diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,77196 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 11022, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 9.072763563781528e-05, + "grad_norm": 820.4292708239255, + "learning_rate": 3.0211480362537765e-06, + "loss": 9.1447, + "step": 1 + }, + { + "epoch": 0.00018145527127563056, + "grad_norm": 786.6947804459147, + "learning_rate": 6.042296072507553e-06, + "loss": 9.1951, + "step": 2 + }, + { + "epoch": 0.00027218290691344586, + "grad_norm": 138.06996959903077, + "learning_rate": 9.06344410876133e-06, + "loss": 5.3362, + "step": 3 + }, + { + "epoch": 0.0003629105425512611, + "grad_norm": 189.49227190126584, + "learning_rate": 1.2084592145015106e-05, + "loss": 5.4067, + "step": 4 + }, + { + "epoch": 0.0004536381781890764, + "grad_norm": 80.48753283811283, + "learning_rate": 1.5105740181268884e-05, + "loss": 4.8299, + "step": 5 + }, + { + "epoch": 0.0005443658138268917, + "grad_norm": 164.5289187307465, + "learning_rate": 1.812688821752266e-05, + "loss": 5.8277, + "step": 6 + }, + { + "epoch": 0.000635093449464707, + "grad_norm": 99.52689820913142, + "learning_rate": 2.1148036253776438e-05, + "loss": 4.4248, + "step": 7 + }, + { + "epoch": 0.0007258210851025222, + "grad_norm": 34.59926088510973, + "learning_rate": 2.4169184290030212e-05, + "loss": 3.8422, + "step": 8 + }, + { + "epoch": 0.0008165487207403375, + "grad_norm": 44.74502538343502, + "learning_rate": 2.7190332326283987e-05, + "loss": 3.6366, + "step": 9 + }, + { + "epoch": 0.0009072763563781528, + "grad_norm": 50.247407143283546, + "learning_rate": 3.0211480362537768e-05, + "loss": 3.5254, + "step": 10 + }, + { + "epoch": 0.000998003992015968, + "grad_norm": 26.76751901785166, + "learning_rate": 3.323262839879154e-05, + "loss": 3.324, + "step": 11 + }, + { + "epoch": 0.0010887316276537834, + "grad_norm": 25.048587822911195, + "learning_rate": 3.625377643504532e-05, + "loss": 3.1885, + "step": 12 + }, + { + "epoch": 0.0011794592632915987, + "grad_norm": 24.016298184039318, + "learning_rate": 3.927492447129909e-05, + "loss": 3.2104, + "step": 13 + }, + { + "epoch": 0.001270186898929414, + "grad_norm": 14.384042903184529, + "learning_rate": 4.2296072507552875e-05, + "loss": 2.961, + "step": 14 + }, + { + "epoch": 0.0013609145345672292, + "grad_norm": 9.711932936007067, + "learning_rate": 4.5317220543806646e-05, + "loss": 2.862, + "step": 15 + }, + { + "epoch": 0.0014516421702050445, + "grad_norm": 8.522219132419734, + "learning_rate": 4.8338368580060424e-05, + "loss": 2.7848, + "step": 16 + }, + { + "epoch": 0.0015423698058428597, + "grad_norm": 6.114026275048351, + "learning_rate": 5.13595166163142e-05, + "loss": 2.7201, + "step": 17 + }, + { + "epoch": 0.001633097441480675, + "grad_norm": 6.656793931172403, + "learning_rate": 5.438066465256797e-05, + "loss": 2.7008, + "step": 18 + }, + { + "epoch": 0.0017238250771184902, + "grad_norm": 5.733828860928352, + "learning_rate": 5.740181268882176e-05, + "loss": 2.6027, + "step": 19 + }, + { + "epoch": 0.0018145527127563057, + "grad_norm": 11.150066388031325, + "learning_rate": 6.0422960725075535e-05, + "loss": 2.6224, + "step": 20 + }, + { + "epoch": 0.001905280348394121, + "grad_norm": 6.2928959751628835, + "learning_rate": 6.34441087613293e-05, + "loss": 2.531, + "step": 21 + }, + { + "epoch": 0.001996007984031936, + "grad_norm": 4.956684129188433, + "learning_rate": 6.646525679758308e-05, + "loss": 2.4763, + "step": 22 + }, + { + "epoch": 0.0020867356196697514, + "grad_norm": 5.471355604275741, + "learning_rate": 6.948640483383687e-05, + "loss": 2.4986, + "step": 23 + }, + { + "epoch": 0.002177463255307567, + "grad_norm": 5.3348934221414925, + "learning_rate": 7.250755287009064e-05, + "loss": 2.4645, + "step": 24 + }, + { + "epoch": 0.002268190890945382, + "grad_norm": 3.111899215663545, + "learning_rate": 7.552870090634441e-05, + "loss": 2.4276, + "step": 25 + }, + { + "epoch": 0.0023589185265831974, + "grad_norm": 2.3183974707687964, + "learning_rate": 7.854984894259818e-05, + "loss": 2.4015, + "step": 26 + }, + { + "epoch": 0.0024496461622210124, + "grad_norm": 2.2310915312571518, + "learning_rate": 8.157099697885197e-05, + "loss": 2.3666, + "step": 27 + }, + { + "epoch": 0.002540373797858828, + "grad_norm": 1.796715857406065, + "learning_rate": 8.459214501510575e-05, + "loss": 2.4146, + "step": 28 + }, + { + "epoch": 0.002631101433496643, + "grad_norm": 1.878669531054621, + "learning_rate": 8.761329305135952e-05, + "loss": 2.3582, + "step": 29 + }, + { + "epoch": 0.0027218290691344584, + "grad_norm": 1.6379751765742399, + "learning_rate": 9.063444108761329e-05, + "loss": 2.3554, + "step": 30 + }, + { + "epoch": 0.0028125567047722734, + "grad_norm": 1.400374389976707, + "learning_rate": 9.365558912386706e-05, + "loss": 2.3598, + "step": 31 + }, + { + "epoch": 0.002903284340410089, + "grad_norm": 1.5649779242627133, + "learning_rate": 9.667673716012085e-05, + "loss": 2.335, + "step": 32 + }, + { + "epoch": 0.0029940119760479044, + "grad_norm": 1.5892294305314516, + "learning_rate": 9.969788519637463e-05, + "loss": 2.3703, + "step": 33 + }, + { + "epoch": 0.0030847396116857194, + "grad_norm": 2.434194090132403, + "learning_rate": 0.0001027190332326284, + "loss": 2.298, + "step": 34 + }, + { + "epoch": 0.003175467247323535, + "grad_norm": 7.5929393726290515, + "learning_rate": 0.00010574018126888218, + "loss": 2.3627, + "step": 35 + }, + { + "epoch": 0.00326619488296135, + "grad_norm": 6.862310486769989, + "learning_rate": 0.00010876132930513595, + "loss": 2.3634, + "step": 36 + }, + { + "epoch": 0.0033569225185991654, + "grad_norm": 2.3993423570221903, + "learning_rate": 0.00011178247734138973, + "loss": 2.3197, + "step": 37 + }, + { + "epoch": 0.0034476501542369804, + "grad_norm": 1.9594524426612088, + "learning_rate": 0.00011480362537764352, + "loss": 2.2989, + "step": 38 + }, + { + "epoch": 0.003538377789874796, + "grad_norm": 2.0891948230206343, + "learning_rate": 0.00011782477341389729, + "loss": 2.3218, + "step": 39 + }, + { + "epoch": 0.0036291054255126113, + "grad_norm": 1.6281196546698677, + "learning_rate": 0.00012084592145015107, + "loss": 2.3326, + "step": 40 + }, + { + "epoch": 0.0037198330611504264, + "grad_norm": 1.2394252909009817, + "learning_rate": 0.00012386706948640483, + "loss": 2.3216, + "step": 41 + }, + { + "epoch": 0.003810560696788242, + "grad_norm": 1.4787078308220734, + "learning_rate": 0.0001268882175226586, + "loss": 2.3364, + "step": 42 + }, + { + "epoch": 0.003901288332426057, + "grad_norm": 1.1116157991475009, + "learning_rate": 0.0001299093655589124, + "loss": 2.323, + "step": 43 + }, + { + "epoch": 0.003992015968063872, + "grad_norm": 0.9307980950615132, + "learning_rate": 0.00013293051359516617, + "loss": 2.2979, + "step": 44 + }, + { + "epoch": 0.004082743603701687, + "grad_norm": 1.2451945842342473, + "learning_rate": 0.00013595166163141994, + "loss": 2.3288, + "step": 45 + }, + { + "epoch": 0.004173471239339503, + "grad_norm": 0.8531747741954858, + "learning_rate": 0.00013897280966767374, + "loss": 2.291, + "step": 46 + }, + { + "epoch": 0.004264198874977318, + "grad_norm": 0.9043824177833142, + "learning_rate": 0.0001419939577039275, + "loss": 2.3277, + "step": 47 + }, + { + "epoch": 0.004354926510615134, + "grad_norm": 1.2828658623949485, + "learning_rate": 0.00014501510574018128, + "loss": 2.2476, + "step": 48 + }, + { + "epoch": 0.004445654146252948, + "grad_norm": 0.7494531663996155, + "learning_rate": 0.00014803625377643502, + "loss": 2.275, + "step": 49 + }, + { + "epoch": 0.004536381781890764, + "grad_norm": 1.496162680000621, + "learning_rate": 0.00015105740181268882, + "loss": 2.2773, + "step": 50 + }, + { + "epoch": 0.004627109417528579, + "grad_norm": 0.883026724479648, + "learning_rate": 0.0001540785498489426, + "loss": 2.2307, + "step": 51 + }, + { + "epoch": 0.004717837053166395, + "grad_norm": 1.3830373737473032, + "learning_rate": 0.00015709969788519636, + "loss": 2.292, + "step": 52 + }, + { + "epoch": 0.004808564688804209, + "grad_norm": 2.876324446592375, + "learning_rate": 0.00016012084592145016, + "loss": 2.2363, + "step": 53 + }, + { + "epoch": 0.004899292324442025, + "grad_norm": 1.9404638777963144, + "learning_rate": 0.00016314199395770393, + "loss": 2.2876, + "step": 54 + }, + { + "epoch": 0.00499001996007984, + "grad_norm": 1.3761181406841545, + "learning_rate": 0.0001661631419939577, + "loss": 2.294, + "step": 55 + }, + { + "epoch": 0.005080747595717656, + "grad_norm": 0.8499136073323302, + "learning_rate": 0.0001691842900302115, + "loss": 2.2354, + "step": 56 + }, + { + "epoch": 0.005171475231355471, + "grad_norm": 1.369659695181981, + "learning_rate": 0.00017220543806646527, + "loss": 2.2473, + "step": 57 + }, + { + "epoch": 0.005262202866993286, + "grad_norm": 0.694439486925276, + "learning_rate": 0.00017522658610271904, + "loss": 2.264, + "step": 58 + }, + { + "epoch": 0.005352930502631101, + "grad_norm": 0.8051420939430045, + "learning_rate": 0.0001782477341389728, + "loss": 2.2822, + "step": 59 + }, + { + "epoch": 0.005443658138268917, + "grad_norm": 0.8244007408125796, + "learning_rate": 0.00018126888217522659, + "loss": 2.2732, + "step": 60 + }, + { + "epoch": 0.005534385773906732, + "grad_norm": 0.6832848906000261, + "learning_rate": 0.00018429003021148036, + "loss": 2.2266, + "step": 61 + }, + { + "epoch": 0.005625113409544547, + "grad_norm": 0.648216548706016, + "learning_rate": 0.00018731117824773413, + "loss": 2.2626, + "step": 62 + }, + { + "epoch": 0.005715841045182362, + "grad_norm": 0.6348033694381108, + "learning_rate": 0.00019033232628398793, + "loss": 2.2303, + "step": 63 + }, + { + "epoch": 0.005806568680820178, + "grad_norm": 0.6106414280185232, + "learning_rate": 0.0001933534743202417, + "loss": 2.2215, + "step": 64 + }, + { + "epoch": 0.005897296316457993, + "grad_norm": 0.8003566726867068, + "learning_rate": 0.00019637462235649547, + "loss": 2.2607, + "step": 65 + }, + { + "epoch": 0.005988023952095809, + "grad_norm": 0.6577719933055758, + "learning_rate": 0.00019939577039274927, + "loss": 2.2282, + "step": 66 + }, + { + "epoch": 0.006078751587733623, + "grad_norm": 0.6167316526764866, + "learning_rate": 0.00020241691842900304, + "loss": 2.2559, + "step": 67 + }, + { + "epoch": 0.006169479223371439, + "grad_norm": 0.6817836266273232, + "learning_rate": 0.0002054380664652568, + "loss": 2.266, + "step": 68 + }, + { + "epoch": 0.006260206859009254, + "grad_norm": 0.5809607940895084, + "learning_rate": 0.00020845921450151058, + "loss": 2.2342, + "step": 69 + }, + { + "epoch": 0.00635093449464707, + "grad_norm": 0.6046783280638626, + "learning_rate": 0.00021148036253776435, + "loss": 2.2807, + "step": 70 + }, + { + "epoch": 0.006441662130284884, + "grad_norm": 0.44327342918901, + "learning_rate": 0.00021450151057401812, + "loss": 2.2428, + "step": 71 + }, + { + "epoch": 0.0065323897659227, + "grad_norm": 0.7273191431266415, + "learning_rate": 0.0002175226586102719, + "loss": 2.2492, + "step": 72 + }, + { + "epoch": 0.006623117401560515, + "grad_norm": 0.5975498108336026, + "learning_rate": 0.0002205438066465257, + "loss": 2.2065, + "step": 73 + }, + { + "epoch": 0.006713845037198331, + "grad_norm": 0.7560929043058257, + "learning_rate": 0.00022356495468277946, + "loss": 2.255, + "step": 74 + }, + { + "epoch": 0.006804572672836146, + "grad_norm": 0.7916841323438787, + "learning_rate": 0.00022658610271903323, + "loss": 2.2234, + "step": 75 + }, + { + "epoch": 0.006895300308473961, + "grad_norm": 0.9107783109606902, + "learning_rate": 0.00022960725075528703, + "loss": 2.2207, + "step": 76 + }, + { + "epoch": 0.006986027944111776, + "grad_norm": 1.195117581309827, + "learning_rate": 0.0002326283987915408, + "loss": 2.1884, + "step": 77 + }, + { + "epoch": 0.007076755579749592, + "grad_norm": 1.3250324898800747, + "learning_rate": 0.00023564954682779457, + "loss": 2.2494, + "step": 78 + }, + { + "epoch": 0.007167483215387407, + "grad_norm": 1.4576441809312517, + "learning_rate": 0.00023867069486404834, + "loss": 2.2078, + "step": 79 + }, + { + "epoch": 0.007258210851025223, + "grad_norm": 0.9941865717013201, + "learning_rate": 0.00024169184290030214, + "loss": 2.2482, + "step": 80 + }, + { + "epoch": 0.007348938486663037, + "grad_norm": 1.0534796017980255, + "learning_rate": 0.00024471299093655586, + "loss": 2.2251, + "step": 81 + }, + { + "epoch": 0.007439666122300853, + "grad_norm": 0.9628377707465988, + "learning_rate": 0.00024773413897280966, + "loss": 2.2363, + "step": 82 + }, + { + "epoch": 0.007530393757938668, + "grad_norm": 0.5977995581474005, + "learning_rate": 0.00025075528700906345, + "loss": 2.1708, + "step": 83 + }, + { + "epoch": 0.007621121393576484, + "grad_norm": 0.6094892347849484, + "learning_rate": 0.0002537764350453172, + "loss": 2.2287, + "step": 84 + }, + { + "epoch": 0.007711849029214298, + "grad_norm": 0.7572654503624252, + "learning_rate": 0.00025679758308157105, + "loss": 2.1992, + "step": 85 + }, + { + "epoch": 0.007802576664852114, + "grad_norm": 0.6677663044766768, + "learning_rate": 0.0002598187311178248, + "loss": 2.2035, + "step": 86 + }, + { + "epoch": 0.007893304300489928, + "grad_norm": 0.9007720727173132, + "learning_rate": 0.00026283987915407854, + "loss": 2.1779, + "step": 87 + }, + { + "epoch": 0.007984031936127744, + "grad_norm": 1.292709523490692, + "learning_rate": 0.00026586102719033234, + "loss": 2.1846, + "step": 88 + }, + { + "epoch": 0.00807475957176556, + "grad_norm": 0.8724457194717196, + "learning_rate": 0.0002688821752265861, + "loss": 2.2156, + "step": 89 + }, + { + "epoch": 0.008165487207403375, + "grad_norm": 0.74861170509202, + "learning_rate": 0.0002719033232628399, + "loss": 2.2154, + "step": 90 + }, + { + "epoch": 0.00825621484304119, + "grad_norm": 0.8258880483260447, + "learning_rate": 0.0002749244712990936, + "loss": 2.2278, + "step": 91 + }, + { + "epoch": 0.008346942478679006, + "grad_norm": 1.1342307855590084, + "learning_rate": 0.0002779456193353475, + "loss": 2.1807, + "step": 92 + }, + { + "epoch": 0.008437670114316821, + "grad_norm": 0.8183713002805795, + "learning_rate": 0.0002809667673716012, + "loss": 2.2369, + "step": 93 + }, + { + "epoch": 0.008528397749954637, + "grad_norm": 0.7210757396412787, + "learning_rate": 0.000283987915407855, + "loss": 2.2136, + "step": 94 + }, + { + "epoch": 0.008619125385592452, + "grad_norm": 0.5677025822598858, + "learning_rate": 0.00028700906344410876, + "loss": 2.1936, + "step": 95 + }, + { + "epoch": 0.008709853021230268, + "grad_norm": 0.6026342931207197, + "learning_rate": 0.00029003021148036256, + "loss": 2.1888, + "step": 96 + }, + { + "epoch": 0.008800580656868081, + "grad_norm": 0.6611372530400874, + "learning_rate": 0.0002930513595166163, + "loss": 2.1992, + "step": 97 + }, + { + "epoch": 0.008891308292505897, + "grad_norm": 0.520844450054285, + "learning_rate": 0.00029607250755287005, + "loss": 2.2295, + "step": 98 + }, + { + "epoch": 0.008982035928143712, + "grad_norm": 0.6743679447744166, + "learning_rate": 0.0002990936555891239, + "loss": 2.1865, + "step": 99 + }, + { + "epoch": 0.009072763563781528, + "grad_norm": 0.6499460355933928, + "learning_rate": 0.00030211480362537764, + "loss": 2.1903, + "step": 100 + }, + { + "epoch": 0.009163491199419343, + "grad_norm": 1.0997225052767212, + "learning_rate": 0.00030513595166163144, + "loss": 2.2144, + "step": 101 + }, + { + "epoch": 0.009254218835057159, + "grad_norm": 0.6544849974188365, + "learning_rate": 0.0003081570996978852, + "loss": 2.197, + "step": 102 + }, + { + "epoch": 0.009344946470694974, + "grad_norm": 0.6235985319088647, + "learning_rate": 0.000311178247734139, + "loss": 2.1764, + "step": 103 + }, + { + "epoch": 0.00943567410633279, + "grad_norm": 0.4788139826559429, + "learning_rate": 0.00031419939577039273, + "loss": 2.177, + "step": 104 + }, + { + "epoch": 0.009526401741970605, + "grad_norm": 0.9638150891936241, + "learning_rate": 0.0003172205438066466, + "loss": 2.1996, + "step": 105 + }, + { + "epoch": 0.009617129377608419, + "grad_norm": 0.6157767985483467, + "learning_rate": 0.0003202416918429003, + "loss": 2.2164, + "step": 106 + }, + { + "epoch": 0.009707857013246234, + "grad_norm": 0.8877359612111287, + "learning_rate": 0.00032326283987915407, + "loss": 2.1834, + "step": 107 + }, + { + "epoch": 0.00979858464888405, + "grad_norm": 1.0872349024346932, + "learning_rate": 0.00032628398791540787, + "loss": 2.2062, + "step": 108 + }, + { + "epoch": 0.009889312284521865, + "grad_norm": 1.3982351604301595, + "learning_rate": 0.0003293051359516616, + "loss": 2.2302, + "step": 109 + }, + { + "epoch": 0.00998003992015968, + "grad_norm": 1.351286782593136, + "learning_rate": 0.0003323262839879154, + "loss": 2.2105, + "step": 110 + }, + { + "epoch": 0.010070767555797496, + "grad_norm": 0.707727974500633, + "learning_rate": 0.00033534743202416915, + "loss": 2.1873, + "step": 111 + }, + { + "epoch": 0.010161495191435312, + "grad_norm": 0.7864926701120595, + "learning_rate": 0.000338368580060423, + "loss": 2.1724, + "step": 112 + }, + { + "epoch": 0.010252222827073127, + "grad_norm": 0.6016277296585074, + "learning_rate": 0.00034138972809667675, + "loss": 2.2164, + "step": 113 + }, + { + "epoch": 0.010342950462710943, + "grad_norm": 0.7344814586487206, + "learning_rate": 0.00034441087613293055, + "loss": 2.207, + "step": 114 + }, + { + "epoch": 0.010433678098348756, + "grad_norm": 0.5857569967762618, + "learning_rate": 0.0003474320241691843, + "loss": 2.2142, + "step": 115 + }, + { + "epoch": 0.010524405733986572, + "grad_norm": 0.5450597425281701, + "learning_rate": 0.0003504531722054381, + "loss": 2.1477, + "step": 116 + }, + { + "epoch": 0.010615133369624387, + "grad_norm": 0.6705351005346584, + "learning_rate": 0.00035347432024169183, + "loss": 2.1858, + "step": 117 + }, + { + "epoch": 0.010705861005262203, + "grad_norm": 0.5682725411158781, + "learning_rate": 0.0003564954682779456, + "loss": 2.2062, + "step": 118 + }, + { + "epoch": 0.010796588640900018, + "grad_norm": 0.8753302572355304, + "learning_rate": 0.00035951661631419943, + "loss": 2.1566, + "step": 119 + }, + { + "epoch": 0.010887316276537834, + "grad_norm": 0.9249830735788512, + "learning_rate": 0.00036253776435045317, + "loss": 2.2074, + "step": 120 + }, + { + "epoch": 0.010978043912175649, + "grad_norm": 0.5765175197797057, + "learning_rate": 0.00036555891238670697, + "loss": 2.1905, + "step": 121 + }, + { + "epoch": 0.011068771547813465, + "grad_norm": 0.7125721427983491, + "learning_rate": 0.0003685800604229607, + "loss": 2.1649, + "step": 122 + }, + { + "epoch": 0.01115949918345128, + "grad_norm": 0.8571813740087499, + "learning_rate": 0.0003716012084592145, + "loss": 2.1788, + "step": 123 + }, + { + "epoch": 0.011250226819089094, + "grad_norm": 0.9555105563218532, + "learning_rate": 0.00037462235649546826, + "loss": 2.2213, + "step": 124 + }, + { + "epoch": 0.01134095445472691, + "grad_norm": 0.9364221162229269, + "learning_rate": 0.0003776435045317221, + "loss": 2.1893, + "step": 125 + }, + { + "epoch": 0.011431682090364725, + "grad_norm": 0.681298854339522, + "learning_rate": 0.00038066465256797585, + "loss": 2.1934, + "step": 126 + }, + { + "epoch": 0.01152240972600254, + "grad_norm": 0.8945762439788384, + "learning_rate": 0.00038368580060422965, + "loss": 2.1652, + "step": 127 + }, + { + "epoch": 0.011613137361640356, + "grad_norm": 0.9957532399996867, + "learning_rate": 0.0003867069486404834, + "loss": 2.1766, + "step": 128 + }, + { + "epoch": 0.011703864997278171, + "grad_norm": 0.9527206015754074, + "learning_rate": 0.00038972809667673714, + "loss": 2.2199, + "step": 129 + }, + { + "epoch": 0.011794592632915987, + "grad_norm": 0.8492911520614113, + "learning_rate": 0.00039274924471299094, + "loss": 2.2213, + "step": 130 + }, + { + "epoch": 0.011885320268553802, + "grad_norm": 0.8596140376120798, + "learning_rate": 0.0003957703927492447, + "loss": 2.1821, + "step": 131 + }, + { + "epoch": 0.011976047904191617, + "grad_norm": 0.7475976792758132, + "learning_rate": 0.00039879154078549853, + "loss": 2.1915, + "step": 132 + }, + { + "epoch": 0.012066775539829431, + "grad_norm": 0.7605990255605074, + "learning_rate": 0.0004018126888217523, + "loss": 2.1192, + "step": 133 + }, + { + "epoch": 0.012157503175467247, + "grad_norm": 0.5054165859114896, + "learning_rate": 0.0004048338368580061, + "loss": 2.162, + "step": 134 + }, + { + "epoch": 0.012248230811105062, + "grad_norm": 1.0394387342184437, + "learning_rate": 0.0004078549848942598, + "loss": 2.1726, + "step": 135 + }, + { + "epoch": 0.012338958446742878, + "grad_norm": 1.7156108221328628, + "learning_rate": 0.0004108761329305136, + "loss": 2.1836, + "step": 136 + }, + { + "epoch": 0.012429686082380693, + "grad_norm": 1.2160519819132387, + "learning_rate": 0.00041389728096676736, + "loss": 2.1676, + "step": 137 + }, + { + "epoch": 0.012520413718018509, + "grad_norm": 1.0174384355350117, + "learning_rate": 0.00041691842900302116, + "loss": 2.1729, + "step": 138 + }, + { + "epoch": 0.012611141353656324, + "grad_norm": 0.6347406206734989, + "learning_rate": 0.00041993957703927496, + "loss": 2.1731, + "step": 139 + }, + { + "epoch": 0.01270186898929414, + "grad_norm": 0.8189831081733057, + "learning_rate": 0.0004229607250755287, + "loss": 2.1661, + "step": 140 + }, + { + "epoch": 0.012792596624931955, + "grad_norm": 0.7566078466683789, + "learning_rate": 0.0004259818731117825, + "loss": 2.25, + "step": 141 + }, + { + "epoch": 0.012883324260569769, + "grad_norm": 0.9294489818935237, + "learning_rate": 0.00042900302114803624, + "loss": 2.1923, + "step": 142 + }, + { + "epoch": 0.012974051896207584, + "grad_norm": 0.700885738384636, + "learning_rate": 0.00043202416918429004, + "loss": 2.1815, + "step": 143 + }, + { + "epoch": 0.0130647795318454, + "grad_norm": 0.7262407075257339, + "learning_rate": 0.0004350453172205438, + "loss": 2.2013, + "step": 144 + }, + { + "epoch": 0.013155507167483215, + "grad_norm": 0.5900520064526659, + "learning_rate": 0.0004380664652567976, + "loss": 2.2423, + "step": 145 + }, + { + "epoch": 0.01324623480312103, + "grad_norm": 0.7512867881841373, + "learning_rate": 0.0004410876132930514, + "loss": 2.1549, + "step": 146 + }, + { + "epoch": 0.013336962438758846, + "grad_norm": 0.7127281086644328, + "learning_rate": 0.0004441087613293052, + "loss": 2.1811, + "step": 147 + }, + { + "epoch": 0.013427690074396662, + "grad_norm": 0.645205013084596, + "learning_rate": 0.0004471299093655589, + "loss": 2.1523, + "step": 148 + }, + { + "epoch": 0.013518417710034477, + "grad_norm": 0.9452934059192973, + "learning_rate": 0.0004501510574018127, + "loss": 2.1907, + "step": 149 + }, + { + "epoch": 0.013609145345672292, + "grad_norm": 1.0022894234423791, + "learning_rate": 0.00045317220543806646, + "loss": 2.1786, + "step": 150 + }, + { + "epoch": 0.013699872981310108, + "grad_norm": 0.7494499005099639, + "learning_rate": 0.0004561933534743202, + "loss": 2.2158, + "step": 151 + }, + { + "epoch": 0.013790600616947922, + "grad_norm": 1.5164872301191674, + "learning_rate": 0.00045921450151057406, + "loss": 2.1925, + "step": 152 + }, + { + "epoch": 0.013881328252585737, + "grad_norm": 0.888825213325083, + "learning_rate": 0.0004622356495468278, + "loss": 2.2028, + "step": 153 + }, + { + "epoch": 0.013972055888223553, + "grad_norm": 0.7173181177977059, + "learning_rate": 0.0004652567975830816, + "loss": 2.1813, + "step": 154 + }, + { + "epoch": 0.014062783523861368, + "grad_norm": 0.5371160851347164, + "learning_rate": 0.00046827794561933535, + "loss": 2.2017, + "step": 155 + }, + { + "epoch": 0.014153511159499184, + "grad_norm": 0.5071978944476353, + "learning_rate": 0.00047129909365558915, + "loss": 2.1723, + "step": 156 + }, + { + "epoch": 0.014244238795136999, + "grad_norm": 0.425241711540431, + "learning_rate": 0.0004743202416918429, + "loss": 2.1645, + "step": 157 + }, + { + "epoch": 0.014334966430774814, + "grad_norm": 0.5960283422878132, + "learning_rate": 0.0004773413897280967, + "loss": 2.1922, + "step": 158 + }, + { + "epoch": 0.01442569406641263, + "grad_norm": 0.4657172565433116, + "learning_rate": 0.0004803625377643505, + "loss": 2.1918, + "step": 159 + }, + { + "epoch": 0.014516421702050445, + "grad_norm": 0.5694968815537725, + "learning_rate": 0.0004833836858006043, + "loss": 2.1882, + "step": 160 + }, + { + "epoch": 0.014607149337688259, + "grad_norm": 0.41609250334151004, + "learning_rate": 0.00048640483383685803, + "loss": 2.1932, + "step": 161 + }, + { + "epoch": 0.014697876973326075, + "grad_norm": 0.589908473856481, + "learning_rate": 0.0004894259818731117, + "loss": 2.188, + "step": 162 + }, + { + "epoch": 0.01478860460896389, + "grad_norm": 0.4069218637420419, + "learning_rate": 0.0004924471299093656, + "loss": 2.1698, + "step": 163 + }, + { + "epoch": 0.014879332244601706, + "grad_norm": 0.5962038004043367, + "learning_rate": 0.0004954682779456193, + "loss": 2.1881, + "step": 164 + }, + { + "epoch": 0.014970059880239521, + "grad_norm": 0.8444269722048197, + "learning_rate": 0.0004984894259818731, + "loss": 2.1651, + "step": 165 + }, + { + "epoch": 0.015060787515877336, + "grad_norm": 0.46064404060604197, + "learning_rate": 0.0005015105740181269, + "loss": 2.2008, + "step": 166 + }, + { + "epoch": 0.015151515151515152, + "grad_norm": 0.6910508088372356, + "learning_rate": 0.0005045317220543807, + "loss": 2.1846, + "step": 167 + }, + { + "epoch": 0.015242242787152967, + "grad_norm": 0.4647076278879488, + "learning_rate": 0.0005075528700906344, + "loss": 2.1675, + "step": 168 + }, + { + "epoch": 0.015332970422790783, + "grad_norm": 0.4592190360698178, + "learning_rate": 0.0005105740181268882, + "loss": 2.1924, + "step": 169 + }, + { + "epoch": 0.015423698058428597, + "grad_norm": 0.4732378401606886, + "learning_rate": 0.0005135951661631421, + "loss": 2.1687, + "step": 170 + }, + { + "epoch": 0.015514425694066412, + "grad_norm": 0.7480730535594214, + "learning_rate": 0.0005166163141993958, + "loss": 2.1654, + "step": 171 + }, + { + "epoch": 0.015605153329704228, + "grad_norm": 0.7612574178521395, + "learning_rate": 0.0005196374622356496, + "loss": 2.1706, + "step": 172 + }, + { + "epoch": 0.015695880965342045, + "grad_norm": 0.6084857502325823, + "learning_rate": 0.0005226586102719033, + "loss": 2.114, + "step": 173 + }, + { + "epoch": 0.015786608600979857, + "grad_norm": 0.6057838919973224, + "learning_rate": 0.0005256797583081571, + "loss": 2.1245, + "step": 174 + }, + { + "epoch": 0.015877336236617672, + "grad_norm": 0.9771297662894174, + "learning_rate": 0.0005287009063444109, + "loss": 2.1468, + "step": 175 + }, + { + "epoch": 0.015968063872255488, + "grad_norm": 0.9891196308639384, + "learning_rate": 0.0005317220543806647, + "loss": 2.1704, + "step": 176 + }, + { + "epoch": 0.016058791507893303, + "grad_norm": 0.6241618528711156, + "learning_rate": 0.0005347432024169185, + "loss": 2.1782, + "step": 177 + }, + { + "epoch": 0.01614951914353112, + "grad_norm": 0.6553118945057888, + "learning_rate": 0.0005377643504531722, + "loss": 2.1257, + "step": 178 + }, + { + "epoch": 0.016240246779168934, + "grad_norm": 0.9628565358279236, + "learning_rate": 0.0005407854984894261, + "loss": 2.1527, + "step": 179 + }, + { + "epoch": 0.01633097441480675, + "grad_norm": 0.8129631323993676, + "learning_rate": 0.0005438066465256798, + "loss": 2.1524, + "step": 180 + }, + { + "epoch": 0.016421702050444565, + "grad_norm": 1.537199300620452, + "learning_rate": 0.0005468277945619336, + "loss": 2.1529, + "step": 181 + }, + { + "epoch": 0.01651242968608238, + "grad_norm": 1.696978959875208, + "learning_rate": 0.0005498489425981872, + "loss": 2.1746, + "step": 182 + }, + { + "epoch": 0.016603157321720196, + "grad_norm": 0.9360017790364128, + "learning_rate": 0.000552870090634441, + "loss": 2.1665, + "step": 183 + }, + { + "epoch": 0.01669388495735801, + "grad_norm": 0.6868276562691026, + "learning_rate": 0.000555891238670695, + "loss": 2.1229, + "step": 184 + }, + { + "epoch": 0.016784612592995827, + "grad_norm": 0.555495360123967, + "learning_rate": 0.0005589123867069486, + "loss": 2.162, + "step": 185 + }, + { + "epoch": 0.016875340228633642, + "grad_norm": 0.6425162102728619, + "learning_rate": 0.0005619335347432024, + "loss": 2.1768, + "step": 186 + }, + { + "epoch": 0.016966067864271458, + "grad_norm": 0.5455130193455889, + "learning_rate": 0.0005649546827794561, + "loss": 2.1879, + "step": 187 + }, + { + "epoch": 0.017056795499909273, + "grad_norm": 0.563548886824068, + "learning_rate": 0.00056797583081571, + "loss": 2.1197, + "step": 188 + }, + { + "epoch": 0.01714752313554709, + "grad_norm": 0.5347641846465149, + "learning_rate": 0.0005709969788519637, + "loss": 2.1375, + "step": 189 + }, + { + "epoch": 0.017238250771184904, + "grad_norm": 0.4782749446134411, + "learning_rate": 0.0005740181268882175, + "loss": 2.1607, + "step": 190 + }, + { + "epoch": 0.01732897840682272, + "grad_norm": 0.39147839722277383, + "learning_rate": 0.0005770392749244713, + "loss": 2.1673, + "step": 191 + }, + { + "epoch": 0.017419706042460535, + "grad_norm": 0.5954227383436532, + "learning_rate": 0.0005800604229607251, + "loss": 2.1895, + "step": 192 + }, + { + "epoch": 0.017510433678098347, + "grad_norm": 0.4588028111404706, + "learning_rate": 0.0005830815709969789, + "loss": 2.1642, + "step": 193 + }, + { + "epoch": 0.017601161313736163, + "grad_norm": 0.6027099879901632, + "learning_rate": 0.0005861027190332326, + "loss": 2.1405, + "step": 194 + }, + { + "epoch": 0.017691888949373978, + "grad_norm": 0.396315640694945, + "learning_rate": 0.0005891238670694864, + "loss": 2.1243, + "step": 195 + }, + { + "epoch": 0.017782616585011794, + "grad_norm": 0.794668446052775, + "learning_rate": 0.0005921450151057401, + "loss": 2.1537, + "step": 196 + }, + { + "epoch": 0.01787334422064961, + "grad_norm": 0.8496693044863525, + "learning_rate": 0.000595166163141994, + "loss": 2.1006, + "step": 197 + }, + { + "epoch": 0.017964071856287425, + "grad_norm": 0.8423631628349483, + "learning_rate": 0.0005981873111782478, + "loss": 2.1517, + "step": 198 + }, + { + "epoch": 0.01805479949192524, + "grad_norm": 0.5359242994412692, + "learning_rate": 0.0006012084592145015, + "loss": 2.1407, + "step": 199 + }, + { + "epoch": 0.018145527127563055, + "grad_norm": 0.8541968522542311, + "learning_rate": 0.0006042296072507553, + "loss": 2.138, + "step": 200 + }, + { + "epoch": 0.01823625476320087, + "grad_norm": 0.5699106159688756, + "learning_rate": 0.0006072507552870091, + "loss": 2.1336, + "step": 201 + }, + { + "epoch": 0.018326982398838686, + "grad_norm": 0.4792329016846558, + "learning_rate": 0.0006102719033232629, + "loss": 2.1425, + "step": 202 + }, + { + "epoch": 0.018417710034476502, + "grad_norm": 0.560043689671445, + "learning_rate": 0.0006132930513595167, + "loss": 2.145, + "step": 203 + }, + { + "epoch": 0.018508437670114317, + "grad_norm": 0.5044555837498224, + "learning_rate": 0.0006163141993957704, + "loss": 2.1122, + "step": 204 + }, + { + "epoch": 0.018599165305752133, + "grad_norm": 1.0571812433247745, + "learning_rate": 0.0006193353474320242, + "loss": 2.1311, + "step": 205 + }, + { + "epoch": 0.018689892941389948, + "grad_norm": 0.4666979747579337, + "learning_rate": 0.000622356495468278, + "loss": 2.1156, + "step": 206 + }, + { + "epoch": 0.018780620577027764, + "grad_norm": 0.5166239718939568, + "learning_rate": 0.0006253776435045318, + "loss": 2.0978, + "step": 207 + }, + { + "epoch": 0.01887134821266558, + "grad_norm": 0.4109878219246611, + "learning_rate": 0.0006283987915407855, + "loss": 2.1047, + "step": 208 + }, + { + "epoch": 0.018962075848303395, + "grad_norm": 0.4565801841735219, + "learning_rate": 0.0006314199395770393, + "loss": 2.1539, + "step": 209 + }, + { + "epoch": 0.01905280348394121, + "grad_norm": 0.4232454745419823, + "learning_rate": 0.0006344410876132932, + "loss": 2.1545, + "step": 210 + }, + { + "epoch": 0.019143531119579022, + "grad_norm": 0.473005307569919, + "learning_rate": 0.0006374622356495468, + "loss": 2.1086, + "step": 211 + }, + { + "epoch": 0.019234258755216838, + "grad_norm": 0.5173048430359828, + "learning_rate": 0.0006404833836858006, + "loss": 2.1292, + "step": 212 + }, + { + "epoch": 0.019324986390854653, + "grad_norm": 0.4457285171639899, + "learning_rate": 0.0006435045317220543, + "loss": 2.0936, + "step": 213 + }, + { + "epoch": 0.01941571402649247, + "grad_norm": 0.36673075630632973, + "learning_rate": 0.0006465256797583081, + "loss": 2.1004, + "step": 214 + }, + { + "epoch": 0.019506441662130284, + "grad_norm": 0.5112680728530213, + "learning_rate": 0.0006495468277945619, + "loss": 2.1449, + "step": 215 + }, + { + "epoch": 0.0195971692977681, + "grad_norm": 0.5450123542130839, + "learning_rate": 0.0006525679758308157, + "loss": 2.1243, + "step": 216 + }, + { + "epoch": 0.019687896933405915, + "grad_norm": 0.5802281795461045, + "learning_rate": 0.0006555891238670695, + "loss": 2.1231, + "step": 217 + }, + { + "epoch": 0.01977862456904373, + "grad_norm": 0.5002512725628194, + "learning_rate": 0.0006586102719033232, + "loss": 2.101, + "step": 218 + }, + { + "epoch": 0.019869352204681546, + "grad_norm": 0.34516797911457525, + "learning_rate": 0.0006616314199395771, + "loss": 2.1075, + "step": 219 + }, + { + "epoch": 0.01996007984031936, + "grad_norm": 0.6244119678319301, + "learning_rate": 0.0006646525679758308, + "loss": 2.1277, + "step": 220 + }, + { + "epoch": 0.020050807475957177, + "grad_norm": 0.5825406402046905, + "learning_rate": 0.0006676737160120846, + "loss": 2.1466, + "step": 221 + }, + { + "epoch": 0.020141535111594992, + "grad_norm": 1.1416540217910731, + "learning_rate": 0.0006706948640483383, + "loss": 2.13, + "step": 222 + }, + { + "epoch": 0.020232262747232808, + "grad_norm": 1.5539261460154343, + "learning_rate": 0.0006737160120845922, + "loss": 2.1377, + "step": 223 + }, + { + "epoch": 0.020322990382870623, + "grad_norm": 0.7764884898267884, + "learning_rate": 0.000676737160120846, + "loss": 2.1253, + "step": 224 + }, + { + "epoch": 0.02041371801850844, + "grad_norm": 0.6951115909936142, + "learning_rate": 0.0006797583081570997, + "loss": 2.1141, + "step": 225 + }, + { + "epoch": 0.020504445654146254, + "grad_norm": 0.37296608199594455, + "learning_rate": 0.0006827794561933535, + "loss": 2.1407, + "step": 226 + }, + { + "epoch": 0.02059517328978407, + "grad_norm": 0.3859530636081249, + "learning_rate": 0.0006858006042296072, + "loss": 2.1398, + "step": 227 + }, + { + "epoch": 0.020685900925421885, + "grad_norm": 0.7308868218042848, + "learning_rate": 0.0006888217522658611, + "loss": 2.1305, + "step": 228 + }, + { + "epoch": 0.020776628561059697, + "grad_norm": 0.607778945866942, + "learning_rate": 0.0006918429003021148, + "loss": 2.088, + "step": 229 + }, + { + "epoch": 0.020867356196697513, + "grad_norm": 0.49360848193179296, + "learning_rate": 0.0006948640483383686, + "loss": 2.178, + "step": 230 + }, + { + "epoch": 0.020958083832335328, + "grad_norm": 0.5252707046965568, + "learning_rate": 0.0006978851963746224, + "loss": 2.0781, + "step": 231 + }, + { + "epoch": 0.021048811467973143, + "grad_norm": 0.3547039054558016, + "learning_rate": 0.0007009063444108762, + "loss": 2.0923, + "step": 232 + }, + { + "epoch": 0.02113953910361096, + "grad_norm": 0.3428080038285104, + "learning_rate": 0.00070392749244713, + "loss": 2.1437, + "step": 233 + }, + { + "epoch": 0.021230266739248774, + "grad_norm": 0.34080504467981576, + "learning_rate": 0.0007069486404833837, + "loss": 2.107, + "step": 234 + }, + { + "epoch": 0.02132099437488659, + "grad_norm": 0.3323650353633009, + "learning_rate": 0.0007099697885196375, + "loss": 2.0933, + "step": 235 + }, + { + "epoch": 0.021411722010524405, + "grad_norm": 0.3256639112125338, + "learning_rate": 0.0007129909365558912, + "loss": 2.1135, + "step": 236 + }, + { + "epoch": 0.02150244964616222, + "grad_norm": 0.3656387428461839, + "learning_rate": 0.0007160120845921451, + "loss": 2.073, + "step": 237 + }, + { + "epoch": 0.021593177281800036, + "grad_norm": 0.342422224209234, + "learning_rate": 0.0007190332326283989, + "loss": 2.1077, + "step": 238 + }, + { + "epoch": 0.021683904917437852, + "grad_norm": 0.312704832234747, + "learning_rate": 0.0007220543806646525, + "loss": 2.1362, + "step": 239 + }, + { + "epoch": 0.021774632553075667, + "grad_norm": 0.3135468014200134, + "learning_rate": 0.0007250755287009063, + "loss": 2.125, + "step": 240 + }, + { + "epoch": 0.021865360188713483, + "grad_norm": 0.3523364913342266, + "learning_rate": 0.0007280966767371601, + "loss": 2.0821, + "step": 241 + }, + { + "epoch": 0.021956087824351298, + "grad_norm": 0.30803724065040455, + "learning_rate": 0.0007311178247734139, + "loss": 2.0766, + "step": 242 + }, + { + "epoch": 0.022046815459989114, + "grad_norm": 0.3175392846648793, + "learning_rate": 0.0007341389728096676, + "loss": 2.1505, + "step": 243 + }, + { + "epoch": 0.02213754309562693, + "grad_norm": 0.41690994402331844, + "learning_rate": 0.0007371601208459214, + "loss": 2.0963, + "step": 244 + }, + { + "epoch": 0.022228270731264745, + "grad_norm": 0.3543764453388643, + "learning_rate": 0.0007401812688821753, + "loss": 2.0917, + "step": 245 + }, + { + "epoch": 0.02231899836690256, + "grad_norm": 0.3546131055185969, + "learning_rate": 0.000743202416918429, + "loss": 2.0893, + "step": 246 + }, + { + "epoch": 0.022409726002540376, + "grad_norm": 0.4445762980604206, + "learning_rate": 0.0007462235649546828, + "loss": 2.0845, + "step": 247 + }, + { + "epoch": 0.022500453638178187, + "grad_norm": 0.9112198938917672, + "learning_rate": 0.0007492447129909365, + "loss": 2.1002, + "step": 248 + }, + { + "epoch": 0.022591181273816003, + "grad_norm": 0.7566561493816926, + "learning_rate": 0.0007522658610271903, + "loss": 2.1335, + "step": 249 + }, + { + "epoch": 0.02268190890945382, + "grad_norm": 0.6273191141874328, + "learning_rate": 0.0007552870090634442, + "loss": 2.1254, + "step": 250 + }, + { + "epoch": 0.022772636545091634, + "grad_norm": 0.5108015398491754, + "learning_rate": 0.0007583081570996979, + "loss": 2.1142, + "step": 251 + }, + { + "epoch": 0.02286336418072945, + "grad_norm": 0.6016623616608251, + "learning_rate": 0.0007613293051359517, + "loss": 2.1258, + "step": 252 + }, + { + "epoch": 0.022954091816367265, + "grad_norm": 0.4825901224126855, + "learning_rate": 0.0007643504531722054, + "loss": 2.078, + "step": 253 + }, + { + "epoch": 0.02304481945200508, + "grad_norm": 0.39907146706292634, + "learning_rate": 0.0007673716012084593, + "loss": 2.0547, + "step": 254 + }, + { + "epoch": 0.023135547087642896, + "grad_norm": 0.3638678802688706, + "learning_rate": 0.000770392749244713, + "loss": 2.0686, + "step": 255 + }, + { + "epoch": 0.02322627472328071, + "grad_norm": 0.38055471341799474, + "learning_rate": 0.0007734138972809668, + "loss": 2.0852, + "step": 256 + }, + { + "epoch": 0.023317002358918527, + "grad_norm": 0.3972165609206255, + "learning_rate": 0.0007764350453172206, + "loss": 2.0843, + "step": 257 + }, + { + "epoch": 0.023407729994556342, + "grad_norm": 0.46817665754885635, + "learning_rate": 0.0007794561933534743, + "loss": 2.105, + "step": 258 + }, + { + "epoch": 0.023498457630194158, + "grad_norm": 0.4141091059595687, + "learning_rate": 0.0007824773413897282, + "loss": 2.0567, + "step": 259 + }, + { + "epoch": 0.023589185265831973, + "grad_norm": 0.5119374944667764, + "learning_rate": 0.0007854984894259819, + "loss": 2.0538, + "step": 260 + }, + { + "epoch": 0.02367991290146979, + "grad_norm": 0.5236417160689418, + "learning_rate": 0.0007885196374622357, + "loss": 2.0634, + "step": 261 + }, + { + "epoch": 0.023770640537107604, + "grad_norm": 0.6983378811481711, + "learning_rate": 0.0007915407854984894, + "loss": 2.0989, + "step": 262 + }, + { + "epoch": 0.02386136817274542, + "grad_norm": 0.956474631323365, + "learning_rate": 0.0007945619335347433, + "loss": 2.0775, + "step": 263 + }, + { + "epoch": 0.023952095808383235, + "grad_norm": 0.8709421861758767, + "learning_rate": 0.0007975830815709971, + "loss": 2.0997, + "step": 264 + }, + { + "epoch": 0.02404282344402105, + "grad_norm": 0.439061471290149, + "learning_rate": 0.0008006042296072508, + "loss": 2.1014, + "step": 265 + }, + { + "epoch": 0.024133551079658862, + "grad_norm": 0.6151314827363878, + "learning_rate": 0.0008036253776435046, + "loss": 2.0945, + "step": 266 + }, + { + "epoch": 0.024224278715296678, + "grad_norm": 0.5527870163147932, + "learning_rate": 0.0008066465256797582, + "loss": 2.0862, + "step": 267 + }, + { + "epoch": 0.024315006350934493, + "grad_norm": 0.4828308882383162, + "learning_rate": 0.0008096676737160121, + "loss": 2.0431, + "step": 268 + }, + { + "epoch": 0.02440573398657231, + "grad_norm": 0.527873089260123, + "learning_rate": 0.0008126888217522658, + "loss": 2.0783, + "step": 269 + }, + { + "epoch": 0.024496461622210124, + "grad_norm": 0.4885821265114626, + "learning_rate": 0.0008157099697885196, + "loss": 2.1127, + "step": 270 + }, + { + "epoch": 0.02458718925784794, + "grad_norm": 0.44113050909479534, + "learning_rate": 0.0008187311178247734, + "loss": 2.0935, + "step": 271 + }, + { + "epoch": 0.024677916893485755, + "grad_norm": 0.41563099962973116, + "learning_rate": 0.0008217522658610272, + "loss": 2.03, + "step": 272 + }, + { + "epoch": 0.02476864452912357, + "grad_norm": 0.44128650635232014, + "learning_rate": 0.000824773413897281, + "loss": 2.131, + "step": 273 + }, + { + "epoch": 0.024859372164761386, + "grad_norm": 0.5073048511337941, + "learning_rate": 0.0008277945619335347, + "loss": 2.0433, + "step": 274 + }, + { + "epoch": 0.0249500998003992, + "grad_norm": 0.5506594742791696, + "learning_rate": 0.0008308157099697885, + "loss": 2.0718, + "step": 275 + }, + { + "epoch": 0.025040827436037017, + "grad_norm": 0.37998305083113343, + "learning_rate": 0.0008338368580060423, + "loss": 2.0483, + "step": 276 + }, + { + "epoch": 0.025131555071674833, + "grad_norm": 0.38384649469305093, + "learning_rate": 0.0008368580060422961, + "loss": 2.0352, + "step": 277 + }, + { + "epoch": 0.025222282707312648, + "grad_norm": 0.5773673318852669, + "learning_rate": 0.0008398791540785499, + "loss": 2.0849, + "step": 278 + }, + { + "epoch": 0.025313010342950464, + "grad_norm": 0.8021561249474198, + "learning_rate": 0.0008429003021148036, + "loss": 2.0774, + "step": 279 + }, + { + "epoch": 0.02540373797858828, + "grad_norm": 0.7551914613140209, + "learning_rate": 0.0008459214501510574, + "loss": 2.0927, + "step": 280 + }, + { + "epoch": 0.025494465614226094, + "grad_norm": 0.4478579370819027, + "learning_rate": 0.0008489425981873112, + "loss": 2.0757, + "step": 281 + }, + { + "epoch": 0.02558519324986391, + "grad_norm": 0.5721666903091793, + "learning_rate": 0.000851963746223565, + "loss": 2.0743, + "step": 282 + }, + { + "epoch": 0.025675920885501725, + "grad_norm": 0.3900383717207425, + "learning_rate": 0.0008549848942598187, + "loss": 2.0329, + "step": 283 + }, + { + "epoch": 0.025766648521139537, + "grad_norm": 0.3745588402054664, + "learning_rate": 0.0008580060422960725, + "loss": 2.0912, + "step": 284 + }, + { + "epoch": 0.025857376156777353, + "grad_norm": 0.4448173118806305, + "learning_rate": 0.0008610271903323264, + "loss": 2.0721, + "step": 285 + }, + { + "epoch": 0.02594810379241517, + "grad_norm": 0.4563553156245527, + "learning_rate": 0.0008640483383685801, + "loss": 2.0672, + "step": 286 + }, + { + "epoch": 0.026038831428052984, + "grad_norm": 0.4183503310975339, + "learning_rate": 0.0008670694864048339, + "loss": 2.0642, + "step": 287 + }, + { + "epoch": 0.0261295590636908, + "grad_norm": 0.3304357164569519, + "learning_rate": 0.0008700906344410876, + "loss": 2.0435, + "step": 288 + }, + { + "epoch": 0.026220286699328615, + "grad_norm": 0.3963375947725998, + "learning_rate": 0.0008731117824773414, + "loss": 2.041, + "step": 289 + }, + { + "epoch": 0.02631101433496643, + "grad_norm": 0.415705357768151, + "learning_rate": 0.0008761329305135952, + "loss": 2.0471, + "step": 290 + }, + { + "epoch": 0.026401741970604246, + "grad_norm": 0.3304961542779854, + "learning_rate": 0.000879154078549849, + "loss": 2.0436, + "step": 291 + }, + { + "epoch": 0.02649246960624206, + "grad_norm": 0.3671295313190496, + "learning_rate": 0.0008821752265861028, + "loss": 2.0448, + "step": 292 + }, + { + "epoch": 0.026583197241879877, + "grad_norm": 0.47629842234976694, + "learning_rate": 0.0008851963746223565, + "loss": 2.082, + "step": 293 + }, + { + "epoch": 0.026673924877517692, + "grad_norm": 0.8376714288304078, + "learning_rate": 0.0008882175226586104, + "loss": 2.1054, + "step": 294 + }, + { + "epoch": 0.026764652513155508, + "grad_norm": 0.8587343326872804, + "learning_rate": 0.000891238670694864, + "loss": 2.0927, + "step": 295 + }, + { + "epoch": 0.026855380148793323, + "grad_norm": 0.6510373449964173, + "learning_rate": 0.0008942598187311178, + "loss": 2.0509, + "step": 296 + }, + { + "epoch": 0.02694610778443114, + "grad_norm": 0.7943361883419603, + "learning_rate": 0.0008972809667673715, + "loss": 2.0661, + "step": 297 + }, + { + "epoch": 0.027036835420068954, + "grad_norm": 0.4801285631703695, + "learning_rate": 0.0009003021148036254, + "loss": 2.0615, + "step": 298 + }, + { + "epoch": 0.02712756305570677, + "grad_norm": 0.4397140278788078, + "learning_rate": 0.0009033232628398792, + "loss": 2.0696, + "step": 299 + }, + { + "epoch": 0.027218290691344585, + "grad_norm": 0.45402997451160837, + "learning_rate": 0.0009063444108761329, + "loss": 2.0734, + "step": 300 + }, + { + "epoch": 0.0273090183269824, + "grad_norm": 0.3478425508455393, + "learning_rate": 0.0009093655589123867, + "loss": 2.0557, + "step": 301 + }, + { + "epoch": 0.027399745962620216, + "grad_norm": 0.4895189575524895, + "learning_rate": 0.0009123867069486404, + "loss": 2.0468, + "step": 302 + }, + { + "epoch": 0.027490473598258028, + "grad_norm": 0.41860661394706555, + "learning_rate": 0.0009154078549848943, + "loss": 2.0532, + "step": 303 + }, + { + "epoch": 0.027581201233895843, + "grad_norm": 0.4506381263879676, + "learning_rate": 0.0009184290030211481, + "loss": 2.0304, + "step": 304 + }, + { + "epoch": 0.02767192886953366, + "grad_norm": 0.3984096611769323, + "learning_rate": 0.0009214501510574018, + "loss": 1.9963, + "step": 305 + }, + { + "epoch": 0.027762656505171474, + "grad_norm": 0.4246256275506215, + "learning_rate": 0.0009244712990936556, + "loss": 2.0616, + "step": 306 + }, + { + "epoch": 0.02785338414080929, + "grad_norm": 0.39586417545250707, + "learning_rate": 0.0009274924471299094, + "loss": 2.004, + "step": 307 + }, + { + "epoch": 0.027944111776447105, + "grad_norm": 0.3739518349898912, + "learning_rate": 0.0009305135951661632, + "loss": 2.0386, + "step": 308 + }, + { + "epoch": 0.02803483941208492, + "grad_norm": 0.4399626227522267, + "learning_rate": 0.0009335347432024169, + "loss": 1.9869, + "step": 309 + }, + { + "epoch": 0.028125567047722736, + "grad_norm": 0.38923863960392113, + "learning_rate": 0.0009365558912386707, + "loss": 1.9953, + "step": 310 + }, + { + "epoch": 0.02821629468336055, + "grad_norm": 0.37800799682965974, + "learning_rate": 0.0009395770392749245, + "loss": 2.0046, + "step": 311 + }, + { + "epoch": 0.028307022318998367, + "grad_norm": 0.36954177548149525, + "learning_rate": 0.0009425981873111783, + "loss": 2.0302, + "step": 312 + }, + { + "epoch": 0.028397749954636183, + "grad_norm": 0.5007625777619573, + "learning_rate": 0.0009456193353474321, + "loss": 2.0145, + "step": 313 + }, + { + "epoch": 0.028488477590273998, + "grad_norm": 0.40606787358269436, + "learning_rate": 0.0009486404833836858, + "loss": 1.9899, + "step": 314 + }, + { + "epoch": 0.028579205225911813, + "grad_norm": 0.33931616925642827, + "learning_rate": 0.0009516616314199396, + "loss": 2.0078, + "step": 315 + }, + { + "epoch": 0.02866993286154963, + "grad_norm": 0.4223517207937608, + "learning_rate": 0.0009546827794561934, + "loss": 2.0158, + "step": 316 + }, + { + "epoch": 0.028760660497187444, + "grad_norm": 0.5410301066318202, + "learning_rate": 0.0009577039274924472, + "loss": 2.0135, + "step": 317 + }, + { + "epoch": 0.02885138813282526, + "grad_norm": 0.6617690604391794, + "learning_rate": 0.000960725075528701, + "loss": 1.9848, + "step": 318 + }, + { + "epoch": 0.028942115768463075, + "grad_norm": 0.9817109496583062, + "learning_rate": 0.0009637462235649547, + "loss": 2.0143, + "step": 319 + }, + { + "epoch": 0.02903284340410089, + "grad_norm": 0.8290935051877456, + "learning_rate": 0.0009667673716012086, + "loss": 2.0442, + "step": 320 + }, + { + "epoch": 0.029123571039738703, + "grad_norm": 0.5144233722691229, + "learning_rate": 0.0009697885196374623, + "loss": 2.0492, + "step": 321 + }, + { + "epoch": 0.029214298675376518, + "grad_norm": 0.6063166459020966, + "learning_rate": 0.0009728096676737161, + "loss": 2.0563, + "step": 322 + }, + { + "epoch": 0.029305026311014334, + "grad_norm": 0.4069644378805369, + "learning_rate": 0.0009758308157099697, + "loss": 2.0196, + "step": 323 + }, + { + "epoch": 0.02939575394665215, + "grad_norm": 0.44417010387225586, + "learning_rate": 0.0009788519637462234, + "loss": 2.0224, + "step": 324 + }, + { + "epoch": 0.029486481582289965, + "grad_norm": 0.4210773566271339, + "learning_rate": 0.0009818731117824774, + "loss": 2.0598, + "step": 325 + }, + { + "epoch": 0.02957720921792778, + "grad_norm": 0.3970546977402958, + "learning_rate": 0.0009848942598187312, + "loss": 2.0197, + "step": 326 + }, + { + "epoch": 0.029667936853565596, + "grad_norm": 0.4510143400057626, + "learning_rate": 0.0009879154078549848, + "loss": 2.0421, + "step": 327 + }, + { + "epoch": 0.02975866448920341, + "grad_norm": 0.3738273921281319, + "learning_rate": 0.0009909365558912386, + "loss": 2.012, + "step": 328 + }, + { + "epoch": 0.029849392124841227, + "grad_norm": 0.36047545550032656, + "learning_rate": 0.0009939577039274924, + "loss": 2.0327, + "step": 329 + }, + { + "epoch": 0.029940119760479042, + "grad_norm": 0.3741144696883902, + "learning_rate": 0.0009969788519637462, + "loss": 1.9926, + "step": 330 + }, + { + "epoch": 0.030030847396116857, + "grad_norm": 0.40987270388762986, + "learning_rate": 0.001, + "loss": 1.9493, + "step": 331 + }, + { + "epoch": 0.030121575031754673, + "grad_norm": 0.39742920246569957, + "learning_rate": 0.0009999999784124631, + "loss": 1.981, + "step": 332 + }, + { + "epoch": 0.03021230266739249, + "grad_norm": 0.3606050739155054, + "learning_rate": 0.0009999999136498545, + "loss": 1.9875, + "step": 333 + }, + { + "epoch": 0.030303030303030304, + "grad_norm": 0.33315107432913693, + "learning_rate": 0.0009999998057121796, + "loss": 1.994, + "step": 334 + }, + { + "epoch": 0.03039375793866812, + "grad_norm": 0.35143856337888246, + "learning_rate": 0.0009999996545994477, + "loss": 2.0003, + "step": 335 + }, + { + "epoch": 0.030484485574305935, + "grad_norm": 0.4217143361612878, + "learning_rate": 0.000999999460311672, + "loss": 2.0011, + "step": 336 + }, + { + "epoch": 0.03057521320994375, + "grad_norm": 0.379056250805123, + "learning_rate": 0.0009999992228488692, + "loss": 1.9422, + "step": 337 + }, + { + "epoch": 0.030665940845581566, + "grad_norm": 0.47654640613522975, + "learning_rate": 0.0009999989422110598, + "loss": 1.946, + "step": 338 + }, + { + "epoch": 0.030756668481219378, + "grad_norm": 0.589994207984377, + "learning_rate": 0.0009999986183982681, + "loss": 1.9612, + "step": 339 + }, + { + "epoch": 0.030847396116857193, + "grad_norm": 0.5641152818031184, + "learning_rate": 0.0009999982514105222, + "loss": 1.983, + "step": 340 + }, + { + "epoch": 0.03093812375249501, + "grad_norm": 0.533417333197279, + "learning_rate": 0.0009999978412478531, + "loss": 2.0076, + "step": 341 + }, + { + "epoch": 0.031028851388132824, + "grad_norm": 0.5553100676582603, + "learning_rate": 0.000999997387910297, + "loss": 1.9679, + "step": 342 + }, + { + "epoch": 0.03111957902377064, + "grad_norm": 0.470199004470659, + "learning_rate": 0.000999996891397893, + "loss": 1.9188, + "step": 343 + }, + { + "epoch": 0.031210306659408455, + "grad_norm": 0.5068885369661538, + "learning_rate": 0.0009999963517106833, + "loss": 1.949, + "step": 344 + }, + { + "epoch": 0.03130103429504627, + "grad_norm": 0.47207495615096223, + "learning_rate": 0.0009999957688487154, + "loss": 1.9638, + "step": 345 + }, + { + "epoch": 0.03139176193068409, + "grad_norm": 0.4652846009324016, + "learning_rate": 0.000999995142812039, + "loss": 1.9732, + "step": 346 + }, + { + "epoch": 0.0314824895663219, + "grad_norm": 0.37492502747433054, + "learning_rate": 0.000999994473600708, + "loss": 1.9603, + "step": 347 + }, + { + "epoch": 0.031573217201959713, + "grad_norm": 0.4838138877647507, + "learning_rate": 0.0009999937612147807, + "loss": 1.9684, + "step": 348 + }, + { + "epoch": 0.03166394483759753, + "grad_norm": 0.3712207538926669, + "learning_rate": 0.0009999930056543184, + "loss": 1.9108, + "step": 349 + }, + { + "epoch": 0.031754672473235344, + "grad_norm": 0.38158246938738055, + "learning_rate": 0.0009999922069193865, + "loss": 1.9486, + "step": 350 + }, + { + "epoch": 0.03184540010887316, + "grad_norm": 0.43475793649520805, + "learning_rate": 0.0009999913650100539, + "loss": 1.9529, + "step": 351 + }, + { + "epoch": 0.031936127744510975, + "grad_norm": 0.5074529788111684, + "learning_rate": 0.000999990479926393, + "loss": 1.9138, + "step": 352 + }, + { + "epoch": 0.032026855380148794, + "grad_norm": 0.6377603600612806, + "learning_rate": 0.0009999895516684808, + "loss": 1.9536, + "step": 353 + }, + { + "epoch": 0.032117583015786606, + "grad_norm": 0.49093886045496277, + "learning_rate": 0.0009999885802363967, + "loss": 1.9015, + "step": 354 + }, + { + "epoch": 0.032208310651424425, + "grad_norm": 0.5746157173193734, + "learning_rate": 0.0009999875656302253, + "loss": 1.9515, + "step": 355 + }, + { + "epoch": 0.03229903828706224, + "grad_norm": 0.540347647919806, + "learning_rate": 0.000999986507850054, + "loss": 1.9416, + "step": 356 + }, + { + "epoch": 0.032389765922700056, + "grad_norm": 0.5555798234729963, + "learning_rate": 0.0009999854068959736, + "loss": 1.9657, + "step": 357 + }, + { + "epoch": 0.03248049355833787, + "grad_norm": 0.3758254561618239, + "learning_rate": 0.00099998426276808, + "loss": 1.8999, + "step": 358 + }, + { + "epoch": 0.03257122119397569, + "grad_norm": 0.568951626624065, + "learning_rate": 0.0009999830754664717, + "loss": 1.9226, + "step": 359 + }, + { + "epoch": 0.0326619488296135, + "grad_norm": 0.7362348572810091, + "learning_rate": 0.0009999818449912508, + "loss": 1.963, + "step": 360 + }, + { + "epoch": 0.03275267646525132, + "grad_norm": 0.809741179989046, + "learning_rate": 0.0009999805713425242, + "loss": 1.9353, + "step": 361 + }, + { + "epoch": 0.03284340410088913, + "grad_norm": 0.5442421181933006, + "learning_rate": 0.0009999792545204013, + "loss": 1.9036, + "step": 362 + }, + { + "epoch": 0.03293413173652695, + "grad_norm": 0.49272201619690587, + "learning_rate": 0.0009999778945249964, + "loss": 1.9177, + "step": 363 + }, + { + "epoch": 0.03302485937216476, + "grad_norm": 0.5911319283571196, + "learning_rate": 0.0009999764913564263, + "loss": 1.9614, + "step": 364 + }, + { + "epoch": 0.03311558700780258, + "grad_norm": 0.40505752876106976, + "learning_rate": 0.0009999750450148128, + "loss": 1.9246, + "step": 365 + }, + { + "epoch": 0.03320631464344039, + "grad_norm": 0.5047633842237526, + "learning_rate": 0.0009999735555002802, + "loss": 1.9091, + "step": 366 + }, + { + "epoch": 0.033297042279078204, + "grad_norm": 0.4841406642803596, + "learning_rate": 0.0009999720228129577, + "loss": 1.926, + "step": 367 + }, + { + "epoch": 0.03338776991471602, + "grad_norm": 0.5372728655678465, + "learning_rate": 0.000999970446952977, + "loss": 1.9162, + "step": 368 + }, + { + "epoch": 0.033478497550353835, + "grad_norm": 0.4982783421174916, + "learning_rate": 0.0009999688279204745, + "loss": 1.9522, + "step": 369 + }, + { + "epoch": 0.033569225185991654, + "grad_norm": 0.4113365375839873, + "learning_rate": 0.0009999671657155904, + "loss": 1.9099, + "step": 370 + }, + { + "epoch": 0.033659952821629466, + "grad_norm": 0.4579536600937145, + "learning_rate": 0.0009999654603384677, + "loss": 1.9051, + "step": 371 + }, + { + "epoch": 0.033750680457267285, + "grad_norm": 0.43802626386668375, + "learning_rate": 0.0009999637117892538, + "loss": 1.9129, + "step": 372 + }, + { + "epoch": 0.0338414080929051, + "grad_norm": 0.5471209678694965, + "learning_rate": 0.0009999619200680996, + "loss": 1.9083, + "step": 373 + }, + { + "epoch": 0.033932135728542916, + "grad_norm": 0.5174588298463658, + "learning_rate": 0.0009999600851751597, + "loss": 1.9001, + "step": 374 + }, + { + "epoch": 0.03402286336418073, + "grad_norm": 0.4405283433530588, + "learning_rate": 0.0009999582071105932, + "loss": 1.8764, + "step": 375 + }, + { + "epoch": 0.03411359099981855, + "grad_norm": 0.4442077069208036, + "learning_rate": 0.0009999562858745616, + "loss": 1.8749, + "step": 376 + }, + { + "epoch": 0.03420431863545636, + "grad_norm": 0.41791247672037757, + "learning_rate": 0.000999954321467231, + "loss": 1.8586, + "step": 377 + }, + { + "epoch": 0.03429504627109418, + "grad_norm": 0.5742685169831021, + "learning_rate": 0.0009999523138887712, + "loss": 1.9052, + "step": 378 + }, + { + "epoch": 0.03438577390673199, + "grad_norm": 0.7337397929170781, + "learning_rate": 0.0009999502631393552, + "loss": 1.8911, + "step": 379 + }, + { + "epoch": 0.03447650154236981, + "grad_norm": 0.7306291071274811, + "learning_rate": 0.0009999481692191607, + "loss": 1.9238, + "step": 380 + }, + { + "epoch": 0.03456722917800762, + "grad_norm": 0.5638035995184348, + "learning_rate": 0.0009999460321283677, + "loss": 1.8631, + "step": 381 + }, + { + "epoch": 0.03465795681364544, + "grad_norm": 0.5669211031760943, + "learning_rate": 0.0009999438518671613, + "loss": 1.8884, + "step": 382 + }, + { + "epoch": 0.03474868444928325, + "grad_norm": 0.5023793644729507, + "learning_rate": 0.0009999416284357297, + "loss": 1.8792, + "step": 383 + }, + { + "epoch": 0.03483941208492107, + "grad_norm": 0.4648585425479646, + "learning_rate": 0.0009999393618342646, + "loss": 1.8397, + "step": 384 + }, + { + "epoch": 0.03493013972055888, + "grad_norm": 0.5262079957548562, + "learning_rate": 0.000999937052062962, + "loss": 1.8889, + "step": 385 + }, + { + "epoch": 0.035020867356196694, + "grad_norm": 0.4955829300215121, + "learning_rate": 0.0009999346991220214, + "loss": 1.8584, + "step": 386 + }, + { + "epoch": 0.03511159499183451, + "grad_norm": 0.5155492752010905, + "learning_rate": 0.0009999323030116458, + "loss": 1.8967, + "step": 387 + }, + { + "epoch": 0.035202322627472325, + "grad_norm": 0.39502443343000027, + "learning_rate": 0.000999929863732042, + "loss": 1.8761, + "step": 388 + }, + { + "epoch": 0.035293050263110144, + "grad_norm": 0.5369900216507983, + "learning_rate": 0.000999927381283421, + "loss": 1.8866, + "step": 389 + }, + { + "epoch": 0.035383777898747956, + "grad_norm": 0.385155682962812, + "learning_rate": 0.0009999248556659967, + "loss": 1.8602, + "step": 390 + }, + { + "epoch": 0.035474505534385775, + "grad_norm": 0.49421774826965126, + "learning_rate": 0.0009999222868799875, + "loss": 1.8204, + "step": 391 + }, + { + "epoch": 0.03556523317002359, + "grad_norm": 0.48447296064228834, + "learning_rate": 0.0009999196749256151, + "loss": 1.8253, + "step": 392 + }, + { + "epoch": 0.035655960805661406, + "grad_norm": 0.6872629167332811, + "learning_rate": 0.0009999170198031052, + "loss": 1.8382, + "step": 393 + }, + { + "epoch": 0.03574668844129922, + "grad_norm": 0.5880753795354492, + "learning_rate": 0.000999914321512687, + "loss": 1.8344, + "step": 394 + }, + { + "epoch": 0.03583741607693704, + "grad_norm": 0.4931631445159973, + "learning_rate": 0.0009999115800545936, + "loss": 1.8558, + "step": 395 + }, + { + "epoch": 0.03592814371257485, + "grad_norm": 0.5638777129972092, + "learning_rate": 0.0009999087954290612, + "loss": 1.8417, + "step": 396 + }, + { + "epoch": 0.03601887134821267, + "grad_norm": 0.4601198771429398, + "learning_rate": 0.0009999059676363308, + "loss": 1.8493, + "step": 397 + }, + { + "epoch": 0.03610959898385048, + "grad_norm": 0.43472140360018086, + "learning_rate": 0.0009999030966766464, + "loss": 1.8533, + "step": 398 + }, + { + "epoch": 0.0362003266194883, + "grad_norm": 0.4513407448648774, + "learning_rate": 0.0009999001825502561, + "loss": 1.7843, + "step": 399 + }, + { + "epoch": 0.03629105425512611, + "grad_norm": 0.4688908459317649, + "learning_rate": 0.0009998972252574113, + "loss": 1.8434, + "step": 400 + }, + { + "epoch": 0.03638178189076393, + "grad_norm": 0.49731874676239424, + "learning_rate": 0.0009998942247983673, + "loss": 1.8104, + "step": 401 + }, + { + "epoch": 0.03647250952640174, + "grad_norm": 0.4786322400816938, + "learning_rate": 0.0009998911811733833, + "loss": 1.8805, + "step": 402 + }, + { + "epoch": 0.036563237162039554, + "grad_norm": 0.43818064666092243, + "learning_rate": 0.0009998880943827221, + "loss": 1.8252, + "step": 403 + }, + { + "epoch": 0.03665396479767737, + "grad_norm": 0.4474606417303887, + "learning_rate": 0.0009998849644266502, + "loss": 1.8347, + "step": 404 + }, + { + "epoch": 0.036744692433315185, + "grad_norm": 0.3918998137647274, + "learning_rate": 0.0009998817913054383, + "loss": 1.8322, + "step": 405 + }, + { + "epoch": 0.036835420068953004, + "grad_norm": 0.49021050289190277, + "learning_rate": 0.0009998785750193599, + "loss": 1.8265, + "step": 406 + }, + { + "epoch": 0.036926147704590816, + "grad_norm": 0.43913778991106556, + "learning_rate": 0.0009998753155686928, + "loss": 1.7733, + "step": 407 + }, + { + "epoch": 0.037016875340228635, + "grad_norm": 0.40226388286443654, + "learning_rate": 0.0009998720129537186, + "loss": 1.807, + "step": 408 + }, + { + "epoch": 0.03710760297586645, + "grad_norm": 0.4815517793101482, + "learning_rate": 0.0009998686671747223, + "loss": 1.8179, + "step": 409 + }, + { + "epoch": 0.037198330611504266, + "grad_norm": 0.44601809035718787, + "learning_rate": 0.000999865278231993, + "loss": 1.7923, + "step": 410 + }, + { + "epoch": 0.03728905824714208, + "grad_norm": 0.4355512548302105, + "learning_rate": 0.0009998618461258232, + "loss": 1.8326, + "step": 411 + }, + { + "epoch": 0.037379785882779896, + "grad_norm": 0.4123359468553089, + "learning_rate": 0.0009998583708565093, + "loss": 1.8188, + "step": 412 + }, + { + "epoch": 0.03747051351841771, + "grad_norm": 0.48189220786807774, + "learning_rate": 0.0009998548524243515, + "loss": 1.8204, + "step": 413 + }, + { + "epoch": 0.03756124115405553, + "grad_norm": 0.46807271046951887, + "learning_rate": 0.0009998512908296535, + "loss": 1.8183, + "step": 414 + }, + { + "epoch": 0.03765196878969334, + "grad_norm": 0.5323675433494917, + "learning_rate": 0.0009998476860727228, + "loss": 1.8646, + "step": 415 + }, + { + "epoch": 0.03774269642533116, + "grad_norm": 0.48419350804630384, + "learning_rate": 0.0009998440381538706, + "loss": 1.8071, + "step": 416 + }, + { + "epoch": 0.03783342406096897, + "grad_norm": 0.38442185262682954, + "learning_rate": 0.0009998403470734122, + "loss": 1.8115, + "step": 417 + }, + { + "epoch": 0.03792415169660679, + "grad_norm": 0.5745929838102614, + "learning_rate": 0.0009998366128316663, + "loss": 1.8144, + "step": 418 + }, + { + "epoch": 0.0380148793322446, + "grad_norm": 0.6011534109851349, + "learning_rate": 0.0009998328354289552, + "loss": 1.7965, + "step": 419 + }, + { + "epoch": 0.03810560696788242, + "grad_norm": 0.505538750318415, + "learning_rate": 0.0009998290148656049, + "loss": 1.8077, + "step": 420 + }, + { + "epoch": 0.03819633460352023, + "grad_norm": 0.4773023901059015, + "learning_rate": 0.0009998251511419455, + "loss": 1.7935, + "step": 421 + }, + { + "epoch": 0.038287062239158044, + "grad_norm": 0.6033302789948958, + "learning_rate": 0.0009998212442583108, + "loss": 1.7607, + "step": 422 + }, + { + "epoch": 0.03837778987479586, + "grad_norm": 0.5227278913484302, + "learning_rate": 0.0009998172942150378, + "loss": 1.8079, + "step": 423 + }, + { + "epoch": 0.038468517510433675, + "grad_norm": 0.5050002846783994, + "learning_rate": 0.000999813301012468, + "loss": 1.7694, + "step": 424 + }, + { + "epoch": 0.038559245146071494, + "grad_norm": 0.47320330076590095, + "learning_rate": 0.0009998092646509458, + "loss": 1.7752, + "step": 425 + }, + { + "epoch": 0.038649972781709306, + "grad_norm": 0.5023986350438835, + "learning_rate": 0.00099980518513082, + "loss": 1.8337, + "step": 426 + }, + { + "epoch": 0.038740700417347125, + "grad_norm": 0.4883714449155776, + "learning_rate": 0.0009998010624524428, + "loss": 1.7908, + "step": 427 + }, + { + "epoch": 0.03883142805298494, + "grad_norm": 0.39554786757820626, + "learning_rate": 0.00099979689661617, + "loss": 1.7824, + "step": 428 + }, + { + "epoch": 0.038922155688622756, + "grad_norm": 0.4784090793986327, + "learning_rate": 0.000999792687622362, + "loss": 1.7722, + "step": 429 + }, + { + "epoch": 0.03901288332426057, + "grad_norm": 0.45900198586451774, + "learning_rate": 0.0009997884354713813, + "loss": 1.7475, + "step": 430 + }, + { + "epoch": 0.03910361095989839, + "grad_norm": 0.5335731585695797, + "learning_rate": 0.0009997841401635959, + "loss": 1.7563, + "step": 431 + }, + { + "epoch": 0.0391943385955362, + "grad_norm": 0.4983470011186999, + "learning_rate": 0.000999779801699376, + "loss": 1.7835, + "step": 432 + }, + { + "epoch": 0.03928506623117402, + "grad_norm": 0.41330635468487775, + "learning_rate": 0.0009997754200790968, + "loss": 1.8032, + "step": 433 + }, + { + "epoch": 0.03937579386681183, + "grad_norm": 0.4630619902123537, + "learning_rate": 0.0009997709953031362, + "loss": 1.7868, + "step": 434 + }, + { + "epoch": 0.03946652150244965, + "grad_norm": 0.42647773426084945, + "learning_rate": 0.0009997665273718767, + "loss": 1.7556, + "step": 435 + }, + { + "epoch": 0.03955724913808746, + "grad_norm": 0.394655633303643, + "learning_rate": 0.0009997620162857036, + "loss": 1.8221, + "step": 436 + }, + { + "epoch": 0.03964797677372528, + "grad_norm": 0.48879947314401784, + "learning_rate": 0.0009997574620450073, + "loss": 1.7605, + "step": 437 + }, + { + "epoch": 0.03973870440936309, + "grad_norm": 0.4464020380280768, + "learning_rate": 0.00099975286465018, + "loss": 1.7436, + "step": 438 + }, + { + "epoch": 0.03982943204500091, + "grad_norm": 0.41494914073841754, + "learning_rate": 0.0009997482241016194, + "loss": 1.7212, + "step": 439 + }, + { + "epoch": 0.03992015968063872, + "grad_norm": 0.47633185460568556, + "learning_rate": 0.0009997435403997258, + "loss": 1.6804, + "step": 440 + }, + { + "epoch": 0.040010887316276535, + "grad_norm": 0.4711308026655034, + "learning_rate": 0.0009997388135449042, + "loss": 1.7446, + "step": 441 + }, + { + "epoch": 0.040101614951914354, + "grad_norm": 0.5058090539230174, + "learning_rate": 0.000999734043537562, + "loss": 1.7115, + "step": 442 + }, + { + "epoch": 0.040192342587552166, + "grad_norm": 0.5077041227151656, + "learning_rate": 0.0009997292303781118, + "loss": 1.7441, + "step": 443 + }, + { + "epoch": 0.040283070223189985, + "grad_norm": 0.4843982288983942, + "learning_rate": 0.0009997243740669686, + "loss": 1.714, + "step": 444 + }, + { + "epoch": 0.0403737978588278, + "grad_norm": 0.4305617654482929, + "learning_rate": 0.0009997194746045523, + "loss": 1.7388, + "step": 445 + }, + { + "epoch": 0.040464525494465615, + "grad_norm": 0.4474779587662717, + "learning_rate": 0.0009997145319912855, + "loss": 1.7422, + "step": 446 + }, + { + "epoch": 0.04055525313010343, + "grad_norm": 0.5014371893847576, + "learning_rate": 0.0009997095462275954, + "loss": 1.7596, + "step": 447 + }, + { + "epoch": 0.040645980765741246, + "grad_norm": 0.47049499862034483, + "learning_rate": 0.0009997045173139123, + "loss": 1.7312, + "step": 448 + }, + { + "epoch": 0.04073670840137906, + "grad_norm": 0.5374726882238579, + "learning_rate": 0.0009996994452506705, + "loss": 1.7253, + "step": 449 + }, + { + "epoch": 0.04082743603701688, + "grad_norm": 0.41749988720422, + "learning_rate": 0.0009996943300383079, + "loss": 1.7138, + "step": 450 + }, + { + "epoch": 0.04091816367265469, + "grad_norm": 0.4985090414790096, + "learning_rate": 0.0009996891716772662, + "loss": 1.7228, + "step": 451 + }, + { + "epoch": 0.04100889130829251, + "grad_norm": 0.4838182982711945, + "learning_rate": 0.000999683970167991, + "loss": 1.7539, + "step": 452 + }, + { + "epoch": 0.04109961894393032, + "grad_norm": 0.4860115576282074, + "learning_rate": 0.0009996787255109312, + "loss": 1.7534, + "step": 453 + }, + { + "epoch": 0.04119034657956814, + "grad_norm": 0.4376774679322773, + "learning_rate": 0.0009996734377065398, + "loss": 1.727, + "step": 454 + }, + { + "epoch": 0.04128107421520595, + "grad_norm": 0.5846545645910486, + "learning_rate": 0.0009996681067552735, + "loss": 1.7129, + "step": 455 + }, + { + "epoch": 0.04137180185084377, + "grad_norm": 0.5192879258551646, + "learning_rate": 0.0009996627326575925, + "loss": 1.7283, + "step": 456 + }, + { + "epoch": 0.04146252948648158, + "grad_norm": 0.4451095583828364, + "learning_rate": 0.000999657315413961, + "loss": 1.7321, + "step": 457 + }, + { + "epoch": 0.041553257122119394, + "grad_norm": 0.5352123510639898, + "learning_rate": 0.0009996518550248466, + "loss": 1.7406, + "step": 458 + }, + { + "epoch": 0.04164398475775721, + "grad_norm": 0.5262122570707228, + "learning_rate": 0.0009996463514907207, + "loss": 1.7218, + "step": 459 + }, + { + "epoch": 0.041734712393395025, + "grad_norm": 0.431466597736113, + "learning_rate": 0.0009996408048120588, + "loss": 1.7289, + "step": 460 + }, + { + "epoch": 0.041825440029032844, + "grad_norm": 0.4787616508651492, + "learning_rate": 0.00099963521498934, + "loss": 1.7632, + "step": 461 + }, + { + "epoch": 0.041916167664670656, + "grad_norm": 0.36255396507517385, + "learning_rate": 0.0009996295820230467, + "loss": 1.6915, + "step": 462 + }, + { + "epoch": 0.042006895300308475, + "grad_norm": 0.4080198144495661, + "learning_rate": 0.000999623905913665, + "loss": 1.7228, + "step": 463 + }, + { + "epoch": 0.04209762293594629, + "grad_norm": 0.4008858982100864, + "learning_rate": 0.0009996181866616858, + "loss": 1.6497, + "step": 464 + }, + { + "epoch": 0.042188350571584106, + "grad_norm": 0.3883234711962411, + "learning_rate": 0.0009996124242676022, + "loss": 1.7168, + "step": 465 + }, + { + "epoch": 0.04227907820722192, + "grad_norm": 0.4136242798096432, + "learning_rate": 0.0009996066187319124, + "loss": 1.7232, + "step": 466 + }, + { + "epoch": 0.04236980584285974, + "grad_norm": 0.4103455170818197, + "learning_rate": 0.0009996007700551173, + "loss": 1.6845, + "step": 467 + }, + { + "epoch": 0.04246053347849755, + "grad_norm": 0.443425012293563, + "learning_rate": 0.000999594878237722, + "loss": 1.6891, + "step": 468 + }, + { + "epoch": 0.04255126111413537, + "grad_norm": 0.367285980390157, + "learning_rate": 0.0009995889432802354, + "loss": 1.7017, + "step": 469 + }, + { + "epoch": 0.04264198874977318, + "grad_norm": 0.4347631076020054, + "learning_rate": 0.00099958296518317, + "loss": 1.675, + "step": 470 + }, + { + "epoch": 0.042732716385411, + "grad_norm": 0.4366342640655164, + "learning_rate": 0.000999576943947042, + "loss": 1.6901, + "step": 471 + }, + { + "epoch": 0.04282344402104881, + "grad_norm": 0.4528294672374248, + "learning_rate": 0.000999570879572371, + "loss": 1.7007, + "step": 472 + }, + { + "epoch": 0.04291417165668663, + "grad_norm": 0.37806051084458, + "learning_rate": 0.0009995647720596813, + "loss": 1.6942, + "step": 473 + }, + { + "epoch": 0.04300489929232444, + "grad_norm": 0.4379125605485645, + "learning_rate": 0.0009995586214094996, + "loss": 1.6933, + "step": 474 + }, + { + "epoch": 0.04309562692796226, + "grad_norm": 0.536382741918843, + "learning_rate": 0.0009995524276223574, + "loss": 1.7231, + "step": 475 + }, + { + "epoch": 0.04318635456360007, + "grad_norm": 0.3726037202059723, + "learning_rate": 0.0009995461906987893, + "loss": 1.6579, + "step": 476 + }, + { + "epoch": 0.043277082199237885, + "grad_norm": 0.48802750411016466, + "learning_rate": 0.0009995399106393341, + "loss": 1.6836, + "step": 477 + }, + { + "epoch": 0.043367809834875704, + "grad_norm": 0.3873161607187404, + "learning_rate": 0.0009995335874445341, + "loss": 1.6756, + "step": 478 + }, + { + "epoch": 0.043458537470513516, + "grad_norm": 0.3921933813505868, + "learning_rate": 0.000999527221114935, + "loss": 1.6855, + "step": 479 + }, + { + "epoch": 0.043549265106151334, + "grad_norm": 0.36918754146675103, + "learning_rate": 0.0009995208116510869, + "loss": 1.7257, + "step": 480 + }, + { + "epoch": 0.043639992741789146, + "grad_norm": 0.4223841746413618, + "learning_rate": 0.0009995143590535431, + "loss": 1.6953, + "step": 481 + }, + { + "epoch": 0.043730720377426965, + "grad_norm": 0.3931194231244593, + "learning_rate": 0.0009995078633228606, + "loss": 1.6992, + "step": 482 + }, + { + "epoch": 0.04382144801306478, + "grad_norm": 0.49401601882869606, + "learning_rate": 0.0009995013244596008, + "loss": 1.7156, + "step": 483 + }, + { + "epoch": 0.043912175648702596, + "grad_norm": 0.4039389231752028, + "learning_rate": 0.0009994947424643277, + "loss": 1.6606, + "step": 484 + }, + { + "epoch": 0.04400290328434041, + "grad_norm": 0.46924699485009785, + "learning_rate": 0.00099948811733761, + "loss": 1.6865, + "step": 485 + }, + { + "epoch": 0.04409363091997823, + "grad_norm": 0.49718958447572853, + "learning_rate": 0.00099948144908002, + "loss": 1.6467, + "step": 486 + }, + { + "epoch": 0.04418435855561604, + "grad_norm": 0.4398859456390439, + "learning_rate": 0.000999474737692133, + "loss": 1.6962, + "step": 487 + }, + { + "epoch": 0.04427508619125386, + "grad_norm": 0.3738893279083037, + "learning_rate": 0.000999467983174529, + "loss": 1.6852, + "step": 488 + }, + { + "epoch": 0.04436581382689167, + "grad_norm": 0.4294785925715381, + "learning_rate": 0.000999461185527791, + "loss": 1.7131, + "step": 489 + }, + { + "epoch": 0.04445654146252949, + "grad_norm": 0.39701260184053516, + "learning_rate": 0.000999454344752506, + "loss": 1.6725, + "step": 490 + }, + { + "epoch": 0.0445472690981673, + "grad_norm": 0.46243512844367973, + "learning_rate": 0.0009994474608492644, + "loss": 1.6917, + "step": 491 + }, + { + "epoch": 0.04463799673380512, + "grad_norm": 0.4685712998498576, + "learning_rate": 0.0009994405338186612, + "loss": 1.6961, + "step": 492 + }, + { + "epoch": 0.04472872436944293, + "grad_norm": 0.47014048243658824, + "learning_rate": 0.0009994335636612944, + "loss": 1.6868, + "step": 493 + }, + { + "epoch": 0.04481945200508075, + "grad_norm": 0.3942260522655613, + "learning_rate": 0.0009994265503777656, + "loss": 1.6948, + "step": 494 + }, + { + "epoch": 0.04491017964071856, + "grad_norm": 0.39781780035329073, + "learning_rate": 0.0009994194939686807, + "loss": 1.6846, + "step": 495 + }, + { + "epoch": 0.045000907276356375, + "grad_norm": 0.37727871449446676, + "learning_rate": 0.0009994123944346489, + "loss": 1.676, + "step": 496 + }, + { + "epoch": 0.045091634911994194, + "grad_norm": 0.447123920864723, + "learning_rate": 0.000999405251776283, + "loss": 1.6898, + "step": 497 + }, + { + "epoch": 0.045182362547632006, + "grad_norm": 0.4160722830542454, + "learning_rate": 0.0009993980659942002, + "loss": 1.6785, + "step": 498 + }, + { + "epoch": 0.045273090183269825, + "grad_norm": 0.3896448465949544, + "learning_rate": 0.0009993908370890209, + "loss": 1.6382, + "step": 499 + }, + { + "epoch": 0.04536381781890764, + "grad_norm": 0.40246973700984073, + "learning_rate": 0.000999383565061369, + "loss": 1.7495, + "step": 500 + }, + { + "epoch": 0.045454545454545456, + "grad_norm": 0.3681764893652991, + "learning_rate": 0.0009993762499118726, + "loss": 1.635, + "step": 501 + }, + { + "epoch": 0.04554527309018327, + "grad_norm": 0.41427683096445655, + "learning_rate": 0.0009993688916411637, + "loss": 1.6965, + "step": 502 + }, + { + "epoch": 0.04563600072582109, + "grad_norm": 0.3710000026562126, + "learning_rate": 0.0009993614902498772, + "loss": 1.6443, + "step": 503 + }, + { + "epoch": 0.0457267283614589, + "grad_norm": 0.38369825640712135, + "learning_rate": 0.0009993540457386525, + "loss": 1.6659, + "step": 504 + }, + { + "epoch": 0.04581745599709672, + "grad_norm": 0.363553703683683, + "learning_rate": 0.0009993465581081323, + "loss": 1.678, + "step": 505 + }, + { + "epoch": 0.04590818363273453, + "grad_norm": 0.3347847430866614, + "learning_rate": 0.0009993390273589633, + "loss": 1.6448, + "step": 506 + }, + { + "epoch": 0.04599891126837235, + "grad_norm": 0.3637215463330027, + "learning_rate": 0.0009993314534917956, + "loss": 1.661, + "step": 507 + }, + { + "epoch": 0.04608963890401016, + "grad_norm": 0.40839167947465776, + "learning_rate": 0.0009993238365072832, + "loss": 1.6503, + "step": 508 + }, + { + "epoch": 0.04618036653964798, + "grad_norm": 0.39594834186827915, + "learning_rate": 0.000999316176406084, + "loss": 1.6532, + "step": 509 + }, + { + "epoch": 0.04627109417528579, + "grad_norm": 0.4125590573556423, + "learning_rate": 0.0009993084731888596, + "loss": 1.6425, + "step": 510 + }, + { + "epoch": 0.04636182181092361, + "grad_norm": 0.3988840853508271, + "learning_rate": 0.0009993007268562749, + "loss": 1.6634, + "step": 511 + }, + { + "epoch": 0.04645254944656142, + "grad_norm": 0.35860013323274004, + "learning_rate": 0.0009992929374089984, + "loss": 1.6397, + "step": 512 + }, + { + "epoch": 0.046543277082199234, + "grad_norm": 0.33264105159109986, + "learning_rate": 0.0009992851048477036, + "loss": 1.6533, + "step": 513 + }, + { + "epoch": 0.04663400471783705, + "grad_norm": 0.43056789346767843, + "learning_rate": 0.0009992772291730664, + "loss": 1.6303, + "step": 514 + }, + { + "epoch": 0.046724732353474865, + "grad_norm": 0.3636862907757154, + "learning_rate": 0.0009992693103857667, + "loss": 1.6667, + "step": 515 + }, + { + "epoch": 0.046815459989112684, + "grad_norm": 0.41819412014373253, + "learning_rate": 0.0009992613484864886, + "loss": 1.6159, + "step": 516 + }, + { + "epoch": 0.046906187624750496, + "grad_norm": 0.4185895993504394, + "learning_rate": 0.0009992533434759194, + "loss": 1.6945, + "step": 517 + }, + { + "epoch": 0.046996915260388315, + "grad_norm": 0.34988677072675956, + "learning_rate": 0.0009992452953547504, + "loss": 1.6528, + "step": 518 + }, + { + "epoch": 0.04708764289602613, + "grad_norm": 0.39051961002761104, + "learning_rate": 0.0009992372041236766, + "loss": 1.6591, + "step": 519 + }, + { + "epoch": 0.047178370531663946, + "grad_norm": 0.3925955978308973, + "learning_rate": 0.0009992290697833966, + "loss": 1.6419, + "step": 520 + }, + { + "epoch": 0.04726909816730176, + "grad_norm": 0.45402063307698054, + "learning_rate": 0.000999220892334613, + "loss": 1.6741, + "step": 521 + }, + { + "epoch": 0.04735982580293958, + "grad_norm": 0.38988533517337315, + "learning_rate": 0.0009992126717780316, + "loss": 1.6756, + "step": 522 + }, + { + "epoch": 0.04745055343857739, + "grad_norm": 0.39090332026883956, + "learning_rate": 0.0009992044081143625, + "loss": 1.6219, + "step": 523 + }, + { + "epoch": 0.04754128107421521, + "grad_norm": 0.3849310576051253, + "learning_rate": 0.0009991961013443192, + "loss": 1.6371, + "step": 524 + }, + { + "epoch": 0.04763200870985302, + "grad_norm": 0.3598662360259466, + "learning_rate": 0.0009991877514686188, + "loss": 1.6596, + "step": 525 + }, + { + "epoch": 0.04772273634549084, + "grad_norm": 0.39319737520858916, + "learning_rate": 0.0009991793584879828, + "loss": 1.6536, + "step": 526 + }, + { + "epoch": 0.04781346398112865, + "grad_norm": 0.3718633497552256, + "learning_rate": 0.0009991709224031352, + "loss": 1.6263, + "step": 527 + }, + { + "epoch": 0.04790419161676647, + "grad_norm": 0.37960047881197856, + "learning_rate": 0.0009991624432148052, + "loss": 1.6644, + "step": 528 + }, + { + "epoch": 0.04799491925240428, + "grad_norm": 0.3768485757304554, + "learning_rate": 0.0009991539209237247, + "loss": 1.6504, + "step": 529 + }, + { + "epoch": 0.0480856468880421, + "grad_norm": 0.3553160132032334, + "learning_rate": 0.0009991453555306291, + "loss": 1.6711, + "step": 530 + }, + { + "epoch": 0.04817637452367991, + "grad_norm": 0.3461233611713523, + "learning_rate": 0.0009991367470362589, + "loss": 1.6367, + "step": 531 + }, + { + "epoch": 0.048267102159317725, + "grad_norm": 0.321333552303932, + "learning_rate": 0.000999128095441357, + "loss": 1.6431, + "step": 532 + }, + { + "epoch": 0.048357829794955544, + "grad_norm": 0.3336422629731072, + "learning_rate": 0.0009991194007466704, + "loss": 1.6297, + "step": 533 + }, + { + "epoch": 0.048448557430593356, + "grad_norm": 0.34396476603828935, + "learning_rate": 0.00099911066295295, + "loss": 1.6859, + "step": 534 + }, + { + "epoch": 0.048539285066231175, + "grad_norm": 0.3366020289008908, + "learning_rate": 0.0009991018820609504, + "loss": 1.6514, + "step": 535 + }, + { + "epoch": 0.04863001270186899, + "grad_norm": 0.33802869173342287, + "learning_rate": 0.0009990930580714298, + "loss": 1.6115, + "step": 536 + }, + { + "epoch": 0.048720740337506806, + "grad_norm": 0.3538998248955065, + "learning_rate": 0.0009990841909851497, + "loss": 1.6616, + "step": 537 + }, + { + "epoch": 0.04881146797314462, + "grad_norm": 0.37589014340668075, + "learning_rate": 0.0009990752808028765, + "loss": 1.628, + "step": 538 + }, + { + "epoch": 0.04890219560878244, + "grad_norm": 0.34809705765845655, + "learning_rate": 0.000999066327525379, + "loss": 1.6272, + "step": 539 + }, + { + "epoch": 0.04899292324442025, + "grad_norm": 0.3546527568729994, + "learning_rate": 0.0009990573311534309, + "loss": 1.6416, + "step": 540 + }, + { + "epoch": 0.04908365088005807, + "grad_norm": 0.3752691564506957, + "learning_rate": 0.0009990482916878082, + "loss": 1.6334, + "step": 541 + }, + { + "epoch": 0.04917437851569588, + "grad_norm": 0.3498423738828038, + "learning_rate": 0.0009990392091292924, + "loss": 1.6043, + "step": 542 + }, + { + "epoch": 0.0492651061513337, + "grad_norm": 0.40058771894341133, + "learning_rate": 0.000999030083478667, + "loss": 1.6224, + "step": 543 + }, + { + "epoch": 0.04935583378697151, + "grad_norm": 0.3445302469855495, + "learning_rate": 0.0009990209147367206, + "loss": 1.6403, + "step": 544 + }, + { + "epoch": 0.04944656142260933, + "grad_norm": 0.3635975288822811, + "learning_rate": 0.0009990117029042445, + "loss": 1.6452, + "step": 545 + }, + { + "epoch": 0.04953728905824714, + "grad_norm": 0.3389556319117152, + "learning_rate": 0.0009990024479820346, + "loss": 1.6669, + "step": 546 + }, + { + "epoch": 0.04962801669388496, + "grad_norm": 0.3407377326878335, + "learning_rate": 0.0009989931499708895, + "loss": 1.6425, + "step": 547 + }, + { + "epoch": 0.04971874432952277, + "grad_norm": 0.3463576851848017, + "learning_rate": 0.0009989838088716124, + "loss": 1.6567, + "step": 548 + }, + { + "epoch": 0.04980947196516059, + "grad_norm": 0.38479440639461154, + "learning_rate": 0.00099897442468501, + "loss": 1.6608, + "step": 549 + }, + { + "epoch": 0.0499001996007984, + "grad_norm": 0.39141857246541467, + "learning_rate": 0.0009989649974118922, + "loss": 1.6287, + "step": 550 + }, + { + "epoch": 0.049990927236436215, + "grad_norm": 0.41149725909805296, + "learning_rate": 0.0009989555270530734, + "loss": 1.6262, + "step": 551 + }, + { + "epoch": 0.050081654872074034, + "grad_norm": 0.41744405451903527, + "learning_rate": 0.0009989460136093716, + "loss": 1.6522, + "step": 552 + }, + { + "epoch": 0.050172382507711846, + "grad_norm": 0.36076669620042345, + "learning_rate": 0.0009989364570816078, + "loss": 1.6254, + "step": 553 + }, + { + "epoch": 0.050263110143349665, + "grad_norm": 0.35633681718566573, + "learning_rate": 0.0009989268574706073, + "loss": 1.6382, + "step": 554 + }, + { + "epoch": 0.05035383777898748, + "grad_norm": 0.42066217803472494, + "learning_rate": 0.0009989172147771993, + "loss": 1.6588, + "step": 555 + }, + { + "epoch": 0.050444565414625296, + "grad_norm": 0.3515167535279943, + "learning_rate": 0.000998907529002216, + "loss": 1.6055, + "step": 556 + }, + { + "epoch": 0.05053529305026311, + "grad_norm": 0.36397618984271385, + "learning_rate": 0.0009988978001464943, + "loss": 1.6025, + "step": 557 + }, + { + "epoch": 0.05062602068590093, + "grad_norm": 0.40610572602781086, + "learning_rate": 0.000998888028210874, + "loss": 1.6621, + "step": 558 + }, + { + "epoch": 0.05071674832153874, + "grad_norm": 0.39635734589781907, + "learning_rate": 0.000998878213196199, + "loss": 1.6348, + "step": 559 + }, + { + "epoch": 0.05080747595717656, + "grad_norm": 0.5361673725430128, + "learning_rate": 0.0009988683551033165, + "loss": 1.6073, + "step": 560 + }, + { + "epoch": 0.05089820359281437, + "grad_norm": 0.3903170926558178, + "learning_rate": 0.0009988584539330782, + "loss": 1.5891, + "step": 561 + }, + { + "epoch": 0.05098893122845219, + "grad_norm": 0.3571369184673008, + "learning_rate": 0.0009988485096863388, + "loss": 1.6116, + "step": 562 + }, + { + "epoch": 0.05107965886409, + "grad_norm": 0.3729339803682821, + "learning_rate": 0.0009988385223639572, + "loss": 1.6206, + "step": 563 + }, + { + "epoch": 0.05117038649972782, + "grad_norm": 0.3965969751671563, + "learning_rate": 0.0009988284919667954, + "loss": 1.6426, + "step": 564 + }, + { + "epoch": 0.05126111413536563, + "grad_norm": 0.44145980245203004, + "learning_rate": 0.0009988184184957201, + "loss": 1.6633, + "step": 565 + }, + { + "epoch": 0.05135184177100345, + "grad_norm": 0.3822285174281755, + "learning_rate": 0.0009988083019516006, + "loss": 1.6074, + "step": 566 + }, + { + "epoch": 0.05144256940664126, + "grad_norm": 0.4136666171002365, + "learning_rate": 0.0009987981423353108, + "loss": 1.6525, + "step": 567 + }, + { + "epoch": 0.051533297042279075, + "grad_norm": 0.39593302705608524, + "learning_rate": 0.000998787939647728, + "loss": 1.6225, + "step": 568 + }, + { + "epoch": 0.051624024677916894, + "grad_norm": 0.3765744442058003, + "learning_rate": 0.000998777693889733, + "loss": 1.6214, + "step": 569 + }, + { + "epoch": 0.051714752313554706, + "grad_norm": 0.35892612982178207, + "learning_rate": 0.0009987674050622106, + "loss": 1.6449, + "step": 570 + }, + { + "epoch": 0.051805479949192525, + "grad_norm": 0.37371527401506144, + "learning_rate": 0.0009987570731660495, + "loss": 1.6168, + "step": 571 + }, + { + "epoch": 0.05189620758483034, + "grad_norm": 0.39180187679424083, + "learning_rate": 0.0009987466982021413, + "loss": 1.5706, + "step": 572 + }, + { + "epoch": 0.051986935220468156, + "grad_norm": 0.4378694943288744, + "learning_rate": 0.0009987362801713823, + "loss": 1.617, + "step": 573 + }, + { + "epoch": 0.05207766285610597, + "grad_norm": 0.39543514616009495, + "learning_rate": 0.000998725819074672, + "loss": 1.6257, + "step": 574 + }, + { + "epoch": 0.05216839049174379, + "grad_norm": 0.3951647333904274, + "learning_rate": 0.0009987153149129137, + "loss": 1.6483, + "step": 575 + }, + { + "epoch": 0.0522591181273816, + "grad_norm": 0.34179431341507055, + "learning_rate": 0.0009987047676870147, + "loss": 1.6391, + "step": 576 + }, + { + "epoch": 0.05234984576301942, + "grad_norm": 0.3719996652628996, + "learning_rate": 0.0009986941773978853, + "loss": 1.6018, + "step": 577 + }, + { + "epoch": 0.05244057339865723, + "grad_norm": 0.37404978057740745, + "learning_rate": 0.00099868354404644, + "loss": 1.6056, + "step": 578 + }, + { + "epoch": 0.05253130103429505, + "grad_norm": 0.3930849579167211, + "learning_rate": 0.0009986728676335975, + "loss": 1.6146, + "step": 579 + }, + { + "epoch": 0.05262202866993286, + "grad_norm": 0.4358693561311562, + "learning_rate": 0.0009986621481602792, + "loss": 1.5994, + "step": 580 + }, + { + "epoch": 0.05271275630557068, + "grad_norm": 0.38712765423976736, + "learning_rate": 0.0009986513856274108, + "loss": 1.5993, + "step": 581 + }, + { + "epoch": 0.05280348394120849, + "grad_norm": 0.3784808957032984, + "learning_rate": 0.0009986405800359221, + "loss": 1.6033, + "step": 582 + }, + { + "epoch": 0.05289421157684631, + "grad_norm": 0.4176527131609666, + "learning_rate": 0.0009986297313867457, + "loss": 1.6323, + "step": 583 + }, + { + "epoch": 0.05298493921248412, + "grad_norm": 0.4464566127654728, + "learning_rate": 0.0009986188396808183, + "loss": 1.6482, + "step": 584 + }, + { + "epoch": 0.05307566684812194, + "grad_norm": 0.42178831879275575, + "learning_rate": 0.0009986079049190807, + "loss": 1.5968, + "step": 585 + }, + { + "epoch": 0.05316639448375975, + "grad_norm": 0.4176603655489731, + "learning_rate": 0.000998596927102477, + "loss": 1.5884, + "step": 586 + }, + { + "epoch": 0.053257122119397565, + "grad_norm": 0.423922600004041, + "learning_rate": 0.0009985859062319553, + "loss": 1.6169, + "step": 587 + }, + { + "epoch": 0.053347849755035384, + "grad_norm": 0.4190211676568518, + "learning_rate": 0.000998574842308467, + "loss": 1.6511, + "step": 588 + }, + { + "epoch": 0.053438577390673196, + "grad_norm": 0.41390601316271464, + "learning_rate": 0.0009985637353329677, + "loss": 1.6277, + "step": 589 + }, + { + "epoch": 0.053529305026311015, + "grad_norm": 0.405938191053153, + "learning_rate": 0.0009985525853064164, + "loss": 1.6503, + "step": 590 + }, + { + "epoch": 0.05362003266194883, + "grad_norm": 0.39048656540849996, + "learning_rate": 0.0009985413922297757, + "loss": 1.6212, + "step": 591 + }, + { + "epoch": 0.053710760297586646, + "grad_norm": 0.4148041263958534, + "learning_rate": 0.0009985301561040124, + "loss": 1.6389, + "step": 592 + }, + { + "epoch": 0.05380148793322446, + "grad_norm": 0.35902557849640654, + "learning_rate": 0.0009985188769300966, + "loss": 1.6165, + "step": 593 + }, + { + "epoch": 0.05389221556886228, + "grad_norm": 0.4202596770916417, + "learning_rate": 0.0009985075547090023, + "loss": 1.6031, + "step": 594 + }, + { + "epoch": 0.05398294320450009, + "grad_norm": 0.3577738205247725, + "learning_rate": 0.0009984961894417073, + "loss": 1.601, + "step": 595 + }, + { + "epoch": 0.05407367084013791, + "grad_norm": 0.43955103242107735, + "learning_rate": 0.0009984847811291928, + "loss": 1.6395, + "step": 596 + }, + { + "epoch": 0.05416439847577572, + "grad_norm": 0.36799754249905664, + "learning_rate": 0.0009984733297724439, + "loss": 1.6018, + "step": 597 + }, + { + "epoch": 0.05425512611141354, + "grad_norm": 0.4620799166750375, + "learning_rate": 0.0009984618353724496, + "loss": 1.5914, + "step": 598 + }, + { + "epoch": 0.05434585374705135, + "grad_norm": 0.3768158644053066, + "learning_rate": 0.0009984502979302023, + "loss": 1.6456, + "step": 599 + }, + { + "epoch": 0.05443658138268917, + "grad_norm": 0.39545439037844604, + "learning_rate": 0.0009984387174466983, + "loss": 1.6134, + "step": 600 + }, + { + "epoch": 0.05452730901832698, + "grad_norm": 0.43445378550847835, + "learning_rate": 0.0009984270939229377, + "loss": 1.6272, + "step": 601 + }, + { + "epoch": 0.0546180366539648, + "grad_norm": 0.362508061921959, + "learning_rate": 0.000998415427359924, + "loss": 1.617, + "step": 602 + }, + { + "epoch": 0.05470876428960261, + "grad_norm": 0.38213055940083734, + "learning_rate": 0.0009984037177586645, + "loss": 1.6462, + "step": 603 + }, + { + "epoch": 0.05479949192524043, + "grad_norm": 0.37277036648167605, + "learning_rate": 0.0009983919651201708, + "loss": 1.6204, + "step": 604 + }, + { + "epoch": 0.054890219560878244, + "grad_norm": 0.32947648512166183, + "learning_rate": 0.0009983801694454573, + "loss": 1.6118, + "step": 605 + }, + { + "epoch": 0.054980947196516056, + "grad_norm": 0.38102000078558446, + "learning_rate": 0.0009983683307355428, + "loss": 1.5822, + "step": 606 + }, + { + "epoch": 0.055071674832153875, + "grad_norm": 0.37444488253741326, + "learning_rate": 0.0009983564489914494, + "loss": 1.6212, + "step": 607 + }, + { + "epoch": 0.05516240246779169, + "grad_norm": 0.3436443025377652, + "learning_rate": 0.0009983445242142033, + "loss": 1.6132, + "step": 608 + }, + { + "epoch": 0.055253130103429506, + "grad_norm": 0.3848500025816333, + "learning_rate": 0.000998332556404834, + "loss": 1.6188, + "step": 609 + }, + { + "epoch": 0.05534385773906732, + "grad_norm": 0.3717379373716313, + "learning_rate": 0.000998320545564375, + "loss": 1.5903, + "step": 610 + }, + { + "epoch": 0.055434585374705136, + "grad_norm": 0.3608561438932713, + "learning_rate": 0.0009983084916938634, + "loss": 1.59, + "step": 611 + }, + { + "epoch": 0.05552531301034295, + "grad_norm": 0.3590699806218024, + "learning_rate": 0.00099829639479434, + "loss": 1.6045, + "step": 612 + }, + { + "epoch": 0.05561604064598077, + "grad_norm": 0.6760392237577582, + "learning_rate": 0.0009982842548668497, + "loss": 1.6299, + "step": 613 + }, + { + "epoch": 0.05570676828161858, + "grad_norm": 0.3541429589388504, + "learning_rate": 0.0009982720719124408, + "loss": 1.6092, + "step": 614 + }, + { + "epoch": 0.0557974959172564, + "grad_norm": 0.34007711522466577, + "learning_rate": 0.0009982598459321646, + "loss": 1.6129, + "step": 615 + }, + { + "epoch": 0.05588822355289421, + "grad_norm": 0.3962289411507396, + "learning_rate": 0.0009982475769270774, + "loss": 1.5794, + "step": 616 + }, + { + "epoch": 0.05597895118853203, + "grad_norm": 0.3617523289478849, + "learning_rate": 0.0009982352648982386, + "loss": 1.6229, + "step": 617 + }, + { + "epoch": 0.05606967882416984, + "grad_norm": 0.37989548350055163, + "learning_rate": 0.000998222909846711, + "loss": 1.5987, + "step": 618 + }, + { + "epoch": 0.05616040645980766, + "grad_norm": 0.3364907705961388, + "learning_rate": 0.0009982105117735621, + "loss": 1.6439, + "step": 619 + }, + { + "epoch": 0.05625113409544547, + "grad_norm": 0.3318476534852213, + "learning_rate": 0.0009981980706798618, + "loss": 1.6232, + "step": 620 + }, + { + "epoch": 0.05634186173108329, + "grad_norm": 0.3218476068267926, + "learning_rate": 0.0009981855865666847, + "loss": 1.5848, + "step": 621 + }, + { + "epoch": 0.0564325893667211, + "grad_norm": 0.3054803691087536, + "learning_rate": 0.0009981730594351087, + "loss": 1.6149, + "step": 622 + }, + { + "epoch": 0.056523317002358915, + "grad_norm": 0.34424788911364335, + "learning_rate": 0.0009981604892862158, + "loss": 1.6326, + "step": 623 + }, + { + "epoch": 0.056614044637996734, + "grad_norm": 0.3331969878738117, + "learning_rate": 0.0009981478761210908, + "loss": 1.5952, + "step": 624 + }, + { + "epoch": 0.056704772273634546, + "grad_norm": 0.328535687591045, + "learning_rate": 0.0009981352199408238, + "loss": 1.6121, + "step": 625 + }, + { + "epoch": 0.056795499909272365, + "grad_norm": 0.373830547559748, + "learning_rate": 0.0009981225207465068, + "loss": 1.6115, + "step": 626 + }, + { + "epoch": 0.05688622754491018, + "grad_norm": 0.42436992855655536, + "learning_rate": 0.000998109778539237, + "loss": 1.6326, + "step": 627 + }, + { + "epoch": 0.056976955180547996, + "grad_norm": 0.3666123263588699, + "learning_rate": 0.000998096993320114, + "loss": 1.5804, + "step": 628 + }, + { + "epoch": 0.05706768281618581, + "grad_norm": 0.37878152150883626, + "learning_rate": 0.0009980841650902427, + "loss": 1.6051, + "step": 629 + }, + { + "epoch": 0.05715841045182363, + "grad_norm": 0.38447398895785384, + "learning_rate": 0.00099807129385073, + "loss": 1.6001, + "step": 630 + }, + { + "epoch": 0.05724913808746144, + "grad_norm": 0.3529286789508739, + "learning_rate": 0.0009980583796026878, + "loss": 1.574, + "step": 631 + }, + { + "epoch": 0.05733986572309926, + "grad_norm": 0.33729563264853574, + "learning_rate": 0.0009980454223472311, + "loss": 1.6454, + "step": 632 + }, + { + "epoch": 0.05743059335873707, + "grad_norm": 0.33147692950528296, + "learning_rate": 0.0009980324220854788, + "loss": 1.6078, + "step": 633 + }, + { + "epoch": 0.05752132099437489, + "grad_norm": 0.3191625427466357, + "learning_rate": 0.0009980193788185535, + "loss": 1.5875, + "step": 634 + }, + { + "epoch": 0.0576120486300127, + "grad_norm": 0.3893469077355295, + "learning_rate": 0.0009980062925475813, + "loss": 1.5736, + "step": 635 + }, + { + "epoch": 0.05770277626565052, + "grad_norm": 0.36116885157225237, + "learning_rate": 0.0009979931632736923, + "loss": 1.5957, + "step": 636 + }, + { + "epoch": 0.05779350390128833, + "grad_norm": 0.35319392439353114, + "learning_rate": 0.0009979799909980204, + "loss": 1.6027, + "step": 637 + }, + { + "epoch": 0.05788423153692615, + "grad_norm": 0.36712026510645224, + "learning_rate": 0.0009979667757217029, + "loss": 1.6394, + "step": 638 + }, + { + "epoch": 0.05797495917256396, + "grad_norm": 0.342073821305968, + "learning_rate": 0.0009979535174458806, + "loss": 1.5643, + "step": 639 + }, + { + "epoch": 0.05806568680820178, + "grad_norm": 0.34254095668247575, + "learning_rate": 0.0009979402161716991, + "loss": 1.6119, + "step": 640 + }, + { + "epoch": 0.058156414443839594, + "grad_norm": 0.33096720551993886, + "learning_rate": 0.0009979268719003062, + "loss": 1.586, + "step": 641 + }, + { + "epoch": 0.058247142079477406, + "grad_norm": 0.3513325555699978, + "learning_rate": 0.000997913484632855, + "loss": 1.6062, + "step": 642 + }, + { + "epoch": 0.058337869715115225, + "grad_norm": 0.3299525516199118, + "learning_rate": 0.0009979000543705006, + "loss": 1.5904, + "step": 643 + }, + { + "epoch": 0.058428597350753037, + "grad_norm": 0.3394403539546374, + "learning_rate": 0.000997886581114403, + "loss": 1.6292, + "step": 644 + }, + { + "epoch": 0.058519324986390855, + "grad_norm": 0.33111810985220086, + "learning_rate": 0.000997873064865726, + "loss": 1.6689, + "step": 645 + }, + { + "epoch": 0.05861005262202867, + "grad_norm": 0.3173924942778653, + "learning_rate": 0.0009978595056256364, + "loss": 1.5777, + "step": 646 + }, + { + "epoch": 0.058700780257666486, + "grad_norm": 0.3513908011644241, + "learning_rate": 0.0009978459033953054, + "loss": 1.5623, + "step": 647 + }, + { + "epoch": 0.0587915078933043, + "grad_norm": 0.3137748020295769, + "learning_rate": 0.000997832258175907, + "loss": 1.592, + "step": 648 + }, + { + "epoch": 0.05888223552894212, + "grad_norm": 0.34569999188315814, + "learning_rate": 0.0009978185699686198, + "loss": 1.6044, + "step": 649 + }, + { + "epoch": 0.05897296316457993, + "grad_norm": 0.3030068525691039, + "learning_rate": 0.0009978048387746256, + "loss": 1.5958, + "step": 650 + }, + { + "epoch": 0.05906369080021775, + "grad_norm": 0.32337413122103553, + "learning_rate": 0.0009977910645951103, + "loss": 1.572, + "step": 651 + }, + { + "epoch": 0.05915441843585556, + "grad_norm": 0.3003584955328216, + "learning_rate": 0.0009977772474312632, + "loss": 1.5678, + "step": 652 + }, + { + "epoch": 0.05924514607149338, + "grad_norm": 0.3152188130102941, + "learning_rate": 0.0009977633872842774, + "loss": 1.6092, + "step": 653 + }, + { + "epoch": 0.05933587370713119, + "grad_norm": 0.32962325853098495, + "learning_rate": 0.0009977494841553495, + "loss": 1.6187, + "step": 654 + }, + { + "epoch": 0.05942660134276901, + "grad_norm": 0.3065665997556033, + "learning_rate": 0.0009977355380456807, + "loss": 1.5691, + "step": 655 + }, + { + "epoch": 0.05951732897840682, + "grad_norm": 0.3542052642201851, + "learning_rate": 0.0009977215489564747, + "loss": 1.6247, + "step": 656 + }, + { + "epoch": 0.05960805661404464, + "grad_norm": 0.3195065424996426, + "learning_rate": 0.0009977075168889397, + "loss": 1.6142, + "step": 657 + }, + { + "epoch": 0.05969878424968245, + "grad_norm": 0.284240423029075, + "learning_rate": 0.0009976934418442869, + "loss": 1.6145, + "step": 658 + }, + { + "epoch": 0.05978951188532027, + "grad_norm": 0.3660036908090987, + "learning_rate": 0.0009976793238237324, + "loss": 1.5931, + "step": 659 + }, + { + "epoch": 0.059880239520958084, + "grad_norm": 0.30275465848651617, + "learning_rate": 0.0009976651628284948, + "loss": 1.6408, + "step": 660 + }, + { + "epoch": 0.059970967156595896, + "grad_norm": 0.3078654414225349, + "learning_rate": 0.0009976509588597968, + "loss": 1.5887, + "step": 661 + }, + { + "epoch": 0.060061694792233715, + "grad_norm": 0.2916203683932255, + "learning_rate": 0.0009976367119188655, + "loss": 1.5651, + "step": 662 + }, + { + "epoch": 0.06015242242787153, + "grad_norm": 0.30548703823566814, + "learning_rate": 0.0009976224220069306, + "loss": 1.5638, + "step": 663 + }, + { + "epoch": 0.060243150063509346, + "grad_norm": 0.31303513190598287, + "learning_rate": 0.0009976080891252264, + "loss": 1.5824, + "step": 664 + }, + { + "epoch": 0.06033387769914716, + "grad_norm": 0.3144702517027741, + "learning_rate": 0.0009975937132749902, + "loss": 1.5912, + "step": 665 + }, + { + "epoch": 0.06042460533478498, + "grad_norm": 0.3127218120102603, + "learning_rate": 0.0009975792944574636, + "loss": 1.5793, + "step": 666 + }, + { + "epoch": 0.06051533297042279, + "grad_norm": 0.34562613587652524, + "learning_rate": 0.0009975648326738915, + "loss": 1.6109, + "step": 667 + }, + { + "epoch": 0.06060606060606061, + "grad_norm": 0.39200341598882726, + "learning_rate": 0.000997550327925523, + "loss": 1.6183, + "step": 668 + }, + { + "epoch": 0.06069678824169842, + "grad_norm": 0.3256489317005186, + "learning_rate": 0.00099753578021361, + "loss": 1.6271, + "step": 669 + }, + { + "epoch": 0.06078751587733624, + "grad_norm": 0.3216968215859242, + "learning_rate": 0.0009975211895394093, + "loss": 1.6243, + "step": 670 + }, + { + "epoch": 0.06087824351297405, + "grad_norm": 0.49466150260547004, + "learning_rate": 0.0009975065559041805, + "loss": 1.5868, + "step": 671 + }, + { + "epoch": 0.06096897114861187, + "grad_norm": 0.3179810687828202, + "learning_rate": 0.0009974918793091872, + "loss": 1.5556, + "step": 672 + }, + { + "epoch": 0.06105969878424968, + "grad_norm": 0.3842905675842209, + "learning_rate": 0.0009974771597556968, + "loss": 1.5689, + "step": 673 + }, + { + "epoch": 0.0611504264198875, + "grad_norm": 0.3343093171125265, + "learning_rate": 0.0009974623972449804, + "loss": 1.601, + "step": 674 + }, + { + "epoch": 0.06124115405552531, + "grad_norm": 0.3238771144769286, + "learning_rate": 0.0009974475917783128, + "loss": 1.5648, + "step": 675 + }, + { + "epoch": 0.06133188169116313, + "grad_norm": 0.37799621303951664, + "learning_rate": 0.000997432743356972, + "loss": 1.6143, + "step": 676 + }, + { + "epoch": 0.061422609326800943, + "grad_norm": 0.3311587910280495, + "learning_rate": 0.0009974178519822408, + "loss": 1.58, + "step": 677 + }, + { + "epoch": 0.061513336962438755, + "grad_norm": 0.342948916981437, + "learning_rate": 0.0009974029176554047, + "loss": 1.5682, + "step": 678 + }, + { + "epoch": 0.061604064598076574, + "grad_norm": 0.33645659026006247, + "learning_rate": 0.0009973879403777533, + "loss": 1.5885, + "step": 679 + }, + { + "epoch": 0.061694792233714386, + "grad_norm": 0.32866781300452697, + "learning_rate": 0.00099737292015058, + "loss": 1.5365, + "step": 680 + }, + { + "epoch": 0.061785519869352205, + "grad_norm": 0.34341212763802187, + "learning_rate": 0.0009973578569751817, + "loss": 1.5803, + "step": 681 + }, + { + "epoch": 0.06187624750499002, + "grad_norm": 0.33966676131822654, + "learning_rate": 0.0009973427508528593, + "loss": 1.5514, + "step": 682 + }, + { + "epoch": 0.061966975140627836, + "grad_norm": 0.37149186305258525, + "learning_rate": 0.0009973276017849167, + "loss": 1.6203, + "step": 683 + }, + { + "epoch": 0.06205770277626565, + "grad_norm": 0.36747517033294824, + "learning_rate": 0.0009973124097726626, + "loss": 1.532, + "step": 684 + }, + { + "epoch": 0.06214843041190347, + "grad_norm": 0.7513163738223906, + "learning_rate": 0.0009972971748174087, + "loss": 1.5669, + "step": 685 + }, + { + "epoch": 0.06223915804754128, + "grad_norm": 0.39255697599269074, + "learning_rate": 0.0009972818969204704, + "loss": 1.5571, + "step": 686 + }, + { + "epoch": 0.0623298856831791, + "grad_norm": 0.34639763573602234, + "learning_rate": 0.000997266576083167, + "loss": 1.6235, + "step": 687 + }, + { + "epoch": 0.06242061331881691, + "grad_norm": 0.31556538297761483, + "learning_rate": 0.0009972512123068214, + "loss": 1.6323, + "step": 688 + }, + { + "epoch": 0.06251134095445472, + "grad_norm": 0.36122500123709717, + "learning_rate": 0.0009972358055927604, + "loss": 1.6134, + "step": 689 + }, + { + "epoch": 0.06260206859009254, + "grad_norm": 0.3032505999069524, + "learning_rate": 0.0009972203559423143, + "loss": 1.5747, + "step": 690 + }, + { + "epoch": 0.06269279622573036, + "grad_norm": 0.3451149198354989, + "learning_rate": 0.000997204863356817, + "loss": 1.574, + "step": 691 + }, + { + "epoch": 0.06278352386136818, + "grad_norm": 0.32989893761177896, + "learning_rate": 0.0009971893278376068, + "loss": 1.5971, + "step": 692 + }, + { + "epoch": 0.06287425149700598, + "grad_norm": 0.3071427349654422, + "learning_rate": 0.0009971737493860247, + "loss": 1.576, + "step": 693 + }, + { + "epoch": 0.0629649791326438, + "grad_norm": 0.3246465407094245, + "learning_rate": 0.000997158128003416, + "loss": 1.5485, + "step": 694 + }, + { + "epoch": 0.06305570676828162, + "grad_norm": 0.3329169685949843, + "learning_rate": 0.0009971424636911297, + "loss": 1.5413, + "step": 695 + }, + { + "epoch": 0.06314643440391943, + "grad_norm": 0.2785128142445482, + "learning_rate": 0.0009971267564505184, + "loss": 1.5717, + "step": 696 + }, + { + "epoch": 0.06323716203955725, + "grad_norm": 0.320878877018018, + "learning_rate": 0.0009971110062829385, + "loss": 1.6108, + "step": 697 + }, + { + "epoch": 0.06332788967519506, + "grad_norm": 0.28227168728271584, + "learning_rate": 0.0009970952131897499, + "loss": 1.5775, + "step": 698 + }, + { + "epoch": 0.06341861731083288, + "grad_norm": 0.2886769595680048, + "learning_rate": 0.0009970793771723163, + "loss": 1.5579, + "step": 699 + }, + { + "epoch": 0.06350934494647069, + "grad_norm": 0.28034405928529454, + "learning_rate": 0.0009970634982320052, + "loss": 1.587, + "step": 700 + }, + { + "epoch": 0.06360007258210851, + "grad_norm": 0.29693603596047435, + "learning_rate": 0.000997047576370188, + "loss": 1.5709, + "step": 701 + }, + { + "epoch": 0.06369080021774633, + "grad_norm": 0.31187527846328533, + "learning_rate": 0.0009970316115882393, + "loss": 1.5747, + "step": 702 + }, + { + "epoch": 0.06378152785338415, + "grad_norm": 0.2882636405841557, + "learning_rate": 0.0009970156038875374, + "loss": 1.599, + "step": 703 + }, + { + "epoch": 0.06387225548902195, + "grad_norm": 0.28060816175128955, + "learning_rate": 0.0009969995532694651, + "loss": 1.5945, + "step": 704 + }, + { + "epoch": 0.06396298312465977, + "grad_norm": 0.29319475362948394, + "learning_rate": 0.0009969834597354078, + "loss": 1.5501, + "step": 705 + }, + { + "epoch": 0.06405371076029759, + "grad_norm": 0.28477016647376036, + "learning_rate": 0.0009969673232867557, + "loss": 1.5955, + "step": 706 + }, + { + "epoch": 0.06414443839593541, + "grad_norm": 0.27955060518018826, + "learning_rate": 0.0009969511439249022, + "loss": 1.6105, + "step": 707 + }, + { + "epoch": 0.06423516603157321, + "grad_norm": 0.2965491384567928, + "learning_rate": 0.000996934921651244, + "loss": 1.5933, + "step": 708 + }, + { + "epoch": 0.06432589366721103, + "grad_norm": 0.2863463847718415, + "learning_rate": 0.0009969186564671821, + "loss": 1.565, + "step": 709 + }, + { + "epoch": 0.06441662130284885, + "grad_norm": 0.31139396400365804, + "learning_rate": 0.0009969023483741208, + "loss": 1.5473, + "step": 710 + }, + { + "epoch": 0.06450734893848667, + "grad_norm": 0.31285131647114267, + "learning_rate": 0.0009968859973734688, + "loss": 1.5745, + "step": 711 + }, + { + "epoch": 0.06459807657412447, + "grad_norm": 0.3176260594283475, + "learning_rate": 0.0009968696034666374, + "loss": 1.6496, + "step": 712 + }, + { + "epoch": 0.0646888042097623, + "grad_norm": 0.3288652986096973, + "learning_rate": 0.0009968531666550426, + "loss": 1.5253, + "step": 713 + }, + { + "epoch": 0.06477953184540011, + "grad_norm": 0.29213956833243804, + "learning_rate": 0.0009968366869401038, + "loss": 1.5571, + "step": 714 + }, + { + "epoch": 0.06487025948103792, + "grad_norm": 0.3441467985405209, + "learning_rate": 0.0009968201643232436, + "loss": 1.5779, + "step": 715 + }, + { + "epoch": 0.06496098711667574, + "grad_norm": 0.3182840954804447, + "learning_rate": 0.0009968035988058893, + "loss": 1.5616, + "step": 716 + }, + { + "epoch": 0.06505171475231356, + "grad_norm": 0.30387434793021373, + "learning_rate": 0.0009967869903894707, + "loss": 1.5871, + "step": 717 + }, + { + "epoch": 0.06514244238795137, + "grad_norm": 0.30271285267613784, + "learning_rate": 0.0009967703390754224, + "loss": 1.5976, + "step": 718 + }, + { + "epoch": 0.06523317002358918, + "grad_norm": 0.293100687549073, + "learning_rate": 0.000996753644865182, + "loss": 1.5984, + "step": 719 + }, + { + "epoch": 0.065323897659227, + "grad_norm": 0.3297381136542756, + "learning_rate": 0.0009967369077601913, + "loss": 1.5989, + "step": 720 + }, + { + "epoch": 0.06541462529486482, + "grad_norm": 0.3096813308763651, + "learning_rate": 0.0009967201277618954, + "loss": 1.5932, + "step": 721 + }, + { + "epoch": 0.06550535293050264, + "grad_norm": 0.2923115690008248, + "learning_rate": 0.0009967033048717431, + "loss": 1.5201, + "step": 722 + }, + { + "epoch": 0.06559608056614044, + "grad_norm": 0.292462682842043, + "learning_rate": 0.0009966864390911873, + "loss": 1.5789, + "step": 723 + }, + { + "epoch": 0.06568680820177826, + "grad_norm": 0.2937027053057862, + "learning_rate": 0.0009966695304216844, + "loss": 1.5903, + "step": 724 + }, + { + "epoch": 0.06577753583741608, + "grad_norm": 0.30537157953695154, + "learning_rate": 0.0009966525788646942, + "loss": 1.5698, + "step": 725 + }, + { + "epoch": 0.0658682634730539, + "grad_norm": 0.35510863668043513, + "learning_rate": 0.0009966355844216808, + "loss": 1.5866, + "step": 726 + }, + { + "epoch": 0.0659589911086917, + "grad_norm": 0.3325938196015394, + "learning_rate": 0.0009966185470941114, + "loss": 1.5914, + "step": 727 + }, + { + "epoch": 0.06604971874432952, + "grad_norm": 0.31454932334444363, + "learning_rate": 0.0009966014668834572, + "loss": 1.5576, + "step": 728 + }, + { + "epoch": 0.06614044637996734, + "grad_norm": 0.32362274917247036, + "learning_rate": 0.000996584343791193, + "loss": 1.5793, + "step": 729 + }, + { + "epoch": 0.06623117401560516, + "grad_norm": 0.3407604846085727, + "learning_rate": 0.0009965671778187977, + "loss": 1.5562, + "step": 730 + }, + { + "epoch": 0.06632190165124296, + "grad_norm": 0.38046764345664885, + "learning_rate": 0.0009965499689677535, + "loss": 1.5918, + "step": 731 + }, + { + "epoch": 0.06641262928688078, + "grad_norm": 0.37790416579167596, + "learning_rate": 0.0009965327172395462, + "loss": 1.5942, + "step": 732 + }, + { + "epoch": 0.0665033569225186, + "grad_norm": 0.3497349729974826, + "learning_rate": 0.0009965154226356657, + "loss": 1.5657, + "step": 733 + }, + { + "epoch": 0.06659408455815641, + "grad_norm": 0.35575673955680714, + "learning_rate": 0.000996498085157605, + "loss": 1.5983, + "step": 734 + }, + { + "epoch": 0.06668481219379423, + "grad_norm": 0.3891528705372433, + "learning_rate": 0.0009964807048068616, + "loss": 1.5395, + "step": 735 + }, + { + "epoch": 0.06677553982943205, + "grad_norm": 0.31520951408956194, + "learning_rate": 0.000996463281584936, + "loss": 1.5882, + "step": 736 + }, + { + "epoch": 0.06686626746506986, + "grad_norm": 0.339034519258965, + "learning_rate": 0.0009964458154933333, + "loss": 1.5718, + "step": 737 + }, + { + "epoch": 0.06695699510070767, + "grad_norm": 0.3062373798301534, + "learning_rate": 0.000996428306533561, + "loss": 1.5685, + "step": 738 + }, + { + "epoch": 0.06704772273634549, + "grad_norm": 0.2977828059477051, + "learning_rate": 0.0009964107547071313, + "loss": 1.607, + "step": 739 + }, + { + "epoch": 0.06713845037198331, + "grad_norm": 0.31922805644707125, + "learning_rate": 0.0009963931600155598, + "loss": 1.5838, + "step": 740 + }, + { + "epoch": 0.06722917800762113, + "grad_norm": 0.2997296275226702, + "learning_rate": 0.0009963755224603656, + "loss": 1.5999, + "step": 741 + }, + { + "epoch": 0.06731990564325893, + "grad_norm": 0.31054964237106564, + "learning_rate": 0.0009963578420430722, + "loss": 1.5756, + "step": 742 + }, + { + "epoch": 0.06741063327889675, + "grad_norm": 0.33695453991785623, + "learning_rate": 0.0009963401187652056, + "loss": 1.5775, + "step": 743 + }, + { + "epoch": 0.06750136091453457, + "grad_norm": 0.29909039365422374, + "learning_rate": 0.0009963223526282968, + "loss": 1.5365, + "step": 744 + }, + { + "epoch": 0.06759208855017239, + "grad_norm": 0.356020111523233, + "learning_rate": 0.0009963045436338798, + "loss": 1.5929, + "step": 745 + }, + { + "epoch": 0.0676828161858102, + "grad_norm": 0.28579559120662396, + "learning_rate": 0.0009962866917834921, + "loss": 1.564, + "step": 746 + }, + { + "epoch": 0.06777354382144801, + "grad_norm": 0.3181350694408455, + "learning_rate": 0.0009962687970786754, + "loss": 1.6038, + "step": 747 + }, + { + "epoch": 0.06786427145708583, + "grad_norm": 0.29455466991690443, + "learning_rate": 0.0009962508595209752, + "loss": 1.5743, + "step": 748 + }, + { + "epoch": 0.06795499909272365, + "grad_norm": 0.3138907250194058, + "learning_rate": 0.00099623287911194, + "loss": 1.5406, + "step": 749 + }, + { + "epoch": 0.06804572672836146, + "grad_norm": 0.32600362659346827, + "learning_rate": 0.0009962148558531224, + "loss": 1.5887, + "step": 750 + }, + { + "epoch": 0.06813645436399927, + "grad_norm": 0.3319160368457971, + "learning_rate": 0.000996196789746079, + "loss": 1.5774, + "step": 751 + }, + { + "epoch": 0.0682271819996371, + "grad_norm": 0.30829652246271766, + "learning_rate": 0.0009961786807923697, + "loss": 1.539, + "step": 752 + }, + { + "epoch": 0.0683179096352749, + "grad_norm": 0.33116043348858565, + "learning_rate": 0.0009961605289935582, + "loss": 1.5861, + "step": 753 + }, + { + "epoch": 0.06840863727091272, + "grad_norm": 0.31575906406123916, + "learning_rate": 0.0009961423343512119, + "loss": 1.5673, + "step": 754 + }, + { + "epoch": 0.06849936490655054, + "grad_norm": 0.3056772866542024, + "learning_rate": 0.0009961240968669018, + "loss": 1.5566, + "step": 755 + }, + { + "epoch": 0.06859009254218836, + "grad_norm": 0.31918880084517304, + "learning_rate": 0.0009961058165422027, + "loss": 1.5325, + "step": 756 + }, + { + "epoch": 0.06868082017782616, + "grad_norm": 0.3479407375088862, + "learning_rate": 0.0009960874933786935, + "loss": 1.583, + "step": 757 + }, + { + "epoch": 0.06877154781346398, + "grad_norm": 0.38724252564477996, + "learning_rate": 0.000996069127377956, + "loss": 1.5414, + "step": 758 + }, + { + "epoch": 0.0688622754491018, + "grad_norm": 0.36682672144502904, + "learning_rate": 0.0009960507185415763, + "loss": 1.5752, + "step": 759 + }, + { + "epoch": 0.06895300308473962, + "grad_norm": 0.3134346213862008, + "learning_rate": 0.0009960322668711439, + "loss": 1.6091, + "step": 760 + }, + { + "epoch": 0.06904373072037742, + "grad_norm": 0.3458590226506286, + "learning_rate": 0.000996013772368252, + "loss": 1.5812, + "step": 761 + }, + { + "epoch": 0.06913445835601524, + "grad_norm": 0.28909048899863005, + "learning_rate": 0.0009959952350344981, + "loss": 1.6111, + "step": 762 + }, + { + "epoch": 0.06922518599165306, + "grad_norm": 0.3164167789545052, + "learning_rate": 0.0009959766548714823, + "loss": 1.5837, + "step": 763 + }, + { + "epoch": 0.06931591362729088, + "grad_norm": 0.29309088703961544, + "learning_rate": 0.0009959580318808093, + "loss": 1.5767, + "step": 764 + }, + { + "epoch": 0.06940664126292868, + "grad_norm": 0.3049154315917614, + "learning_rate": 0.0009959393660640872, + "loss": 1.5825, + "step": 765 + }, + { + "epoch": 0.0694973688985665, + "grad_norm": 0.30900604109484736, + "learning_rate": 0.0009959206574229277, + "loss": 1.5975, + "step": 766 + }, + { + "epoch": 0.06958809653420432, + "grad_norm": 0.27485870394628803, + "learning_rate": 0.0009959019059589463, + "loss": 1.5546, + "step": 767 + }, + { + "epoch": 0.06967882416984214, + "grad_norm": 0.2891328442327511, + "learning_rate": 0.0009958831116737622, + "loss": 1.5889, + "step": 768 + }, + { + "epoch": 0.06976955180547995, + "grad_norm": 0.30826480654317556, + "learning_rate": 0.0009958642745689983, + "loss": 1.5579, + "step": 769 + }, + { + "epoch": 0.06986027944111776, + "grad_norm": 0.29683763690278103, + "learning_rate": 0.0009958453946462813, + "loss": 1.5723, + "step": 770 + }, + { + "epoch": 0.06995100707675558, + "grad_norm": 0.3085394356277712, + "learning_rate": 0.0009958264719072416, + "loss": 1.565, + "step": 771 + }, + { + "epoch": 0.07004173471239339, + "grad_norm": 0.3219472378610613, + "learning_rate": 0.0009958075063535128, + "loss": 1.5516, + "step": 772 + }, + { + "epoch": 0.07013246234803121, + "grad_norm": 0.3020505837562614, + "learning_rate": 0.0009957884979867326, + "loss": 1.5556, + "step": 773 + }, + { + "epoch": 0.07022318998366903, + "grad_norm": 0.30538360793801556, + "learning_rate": 0.0009957694468085427, + "loss": 1.5927, + "step": 774 + }, + { + "epoch": 0.07031391761930685, + "grad_norm": 0.2997676229147533, + "learning_rate": 0.000995750352820588, + "loss": 1.581, + "step": 775 + }, + { + "epoch": 0.07040464525494465, + "grad_norm": 0.28596655884358024, + "learning_rate": 0.0009957312160245172, + "loss": 1.5878, + "step": 776 + }, + { + "epoch": 0.07049537289058247, + "grad_norm": 0.29241513504601424, + "learning_rate": 0.0009957120364219828, + "loss": 1.5627, + "step": 777 + }, + { + "epoch": 0.07058610052622029, + "grad_norm": 0.30946025325521326, + "learning_rate": 0.000995692814014641, + "loss": 1.5845, + "step": 778 + }, + { + "epoch": 0.07067682816185811, + "grad_norm": 0.2950113645112398, + "learning_rate": 0.0009956735488041517, + "loss": 1.6067, + "step": 779 + }, + { + "epoch": 0.07076755579749591, + "grad_norm": 0.7241211054062756, + "learning_rate": 0.0009956542407921784, + "loss": 1.5907, + "step": 780 + }, + { + "epoch": 0.07085828343313373, + "grad_norm": 0.318011523091178, + "learning_rate": 0.0009956348899803882, + "loss": 1.5559, + "step": 781 + }, + { + "epoch": 0.07094901106877155, + "grad_norm": 0.32544398429995763, + "learning_rate": 0.0009956154963704524, + "loss": 1.5418, + "step": 782 + }, + { + "epoch": 0.07103973870440937, + "grad_norm": 0.3285976806558467, + "learning_rate": 0.0009955960599640455, + "loss": 1.5632, + "step": 783 + }, + { + "epoch": 0.07113046634004717, + "grad_norm": 0.32713612892083216, + "learning_rate": 0.0009955765807628456, + "loss": 1.5497, + "step": 784 + }, + { + "epoch": 0.071221193975685, + "grad_norm": 0.33697217368307725, + "learning_rate": 0.000995557058768535, + "loss": 1.592, + "step": 785 + }, + { + "epoch": 0.07131192161132281, + "grad_norm": 0.3208211812961102, + "learning_rate": 0.0009955374939827994, + "loss": 1.5712, + "step": 786 + }, + { + "epoch": 0.07140264924696063, + "grad_norm": 0.32166540854193815, + "learning_rate": 0.000995517886407328, + "loss": 1.5665, + "step": 787 + }, + { + "epoch": 0.07149337688259844, + "grad_norm": 0.32065758014916756, + "learning_rate": 0.0009954982360438143, + "loss": 1.5248, + "step": 788 + }, + { + "epoch": 0.07158410451823626, + "grad_norm": 0.3261790789960232, + "learning_rate": 0.0009954785428939548, + "loss": 1.5817, + "step": 789 + }, + { + "epoch": 0.07167483215387407, + "grad_norm": 0.3068629524265977, + "learning_rate": 0.0009954588069594498, + "loss": 1.5448, + "step": 790 + }, + { + "epoch": 0.07176555978951188, + "grad_norm": 0.32358400467724474, + "learning_rate": 0.000995439028242004, + "loss": 1.5623, + "step": 791 + }, + { + "epoch": 0.0718562874251497, + "grad_norm": 0.3378376872394572, + "learning_rate": 0.0009954192067433251, + "loss": 1.5805, + "step": 792 + }, + { + "epoch": 0.07194701506078752, + "grad_norm": 0.34895497890466803, + "learning_rate": 0.000995399342465125, + "loss": 1.5612, + "step": 793 + }, + { + "epoch": 0.07203774269642534, + "grad_norm": 0.26870455566250034, + "learning_rate": 0.0009953794354091183, + "loss": 1.5236, + "step": 794 + }, + { + "epoch": 0.07212847033206314, + "grad_norm": 0.30302437752599254, + "learning_rate": 0.0009953594855770245, + "loss": 1.5546, + "step": 795 + }, + { + "epoch": 0.07221919796770096, + "grad_norm": 0.27632748999622886, + "learning_rate": 0.0009953394929705659, + "loss": 1.5887, + "step": 796 + }, + { + "epoch": 0.07230992560333878, + "grad_norm": 0.3670055197162124, + "learning_rate": 0.0009953194575914692, + "loss": 1.5641, + "step": 797 + }, + { + "epoch": 0.0724006532389766, + "grad_norm": 0.29279771590440873, + "learning_rate": 0.0009952993794414644, + "loss": 1.5654, + "step": 798 + }, + { + "epoch": 0.0724913808746144, + "grad_norm": 0.3236656615179228, + "learning_rate": 0.0009952792585222851, + "loss": 1.5394, + "step": 799 + }, + { + "epoch": 0.07258210851025222, + "grad_norm": 0.4285465983406476, + "learning_rate": 0.0009952590948356687, + "loss": 1.5314, + "step": 800 + }, + { + "epoch": 0.07267283614589004, + "grad_norm": 0.3126536501803067, + "learning_rate": 0.0009952388883833567, + "loss": 1.5753, + "step": 801 + }, + { + "epoch": 0.07276356378152786, + "grad_norm": 0.29574782071175315, + "learning_rate": 0.0009952186391670936, + "loss": 1.5674, + "step": 802 + }, + { + "epoch": 0.07285429141716566, + "grad_norm": 0.2610796543814839, + "learning_rate": 0.0009951983471886282, + "loss": 1.6135, + "step": 803 + }, + { + "epoch": 0.07294501905280348, + "grad_norm": 0.2972964352642521, + "learning_rate": 0.0009951780124497123, + "loss": 1.5552, + "step": 804 + }, + { + "epoch": 0.0730357466884413, + "grad_norm": 0.3163770557818192, + "learning_rate": 0.0009951576349521022, + "loss": 1.5729, + "step": 805 + }, + { + "epoch": 0.07312647432407911, + "grad_norm": 0.2830167932865456, + "learning_rate": 0.0009951372146975571, + "loss": 1.594, + "step": 806 + }, + { + "epoch": 0.07321720195971693, + "grad_norm": 0.301628265033494, + "learning_rate": 0.0009951167516878408, + "loss": 1.5857, + "step": 807 + }, + { + "epoch": 0.07330792959535475, + "grad_norm": 0.3059247977743202, + "learning_rate": 0.0009950962459247198, + "loss": 1.5601, + "step": 808 + }, + { + "epoch": 0.07339865723099256, + "grad_norm": 0.28877009318559105, + "learning_rate": 0.0009950756974099653, + "loss": 1.5519, + "step": 809 + }, + { + "epoch": 0.07348938486663037, + "grad_norm": 0.2802019670374271, + "learning_rate": 0.000995055106145351, + "loss": 1.5275, + "step": 810 + }, + { + "epoch": 0.07358011250226819, + "grad_norm": 0.2892397591620191, + "learning_rate": 0.0009950344721326556, + "loss": 1.5464, + "step": 811 + }, + { + "epoch": 0.07367084013790601, + "grad_norm": 0.29419096286479196, + "learning_rate": 0.0009950137953736605, + "loss": 1.5581, + "step": 812 + }, + { + "epoch": 0.07376156777354383, + "grad_norm": 0.2852739348945415, + "learning_rate": 0.000994993075870151, + "loss": 1.6072, + "step": 813 + }, + { + "epoch": 0.07385229540918163, + "grad_norm": 0.28767951130416153, + "learning_rate": 0.0009949723136239168, + "loss": 1.5964, + "step": 814 + }, + { + "epoch": 0.07394302304481945, + "grad_norm": 0.2995906379318244, + "learning_rate": 0.0009949515086367501, + "loss": 1.5912, + "step": 815 + }, + { + "epoch": 0.07403375068045727, + "grad_norm": 0.2808103906044148, + "learning_rate": 0.0009949306609104479, + "loss": 1.5349, + "step": 816 + }, + { + "epoch": 0.07412447831609509, + "grad_norm": 0.2724775414925694, + "learning_rate": 0.00099490977044681, + "loss": 1.6103, + "step": 817 + }, + { + "epoch": 0.0742152059517329, + "grad_norm": 0.26628394031978175, + "learning_rate": 0.0009948888372476406, + "loss": 1.5441, + "step": 818 + }, + { + "epoch": 0.07430593358737071, + "grad_norm": 0.30833186332255663, + "learning_rate": 0.0009948678613147471, + "loss": 1.5199, + "step": 819 + }, + { + "epoch": 0.07439666122300853, + "grad_norm": 0.311419782233401, + "learning_rate": 0.0009948468426499407, + "loss": 1.5233, + "step": 820 + }, + { + "epoch": 0.07448738885864635, + "grad_norm": 0.27339113040386515, + "learning_rate": 0.0009948257812550365, + "loss": 1.5458, + "step": 821 + }, + { + "epoch": 0.07457811649428416, + "grad_norm": 0.30049756037737746, + "learning_rate": 0.0009948046771318536, + "loss": 1.5704, + "step": 822 + }, + { + "epoch": 0.07466884412992197, + "grad_norm": 0.2841987226337639, + "learning_rate": 0.0009947835302822135, + "loss": 1.513, + "step": 823 + }, + { + "epoch": 0.07475957176555979, + "grad_norm": 0.299905276714924, + "learning_rate": 0.0009947623407079427, + "loss": 1.5842, + "step": 824 + }, + { + "epoch": 0.0748502994011976, + "grad_norm": 0.4142954063751624, + "learning_rate": 0.000994741108410871, + "loss": 1.5495, + "step": 825 + }, + { + "epoch": 0.07494102703683542, + "grad_norm": 0.29091605750815586, + "learning_rate": 0.0009947198333928316, + "loss": 1.5569, + "step": 826 + }, + { + "epoch": 0.07503175467247324, + "grad_norm": 0.2822913672568809, + "learning_rate": 0.0009946985156556614, + "loss": 1.5179, + "step": 827 + }, + { + "epoch": 0.07512248230811105, + "grad_norm": 0.28886109726059683, + "learning_rate": 0.0009946771552012017, + "loss": 1.5553, + "step": 828 + }, + { + "epoch": 0.07521320994374886, + "grad_norm": 0.2798141817706489, + "learning_rate": 0.000994655752031297, + "loss": 1.563, + "step": 829 + }, + { + "epoch": 0.07530393757938668, + "grad_norm": 0.33476685619453483, + "learning_rate": 0.0009946343061477947, + "loss": 1.5526, + "step": 830 + }, + { + "epoch": 0.0753946652150245, + "grad_norm": 0.27957960897649153, + "learning_rate": 0.0009946128175525475, + "loss": 1.5722, + "step": 831 + }, + { + "epoch": 0.07548539285066232, + "grad_norm": 0.31926765557778986, + "learning_rate": 0.0009945912862474105, + "loss": 1.5896, + "step": 832 + }, + { + "epoch": 0.07557612048630012, + "grad_norm": 0.271826332719705, + "learning_rate": 0.000994569712234243, + "loss": 1.5414, + "step": 833 + }, + { + "epoch": 0.07566684812193794, + "grad_norm": 0.32793847781998986, + "learning_rate": 0.000994548095514908, + "loss": 1.5821, + "step": 834 + }, + { + "epoch": 0.07575757575757576, + "grad_norm": 0.30435295722291783, + "learning_rate": 0.0009945264360912722, + "loss": 1.531, + "step": 835 + }, + { + "epoch": 0.07584830339321358, + "grad_norm": 0.3130180170208902, + "learning_rate": 0.0009945047339652057, + "loss": 1.5517, + "step": 836 + }, + { + "epoch": 0.07593903102885138, + "grad_norm": 0.29512398150459906, + "learning_rate": 0.0009944829891385825, + "loss": 1.5525, + "step": 837 + }, + { + "epoch": 0.0760297586644892, + "grad_norm": 0.2852206453697002, + "learning_rate": 0.0009944612016132802, + "loss": 1.5879, + "step": 838 + }, + { + "epoch": 0.07612048630012702, + "grad_norm": 0.30067741723678, + "learning_rate": 0.0009944393713911804, + "loss": 1.5299, + "step": 839 + }, + { + "epoch": 0.07621121393576484, + "grad_norm": 0.35692871257861547, + "learning_rate": 0.0009944174984741678, + "loss": 1.5796, + "step": 840 + }, + { + "epoch": 0.07630194157140265, + "grad_norm": 0.3068783754133656, + "learning_rate": 0.0009943955828641317, + "loss": 1.546, + "step": 841 + }, + { + "epoch": 0.07639266920704046, + "grad_norm": 0.30596526372644095, + "learning_rate": 0.0009943736245629638, + "loss": 1.5641, + "step": 842 + }, + { + "epoch": 0.07648339684267828, + "grad_norm": 0.6881679557607587, + "learning_rate": 0.0009943516235725606, + "loss": 1.5847, + "step": 843 + }, + { + "epoch": 0.07657412447831609, + "grad_norm": 0.39883633175094907, + "learning_rate": 0.000994329579894822, + "loss": 1.5695, + "step": 844 + }, + { + "epoch": 0.07666485211395391, + "grad_norm": 0.3789117921478979, + "learning_rate": 0.0009943074935316514, + "loss": 1.5578, + "step": 845 + }, + { + "epoch": 0.07675557974959173, + "grad_norm": 0.3050977411578192, + "learning_rate": 0.0009942853644849556, + "loss": 1.5311, + "step": 846 + }, + { + "epoch": 0.07684630738522955, + "grad_norm": 0.3031613261507933, + "learning_rate": 0.000994263192756646, + "loss": 1.5866, + "step": 847 + }, + { + "epoch": 0.07693703502086735, + "grad_norm": 0.33765033890098367, + "learning_rate": 0.0009942409783486367, + "loss": 1.5526, + "step": 848 + }, + { + "epoch": 0.07702776265650517, + "grad_norm": 0.2755190009917202, + "learning_rate": 0.0009942187212628462, + "loss": 1.608, + "step": 849 + }, + { + "epoch": 0.07711849029214299, + "grad_norm": 0.28956722550940306, + "learning_rate": 0.000994196421501196, + "loss": 1.5572, + "step": 850 + }, + { + "epoch": 0.07720921792778081, + "grad_norm": 0.3022341430766907, + "learning_rate": 0.0009941740790656121, + "loss": 1.5419, + "step": 851 + }, + { + "epoch": 0.07729994556341861, + "grad_norm": 0.3002361769586046, + "learning_rate": 0.0009941516939580238, + "loss": 1.5595, + "step": 852 + }, + { + "epoch": 0.07739067319905643, + "grad_norm": 0.3036474800777086, + "learning_rate": 0.0009941292661803638, + "loss": 1.5935, + "step": 853 + }, + { + "epoch": 0.07748140083469425, + "grad_norm": 0.31213422828759774, + "learning_rate": 0.0009941067957345688, + "loss": 1.5323, + "step": 854 + }, + { + "epoch": 0.07757212847033207, + "grad_norm": 0.2937625564699382, + "learning_rate": 0.0009940842826225793, + "loss": 1.5805, + "step": 855 + }, + { + "epoch": 0.07766285610596987, + "grad_norm": 0.292980820016962, + "learning_rate": 0.000994061726846339, + "loss": 1.5166, + "step": 856 + }, + { + "epoch": 0.07775358374160769, + "grad_norm": 0.31146771164332426, + "learning_rate": 0.0009940391284077958, + "loss": 1.543, + "step": 857 + }, + { + "epoch": 0.07784431137724551, + "grad_norm": 0.34221804073998574, + "learning_rate": 0.000994016487308901, + "loss": 1.5452, + "step": 858 + }, + { + "epoch": 0.07793503901288333, + "grad_norm": 0.3237705512004156, + "learning_rate": 0.0009939938035516099, + "loss": 1.5652, + "step": 859 + }, + { + "epoch": 0.07802576664852114, + "grad_norm": 0.31666015266289566, + "learning_rate": 0.000993971077137881, + "loss": 1.5652, + "step": 860 + }, + { + "epoch": 0.07811649428415895, + "grad_norm": 0.3345269071616816, + "learning_rate": 0.0009939483080696767, + "loss": 1.5422, + "step": 861 + }, + { + "epoch": 0.07820722191979677, + "grad_norm": 0.33400546121101105, + "learning_rate": 0.0009939254963489633, + "loss": 1.5824, + "step": 862 + }, + { + "epoch": 0.07829794955543458, + "grad_norm": 0.379297826324954, + "learning_rate": 0.0009939026419777104, + "loss": 1.5457, + "step": 863 + }, + { + "epoch": 0.0783886771910724, + "grad_norm": 0.3039021879102997, + "learning_rate": 0.0009938797449578916, + "loss": 1.5459, + "step": 864 + }, + { + "epoch": 0.07847940482671022, + "grad_norm": 0.31626696054326914, + "learning_rate": 0.000993856805291484, + "loss": 1.5499, + "step": 865 + }, + { + "epoch": 0.07857013246234804, + "grad_norm": 0.29618091259456364, + "learning_rate": 0.0009938338229804685, + "loss": 1.5587, + "step": 866 + }, + { + "epoch": 0.07866086009798584, + "grad_norm": 0.34033019006887, + "learning_rate": 0.0009938107980268297, + "loss": 1.553, + "step": 867 + }, + { + "epoch": 0.07875158773362366, + "grad_norm": 0.31580884196037673, + "learning_rate": 0.0009937877304325555, + "loss": 1.5545, + "step": 868 + }, + { + "epoch": 0.07884231536926148, + "grad_norm": 0.3266631346424324, + "learning_rate": 0.0009937646201996382, + "loss": 1.5452, + "step": 869 + }, + { + "epoch": 0.0789330430048993, + "grad_norm": 0.3673401877257416, + "learning_rate": 0.000993741467330073, + "loss": 1.589, + "step": 870 + }, + { + "epoch": 0.0790237706405371, + "grad_norm": 0.4218658711434203, + "learning_rate": 0.0009937182718258596, + "loss": 1.5843, + "step": 871 + }, + { + "epoch": 0.07911449827617492, + "grad_norm": 0.329537355866133, + "learning_rate": 0.0009936950336890003, + "loss": 1.5504, + "step": 872 + }, + { + "epoch": 0.07920522591181274, + "grad_norm": 0.2983508066138841, + "learning_rate": 0.0009936717529215023, + "loss": 1.5413, + "step": 873 + }, + { + "epoch": 0.07929595354745056, + "grad_norm": 0.29309344568345524, + "learning_rate": 0.0009936484295253755, + "loss": 1.6012, + "step": 874 + }, + { + "epoch": 0.07938668118308836, + "grad_norm": 0.3069203353561492, + "learning_rate": 0.0009936250635026343, + "loss": 1.5447, + "step": 875 + }, + { + "epoch": 0.07947740881872618, + "grad_norm": 0.29416242291556977, + "learning_rate": 0.000993601654855296, + "loss": 1.5669, + "step": 876 + }, + { + "epoch": 0.079568136454364, + "grad_norm": 0.2775751731655097, + "learning_rate": 0.000993578203585382, + "loss": 1.5698, + "step": 877 + }, + { + "epoch": 0.07965886409000182, + "grad_norm": 0.2981020937281569, + "learning_rate": 0.0009935547096949174, + "loss": 1.5408, + "step": 878 + }, + { + "epoch": 0.07974959172563963, + "grad_norm": 0.2962842893852791, + "learning_rate": 0.0009935311731859308, + "loss": 1.563, + "step": 879 + }, + { + "epoch": 0.07984031936127745, + "grad_norm": 0.31251839070589466, + "learning_rate": 0.000993507594060455, + "loss": 1.5615, + "step": 880 + }, + { + "epoch": 0.07993104699691526, + "grad_norm": 0.27067389184504426, + "learning_rate": 0.0009934839723205254, + "loss": 1.5395, + "step": 881 + }, + { + "epoch": 0.08002177463255307, + "grad_norm": 0.4155088238949986, + "learning_rate": 0.000993460307968182, + "loss": 1.578, + "step": 882 + }, + { + "epoch": 0.08011250226819089, + "grad_norm": 0.3146207833865088, + "learning_rate": 0.0009934366010054686, + "loss": 1.5339, + "step": 883 + }, + { + "epoch": 0.08020322990382871, + "grad_norm": 0.30274200045255373, + "learning_rate": 0.0009934128514344318, + "loss": 1.524, + "step": 884 + }, + { + "epoch": 0.08029395753946653, + "grad_norm": 0.29539999927097715, + "learning_rate": 0.0009933890592571223, + "loss": 1.5613, + "step": 885 + }, + { + "epoch": 0.08038468517510433, + "grad_norm": 0.2870964295114381, + "learning_rate": 0.0009933652244755952, + "loss": 1.5253, + "step": 886 + }, + { + "epoch": 0.08047541281074215, + "grad_norm": 0.2998178084134325, + "learning_rate": 0.0009933413470919082, + "loss": 1.6028, + "step": 887 + }, + { + "epoch": 0.08056614044637997, + "grad_norm": 0.3127950812644323, + "learning_rate": 0.000993317427108123, + "loss": 1.5883, + "step": 888 + }, + { + "epoch": 0.08065686808201779, + "grad_norm": 0.29975795721905385, + "learning_rate": 0.0009932934645263054, + "loss": 1.5603, + "step": 889 + }, + { + "epoch": 0.0807475957176556, + "grad_norm": 0.29650237164729276, + "learning_rate": 0.0009932694593485242, + "loss": 1.5423, + "step": 890 + }, + { + "epoch": 0.08083832335329341, + "grad_norm": 0.2898570887232849, + "learning_rate": 0.0009932454115768527, + "loss": 1.5197, + "step": 891 + }, + { + "epoch": 0.08092905098893123, + "grad_norm": 0.2705405734410793, + "learning_rate": 0.0009932213212133672, + "loss": 1.5495, + "step": 892 + }, + { + "epoch": 0.08101977862456905, + "grad_norm": 0.26310215789373004, + "learning_rate": 0.000993197188260148, + "loss": 1.5706, + "step": 893 + }, + { + "epoch": 0.08111050626020685, + "grad_norm": 0.3100260126038494, + "learning_rate": 0.000993173012719279, + "loss": 1.605, + "step": 894 + }, + { + "epoch": 0.08120123389584467, + "grad_norm": 0.2674618936022376, + "learning_rate": 0.0009931487945928472, + "loss": 1.5094, + "step": 895 + }, + { + "epoch": 0.08129196153148249, + "grad_norm": 0.38963944871821393, + "learning_rate": 0.0009931245338829448, + "loss": 1.5658, + "step": 896 + }, + { + "epoch": 0.08138268916712031, + "grad_norm": 0.27162175071808975, + "learning_rate": 0.0009931002305916658, + "loss": 1.5743, + "step": 897 + }, + { + "epoch": 0.08147341680275812, + "grad_norm": 0.27635711687704384, + "learning_rate": 0.0009930758847211095, + "loss": 1.5381, + "step": 898 + }, + { + "epoch": 0.08156414443839594, + "grad_norm": 0.31751066547183177, + "learning_rate": 0.0009930514962733776, + "loss": 1.5594, + "step": 899 + }, + { + "epoch": 0.08165487207403375, + "grad_norm": 0.2705343835334727, + "learning_rate": 0.0009930270652505767, + "loss": 1.513, + "step": 900 + }, + { + "epoch": 0.08174559970967156, + "grad_norm": 0.2778331017525457, + "learning_rate": 0.0009930025916548158, + "loss": 1.5037, + "step": 901 + }, + { + "epoch": 0.08183632734530938, + "grad_norm": 0.3378035677151701, + "learning_rate": 0.0009929780754882086, + "loss": 1.5325, + "step": 902 + }, + { + "epoch": 0.0819270549809472, + "grad_norm": 0.26018182793531225, + "learning_rate": 0.0009929535167528718, + "loss": 1.5572, + "step": 903 + }, + { + "epoch": 0.08201778261658502, + "grad_norm": 0.27392491160332305, + "learning_rate": 0.0009929289154509261, + "loss": 1.523, + "step": 904 + }, + { + "epoch": 0.08210851025222282, + "grad_norm": 0.27486192134154885, + "learning_rate": 0.000992904271584496, + "loss": 1.5233, + "step": 905 + }, + { + "epoch": 0.08219923788786064, + "grad_norm": 0.32064815736244323, + "learning_rate": 0.0009928795851557096, + "loss": 1.5118, + "step": 906 + }, + { + "epoch": 0.08228996552349846, + "grad_norm": 0.27826274008032026, + "learning_rate": 0.0009928548561666981, + "loss": 1.539, + "step": 907 + }, + { + "epoch": 0.08238069315913628, + "grad_norm": 0.2961622196153335, + "learning_rate": 0.0009928300846195972, + "loss": 1.5599, + "step": 908 + }, + { + "epoch": 0.08247142079477408, + "grad_norm": 0.27319277273298026, + "learning_rate": 0.0009928052705165458, + "loss": 1.5467, + "step": 909 + }, + { + "epoch": 0.0825621484304119, + "grad_norm": 0.28591671341000924, + "learning_rate": 0.0009927804138596867, + "loss": 1.5802, + "step": 910 + }, + { + "epoch": 0.08265287606604972, + "grad_norm": 0.25777649661544266, + "learning_rate": 0.0009927555146511664, + "loss": 1.5784, + "step": 911 + }, + { + "epoch": 0.08274360370168754, + "grad_norm": 0.26934773030474396, + "learning_rate": 0.0009927305728931347, + "loss": 1.4966, + "step": 912 + }, + { + "epoch": 0.08283433133732535, + "grad_norm": 0.2608946247994796, + "learning_rate": 0.0009927055885877454, + "loss": 1.5577, + "step": 913 + }, + { + "epoch": 0.08292505897296316, + "grad_norm": 0.24869774692327046, + "learning_rate": 0.0009926805617371556, + "loss": 1.5136, + "step": 914 + }, + { + "epoch": 0.08301578660860098, + "grad_norm": 0.29107506521965276, + "learning_rate": 0.0009926554923435271, + "loss": 1.5455, + "step": 915 + }, + { + "epoch": 0.08310651424423879, + "grad_norm": 0.2648174204040714, + "learning_rate": 0.0009926303804090241, + "loss": 1.5385, + "step": 916 + }, + { + "epoch": 0.08319724187987661, + "grad_norm": 0.25760763378661083, + "learning_rate": 0.0009926052259358151, + "loss": 1.5529, + "step": 917 + }, + { + "epoch": 0.08328796951551443, + "grad_norm": 0.25685698379991445, + "learning_rate": 0.0009925800289260723, + "loss": 1.547, + "step": 918 + }, + { + "epoch": 0.08337869715115225, + "grad_norm": 0.2819426973143449, + "learning_rate": 0.0009925547893819713, + "loss": 1.5617, + "step": 919 + }, + { + "epoch": 0.08346942478679005, + "grad_norm": 0.2601537966819283, + "learning_rate": 0.0009925295073056918, + "loss": 1.5427, + "step": 920 + }, + { + "epoch": 0.08356015242242787, + "grad_norm": 0.2637589115096302, + "learning_rate": 0.0009925041826994167, + "loss": 1.6176, + "step": 921 + }, + { + "epoch": 0.08365088005806569, + "grad_norm": 0.24828825260185122, + "learning_rate": 0.000992478815565333, + "loss": 1.5444, + "step": 922 + }, + { + "epoch": 0.08374160769370351, + "grad_norm": 0.24743242387192108, + "learning_rate": 0.0009924534059056306, + "loss": 1.5285, + "step": 923 + }, + { + "epoch": 0.08383233532934131, + "grad_norm": 0.2384911975840947, + "learning_rate": 0.0009924279537225043, + "loss": 1.5375, + "step": 924 + }, + { + "epoch": 0.08392306296497913, + "grad_norm": 0.24130719921338256, + "learning_rate": 0.0009924024590181515, + "loss": 1.5415, + "step": 925 + }, + { + "epoch": 0.08401379060061695, + "grad_norm": 0.24738842312391218, + "learning_rate": 0.0009923769217947739, + "loss": 1.5654, + "step": 926 + }, + { + "epoch": 0.08410451823625477, + "grad_norm": 0.2838610898920616, + "learning_rate": 0.0009923513420545763, + "loss": 1.552, + "step": 927 + }, + { + "epoch": 0.08419524587189257, + "grad_norm": 0.24799991925491538, + "learning_rate": 0.000992325719799768, + "loss": 1.5174, + "step": 928 + }, + { + "epoch": 0.08428597350753039, + "grad_norm": 0.2526419394626361, + "learning_rate": 0.000992300055032561, + "loss": 1.5278, + "step": 929 + }, + { + "epoch": 0.08437670114316821, + "grad_norm": 0.2412967685636844, + "learning_rate": 0.000992274347755172, + "loss": 1.5318, + "step": 930 + }, + { + "epoch": 0.08446742877880603, + "grad_norm": 0.24330943942896696, + "learning_rate": 0.0009922485979698202, + "loss": 1.5678, + "step": 931 + }, + { + "epoch": 0.08455815641444384, + "grad_norm": 0.23918600557722985, + "learning_rate": 0.0009922228056787295, + "loss": 1.5223, + "step": 932 + }, + { + "epoch": 0.08464888405008165, + "grad_norm": 0.2516249407192343, + "learning_rate": 0.000992196970884127, + "loss": 1.5199, + "step": 933 + }, + { + "epoch": 0.08473961168571947, + "grad_norm": 0.239946932400916, + "learning_rate": 0.0009921710935882437, + "loss": 1.5586, + "step": 934 + }, + { + "epoch": 0.08483033932135728, + "grad_norm": 0.28570341867782456, + "learning_rate": 0.0009921451737933138, + "loss": 1.5281, + "step": 935 + }, + { + "epoch": 0.0849210669569951, + "grad_norm": 0.2500823929983036, + "learning_rate": 0.0009921192115015757, + "loss": 1.5108, + "step": 936 + }, + { + "epoch": 0.08501179459263292, + "grad_norm": 0.2577813463323016, + "learning_rate": 0.000992093206715271, + "loss": 1.5391, + "step": 937 + }, + { + "epoch": 0.08510252222827074, + "grad_norm": 0.24799878553565247, + "learning_rate": 0.0009920671594366454, + "loss": 1.5177, + "step": 938 + }, + { + "epoch": 0.08519324986390854, + "grad_norm": 0.25652297671565993, + "learning_rate": 0.000992041069667948, + "loss": 1.5491, + "step": 939 + }, + { + "epoch": 0.08528397749954636, + "grad_norm": 0.2604338950236376, + "learning_rate": 0.000992014937411432, + "loss": 1.5601, + "step": 940 + }, + { + "epoch": 0.08537470513518418, + "grad_norm": 0.2760265103304715, + "learning_rate": 0.0009919887626693533, + "loss": 1.5263, + "step": 941 + }, + { + "epoch": 0.085465432770822, + "grad_norm": 0.26612916814818227, + "learning_rate": 0.0009919625454439726, + "loss": 1.5345, + "step": 942 + }, + { + "epoch": 0.0855561604064598, + "grad_norm": 0.2535089962420092, + "learning_rate": 0.0009919362857375535, + "loss": 1.509, + "step": 943 + }, + { + "epoch": 0.08564688804209762, + "grad_norm": 0.29583567330444266, + "learning_rate": 0.000991909983552364, + "loss": 1.5218, + "step": 944 + }, + { + "epoch": 0.08573761567773544, + "grad_norm": 0.2505318800760225, + "learning_rate": 0.0009918836388906744, + "loss": 1.5325, + "step": 945 + }, + { + "epoch": 0.08582834331337326, + "grad_norm": 0.25469092801408316, + "learning_rate": 0.0009918572517547603, + "loss": 1.5292, + "step": 946 + }, + { + "epoch": 0.08591907094901106, + "grad_norm": 0.2476873877111172, + "learning_rate": 0.0009918308221469002, + "loss": 1.56, + "step": 947 + }, + { + "epoch": 0.08600979858464888, + "grad_norm": 0.24679587838118391, + "learning_rate": 0.0009918043500693758, + "loss": 1.5726, + "step": 948 + }, + { + "epoch": 0.0861005262202867, + "grad_norm": 0.26342023188714564, + "learning_rate": 0.0009917778355244735, + "loss": 1.5235, + "step": 949 + }, + { + "epoch": 0.08619125385592452, + "grad_norm": 0.23224819734286978, + "learning_rate": 0.0009917512785144824, + "loss": 1.5561, + "step": 950 + }, + { + "epoch": 0.08628198149156233, + "grad_norm": 0.22271539751786437, + "learning_rate": 0.0009917246790416962, + "loss": 1.564, + "step": 951 + }, + { + "epoch": 0.08637270912720015, + "grad_norm": 0.2523992024856635, + "learning_rate": 0.0009916980371084112, + "loss": 1.5245, + "step": 952 + }, + { + "epoch": 0.08646343676283796, + "grad_norm": 0.2645523502578023, + "learning_rate": 0.0009916713527169285, + "loss": 1.5038, + "step": 953 + }, + { + "epoch": 0.08655416439847577, + "grad_norm": 0.2603422646393085, + "learning_rate": 0.000991644625869552, + "loss": 1.5325, + "step": 954 + }, + { + "epoch": 0.08664489203411359, + "grad_norm": 0.2726317245549138, + "learning_rate": 0.0009916178565685895, + "loss": 1.5125, + "step": 955 + }, + { + "epoch": 0.08673561966975141, + "grad_norm": 0.2496039544468738, + "learning_rate": 0.0009915910448163525, + "loss": 1.533, + "step": 956 + }, + { + "epoch": 0.08682634730538923, + "grad_norm": 0.28283293763263817, + "learning_rate": 0.0009915641906151566, + "loss": 1.5304, + "step": 957 + }, + { + "epoch": 0.08691707494102703, + "grad_norm": 0.2791282559979197, + "learning_rate": 0.0009915372939673201, + "loss": 1.5226, + "step": 958 + }, + { + "epoch": 0.08700780257666485, + "grad_norm": 0.2771664482635427, + "learning_rate": 0.000991510354875166, + "loss": 1.5406, + "step": 959 + }, + { + "epoch": 0.08709853021230267, + "grad_norm": 0.26897923791051237, + "learning_rate": 0.0009914833733410205, + "loss": 1.5468, + "step": 960 + }, + { + "epoch": 0.08718925784794049, + "grad_norm": 0.2573384235057213, + "learning_rate": 0.000991456349367213, + "loss": 1.494, + "step": 961 + }, + { + "epoch": 0.08727998548357829, + "grad_norm": 0.26552487998952407, + "learning_rate": 0.0009914292829560775, + "loss": 1.5171, + "step": 962 + }, + { + "epoch": 0.08737071311921611, + "grad_norm": 0.2849422949059693, + "learning_rate": 0.0009914021741099508, + "loss": 1.5077, + "step": 963 + }, + { + "epoch": 0.08746144075485393, + "grad_norm": 0.30254044739650426, + "learning_rate": 0.000991375022831174, + "loss": 1.5294, + "step": 964 + }, + { + "epoch": 0.08755216839049175, + "grad_norm": 0.2845859117637198, + "learning_rate": 0.0009913478291220914, + "loss": 1.543, + "step": 965 + }, + { + "epoch": 0.08764289602612955, + "grad_norm": 0.2756817232974474, + "learning_rate": 0.0009913205929850514, + "loss": 1.5557, + "step": 966 + }, + { + "epoch": 0.08773362366176737, + "grad_norm": 0.3518290787456415, + "learning_rate": 0.0009912933144224062, + "loss": 1.5884, + "step": 967 + }, + { + "epoch": 0.08782435129740519, + "grad_norm": 0.32655690458021386, + "learning_rate": 0.0009912659934365104, + "loss": 1.5421, + "step": 968 + }, + { + "epoch": 0.08791507893304301, + "grad_norm": 0.27905837109221854, + "learning_rate": 0.000991238630029724, + "loss": 1.5577, + "step": 969 + }, + { + "epoch": 0.08800580656868082, + "grad_norm": 0.2461157200992668, + "learning_rate": 0.000991211224204409, + "loss": 1.5137, + "step": 970 + }, + { + "epoch": 0.08809653420431864, + "grad_norm": 0.25648065714054724, + "learning_rate": 0.0009911837759629329, + "loss": 1.5331, + "step": 971 + }, + { + "epoch": 0.08818726183995645, + "grad_norm": 0.2579076545051402, + "learning_rate": 0.0009911562853076653, + "loss": 1.5155, + "step": 972 + }, + { + "epoch": 0.08827798947559426, + "grad_norm": 0.2641794693183072, + "learning_rate": 0.0009911287522409798, + "loss": 1.5146, + "step": 973 + }, + { + "epoch": 0.08836871711123208, + "grad_norm": 0.2841203171731336, + "learning_rate": 0.0009911011767652544, + "loss": 1.5359, + "step": 974 + }, + { + "epoch": 0.0884594447468699, + "grad_norm": 0.29539697694051986, + "learning_rate": 0.00099107355888287, + "loss": 1.5243, + "step": 975 + }, + { + "epoch": 0.08855017238250772, + "grad_norm": 0.2950395593250475, + "learning_rate": 0.0009910458985962112, + "loss": 1.5068, + "step": 976 + }, + { + "epoch": 0.08864090001814552, + "grad_norm": 0.2702013569296065, + "learning_rate": 0.0009910181959076668, + "loss": 1.5328, + "step": 977 + }, + { + "epoch": 0.08873162765378334, + "grad_norm": 0.2844026245656121, + "learning_rate": 0.0009909904508196288, + "loss": 1.5288, + "step": 978 + }, + { + "epoch": 0.08882235528942116, + "grad_norm": 0.23854168632668782, + "learning_rate": 0.000990962663334493, + "loss": 1.5621, + "step": 979 + }, + { + "epoch": 0.08891308292505898, + "grad_norm": 0.24286921106765377, + "learning_rate": 0.0009909348334546588, + "loss": 1.5484, + "step": 980 + }, + { + "epoch": 0.08900381056069678, + "grad_norm": 0.2685470533414942, + "learning_rate": 0.0009909069611825296, + "loss": 1.5241, + "step": 981 + }, + { + "epoch": 0.0890945381963346, + "grad_norm": 0.2536293574431169, + "learning_rate": 0.0009908790465205117, + "loss": 1.5561, + "step": 982 + }, + { + "epoch": 0.08918526583197242, + "grad_norm": 0.23730359147824848, + "learning_rate": 0.000990851089471016, + "loss": 1.4981, + "step": 983 + }, + { + "epoch": 0.08927599346761024, + "grad_norm": 0.2473453668227584, + "learning_rate": 0.0009908230900364564, + "loss": 1.4868, + "step": 984 + }, + { + "epoch": 0.08936672110324805, + "grad_norm": 0.32744156356174486, + "learning_rate": 0.0009907950482192505, + "loss": 1.5509, + "step": 985 + }, + { + "epoch": 0.08945744873888586, + "grad_norm": 0.25679938890920084, + "learning_rate": 0.0009907669640218197, + "loss": 1.5497, + "step": 986 + }, + { + "epoch": 0.08954817637452368, + "grad_norm": 0.859594274122978, + "learning_rate": 0.0009907388374465894, + "loss": 1.5755, + "step": 987 + }, + { + "epoch": 0.0896389040101615, + "grad_norm": 0.3281302733506906, + "learning_rate": 0.000990710668495988, + "loss": 1.5264, + "step": 988 + }, + { + "epoch": 0.08972963164579931, + "grad_norm": 0.2708458942137128, + "learning_rate": 0.0009906824571724484, + "loss": 1.5262, + "step": 989 + }, + { + "epoch": 0.08982035928143713, + "grad_norm": 0.2705852864577125, + "learning_rate": 0.000990654203478406, + "loss": 1.5451, + "step": 990 + }, + { + "epoch": 0.08991108691707494, + "grad_norm": 0.2722822830623868, + "learning_rate": 0.000990625907416301, + "loss": 1.5336, + "step": 991 + }, + { + "epoch": 0.09000181455271275, + "grad_norm": 0.29162330792228097, + "learning_rate": 0.0009905975689885765, + "loss": 1.5232, + "step": 992 + }, + { + "epoch": 0.09009254218835057, + "grad_norm": 0.2705697899064425, + "learning_rate": 0.0009905691881976796, + "loss": 1.5311, + "step": 993 + }, + { + "epoch": 0.09018326982398839, + "grad_norm": 0.2830999635320247, + "learning_rate": 0.000990540765046061, + "loss": 1.5433, + "step": 994 + }, + { + "epoch": 0.0902739974596262, + "grad_norm": 0.25018248475063737, + "learning_rate": 0.000990512299536175, + "loss": 1.5334, + "step": 995 + }, + { + "epoch": 0.09036472509526401, + "grad_norm": 0.27995598207987576, + "learning_rate": 0.00099048379167048, + "loss": 1.5315, + "step": 996 + }, + { + "epoch": 0.09045545273090183, + "grad_norm": 0.26227849593930685, + "learning_rate": 0.0009904552414514366, + "loss": 1.5394, + "step": 997 + }, + { + "epoch": 0.09054618036653965, + "grad_norm": 0.25192214633237703, + "learning_rate": 0.0009904266488815114, + "loss": 1.5086, + "step": 998 + }, + { + "epoch": 0.09063690800217747, + "grad_norm": 0.2589919391921309, + "learning_rate": 0.0009903980139631726, + "loss": 1.5718, + "step": 999 + }, + { + "epoch": 0.09072763563781527, + "grad_norm": 0.2421671537442652, + "learning_rate": 0.000990369336698893, + "loss": 1.5185, + "step": 1000 + }, + { + "epoch": 0.09081836327345309, + "grad_norm": 0.3371025399245189, + "learning_rate": 0.000990340617091149, + "loss": 1.533, + "step": 1001 + }, + { + "epoch": 0.09090909090909091, + "grad_norm": 0.23872073694679727, + "learning_rate": 0.0009903118551424202, + "loss": 1.5294, + "step": 1002 + }, + { + "epoch": 0.09099981854472873, + "grad_norm": 0.24900248492168783, + "learning_rate": 0.0009902830508551907, + "loss": 1.5435, + "step": 1003 + }, + { + "epoch": 0.09109054618036654, + "grad_norm": 0.33238490296288314, + "learning_rate": 0.0009902542042319474, + "loss": 1.5249, + "step": 1004 + }, + { + "epoch": 0.09118127381600435, + "grad_norm": 0.32531866446320856, + "learning_rate": 0.0009902253152751811, + "loss": 1.5381, + "step": 1005 + }, + { + "epoch": 0.09127200145164217, + "grad_norm": 0.25320823399168985, + "learning_rate": 0.000990196383987387, + "loss": 1.5721, + "step": 1006 + }, + { + "epoch": 0.09136272908727998, + "grad_norm": 0.23934977334284382, + "learning_rate": 0.0009901674103710626, + "loss": 1.5746, + "step": 1007 + }, + { + "epoch": 0.0914534567229178, + "grad_norm": 0.26213087869723645, + "learning_rate": 0.0009901383944287102, + "loss": 1.5573, + "step": 1008 + }, + { + "epoch": 0.09154418435855562, + "grad_norm": 0.23694911906228688, + "learning_rate": 0.000990109336162835, + "loss": 1.4954, + "step": 1009 + }, + { + "epoch": 0.09163491199419344, + "grad_norm": 0.26230295908867524, + "learning_rate": 0.0009900802355759467, + "loss": 1.5651, + "step": 1010 + }, + { + "epoch": 0.09172563962983124, + "grad_norm": 0.250338521061609, + "learning_rate": 0.0009900510926705577, + "loss": 1.5554, + "step": 1011 + }, + { + "epoch": 0.09181636726546906, + "grad_norm": 0.2400826861690977, + "learning_rate": 0.0009900219074491846, + "loss": 1.5243, + "step": 1012 + }, + { + "epoch": 0.09190709490110688, + "grad_norm": 0.23929338880629158, + "learning_rate": 0.0009899926799143476, + "loss": 1.5402, + "step": 1013 + }, + { + "epoch": 0.0919978225367447, + "grad_norm": 0.30069646701875213, + "learning_rate": 0.0009899634100685704, + "loss": 1.5489, + "step": 1014 + }, + { + "epoch": 0.0920885501723825, + "grad_norm": 0.2480555751850939, + "learning_rate": 0.0009899340979143804, + "loss": 1.5116, + "step": 1015 + }, + { + "epoch": 0.09217927780802032, + "grad_norm": 0.24600175364299248, + "learning_rate": 0.0009899047434543092, + "loss": 1.5286, + "step": 1016 + }, + { + "epoch": 0.09227000544365814, + "grad_norm": 0.26793410366112097, + "learning_rate": 0.000989875346690891, + "loss": 1.5027, + "step": 1017 + }, + { + "epoch": 0.09236073307929596, + "grad_norm": 0.2656891866431258, + "learning_rate": 0.0009898459076266642, + "loss": 1.5681, + "step": 1018 + }, + { + "epoch": 0.09245146071493376, + "grad_norm": 0.24036381893790526, + "learning_rate": 0.0009898164262641714, + "loss": 1.5346, + "step": 1019 + }, + { + "epoch": 0.09254218835057158, + "grad_norm": 0.24319782559259803, + "learning_rate": 0.0009897869026059577, + "loss": 1.5195, + "step": 1020 + }, + { + "epoch": 0.0926329159862094, + "grad_norm": 0.24403292006733462, + "learning_rate": 0.000989757336654573, + "loss": 1.5371, + "step": 1021 + }, + { + "epoch": 0.09272364362184722, + "grad_norm": 0.2411175471830929, + "learning_rate": 0.0009897277284125702, + "loss": 1.5247, + "step": 1022 + }, + { + "epoch": 0.09281437125748503, + "grad_norm": 0.24138414625131285, + "learning_rate": 0.0009896980778825058, + "loss": 1.5184, + "step": 1023 + }, + { + "epoch": 0.09290509889312284, + "grad_norm": 0.24060490014023186, + "learning_rate": 0.0009896683850669401, + "loss": 1.532, + "step": 1024 + }, + { + "epoch": 0.09299582652876066, + "grad_norm": 0.2514277462302056, + "learning_rate": 0.000989638649968437, + "loss": 1.5636, + "step": 1025 + }, + { + "epoch": 0.09308655416439847, + "grad_norm": 0.24160748041143223, + "learning_rate": 0.0009896088725895647, + "loss": 1.5478, + "step": 1026 + }, + { + "epoch": 0.09317728180003629, + "grad_norm": 0.2610751351484801, + "learning_rate": 0.000989579052932894, + "loss": 1.5107, + "step": 1027 + }, + { + "epoch": 0.0932680094356741, + "grad_norm": 0.2467383482829909, + "learning_rate": 0.0009895491910009997, + "loss": 1.537, + "step": 1028 + }, + { + "epoch": 0.09335873707131193, + "grad_norm": 0.2560566791250042, + "learning_rate": 0.0009895192867964608, + "loss": 1.5393, + "step": 1029 + }, + { + "epoch": 0.09344946470694973, + "grad_norm": 0.24819427733997595, + "learning_rate": 0.0009894893403218593, + "loss": 1.5826, + "step": 1030 + }, + { + "epoch": 0.09354019234258755, + "grad_norm": 0.2384184749229187, + "learning_rate": 0.0009894593515797812, + "loss": 1.5262, + "step": 1031 + }, + { + "epoch": 0.09363091997822537, + "grad_norm": 0.24382571676988204, + "learning_rate": 0.0009894293205728157, + "loss": 1.5437, + "step": 1032 + }, + { + "epoch": 0.09372164761386319, + "grad_norm": 0.26172641288886267, + "learning_rate": 0.0009893992473035563, + "loss": 1.5507, + "step": 1033 + }, + { + "epoch": 0.09381237524950099, + "grad_norm": 0.22887000630769783, + "learning_rate": 0.0009893691317745998, + "loss": 1.5433, + "step": 1034 + }, + { + "epoch": 0.09390310288513881, + "grad_norm": 0.2426145442280011, + "learning_rate": 0.0009893389739885467, + "loss": 1.5173, + "step": 1035 + }, + { + "epoch": 0.09399383052077663, + "grad_norm": 0.2328069651859722, + "learning_rate": 0.0009893087739480011, + "loss": 1.5084, + "step": 1036 + }, + { + "epoch": 0.09408455815641445, + "grad_norm": 0.2602581819814345, + "learning_rate": 0.0009892785316555708, + "loss": 1.521, + "step": 1037 + }, + { + "epoch": 0.09417528579205225, + "grad_norm": 0.2423885905364033, + "learning_rate": 0.000989248247113867, + "loss": 1.5463, + "step": 1038 + }, + { + "epoch": 0.09426601342769007, + "grad_norm": 0.24881014435692236, + "learning_rate": 0.0009892179203255052, + "loss": 1.5354, + "step": 1039 + }, + { + "epoch": 0.09435674106332789, + "grad_norm": 0.269439893161174, + "learning_rate": 0.0009891875512931037, + "loss": 1.5041, + "step": 1040 + }, + { + "epoch": 0.09444746869896571, + "grad_norm": 0.2946427030906177, + "learning_rate": 0.000989157140019285, + "loss": 1.5387, + "step": 1041 + }, + { + "epoch": 0.09453819633460352, + "grad_norm": 0.24324392689582888, + "learning_rate": 0.0009891266865066752, + "loss": 1.5156, + "step": 1042 + }, + { + "epoch": 0.09462892397024134, + "grad_norm": 0.24250526727572252, + "learning_rate": 0.0009890961907579041, + "loss": 1.5244, + "step": 1043 + }, + { + "epoch": 0.09471965160587915, + "grad_norm": 0.2287969791678899, + "learning_rate": 0.0009890656527756047, + "loss": 1.5395, + "step": 1044 + }, + { + "epoch": 0.09481037924151696, + "grad_norm": 0.29561234271983117, + "learning_rate": 0.0009890350725624143, + "loss": 1.5265, + "step": 1045 + }, + { + "epoch": 0.09490110687715478, + "grad_norm": 0.23594854985115662, + "learning_rate": 0.000989004450120973, + "loss": 1.5469, + "step": 1046 + }, + { + "epoch": 0.0949918345127926, + "grad_norm": 0.23796615591149767, + "learning_rate": 0.0009889737854539254, + "loss": 1.5427, + "step": 1047 + }, + { + "epoch": 0.09508256214843042, + "grad_norm": 0.22902283361669032, + "learning_rate": 0.0009889430785639194, + "loss": 1.5274, + "step": 1048 + }, + { + "epoch": 0.09517328978406822, + "grad_norm": 0.2278528857729931, + "learning_rate": 0.0009889123294536068, + "loss": 1.4997, + "step": 1049 + }, + { + "epoch": 0.09526401741970604, + "grad_norm": 0.21333482985484856, + "learning_rate": 0.0009888815381256422, + "loss": 1.5024, + "step": 1050 + }, + { + "epoch": 0.09535474505534386, + "grad_norm": 0.22351728256093603, + "learning_rate": 0.0009888507045826846, + "loss": 1.528, + "step": 1051 + }, + { + "epoch": 0.09544547269098168, + "grad_norm": 0.22632301990147985, + "learning_rate": 0.0009888198288273968, + "loss": 1.5307, + "step": 1052 + }, + { + "epoch": 0.09553620032661948, + "grad_norm": 0.23587164547508063, + "learning_rate": 0.000988788910862445, + "loss": 1.5165, + "step": 1053 + }, + { + "epoch": 0.0956269279622573, + "grad_norm": 0.23494334838773073, + "learning_rate": 0.0009887579506904982, + "loss": 1.5431, + "step": 1054 + }, + { + "epoch": 0.09571765559789512, + "grad_norm": 0.243363355291682, + "learning_rate": 0.0009887269483142304, + "loss": 1.5076, + "step": 1055 + }, + { + "epoch": 0.09580838323353294, + "grad_norm": 0.2300728621444557, + "learning_rate": 0.0009886959037363188, + "loss": 1.5191, + "step": 1056 + }, + { + "epoch": 0.09589911086917075, + "grad_norm": 0.2353344862258975, + "learning_rate": 0.0009886648169594439, + "loss": 1.5137, + "step": 1057 + }, + { + "epoch": 0.09598983850480856, + "grad_norm": 0.27312075201521924, + "learning_rate": 0.00098863368798629, + "loss": 1.5156, + "step": 1058 + }, + { + "epoch": 0.09608056614044638, + "grad_norm": 0.2240039365526822, + "learning_rate": 0.000988602516819545, + "loss": 1.4748, + "step": 1059 + }, + { + "epoch": 0.0961712937760842, + "grad_norm": 0.22180920509047372, + "learning_rate": 0.0009885713034619007, + "loss": 1.4897, + "step": 1060 + }, + { + "epoch": 0.096262021411722, + "grad_norm": 0.2836038109892051, + "learning_rate": 0.0009885400479160524, + "loss": 1.5189, + "step": 1061 + }, + { + "epoch": 0.09635274904735983, + "grad_norm": 0.2760187495911597, + "learning_rate": 0.000988508750184699, + "loss": 1.5117, + "step": 1062 + }, + { + "epoch": 0.09644347668299764, + "grad_norm": 0.23761595316703799, + "learning_rate": 0.000988477410270543, + "loss": 1.54, + "step": 1063 + }, + { + "epoch": 0.09653420431863545, + "grad_norm": 0.262145001421214, + "learning_rate": 0.0009884460281762905, + "loss": 1.5077, + "step": 1064 + }, + { + "epoch": 0.09662493195427327, + "grad_norm": 0.23811682463023015, + "learning_rate": 0.0009884146039046515, + "loss": 1.5424, + "step": 1065 + }, + { + "epoch": 0.09671565958991109, + "grad_norm": 0.23794676591030395, + "learning_rate": 0.0009883831374583396, + "loss": 1.4911, + "step": 1066 + }, + { + "epoch": 0.0968063872255489, + "grad_norm": 0.24781832512086627, + "learning_rate": 0.0009883516288400718, + "loss": 1.5056, + "step": 1067 + }, + { + "epoch": 0.09689711486118671, + "grad_norm": 0.24921996809886357, + "learning_rate": 0.0009883200780525687, + "loss": 1.526, + "step": 1068 + }, + { + "epoch": 0.09698784249682453, + "grad_norm": 0.2420986569118293, + "learning_rate": 0.000988288485098555, + "loss": 1.5712, + "step": 1069 + }, + { + "epoch": 0.09707857013246235, + "grad_norm": 0.22909467472036457, + "learning_rate": 0.0009882568499807586, + "loss": 1.5492, + "step": 1070 + }, + { + "epoch": 0.09716929776810017, + "grad_norm": 0.2453217603464456, + "learning_rate": 0.0009882251727019113, + "loss": 1.5792, + "step": 1071 + }, + { + "epoch": 0.09726002540373797, + "grad_norm": 0.24482834032878814, + "learning_rate": 0.0009881934532647483, + "loss": 1.5404, + "step": 1072 + }, + { + "epoch": 0.09735075303937579, + "grad_norm": 0.24566365200318738, + "learning_rate": 0.0009881616916720087, + "loss": 1.491, + "step": 1073 + }, + { + "epoch": 0.09744148067501361, + "grad_norm": 0.253441653697992, + "learning_rate": 0.0009881298879264352, + "loss": 1.5184, + "step": 1074 + }, + { + "epoch": 0.09753220831065143, + "grad_norm": 0.2333179393211054, + "learning_rate": 0.0009880980420307738, + "loss": 1.5539, + "step": 1075 + }, + { + "epoch": 0.09762293594628924, + "grad_norm": 0.2596653591895023, + "learning_rate": 0.0009880661539877745, + "loss": 1.503, + "step": 1076 + }, + { + "epoch": 0.09771366358192705, + "grad_norm": 0.24314799657627903, + "learning_rate": 0.0009880342238001909, + "loss": 1.5527, + "step": 1077 + }, + { + "epoch": 0.09780439121756487, + "grad_norm": 0.2466298835377919, + "learning_rate": 0.0009880022514707804, + "loss": 1.5051, + "step": 1078 + }, + { + "epoch": 0.09789511885320269, + "grad_norm": 0.24004006415899345, + "learning_rate": 0.000987970237002303, + "loss": 1.5708, + "step": 1079 + }, + { + "epoch": 0.0979858464888405, + "grad_norm": 0.2349325194644158, + "learning_rate": 0.0009879381803975242, + "loss": 1.5427, + "step": 1080 + }, + { + "epoch": 0.09807657412447832, + "grad_norm": 0.24221084457588143, + "learning_rate": 0.0009879060816592115, + "loss": 1.5363, + "step": 1081 + }, + { + "epoch": 0.09816730176011614, + "grad_norm": 0.42168262801741924, + "learning_rate": 0.0009878739407901368, + "loss": 1.5116, + "step": 1082 + }, + { + "epoch": 0.09825802939575394, + "grad_norm": 0.25054100825456693, + "learning_rate": 0.0009878417577930752, + "loss": 1.5038, + "step": 1083 + }, + { + "epoch": 0.09834875703139176, + "grad_norm": 0.26962119424406134, + "learning_rate": 0.000987809532670806, + "loss": 1.5085, + "step": 1084 + }, + { + "epoch": 0.09843948466702958, + "grad_norm": 0.2689116876685714, + "learning_rate": 0.0009877772654261119, + "loss": 1.5659, + "step": 1085 + }, + { + "epoch": 0.0985302123026674, + "grad_norm": 0.23726880683446464, + "learning_rate": 0.0009877449560617788, + "loss": 1.5588, + "step": 1086 + }, + { + "epoch": 0.0986209399383052, + "grad_norm": 0.24400437177424794, + "learning_rate": 0.0009877126045805971, + "loss": 1.522, + "step": 1087 + }, + { + "epoch": 0.09871166757394302, + "grad_norm": 0.21944528295558666, + "learning_rate": 0.00098768021098536, + "loss": 1.5352, + "step": 1088 + }, + { + "epoch": 0.09880239520958084, + "grad_norm": 0.277528771457675, + "learning_rate": 0.0009876477752788647, + "loss": 1.5113, + "step": 1089 + }, + { + "epoch": 0.09889312284521866, + "grad_norm": 0.23728869689470775, + "learning_rate": 0.0009876152974639123, + "loss": 1.5025, + "step": 1090 + }, + { + "epoch": 0.09898385048085646, + "grad_norm": 0.30766698467769465, + "learning_rate": 0.000987582777543307, + "loss": 1.5193, + "step": 1091 + }, + { + "epoch": 0.09907457811649428, + "grad_norm": 0.22724585928206203, + "learning_rate": 0.000987550215519857, + "loss": 1.5071, + "step": 1092 + }, + { + "epoch": 0.0991653057521321, + "grad_norm": 0.27627248514652997, + "learning_rate": 0.0009875176113963739, + "loss": 1.4457, + "step": 1093 + }, + { + "epoch": 0.09925603338776992, + "grad_norm": 0.2453845436181277, + "learning_rate": 0.0009874849651756734, + "loss": 1.5675, + "step": 1094 + }, + { + "epoch": 0.09934676102340773, + "grad_norm": 0.2547986954099667, + "learning_rate": 0.0009874522768605744, + "loss": 1.5013, + "step": 1095 + }, + { + "epoch": 0.09943748865904554, + "grad_norm": 0.23131681395440118, + "learning_rate": 0.0009874195464538993, + "loss": 1.5712, + "step": 1096 + }, + { + "epoch": 0.09952821629468336, + "grad_norm": 0.2978079192573597, + "learning_rate": 0.0009873867739584746, + "loss": 1.5327, + "step": 1097 + }, + { + "epoch": 0.09961894393032118, + "grad_norm": 0.22363331538310116, + "learning_rate": 0.00098735395937713, + "loss": 1.495, + "step": 1098 + }, + { + "epoch": 0.09970967156595899, + "grad_norm": 0.2333508083191555, + "learning_rate": 0.0009873211027126992, + "loss": 1.5289, + "step": 1099 + }, + { + "epoch": 0.0998003992015968, + "grad_norm": 0.25987497648177926, + "learning_rate": 0.0009872882039680194, + "loss": 1.5484, + "step": 1100 + }, + { + "epoch": 0.09989112683723463, + "grad_norm": 0.232897023884946, + "learning_rate": 0.0009872552631459315, + "loss": 1.5125, + "step": 1101 + }, + { + "epoch": 0.09998185447287243, + "grad_norm": 0.24289901786321708, + "learning_rate": 0.0009872222802492796, + "loss": 1.5184, + "step": 1102 + }, + { + "epoch": 0.10007258210851025, + "grad_norm": 0.23049762109816735, + "learning_rate": 0.0009871892552809122, + "loss": 1.5427, + "step": 1103 + }, + { + "epoch": 0.10016330974414807, + "grad_norm": 0.2472473530906148, + "learning_rate": 0.0009871561882436805, + "loss": 1.5526, + "step": 1104 + }, + { + "epoch": 0.10025403737978589, + "grad_norm": 0.24211878778443127, + "learning_rate": 0.0009871230791404404, + "loss": 1.4987, + "step": 1105 + }, + { + "epoch": 0.10034476501542369, + "grad_norm": 0.2451640482680216, + "learning_rate": 0.0009870899279740507, + "loss": 1.5315, + "step": 1106 + }, + { + "epoch": 0.10043549265106151, + "grad_norm": 0.22745781354058006, + "learning_rate": 0.0009870567347473737, + "loss": 1.5134, + "step": 1107 + }, + { + "epoch": 0.10052622028669933, + "grad_norm": 0.23688209775452046, + "learning_rate": 0.000987023499463276, + "loss": 1.4951, + "step": 1108 + }, + { + "epoch": 0.10061694792233715, + "grad_norm": 0.24650146402943368, + "learning_rate": 0.0009869902221246272, + "loss": 1.5179, + "step": 1109 + }, + { + "epoch": 0.10070767555797495, + "grad_norm": 0.24279699655750797, + "learning_rate": 0.0009869569027343011, + "loss": 1.4625, + "step": 1110 + }, + { + "epoch": 0.10079840319361277, + "grad_norm": 0.24617573272572474, + "learning_rate": 0.0009869235412951745, + "loss": 1.478, + "step": 1111 + }, + { + "epoch": 0.10088913082925059, + "grad_norm": 0.26014172656491413, + "learning_rate": 0.0009868901378101286, + "loss": 1.5306, + "step": 1112 + }, + { + "epoch": 0.10097985846488841, + "grad_norm": 0.2686046126177649, + "learning_rate": 0.0009868566922820474, + "loss": 1.5036, + "step": 1113 + }, + { + "epoch": 0.10107058610052622, + "grad_norm": 0.23548321818993542, + "learning_rate": 0.000986823204713819, + "loss": 1.5299, + "step": 1114 + }, + { + "epoch": 0.10116131373616404, + "grad_norm": 0.24222648700032745, + "learning_rate": 0.0009867896751083352, + "loss": 1.5292, + "step": 1115 + }, + { + "epoch": 0.10125204137180185, + "grad_norm": 0.26524136316038954, + "learning_rate": 0.0009867561034684912, + "loss": 1.5224, + "step": 1116 + }, + { + "epoch": 0.10134276900743966, + "grad_norm": 0.2294209815959494, + "learning_rate": 0.000986722489797186, + "loss": 1.4985, + "step": 1117 + }, + { + "epoch": 0.10143349664307748, + "grad_norm": 0.23174509887168995, + "learning_rate": 0.000986688834097322, + "loss": 1.5603, + "step": 1118 + }, + { + "epoch": 0.1015242242787153, + "grad_norm": 0.24697825739776935, + "learning_rate": 0.0009866551363718055, + "loss": 1.5068, + "step": 1119 + }, + { + "epoch": 0.10161495191435312, + "grad_norm": 0.28464362463522336, + "learning_rate": 0.000986621396623546, + "loss": 1.5413, + "step": 1120 + }, + { + "epoch": 0.10170567954999092, + "grad_norm": 0.2487311591224258, + "learning_rate": 0.0009865876148554575, + "loss": 1.5362, + "step": 1121 + }, + { + "epoch": 0.10179640718562874, + "grad_norm": 0.24893126272170243, + "learning_rate": 0.0009865537910704566, + "loss": 1.5573, + "step": 1122 + }, + { + "epoch": 0.10188713482126656, + "grad_norm": 0.23408355303962342, + "learning_rate": 0.000986519925271464, + "loss": 1.5435, + "step": 1123 + }, + { + "epoch": 0.10197786245690438, + "grad_norm": 0.23185796868911782, + "learning_rate": 0.0009864860174614045, + "loss": 1.523, + "step": 1124 + }, + { + "epoch": 0.10206859009254218, + "grad_norm": 0.23032960723518864, + "learning_rate": 0.0009864520676432053, + "loss": 1.5163, + "step": 1125 + }, + { + "epoch": 0.10215931772818, + "grad_norm": 0.23717534670240503, + "learning_rate": 0.0009864180758197988, + "loss": 1.5662, + "step": 1126 + }, + { + "epoch": 0.10225004536381782, + "grad_norm": 0.23715737217578628, + "learning_rate": 0.0009863840419941194, + "loss": 1.5633, + "step": 1127 + }, + { + "epoch": 0.10234077299945564, + "grad_norm": 0.25108488376652244, + "learning_rate": 0.0009863499661691064, + "loss": 1.501, + "step": 1128 + }, + { + "epoch": 0.10243150063509344, + "grad_norm": 0.25068274387218004, + "learning_rate": 0.000986315848347702, + "loss": 1.5212, + "step": 1129 + }, + { + "epoch": 0.10252222827073126, + "grad_norm": 0.2191993115932023, + "learning_rate": 0.0009862816885328526, + "loss": 1.5416, + "step": 1130 + }, + { + "epoch": 0.10261295590636908, + "grad_norm": 0.23811143323257994, + "learning_rate": 0.0009862474867275077, + "loss": 1.4965, + "step": 1131 + }, + { + "epoch": 0.1027036835420069, + "grad_norm": 0.23438274602202872, + "learning_rate": 0.0009862132429346207, + "loss": 1.5095, + "step": 1132 + }, + { + "epoch": 0.1027944111776447, + "grad_norm": 0.2210515467219483, + "learning_rate": 0.0009861789571571483, + "loss": 1.4882, + "step": 1133 + }, + { + "epoch": 0.10288513881328253, + "grad_norm": 0.22398112991286076, + "learning_rate": 0.0009861446293980517, + "loss": 1.5401, + "step": 1134 + }, + { + "epoch": 0.10297586644892034, + "grad_norm": 0.2657772490498837, + "learning_rate": 0.0009861102596602942, + "loss": 1.5073, + "step": 1135 + }, + { + "epoch": 0.10306659408455815, + "grad_norm": 0.2790679938278512, + "learning_rate": 0.0009860758479468446, + "loss": 1.4921, + "step": 1136 + }, + { + "epoch": 0.10315732172019597, + "grad_norm": 0.24888377757630117, + "learning_rate": 0.0009860413942606736, + "loss": 1.5143, + "step": 1137 + }, + { + "epoch": 0.10324804935583379, + "grad_norm": 0.24366005048298875, + "learning_rate": 0.0009860068986047566, + "loss": 1.5967, + "step": 1138 + }, + { + "epoch": 0.1033387769914716, + "grad_norm": 0.23495235606557682, + "learning_rate": 0.0009859723609820722, + "loss": 1.5046, + "step": 1139 + }, + { + "epoch": 0.10342950462710941, + "grad_norm": 0.25183666592383125, + "learning_rate": 0.000985937781395603, + "loss": 1.4785, + "step": 1140 + }, + { + "epoch": 0.10352023226274723, + "grad_norm": 0.23277078999288955, + "learning_rate": 0.0009859031598483347, + "loss": 1.5459, + "step": 1141 + }, + { + "epoch": 0.10361095989838505, + "grad_norm": 0.24925603857276388, + "learning_rate": 0.0009858684963432568, + "loss": 1.5444, + "step": 1142 + }, + { + "epoch": 0.10370168753402287, + "grad_norm": 0.2675398544930922, + "learning_rate": 0.0009858337908833627, + "loss": 1.5424, + "step": 1143 + }, + { + "epoch": 0.10379241516966067, + "grad_norm": 0.2428779218632576, + "learning_rate": 0.0009857990434716493, + "loss": 1.5477, + "step": 1144 + }, + { + "epoch": 0.10388314280529849, + "grad_norm": 0.24834246162879173, + "learning_rate": 0.0009857642541111168, + "loss": 1.5173, + "step": 1145 + }, + { + "epoch": 0.10397387044093631, + "grad_norm": 0.2218289636419925, + "learning_rate": 0.0009857294228047694, + "loss": 1.5316, + "step": 1146 + }, + { + "epoch": 0.10406459807657413, + "grad_norm": 0.24466641924113017, + "learning_rate": 0.0009856945495556146, + "loss": 1.5139, + "step": 1147 + }, + { + "epoch": 0.10415532571221194, + "grad_norm": 0.27557822223902234, + "learning_rate": 0.0009856596343666639, + "loss": 1.5324, + "step": 1148 + }, + { + "epoch": 0.10424605334784975, + "grad_norm": 0.24266870228158682, + "learning_rate": 0.0009856246772409322, + "loss": 1.4825, + "step": 1149 + }, + { + "epoch": 0.10433678098348757, + "grad_norm": 0.24303249038958613, + "learning_rate": 0.0009855896781814381, + "loss": 1.5095, + "step": 1150 + }, + { + "epoch": 0.10442750861912539, + "grad_norm": 0.23436110500161653, + "learning_rate": 0.000985554637191204, + "loss": 1.5013, + "step": 1151 + }, + { + "epoch": 0.1045182362547632, + "grad_norm": 0.23610042517604202, + "learning_rate": 0.000985519554273255, + "loss": 1.5218, + "step": 1152 + }, + { + "epoch": 0.10460896389040102, + "grad_norm": 0.22497542672432339, + "learning_rate": 0.000985484429430621, + "loss": 1.5077, + "step": 1153 + }, + { + "epoch": 0.10469969152603883, + "grad_norm": 0.23192415746613784, + "learning_rate": 0.000985449262666335, + "loss": 1.5063, + "step": 1154 + }, + { + "epoch": 0.10479041916167664, + "grad_norm": 0.20835703225024904, + "learning_rate": 0.0009854140539834338, + "loss": 1.5515, + "step": 1155 + }, + { + "epoch": 0.10488114679731446, + "grad_norm": 0.32953211106041214, + "learning_rate": 0.0009853788033849574, + "loss": 1.5151, + "step": 1156 + }, + { + "epoch": 0.10497187443295228, + "grad_norm": 0.23599800185934483, + "learning_rate": 0.0009853435108739498, + "loss": 1.5119, + "step": 1157 + }, + { + "epoch": 0.1050626020685901, + "grad_norm": 0.22797860315012333, + "learning_rate": 0.0009853081764534585, + "loss": 1.5177, + "step": 1158 + }, + { + "epoch": 0.1051533297042279, + "grad_norm": 0.24507671342798887, + "learning_rate": 0.0009852728001265344, + "loss": 1.517, + "step": 1159 + }, + { + "epoch": 0.10524405733986572, + "grad_norm": 0.24128875754317905, + "learning_rate": 0.0009852373818962327, + "loss": 1.511, + "step": 1160 + }, + { + "epoch": 0.10533478497550354, + "grad_norm": 0.2202390047166169, + "learning_rate": 0.0009852019217656113, + "loss": 1.4874, + "step": 1161 + }, + { + "epoch": 0.10542551261114136, + "grad_norm": 0.2545061065294962, + "learning_rate": 0.0009851664197377328, + "loss": 1.4859, + "step": 1162 + }, + { + "epoch": 0.10551624024677916, + "grad_norm": 0.22179823055630513, + "learning_rate": 0.0009851308758156623, + "loss": 1.514, + "step": 1163 + }, + { + "epoch": 0.10560696788241698, + "grad_norm": 0.23246582427550586, + "learning_rate": 0.0009850952900024692, + "loss": 1.5448, + "step": 1164 + }, + { + "epoch": 0.1056976955180548, + "grad_norm": 0.2480941222294245, + "learning_rate": 0.0009850596623012264, + "loss": 1.5033, + "step": 1165 + }, + { + "epoch": 0.10578842315369262, + "grad_norm": 0.22627414381780173, + "learning_rate": 0.0009850239927150103, + "loss": 1.5255, + "step": 1166 + }, + { + "epoch": 0.10587915078933043, + "grad_norm": 0.2317800514015811, + "learning_rate": 0.0009849882812469006, + "loss": 1.4868, + "step": 1167 + }, + { + "epoch": 0.10596987842496824, + "grad_norm": 0.2438450431309767, + "learning_rate": 0.0009849525278999816, + "loss": 1.4816, + "step": 1168 + }, + { + "epoch": 0.10606060606060606, + "grad_norm": 0.24543039195431443, + "learning_rate": 0.0009849167326773402, + "loss": 1.5173, + "step": 1169 + }, + { + "epoch": 0.10615133369624388, + "grad_norm": 0.2342990598890355, + "learning_rate": 0.0009848808955820675, + "loss": 1.5394, + "step": 1170 + }, + { + "epoch": 0.10624206133188169, + "grad_norm": 0.24131000911381367, + "learning_rate": 0.0009848450166172582, + "loss": 1.5142, + "step": 1171 + }, + { + "epoch": 0.1063327889675195, + "grad_norm": 0.2272856656620883, + "learning_rate": 0.00098480909578601, + "loss": 1.5019, + "step": 1172 + }, + { + "epoch": 0.10642351660315733, + "grad_norm": 0.24890495263946205, + "learning_rate": 0.0009847731330914252, + "loss": 1.5015, + "step": 1173 + }, + { + "epoch": 0.10651424423879513, + "grad_norm": 0.23370747471637526, + "learning_rate": 0.0009847371285366087, + "loss": 1.536, + "step": 1174 + }, + { + "epoch": 0.10660497187443295, + "grad_norm": 0.2831460915438235, + "learning_rate": 0.0009847010821246698, + "loss": 1.5305, + "step": 1175 + }, + { + "epoch": 0.10669569951007077, + "grad_norm": 0.2275842424198593, + "learning_rate": 0.000984664993858721, + "loss": 1.486, + "step": 1176 + }, + { + "epoch": 0.10678642714570859, + "grad_norm": 0.23576534770611085, + "learning_rate": 0.0009846288637418786, + "loss": 1.4821, + "step": 1177 + }, + { + "epoch": 0.10687715478134639, + "grad_norm": 0.2262688633082464, + "learning_rate": 0.0009845926917772623, + "loss": 1.5465, + "step": 1178 + }, + { + "epoch": 0.10696788241698421, + "grad_norm": 0.2424524372452606, + "learning_rate": 0.0009845564779679956, + "loss": 1.5264, + "step": 1179 + }, + { + "epoch": 0.10705861005262203, + "grad_norm": 0.2266222142630314, + "learning_rate": 0.0009845202223172057, + "loss": 1.5129, + "step": 1180 + }, + { + "epoch": 0.10714933768825985, + "grad_norm": 0.23011320743397481, + "learning_rate": 0.0009844839248280232, + "loss": 1.5167, + "step": 1181 + }, + { + "epoch": 0.10724006532389765, + "grad_norm": 0.24431084175616657, + "learning_rate": 0.0009844475855035823, + "loss": 1.5371, + "step": 1182 + }, + { + "epoch": 0.10733079295953547, + "grad_norm": 0.2457471885436868, + "learning_rate": 0.000984411204347021, + "loss": 1.5292, + "step": 1183 + }, + { + "epoch": 0.10742152059517329, + "grad_norm": 0.2240184042747362, + "learning_rate": 0.0009843747813614808, + "loss": 1.527, + "step": 1184 + }, + { + "epoch": 0.10751224823081111, + "grad_norm": 0.2428332094845028, + "learning_rate": 0.000984338316550107, + "loss": 1.5298, + "step": 1185 + }, + { + "epoch": 0.10760297586644892, + "grad_norm": 0.4034568984104106, + "learning_rate": 0.000984301809916048, + "loss": 1.5115, + "step": 1186 + }, + { + "epoch": 0.10769370350208674, + "grad_norm": 0.24577748844241806, + "learning_rate": 0.0009842652614624565, + "loss": 1.5312, + "step": 1187 + }, + { + "epoch": 0.10778443113772455, + "grad_norm": 0.25441053596957613, + "learning_rate": 0.000984228671192488, + "loss": 1.5188, + "step": 1188 + }, + { + "epoch": 0.10787515877336237, + "grad_norm": 0.2605751145379587, + "learning_rate": 0.0009841920391093027, + "loss": 1.4677, + "step": 1189 + }, + { + "epoch": 0.10796588640900018, + "grad_norm": 0.25288610440474374, + "learning_rate": 0.0009841553652160634, + "loss": 1.5183, + "step": 1190 + }, + { + "epoch": 0.108056614044638, + "grad_norm": 0.2795734097890809, + "learning_rate": 0.0009841186495159369, + "loss": 1.5126, + "step": 1191 + }, + { + "epoch": 0.10814734168027582, + "grad_norm": 0.22913068292293695, + "learning_rate": 0.0009840818920120935, + "loss": 1.4978, + "step": 1192 + }, + { + "epoch": 0.10823806931591362, + "grad_norm": 0.23243314505004983, + "learning_rate": 0.0009840450927077076, + "loss": 1.4941, + "step": 1193 + }, + { + "epoch": 0.10832879695155144, + "grad_norm": 0.2435442434611003, + "learning_rate": 0.0009840082516059565, + "loss": 1.5646, + "step": 1194 + }, + { + "epoch": 0.10841952458718926, + "grad_norm": 0.2451857943153742, + "learning_rate": 0.0009839713687100216, + "loss": 1.4967, + "step": 1195 + }, + { + "epoch": 0.10851025222282708, + "grad_norm": 0.23898091935068116, + "learning_rate": 0.0009839344440230877, + "loss": 1.5218, + "step": 1196 + }, + { + "epoch": 0.10860097985846488, + "grad_norm": 0.24377378582177472, + "learning_rate": 0.0009838974775483432, + "loss": 1.5184, + "step": 1197 + }, + { + "epoch": 0.1086917074941027, + "grad_norm": 0.2686377323473461, + "learning_rate": 0.00098386046928898, + "loss": 1.5171, + "step": 1198 + }, + { + "epoch": 0.10878243512974052, + "grad_norm": 0.2516252287579682, + "learning_rate": 0.0009838234192481943, + "loss": 1.5189, + "step": 1199 + }, + { + "epoch": 0.10887316276537834, + "grad_norm": 0.24458906576145717, + "learning_rate": 0.0009837863274291848, + "loss": 1.5348, + "step": 1200 + }, + { + "epoch": 0.10896389040101614, + "grad_norm": 0.25570048723948186, + "learning_rate": 0.0009837491938351549, + "loss": 1.5282, + "step": 1201 + }, + { + "epoch": 0.10905461803665396, + "grad_norm": 0.2447585767619514, + "learning_rate": 0.0009837120184693106, + "loss": 1.4819, + "step": 1202 + }, + { + "epoch": 0.10914534567229178, + "grad_norm": 0.242196891632013, + "learning_rate": 0.000983674801334862, + "loss": 1.4879, + "step": 1203 + }, + { + "epoch": 0.1092360733079296, + "grad_norm": 0.2804634311467624, + "learning_rate": 0.0009836375424350234, + "loss": 1.5467, + "step": 1204 + }, + { + "epoch": 0.1093268009435674, + "grad_norm": 0.24049503551585336, + "learning_rate": 0.0009836002417730116, + "loss": 1.4991, + "step": 1205 + }, + { + "epoch": 0.10941752857920523, + "grad_norm": 0.21937677007479833, + "learning_rate": 0.0009835628993520478, + "loss": 1.4797, + "step": 1206 + }, + { + "epoch": 0.10950825621484304, + "grad_norm": 0.22292956122947322, + "learning_rate": 0.0009835255151753562, + "loss": 1.555, + "step": 1207 + }, + { + "epoch": 0.10959898385048086, + "grad_norm": 0.2471347679366293, + "learning_rate": 0.000983488089246165, + "loss": 1.5476, + "step": 1208 + }, + { + "epoch": 0.10968971148611867, + "grad_norm": 0.2317720268229185, + "learning_rate": 0.0009834506215677062, + "loss": 1.4979, + "step": 1209 + }, + { + "epoch": 0.10978043912175649, + "grad_norm": 0.2463886079194716, + "learning_rate": 0.000983413112143215, + "loss": 1.5159, + "step": 1210 + }, + { + "epoch": 0.1098711667573943, + "grad_norm": 0.2456263044242982, + "learning_rate": 0.00098337556097593, + "loss": 1.5181, + "step": 1211 + }, + { + "epoch": 0.10996189439303211, + "grad_norm": 0.2274281276411643, + "learning_rate": 0.0009833379680690944, + "loss": 1.5169, + "step": 1212 + }, + { + "epoch": 0.11005262202866993, + "grad_norm": 0.22173058845249952, + "learning_rate": 0.000983300333425954, + "loss": 1.5349, + "step": 1213 + }, + { + "epoch": 0.11014334966430775, + "grad_norm": 0.2390238179882533, + "learning_rate": 0.0009832626570497585, + "loss": 1.5153, + "step": 1214 + }, + { + "epoch": 0.11023407729994557, + "grad_norm": 0.23277991091874084, + "learning_rate": 0.0009832249389437613, + "loss": 1.5214, + "step": 1215 + }, + { + "epoch": 0.11032480493558337, + "grad_norm": 0.2229474090056004, + "learning_rate": 0.0009831871791112195, + "loss": 1.5147, + "step": 1216 + }, + { + "epoch": 0.11041553257122119, + "grad_norm": 0.23507422161868113, + "learning_rate": 0.0009831493775553934, + "loss": 1.5593, + "step": 1217 + }, + { + "epoch": 0.11050626020685901, + "grad_norm": 0.22480655550680595, + "learning_rate": 0.0009831115342795475, + "loss": 1.5083, + "step": 1218 + }, + { + "epoch": 0.11059698784249683, + "grad_norm": 0.2547644565258582, + "learning_rate": 0.0009830736492869494, + "loss": 1.5238, + "step": 1219 + }, + { + "epoch": 0.11068771547813464, + "grad_norm": 0.23193012113239855, + "learning_rate": 0.0009830357225808705, + "loss": 1.4724, + "step": 1220 + }, + { + "epoch": 0.11077844311377245, + "grad_norm": 0.24844119034856305, + "learning_rate": 0.0009829977541645856, + "loss": 1.5218, + "step": 1221 + }, + { + "epoch": 0.11086917074941027, + "grad_norm": 0.26652587164817443, + "learning_rate": 0.0009829597440413737, + "loss": 1.5038, + "step": 1222 + }, + { + "epoch": 0.11095989838504809, + "grad_norm": 0.22341595519251997, + "learning_rate": 0.0009829216922145165, + "loss": 1.5417, + "step": 1223 + }, + { + "epoch": 0.1110506260206859, + "grad_norm": 0.24630538482235864, + "learning_rate": 0.0009828835986873002, + "loss": 1.5393, + "step": 1224 + }, + { + "epoch": 0.11114135365632372, + "grad_norm": 0.24155777292828498, + "learning_rate": 0.0009828454634630138, + "loss": 1.4855, + "step": 1225 + }, + { + "epoch": 0.11123208129196153, + "grad_norm": 0.2655898701193555, + "learning_rate": 0.0009828072865449503, + "loss": 1.4849, + "step": 1226 + }, + { + "epoch": 0.11132280892759934, + "grad_norm": 0.23595901761212887, + "learning_rate": 0.0009827690679364068, + "loss": 1.5372, + "step": 1227 + }, + { + "epoch": 0.11141353656323716, + "grad_norm": 0.23642454495173065, + "learning_rate": 0.000982730807640683, + "loss": 1.5083, + "step": 1228 + }, + { + "epoch": 0.11150426419887498, + "grad_norm": 0.22773220789921142, + "learning_rate": 0.000982692505661083, + "loss": 1.5273, + "step": 1229 + }, + { + "epoch": 0.1115949918345128, + "grad_norm": 0.2527334130761488, + "learning_rate": 0.0009826541620009137, + "loss": 1.4816, + "step": 1230 + }, + { + "epoch": 0.1116857194701506, + "grad_norm": 0.25318670611357735, + "learning_rate": 0.0009826157766634864, + "loss": 1.5554, + "step": 1231 + }, + { + "epoch": 0.11177644710578842, + "grad_norm": 0.25987684017940765, + "learning_rate": 0.000982577349652116, + "loss": 1.4858, + "step": 1232 + }, + { + "epoch": 0.11186717474142624, + "grad_norm": 0.25839355258158975, + "learning_rate": 0.0009825388809701198, + "loss": 1.5277, + "step": 1233 + }, + { + "epoch": 0.11195790237706406, + "grad_norm": 0.24451337980313964, + "learning_rate": 0.0009825003706208206, + "loss": 1.4876, + "step": 1234 + }, + { + "epoch": 0.11204863001270186, + "grad_norm": 0.324980571679887, + "learning_rate": 0.0009824618186075429, + "loss": 1.5256, + "step": 1235 + }, + { + "epoch": 0.11213935764833968, + "grad_norm": 0.2562746960164097, + "learning_rate": 0.0009824232249336163, + "loss": 1.5121, + "step": 1236 + }, + { + "epoch": 0.1122300852839775, + "grad_norm": 0.23054858309919324, + "learning_rate": 0.000982384589602373, + "loss": 1.5333, + "step": 1237 + }, + { + "epoch": 0.11232081291961532, + "grad_norm": 0.2727409664410058, + "learning_rate": 0.0009823459126171495, + "loss": 1.4883, + "step": 1238 + }, + { + "epoch": 0.11241154055525313, + "grad_norm": 0.2569577196341418, + "learning_rate": 0.000982307193981285, + "loss": 1.4735, + "step": 1239 + }, + { + "epoch": 0.11250226819089094, + "grad_norm": 0.2374254563283485, + "learning_rate": 0.0009822684336981235, + "loss": 1.5246, + "step": 1240 + }, + { + "epoch": 0.11259299582652876, + "grad_norm": 0.4273570092581285, + "learning_rate": 0.0009822296317710116, + "loss": 1.5007, + "step": 1241 + }, + { + "epoch": 0.11268372346216658, + "grad_norm": 0.7310068240739607, + "learning_rate": 0.0009821907882033, + "loss": 1.5134, + "step": 1242 + }, + { + "epoch": 0.11277445109780439, + "grad_norm": 0.2181265847520133, + "learning_rate": 0.0009821519029983427, + "loss": 1.5101, + "step": 1243 + }, + { + "epoch": 0.1128651787334422, + "grad_norm": 0.2688102881347941, + "learning_rate": 0.0009821129761594972, + "loss": 1.5189, + "step": 1244 + }, + { + "epoch": 0.11295590636908003, + "grad_norm": 0.27652792112115737, + "learning_rate": 0.0009820740076901255, + "loss": 1.5621, + "step": 1245 + }, + { + "epoch": 0.11304663400471783, + "grad_norm": 0.290162244345911, + "learning_rate": 0.0009820349975935923, + "loss": 1.5346, + "step": 1246 + }, + { + "epoch": 0.11313736164035565, + "grad_norm": 0.2219602014137726, + "learning_rate": 0.0009819959458732658, + "loss": 1.5414, + "step": 1247 + }, + { + "epoch": 0.11322808927599347, + "grad_norm": 0.2537020225704424, + "learning_rate": 0.0009819568525325185, + "loss": 1.5221, + "step": 1248 + }, + { + "epoch": 0.11331881691163129, + "grad_norm": 0.22874179509063663, + "learning_rate": 0.0009819177175747257, + "loss": 1.5226, + "step": 1249 + }, + { + "epoch": 0.11340954454726909, + "grad_norm": 0.24579153749212468, + "learning_rate": 0.0009818785410032672, + "loss": 1.5474, + "step": 1250 + }, + { + "epoch": 0.11350027218290691, + "grad_norm": 0.22534648920146239, + "learning_rate": 0.0009818393228215253, + "loss": 1.4766, + "step": 1251 + }, + { + "epoch": 0.11359099981854473, + "grad_norm": 0.2279144043136576, + "learning_rate": 0.0009818000630328872, + "loss": 1.4968, + "step": 1252 + }, + { + "epoch": 0.11368172745418255, + "grad_norm": 0.21842302293527396, + "learning_rate": 0.0009817607616407426, + "loss": 1.5411, + "step": 1253 + }, + { + "epoch": 0.11377245508982035, + "grad_norm": 0.21719008074290744, + "learning_rate": 0.0009817214186484853, + "loss": 1.4921, + "step": 1254 + }, + { + "epoch": 0.11386318272545817, + "grad_norm": 0.20947836120786745, + "learning_rate": 0.0009816820340595124, + "loss": 1.5126, + "step": 1255 + }, + { + "epoch": 0.11395391036109599, + "grad_norm": 0.24100439104936106, + "learning_rate": 0.0009816426078772247, + "loss": 1.4623, + "step": 1256 + }, + { + "epoch": 0.11404463799673381, + "grad_norm": 0.2212802372224098, + "learning_rate": 0.0009816031401050271, + "loss": 1.5199, + "step": 1257 + }, + { + "epoch": 0.11413536563237162, + "grad_norm": 0.21226633254059984, + "learning_rate": 0.000981563630746327, + "loss": 1.5101, + "step": 1258 + }, + { + "epoch": 0.11422609326800943, + "grad_norm": 0.21432743592745274, + "learning_rate": 0.0009815240798045368, + "loss": 1.4953, + "step": 1259 + }, + { + "epoch": 0.11431682090364725, + "grad_norm": 0.19780507928727825, + "learning_rate": 0.0009814844872830712, + "loss": 1.5219, + "step": 1260 + }, + { + "epoch": 0.11440754853928507, + "grad_norm": 0.2277526302126086, + "learning_rate": 0.0009814448531853493, + "loss": 1.5001, + "step": 1261 + }, + { + "epoch": 0.11449827617492288, + "grad_norm": 0.22475524631264815, + "learning_rate": 0.000981405177514793, + "loss": 1.5082, + "step": 1262 + }, + { + "epoch": 0.1145890038105607, + "grad_norm": 0.21560097701329067, + "learning_rate": 0.000981365460274829, + "loss": 1.5394, + "step": 1263 + }, + { + "epoch": 0.11467973144619852, + "grad_norm": 0.21241909842511889, + "learning_rate": 0.0009813257014688866, + "loss": 1.5196, + "step": 1264 + }, + { + "epoch": 0.11477045908183632, + "grad_norm": 0.20261230378581108, + "learning_rate": 0.0009812859011003987, + "loss": 1.5201, + "step": 1265 + }, + { + "epoch": 0.11486118671747414, + "grad_norm": 0.24842061634626292, + "learning_rate": 0.0009812460591728024, + "loss": 1.522, + "step": 1266 + }, + { + "epoch": 0.11495191435311196, + "grad_norm": 0.2019454662486865, + "learning_rate": 0.0009812061756895379, + "loss": 1.5217, + "step": 1267 + }, + { + "epoch": 0.11504264198874978, + "grad_norm": 0.24114785581968498, + "learning_rate": 0.0009811662506540493, + "loss": 1.5269, + "step": 1268 + }, + { + "epoch": 0.11513336962438758, + "grad_norm": 0.20730344077250903, + "learning_rate": 0.000981126284069784, + "loss": 1.5055, + "step": 1269 + }, + { + "epoch": 0.1152240972600254, + "grad_norm": 0.20179322276627037, + "learning_rate": 0.0009810862759401932, + "loss": 1.5379, + "step": 1270 + }, + { + "epoch": 0.11531482489566322, + "grad_norm": 0.22023929732224706, + "learning_rate": 0.0009810462262687316, + "loss": 1.4956, + "step": 1271 + }, + { + "epoch": 0.11540555253130104, + "grad_norm": 0.20706148310914194, + "learning_rate": 0.0009810061350588573, + "loss": 1.4968, + "step": 1272 + }, + { + "epoch": 0.11549628016693884, + "grad_norm": 0.22055465561097243, + "learning_rate": 0.0009809660023140326, + "loss": 1.5177, + "step": 1273 + }, + { + "epoch": 0.11558700780257666, + "grad_norm": 0.22412054763521982, + "learning_rate": 0.0009809258280377225, + "loss": 1.5352, + "step": 1274 + }, + { + "epoch": 0.11567773543821448, + "grad_norm": 0.2092200401291364, + "learning_rate": 0.0009808856122333963, + "loss": 1.5301, + "step": 1275 + }, + { + "epoch": 0.1157684630738523, + "grad_norm": 0.20802481518368376, + "learning_rate": 0.0009808453549045267, + "loss": 1.5115, + "step": 1276 + }, + { + "epoch": 0.1158591907094901, + "grad_norm": 0.21151482425804075, + "learning_rate": 0.0009808050560545895, + "loss": 1.4813, + "step": 1277 + }, + { + "epoch": 0.11594991834512793, + "grad_norm": 0.21251811854914904, + "learning_rate": 0.0009807647156870652, + "loss": 1.4777, + "step": 1278 + }, + { + "epoch": 0.11604064598076574, + "grad_norm": 0.2042764387610411, + "learning_rate": 0.0009807243338054367, + "loss": 1.4876, + "step": 1279 + }, + { + "epoch": 0.11613137361640356, + "grad_norm": 0.20418949992464125, + "learning_rate": 0.0009806839104131912, + "loss": 1.5061, + "step": 1280 + }, + { + "epoch": 0.11622210125204137, + "grad_norm": 0.20915454540325745, + "learning_rate": 0.0009806434455138194, + "loss": 1.5442, + "step": 1281 + }, + { + "epoch": 0.11631282888767919, + "grad_norm": 0.23506237261279161, + "learning_rate": 0.0009806029391108148, + "loss": 1.5232, + "step": 1282 + }, + { + "epoch": 0.116403556523317, + "grad_norm": 0.20314947325728902, + "learning_rate": 0.0009805623912076758, + "loss": 1.5338, + "step": 1283 + }, + { + "epoch": 0.11649428415895481, + "grad_norm": 0.21635178207342257, + "learning_rate": 0.0009805218018079035, + "loss": 1.5162, + "step": 1284 + }, + { + "epoch": 0.11658501179459263, + "grad_norm": 0.21832486731792783, + "learning_rate": 0.0009804811709150027, + "loss": 1.5152, + "step": 1285 + }, + { + "epoch": 0.11667573943023045, + "grad_norm": 0.2857778188292202, + "learning_rate": 0.0009804404985324822, + "loss": 1.5453, + "step": 1286 + }, + { + "epoch": 0.11676646706586827, + "grad_norm": 0.21299619503286368, + "learning_rate": 0.0009803997846638537, + "loss": 1.4865, + "step": 1287 + }, + { + "epoch": 0.11685719470150607, + "grad_norm": 0.20908244757189656, + "learning_rate": 0.000980359029312633, + "loss": 1.5153, + "step": 1288 + }, + { + "epoch": 0.11694792233714389, + "grad_norm": 0.2010312237184306, + "learning_rate": 0.0009803182324823391, + "loss": 1.4972, + "step": 1289 + }, + { + "epoch": 0.11703864997278171, + "grad_norm": 0.2501104339621092, + "learning_rate": 0.0009802773941764954, + "loss": 1.5065, + "step": 1290 + }, + { + "epoch": 0.11712937760841953, + "grad_norm": 0.24129850904250294, + "learning_rate": 0.0009802365143986276, + "loss": 1.4977, + "step": 1291 + }, + { + "epoch": 0.11722010524405733, + "grad_norm": 0.22119010389545818, + "learning_rate": 0.0009801955931522663, + "loss": 1.5153, + "step": 1292 + }, + { + "epoch": 0.11731083287969515, + "grad_norm": 0.2231924478620759, + "learning_rate": 0.0009801546304409445, + "loss": 1.5071, + "step": 1293 + }, + { + "epoch": 0.11740156051533297, + "grad_norm": 0.2609393512995722, + "learning_rate": 0.0009801136262681997, + "loss": 1.5245, + "step": 1294 + }, + { + "epoch": 0.11749228815097079, + "grad_norm": 0.2697633750204792, + "learning_rate": 0.0009800725806375725, + "loss": 1.5104, + "step": 1295 + }, + { + "epoch": 0.1175830157866086, + "grad_norm": 0.2121532682965755, + "learning_rate": 0.0009800314935526073, + "loss": 1.4674, + "step": 1296 + }, + { + "epoch": 0.11767374342224642, + "grad_norm": 0.23923294478381973, + "learning_rate": 0.0009799903650168518, + "loss": 1.4963, + "step": 1297 + }, + { + "epoch": 0.11776447105788423, + "grad_norm": 0.2303002151757583, + "learning_rate": 0.0009799491950338577, + "loss": 1.5353, + "step": 1298 + }, + { + "epoch": 0.11785519869352205, + "grad_norm": 0.2835514731711105, + "learning_rate": 0.0009799079836071796, + "loss": 1.533, + "step": 1299 + }, + { + "epoch": 0.11794592632915986, + "grad_norm": 0.26026533211704267, + "learning_rate": 0.0009798667307403767, + "loss": 1.496, + "step": 1300 + }, + { + "epoch": 0.11803665396479768, + "grad_norm": 0.2272989362861702, + "learning_rate": 0.0009798254364370106, + "loss": 1.502, + "step": 1301 + }, + { + "epoch": 0.1181273816004355, + "grad_norm": 0.2996726883808252, + "learning_rate": 0.0009797841007006473, + "loss": 1.5119, + "step": 1302 + }, + { + "epoch": 0.1182181092360733, + "grad_norm": 0.24102319777239267, + "learning_rate": 0.0009797427235348564, + "loss": 1.5189, + "step": 1303 + }, + { + "epoch": 0.11830883687171112, + "grad_norm": 0.32278453704479454, + "learning_rate": 0.0009797013049432106, + "loss": 1.5378, + "step": 1304 + }, + { + "epoch": 0.11839956450734894, + "grad_norm": 0.24154972670411073, + "learning_rate": 0.0009796598449292862, + "loss": 1.4774, + "step": 1305 + }, + { + "epoch": 0.11849029214298676, + "grad_norm": 0.22495266234712563, + "learning_rate": 0.0009796183434966636, + "loss": 1.4936, + "step": 1306 + }, + { + "epoch": 0.11858101977862456, + "grad_norm": 0.2284337896047979, + "learning_rate": 0.0009795768006489265, + "loss": 1.5454, + "step": 1307 + }, + { + "epoch": 0.11867174741426238, + "grad_norm": 0.20535595072950927, + "learning_rate": 0.0009795352163896617, + "loss": 1.4947, + "step": 1308 + }, + { + "epoch": 0.1187624750499002, + "grad_norm": 0.22967234362191422, + "learning_rate": 0.0009794935907224606, + "loss": 1.5208, + "step": 1309 + }, + { + "epoch": 0.11885320268553802, + "grad_norm": 0.21888548207184724, + "learning_rate": 0.0009794519236509171, + "loss": 1.5283, + "step": 1310 + }, + { + "epoch": 0.11894393032117583, + "grad_norm": 0.2159976583985646, + "learning_rate": 0.0009794102151786295, + "loss": 1.4966, + "step": 1311 + }, + { + "epoch": 0.11903465795681364, + "grad_norm": 0.26473657082031005, + "learning_rate": 0.000979368465309199, + "loss": 1.4918, + "step": 1312 + }, + { + "epoch": 0.11912538559245146, + "grad_norm": 0.21136606890607487, + "learning_rate": 0.000979326674046231, + "loss": 1.5349, + "step": 1313 + }, + { + "epoch": 0.11921611322808928, + "grad_norm": 0.224700481688445, + "learning_rate": 0.0009792848413933339, + "loss": 1.499, + "step": 1314 + }, + { + "epoch": 0.11930684086372709, + "grad_norm": 0.21046059996444688, + "learning_rate": 0.0009792429673541202, + "loss": 1.4964, + "step": 1315 + }, + { + "epoch": 0.1193975684993649, + "grad_norm": 0.20588774942508076, + "learning_rate": 0.0009792010519322054, + "loss": 1.475, + "step": 1316 + }, + { + "epoch": 0.11948829613500273, + "grad_norm": 0.2740519701055802, + "learning_rate": 0.0009791590951312094, + "loss": 1.5012, + "step": 1317 + }, + { + "epoch": 0.11957902377064054, + "grad_norm": 0.22284735749286577, + "learning_rate": 0.0009791170969547548, + "loss": 1.4735, + "step": 1318 + }, + { + "epoch": 0.11966975140627835, + "grad_norm": 0.2179116241593899, + "learning_rate": 0.0009790750574064685, + "loss": 1.5221, + "step": 1319 + }, + { + "epoch": 0.11976047904191617, + "grad_norm": 0.2267847782681148, + "learning_rate": 0.00097903297648998, + "loss": 1.5045, + "step": 1320 + }, + { + "epoch": 0.11985120667755399, + "grad_norm": 0.2318208288497616, + "learning_rate": 0.0009789908542089235, + "loss": 1.5145, + "step": 1321 + }, + { + "epoch": 0.11994193431319179, + "grad_norm": 0.2210047944372597, + "learning_rate": 0.0009789486905669362, + "loss": 1.4888, + "step": 1322 + }, + { + "epoch": 0.12003266194882961, + "grad_norm": 0.20901333446996267, + "learning_rate": 0.000978906485567659, + "loss": 1.5172, + "step": 1323 + }, + { + "epoch": 0.12012338958446743, + "grad_norm": 0.21573710040916208, + "learning_rate": 0.0009788642392147362, + "loss": 1.4701, + "step": 1324 + }, + { + "epoch": 0.12021411722010525, + "grad_norm": 0.21784392161292232, + "learning_rate": 0.0009788219515118155, + "loss": 1.5168, + "step": 1325 + }, + { + "epoch": 0.12030484485574305, + "grad_norm": 0.21503944558323293, + "learning_rate": 0.0009787796224625489, + "loss": 1.5215, + "step": 1326 + }, + { + "epoch": 0.12039557249138087, + "grad_norm": 0.20074649741616438, + "learning_rate": 0.0009787372520705913, + "loss": 1.5183, + "step": 1327 + }, + { + "epoch": 0.12048630012701869, + "grad_norm": 0.23124378775670548, + "learning_rate": 0.0009786948403396015, + "loss": 1.4997, + "step": 1328 + }, + { + "epoch": 0.12057702776265651, + "grad_norm": 0.22976528771200627, + "learning_rate": 0.0009786523872732417, + "loss": 1.5107, + "step": 1329 + }, + { + "epoch": 0.12066775539829432, + "grad_norm": 0.20833934502654083, + "learning_rate": 0.0009786098928751775, + "loss": 1.4979, + "step": 1330 + }, + { + "epoch": 0.12075848303393213, + "grad_norm": 0.22770575887199607, + "learning_rate": 0.0009785673571490786, + "loss": 1.5086, + "step": 1331 + }, + { + "epoch": 0.12084921066956995, + "grad_norm": 0.2916848106359406, + "learning_rate": 0.0009785247800986178, + "loss": 1.5076, + "step": 1332 + }, + { + "epoch": 0.12093993830520777, + "grad_norm": 0.21565731518075995, + "learning_rate": 0.000978482161727472, + "loss": 1.5277, + "step": 1333 + }, + { + "epoch": 0.12103066594084558, + "grad_norm": 0.271821307618932, + "learning_rate": 0.0009784395020393207, + "loss": 1.5079, + "step": 1334 + }, + { + "epoch": 0.1211213935764834, + "grad_norm": 0.22518914350563607, + "learning_rate": 0.0009783968010378481, + "loss": 1.5194, + "step": 1335 + }, + { + "epoch": 0.12121212121212122, + "grad_norm": 0.23935215879565172, + "learning_rate": 0.0009783540587267412, + "loss": 1.5232, + "step": 1336 + }, + { + "epoch": 0.12130284884775902, + "grad_norm": 0.21028198606552173, + "learning_rate": 0.0009783112751096906, + "loss": 1.5045, + "step": 1337 + }, + { + "epoch": 0.12139357648339684, + "grad_norm": 0.22645519861021043, + "learning_rate": 0.000978268450190391, + "loss": 1.5043, + "step": 1338 + }, + { + "epoch": 0.12148430411903466, + "grad_norm": 0.2135913343269859, + "learning_rate": 0.0009782255839725406, + "loss": 1.5251, + "step": 1339 + }, + { + "epoch": 0.12157503175467248, + "grad_norm": 0.2246935208410568, + "learning_rate": 0.0009781826764598401, + "loss": 1.5173, + "step": 1340 + }, + { + "epoch": 0.12166575939031028, + "grad_norm": 0.24451458961077152, + "learning_rate": 0.0009781397276559952, + "loss": 1.4988, + "step": 1341 + }, + { + "epoch": 0.1217564870259481, + "grad_norm": 0.22710369730632465, + "learning_rate": 0.000978096737564714, + "loss": 1.5216, + "step": 1342 + }, + { + "epoch": 0.12184721466158592, + "grad_norm": 0.24438101554548927, + "learning_rate": 0.0009780537061897096, + "loss": 1.5071, + "step": 1343 + }, + { + "epoch": 0.12193794229722374, + "grad_norm": 0.2320206075250568, + "learning_rate": 0.0009780106335346968, + "loss": 1.4938, + "step": 1344 + }, + { + "epoch": 0.12202866993286154, + "grad_norm": 0.23054723178990358, + "learning_rate": 0.0009779675196033956, + "loss": 1.5234, + "step": 1345 + }, + { + "epoch": 0.12211939756849936, + "grad_norm": 0.23032388228763803, + "learning_rate": 0.0009779243643995286, + "loss": 1.52, + "step": 1346 + }, + { + "epoch": 0.12221012520413718, + "grad_norm": 0.23416792908019438, + "learning_rate": 0.0009778811679268223, + "loss": 1.5033, + "step": 1347 + }, + { + "epoch": 0.122300852839775, + "grad_norm": 0.20707373592617684, + "learning_rate": 0.0009778379301890066, + "loss": 1.5029, + "step": 1348 + }, + { + "epoch": 0.1223915804754128, + "grad_norm": 0.20512485840760303, + "learning_rate": 0.0009777946511898153, + "loss": 1.479, + "step": 1349 + }, + { + "epoch": 0.12248230811105063, + "grad_norm": 0.2143205469730299, + "learning_rate": 0.0009777513309329852, + "loss": 1.5127, + "step": 1350 + }, + { + "epoch": 0.12257303574668844, + "grad_norm": 0.23096919709996652, + "learning_rate": 0.0009777079694222575, + "loss": 1.4692, + "step": 1351 + }, + { + "epoch": 0.12266376338232626, + "grad_norm": 0.205992648644257, + "learning_rate": 0.000977664566661376, + "loss": 1.506, + "step": 1352 + }, + { + "epoch": 0.12275449101796407, + "grad_norm": 0.19782829244609768, + "learning_rate": 0.0009776211226540893, + "loss": 1.5102, + "step": 1353 + }, + { + "epoch": 0.12284521865360189, + "grad_norm": 0.23536132054405984, + "learning_rate": 0.0009775776374041478, + "loss": 1.4971, + "step": 1354 + }, + { + "epoch": 0.1229359462892397, + "grad_norm": 0.2314248130102739, + "learning_rate": 0.000977534110915307, + "loss": 1.5341, + "step": 1355 + }, + { + "epoch": 0.12302667392487751, + "grad_norm": 0.19792737178749487, + "learning_rate": 0.0009774905431913254, + "loss": 1.5028, + "step": 1356 + }, + { + "epoch": 0.12311740156051533, + "grad_norm": 0.20216717398687292, + "learning_rate": 0.0009774469342359652, + "loss": 1.5248, + "step": 1357 + }, + { + "epoch": 0.12320812919615315, + "grad_norm": 0.20504821900558012, + "learning_rate": 0.0009774032840529916, + "loss": 1.4705, + "step": 1358 + }, + { + "epoch": 0.12329885683179097, + "grad_norm": 0.20662398541952087, + "learning_rate": 0.000977359592646174, + "loss": 1.4601, + "step": 1359 + }, + { + "epoch": 0.12338958446742877, + "grad_norm": 0.20968953995405767, + "learning_rate": 0.0009773158600192855, + "loss": 1.5081, + "step": 1360 + }, + { + "epoch": 0.12348031210306659, + "grad_norm": 0.2216705248942976, + "learning_rate": 0.000977272086176102, + "loss": 1.5076, + "step": 1361 + }, + { + "epoch": 0.12357103973870441, + "grad_norm": 0.22003050072289268, + "learning_rate": 0.0009772282711204034, + "loss": 1.5045, + "step": 1362 + }, + { + "epoch": 0.12366176737434223, + "grad_norm": 0.2196784909270421, + "learning_rate": 0.0009771844148559734, + "loss": 1.5347, + "step": 1363 + }, + { + "epoch": 0.12375249500998003, + "grad_norm": 0.19563531152563765, + "learning_rate": 0.0009771405173865989, + "loss": 1.4872, + "step": 1364 + }, + { + "epoch": 0.12384322264561785, + "grad_norm": 0.20549074466594597, + "learning_rate": 0.00097709657871607, + "loss": 1.4804, + "step": 1365 + }, + { + "epoch": 0.12393395028125567, + "grad_norm": 0.20773014023675315, + "learning_rate": 0.0009770525988481817, + "loss": 1.5308, + "step": 1366 + }, + { + "epoch": 0.12402467791689349, + "grad_norm": 0.1994347913304232, + "learning_rate": 0.000977008577786731, + "loss": 1.5654, + "step": 1367 + }, + { + "epoch": 0.1241154055525313, + "grad_norm": 0.19494912645910198, + "learning_rate": 0.0009769645155355193, + "loss": 1.4985, + "step": 1368 + }, + { + "epoch": 0.12420613318816912, + "grad_norm": 0.1961067101118581, + "learning_rate": 0.0009769204120983516, + "loss": 1.4927, + "step": 1369 + }, + { + "epoch": 0.12429686082380693, + "grad_norm": 0.21304277630485502, + "learning_rate": 0.0009768762674790357, + "loss": 1.5187, + "step": 1370 + }, + { + "epoch": 0.12438758845944475, + "grad_norm": 0.19558050292655416, + "learning_rate": 0.0009768320816813838, + "loss": 1.4674, + "step": 1371 + }, + { + "epoch": 0.12447831609508256, + "grad_norm": 0.22935012637202157, + "learning_rate": 0.0009767878547092114, + "loss": 1.5444, + "step": 1372 + }, + { + "epoch": 0.12456904373072038, + "grad_norm": 0.19639476287153632, + "learning_rate": 0.0009767435865663376, + "loss": 1.5122, + "step": 1373 + }, + { + "epoch": 0.1246597713663582, + "grad_norm": 0.21906821161220716, + "learning_rate": 0.000976699277256585, + "loss": 1.4978, + "step": 1374 + }, + { + "epoch": 0.124750499001996, + "grad_norm": 0.1942364206881126, + "learning_rate": 0.0009766549267837793, + "loss": 1.4988, + "step": 1375 + }, + { + "epoch": 0.12484122663763382, + "grad_norm": 0.1995463174585507, + "learning_rate": 0.0009766105351517505, + "loss": 1.5047, + "step": 1376 + }, + { + "epoch": 0.12493195427327164, + "grad_norm": 0.19982281897503873, + "learning_rate": 0.0009765661023643317, + "loss": 1.4553, + "step": 1377 + }, + { + "epoch": 0.12502268190890944, + "grad_norm": 0.21216765141487795, + "learning_rate": 0.0009765216284253598, + "loss": 1.5081, + "step": 1378 + }, + { + "epoch": 0.12511340954454728, + "grad_norm": 0.19930569627078734, + "learning_rate": 0.0009764771133386749, + "loss": 1.4724, + "step": 1379 + }, + { + "epoch": 0.12520413718018508, + "grad_norm": 0.23049239765718563, + "learning_rate": 0.0009764325571081212, + "loss": 1.5095, + "step": 1380 + }, + { + "epoch": 0.1252948648158229, + "grad_norm": 0.2060888441808469, + "learning_rate": 0.000976387959737546, + "loss": 1.5102, + "step": 1381 + }, + { + "epoch": 0.12538559245146072, + "grad_norm": 0.19576564052418152, + "learning_rate": 0.0009763433212308001, + "loss": 1.4676, + "step": 1382 + }, + { + "epoch": 0.12547632008709853, + "grad_norm": 0.19430836804943638, + "learning_rate": 0.0009762986415917383, + "loss": 1.4753, + "step": 1383 + }, + { + "epoch": 0.12556704772273636, + "grad_norm": 0.1907351851902473, + "learning_rate": 0.0009762539208242185, + "loss": 1.4735, + "step": 1384 + }, + { + "epoch": 0.12565777535837416, + "grad_norm": 0.20022631263362883, + "learning_rate": 0.0009762091589321025, + "loss": 1.5096, + "step": 1385 + }, + { + "epoch": 0.12574850299401197, + "grad_norm": 0.22248434875580933, + "learning_rate": 0.0009761643559192556, + "loss": 1.4736, + "step": 1386 + }, + { + "epoch": 0.1258392306296498, + "grad_norm": 0.20282453286718213, + "learning_rate": 0.0009761195117895462, + "loss": 1.4891, + "step": 1387 + }, + { + "epoch": 0.1259299582652876, + "grad_norm": 0.21791273957701135, + "learning_rate": 0.0009760746265468468, + "loss": 1.4641, + "step": 1388 + }, + { + "epoch": 0.1260206859009254, + "grad_norm": 0.20574978761285545, + "learning_rate": 0.0009760297001950334, + "loss": 1.4749, + "step": 1389 + }, + { + "epoch": 0.12611141353656324, + "grad_norm": 0.20244047325417225, + "learning_rate": 0.0009759847327379849, + "loss": 1.4806, + "step": 1390 + }, + { + "epoch": 0.12620214117220105, + "grad_norm": 0.22422917243068205, + "learning_rate": 0.0009759397241795849, + "loss": 1.52, + "step": 1391 + }, + { + "epoch": 0.12629286880783885, + "grad_norm": 0.21149683920818854, + "learning_rate": 0.0009758946745237194, + "loss": 1.5297, + "step": 1392 + }, + { + "epoch": 0.1263835964434767, + "grad_norm": 0.2092810449022671, + "learning_rate": 0.0009758495837742787, + "loss": 1.4996, + "step": 1393 + }, + { + "epoch": 0.1264743240791145, + "grad_norm": 0.2092999499538956, + "learning_rate": 0.0009758044519351562, + "loss": 1.513, + "step": 1394 + }, + { + "epoch": 0.12656505171475232, + "grad_norm": 0.21298274273417492, + "learning_rate": 0.0009757592790102492, + "loss": 1.4988, + "step": 1395 + }, + { + "epoch": 0.12665577935039013, + "grad_norm": 0.20343612452969573, + "learning_rate": 0.0009757140650034584, + "loss": 1.5266, + "step": 1396 + }, + { + "epoch": 0.12674650698602793, + "grad_norm": 0.2061730079670881, + "learning_rate": 0.0009756688099186878, + "loss": 1.5252, + "step": 1397 + }, + { + "epoch": 0.12683723462166577, + "grad_norm": 0.2087420284823541, + "learning_rate": 0.0009756235137598457, + "loss": 1.5228, + "step": 1398 + }, + { + "epoch": 0.12692796225730357, + "grad_norm": 0.2209087968833071, + "learning_rate": 0.0009755781765308428, + "loss": 1.4962, + "step": 1399 + }, + { + "epoch": 0.12701868989294138, + "grad_norm": 0.21379101840346854, + "learning_rate": 0.0009755327982355944, + "loss": 1.4363, + "step": 1400 + }, + { + "epoch": 0.1271094175285792, + "grad_norm": 0.23822492242510165, + "learning_rate": 0.0009754873788780186, + "loss": 1.4845, + "step": 1401 + }, + { + "epoch": 0.12720014516421702, + "grad_norm": 0.2075412835611438, + "learning_rate": 0.0009754419184620378, + "loss": 1.4719, + "step": 1402 + }, + { + "epoch": 0.12729087279985485, + "grad_norm": 0.2156195444382119, + "learning_rate": 0.000975396416991577, + "loss": 1.468, + "step": 1403 + }, + { + "epoch": 0.12738160043549265, + "grad_norm": 0.23711806442828384, + "learning_rate": 0.0009753508744705657, + "loss": 1.5285, + "step": 1404 + }, + { + "epoch": 0.12747232807113046, + "grad_norm": 0.2093625074062136, + "learning_rate": 0.0009753052909029363, + "loss": 1.4962, + "step": 1405 + }, + { + "epoch": 0.1275630557067683, + "grad_norm": 0.23659251613811652, + "learning_rate": 0.0009752596662926249, + "loss": 1.5119, + "step": 1406 + }, + { + "epoch": 0.1276537833424061, + "grad_norm": 0.19831955925182312, + "learning_rate": 0.0009752140006435714, + "loss": 1.4881, + "step": 1407 + }, + { + "epoch": 0.1277445109780439, + "grad_norm": 0.1888074023869569, + "learning_rate": 0.000975168293959719, + "loss": 1.5121, + "step": 1408 + }, + { + "epoch": 0.12783523861368173, + "grad_norm": 0.20483146920536172, + "learning_rate": 0.0009751225462450142, + "loss": 1.48, + "step": 1409 + }, + { + "epoch": 0.12792596624931954, + "grad_norm": 0.19248141549728343, + "learning_rate": 0.0009750767575034075, + "loss": 1.5285, + "step": 1410 + }, + { + "epoch": 0.12801669388495734, + "grad_norm": 0.1914463219925047, + "learning_rate": 0.0009750309277388529, + "loss": 1.5233, + "step": 1411 + }, + { + "epoch": 0.12810742152059518, + "grad_norm": 0.20580795818469502, + "learning_rate": 0.0009749850569553076, + "loss": 1.4896, + "step": 1412 + }, + { + "epoch": 0.12819814915623298, + "grad_norm": 0.2080395521280534, + "learning_rate": 0.0009749391451567325, + "loss": 1.5064, + "step": 1413 + }, + { + "epoch": 0.12828887679187082, + "grad_norm": 0.21987321761655923, + "learning_rate": 0.0009748931923470923, + "loss": 1.5032, + "step": 1414 + }, + { + "epoch": 0.12837960442750862, + "grad_norm": 0.2221649066373174, + "learning_rate": 0.0009748471985303551, + "loss": 1.4936, + "step": 1415 + }, + { + "epoch": 0.12847033206314643, + "grad_norm": 0.19543362137076828, + "learning_rate": 0.0009748011637104921, + "loss": 1.4722, + "step": 1416 + }, + { + "epoch": 0.12856105969878426, + "grad_norm": 0.1938612694868798, + "learning_rate": 0.0009747550878914788, + "loss": 1.5063, + "step": 1417 + }, + { + "epoch": 0.12865178733442206, + "grad_norm": 0.19739772966306984, + "learning_rate": 0.0009747089710772936, + "loss": 1.5276, + "step": 1418 + }, + { + "epoch": 0.12874251497005987, + "grad_norm": 0.19695650457108455, + "learning_rate": 0.0009746628132719188, + "loss": 1.5097, + "step": 1419 + }, + { + "epoch": 0.1288332426056977, + "grad_norm": 0.20203698152495628, + "learning_rate": 0.0009746166144793401, + "loss": 1.5222, + "step": 1420 + }, + { + "epoch": 0.1289239702413355, + "grad_norm": 0.20605226713360217, + "learning_rate": 0.0009745703747035469, + "loss": 1.5332, + "step": 1421 + }, + { + "epoch": 0.12901469787697334, + "grad_norm": 0.20462736205378157, + "learning_rate": 0.0009745240939485318, + "loss": 1.4954, + "step": 1422 + }, + { + "epoch": 0.12910542551261114, + "grad_norm": 0.1935526437825326, + "learning_rate": 0.0009744777722182912, + "loss": 1.4911, + "step": 1423 + }, + { + "epoch": 0.12919615314824895, + "grad_norm": 0.19939573780552355, + "learning_rate": 0.0009744314095168252, + "loss": 1.5022, + "step": 1424 + }, + { + "epoch": 0.12928688078388678, + "grad_norm": 0.1936090023956459, + "learning_rate": 0.0009743850058481369, + "loss": 1.5191, + "step": 1425 + }, + { + "epoch": 0.1293776084195246, + "grad_norm": 0.19255203218956216, + "learning_rate": 0.0009743385612162335, + "loss": 1.4954, + "step": 1426 + }, + { + "epoch": 0.1294683360551624, + "grad_norm": 0.19455264991550522, + "learning_rate": 0.0009742920756251255, + "loss": 1.5025, + "step": 1427 + }, + { + "epoch": 0.12955906369080022, + "grad_norm": 0.20404231209860996, + "learning_rate": 0.0009742455490788267, + "loss": 1.4936, + "step": 1428 + }, + { + "epoch": 0.12964979132643803, + "grad_norm": 0.22319332535680325, + "learning_rate": 0.0009741989815813551, + "loss": 1.466, + "step": 1429 + }, + { + "epoch": 0.12974051896207583, + "grad_norm": 0.20971031105065585, + "learning_rate": 0.0009741523731367312, + "loss": 1.5079, + "step": 1430 + }, + { + "epoch": 0.12983124659771367, + "grad_norm": 0.22957390555306634, + "learning_rate": 0.0009741057237489802, + "loss": 1.5006, + "step": 1431 + }, + { + "epoch": 0.12992197423335147, + "grad_norm": 0.22405923568633765, + "learning_rate": 0.00097405903342213, + "loss": 1.4788, + "step": 1432 + }, + { + "epoch": 0.1300127018689893, + "grad_norm": 0.21085960256661393, + "learning_rate": 0.0009740123021602126, + "loss": 1.4703, + "step": 1433 + }, + { + "epoch": 0.1301034295046271, + "grad_norm": 0.24619151574594084, + "learning_rate": 0.0009739655299672628, + "loss": 1.4928, + "step": 1434 + }, + { + "epoch": 0.13019415714026492, + "grad_norm": 0.20343095666266978, + "learning_rate": 0.0009739187168473198, + "loss": 1.4841, + "step": 1435 + }, + { + "epoch": 0.13028488477590275, + "grad_norm": 0.22336070252021056, + "learning_rate": 0.0009738718628044256, + "loss": 1.4683, + "step": 1436 + }, + { + "epoch": 0.13037561241154055, + "grad_norm": 0.20188840116084064, + "learning_rate": 0.0009738249678426263, + "loss": 1.5188, + "step": 1437 + }, + { + "epoch": 0.13046634004717836, + "grad_norm": 0.32310415046676383, + "learning_rate": 0.0009737780319659712, + "loss": 1.4814, + "step": 1438 + }, + { + "epoch": 0.1305570676828162, + "grad_norm": 0.24462657369573754, + "learning_rate": 0.0009737310551785133, + "loss": 1.4724, + "step": 1439 + }, + { + "epoch": 0.130647795318454, + "grad_norm": 0.2187719165879749, + "learning_rate": 0.0009736840374843088, + "loss": 1.5204, + "step": 1440 + }, + { + "epoch": 0.13073852295409183, + "grad_norm": 0.22018979875248298, + "learning_rate": 0.0009736369788874178, + "loss": 1.5258, + "step": 1441 + }, + { + "epoch": 0.13082925058972963, + "grad_norm": 0.22017405477679888, + "learning_rate": 0.000973589879391904, + "loss": 1.4488, + "step": 1442 + }, + { + "epoch": 0.13091997822536744, + "grad_norm": 0.2548526417418072, + "learning_rate": 0.0009735427390018343, + "loss": 1.4861, + "step": 1443 + }, + { + "epoch": 0.13101070586100527, + "grad_norm": 0.2356232672601463, + "learning_rate": 0.0009734955577212793, + "loss": 1.4875, + "step": 1444 + }, + { + "epoch": 0.13110143349664308, + "grad_norm": 0.27935839550867386, + "learning_rate": 0.0009734483355543131, + "loss": 1.5016, + "step": 1445 + }, + { + "epoch": 0.13119216113228088, + "grad_norm": 0.20775506571943014, + "learning_rate": 0.0009734010725050133, + "loss": 1.4889, + "step": 1446 + }, + { + "epoch": 0.13128288876791872, + "grad_norm": 0.2084373670900712, + "learning_rate": 0.0009733537685774612, + "loss": 1.4948, + "step": 1447 + }, + { + "epoch": 0.13137361640355652, + "grad_norm": 0.2114424245544846, + "learning_rate": 0.0009733064237757413, + "loss": 1.5268, + "step": 1448 + }, + { + "epoch": 0.13146434403919433, + "grad_norm": 0.20690885919638405, + "learning_rate": 0.000973259038103942, + "loss": 1.516, + "step": 1449 + }, + { + "epoch": 0.13155507167483216, + "grad_norm": 0.21553658787848062, + "learning_rate": 0.000973211611566155, + "loss": 1.4991, + "step": 1450 + }, + { + "epoch": 0.13164579931046996, + "grad_norm": 0.213915917094399, + "learning_rate": 0.0009731641441664756, + "loss": 1.4381, + "step": 1451 + }, + { + "epoch": 0.1317365269461078, + "grad_norm": 0.23725759593994084, + "learning_rate": 0.0009731166359090026, + "loss": 1.4965, + "step": 1452 + }, + { + "epoch": 0.1318272545817456, + "grad_norm": 0.2180808111050868, + "learning_rate": 0.0009730690867978385, + "loss": 1.4966, + "step": 1453 + }, + { + "epoch": 0.1319179822173834, + "grad_norm": 0.21454334372615252, + "learning_rate": 0.000973021496837089, + "loss": 1.4904, + "step": 1454 + }, + { + "epoch": 0.13200870985302124, + "grad_norm": 0.21153353899165142, + "learning_rate": 0.0009729738660308634, + "loss": 1.4749, + "step": 1455 + }, + { + "epoch": 0.13209943748865904, + "grad_norm": 0.22303751465787122, + "learning_rate": 0.0009729261943832748, + "loss": 1.4667, + "step": 1456 + }, + { + "epoch": 0.13219016512429685, + "grad_norm": 0.21314655232640253, + "learning_rate": 0.0009728784818984395, + "loss": 1.4793, + "step": 1457 + }, + { + "epoch": 0.13228089275993468, + "grad_norm": 0.23790208707840851, + "learning_rate": 0.0009728307285804778, + "loss": 1.4884, + "step": 1458 + }, + { + "epoch": 0.1323716203955725, + "grad_norm": 0.20358844006883953, + "learning_rate": 0.0009727829344335129, + "loss": 1.5284, + "step": 1459 + }, + { + "epoch": 0.13246234803121032, + "grad_norm": 0.2228222806900039, + "learning_rate": 0.0009727350994616719, + "loss": 1.5346, + "step": 1460 + }, + { + "epoch": 0.13255307566684812, + "grad_norm": 0.29179434073552696, + "learning_rate": 0.0009726872236690856, + "loss": 1.4863, + "step": 1461 + }, + { + "epoch": 0.13264380330248593, + "grad_norm": 0.4204307555429662, + "learning_rate": 0.0009726393070598876, + "loss": 1.48, + "step": 1462 + }, + { + "epoch": 0.13273453093812376, + "grad_norm": 0.2108407040714185, + "learning_rate": 0.0009725913496382159, + "loss": 1.4767, + "step": 1463 + }, + { + "epoch": 0.13282525857376157, + "grad_norm": 0.21359995360245787, + "learning_rate": 0.0009725433514082115, + "loss": 1.5009, + "step": 1464 + }, + { + "epoch": 0.13291598620939937, + "grad_norm": 0.2242322151448446, + "learning_rate": 0.000972495312374019, + "loss": 1.4653, + "step": 1465 + }, + { + "epoch": 0.1330067138450372, + "grad_norm": 0.21289723868037647, + "learning_rate": 0.0009724472325397868, + "loss": 1.4819, + "step": 1466 + }, + { + "epoch": 0.133097441480675, + "grad_norm": 0.20754916192370432, + "learning_rate": 0.0009723991119096662, + "loss": 1.5416, + "step": 1467 + }, + { + "epoch": 0.13318816911631282, + "grad_norm": 0.22166430850580682, + "learning_rate": 0.0009723509504878129, + "loss": 1.5328, + "step": 1468 + }, + { + "epoch": 0.13327889675195065, + "grad_norm": 0.2492508315818585, + "learning_rate": 0.0009723027482783853, + "loss": 1.4704, + "step": 1469 + }, + { + "epoch": 0.13336962438758845, + "grad_norm": 0.20045420260713534, + "learning_rate": 0.0009722545052855457, + "loss": 1.5017, + "step": 1470 + }, + { + "epoch": 0.1334603520232263, + "grad_norm": 0.2937425322859459, + "learning_rate": 0.00097220622151346, + "loss": 1.4763, + "step": 1471 + }, + { + "epoch": 0.1335510796588641, + "grad_norm": 0.214029437105581, + "learning_rate": 0.0009721578969662976, + "loss": 1.4854, + "step": 1472 + }, + { + "epoch": 0.1336418072945019, + "grad_norm": 0.2124832104353197, + "learning_rate": 0.0009721095316482312, + "loss": 1.4913, + "step": 1473 + }, + { + "epoch": 0.13373253493013973, + "grad_norm": 0.2224801435393726, + "learning_rate": 0.0009720611255634369, + "loss": 1.5081, + "step": 1474 + }, + { + "epoch": 0.13382326256577753, + "grad_norm": 0.23573378510593188, + "learning_rate": 0.000972012678716095, + "loss": 1.5306, + "step": 1475 + }, + { + "epoch": 0.13391399020141534, + "grad_norm": 0.21132447226856627, + "learning_rate": 0.0009719641911103888, + "loss": 1.4865, + "step": 1476 + }, + { + "epoch": 0.13400471783705317, + "grad_norm": 0.21594143902620866, + "learning_rate": 0.000971915662750505, + "loss": 1.4866, + "step": 1477 + }, + { + "epoch": 0.13409544547269098, + "grad_norm": 0.21436291129508162, + "learning_rate": 0.0009718670936406344, + "loss": 1.502, + "step": 1478 + }, + { + "epoch": 0.1341861731083288, + "grad_norm": 0.21104573490627468, + "learning_rate": 0.0009718184837849705, + "loss": 1.5159, + "step": 1479 + }, + { + "epoch": 0.13427690074396662, + "grad_norm": 0.2270471488152515, + "learning_rate": 0.000971769833187711, + "loss": 1.4795, + "step": 1480 + }, + { + "epoch": 0.13436762837960442, + "grad_norm": 0.21520121052146637, + "learning_rate": 0.0009717211418530569, + "loss": 1.4953, + "step": 1481 + }, + { + "epoch": 0.13445835601524225, + "grad_norm": 0.21785140784436274, + "learning_rate": 0.0009716724097852127, + "loss": 1.4592, + "step": 1482 + }, + { + "epoch": 0.13454908365088006, + "grad_norm": 0.2298182757428026, + "learning_rate": 0.0009716236369883864, + "loss": 1.4788, + "step": 1483 + }, + { + "epoch": 0.13463981128651786, + "grad_norm": 0.2566851113721963, + "learning_rate": 0.0009715748234667896, + "loss": 1.476, + "step": 1484 + }, + { + "epoch": 0.1347305389221557, + "grad_norm": 0.25088584018196813, + "learning_rate": 0.0009715259692246372, + "loss": 1.4966, + "step": 1485 + }, + { + "epoch": 0.1348212665577935, + "grad_norm": 0.23794174095695278, + "learning_rate": 0.0009714770742661478, + "loss": 1.4522, + "step": 1486 + }, + { + "epoch": 0.1349119941934313, + "grad_norm": 0.2606406420053497, + "learning_rate": 0.0009714281385955437, + "loss": 1.5402, + "step": 1487 + }, + { + "epoch": 0.13500272182906914, + "grad_norm": 0.23622747349413334, + "learning_rate": 0.0009713791622170502, + "loss": 1.4717, + "step": 1488 + }, + { + "epoch": 0.13509344946470694, + "grad_norm": 0.22716534360494128, + "learning_rate": 0.0009713301451348968, + "loss": 1.506, + "step": 1489 + }, + { + "epoch": 0.13518417710034478, + "grad_norm": 0.22603467459851406, + "learning_rate": 0.0009712810873533158, + "loss": 1.4688, + "step": 1490 + }, + { + "epoch": 0.13527490473598258, + "grad_norm": 0.26830424512142437, + "learning_rate": 0.0009712319888765433, + "loss": 1.5038, + "step": 1491 + }, + { + "epoch": 0.1353656323716204, + "grad_norm": 0.23294735561958296, + "learning_rate": 0.0009711828497088192, + "loss": 1.4834, + "step": 1492 + }, + { + "epoch": 0.13545636000725822, + "grad_norm": 0.2120265160980375, + "learning_rate": 0.0009711336698543867, + "loss": 1.4992, + "step": 1493 + }, + { + "epoch": 0.13554708764289602, + "grad_norm": 0.22044363833822453, + "learning_rate": 0.0009710844493174922, + "loss": 1.5004, + "step": 1494 + }, + { + "epoch": 0.13563781527853383, + "grad_norm": 0.22696545597524442, + "learning_rate": 0.0009710351881023861, + "loss": 1.4919, + "step": 1495 + }, + { + "epoch": 0.13572854291417166, + "grad_norm": 0.22633597073173178, + "learning_rate": 0.0009709858862133221, + "loss": 1.4877, + "step": 1496 + }, + { + "epoch": 0.13581927054980947, + "grad_norm": 0.2564014387793154, + "learning_rate": 0.0009709365436545574, + "loss": 1.5151, + "step": 1497 + }, + { + "epoch": 0.1359099981854473, + "grad_norm": 0.3216058632529646, + "learning_rate": 0.0009708871604303528, + "loss": 1.4867, + "step": 1498 + }, + { + "epoch": 0.1360007258210851, + "grad_norm": 0.22872807662453268, + "learning_rate": 0.0009708377365449726, + "loss": 1.5069, + "step": 1499 + }, + { + "epoch": 0.1360914534567229, + "grad_norm": 0.21724655275118812, + "learning_rate": 0.0009707882720026841, + "loss": 1.4805, + "step": 1500 + }, + { + "epoch": 0.13618218109236074, + "grad_norm": 0.19327621459517239, + "learning_rate": 0.0009707387668077592, + "loss": 1.4805, + "step": 1501 + }, + { + "epoch": 0.13627290872799855, + "grad_norm": 0.22101328513931412, + "learning_rate": 0.0009706892209644725, + "loss": 1.5168, + "step": 1502 + }, + { + "epoch": 0.13636363636363635, + "grad_norm": 0.21591642440574765, + "learning_rate": 0.0009706396344771021, + "loss": 1.4957, + "step": 1503 + }, + { + "epoch": 0.1364543639992742, + "grad_norm": 0.19655825548726868, + "learning_rate": 0.0009705900073499296, + "loss": 1.5073, + "step": 1504 + }, + { + "epoch": 0.136545091634912, + "grad_norm": 0.1939293851015894, + "learning_rate": 0.000970540339587241, + "loss": 1.4856, + "step": 1505 + }, + { + "epoch": 0.1366358192705498, + "grad_norm": 0.189038368160877, + "learning_rate": 0.0009704906311933246, + "loss": 1.5123, + "step": 1506 + }, + { + "epoch": 0.13672654690618763, + "grad_norm": 0.20751284792037483, + "learning_rate": 0.0009704408821724728, + "loss": 1.5056, + "step": 1507 + }, + { + "epoch": 0.13681727454182543, + "grad_norm": 0.19335058885332476, + "learning_rate": 0.0009703910925289815, + "loss": 1.5087, + "step": 1508 + }, + { + "epoch": 0.13690800217746327, + "grad_norm": 0.19687585604945368, + "learning_rate": 0.00097034126226715, + "loss": 1.4987, + "step": 1509 + }, + { + "epoch": 0.13699872981310107, + "grad_norm": 0.1992702748042019, + "learning_rate": 0.0009702913913912812, + "loss": 1.4957, + "step": 1510 + }, + { + "epoch": 0.13708945744873888, + "grad_norm": 0.20630189818364725, + "learning_rate": 0.0009702414799056815, + "loss": 1.507, + "step": 1511 + }, + { + "epoch": 0.1371801850843767, + "grad_norm": 0.2161601274557326, + "learning_rate": 0.0009701915278146607, + "loss": 1.4861, + "step": 1512 + }, + { + "epoch": 0.13727091272001452, + "grad_norm": 0.21740237132043075, + "learning_rate": 0.0009701415351225322, + "loss": 1.4435, + "step": 1513 + }, + { + "epoch": 0.13736164035565232, + "grad_norm": 0.20681604317177277, + "learning_rate": 0.0009700915018336127, + "loss": 1.4531, + "step": 1514 + }, + { + "epoch": 0.13745236799129015, + "grad_norm": 0.23849249196100603, + "learning_rate": 0.000970041427952223, + "loss": 1.494, + "step": 1515 + }, + { + "epoch": 0.13754309562692796, + "grad_norm": 0.22586086139376688, + "learning_rate": 0.0009699913134826865, + "loss": 1.5286, + "step": 1516 + }, + { + "epoch": 0.1376338232625658, + "grad_norm": 0.21234528981816117, + "learning_rate": 0.0009699411584293308, + "loss": 1.5136, + "step": 1517 + }, + { + "epoch": 0.1377245508982036, + "grad_norm": 0.22797656013631967, + "learning_rate": 0.0009698909627964869, + "loss": 1.4898, + "step": 1518 + }, + { + "epoch": 0.1378152785338414, + "grad_norm": 0.24819133545463318, + "learning_rate": 0.000969840726588489, + "loss": 1.4679, + "step": 1519 + }, + { + "epoch": 0.13790600616947923, + "grad_norm": 0.23940645884477685, + "learning_rate": 0.0009697904498096752, + "loss": 1.5294, + "step": 1520 + }, + { + "epoch": 0.13799673380511704, + "grad_norm": 0.22923472845513865, + "learning_rate": 0.0009697401324643869, + "loss": 1.525, + "step": 1521 + }, + { + "epoch": 0.13808746144075484, + "grad_norm": 0.21002066784086754, + "learning_rate": 0.0009696897745569688, + "loss": 1.4879, + "step": 1522 + }, + { + "epoch": 0.13817818907639268, + "grad_norm": 0.21218932153403242, + "learning_rate": 0.0009696393760917696, + "loss": 1.4786, + "step": 1523 + }, + { + "epoch": 0.13826891671203048, + "grad_norm": 0.22686570924085503, + "learning_rate": 0.0009695889370731409, + "loss": 1.4716, + "step": 1524 + }, + { + "epoch": 0.1383596443476683, + "grad_norm": 0.21772589067907833, + "learning_rate": 0.0009695384575054382, + "loss": 1.4834, + "step": 1525 + }, + { + "epoch": 0.13845037198330612, + "grad_norm": 0.2718171583929527, + "learning_rate": 0.0009694879373930207, + "loss": 1.4832, + "step": 1526 + }, + { + "epoch": 0.13854109961894392, + "grad_norm": 0.2168511115503537, + "learning_rate": 0.0009694373767402504, + "loss": 1.4802, + "step": 1527 + }, + { + "epoch": 0.13863182725458176, + "grad_norm": 0.19439803073950382, + "learning_rate": 0.0009693867755514937, + "loss": 1.4886, + "step": 1528 + }, + { + "epoch": 0.13872255489021956, + "grad_norm": 0.31476938960084233, + "learning_rate": 0.0009693361338311195, + "loss": 1.4783, + "step": 1529 + }, + { + "epoch": 0.13881328252585737, + "grad_norm": 0.2530194360298416, + "learning_rate": 0.0009692854515835011, + "loss": 1.5214, + "step": 1530 + }, + { + "epoch": 0.1389040101614952, + "grad_norm": 0.26060579567803077, + "learning_rate": 0.0009692347288130147, + "loss": 1.4874, + "step": 1531 + }, + { + "epoch": 0.138994737797133, + "grad_norm": 0.21302360803107653, + "learning_rate": 0.0009691839655240405, + "loss": 1.5015, + "step": 1532 + }, + { + "epoch": 0.1390854654327708, + "grad_norm": 0.29001486187437353, + "learning_rate": 0.0009691331617209616, + "loss": 1.4945, + "step": 1533 + }, + { + "epoch": 0.13917619306840864, + "grad_norm": 0.825330529960664, + "learning_rate": 0.000969082317408165, + "loss": 1.5242, + "step": 1534 + }, + { + "epoch": 0.13926692070404645, + "grad_norm": 0.2309051973827533, + "learning_rate": 0.0009690314325900411, + "loss": 1.4736, + "step": 1535 + }, + { + "epoch": 0.13935764833968428, + "grad_norm": 0.22989840240152967, + "learning_rate": 0.000968980507270984, + "loss": 1.506, + "step": 1536 + }, + { + "epoch": 0.1394483759753221, + "grad_norm": 0.22731942868216573, + "learning_rate": 0.0009689295414553909, + "loss": 1.478, + "step": 1537 + }, + { + "epoch": 0.1395391036109599, + "grad_norm": 0.20323435622848104, + "learning_rate": 0.0009688785351476629, + "loss": 1.4827, + "step": 1538 + }, + { + "epoch": 0.13962983124659772, + "grad_norm": 0.25877249688528364, + "learning_rate": 0.0009688274883522042, + "loss": 1.5308, + "step": 1539 + }, + { + "epoch": 0.13972055888223553, + "grad_norm": 0.22171423109731522, + "learning_rate": 0.0009687764010734228, + "loss": 1.4698, + "step": 1540 + }, + { + "epoch": 0.13981128651787333, + "grad_norm": 0.2250638612213158, + "learning_rate": 0.0009687252733157301, + "loss": 1.5001, + "step": 1541 + }, + { + "epoch": 0.13990201415351117, + "grad_norm": 0.21450238094917692, + "learning_rate": 0.0009686741050835408, + "loss": 1.5387, + "step": 1542 + }, + { + "epoch": 0.13999274178914897, + "grad_norm": 0.23036135679494216, + "learning_rate": 0.0009686228963812736, + "loss": 1.5201, + "step": 1543 + }, + { + "epoch": 0.14008346942478678, + "grad_norm": 0.2323868568682578, + "learning_rate": 0.0009685716472133503, + "loss": 1.4764, + "step": 1544 + }, + { + "epoch": 0.1401741970604246, + "grad_norm": 0.21747963735057965, + "learning_rate": 0.0009685203575841961, + "loss": 1.5068, + "step": 1545 + }, + { + "epoch": 0.14026492469606242, + "grad_norm": 0.21240633610679713, + "learning_rate": 0.0009684690274982399, + "loss": 1.4982, + "step": 1546 + }, + { + "epoch": 0.14035565233170025, + "grad_norm": 0.2234461797545051, + "learning_rate": 0.0009684176569599143, + "loss": 1.5364, + "step": 1547 + }, + { + "epoch": 0.14044637996733805, + "grad_norm": 0.2219944975038985, + "learning_rate": 0.0009683662459736549, + "loss": 1.499, + "step": 1548 + }, + { + "epoch": 0.14053710760297586, + "grad_norm": 0.22220974048187242, + "learning_rate": 0.000968314794543901, + "loss": 1.4807, + "step": 1549 + }, + { + "epoch": 0.1406278352386137, + "grad_norm": 0.2129653552470787, + "learning_rate": 0.0009682633026750957, + "loss": 1.4961, + "step": 1550 + }, + { + "epoch": 0.1407185628742515, + "grad_norm": 0.23134543060526244, + "learning_rate": 0.0009682117703716852, + "loss": 1.4696, + "step": 1551 + }, + { + "epoch": 0.1408092905098893, + "grad_norm": 0.23273616522432225, + "learning_rate": 0.0009681601976381193, + "loss": 1.4798, + "step": 1552 + }, + { + "epoch": 0.14090001814552713, + "grad_norm": 0.20313183601771098, + "learning_rate": 0.0009681085844788515, + "loss": 1.5092, + "step": 1553 + }, + { + "epoch": 0.14099074578116494, + "grad_norm": 0.21461795078845597, + "learning_rate": 0.0009680569308983382, + "loss": 1.5142, + "step": 1554 + }, + { + "epoch": 0.14108147341680277, + "grad_norm": 0.2160991079297226, + "learning_rate": 0.00096800523690104, + "loss": 1.475, + "step": 1555 + }, + { + "epoch": 0.14117220105244058, + "grad_norm": 0.22667694011988973, + "learning_rate": 0.0009679535024914207, + "loss": 1.5266, + "step": 1556 + }, + { + "epoch": 0.14126292868807838, + "grad_norm": 0.2092473635061854, + "learning_rate": 0.0009679017276739474, + "loss": 1.5135, + "step": 1557 + }, + { + "epoch": 0.14135365632371621, + "grad_norm": 0.2234538777481957, + "learning_rate": 0.000967849912453091, + "loss": 1.4916, + "step": 1558 + }, + { + "epoch": 0.14144438395935402, + "grad_norm": 0.24058028710995802, + "learning_rate": 0.0009677980568333257, + "loss": 1.5099, + "step": 1559 + }, + { + "epoch": 0.14153511159499182, + "grad_norm": 0.2143084361150511, + "learning_rate": 0.0009677461608191292, + "loss": 1.4709, + "step": 1560 + }, + { + "epoch": 0.14162583923062966, + "grad_norm": 0.20902491398608053, + "learning_rate": 0.0009676942244149828, + "loss": 1.4758, + "step": 1561 + }, + { + "epoch": 0.14171656686626746, + "grad_norm": 0.22008352521064875, + "learning_rate": 0.0009676422476253713, + "loss": 1.5, + "step": 1562 + }, + { + "epoch": 0.14180729450190527, + "grad_norm": 0.21459336590108538, + "learning_rate": 0.0009675902304547826, + "loss": 1.5363, + "step": 1563 + }, + { + "epoch": 0.1418980221375431, + "grad_norm": 0.24956922699661505, + "learning_rate": 0.0009675381729077087, + "loss": 1.4963, + "step": 1564 + }, + { + "epoch": 0.1419887497731809, + "grad_norm": 0.2207112189008068, + "learning_rate": 0.0009674860749886446, + "loss": 1.4961, + "step": 1565 + }, + { + "epoch": 0.14207947740881874, + "grad_norm": 0.23375784216278456, + "learning_rate": 0.000967433936702089, + "loss": 1.4707, + "step": 1566 + }, + { + "epoch": 0.14217020504445654, + "grad_norm": 0.2172464118688708, + "learning_rate": 0.0009673817580525441, + "loss": 1.4952, + "step": 1567 + }, + { + "epoch": 0.14226093268009435, + "grad_norm": 0.22421909586173747, + "learning_rate": 0.0009673295390445156, + "loss": 1.4834, + "step": 1568 + }, + { + "epoch": 0.14235166031573218, + "grad_norm": 0.2196992381308549, + "learning_rate": 0.0009672772796825124, + "loss": 1.4991, + "step": 1569 + }, + { + "epoch": 0.14244238795137, + "grad_norm": 0.2145199116232832, + "learning_rate": 0.0009672249799710474, + "loss": 1.5107, + "step": 1570 + }, + { + "epoch": 0.1425331155870078, + "grad_norm": 0.20659704982454205, + "learning_rate": 0.0009671726399146363, + "loss": 1.4746, + "step": 1571 + }, + { + "epoch": 0.14262384322264562, + "grad_norm": 0.21417659039733117, + "learning_rate": 0.0009671202595177991, + "loss": 1.5085, + "step": 1572 + }, + { + "epoch": 0.14271457085828343, + "grad_norm": 0.20979570789069574, + "learning_rate": 0.0009670678387850585, + "loss": 1.4976, + "step": 1573 + }, + { + "epoch": 0.14280529849392126, + "grad_norm": 0.2182839347995015, + "learning_rate": 0.0009670153777209413, + "loss": 1.4702, + "step": 1574 + }, + { + "epoch": 0.14289602612955907, + "grad_norm": 0.2422318195391616, + "learning_rate": 0.0009669628763299774, + "loss": 1.513, + "step": 1575 + }, + { + "epoch": 0.14298675376519687, + "grad_norm": 0.22596969434471115, + "learning_rate": 0.0009669103346167002, + "loss": 1.5126, + "step": 1576 + }, + { + "epoch": 0.1430774814008347, + "grad_norm": 0.21978424586159778, + "learning_rate": 0.0009668577525856468, + "loss": 1.4771, + "step": 1577 + }, + { + "epoch": 0.1431682090364725, + "grad_norm": 0.21533107037107344, + "learning_rate": 0.0009668051302413577, + "loss": 1.5237, + "step": 1578 + }, + { + "epoch": 0.14325893667211032, + "grad_norm": 0.2150696448308193, + "learning_rate": 0.0009667524675883767, + "loss": 1.4793, + "step": 1579 + }, + { + "epoch": 0.14334966430774815, + "grad_norm": 0.21578700989908245, + "learning_rate": 0.0009666997646312514, + "loss": 1.494, + "step": 1580 + }, + { + "epoch": 0.14344039194338595, + "grad_norm": 0.22882853183750157, + "learning_rate": 0.0009666470213745327, + "loss": 1.4988, + "step": 1581 + }, + { + "epoch": 0.14353111957902376, + "grad_norm": 0.2058390502574839, + "learning_rate": 0.000966594237822775, + "loss": 1.4799, + "step": 1582 + }, + { + "epoch": 0.1436218472146616, + "grad_norm": 0.24062593440880264, + "learning_rate": 0.000966541413980536, + "loss": 1.4857, + "step": 1583 + }, + { + "epoch": 0.1437125748502994, + "grad_norm": 0.20986770814030128, + "learning_rate": 0.000966488549852377, + "loss": 1.5238, + "step": 1584 + }, + { + "epoch": 0.14380330248593723, + "grad_norm": 0.19378429731276842, + "learning_rate": 0.0009664356454428631, + "loss": 1.4512, + "step": 1585 + }, + { + "epoch": 0.14389403012157503, + "grad_norm": 0.19999770590204569, + "learning_rate": 0.0009663827007565624, + "loss": 1.503, + "step": 1586 + }, + { + "epoch": 0.14398475775721284, + "grad_norm": 0.19619507770850192, + "learning_rate": 0.0009663297157980468, + "loss": 1.477, + "step": 1587 + }, + { + "epoch": 0.14407548539285067, + "grad_norm": 0.20444030490280005, + "learning_rate": 0.0009662766905718916, + "loss": 1.5445, + "step": 1588 + }, + { + "epoch": 0.14416621302848848, + "grad_norm": 0.20494004621015444, + "learning_rate": 0.0009662236250826755, + "loss": 1.506, + "step": 1589 + }, + { + "epoch": 0.14425694066412628, + "grad_norm": 0.2115840419569785, + "learning_rate": 0.0009661705193349804, + "loss": 1.508, + "step": 1590 + }, + { + "epoch": 0.14434766829976411, + "grad_norm": 0.1943744197357427, + "learning_rate": 0.0009661173733333925, + "loss": 1.451, + "step": 1591 + }, + { + "epoch": 0.14443839593540192, + "grad_norm": 0.20039910967464616, + "learning_rate": 0.0009660641870825005, + "loss": 1.5057, + "step": 1592 + }, + { + "epoch": 0.14452912357103972, + "grad_norm": 0.24106750699256588, + "learning_rate": 0.0009660109605868975, + "loss": 1.5201, + "step": 1593 + }, + { + "epoch": 0.14461985120667756, + "grad_norm": 0.2180058539042007, + "learning_rate": 0.0009659576938511791, + "loss": 1.4778, + "step": 1594 + }, + { + "epoch": 0.14471057884231536, + "grad_norm": 0.20545055517306773, + "learning_rate": 0.0009659043868799454, + "loss": 1.4697, + "step": 1595 + }, + { + "epoch": 0.1448013064779532, + "grad_norm": 0.20842005136755526, + "learning_rate": 0.000965851039677799, + "loss": 1.4458, + "step": 1596 + }, + { + "epoch": 0.144892034113591, + "grad_norm": 0.19925474214983707, + "learning_rate": 0.0009657976522493468, + "loss": 1.5002, + "step": 1597 + }, + { + "epoch": 0.1449827617492288, + "grad_norm": 0.19910857305077256, + "learning_rate": 0.0009657442245991985, + "loss": 1.4653, + "step": 1598 + }, + { + "epoch": 0.14507348938486664, + "grad_norm": 0.23119079731956946, + "learning_rate": 0.000965690756731968, + "loss": 1.5127, + "step": 1599 + }, + { + "epoch": 0.14516421702050444, + "grad_norm": 0.20948458221099123, + "learning_rate": 0.0009656372486522719, + "loss": 1.5421, + "step": 1600 + }, + { + "epoch": 0.14525494465614225, + "grad_norm": 0.19560683454830363, + "learning_rate": 0.0009655837003647307, + "loss": 1.4716, + "step": 1601 + }, + { + "epoch": 0.14534567229178008, + "grad_norm": 0.19419127890972085, + "learning_rate": 0.0009655301118739686, + "loss": 1.5179, + "step": 1602 + }, + { + "epoch": 0.1454363999274179, + "grad_norm": 0.20543105873616987, + "learning_rate": 0.0009654764831846126, + "loss": 1.5072, + "step": 1603 + }, + { + "epoch": 0.14552712756305572, + "grad_norm": 0.19241726749149518, + "learning_rate": 0.0009654228143012936, + "loss": 1.5136, + "step": 1604 + }, + { + "epoch": 0.14561785519869352, + "grad_norm": 0.19715140211212284, + "learning_rate": 0.000965369105228646, + "loss": 1.4751, + "step": 1605 + }, + { + "epoch": 0.14570858283433133, + "grad_norm": 0.2140727780330096, + "learning_rate": 0.0009653153559713076, + "loss": 1.495, + "step": 1606 + }, + { + "epoch": 0.14579931046996916, + "grad_norm": 0.21123660831030325, + "learning_rate": 0.0009652615665339196, + "loss": 1.5162, + "step": 1607 + }, + { + "epoch": 0.14589003810560697, + "grad_norm": 0.25116090024837295, + "learning_rate": 0.0009652077369211267, + "loss": 1.4632, + "step": 1608 + }, + { + "epoch": 0.14598076574124477, + "grad_norm": 0.22045777343863066, + "learning_rate": 0.0009651538671375774, + "loss": 1.5074, + "step": 1609 + }, + { + "epoch": 0.1460714933768826, + "grad_norm": 0.2089016866143971, + "learning_rate": 0.0009650999571879229, + "loss": 1.4949, + "step": 1610 + }, + { + "epoch": 0.1461622210125204, + "grad_norm": 0.19711792434289133, + "learning_rate": 0.0009650460070768185, + "loss": 1.4799, + "step": 1611 + }, + { + "epoch": 0.14625294864815822, + "grad_norm": 0.20102901542652407, + "learning_rate": 0.000964992016808923, + "loss": 1.4585, + "step": 1612 + }, + { + "epoch": 0.14634367628379605, + "grad_norm": 1.2325486364747293, + "learning_rate": 0.0009649379863888983, + "loss": 1.4865, + "step": 1613 + }, + { + "epoch": 0.14643440391943385, + "grad_norm": 0.19360209287164395, + "learning_rate": 0.00096488391582141, + "loss": 1.4807, + "step": 1614 + }, + { + "epoch": 0.14652513155507169, + "grad_norm": 0.20888935829978963, + "learning_rate": 0.0009648298051111268, + "loss": 1.4942, + "step": 1615 + }, + { + "epoch": 0.1466158591907095, + "grad_norm": 0.2328104215947702, + "learning_rate": 0.0009647756542627218, + "loss": 1.5252, + "step": 1616 + }, + { + "epoch": 0.1467065868263473, + "grad_norm": 0.20764582987030275, + "learning_rate": 0.0009647214632808702, + "loss": 1.4966, + "step": 1617 + }, + { + "epoch": 0.14679731446198513, + "grad_norm": 0.27847763912312523, + "learning_rate": 0.0009646672321702519, + "loss": 1.4891, + "step": 1618 + }, + { + "epoch": 0.14688804209762293, + "grad_norm": 0.2613477287855903, + "learning_rate": 0.0009646129609355497, + "loss": 1.4606, + "step": 1619 + }, + { + "epoch": 0.14697876973326074, + "grad_norm": 0.221681762179266, + "learning_rate": 0.0009645586495814497, + "loss": 1.5048, + "step": 1620 + }, + { + "epoch": 0.14706949736889857, + "grad_norm": 0.21192464446519665, + "learning_rate": 0.0009645042981126419, + "loss": 1.4944, + "step": 1621 + }, + { + "epoch": 0.14716022500453638, + "grad_norm": 0.22232818291173778, + "learning_rate": 0.0009644499065338195, + "loss": 1.4766, + "step": 1622 + }, + { + "epoch": 0.1472509526401742, + "grad_norm": 0.2649569215966306, + "learning_rate": 0.0009643954748496793, + "loss": 1.5207, + "step": 1623 + }, + { + "epoch": 0.14734168027581201, + "grad_norm": 0.21928536685659053, + "learning_rate": 0.0009643410030649212, + "loss": 1.5, + "step": 1624 + }, + { + "epoch": 0.14743240791144982, + "grad_norm": 0.2436212661771552, + "learning_rate": 0.0009642864911842493, + "loss": 1.4711, + "step": 1625 + }, + { + "epoch": 0.14752313554708765, + "grad_norm": 0.23253806679647016, + "learning_rate": 0.0009642319392123702, + "loss": 1.5132, + "step": 1626 + }, + { + "epoch": 0.14761386318272546, + "grad_norm": 0.21652788386948432, + "learning_rate": 0.0009641773471539949, + "loss": 1.5034, + "step": 1627 + }, + { + "epoch": 0.14770459081836326, + "grad_norm": 0.2294454091124724, + "learning_rate": 0.0009641227150138372, + "loss": 1.4897, + "step": 1628 + }, + { + "epoch": 0.1477953184540011, + "grad_norm": 0.21781242376307436, + "learning_rate": 0.0009640680427966147, + "loss": 1.4898, + "step": 1629 + }, + { + "epoch": 0.1478860460896389, + "grad_norm": 0.23355485623686068, + "learning_rate": 0.0009640133305070482, + "loss": 1.5289, + "step": 1630 + }, + { + "epoch": 0.1479767737252767, + "grad_norm": 0.22717114807521618, + "learning_rate": 0.0009639585781498623, + "loss": 1.4874, + "step": 1631 + }, + { + "epoch": 0.14806750136091454, + "grad_norm": 0.25765241966516955, + "learning_rate": 0.0009639037857297847, + "loss": 1.5027, + "step": 1632 + }, + { + "epoch": 0.14815822899655234, + "grad_norm": 0.21508884294477404, + "learning_rate": 0.000963848953251547, + "loss": 1.532, + "step": 1633 + }, + { + "epoch": 0.14824895663219018, + "grad_norm": 0.21591282527901998, + "learning_rate": 0.0009637940807198837, + "loss": 1.4864, + "step": 1634 + }, + { + "epoch": 0.14833968426782798, + "grad_norm": 0.21004417924684549, + "learning_rate": 0.0009637391681395334, + "loss": 1.478, + "step": 1635 + }, + { + "epoch": 0.1484304119034658, + "grad_norm": 0.2454377055280665, + "learning_rate": 0.0009636842155152372, + "loss": 1.5013, + "step": 1636 + }, + { + "epoch": 0.14852113953910362, + "grad_norm": 0.25993224359242145, + "learning_rate": 0.0009636292228517409, + "loss": 1.4623, + "step": 1637 + }, + { + "epoch": 0.14861186717474142, + "grad_norm": 0.2592409533991936, + "learning_rate": 0.0009635741901537929, + "loss": 1.5104, + "step": 1638 + }, + { + "epoch": 0.14870259481037923, + "grad_norm": 0.384431315956382, + "learning_rate": 0.0009635191174261452, + "loss": 1.5218, + "step": 1639 + }, + { + "epoch": 0.14879332244601706, + "grad_norm": 0.2403730791583455, + "learning_rate": 0.0009634640046735533, + "loss": 1.5466, + "step": 1640 + }, + { + "epoch": 0.14888405008165487, + "grad_norm": 0.235197474728611, + "learning_rate": 0.0009634088519007764, + "loss": 1.4683, + "step": 1641 + }, + { + "epoch": 0.1489747777172927, + "grad_norm": 0.23748593416929917, + "learning_rate": 0.0009633536591125768, + "loss": 1.5209, + "step": 1642 + }, + { + "epoch": 0.1490655053529305, + "grad_norm": 0.21246911426173365, + "learning_rate": 0.0009632984263137205, + "loss": 1.4977, + "step": 1643 + }, + { + "epoch": 0.1491562329885683, + "grad_norm": 0.2141506833734812, + "learning_rate": 0.0009632431535089767, + "loss": 1.4811, + "step": 1644 + }, + { + "epoch": 0.14924696062420614, + "grad_norm": 0.20100835787109797, + "learning_rate": 0.0009631878407031183, + "loss": 1.4991, + "step": 1645 + }, + { + "epoch": 0.14933768825984395, + "grad_norm": 0.20148080910969998, + "learning_rate": 0.0009631324879009218, + "loss": 1.4883, + "step": 1646 + }, + { + "epoch": 0.14942841589548175, + "grad_norm": 0.20657914746210532, + "learning_rate": 0.0009630770951071666, + "loss": 1.4664, + "step": 1647 + }, + { + "epoch": 0.14951914353111959, + "grad_norm": 0.20711840213938618, + "learning_rate": 0.0009630216623266359, + "loss": 1.4982, + "step": 1648 + }, + { + "epoch": 0.1496098711667574, + "grad_norm": 0.20381710365863714, + "learning_rate": 0.0009629661895641165, + "loss": 1.4892, + "step": 1649 + }, + { + "epoch": 0.1497005988023952, + "grad_norm": 0.18691368842020217, + "learning_rate": 0.0009629106768243983, + "loss": 1.4512, + "step": 1650 + }, + { + "epoch": 0.14979132643803303, + "grad_norm": 0.22627393392039807, + "learning_rate": 0.0009628551241122749, + "loss": 1.469, + "step": 1651 + }, + { + "epoch": 0.14988205407367083, + "grad_norm": 0.19807925267012702, + "learning_rate": 0.0009627995314325436, + "loss": 1.512, + "step": 1652 + }, + { + "epoch": 0.14997278170930867, + "grad_norm": 0.19392626426005344, + "learning_rate": 0.0009627438987900044, + "loss": 1.4867, + "step": 1653 + }, + { + "epoch": 0.15006350934494647, + "grad_norm": 0.21435086461264916, + "learning_rate": 0.0009626882261894612, + "loss": 1.4419, + "step": 1654 + }, + { + "epoch": 0.15015423698058428, + "grad_norm": 0.22498115466606813, + "learning_rate": 0.0009626325136357216, + "loss": 1.5055, + "step": 1655 + }, + { + "epoch": 0.1502449646162221, + "grad_norm": 0.1999356335377401, + "learning_rate": 0.0009625767611335963, + "loss": 1.4483, + "step": 1656 + }, + { + "epoch": 0.15033569225185991, + "grad_norm": 0.2063290546610908, + "learning_rate": 0.0009625209686878993, + "loss": 1.528, + "step": 1657 + }, + { + "epoch": 0.15042641988749772, + "grad_norm": 0.21554568895124185, + "learning_rate": 0.0009624651363034487, + "loss": 1.521, + "step": 1658 + }, + { + "epoch": 0.15051714752313555, + "grad_norm": 0.21592350175696998, + "learning_rate": 0.0009624092639850654, + "loss": 1.4812, + "step": 1659 + }, + { + "epoch": 0.15060787515877336, + "grad_norm": 0.22821444630501633, + "learning_rate": 0.0009623533517375738, + "loss": 1.4881, + "step": 1660 + }, + { + "epoch": 0.1506986027944112, + "grad_norm": 0.21972048442660966, + "learning_rate": 0.0009622973995658024, + "loss": 1.481, + "step": 1661 + }, + { + "epoch": 0.150789330430049, + "grad_norm": 0.22407586960557263, + "learning_rate": 0.0009622414074745823, + "loss": 1.4691, + "step": 1662 + }, + { + "epoch": 0.1508800580656868, + "grad_norm": 0.49988435725477776, + "learning_rate": 0.0009621853754687485, + "loss": 1.5078, + "step": 1663 + }, + { + "epoch": 0.15097078570132463, + "grad_norm": 0.22272264136591302, + "learning_rate": 0.0009621293035531395, + "loss": 1.4912, + "step": 1664 + }, + { + "epoch": 0.15106151333696244, + "grad_norm": 0.2218963778346908, + "learning_rate": 0.0009620731917325968, + "loss": 1.4977, + "step": 1665 + }, + { + "epoch": 0.15115224097260024, + "grad_norm": 0.2620871836552729, + "learning_rate": 0.0009620170400119661, + "loss": 1.5119, + "step": 1666 + }, + { + "epoch": 0.15124296860823808, + "grad_norm": 0.23556152011621295, + "learning_rate": 0.000961960848396096, + "loss": 1.499, + "step": 1667 + }, + { + "epoch": 0.15133369624387588, + "grad_norm": 0.24884973055716367, + "learning_rate": 0.0009619046168898384, + "loss": 1.5089, + "step": 1668 + }, + { + "epoch": 0.1514244238795137, + "grad_norm": 0.2535726575694708, + "learning_rate": 0.0009618483454980491, + "loss": 1.4769, + "step": 1669 + }, + { + "epoch": 0.15151515151515152, + "grad_norm": 0.23775274712476566, + "learning_rate": 0.000961792034225587, + "loss": 1.4755, + "step": 1670 + }, + { + "epoch": 0.15160587915078932, + "grad_norm": 0.22150571737186442, + "learning_rate": 0.0009617356830773148, + "loss": 1.5028, + "step": 1671 + }, + { + "epoch": 0.15169660678642716, + "grad_norm": 0.21535249638794185, + "learning_rate": 0.0009616792920580982, + "loss": 1.5361, + "step": 1672 + }, + { + "epoch": 0.15178733442206496, + "grad_norm": 0.2326413425012696, + "learning_rate": 0.0009616228611728069, + "loss": 1.4971, + "step": 1673 + }, + { + "epoch": 0.15187806205770277, + "grad_norm": 0.2696139779880897, + "learning_rate": 0.0009615663904263132, + "loss": 1.4988, + "step": 1674 + }, + { + "epoch": 0.1519687896933406, + "grad_norm": 0.2106940385492926, + "learning_rate": 0.0009615098798234938, + "loss": 1.4803, + "step": 1675 + }, + { + "epoch": 0.1520595173289784, + "grad_norm": 0.23131315168093583, + "learning_rate": 0.0009614533293692282, + "loss": 1.4618, + "step": 1676 + }, + { + "epoch": 0.1521502449646162, + "grad_norm": 0.21018201585894838, + "learning_rate": 0.0009613967390683998, + "loss": 1.4934, + "step": 1677 + }, + { + "epoch": 0.15224097260025404, + "grad_norm": 0.23040607332685692, + "learning_rate": 0.0009613401089258949, + "loss": 1.5047, + "step": 1678 + }, + { + "epoch": 0.15233170023589185, + "grad_norm": 0.22187518369647855, + "learning_rate": 0.0009612834389466034, + "loss": 1.5094, + "step": 1679 + }, + { + "epoch": 0.15242242787152968, + "grad_norm": 0.22011381430677593, + "learning_rate": 0.0009612267291354191, + "loss": 1.4923, + "step": 1680 + }, + { + "epoch": 0.15251315550716749, + "grad_norm": 0.20257865214397033, + "learning_rate": 0.0009611699794972389, + "loss": 1.4737, + "step": 1681 + }, + { + "epoch": 0.1526038831428053, + "grad_norm": 0.24085843814191962, + "learning_rate": 0.0009611131900369627, + "loss": 1.5088, + "step": 1682 + }, + { + "epoch": 0.15269461077844312, + "grad_norm": 0.2306133010046884, + "learning_rate": 0.0009610563607594948, + "loss": 1.5154, + "step": 1683 + }, + { + "epoch": 0.15278533841408093, + "grad_norm": 0.23938073981712166, + "learning_rate": 0.0009609994916697422, + "loss": 1.503, + "step": 1684 + }, + { + "epoch": 0.15287606604971873, + "grad_norm": 0.21343598630946356, + "learning_rate": 0.0009609425827726154, + "loss": 1.5022, + "step": 1685 + }, + { + "epoch": 0.15296679368535657, + "grad_norm": 0.20518149609991043, + "learning_rate": 0.0009608856340730288, + "loss": 1.498, + "step": 1686 + }, + { + "epoch": 0.15305752132099437, + "grad_norm": 0.2513721016322373, + "learning_rate": 0.0009608286455758996, + "loss": 1.488, + "step": 1687 + }, + { + "epoch": 0.15314824895663218, + "grad_norm": 0.2272277034135155, + "learning_rate": 0.0009607716172861492, + "loss": 1.4871, + "step": 1688 + }, + { + "epoch": 0.15323897659227, + "grad_norm": 0.19911247824231254, + "learning_rate": 0.0009607145492087015, + "loss": 1.4883, + "step": 1689 + }, + { + "epoch": 0.15332970422790781, + "grad_norm": 0.3320101251918071, + "learning_rate": 0.0009606574413484847, + "loss": 1.4886, + "step": 1690 + }, + { + "epoch": 0.15342043186354565, + "grad_norm": 0.2100053351635548, + "learning_rate": 0.0009606002937104299, + "loss": 1.4958, + "step": 1691 + }, + { + "epoch": 0.15351115949918345, + "grad_norm": 0.20541955348917748, + "learning_rate": 0.0009605431062994718, + "loss": 1.4828, + "step": 1692 + }, + { + "epoch": 0.15360188713482126, + "grad_norm": 0.2181970858396196, + "learning_rate": 0.0009604858791205487, + "loss": 1.4999, + "step": 1693 + }, + { + "epoch": 0.1536926147704591, + "grad_norm": 0.19978728418554395, + "learning_rate": 0.000960428612178602, + "loss": 1.4873, + "step": 1694 + }, + { + "epoch": 0.1537833424060969, + "grad_norm": 0.2135707022325032, + "learning_rate": 0.0009603713054785768, + "loss": 1.5245, + "step": 1695 + }, + { + "epoch": 0.1538740700417347, + "grad_norm": 0.21681394197634307, + "learning_rate": 0.0009603139590254215, + "loss": 1.4712, + "step": 1696 + }, + { + "epoch": 0.15396479767737253, + "grad_norm": 0.21835975551720893, + "learning_rate": 0.000960256572824088, + "loss": 1.4425, + "step": 1697 + }, + { + "epoch": 0.15405552531301034, + "grad_norm": 0.21710006490151135, + "learning_rate": 0.0009601991468795316, + "loss": 1.4948, + "step": 1698 + }, + { + "epoch": 0.15414625294864817, + "grad_norm": 0.5484136316500835, + "learning_rate": 0.000960141681196711, + "loss": 1.4967, + "step": 1699 + }, + { + "epoch": 0.15423698058428598, + "grad_norm": 0.21576266133558522, + "learning_rate": 0.0009600841757805885, + "loss": 1.4597, + "step": 1700 + }, + { + "epoch": 0.15432770821992378, + "grad_norm": 0.20968217253550506, + "learning_rate": 0.0009600266306361296, + "loss": 1.4548, + "step": 1701 + }, + { + "epoch": 0.15441843585556161, + "grad_norm": 0.22768224169326226, + "learning_rate": 0.0009599690457683033, + "loss": 1.47, + "step": 1702 + }, + { + "epoch": 0.15450916349119942, + "grad_norm": 0.24669779970873504, + "learning_rate": 0.0009599114211820821, + "loss": 1.4588, + "step": 1703 + }, + { + "epoch": 0.15459989112683722, + "grad_norm": 0.20330334892386825, + "learning_rate": 0.0009598537568824419, + "loss": 1.4943, + "step": 1704 + }, + { + "epoch": 0.15469061876247506, + "grad_norm": 0.20404746350054637, + "learning_rate": 0.0009597960528743621, + "loss": 1.4577, + "step": 1705 + }, + { + "epoch": 0.15478134639811286, + "grad_norm": 0.2096537182814436, + "learning_rate": 0.0009597383091628252, + "loss": 1.4782, + "step": 1706 + }, + { + "epoch": 0.15487207403375067, + "grad_norm": 0.2234405420029924, + "learning_rate": 0.0009596805257528177, + "loss": 1.4865, + "step": 1707 + }, + { + "epoch": 0.1549628016693885, + "grad_norm": 0.21092152905534894, + "learning_rate": 0.000959622702649329, + "loss": 1.5156, + "step": 1708 + }, + { + "epoch": 0.1550535293050263, + "grad_norm": 0.19175415065625348, + "learning_rate": 0.0009595648398573522, + "loss": 1.4992, + "step": 1709 + }, + { + "epoch": 0.15514425694066414, + "grad_norm": 0.2082205904336217, + "learning_rate": 0.0009595069373818836, + "loss": 1.4818, + "step": 1710 + }, + { + "epoch": 0.15523498457630194, + "grad_norm": 0.19413465746708378, + "learning_rate": 0.0009594489952279235, + "loss": 1.4894, + "step": 1711 + }, + { + "epoch": 0.15532571221193975, + "grad_norm": 0.19532551526621247, + "learning_rate": 0.0009593910134004748, + "loss": 1.503, + "step": 1712 + }, + { + "epoch": 0.15541643984757758, + "grad_norm": 0.18384814914069614, + "learning_rate": 0.0009593329919045444, + "loss": 1.5129, + "step": 1713 + }, + { + "epoch": 0.15550716748321539, + "grad_norm": 0.18567868400619558, + "learning_rate": 0.0009592749307451424, + "loss": 1.4733, + "step": 1714 + }, + { + "epoch": 0.1555978951188532, + "grad_norm": 0.1863347128501986, + "learning_rate": 0.0009592168299272825, + "loss": 1.4871, + "step": 1715 + }, + { + "epoch": 0.15568862275449102, + "grad_norm": 0.1973814354215678, + "learning_rate": 0.0009591586894559817, + "loss": 1.4965, + "step": 1716 + }, + { + "epoch": 0.15577935039012883, + "grad_norm": 0.2023154651438877, + "learning_rate": 0.0009591005093362603, + "loss": 1.4892, + "step": 1717 + }, + { + "epoch": 0.15587007802576666, + "grad_norm": 0.19969007539125394, + "learning_rate": 0.0009590422895731422, + "loss": 1.4848, + "step": 1718 + }, + { + "epoch": 0.15596080566140447, + "grad_norm": 0.2585888166428666, + "learning_rate": 0.0009589840301716549, + "loss": 1.4769, + "step": 1719 + }, + { + "epoch": 0.15605153329704227, + "grad_norm": 0.1865369651283062, + "learning_rate": 0.0009589257311368289, + "loss": 1.4819, + "step": 1720 + }, + { + "epoch": 0.1561422609326801, + "grad_norm": 0.19239927967563009, + "learning_rate": 0.0009588673924736983, + "loss": 1.487, + "step": 1721 + }, + { + "epoch": 0.1562329885683179, + "grad_norm": 0.17652705945930616, + "learning_rate": 0.0009588090141873007, + "loss": 1.5372, + "step": 1722 + }, + { + "epoch": 0.15632371620395571, + "grad_norm": 0.18475289170745146, + "learning_rate": 0.0009587505962826773, + "loss": 1.5012, + "step": 1723 + }, + { + "epoch": 0.15641444383959355, + "grad_norm": 0.18378618026094773, + "learning_rate": 0.0009586921387648721, + "loss": 1.478, + "step": 1724 + }, + { + "epoch": 0.15650517147523135, + "grad_norm": 0.18389799279986838, + "learning_rate": 0.0009586336416389331, + "loss": 1.475, + "step": 1725 + }, + { + "epoch": 0.15659589911086916, + "grad_norm": 0.19051521612440556, + "learning_rate": 0.0009585751049099117, + "loss": 1.49, + "step": 1726 + }, + { + "epoch": 0.156686626746507, + "grad_norm": 0.19177228397035256, + "learning_rate": 0.0009585165285828623, + "loss": 1.4856, + "step": 1727 + }, + { + "epoch": 0.1567773543821448, + "grad_norm": 0.191766806204871, + "learning_rate": 0.0009584579126628432, + "loss": 1.4752, + "step": 1728 + }, + { + "epoch": 0.15686808201778263, + "grad_norm": 0.19574899939626533, + "learning_rate": 0.0009583992571549157, + "loss": 1.4595, + "step": 1729 + }, + { + "epoch": 0.15695880965342043, + "grad_norm": 0.2028281511938759, + "learning_rate": 0.0009583405620641448, + "loss": 1.4639, + "step": 1730 + }, + { + "epoch": 0.15704953728905824, + "grad_norm": 0.19781128637670733, + "learning_rate": 0.0009582818273955988, + "loss": 1.5026, + "step": 1731 + }, + { + "epoch": 0.15714026492469607, + "grad_norm": 0.23629409051468708, + "learning_rate": 0.0009582230531543494, + "loss": 1.5366, + "step": 1732 + }, + { + "epoch": 0.15723099256033388, + "grad_norm": 0.19017637772851537, + "learning_rate": 0.0009581642393454719, + "loss": 1.4811, + "step": 1733 + }, + { + "epoch": 0.15732172019597168, + "grad_norm": 0.19846727240328138, + "learning_rate": 0.0009581053859740447, + "loss": 1.4727, + "step": 1734 + }, + { + "epoch": 0.15741244783160951, + "grad_norm": 0.22350394699968623, + "learning_rate": 0.00095804649304515, + "loss": 1.5174, + "step": 1735 + }, + { + "epoch": 0.15750317546724732, + "grad_norm": 0.1968527094072719, + "learning_rate": 0.0009579875605638732, + "loss": 1.5065, + "step": 1736 + }, + { + "epoch": 0.15759390310288515, + "grad_norm": 0.2033702168205132, + "learning_rate": 0.0009579285885353029, + "loss": 1.5147, + "step": 1737 + }, + { + "epoch": 0.15768463073852296, + "grad_norm": 0.3753789384332225, + "learning_rate": 0.0009578695769645316, + "loss": 1.4744, + "step": 1738 + }, + { + "epoch": 0.15777535837416076, + "grad_norm": 0.20782577343068181, + "learning_rate": 0.0009578105258566547, + "loss": 1.4824, + "step": 1739 + }, + { + "epoch": 0.1578660860097986, + "grad_norm": 0.18612562873892832, + "learning_rate": 0.0009577514352167715, + "loss": 1.4379, + "step": 1740 + }, + { + "epoch": 0.1579568136454364, + "grad_norm": 0.18304732992902692, + "learning_rate": 0.0009576923050499844, + "loss": 1.4918, + "step": 1741 + }, + { + "epoch": 0.1580475412810742, + "grad_norm": 0.1842366067559241, + "learning_rate": 0.0009576331353613994, + "loss": 1.4597, + "step": 1742 + }, + { + "epoch": 0.15813826891671204, + "grad_norm": 0.21520743389075242, + "learning_rate": 0.0009575739261561256, + "loss": 1.4785, + "step": 1743 + }, + { + "epoch": 0.15822899655234984, + "grad_norm": 0.19090933688345102, + "learning_rate": 0.0009575146774392758, + "loss": 1.4439, + "step": 1744 + }, + { + "epoch": 0.15831972418798765, + "grad_norm": 0.22982477029462517, + "learning_rate": 0.0009574553892159663, + "loss": 1.4675, + "step": 1745 + }, + { + "epoch": 0.15841045182362548, + "grad_norm": 0.19508715578288885, + "learning_rate": 0.0009573960614913164, + "loss": 1.4914, + "step": 1746 + }, + { + "epoch": 0.15850117945926329, + "grad_norm": 0.19769598820479453, + "learning_rate": 0.0009573366942704492, + "loss": 1.4888, + "step": 1747 + }, + { + "epoch": 0.15859190709490112, + "grad_norm": 0.1962585441746241, + "learning_rate": 0.0009572772875584911, + "loss": 1.4552, + "step": 1748 + }, + { + "epoch": 0.15868263473053892, + "grad_norm": 0.20856454230062932, + "learning_rate": 0.0009572178413605718, + "loss": 1.4927, + "step": 1749 + }, + { + "epoch": 0.15877336236617673, + "grad_norm": 0.19421351003477655, + "learning_rate": 0.0009571583556818245, + "loss": 1.4817, + "step": 1750 + }, + { + "epoch": 0.15886409000181456, + "grad_norm": 0.196454299192572, + "learning_rate": 0.0009570988305273858, + "loss": 1.4818, + "step": 1751 + }, + { + "epoch": 0.15895481763745237, + "grad_norm": 0.18393876915838078, + "learning_rate": 0.0009570392659023957, + "loss": 1.5201, + "step": 1752 + }, + { + "epoch": 0.15904554527309017, + "grad_norm": 0.18520110641557203, + "learning_rate": 0.0009569796618119977, + "loss": 1.555, + "step": 1753 + }, + { + "epoch": 0.159136272908728, + "grad_norm": 0.19214000609149687, + "learning_rate": 0.0009569200182613385, + "loss": 1.5216, + "step": 1754 + }, + { + "epoch": 0.1592270005443658, + "grad_norm": 0.2056394484126395, + "learning_rate": 0.0009568603352555684, + "loss": 1.4838, + "step": 1755 + }, + { + "epoch": 0.15931772818000364, + "grad_norm": 0.20330390967986206, + "learning_rate": 0.000956800612799841, + "loss": 1.4906, + "step": 1756 + }, + { + "epoch": 0.15940845581564145, + "grad_norm": 0.18665517797997794, + "learning_rate": 0.0009567408508993134, + "loss": 1.4952, + "step": 1757 + }, + { + "epoch": 0.15949918345127925, + "grad_norm": 0.19407003821669444, + "learning_rate": 0.0009566810495591459, + "loss": 1.4792, + "step": 1758 + }, + { + "epoch": 0.15958991108691709, + "grad_norm": 0.18869830186584532, + "learning_rate": 0.0009566212087845025, + "loss": 1.4529, + "step": 1759 + }, + { + "epoch": 0.1596806387225549, + "grad_norm": 0.17909678347396768, + "learning_rate": 0.0009565613285805506, + "loss": 1.4889, + "step": 1760 + }, + { + "epoch": 0.1597713663581927, + "grad_norm": 0.19131710941902288, + "learning_rate": 0.0009565014089524604, + "loss": 1.4691, + "step": 1761 + }, + { + "epoch": 0.15986209399383053, + "grad_norm": 0.1960228928062618, + "learning_rate": 0.0009564414499054065, + "loss": 1.5236, + "step": 1762 + }, + { + "epoch": 0.15995282162946833, + "grad_norm": 0.1912112023017314, + "learning_rate": 0.000956381451444566, + "loss": 1.5143, + "step": 1763 + }, + { + "epoch": 0.16004354926510614, + "grad_norm": 0.1859608390952751, + "learning_rate": 0.0009563214135751199, + "loss": 1.4719, + "step": 1764 + }, + { + "epoch": 0.16013427690074397, + "grad_norm": 0.19034564880682317, + "learning_rate": 0.0009562613363022526, + "loss": 1.4699, + "step": 1765 + }, + { + "epoch": 0.16022500453638178, + "grad_norm": 0.18423564428736267, + "learning_rate": 0.0009562012196311515, + "loss": 1.4438, + "step": 1766 + }, + { + "epoch": 0.1603157321720196, + "grad_norm": 0.18971490719326747, + "learning_rate": 0.0009561410635670079, + "loss": 1.4943, + "step": 1767 + }, + { + "epoch": 0.16040645980765741, + "grad_norm": 0.18691980735657582, + "learning_rate": 0.0009560808681150164, + "loss": 1.4879, + "step": 1768 + }, + { + "epoch": 0.16049718744329522, + "grad_norm": 0.2108218592233075, + "learning_rate": 0.0009560206332803745, + "loss": 1.4795, + "step": 1769 + }, + { + "epoch": 0.16058791507893305, + "grad_norm": 0.18283749130474017, + "learning_rate": 0.0009559603590682837, + "loss": 1.5021, + "step": 1770 + }, + { + "epoch": 0.16067864271457086, + "grad_norm": 0.218763893654201, + "learning_rate": 0.0009559000454839488, + "loss": 1.4848, + "step": 1771 + }, + { + "epoch": 0.16076937035020866, + "grad_norm": 0.19535421359117192, + "learning_rate": 0.0009558396925325778, + "loss": 1.4925, + "step": 1772 + }, + { + "epoch": 0.1608600979858465, + "grad_norm": 0.19982995688256208, + "learning_rate": 0.000955779300219382, + "loss": 1.5022, + "step": 1773 + }, + { + "epoch": 0.1609508256214843, + "grad_norm": 0.18889796315592228, + "learning_rate": 0.0009557188685495767, + "loss": 1.5032, + "step": 1774 + }, + { + "epoch": 0.16104155325712213, + "grad_norm": 0.2677016512608687, + "learning_rate": 0.0009556583975283798, + "loss": 1.4626, + "step": 1775 + }, + { + "epoch": 0.16113228089275994, + "grad_norm": 0.19317790430587645, + "learning_rate": 0.0009555978871610131, + "loss": 1.4852, + "step": 1776 + }, + { + "epoch": 0.16122300852839774, + "grad_norm": 0.19171414006934073, + "learning_rate": 0.0009555373374527016, + "loss": 1.4977, + "step": 1777 + }, + { + "epoch": 0.16131373616403558, + "grad_norm": 0.227676083066595, + "learning_rate": 0.0009554767484086741, + "loss": 1.493, + "step": 1778 + }, + { + "epoch": 0.16140446379967338, + "grad_norm": 0.20216546351003103, + "learning_rate": 0.0009554161200341622, + "loss": 1.4981, + "step": 1779 + }, + { + "epoch": 0.1614951914353112, + "grad_norm": 0.2058210466151213, + "learning_rate": 0.0009553554523344011, + "loss": 1.5054, + "step": 1780 + }, + { + "epoch": 0.16158591907094902, + "grad_norm": 0.1857169134056215, + "learning_rate": 0.0009552947453146297, + "loss": 1.4347, + "step": 1781 + }, + { + "epoch": 0.16167664670658682, + "grad_norm": 0.19180303930803017, + "learning_rate": 0.0009552339989800898, + "loss": 1.4335, + "step": 1782 + }, + { + "epoch": 0.16176737434222463, + "grad_norm": 0.2357819477088831, + "learning_rate": 0.0009551732133360271, + "loss": 1.4941, + "step": 1783 + }, + { + "epoch": 0.16185810197786246, + "grad_norm": 0.19039848582971491, + "learning_rate": 0.0009551123883876902, + "loss": 1.5393, + "step": 1784 + }, + { + "epoch": 0.16194882961350027, + "grad_norm": 0.20858755897575254, + "learning_rate": 0.0009550515241403317, + "loss": 1.4721, + "step": 1785 + }, + { + "epoch": 0.1620395572491381, + "grad_norm": 0.19132197455002437, + "learning_rate": 0.000954990620599207, + "loss": 1.493, + "step": 1786 + }, + { + "epoch": 0.1621302848847759, + "grad_norm": 0.18702197992240863, + "learning_rate": 0.0009549296777695748, + "loss": 1.4938, + "step": 1787 + }, + { + "epoch": 0.1622210125204137, + "grad_norm": 0.189291980441887, + "learning_rate": 0.0009548686956566984, + "loss": 1.4966, + "step": 1788 + }, + { + "epoch": 0.16231174015605154, + "grad_norm": 0.19084257051758738, + "learning_rate": 0.0009548076742658427, + "loss": 1.4616, + "step": 1789 + }, + { + "epoch": 0.16240246779168935, + "grad_norm": 0.18742599805837698, + "learning_rate": 0.0009547466136022774, + "loss": 1.5125, + "step": 1790 + }, + { + "epoch": 0.16249319542732715, + "grad_norm": 0.19694591497007485, + "learning_rate": 0.0009546855136712752, + "loss": 1.486, + "step": 1791 + }, + { + "epoch": 0.16258392306296499, + "grad_norm": 0.21677255959227318, + "learning_rate": 0.0009546243744781116, + "loss": 1.477, + "step": 1792 + }, + { + "epoch": 0.1626746506986028, + "grad_norm": 0.18693121458365508, + "learning_rate": 0.0009545631960280662, + "loss": 1.4911, + "step": 1793 + }, + { + "epoch": 0.16276537833424062, + "grad_norm": 0.1966923370402943, + "learning_rate": 0.000954501978326422, + "loss": 1.4502, + "step": 1794 + }, + { + "epoch": 0.16285610596987843, + "grad_norm": 0.1806080872279899, + "learning_rate": 0.0009544407213784651, + "loss": 1.4595, + "step": 1795 + }, + { + "epoch": 0.16294683360551623, + "grad_norm": 0.19277417590177942, + "learning_rate": 0.0009543794251894847, + "loss": 1.4803, + "step": 1796 + }, + { + "epoch": 0.16303756124115407, + "grad_norm": 0.19735288052515712, + "learning_rate": 0.0009543180897647741, + "loss": 1.4787, + "step": 1797 + }, + { + "epoch": 0.16312828887679187, + "grad_norm": 0.17486581457613784, + "learning_rate": 0.0009542567151096294, + "loss": 1.4924, + "step": 1798 + }, + { + "epoch": 0.16321901651242968, + "grad_norm": 0.17702816594844228, + "learning_rate": 0.0009541953012293505, + "loss": 1.4849, + "step": 1799 + }, + { + "epoch": 0.1633097441480675, + "grad_norm": 0.18277085879769536, + "learning_rate": 0.0009541338481292404, + "loss": 1.4825, + "step": 1800 + }, + { + "epoch": 0.16340047178370531, + "grad_norm": 0.1782208068666128, + "learning_rate": 0.0009540723558146055, + "loss": 1.4818, + "step": 1801 + }, + { + "epoch": 0.16349119941934312, + "grad_norm": 0.17580886036051865, + "learning_rate": 0.0009540108242907557, + "loss": 1.4758, + "step": 1802 + }, + { + "epoch": 0.16358192705498095, + "grad_norm": 0.19246360851006244, + "learning_rate": 0.0009539492535630044, + "loss": 1.499, + "step": 1803 + }, + { + "epoch": 0.16367265469061876, + "grad_norm": 0.1866024710553281, + "learning_rate": 0.0009538876436366681, + "loss": 1.5195, + "step": 1804 + }, + { + "epoch": 0.1637633823262566, + "grad_norm": 0.20263794171399616, + "learning_rate": 0.0009538259945170671, + "loss": 1.4825, + "step": 1805 + }, + { + "epoch": 0.1638541099618944, + "grad_norm": 0.17876432004236112, + "learning_rate": 0.0009537643062095243, + "loss": 1.4678, + "step": 1806 + }, + { + "epoch": 0.1639448375975322, + "grad_norm": 0.1875876919073715, + "learning_rate": 0.000953702578719367, + "loss": 1.5068, + "step": 1807 + }, + { + "epoch": 0.16403556523317003, + "grad_norm": 0.18515503891702967, + "learning_rate": 0.0009536408120519249, + "loss": 1.4854, + "step": 1808 + }, + { + "epoch": 0.16412629286880784, + "grad_norm": 0.20492349364672277, + "learning_rate": 0.000953579006212532, + "loss": 1.4646, + "step": 1809 + }, + { + "epoch": 0.16421702050444564, + "grad_norm": 0.18586778509995577, + "learning_rate": 0.000953517161206525, + "loss": 1.4522, + "step": 1810 + }, + { + "epoch": 0.16430774814008348, + "grad_norm": 0.19214735715683529, + "learning_rate": 0.0009534552770392444, + "loss": 1.4769, + "step": 1811 + }, + { + "epoch": 0.16439847577572128, + "grad_norm": 0.1844848092656225, + "learning_rate": 0.0009533933537160338, + "loss": 1.4728, + "step": 1812 + }, + { + "epoch": 0.1644892034113591, + "grad_norm": 0.18759963241082572, + "learning_rate": 0.0009533313912422401, + "loss": 1.4376, + "step": 1813 + }, + { + "epoch": 0.16457993104699692, + "grad_norm": 0.2082563757377944, + "learning_rate": 0.0009532693896232141, + "loss": 1.5164, + "step": 1814 + }, + { + "epoch": 0.16467065868263472, + "grad_norm": 0.19435712839311106, + "learning_rate": 0.0009532073488643094, + "loss": 1.5015, + "step": 1815 + }, + { + "epoch": 0.16476138631827256, + "grad_norm": 0.18327887574321478, + "learning_rate": 0.0009531452689708833, + "loss": 1.4431, + "step": 1816 + }, + { + "epoch": 0.16485211395391036, + "grad_norm": 0.20788950393885103, + "learning_rate": 0.0009530831499482966, + "loss": 1.4611, + "step": 1817 + }, + { + "epoch": 0.16494284158954817, + "grad_norm": 0.18371745749279494, + "learning_rate": 0.000953020991801913, + "loss": 1.4907, + "step": 1818 + }, + { + "epoch": 0.165033569225186, + "grad_norm": 0.1796619883374961, + "learning_rate": 0.0009529587945371, + "loss": 1.461, + "step": 1819 + }, + { + "epoch": 0.1651242968608238, + "grad_norm": 0.19679060521091932, + "learning_rate": 0.0009528965581592284, + "loss": 1.48, + "step": 1820 + }, + { + "epoch": 0.1652150244964616, + "grad_norm": 0.19202366913175076, + "learning_rate": 0.0009528342826736722, + "loss": 1.4693, + "step": 1821 + }, + { + "epoch": 0.16530575213209944, + "grad_norm": 0.18070311170792847, + "learning_rate": 0.0009527719680858089, + "loss": 1.4669, + "step": 1822 + }, + { + "epoch": 0.16539647976773725, + "grad_norm": 0.1899370861430928, + "learning_rate": 0.0009527096144010196, + "loss": 1.4806, + "step": 1823 + }, + { + "epoch": 0.16548720740337508, + "grad_norm": 0.1929394216014914, + "learning_rate": 0.0009526472216246882, + "loss": 1.4943, + "step": 1824 + }, + { + "epoch": 0.16557793503901289, + "grad_norm": 0.18783544859373824, + "learning_rate": 0.0009525847897622026, + "loss": 1.5234, + "step": 1825 + }, + { + "epoch": 0.1656686626746507, + "grad_norm": 0.1947725390662169, + "learning_rate": 0.0009525223188189536, + "loss": 1.4813, + "step": 1826 + }, + { + "epoch": 0.16575939031028852, + "grad_norm": 0.1972865962346976, + "learning_rate": 0.0009524598088003357, + "loss": 1.454, + "step": 1827 + }, + { + "epoch": 0.16585011794592633, + "grad_norm": 0.1792790085366141, + "learning_rate": 0.0009523972597117467, + "loss": 1.4517, + "step": 1828 + }, + { + "epoch": 0.16594084558156413, + "grad_norm": 0.21279987720207097, + "learning_rate": 0.0009523346715585877, + "loss": 1.4691, + "step": 1829 + }, + { + "epoch": 0.16603157321720197, + "grad_norm": 0.22452098411765983, + "learning_rate": 0.0009522720443462629, + "loss": 1.4427, + "step": 1830 + }, + { + "epoch": 0.16612230085283977, + "grad_norm": 0.19127885091527821, + "learning_rate": 0.0009522093780801806, + "loss": 1.4849, + "step": 1831 + }, + { + "epoch": 0.16621302848847758, + "grad_norm": 0.19355719345915431, + "learning_rate": 0.0009521466727657518, + "loss": 1.4771, + "step": 1832 + }, + { + "epoch": 0.1663037561241154, + "grad_norm": 0.1886666891459, + "learning_rate": 0.0009520839284083913, + "loss": 1.4833, + "step": 1833 + }, + { + "epoch": 0.16639448375975321, + "grad_norm": 0.2071846243256204, + "learning_rate": 0.0009520211450135168, + "loss": 1.4488, + "step": 1834 + }, + { + "epoch": 0.16648521139539105, + "grad_norm": 0.22158520618539415, + "learning_rate": 0.0009519583225865498, + "loss": 1.5061, + "step": 1835 + }, + { + "epoch": 0.16657593903102885, + "grad_norm": 0.20455960294466116, + "learning_rate": 0.0009518954611329152, + "loss": 1.495, + "step": 1836 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 0.20477211960862113, + "learning_rate": 0.0009518325606580407, + "loss": 1.5115, + "step": 1837 + }, + { + "epoch": 0.1667573943023045, + "grad_norm": 0.2123269680844291, + "learning_rate": 0.0009517696211673581, + "loss": 1.5166, + "step": 1838 + }, + { + "epoch": 0.1668481219379423, + "grad_norm": 0.18717309954230224, + "learning_rate": 0.0009517066426663021, + "loss": 1.5031, + "step": 1839 + }, + { + "epoch": 0.1669388495735801, + "grad_norm": 0.19468524110259844, + "learning_rate": 0.000951643625160311, + "loss": 1.4892, + "step": 1840 + }, + { + "epoch": 0.16702957720921793, + "grad_norm": 0.2126593264944013, + "learning_rate": 0.0009515805686548262, + "loss": 1.4832, + "step": 1841 + }, + { + "epoch": 0.16712030484485574, + "grad_norm": 0.1949010410694185, + "learning_rate": 0.0009515174731552929, + "loss": 1.4657, + "step": 1842 + }, + { + "epoch": 0.16721103248049357, + "grad_norm": 0.6825604949939889, + "learning_rate": 0.000951454338667159, + "loss": 1.434, + "step": 1843 + }, + { + "epoch": 0.16730176011613138, + "grad_norm": 0.18569314773617557, + "learning_rate": 0.0009513911651958766, + "loss": 1.4799, + "step": 1844 + }, + { + "epoch": 0.16739248775176918, + "grad_norm": 0.1926469370097391, + "learning_rate": 0.0009513279527469005, + "loss": 1.447, + "step": 1845 + }, + { + "epoch": 0.16748321538740701, + "grad_norm": 0.19981562002907874, + "learning_rate": 0.0009512647013256892, + "loss": 1.4962, + "step": 1846 + }, + { + "epoch": 0.16757394302304482, + "grad_norm": 0.38977920055990867, + "learning_rate": 0.0009512014109377045, + "loss": 1.4557, + "step": 1847 + }, + { + "epoch": 0.16766467065868262, + "grad_norm": 0.18828815981283115, + "learning_rate": 0.0009511380815884114, + "loss": 1.498, + "step": 1848 + }, + { + "epoch": 0.16775539829432046, + "grad_norm": 0.1924757327209146, + "learning_rate": 0.0009510747132832785, + "loss": 1.4893, + "step": 1849 + }, + { + "epoch": 0.16784612592995826, + "grad_norm": 0.19096030148992452, + "learning_rate": 0.0009510113060277774, + "loss": 1.4794, + "step": 1850 + }, + { + "epoch": 0.16793685356559607, + "grad_norm": 0.1930967568373273, + "learning_rate": 0.0009509478598273837, + "loss": 1.4659, + "step": 1851 + }, + { + "epoch": 0.1680275812012339, + "grad_norm": 0.18833833144329945, + "learning_rate": 0.0009508843746875759, + "loss": 1.4479, + "step": 1852 + }, + { + "epoch": 0.1681183088368717, + "grad_norm": 0.1804860212170497, + "learning_rate": 0.0009508208506138358, + "loss": 1.446, + "step": 1853 + }, + { + "epoch": 0.16820903647250954, + "grad_norm": 0.18784616776160026, + "learning_rate": 0.000950757287611649, + "loss": 1.4683, + "step": 1854 + }, + { + "epoch": 0.16829976410814734, + "grad_norm": 0.1976039932733615, + "learning_rate": 0.0009506936856865038, + "loss": 1.4711, + "step": 1855 + }, + { + "epoch": 0.16839049174378515, + "grad_norm": 0.18944045703784945, + "learning_rate": 0.0009506300448438924, + "loss": 1.4866, + "step": 1856 + }, + { + "epoch": 0.16848121937942298, + "grad_norm": 0.20193256367240814, + "learning_rate": 0.0009505663650893104, + "loss": 1.463, + "step": 1857 + }, + { + "epoch": 0.16857194701506079, + "grad_norm": 0.17959965345200696, + "learning_rate": 0.0009505026464282563, + "loss": 1.4573, + "step": 1858 + }, + { + "epoch": 0.1686626746506986, + "grad_norm": 0.18791241227128874, + "learning_rate": 0.0009504388888662321, + "loss": 1.4177, + "step": 1859 + }, + { + "epoch": 0.16875340228633642, + "grad_norm": 0.19055557229908934, + "learning_rate": 0.0009503750924087436, + "loss": 1.5024, + "step": 1860 + }, + { + "epoch": 0.16884412992197423, + "grad_norm": 0.19037843638461036, + "learning_rate": 0.0009503112570612993, + "loss": 1.4757, + "step": 1861 + }, + { + "epoch": 0.16893485755761206, + "grad_norm": 0.19761930001944067, + "learning_rate": 0.0009502473828294118, + "loss": 1.495, + "step": 1862 + }, + { + "epoch": 0.16902558519324987, + "grad_norm": 0.19731559636997825, + "learning_rate": 0.0009501834697185964, + "loss": 1.5011, + "step": 1863 + }, + { + "epoch": 0.16911631282888767, + "grad_norm": 0.20619279993681552, + "learning_rate": 0.0009501195177343721, + "loss": 1.5154, + "step": 1864 + }, + { + "epoch": 0.1692070404645255, + "grad_norm": 0.1779529591636492, + "learning_rate": 0.000950055526882261, + "loss": 1.4683, + "step": 1865 + }, + { + "epoch": 0.1692977681001633, + "grad_norm": 0.18667518507421096, + "learning_rate": 0.0009499914971677889, + "loss": 1.4528, + "step": 1866 + }, + { + "epoch": 0.16938849573580111, + "grad_norm": 0.19075593327836657, + "learning_rate": 0.0009499274285964846, + "loss": 1.5091, + "step": 1867 + }, + { + "epoch": 0.16947922337143895, + "grad_norm": 0.17778894951833032, + "learning_rate": 0.0009498633211738806, + "loss": 1.4784, + "step": 1868 + }, + { + "epoch": 0.16956995100707675, + "grad_norm": 0.18354715462583854, + "learning_rate": 0.0009497991749055125, + "loss": 1.475, + "step": 1869 + }, + { + "epoch": 0.16966067864271456, + "grad_norm": 0.18923762792433305, + "learning_rate": 0.0009497349897969194, + "loss": 1.4318, + "step": 1870 + }, + { + "epoch": 0.1697514062783524, + "grad_norm": 0.17640750797551802, + "learning_rate": 0.0009496707658536435, + "loss": 1.4655, + "step": 1871 + }, + { + "epoch": 0.1698421339139902, + "grad_norm": 0.18555668507635545, + "learning_rate": 0.0009496065030812308, + "loss": 1.4785, + "step": 1872 + }, + { + "epoch": 0.16993286154962803, + "grad_norm": 0.18224096141088122, + "learning_rate": 0.0009495422014852302, + "loss": 1.4604, + "step": 1873 + }, + { + "epoch": 0.17002358918526583, + "grad_norm": 0.18304127560798406, + "learning_rate": 0.0009494778610711945, + "loss": 1.4712, + "step": 1874 + }, + { + "epoch": 0.17011431682090364, + "grad_norm": 0.18829587009596765, + "learning_rate": 0.0009494134818446789, + "loss": 1.4648, + "step": 1875 + }, + { + "epoch": 0.17020504445654147, + "grad_norm": 0.1980680590989037, + "learning_rate": 0.0009493490638112432, + "loss": 1.4486, + "step": 1876 + }, + { + "epoch": 0.17029577209217928, + "grad_norm": 0.28683196586661974, + "learning_rate": 0.0009492846069764495, + "loss": 1.4622, + "step": 1877 + }, + { + "epoch": 0.17038649972781708, + "grad_norm": 0.19238746743750432, + "learning_rate": 0.0009492201113458637, + "loss": 1.4957, + "step": 1878 + }, + { + "epoch": 0.17047722736345491, + "grad_norm": 0.19205304605244372, + "learning_rate": 0.0009491555769250552, + "loss": 1.4902, + "step": 1879 + }, + { + "epoch": 0.17056795499909272, + "grad_norm": 0.1852327870457006, + "learning_rate": 0.0009490910037195964, + "loss": 1.4464, + "step": 1880 + }, + { + "epoch": 0.17065868263473055, + "grad_norm": 0.1970266066921997, + "learning_rate": 0.0009490263917350632, + "loss": 1.4577, + "step": 1881 + }, + { + "epoch": 0.17074941027036836, + "grad_norm": 0.19576694499638195, + "learning_rate": 0.0009489617409770348, + "loss": 1.4937, + "step": 1882 + }, + { + "epoch": 0.17084013790600616, + "grad_norm": 0.19007125740424394, + "learning_rate": 0.000948897051451094, + "loss": 1.4741, + "step": 1883 + }, + { + "epoch": 0.170930865541644, + "grad_norm": 0.1871089602402652, + "learning_rate": 0.0009488323231628266, + "loss": 1.4657, + "step": 1884 + }, + { + "epoch": 0.1710215931772818, + "grad_norm": 0.18754775468724283, + "learning_rate": 0.0009487675561178221, + "loss": 1.4722, + "step": 1885 + }, + { + "epoch": 0.1711123208129196, + "grad_norm": 0.1903480121410397, + "learning_rate": 0.0009487027503216729, + "loss": 1.4901, + "step": 1886 + }, + { + "epoch": 0.17120304844855744, + "grad_norm": 0.18801752836673133, + "learning_rate": 0.000948637905779975, + "loss": 1.4775, + "step": 1887 + }, + { + "epoch": 0.17129377608419524, + "grad_norm": 0.19248189306386826, + "learning_rate": 0.000948573022498328, + "loss": 1.481, + "step": 1888 + }, + { + "epoch": 0.17138450371983305, + "grad_norm": 0.6636429347632209, + "learning_rate": 0.0009485081004823342, + "loss": 1.4728, + "step": 1889 + }, + { + "epoch": 0.17147523135547088, + "grad_norm": 0.19369867867800133, + "learning_rate": 0.0009484431397375998, + "loss": 1.5158, + "step": 1890 + }, + { + "epoch": 0.17156595899110869, + "grad_norm": 0.22334332291257442, + "learning_rate": 0.0009483781402697344, + "loss": 1.5262, + "step": 1891 + }, + { + "epoch": 0.17165668662674652, + "grad_norm": 0.19199036032376812, + "learning_rate": 0.0009483131020843503, + "loss": 1.5069, + "step": 1892 + }, + { + "epoch": 0.17174741426238432, + "grad_norm": 0.2577900509404644, + "learning_rate": 0.0009482480251870639, + "loss": 1.4579, + "step": 1893 + }, + { + "epoch": 0.17183814189802213, + "grad_norm": 0.1751208740611575, + "learning_rate": 0.0009481829095834943, + "loss": 1.4687, + "step": 1894 + }, + { + "epoch": 0.17192886953365996, + "grad_norm": 0.19710947303021337, + "learning_rate": 0.0009481177552792645, + "loss": 1.4822, + "step": 1895 + }, + { + "epoch": 0.17201959716929777, + "grad_norm": 0.2240961886100372, + "learning_rate": 0.0009480525622800006, + "loss": 1.4647, + "step": 1896 + }, + { + "epoch": 0.17211032480493557, + "grad_norm": 0.3400226791660721, + "learning_rate": 0.0009479873305913318, + "loss": 1.4958, + "step": 1897 + }, + { + "epoch": 0.1722010524405734, + "grad_norm": 0.1942978039815146, + "learning_rate": 0.000947922060218891, + "loss": 1.495, + "step": 1898 + }, + { + "epoch": 0.1722917800762112, + "grad_norm": 0.2125087354458267, + "learning_rate": 0.0009478567511683142, + "loss": 1.4527, + "step": 1899 + }, + { + "epoch": 0.17238250771184904, + "grad_norm": 0.1899935611768517, + "learning_rate": 0.0009477914034452411, + "loss": 1.4822, + "step": 1900 + }, + { + "epoch": 0.17247323534748685, + "grad_norm": 0.20135628334692796, + "learning_rate": 0.0009477260170553142, + "loss": 1.4826, + "step": 1901 + }, + { + "epoch": 0.17256396298312465, + "grad_norm": 0.18039041223572633, + "learning_rate": 0.0009476605920041796, + "loss": 1.4669, + "step": 1902 + }, + { + "epoch": 0.17265469061876249, + "grad_norm": 0.1993530153835063, + "learning_rate": 0.0009475951282974871, + "loss": 1.4621, + "step": 1903 + }, + { + "epoch": 0.1727454182544003, + "grad_norm": 0.18496499705598593, + "learning_rate": 0.0009475296259408892, + "loss": 1.4862, + "step": 1904 + }, + { + "epoch": 0.1728361458900381, + "grad_norm": 0.17733934808019203, + "learning_rate": 0.0009474640849400422, + "loss": 1.4979, + "step": 1905 + }, + { + "epoch": 0.17292687352567593, + "grad_norm": 0.177984581899002, + "learning_rate": 0.0009473985053006055, + "loss": 1.4994, + "step": 1906 + }, + { + "epoch": 0.17301760116131373, + "grad_norm": 0.24460950415289914, + "learning_rate": 0.000947332887028242, + "loss": 1.4862, + "step": 1907 + }, + { + "epoch": 0.17310832879695154, + "grad_norm": 0.1831912935209421, + "learning_rate": 0.0009472672301286176, + "loss": 1.4885, + "step": 1908 + }, + { + "epoch": 0.17319905643258937, + "grad_norm": 0.19642417180000496, + "learning_rate": 0.0009472015346074021, + "loss": 1.462, + "step": 1909 + }, + { + "epoch": 0.17328978406822718, + "grad_norm": 0.17905872030288314, + "learning_rate": 0.0009471358004702682, + "loss": 1.4725, + "step": 1910 + }, + { + "epoch": 0.173380511703865, + "grad_norm": 0.18986269618286766, + "learning_rate": 0.000947070027722892, + "loss": 1.4635, + "step": 1911 + }, + { + "epoch": 0.17347123933950281, + "grad_norm": 0.19311639601637792, + "learning_rate": 0.0009470042163709529, + "loss": 1.4699, + "step": 1912 + }, + { + "epoch": 0.17356196697514062, + "grad_norm": 0.17635544302552023, + "learning_rate": 0.000946938366420134, + "loss": 1.4823, + "step": 1913 + }, + { + "epoch": 0.17365269461077845, + "grad_norm": 0.1846317063045804, + "learning_rate": 0.0009468724778761212, + "loss": 1.4471, + "step": 1914 + }, + { + "epoch": 0.17374342224641626, + "grad_norm": 0.195633120498795, + "learning_rate": 0.000946806550744604, + "loss": 1.4701, + "step": 1915 + }, + { + "epoch": 0.17383414988205406, + "grad_norm": 0.1937746391741126, + "learning_rate": 0.0009467405850312753, + "loss": 1.4678, + "step": 1916 + }, + { + "epoch": 0.1739248775176919, + "grad_norm": 0.184480256855427, + "learning_rate": 0.0009466745807418315, + "loss": 1.4776, + "step": 1917 + }, + { + "epoch": 0.1740156051533297, + "grad_norm": 0.1973941619044804, + "learning_rate": 0.0009466085378819715, + "loss": 1.4136, + "step": 1918 + }, + { + "epoch": 0.17410633278896753, + "grad_norm": 0.18160272453494172, + "learning_rate": 0.0009465424564573985, + "loss": 1.4671, + "step": 1919 + }, + { + "epoch": 0.17419706042460534, + "grad_norm": 0.20059080493161097, + "learning_rate": 0.0009464763364738187, + "loss": 1.4859, + "step": 1920 + }, + { + "epoch": 0.17428778806024314, + "grad_norm": 0.22277306302472183, + "learning_rate": 0.0009464101779369414, + "loss": 1.4725, + "step": 1921 + }, + { + "epoch": 0.17437851569588098, + "grad_norm": 0.18743127843435, + "learning_rate": 0.0009463439808524794, + "loss": 1.5067, + "step": 1922 + }, + { + "epoch": 0.17446924333151878, + "grad_norm": 0.17719002982400114, + "learning_rate": 0.0009462777452261489, + "loss": 1.4609, + "step": 1923 + }, + { + "epoch": 0.17455997096715659, + "grad_norm": 0.17696080193743477, + "learning_rate": 0.0009462114710636694, + "loss": 1.4617, + "step": 1924 + }, + { + "epoch": 0.17465069860279442, + "grad_norm": 0.19499714382821434, + "learning_rate": 0.0009461451583707635, + "loss": 1.4622, + "step": 1925 + }, + { + "epoch": 0.17474142623843222, + "grad_norm": 0.1841126417338815, + "learning_rate": 0.0009460788071531574, + "loss": 1.4862, + "step": 1926 + }, + { + "epoch": 0.17483215387407003, + "grad_norm": 0.18124003649758608, + "learning_rate": 0.0009460124174165807, + "loss": 1.4155, + "step": 1927 + }, + { + "epoch": 0.17492288150970786, + "grad_norm": 0.1816900114858022, + "learning_rate": 0.0009459459891667659, + "loss": 1.4491, + "step": 1928 + }, + { + "epoch": 0.17501360914534567, + "grad_norm": 0.17861900578927514, + "learning_rate": 0.0009458795224094492, + "loss": 1.4594, + "step": 1929 + }, + { + "epoch": 0.1751043367809835, + "grad_norm": 0.18302536461622793, + "learning_rate": 0.0009458130171503701, + "loss": 1.4832, + "step": 1930 + }, + { + "epoch": 0.1751950644166213, + "grad_norm": 0.2182151249488229, + "learning_rate": 0.0009457464733952711, + "loss": 1.4832, + "step": 1931 + }, + { + "epoch": 0.1752857920522591, + "grad_norm": 0.17350290414356317, + "learning_rate": 0.0009456798911498985, + "loss": 1.4888, + "step": 1932 + }, + { + "epoch": 0.17537651968789694, + "grad_norm": 0.1806478215282395, + "learning_rate": 0.0009456132704200017, + "loss": 1.5228, + "step": 1933 + }, + { + "epoch": 0.17546724732353475, + "grad_norm": 0.17475138297978723, + "learning_rate": 0.0009455466112113333, + "loss": 1.4717, + "step": 1934 + }, + { + "epoch": 0.17555797495917255, + "grad_norm": 0.16880463020342254, + "learning_rate": 0.0009454799135296492, + "loss": 1.4537, + "step": 1935 + }, + { + "epoch": 0.17564870259481039, + "grad_norm": 0.18844528550722975, + "learning_rate": 0.0009454131773807091, + "loss": 1.4825, + "step": 1936 + }, + { + "epoch": 0.1757394302304482, + "grad_norm": 0.22476339530721198, + "learning_rate": 0.0009453464027702754, + "loss": 1.4915, + "step": 1937 + }, + { + "epoch": 0.17583015786608602, + "grad_norm": 0.1772739746103308, + "learning_rate": 0.0009452795897041142, + "loss": 1.4813, + "step": 1938 + }, + { + "epoch": 0.17592088550172383, + "grad_norm": 0.17938558950304556, + "learning_rate": 0.000945212738187995, + "loss": 1.4872, + "step": 1939 + }, + { + "epoch": 0.17601161313736163, + "grad_norm": 0.1936004426309901, + "learning_rate": 0.00094514584822769, + "loss": 1.5027, + "step": 1940 + }, + { + "epoch": 0.17610234077299947, + "grad_norm": 0.19025266618666306, + "learning_rate": 0.0009450789198289753, + "loss": 1.4549, + "step": 1941 + }, + { + "epoch": 0.17619306840863727, + "grad_norm": 0.18336434747348818, + "learning_rate": 0.0009450119529976304, + "loss": 1.4639, + "step": 1942 + }, + { + "epoch": 0.17628379604427508, + "grad_norm": 0.19079189104744423, + "learning_rate": 0.0009449449477394379, + "loss": 1.4551, + "step": 1943 + }, + { + "epoch": 0.1763745236799129, + "grad_norm": 0.1918198222966092, + "learning_rate": 0.0009448779040601835, + "loss": 1.4856, + "step": 1944 + }, + { + "epoch": 0.17646525131555071, + "grad_norm": 0.2092049295200832, + "learning_rate": 0.0009448108219656565, + "loss": 1.4558, + "step": 1945 + }, + { + "epoch": 0.17655597895118852, + "grad_norm": 0.18389915580098698, + "learning_rate": 0.0009447437014616495, + "loss": 1.4656, + "step": 1946 + }, + { + "epoch": 0.17664670658682635, + "grad_norm": 0.20209809180024563, + "learning_rate": 0.0009446765425539582, + "loss": 1.4565, + "step": 1947 + }, + { + "epoch": 0.17673743422246416, + "grad_norm": 0.202085842148024, + "learning_rate": 0.0009446093452483821, + "loss": 1.4915, + "step": 1948 + }, + { + "epoch": 0.176828161858102, + "grad_norm": 0.22580270250227585, + "learning_rate": 0.0009445421095507233, + "loss": 1.4425, + "step": 1949 + }, + { + "epoch": 0.1769188894937398, + "grad_norm": 0.24257897849197227, + "learning_rate": 0.000944474835466788, + "loss": 1.4889, + "step": 1950 + }, + { + "epoch": 0.1770096171293776, + "grad_norm": 0.18978827601716663, + "learning_rate": 0.000944407523002385, + "loss": 1.486, + "step": 1951 + }, + { + "epoch": 0.17710034476501543, + "grad_norm": 0.19617455172128645, + "learning_rate": 0.000944340172163327, + "loss": 1.4809, + "step": 1952 + }, + { + "epoch": 0.17719107240065324, + "grad_norm": 0.28378430976574587, + "learning_rate": 0.0009442727829554297, + "loss": 1.5078, + "step": 1953 + }, + { + "epoch": 0.17728180003629104, + "grad_norm": 0.18258997021290224, + "learning_rate": 0.0009442053553845119, + "loss": 1.485, + "step": 1954 + }, + { + "epoch": 0.17737252767192888, + "grad_norm": 0.21075463117346793, + "learning_rate": 0.0009441378894563963, + "loss": 1.5123, + "step": 1955 + }, + { + "epoch": 0.17746325530756668, + "grad_norm": 0.25696451052243535, + "learning_rate": 0.0009440703851769086, + "loss": 1.4822, + "step": 1956 + }, + { + "epoch": 0.1775539829432045, + "grad_norm": 0.1944951880058975, + "learning_rate": 0.0009440028425518777, + "loss": 1.4377, + "step": 1957 + }, + { + "epoch": 0.17764471057884232, + "grad_norm": 0.20451934331933824, + "learning_rate": 0.0009439352615871358, + "loss": 1.4823, + "step": 1958 + }, + { + "epoch": 0.17773543821448012, + "grad_norm": 0.19134219737724503, + "learning_rate": 0.0009438676422885185, + "loss": 1.4729, + "step": 1959 + }, + { + "epoch": 0.17782616585011796, + "grad_norm": 0.19898235673207143, + "learning_rate": 0.000943799984661865, + "loss": 1.5158, + "step": 1960 + }, + { + "epoch": 0.17791689348575576, + "grad_norm": 0.22591176778282324, + "learning_rate": 0.0009437322887130174, + "loss": 1.4886, + "step": 1961 + }, + { + "epoch": 0.17800762112139357, + "grad_norm": 0.20882159291951155, + "learning_rate": 0.0009436645544478213, + "loss": 1.4917, + "step": 1962 + }, + { + "epoch": 0.1780983487570314, + "grad_norm": 0.2218355188665145, + "learning_rate": 0.0009435967818721256, + "loss": 1.4795, + "step": 1963 + }, + { + "epoch": 0.1781890763926692, + "grad_norm": 0.2215319747713021, + "learning_rate": 0.0009435289709917822, + "loss": 1.4662, + "step": 1964 + }, + { + "epoch": 0.178279804028307, + "grad_norm": 0.18660122657038905, + "learning_rate": 0.000943461121812647, + "loss": 1.5039, + "step": 1965 + }, + { + "epoch": 0.17837053166394484, + "grad_norm": 0.18273625553744655, + "learning_rate": 0.0009433932343405785, + "loss": 1.4509, + "step": 1966 + }, + { + "epoch": 0.17846125929958265, + "grad_norm": 0.2044310895450619, + "learning_rate": 0.0009433253085814388, + "loss": 1.4527, + "step": 1967 + }, + { + "epoch": 0.17855198693522048, + "grad_norm": 0.187994524257715, + "learning_rate": 0.0009432573445410934, + "loss": 1.4549, + "step": 1968 + }, + { + "epoch": 0.17864271457085829, + "grad_norm": 0.2552780861175998, + "learning_rate": 0.000943189342225411, + "loss": 1.4621, + "step": 1969 + }, + { + "epoch": 0.1787334422064961, + "grad_norm": 0.20031011120285885, + "learning_rate": 0.0009431213016402635, + "loss": 1.4735, + "step": 1970 + }, + { + "epoch": 0.17882416984213392, + "grad_norm": 0.1890104220416182, + "learning_rate": 0.0009430532227915265, + "loss": 1.4439, + "step": 1971 + }, + { + "epoch": 0.17891489747777173, + "grad_norm": 0.23794737407966016, + "learning_rate": 0.0009429851056850782, + "loss": 1.4935, + "step": 1972 + }, + { + "epoch": 0.17900562511340953, + "grad_norm": 0.19060050620027238, + "learning_rate": 0.0009429169503268009, + "loss": 1.4713, + "step": 1973 + }, + { + "epoch": 0.17909635274904737, + "grad_norm": 0.1971079912091221, + "learning_rate": 0.0009428487567225795, + "loss": 1.4952, + "step": 1974 + }, + { + "epoch": 0.17918708038468517, + "grad_norm": 0.18802540921722544, + "learning_rate": 0.0009427805248783028, + "loss": 1.5153, + "step": 1975 + }, + { + "epoch": 0.179277808020323, + "grad_norm": 0.180235527245577, + "learning_rate": 0.0009427122547998625, + "loss": 1.4905, + "step": 1976 + }, + { + "epoch": 0.1793685356559608, + "grad_norm": 0.20909266975403903, + "learning_rate": 0.0009426439464931537, + "loss": 1.4887, + "step": 1977 + }, + { + "epoch": 0.17945926329159861, + "grad_norm": 0.1797749514141923, + "learning_rate": 0.0009425755999640748, + "loss": 1.4781, + "step": 1978 + }, + { + "epoch": 0.17954999092723645, + "grad_norm": 0.19024817125293628, + "learning_rate": 0.0009425072152185278, + "loss": 1.452, + "step": 1979 + }, + { + "epoch": 0.17964071856287425, + "grad_norm": 0.19941414460923612, + "learning_rate": 0.0009424387922624174, + "loss": 1.5147, + "step": 1980 + }, + { + "epoch": 0.17973144619851206, + "grad_norm": 0.19528071074989664, + "learning_rate": 0.0009423703311016523, + "loss": 1.4484, + "step": 1981 + }, + { + "epoch": 0.1798221738341499, + "grad_norm": 0.17893171395879284, + "learning_rate": 0.0009423018317421437, + "loss": 1.4582, + "step": 1982 + }, + { + "epoch": 0.1799129014697877, + "grad_norm": 0.1752533051541751, + "learning_rate": 0.0009422332941898067, + "loss": 1.4886, + "step": 1983 + }, + { + "epoch": 0.1800036291054255, + "grad_norm": 0.19137954626161216, + "learning_rate": 0.0009421647184505597, + "loss": 1.4267, + "step": 1984 + }, + { + "epoch": 0.18009435674106333, + "grad_norm": 0.187929937098986, + "learning_rate": 0.000942096104530324, + "loss": 1.5166, + "step": 1985 + }, + { + "epoch": 0.18018508437670114, + "grad_norm": 0.17579493292140044, + "learning_rate": 0.0009420274524350247, + "loss": 1.4746, + "step": 1986 + }, + { + "epoch": 0.18027581201233897, + "grad_norm": 0.1692474858623412, + "learning_rate": 0.0009419587621705897, + "loss": 1.4512, + "step": 1987 + }, + { + "epoch": 0.18036653964797678, + "grad_norm": 0.1633282469814194, + "learning_rate": 0.0009418900337429502, + "loss": 1.4662, + "step": 1988 + }, + { + "epoch": 0.18045726728361458, + "grad_norm": 0.170057300140603, + "learning_rate": 0.0009418212671580413, + "loss": 1.4369, + "step": 1989 + }, + { + "epoch": 0.1805479949192524, + "grad_norm": 0.16602572651489003, + "learning_rate": 0.000941752462421801, + "loss": 1.5101, + "step": 1990 + }, + { + "epoch": 0.18063872255489022, + "grad_norm": 0.2485888130996302, + "learning_rate": 0.0009416836195401703, + "loss": 1.4617, + "step": 1991 + }, + { + "epoch": 0.18072945019052802, + "grad_norm": 0.21390739733097622, + "learning_rate": 0.0009416147385190939, + "loss": 1.4529, + "step": 1992 + }, + { + "epoch": 0.18082017782616586, + "grad_norm": 0.18139704778311047, + "learning_rate": 0.0009415458193645199, + "loss": 1.4441, + "step": 1993 + }, + { + "epoch": 0.18091090546180366, + "grad_norm": 0.19843194442568257, + "learning_rate": 0.0009414768620823993, + "loss": 1.4378, + "step": 1994 + }, + { + "epoch": 0.1810016330974415, + "grad_norm": 0.2333485636019649, + "learning_rate": 0.0009414078666786865, + "loss": 1.4807, + "step": 1995 + }, + { + "epoch": 0.1810923607330793, + "grad_norm": 0.19195693581088663, + "learning_rate": 0.0009413388331593394, + "loss": 1.4781, + "step": 1996 + }, + { + "epoch": 0.1811830883687171, + "grad_norm": 0.19187092143172532, + "learning_rate": 0.000941269761530319, + "loss": 1.4594, + "step": 1997 + }, + { + "epoch": 0.18127381600435494, + "grad_norm": 0.17158883951183093, + "learning_rate": 0.0009412006517975898, + "loss": 1.4994, + "step": 1998 + }, + { + "epoch": 0.18136454363999274, + "grad_norm": 0.18970140355094406, + "learning_rate": 0.000941131503967119, + "loss": 1.4472, + "step": 1999 + }, + { + "epoch": 0.18145527127563055, + "grad_norm": 0.18800834564786414, + "learning_rate": 0.0009410623180448781, + "loss": 1.4834, + "step": 2000 + }, + { + "epoch": 0.18154599891126838, + "grad_norm": 0.18791084963515603, + "learning_rate": 0.0009409930940368407, + "loss": 1.4578, + "step": 2001 + }, + { + "epoch": 0.18163672654690619, + "grad_norm": 0.18978506569345335, + "learning_rate": 0.000940923831948985, + "loss": 1.477, + "step": 2002 + }, + { + "epoch": 0.181727454182544, + "grad_norm": 0.1848478485647636, + "learning_rate": 0.0009408545317872912, + "loss": 1.4861, + "step": 2003 + }, + { + "epoch": 0.18181818181818182, + "grad_norm": 0.19180739898901247, + "learning_rate": 0.0009407851935577436, + "loss": 1.4613, + "step": 2004 + }, + { + "epoch": 0.18190890945381963, + "grad_norm": 0.187358142006433, + "learning_rate": 0.0009407158172663298, + "loss": 1.4941, + "step": 2005 + }, + { + "epoch": 0.18199963708945746, + "grad_norm": 0.1818550561932149, + "learning_rate": 0.00094064640291904, + "loss": 1.471, + "step": 2006 + }, + { + "epoch": 0.18209036472509527, + "grad_norm": 0.18346047325118026, + "learning_rate": 0.0009405769505218684, + "loss": 1.5114, + "step": 2007 + }, + { + "epoch": 0.18218109236073307, + "grad_norm": 0.1750798585458085, + "learning_rate": 0.0009405074600808122, + "loss": 1.5043, + "step": 2008 + }, + { + "epoch": 0.1822718199963709, + "grad_norm": 0.27195025815124196, + "learning_rate": 0.000940437931601872, + "loss": 1.4579, + "step": 2009 + }, + { + "epoch": 0.1823625476320087, + "grad_norm": 0.1754074030130297, + "learning_rate": 0.0009403683650910514, + "loss": 1.4531, + "step": 2010 + }, + { + "epoch": 0.18245327526764651, + "grad_norm": 0.17742405849441067, + "learning_rate": 0.0009402987605543576, + "loss": 1.4894, + "step": 2011 + }, + { + "epoch": 0.18254400290328435, + "grad_norm": 0.18539373464164519, + "learning_rate": 0.0009402291179978009, + "loss": 1.4671, + "step": 2012 + }, + { + "epoch": 0.18263473053892215, + "grad_norm": 0.1991716925646394, + "learning_rate": 0.000940159437427395, + "loss": 1.5092, + "step": 2013 + }, + { + "epoch": 0.18272545817455996, + "grad_norm": 0.20624780083787275, + "learning_rate": 0.0009400897188491568, + "loss": 1.4671, + "step": 2014 + }, + { + "epoch": 0.1828161858101978, + "grad_norm": 0.19471333863441964, + "learning_rate": 0.0009400199622691065, + "loss": 1.436, + "step": 2015 + }, + { + "epoch": 0.1829069134458356, + "grad_norm": 0.20049820808758675, + "learning_rate": 0.0009399501676932675, + "loss": 1.4798, + "step": 2016 + }, + { + "epoch": 0.18299764108147343, + "grad_norm": 0.19759517163936915, + "learning_rate": 0.0009398803351276668, + "loss": 1.5132, + "step": 2017 + }, + { + "epoch": 0.18308836871711123, + "grad_norm": 0.20711408917364543, + "learning_rate": 0.0009398104645783344, + "loss": 1.4782, + "step": 2018 + }, + { + "epoch": 0.18317909635274904, + "grad_norm": 0.18609331867041823, + "learning_rate": 0.0009397405560513035, + "loss": 1.4488, + "step": 2019 + }, + { + "epoch": 0.18326982398838687, + "grad_norm": 0.22281756786506624, + "learning_rate": 0.0009396706095526107, + "loss": 1.5018, + "step": 2020 + }, + { + "epoch": 0.18336055162402468, + "grad_norm": 0.18564401711476441, + "learning_rate": 0.000939600625088296, + "loss": 1.4572, + "step": 2021 + }, + { + "epoch": 0.18345127925966248, + "grad_norm": 0.19202328841001925, + "learning_rate": 0.0009395306026644026, + "loss": 1.4539, + "step": 2022 + }, + { + "epoch": 0.1835420068953003, + "grad_norm": 0.21085356043820716, + "learning_rate": 0.0009394605422869769, + "loss": 1.4529, + "step": 2023 + }, + { + "epoch": 0.18363273453093812, + "grad_norm": 0.21525150888677158, + "learning_rate": 0.0009393904439620685, + "loss": 1.5033, + "step": 2024 + }, + { + "epoch": 0.18372346216657595, + "grad_norm": 0.18728955068925918, + "learning_rate": 0.0009393203076957307, + "loss": 1.4839, + "step": 2025 + }, + { + "epoch": 0.18381418980221376, + "grad_norm": 0.21831515549218197, + "learning_rate": 0.0009392501334940193, + "loss": 1.4693, + "step": 2026 + }, + { + "epoch": 0.18390491743785156, + "grad_norm": 0.21317673134424908, + "learning_rate": 0.0009391799213629942, + "loss": 1.4527, + "step": 2027 + }, + { + "epoch": 0.1839956450734894, + "grad_norm": 0.19636532949772345, + "learning_rate": 0.0009391096713087183, + "loss": 1.4639, + "step": 2028 + }, + { + "epoch": 0.1840863727091272, + "grad_norm": 0.17113037474410708, + "learning_rate": 0.0009390393833372575, + "loss": 1.4623, + "step": 2029 + }, + { + "epoch": 0.184177100344765, + "grad_norm": 0.1916194581770531, + "learning_rate": 0.0009389690574546812, + "loss": 1.4516, + "step": 2030 + }, + { + "epoch": 0.18426782798040284, + "grad_norm": 0.17798554831510127, + "learning_rate": 0.000938898693667062, + "loss": 1.4949, + "step": 2031 + }, + { + "epoch": 0.18435855561604064, + "grad_norm": 0.18936452259002762, + "learning_rate": 0.0009388282919804761, + "loss": 1.4866, + "step": 2032 + }, + { + "epoch": 0.18444928325167845, + "grad_norm": 0.19618936496560863, + "learning_rate": 0.0009387578524010026, + "loss": 1.4789, + "step": 2033 + }, + { + "epoch": 0.18454001088731628, + "grad_norm": 0.18719348919452888, + "learning_rate": 0.0009386873749347236, + "loss": 1.4716, + "step": 2034 + }, + { + "epoch": 0.18463073852295409, + "grad_norm": 0.1805561556420591, + "learning_rate": 0.0009386168595877253, + "loss": 1.4662, + "step": 2035 + }, + { + "epoch": 0.18472146615859192, + "grad_norm": 0.185088753362548, + "learning_rate": 0.0009385463063660964, + "loss": 1.4578, + "step": 2036 + }, + { + "epoch": 0.18481219379422972, + "grad_norm": 0.17583736351149054, + "learning_rate": 0.0009384757152759293, + "loss": 1.457, + "step": 2037 + }, + { + "epoch": 0.18490292142986753, + "grad_norm": 0.17817988711017577, + "learning_rate": 0.0009384050863233198, + "loss": 1.4569, + "step": 2038 + }, + { + "epoch": 0.18499364906550536, + "grad_norm": 0.17186378128787744, + "learning_rate": 0.0009383344195143663, + "loss": 1.4853, + "step": 2039 + }, + { + "epoch": 0.18508437670114317, + "grad_norm": 0.16766084585666094, + "learning_rate": 0.0009382637148551711, + "loss": 1.5111, + "step": 2040 + }, + { + "epoch": 0.18517510433678097, + "grad_norm": 0.18309055404860444, + "learning_rate": 0.0009381929723518395, + "loss": 1.4537, + "step": 2041 + }, + { + "epoch": 0.1852658319724188, + "grad_norm": 0.17172312256656114, + "learning_rate": 0.0009381221920104801, + "loss": 1.4744, + "step": 2042 + }, + { + "epoch": 0.1853565596080566, + "grad_norm": 0.2130920466593361, + "learning_rate": 0.000938051373837205, + "loss": 1.4924, + "step": 2043 + }, + { + "epoch": 0.18544728724369444, + "grad_norm": 0.180223404099451, + "learning_rate": 0.0009379805178381291, + "loss": 1.4706, + "step": 2044 + }, + { + "epoch": 0.18553801487933225, + "grad_norm": 0.19639161167215788, + "learning_rate": 0.000937909624019371, + "loss": 1.4455, + "step": 2045 + }, + { + "epoch": 0.18562874251497005, + "grad_norm": 0.18671127345963429, + "learning_rate": 0.0009378386923870523, + "loss": 1.4323, + "step": 2046 + }, + { + "epoch": 0.18571947015060788, + "grad_norm": 0.18575899565719275, + "learning_rate": 0.0009377677229472981, + "loss": 1.4951, + "step": 2047 + }, + { + "epoch": 0.1858101977862457, + "grad_norm": 0.18606824019992355, + "learning_rate": 0.0009376967157062363, + "loss": 1.4857, + "step": 2048 + }, + { + "epoch": 0.1859009254218835, + "grad_norm": 0.19277206675093192, + "learning_rate": 0.0009376256706699986, + "loss": 1.4765, + "step": 2049 + }, + { + "epoch": 0.18599165305752133, + "grad_norm": 0.20392313806818183, + "learning_rate": 0.0009375545878447198, + "loss": 1.456, + "step": 2050 + }, + { + "epoch": 0.18608238069315913, + "grad_norm": 0.1846117075052089, + "learning_rate": 0.0009374834672365379, + "loss": 1.4701, + "step": 2051 + }, + { + "epoch": 0.18617310832879694, + "grad_norm": 0.21304583646361783, + "learning_rate": 0.000937412308851594, + "loss": 1.4836, + "step": 2052 + }, + { + "epoch": 0.18626383596443477, + "grad_norm": 0.18436770255356866, + "learning_rate": 0.0009373411126960329, + "loss": 1.4718, + "step": 2053 + }, + { + "epoch": 0.18635456360007258, + "grad_norm": 0.17988600493958898, + "learning_rate": 0.0009372698787760021, + "loss": 1.4728, + "step": 2054 + }, + { + "epoch": 0.1864452912357104, + "grad_norm": 0.1755372955438893, + "learning_rate": 0.0009371986070976531, + "loss": 1.4931, + "step": 2055 + }, + { + "epoch": 0.1865360188713482, + "grad_norm": 0.16493461659446926, + "learning_rate": 0.0009371272976671397, + "loss": 1.4694, + "step": 2056 + }, + { + "epoch": 0.18662674650698602, + "grad_norm": 0.1931568178651495, + "learning_rate": 0.0009370559504906198, + "loss": 1.5242, + "step": 2057 + }, + { + "epoch": 0.18671747414262385, + "grad_norm": 0.1750423407924894, + "learning_rate": 0.0009369845655742542, + "loss": 1.5186, + "step": 2058 + }, + { + "epoch": 0.18680820177826166, + "grad_norm": 0.17167343454692496, + "learning_rate": 0.0009369131429242068, + "loss": 1.4592, + "step": 2059 + }, + { + "epoch": 0.18689892941389946, + "grad_norm": 0.17260456507266697, + "learning_rate": 0.0009368416825466453, + "loss": 1.4922, + "step": 2060 + }, + { + "epoch": 0.1869896570495373, + "grad_norm": 0.16910301651596202, + "learning_rate": 0.00093677018444774, + "loss": 1.4786, + "step": 2061 + }, + { + "epoch": 0.1870803846851751, + "grad_norm": 0.17101148822792275, + "learning_rate": 0.0009366986486336649, + "loss": 1.4591, + "step": 2062 + }, + { + "epoch": 0.18717111232081293, + "grad_norm": 0.1738015979324423, + "learning_rate": 0.0009366270751105972, + "loss": 1.4865, + "step": 2063 + }, + { + "epoch": 0.18726183995645074, + "grad_norm": 0.16894274123202857, + "learning_rate": 0.0009365554638847171, + "loss": 1.4816, + "step": 2064 + }, + { + "epoch": 0.18735256759208854, + "grad_norm": 0.16758892388732072, + "learning_rate": 0.0009364838149622086, + "loss": 1.4778, + "step": 2065 + }, + { + "epoch": 0.18744329522772638, + "grad_norm": 0.18591542965071994, + "learning_rate": 0.0009364121283492582, + "loss": 1.4325, + "step": 2066 + }, + { + "epoch": 0.18753402286336418, + "grad_norm": 0.18105515746099904, + "learning_rate": 0.000936340404052056, + "loss": 1.447, + "step": 2067 + }, + { + "epoch": 0.18762475049900199, + "grad_norm": 0.16985036746417229, + "learning_rate": 0.0009362686420767959, + "loss": 1.4928, + "step": 2068 + }, + { + "epoch": 0.18771547813463982, + "grad_norm": 0.18813010613626296, + "learning_rate": 0.000936196842429674, + "loss": 1.4346, + "step": 2069 + }, + { + "epoch": 0.18780620577027762, + "grad_norm": 0.18309986840953374, + "learning_rate": 0.0009361250051168907, + "loss": 1.4778, + "step": 2070 + }, + { + "epoch": 0.18789693340591543, + "grad_norm": 0.1762884640584661, + "learning_rate": 0.0009360531301446489, + "loss": 1.4589, + "step": 2071 + }, + { + "epoch": 0.18798766104155326, + "grad_norm": 0.1785027533980283, + "learning_rate": 0.0009359812175191549, + "loss": 1.4911, + "step": 2072 + }, + { + "epoch": 0.18807838867719107, + "grad_norm": 0.18133677649733643, + "learning_rate": 0.0009359092672466185, + "loss": 1.4452, + "step": 2073 + }, + { + "epoch": 0.1881691163128289, + "grad_norm": 0.17131411834682564, + "learning_rate": 0.0009358372793332528, + "loss": 1.4891, + "step": 2074 + }, + { + "epoch": 0.1882598439484667, + "grad_norm": 0.1770301872594754, + "learning_rate": 0.0009357652537852737, + "loss": 1.4664, + "step": 2075 + }, + { + "epoch": 0.1883505715841045, + "grad_norm": 0.18934790650188996, + "learning_rate": 0.0009356931906089008, + "loss": 1.4777, + "step": 2076 + }, + { + "epoch": 0.18844129921974234, + "grad_norm": 0.17622158738177093, + "learning_rate": 0.0009356210898103565, + "loss": 1.4552, + "step": 2077 + }, + { + "epoch": 0.18853202685538015, + "grad_norm": 0.18423674982559649, + "learning_rate": 0.0009355489513958671, + "loss": 1.4893, + "step": 2078 + }, + { + "epoch": 0.18862275449101795, + "grad_norm": 0.18320024544486335, + "learning_rate": 0.0009354767753716613, + "loss": 1.481, + "step": 2079 + }, + { + "epoch": 0.18871348212665578, + "grad_norm": 0.17389509047476853, + "learning_rate": 0.0009354045617439719, + "loss": 1.4824, + "step": 2080 + }, + { + "epoch": 0.1888042097622936, + "grad_norm": 0.16977733694351424, + "learning_rate": 0.0009353323105190345, + "loss": 1.4855, + "step": 2081 + }, + { + "epoch": 0.18889493739793142, + "grad_norm": 0.17935310979032326, + "learning_rate": 0.0009352600217030877, + "loss": 1.503, + "step": 2082 + }, + { + "epoch": 0.18898566503356923, + "grad_norm": 0.18869277706577176, + "learning_rate": 0.0009351876953023741, + "loss": 1.507, + "step": 2083 + }, + { + "epoch": 0.18907639266920703, + "grad_norm": 0.17772513523943498, + "learning_rate": 0.0009351153313231389, + "loss": 1.4194, + "step": 2084 + }, + { + "epoch": 0.18916712030484487, + "grad_norm": 0.17552666095597963, + "learning_rate": 0.0009350429297716305, + "loss": 1.4928, + "step": 2085 + }, + { + "epoch": 0.18925784794048267, + "grad_norm": 0.17472385207964122, + "learning_rate": 0.0009349704906541013, + "loss": 1.4623, + "step": 2086 + }, + { + "epoch": 0.18934857557612048, + "grad_norm": 0.19848091933939885, + "learning_rate": 0.0009348980139768058, + "loss": 1.4941, + "step": 2087 + }, + { + "epoch": 0.1894393032117583, + "grad_norm": 0.18633497294900941, + "learning_rate": 0.0009348254997460028, + "loss": 1.4551, + "step": 2088 + }, + { + "epoch": 0.1895300308473961, + "grad_norm": 0.16999235825772485, + "learning_rate": 0.0009347529479679539, + "loss": 1.4773, + "step": 2089 + }, + { + "epoch": 0.18962075848303392, + "grad_norm": 0.1839080639452047, + "learning_rate": 0.0009346803586489238, + "loss": 1.4629, + "step": 2090 + }, + { + "epoch": 0.18971148611867175, + "grad_norm": 0.17862744541881695, + "learning_rate": 0.0009346077317951806, + "loss": 1.448, + "step": 2091 + }, + { + "epoch": 0.18980221375430956, + "grad_norm": 0.18434928678383564, + "learning_rate": 0.0009345350674129958, + "loss": 1.5076, + "step": 2092 + }, + { + "epoch": 0.1898929413899474, + "grad_norm": 0.18226484084110706, + "learning_rate": 0.0009344623655086438, + "loss": 1.4597, + "step": 2093 + }, + { + "epoch": 0.1899836690255852, + "grad_norm": 0.1761038832371258, + "learning_rate": 0.0009343896260884026, + "loss": 1.4711, + "step": 2094 + }, + { + "epoch": 0.190074396661223, + "grad_norm": 0.17853053156006432, + "learning_rate": 0.0009343168491585532, + "loss": 1.4879, + "step": 2095 + }, + { + "epoch": 0.19016512429686083, + "grad_norm": 0.18628807864549857, + "learning_rate": 0.0009342440347253798, + "loss": 1.4886, + "step": 2096 + }, + { + "epoch": 0.19025585193249864, + "grad_norm": 0.1770153764057505, + "learning_rate": 0.00093417118279517, + "loss": 1.4831, + "step": 2097 + }, + { + "epoch": 0.19034657956813644, + "grad_norm": 0.17880499871120614, + "learning_rate": 0.0009340982933742145, + "loss": 1.4582, + "step": 2098 + }, + { + "epoch": 0.19043730720377428, + "grad_norm": 0.1936451248172185, + "learning_rate": 0.0009340253664688075, + "loss": 1.4557, + "step": 2099 + }, + { + "epoch": 0.19052803483941208, + "grad_norm": 0.19490601841371208, + "learning_rate": 0.0009339524020852461, + "loss": 1.5352, + "step": 2100 + }, + { + "epoch": 0.1906187624750499, + "grad_norm": 0.18799124882951163, + "learning_rate": 0.0009338794002298307, + "loss": 1.4697, + "step": 2101 + }, + { + "epoch": 0.19070949011068772, + "grad_norm": 0.18055338694504536, + "learning_rate": 0.0009338063609088654, + "loss": 1.4501, + "step": 2102 + }, + { + "epoch": 0.19080021774632552, + "grad_norm": 0.17773071899263054, + "learning_rate": 0.0009337332841286567, + "loss": 1.4511, + "step": 2103 + }, + { + "epoch": 0.19089094538196336, + "grad_norm": 0.17307923198623026, + "learning_rate": 0.000933660169895515, + "loss": 1.4731, + "step": 2104 + }, + { + "epoch": 0.19098167301760116, + "grad_norm": 0.1598402409438659, + "learning_rate": 0.0009335870182157537, + "loss": 1.4327, + "step": 2105 + }, + { + "epoch": 0.19107240065323897, + "grad_norm": 0.17424844092166214, + "learning_rate": 0.0009335138290956896, + "loss": 1.4703, + "step": 2106 + }, + { + "epoch": 0.1911631282888768, + "grad_norm": 0.17055056613081884, + "learning_rate": 0.0009334406025416425, + "loss": 1.4934, + "step": 2107 + }, + { + "epoch": 0.1912538559245146, + "grad_norm": 0.1781384065200643, + "learning_rate": 0.0009333673385599352, + "loss": 1.4578, + "step": 2108 + }, + { + "epoch": 0.1913445835601524, + "grad_norm": 0.19210662801431977, + "learning_rate": 0.0009332940371568945, + "loss": 1.4667, + "step": 2109 + }, + { + "epoch": 0.19143531119579024, + "grad_norm": 0.197022746208077, + "learning_rate": 0.00093322069833885, + "loss": 1.4862, + "step": 2110 + }, + { + "epoch": 0.19152603883142805, + "grad_norm": 0.19346702382598419, + "learning_rate": 0.0009331473221121341, + "loss": 1.4735, + "step": 2111 + }, + { + "epoch": 0.19161676646706588, + "grad_norm": 0.20629235928204875, + "learning_rate": 0.0009330739084830832, + "loss": 1.4859, + "step": 2112 + }, + { + "epoch": 0.19170749410270368, + "grad_norm": 0.20849862995590931, + "learning_rate": 0.0009330004574580365, + "loss": 1.4678, + "step": 2113 + }, + { + "epoch": 0.1917982217383415, + "grad_norm": 0.1817073175248271, + "learning_rate": 0.0009329269690433364, + "loss": 1.449, + "step": 2114 + }, + { + "epoch": 0.19188894937397932, + "grad_norm": 0.18633660753857706, + "learning_rate": 0.0009328534432453289, + "loss": 1.4477, + "step": 2115 + }, + { + "epoch": 0.19197967700961713, + "grad_norm": 0.21170289939050782, + "learning_rate": 0.0009327798800703626, + "loss": 1.4603, + "step": 2116 + }, + { + "epoch": 0.19207040464525493, + "grad_norm": 0.204293894858702, + "learning_rate": 0.00093270627952479, + "loss": 1.424, + "step": 2117 + }, + { + "epoch": 0.19216113228089277, + "grad_norm": 0.22996587191522055, + "learning_rate": 0.0009326326416149662, + "loss": 1.4448, + "step": 2118 + }, + { + "epoch": 0.19225185991653057, + "grad_norm": 0.20682530689481352, + "learning_rate": 0.0009325589663472503, + "loss": 1.4459, + "step": 2119 + }, + { + "epoch": 0.1923425875521684, + "grad_norm": 0.18934301042514107, + "learning_rate": 0.0009324852537280036, + "loss": 1.4787, + "step": 2120 + }, + { + "epoch": 0.1924333151878062, + "grad_norm": 0.20539979502756447, + "learning_rate": 0.0009324115037635917, + "loss": 1.4755, + "step": 2121 + }, + { + "epoch": 0.192524042823444, + "grad_norm": 0.19451035438710532, + "learning_rate": 0.0009323377164603825, + "loss": 1.4823, + "step": 2122 + }, + { + "epoch": 0.19261477045908185, + "grad_norm": 0.1975359008251044, + "learning_rate": 0.0009322638918247482, + "loss": 1.4955, + "step": 2123 + }, + { + "epoch": 0.19270549809471965, + "grad_norm": 0.18067566637862742, + "learning_rate": 0.0009321900298630628, + "loss": 1.4805, + "step": 2124 + }, + { + "epoch": 0.19279622573035746, + "grad_norm": 0.18232328724635163, + "learning_rate": 0.0009321161305817046, + "loss": 1.4398, + "step": 2125 + }, + { + "epoch": 0.1928869533659953, + "grad_norm": 0.17089974736533584, + "learning_rate": 0.0009320421939870549, + "loss": 1.4682, + "step": 2126 + }, + { + "epoch": 0.1929776810016331, + "grad_norm": 0.19358101871262987, + "learning_rate": 0.0009319682200854981, + "loss": 1.4824, + "step": 2127 + }, + { + "epoch": 0.1930684086372709, + "grad_norm": 0.1944507417464099, + "learning_rate": 0.0009318942088834219, + "loss": 1.4721, + "step": 2128 + }, + { + "epoch": 0.19315913627290873, + "grad_norm": 0.19264186317838708, + "learning_rate": 0.0009318201603872169, + "loss": 1.4328, + "step": 2129 + }, + { + "epoch": 0.19324986390854654, + "grad_norm": 0.19783342909645538, + "learning_rate": 0.0009317460746032777, + "loss": 1.4674, + "step": 2130 + }, + { + "epoch": 0.19334059154418437, + "grad_norm": 0.17969577397792152, + "learning_rate": 0.0009316719515380011, + "loss": 1.445, + "step": 2131 + }, + { + "epoch": 0.19343131917982218, + "grad_norm": 0.18306108034692592, + "learning_rate": 0.000931597791197788, + "loss": 1.4181, + "step": 2132 + }, + { + "epoch": 0.19352204681545998, + "grad_norm": 0.18434495004167534, + "learning_rate": 0.0009315235935890418, + "loss": 1.4859, + "step": 2133 + }, + { + "epoch": 0.1936127744510978, + "grad_norm": 0.18458723256139092, + "learning_rate": 0.0009314493587181699, + "loss": 1.4851, + "step": 2134 + }, + { + "epoch": 0.19370350208673562, + "grad_norm": 0.19644937769116874, + "learning_rate": 0.0009313750865915822, + "loss": 1.4673, + "step": 2135 + }, + { + "epoch": 0.19379422972237342, + "grad_norm": 0.29012529013280597, + "learning_rate": 0.0009313007772156923, + "loss": 1.4944, + "step": 2136 + }, + { + "epoch": 0.19388495735801126, + "grad_norm": 0.18630115849725665, + "learning_rate": 0.0009312264305969166, + "loss": 1.4611, + "step": 2137 + }, + { + "epoch": 0.19397568499364906, + "grad_norm": 0.23446360642773534, + "learning_rate": 0.0009311520467416751, + "loss": 1.4803, + "step": 2138 + }, + { + "epoch": 0.1940664126292869, + "grad_norm": 0.1958066602996492, + "learning_rate": 0.0009310776256563908, + "loss": 1.4957, + "step": 2139 + }, + { + "epoch": 0.1941571402649247, + "grad_norm": 0.23268458253710253, + "learning_rate": 0.00093100316734749, + "loss": 1.469, + "step": 2140 + }, + { + "epoch": 0.1942478679005625, + "grad_norm": 0.1818556828947201, + "learning_rate": 0.0009309286718214022, + "loss": 1.4538, + "step": 2141 + }, + { + "epoch": 0.19433859553620034, + "grad_norm": 0.2066014998401165, + "learning_rate": 0.0009308541390845601, + "loss": 1.4657, + "step": 2142 + }, + { + "epoch": 0.19442932317183814, + "grad_norm": 0.17299858127489265, + "learning_rate": 0.0009307795691433996, + "loss": 1.444, + "step": 2143 + }, + { + "epoch": 0.19452005080747595, + "grad_norm": 0.1992779393617308, + "learning_rate": 0.0009307049620043597, + "loss": 1.4864, + "step": 2144 + }, + { + "epoch": 0.19461077844311378, + "grad_norm": 0.17772805131067473, + "learning_rate": 0.0009306303176738829, + "loss": 1.5246, + "step": 2145 + }, + { + "epoch": 0.19470150607875158, + "grad_norm": 0.19106522994219155, + "learning_rate": 0.0009305556361584147, + "loss": 1.4378, + "step": 2146 + }, + { + "epoch": 0.1947922337143894, + "grad_norm": 0.18993064309141144, + "learning_rate": 0.0009304809174644038, + "loss": 1.5039, + "step": 2147 + }, + { + "epoch": 0.19488296135002722, + "grad_norm": 0.18214800983165008, + "learning_rate": 0.0009304061615983024, + "loss": 1.4917, + "step": 2148 + }, + { + "epoch": 0.19497368898566503, + "grad_norm": 0.18284860104587028, + "learning_rate": 0.0009303313685665655, + "loss": 1.4592, + "step": 2149 + }, + { + "epoch": 0.19506441662130286, + "grad_norm": 0.19231837591184123, + "learning_rate": 0.0009302565383756514, + "loss": 1.487, + "step": 2150 + }, + { + "epoch": 0.19515514425694067, + "grad_norm": 0.18753596104659698, + "learning_rate": 0.0009301816710320217, + "loss": 1.4324, + "step": 2151 + }, + { + "epoch": 0.19524587189257847, + "grad_norm": 0.1976452932615034, + "learning_rate": 0.0009301067665421414, + "loss": 1.5019, + "step": 2152 + }, + { + "epoch": 0.1953365995282163, + "grad_norm": 0.17569313083992827, + "learning_rate": 0.0009300318249124785, + "loss": 1.4325, + "step": 2153 + }, + { + "epoch": 0.1954273271638541, + "grad_norm": 0.19486282231254032, + "learning_rate": 0.000929956846149504, + "loss": 1.4396, + "step": 2154 + }, + { + "epoch": 0.1955180547994919, + "grad_norm": 0.20079043768240507, + "learning_rate": 0.0009298818302596926, + "loss": 1.4539, + "step": 2155 + }, + { + "epoch": 0.19560878243512975, + "grad_norm": 0.1810734930493356, + "learning_rate": 0.0009298067772495218, + "loss": 1.4597, + "step": 2156 + }, + { + "epoch": 0.19569951007076755, + "grad_norm": 0.18231160206014976, + "learning_rate": 0.0009297316871254725, + "loss": 1.4888, + "step": 2157 + }, + { + "epoch": 0.19579023770640538, + "grad_norm": 0.1759741512798001, + "learning_rate": 0.0009296565598940284, + "loss": 1.4647, + "step": 2158 + }, + { + "epoch": 0.1958809653420432, + "grad_norm": 0.19301920226349767, + "learning_rate": 0.0009295813955616772, + "loss": 1.4646, + "step": 2159 + }, + { + "epoch": 0.195971692977681, + "grad_norm": 0.1754916087760411, + "learning_rate": 0.0009295061941349092, + "loss": 1.4823, + "step": 2160 + }, + { + "epoch": 0.19606242061331883, + "grad_norm": 0.20050698779304588, + "learning_rate": 0.0009294309556202179, + "loss": 1.4694, + "step": 2161 + }, + { + "epoch": 0.19615314824895663, + "grad_norm": 0.1813866685700504, + "learning_rate": 0.0009293556800241005, + "loss": 1.4541, + "step": 2162 + }, + { + "epoch": 0.19624387588459444, + "grad_norm": 0.18749517318582712, + "learning_rate": 0.0009292803673530567, + "loss": 1.4368, + "step": 2163 + }, + { + "epoch": 0.19633460352023227, + "grad_norm": 0.2198981839119871, + "learning_rate": 0.00092920501761359, + "loss": 1.5277, + "step": 2164 + }, + { + "epoch": 0.19642533115587008, + "grad_norm": 0.21252144777562473, + "learning_rate": 0.0009291296308122067, + "loss": 1.456, + "step": 2165 + }, + { + "epoch": 0.19651605879150788, + "grad_norm": 0.19008460746621622, + "learning_rate": 0.0009290542069554166, + "loss": 1.4828, + "step": 2166 + }, + { + "epoch": 0.1966067864271457, + "grad_norm": 0.1979293078098261, + "learning_rate": 0.0009289787460497323, + "loss": 1.4493, + "step": 2167 + }, + { + "epoch": 0.19669751406278352, + "grad_norm": 0.17961367796796007, + "learning_rate": 0.0009289032481016704, + "loss": 1.4783, + "step": 2168 + }, + { + "epoch": 0.19678824169842135, + "grad_norm": 0.17266347396644136, + "learning_rate": 0.0009288277131177495, + "loss": 1.5025, + "step": 2169 + }, + { + "epoch": 0.19687896933405916, + "grad_norm": 0.21323047579705104, + "learning_rate": 0.0009287521411044925, + "loss": 1.4579, + "step": 2170 + }, + { + "epoch": 0.19696969696969696, + "grad_norm": 0.17769374700467658, + "learning_rate": 0.0009286765320684249, + "loss": 1.4567, + "step": 2171 + }, + { + "epoch": 0.1970604246053348, + "grad_norm": 0.18736590774474426, + "learning_rate": 0.0009286008860160755, + "loss": 1.5166, + "step": 2172 + }, + { + "epoch": 0.1971511522409726, + "grad_norm": 0.1910355225431383, + "learning_rate": 0.0009285252029539766, + "loss": 1.461, + "step": 2173 + }, + { + "epoch": 0.1972418798766104, + "grad_norm": 0.18558485966753985, + "learning_rate": 0.0009284494828886631, + "loss": 1.458, + "step": 2174 + }, + { + "epoch": 0.19733260751224824, + "grad_norm": 0.21364076471319007, + "learning_rate": 0.0009283737258266737, + "loss": 1.456, + "step": 2175 + }, + { + "epoch": 0.19742333514788604, + "grad_norm": 0.22067482211159536, + "learning_rate": 0.0009282979317745499, + "loss": 1.4721, + "step": 2176 + }, + { + "epoch": 0.19751406278352387, + "grad_norm": 0.20001136159951016, + "learning_rate": 0.0009282221007388366, + "loss": 1.4622, + "step": 2177 + }, + { + "epoch": 0.19760479041916168, + "grad_norm": 0.20706416118154103, + "learning_rate": 0.0009281462327260818, + "loss": 1.4986, + "step": 2178 + }, + { + "epoch": 0.19769551805479949, + "grad_norm": 0.19976455569221083, + "learning_rate": 0.0009280703277428366, + "loss": 1.4639, + "step": 2179 + }, + { + "epoch": 0.19778624569043732, + "grad_norm": 0.18251128086784443, + "learning_rate": 0.0009279943857956556, + "loss": 1.4893, + "step": 2180 + }, + { + "epoch": 0.19787697332607512, + "grad_norm": 0.19688866056705828, + "learning_rate": 0.0009279184068910965, + "loss": 1.4701, + "step": 2181 + }, + { + "epoch": 0.19796770096171293, + "grad_norm": 0.18610718125620157, + "learning_rate": 0.0009278423910357195, + "loss": 1.4449, + "step": 2182 + }, + { + "epoch": 0.19805842859735076, + "grad_norm": 0.19371069520586603, + "learning_rate": 0.0009277663382360892, + "loss": 1.5007, + "step": 2183 + }, + { + "epoch": 0.19814915623298857, + "grad_norm": 0.1820942368804205, + "learning_rate": 0.0009276902484987725, + "loss": 1.4896, + "step": 2184 + }, + { + "epoch": 0.19823988386862637, + "grad_norm": 0.17604650571484146, + "learning_rate": 0.0009276141218303398, + "loss": 1.4782, + "step": 2185 + }, + { + "epoch": 0.1983306115042642, + "grad_norm": 0.19914946780567086, + "learning_rate": 0.0009275379582373647, + "loss": 1.4882, + "step": 2186 + }, + { + "epoch": 0.198421339139902, + "grad_norm": 0.18112189754389174, + "learning_rate": 0.0009274617577264238, + "loss": 1.4425, + "step": 2187 + }, + { + "epoch": 0.19851206677553984, + "grad_norm": 0.19407613234537083, + "learning_rate": 0.0009273855203040973, + "loss": 1.4491, + "step": 2188 + }, + { + "epoch": 0.19860279441117765, + "grad_norm": 0.19100872344363626, + "learning_rate": 0.0009273092459769678, + "loss": 1.4616, + "step": 2189 + }, + { + "epoch": 0.19869352204681545, + "grad_norm": 0.18286159292512347, + "learning_rate": 0.0009272329347516223, + "loss": 1.4682, + "step": 2190 + }, + { + "epoch": 0.19878424968245328, + "grad_norm": 0.17894024127056007, + "learning_rate": 0.0009271565866346498, + "loss": 1.4681, + "step": 2191 + }, + { + "epoch": 0.1988749773180911, + "grad_norm": 0.18124652083552678, + "learning_rate": 0.0009270802016326429, + "loss": 1.4774, + "step": 2192 + }, + { + "epoch": 0.1989657049537289, + "grad_norm": 0.1821214564175679, + "learning_rate": 0.000927003779752198, + "loss": 1.491, + "step": 2193 + }, + { + "epoch": 0.19905643258936673, + "grad_norm": 0.17617383374484294, + "learning_rate": 0.0009269273209999134, + "loss": 1.5135, + "step": 2194 + }, + { + "epoch": 0.19914716022500453, + "grad_norm": 0.18336048284701859, + "learning_rate": 0.000926850825382392, + "loss": 1.449, + "step": 2195 + }, + { + "epoch": 0.19923788786064237, + "grad_norm": 0.19140701435724197, + "learning_rate": 0.0009267742929062389, + "loss": 1.4893, + "step": 2196 + }, + { + "epoch": 0.19932861549628017, + "grad_norm": 0.17515670776274309, + "learning_rate": 0.0009266977235780625, + "loss": 1.4721, + "step": 2197 + }, + { + "epoch": 0.19941934313191798, + "grad_norm": 0.20991495370473154, + "learning_rate": 0.0009266211174044749, + "loss": 1.4731, + "step": 2198 + }, + { + "epoch": 0.1995100707675558, + "grad_norm": 0.1772922644436165, + "learning_rate": 0.0009265444743920909, + "loss": 1.4126, + "step": 2199 + }, + { + "epoch": 0.1996007984031936, + "grad_norm": 0.2039632580466463, + "learning_rate": 0.0009264677945475286, + "loss": 1.4608, + "step": 2200 + }, + { + "epoch": 0.19969152603883142, + "grad_norm": 0.18237273261001866, + "learning_rate": 0.0009263910778774096, + "loss": 1.4378, + "step": 2201 + }, + { + "epoch": 0.19978225367446925, + "grad_norm": 0.18326031905123463, + "learning_rate": 0.0009263143243883581, + "loss": 1.4817, + "step": 2202 + }, + { + "epoch": 0.19987298131010706, + "grad_norm": 0.17522838267806362, + "learning_rate": 0.0009262375340870017, + "loss": 1.4519, + "step": 2203 + }, + { + "epoch": 0.19996370894574486, + "grad_norm": 0.18123877554628667, + "learning_rate": 0.0009261607069799716, + "loss": 1.4517, + "step": 2204 + }, + { + "epoch": 0.2000544365813827, + "grad_norm": 0.16843320789162539, + "learning_rate": 0.0009260838430739016, + "loss": 1.4337, + "step": 2205 + }, + { + "epoch": 0.2001451642170205, + "grad_norm": 0.17666823578787486, + "learning_rate": 0.0009260069423754289, + "loss": 1.4487, + "step": 2206 + }, + { + "epoch": 0.20023589185265833, + "grad_norm": 0.17254884206709165, + "learning_rate": 0.000925930004891194, + "loss": 1.4777, + "step": 2207 + }, + { + "epoch": 0.20032661948829614, + "grad_norm": 0.17754294601284623, + "learning_rate": 0.0009258530306278404, + "loss": 1.453, + "step": 2208 + }, + { + "epoch": 0.20041734712393394, + "grad_norm": 0.17556191305138863, + "learning_rate": 0.0009257760195920148, + "loss": 1.433, + "step": 2209 + }, + { + "epoch": 0.20050807475957177, + "grad_norm": 0.19178074676556708, + "learning_rate": 0.0009256989717903671, + "loss": 1.4652, + "step": 2210 + }, + { + "epoch": 0.20059880239520958, + "grad_norm": 0.17999019543808487, + "learning_rate": 0.0009256218872295505, + "loss": 1.4547, + "step": 2211 + }, + { + "epoch": 0.20068953003084739, + "grad_norm": 0.21204851665658112, + "learning_rate": 0.0009255447659162213, + "loss": 1.4826, + "step": 2212 + }, + { + "epoch": 0.20078025766648522, + "grad_norm": 0.17126950702676602, + "learning_rate": 0.0009254676078570388, + "loss": 1.4694, + "step": 2213 + }, + { + "epoch": 0.20087098530212302, + "grad_norm": 0.17856606112888854, + "learning_rate": 0.0009253904130586654, + "loss": 1.4785, + "step": 2214 + }, + { + "epoch": 0.20096171293776086, + "grad_norm": 0.17228259634136583, + "learning_rate": 0.0009253131815277674, + "loss": 1.4471, + "step": 2215 + }, + { + "epoch": 0.20105244057339866, + "grad_norm": 0.1758901773065002, + "learning_rate": 0.0009252359132710136, + "loss": 1.4518, + "step": 2216 + }, + { + "epoch": 0.20114316820903647, + "grad_norm": 0.1794887300565247, + "learning_rate": 0.0009251586082950758, + "loss": 1.4955, + "step": 2217 + }, + { + "epoch": 0.2012338958446743, + "grad_norm": 0.18730548778231898, + "learning_rate": 0.0009250812666066295, + "loss": 1.4712, + "step": 2218 + }, + { + "epoch": 0.2013246234803121, + "grad_norm": 0.18087177612716124, + "learning_rate": 0.0009250038882123533, + "loss": 1.4333, + "step": 2219 + }, + { + "epoch": 0.2014153511159499, + "grad_norm": 0.21264552823784147, + "learning_rate": 0.0009249264731189287, + "loss": 1.4654, + "step": 2220 + }, + { + "epoch": 0.20150607875158774, + "grad_norm": 0.1897747646322132, + "learning_rate": 0.0009248490213330405, + "loss": 1.4618, + "step": 2221 + }, + { + "epoch": 0.20159680638722555, + "grad_norm": 0.1857050922856305, + "learning_rate": 0.0009247715328613766, + "loss": 1.5002, + "step": 2222 + }, + { + "epoch": 0.20168753402286335, + "grad_norm": 0.19972446081237302, + "learning_rate": 0.0009246940077106282, + "loss": 1.4978, + "step": 2223 + }, + { + "epoch": 0.20177826165850118, + "grad_norm": 0.17645613462453036, + "learning_rate": 0.0009246164458874899, + "loss": 1.4426, + "step": 2224 + }, + { + "epoch": 0.201868989294139, + "grad_norm": 0.16657934524764684, + "learning_rate": 0.0009245388473986587, + "loss": 1.4422, + "step": 2225 + }, + { + "epoch": 0.20195971692977682, + "grad_norm": 0.18868933102660035, + "learning_rate": 0.0009244612122508355, + "loss": 1.463, + "step": 2226 + }, + { + "epoch": 0.20205044456541463, + "grad_norm": 0.2083717104316274, + "learning_rate": 0.0009243835404507242, + "loss": 1.4738, + "step": 2227 + }, + { + "epoch": 0.20214117220105243, + "grad_norm": 0.19060394068749684, + "learning_rate": 0.0009243058320050313, + "loss": 1.484, + "step": 2228 + }, + { + "epoch": 0.20223189983669027, + "grad_norm": 0.1949868264778469, + "learning_rate": 0.0009242280869204676, + "loss": 1.4476, + "step": 2229 + }, + { + "epoch": 0.20232262747232807, + "grad_norm": 0.18542103851615035, + "learning_rate": 0.0009241503052037458, + "loss": 1.4345, + "step": 2230 + }, + { + "epoch": 0.20241335510796588, + "grad_norm": 0.18490692399971048, + "learning_rate": 0.0009240724868615828, + "loss": 1.4811, + "step": 2231 + }, + { + "epoch": 0.2025040827436037, + "grad_norm": 0.1922082427437554, + "learning_rate": 0.0009239946319006982, + "loss": 1.4492, + "step": 2232 + }, + { + "epoch": 0.2025948103792415, + "grad_norm": 0.18855498318496403, + "learning_rate": 0.0009239167403278143, + "loss": 1.4573, + "step": 2233 + }, + { + "epoch": 0.20268553801487932, + "grad_norm": 0.18831771110963658, + "learning_rate": 0.0009238388121496577, + "loss": 1.4677, + "step": 2234 + }, + { + "epoch": 0.20277626565051715, + "grad_norm": 0.19253984723252768, + "learning_rate": 0.0009237608473729571, + "loss": 1.5011, + "step": 2235 + }, + { + "epoch": 0.20286699328615496, + "grad_norm": 0.1905503442060351, + "learning_rate": 0.0009236828460044449, + "loss": 1.4509, + "step": 2236 + }, + { + "epoch": 0.2029577209217928, + "grad_norm": 0.20869087141932305, + "learning_rate": 0.0009236048080508564, + "loss": 1.4722, + "step": 2237 + }, + { + "epoch": 0.2030484485574306, + "grad_norm": 0.1903971769572516, + "learning_rate": 0.0009235267335189303, + "loss": 1.4521, + "step": 2238 + }, + { + "epoch": 0.2031391761930684, + "grad_norm": 0.2011055009923827, + "learning_rate": 0.0009234486224154085, + "loss": 1.4594, + "step": 2239 + }, + { + "epoch": 0.20322990382870623, + "grad_norm": 0.1953248179602791, + "learning_rate": 0.0009233704747470356, + "loss": 1.4648, + "step": 2240 + }, + { + "epoch": 0.20332063146434404, + "grad_norm": 0.18533114245015306, + "learning_rate": 0.0009232922905205599, + "loss": 1.4438, + "step": 2241 + }, + { + "epoch": 0.20341135909998184, + "grad_norm": 0.22109566063854716, + "learning_rate": 0.0009232140697427324, + "loss": 1.4434, + "step": 2242 + }, + { + "epoch": 0.20350208673561967, + "grad_norm": 0.194806060729964, + "learning_rate": 0.0009231358124203077, + "loss": 1.4876, + "step": 2243 + }, + { + "epoch": 0.20359281437125748, + "grad_norm": 0.1817127475152464, + "learning_rate": 0.0009230575185600431, + "loss": 1.4698, + "step": 2244 + }, + { + "epoch": 0.2036835420068953, + "grad_norm": 0.1688586853649706, + "learning_rate": 0.0009229791881686997, + "loss": 1.4443, + "step": 2245 + }, + { + "epoch": 0.20377426964253312, + "grad_norm": 0.1839252787827211, + "learning_rate": 0.0009229008212530409, + "loss": 1.4282, + "step": 2246 + }, + { + "epoch": 0.20386499727817092, + "grad_norm": 0.18955227582428785, + "learning_rate": 0.0009228224178198338, + "loss": 1.4493, + "step": 2247 + }, + { + "epoch": 0.20395572491380876, + "grad_norm": 0.2004201906354816, + "learning_rate": 0.0009227439778758486, + "loss": 1.4419, + "step": 2248 + }, + { + "epoch": 0.20404645254944656, + "grad_norm": 0.17704063369087483, + "learning_rate": 0.0009226655014278587, + "loss": 1.4398, + "step": 2249 + }, + { + "epoch": 0.20413718018508437, + "grad_norm": 0.2008532877307107, + "learning_rate": 0.0009225869884826403, + "loss": 1.4876, + "step": 2250 + }, + { + "epoch": 0.2042279078207222, + "grad_norm": 0.20598710448127336, + "learning_rate": 0.0009225084390469733, + "loss": 1.4831, + "step": 2251 + }, + { + "epoch": 0.20431863545636, + "grad_norm": 0.19516533610659254, + "learning_rate": 0.0009224298531276404, + "loss": 1.4566, + "step": 2252 + }, + { + "epoch": 0.2044093630919978, + "grad_norm": 0.23652807254811936, + "learning_rate": 0.0009223512307314272, + "loss": 1.4753, + "step": 2253 + }, + { + "epoch": 0.20450009072763564, + "grad_norm": 0.21503379249342688, + "learning_rate": 0.0009222725718651231, + "loss": 1.4494, + "step": 2254 + }, + { + "epoch": 0.20459081836327345, + "grad_norm": 0.2090662406119697, + "learning_rate": 0.0009221938765355202, + "loss": 1.4783, + "step": 2255 + }, + { + "epoch": 0.20468154599891128, + "grad_norm": 0.2538977495226577, + "learning_rate": 0.0009221151447494138, + "loss": 1.4927, + "step": 2256 + }, + { + "epoch": 0.20477227363454908, + "grad_norm": 0.20758882555401345, + "learning_rate": 0.0009220363765136024, + "loss": 1.46, + "step": 2257 + }, + { + "epoch": 0.2048630012701869, + "grad_norm": 0.20374264657788127, + "learning_rate": 0.0009219575718348879, + "loss": 1.4824, + "step": 2258 + }, + { + "epoch": 0.20495372890582472, + "grad_norm": 0.19981465739606402, + "learning_rate": 0.0009218787307200746, + "loss": 1.4365, + "step": 2259 + }, + { + "epoch": 0.20504445654146253, + "grad_norm": 0.19497254957928878, + "learning_rate": 0.0009217998531759708, + "loss": 1.4736, + "step": 2260 + }, + { + "epoch": 0.20513518417710033, + "grad_norm": 0.18620480967052633, + "learning_rate": 0.0009217209392093876, + "loss": 1.46, + "step": 2261 + }, + { + "epoch": 0.20522591181273817, + "grad_norm": 0.2103380710786011, + "learning_rate": 0.000921641988827139, + "loss": 1.4543, + "step": 2262 + }, + { + "epoch": 0.20531663944837597, + "grad_norm": 0.19053218238853556, + "learning_rate": 0.0009215630020360427, + "loss": 1.4628, + "step": 2263 + }, + { + "epoch": 0.2054073670840138, + "grad_norm": 0.18737976363988904, + "learning_rate": 0.000921483978842919, + "loss": 1.4909, + "step": 2264 + }, + { + "epoch": 0.2054980947196516, + "grad_norm": 0.28285184502219884, + "learning_rate": 0.0009214049192545915, + "loss": 1.4665, + "step": 2265 + }, + { + "epoch": 0.2055888223552894, + "grad_norm": 0.18391504463789018, + "learning_rate": 0.0009213258232778871, + "loss": 1.4544, + "step": 2266 + }, + { + "epoch": 0.20567954999092725, + "grad_norm": 0.18065394161409587, + "learning_rate": 0.000921246690919636, + "loss": 1.4572, + "step": 2267 + }, + { + "epoch": 0.20577027762656505, + "grad_norm": 0.1755812337375875, + "learning_rate": 0.0009211675221866708, + "loss": 1.4252, + "step": 2268 + }, + { + "epoch": 0.20586100526220286, + "grad_norm": 0.3966630961719673, + "learning_rate": 0.0009210883170858282, + "loss": 1.5001, + "step": 2269 + }, + { + "epoch": 0.2059517328978407, + "grad_norm": 0.17451990178676052, + "learning_rate": 0.0009210090756239471, + "loss": 1.4992, + "step": 2270 + }, + { + "epoch": 0.2060424605334785, + "grad_norm": 0.16936897954643887, + "learning_rate": 0.0009209297978078705, + "loss": 1.4476, + "step": 2271 + }, + { + "epoch": 0.2061331881691163, + "grad_norm": 0.17702294126226584, + "learning_rate": 0.0009208504836444436, + "loss": 1.4838, + "step": 2272 + }, + { + "epoch": 0.20622391580475413, + "grad_norm": 0.17837055405464555, + "learning_rate": 0.0009207711331405156, + "loss": 1.48, + "step": 2273 + }, + { + "epoch": 0.20631464344039194, + "grad_norm": 0.1694060177687306, + "learning_rate": 0.0009206917463029381, + "loss": 1.4862, + "step": 2274 + }, + { + "epoch": 0.20640537107602977, + "grad_norm": 0.18257093943207484, + "learning_rate": 0.0009206123231385665, + "loss": 1.4809, + "step": 2275 + }, + { + "epoch": 0.20649609871166758, + "grad_norm": 0.19186928729027544, + "learning_rate": 0.0009205328636542587, + "loss": 1.4572, + "step": 2276 + }, + { + "epoch": 0.20658682634730538, + "grad_norm": 0.17731788880503052, + "learning_rate": 0.000920453367856876, + "loss": 1.4474, + "step": 2277 + }, + { + "epoch": 0.2066775539829432, + "grad_norm": 0.16951928831022, + "learning_rate": 0.0009203738357532831, + "loss": 1.4659, + "step": 2278 + }, + { + "epoch": 0.20676828161858102, + "grad_norm": 0.16706522061513043, + "learning_rate": 0.0009202942673503476, + "loss": 1.4461, + "step": 2279 + }, + { + "epoch": 0.20685900925421882, + "grad_norm": 0.21045681546410205, + "learning_rate": 0.0009202146626549402, + "loss": 1.4722, + "step": 2280 + }, + { + "epoch": 0.20694973688985666, + "grad_norm": 0.16547637196277143, + "learning_rate": 0.0009201350216739347, + "loss": 1.492, + "step": 2281 + }, + { + "epoch": 0.20704046452549446, + "grad_norm": 0.1627617643144374, + "learning_rate": 0.0009200553444142081, + "loss": 1.4911, + "step": 2282 + }, + { + "epoch": 0.2071311921611323, + "grad_norm": 0.16726373945277287, + "learning_rate": 0.0009199756308826407, + "loss": 1.4166, + "step": 2283 + }, + { + "epoch": 0.2072219197967701, + "grad_norm": 0.16123849900186704, + "learning_rate": 0.0009198958810861155, + "loss": 1.4522, + "step": 2284 + }, + { + "epoch": 0.2073126474324079, + "grad_norm": 0.16783933755216668, + "learning_rate": 0.0009198160950315193, + "loss": 1.4691, + "step": 2285 + }, + { + "epoch": 0.20740337506804574, + "grad_norm": 0.16151346339348427, + "learning_rate": 0.0009197362727257412, + "loss": 1.4716, + "step": 2286 + }, + { + "epoch": 0.20749410270368354, + "grad_norm": 0.1677121184586632, + "learning_rate": 0.0009196564141756743, + "loss": 1.4894, + "step": 2287 + }, + { + "epoch": 0.20758483033932135, + "grad_norm": 0.16282249336187063, + "learning_rate": 0.000919576519388214, + "loss": 1.4837, + "step": 2288 + }, + { + "epoch": 0.20767555797495918, + "grad_norm": 0.16145060430171296, + "learning_rate": 0.0009194965883702596, + "loss": 1.4494, + "step": 2289 + }, + { + "epoch": 0.20776628561059698, + "grad_norm": 0.16687355562163694, + "learning_rate": 0.000919416621128713, + "loss": 1.4693, + "step": 2290 + }, + { + "epoch": 0.2078570132462348, + "grad_norm": 0.1750299826959227, + "learning_rate": 0.0009193366176704792, + "loss": 1.475, + "step": 2291 + }, + { + "epoch": 0.20794774088187262, + "grad_norm": 0.17627362685640882, + "learning_rate": 0.0009192565780024667, + "loss": 1.4707, + "step": 2292 + }, + { + "epoch": 0.20803846851751043, + "grad_norm": 0.17527761376800863, + "learning_rate": 0.0009191765021315868, + "loss": 1.4894, + "step": 2293 + }, + { + "epoch": 0.20812919615314826, + "grad_norm": 0.177546054899134, + "learning_rate": 0.0009190963900647543, + "loss": 1.4442, + "step": 2294 + }, + { + "epoch": 0.20821992378878607, + "grad_norm": 0.16096916766409924, + "learning_rate": 0.0009190162418088865, + "loss": 1.4534, + "step": 2295 + }, + { + "epoch": 0.20831065142442387, + "grad_norm": 0.20947435655825647, + "learning_rate": 0.0009189360573709047, + "loss": 1.429, + "step": 2296 + }, + { + "epoch": 0.2084013790600617, + "grad_norm": 0.18229533156192207, + "learning_rate": 0.0009188558367577327, + "loss": 1.4753, + "step": 2297 + }, + { + "epoch": 0.2084921066956995, + "grad_norm": 0.17742307697986376, + "learning_rate": 0.0009187755799762972, + "loss": 1.455, + "step": 2298 + }, + { + "epoch": 0.2085828343313373, + "grad_norm": 0.16853890257047263, + "learning_rate": 0.0009186952870335288, + "loss": 1.4278, + "step": 2299 + }, + { + "epoch": 0.20867356196697515, + "grad_norm": 0.16680509171176872, + "learning_rate": 0.0009186149579363605, + "loss": 1.4978, + "step": 2300 + }, + { + "epoch": 0.20876428960261295, + "grad_norm": 0.1736521291471785, + "learning_rate": 0.000918534592691729, + "loss": 1.4516, + "step": 2301 + }, + { + "epoch": 0.20885501723825078, + "grad_norm": 0.1749575777574434, + "learning_rate": 0.0009184541913065739, + "loss": 1.5207, + "step": 2302 + }, + { + "epoch": 0.2089457448738886, + "grad_norm": 0.17211702753951508, + "learning_rate": 0.0009183737537878374, + "loss": 1.444, + "step": 2303 + }, + { + "epoch": 0.2090364725095264, + "grad_norm": 0.18576847419099063, + "learning_rate": 0.0009182932801424657, + "loss": 1.4873, + "step": 2304 + }, + { + "epoch": 0.20912720014516423, + "grad_norm": 0.18474332610815503, + "learning_rate": 0.0009182127703774077, + "loss": 1.4447, + "step": 2305 + }, + { + "epoch": 0.20921792778080203, + "grad_norm": 0.17876130627437586, + "learning_rate": 0.0009181322244996153, + "loss": 1.4589, + "step": 2306 + }, + { + "epoch": 0.20930865541643984, + "grad_norm": 0.16858236365600254, + "learning_rate": 0.0009180516425160436, + "loss": 1.4799, + "step": 2307 + }, + { + "epoch": 0.20939938305207767, + "grad_norm": 0.174384845502563, + "learning_rate": 0.0009179710244336512, + "loss": 1.4757, + "step": 2308 + }, + { + "epoch": 0.20949011068771548, + "grad_norm": 0.16953410626735105, + "learning_rate": 0.0009178903702593991, + "loss": 1.4639, + "step": 2309 + }, + { + "epoch": 0.20958083832335328, + "grad_norm": 0.2289254511802314, + "learning_rate": 0.0009178096800002518, + "loss": 1.4582, + "step": 2310 + }, + { + "epoch": 0.2096715659589911, + "grad_norm": 0.17902527650154598, + "learning_rate": 0.0009177289536631772, + "loss": 1.4424, + "step": 2311 + }, + { + "epoch": 0.20976229359462892, + "grad_norm": 0.1711555140723602, + "learning_rate": 0.0009176481912551458, + "loss": 1.3919, + "step": 2312 + }, + { + "epoch": 0.20985302123026675, + "grad_norm": 0.1855174341268104, + "learning_rate": 0.0009175673927831316, + "loss": 1.4696, + "step": 2313 + }, + { + "epoch": 0.20994374886590456, + "grad_norm": 0.2264997156651889, + "learning_rate": 0.0009174865582541115, + "loss": 1.4522, + "step": 2314 + }, + { + "epoch": 0.21003447650154236, + "grad_norm": 0.16140176675807938, + "learning_rate": 0.0009174056876750655, + "loss": 1.4576, + "step": 2315 + }, + { + "epoch": 0.2101252041371802, + "grad_norm": 0.17275692316977315, + "learning_rate": 0.0009173247810529768, + "loss": 1.4773, + "step": 2316 + }, + { + "epoch": 0.210215931772818, + "grad_norm": 0.18008641353367402, + "learning_rate": 0.0009172438383948318, + "loss": 1.4481, + "step": 2317 + }, + { + "epoch": 0.2103066594084558, + "grad_norm": 0.17323711793192495, + "learning_rate": 0.00091716285970762, + "loss": 1.4785, + "step": 2318 + }, + { + "epoch": 0.21039738704409364, + "grad_norm": 0.1907779199000258, + "learning_rate": 0.0009170818449983336, + "loss": 1.436, + "step": 2319 + }, + { + "epoch": 0.21048811467973144, + "grad_norm": 0.22305032950607925, + "learning_rate": 0.0009170007942739685, + "loss": 1.4455, + "step": 2320 + }, + { + "epoch": 0.21057884231536927, + "grad_norm": 0.17262104801565656, + "learning_rate": 0.0009169197075415233, + "loss": 1.4587, + "step": 2321 + }, + { + "epoch": 0.21066956995100708, + "grad_norm": 0.16874876633842623, + "learning_rate": 0.0009168385848080001, + "loss": 1.429, + "step": 2322 + }, + { + "epoch": 0.21076029758664488, + "grad_norm": 0.18579947906102112, + "learning_rate": 0.0009167574260804034, + "loss": 1.4326, + "step": 2323 + }, + { + "epoch": 0.21085102522228272, + "grad_norm": 0.18063547273551678, + "learning_rate": 0.0009166762313657417, + "loss": 1.4962, + "step": 2324 + }, + { + "epoch": 0.21094175285792052, + "grad_norm": 0.19470943978338467, + "learning_rate": 0.0009165950006710259, + "loss": 1.4462, + "step": 2325 + }, + { + "epoch": 0.21103248049355833, + "grad_norm": 0.19150576279897685, + "learning_rate": 0.0009165137340032705, + "loss": 1.4513, + "step": 2326 + }, + { + "epoch": 0.21112320812919616, + "grad_norm": 0.16854414674586776, + "learning_rate": 0.0009164324313694928, + "loss": 1.4768, + "step": 2327 + }, + { + "epoch": 0.21121393576483397, + "grad_norm": 0.18986247301916648, + "learning_rate": 0.0009163510927767131, + "loss": 1.4645, + "step": 2328 + }, + { + "epoch": 0.21130466340047177, + "grad_norm": 0.1653516346732713, + "learning_rate": 0.0009162697182319553, + "loss": 1.4525, + "step": 2329 + }, + { + "epoch": 0.2113953910361096, + "grad_norm": 0.18722991409410722, + "learning_rate": 0.0009161883077422459, + "loss": 1.4395, + "step": 2330 + }, + { + "epoch": 0.2114861186717474, + "grad_norm": 0.17501313238336375, + "learning_rate": 0.0009161068613146149, + "loss": 1.4482, + "step": 2331 + }, + { + "epoch": 0.21157684630738524, + "grad_norm": 0.1922151692509321, + "learning_rate": 0.000916025378956095, + "loss": 1.4397, + "step": 2332 + }, + { + "epoch": 0.21166757394302305, + "grad_norm": 0.18524617169841112, + "learning_rate": 0.0009159438606737223, + "loss": 1.4365, + "step": 2333 + }, + { + "epoch": 0.21175830157866085, + "grad_norm": 0.1940672803794177, + "learning_rate": 0.000915862306474536, + "loss": 1.4379, + "step": 2334 + }, + { + "epoch": 0.21184902921429868, + "grad_norm": 0.20201732457549282, + "learning_rate": 0.0009157807163655782, + "loss": 1.472, + "step": 2335 + }, + { + "epoch": 0.2119397568499365, + "grad_norm": 0.16968511387744684, + "learning_rate": 0.0009156990903538944, + "loss": 1.4658, + "step": 2336 + }, + { + "epoch": 0.2120304844855743, + "grad_norm": 0.18050669433836478, + "learning_rate": 0.0009156174284465327, + "loss": 1.4576, + "step": 2337 + }, + { + "epoch": 0.21212121212121213, + "grad_norm": 0.16815235816601548, + "learning_rate": 0.0009155357306505447, + "loss": 1.4809, + "step": 2338 + }, + { + "epoch": 0.21221193975684993, + "grad_norm": 0.17876264643848971, + "learning_rate": 0.0009154539969729854, + "loss": 1.4689, + "step": 2339 + }, + { + "epoch": 0.21230266739248776, + "grad_norm": 0.18885733315871547, + "learning_rate": 0.0009153722274209121, + "loss": 1.4322, + "step": 2340 + }, + { + "epoch": 0.21239339502812557, + "grad_norm": 0.1818925514965045, + "learning_rate": 0.0009152904220013858, + "loss": 1.4758, + "step": 2341 + }, + { + "epoch": 0.21248412266376338, + "grad_norm": 0.17573112298359017, + "learning_rate": 0.0009152085807214703, + "loss": 1.4294, + "step": 2342 + }, + { + "epoch": 0.2125748502994012, + "grad_norm": 0.17277678206994518, + "learning_rate": 0.0009151267035882326, + "loss": 1.4526, + "step": 2343 + }, + { + "epoch": 0.212665577935039, + "grad_norm": 0.17908170667501255, + "learning_rate": 0.0009150447906087429, + "loss": 1.5004, + "step": 2344 + }, + { + "epoch": 0.21275630557067682, + "grad_norm": 0.1730142105416477, + "learning_rate": 0.0009149628417900743, + "loss": 1.4772, + "step": 2345 + }, + { + "epoch": 0.21284703320631465, + "grad_norm": 0.1794597438881871, + "learning_rate": 0.0009148808571393034, + "loss": 1.4347, + "step": 2346 + }, + { + "epoch": 0.21293776084195246, + "grad_norm": 0.17528478522282778, + "learning_rate": 0.0009147988366635092, + "loss": 1.4246, + "step": 2347 + }, + { + "epoch": 0.21302848847759026, + "grad_norm": 0.15807182160782624, + "learning_rate": 0.0009147167803697742, + "loss": 1.4448, + "step": 2348 + }, + { + "epoch": 0.2131192161132281, + "grad_norm": 0.173082891201107, + "learning_rate": 0.0009146346882651841, + "loss": 1.4635, + "step": 2349 + }, + { + "epoch": 0.2132099437488659, + "grad_norm": 0.18487029568851474, + "learning_rate": 0.0009145525603568277, + "loss": 1.4793, + "step": 2350 + }, + { + "epoch": 0.21330067138450373, + "grad_norm": 0.19333890367927428, + "learning_rate": 0.0009144703966517966, + "loss": 1.4005, + "step": 2351 + }, + { + "epoch": 0.21339139902014154, + "grad_norm": 0.16450603235253813, + "learning_rate": 0.0009143881971571857, + "loss": 1.4952, + "step": 2352 + }, + { + "epoch": 0.21348212665577934, + "grad_norm": 0.16252615396827982, + "learning_rate": 0.0009143059618800927, + "loss": 1.4392, + "step": 2353 + }, + { + "epoch": 0.21357285429141717, + "grad_norm": 0.18775290428193617, + "learning_rate": 0.000914223690827619, + "loss": 1.4626, + "step": 2354 + }, + { + "epoch": 0.21366358192705498, + "grad_norm": 0.16977711419812164, + "learning_rate": 0.0009141413840068684, + "loss": 1.4688, + "step": 2355 + }, + { + "epoch": 0.21375430956269278, + "grad_norm": 0.17684024867213416, + "learning_rate": 0.0009140590414249485, + "loss": 1.4619, + "step": 2356 + }, + { + "epoch": 0.21384503719833062, + "grad_norm": 0.17559843984832108, + "learning_rate": 0.0009139766630889692, + "loss": 1.4298, + "step": 2357 + }, + { + "epoch": 0.21393576483396842, + "grad_norm": 0.16729428685954395, + "learning_rate": 0.000913894249006044, + "loss": 1.4198, + "step": 2358 + }, + { + "epoch": 0.21402649246960626, + "grad_norm": 0.1939800014642457, + "learning_rate": 0.0009138117991832894, + "loss": 1.4432, + "step": 2359 + }, + { + "epoch": 0.21411722010524406, + "grad_norm": 0.1926619685800366, + "learning_rate": 0.0009137293136278248, + "loss": 1.4576, + "step": 2360 + }, + { + "epoch": 0.21420794774088187, + "grad_norm": 0.21316075140752733, + "learning_rate": 0.0009136467923467733, + "loss": 1.4639, + "step": 2361 + }, + { + "epoch": 0.2142986753765197, + "grad_norm": 0.17072263437493956, + "learning_rate": 0.0009135642353472602, + "loss": 1.4526, + "step": 2362 + }, + { + "epoch": 0.2143894030121575, + "grad_norm": 0.18027086382387422, + "learning_rate": 0.0009134816426364144, + "loss": 1.4525, + "step": 2363 + }, + { + "epoch": 0.2144801306477953, + "grad_norm": 0.1720660497165026, + "learning_rate": 0.0009133990142213678, + "loss": 1.4525, + "step": 2364 + }, + { + "epoch": 0.21457085828343314, + "grad_norm": 0.17905480941546345, + "learning_rate": 0.0009133163501092555, + "loss": 1.4802, + "step": 2365 + }, + { + "epoch": 0.21466158591907095, + "grad_norm": 0.2202458620436215, + "learning_rate": 0.0009132336503072153, + "loss": 1.4466, + "step": 2366 + }, + { + "epoch": 0.21475231355470875, + "grad_norm": 0.16742088935614144, + "learning_rate": 0.0009131509148223886, + "loss": 1.4841, + "step": 2367 + }, + { + "epoch": 0.21484304119034658, + "grad_norm": 0.17469709367405795, + "learning_rate": 0.0009130681436619193, + "loss": 1.4604, + "step": 2368 + }, + { + "epoch": 0.2149337688259844, + "grad_norm": 0.17163457778595356, + "learning_rate": 0.0009129853368329552, + "loss": 1.4769, + "step": 2369 + }, + { + "epoch": 0.21502449646162222, + "grad_norm": 0.29902299208552807, + "learning_rate": 0.0009129024943426463, + "loss": 1.4654, + "step": 2370 + }, + { + "epoch": 0.21511522409726003, + "grad_norm": 0.17117287643643178, + "learning_rate": 0.0009128196161981462, + "loss": 1.4168, + "step": 2371 + }, + { + "epoch": 0.21520595173289783, + "grad_norm": 0.17600023669006912, + "learning_rate": 0.0009127367024066112, + "loss": 1.4407, + "step": 2372 + }, + { + "epoch": 0.21529667936853567, + "grad_norm": 0.17302259712978027, + "learning_rate": 0.0009126537529752012, + "loss": 1.492, + "step": 2373 + }, + { + "epoch": 0.21538740700417347, + "grad_norm": 0.17846099485381042, + "learning_rate": 0.0009125707679110789, + "loss": 1.4562, + "step": 2374 + }, + { + "epoch": 0.21547813463981128, + "grad_norm": 0.1757979084983244, + "learning_rate": 0.00091248774722141, + "loss": 1.4652, + "step": 2375 + }, + { + "epoch": 0.2155688622754491, + "grad_norm": 0.15802476197736975, + "learning_rate": 0.0009124046909133633, + "loss": 1.4273, + "step": 2376 + }, + { + "epoch": 0.2156595899110869, + "grad_norm": 0.16954925122257497, + "learning_rate": 0.0009123215989941107, + "loss": 1.4798, + "step": 2377 + }, + { + "epoch": 0.21575031754672475, + "grad_norm": 0.1816149164685076, + "learning_rate": 0.0009122384714708273, + "loss": 1.4703, + "step": 2378 + }, + { + "epoch": 0.21584104518236255, + "grad_norm": 0.17442132495932933, + "learning_rate": 0.000912155308350691, + "loss": 1.4296, + "step": 2379 + }, + { + "epoch": 0.21593177281800036, + "grad_norm": 0.1873397264673129, + "learning_rate": 0.0009120721096408833, + "loss": 1.4691, + "step": 2380 + }, + { + "epoch": 0.2160225004536382, + "grad_norm": 0.17802328042142151, + "learning_rate": 0.0009119888753485881, + "loss": 1.4881, + "step": 2381 + }, + { + "epoch": 0.216113228089276, + "grad_norm": 0.177595012384486, + "learning_rate": 0.0009119056054809929, + "loss": 1.4283, + "step": 2382 + }, + { + "epoch": 0.2162039557249138, + "grad_norm": 0.20053419421957588, + "learning_rate": 0.0009118223000452877, + "loss": 1.435, + "step": 2383 + }, + { + "epoch": 0.21629468336055163, + "grad_norm": 0.17487392757259398, + "learning_rate": 0.0009117389590486665, + "loss": 1.4396, + "step": 2384 + }, + { + "epoch": 0.21638541099618944, + "grad_norm": 0.16768365352157094, + "learning_rate": 0.0009116555824983252, + "loss": 1.4683, + "step": 2385 + }, + { + "epoch": 0.21647613863182724, + "grad_norm": 0.18365662915613398, + "learning_rate": 0.0009115721704014639, + "loss": 1.4345, + "step": 2386 + }, + { + "epoch": 0.21656686626746507, + "grad_norm": 0.16524167646512478, + "learning_rate": 0.000911488722765285, + "loss": 1.4225, + "step": 2387 + }, + { + "epoch": 0.21665759390310288, + "grad_norm": 0.17469199603529098, + "learning_rate": 0.0009114052395969942, + "loss": 1.4768, + "step": 2388 + }, + { + "epoch": 0.2167483215387407, + "grad_norm": 0.15305190270795224, + "learning_rate": 0.0009113217209038003, + "loss": 1.466, + "step": 2389 + }, + { + "epoch": 0.21683904917437852, + "grad_norm": 0.18226751260482327, + "learning_rate": 0.0009112381666929153, + "loss": 1.4702, + "step": 2390 + }, + { + "epoch": 0.21692977681001632, + "grad_norm": 0.42961799211650104, + "learning_rate": 0.0009111545769715538, + "loss": 1.4877, + "step": 2391 + }, + { + "epoch": 0.21702050444565416, + "grad_norm": 0.15645257571628196, + "learning_rate": 0.0009110709517469341, + "loss": 1.4502, + "step": 2392 + }, + { + "epoch": 0.21711123208129196, + "grad_norm": 0.17747902860523404, + "learning_rate": 0.0009109872910262771, + "loss": 1.4424, + "step": 2393 + }, + { + "epoch": 0.21720195971692977, + "grad_norm": 0.1870367177576535, + "learning_rate": 0.0009109035948168069, + "loss": 1.5017, + "step": 2394 + }, + { + "epoch": 0.2172926873525676, + "grad_norm": 0.18072262836603017, + "learning_rate": 0.0009108198631257508, + "loss": 1.4893, + "step": 2395 + }, + { + "epoch": 0.2173834149882054, + "grad_norm": 0.15784195738068948, + "learning_rate": 0.0009107360959603391, + "loss": 1.4124, + "step": 2396 + }, + { + "epoch": 0.21747414262384324, + "grad_norm": 0.16041841462002132, + "learning_rate": 0.0009106522933278047, + "loss": 1.4557, + "step": 2397 + }, + { + "epoch": 0.21756487025948104, + "grad_norm": 0.16449165556822984, + "learning_rate": 0.0009105684552353844, + "loss": 1.4628, + "step": 2398 + }, + { + "epoch": 0.21765559789511885, + "grad_norm": 0.1657735040600592, + "learning_rate": 0.0009104845816903174, + "loss": 1.4701, + "step": 2399 + }, + { + "epoch": 0.21774632553075668, + "grad_norm": 0.1882743649431247, + "learning_rate": 0.0009104006726998464, + "loss": 1.4754, + "step": 2400 + }, + { + "epoch": 0.21783705316639448, + "grad_norm": 0.15688733254915543, + "learning_rate": 0.0009103167282712167, + "loss": 1.4461, + "step": 2401 + }, + { + "epoch": 0.2179277808020323, + "grad_norm": 0.16797434315425583, + "learning_rate": 0.0009102327484116771, + "loss": 1.4641, + "step": 2402 + }, + { + "epoch": 0.21801850843767012, + "grad_norm": 0.17089293684850979, + "learning_rate": 0.0009101487331284792, + "loss": 1.4946, + "step": 2403 + }, + { + "epoch": 0.21810923607330793, + "grad_norm": 0.18461441421948982, + "learning_rate": 0.0009100646824288778, + "loss": 1.4506, + "step": 2404 + }, + { + "epoch": 0.21819996370894573, + "grad_norm": 0.17654005840919318, + "learning_rate": 0.0009099805963201305, + "loss": 1.5026, + "step": 2405 + }, + { + "epoch": 0.21829069134458357, + "grad_norm": 0.20917004582325355, + "learning_rate": 0.0009098964748094985, + "loss": 1.4863, + "step": 2406 + }, + { + "epoch": 0.21838141898022137, + "grad_norm": 0.18036347453373205, + "learning_rate": 0.0009098123179042452, + "loss": 1.4406, + "step": 2407 + }, + { + "epoch": 0.2184721466158592, + "grad_norm": 0.1782459834851043, + "learning_rate": 0.0009097281256116381, + "loss": 1.471, + "step": 2408 + }, + { + "epoch": 0.218562874251497, + "grad_norm": 0.1851916027096605, + "learning_rate": 0.0009096438979389468, + "loss": 1.464, + "step": 2409 + }, + { + "epoch": 0.2186536018871348, + "grad_norm": 0.18454738172579388, + "learning_rate": 0.0009095596348934448, + "loss": 1.4746, + "step": 2410 + }, + { + "epoch": 0.21874432952277265, + "grad_norm": 0.16535486782135994, + "learning_rate": 0.0009094753364824076, + "loss": 1.4683, + "step": 2411 + }, + { + "epoch": 0.21883505715841045, + "grad_norm": 0.17610531831707052, + "learning_rate": 0.000909391002713115, + "loss": 1.4514, + "step": 2412 + }, + { + "epoch": 0.21892578479404826, + "grad_norm": 0.16898704696207872, + "learning_rate": 0.0009093066335928488, + "loss": 1.4772, + "step": 2413 + }, + { + "epoch": 0.2190165124296861, + "grad_norm": 0.17597182461392344, + "learning_rate": 0.0009092222291288947, + "loss": 1.4734, + "step": 2414 + }, + { + "epoch": 0.2191072400653239, + "grad_norm": 0.1769115117010821, + "learning_rate": 0.0009091377893285406, + "loss": 1.4402, + "step": 2415 + }, + { + "epoch": 0.21919796770096173, + "grad_norm": 0.1796435591915719, + "learning_rate": 0.000909053314199078, + "loss": 1.4558, + "step": 2416 + }, + { + "epoch": 0.21928869533659953, + "grad_norm": 0.16987391363269944, + "learning_rate": 0.0009089688037478016, + "loss": 1.4559, + "step": 2417 + }, + { + "epoch": 0.21937942297223734, + "grad_norm": 0.18344218914935123, + "learning_rate": 0.0009088842579820087, + "loss": 1.4664, + "step": 2418 + }, + { + "epoch": 0.21947015060787517, + "grad_norm": 0.18262842534221754, + "learning_rate": 0.000908799676909, + "loss": 1.436, + "step": 2419 + }, + { + "epoch": 0.21956087824351297, + "grad_norm": 0.2203115566376877, + "learning_rate": 0.0009087150605360788, + "loss": 1.4334, + "step": 2420 + }, + { + "epoch": 0.21965160587915078, + "grad_norm": 0.1748593570082947, + "learning_rate": 0.0009086304088705518, + "loss": 1.4837, + "step": 2421 + }, + { + "epoch": 0.2197423335147886, + "grad_norm": 0.17897385813512606, + "learning_rate": 0.0009085457219197288, + "loss": 1.4956, + "step": 2422 + }, + { + "epoch": 0.21983306115042642, + "grad_norm": 0.17505380168872597, + "learning_rate": 0.0009084609996909226, + "loss": 1.4725, + "step": 2423 + }, + { + "epoch": 0.21992378878606422, + "grad_norm": 0.17290900449742813, + "learning_rate": 0.0009083762421914489, + "loss": 1.4574, + "step": 2424 + }, + { + "epoch": 0.22001451642170206, + "grad_norm": 0.19650878619422835, + "learning_rate": 0.0009082914494286266, + "loss": 1.4476, + "step": 2425 + }, + { + "epoch": 0.22010524405733986, + "grad_norm": 0.1886129989872123, + "learning_rate": 0.0009082066214097772, + "loss": 1.4758, + "step": 2426 + }, + { + "epoch": 0.2201959716929777, + "grad_norm": 0.17272213269677605, + "learning_rate": 0.0009081217581422259, + "loss": 1.4662, + "step": 2427 + }, + { + "epoch": 0.2202866993286155, + "grad_norm": 0.1851240183241611, + "learning_rate": 0.0009080368596333006, + "loss": 1.4623, + "step": 2428 + }, + { + "epoch": 0.2203774269642533, + "grad_norm": 0.17605786591177885, + "learning_rate": 0.0009079519258903326, + "loss": 1.4083, + "step": 2429 + }, + { + "epoch": 0.22046815459989114, + "grad_norm": 0.1937693338081879, + "learning_rate": 0.0009078669569206555, + "loss": 1.4837, + "step": 2430 + }, + { + "epoch": 0.22055888223552894, + "grad_norm": 0.20796307919988385, + "learning_rate": 0.0009077819527316066, + "loss": 1.447, + "step": 2431 + }, + { + "epoch": 0.22064960987116675, + "grad_norm": 0.18277961586674737, + "learning_rate": 0.000907696913330526, + "loss": 1.4706, + "step": 2432 + }, + { + "epoch": 0.22074033750680458, + "grad_norm": 0.23206897441617622, + "learning_rate": 0.0009076118387247568, + "loss": 1.4545, + "step": 2433 + }, + { + "epoch": 0.22083106514244238, + "grad_norm": 0.1757419705788909, + "learning_rate": 0.0009075267289216451, + "loss": 1.4633, + "step": 2434 + }, + { + "epoch": 0.22092179277808022, + "grad_norm": 0.19336072058491435, + "learning_rate": 0.0009074415839285405, + "loss": 1.4517, + "step": 2435 + }, + { + "epoch": 0.22101252041371802, + "grad_norm": 0.17122630422176047, + "learning_rate": 0.0009073564037527951, + "loss": 1.4642, + "step": 2436 + }, + { + "epoch": 0.22110324804935583, + "grad_norm": 0.20580184497524187, + "learning_rate": 0.0009072711884017641, + "loss": 1.459, + "step": 2437 + }, + { + "epoch": 0.22119397568499366, + "grad_norm": 0.1827211948675623, + "learning_rate": 0.000907185937882806, + "loss": 1.4513, + "step": 2438 + }, + { + "epoch": 0.22128470332063147, + "grad_norm": 0.18203001817702738, + "learning_rate": 0.000907100652203282, + "loss": 1.4856, + "step": 2439 + }, + { + "epoch": 0.22137543095626927, + "grad_norm": 0.19979453363437585, + "learning_rate": 0.0009070153313705569, + "loss": 1.4705, + "step": 2440 + }, + { + "epoch": 0.2214661585919071, + "grad_norm": 0.19294490984791102, + "learning_rate": 0.0009069299753919979, + "loss": 1.4237, + "step": 2441 + }, + { + "epoch": 0.2215568862275449, + "grad_norm": 0.22436136196795345, + "learning_rate": 0.0009068445842749754, + "loss": 1.4114, + "step": 2442 + }, + { + "epoch": 0.2216476138631827, + "grad_norm": 0.18032020240592245, + "learning_rate": 0.0009067591580268632, + "loss": 1.4424, + "step": 2443 + }, + { + "epoch": 0.22173834149882055, + "grad_norm": 0.25586766550810375, + "learning_rate": 0.0009066736966550378, + "loss": 1.4864, + "step": 2444 + }, + { + "epoch": 0.22182906913445835, + "grad_norm": 0.18007156828120455, + "learning_rate": 0.0009065882001668786, + "loss": 1.4618, + "step": 2445 + }, + { + "epoch": 0.22191979677009618, + "grad_norm": 0.17179755556467094, + "learning_rate": 0.0009065026685697686, + "loss": 1.5158, + "step": 2446 + }, + { + "epoch": 0.222010524405734, + "grad_norm": 0.17197005314131425, + "learning_rate": 0.0009064171018710931, + "loss": 1.4638, + "step": 2447 + }, + { + "epoch": 0.2221012520413718, + "grad_norm": 0.16822506575040433, + "learning_rate": 0.0009063315000782411, + "loss": 1.4553, + "step": 2448 + }, + { + "epoch": 0.22219197967700963, + "grad_norm": 0.1846182977541171, + "learning_rate": 0.0009062458631986039, + "loss": 1.4672, + "step": 2449 + }, + { + "epoch": 0.22228270731264743, + "grad_norm": 0.17295733154131512, + "learning_rate": 0.0009061601912395767, + "loss": 1.4574, + "step": 2450 + }, + { + "epoch": 0.22237343494828524, + "grad_norm": 0.16575259665107403, + "learning_rate": 0.0009060744842085573, + "loss": 1.4528, + "step": 2451 + }, + { + "epoch": 0.22246416258392307, + "grad_norm": 0.16061438341563292, + "learning_rate": 0.0009059887421129461, + "loss": 1.4619, + "step": 2452 + }, + { + "epoch": 0.22255489021956087, + "grad_norm": 0.18275396257980175, + "learning_rate": 0.0009059029649601474, + "loss": 1.4603, + "step": 2453 + }, + { + "epoch": 0.22264561785519868, + "grad_norm": 0.17104502253453344, + "learning_rate": 0.0009058171527575676, + "loss": 1.4186, + "step": 2454 + }, + { + "epoch": 0.2227363454908365, + "grad_norm": 0.17036020297212173, + "learning_rate": 0.000905731305512617, + "loss": 1.4958, + "step": 2455 + }, + { + "epoch": 0.22282707312647432, + "grad_norm": 0.17350513918055924, + "learning_rate": 0.0009056454232327083, + "loss": 1.4581, + "step": 2456 + }, + { + "epoch": 0.22291780076211215, + "grad_norm": 0.16742178374354513, + "learning_rate": 0.0009055595059252575, + "loss": 1.4406, + "step": 2457 + }, + { + "epoch": 0.22300852839774996, + "grad_norm": 0.1657515413190582, + "learning_rate": 0.0009054735535976837, + "loss": 1.4906, + "step": 2458 + }, + { + "epoch": 0.22309925603338776, + "grad_norm": 0.17246854480397966, + "learning_rate": 0.0009053875662574087, + "loss": 1.5157, + "step": 2459 + }, + { + "epoch": 0.2231899836690256, + "grad_norm": 0.1644007130131495, + "learning_rate": 0.0009053015439118577, + "loss": 1.4707, + "step": 2460 + }, + { + "epoch": 0.2232807113046634, + "grad_norm": 0.165971942376175, + "learning_rate": 0.0009052154865684585, + "loss": 1.4249, + "step": 2461 + }, + { + "epoch": 0.2233714389403012, + "grad_norm": 0.15773312086956714, + "learning_rate": 0.0009051293942346425, + "loss": 1.4627, + "step": 2462 + }, + { + "epoch": 0.22346216657593904, + "grad_norm": 0.15812348457275305, + "learning_rate": 0.0009050432669178434, + "loss": 1.4696, + "step": 2463 + }, + { + "epoch": 0.22355289421157684, + "grad_norm": 0.17872078970090283, + "learning_rate": 0.0009049571046254988, + "loss": 1.4329, + "step": 2464 + }, + { + "epoch": 0.22364362184721467, + "grad_norm": 0.1612278247172875, + "learning_rate": 0.0009048709073650482, + "loss": 1.4237, + "step": 2465 + }, + { + "epoch": 0.22373434948285248, + "grad_norm": 0.1522254392977764, + "learning_rate": 0.0009047846751439353, + "loss": 1.4768, + "step": 2466 + }, + { + "epoch": 0.22382507711849028, + "grad_norm": 0.23139303058053454, + "learning_rate": 0.0009046984079696059, + "loss": 1.4458, + "step": 2467 + }, + { + "epoch": 0.22391580475412812, + "grad_norm": 0.16151818731366588, + "learning_rate": 0.0009046121058495093, + "loss": 1.4551, + "step": 2468 + }, + { + "epoch": 0.22400653238976592, + "grad_norm": 0.15710208956261998, + "learning_rate": 0.0009045257687910978, + "loss": 1.4647, + "step": 2469 + }, + { + "epoch": 0.22409726002540373, + "grad_norm": 0.17029370334541835, + "learning_rate": 0.0009044393968018265, + "loss": 1.491, + "step": 2470 + }, + { + "epoch": 0.22418798766104156, + "grad_norm": 0.20243072612831442, + "learning_rate": 0.0009043529898891538, + "loss": 1.4274, + "step": 2471 + }, + { + "epoch": 0.22427871529667937, + "grad_norm": 0.1848969222783181, + "learning_rate": 0.0009042665480605408, + "loss": 1.468, + "step": 2472 + }, + { + "epoch": 0.22436944293231717, + "grad_norm": 0.16827994653241304, + "learning_rate": 0.0009041800713234517, + "loss": 1.4388, + "step": 2473 + }, + { + "epoch": 0.224460170567955, + "grad_norm": 0.16019763613728105, + "learning_rate": 0.0009040935596853539, + "loss": 1.471, + "step": 2474 + }, + { + "epoch": 0.2245508982035928, + "grad_norm": 0.17754758544816604, + "learning_rate": 0.0009040070131537177, + "loss": 1.4753, + "step": 2475 + }, + { + "epoch": 0.22464162583923064, + "grad_norm": 0.15964979664774684, + "learning_rate": 0.0009039204317360163, + "loss": 1.4673, + "step": 2476 + }, + { + "epoch": 0.22473235347486845, + "grad_norm": 0.16074996849390952, + "learning_rate": 0.0009038338154397261, + "loss": 1.4414, + "step": 2477 + }, + { + "epoch": 0.22482308111050625, + "grad_norm": 0.15400124977733115, + "learning_rate": 0.0009037471642723265, + "loss": 1.4787, + "step": 2478 + }, + { + "epoch": 0.22491380874614408, + "grad_norm": 0.16165398970975947, + "learning_rate": 0.0009036604782412997, + "loss": 1.4402, + "step": 2479 + }, + { + "epoch": 0.2250045363817819, + "grad_norm": 0.16267193246609288, + "learning_rate": 0.0009035737573541312, + "loss": 1.4408, + "step": 2480 + }, + { + "epoch": 0.2250952640174197, + "grad_norm": 0.16194493710948707, + "learning_rate": 0.0009034870016183092, + "loss": 1.4587, + "step": 2481 + }, + { + "epoch": 0.22518599165305753, + "grad_norm": 0.16080083520606608, + "learning_rate": 0.0009034002110413251, + "loss": 1.4799, + "step": 2482 + }, + { + "epoch": 0.22527671928869533, + "grad_norm": 0.16483025725630632, + "learning_rate": 0.0009033133856306733, + "loss": 1.4578, + "step": 2483 + }, + { + "epoch": 0.22536744692433316, + "grad_norm": 0.17460926350143435, + "learning_rate": 0.0009032265253938513, + "loss": 1.4514, + "step": 2484 + }, + { + "epoch": 0.22545817455997097, + "grad_norm": 0.16182445247913044, + "learning_rate": 0.0009031396303383595, + "loss": 1.4566, + "step": 2485 + }, + { + "epoch": 0.22554890219560877, + "grad_norm": 0.1582468917288134, + "learning_rate": 0.0009030527004717009, + "loss": 1.4173, + "step": 2486 + }, + { + "epoch": 0.2256396298312466, + "grad_norm": 0.15600801161846498, + "learning_rate": 0.0009029657358013825, + "loss": 1.4483, + "step": 2487 + }, + { + "epoch": 0.2257303574668844, + "grad_norm": 0.1665265586570981, + "learning_rate": 0.0009028787363349133, + "loss": 1.4675, + "step": 2488 + }, + { + "epoch": 0.22582108510252222, + "grad_norm": 0.1671771588752467, + "learning_rate": 0.0009027917020798058, + "loss": 1.4719, + "step": 2489 + }, + { + "epoch": 0.22591181273816005, + "grad_norm": 0.1564927520939081, + "learning_rate": 0.0009027046330435755, + "loss": 1.486, + "step": 2490 + }, + { + "epoch": 0.22600254037379786, + "grad_norm": 0.16773838484973644, + "learning_rate": 0.0009026175292337409, + "loss": 1.459, + "step": 2491 + }, + { + "epoch": 0.22609326800943566, + "grad_norm": 0.15330284322538432, + "learning_rate": 0.0009025303906578231, + "loss": 1.4616, + "step": 2492 + }, + { + "epoch": 0.2261839956450735, + "grad_norm": 0.21682875337839164, + "learning_rate": 0.0009024432173233468, + "loss": 1.455, + "step": 2493 + }, + { + "epoch": 0.2262747232807113, + "grad_norm": 0.16195442765045828, + "learning_rate": 0.0009023560092378393, + "loss": 1.4647, + "step": 2494 + }, + { + "epoch": 0.22636545091634913, + "grad_norm": 0.17024065016697917, + "learning_rate": 0.0009022687664088314, + "loss": 1.4683, + "step": 2495 + }, + { + "epoch": 0.22645617855198694, + "grad_norm": 0.17359708014520017, + "learning_rate": 0.000902181488843856, + "loss": 1.4813, + "step": 2496 + }, + { + "epoch": 0.22654690618762474, + "grad_norm": 0.1709562774801282, + "learning_rate": 0.0009020941765504498, + "loss": 1.4271, + "step": 2497 + }, + { + "epoch": 0.22663763382326257, + "grad_norm": 0.28192830403666524, + "learning_rate": 0.0009020068295361522, + "loss": 1.4507, + "step": 2498 + }, + { + "epoch": 0.22672836145890038, + "grad_norm": 0.16519756840079972, + "learning_rate": 0.0009019194478085055, + "loss": 1.4259, + "step": 2499 + }, + { + "epoch": 0.22681908909453818, + "grad_norm": 0.18191440304621126, + "learning_rate": 0.0009018320313750554, + "loss": 1.4549, + "step": 2500 + }, + { + "epoch": 0.22690981673017602, + "grad_norm": 0.15844276302384575, + "learning_rate": 0.0009017445802433501, + "loss": 1.4557, + "step": 2501 + }, + { + "epoch": 0.22700054436581382, + "grad_norm": 0.16901481151451905, + "learning_rate": 0.0009016570944209413, + "loss": 1.4226, + "step": 2502 + }, + { + "epoch": 0.22709127200145166, + "grad_norm": 0.1618986817286491, + "learning_rate": 0.000901569573915383, + "loss": 1.4528, + "step": 2503 + }, + { + "epoch": 0.22718199963708946, + "grad_norm": 0.17361593149936536, + "learning_rate": 0.0009014820187342327, + "loss": 1.4311, + "step": 2504 + }, + { + "epoch": 0.22727272727272727, + "grad_norm": 0.17521285780805082, + "learning_rate": 0.0009013944288850511, + "loss": 1.4561, + "step": 2505 + }, + { + "epoch": 0.2273634549083651, + "grad_norm": 0.17495763342848056, + "learning_rate": 0.0009013068043754014, + "loss": 1.4625, + "step": 2506 + }, + { + "epoch": 0.2274541825440029, + "grad_norm": 0.18350091140318056, + "learning_rate": 0.0009012191452128499, + "loss": 1.4135, + "step": 2507 + }, + { + "epoch": 0.2275449101796407, + "grad_norm": 0.19004475597157722, + "learning_rate": 0.000901131451404966, + "loss": 1.4683, + "step": 2508 + }, + { + "epoch": 0.22763563781527854, + "grad_norm": 0.25533178262626705, + "learning_rate": 0.0009010437229593223, + "loss": 1.4612, + "step": 2509 + }, + { + "epoch": 0.22772636545091635, + "grad_norm": 0.18358220602128006, + "learning_rate": 0.000900955959883494, + "loss": 1.4358, + "step": 2510 + }, + { + "epoch": 0.22781709308655415, + "grad_norm": 0.17999676182749055, + "learning_rate": 0.0009008681621850594, + "loss": 1.4782, + "step": 2511 + }, + { + "epoch": 0.22790782072219198, + "grad_norm": 0.1959590131649435, + "learning_rate": 0.0009007803298716, + "loss": 1.4745, + "step": 2512 + }, + { + "epoch": 0.2279985483578298, + "grad_norm": 0.17399036964249381, + "learning_rate": 0.0009006924629507, + "loss": 1.4671, + "step": 2513 + }, + { + "epoch": 0.22808927599346762, + "grad_norm": 0.17008554500794854, + "learning_rate": 0.0009006045614299467, + "loss": 1.4276, + "step": 2514 + }, + { + "epoch": 0.22818000362910543, + "grad_norm": 0.18016727043387165, + "learning_rate": 0.0009005166253169307, + "loss": 1.4564, + "step": 2515 + }, + { + "epoch": 0.22827073126474323, + "grad_norm": 0.17452813118499289, + "learning_rate": 0.0009004286546192449, + "loss": 1.4554, + "step": 2516 + }, + { + "epoch": 0.22836145890038106, + "grad_norm": 0.17002742254395206, + "learning_rate": 0.0009003406493444856, + "loss": 1.4891, + "step": 2517 + }, + { + "epoch": 0.22845218653601887, + "grad_norm": 0.17844800326669283, + "learning_rate": 0.0009002526095002526, + "loss": 1.4573, + "step": 2518 + }, + { + "epoch": 0.22854291417165667, + "grad_norm": 0.17346715258970194, + "learning_rate": 0.0009001645350941475, + "loss": 1.4589, + "step": 2519 + }, + { + "epoch": 0.2286336418072945, + "grad_norm": 0.17217573092177246, + "learning_rate": 0.0009000764261337759, + "loss": 1.438, + "step": 2520 + }, + { + "epoch": 0.2287243694429323, + "grad_norm": 0.1839259803125763, + "learning_rate": 0.000899988282626746, + "loss": 1.4398, + "step": 2521 + }, + { + "epoch": 0.22881509707857015, + "grad_norm": 0.17284448543798345, + "learning_rate": 0.0008999001045806688, + "loss": 1.4654, + "step": 2522 + }, + { + "epoch": 0.22890582471420795, + "grad_norm": 0.17238274309699178, + "learning_rate": 0.0008998118920031589, + "loss": 1.4777, + "step": 2523 + }, + { + "epoch": 0.22899655234984576, + "grad_norm": 0.17140937560953176, + "learning_rate": 0.0008997236449018328, + "loss": 1.4475, + "step": 2524 + }, + { + "epoch": 0.2290872799854836, + "grad_norm": 0.1748547420592097, + "learning_rate": 0.0008996353632843112, + "loss": 1.4431, + "step": 2525 + }, + { + "epoch": 0.2291780076211214, + "grad_norm": 0.19481118180732304, + "learning_rate": 0.0008995470471582172, + "loss": 1.4612, + "step": 2526 + }, + { + "epoch": 0.2292687352567592, + "grad_norm": 0.1954289984730676, + "learning_rate": 0.0008994586965311768, + "loss": 1.4395, + "step": 2527 + }, + { + "epoch": 0.22935946289239703, + "grad_norm": 0.19462782766461967, + "learning_rate": 0.0008993703114108189, + "loss": 1.4763, + "step": 2528 + }, + { + "epoch": 0.22945019052803484, + "grad_norm": 0.185099442850869, + "learning_rate": 0.000899281891804776, + "loss": 1.4479, + "step": 2529 + }, + { + "epoch": 0.22954091816367264, + "grad_norm": 0.17211781354821185, + "learning_rate": 0.0008991934377206828, + "loss": 1.4831, + "step": 2530 + }, + { + "epoch": 0.22963164579931047, + "grad_norm": 0.17773471730745322, + "learning_rate": 0.0008991049491661774, + "loss": 1.4416, + "step": 2531 + }, + { + "epoch": 0.22972237343494828, + "grad_norm": 0.23068386686929765, + "learning_rate": 0.0008990164261489007, + "loss": 1.4385, + "step": 2532 + }, + { + "epoch": 0.2298131010705861, + "grad_norm": 0.19105679464967412, + "learning_rate": 0.0008989278686764968, + "loss": 1.4738, + "step": 2533 + }, + { + "epoch": 0.22990382870622392, + "grad_norm": 0.17937716880417248, + "learning_rate": 0.0008988392767566128, + "loss": 1.4694, + "step": 2534 + }, + { + "epoch": 0.22999455634186172, + "grad_norm": 0.17693602893981591, + "learning_rate": 0.0008987506503968984, + "loss": 1.4629, + "step": 2535 + }, + { + "epoch": 0.23008528397749956, + "grad_norm": 0.1910775156167919, + "learning_rate": 0.0008986619896050066, + "loss": 1.467, + "step": 2536 + }, + { + "epoch": 0.23017601161313736, + "grad_norm": 0.18193438803022882, + "learning_rate": 0.0008985732943885931, + "loss": 1.4622, + "step": 2537 + }, + { + "epoch": 0.23026673924877517, + "grad_norm": 0.17928736686397642, + "learning_rate": 0.000898484564755317, + "loss": 1.4326, + "step": 2538 + }, + { + "epoch": 0.230357466884413, + "grad_norm": 0.17865975348266613, + "learning_rate": 0.0008983958007128401, + "loss": 1.4592, + "step": 2539 + }, + { + "epoch": 0.2304481945200508, + "grad_norm": 0.1666929778252728, + "learning_rate": 0.0008983070022688269, + "loss": 1.4409, + "step": 2540 + }, + { + "epoch": 0.23053892215568864, + "grad_norm": 0.1872057811164316, + "learning_rate": 0.0008982181694309455, + "loss": 1.4377, + "step": 2541 + }, + { + "epoch": 0.23062964979132644, + "grad_norm": 0.1753186324337673, + "learning_rate": 0.0008981293022068664, + "loss": 1.4463, + "step": 2542 + }, + { + "epoch": 0.23072037742696425, + "grad_norm": 0.19120675696719802, + "learning_rate": 0.0008980404006042634, + "loss": 1.4562, + "step": 2543 + }, + { + "epoch": 0.23081110506260208, + "grad_norm": 0.1696452780832263, + "learning_rate": 0.0008979514646308131, + "loss": 1.4296, + "step": 2544 + }, + { + "epoch": 0.23090183269823988, + "grad_norm": 0.16805526618550695, + "learning_rate": 0.0008978624942941952, + "loss": 1.4567, + "step": 2545 + }, + { + "epoch": 0.2309925603338777, + "grad_norm": 0.1866462315126493, + "learning_rate": 0.0008977734896020924, + "loss": 1.4489, + "step": 2546 + }, + { + "epoch": 0.23108328796951552, + "grad_norm": 0.16377566976077568, + "learning_rate": 0.0008976844505621899, + "loss": 1.4301, + "step": 2547 + }, + { + "epoch": 0.23117401560515333, + "grad_norm": 0.19415802697514187, + "learning_rate": 0.0008975953771821766, + "loss": 1.4246, + "step": 2548 + }, + { + "epoch": 0.23126474324079113, + "grad_norm": 0.20362403328422007, + "learning_rate": 0.000897506269469744, + "loss": 1.448, + "step": 2549 + }, + { + "epoch": 0.23135547087642896, + "grad_norm": 0.16881952637203393, + "learning_rate": 0.0008974171274325863, + "loss": 1.4445, + "step": 2550 + }, + { + "epoch": 0.23144619851206677, + "grad_norm": 0.16716733743237858, + "learning_rate": 0.0008973279510784011, + "loss": 1.4992, + "step": 2551 + }, + { + "epoch": 0.2315369261477046, + "grad_norm": 0.17050471525766264, + "learning_rate": 0.0008972387404148888, + "loss": 1.46, + "step": 2552 + }, + { + "epoch": 0.2316276537833424, + "grad_norm": 0.16563147287919383, + "learning_rate": 0.0008971494954497527, + "loss": 1.456, + "step": 2553 + }, + { + "epoch": 0.2317183814189802, + "grad_norm": 0.17578702289775042, + "learning_rate": 0.0008970602161906991, + "loss": 1.4616, + "step": 2554 + }, + { + "epoch": 0.23180910905461805, + "grad_norm": 0.1718675717298807, + "learning_rate": 0.0008969709026454373, + "loss": 1.4791, + "step": 2555 + }, + { + "epoch": 0.23189983669025585, + "grad_norm": 0.1746935983545294, + "learning_rate": 0.0008968815548216797, + "loss": 1.4829, + "step": 2556 + }, + { + "epoch": 0.23199056432589366, + "grad_norm": 0.17626022644749142, + "learning_rate": 0.0008967921727271412, + "loss": 1.4575, + "step": 2557 + }, + { + "epoch": 0.2320812919615315, + "grad_norm": 0.17313022758349145, + "learning_rate": 0.0008967027563695401, + "loss": 1.4401, + "step": 2558 + }, + { + "epoch": 0.2321720195971693, + "grad_norm": 0.16430024507250374, + "learning_rate": 0.0008966133057565977, + "loss": 1.4377, + "step": 2559 + }, + { + "epoch": 0.23226274723280713, + "grad_norm": 0.16029191086887912, + "learning_rate": 0.0008965238208960375, + "loss": 1.4577, + "step": 2560 + }, + { + "epoch": 0.23235347486844493, + "grad_norm": 0.18718458078143005, + "learning_rate": 0.0008964343017955874, + "loss": 1.4473, + "step": 2561 + }, + { + "epoch": 0.23244420250408274, + "grad_norm": 0.16139378412475797, + "learning_rate": 0.0008963447484629767, + "loss": 1.4961, + "step": 2562 + }, + { + "epoch": 0.23253493013972057, + "grad_norm": 0.16094769265769018, + "learning_rate": 0.0008962551609059384, + "loss": 1.4426, + "step": 2563 + }, + { + "epoch": 0.23262565777535837, + "grad_norm": 0.15840451451007587, + "learning_rate": 0.0008961655391322088, + "loss": 1.4278, + "step": 2564 + }, + { + "epoch": 0.23271638541099618, + "grad_norm": 0.15452223589218916, + "learning_rate": 0.0008960758831495264, + "loss": 1.4627, + "step": 2565 + }, + { + "epoch": 0.232807113046634, + "grad_norm": 0.1677649379030899, + "learning_rate": 0.0008959861929656331, + "loss": 1.4573, + "step": 2566 + }, + { + "epoch": 0.23289784068227182, + "grad_norm": 0.1993692616226082, + "learning_rate": 0.0008958964685882736, + "loss": 1.4578, + "step": 2567 + }, + { + "epoch": 0.23298856831790962, + "grad_norm": 0.16769312316583276, + "learning_rate": 0.0008958067100251958, + "loss": 1.4307, + "step": 2568 + }, + { + "epoch": 0.23307929595354746, + "grad_norm": 0.16872415099468832, + "learning_rate": 0.0008957169172841504, + "loss": 1.4524, + "step": 2569 + }, + { + "epoch": 0.23317002358918526, + "grad_norm": 0.16099155362094647, + "learning_rate": 0.0008956270903728906, + "loss": 1.4536, + "step": 2570 + }, + { + "epoch": 0.2332607512248231, + "grad_norm": 0.16300129780681843, + "learning_rate": 0.0008955372292991734, + "loss": 1.4446, + "step": 2571 + }, + { + "epoch": 0.2333514788604609, + "grad_norm": 0.16511354200791123, + "learning_rate": 0.0008954473340707581, + "loss": 1.4489, + "step": 2572 + }, + { + "epoch": 0.2334422064960987, + "grad_norm": 0.15636932011112953, + "learning_rate": 0.0008953574046954071, + "loss": 1.4727, + "step": 2573 + }, + { + "epoch": 0.23353293413173654, + "grad_norm": 0.33474958818381145, + "learning_rate": 0.0008952674411808861, + "loss": 1.4856, + "step": 2574 + }, + { + "epoch": 0.23362366176737434, + "grad_norm": 0.1559042190787926, + "learning_rate": 0.0008951774435349634, + "loss": 1.4561, + "step": 2575 + }, + { + "epoch": 0.23371438940301215, + "grad_norm": 0.172980547814752, + "learning_rate": 0.00089508741176541, + "loss": 1.4557, + "step": 2576 + }, + { + "epoch": 0.23380511703864998, + "grad_norm": 0.16160117339982571, + "learning_rate": 0.0008949973458800003, + "loss": 1.4919, + "step": 2577 + }, + { + "epoch": 0.23389584467428778, + "grad_norm": 0.16917662666541627, + "learning_rate": 0.0008949072458865117, + "loss": 1.4694, + "step": 2578 + }, + { + "epoch": 0.23398657230992562, + "grad_norm": 0.15319636816198726, + "learning_rate": 0.0008948171117927242, + "loss": 1.4839, + "step": 2579 + }, + { + "epoch": 0.23407729994556342, + "grad_norm": 0.15109227520876545, + "learning_rate": 0.000894726943606421, + "loss": 1.4926, + "step": 2580 + }, + { + "epoch": 0.23416802758120123, + "grad_norm": 0.14902285570087592, + "learning_rate": 0.000894636741335388, + "loss": 1.4592, + "step": 2581 + }, + { + "epoch": 0.23425875521683906, + "grad_norm": 0.15265394197648308, + "learning_rate": 0.000894546504987414, + "loss": 1.4299, + "step": 2582 + }, + { + "epoch": 0.23434948285247686, + "grad_norm": 0.19518147031148556, + "learning_rate": 0.0008944562345702913, + "loss": 1.4272, + "step": 2583 + }, + { + "epoch": 0.23444021048811467, + "grad_norm": 0.15697644897884883, + "learning_rate": 0.0008943659300918148, + "loss": 1.4517, + "step": 2584 + }, + { + "epoch": 0.2345309381237525, + "grad_norm": 0.14820748948319504, + "learning_rate": 0.0008942755915597819, + "loss": 1.436, + "step": 2585 + }, + { + "epoch": 0.2346216657593903, + "grad_norm": 0.1643728211059675, + "learning_rate": 0.0008941852189819936, + "loss": 1.404, + "step": 2586 + }, + { + "epoch": 0.2347123933950281, + "grad_norm": 0.16424476391537682, + "learning_rate": 0.0008940948123662536, + "loss": 1.4833, + "step": 2587 + }, + { + "epoch": 0.23480312103066595, + "grad_norm": 0.1574335335494688, + "learning_rate": 0.0008940043717203684, + "loss": 1.4748, + "step": 2588 + }, + { + "epoch": 0.23489384866630375, + "grad_norm": 0.15549374254138643, + "learning_rate": 0.0008939138970521475, + "loss": 1.4456, + "step": 2589 + }, + { + "epoch": 0.23498457630194158, + "grad_norm": 0.15904647693799606, + "learning_rate": 0.0008938233883694038, + "loss": 1.4543, + "step": 2590 + }, + { + "epoch": 0.2350753039375794, + "grad_norm": 0.1859194653454555, + "learning_rate": 0.0008937328456799522, + "loss": 1.4723, + "step": 2591 + }, + { + "epoch": 0.2351660315732172, + "grad_norm": 0.16018567270693615, + "learning_rate": 0.0008936422689916115, + "loss": 1.4638, + "step": 2592 + }, + { + "epoch": 0.23525675920885503, + "grad_norm": 0.16995890578560086, + "learning_rate": 0.000893551658312203, + "loss": 1.4266, + "step": 2593 + }, + { + "epoch": 0.23534748684449283, + "grad_norm": 0.16120462933826374, + "learning_rate": 0.0008934610136495506, + "loss": 1.4381, + "step": 2594 + }, + { + "epoch": 0.23543821448013064, + "grad_norm": 0.17364508701755965, + "learning_rate": 0.0008933703350114817, + "loss": 1.424, + "step": 2595 + }, + { + "epoch": 0.23552894211576847, + "grad_norm": 0.1597833606730182, + "learning_rate": 0.0008932796224058264, + "loss": 1.4347, + "step": 2596 + }, + { + "epoch": 0.23561966975140627, + "grad_norm": 0.1741534862751746, + "learning_rate": 0.0008931888758404178, + "loss": 1.4451, + "step": 2597 + }, + { + "epoch": 0.2357103973870441, + "grad_norm": 0.16537935754480781, + "learning_rate": 0.0008930980953230918, + "loss": 1.4489, + "step": 2598 + }, + { + "epoch": 0.2358011250226819, + "grad_norm": 0.17906579936303138, + "learning_rate": 0.0008930072808616873, + "loss": 1.4652, + "step": 2599 + }, + { + "epoch": 0.23589185265831972, + "grad_norm": 0.15869385590280236, + "learning_rate": 0.0008929164324640462, + "loss": 1.4321, + "step": 2600 + }, + { + "epoch": 0.23598258029395755, + "grad_norm": 0.15406476127629823, + "learning_rate": 0.0008928255501380132, + "loss": 1.4813, + "step": 2601 + }, + { + "epoch": 0.23607330792959536, + "grad_norm": 0.1457284030972597, + "learning_rate": 0.0008927346338914361, + "loss": 1.4513, + "step": 2602 + }, + { + "epoch": 0.23616403556523316, + "grad_norm": 0.15683986656782847, + "learning_rate": 0.0008926436837321655, + "loss": 1.4349, + "step": 2603 + }, + { + "epoch": 0.236254763200871, + "grad_norm": 0.1561730259720366, + "learning_rate": 0.0008925526996680548, + "loss": 1.416, + "step": 2604 + }, + { + "epoch": 0.2363454908365088, + "grad_norm": 0.16433653984451152, + "learning_rate": 0.0008924616817069608, + "loss": 1.4677, + "step": 2605 + }, + { + "epoch": 0.2364362184721466, + "grad_norm": 0.15799314744427656, + "learning_rate": 0.0008923706298567427, + "loss": 1.4794, + "step": 2606 + }, + { + "epoch": 0.23652694610778444, + "grad_norm": 0.15959861582643123, + "learning_rate": 0.0008922795441252629, + "loss": 1.4687, + "step": 2607 + }, + { + "epoch": 0.23661767374342224, + "grad_norm": 0.16019064450026987, + "learning_rate": 0.0008921884245203866, + "loss": 1.4562, + "step": 2608 + }, + { + "epoch": 0.23670840137906007, + "grad_norm": 0.1701196124228823, + "learning_rate": 0.0008920972710499819, + "loss": 1.4746, + "step": 2609 + }, + { + "epoch": 0.23679912901469788, + "grad_norm": 0.15656826227785856, + "learning_rate": 0.0008920060837219203, + "loss": 1.4624, + "step": 2610 + }, + { + "epoch": 0.23688985665033568, + "grad_norm": 0.15732338844189767, + "learning_rate": 0.0008919148625440755, + "loss": 1.4744, + "step": 2611 + }, + { + "epoch": 0.23698058428597352, + "grad_norm": 0.16001200009041508, + "learning_rate": 0.0008918236075243246, + "loss": 1.4433, + "step": 2612 + }, + { + "epoch": 0.23707131192161132, + "grad_norm": 0.16987316033340766, + "learning_rate": 0.0008917323186705474, + "loss": 1.4442, + "step": 2613 + }, + { + "epoch": 0.23716203955724913, + "grad_norm": 0.14875354590239717, + "learning_rate": 0.0008916409959906267, + "loss": 1.4191, + "step": 2614 + }, + { + "epoch": 0.23725276719288696, + "grad_norm": 0.16014740400946723, + "learning_rate": 0.0008915496394924484, + "loss": 1.4292, + "step": 2615 + }, + { + "epoch": 0.23734349482852476, + "grad_norm": 0.1727221747012365, + "learning_rate": 0.000891458249183901, + "loss": 1.5105, + "step": 2616 + }, + { + "epoch": 0.2374342224641626, + "grad_norm": 0.1457423633624541, + "learning_rate": 0.0008913668250728759, + "loss": 1.4778, + "step": 2617 + }, + { + "epoch": 0.2375249500998004, + "grad_norm": 0.1574593316376205, + "learning_rate": 0.000891275367167268, + "loss": 1.4392, + "step": 2618 + }, + { + "epoch": 0.2376156777354382, + "grad_norm": 0.16583706684327812, + "learning_rate": 0.0008911838754749743, + "loss": 1.4244, + "step": 2619 + }, + { + "epoch": 0.23770640537107604, + "grad_norm": 0.15446294576765732, + "learning_rate": 0.0008910923500038955, + "loss": 1.4638, + "step": 2620 + }, + { + "epoch": 0.23779713300671385, + "grad_norm": 0.15132283482499942, + "learning_rate": 0.0008910007907619344, + "loss": 1.4424, + "step": 2621 + }, + { + "epoch": 0.23788786064235165, + "grad_norm": 0.15587186197362354, + "learning_rate": 0.0008909091977569976, + "loss": 1.4793, + "step": 2622 + }, + { + "epoch": 0.23797858827798948, + "grad_norm": 0.1502311694700533, + "learning_rate": 0.0008908175709969936, + "loss": 1.4354, + "step": 2623 + }, + { + "epoch": 0.2380693159136273, + "grad_norm": 0.1540840377063716, + "learning_rate": 0.0008907259104898352, + "loss": 1.4627, + "step": 2624 + }, + { + "epoch": 0.2381600435492651, + "grad_norm": 0.1576577615009266, + "learning_rate": 0.0008906342162434366, + "loss": 1.4494, + "step": 2625 + }, + { + "epoch": 0.23825077118490293, + "grad_norm": 0.14619979741757452, + "learning_rate": 0.000890542488265716, + "loss": 1.4463, + "step": 2626 + }, + { + "epoch": 0.23834149882054073, + "grad_norm": 0.14990667610692335, + "learning_rate": 0.0008904507265645938, + "loss": 1.4492, + "step": 2627 + }, + { + "epoch": 0.23843222645617856, + "grad_norm": 0.1735076159721931, + "learning_rate": 0.0008903589311479939, + "loss": 1.4655, + "step": 2628 + }, + { + "epoch": 0.23852295409181637, + "grad_norm": 0.1566152638020614, + "learning_rate": 0.0008902671020238427, + "loss": 1.4651, + "step": 2629 + }, + { + "epoch": 0.23861368172745417, + "grad_norm": 0.1495252599758567, + "learning_rate": 0.0008901752392000699, + "loss": 1.4337, + "step": 2630 + }, + { + "epoch": 0.238704409363092, + "grad_norm": 0.1513753864838902, + "learning_rate": 0.0008900833426846075, + "loss": 1.4441, + "step": 2631 + }, + { + "epoch": 0.2387951369987298, + "grad_norm": 0.15479640116443694, + "learning_rate": 0.000889991412485391, + "loss": 1.4612, + "step": 2632 + }, + { + "epoch": 0.23888586463436762, + "grad_norm": 0.14934703794693666, + "learning_rate": 0.0008898994486103587, + "loss": 1.4496, + "step": 2633 + }, + { + "epoch": 0.23897659227000545, + "grad_norm": 0.15190313898417152, + "learning_rate": 0.0008898074510674515, + "loss": 1.4424, + "step": 2634 + }, + { + "epoch": 0.23906731990564326, + "grad_norm": 0.15538865485275613, + "learning_rate": 0.0008897154198646133, + "loss": 1.4132, + "step": 2635 + }, + { + "epoch": 0.2391580475412811, + "grad_norm": 0.15726856884978635, + "learning_rate": 0.0008896233550097912, + "loss": 1.4383, + "step": 2636 + }, + { + "epoch": 0.2392487751769189, + "grad_norm": 0.15626213468732422, + "learning_rate": 0.000889531256510935, + "loss": 1.4729, + "step": 2637 + }, + { + "epoch": 0.2393395028125567, + "grad_norm": 0.14705294452263767, + "learning_rate": 0.0008894391243759974, + "loss": 1.4611, + "step": 2638 + }, + { + "epoch": 0.23943023044819453, + "grad_norm": 0.15283691954179865, + "learning_rate": 0.000889346958612934, + "loss": 1.4454, + "step": 2639 + }, + { + "epoch": 0.23952095808383234, + "grad_norm": 0.14555510206295608, + "learning_rate": 0.0008892547592297033, + "loss": 1.3956, + "step": 2640 + }, + { + "epoch": 0.23961168571947014, + "grad_norm": 0.15088453080245862, + "learning_rate": 0.0008891625262342669, + "loss": 1.45, + "step": 2641 + }, + { + "epoch": 0.23970241335510797, + "grad_norm": 0.1518032823921633, + "learning_rate": 0.0008890702596345889, + "loss": 1.456, + "step": 2642 + }, + { + "epoch": 0.23979314099074578, + "grad_norm": 0.14567280429921248, + "learning_rate": 0.0008889779594386367, + "loss": 1.4503, + "step": 2643 + }, + { + "epoch": 0.23988386862638358, + "grad_norm": 0.15681141867640777, + "learning_rate": 0.0008888856256543804, + "loss": 1.4519, + "step": 2644 + }, + { + "epoch": 0.23997459626202142, + "grad_norm": 0.1600791887917163, + "learning_rate": 0.0008887932582897929, + "loss": 1.4705, + "step": 2645 + }, + { + "epoch": 0.24006532389765922, + "grad_norm": 0.15773485725674052, + "learning_rate": 0.0008887008573528504, + "loss": 1.4926, + "step": 2646 + }, + { + "epoch": 0.24015605153329705, + "grad_norm": 0.18583514528832137, + "learning_rate": 0.0008886084228515313, + "loss": 1.4649, + "step": 2647 + }, + { + "epoch": 0.24024677916893486, + "grad_norm": 0.16595218317478885, + "learning_rate": 0.0008885159547938178, + "loss": 1.4701, + "step": 2648 + }, + { + "epoch": 0.24033750680457266, + "grad_norm": 0.15547916944935908, + "learning_rate": 0.0008884234531876943, + "loss": 1.4353, + "step": 2649 + }, + { + "epoch": 0.2404282344402105, + "grad_norm": 0.16090759174034427, + "learning_rate": 0.0008883309180411484, + "loss": 1.4537, + "step": 2650 + }, + { + "epoch": 0.2405189620758483, + "grad_norm": 0.19240240977992906, + "learning_rate": 0.0008882383493621706, + "loss": 1.445, + "step": 2651 + }, + { + "epoch": 0.2406096897114861, + "grad_norm": 0.18127078504882002, + "learning_rate": 0.0008881457471587539, + "loss": 1.5073, + "step": 2652 + }, + { + "epoch": 0.24070041734712394, + "grad_norm": 0.17153513703595405, + "learning_rate": 0.0008880531114388948, + "loss": 1.4532, + "step": 2653 + }, + { + "epoch": 0.24079114498276175, + "grad_norm": 0.17581516699716343, + "learning_rate": 0.0008879604422105925, + "loss": 1.4237, + "step": 2654 + }, + { + "epoch": 0.24088187261839955, + "grad_norm": 0.1623879531984882, + "learning_rate": 0.0008878677394818487, + "loss": 1.4491, + "step": 2655 + }, + { + "epoch": 0.24097260025403738, + "grad_norm": 0.16616848612593563, + "learning_rate": 0.0008877750032606683, + "loss": 1.4435, + "step": 2656 + }, + { + "epoch": 0.2410633278896752, + "grad_norm": 0.16863735395074514, + "learning_rate": 0.0008876822335550594, + "loss": 1.4927, + "step": 2657 + }, + { + "epoch": 0.24115405552531302, + "grad_norm": 0.16037600179073375, + "learning_rate": 0.0008875894303730323, + "loss": 1.4506, + "step": 2658 + }, + { + "epoch": 0.24124478316095083, + "grad_norm": 0.1659844472301839, + "learning_rate": 0.0008874965937226009, + "loss": 1.432, + "step": 2659 + }, + { + "epoch": 0.24133551079658863, + "grad_norm": 0.1642617215174311, + "learning_rate": 0.0008874037236117815, + "loss": 1.4919, + "step": 2660 + }, + { + "epoch": 0.24142623843222646, + "grad_norm": 0.16136605083401775, + "learning_rate": 0.0008873108200485936, + "loss": 1.4495, + "step": 2661 + }, + { + "epoch": 0.24151696606786427, + "grad_norm": 0.17139712662145543, + "learning_rate": 0.0008872178830410592, + "loss": 1.4366, + "step": 2662 + }, + { + "epoch": 0.24160769370350207, + "grad_norm": 0.15696979312214343, + "learning_rate": 0.0008871249125972035, + "loss": 1.4673, + "step": 2663 + }, + { + "epoch": 0.2416984213391399, + "grad_norm": 0.1605292080705529, + "learning_rate": 0.0008870319087250546, + "loss": 1.4322, + "step": 2664 + }, + { + "epoch": 0.2417891489747777, + "grad_norm": 0.16640082747877558, + "learning_rate": 0.0008869388714326433, + "loss": 1.469, + "step": 2665 + }, + { + "epoch": 0.24187987661041555, + "grad_norm": 0.1943256068034806, + "learning_rate": 0.0008868458007280034, + "loss": 1.4462, + "step": 2666 + }, + { + "epoch": 0.24197060424605335, + "grad_norm": 0.15437757203057256, + "learning_rate": 0.0008867526966191716, + "loss": 1.4692, + "step": 2667 + }, + { + "epoch": 0.24206133188169116, + "grad_norm": 0.1579701062687973, + "learning_rate": 0.0008866595591141875, + "loss": 1.4826, + "step": 2668 + }, + { + "epoch": 0.242152059517329, + "grad_norm": 0.16210089783944445, + "learning_rate": 0.0008865663882210935, + "loss": 1.4725, + "step": 2669 + }, + { + "epoch": 0.2422427871529668, + "grad_norm": 0.16153076939063823, + "learning_rate": 0.0008864731839479347, + "loss": 1.4141, + "step": 2670 + }, + { + "epoch": 0.2423335147886046, + "grad_norm": 0.15828084672169584, + "learning_rate": 0.0008863799463027597, + "loss": 1.4719, + "step": 2671 + }, + { + "epoch": 0.24242424242424243, + "grad_norm": 0.1522308700362036, + "learning_rate": 0.0008862866752936194, + "loss": 1.4398, + "step": 2672 + }, + { + "epoch": 0.24251497005988024, + "grad_norm": 0.1687940341713481, + "learning_rate": 0.0008861933709285677, + "loss": 1.4463, + "step": 2673 + }, + { + "epoch": 0.24260569769551804, + "grad_norm": 0.16180900552576027, + "learning_rate": 0.0008861000332156615, + "loss": 1.4267, + "step": 2674 + }, + { + "epoch": 0.24269642533115587, + "grad_norm": 0.14981880278118573, + "learning_rate": 0.0008860066621629606, + "loss": 1.4019, + "step": 2675 + }, + { + "epoch": 0.24278715296679368, + "grad_norm": 0.1527346408747331, + "learning_rate": 0.0008859132577785274, + "loss": 1.4596, + "step": 2676 + }, + { + "epoch": 0.2428778806024315, + "grad_norm": 0.15480536243165074, + "learning_rate": 0.0008858198200704275, + "loss": 1.4451, + "step": 2677 + }, + { + "epoch": 0.24296860823806932, + "grad_norm": 0.16845593220756336, + "learning_rate": 0.0008857263490467293, + "loss": 1.4441, + "step": 2678 + }, + { + "epoch": 0.24305933587370712, + "grad_norm": 0.16290630758066457, + "learning_rate": 0.0008856328447155041, + "loss": 1.4865, + "step": 2679 + }, + { + "epoch": 0.24315006350934495, + "grad_norm": 0.15577246233230052, + "learning_rate": 0.0008855393070848258, + "loss": 1.463, + "step": 2680 + }, + { + "epoch": 0.24324079114498276, + "grad_norm": 0.15845900033264912, + "learning_rate": 0.0008854457361627717, + "loss": 1.4476, + "step": 2681 + }, + { + "epoch": 0.24333151878062056, + "grad_norm": 0.16881196822072403, + "learning_rate": 0.0008853521319574214, + "loss": 1.4154, + "step": 2682 + }, + { + "epoch": 0.2434222464162584, + "grad_norm": 0.22817578995315474, + "learning_rate": 0.0008852584944768576, + "loss": 1.461, + "step": 2683 + }, + { + "epoch": 0.2435129740518962, + "grad_norm": 0.15691157402434328, + "learning_rate": 0.0008851648237291661, + "loss": 1.4616, + "step": 2684 + }, + { + "epoch": 0.24360370168753404, + "grad_norm": 0.20994448877611646, + "learning_rate": 0.0008850711197224353, + "loss": 1.453, + "step": 2685 + }, + { + "epoch": 0.24369442932317184, + "grad_norm": 0.1548935921940971, + "learning_rate": 0.0008849773824647565, + "loss": 1.4442, + "step": 2686 + }, + { + "epoch": 0.24378515695880965, + "grad_norm": 0.16216985188206942, + "learning_rate": 0.000884883611964224, + "loss": 1.4366, + "step": 2687 + }, + { + "epoch": 0.24387588459444748, + "grad_norm": 0.16787152242315237, + "learning_rate": 0.0008847898082289349, + "loss": 1.4733, + "step": 2688 + }, + { + "epoch": 0.24396661223008528, + "grad_norm": 0.15146733335802717, + "learning_rate": 0.0008846959712669892, + "loss": 1.4566, + "step": 2689 + }, + { + "epoch": 0.2440573398657231, + "grad_norm": 0.16308936350489342, + "learning_rate": 0.0008846021010864896, + "loss": 1.4371, + "step": 2690 + }, + { + "epoch": 0.24414806750136092, + "grad_norm": 0.1568465903062381, + "learning_rate": 0.000884508197695542, + "loss": 1.462, + "step": 2691 + }, + { + "epoch": 0.24423879513699873, + "grad_norm": 0.193421364720221, + "learning_rate": 0.0008844142611022548, + "loss": 1.4391, + "step": 2692 + }, + { + "epoch": 0.24432952277263653, + "grad_norm": 0.15497492056539236, + "learning_rate": 0.0008843202913147394, + "loss": 1.4366, + "step": 2693 + }, + { + "epoch": 0.24442025040827436, + "grad_norm": 0.1867727874932381, + "learning_rate": 0.0008842262883411103, + "loss": 1.4732, + "step": 2694 + }, + { + "epoch": 0.24451097804391217, + "grad_norm": 0.1665422441303654, + "learning_rate": 0.0008841322521894846, + "loss": 1.445, + "step": 2695 + }, + { + "epoch": 0.24460170567955, + "grad_norm": 0.15959995188068754, + "learning_rate": 0.0008840381828679823, + "loss": 1.4634, + "step": 2696 + }, + { + "epoch": 0.2446924333151878, + "grad_norm": 0.1576125470120173, + "learning_rate": 0.0008839440803847263, + "loss": 1.4455, + "step": 2697 + }, + { + "epoch": 0.2447831609508256, + "grad_norm": 0.15600167855037053, + "learning_rate": 0.0008838499447478423, + "loss": 1.4769, + "step": 2698 + }, + { + "epoch": 0.24487388858646345, + "grad_norm": 0.17328935006371654, + "learning_rate": 0.0008837557759654591, + "loss": 1.4584, + "step": 2699 + }, + { + "epoch": 0.24496461622210125, + "grad_norm": 0.16317231002877838, + "learning_rate": 0.000883661574045708, + "loss": 1.4153, + "step": 2700 + }, + { + "epoch": 0.24505534385773906, + "grad_norm": 0.15813123359396442, + "learning_rate": 0.0008835673389967235, + "loss": 1.4658, + "step": 2701 + }, + { + "epoch": 0.2451460714933769, + "grad_norm": 0.16426735649785038, + "learning_rate": 0.0008834730708266427, + "loss": 1.4526, + "step": 2702 + }, + { + "epoch": 0.2452367991290147, + "grad_norm": 0.1674021016150136, + "learning_rate": 0.0008833787695436057, + "loss": 1.4318, + "step": 2703 + }, + { + "epoch": 0.24532752676465253, + "grad_norm": 0.15131643764663574, + "learning_rate": 0.0008832844351557555, + "loss": 1.4194, + "step": 2704 + }, + { + "epoch": 0.24541825440029033, + "grad_norm": 0.14849008982175452, + "learning_rate": 0.0008831900676712378, + "loss": 1.4321, + "step": 2705 + }, + { + "epoch": 0.24550898203592814, + "grad_norm": 0.15830389338158724, + "learning_rate": 0.0008830956670982013, + "loss": 1.4532, + "step": 2706 + }, + { + "epoch": 0.24559970967156597, + "grad_norm": 0.1554372664415459, + "learning_rate": 0.0008830012334447975, + "loss": 1.4452, + "step": 2707 + }, + { + "epoch": 0.24569043730720377, + "grad_norm": 0.16834703946680235, + "learning_rate": 0.0008829067667191807, + "loss": 1.4526, + "step": 2708 + }, + { + "epoch": 0.24578116494284158, + "grad_norm": 0.16603150753432097, + "learning_rate": 0.0008828122669295082, + "loss": 1.4586, + "step": 2709 + }, + { + "epoch": 0.2458718925784794, + "grad_norm": 0.17105947322309928, + "learning_rate": 0.00088271773408394, + "loss": 1.4842, + "step": 2710 + }, + { + "epoch": 0.24596262021411722, + "grad_norm": 0.2841157919802381, + "learning_rate": 0.000882623168190639, + "loss": 1.4597, + "step": 2711 + }, + { + "epoch": 0.24605334784975502, + "grad_norm": 0.16625975256014414, + "learning_rate": 0.0008825285692577712, + "loss": 1.4845, + "step": 2712 + }, + { + "epoch": 0.24614407548539285, + "grad_norm": 0.1834411673379599, + "learning_rate": 0.0008824339372935048, + "loss": 1.4521, + "step": 2713 + }, + { + "epoch": 0.24623480312103066, + "grad_norm": 0.1675349536725267, + "learning_rate": 0.0008823392723060117, + "loss": 1.4411, + "step": 2714 + }, + { + "epoch": 0.2463255307566685, + "grad_norm": 0.16551139557562763, + "learning_rate": 0.0008822445743034661, + "loss": 1.4283, + "step": 2715 + }, + { + "epoch": 0.2464162583923063, + "grad_norm": 0.16313462037908152, + "learning_rate": 0.0008821498432940452, + "loss": 1.4553, + "step": 2716 + }, + { + "epoch": 0.2465069860279441, + "grad_norm": 0.16130351448858402, + "learning_rate": 0.000882055079285929, + "loss": 1.4614, + "step": 2717 + }, + { + "epoch": 0.24659771366358194, + "grad_norm": 0.1651355259964269, + "learning_rate": 0.0008819602822873004, + "loss": 1.4609, + "step": 2718 + }, + { + "epoch": 0.24668844129921974, + "grad_norm": 0.16747808279431695, + "learning_rate": 0.0008818654523063451, + "loss": 1.4448, + "step": 2719 + }, + { + "epoch": 0.24677916893485755, + "grad_norm": 0.1625203169607356, + "learning_rate": 0.0008817705893512518, + "loss": 1.4594, + "step": 2720 + }, + { + "epoch": 0.24686989657049538, + "grad_norm": 0.16772496225048583, + "learning_rate": 0.0008816756934302117, + "loss": 1.4252, + "step": 2721 + }, + { + "epoch": 0.24696062420613318, + "grad_norm": 0.1597191522748346, + "learning_rate": 0.0008815807645514192, + "loss": 1.429, + "step": 2722 + }, + { + "epoch": 0.24705135184177102, + "grad_norm": 0.16285548658868895, + "learning_rate": 0.0008814858027230716, + "loss": 1.4567, + "step": 2723 + }, + { + "epoch": 0.24714207947740882, + "grad_norm": 0.16337501949124367, + "learning_rate": 0.0008813908079533686, + "loss": 1.4576, + "step": 2724 + }, + { + "epoch": 0.24723280711304663, + "grad_norm": 0.1572983922713275, + "learning_rate": 0.000881295780250513, + "loss": 1.4672, + "step": 2725 + }, + { + "epoch": 0.24732353474868446, + "grad_norm": 0.15694761027505832, + "learning_rate": 0.0008812007196227108, + "loss": 1.4558, + "step": 2726 + }, + { + "epoch": 0.24741426238432226, + "grad_norm": 0.15825811580057458, + "learning_rate": 0.0008811056260781703, + "loss": 1.4072, + "step": 2727 + }, + { + "epoch": 0.24750499001996007, + "grad_norm": 0.15430066310958593, + "learning_rate": 0.0008810104996251027, + "loss": 1.4611, + "step": 2728 + }, + { + "epoch": 0.2475957176555979, + "grad_norm": 0.15884703285789595, + "learning_rate": 0.0008809153402717223, + "loss": 1.4694, + "step": 2729 + }, + { + "epoch": 0.2476864452912357, + "grad_norm": 0.16038589812752643, + "learning_rate": 0.0008808201480262461, + "loss": 1.4588, + "step": 2730 + }, + { + "epoch": 0.2477771729268735, + "grad_norm": 0.15761634121159954, + "learning_rate": 0.000880724922896894, + "loss": 1.4582, + "step": 2731 + }, + { + "epoch": 0.24786790056251135, + "grad_norm": 0.15697506152114876, + "learning_rate": 0.0008806296648918888, + "loss": 1.4457, + "step": 2732 + }, + { + "epoch": 0.24795862819814915, + "grad_norm": 0.15728992877347028, + "learning_rate": 0.0008805343740194558, + "loss": 1.4787, + "step": 2733 + }, + { + "epoch": 0.24804935583378698, + "grad_norm": 0.15613125488307214, + "learning_rate": 0.0008804390502878237, + "loss": 1.4349, + "step": 2734 + }, + { + "epoch": 0.2481400834694248, + "grad_norm": 0.16178961779641002, + "learning_rate": 0.0008803436937052234, + "loss": 1.4967, + "step": 2735 + }, + { + "epoch": 0.2482308111050626, + "grad_norm": 0.1651840257007129, + "learning_rate": 0.0008802483042798891, + "loss": 1.427, + "step": 2736 + }, + { + "epoch": 0.24832153874070043, + "grad_norm": 0.1596454447154968, + "learning_rate": 0.0008801528820200577, + "loss": 1.4166, + "step": 2737 + }, + { + "epoch": 0.24841226637633823, + "grad_norm": 0.22527625832101253, + "learning_rate": 0.0008800574269339689, + "loss": 1.4565, + "step": 2738 + }, + { + "epoch": 0.24850299401197604, + "grad_norm": 0.17984042537820452, + "learning_rate": 0.0008799619390298653, + "loss": 1.4451, + "step": 2739 + }, + { + "epoch": 0.24859372164761387, + "grad_norm": 0.16702074488473853, + "learning_rate": 0.0008798664183159923, + "loss": 1.4516, + "step": 2740 + }, + { + "epoch": 0.24868444928325167, + "grad_norm": 0.16187408881859808, + "learning_rate": 0.000879770864800598, + "loss": 1.4667, + "step": 2741 + }, + { + "epoch": 0.2487751769188895, + "grad_norm": 0.15927165062068435, + "learning_rate": 0.0008796752784919335, + "loss": 1.4614, + "step": 2742 + }, + { + "epoch": 0.2488659045545273, + "grad_norm": 0.1705637465002048, + "learning_rate": 0.0008795796593982529, + "loss": 1.4397, + "step": 2743 + }, + { + "epoch": 0.24895663219016512, + "grad_norm": 0.16075234007075284, + "learning_rate": 0.0008794840075278127, + "loss": 1.4513, + "step": 2744 + }, + { + "epoch": 0.24904735982580295, + "grad_norm": 0.16768655648637387, + "learning_rate": 0.0008793883228888726, + "loss": 1.4895, + "step": 2745 + }, + { + "epoch": 0.24913808746144075, + "grad_norm": 0.158982389809172, + "learning_rate": 0.0008792926054896948, + "loss": 1.4429, + "step": 2746 + }, + { + "epoch": 0.24922881509707856, + "grad_norm": 0.15774871742120344, + "learning_rate": 0.0008791968553385445, + "loss": 1.4502, + "step": 2747 + }, + { + "epoch": 0.2493195427327164, + "grad_norm": 0.2106717518106777, + "learning_rate": 0.0008791010724436901, + "loss": 1.4423, + "step": 2748 + }, + { + "epoch": 0.2494102703683542, + "grad_norm": 0.23843065649083994, + "learning_rate": 0.0008790052568134021, + "loss": 1.4558, + "step": 2749 + }, + { + "epoch": 0.249500998003992, + "grad_norm": 0.15722130369002804, + "learning_rate": 0.0008789094084559544, + "loss": 1.4847, + "step": 2750 + }, + { + "epoch": 0.24959172563962984, + "grad_norm": 0.16679613582057987, + "learning_rate": 0.0008788135273796233, + "loss": 1.4526, + "step": 2751 + }, + { + "epoch": 0.24968245327526764, + "grad_norm": 0.16217884117816186, + "learning_rate": 0.0008787176135926883, + "loss": 1.4591, + "step": 2752 + }, + { + "epoch": 0.24977318091090547, + "grad_norm": 0.1505005951768302, + "learning_rate": 0.0008786216671034316, + "loss": 1.4773, + "step": 2753 + }, + { + "epoch": 0.24986390854654328, + "grad_norm": 0.16276395858524353, + "learning_rate": 0.0008785256879201382, + "loss": 1.4653, + "step": 2754 + }, + { + "epoch": 0.24995463618218108, + "grad_norm": 0.16254127854353043, + "learning_rate": 0.0008784296760510957, + "loss": 1.4131, + "step": 2755 + }, + { + "epoch": 0.2500453638178189, + "grad_norm": 0.15901492304841375, + "learning_rate": 0.000878333631504595, + "loss": 1.4206, + "step": 2756 + }, + { + "epoch": 0.2501360914534567, + "grad_norm": 0.1799793394538731, + "learning_rate": 0.0008782375542889293, + "loss": 1.459, + "step": 2757 + }, + { + "epoch": 0.25022681908909455, + "grad_norm": 0.16739755761828498, + "learning_rate": 0.0008781414444123953, + "loss": 1.4369, + "step": 2758 + }, + { + "epoch": 0.25031754672473233, + "grad_norm": 0.16575177929975485, + "learning_rate": 0.0008780453018832918, + "loss": 1.4049, + "step": 2759 + }, + { + "epoch": 0.25040827436037016, + "grad_norm": 0.1709452979524678, + "learning_rate": 0.0008779491267099207, + "loss": 1.4939, + "step": 2760 + }, + { + "epoch": 0.250499001996008, + "grad_norm": 0.16520030219499168, + "learning_rate": 0.0008778529189005867, + "loss": 1.4608, + "step": 2761 + }, + { + "epoch": 0.2505897296316458, + "grad_norm": 0.20379272657759157, + "learning_rate": 0.0008777566784635975, + "loss": 1.4643, + "step": 2762 + }, + { + "epoch": 0.2506804572672836, + "grad_norm": 0.16325475246589147, + "learning_rate": 0.0008776604054072637, + "loss": 1.4397, + "step": 2763 + }, + { + "epoch": 0.25077118490292144, + "grad_norm": 0.16728646202564784, + "learning_rate": 0.0008775640997398979, + "loss": 1.4442, + "step": 2764 + }, + { + "epoch": 0.2508619125385592, + "grad_norm": 0.1562847710642401, + "learning_rate": 0.0008774677614698165, + "loss": 1.5192, + "step": 2765 + }, + { + "epoch": 0.25095264017419705, + "grad_norm": 0.17404636127574163, + "learning_rate": 0.0008773713906053384, + "loss": 1.4662, + "step": 2766 + }, + { + "epoch": 0.2510433678098349, + "grad_norm": 0.1725161561306144, + "learning_rate": 0.000877274987154785, + "loss": 1.4253, + "step": 2767 + }, + { + "epoch": 0.2511340954454727, + "grad_norm": 0.16120582517007703, + "learning_rate": 0.0008771785511264809, + "loss": 1.4575, + "step": 2768 + }, + { + "epoch": 0.2512248230811105, + "grad_norm": 0.1537140838545632, + "learning_rate": 0.0008770820825287533, + "loss": 1.4635, + "step": 2769 + }, + { + "epoch": 0.2513155507167483, + "grad_norm": 0.24858680695823035, + "learning_rate": 0.0008769855813699324, + "loss": 1.4246, + "step": 2770 + }, + { + "epoch": 0.25140627835238616, + "grad_norm": 0.15787002068340472, + "learning_rate": 0.0008768890476583508, + "loss": 1.4341, + "step": 2771 + }, + { + "epoch": 0.25149700598802394, + "grad_norm": 0.16700574908479113, + "learning_rate": 0.0008767924814023446, + "loss": 1.4068, + "step": 2772 + }, + { + "epoch": 0.25158773362366177, + "grad_norm": 0.16958950147298774, + "learning_rate": 0.000876695882610252, + "loss": 1.4638, + "step": 2773 + }, + { + "epoch": 0.2516784612592996, + "grad_norm": 0.14958301371389046, + "learning_rate": 0.0008765992512904144, + "loss": 1.4477, + "step": 2774 + }, + { + "epoch": 0.2517691888949374, + "grad_norm": 0.1486271743716594, + "learning_rate": 0.0008765025874511758, + "loss": 1.4249, + "step": 2775 + }, + { + "epoch": 0.2518599165305752, + "grad_norm": 0.17700956381101107, + "learning_rate": 0.0008764058911008835, + "loss": 1.4718, + "step": 2776 + }, + { + "epoch": 0.25195064416621304, + "grad_norm": 0.16133772574570138, + "learning_rate": 0.0008763091622478869, + "loss": 1.4519, + "step": 2777 + }, + { + "epoch": 0.2520413718018508, + "grad_norm": 0.14869194356016477, + "learning_rate": 0.0008762124009005388, + "loss": 1.4457, + "step": 2778 + }, + { + "epoch": 0.25213209943748865, + "grad_norm": 0.1767556025076581, + "learning_rate": 0.0008761156070671943, + "loss": 1.4674, + "step": 2779 + }, + { + "epoch": 0.2522228270731265, + "grad_norm": 0.15257588237957667, + "learning_rate": 0.0008760187807562119, + "loss": 1.4642, + "step": 2780 + }, + { + "epoch": 0.25231355470876426, + "grad_norm": 0.16082100540089045, + "learning_rate": 0.0008759219219759522, + "loss": 1.4647, + "step": 2781 + }, + { + "epoch": 0.2524042823444021, + "grad_norm": 0.15507699756599908, + "learning_rate": 0.0008758250307347792, + "loss": 1.4482, + "step": 2782 + }, + { + "epoch": 0.25249500998003993, + "grad_norm": 0.22051196727533987, + "learning_rate": 0.0008757281070410592, + "loss": 1.4773, + "step": 2783 + }, + { + "epoch": 0.2525857376156777, + "grad_norm": 0.16915186389664552, + "learning_rate": 0.000875631150903162, + "loss": 1.457, + "step": 2784 + }, + { + "epoch": 0.25267646525131554, + "grad_norm": 0.16071353559941542, + "learning_rate": 0.0008755341623294595, + "loss": 1.44, + "step": 2785 + }, + { + "epoch": 0.2527671928869534, + "grad_norm": 0.15022046193579744, + "learning_rate": 0.0008754371413283267, + "loss": 1.4179, + "step": 2786 + }, + { + "epoch": 0.2528579205225912, + "grad_norm": 0.15013623686977415, + "learning_rate": 0.0008753400879081414, + "loss": 1.4319, + "step": 2787 + }, + { + "epoch": 0.252948648158229, + "grad_norm": 0.14565763928747363, + "learning_rate": 0.0008752430020772844, + "loss": 1.4441, + "step": 2788 + }, + { + "epoch": 0.2530393757938668, + "grad_norm": 0.14363253118120506, + "learning_rate": 0.0008751458838441386, + "loss": 1.4216, + "step": 2789 + }, + { + "epoch": 0.25313010342950465, + "grad_norm": 0.14445795581372467, + "learning_rate": 0.0008750487332170906, + "loss": 1.4708, + "step": 2790 + }, + { + "epoch": 0.2532208310651424, + "grad_norm": 0.16251230951399886, + "learning_rate": 0.0008749515502045291, + "loss": 1.4484, + "step": 2791 + }, + { + "epoch": 0.25331155870078026, + "grad_norm": 0.17261896132841562, + "learning_rate": 0.0008748543348148461, + "loss": 1.4848, + "step": 2792 + }, + { + "epoch": 0.2534022863364181, + "grad_norm": 0.14344622617555491, + "learning_rate": 0.0008747570870564358, + "loss": 1.4384, + "step": 2793 + }, + { + "epoch": 0.25349301397205587, + "grad_norm": 0.14445641540239038, + "learning_rate": 0.0008746598069376961, + "loss": 1.4259, + "step": 2794 + }, + { + "epoch": 0.2535837416076937, + "grad_norm": 0.15659335906327063, + "learning_rate": 0.0008745624944670267, + "loss": 1.4176, + "step": 2795 + }, + { + "epoch": 0.25367446924333154, + "grad_norm": 0.1446459532006596, + "learning_rate": 0.0008744651496528308, + "loss": 1.4478, + "step": 2796 + }, + { + "epoch": 0.2537651968789693, + "grad_norm": 0.15769903565871385, + "learning_rate": 0.000874367772503514, + "loss": 1.437, + "step": 2797 + }, + { + "epoch": 0.25385592451460715, + "grad_norm": 0.1513034849140805, + "learning_rate": 0.0008742703630274847, + "loss": 1.4685, + "step": 2798 + }, + { + "epoch": 0.253946652150245, + "grad_norm": 0.14475293915310555, + "learning_rate": 0.0008741729212331545, + "loss": 1.4768, + "step": 2799 + }, + { + "epoch": 0.25403737978588276, + "grad_norm": 0.14998646472791374, + "learning_rate": 0.0008740754471289374, + "loss": 1.4546, + "step": 2800 + }, + { + "epoch": 0.2541281074215206, + "grad_norm": 0.1513981830756029, + "learning_rate": 0.0008739779407232504, + "loss": 1.4407, + "step": 2801 + }, + { + "epoch": 0.2542188350571584, + "grad_norm": 0.15246456921592533, + "learning_rate": 0.000873880402024513, + "loss": 1.4601, + "step": 2802 + }, + { + "epoch": 0.2543095626927962, + "grad_norm": 0.15925103139543662, + "learning_rate": 0.0008737828310411477, + "loss": 1.4424, + "step": 2803 + }, + { + "epoch": 0.25440029032843403, + "grad_norm": 0.16651541248502877, + "learning_rate": 0.0008736852277815801, + "loss": 1.436, + "step": 2804 + }, + { + "epoch": 0.25449101796407186, + "grad_norm": 0.15570715684993736, + "learning_rate": 0.0008735875922542378, + "loss": 1.468, + "step": 2805 + }, + { + "epoch": 0.2545817455997097, + "grad_norm": 0.16393986786709527, + "learning_rate": 0.0008734899244675519, + "loss": 1.4395, + "step": 2806 + }, + { + "epoch": 0.2546724732353475, + "grad_norm": 0.16171564900297924, + "learning_rate": 0.0008733922244299559, + "loss": 1.4556, + "step": 2807 + }, + { + "epoch": 0.2547632008709853, + "grad_norm": 0.16507756833229031, + "learning_rate": 0.0008732944921498864, + "loss": 1.4736, + "step": 2808 + }, + { + "epoch": 0.25485392850662314, + "grad_norm": 0.16958510127697726, + "learning_rate": 0.0008731967276357826, + "loss": 1.4391, + "step": 2809 + }, + { + "epoch": 0.2549446561422609, + "grad_norm": 0.1526521509658183, + "learning_rate": 0.0008730989308960861, + "loss": 1.4453, + "step": 2810 + }, + { + "epoch": 0.25503538377789875, + "grad_norm": 0.15940856477753432, + "learning_rate": 0.0008730011019392421, + "loss": 1.4697, + "step": 2811 + }, + { + "epoch": 0.2551261114135366, + "grad_norm": 0.1730635109527544, + "learning_rate": 0.0008729032407736979, + "loss": 1.4628, + "step": 2812 + }, + { + "epoch": 0.25521683904917436, + "grad_norm": 0.1876581365351861, + "learning_rate": 0.0008728053474079039, + "loss": 1.4279, + "step": 2813 + }, + { + "epoch": 0.2553075666848122, + "grad_norm": 0.15896217603539936, + "learning_rate": 0.0008727074218503133, + "loss": 1.4593, + "step": 2814 + }, + { + "epoch": 0.25539829432045, + "grad_norm": 0.16705689053661762, + "learning_rate": 0.0008726094641093818, + "loss": 1.463, + "step": 2815 + }, + { + "epoch": 0.2554890219560878, + "grad_norm": 0.18414805706886414, + "learning_rate": 0.0008725114741935683, + "loss": 1.4371, + "step": 2816 + }, + { + "epoch": 0.25557974959172564, + "grad_norm": 0.1728456845509467, + "learning_rate": 0.0008724134521113338, + "loss": 1.4308, + "step": 2817 + }, + { + "epoch": 0.25567047722736347, + "grad_norm": 0.17733822884874156, + "learning_rate": 0.0008723153978711431, + "loss": 1.453, + "step": 2818 + }, + { + "epoch": 0.25576120486300125, + "grad_norm": 0.1992730977390446, + "learning_rate": 0.0008722173114814628, + "loss": 1.4496, + "step": 2819 + }, + { + "epoch": 0.2558519324986391, + "grad_norm": 0.16992113474675394, + "learning_rate": 0.0008721191929507628, + "loss": 1.4805, + "step": 2820 + }, + { + "epoch": 0.2559426601342769, + "grad_norm": 0.18159085903576797, + "learning_rate": 0.0008720210422875157, + "loss": 1.448, + "step": 2821 + }, + { + "epoch": 0.2560333877699147, + "grad_norm": 0.17878882172823707, + "learning_rate": 0.0008719228595001967, + "loss": 1.412, + "step": 2822 + }, + { + "epoch": 0.2561241154055525, + "grad_norm": 0.16998193791880037, + "learning_rate": 0.000871824644597284, + "loss": 1.4609, + "step": 2823 + }, + { + "epoch": 0.25621484304119035, + "grad_norm": 0.18968111020125866, + "learning_rate": 0.0008717263975872583, + "loss": 1.4559, + "step": 2824 + }, + { + "epoch": 0.2563055706768282, + "grad_norm": 0.1645519443854356, + "learning_rate": 0.0008716281184786037, + "loss": 1.4406, + "step": 2825 + }, + { + "epoch": 0.25639629831246596, + "grad_norm": 0.1634008780586828, + "learning_rate": 0.0008715298072798061, + "loss": 1.457, + "step": 2826 + }, + { + "epoch": 0.2564870259481038, + "grad_norm": 0.18969709970670903, + "learning_rate": 0.0008714314639993548, + "loss": 1.4166, + "step": 2827 + }, + { + "epoch": 0.25657775358374163, + "grad_norm": 0.1644102980886399, + "learning_rate": 0.0008713330886457419, + "loss": 1.4418, + "step": 2828 + }, + { + "epoch": 0.2566684812193794, + "grad_norm": 0.1598074273381622, + "learning_rate": 0.0008712346812274621, + "loss": 1.4234, + "step": 2829 + }, + { + "epoch": 0.25675920885501724, + "grad_norm": 0.21705373898228347, + "learning_rate": 0.0008711362417530129, + "loss": 1.4622, + "step": 2830 + }, + { + "epoch": 0.2568499364906551, + "grad_norm": 0.16819067430592063, + "learning_rate": 0.0008710377702308944, + "loss": 1.4409, + "step": 2831 + }, + { + "epoch": 0.25694066412629285, + "grad_norm": 0.24744390909747344, + "learning_rate": 0.0008709392666696098, + "loss": 1.434, + "step": 2832 + }, + { + "epoch": 0.2570313917619307, + "grad_norm": 0.1499321041470763, + "learning_rate": 0.0008708407310776649, + "loss": 1.4678, + "step": 2833 + }, + { + "epoch": 0.2571221193975685, + "grad_norm": 0.1544682772311153, + "learning_rate": 0.0008707421634635684, + "loss": 1.4421, + "step": 2834 + }, + { + "epoch": 0.2572128470332063, + "grad_norm": 0.16873083836653832, + "learning_rate": 0.0008706435638358313, + "loss": 1.4252, + "step": 2835 + }, + { + "epoch": 0.2573035746688441, + "grad_norm": 0.1479251282229246, + "learning_rate": 0.0008705449322029677, + "loss": 1.4645, + "step": 2836 + }, + { + "epoch": 0.25739430230448196, + "grad_norm": 0.15679364771911486, + "learning_rate": 0.0008704462685734946, + "loss": 1.4321, + "step": 2837 + }, + { + "epoch": 0.25748502994011974, + "grad_norm": 0.1897275887253787, + "learning_rate": 0.0008703475729559318, + "loss": 1.4488, + "step": 2838 + }, + { + "epoch": 0.25757575757575757, + "grad_norm": 0.1587649657949999, + "learning_rate": 0.0008702488453588013, + "loss": 1.447, + "step": 2839 + }, + { + "epoch": 0.2576664852113954, + "grad_norm": 0.1761281490271154, + "learning_rate": 0.0008701500857906285, + "loss": 1.4714, + "step": 2840 + }, + { + "epoch": 0.2577572128470332, + "grad_norm": 0.15998821141532274, + "learning_rate": 0.0008700512942599412, + "loss": 1.4693, + "step": 2841 + }, + { + "epoch": 0.257847940482671, + "grad_norm": 0.14181298946766566, + "learning_rate": 0.0008699524707752702, + "loss": 1.4392, + "step": 2842 + }, + { + "epoch": 0.25793866811830884, + "grad_norm": 0.15383764719612583, + "learning_rate": 0.0008698536153451488, + "loss": 1.4523, + "step": 2843 + }, + { + "epoch": 0.2580293957539467, + "grad_norm": 0.16276818841732324, + "learning_rate": 0.0008697547279781132, + "loss": 1.4422, + "step": 2844 + }, + { + "epoch": 0.25812012338958445, + "grad_norm": 0.1473697131373423, + "learning_rate": 0.0008696558086827022, + "loss": 1.4232, + "step": 2845 + }, + { + "epoch": 0.2582108510252223, + "grad_norm": 0.16095550179598833, + "learning_rate": 0.000869556857467458, + "loss": 1.4758, + "step": 2846 + }, + { + "epoch": 0.2583015786608601, + "grad_norm": 0.15570509729139462, + "learning_rate": 0.0008694578743409242, + "loss": 1.4469, + "step": 2847 + }, + { + "epoch": 0.2583923062964979, + "grad_norm": 0.15132080919204835, + "learning_rate": 0.0008693588593116488, + "loss": 1.4393, + "step": 2848 + }, + { + "epoch": 0.25848303393213573, + "grad_norm": 0.15336294602460712, + "learning_rate": 0.0008692598123881814, + "loss": 1.4682, + "step": 2849 + }, + { + "epoch": 0.25857376156777356, + "grad_norm": 0.15457019024883675, + "learning_rate": 0.0008691607335790749, + "loss": 1.4218, + "step": 2850 + }, + { + "epoch": 0.25866448920341134, + "grad_norm": 0.1510326325265926, + "learning_rate": 0.0008690616228928845, + "loss": 1.4536, + "step": 2851 + }, + { + "epoch": 0.2587552168390492, + "grad_norm": 0.1617526790178137, + "learning_rate": 0.0008689624803381686, + "loss": 1.4505, + "step": 2852 + }, + { + "epoch": 0.258845944474687, + "grad_norm": 0.16244776239965453, + "learning_rate": 0.0008688633059234881, + "loss": 1.4439, + "step": 2853 + }, + { + "epoch": 0.2589366721103248, + "grad_norm": 0.16529029963125982, + "learning_rate": 0.0008687640996574068, + "loss": 1.4308, + "step": 2854 + }, + { + "epoch": 0.2590273997459626, + "grad_norm": 0.1533623810134481, + "learning_rate": 0.0008686648615484912, + "loss": 1.4204, + "step": 2855 + }, + { + "epoch": 0.25911812738160045, + "grad_norm": 0.14569346053605609, + "learning_rate": 0.0008685655916053105, + "loss": 1.4496, + "step": 2856 + }, + { + "epoch": 0.2592088550172382, + "grad_norm": 0.1583829597322643, + "learning_rate": 0.0008684662898364365, + "loss": 1.4607, + "step": 2857 + }, + { + "epoch": 0.25929958265287606, + "grad_norm": 0.1587214164554824, + "learning_rate": 0.0008683669562504441, + "loss": 1.4371, + "step": 2858 + }, + { + "epoch": 0.2593903102885139, + "grad_norm": 0.1590174955884066, + "learning_rate": 0.0008682675908559108, + "loss": 1.449, + "step": 2859 + }, + { + "epoch": 0.25948103792415167, + "grad_norm": 0.15701126758060996, + "learning_rate": 0.0008681681936614168, + "loss": 1.4264, + "step": 2860 + }, + { + "epoch": 0.2595717655597895, + "grad_norm": 0.15103385803693353, + "learning_rate": 0.0008680687646755449, + "loss": 1.4623, + "step": 2861 + }, + { + "epoch": 0.25966249319542734, + "grad_norm": 0.17331321415919934, + "learning_rate": 0.000867969303906881, + "loss": 1.4464, + "step": 2862 + }, + { + "epoch": 0.25975322083106517, + "grad_norm": 0.15436924708162975, + "learning_rate": 0.0008678698113640135, + "loss": 1.4368, + "step": 2863 + }, + { + "epoch": 0.25984394846670295, + "grad_norm": 0.17639093117188967, + "learning_rate": 0.0008677702870555336, + "loss": 1.423, + "step": 2864 + }, + { + "epoch": 0.2599346761023408, + "grad_norm": 0.15389404899908496, + "learning_rate": 0.000867670730990035, + "loss": 1.4689, + "step": 2865 + }, + { + "epoch": 0.2600254037379786, + "grad_norm": 0.15396842902034458, + "learning_rate": 0.0008675711431761147, + "loss": 1.4113, + "step": 2866 + }, + { + "epoch": 0.2601161313736164, + "grad_norm": 0.1709706958689735, + "learning_rate": 0.000867471523622372, + "loss": 1.4225, + "step": 2867 + }, + { + "epoch": 0.2602068590092542, + "grad_norm": 0.14244390326528927, + "learning_rate": 0.0008673718723374091, + "loss": 1.4415, + "step": 2868 + }, + { + "epoch": 0.26029758664489205, + "grad_norm": 0.15356686696090457, + "learning_rate": 0.0008672721893298309, + "loss": 1.436, + "step": 2869 + }, + { + "epoch": 0.26038831428052983, + "grad_norm": 0.1500483454484577, + "learning_rate": 0.000867172474608245, + "loss": 1.4648, + "step": 2870 + }, + { + "epoch": 0.26047904191616766, + "grad_norm": 0.14919845299058826, + "learning_rate": 0.0008670727281812618, + "loss": 1.4496, + "step": 2871 + }, + { + "epoch": 0.2605697695518055, + "grad_norm": 0.15785569358732304, + "learning_rate": 0.0008669729500574943, + "loss": 1.4769, + "step": 2872 + }, + { + "epoch": 0.2606604971874433, + "grad_norm": 0.1517896198453089, + "learning_rate": 0.0008668731402455586, + "loss": 1.4599, + "step": 2873 + }, + { + "epoch": 0.2607512248230811, + "grad_norm": 0.14894200966806997, + "learning_rate": 0.0008667732987540733, + "loss": 1.455, + "step": 2874 + }, + { + "epoch": 0.26084195245871894, + "grad_norm": 0.14680897035903015, + "learning_rate": 0.0008666734255916594, + "loss": 1.4253, + "step": 2875 + }, + { + "epoch": 0.2609326800943567, + "grad_norm": 0.1718658658184039, + "learning_rate": 0.0008665735207669412, + "loss": 1.4429, + "step": 2876 + }, + { + "epoch": 0.26102340772999455, + "grad_norm": 0.15039599779629237, + "learning_rate": 0.0008664735842885455, + "loss": 1.4754, + "step": 2877 + }, + { + "epoch": 0.2611141353656324, + "grad_norm": 0.14347279006825223, + "learning_rate": 0.0008663736161651017, + "loss": 1.4227, + "step": 2878 + }, + { + "epoch": 0.26120486300127016, + "grad_norm": 0.14680537655629958, + "learning_rate": 0.0008662736164052423, + "loss": 1.4524, + "step": 2879 + }, + { + "epoch": 0.261295590636908, + "grad_norm": 0.1472490194867123, + "learning_rate": 0.000866173585017602, + "loss": 1.4444, + "step": 2880 + }, + { + "epoch": 0.2613863182725458, + "grad_norm": 0.14865501740036174, + "learning_rate": 0.0008660735220108187, + "loss": 1.4183, + "step": 2881 + }, + { + "epoch": 0.26147704590818366, + "grad_norm": 0.1578641678910363, + "learning_rate": 0.0008659734273935328, + "loss": 1.4746, + "step": 2882 + }, + { + "epoch": 0.26156777354382144, + "grad_norm": 0.16307321458404195, + "learning_rate": 0.0008658733011743876, + "loss": 1.417, + "step": 2883 + }, + { + "epoch": 0.26165850117945927, + "grad_norm": 0.15622117696154836, + "learning_rate": 0.0008657731433620289, + "loss": 1.467, + "step": 2884 + }, + { + "epoch": 0.2617492288150971, + "grad_norm": 0.15998371272672726, + "learning_rate": 0.0008656729539651051, + "loss": 1.4222, + "step": 2885 + }, + { + "epoch": 0.2618399564507349, + "grad_norm": 0.15573517466603143, + "learning_rate": 0.0008655727329922681, + "loss": 1.445, + "step": 2886 + }, + { + "epoch": 0.2619306840863727, + "grad_norm": 0.16019027039922085, + "learning_rate": 0.0008654724804521718, + "loss": 1.4609, + "step": 2887 + }, + { + "epoch": 0.26202141172201054, + "grad_norm": 0.15104731455183776, + "learning_rate": 0.0008653721963534728, + "loss": 1.3856, + "step": 2888 + }, + { + "epoch": 0.2621121393576483, + "grad_norm": 0.15296952557845345, + "learning_rate": 0.0008652718807048307, + "loss": 1.4703, + "step": 2889 + }, + { + "epoch": 0.26220286699328615, + "grad_norm": 0.15678053296065925, + "learning_rate": 0.000865171533514908, + "loss": 1.4595, + "step": 2890 + }, + { + "epoch": 0.262293594628924, + "grad_norm": 0.16323622361010576, + "learning_rate": 0.0008650711547923695, + "loss": 1.4764, + "step": 2891 + }, + { + "epoch": 0.26238432226456176, + "grad_norm": 0.16383192639673474, + "learning_rate": 0.0008649707445458831, + "loss": 1.4622, + "step": 2892 + }, + { + "epoch": 0.2624750499001996, + "grad_norm": 0.15403342182121194, + "learning_rate": 0.0008648703027841191, + "loss": 1.4441, + "step": 2893 + }, + { + "epoch": 0.26256577753583743, + "grad_norm": 0.15222738581499234, + "learning_rate": 0.0008647698295157505, + "loss": 1.415, + "step": 2894 + }, + { + "epoch": 0.2626565051714752, + "grad_norm": 0.1499776732713575, + "learning_rate": 0.0008646693247494534, + "loss": 1.4334, + "step": 2895 + }, + { + "epoch": 0.26274723280711304, + "grad_norm": 0.15397180813981046, + "learning_rate": 0.0008645687884939065, + "loss": 1.4104, + "step": 2896 + }, + { + "epoch": 0.2628379604427509, + "grad_norm": 0.15580040239729995, + "learning_rate": 0.0008644682207577909, + "loss": 1.4833, + "step": 2897 + }, + { + "epoch": 0.26292868807838865, + "grad_norm": 0.1700873664827329, + "learning_rate": 0.0008643676215497909, + "loss": 1.4382, + "step": 2898 + }, + { + "epoch": 0.2630194157140265, + "grad_norm": 0.16107541433472075, + "learning_rate": 0.0008642669908785929, + "loss": 1.5006, + "step": 2899 + }, + { + "epoch": 0.2631101433496643, + "grad_norm": 0.1594019442975969, + "learning_rate": 0.0008641663287528864, + "loss": 1.4643, + "step": 2900 + }, + { + "epoch": 0.26320087098530215, + "grad_norm": 0.15360904162474343, + "learning_rate": 0.000864065635181364, + "loss": 1.4284, + "step": 2901 + }, + { + "epoch": 0.2632915986209399, + "grad_norm": 0.14790028946263742, + "learning_rate": 0.0008639649101727202, + "loss": 1.4414, + "step": 2902 + }, + { + "epoch": 0.26338232625657776, + "grad_norm": 0.14748739975939, + "learning_rate": 0.0008638641537356529, + "loss": 1.418, + "step": 2903 + }, + { + "epoch": 0.2634730538922156, + "grad_norm": 0.14487260188842085, + "learning_rate": 0.0008637633658788622, + "loss": 1.4695, + "step": 2904 + }, + { + "epoch": 0.26356378152785337, + "grad_norm": 0.14837012926150733, + "learning_rate": 0.0008636625466110513, + "loss": 1.4397, + "step": 2905 + }, + { + "epoch": 0.2636545091634912, + "grad_norm": 0.14353152525481128, + "learning_rate": 0.0008635616959409259, + "loss": 1.457, + "step": 2906 + }, + { + "epoch": 0.26374523679912903, + "grad_norm": 0.15452963306695566, + "learning_rate": 0.0008634608138771942, + "loss": 1.4297, + "step": 2907 + }, + { + "epoch": 0.2638359644347668, + "grad_norm": 0.14414532961407298, + "learning_rate": 0.0008633599004285679, + "loss": 1.4313, + "step": 2908 + }, + { + "epoch": 0.26392669207040464, + "grad_norm": 0.15875967767174531, + "learning_rate": 0.0008632589556037606, + "loss": 1.4739, + "step": 2909 + }, + { + "epoch": 0.2640174197060425, + "grad_norm": 0.19215914355682165, + "learning_rate": 0.000863157979411489, + "loss": 1.4614, + "step": 2910 + }, + { + "epoch": 0.26410814734168025, + "grad_norm": 0.1526514578715362, + "learning_rate": 0.0008630569718604724, + "loss": 1.4504, + "step": 2911 + }, + { + "epoch": 0.2641988749773181, + "grad_norm": 0.14000270529373937, + "learning_rate": 0.0008629559329594327, + "loss": 1.4069, + "step": 2912 + }, + { + "epoch": 0.2642896026129559, + "grad_norm": 0.1587996063983217, + "learning_rate": 0.0008628548627170947, + "loss": 1.4337, + "step": 2913 + }, + { + "epoch": 0.2643803302485937, + "grad_norm": 0.15543103410687886, + "learning_rate": 0.0008627537611421857, + "loss": 1.4393, + "step": 2914 + }, + { + "epoch": 0.26447105788423153, + "grad_norm": 0.15527430874490783, + "learning_rate": 0.0008626526282434361, + "loss": 1.4542, + "step": 2915 + }, + { + "epoch": 0.26456178551986936, + "grad_norm": 0.147104773584927, + "learning_rate": 0.0008625514640295786, + "loss": 1.4727, + "step": 2916 + }, + { + "epoch": 0.26465251315550714, + "grad_norm": 0.15374979341089526, + "learning_rate": 0.0008624502685093487, + "loss": 1.4531, + "step": 2917 + }, + { + "epoch": 0.264743240791145, + "grad_norm": 0.14555695672548913, + "learning_rate": 0.0008623490416914848, + "loss": 1.4485, + "step": 2918 + }, + { + "epoch": 0.2648339684267828, + "grad_norm": 0.1489144889188526, + "learning_rate": 0.0008622477835847275, + "loss": 1.4724, + "step": 2919 + }, + { + "epoch": 0.26492469606242064, + "grad_norm": 0.15459836872322952, + "learning_rate": 0.000862146494197821, + "loss": 1.4435, + "step": 2920 + }, + { + "epoch": 0.2650154236980584, + "grad_norm": 0.15486656305650456, + "learning_rate": 0.0008620451735395112, + "loss": 1.4135, + "step": 2921 + }, + { + "epoch": 0.26510615133369625, + "grad_norm": 0.16655200797660233, + "learning_rate": 0.0008619438216185473, + "loss": 1.4441, + "step": 2922 + }, + { + "epoch": 0.2651968789693341, + "grad_norm": 0.15663372699677736, + "learning_rate": 0.0008618424384436809, + "loss": 1.5112, + "step": 2923 + }, + { + "epoch": 0.26528760660497186, + "grad_norm": 0.19269067745598986, + "learning_rate": 0.0008617410240236669, + "loss": 1.4205, + "step": 2924 + }, + { + "epoch": 0.2653783342406097, + "grad_norm": 0.16603977360014718, + "learning_rate": 0.000861639578367262, + "loss": 1.4323, + "step": 2925 + }, + { + "epoch": 0.2654690618762475, + "grad_norm": 0.15883669522606741, + "learning_rate": 0.0008615381014832264, + "loss": 1.4412, + "step": 2926 + }, + { + "epoch": 0.2655597895118853, + "grad_norm": 0.1575475991308535, + "learning_rate": 0.0008614365933803222, + "loss": 1.4112, + "step": 2927 + }, + { + "epoch": 0.26565051714752314, + "grad_norm": 0.16692661053579888, + "learning_rate": 0.000861335054067315, + "loss": 1.4458, + "step": 2928 + }, + { + "epoch": 0.26574124478316097, + "grad_norm": 0.16206111830551298, + "learning_rate": 0.0008612334835529726, + "loss": 1.4324, + "step": 2929 + }, + { + "epoch": 0.26583197241879875, + "grad_norm": 0.16915063974984113, + "learning_rate": 0.0008611318818460657, + "loss": 1.4518, + "step": 2930 + }, + { + "epoch": 0.2659227000544366, + "grad_norm": 0.16253739971912462, + "learning_rate": 0.0008610302489553675, + "loss": 1.4818, + "step": 2931 + }, + { + "epoch": 0.2660134276900744, + "grad_norm": 0.16750821445466674, + "learning_rate": 0.0008609285848896542, + "loss": 1.4822, + "step": 2932 + }, + { + "epoch": 0.2661041553257122, + "grad_norm": 0.1536175137886269, + "learning_rate": 0.0008608268896577043, + "loss": 1.4381, + "step": 2933 + }, + { + "epoch": 0.26619488296135, + "grad_norm": 0.1555812399451693, + "learning_rate": 0.0008607251632682993, + "loss": 1.4562, + "step": 2934 + }, + { + "epoch": 0.26628561059698785, + "grad_norm": 0.16861337475686453, + "learning_rate": 0.0008606234057302233, + "loss": 1.4817, + "step": 2935 + }, + { + "epoch": 0.26637633823262563, + "grad_norm": 0.14894030039431552, + "learning_rate": 0.0008605216170522632, + "loss": 1.4338, + "step": 2936 + }, + { + "epoch": 0.26646706586826346, + "grad_norm": 0.1561152434361984, + "learning_rate": 0.0008604197972432082, + "loss": 1.4278, + "step": 2937 + }, + { + "epoch": 0.2665577935039013, + "grad_norm": 0.14958015009028597, + "learning_rate": 0.0008603179463118507, + "loss": 1.4315, + "step": 2938 + }, + { + "epoch": 0.26664852113953913, + "grad_norm": 0.18575495143968732, + "learning_rate": 0.0008602160642669852, + "loss": 1.4383, + "step": 2939 + }, + { + "epoch": 0.2667392487751769, + "grad_norm": 0.14901394501440313, + "learning_rate": 0.0008601141511174096, + "loss": 1.4677, + "step": 2940 + }, + { + "epoch": 0.26682997641081474, + "grad_norm": 0.15118295421536213, + "learning_rate": 0.0008600122068719241, + "loss": 1.4424, + "step": 2941 + }, + { + "epoch": 0.2669207040464526, + "grad_norm": 0.18186606857845375, + "learning_rate": 0.0008599102315393313, + "loss": 1.4535, + "step": 2942 + }, + { + "epoch": 0.26701143168209035, + "grad_norm": 0.14852495540975527, + "learning_rate": 0.000859808225128437, + "loss": 1.4072, + "step": 2943 + }, + { + "epoch": 0.2671021593177282, + "grad_norm": 0.15670437522845188, + "learning_rate": 0.0008597061876480495, + "loss": 1.4779, + "step": 2944 + }, + { + "epoch": 0.267192886953366, + "grad_norm": 0.1594431981893196, + "learning_rate": 0.0008596041191069795, + "loss": 1.4098, + "step": 2945 + }, + { + "epoch": 0.2672836145890038, + "grad_norm": 0.15663859500508276, + "learning_rate": 0.000859502019514041, + "loss": 1.4125, + "step": 2946 + }, + { + "epoch": 0.2673743422246416, + "grad_norm": 0.1804072113848158, + "learning_rate": 0.0008593998888780501, + "loss": 1.4332, + "step": 2947 + }, + { + "epoch": 0.26746506986027946, + "grad_norm": 0.16295908141630822, + "learning_rate": 0.0008592977272078258, + "loss": 1.4669, + "step": 2948 + }, + { + "epoch": 0.26755579749591724, + "grad_norm": 0.15650773179820593, + "learning_rate": 0.0008591955345121898, + "loss": 1.4837, + "step": 2949 + }, + { + "epoch": 0.26764652513155507, + "grad_norm": 0.14573202023168716, + "learning_rate": 0.0008590933107999664, + "loss": 1.4789, + "step": 2950 + }, + { + "epoch": 0.2677372527671929, + "grad_norm": 0.14960710133589616, + "learning_rate": 0.0008589910560799827, + "loss": 1.443, + "step": 2951 + }, + { + "epoch": 0.2678279804028307, + "grad_norm": 0.1527023735446857, + "learning_rate": 0.0008588887703610686, + "loss": 1.4642, + "step": 2952 + }, + { + "epoch": 0.2679187080384685, + "grad_norm": 0.15815208349793272, + "learning_rate": 0.000858786453652056, + "loss": 1.4471, + "step": 2953 + }, + { + "epoch": 0.26800943567410634, + "grad_norm": 0.149592480575069, + "learning_rate": 0.0008586841059617804, + "loss": 1.4255, + "step": 2954 + }, + { + "epoch": 0.2681001633097441, + "grad_norm": 0.15288560788805164, + "learning_rate": 0.0008585817272990794, + "loss": 1.443, + "step": 2955 + }, + { + "epoch": 0.26819089094538195, + "grad_norm": 0.15411613673626234, + "learning_rate": 0.0008584793176727933, + "loss": 1.4324, + "step": 2956 + }, + { + "epoch": 0.2682816185810198, + "grad_norm": 0.15306272274647778, + "learning_rate": 0.0008583768770917654, + "loss": 1.4717, + "step": 2957 + }, + { + "epoch": 0.2683723462166576, + "grad_norm": 0.16113249352747788, + "learning_rate": 0.0008582744055648413, + "loss": 1.4752, + "step": 2958 + }, + { + "epoch": 0.2684630738522954, + "grad_norm": 0.16992760043509306, + "learning_rate": 0.0008581719031008695, + "loss": 1.4058, + "step": 2959 + }, + { + "epoch": 0.26855380148793323, + "grad_norm": 0.15516613569100496, + "learning_rate": 0.000858069369708701, + "loss": 1.4361, + "step": 2960 + }, + { + "epoch": 0.26864452912357106, + "grad_norm": 0.16686806367657253, + "learning_rate": 0.0008579668053971896, + "loss": 1.4351, + "step": 2961 + }, + { + "epoch": 0.26873525675920884, + "grad_norm": 0.16269718035045602, + "learning_rate": 0.0008578642101751919, + "loss": 1.4067, + "step": 2962 + }, + { + "epoch": 0.2688259843948467, + "grad_norm": 0.1649098448304937, + "learning_rate": 0.0008577615840515669, + "loss": 1.4547, + "step": 2963 + }, + { + "epoch": 0.2689167120304845, + "grad_norm": 0.15809229718250473, + "learning_rate": 0.0008576589270351763, + "loss": 1.4369, + "step": 2964 + }, + { + "epoch": 0.2690074396661223, + "grad_norm": 0.23650423700847145, + "learning_rate": 0.0008575562391348847, + "loss": 1.4244, + "step": 2965 + }, + { + "epoch": 0.2690981673017601, + "grad_norm": 0.15446632331754007, + "learning_rate": 0.0008574535203595593, + "loss": 1.4568, + "step": 2966 + }, + { + "epoch": 0.26918889493739795, + "grad_norm": 0.1664175526014279, + "learning_rate": 0.0008573507707180695, + "loss": 1.452, + "step": 2967 + }, + { + "epoch": 0.2692796225730357, + "grad_norm": 0.15728802847998066, + "learning_rate": 0.0008572479902192881, + "loss": 1.4451, + "step": 2968 + }, + { + "epoch": 0.26937035020867356, + "grad_norm": 0.14740330438099536, + "learning_rate": 0.0008571451788720901, + "loss": 1.4418, + "step": 2969 + }, + { + "epoch": 0.2694610778443114, + "grad_norm": 0.149979897138873, + "learning_rate": 0.0008570423366853532, + "loss": 1.4728, + "step": 2970 + }, + { + "epoch": 0.26955180547994917, + "grad_norm": 0.14630446178207215, + "learning_rate": 0.0008569394636679579, + "loss": 1.4566, + "step": 2971 + }, + { + "epoch": 0.269642533115587, + "grad_norm": 0.14340656831945603, + "learning_rate": 0.0008568365598287875, + "loss": 1.4371, + "step": 2972 + }, + { + "epoch": 0.26973326075122483, + "grad_norm": 0.1491583911358256, + "learning_rate": 0.0008567336251767273, + "loss": 1.422, + "step": 2973 + }, + { + "epoch": 0.2698239883868626, + "grad_norm": 0.15232179648160155, + "learning_rate": 0.0008566306597206662, + "loss": 1.4404, + "step": 2974 + }, + { + "epoch": 0.26991471602250044, + "grad_norm": 0.16117640480363754, + "learning_rate": 0.000856527663469495, + "loss": 1.4392, + "step": 2975 + }, + { + "epoch": 0.2700054436581383, + "grad_norm": 0.15317106045736556, + "learning_rate": 0.0008564246364321074, + "loss": 1.4706, + "step": 2976 + }, + { + "epoch": 0.2700961712937761, + "grad_norm": 0.15771189448807305, + "learning_rate": 0.0008563215786174, + "loss": 1.4219, + "step": 2977 + }, + { + "epoch": 0.2701868989294139, + "grad_norm": 0.15026064295013045, + "learning_rate": 0.0008562184900342718, + "loss": 1.4759, + "step": 2978 + }, + { + "epoch": 0.2702776265650517, + "grad_norm": 0.15479839776078042, + "learning_rate": 0.0008561153706916245, + "loss": 1.4334, + "step": 2979 + }, + { + "epoch": 0.27036835420068955, + "grad_norm": 0.16510177584718433, + "learning_rate": 0.0008560122205983622, + "loss": 1.4606, + "step": 2980 + }, + { + "epoch": 0.27045908183632733, + "grad_norm": 0.16602432953919308, + "learning_rate": 0.0008559090397633925, + "loss": 1.4124, + "step": 2981 + }, + { + "epoch": 0.27054980947196516, + "grad_norm": 0.17631137938547167, + "learning_rate": 0.0008558058281956247, + "loss": 1.4339, + "step": 2982 + }, + { + "epoch": 0.270640537107603, + "grad_norm": 0.15234786558742996, + "learning_rate": 0.0008557025859039711, + "loss": 1.4284, + "step": 2983 + }, + { + "epoch": 0.2707312647432408, + "grad_norm": 0.17832517395815356, + "learning_rate": 0.0008555993128973468, + "loss": 1.4705, + "step": 2984 + }, + { + "epoch": 0.2708219923788786, + "grad_norm": 0.16263614232278448, + "learning_rate": 0.0008554960091846695, + "loss": 1.4334, + "step": 2985 + }, + { + "epoch": 0.27091272001451644, + "grad_norm": 0.16481590286189077, + "learning_rate": 0.0008553926747748593, + "loss": 1.4478, + "step": 2986 + }, + { + "epoch": 0.2710034476501542, + "grad_norm": 0.17328176467223394, + "learning_rate": 0.0008552893096768394, + "loss": 1.5067, + "step": 2987 + }, + { + "epoch": 0.27109417528579205, + "grad_norm": 0.15220775165153355, + "learning_rate": 0.0008551859138995351, + "loss": 1.4528, + "step": 2988 + }, + { + "epoch": 0.2711849029214299, + "grad_norm": 0.15885804916316376, + "learning_rate": 0.0008550824874518749, + "loss": 1.4674, + "step": 2989 + }, + { + "epoch": 0.27127563055706766, + "grad_norm": 0.2865730268539177, + "learning_rate": 0.0008549790303427894, + "loss": 1.457, + "step": 2990 + }, + { + "epoch": 0.2713663581927055, + "grad_norm": 0.16596582095214069, + "learning_rate": 0.0008548755425812124, + "loss": 1.4312, + "step": 2991 + }, + { + "epoch": 0.2714570858283433, + "grad_norm": 0.33215382873357274, + "learning_rate": 0.0008547720241760801, + "loss": 1.4218, + "step": 2992 + }, + { + "epoch": 0.2715478134639811, + "grad_norm": 0.16078486480714413, + "learning_rate": 0.0008546684751363312, + "loss": 1.4492, + "step": 2993 + }, + { + "epoch": 0.27163854109961894, + "grad_norm": 0.1707748299792241, + "learning_rate": 0.0008545648954709071, + "loss": 1.435, + "step": 2994 + }, + { + "epoch": 0.27172926873525677, + "grad_norm": 0.1666985905848025, + "learning_rate": 0.0008544612851887521, + "loss": 1.4518, + "step": 2995 + }, + { + "epoch": 0.2718199963708946, + "grad_norm": 0.1871433923416216, + "learning_rate": 0.0008543576442988128, + "loss": 1.4262, + "step": 2996 + }, + { + "epoch": 0.2719107240065324, + "grad_norm": 0.20562835536313948, + "learning_rate": 0.0008542539728100388, + "loss": 1.4575, + "step": 2997 + }, + { + "epoch": 0.2720014516421702, + "grad_norm": 0.15613247743792064, + "learning_rate": 0.000854150270731382, + "loss": 1.4611, + "step": 2998 + }, + { + "epoch": 0.27209217927780804, + "grad_norm": 0.1508802256736549, + "learning_rate": 0.000854046538071797, + "loss": 1.4315, + "step": 2999 + }, + { + "epoch": 0.2721829069134458, + "grad_norm": 0.16213014178867827, + "learning_rate": 0.0008539427748402415, + "loss": 1.4441, + "step": 3000 + }, + { + "epoch": 0.27227363454908365, + "grad_norm": 0.15648063929603717, + "learning_rate": 0.000853838981045675, + "loss": 1.4277, + "step": 3001 + }, + { + "epoch": 0.2723643621847215, + "grad_norm": 0.16701298550139781, + "learning_rate": 0.0008537351566970604, + "loss": 1.4303, + "step": 3002 + }, + { + "epoch": 0.27245508982035926, + "grad_norm": 0.1659237622216556, + "learning_rate": 0.0008536313018033629, + "loss": 1.4748, + "step": 3003 + }, + { + "epoch": 0.2725458174559971, + "grad_norm": 0.16115565340175286, + "learning_rate": 0.0008535274163735503, + "loss": 1.4748, + "step": 3004 + }, + { + "epoch": 0.27263654509163493, + "grad_norm": 0.1466015490360477, + "learning_rate": 0.0008534235004165933, + "loss": 1.4369, + "step": 3005 + }, + { + "epoch": 0.2727272727272727, + "grad_norm": 0.15364629715922112, + "learning_rate": 0.000853319553941465, + "loss": 1.4399, + "step": 3006 + }, + { + "epoch": 0.27281800036291054, + "grad_norm": 0.16192101986100352, + "learning_rate": 0.0008532155769571411, + "loss": 1.4135, + "step": 3007 + }, + { + "epoch": 0.2729087279985484, + "grad_norm": 0.16533088281164351, + "learning_rate": 0.0008531115694726, + "loss": 1.4699, + "step": 3008 + }, + { + "epoch": 0.27299945563418615, + "grad_norm": 0.15883757761088638, + "learning_rate": 0.0008530075314968228, + "loss": 1.4738, + "step": 3009 + }, + { + "epoch": 0.273090183269824, + "grad_norm": 0.15901831336375744, + "learning_rate": 0.0008529034630387933, + "loss": 1.474, + "step": 3010 + }, + { + "epoch": 0.2731809109054618, + "grad_norm": 0.1772759292923947, + "learning_rate": 0.0008527993641074978, + "loss": 1.4586, + "step": 3011 + }, + { + "epoch": 0.2732716385410996, + "grad_norm": 0.15520829555106333, + "learning_rate": 0.000852695234711925, + "loss": 1.4001, + "step": 3012 + }, + { + "epoch": 0.2733623661767374, + "grad_norm": 0.17049392429073887, + "learning_rate": 0.000852591074861067, + "loss": 1.4337, + "step": 3013 + }, + { + "epoch": 0.27345309381237526, + "grad_norm": 0.16126723149704436, + "learning_rate": 0.0008524868845639175, + "loss": 1.4007, + "step": 3014 + }, + { + "epoch": 0.2735438214480131, + "grad_norm": 0.17393710972204332, + "learning_rate": 0.0008523826638294738, + "loss": 1.467, + "step": 3015 + }, + { + "epoch": 0.27363454908365087, + "grad_norm": 0.16866498451051243, + "learning_rate": 0.0008522784126667349, + "loss": 1.4366, + "step": 3016 + }, + { + "epoch": 0.2737252767192887, + "grad_norm": 0.17389430958953137, + "learning_rate": 0.0008521741310847031, + "loss": 1.4403, + "step": 3017 + }, + { + "epoch": 0.27381600435492653, + "grad_norm": 0.16161264515491391, + "learning_rate": 0.0008520698190923834, + "loss": 1.517, + "step": 3018 + }, + { + "epoch": 0.2739067319905643, + "grad_norm": 0.1649976239182748, + "learning_rate": 0.0008519654766987829, + "loss": 1.4666, + "step": 3019 + }, + { + "epoch": 0.27399745962620214, + "grad_norm": 0.15279784670260219, + "learning_rate": 0.0008518611039129114, + "loss": 1.4328, + "step": 3020 + }, + { + "epoch": 0.27408818726184, + "grad_norm": 0.1467965430376585, + "learning_rate": 0.000851756700743782, + "loss": 1.4439, + "step": 3021 + }, + { + "epoch": 0.27417891489747775, + "grad_norm": 0.19235954484893716, + "learning_rate": 0.0008516522672004094, + "loss": 1.4539, + "step": 3022 + }, + { + "epoch": 0.2742696425331156, + "grad_norm": 0.14582229119189893, + "learning_rate": 0.0008515478032918119, + "loss": 1.4403, + "step": 3023 + }, + { + "epoch": 0.2743603701687534, + "grad_norm": 0.18164154898164034, + "learning_rate": 0.0008514433090270096, + "loss": 1.4258, + "step": 3024 + }, + { + "epoch": 0.2744510978043912, + "grad_norm": 0.1585525973210937, + "learning_rate": 0.000851338784415026, + "loss": 1.4406, + "step": 3025 + }, + { + "epoch": 0.27454182544002903, + "grad_norm": 0.18021650252575583, + "learning_rate": 0.0008512342294648864, + "loss": 1.4512, + "step": 3026 + }, + { + "epoch": 0.27463255307566686, + "grad_norm": 0.1463941540959393, + "learning_rate": 0.0008511296441856193, + "loss": 1.4587, + "step": 3027 + }, + { + "epoch": 0.27472328071130464, + "grad_norm": 0.16302638730470465, + "learning_rate": 0.0008510250285862557, + "loss": 1.4691, + "step": 3028 + }, + { + "epoch": 0.2748140083469425, + "grad_norm": 0.15525804713184596, + "learning_rate": 0.0008509203826758292, + "loss": 1.4604, + "step": 3029 + }, + { + "epoch": 0.2749047359825803, + "grad_norm": 0.14836080924997888, + "learning_rate": 0.000850815706463376, + "loss": 1.4592, + "step": 3030 + }, + { + "epoch": 0.2749954636182181, + "grad_norm": 0.16526210135734962, + "learning_rate": 0.0008507109999579348, + "loss": 1.4724, + "step": 3031 + }, + { + "epoch": 0.2750861912538559, + "grad_norm": 0.14560588530783108, + "learning_rate": 0.0008506062631685469, + "loss": 1.4457, + "step": 3032 + }, + { + "epoch": 0.27517691888949375, + "grad_norm": 0.16197411425012456, + "learning_rate": 0.0008505014961042566, + "loss": 1.4484, + "step": 3033 + }, + { + "epoch": 0.2752676465251316, + "grad_norm": 0.14585305861354508, + "learning_rate": 0.0008503966987741105, + "loss": 1.4566, + "step": 3034 + }, + { + "epoch": 0.27535837416076936, + "grad_norm": 0.16637167638509998, + "learning_rate": 0.0008502918711871577, + "loss": 1.4682, + "step": 3035 + }, + { + "epoch": 0.2754491017964072, + "grad_norm": 0.16033797754757687, + "learning_rate": 0.0008501870133524503, + "loss": 1.4401, + "step": 3036 + }, + { + "epoch": 0.275539829432045, + "grad_norm": 0.14847870986961878, + "learning_rate": 0.0008500821252790427, + "loss": 1.4677, + "step": 3037 + }, + { + "epoch": 0.2756305570676828, + "grad_norm": 0.1595045339489471, + "learning_rate": 0.000849977206975992, + "loss": 1.4355, + "step": 3038 + }, + { + "epoch": 0.27572128470332063, + "grad_norm": 0.15774177743782478, + "learning_rate": 0.0008498722584523578, + "loss": 1.4182, + "step": 3039 + }, + { + "epoch": 0.27581201233895847, + "grad_norm": 0.14319831445943443, + "learning_rate": 0.0008497672797172026, + "loss": 1.4228, + "step": 3040 + }, + { + "epoch": 0.27590273997459624, + "grad_norm": 0.1495060689100412, + "learning_rate": 0.0008496622707795913, + "loss": 1.4208, + "step": 3041 + }, + { + "epoch": 0.2759934676102341, + "grad_norm": 0.14814132703546595, + "learning_rate": 0.0008495572316485913, + "loss": 1.4642, + "step": 3042 + }, + { + "epoch": 0.2760841952458719, + "grad_norm": 0.1667386182591464, + "learning_rate": 0.0008494521623332727, + "loss": 1.4544, + "step": 3043 + }, + { + "epoch": 0.2761749228815097, + "grad_norm": 0.16096028434008489, + "learning_rate": 0.0008493470628427085, + "loss": 1.4574, + "step": 3044 + }, + { + "epoch": 0.2762656505171475, + "grad_norm": 0.15854431466404711, + "learning_rate": 0.000849241933185974, + "loss": 1.4639, + "step": 3045 + }, + { + "epoch": 0.27635637815278535, + "grad_norm": 0.15398639076148476, + "learning_rate": 0.0008491367733721471, + "loss": 1.4469, + "step": 3046 + }, + { + "epoch": 0.27644710578842313, + "grad_norm": 0.17665405738882817, + "learning_rate": 0.0008490315834103082, + "loss": 1.4256, + "step": 3047 + }, + { + "epoch": 0.27653783342406096, + "grad_norm": 0.14508784008289238, + "learning_rate": 0.0008489263633095407, + "loss": 1.4201, + "step": 3048 + }, + { + "epoch": 0.2766285610596988, + "grad_norm": 0.15001401350953747, + "learning_rate": 0.0008488211130789304, + "loss": 1.4646, + "step": 3049 + }, + { + "epoch": 0.2767192886953366, + "grad_norm": 0.14829787005884534, + "learning_rate": 0.0008487158327275655, + "loss": 1.4507, + "step": 3050 + }, + { + "epoch": 0.2768100163309744, + "grad_norm": 0.15557101488081404, + "learning_rate": 0.000848610522264537, + "loss": 1.4853, + "step": 3051 + }, + { + "epoch": 0.27690074396661224, + "grad_norm": 0.1571390825656429, + "learning_rate": 0.0008485051816989386, + "loss": 1.4053, + "step": 3052 + }, + { + "epoch": 0.2769914716022501, + "grad_norm": 0.1784501189939724, + "learning_rate": 0.0008483998110398665, + "loss": 1.4293, + "step": 3053 + }, + { + "epoch": 0.27708219923788785, + "grad_norm": 0.1605526345907272, + "learning_rate": 0.0008482944102964192, + "loss": 1.4551, + "step": 3054 + }, + { + "epoch": 0.2771729268735257, + "grad_norm": 0.1660924497687302, + "learning_rate": 0.0008481889794776982, + "loss": 1.4109, + "step": 3055 + }, + { + "epoch": 0.2772636545091635, + "grad_norm": 0.15350158663364583, + "learning_rate": 0.0008480835185928075, + "loss": 1.4512, + "step": 3056 + }, + { + "epoch": 0.2773543821448013, + "grad_norm": 0.1649330765400088, + "learning_rate": 0.0008479780276508538, + "loss": 1.4422, + "step": 3057 + }, + { + "epoch": 0.2774451097804391, + "grad_norm": 0.15316098669833547, + "learning_rate": 0.0008478725066609461, + "loss": 1.3783, + "step": 3058 + }, + { + "epoch": 0.27753583741607696, + "grad_norm": 0.14981615586862282, + "learning_rate": 0.0008477669556321961, + "loss": 1.4253, + "step": 3059 + }, + { + "epoch": 0.27762656505171474, + "grad_norm": 0.1472199004379362, + "learning_rate": 0.0008476613745737183, + "loss": 1.4283, + "step": 3060 + }, + { + "epoch": 0.27771729268735257, + "grad_norm": 0.14509210419212576, + "learning_rate": 0.0008475557634946296, + "loss": 1.3961, + "step": 3061 + }, + { + "epoch": 0.2778080203229904, + "grad_norm": 0.15735055910838877, + "learning_rate": 0.0008474501224040494, + "loss": 1.455, + "step": 3062 + }, + { + "epoch": 0.2778987479586282, + "grad_norm": 0.34012118901291416, + "learning_rate": 0.0008473444513110999, + "loss": 1.438, + "step": 3063 + }, + { + "epoch": 0.277989475594266, + "grad_norm": 0.16105228424647294, + "learning_rate": 0.0008472387502249059, + "loss": 1.3964, + "step": 3064 + }, + { + "epoch": 0.27808020322990384, + "grad_norm": 0.14238434741894723, + "learning_rate": 0.0008471330191545947, + "loss": 1.4438, + "step": 3065 + }, + { + "epoch": 0.2781709308655416, + "grad_norm": 0.14324377165795105, + "learning_rate": 0.0008470272581092962, + "loss": 1.3918, + "step": 3066 + }, + { + "epoch": 0.27826165850117945, + "grad_norm": 0.15552545172911583, + "learning_rate": 0.0008469214670981425, + "loss": 1.4651, + "step": 3067 + }, + { + "epoch": 0.2783523861368173, + "grad_norm": 0.13857282362326526, + "learning_rate": 0.0008468156461302692, + "loss": 1.446, + "step": 3068 + }, + { + "epoch": 0.27844311377245506, + "grad_norm": 0.15305869848133266, + "learning_rate": 0.0008467097952148138, + "loss": 1.4404, + "step": 3069 + }, + { + "epoch": 0.2785338414080929, + "grad_norm": 0.15892078459458026, + "learning_rate": 0.0008466039143609163, + "loss": 1.4406, + "step": 3070 + }, + { + "epoch": 0.27862456904373073, + "grad_norm": 0.16599916927004818, + "learning_rate": 0.0008464980035777199, + "loss": 1.4513, + "step": 3071 + }, + { + "epoch": 0.27871529667936856, + "grad_norm": 0.15307644215564434, + "learning_rate": 0.0008463920628743697, + "loss": 1.4278, + "step": 3072 + }, + { + "epoch": 0.27880602431500634, + "grad_norm": 0.14679447697793188, + "learning_rate": 0.0008462860922600139, + "loss": 1.4311, + "step": 3073 + }, + { + "epoch": 0.2788967519506442, + "grad_norm": 0.16078253939137974, + "learning_rate": 0.0008461800917438029, + "loss": 1.4535, + "step": 3074 + }, + { + "epoch": 0.278987479586282, + "grad_norm": 0.16009314658659932, + "learning_rate": 0.0008460740613348899, + "loss": 1.4259, + "step": 3075 + }, + { + "epoch": 0.2790782072219198, + "grad_norm": 0.14505637571770819, + "learning_rate": 0.0008459680010424309, + "loss": 1.4419, + "step": 3076 + }, + { + "epoch": 0.2791689348575576, + "grad_norm": 0.14674212032481923, + "learning_rate": 0.0008458619108755839, + "loss": 1.42, + "step": 3077 + }, + { + "epoch": 0.27925966249319545, + "grad_norm": 0.15918598029095732, + "learning_rate": 0.0008457557908435099, + "loss": 1.4588, + "step": 3078 + }, + { + "epoch": 0.2793503901288332, + "grad_norm": 0.15924731434905273, + "learning_rate": 0.0008456496409553724, + "loss": 1.4746, + "step": 3079 + }, + { + "epoch": 0.27944111776447106, + "grad_norm": 0.15079071873307345, + "learning_rate": 0.0008455434612203377, + "loss": 1.4566, + "step": 3080 + }, + { + "epoch": 0.2795318454001089, + "grad_norm": 0.1522577893743572, + "learning_rate": 0.0008454372516475739, + "loss": 1.4759, + "step": 3081 + }, + { + "epoch": 0.27962257303574667, + "grad_norm": 0.1487931896116652, + "learning_rate": 0.0008453310122462526, + "loss": 1.4234, + "step": 3082 + }, + { + "epoch": 0.2797133006713845, + "grad_norm": 0.1583080392201739, + "learning_rate": 0.0008452247430255476, + "loss": 1.4414, + "step": 3083 + }, + { + "epoch": 0.27980402830702233, + "grad_norm": 0.1449991181978033, + "learning_rate": 0.0008451184439946348, + "loss": 1.4545, + "step": 3084 + }, + { + "epoch": 0.2798947559426601, + "grad_norm": 0.1428083129899147, + "learning_rate": 0.000845012115162694, + "loss": 1.455, + "step": 3085 + }, + { + "epoch": 0.27998548357829794, + "grad_norm": 0.13890025853030422, + "learning_rate": 0.0008449057565389058, + "loss": 1.4541, + "step": 3086 + }, + { + "epoch": 0.2800762112139358, + "grad_norm": 0.13478219492157362, + "learning_rate": 0.000844799368132455, + "loss": 1.4275, + "step": 3087 + }, + { + "epoch": 0.28016693884957355, + "grad_norm": 0.14912983355482412, + "learning_rate": 0.0008446929499525277, + "loss": 1.4854, + "step": 3088 + }, + { + "epoch": 0.2802576664852114, + "grad_norm": 0.14702221752335512, + "learning_rate": 0.0008445865020083134, + "loss": 1.4131, + "step": 3089 + }, + { + "epoch": 0.2803483941208492, + "grad_norm": 0.15024674810245522, + "learning_rate": 0.0008444800243090039, + "loss": 1.427, + "step": 3090 + }, + { + "epoch": 0.28043912175648705, + "grad_norm": 0.14341523539834666, + "learning_rate": 0.0008443735168637936, + "loss": 1.4684, + "step": 3091 + }, + { + "epoch": 0.28052984939212483, + "grad_norm": 0.14322354431490772, + "learning_rate": 0.0008442669796818793, + "loss": 1.4388, + "step": 3092 + }, + { + "epoch": 0.28062057702776266, + "grad_norm": 0.14739222403406615, + "learning_rate": 0.0008441604127724607, + "loss": 1.4114, + "step": 3093 + }, + { + "epoch": 0.2807113046634005, + "grad_norm": 0.14711797804527332, + "learning_rate": 0.0008440538161447396, + "loss": 1.4521, + "step": 3094 + }, + { + "epoch": 0.2808020322990383, + "grad_norm": 0.14513351032447744, + "learning_rate": 0.0008439471898079207, + "loss": 1.4278, + "step": 3095 + }, + { + "epoch": 0.2808927599346761, + "grad_norm": 0.14448679364383618, + "learning_rate": 0.0008438405337712114, + "loss": 1.4144, + "step": 3096 + }, + { + "epoch": 0.28098348757031394, + "grad_norm": 0.14636660122639697, + "learning_rate": 0.0008437338480438214, + "loss": 1.4544, + "step": 3097 + }, + { + "epoch": 0.2810742152059517, + "grad_norm": 0.1374487486760958, + "learning_rate": 0.0008436271326349627, + "loss": 1.4379, + "step": 3098 + }, + { + "epoch": 0.28116494284158955, + "grad_norm": 0.15096127992307704, + "learning_rate": 0.0008435203875538506, + "loss": 1.4182, + "step": 3099 + }, + { + "epoch": 0.2812556704772274, + "grad_norm": 0.14458235209246206, + "learning_rate": 0.0008434136128097025, + "loss": 1.4292, + "step": 3100 + }, + { + "epoch": 0.28134639811286516, + "grad_norm": 0.14580855316434274, + "learning_rate": 0.0008433068084117382, + "loss": 1.4522, + "step": 3101 + }, + { + "epoch": 0.281437125748503, + "grad_norm": 0.14434168161310848, + "learning_rate": 0.0008431999743691804, + "loss": 1.4048, + "step": 3102 + }, + { + "epoch": 0.2815278533841408, + "grad_norm": 0.13615648967362606, + "learning_rate": 0.0008430931106912543, + "loss": 1.4261, + "step": 3103 + }, + { + "epoch": 0.2816185810197786, + "grad_norm": 0.1443495389666788, + "learning_rate": 0.0008429862173871873, + "loss": 1.4574, + "step": 3104 + }, + { + "epoch": 0.28170930865541643, + "grad_norm": 0.13749379742143358, + "learning_rate": 0.0008428792944662103, + "loss": 1.4374, + "step": 3105 + }, + { + "epoch": 0.28180003629105427, + "grad_norm": 0.18623137777446572, + "learning_rate": 0.0008427723419375553, + "loss": 1.4103, + "step": 3106 + }, + { + "epoch": 0.28189076392669205, + "grad_norm": 0.14132760257572463, + "learning_rate": 0.0008426653598104583, + "loss": 1.4232, + "step": 3107 + }, + { + "epoch": 0.2819814915623299, + "grad_norm": 0.15083684384553503, + "learning_rate": 0.0008425583480941568, + "loss": 1.4343, + "step": 3108 + }, + { + "epoch": 0.2820722191979677, + "grad_norm": 0.14957260478417495, + "learning_rate": 0.0008424513067978916, + "loss": 1.4411, + "step": 3109 + }, + { + "epoch": 0.28216294683360554, + "grad_norm": 0.1519114676838335, + "learning_rate": 0.0008423442359309054, + "loss": 1.4163, + "step": 3110 + }, + { + "epoch": 0.2822536744692433, + "grad_norm": 0.1642024666136869, + "learning_rate": 0.000842237135502444, + "loss": 1.4654, + "step": 3111 + }, + { + "epoch": 0.28234440210488115, + "grad_norm": 0.1543869119822177, + "learning_rate": 0.0008421300055217558, + "loss": 1.4459, + "step": 3112 + }, + { + "epoch": 0.282435129740519, + "grad_norm": 0.1559231244229888, + "learning_rate": 0.0008420228459980908, + "loss": 1.4068, + "step": 3113 + }, + { + "epoch": 0.28252585737615676, + "grad_norm": 0.19193634539716797, + "learning_rate": 0.000841915656940703, + "loss": 1.426, + "step": 3114 + }, + { + "epoch": 0.2826165850117946, + "grad_norm": 0.15401808954235294, + "learning_rate": 0.0008418084383588476, + "loss": 1.4566, + "step": 3115 + }, + { + "epoch": 0.28270731264743243, + "grad_norm": 0.14372448477025243, + "learning_rate": 0.0008417011902617833, + "loss": 1.4499, + "step": 3116 + }, + { + "epoch": 0.2827980402830702, + "grad_norm": 0.15279052128802534, + "learning_rate": 0.0008415939126587706, + "loss": 1.3952, + "step": 3117 + }, + { + "epoch": 0.28288876791870804, + "grad_norm": 0.14881802577667438, + "learning_rate": 0.0008414866055590734, + "loss": 1.4521, + "step": 3118 + }, + { + "epoch": 0.2829794955543459, + "grad_norm": 0.15026415929076706, + "learning_rate": 0.0008413792689719575, + "loss": 1.4926, + "step": 3119 + }, + { + "epoch": 0.28307022318998365, + "grad_norm": 0.15643532396844018, + "learning_rate": 0.0008412719029066915, + "loss": 1.4303, + "step": 3120 + }, + { + "epoch": 0.2831609508256215, + "grad_norm": 0.15470771269228276, + "learning_rate": 0.0008411645073725461, + "loss": 1.4148, + "step": 3121 + }, + { + "epoch": 0.2832516784612593, + "grad_norm": 0.15255741194393047, + "learning_rate": 0.0008410570823787954, + "loss": 1.465, + "step": 3122 + }, + { + "epoch": 0.2833424060968971, + "grad_norm": 0.17520410004719092, + "learning_rate": 0.0008409496279347153, + "loss": 1.4501, + "step": 3123 + }, + { + "epoch": 0.2834331337325349, + "grad_norm": 0.15561682431643262, + "learning_rate": 0.0008408421440495847, + "loss": 1.4303, + "step": 3124 + }, + { + "epoch": 0.28352386136817276, + "grad_norm": 0.159442074505315, + "learning_rate": 0.0008407346307326846, + "loss": 1.4346, + "step": 3125 + }, + { + "epoch": 0.28361458900381054, + "grad_norm": 0.16723174878223326, + "learning_rate": 0.0008406270879932989, + "loss": 1.4517, + "step": 3126 + }, + { + "epoch": 0.28370531663944837, + "grad_norm": 0.14779624039639444, + "learning_rate": 0.0008405195158407142, + "loss": 1.4584, + "step": 3127 + }, + { + "epoch": 0.2837960442750862, + "grad_norm": 0.1679739869574797, + "learning_rate": 0.0008404119142842189, + "loss": 1.4367, + "step": 3128 + }, + { + "epoch": 0.28388677191072403, + "grad_norm": 0.1424200439177149, + "learning_rate": 0.0008403042833331045, + "loss": 1.4584, + "step": 3129 + }, + { + "epoch": 0.2839774995463618, + "grad_norm": 0.16550899043238562, + "learning_rate": 0.0008401966229966653, + "loss": 1.4168, + "step": 3130 + }, + { + "epoch": 0.28406822718199964, + "grad_norm": 0.18294760270962887, + "learning_rate": 0.0008400889332841974, + "loss": 1.4612, + "step": 3131 + }, + { + "epoch": 0.2841589548176375, + "grad_norm": 0.14311914591885855, + "learning_rate": 0.0008399812142050003, + "loss": 1.4231, + "step": 3132 + }, + { + "epoch": 0.28424968245327525, + "grad_norm": 0.14627274282757832, + "learning_rate": 0.0008398734657683749, + "loss": 1.5005, + "step": 3133 + }, + { + "epoch": 0.2843404100889131, + "grad_norm": 0.15339410482443708, + "learning_rate": 0.0008397656879836257, + "loss": 1.4657, + "step": 3134 + }, + { + "epoch": 0.2844311377245509, + "grad_norm": 0.14925842933215824, + "learning_rate": 0.0008396578808600594, + "loss": 1.4336, + "step": 3135 + }, + { + "epoch": 0.2845218653601887, + "grad_norm": 0.16267673486424733, + "learning_rate": 0.000839550044406985, + "loss": 1.4803, + "step": 3136 + }, + { + "epoch": 0.28461259299582653, + "grad_norm": 0.14830305278304368, + "learning_rate": 0.0008394421786337141, + "loss": 1.4131, + "step": 3137 + }, + { + "epoch": 0.28470332063146436, + "grad_norm": 0.1470507823886309, + "learning_rate": 0.0008393342835495612, + "loss": 1.41, + "step": 3138 + }, + { + "epoch": 0.28479404826710214, + "grad_norm": 0.14435137356706382, + "learning_rate": 0.0008392263591638428, + "loss": 1.4194, + "step": 3139 + }, + { + "epoch": 0.28488477590274, + "grad_norm": 0.15386026731978952, + "learning_rate": 0.0008391184054858784, + "loss": 1.4473, + "step": 3140 + }, + { + "epoch": 0.2849755035383778, + "grad_norm": 0.15744891266753025, + "learning_rate": 0.0008390104225249894, + "loss": 1.4143, + "step": 3141 + }, + { + "epoch": 0.2850662311740156, + "grad_norm": 0.16632546385264044, + "learning_rate": 0.0008389024102905008, + "loss": 1.4594, + "step": 3142 + }, + { + "epoch": 0.2851569588096534, + "grad_norm": 0.15255710813054166, + "learning_rate": 0.000838794368791739, + "loss": 1.4338, + "step": 3143 + }, + { + "epoch": 0.28524768644529125, + "grad_norm": 0.14919924780994193, + "learning_rate": 0.0008386862980380335, + "loss": 1.4431, + "step": 3144 + }, + { + "epoch": 0.285338414080929, + "grad_norm": 0.15264654916649698, + "learning_rate": 0.0008385781980387163, + "loss": 1.4206, + "step": 3145 + }, + { + "epoch": 0.28542914171656686, + "grad_norm": 0.15885267471134035, + "learning_rate": 0.0008384700688031217, + "loss": 1.4691, + "step": 3146 + }, + { + "epoch": 0.2855198693522047, + "grad_norm": 0.16569261803535554, + "learning_rate": 0.0008383619103405868, + "loss": 1.4199, + "step": 3147 + }, + { + "epoch": 0.2856105969878425, + "grad_norm": 0.14736644280979577, + "learning_rate": 0.0008382537226604512, + "loss": 1.4677, + "step": 3148 + }, + { + "epoch": 0.2857013246234803, + "grad_norm": 0.14747269461614693, + "learning_rate": 0.0008381455057720567, + "loss": 1.4403, + "step": 3149 + }, + { + "epoch": 0.28579205225911813, + "grad_norm": 0.15474696029236293, + "learning_rate": 0.0008380372596847479, + "loss": 1.4211, + "step": 3150 + }, + { + "epoch": 0.28588277989475597, + "grad_norm": 0.15591512023323253, + "learning_rate": 0.000837928984407872, + "loss": 1.4142, + "step": 3151 + }, + { + "epoch": 0.28597350753039374, + "grad_norm": 0.14530248029683168, + "learning_rate": 0.0008378206799507784, + "loss": 1.4503, + "step": 3152 + }, + { + "epoch": 0.2860642351660316, + "grad_norm": 0.1424284644268107, + "learning_rate": 0.0008377123463228194, + "loss": 1.412, + "step": 3153 + }, + { + "epoch": 0.2861549628016694, + "grad_norm": 0.14749431068764982, + "learning_rate": 0.0008376039835333495, + "loss": 1.4342, + "step": 3154 + }, + { + "epoch": 0.2862456904373072, + "grad_norm": 0.14267983428469402, + "learning_rate": 0.0008374955915917258, + "loss": 1.4542, + "step": 3155 + }, + { + "epoch": 0.286336418072945, + "grad_norm": 0.16653520969348864, + "learning_rate": 0.0008373871705073079, + "loss": 1.4242, + "step": 3156 + }, + { + "epoch": 0.28642714570858285, + "grad_norm": 0.1330234671476022, + "learning_rate": 0.0008372787202894583, + "loss": 1.4321, + "step": 3157 + }, + { + "epoch": 0.28651787334422063, + "grad_norm": 0.14300235952837834, + "learning_rate": 0.0008371702409475415, + "loss": 1.4366, + "step": 3158 + }, + { + "epoch": 0.28660860097985846, + "grad_norm": 0.15054582781728157, + "learning_rate": 0.0008370617324909246, + "loss": 1.434, + "step": 3159 + }, + { + "epoch": 0.2866993286154963, + "grad_norm": 0.13879808016985581, + "learning_rate": 0.0008369531949289775, + "loss": 1.4167, + "step": 3160 + }, + { + "epoch": 0.2867900562511341, + "grad_norm": 0.1557074953857119, + "learning_rate": 0.0008368446282710723, + "loss": 1.4417, + "step": 3161 + }, + { + "epoch": 0.2868807838867719, + "grad_norm": 0.14920637642150095, + "learning_rate": 0.0008367360325265838, + "loss": 1.4399, + "step": 3162 + }, + { + "epoch": 0.28697151152240974, + "grad_norm": 0.15056434998079785, + "learning_rate": 0.0008366274077048894, + "loss": 1.448, + "step": 3163 + }, + { + "epoch": 0.2870622391580475, + "grad_norm": 0.13586951397864633, + "learning_rate": 0.0008365187538153685, + "loss": 1.4115, + "step": 3164 + }, + { + "epoch": 0.28715296679368535, + "grad_norm": 0.15535359933185325, + "learning_rate": 0.0008364100708674038, + "loss": 1.4151, + "step": 3165 + }, + { + "epoch": 0.2872436944293232, + "grad_norm": 0.1521865625499565, + "learning_rate": 0.0008363013588703798, + "loss": 1.4397, + "step": 3166 + }, + { + "epoch": 0.287334422064961, + "grad_norm": 0.1606380127633932, + "learning_rate": 0.000836192617833684, + "loss": 1.4551, + "step": 3167 + }, + { + "epoch": 0.2874251497005988, + "grad_norm": 0.1433568309849207, + "learning_rate": 0.0008360838477667058, + "loss": 1.4247, + "step": 3168 + }, + { + "epoch": 0.2875158773362366, + "grad_norm": 0.1621304866015349, + "learning_rate": 0.0008359750486788382, + "loss": 1.4507, + "step": 3169 + }, + { + "epoch": 0.28760660497187446, + "grad_norm": 0.15791678546005256, + "learning_rate": 0.0008358662205794754, + "loss": 1.4561, + "step": 3170 + }, + { + "epoch": 0.28769733260751224, + "grad_norm": 0.146880328698851, + "learning_rate": 0.0008357573634780152, + "loss": 1.441, + "step": 3171 + }, + { + "epoch": 0.28778806024315007, + "grad_norm": 0.1446541781652539, + "learning_rate": 0.0008356484773838569, + "loss": 1.4301, + "step": 3172 + }, + { + "epoch": 0.2878787878787879, + "grad_norm": 0.14657477723021256, + "learning_rate": 0.0008355395623064031, + "loss": 1.3734, + "step": 3173 + }, + { + "epoch": 0.2879695155144257, + "grad_norm": 0.154654553481021, + "learning_rate": 0.0008354306182550589, + "loss": 1.4689, + "step": 3174 + }, + { + "epoch": 0.2880602431500635, + "grad_norm": 0.2092742196000887, + "learning_rate": 0.0008353216452392312, + "loss": 1.4321, + "step": 3175 + }, + { + "epoch": 0.28815097078570134, + "grad_norm": 0.14838198508497938, + "learning_rate": 0.0008352126432683299, + "loss": 1.483, + "step": 3176 + }, + { + "epoch": 0.2882416984213391, + "grad_norm": 0.1432803506114661, + "learning_rate": 0.0008351036123517677, + "loss": 1.4308, + "step": 3177 + }, + { + "epoch": 0.28833242605697695, + "grad_norm": 0.14641191752367028, + "learning_rate": 0.0008349945524989589, + "loss": 1.448, + "step": 3178 + }, + { + "epoch": 0.2884231536926148, + "grad_norm": 0.1682435094210582, + "learning_rate": 0.0008348854637193212, + "loss": 1.4343, + "step": 3179 + }, + { + "epoch": 0.28851388132825256, + "grad_norm": 0.14869183842539588, + "learning_rate": 0.0008347763460222746, + "loss": 1.4735, + "step": 3180 + }, + { + "epoch": 0.2886046089638904, + "grad_norm": 0.15112323648692885, + "learning_rate": 0.0008346671994172411, + "loss": 1.4777, + "step": 3181 + }, + { + "epoch": 0.28869533659952823, + "grad_norm": 0.1529621255608737, + "learning_rate": 0.0008345580239136455, + "loss": 1.416, + "step": 3182 + }, + { + "epoch": 0.288786064235166, + "grad_norm": 0.14838803841704323, + "learning_rate": 0.0008344488195209152, + "loss": 1.4338, + "step": 3183 + }, + { + "epoch": 0.28887679187080384, + "grad_norm": 0.17296093545708696, + "learning_rate": 0.0008343395862484799, + "loss": 1.4623, + "step": 3184 + }, + { + "epoch": 0.2889675195064417, + "grad_norm": 0.1433946161579498, + "learning_rate": 0.0008342303241057723, + "loss": 1.4515, + "step": 3185 + }, + { + "epoch": 0.28905824714207945, + "grad_norm": 0.14922685832052954, + "learning_rate": 0.000834121033102227, + "loss": 1.4213, + "step": 3186 + }, + { + "epoch": 0.2891489747777173, + "grad_norm": 0.14205187397564759, + "learning_rate": 0.0008340117132472811, + "loss": 1.4546, + "step": 3187 + }, + { + "epoch": 0.2892397024133551, + "grad_norm": 0.14663113311340262, + "learning_rate": 0.0008339023645503745, + "loss": 1.4652, + "step": 3188 + }, + { + "epoch": 0.28933043004899295, + "grad_norm": 0.1432831612597382, + "learning_rate": 0.0008337929870209495, + "loss": 1.3978, + "step": 3189 + }, + { + "epoch": 0.2894211576846307, + "grad_norm": 0.15544183874426973, + "learning_rate": 0.000833683580668451, + "loss": 1.4535, + "step": 3190 + }, + { + "epoch": 0.28951188532026856, + "grad_norm": 0.14740397940282207, + "learning_rate": 0.0008335741455023261, + "loss": 1.439, + "step": 3191 + }, + { + "epoch": 0.2896026129559064, + "grad_norm": 0.14475092001326814, + "learning_rate": 0.0008334646815320246, + "loss": 1.4555, + "step": 3192 + }, + { + "epoch": 0.28969334059154417, + "grad_norm": 0.14543815778628807, + "learning_rate": 0.0008333551887669987, + "loss": 1.443, + "step": 3193 + }, + { + "epoch": 0.289784068227182, + "grad_norm": 0.15327537918314926, + "learning_rate": 0.000833245667216703, + "loss": 1.4793, + "step": 3194 + }, + { + "epoch": 0.28987479586281983, + "grad_norm": 0.14295308842151971, + "learning_rate": 0.0008331361168905949, + "loss": 1.4306, + "step": 3195 + }, + { + "epoch": 0.2899655234984576, + "grad_norm": 0.14934491303367842, + "learning_rate": 0.000833026537798134, + "loss": 1.4299, + "step": 3196 + }, + { + "epoch": 0.29005625113409544, + "grad_norm": 0.1769867510216634, + "learning_rate": 0.0008329169299487824, + "loss": 1.4357, + "step": 3197 + }, + { + "epoch": 0.2901469787697333, + "grad_norm": 0.14491125899607696, + "learning_rate": 0.000832807293352005, + "loss": 1.4268, + "step": 3198 + }, + { + "epoch": 0.29023770640537105, + "grad_norm": 0.14235681688988977, + "learning_rate": 0.0008326976280172687, + "loss": 1.4331, + "step": 3199 + }, + { + "epoch": 0.2903284340410089, + "grad_norm": 0.15010391206912901, + "learning_rate": 0.0008325879339540432, + "loss": 1.4708, + "step": 3200 + }, + { + "epoch": 0.2904191616766467, + "grad_norm": 0.1413409012546094, + "learning_rate": 0.0008324782111718005, + "loss": 1.4582, + "step": 3201 + }, + { + "epoch": 0.2905098893122845, + "grad_norm": 0.14547319862657052, + "learning_rate": 0.0008323684596800154, + "loss": 1.4183, + "step": 3202 + }, + { + "epoch": 0.29060061694792233, + "grad_norm": 0.14070834951880237, + "learning_rate": 0.0008322586794881646, + "loss": 1.4056, + "step": 3203 + }, + { + "epoch": 0.29069134458356016, + "grad_norm": 0.1626516207772062, + "learning_rate": 0.000832148870605728, + "loss": 1.4239, + "step": 3204 + }, + { + "epoch": 0.29078207221919794, + "grad_norm": 0.14641028223313654, + "learning_rate": 0.0008320390330421875, + "loss": 1.4158, + "step": 3205 + }, + { + "epoch": 0.2908727998548358, + "grad_norm": 0.1474851814437363, + "learning_rate": 0.0008319291668070274, + "loss": 1.4681, + "step": 3206 + }, + { + "epoch": 0.2909635274904736, + "grad_norm": 0.1424188786613724, + "learning_rate": 0.0008318192719097351, + "loss": 1.4243, + "step": 3207 + }, + { + "epoch": 0.29105425512611144, + "grad_norm": 0.14776223380652048, + "learning_rate": 0.0008317093483597995, + "loss": 1.4274, + "step": 3208 + }, + { + "epoch": 0.2911449827617492, + "grad_norm": 0.14195914837233367, + "learning_rate": 0.000831599396166713, + "loss": 1.4535, + "step": 3209 + }, + { + "epoch": 0.29123571039738705, + "grad_norm": 0.150311708894339, + "learning_rate": 0.0008314894153399697, + "loss": 1.4015, + "step": 3210 + }, + { + "epoch": 0.2913264380330249, + "grad_norm": 0.145145095820288, + "learning_rate": 0.0008313794058890664, + "loss": 1.4079, + "step": 3211 + }, + { + "epoch": 0.29141716566866266, + "grad_norm": 0.15424598030693265, + "learning_rate": 0.0008312693678235026, + "loss": 1.4295, + "step": 3212 + }, + { + "epoch": 0.2915078933043005, + "grad_norm": 0.14365461511629984, + "learning_rate": 0.0008311593011527802, + "loss": 1.4842, + "step": 3213 + }, + { + "epoch": 0.2915986209399383, + "grad_norm": 0.16116454434522418, + "learning_rate": 0.0008310492058864032, + "loss": 1.4369, + "step": 3214 + }, + { + "epoch": 0.2916893485755761, + "grad_norm": 0.17732993856106788, + "learning_rate": 0.0008309390820338784, + "loss": 1.4132, + "step": 3215 + }, + { + "epoch": 0.29178007621121393, + "grad_norm": 0.15051502780521228, + "learning_rate": 0.0008308289296047154, + "loss": 1.467, + "step": 3216 + }, + { + "epoch": 0.29187080384685177, + "grad_norm": 0.15468422047756958, + "learning_rate": 0.0008307187486084253, + "loss": 1.4657, + "step": 3217 + }, + { + "epoch": 0.29196153148248954, + "grad_norm": 0.14267502693622375, + "learning_rate": 0.0008306085390545226, + "loss": 1.4626, + "step": 3218 + }, + { + "epoch": 0.2920522591181274, + "grad_norm": 0.1418187434603406, + "learning_rate": 0.0008304983009525239, + "loss": 1.4693, + "step": 3219 + }, + { + "epoch": 0.2921429867537652, + "grad_norm": 0.14589316038150102, + "learning_rate": 0.0008303880343119481, + "loss": 1.4502, + "step": 3220 + }, + { + "epoch": 0.292233714389403, + "grad_norm": 0.14724698553495022, + "learning_rate": 0.0008302777391423168, + "loss": 1.4673, + "step": 3221 + }, + { + "epoch": 0.2923244420250408, + "grad_norm": 0.1488646024358881, + "learning_rate": 0.0008301674154531542, + "loss": 1.416, + "step": 3222 + }, + { + "epoch": 0.29241516966067865, + "grad_norm": 0.15008807082383804, + "learning_rate": 0.0008300570632539865, + "loss": 1.4639, + "step": 3223 + }, + { + "epoch": 0.29250589729631643, + "grad_norm": 0.14099378838278387, + "learning_rate": 0.0008299466825543429, + "loss": 1.4284, + "step": 3224 + }, + { + "epoch": 0.29259662493195426, + "grad_norm": 0.14402650309278067, + "learning_rate": 0.0008298362733637544, + "loss": 1.3992, + "step": 3225 + }, + { + "epoch": 0.2926873525675921, + "grad_norm": 0.1459710911206952, + "learning_rate": 0.0008297258356917552, + "loss": 1.3917, + "step": 3226 + }, + { + "epoch": 0.29277808020322993, + "grad_norm": 0.14599783030514993, + "learning_rate": 0.0008296153695478816, + "loss": 1.4318, + "step": 3227 + }, + { + "epoch": 0.2928688078388677, + "grad_norm": 0.14221446300148488, + "learning_rate": 0.0008295048749416721, + "loss": 1.4356, + "step": 3228 + }, + { + "epoch": 0.29295953547450554, + "grad_norm": 0.14496893775077185, + "learning_rate": 0.0008293943518826681, + "loss": 1.4351, + "step": 3229 + }, + { + "epoch": 0.29305026311014337, + "grad_norm": 0.1458469621919066, + "learning_rate": 0.0008292838003804133, + "loss": 1.4724, + "step": 3230 + }, + { + "epoch": 0.29314099074578115, + "grad_norm": 0.15991585321080468, + "learning_rate": 0.0008291732204444537, + "loss": 1.4421, + "step": 3231 + }, + { + "epoch": 0.293231718381419, + "grad_norm": 0.14170267707637096, + "learning_rate": 0.0008290626120843382, + "loss": 1.4569, + "step": 3232 + }, + { + "epoch": 0.2933224460170568, + "grad_norm": 0.1446645890824495, + "learning_rate": 0.0008289519753096175, + "loss": 1.4381, + "step": 3233 + }, + { + "epoch": 0.2934131736526946, + "grad_norm": 0.1451809927835344, + "learning_rate": 0.0008288413101298453, + "loss": 1.4731, + "step": 3234 + }, + { + "epoch": 0.2935039012883324, + "grad_norm": 0.15457725553060017, + "learning_rate": 0.0008287306165545776, + "loss": 1.4535, + "step": 3235 + }, + { + "epoch": 0.29359462892397026, + "grad_norm": 0.14613924538337855, + "learning_rate": 0.0008286198945933725, + "loss": 1.414, + "step": 3236 + }, + { + "epoch": 0.29368535655960804, + "grad_norm": 0.1921105628419289, + "learning_rate": 0.0008285091442557913, + "loss": 1.4467, + "step": 3237 + }, + { + "epoch": 0.29377608419524587, + "grad_norm": 0.14627476119187768, + "learning_rate": 0.0008283983655513969, + "loss": 1.4201, + "step": 3238 + }, + { + "epoch": 0.2938668118308837, + "grad_norm": 0.15854448217129186, + "learning_rate": 0.0008282875584897553, + "loss": 1.4365, + "step": 3239 + }, + { + "epoch": 0.2939575394665215, + "grad_norm": 0.15205155832356226, + "learning_rate": 0.0008281767230804346, + "loss": 1.4212, + "step": 3240 + }, + { + "epoch": 0.2940482671021593, + "grad_norm": 0.1617772256479574, + "learning_rate": 0.0008280658593330056, + "loss": 1.4296, + "step": 3241 + }, + { + "epoch": 0.29413899473779714, + "grad_norm": 0.1455258013329566, + "learning_rate": 0.0008279549672570413, + "loss": 1.3883, + "step": 3242 + }, + { + "epoch": 0.2942297223734349, + "grad_norm": 0.15677925084906652, + "learning_rate": 0.0008278440468621172, + "loss": 1.4607, + "step": 3243 + }, + { + "epoch": 0.29432045000907275, + "grad_norm": 0.14583949740820218, + "learning_rate": 0.0008277330981578112, + "loss": 1.4324, + "step": 3244 + }, + { + "epoch": 0.2944111776447106, + "grad_norm": 0.17315232568034983, + "learning_rate": 0.0008276221211537039, + "loss": 1.4249, + "step": 3245 + }, + { + "epoch": 0.2945019052803484, + "grad_norm": 0.1547706135654953, + "learning_rate": 0.0008275111158593783, + "loss": 1.4428, + "step": 3246 + }, + { + "epoch": 0.2945926329159862, + "grad_norm": 0.14866928282949152, + "learning_rate": 0.0008274000822844195, + "loss": 1.4286, + "step": 3247 + }, + { + "epoch": 0.29468336055162403, + "grad_norm": 0.16071941111264088, + "learning_rate": 0.0008272890204384152, + "loss": 1.441, + "step": 3248 + }, + { + "epoch": 0.29477408818726186, + "grad_norm": 0.14306154075329042, + "learning_rate": 0.0008271779303309561, + "loss": 1.4255, + "step": 3249 + }, + { + "epoch": 0.29486481582289964, + "grad_norm": 0.16344613330560132, + "learning_rate": 0.000827066811971634, + "loss": 1.395, + "step": 3250 + }, + { + "epoch": 0.2949555434585375, + "grad_norm": 0.13994015360040254, + "learning_rate": 0.0008269556653700449, + "loss": 1.4064, + "step": 3251 + }, + { + "epoch": 0.2950462710941753, + "grad_norm": 0.1358454125099044, + "learning_rate": 0.0008268444905357857, + "loss": 1.4258, + "step": 3252 + }, + { + "epoch": 0.2951369987298131, + "grad_norm": 0.15052188566929645, + "learning_rate": 0.0008267332874784568, + "loss": 1.4288, + "step": 3253 + }, + { + "epoch": 0.2952277263654509, + "grad_norm": 0.14351762563578907, + "learning_rate": 0.0008266220562076601, + "loss": 1.4293, + "step": 3254 + }, + { + "epoch": 0.29531845400108875, + "grad_norm": 0.1359235255167703, + "learning_rate": 0.0008265107967330008, + "loss": 1.452, + "step": 3255 + }, + { + "epoch": 0.2954091816367265, + "grad_norm": 0.16675530198218047, + "learning_rate": 0.0008263995090640861, + "loss": 1.4578, + "step": 3256 + }, + { + "epoch": 0.29549990927236436, + "grad_norm": 0.14930702546868885, + "learning_rate": 0.0008262881932105257, + "loss": 1.4128, + "step": 3257 + }, + { + "epoch": 0.2955906369080022, + "grad_norm": 0.1449381708486386, + "learning_rate": 0.0008261768491819317, + "loss": 1.443, + "step": 3258 + }, + { + "epoch": 0.29568136454363997, + "grad_norm": 0.14916114676864392, + "learning_rate": 0.0008260654769879186, + "loss": 1.4579, + "step": 3259 + }, + { + "epoch": 0.2957720921792778, + "grad_norm": 0.14020053697963805, + "learning_rate": 0.0008259540766381037, + "loss": 1.4525, + "step": 3260 + }, + { + "epoch": 0.29586281981491563, + "grad_norm": 0.14701703272773398, + "learning_rate": 0.0008258426481421062, + "loss": 1.4454, + "step": 3261 + }, + { + "epoch": 0.2959535474505534, + "grad_norm": 0.1452881910850172, + "learning_rate": 0.0008257311915095478, + "loss": 1.4656, + "step": 3262 + }, + { + "epoch": 0.29604427508619124, + "grad_norm": 0.1496298996615985, + "learning_rate": 0.0008256197067500533, + "loss": 1.4488, + "step": 3263 + }, + { + "epoch": 0.2961350027218291, + "grad_norm": 0.14102255538892017, + "learning_rate": 0.0008255081938732489, + "loss": 1.4391, + "step": 3264 + }, + { + "epoch": 0.2962257303574669, + "grad_norm": 0.1390611130493936, + "learning_rate": 0.000825396652888764, + "loss": 1.4362, + "step": 3265 + }, + { + "epoch": 0.2963164579931047, + "grad_norm": 0.14360310311919935, + "learning_rate": 0.0008252850838062304, + "loss": 1.4262, + "step": 3266 + }, + { + "epoch": 0.2964071856287425, + "grad_norm": 0.15867286473172074, + "learning_rate": 0.0008251734866352817, + "loss": 1.4205, + "step": 3267 + }, + { + "epoch": 0.29649791326438035, + "grad_norm": 0.1539733465997558, + "learning_rate": 0.0008250618613855546, + "loss": 1.4444, + "step": 3268 + }, + { + "epoch": 0.29658864090001813, + "grad_norm": 0.15226748593905903, + "learning_rate": 0.0008249502080666878, + "loss": 1.4686, + "step": 3269 + }, + { + "epoch": 0.29667936853565596, + "grad_norm": 0.18936967734219096, + "learning_rate": 0.0008248385266883225, + "loss": 1.4567, + "step": 3270 + }, + { + "epoch": 0.2967700961712938, + "grad_norm": 0.14595091735024157, + "learning_rate": 0.0008247268172601028, + "loss": 1.4598, + "step": 3271 + }, + { + "epoch": 0.2968608238069316, + "grad_norm": 0.15154768169068863, + "learning_rate": 0.0008246150797916742, + "loss": 1.4804, + "step": 3272 + }, + { + "epoch": 0.2969515514425694, + "grad_norm": 0.15425375946100586, + "learning_rate": 0.000824503314292686, + "loss": 1.4322, + "step": 3273 + }, + { + "epoch": 0.29704227907820724, + "grad_norm": 0.1448328242945287, + "learning_rate": 0.0008243915207727886, + "loss": 1.4057, + "step": 3274 + }, + { + "epoch": 0.297133006713845, + "grad_norm": 0.15038676682834262, + "learning_rate": 0.0008242796992416358, + "loss": 1.4798, + "step": 3275 + }, + { + "epoch": 0.29722373434948285, + "grad_norm": 0.14638806764811285, + "learning_rate": 0.0008241678497088829, + "loss": 1.4252, + "step": 3276 + }, + { + "epoch": 0.2973144619851207, + "grad_norm": 0.15727961488982045, + "learning_rate": 0.0008240559721841884, + "loss": 1.3897, + "step": 3277 + }, + { + "epoch": 0.29740518962075846, + "grad_norm": 0.14180500603794768, + "learning_rate": 0.000823944066677213, + "loss": 1.4124, + "step": 3278 + }, + { + "epoch": 0.2974959172563963, + "grad_norm": 0.1483615976188332, + "learning_rate": 0.0008238321331976197, + "loss": 1.4541, + "step": 3279 + }, + { + "epoch": 0.2975866448920341, + "grad_norm": 0.1475756169605015, + "learning_rate": 0.000823720171755074, + "loss": 1.413, + "step": 3280 + }, + { + "epoch": 0.2976773725276719, + "grad_norm": 0.16619635087097243, + "learning_rate": 0.0008236081823592437, + "loss": 1.4175, + "step": 3281 + }, + { + "epoch": 0.29776810016330973, + "grad_norm": 0.14366991849793234, + "learning_rate": 0.0008234961650197993, + "loss": 1.4334, + "step": 3282 + }, + { + "epoch": 0.29785882779894757, + "grad_norm": 0.1534647604101838, + "learning_rate": 0.0008233841197464133, + "loss": 1.4341, + "step": 3283 + }, + { + "epoch": 0.2979495554345854, + "grad_norm": 0.14853573953761046, + "learning_rate": 0.0008232720465487608, + "loss": 1.4549, + "step": 3284 + }, + { + "epoch": 0.2980402830702232, + "grad_norm": 0.14654371040282665, + "learning_rate": 0.0008231599454365195, + "loss": 1.4505, + "step": 3285 + }, + { + "epoch": 0.298131010705861, + "grad_norm": 0.14736178867758906, + "learning_rate": 0.0008230478164193693, + "loss": 1.4174, + "step": 3286 + }, + { + "epoch": 0.29822173834149884, + "grad_norm": 0.1433707553277556, + "learning_rate": 0.0008229356595069925, + "loss": 1.4502, + "step": 3287 + }, + { + "epoch": 0.2983124659771366, + "grad_norm": 0.14792066642125234, + "learning_rate": 0.0008228234747090741, + "loss": 1.4271, + "step": 3288 + }, + { + "epoch": 0.29840319361277445, + "grad_norm": 0.13827661440246394, + "learning_rate": 0.0008227112620353007, + "loss": 1.4249, + "step": 3289 + }, + { + "epoch": 0.2984939212484123, + "grad_norm": 0.14532510209143412, + "learning_rate": 0.0008225990214953627, + "loss": 1.3927, + "step": 3290 + }, + { + "epoch": 0.29858464888405006, + "grad_norm": 0.15760421021601712, + "learning_rate": 0.0008224867530989513, + "loss": 1.4345, + "step": 3291 + }, + { + "epoch": 0.2986753765196879, + "grad_norm": 0.14738626549184472, + "learning_rate": 0.0008223744568557614, + "loss": 1.4329, + "step": 3292 + }, + { + "epoch": 0.29876610415532573, + "grad_norm": 0.16460862060899287, + "learning_rate": 0.0008222621327754895, + "loss": 1.4463, + "step": 3293 + }, + { + "epoch": 0.2988568317909635, + "grad_norm": 0.14978339934499868, + "learning_rate": 0.0008221497808678352, + "loss": 1.4616, + "step": 3294 + }, + { + "epoch": 0.29894755942660134, + "grad_norm": 0.145478645279195, + "learning_rate": 0.0008220374011424997, + "loss": 1.4605, + "step": 3295 + }, + { + "epoch": 0.29903828706223917, + "grad_norm": 0.15157323412776805, + "learning_rate": 0.0008219249936091871, + "loss": 1.4545, + "step": 3296 + }, + { + "epoch": 0.29912901469787695, + "grad_norm": 0.14635799648729444, + "learning_rate": 0.0008218125582776039, + "loss": 1.4067, + "step": 3297 + }, + { + "epoch": 0.2992197423335148, + "grad_norm": 0.1405107806809972, + "learning_rate": 0.0008217000951574589, + "loss": 1.4311, + "step": 3298 + }, + { + "epoch": 0.2993104699691526, + "grad_norm": 0.14539733759590961, + "learning_rate": 0.0008215876042584633, + "loss": 1.3923, + "step": 3299 + }, + { + "epoch": 0.2994011976047904, + "grad_norm": 0.14539768390548258, + "learning_rate": 0.0008214750855903306, + "loss": 1.4732, + "step": 3300 + }, + { + "epoch": 0.2994919252404282, + "grad_norm": 0.13383193115920564, + "learning_rate": 0.0008213625391627767, + "loss": 1.44, + "step": 3301 + }, + { + "epoch": 0.29958265287606606, + "grad_norm": 0.15675209397377082, + "learning_rate": 0.0008212499649855204, + "loss": 1.4298, + "step": 3302 + }, + { + "epoch": 0.2996733805117039, + "grad_norm": 0.1565881769640635, + "learning_rate": 0.0008211373630682822, + "loss": 1.4205, + "step": 3303 + }, + { + "epoch": 0.29976410814734167, + "grad_norm": 0.15012901742560142, + "learning_rate": 0.0008210247334207854, + "loss": 1.4398, + "step": 3304 + }, + { + "epoch": 0.2998548357829795, + "grad_norm": 0.13136135467837476, + "learning_rate": 0.0008209120760527555, + "loss": 1.4198, + "step": 3305 + }, + { + "epoch": 0.29994556341861733, + "grad_norm": 0.14161904420118884, + "learning_rate": 0.0008207993909739207, + "loss": 1.4708, + "step": 3306 + }, + { + "epoch": 0.3000362910542551, + "grad_norm": 0.1384680380358639, + "learning_rate": 0.0008206866781940111, + "loss": 1.4268, + "step": 3307 + }, + { + "epoch": 0.30012701868989294, + "grad_norm": 0.15429632542627575, + "learning_rate": 0.0008205739377227595, + "loss": 1.415, + "step": 3308 + }, + { + "epoch": 0.3002177463255308, + "grad_norm": 0.15146876762137135, + "learning_rate": 0.0008204611695699013, + "loss": 1.4333, + "step": 3309 + }, + { + "epoch": 0.30030847396116855, + "grad_norm": 0.15418822785968153, + "learning_rate": 0.0008203483737451737, + "loss": 1.4518, + "step": 3310 + }, + { + "epoch": 0.3003992015968064, + "grad_norm": 0.14852956841018108, + "learning_rate": 0.000820235550258317, + "loss": 1.4359, + "step": 3311 + }, + { + "epoch": 0.3004899292324442, + "grad_norm": 0.14488631602357044, + "learning_rate": 0.0008201226991190731, + "loss": 1.4593, + "step": 3312 + }, + { + "epoch": 0.300580656868082, + "grad_norm": 0.148394324928119, + "learning_rate": 0.0008200098203371871, + "loss": 1.417, + "step": 3313 + }, + { + "epoch": 0.30067138450371983, + "grad_norm": 0.15687099849064456, + "learning_rate": 0.0008198969139224059, + "loss": 1.456, + "step": 3314 + }, + { + "epoch": 0.30076211213935766, + "grad_norm": 0.1403176501501323, + "learning_rate": 0.0008197839798844791, + "loss": 1.4623, + "step": 3315 + }, + { + "epoch": 0.30085283977499544, + "grad_norm": 0.14239514110831908, + "learning_rate": 0.0008196710182331585, + "loss": 1.4621, + "step": 3316 + }, + { + "epoch": 0.3009435674106333, + "grad_norm": 0.14389906897811466, + "learning_rate": 0.0008195580289781983, + "loss": 1.4105, + "step": 3317 + }, + { + "epoch": 0.3010342950462711, + "grad_norm": 0.14099488891302794, + "learning_rate": 0.0008194450121293553, + "loss": 1.4995, + "step": 3318 + }, + { + "epoch": 0.3011250226819089, + "grad_norm": 0.15854125582709427, + "learning_rate": 0.0008193319676963884, + "loss": 1.3708, + "step": 3319 + }, + { + "epoch": 0.3012157503175467, + "grad_norm": 0.14480393460199717, + "learning_rate": 0.0008192188956890589, + "loss": 1.448, + "step": 3320 + }, + { + "epoch": 0.30130647795318455, + "grad_norm": 0.15472079669227062, + "learning_rate": 0.0008191057961171308, + "loss": 1.4248, + "step": 3321 + }, + { + "epoch": 0.3013972055888224, + "grad_norm": 0.15035585040461275, + "learning_rate": 0.0008189926689903702, + "loss": 1.3946, + "step": 3322 + }, + { + "epoch": 0.30148793322446016, + "grad_norm": 0.14717446016591978, + "learning_rate": 0.0008188795143185454, + "loss": 1.4536, + "step": 3323 + }, + { + "epoch": 0.301578660860098, + "grad_norm": 0.20314636196321026, + "learning_rate": 0.0008187663321114278, + "loss": 1.448, + "step": 3324 + }, + { + "epoch": 0.3016693884957358, + "grad_norm": 0.1589558328895963, + "learning_rate": 0.0008186531223787903, + "loss": 1.4157, + "step": 3325 + }, + { + "epoch": 0.3017601161313736, + "grad_norm": 0.15955798164989446, + "learning_rate": 0.0008185398851304089, + "loss": 1.4451, + "step": 3326 + }, + { + "epoch": 0.30185084376701143, + "grad_norm": 0.15699362233689762, + "learning_rate": 0.0008184266203760613, + "loss": 1.4489, + "step": 3327 + }, + { + "epoch": 0.30194157140264927, + "grad_norm": 0.15697021407228195, + "learning_rate": 0.0008183133281255281, + "loss": 1.4444, + "step": 3328 + }, + { + "epoch": 0.30203229903828704, + "grad_norm": 0.15642126452320368, + "learning_rate": 0.0008182000083885921, + "loss": 1.4609, + "step": 3329 + }, + { + "epoch": 0.3021230266739249, + "grad_norm": 0.19186944314476576, + "learning_rate": 0.0008180866611750386, + "loss": 1.4261, + "step": 3330 + }, + { + "epoch": 0.3022137543095627, + "grad_norm": 0.17705227287875652, + "learning_rate": 0.000817973286494655, + "loss": 1.4544, + "step": 3331 + }, + { + "epoch": 0.3023044819452005, + "grad_norm": 0.1483909165693367, + "learning_rate": 0.0008178598843572311, + "loss": 1.4358, + "step": 3332 + }, + { + "epoch": 0.3023952095808383, + "grad_norm": 0.16150122674983663, + "learning_rate": 0.0008177464547725595, + "loss": 1.4009, + "step": 3333 + }, + { + "epoch": 0.30248593721647615, + "grad_norm": 0.15882346604300832, + "learning_rate": 0.0008176329977504347, + "loss": 1.4188, + "step": 3334 + }, + { + "epoch": 0.30257666485211393, + "grad_norm": 0.16019061511148583, + "learning_rate": 0.0008175195133006537, + "loss": 1.4284, + "step": 3335 + }, + { + "epoch": 0.30266739248775176, + "grad_norm": 0.15763293706864528, + "learning_rate": 0.0008174060014330158, + "loss": 1.4225, + "step": 3336 + }, + { + "epoch": 0.3027581201233896, + "grad_norm": 0.15357129102146652, + "learning_rate": 0.000817292462157323, + "loss": 1.434, + "step": 3337 + }, + { + "epoch": 0.3028488477590274, + "grad_norm": 0.14844137136700503, + "learning_rate": 0.0008171788954833793, + "loss": 1.4482, + "step": 3338 + }, + { + "epoch": 0.3029395753946652, + "grad_norm": 0.1430826650686725, + "learning_rate": 0.0008170653014209912, + "loss": 1.4305, + "step": 3339 + }, + { + "epoch": 0.30303030303030304, + "grad_norm": 0.1516150064317381, + "learning_rate": 0.0008169516799799676, + "loss": 1.4294, + "step": 3340 + }, + { + "epoch": 0.30312103066594087, + "grad_norm": 0.17808159619836625, + "learning_rate": 0.0008168380311701198, + "loss": 1.3926, + "step": 3341 + }, + { + "epoch": 0.30321175830157865, + "grad_norm": 0.1471744770686188, + "learning_rate": 0.0008167243550012613, + "loss": 1.4755, + "step": 3342 + }, + { + "epoch": 0.3033024859372165, + "grad_norm": 0.15387350344815745, + "learning_rate": 0.000816610651483208, + "loss": 1.4428, + "step": 3343 + }, + { + "epoch": 0.3033932135728543, + "grad_norm": 0.15324256697968663, + "learning_rate": 0.0008164969206257784, + "loss": 1.4453, + "step": 3344 + }, + { + "epoch": 0.3034839412084921, + "grad_norm": 0.14544386082682587, + "learning_rate": 0.000816383162438793, + "loss": 1.4312, + "step": 3345 + }, + { + "epoch": 0.3035746688441299, + "grad_norm": 0.1543264209203691, + "learning_rate": 0.0008162693769320749, + "loss": 1.4366, + "step": 3346 + }, + { + "epoch": 0.30366539647976776, + "grad_norm": 0.15580400644075468, + "learning_rate": 0.0008161555641154492, + "loss": 1.4426, + "step": 3347 + }, + { + "epoch": 0.30375612411540553, + "grad_norm": 0.16854878432433368, + "learning_rate": 0.0008160417239987443, + "loss": 1.4321, + "step": 3348 + }, + { + "epoch": 0.30384685175104337, + "grad_norm": 0.14090345661790665, + "learning_rate": 0.0008159278565917899, + "loss": 1.425, + "step": 3349 + }, + { + "epoch": 0.3039375793866812, + "grad_norm": 0.17339908560380127, + "learning_rate": 0.0008158139619044185, + "loss": 1.4357, + "step": 3350 + }, + { + "epoch": 0.304028307022319, + "grad_norm": 0.15576448437785156, + "learning_rate": 0.0008157000399464649, + "loss": 1.418, + "step": 3351 + }, + { + "epoch": 0.3041190346579568, + "grad_norm": 0.15929526747824135, + "learning_rate": 0.0008155860907277663, + "loss": 1.4093, + "step": 3352 + }, + { + "epoch": 0.30420976229359464, + "grad_norm": 0.14567310366633096, + "learning_rate": 0.0008154721142581622, + "loss": 1.433, + "step": 3353 + }, + { + "epoch": 0.3043004899292324, + "grad_norm": 0.29769872627887684, + "learning_rate": 0.0008153581105474947, + "loss": 1.4393, + "step": 3354 + }, + { + "epoch": 0.30439121756487025, + "grad_norm": 0.1475056190137249, + "learning_rate": 0.0008152440796056077, + "loss": 1.4478, + "step": 3355 + }, + { + "epoch": 0.3044819452005081, + "grad_norm": 0.1519829421540556, + "learning_rate": 0.0008151300214423482, + "loss": 1.4069, + "step": 3356 + }, + { + "epoch": 0.30457267283614586, + "grad_norm": 0.16923211022321802, + "learning_rate": 0.0008150159360675647, + "loss": 1.4386, + "step": 3357 + }, + { + "epoch": 0.3046634004717837, + "grad_norm": 0.15530456408470286, + "learning_rate": 0.0008149018234911088, + "loss": 1.4663, + "step": 3358 + }, + { + "epoch": 0.30475412810742153, + "grad_norm": 0.15552799487551527, + "learning_rate": 0.000814787683722834, + "loss": 1.4207, + "step": 3359 + }, + { + "epoch": 0.30484485574305936, + "grad_norm": 0.1497800606200206, + "learning_rate": 0.0008146735167725963, + "loss": 1.4283, + "step": 3360 + }, + { + "epoch": 0.30493558337869714, + "grad_norm": 0.15611262935028802, + "learning_rate": 0.0008145593226502541, + "loss": 1.4373, + "step": 3361 + }, + { + "epoch": 0.30502631101433497, + "grad_norm": 0.2006061489155502, + "learning_rate": 0.0008144451013656679, + "loss": 1.4789, + "step": 3362 + }, + { + "epoch": 0.3051170386499728, + "grad_norm": 0.1513811647076985, + "learning_rate": 0.0008143308529287009, + "loss": 1.4536, + "step": 3363 + }, + { + "epoch": 0.3052077662856106, + "grad_norm": 0.142582685304983, + "learning_rate": 0.0008142165773492185, + "loss": 1.445, + "step": 3364 + }, + { + "epoch": 0.3052984939212484, + "grad_norm": 0.13785303323519105, + "learning_rate": 0.0008141022746370883, + "loss": 1.4536, + "step": 3365 + }, + { + "epoch": 0.30538922155688625, + "grad_norm": 0.16963886527802277, + "learning_rate": 0.0008139879448021805, + "loss": 1.4551, + "step": 3366 + }, + { + "epoch": 0.305479949192524, + "grad_norm": 0.15055993633781542, + "learning_rate": 0.0008138735878543672, + "loss": 1.4241, + "step": 3367 + }, + { + "epoch": 0.30557067682816186, + "grad_norm": 0.1440037703847169, + "learning_rate": 0.0008137592038035233, + "loss": 1.4375, + "step": 3368 + }, + { + "epoch": 0.3056614044637997, + "grad_norm": 0.14526469929612243, + "learning_rate": 0.0008136447926595261, + "loss": 1.4595, + "step": 3369 + }, + { + "epoch": 0.30575213209943747, + "grad_norm": 0.15048514342598637, + "learning_rate": 0.0008135303544322547, + "loss": 1.4482, + "step": 3370 + }, + { + "epoch": 0.3058428597350753, + "grad_norm": 0.14927568404821795, + "learning_rate": 0.000813415889131591, + "loss": 1.4677, + "step": 3371 + }, + { + "epoch": 0.30593358737071313, + "grad_norm": 0.14757081846720285, + "learning_rate": 0.0008133013967674193, + "loss": 1.4133, + "step": 3372 + }, + { + "epoch": 0.3060243150063509, + "grad_norm": 0.14281827660295587, + "learning_rate": 0.0008131868773496254, + "loss": 1.4363, + "step": 3373 + }, + { + "epoch": 0.30611504264198874, + "grad_norm": 0.15052621023595728, + "learning_rate": 0.0008130723308880987, + "loss": 1.4493, + "step": 3374 + }, + { + "epoch": 0.3062057702776266, + "grad_norm": 0.14594845283504906, + "learning_rate": 0.00081295775739273, + "loss": 1.3984, + "step": 3375 + }, + { + "epoch": 0.30629649791326435, + "grad_norm": 0.14505448534362608, + "learning_rate": 0.0008128431568734131, + "loss": 1.4546, + "step": 3376 + }, + { + "epoch": 0.3063872255489022, + "grad_norm": 0.1459501099998378, + "learning_rate": 0.0008127285293400432, + "loss": 1.457, + "step": 3377 + }, + { + "epoch": 0.30647795318454, + "grad_norm": 0.14681041709162676, + "learning_rate": 0.000812613874802519, + "loss": 1.4349, + "step": 3378 + }, + { + "epoch": 0.30656868082017785, + "grad_norm": 0.15106225294050388, + "learning_rate": 0.0008124991932707402, + "loss": 1.4315, + "step": 3379 + }, + { + "epoch": 0.30665940845581563, + "grad_norm": 0.17102427916641075, + "learning_rate": 0.0008123844847546102, + "loss": 1.4692, + "step": 3380 + }, + { + "epoch": 0.30675013609145346, + "grad_norm": 0.14452185255719405, + "learning_rate": 0.0008122697492640341, + "loss": 1.4582, + "step": 3381 + }, + { + "epoch": 0.3068408637270913, + "grad_norm": 0.15237813464690925, + "learning_rate": 0.0008121549868089189, + "loss": 1.4453, + "step": 3382 + }, + { + "epoch": 0.3069315913627291, + "grad_norm": 0.1572603172982079, + "learning_rate": 0.0008120401973991747, + "loss": 1.4556, + "step": 3383 + }, + { + "epoch": 0.3070223189983669, + "grad_norm": 0.14000553357673304, + "learning_rate": 0.0008119253810447134, + "loss": 1.3911, + "step": 3384 + }, + { + "epoch": 0.30711304663400474, + "grad_norm": 0.13560849787947113, + "learning_rate": 0.0008118105377554495, + "loss": 1.418, + "step": 3385 + }, + { + "epoch": 0.3072037742696425, + "grad_norm": 0.15599939619204894, + "learning_rate": 0.0008116956675412997, + "loss": 1.417, + "step": 3386 + }, + { + "epoch": 0.30729450190528035, + "grad_norm": 0.14903466970503518, + "learning_rate": 0.0008115807704121831, + "loss": 1.432, + "step": 3387 + }, + { + "epoch": 0.3073852295409182, + "grad_norm": 0.14076709363187367, + "learning_rate": 0.0008114658463780209, + "loss": 1.4183, + "step": 3388 + }, + { + "epoch": 0.30747595717655596, + "grad_norm": 0.1819444014461679, + "learning_rate": 0.0008113508954487371, + "loss": 1.4257, + "step": 3389 + }, + { + "epoch": 0.3075666848121938, + "grad_norm": 0.1427390588571114, + "learning_rate": 0.0008112359176342576, + "loss": 1.4116, + "step": 3390 + }, + { + "epoch": 0.3076574124478316, + "grad_norm": 0.14453934913169053, + "learning_rate": 0.0008111209129445107, + "loss": 1.4271, + "step": 3391 + }, + { + "epoch": 0.3077481400834694, + "grad_norm": 0.14524641045310968, + "learning_rate": 0.0008110058813894272, + "loss": 1.4297, + "step": 3392 + }, + { + "epoch": 0.30783886771910723, + "grad_norm": 0.16319569664645772, + "learning_rate": 0.0008108908229789399, + "loss": 1.3961, + "step": 3393 + }, + { + "epoch": 0.30792959535474507, + "grad_norm": 0.19216199296501993, + "learning_rate": 0.0008107757377229842, + "loss": 1.4577, + "step": 3394 + }, + { + "epoch": 0.30802032299038284, + "grad_norm": 0.14996391118923252, + "learning_rate": 0.0008106606256314978, + "loss": 1.474, + "step": 3395 + }, + { + "epoch": 0.3081110506260207, + "grad_norm": 0.15338757781462564, + "learning_rate": 0.0008105454867144206, + "loss": 1.4224, + "step": 3396 + }, + { + "epoch": 0.3082017782616585, + "grad_norm": 0.13917677463431816, + "learning_rate": 0.0008104303209816948, + "loss": 1.4546, + "step": 3397 + }, + { + "epoch": 0.30829250589729634, + "grad_norm": 0.15306595679018942, + "learning_rate": 0.0008103151284432651, + "loss": 1.4374, + "step": 3398 + }, + { + "epoch": 0.3083832335329341, + "grad_norm": 0.1431755826579186, + "learning_rate": 0.0008101999091090781, + "loss": 1.4299, + "step": 3399 + }, + { + "epoch": 0.30847396116857195, + "grad_norm": 0.14391432669804988, + "learning_rate": 0.0008100846629890834, + "loss": 1.4492, + "step": 3400 + }, + { + "epoch": 0.3085646888042098, + "grad_norm": 0.13876002172559576, + "learning_rate": 0.0008099693900932326, + "loss": 1.4399, + "step": 3401 + }, + { + "epoch": 0.30865541643984756, + "grad_norm": 0.14124109971871274, + "learning_rate": 0.0008098540904314789, + "loss": 1.4421, + "step": 3402 + }, + { + "epoch": 0.3087461440754854, + "grad_norm": 0.16452034357535925, + "learning_rate": 0.000809738764013779, + "loss": 1.4456, + "step": 3403 + }, + { + "epoch": 0.30883687171112323, + "grad_norm": 0.14420871878125857, + "learning_rate": 0.0008096234108500911, + "loss": 1.4153, + "step": 3404 + }, + { + "epoch": 0.308927599346761, + "grad_norm": 0.13257024567276865, + "learning_rate": 0.0008095080309503762, + "loss": 1.4333, + "step": 3405 + }, + { + "epoch": 0.30901832698239884, + "grad_norm": 0.17005645088486568, + "learning_rate": 0.000809392624324597, + "loss": 1.4224, + "step": 3406 + }, + { + "epoch": 0.30910905461803667, + "grad_norm": 0.15064587295490947, + "learning_rate": 0.0008092771909827193, + "loss": 1.4564, + "step": 3407 + }, + { + "epoch": 0.30919978225367445, + "grad_norm": 0.15018059970426093, + "learning_rate": 0.0008091617309347103, + "loss": 1.4417, + "step": 3408 + }, + { + "epoch": 0.3092905098893123, + "grad_norm": 0.13825140672585137, + "learning_rate": 0.0008090462441905405, + "loss": 1.4177, + "step": 3409 + }, + { + "epoch": 0.3093812375249501, + "grad_norm": 0.1295673018493993, + "learning_rate": 0.0008089307307601819, + "loss": 1.4567, + "step": 3410 + }, + { + "epoch": 0.3094719651605879, + "grad_norm": 0.14333696353961914, + "learning_rate": 0.0008088151906536092, + "loss": 1.4722, + "step": 3411 + }, + { + "epoch": 0.3095626927962257, + "grad_norm": 0.1373455094149264, + "learning_rate": 0.0008086996238807991, + "loss": 1.4319, + "step": 3412 + }, + { + "epoch": 0.30965342043186356, + "grad_norm": 0.1566991306261173, + "learning_rate": 0.0008085840304517311, + "loss": 1.4611, + "step": 3413 + }, + { + "epoch": 0.30974414806750133, + "grad_norm": 0.1309205864597577, + "learning_rate": 0.0008084684103763866, + "loss": 1.4127, + "step": 3414 + }, + { + "epoch": 0.30983487570313917, + "grad_norm": 0.13282616135990244, + "learning_rate": 0.0008083527636647494, + "loss": 1.4408, + "step": 3415 + }, + { + "epoch": 0.309925603338777, + "grad_norm": 0.13228749696713915, + "learning_rate": 0.0008082370903268057, + "loss": 1.4268, + "step": 3416 + }, + { + "epoch": 0.31001633097441483, + "grad_norm": 0.13871704751621602, + "learning_rate": 0.0008081213903725437, + "loss": 1.4378, + "step": 3417 + }, + { + "epoch": 0.3101070586100526, + "grad_norm": 0.13208214404664134, + "learning_rate": 0.0008080056638119542, + "loss": 1.4917, + "step": 3418 + }, + { + "epoch": 0.31019778624569044, + "grad_norm": 0.16777883861675005, + "learning_rate": 0.0008078899106550303, + "loss": 1.4518, + "step": 3419 + }, + { + "epoch": 0.3102885138813283, + "grad_norm": 0.13986355021930355, + "learning_rate": 0.0008077741309117674, + "loss": 1.4617, + "step": 3420 + }, + { + "epoch": 0.31037924151696605, + "grad_norm": 0.15447896301544323, + "learning_rate": 0.0008076583245921627, + "loss": 1.4527, + "step": 3421 + }, + { + "epoch": 0.3104699691526039, + "grad_norm": 0.13639993816059065, + "learning_rate": 0.0008075424917062164, + "loss": 1.4125, + "step": 3422 + }, + { + "epoch": 0.3105606967882417, + "grad_norm": 0.14571923631314093, + "learning_rate": 0.0008074266322639305, + "loss": 1.4504, + "step": 3423 + }, + { + "epoch": 0.3106514244238795, + "grad_norm": 0.14041492329768293, + "learning_rate": 0.0008073107462753098, + "loss": 1.4248, + "step": 3424 + }, + { + "epoch": 0.31074215205951733, + "grad_norm": 0.13443231577700387, + "learning_rate": 0.0008071948337503608, + "loss": 1.483, + "step": 3425 + }, + { + "epoch": 0.31083287969515516, + "grad_norm": 0.15041358525309362, + "learning_rate": 0.0008070788946990926, + "loss": 1.4475, + "step": 3426 + }, + { + "epoch": 0.31092360733079294, + "grad_norm": 0.13748004902311772, + "learning_rate": 0.0008069629291315166, + "loss": 1.4546, + "step": 3427 + }, + { + "epoch": 0.31101433496643077, + "grad_norm": 0.1476613527287906, + "learning_rate": 0.0008068469370576464, + "loss": 1.4235, + "step": 3428 + }, + { + "epoch": 0.3111050626020686, + "grad_norm": 0.15003193107698592, + "learning_rate": 0.0008067309184874979, + "loss": 1.4389, + "step": 3429 + }, + { + "epoch": 0.3111957902377064, + "grad_norm": 0.1470217268042705, + "learning_rate": 0.0008066148734310894, + "loss": 1.4564, + "step": 3430 + }, + { + "epoch": 0.3112865178733442, + "grad_norm": 0.1550081076093942, + "learning_rate": 0.0008064988018984415, + "loss": 1.4177, + "step": 3431 + }, + { + "epoch": 0.31137724550898205, + "grad_norm": 0.13734929808683216, + "learning_rate": 0.0008063827038995768, + "loss": 1.4671, + "step": 3432 + }, + { + "epoch": 0.3114679731446198, + "grad_norm": 0.1458632928394415, + "learning_rate": 0.0008062665794445205, + "loss": 1.4818, + "step": 3433 + }, + { + "epoch": 0.31155870078025766, + "grad_norm": 0.1481505068155232, + "learning_rate": 0.0008061504285432999, + "loss": 1.4449, + "step": 3434 + }, + { + "epoch": 0.3116494284158955, + "grad_norm": 0.1464435005234262, + "learning_rate": 0.0008060342512059447, + "loss": 1.4783, + "step": 3435 + }, + { + "epoch": 0.3117401560515333, + "grad_norm": 0.1513482344953118, + "learning_rate": 0.0008059180474424868, + "loss": 1.389, + "step": 3436 + }, + { + "epoch": 0.3118308836871711, + "grad_norm": 0.13282689866985337, + "learning_rate": 0.0008058018172629604, + "loss": 1.3974, + "step": 3437 + }, + { + "epoch": 0.31192161132280893, + "grad_norm": 0.13404360888665987, + "learning_rate": 0.0008056855606774021, + "loss": 1.4171, + "step": 3438 + }, + { + "epoch": 0.31201233895844677, + "grad_norm": 0.1468481967819195, + "learning_rate": 0.0008055692776958503, + "loss": 1.3968, + "step": 3439 + }, + { + "epoch": 0.31210306659408454, + "grad_norm": 0.1487985770789209, + "learning_rate": 0.0008054529683283467, + "loss": 1.455, + "step": 3440 + }, + { + "epoch": 0.3121937942297224, + "grad_norm": 0.15499706985685638, + "learning_rate": 0.0008053366325849339, + "loss": 1.4356, + "step": 3441 + }, + { + "epoch": 0.3122845218653602, + "grad_norm": 0.14787125464558612, + "learning_rate": 0.0008052202704756582, + "loss": 1.4609, + "step": 3442 + }, + { + "epoch": 0.312375249500998, + "grad_norm": 0.17808984926128507, + "learning_rate": 0.0008051038820105671, + "loss": 1.4482, + "step": 3443 + }, + { + "epoch": 0.3124659771366358, + "grad_norm": 0.14561769533490346, + "learning_rate": 0.0008049874671997106, + "loss": 1.4763, + "step": 3444 + }, + { + "epoch": 0.31255670477227365, + "grad_norm": 0.16664136948366393, + "learning_rate": 0.0008048710260531416, + "loss": 1.4282, + "step": 3445 + }, + { + "epoch": 0.31264743240791143, + "grad_norm": 0.15539806608975906, + "learning_rate": 0.0008047545585809144, + "loss": 1.4175, + "step": 3446 + }, + { + "epoch": 0.31273816004354926, + "grad_norm": 0.16264377652765652, + "learning_rate": 0.0008046380647930863, + "loss": 1.4562, + "step": 3447 + }, + { + "epoch": 0.3128288876791871, + "grad_norm": 0.15608030487968197, + "learning_rate": 0.0008045215446997163, + "loss": 1.4628, + "step": 3448 + }, + { + "epoch": 0.3129196153148249, + "grad_norm": 0.15769441374318463, + "learning_rate": 0.0008044049983108661, + "loss": 1.4354, + "step": 3449 + }, + { + "epoch": 0.3130103429504627, + "grad_norm": 0.1441259125370345, + "learning_rate": 0.0008042884256365994, + "loss": 1.4404, + "step": 3450 + }, + { + "epoch": 0.31310107058610054, + "grad_norm": 0.1508066313313853, + "learning_rate": 0.0008041718266869822, + "loss": 1.4422, + "step": 3451 + }, + { + "epoch": 0.3131917982217383, + "grad_norm": 0.13957398028302348, + "learning_rate": 0.0008040552014720831, + "loss": 1.4063, + "step": 3452 + }, + { + "epoch": 0.31328252585737615, + "grad_norm": 0.1504367596693814, + "learning_rate": 0.0008039385500019725, + "loss": 1.4796, + "step": 3453 + }, + { + "epoch": 0.313373253493014, + "grad_norm": 0.13536930557669746, + "learning_rate": 0.0008038218722867232, + "loss": 1.4445, + "step": 3454 + }, + { + "epoch": 0.3134639811286518, + "grad_norm": 0.15463206288243736, + "learning_rate": 0.0008037051683364106, + "loss": 1.4639, + "step": 3455 + }, + { + "epoch": 0.3135547087642896, + "grad_norm": 0.14766790980504235, + "learning_rate": 0.0008035884381611118, + "loss": 1.4324, + "step": 3456 + }, + { + "epoch": 0.3136454363999274, + "grad_norm": 0.13806600411395478, + "learning_rate": 0.0008034716817709067, + "loss": 1.4563, + "step": 3457 + }, + { + "epoch": 0.31373616403556526, + "grad_norm": 0.13638515556549372, + "learning_rate": 0.0008033548991758772, + "loss": 1.4011, + "step": 3458 + }, + { + "epoch": 0.31382689167120303, + "grad_norm": 0.14892223083499387, + "learning_rate": 0.0008032380903861074, + "loss": 1.4588, + "step": 3459 + }, + { + "epoch": 0.31391761930684087, + "grad_norm": 0.14081904888561347, + "learning_rate": 0.0008031212554116838, + "loss": 1.4486, + "step": 3460 + }, + { + "epoch": 0.3140083469424787, + "grad_norm": 0.13939581147112648, + "learning_rate": 0.0008030043942626951, + "loss": 1.4244, + "step": 3461 + }, + { + "epoch": 0.3140990745781165, + "grad_norm": 0.14607026687192468, + "learning_rate": 0.0008028875069492323, + "loss": 1.4436, + "step": 3462 + }, + { + "epoch": 0.3141898022137543, + "grad_norm": 0.14213132876777312, + "learning_rate": 0.0008027705934813887, + "loss": 1.4338, + "step": 3463 + }, + { + "epoch": 0.31428052984939214, + "grad_norm": 0.14025562109599768, + "learning_rate": 0.0008026536538692596, + "loss": 1.4447, + "step": 3464 + }, + { + "epoch": 0.3143712574850299, + "grad_norm": 0.13946984397350162, + "learning_rate": 0.000802536688122943, + "loss": 1.4336, + "step": 3465 + }, + { + "epoch": 0.31446198512066775, + "grad_norm": 0.13888971979864104, + "learning_rate": 0.0008024196962525388, + "loss": 1.445, + "step": 3466 + }, + { + "epoch": 0.3145527127563056, + "grad_norm": 0.1329234111412585, + "learning_rate": 0.0008023026782681492, + "loss": 1.4184, + "step": 3467 + }, + { + "epoch": 0.31464344039194336, + "grad_norm": 0.1377578429587862, + "learning_rate": 0.0008021856341798788, + "loss": 1.4429, + "step": 3468 + }, + { + "epoch": 0.3147341680275812, + "grad_norm": 0.15225303006397725, + "learning_rate": 0.0008020685639978342, + "loss": 1.4432, + "step": 3469 + }, + { + "epoch": 0.31482489566321903, + "grad_norm": 0.1435859911918125, + "learning_rate": 0.0008019514677321249, + "loss": 1.4121, + "step": 3470 + }, + { + "epoch": 0.3149156232988568, + "grad_norm": 0.14411354878481558, + "learning_rate": 0.0008018343453928616, + "loss": 1.4784, + "step": 3471 + }, + { + "epoch": 0.31500635093449464, + "grad_norm": 0.15539536552294037, + "learning_rate": 0.0008017171969901582, + "loss": 1.4372, + "step": 3472 + }, + { + "epoch": 0.31509707857013247, + "grad_norm": 0.1882037745400522, + "learning_rate": 0.0008016000225341302, + "loss": 1.4283, + "step": 3473 + }, + { + "epoch": 0.3151878062057703, + "grad_norm": 0.18391349767028387, + "learning_rate": 0.0008014828220348959, + "loss": 1.4238, + "step": 3474 + }, + { + "epoch": 0.3152785338414081, + "grad_norm": 0.14825875415530185, + "learning_rate": 0.0008013655955025757, + "loss": 1.4343, + "step": 3475 + }, + { + "epoch": 0.3153692614770459, + "grad_norm": 0.15380732019375112, + "learning_rate": 0.0008012483429472916, + "loss": 1.4472, + "step": 3476 + }, + { + "epoch": 0.31545998911268375, + "grad_norm": 0.1557625516470415, + "learning_rate": 0.0008011310643791689, + "loss": 1.4333, + "step": 3477 + }, + { + "epoch": 0.3155507167483215, + "grad_norm": 0.14907293227590238, + "learning_rate": 0.0008010137598083344, + "loss": 1.4315, + "step": 3478 + }, + { + "epoch": 0.31564144438395936, + "grad_norm": 0.15014697298481755, + "learning_rate": 0.0008008964292449172, + "loss": 1.4376, + "step": 3479 + }, + { + "epoch": 0.3157321720195972, + "grad_norm": 0.14475761479870772, + "learning_rate": 0.0008007790726990492, + "loss": 1.4384, + "step": 3480 + }, + { + "epoch": 0.31582289965523497, + "grad_norm": 0.15208671331840629, + "learning_rate": 0.0008006616901808638, + "loss": 1.4351, + "step": 3481 + }, + { + "epoch": 0.3159136272908728, + "grad_norm": 0.1585923758196991, + "learning_rate": 0.0008005442817004972, + "loss": 1.4845, + "step": 3482 + }, + { + "epoch": 0.31600435492651063, + "grad_norm": 0.14734356443960056, + "learning_rate": 0.0008004268472680875, + "loss": 1.4262, + "step": 3483 + }, + { + "epoch": 0.3160950825621484, + "grad_norm": 0.1696091080211757, + "learning_rate": 0.0008003093868937754, + "loss": 1.4264, + "step": 3484 + }, + { + "epoch": 0.31618581019778624, + "grad_norm": 0.17625057500564223, + "learning_rate": 0.0008001919005877033, + "loss": 1.4528, + "step": 3485 + }, + { + "epoch": 0.3162765378334241, + "grad_norm": 0.14901991328434286, + "learning_rate": 0.0008000743883600166, + "loss": 1.4147, + "step": 3486 + }, + { + "epoch": 0.31636726546906185, + "grad_norm": 0.1373969593615424, + "learning_rate": 0.000799956850220862, + "loss": 1.431, + "step": 3487 + }, + { + "epoch": 0.3164579931046997, + "grad_norm": 0.13957935103849092, + "learning_rate": 0.0007998392861803892, + "loss": 1.4817, + "step": 3488 + }, + { + "epoch": 0.3165487207403375, + "grad_norm": 0.14641265818886876, + "learning_rate": 0.0007997216962487499, + "loss": 1.4403, + "step": 3489 + }, + { + "epoch": 0.3166394483759753, + "grad_norm": 0.14582181607274, + "learning_rate": 0.000799604080436098, + "loss": 1.4076, + "step": 3490 + }, + { + "epoch": 0.31673017601161313, + "grad_norm": 0.15674578928302071, + "learning_rate": 0.0007994864387525896, + "loss": 1.4216, + "step": 3491 + }, + { + "epoch": 0.31682090364725096, + "grad_norm": 0.13939403689721644, + "learning_rate": 0.0007993687712083828, + "loss": 1.4326, + "step": 3492 + }, + { + "epoch": 0.3169116312828888, + "grad_norm": 0.15170318825967802, + "learning_rate": 0.0007992510778136388, + "loss": 1.4297, + "step": 3493 + }, + { + "epoch": 0.31700235891852657, + "grad_norm": 0.15664289452577482, + "learning_rate": 0.0007991333585785201, + "loss": 1.4173, + "step": 3494 + }, + { + "epoch": 0.3170930865541644, + "grad_norm": 0.15117027392003238, + "learning_rate": 0.0007990156135131917, + "loss": 1.4538, + "step": 3495 + }, + { + "epoch": 0.31718381418980224, + "grad_norm": 0.14322037420815487, + "learning_rate": 0.0007988978426278209, + "loss": 1.4557, + "step": 3496 + }, + { + "epoch": 0.31727454182544, + "grad_norm": 0.14518016581092127, + "learning_rate": 0.0007987800459325775, + "loss": 1.4338, + "step": 3497 + }, + { + "epoch": 0.31736526946107785, + "grad_norm": 0.1398877535775253, + "learning_rate": 0.0007986622234376332, + "loss": 1.4413, + "step": 3498 + }, + { + "epoch": 0.3174559970967157, + "grad_norm": 0.14948929264889307, + "learning_rate": 0.0007985443751531616, + "loss": 1.4589, + "step": 3499 + }, + { + "epoch": 0.31754672473235346, + "grad_norm": 0.15023823503549047, + "learning_rate": 0.0007984265010893395, + "loss": 1.4536, + "step": 3500 + }, + { + "epoch": 0.3176374523679913, + "grad_norm": 0.13840576103697863, + "learning_rate": 0.0007983086012563449, + "loss": 1.4257, + "step": 3501 + }, + { + "epoch": 0.3177281800036291, + "grad_norm": 0.13782278508389756, + "learning_rate": 0.0007981906756643586, + "loss": 1.4388, + "step": 3502 + }, + { + "epoch": 0.3178189076392669, + "grad_norm": 0.14651384393998132, + "learning_rate": 0.0007980727243235635, + "loss": 1.4297, + "step": 3503 + }, + { + "epoch": 0.31790963527490473, + "grad_norm": 0.1376124201085183, + "learning_rate": 0.0007979547472441447, + "loss": 1.4481, + "step": 3504 + }, + { + "epoch": 0.31800036291054257, + "grad_norm": 0.13650054967169564, + "learning_rate": 0.0007978367444362897, + "loss": 1.4771, + "step": 3505 + }, + { + "epoch": 0.31809109054618034, + "grad_norm": 0.13112440247979532, + "learning_rate": 0.0007977187159101878, + "loss": 1.46, + "step": 3506 + }, + { + "epoch": 0.3181818181818182, + "grad_norm": 0.13100869330095322, + "learning_rate": 0.000797600661676031, + "loss": 1.4104, + "step": 3507 + }, + { + "epoch": 0.318272545817456, + "grad_norm": 0.1426342939007597, + "learning_rate": 0.000797482581744013, + "loss": 1.4252, + "step": 3508 + }, + { + "epoch": 0.3183632734530938, + "grad_norm": 0.137353566473243, + "learning_rate": 0.0007973644761243303, + "loss": 1.4346, + "step": 3509 + }, + { + "epoch": 0.3184540010887316, + "grad_norm": 0.14040950293258214, + "learning_rate": 0.0007972463448271815, + "loss": 1.4148, + "step": 3510 + }, + { + "epoch": 0.31854472872436945, + "grad_norm": 0.13543117069091576, + "learning_rate": 0.0007971281878627667, + "loss": 1.4362, + "step": 3511 + }, + { + "epoch": 0.3186354563600073, + "grad_norm": 0.1352000514055974, + "learning_rate": 0.0007970100052412893, + "loss": 1.4241, + "step": 3512 + }, + { + "epoch": 0.31872618399564506, + "grad_norm": 0.13817386800298603, + "learning_rate": 0.0007968917969729541, + "loss": 1.4423, + "step": 3513 + }, + { + "epoch": 0.3188169116312829, + "grad_norm": 0.13517670024584255, + "learning_rate": 0.0007967735630679684, + "loss": 1.4261, + "step": 3514 + }, + { + "epoch": 0.31890763926692073, + "grad_norm": 0.14354246985437286, + "learning_rate": 0.0007966553035365419, + "loss": 1.4373, + "step": 3515 + }, + { + "epoch": 0.3189983669025585, + "grad_norm": 0.1404575870549827, + "learning_rate": 0.0007965370183888863, + "loss": 1.4228, + "step": 3516 + }, + { + "epoch": 0.31908909453819634, + "grad_norm": 0.13891043347123908, + "learning_rate": 0.0007964187076352152, + "loss": 1.4524, + "step": 3517 + }, + { + "epoch": 0.31917982217383417, + "grad_norm": 0.15465074990531552, + "learning_rate": 0.0007963003712857453, + "loss": 1.43, + "step": 3518 + }, + { + "epoch": 0.31927054980947195, + "grad_norm": 0.1335216162901815, + "learning_rate": 0.0007961820093506944, + "loss": 1.4416, + "step": 3519 + }, + { + "epoch": 0.3193612774451098, + "grad_norm": 0.13749840588794957, + "learning_rate": 0.0007960636218402834, + "loss": 1.462, + "step": 3520 + }, + { + "epoch": 0.3194520050807476, + "grad_norm": 0.15560691429022103, + "learning_rate": 0.0007959452087647352, + "loss": 1.4709, + "step": 3521 + }, + { + "epoch": 0.3195427327163854, + "grad_norm": 0.14280346993521642, + "learning_rate": 0.0007958267701342744, + "loss": 1.3979, + "step": 3522 + }, + { + "epoch": 0.3196334603520232, + "grad_norm": 0.14185102743654004, + "learning_rate": 0.0007957083059591285, + "loss": 1.3944, + "step": 3523 + }, + { + "epoch": 0.31972418798766106, + "grad_norm": 0.1321101620116609, + "learning_rate": 0.0007955898162495267, + "loss": 1.4205, + "step": 3524 + }, + { + "epoch": 0.31981491562329883, + "grad_norm": 0.2419748847072817, + "learning_rate": 0.0007954713010157008, + "loss": 1.4356, + "step": 3525 + }, + { + "epoch": 0.31990564325893667, + "grad_norm": 0.1309621814346022, + "learning_rate": 0.0007953527602678845, + "loss": 1.4565, + "step": 3526 + }, + { + "epoch": 0.3199963708945745, + "grad_norm": 0.15016798244317678, + "learning_rate": 0.0007952341940163137, + "loss": 1.4457, + "step": 3527 + }, + { + "epoch": 0.3200870985302123, + "grad_norm": 0.14113753387295472, + "learning_rate": 0.0007951156022712269, + "loss": 1.4832, + "step": 3528 + }, + { + "epoch": 0.3201778261658501, + "grad_norm": 0.1459538775295838, + "learning_rate": 0.0007949969850428642, + "loss": 1.4134, + "step": 3529 + }, + { + "epoch": 0.32026855380148794, + "grad_norm": 0.13743876188047863, + "learning_rate": 0.0007948783423414685, + "loss": 1.4111, + "step": 3530 + }, + { + "epoch": 0.3203592814371258, + "grad_norm": 0.13172394467855128, + "learning_rate": 0.0007947596741772844, + "loss": 1.4145, + "step": 3531 + }, + { + "epoch": 0.32045000907276355, + "grad_norm": 0.14177574123384856, + "learning_rate": 0.000794640980560559, + "loss": 1.4269, + "step": 3532 + }, + { + "epoch": 0.3205407367084014, + "grad_norm": 0.13559169983318642, + "learning_rate": 0.0007945222615015416, + "loss": 1.4391, + "step": 3533 + }, + { + "epoch": 0.3206314643440392, + "grad_norm": 0.1362887597998988, + "learning_rate": 0.0007944035170104835, + "loss": 1.419, + "step": 3534 + }, + { + "epoch": 0.320722191979677, + "grad_norm": 0.14087701173673228, + "learning_rate": 0.0007942847470976382, + "loss": 1.4154, + "step": 3535 + }, + { + "epoch": 0.32081291961531483, + "grad_norm": 0.14524613483046014, + "learning_rate": 0.0007941659517732615, + "loss": 1.4474, + "step": 3536 + }, + { + "epoch": 0.32090364725095266, + "grad_norm": 0.15447502451908177, + "learning_rate": 0.0007940471310476119, + "loss": 1.4386, + "step": 3537 + }, + { + "epoch": 0.32099437488659044, + "grad_norm": 0.14448557029932424, + "learning_rate": 0.0007939282849309488, + "loss": 1.4347, + "step": 3538 + }, + { + "epoch": 0.32108510252222827, + "grad_norm": 0.14289281026817444, + "learning_rate": 0.0007938094134335352, + "loss": 1.4228, + "step": 3539 + }, + { + "epoch": 0.3211758301578661, + "grad_norm": 0.16642740576994422, + "learning_rate": 0.0007936905165656354, + "loss": 1.4168, + "step": 3540 + }, + { + "epoch": 0.3212665577935039, + "grad_norm": 0.13812296550047748, + "learning_rate": 0.0007935715943375161, + "loss": 1.4803, + "step": 3541 + }, + { + "epoch": 0.3213572854291417, + "grad_norm": 0.228306201918004, + "learning_rate": 0.0007934526467594465, + "loss": 1.4172, + "step": 3542 + }, + { + "epoch": 0.32144801306477955, + "grad_norm": 0.136771842957294, + "learning_rate": 0.0007933336738416976, + "loss": 1.4253, + "step": 3543 + }, + { + "epoch": 0.3215387407004173, + "grad_norm": 0.13037450996513322, + "learning_rate": 0.0007932146755945426, + "loss": 1.4188, + "step": 3544 + }, + { + "epoch": 0.32162946833605516, + "grad_norm": 0.13334298337734268, + "learning_rate": 0.0007930956520282573, + "loss": 1.429, + "step": 3545 + }, + { + "epoch": 0.321720195971693, + "grad_norm": 0.1311556891738383, + "learning_rate": 0.0007929766031531192, + "loss": 1.437, + "step": 3546 + }, + { + "epoch": 0.32181092360733077, + "grad_norm": 0.1403912938146604, + "learning_rate": 0.0007928575289794082, + "loss": 1.4332, + "step": 3547 + }, + { + "epoch": 0.3219016512429686, + "grad_norm": 0.14095502560899253, + "learning_rate": 0.0007927384295174065, + "loss": 1.4523, + "step": 3548 + }, + { + "epoch": 0.32199237887860643, + "grad_norm": 0.13730094928539976, + "learning_rate": 0.0007926193047773981, + "loss": 1.4524, + "step": 3549 + }, + { + "epoch": 0.32208310651424427, + "grad_norm": 0.16869104918724095, + "learning_rate": 0.0007925001547696698, + "loss": 1.4461, + "step": 3550 + }, + { + "epoch": 0.32217383414988204, + "grad_norm": 0.1432659065005143, + "learning_rate": 0.0007923809795045098, + "loss": 1.4248, + "step": 3551 + }, + { + "epoch": 0.3222645617855199, + "grad_norm": 0.1465192673536481, + "learning_rate": 0.0007922617789922093, + "loss": 1.4201, + "step": 3552 + }, + { + "epoch": 0.3223552894211577, + "grad_norm": 0.14115861896769166, + "learning_rate": 0.0007921425532430612, + "loss": 1.4366, + "step": 3553 + }, + { + "epoch": 0.3224460170567955, + "grad_norm": 0.14905794763367652, + "learning_rate": 0.0007920233022673604, + "loss": 1.4198, + "step": 3554 + }, + { + "epoch": 0.3225367446924333, + "grad_norm": 0.13314579414790154, + "learning_rate": 0.0007919040260754045, + "loss": 1.4676, + "step": 3555 + }, + { + "epoch": 0.32262747232807115, + "grad_norm": 0.1413701770243017, + "learning_rate": 0.0007917847246774927, + "loss": 1.4405, + "step": 3556 + }, + { + "epoch": 0.32271819996370893, + "grad_norm": 0.1659533559279214, + "learning_rate": 0.0007916653980839272, + "loss": 1.412, + "step": 3557 + }, + { + "epoch": 0.32280892759934676, + "grad_norm": 0.1491757775571294, + "learning_rate": 0.0007915460463050114, + "loss": 1.4291, + "step": 3558 + }, + { + "epoch": 0.3228996552349846, + "grad_norm": 0.14887418620052079, + "learning_rate": 0.0007914266693510517, + "loss": 1.4071, + "step": 3559 + }, + { + "epoch": 0.3229903828706224, + "grad_norm": 0.14931147811086878, + "learning_rate": 0.000791307267232356, + "loss": 1.4344, + "step": 3560 + }, + { + "epoch": 0.3230811105062602, + "grad_norm": 0.13837509274532747, + "learning_rate": 0.0007911878399592349, + "loss": 1.4301, + "step": 3561 + }, + { + "epoch": 0.32317183814189804, + "grad_norm": 0.1860766081433947, + "learning_rate": 0.0007910683875420009, + "loss": 1.4313, + "step": 3562 + }, + { + "epoch": 0.3232625657775358, + "grad_norm": 0.15120423349188025, + "learning_rate": 0.0007909489099909688, + "loss": 1.3981, + "step": 3563 + }, + { + "epoch": 0.32335329341317365, + "grad_norm": 0.15242339735125982, + "learning_rate": 0.0007908294073164554, + "loss": 1.4373, + "step": 3564 + }, + { + "epoch": 0.3234440210488115, + "grad_norm": 0.18266968478873924, + "learning_rate": 0.0007907098795287798, + "loss": 1.4532, + "step": 3565 + }, + { + "epoch": 0.32353474868444926, + "grad_norm": 0.14829239903838645, + "learning_rate": 0.0007905903266382633, + "loss": 1.4555, + "step": 3566 + }, + { + "epoch": 0.3236254763200871, + "grad_norm": 0.1538186951190044, + "learning_rate": 0.0007904707486552292, + "loss": 1.447, + "step": 3567 + }, + { + "epoch": 0.3237162039557249, + "grad_norm": 0.14033724697538436, + "learning_rate": 0.0007903511455900031, + "loss": 1.4494, + "step": 3568 + }, + { + "epoch": 0.32380693159136276, + "grad_norm": 0.1525595940253437, + "learning_rate": 0.0007902315174529128, + "loss": 1.417, + "step": 3569 + }, + { + "epoch": 0.32389765922700053, + "grad_norm": 0.14209315772261386, + "learning_rate": 0.0007901118642542883, + "loss": 1.441, + "step": 3570 + }, + { + "epoch": 0.32398838686263837, + "grad_norm": 0.14031192618574623, + "learning_rate": 0.0007899921860044614, + "loss": 1.4219, + "step": 3571 + }, + { + "epoch": 0.3240791144982762, + "grad_norm": 0.1449103353004689, + "learning_rate": 0.0007898724827137667, + "loss": 1.4066, + "step": 3572 + }, + { + "epoch": 0.324169842133914, + "grad_norm": 0.15845788878185862, + "learning_rate": 0.0007897527543925402, + "loss": 1.4214, + "step": 3573 + }, + { + "epoch": 0.3242605697695518, + "grad_norm": 0.14670678475364626, + "learning_rate": 0.0007896330010511208, + "loss": 1.4719, + "step": 3574 + }, + { + "epoch": 0.32435129740518964, + "grad_norm": 0.13978157252424067, + "learning_rate": 0.0007895132226998491, + "loss": 1.4533, + "step": 3575 + }, + { + "epoch": 0.3244420250408274, + "grad_norm": 0.15454965390777065, + "learning_rate": 0.0007893934193490678, + "loss": 1.4469, + "step": 3576 + }, + { + "epoch": 0.32453275267646525, + "grad_norm": 0.1624123292209034, + "learning_rate": 0.0007892735910091221, + "loss": 1.4805, + "step": 3577 + }, + { + "epoch": 0.3246234803121031, + "grad_norm": 0.16747087828473667, + "learning_rate": 0.0007891537376903592, + "loss": 1.4129, + "step": 3578 + }, + { + "epoch": 0.32471420794774086, + "grad_norm": 0.17227162568597534, + "learning_rate": 0.0007890338594031286, + "loss": 1.4577, + "step": 3579 + }, + { + "epoch": 0.3248049355833787, + "grad_norm": 0.14664803891394892, + "learning_rate": 0.0007889139561577815, + "loss": 1.4532, + "step": 3580 + }, + { + "epoch": 0.32489566321901653, + "grad_norm": 0.19454232467835988, + "learning_rate": 0.0007887940279646717, + "loss": 1.401, + "step": 3581 + }, + { + "epoch": 0.3249863908546543, + "grad_norm": 0.20363432016737631, + "learning_rate": 0.000788674074834155, + "loss": 1.4499, + "step": 3582 + }, + { + "epoch": 0.32507711849029214, + "grad_norm": 0.135707327844801, + "learning_rate": 0.0007885540967765895, + "loss": 1.4221, + "step": 3583 + }, + { + "epoch": 0.32516784612592997, + "grad_norm": 0.1456940167953669, + "learning_rate": 0.0007884340938023351, + "loss": 1.4545, + "step": 3584 + }, + { + "epoch": 0.32525857376156775, + "grad_norm": 0.1423480703534161, + "learning_rate": 0.0007883140659217543, + "loss": 1.4173, + "step": 3585 + }, + { + "epoch": 0.3253493013972056, + "grad_norm": 0.1373384641458924, + "learning_rate": 0.0007881940131452112, + "loss": 1.4372, + "step": 3586 + }, + { + "epoch": 0.3254400290328434, + "grad_norm": 0.13151035311562675, + "learning_rate": 0.0007880739354830729, + "loss": 1.4383, + "step": 3587 + }, + { + "epoch": 0.32553075666848125, + "grad_norm": 0.13302955651901882, + "learning_rate": 0.0007879538329457076, + "loss": 1.4379, + "step": 3588 + }, + { + "epoch": 0.325621484304119, + "grad_norm": 0.1329195510137683, + "learning_rate": 0.0007878337055434864, + "loss": 1.4309, + "step": 3589 + }, + { + "epoch": 0.32571221193975686, + "grad_norm": 0.13376212908044208, + "learning_rate": 0.0007877135532867823, + "loss": 1.4326, + "step": 3590 + }, + { + "epoch": 0.3258029395753947, + "grad_norm": 0.13873043141537955, + "learning_rate": 0.0007875933761859706, + "loss": 1.4215, + "step": 3591 + }, + { + "epoch": 0.32589366721103247, + "grad_norm": 0.17169060975807732, + "learning_rate": 0.0007874731742514284, + "loss": 1.4023, + "step": 3592 + }, + { + "epoch": 0.3259843948466703, + "grad_norm": 0.15494602136077548, + "learning_rate": 0.0007873529474935353, + "loss": 1.4385, + "step": 3593 + }, + { + "epoch": 0.32607512248230813, + "grad_norm": 0.1446065085582102, + "learning_rate": 0.0007872326959226727, + "loss": 1.4337, + "step": 3594 + }, + { + "epoch": 0.3261658501179459, + "grad_norm": 0.14752412541052518, + "learning_rate": 0.0007871124195492245, + "loss": 1.3991, + "step": 3595 + }, + { + "epoch": 0.32625657775358374, + "grad_norm": 0.15510939544670718, + "learning_rate": 0.0007869921183835766, + "loss": 1.4439, + "step": 3596 + }, + { + "epoch": 0.3263473053892216, + "grad_norm": 0.15976091173260143, + "learning_rate": 0.0007868717924361168, + "loss": 1.4029, + "step": 3597 + }, + { + "epoch": 0.32643803302485935, + "grad_norm": 0.14976348072048765, + "learning_rate": 0.0007867514417172356, + "loss": 1.4556, + "step": 3598 + }, + { + "epoch": 0.3265287606604972, + "grad_norm": 0.1470448647028396, + "learning_rate": 0.0007866310662373253, + "loss": 1.4114, + "step": 3599 + }, + { + "epoch": 0.326619488296135, + "grad_norm": 0.1632887934580962, + "learning_rate": 0.00078651066600678, + "loss": 1.4129, + "step": 3600 + }, + { + "epoch": 0.3267102159317728, + "grad_norm": 0.14879283524291906, + "learning_rate": 0.0007863902410359966, + "loss": 1.3865, + "step": 3601 + }, + { + "epoch": 0.32680094356741063, + "grad_norm": 0.13691107404947922, + "learning_rate": 0.0007862697913353736, + "loss": 1.4176, + "step": 3602 + }, + { + "epoch": 0.32689167120304846, + "grad_norm": 0.1413087822336289, + "learning_rate": 0.0007861493169153118, + "loss": 1.3941, + "step": 3603 + }, + { + "epoch": 0.32698239883868624, + "grad_norm": 0.13865019958963934, + "learning_rate": 0.0007860288177862145, + "loss": 1.4453, + "step": 3604 + }, + { + "epoch": 0.32707312647432407, + "grad_norm": 0.14539684804001923, + "learning_rate": 0.0007859082939584866, + "loss": 1.4445, + "step": 3605 + }, + { + "epoch": 0.3271638541099619, + "grad_norm": 0.14504201427219615, + "learning_rate": 0.0007857877454425353, + "loss": 1.4593, + "step": 3606 + }, + { + "epoch": 0.3272545817455997, + "grad_norm": 0.16135967776539697, + "learning_rate": 0.00078566717224877, + "loss": 1.4332, + "step": 3607 + }, + { + "epoch": 0.3273453093812375, + "grad_norm": 0.1607077289207464, + "learning_rate": 0.0007855465743876024, + "loss": 1.4158, + "step": 3608 + }, + { + "epoch": 0.32743603701687535, + "grad_norm": 0.15204863162151663, + "learning_rate": 0.0007854259518694459, + "loss": 1.4161, + "step": 3609 + }, + { + "epoch": 0.3275267646525132, + "grad_norm": 0.15476684707466254, + "learning_rate": 0.0007853053047047165, + "loss": 1.4338, + "step": 3610 + }, + { + "epoch": 0.32761749228815096, + "grad_norm": 0.1402692785127267, + "learning_rate": 0.000785184632903832, + "loss": 1.4561, + "step": 3611 + }, + { + "epoch": 0.3277082199237888, + "grad_norm": 0.1441701267526025, + "learning_rate": 0.0007850639364772123, + "loss": 1.4653, + "step": 3612 + }, + { + "epoch": 0.3277989475594266, + "grad_norm": 0.1546144236629825, + "learning_rate": 0.0007849432154352797, + "loss": 1.4307, + "step": 3613 + }, + { + "epoch": 0.3278896751950644, + "grad_norm": 0.1418944728133469, + "learning_rate": 0.0007848224697884585, + "loss": 1.454, + "step": 3614 + }, + { + "epoch": 0.32798040283070223, + "grad_norm": 0.14573400481638163, + "learning_rate": 0.000784701699547175, + "loss": 1.4391, + "step": 3615 + }, + { + "epoch": 0.32807113046634007, + "grad_norm": 0.13881854767663196, + "learning_rate": 0.0007845809047218579, + "loss": 1.3933, + "step": 3616 + }, + { + "epoch": 0.32816185810197784, + "grad_norm": 0.15180445423961428, + "learning_rate": 0.0007844600853229376, + "loss": 1.4485, + "step": 3617 + }, + { + "epoch": 0.3282525857376157, + "grad_norm": 0.1481196277863035, + "learning_rate": 0.000784339241360847, + "loss": 1.4304, + "step": 3618 + }, + { + "epoch": 0.3283433133732535, + "grad_norm": 0.1532803577625145, + "learning_rate": 0.0007842183728460211, + "loss": 1.377, + "step": 3619 + }, + { + "epoch": 0.3284340410088913, + "grad_norm": 0.13935167425009087, + "learning_rate": 0.0007840974797888967, + "loss": 1.4597, + "step": 3620 + }, + { + "epoch": 0.3285247686445291, + "grad_norm": 0.14006298375502552, + "learning_rate": 0.0007839765621999133, + "loss": 1.4123, + "step": 3621 + }, + { + "epoch": 0.32861549628016695, + "grad_norm": 0.23269052345607558, + "learning_rate": 0.0007838556200895117, + "loss": 1.4214, + "step": 3622 + }, + { + "epoch": 0.32870622391580473, + "grad_norm": 0.1556176206171642, + "learning_rate": 0.0007837346534681355, + "loss": 1.4143, + "step": 3623 + }, + { + "epoch": 0.32879695155144256, + "grad_norm": 0.14276457900269465, + "learning_rate": 0.0007836136623462301, + "loss": 1.4193, + "step": 3624 + }, + { + "epoch": 0.3288876791870804, + "grad_norm": 0.14319506617931047, + "learning_rate": 0.0007834926467342433, + "loss": 1.4357, + "step": 3625 + }, + { + "epoch": 0.3289784068227182, + "grad_norm": 0.13866151911859068, + "learning_rate": 0.0007833716066426246, + "loss": 1.4287, + "step": 3626 + }, + { + "epoch": 0.329069134458356, + "grad_norm": 0.15257399427808896, + "learning_rate": 0.0007832505420818259, + "loss": 1.417, + "step": 3627 + }, + { + "epoch": 0.32915986209399384, + "grad_norm": 0.14608050400146075, + "learning_rate": 0.000783129453062301, + "loss": 1.4016, + "step": 3628 + }, + { + "epoch": 0.32925058972963167, + "grad_norm": 0.13554830808810636, + "learning_rate": 0.0007830083395945062, + "loss": 1.4371, + "step": 3629 + }, + { + "epoch": 0.32934131736526945, + "grad_norm": 0.14920092435586482, + "learning_rate": 0.0007828872016888998, + "loss": 1.3954, + "step": 3630 + }, + { + "epoch": 0.3294320450009073, + "grad_norm": 0.14251057957248894, + "learning_rate": 0.0007827660393559416, + "loss": 1.4057, + "step": 3631 + }, + { + "epoch": 0.3295227726365451, + "grad_norm": 0.13901428932944532, + "learning_rate": 0.0007826448526060942, + "loss": 1.4315, + "step": 3632 + }, + { + "epoch": 0.3296135002721829, + "grad_norm": 0.14749938691782313, + "learning_rate": 0.0007825236414498222, + "loss": 1.4279, + "step": 3633 + }, + { + "epoch": 0.3297042279078207, + "grad_norm": 0.15120437240112145, + "learning_rate": 0.0007824024058975921, + "loss": 1.4051, + "step": 3634 + }, + { + "epoch": 0.32979495554345856, + "grad_norm": 0.14202082151471193, + "learning_rate": 0.0007822811459598727, + "loss": 1.388, + "step": 3635 + }, + { + "epoch": 0.32988568317909633, + "grad_norm": 0.1536709939645876, + "learning_rate": 0.0007821598616471345, + "loss": 1.4579, + "step": 3636 + }, + { + "epoch": 0.32997641081473417, + "grad_norm": 0.13939369994738798, + "learning_rate": 0.000782038552969851, + "loss": 1.4294, + "step": 3637 + }, + { + "epoch": 0.330067138450372, + "grad_norm": 0.1474251863027257, + "learning_rate": 0.0007819172199384967, + "loss": 1.4581, + "step": 3638 + }, + { + "epoch": 0.3301578660860098, + "grad_norm": 0.1525824441088725, + "learning_rate": 0.0007817958625635489, + "loss": 1.4399, + "step": 3639 + }, + { + "epoch": 0.3302485937216476, + "grad_norm": 0.14016223416878718, + "learning_rate": 0.0007816744808554867, + "loss": 1.4789, + "step": 3640 + }, + { + "epoch": 0.33033932135728544, + "grad_norm": 0.1455108823439235, + "learning_rate": 0.0007815530748247919, + "loss": 1.4239, + "step": 3641 + }, + { + "epoch": 0.3304300489929232, + "grad_norm": 0.1449703357086505, + "learning_rate": 0.0007814316444819474, + "loss": 1.447, + "step": 3642 + }, + { + "epoch": 0.33052077662856105, + "grad_norm": 0.1412945249729774, + "learning_rate": 0.000781310189837439, + "loss": 1.4466, + "step": 3643 + }, + { + "epoch": 0.3306115042641989, + "grad_norm": 0.13045976318062966, + "learning_rate": 0.0007811887109017542, + "loss": 1.456, + "step": 3644 + }, + { + "epoch": 0.33070223189983666, + "grad_norm": 0.14077389476789726, + "learning_rate": 0.0007810672076853827, + "loss": 1.4286, + "step": 3645 + }, + { + "epoch": 0.3307929595354745, + "grad_norm": 0.1532561483898033, + "learning_rate": 0.0007809456801988164, + "loss": 1.4052, + "step": 3646 + }, + { + "epoch": 0.33088368717111233, + "grad_norm": 0.14302385460066308, + "learning_rate": 0.0007808241284525492, + "loss": 1.4648, + "step": 3647 + }, + { + "epoch": 0.33097441480675016, + "grad_norm": 0.13949115440194118, + "learning_rate": 0.0007807025524570772, + "loss": 1.4144, + "step": 3648 + }, + { + "epoch": 0.33106514244238794, + "grad_norm": 0.13602550600729393, + "learning_rate": 0.0007805809522228984, + "loss": 1.4144, + "step": 3649 + }, + { + "epoch": 0.33115587007802577, + "grad_norm": 0.14090099813580803, + "learning_rate": 0.0007804593277605131, + "loss": 1.4002, + "step": 3650 + }, + { + "epoch": 0.3312465977136636, + "grad_norm": 0.13374346837006246, + "learning_rate": 0.0007803376790804233, + "loss": 1.4274, + "step": 3651 + }, + { + "epoch": 0.3313373253493014, + "grad_norm": 0.13647959374222537, + "learning_rate": 0.0007802160061931338, + "loss": 1.4458, + "step": 3652 + }, + { + "epoch": 0.3314280529849392, + "grad_norm": 0.13340593865762426, + "learning_rate": 0.0007800943091091508, + "loss": 1.4268, + "step": 3653 + }, + { + "epoch": 0.33151878062057705, + "grad_norm": 0.13578687888136937, + "learning_rate": 0.000779972587838983, + "loss": 1.4256, + "step": 3654 + }, + { + "epoch": 0.3316095082562148, + "grad_norm": 0.15001454970175737, + "learning_rate": 0.0007798508423931407, + "loss": 1.4142, + "step": 3655 + }, + { + "epoch": 0.33170023589185266, + "grad_norm": 0.14818584179002883, + "learning_rate": 0.0007797290727821371, + "loss": 1.4225, + "step": 3656 + }, + { + "epoch": 0.3317909635274905, + "grad_norm": 0.13223529624562863, + "learning_rate": 0.000779607279016487, + "loss": 1.4284, + "step": 3657 + }, + { + "epoch": 0.33188169116312827, + "grad_norm": 0.14174786546538137, + "learning_rate": 0.000779485461106707, + "loss": 1.4352, + "step": 3658 + }, + { + "epoch": 0.3319724187987661, + "grad_norm": 0.13782984479910657, + "learning_rate": 0.0007793636190633161, + "loss": 1.4259, + "step": 3659 + }, + { + "epoch": 0.33206314643440393, + "grad_norm": 0.15225482189062467, + "learning_rate": 0.0007792417528968357, + "loss": 1.4301, + "step": 3660 + }, + { + "epoch": 0.3321538740700417, + "grad_norm": 0.14946515539885838, + "learning_rate": 0.0007791198626177888, + "loss": 1.4016, + "step": 3661 + }, + { + "epoch": 0.33224460170567954, + "grad_norm": 0.1401858596458253, + "learning_rate": 0.0007789979482367004, + "loss": 1.4528, + "step": 3662 + }, + { + "epoch": 0.3323353293413174, + "grad_norm": 0.14312321362959018, + "learning_rate": 0.0007788760097640983, + "loss": 1.4382, + "step": 3663 + }, + { + "epoch": 0.33242605697695515, + "grad_norm": 0.14237749174560535, + "learning_rate": 0.0007787540472105115, + "loss": 1.4587, + "step": 3664 + }, + { + "epoch": 0.332516784612593, + "grad_norm": 0.14562722163143993, + "learning_rate": 0.0007786320605864718, + "loss": 1.4726, + "step": 3665 + }, + { + "epoch": 0.3326075122482308, + "grad_norm": 0.1348319266052641, + "learning_rate": 0.0007785100499025125, + "loss": 1.4553, + "step": 3666 + }, + { + "epoch": 0.33269823988386865, + "grad_norm": 0.1421828333365168, + "learning_rate": 0.0007783880151691694, + "loss": 1.4466, + "step": 3667 + }, + { + "epoch": 0.33278896751950643, + "grad_norm": 0.13453718400863104, + "learning_rate": 0.00077826595639698, + "loss": 1.3879, + "step": 3668 + }, + { + "epoch": 0.33287969515514426, + "grad_norm": 0.14326551561442644, + "learning_rate": 0.0007781438735964845, + "loss": 1.4216, + "step": 3669 + }, + { + "epoch": 0.3329704227907821, + "grad_norm": 0.13496779906635917, + "learning_rate": 0.0007780217667782243, + "loss": 1.3841, + "step": 3670 + }, + { + "epoch": 0.33306115042641987, + "grad_norm": 0.14032912853601048, + "learning_rate": 0.0007778996359527436, + "loss": 1.4409, + "step": 3671 + }, + { + "epoch": 0.3331518780620577, + "grad_norm": 0.17649813607893272, + "learning_rate": 0.0007777774811305884, + "loss": 1.4491, + "step": 3672 + }, + { + "epoch": 0.33324260569769554, + "grad_norm": 0.14557827987691171, + "learning_rate": 0.0007776553023223068, + "loss": 1.4546, + "step": 3673 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.1478525390709792, + "learning_rate": 0.000777533099538449, + "loss": 1.4526, + "step": 3674 + }, + { + "epoch": 0.33342406096897115, + "grad_norm": 0.1405788512880921, + "learning_rate": 0.000777410872789567, + "loss": 1.4475, + "step": 3675 + }, + { + "epoch": 0.333514788604609, + "grad_norm": 0.1332695534426928, + "learning_rate": 0.0007772886220862153, + "loss": 1.448, + "step": 3676 + }, + { + "epoch": 0.33360551624024676, + "grad_norm": 0.13991266544920145, + "learning_rate": 0.0007771663474389504, + "loss": 1.4171, + "step": 3677 + }, + { + "epoch": 0.3336962438758846, + "grad_norm": 0.14988594274167727, + "learning_rate": 0.0007770440488583301, + "loss": 1.4481, + "step": 3678 + }, + { + "epoch": 0.3337869715115224, + "grad_norm": 0.13568175946169977, + "learning_rate": 0.0007769217263549157, + "loss": 1.4094, + "step": 3679 + }, + { + "epoch": 0.3338776991471602, + "grad_norm": 0.17157520358234457, + "learning_rate": 0.0007767993799392693, + "loss": 1.4424, + "step": 3680 + }, + { + "epoch": 0.33396842678279803, + "grad_norm": 0.13113418385249648, + "learning_rate": 0.0007766770096219557, + "loss": 1.4193, + "step": 3681 + }, + { + "epoch": 0.33405915441843587, + "grad_norm": 0.13527178689008634, + "learning_rate": 0.0007765546154135417, + "loss": 1.4163, + "step": 3682 + }, + { + "epoch": 0.33414988205407364, + "grad_norm": 0.13884037948977349, + "learning_rate": 0.0007764321973245955, + "loss": 1.4345, + "step": 3683 + }, + { + "epoch": 0.3342406096897115, + "grad_norm": 0.1364251156995652, + "learning_rate": 0.0007763097553656886, + "loss": 1.4509, + "step": 3684 + }, + { + "epoch": 0.3343313373253493, + "grad_norm": 0.1418225387563103, + "learning_rate": 0.0007761872895473936, + "loss": 1.4007, + "step": 3685 + }, + { + "epoch": 0.33442206496098714, + "grad_norm": 0.13366140366355422, + "learning_rate": 0.0007760647998802853, + "loss": 1.4406, + "step": 3686 + }, + { + "epoch": 0.3345127925966249, + "grad_norm": 0.14811591570500693, + "learning_rate": 0.0007759422863749409, + "loss": 1.4067, + "step": 3687 + }, + { + "epoch": 0.33460352023226275, + "grad_norm": 0.13903377499131173, + "learning_rate": 0.0007758197490419394, + "loss": 1.4196, + "step": 3688 + }, + { + "epoch": 0.3346942478679006, + "grad_norm": 0.13667143754132144, + "learning_rate": 0.000775697187891862, + "loss": 1.4624, + "step": 3689 + }, + { + "epoch": 0.33478497550353836, + "grad_norm": 0.1338658006208695, + "learning_rate": 0.0007755746029352917, + "loss": 1.4585, + "step": 3690 + }, + { + "epoch": 0.3348757031391762, + "grad_norm": 0.13467902965945955, + "learning_rate": 0.0007754519941828139, + "loss": 1.4267, + "step": 3691 + }, + { + "epoch": 0.33496643077481403, + "grad_norm": 0.14384467230019327, + "learning_rate": 0.0007753293616450157, + "loss": 1.4523, + "step": 3692 + }, + { + "epoch": 0.3350571584104518, + "grad_norm": 0.13652153436160347, + "learning_rate": 0.0007752067053324867, + "loss": 1.429, + "step": 3693 + }, + { + "epoch": 0.33514788604608964, + "grad_norm": 0.14902096471901324, + "learning_rate": 0.000775084025255818, + "loss": 1.4447, + "step": 3694 + }, + { + "epoch": 0.33523861368172747, + "grad_norm": 0.17662525296430803, + "learning_rate": 0.0007749613214256033, + "loss": 1.411, + "step": 3695 + }, + { + "epoch": 0.33532934131736525, + "grad_norm": 0.14761147955121096, + "learning_rate": 0.0007748385938524378, + "loss": 1.404, + "step": 3696 + }, + { + "epoch": 0.3354200689530031, + "grad_norm": 0.13470716435575192, + "learning_rate": 0.0007747158425469194, + "loss": 1.4597, + "step": 3697 + }, + { + "epoch": 0.3355107965886409, + "grad_norm": 0.14336428143047342, + "learning_rate": 0.0007745930675196473, + "loss": 1.4084, + "step": 3698 + }, + { + "epoch": 0.3356015242242787, + "grad_norm": 0.15607816109934675, + "learning_rate": 0.0007744702687812235, + "loss": 1.4615, + "step": 3699 + }, + { + "epoch": 0.3356922518599165, + "grad_norm": 0.13398400675297495, + "learning_rate": 0.0007743474463422516, + "loss": 1.4174, + "step": 3700 + }, + { + "epoch": 0.33578297949555436, + "grad_norm": 0.17379880605757625, + "learning_rate": 0.000774224600213337, + "loss": 1.4448, + "step": 3701 + }, + { + "epoch": 0.33587370713119213, + "grad_norm": 0.13811934430645775, + "learning_rate": 0.0007741017304050879, + "loss": 1.4216, + "step": 3702 + }, + { + "epoch": 0.33596443476682997, + "grad_norm": 0.1337201260512974, + "learning_rate": 0.000773978836928114, + "loss": 1.4388, + "step": 3703 + }, + { + "epoch": 0.3360551624024678, + "grad_norm": 0.1357378099484116, + "learning_rate": 0.0007738559197930273, + "loss": 1.4231, + "step": 3704 + }, + { + "epoch": 0.33614589003810563, + "grad_norm": 0.16115545262946643, + "learning_rate": 0.0007737329790104414, + "loss": 1.3964, + "step": 3705 + }, + { + "epoch": 0.3362366176737434, + "grad_norm": 0.13765508927756737, + "learning_rate": 0.0007736100145909724, + "loss": 1.4177, + "step": 3706 + }, + { + "epoch": 0.33632734530938124, + "grad_norm": 0.13631029147411058, + "learning_rate": 0.0007734870265452382, + "loss": 1.4251, + "step": 3707 + }, + { + "epoch": 0.3364180729450191, + "grad_norm": 0.13620459128067564, + "learning_rate": 0.0007733640148838592, + "loss": 1.427, + "step": 3708 + }, + { + "epoch": 0.33650880058065685, + "grad_norm": 0.14530009872292118, + "learning_rate": 0.000773240979617457, + "loss": 1.4384, + "step": 3709 + }, + { + "epoch": 0.3365995282162947, + "grad_norm": 0.1412123957095051, + "learning_rate": 0.0007731179207566561, + "loss": 1.4141, + "step": 3710 + }, + { + "epoch": 0.3366902558519325, + "grad_norm": 0.1759721925841079, + "learning_rate": 0.0007729948383120827, + "loss": 1.4454, + "step": 3711 + }, + { + "epoch": 0.3367809834875703, + "grad_norm": 0.14273093152191219, + "learning_rate": 0.0007728717322943643, + "loss": 1.4448, + "step": 3712 + }, + { + "epoch": 0.33687171112320813, + "grad_norm": 0.14377562610297445, + "learning_rate": 0.0007727486027141319, + "loss": 1.4203, + "step": 3713 + }, + { + "epoch": 0.33696243875884596, + "grad_norm": 0.1618936667611823, + "learning_rate": 0.0007726254495820173, + "loss": 1.4266, + "step": 3714 + }, + { + "epoch": 0.33705316639448374, + "grad_norm": 0.13260365477729608, + "learning_rate": 0.0007725022729086551, + "loss": 1.4563, + "step": 3715 + }, + { + "epoch": 0.33714389403012157, + "grad_norm": 0.13740932312270818, + "learning_rate": 0.0007723790727046815, + "loss": 1.4311, + "step": 3716 + }, + { + "epoch": 0.3372346216657594, + "grad_norm": 0.13582325247860927, + "learning_rate": 0.0007722558489807347, + "loss": 1.4167, + "step": 3717 + }, + { + "epoch": 0.3373253493013972, + "grad_norm": 0.14300574697229138, + "learning_rate": 0.0007721326017474552, + "loss": 1.4517, + "step": 3718 + }, + { + "epoch": 0.337416076937035, + "grad_norm": 0.13859358236209052, + "learning_rate": 0.0007720093310154855, + "loss": 1.4133, + "step": 3719 + }, + { + "epoch": 0.33750680457267285, + "grad_norm": 0.14535537044831934, + "learning_rate": 0.0007718860367954698, + "loss": 1.4319, + "step": 3720 + }, + { + "epoch": 0.3375975322083106, + "grad_norm": 0.12500684356957142, + "learning_rate": 0.000771762719098055, + "loss": 1.4301, + "step": 3721 + }, + { + "epoch": 0.33768825984394846, + "grad_norm": 0.16811189584211475, + "learning_rate": 0.0007716393779338892, + "loss": 1.4831, + "step": 3722 + }, + { + "epoch": 0.3377789874795863, + "grad_norm": 0.1383008764843661, + "learning_rate": 0.0007715160133136232, + "loss": 1.4662, + "step": 3723 + }, + { + "epoch": 0.3378697151152241, + "grad_norm": 0.142517547925, + "learning_rate": 0.0007713926252479093, + "loss": 1.4558, + "step": 3724 + }, + { + "epoch": 0.3379604427508619, + "grad_norm": 0.13471262336952042, + "learning_rate": 0.0007712692137474025, + "loss": 1.4641, + "step": 3725 + }, + { + "epoch": 0.33805117038649973, + "grad_norm": 0.13804775276776035, + "learning_rate": 0.0007711457788227587, + "loss": 1.4066, + "step": 3726 + }, + { + "epoch": 0.33814189802213757, + "grad_norm": 0.13255820533772722, + "learning_rate": 0.0007710223204846372, + "loss": 1.4162, + "step": 3727 + }, + { + "epoch": 0.33823262565777534, + "grad_norm": 0.13604719899373605, + "learning_rate": 0.0007708988387436984, + "loss": 1.4255, + "step": 3728 + }, + { + "epoch": 0.3383233532934132, + "grad_norm": 0.13442482049203502, + "learning_rate": 0.0007707753336106047, + "loss": 1.4449, + "step": 3729 + }, + { + "epoch": 0.338414080929051, + "grad_norm": 0.14735632790266318, + "learning_rate": 0.0007706518050960212, + "loss": 1.4093, + "step": 3730 + }, + { + "epoch": 0.3385048085646888, + "grad_norm": 0.1371545930241098, + "learning_rate": 0.0007705282532106144, + "loss": 1.4515, + "step": 3731 + }, + { + "epoch": 0.3385955362003266, + "grad_norm": 0.14832543969471212, + "learning_rate": 0.000770404677965053, + "loss": 1.3907, + "step": 3732 + }, + { + "epoch": 0.33868626383596445, + "grad_norm": 0.13978072156426483, + "learning_rate": 0.000770281079370008, + "loss": 1.4354, + "step": 3733 + }, + { + "epoch": 0.33877699147160223, + "grad_norm": 0.14210250537595476, + "learning_rate": 0.0007701574574361518, + "loss": 1.412, + "step": 3734 + }, + { + "epoch": 0.33886771910724006, + "grad_norm": 0.12979954193229712, + "learning_rate": 0.0007700338121741595, + "loss": 1.4179, + "step": 3735 + }, + { + "epoch": 0.3389584467428779, + "grad_norm": 0.12739019575219865, + "learning_rate": 0.0007699101435947077, + "loss": 1.4466, + "step": 3736 + }, + { + "epoch": 0.33904917437851567, + "grad_norm": 0.13765801724907525, + "learning_rate": 0.0007697864517084749, + "loss": 1.43, + "step": 3737 + }, + { + "epoch": 0.3391399020141535, + "grad_norm": 0.1252359369160693, + "learning_rate": 0.0007696627365261427, + "loss": 1.4544, + "step": 3738 + }, + { + "epoch": 0.33923062964979134, + "grad_norm": 0.13294290709385562, + "learning_rate": 0.0007695389980583932, + "loss": 1.4016, + "step": 3739 + }, + { + "epoch": 0.3393213572854291, + "grad_norm": 0.12662873162835703, + "learning_rate": 0.0007694152363159115, + "loss": 1.4472, + "step": 3740 + }, + { + "epoch": 0.33941208492106695, + "grad_norm": 0.13263725415655242, + "learning_rate": 0.0007692914513093844, + "loss": 1.4263, + "step": 3741 + }, + { + "epoch": 0.3395028125567048, + "grad_norm": 0.14397710781814857, + "learning_rate": 0.0007691676430495007, + "loss": 1.4077, + "step": 3742 + }, + { + "epoch": 0.3395935401923426, + "grad_norm": 0.13678404063181174, + "learning_rate": 0.0007690438115469516, + "loss": 1.4284, + "step": 3743 + }, + { + "epoch": 0.3396842678279804, + "grad_norm": 0.14264277063550174, + "learning_rate": 0.0007689199568124297, + "loss": 1.4593, + "step": 3744 + }, + { + "epoch": 0.3397749954636182, + "grad_norm": 0.14926018859168236, + "learning_rate": 0.0007687960788566298, + "loss": 1.4037, + "step": 3745 + }, + { + "epoch": 0.33986572309925606, + "grad_norm": 0.13349841982974425, + "learning_rate": 0.0007686721776902488, + "loss": 1.4461, + "step": 3746 + }, + { + "epoch": 0.33995645073489383, + "grad_norm": 0.13199211042627393, + "learning_rate": 0.0007685482533239858, + "loss": 1.4287, + "step": 3747 + }, + { + "epoch": 0.34004717837053167, + "grad_norm": 0.13302779272448884, + "learning_rate": 0.0007684243057685414, + "loss": 1.4498, + "step": 3748 + }, + { + "epoch": 0.3401379060061695, + "grad_norm": 0.15722582326187104, + "learning_rate": 0.0007683003350346187, + "loss": 1.433, + "step": 3749 + }, + { + "epoch": 0.3402286336418073, + "grad_norm": 0.1314254434112638, + "learning_rate": 0.0007681763411329226, + "loss": 1.449, + "step": 3750 + }, + { + "epoch": 0.3403193612774451, + "grad_norm": 0.13686537639686008, + "learning_rate": 0.00076805232407416, + "loss": 1.3996, + "step": 3751 + }, + { + "epoch": 0.34041008891308294, + "grad_norm": 0.13009391876651846, + "learning_rate": 0.0007679282838690395, + "loss": 1.4485, + "step": 3752 + }, + { + "epoch": 0.3405008165487207, + "grad_norm": 0.1389490287308042, + "learning_rate": 0.0007678042205282723, + "loss": 1.3988, + "step": 3753 + }, + { + "epoch": 0.34059154418435855, + "grad_norm": 0.14000006551521765, + "learning_rate": 0.000767680134062571, + "loss": 1.4321, + "step": 3754 + }, + { + "epoch": 0.3406822718199964, + "grad_norm": 0.13141792225547122, + "learning_rate": 0.0007675560244826508, + "loss": 1.4082, + "step": 3755 + }, + { + "epoch": 0.34077299945563416, + "grad_norm": 0.13403830979683176, + "learning_rate": 0.0007674318917992287, + "loss": 1.3859, + "step": 3756 + }, + { + "epoch": 0.340863727091272, + "grad_norm": 0.13803301025893205, + "learning_rate": 0.000767307736023023, + "loss": 1.4144, + "step": 3757 + }, + { + "epoch": 0.34095445472690983, + "grad_norm": 0.14324617150737884, + "learning_rate": 0.0007671835571647549, + "loss": 1.4634, + "step": 3758 + }, + { + "epoch": 0.3410451823625476, + "grad_norm": 0.1481598178629098, + "learning_rate": 0.0007670593552351475, + "loss": 1.4644, + "step": 3759 + }, + { + "epoch": 0.34113590999818544, + "grad_norm": 0.2054745654605386, + "learning_rate": 0.0007669351302449253, + "loss": 1.4237, + "step": 3760 + }, + { + "epoch": 0.34122663763382327, + "grad_norm": 0.13555931729324897, + "learning_rate": 0.0007668108822048152, + "loss": 1.449, + "step": 3761 + }, + { + "epoch": 0.3413173652694611, + "grad_norm": 0.1423928748900926, + "learning_rate": 0.0007666866111255461, + "loss": 1.4365, + "step": 3762 + }, + { + "epoch": 0.3414080929050989, + "grad_norm": 0.14458272370153838, + "learning_rate": 0.0007665623170178489, + "loss": 1.453, + "step": 3763 + }, + { + "epoch": 0.3414988205407367, + "grad_norm": 0.14829036832620016, + "learning_rate": 0.0007664379998924563, + "loss": 1.4196, + "step": 3764 + }, + { + "epoch": 0.34158954817637455, + "grad_norm": 0.13211062651768105, + "learning_rate": 0.0007663136597601031, + "loss": 1.4187, + "step": 3765 + }, + { + "epoch": 0.3416802758120123, + "grad_norm": 0.14201188083790756, + "learning_rate": 0.0007661892966315262, + "loss": 1.4152, + "step": 3766 + }, + { + "epoch": 0.34177100344765016, + "grad_norm": 0.13379030919765478, + "learning_rate": 0.0007660649105174642, + "loss": 1.4264, + "step": 3767 + }, + { + "epoch": 0.341861731083288, + "grad_norm": 0.13616549777404013, + "learning_rate": 0.000765940501428658, + "loss": 1.4095, + "step": 3768 + }, + { + "epoch": 0.34195245871892577, + "grad_norm": 0.13813824481683648, + "learning_rate": 0.0007658160693758504, + "loss": 1.3858, + "step": 3769 + }, + { + "epoch": 0.3420431863545636, + "grad_norm": 0.14880078758619047, + "learning_rate": 0.000765691614369786, + "loss": 1.4514, + "step": 3770 + }, + { + "epoch": 0.34213391399020143, + "grad_norm": 0.13104175817184133, + "learning_rate": 0.0007655671364212116, + "loss": 1.459, + "step": 3771 + }, + { + "epoch": 0.3422246416258392, + "grad_norm": 0.13778830680020465, + "learning_rate": 0.0007654426355408756, + "loss": 1.4389, + "step": 3772 + }, + { + "epoch": 0.34231536926147704, + "grad_norm": 0.13604776269297192, + "learning_rate": 0.0007653181117395292, + "loss": 1.4649, + "step": 3773 + }, + { + "epoch": 0.3424060968971149, + "grad_norm": 0.13312029115242316, + "learning_rate": 0.0007651935650279245, + "loss": 1.4702, + "step": 3774 + }, + { + "epoch": 0.34249682453275265, + "grad_norm": 0.13631702419969988, + "learning_rate": 0.0007650689954168166, + "loss": 1.466, + "step": 3775 + }, + { + "epoch": 0.3425875521683905, + "grad_norm": 0.137831319949764, + "learning_rate": 0.0007649444029169617, + "loss": 1.3926, + "step": 3776 + }, + { + "epoch": 0.3426782798040283, + "grad_norm": 0.15116112920365576, + "learning_rate": 0.0007648197875391185, + "loss": 1.4433, + "step": 3777 + }, + { + "epoch": 0.3427690074396661, + "grad_norm": 0.17668693661315976, + "learning_rate": 0.0007646951492940479, + "loss": 1.4155, + "step": 3778 + }, + { + "epoch": 0.34285973507530393, + "grad_norm": 0.13947855510312793, + "learning_rate": 0.0007645704881925118, + "loss": 1.4494, + "step": 3779 + }, + { + "epoch": 0.34295046271094176, + "grad_norm": 0.13220766647760068, + "learning_rate": 0.0007644458042452753, + "loss": 1.4239, + "step": 3780 + }, + { + "epoch": 0.3430411903465796, + "grad_norm": 0.1545981456510962, + "learning_rate": 0.0007643210974631045, + "loss": 1.4112, + "step": 3781 + }, + { + "epoch": 0.34313191798221737, + "grad_norm": 0.16140163236476723, + "learning_rate": 0.000764196367856768, + "loss": 1.4434, + "step": 3782 + }, + { + "epoch": 0.3432226456178552, + "grad_norm": 0.14552159710940987, + "learning_rate": 0.0007640716154370363, + "loss": 1.4542, + "step": 3783 + }, + { + "epoch": 0.34331337325349304, + "grad_norm": 0.1395791359939996, + "learning_rate": 0.0007639468402146814, + "loss": 1.4143, + "step": 3784 + }, + { + "epoch": 0.3434041008891308, + "grad_norm": 0.13408073629734435, + "learning_rate": 0.0007638220422004784, + "loss": 1.4312, + "step": 3785 + }, + { + "epoch": 0.34349482852476865, + "grad_norm": 0.14307180206139247, + "learning_rate": 0.0007636972214052028, + "loss": 1.4375, + "step": 3786 + }, + { + "epoch": 0.3435855561604065, + "grad_norm": 0.15642817055894445, + "learning_rate": 0.0007635723778396334, + "loss": 1.4236, + "step": 3787 + }, + { + "epoch": 0.34367628379604426, + "grad_norm": 0.1496795567842592, + "learning_rate": 0.0007634475115145502, + "loss": 1.4207, + "step": 3788 + }, + { + "epoch": 0.3437670114316821, + "grad_norm": 0.13374580588212523, + "learning_rate": 0.0007633226224407358, + "loss": 1.403, + "step": 3789 + }, + { + "epoch": 0.3438577390673199, + "grad_norm": 0.1692867815533983, + "learning_rate": 0.000763197710628974, + "loss": 1.4234, + "step": 3790 + }, + { + "epoch": 0.3439484667029577, + "grad_norm": 0.14320156253070523, + "learning_rate": 0.0007630727760900511, + "loss": 1.4401, + "step": 3791 + }, + { + "epoch": 0.34403919433859553, + "grad_norm": 0.25703176641784975, + "learning_rate": 0.0007629478188347552, + "loss": 1.3761, + "step": 3792 + }, + { + "epoch": 0.34412992197423337, + "grad_norm": 0.16277556330321327, + "learning_rate": 0.0007628228388738765, + "loss": 1.4041, + "step": 3793 + }, + { + "epoch": 0.34422064960987114, + "grad_norm": 0.4015034795854639, + "learning_rate": 0.0007626978362182069, + "loss": 1.403, + "step": 3794 + }, + { + "epoch": 0.344311377245509, + "grad_norm": 0.16563354161078886, + "learning_rate": 0.0007625728108785404, + "loss": 1.4554, + "step": 3795 + }, + { + "epoch": 0.3444021048811468, + "grad_norm": 0.13586695264806148, + "learning_rate": 0.0007624477628656727, + "loss": 1.4454, + "step": 3796 + }, + { + "epoch": 0.3444928325167846, + "grad_norm": 0.1697072906029104, + "learning_rate": 0.0007623226921904024, + "loss": 1.4396, + "step": 3797 + }, + { + "epoch": 0.3445835601524224, + "grad_norm": 0.14204289348132831, + "learning_rate": 0.0007621975988635288, + "loss": 1.408, + "step": 3798 + }, + { + "epoch": 0.34467428778806025, + "grad_norm": 0.14541051735110289, + "learning_rate": 0.0007620724828958539, + "loss": 1.4105, + "step": 3799 + }, + { + "epoch": 0.3447650154236981, + "grad_norm": 0.14963997462508422, + "learning_rate": 0.0007619473442981815, + "loss": 1.4338, + "step": 3800 + }, + { + "epoch": 0.34485574305933586, + "grad_norm": 0.1579796678192043, + "learning_rate": 0.0007618221830813172, + "loss": 1.4257, + "step": 3801 + }, + { + "epoch": 0.3449464706949737, + "grad_norm": 0.14315042637983563, + "learning_rate": 0.0007616969992560689, + "loss": 1.4602, + "step": 3802 + }, + { + "epoch": 0.3450371983306115, + "grad_norm": 0.13499234079468966, + "learning_rate": 0.0007615717928332462, + "loss": 1.4396, + "step": 3803 + }, + { + "epoch": 0.3451279259662493, + "grad_norm": 0.1461818925504072, + "learning_rate": 0.0007614465638236605, + "loss": 1.448, + "step": 3804 + }, + { + "epoch": 0.34521865360188714, + "grad_norm": 0.14587163771904005, + "learning_rate": 0.0007613213122381256, + "loss": 1.4264, + "step": 3805 + }, + { + "epoch": 0.34530938123752497, + "grad_norm": 0.13786447046657688, + "learning_rate": 0.0007611960380874567, + "loss": 1.4353, + "step": 3806 + }, + { + "epoch": 0.34540010887316275, + "grad_norm": 0.1922022979603246, + "learning_rate": 0.0007610707413824717, + "loss": 1.4256, + "step": 3807 + }, + { + "epoch": 0.3454908365088006, + "grad_norm": 0.14918538615303614, + "learning_rate": 0.0007609454221339895, + "loss": 1.4042, + "step": 3808 + }, + { + "epoch": 0.3455815641444384, + "grad_norm": 0.14907701906967502, + "learning_rate": 0.0007608200803528318, + "loss": 1.4031, + "step": 3809 + }, + { + "epoch": 0.3456722917800762, + "grad_norm": 0.16969394870388532, + "learning_rate": 0.0007606947160498216, + "loss": 1.4121, + "step": 3810 + }, + { + "epoch": 0.345763019415714, + "grad_norm": 0.1430723711160428, + "learning_rate": 0.000760569329235784, + "loss": 1.4322, + "step": 3811 + }, + { + "epoch": 0.34585374705135186, + "grad_norm": 0.14748909440512747, + "learning_rate": 0.0007604439199215468, + "loss": 1.4618, + "step": 3812 + }, + { + "epoch": 0.34594447468698963, + "grad_norm": 0.14721753037553298, + "learning_rate": 0.0007603184881179386, + "loss": 1.4107, + "step": 3813 + }, + { + "epoch": 0.34603520232262747, + "grad_norm": 0.14167652258200775, + "learning_rate": 0.0007601930338357908, + "loss": 1.4117, + "step": 3814 + }, + { + "epoch": 0.3461259299582653, + "grad_norm": 0.14677743470659277, + "learning_rate": 0.0007600675570859357, + "loss": 1.4326, + "step": 3815 + }, + { + "epoch": 0.3462166575939031, + "grad_norm": 0.15853039391271795, + "learning_rate": 0.000759942057879209, + "loss": 1.4443, + "step": 3816 + }, + { + "epoch": 0.3463073852295409, + "grad_norm": 0.14215588437695167, + "learning_rate": 0.0007598165362264473, + "loss": 1.3902, + "step": 3817 + }, + { + "epoch": 0.34639811286517874, + "grad_norm": 0.1503084571352621, + "learning_rate": 0.0007596909921384894, + "loss": 1.4571, + "step": 3818 + }, + { + "epoch": 0.3464888405008166, + "grad_norm": 0.16015897262473563, + "learning_rate": 0.000759565425626176, + "loss": 1.439, + "step": 3819 + }, + { + "epoch": 0.34657956813645435, + "grad_norm": 0.1494099009481822, + "learning_rate": 0.0007594398367003498, + "loss": 1.4262, + "step": 3820 + }, + { + "epoch": 0.3466702957720922, + "grad_norm": 0.14784403382090217, + "learning_rate": 0.0007593142253718556, + "loss": 1.4354, + "step": 3821 + }, + { + "epoch": 0.34676102340773, + "grad_norm": 0.16236818434120998, + "learning_rate": 0.0007591885916515398, + "loss": 1.4674, + "step": 3822 + }, + { + "epoch": 0.3468517510433678, + "grad_norm": 0.15498778445220798, + "learning_rate": 0.0007590629355502508, + "loss": 1.4603, + "step": 3823 + }, + { + "epoch": 0.34694247867900563, + "grad_norm": 0.1402923221198293, + "learning_rate": 0.0007589372570788393, + "loss": 1.4048, + "step": 3824 + }, + { + "epoch": 0.34703320631464346, + "grad_norm": 0.14594662224044475, + "learning_rate": 0.0007588115562481573, + "loss": 1.4281, + "step": 3825 + }, + { + "epoch": 0.34712393395028124, + "grad_norm": 0.1636537096367189, + "learning_rate": 0.0007586858330690593, + "loss": 1.4728, + "step": 3826 + }, + { + "epoch": 0.34721466158591907, + "grad_norm": 0.2211428015723422, + "learning_rate": 0.0007585600875524016, + "loss": 1.4389, + "step": 3827 + }, + { + "epoch": 0.3473053892215569, + "grad_norm": 0.15136135862959688, + "learning_rate": 0.0007584343197090422, + "loss": 1.4479, + "step": 3828 + }, + { + "epoch": 0.3473961168571947, + "grad_norm": 0.1867395600535395, + "learning_rate": 0.0007583085295498412, + "loss": 1.3927, + "step": 3829 + }, + { + "epoch": 0.3474868444928325, + "grad_norm": 0.1420141672487415, + "learning_rate": 0.0007581827170856605, + "loss": 1.3859, + "step": 3830 + }, + { + "epoch": 0.34757757212847035, + "grad_norm": 0.14561392234352385, + "learning_rate": 0.0007580568823273643, + "loss": 1.4267, + "step": 3831 + }, + { + "epoch": 0.3476682997641081, + "grad_norm": 0.1884853438915526, + "learning_rate": 0.0007579310252858181, + "loss": 1.4245, + "step": 3832 + }, + { + "epoch": 0.34775902739974596, + "grad_norm": 0.1681441787464079, + "learning_rate": 0.00075780514597189, + "loss": 1.429, + "step": 3833 + }, + { + "epoch": 0.3478497550353838, + "grad_norm": 0.14886746541400522, + "learning_rate": 0.0007576792443964495, + "loss": 1.3933, + "step": 3834 + }, + { + "epoch": 0.34794048267102157, + "grad_norm": 0.16516894539909768, + "learning_rate": 0.0007575533205703682, + "loss": 1.4278, + "step": 3835 + }, + { + "epoch": 0.3480312103066594, + "grad_norm": 0.14541289241049304, + "learning_rate": 0.0007574273745045198, + "loss": 1.4631, + "step": 3836 + }, + { + "epoch": 0.34812193794229723, + "grad_norm": 0.17503629427113904, + "learning_rate": 0.0007573014062097796, + "loss": 1.4278, + "step": 3837 + }, + { + "epoch": 0.34821266557793507, + "grad_norm": 0.16334286935015954, + "learning_rate": 0.0007571754156970252, + "loss": 1.4412, + "step": 3838 + }, + { + "epoch": 0.34830339321357284, + "grad_norm": 0.14661739219827202, + "learning_rate": 0.0007570494029771356, + "loss": 1.4374, + "step": 3839 + }, + { + "epoch": 0.3483941208492107, + "grad_norm": 0.14983020308429787, + "learning_rate": 0.0007569233680609921, + "loss": 1.4041, + "step": 3840 + }, + { + "epoch": 0.3484848484848485, + "grad_norm": 0.15134777495354293, + "learning_rate": 0.0007567973109594781, + "loss": 1.4195, + "step": 3841 + }, + { + "epoch": 0.3485755761204863, + "grad_norm": 0.1407117417682839, + "learning_rate": 0.0007566712316834783, + "loss": 1.4276, + "step": 3842 + }, + { + "epoch": 0.3486663037561241, + "grad_norm": 0.141710641993523, + "learning_rate": 0.00075654513024388, + "loss": 1.4232, + "step": 3843 + }, + { + "epoch": 0.34875703139176195, + "grad_norm": 0.152760601983633, + "learning_rate": 0.0007564190066515717, + "loss": 1.4422, + "step": 3844 + }, + { + "epoch": 0.34884775902739973, + "grad_norm": 0.14303504670828868, + "learning_rate": 0.0007562928609174444, + "loss": 1.4067, + "step": 3845 + }, + { + "epoch": 0.34893848666303756, + "grad_norm": 0.13699247087836633, + "learning_rate": 0.0007561666930523908, + "loss": 1.4237, + "step": 3846 + }, + { + "epoch": 0.3490292142986754, + "grad_norm": 0.16070396889664385, + "learning_rate": 0.0007560405030673055, + "loss": 1.4228, + "step": 3847 + }, + { + "epoch": 0.34911994193431317, + "grad_norm": 0.15219544968079263, + "learning_rate": 0.0007559142909730851, + "loss": 1.4186, + "step": 3848 + }, + { + "epoch": 0.349210669569951, + "grad_norm": 0.17868598097561225, + "learning_rate": 0.000755788056780628, + "loss": 1.4263, + "step": 3849 + }, + { + "epoch": 0.34930139720558884, + "grad_norm": 0.14463033264926253, + "learning_rate": 0.0007556618005008343, + "loss": 1.4476, + "step": 3850 + }, + { + "epoch": 0.3493921248412266, + "grad_norm": 0.15832172271338657, + "learning_rate": 0.0007555355221446066, + "loss": 1.4494, + "step": 3851 + }, + { + "epoch": 0.34948285247686445, + "grad_norm": 0.13981062597967223, + "learning_rate": 0.0007554092217228489, + "loss": 1.4517, + "step": 3852 + }, + { + "epoch": 0.3495735801125023, + "grad_norm": 0.14098340269614815, + "learning_rate": 0.0007552828992464674, + "loss": 1.4068, + "step": 3853 + }, + { + "epoch": 0.34966430774814006, + "grad_norm": 0.14077251689176976, + "learning_rate": 0.0007551565547263694, + "loss": 1.4148, + "step": 3854 + }, + { + "epoch": 0.3497550353837779, + "grad_norm": 0.14521230659267498, + "learning_rate": 0.0007550301881734658, + "loss": 1.4685, + "step": 3855 + }, + { + "epoch": 0.3498457630194157, + "grad_norm": 0.13606529866989028, + "learning_rate": 0.0007549037995986677, + "loss": 1.4494, + "step": 3856 + }, + { + "epoch": 0.34993649065505356, + "grad_norm": 0.14087296420731132, + "learning_rate": 0.000754777389012889, + "loss": 1.3979, + "step": 3857 + }, + { + "epoch": 0.35002721829069133, + "grad_norm": 0.17727113141102696, + "learning_rate": 0.000754650956427045, + "loss": 1.4474, + "step": 3858 + }, + { + "epoch": 0.35011794592632917, + "grad_norm": 0.14968167265145316, + "learning_rate": 0.0007545245018520535, + "loss": 1.4405, + "step": 3859 + }, + { + "epoch": 0.350208673561967, + "grad_norm": 0.4123293766790038, + "learning_rate": 0.0007543980252988339, + "loss": 1.4145, + "step": 3860 + }, + { + "epoch": 0.3502994011976048, + "grad_norm": 0.15362480172072146, + "learning_rate": 0.0007542715267783072, + "loss": 1.4113, + "step": 3861 + }, + { + "epoch": 0.3503901288332426, + "grad_norm": 0.15972746944565058, + "learning_rate": 0.0007541450063013966, + "loss": 1.4647, + "step": 3862 + }, + { + "epoch": 0.35048085646888044, + "grad_norm": 0.20426612946555867, + "learning_rate": 0.0007540184638790275, + "loss": 1.4365, + "step": 3863 + }, + { + "epoch": 0.3505715841045182, + "grad_norm": 0.14854808306826664, + "learning_rate": 0.0007538918995221263, + "loss": 1.4282, + "step": 3864 + }, + { + "epoch": 0.35066231174015605, + "grad_norm": 0.15021734175108611, + "learning_rate": 0.0007537653132416223, + "loss": 1.438, + "step": 3865 + }, + { + "epoch": 0.3507530393757939, + "grad_norm": 0.15300096354349163, + "learning_rate": 0.0007536387050484461, + "loss": 1.4552, + "step": 3866 + }, + { + "epoch": 0.35084376701143166, + "grad_norm": 0.14570435002471468, + "learning_rate": 0.0007535120749535304, + "loss": 1.4425, + "step": 3867 + }, + { + "epoch": 0.3509344946470695, + "grad_norm": 0.14896661735843603, + "learning_rate": 0.0007533854229678096, + "loss": 1.4309, + "step": 3868 + }, + { + "epoch": 0.3510252222827073, + "grad_norm": 0.2890218655434037, + "learning_rate": 0.0007532587491022203, + "loss": 1.4265, + "step": 3869 + }, + { + "epoch": 0.3511159499183451, + "grad_norm": 0.14459691588805543, + "learning_rate": 0.0007531320533677004, + "loss": 1.4295, + "step": 3870 + }, + { + "epoch": 0.35120667755398294, + "grad_norm": 0.15303377615262562, + "learning_rate": 0.0007530053357751906, + "loss": 1.423, + "step": 3871 + }, + { + "epoch": 0.35129740518962077, + "grad_norm": 0.15836074614024573, + "learning_rate": 0.0007528785963356326, + "loss": 1.4292, + "step": 3872 + }, + { + "epoch": 0.35138813282525855, + "grad_norm": 0.17743742111130031, + "learning_rate": 0.0007527518350599708, + "loss": 1.3888, + "step": 3873 + }, + { + "epoch": 0.3514788604608964, + "grad_norm": 0.1448132664582755, + "learning_rate": 0.0007526250519591506, + "loss": 1.3875, + "step": 3874 + }, + { + "epoch": 0.3515695880965342, + "grad_norm": 0.15113327951041908, + "learning_rate": 0.00075249824704412, + "loss": 1.4576, + "step": 3875 + }, + { + "epoch": 0.35166031573217205, + "grad_norm": 0.18618811790205236, + "learning_rate": 0.0007523714203258286, + "loss": 1.4544, + "step": 3876 + }, + { + "epoch": 0.3517510433678098, + "grad_norm": 0.1474945516193842, + "learning_rate": 0.0007522445718152278, + "loss": 1.3856, + "step": 3877 + }, + { + "epoch": 0.35184177100344766, + "grad_norm": 0.27062498628326115, + "learning_rate": 0.000752117701523271, + "loss": 1.4485, + "step": 3878 + }, + { + "epoch": 0.3519324986390855, + "grad_norm": 0.1538344750531927, + "learning_rate": 0.0007519908094609135, + "loss": 1.4507, + "step": 3879 + }, + { + "epoch": 0.35202322627472327, + "grad_norm": 0.15404582319508459, + "learning_rate": 0.0007518638956391126, + "loss": 1.4131, + "step": 3880 + }, + { + "epoch": 0.3521139539103611, + "grad_norm": 0.15238623374027582, + "learning_rate": 0.0007517369600688271, + "loss": 1.4276, + "step": 3881 + }, + { + "epoch": 0.35220468154599893, + "grad_norm": 0.2362808619767305, + "learning_rate": 0.000751610002761018, + "loss": 1.4479, + "step": 3882 + }, + { + "epoch": 0.3522954091816367, + "grad_norm": 0.14797464218044834, + "learning_rate": 0.000751483023726648, + "loss": 1.4368, + "step": 3883 + }, + { + "epoch": 0.35238613681727454, + "grad_norm": 0.157530686238241, + "learning_rate": 0.0007513560229766819, + "loss": 1.4217, + "step": 3884 + }, + { + "epoch": 0.3524768644529124, + "grad_norm": 0.16162641212038076, + "learning_rate": 0.0007512290005220861, + "loss": 1.4338, + "step": 3885 + }, + { + "epoch": 0.35256759208855015, + "grad_norm": 0.15906491759122274, + "learning_rate": 0.0007511019563738293, + "loss": 1.4404, + "step": 3886 + }, + { + "epoch": 0.352658319724188, + "grad_norm": 0.15852447879090778, + "learning_rate": 0.0007509748905428815, + "loss": 1.429, + "step": 3887 + }, + { + "epoch": 0.3527490473598258, + "grad_norm": 0.18057422480451016, + "learning_rate": 0.0007508478030402147, + "loss": 1.4024, + "step": 3888 + }, + { + "epoch": 0.3528397749954636, + "grad_norm": 0.22905847804137555, + "learning_rate": 0.0007507206938768032, + "loss": 1.417, + "step": 3889 + }, + { + "epoch": 0.35293050263110143, + "grad_norm": 0.15776367782993136, + "learning_rate": 0.0007505935630636229, + "loss": 1.4498, + "step": 3890 + }, + { + "epoch": 0.35302123026673926, + "grad_norm": 0.1488061133337983, + "learning_rate": 0.0007504664106116515, + "loss": 1.4545, + "step": 3891 + }, + { + "epoch": 0.35311195790237704, + "grad_norm": 0.16596532556889676, + "learning_rate": 0.0007503392365318688, + "loss": 1.4484, + "step": 3892 + }, + { + "epoch": 0.35320268553801487, + "grad_norm": 0.14428407887378525, + "learning_rate": 0.0007502120408352557, + "loss": 1.4144, + "step": 3893 + }, + { + "epoch": 0.3532934131736527, + "grad_norm": 0.1709531019875482, + "learning_rate": 0.0007500848235327964, + "loss": 1.4398, + "step": 3894 + }, + { + "epoch": 0.35338414080929054, + "grad_norm": 0.15534879546806404, + "learning_rate": 0.0007499575846354755, + "loss": 1.4171, + "step": 3895 + }, + { + "epoch": 0.3534748684449283, + "grad_norm": 0.14431151648068, + "learning_rate": 0.0007498303241542805, + "loss": 1.4501, + "step": 3896 + }, + { + "epoch": 0.35356559608056615, + "grad_norm": 0.1467933409942153, + "learning_rate": 0.0007497030421002001, + "loss": 1.4301, + "step": 3897 + }, + { + "epoch": 0.353656323716204, + "grad_norm": 0.21097796795894336, + "learning_rate": 0.0007495757384842251, + "loss": 1.452, + "step": 3898 + }, + { + "epoch": 0.35374705135184176, + "grad_norm": 0.14829718441518722, + "learning_rate": 0.0007494484133173484, + "loss": 1.4345, + "step": 3899 + }, + { + "epoch": 0.3538377789874796, + "grad_norm": 0.14911027227862506, + "learning_rate": 0.0007493210666105646, + "loss": 1.4286, + "step": 3900 + }, + { + "epoch": 0.3539285066231174, + "grad_norm": 0.15622076039504104, + "learning_rate": 0.0007491936983748698, + "loss": 1.4382, + "step": 3901 + }, + { + "epoch": 0.3540192342587552, + "grad_norm": 0.14438052135144464, + "learning_rate": 0.0007490663086212624, + "loss": 1.4068, + "step": 3902 + }, + { + "epoch": 0.35410996189439303, + "grad_norm": 0.1497656411056315, + "learning_rate": 0.0007489388973607425, + "loss": 1.4311, + "step": 3903 + }, + { + "epoch": 0.35420068953003087, + "grad_norm": 0.15443803379588003, + "learning_rate": 0.0007488114646043121, + "loss": 1.4511, + "step": 3904 + }, + { + "epoch": 0.35429141716566864, + "grad_norm": 0.16076486045292723, + "learning_rate": 0.0007486840103629751, + "loss": 1.3934, + "step": 3905 + }, + { + "epoch": 0.3543821448013065, + "grad_norm": 0.14996144137086911, + "learning_rate": 0.0007485565346477374, + "loss": 1.3986, + "step": 3906 + }, + { + "epoch": 0.3544728724369443, + "grad_norm": 0.1545952077130321, + "learning_rate": 0.0007484290374696061, + "loss": 1.4149, + "step": 3907 + }, + { + "epoch": 0.3545636000725821, + "grad_norm": 0.15259408344812392, + "learning_rate": 0.0007483015188395907, + "loss": 1.4238, + "step": 3908 + }, + { + "epoch": 0.3546543277082199, + "grad_norm": 0.15641996026387828, + "learning_rate": 0.0007481739787687028, + "loss": 1.4253, + "step": 3909 + }, + { + "epoch": 0.35474505534385775, + "grad_norm": 0.1521875338611335, + "learning_rate": 0.0007480464172679549, + "loss": 1.4277, + "step": 3910 + }, + { + "epoch": 0.35483578297949553, + "grad_norm": 0.1490289446573743, + "learning_rate": 0.0007479188343483626, + "loss": 1.389, + "step": 3911 + }, + { + "epoch": 0.35492651061513336, + "grad_norm": 0.15827831869416273, + "learning_rate": 0.0007477912300209424, + "loss": 1.4376, + "step": 3912 + }, + { + "epoch": 0.3550172382507712, + "grad_norm": 0.14050716322067758, + "learning_rate": 0.0007476636042967128, + "loss": 1.443, + "step": 3913 + }, + { + "epoch": 0.355107965886409, + "grad_norm": 0.1402291274110973, + "learning_rate": 0.0007475359571866946, + "loss": 1.4609, + "step": 3914 + }, + { + "epoch": 0.3551986935220468, + "grad_norm": 0.14938206307350774, + "learning_rate": 0.00074740828870191, + "loss": 1.446, + "step": 3915 + }, + { + "epoch": 0.35528942115768464, + "grad_norm": 0.15253070656373718, + "learning_rate": 0.0007472805988533831, + "loss": 1.4022, + "step": 3916 + }, + { + "epoch": 0.35538014879332247, + "grad_norm": 0.16735777630501888, + "learning_rate": 0.0007471528876521402, + "loss": 1.4556, + "step": 3917 + }, + { + "epoch": 0.35547087642896025, + "grad_norm": 0.15145455075073203, + "learning_rate": 0.0007470251551092089, + "loss": 1.3916, + "step": 3918 + }, + { + "epoch": 0.3555616040645981, + "grad_norm": 0.14914754651524215, + "learning_rate": 0.0007468974012356192, + "loss": 1.399, + "step": 3919 + }, + { + "epoch": 0.3556523317002359, + "grad_norm": 0.1538436539106682, + "learning_rate": 0.0007467696260424024, + "loss": 1.406, + "step": 3920 + }, + { + "epoch": 0.3557430593358737, + "grad_norm": 0.14195980549383935, + "learning_rate": 0.000746641829540592, + "loss": 1.4322, + "step": 3921 + }, + { + "epoch": 0.3558337869715115, + "grad_norm": 0.14329104660764136, + "learning_rate": 0.0007465140117412233, + "loss": 1.4322, + "step": 3922 + }, + { + "epoch": 0.35592451460714936, + "grad_norm": 0.26880155492281754, + "learning_rate": 0.0007463861726553334, + "loss": 1.3968, + "step": 3923 + }, + { + "epoch": 0.35601524224278713, + "grad_norm": 0.16870721181669884, + "learning_rate": 0.0007462583122939612, + "loss": 1.4724, + "step": 3924 + }, + { + "epoch": 0.35610596987842497, + "grad_norm": 0.2270999751779172, + "learning_rate": 0.0007461304306681472, + "loss": 1.4149, + "step": 3925 + }, + { + "epoch": 0.3561966975140628, + "grad_norm": 0.5237968281878967, + "learning_rate": 0.0007460025277889345, + "loss": 1.4092, + "step": 3926 + }, + { + "epoch": 0.3562874251497006, + "grad_norm": 0.13358477223711426, + "learning_rate": 0.0007458746036673672, + "loss": 1.3807, + "step": 3927 + }, + { + "epoch": 0.3563781527853384, + "grad_norm": 0.2074770978109366, + "learning_rate": 0.0007457466583144915, + "loss": 1.4397, + "step": 3928 + }, + { + "epoch": 0.35646888042097624, + "grad_norm": 0.15093625002858244, + "learning_rate": 0.0007456186917413559, + "loss": 1.4134, + "step": 3929 + }, + { + "epoch": 0.356559608056614, + "grad_norm": 0.14276346064945514, + "learning_rate": 0.0007454907039590098, + "loss": 1.4369, + "step": 3930 + }, + { + "epoch": 0.35665033569225185, + "grad_norm": 0.8652968013522939, + "learning_rate": 0.0007453626949785055, + "loss": 1.4295, + "step": 3931 + }, + { + "epoch": 0.3567410633278897, + "grad_norm": 0.1875246253556635, + "learning_rate": 0.0007452346648108961, + "loss": 1.4522, + "step": 3932 + }, + { + "epoch": 0.3568317909635275, + "grad_norm": 0.16497880744859536, + "learning_rate": 0.0007451066134672373, + "loss": 1.3945, + "step": 3933 + }, + { + "epoch": 0.3569225185991653, + "grad_norm": 0.1421456939992672, + "learning_rate": 0.0007449785409585863, + "loss": 1.4333, + "step": 3934 + }, + { + "epoch": 0.3570132462348031, + "grad_norm": 0.1484072171175206, + "learning_rate": 0.0007448504472960022, + "loss": 1.4938, + "step": 3935 + }, + { + "epoch": 0.35710397387044096, + "grad_norm": 0.14358788019523452, + "learning_rate": 0.0007447223324905459, + "loss": 1.4256, + "step": 3936 + }, + { + "epoch": 0.35719470150607874, + "grad_norm": 0.1465112742517844, + "learning_rate": 0.0007445941965532801, + "loss": 1.4401, + "step": 3937 + }, + { + "epoch": 0.35728542914171657, + "grad_norm": 0.14840168359191105, + "learning_rate": 0.0007444660394952694, + "loss": 1.4559, + "step": 3938 + }, + { + "epoch": 0.3573761567773544, + "grad_norm": 0.13918955211477224, + "learning_rate": 0.0007443378613275803, + "loss": 1.4348, + "step": 3939 + }, + { + "epoch": 0.3574668844129922, + "grad_norm": 0.14356512633518828, + "learning_rate": 0.0007442096620612806, + "loss": 1.4099, + "step": 3940 + }, + { + "epoch": 0.35755761204863, + "grad_norm": 0.14612248503401287, + "learning_rate": 0.0007440814417074408, + "loss": 1.4394, + "step": 3941 + }, + { + "epoch": 0.35764833968426785, + "grad_norm": 0.14627817017574957, + "learning_rate": 0.0007439532002771324, + "loss": 1.4456, + "step": 3942 + }, + { + "epoch": 0.3577390673199056, + "grad_norm": 0.14162026188712437, + "learning_rate": 0.0007438249377814293, + "loss": 1.4523, + "step": 3943 + }, + { + "epoch": 0.35782979495554346, + "grad_norm": 0.1402633897805565, + "learning_rate": 0.0007436966542314068, + "loss": 1.406, + "step": 3944 + }, + { + "epoch": 0.3579205225911813, + "grad_norm": 0.14140133545943884, + "learning_rate": 0.0007435683496381422, + "loss": 1.413, + "step": 3945 + }, + { + "epoch": 0.35801125022681907, + "grad_norm": 0.14247740181134164, + "learning_rate": 0.0007434400240127149, + "loss": 1.4291, + "step": 3946 + }, + { + "epoch": 0.3581019778624569, + "grad_norm": 0.14094646910009787, + "learning_rate": 0.0007433116773662056, + "loss": 1.4411, + "step": 3947 + }, + { + "epoch": 0.35819270549809473, + "grad_norm": 0.14972691248674333, + "learning_rate": 0.000743183309709697, + "loss": 1.4074, + "step": 3948 + }, + { + "epoch": 0.3582834331337325, + "grad_norm": 0.18802502932355955, + "learning_rate": 0.0007430549210542739, + "loss": 1.4179, + "step": 3949 + }, + { + "epoch": 0.35837416076937034, + "grad_norm": 0.16350236291478243, + "learning_rate": 0.0007429265114110225, + "loss": 1.4164, + "step": 3950 + }, + { + "epoch": 0.3584648884050082, + "grad_norm": 0.15235356115614979, + "learning_rate": 0.000742798080791031, + "loss": 1.4398, + "step": 3951 + }, + { + "epoch": 0.358555616040646, + "grad_norm": 0.1574809561698526, + "learning_rate": 0.0007426696292053893, + "loss": 1.4331, + "step": 3952 + }, + { + "epoch": 0.3586463436762838, + "grad_norm": 0.20459236541312578, + "learning_rate": 0.0007425411566651897, + "loss": 1.4476, + "step": 3953 + }, + { + "epoch": 0.3587370713119216, + "grad_norm": 0.15845076247405934, + "learning_rate": 0.0007424126631815253, + "loss": 1.4477, + "step": 3954 + }, + { + "epoch": 0.35882779894755945, + "grad_norm": 0.15924538151427067, + "learning_rate": 0.0007422841487654916, + "loss": 1.4141, + "step": 3955 + }, + { + "epoch": 0.35891852658319723, + "grad_norm": 0.16872588572316113, + "learning_rate": 0.000742155613428186, + "loss": 1.4002, + "step": 3956 + }, + { + "epoch": 0.35900925421883506, + "grad_norm": 0.15890249031843248, + "learning_rate": 0.0007420270571807076, + "loss": 1.4074, + "step": 3957 + }, + { + "epoch": 0.3590999818544729, + "grad_norm": 0.1521804353122068, + "learning_rate": 0.0007418984800341571, + "loss": 1.4577, + "step": 3958 + }, + { + "epoch": 0.35919070949011067, + "grad_norm": 0.1763972292347373, + "learning_rate": 0.0007417698819996371, + "loss": 1.4627, + "step": 3959 + }, + { + "epoch": 0.3592814371257485, + "grad_norm": 0.14970798884507155, + "learning_rate": 0.0007416412630882523, + "loss": 1.3949, + "step": 3960 + }, + { + "epoch": 0.35937216476138634, + "grad_norm": 0.15118137735133122, + "learning_rate": 0.0007415126233111087, + "loss": 1.4252, + "step": 3961 + }, + { + "epoch": 0.3594628923970241, + "grad_norm": 0.15155332805494676, + "learning_rate": 0.0007413839626793145, + "loss": 1.4056, + "step": 3962 + }, + { + "epoch": 0.35955362003266195, + "grad_norm": 0.1622407381448003, + "learning_rate": 0.0007412552812039797, + "loss": 1.4309, + "step": 3963 + }, + { + "epoch": 0.3596443476682998, + "grad_norm": 0.17364379619401502, + "learning_rate": 0.0007411265788962156, + "loss": 1.4043, + "step": 3964 + }, + { + "epoch": 0.35973507530393756, + "grad_norm": 0.13635198561327044, + "learning_rate": 0.0007409978557671359, + "loss": 1.4202, + "step": 3965 + }, + { + "epoch": 0.3598258029395754, + "grad_norm": 0.1487396772448617, + "learning_rate": 0.000740869111827856, + "loss": 1.4148, + "step": 3966 + }, + { + "epoch": 0.3599165305752132, + "grad_norm": 0.1427323813842541, + "learning_rate": 0.0007407403470894926, + "loss": 1.4357, + "step": 3967 + }, + { + "epoch": 0.360007258210851, + "grad_norm": 0.14227221647372773, + "learning_rate": 0.0007406115615631649, + "loss": 1.4041, + "step": 3968 + }, + { + "epoch": 0.36009798584648883, + "grad_norm": 0.1357627873080866, + "learning_rate": 0.0007404827552599933, + "loss": 1.4325, + "step": 3969 + }, + { + "epoch": 0.36018871348212667, + "grad_norm": 0.1453615304770718, + "learning_rate": 0.0007403539281911003, + "loss": 1.41, + "step": 3970 + }, + { + "epoch": 0.3602794411177645, + "grad_norm": 0.13933244093256983, + "learning_rate": 0.0007402250803676103, + "loss": 1.427, + "step": 3971 + }, + { + "epoch": 0.3603701687534023, + "grad_norm": 0.1465289259054894, + "learning_rate": 0.0007400962118006492, + "loss": 1.3991, + "step": 3972 + }, + { + "epoch": 0.3604608963890401, + "grad_norm": 0.14381106255338322, + "learning_rate": 0.0007399673225013448, + "loss": 1.4474, + "step": 3973 + }, + { + "epoch": 0.36055162402467794, + "grad_norm": 0.14128130791312152, + "learning_rate": 0.0007398384124808267, + "loss": 1.4261, + "step": 3974 + }, + { + "epoch": 0.3606423516603157, + "grad_norm": 0.1370302411630563, + "learning_rate": 0.0007397094817502263, + "loss": 1.417, + "step": 3975 + }, + { + "epoch": 0.36073307929595355, + "grad_norm": 0.1354097620611273, + "learning_rate": 0.0007395805303206768, + "loss": 1.451, + "step": 3976 + }, + { + "epoch": 0.3608238069315914, + "grad_norm": 0.20369560206703247, + "learning_rate": 0.0007394515582033132, + "loss": 1.4324, + "step": 3977 + }, + { + "epoch": 0.36091453456722916, + "grad_norm": 0.1486522498626087, + "learning_rate": 0.0007393225654092724, + "loss": 1.4052, + "step": 3978 + }, + { + "epoch": 0.361005262202867, + "grad_norm": 0.15348295931970798, + "learning_rate": 0.0007391935519496926, + "loss": 1.4252, + "step": 3979 + }, + { + "epoch": 0.3610959898385048, + "grad_norm": 0.15081788505058027, + "learning_rate": 0.0007390645178357146, + "loss": 1.4187, + "step": 3980 + }, + { + "epoch": 0.3611867174741426, + "grad_norm": 0.15644230898177292, + "learning_rate": 0.00073893546307848, + "loss": 1.4265, + "step": 3981 + }, + { + "epoch": 0.36127744510978044, + "grad_norm": 0.14501182181853095, + "learning_rate": 0.0007388063876891331, + "loss": 1.4613, + "step": 3982 + }, + { + "epoch": 0.36136817274541827, + "grad_norm": 0.15514978944971508, + "learning_rate": 0.0007386772916788193, + "loss": 1.4301, + "step": 3983 + }, + { + "epoch": 0.36145890038105605, + "grad_norm": 0.15859228751086962, + "learning_rate": 0.0007385481750586863, + "loss": 1.4503, + "step": 3984 + }, + { + "epoch": 0.3615496280166939, + "grad_norm": 0.1378417012234728, + "learning_rate": 0.0007384190378398833, + "loss": 1.4175, + "step": 3985 + }, + { + "epoch": 0.3616403556523317, + "grad_norm": 0.15438011780101404, + "learning_rate": 0.0007382898800335612, + "loss": 1.4363, + "step": 3986 + }, + { + "epoch": 0.3617310832879695, + "grad_norm": 0.14797634178347246, + "learning_rate": 0.0007381607016508727, + "loss": 1.4324, + "step": 3987 + }, + { + "epoch": 0.3618218109236073, + "grad_norm": 0.1626288485603936, + "learning_rate": 0.0007380315027029725, + "loss": 1.4655, + "step": 3988 + }, + { + "epoch": 0.36191253855924516, + "grad_norm": 0.13853446159625774, + "learning_rate": 0.000737902283201017, + "loss": 1.4204, + "step": 3989 + }, + { + "epoch": 0.362003266194883, + "grad_norm": 0.14349340165912688, + "learning_rate": 0.0007377730431561645, + "loss": 1.4377, + "step": 3990 + }, + { + "epoch": 0.36209399383052077, + "grad_norm": 0.14271949496386538, + "learning_rate": 0.0007376437825795743, + "loss": 1.4299, + "step": 3991 + }, + { + "epoch": 0.3621847214661586, + "grad_norm": 0.13754404310926446, + "learning_rate": 0.0007375145014824089, + "loss": 1.4298, + "step": 3992 + }, + { + "epoch": 0.36227544910179643, + "grad_norm": 0.13791329658020782, + "learning_rate": 0.000737385199875831, + "loss": 1.4222, + "step": 3993 + }, + { + "epoch": 0.3623661767374342, + "grad_norm": 0.15506037118819868, + "learning_rate": 0.0007372558777710061, + "loss": 1.4467, + "step": 3994 + }, + { + "epoch": 0.36245690437307204, + "grad_norm": 0.14784202431491653, + "learning_rate": 0.0007371265351791012, + "loss": 1.4362, + "step": 3995 + }, + { + "epoch": 0.3625476320087099, + "grad_norm": 0.15836711854863983, + "learning_rate": 0.000736997172111285, + "loss": 1.4195, + "step": 3996 + }, + { + "epoch": 0.36263835964434765, + "grad_norm": 0.14166026784114452, + "learning_rate": 0.0007368677885787282, + "loss": 1.4043, + "step": 3997 + }, + { + "epoch": 0.3627290872799855, + "grad_norm": 0.13832820946007954, + "learning_rate": 0.0007367383845926028, + "loss": 1.4536, + "step": 3998 + }, + { + "epoch": 0.3628198149156233, + "grad_norm": 0.15571853019306306, + "learning_rate": 0.0007366089601640831, + "loss": 1.4359, + "step": 3999 + }, + { + "epoch": 0.3629105425512611, + "grad_norm": 0.1580950760559624, + "learning_rate": 0.0007364795153043448, + "loss": 1.4303, + "step": 4000 + }, + { + "epoch": 0.36300127018689893, + "grad_norm": 0.1390971263729551, + "learning_rate": 0.0007363500500245655, + "loss": 1.4366, + "step": 4001 + }, + { + "epoch": 0.36309199782253676, + "grad_norm": 0.1363669897369307, + "learning_rate": 0.0007362205643359246, + "loss": 1.4135, + "step": 4002 + }, + { + "epoch": 0.36318272545817454, + "grad_norm": 0.1312499715748854, + "learning_rate": 0.000736091058249603, + "loss": 1.4088, + "step": 4003 + }, + { + "epoch": 0.36327345309381237, + "grad_norm": 0.1568361971787701, + "learning_rate": 0.0007359615317767839, + "loss": 1.5019, + "step": 4004 + }, + { + "epoch": 0.3633641807294502, + "grad_norm": 0.1378687269799347, + "learning_rate": 0.0007358319849286517, + "loss": 1.4211, + "step": 4005 + }, + { + "epoch": 0.363454908365088, + "grad_norm": 0.13780449871968165, + "learning_rate": 0.0007357024177163927, + "loss": 1.4281, + "step": 4006 + }, + { + "epoch": 0.3635456360007258, + "grad_norm": 0.14730407457476202, + "learning_rate": 0.0007355728301511955, + "loss": 1.4437, + "step": 4007 + }, + { + "epoch": 0.36363636363636365, + "grad_norm": 0.13734116078559033, + "learning_rate": 0.0007354432222442494, + "loss": 1.4191, + "step": 4008 + }, + { + "epoch": 0.3637270912720015, + "grad_norm": 0.1379892558406871, + "learning_rate": 0.0007353135940067465, + "loss": 1.419, + "step": 4009 + }, + { + "epoch": 0.36381781890763926, + "grad_norm": 0.1395653101334405, + "learning_rate": 0.00073518394544988, + "loss": 1.4139, + "step": 4010 + }, + { + "epoch": 0.3639085465432771, + "grad_norm": 0.14752107671074835, + "learning_rate": 0.0007350542765848452, + "loss": 1.4462, + "step": 4011 + }, + { + "epoch": 0.3639992741789149, + "grad_norm": 0.13278204711356267, + "learning_rate": 0.000734924587422839, + "loss": 1.42, + "step": 4012 + }, + { + "epoch": 0.3640900018145527, + "grad_norm": 0.1356371411899509, + "learning_rate": 0.0007347948779750599, + "loss": 1.4502, + "step": 4013 + }, + { + "epoch": 0.36418072945019053, + "grad_norm": 0.13759264204790073, + "learning_rate": 0.0007346651482527087, + "loss": 1.432, + "step": 4014 + }, + { + "epoch": 0.36427145708582837, + "grad_norm": 0.14650126079445203, + "learning_rate": 0.0007345353982669871, + "loss": 1.4138, + "step": 4015 + }, + { + "epoch": 0.36436218472146614, + "grad_norm": 0.13394250740236252, + "learning_rate": 0.0007344056280290995, + "loss": 1.3991, + "step": 4016 + }, + { + "epoch": 0.364452912357104, + "grad_norm": 0.15168089449329142, + "learning_rate": 0.0007342758375502514, + "loss": 1.4016, + "step": 4017 + }, + { + "epoch": 0.3645436399927418, + "grad_norm": 0.13741678119105732, + "learning_rate": 0.00073414602684165, + "loss": 1.431, + "step": 4018 + }, + { + "epoch": 0.3646343676283796, + "grad_norm": 0.14555627484971745, + "learning_rate": 0.0007340161959145049, + "loss": 1.4233, + "step": 4019 + }, + { + "epoch": 0.3647250952640174, + "grad_norm": 0.13679994405547896, + "learning_rate": 0.0007338863447800267, + "loss": 1.424, + "step": 4020 + }, + { + "epoch": 0.36481582289965525, + "grad_norm": 0.16275392231219163, + "learning_rate": 0.0007337564734494281, + "loss": 1.4418, + "step": 4021 + }, + { + "epoch": 0.36490655053529303, + "grad_norm": 0.13755006113944937, + "learning_rate": 0.0007336265819339236, + "loss": 1.419, + "step": 4022 + }, + { + "epoch": 0.36499727817093086, + "grad_norm": 0.14012264470224667, + "learning_rate": 0.0007334966702447292, + "loss": 1.4268, + "step": 4023 + }, + { + "epoch": 0.3650880058065687, + "grad_norm": 0.14138703425354487, + "learning_rate": 0.0007333667383930632, + "loss": 1.4207, + "step": 4024 + }, + { + "epoch": 0.36517873344220647, + "grad_norm": 0.12979550869066017, + "learning_rate": 0.0007332367863901448, + "loss": 1.4254, + "step": 4025 + }, + { + "epoch": 0.3652694610778443, + "grad_norm": 0.13646289689183494, + "learning_rate": 0.0007331068142471955, + "loss": 1.3995, + "step": 4026 + }, + { + "epoch": 0.36536018871348214, + "grad_norm": 0.16173277788880117, + "learning_rate": 0.0007329768219754383, + "loss": 1.4185, + "step": 4027 + }, + { + "epoch": 0.3654509163491199, + "grad_norm": 0.1409580883539928, + "learning_rate": 0.0007328468095860984, + "loss": 1.4097, + "step": 4028 + }, + { + "epoch": 0.36554164398475775, + "grad_norm": 0.15234370921070137, + "learning_rate": 0.0007327167770904022, + "loss": 1.4358, + "step": 4029 + }, + { + "epoch": 0.3656323716203956, + "grad_norm": 0.16134454309333424, + "learning_rate": 0.0007325867244995776, + "loss": 1.4247, + "step": 4030 + }, + { + "epoch": 0.3657230992560334, + "grad_norm": 0.15361339157673626, + "learning_rate": 0.0007324566518248555, + "loss": 1.4263, + "step": 4031 + }, + { + "epoch": 0.3658138268916712, + "grad_norm": 0.13946543303486691, + "learning_rate": 0.0007323265590774671, + "loss": 1.4182, + "step": 4032 + }, + { + "epoch": 0.365904554527309, + "grad_norm": 0.1486927000436527, + "learning_rate": 0.0007321964462686461, + "loss": 1.3704, + "step": 4033 + }, + { + "epoch": 0.36599528216294686, + "grad_norm": 0.14067120231342894, + "learning_rate": 0.0007320663134096278, + "loss": 1.4107, + "step": 4034 + }, + { + "epoch": 0.36608600979858463, + "grad_norm": 0.14838315863078272, + "learning_rate": 0.000731936160511649, + "loss": 1.3773, + "step": 4035 + }, + { + "epoch": 0.36617673743422247, + "grad_norm": 0.14243214592613285, + "learning_rate": 0.0007318059875859487, + "loss": 1.4271, + "step": 4036 + }, + { + "epoch": 0.3662674650698603, + "grad_norm": 0.1399869844596584, + "learning_rate": 0.000731675794643767, + "loss": 1.4657, + "step": 4037 + }, + { + "epoch": 0.3663581927054981, + "grad_norm": 0.14836994832023562, + "learning_rate": 0.0007315455816963465, + "loss": 1.3944, + "step": 4038 + }, + { + "epoch": 0.3664489203411359, + "grad_norm": 0.1487785613409995, + "learning_rate": 0.0007314153487549308, + "loss": 1.4373, + "step": 4039 + }, + { + "epoch": 0.36653964797677374, + "grad_norm": 0.13509781280018981, + "learning_rate": 0.0007312850958307656, + "loss": 1.4243, + "step": 4040 + }, + { + "epoch": 0.3666303756124115, + "grad_norm": 0.18530592358999295, + "learning_rate": 0.0007311548229350982, + "loss": 1.4494, + "step": 4041 + }, + { + "epoch": 0.36672110324804935, + "grad_norm": 0.15235328475306473, + "learning_rate": 0.000731024530079178, + "loss": 1.4338, + "step": 4042 + }, + { + "epoch": 0.3668118308836872, + "grad_norm": 0.1578132049572913, + "learning_rate": 0.0007308942172742554, + "loss": 1.4311, + "step": 4043 + }, + { + "epoch": 0.36690255851932496, + "grad_norm": 0.14131151821981627, + "learning_rate": 0.0007307638845315832, + "loss": 1.3835, + "step": 4044 + }, + { + "epoch": 0.3669932861549628, + "grad_norm": 0.14908271534717957, + "learning_rate": 0.0007306335318624155, + "loss": 1.4141, + "step": 4045 + }, + { + "epoch": 0.3670840137906006, + "grad_norm": 0.14262393993081393, + "learning_rate": 0.0007305031592780085, + "loss": 1.4673, + "step": 4046 + }, + { + "epoch": 0.3671747414262384, + "grad_norm": 0.1434588610916839, + "learning_rate": 0.0007303727667896195, + "loss": 1.383, + "step": 4047 + }, + { + "epoch": 0.36726546906187624, + "grad_norm": 0.15651985485893485, + "learning_rate": 0.0007302423544085083, + "loss": 1.4346, + "step": 4048 + }, + { + "epoch": 0.36735619669751407, + "grad_norm": 0.15202306489921052, + "learning_rate": 0.0007301119221459357, + "loss": 1.4134, + "step": 4049 + }, + { + "epoch": 0.3674469243331519, + "grad_norm": 0.15159290873083076, + "learning_rate": 0.0007299814700131649, + "loss": 1.418, + "step": 4050 + }, + { + "epoch": 0.3675376519687897, + "grad_norm": 0.14114686734908508, + "learning_rate": 0.0007298509980214602, + "loss": 1.4213, + "step": 4051 + }, + { + "epoch": 0.3676283796044275, + "grad_norm": 0.21631795235221155, + "learning_rate": 0.0007297205061820879, + "loss": 1.3793, + "step": 4052 + }, + { + "epoch": 0.36771910724006535, + "grad_norm": 0.15345339156209314, + "learning_rate": 0.0007295899945063161, + "loss": 1.4644, + "step": 4053 + }, + { + "epoch": 0.3678098348757031, + "grad_norm": 0.13591742979488172, + "learning_rate": 0.0007294594630054142, + "loss": 1.405, + "step": 4054 + }, + { + "epoch": 0.36790056251134096, + "grad_norm": 0.1500612791026988, + "learning_rate": 0.0007293289116906541, + "loss": 1.419, + "step": 4055 + }, + { + "epoch": 0.3679912901469788, + "grad_norm": 0.14640468151350708, + "learning_rate": 0.0007291983405733087, + "loss": 1.4401, + "step": 4056 + }, + { + "epoch": 0.36808201778261657, + "grad_norm": 0.1442174377234265, + "learning_rate": 0.0007290677496646525, + "loss": 1.433, + "step": 4057 + }, + { + "epoch": 0.3681727454182544, + "grad_norm": 0.16184536735992414, + "learning_rate": 0.0007289371389759627, + "loss": 1.3807, + "step": 4058 + }, + { + "epoch": 0.36826347305389223, + "grad_norm": 0.13601909091451914, + "learning_rate": 0.000728806508518517, + "loss": 1.4209, + "step": 4059 + }, + { + "epoch": 0.36835420068953, + "grad_norm": 0.14611121066167154, + "learning_rate": 0.0007286758583035958, + "loss": 1.3873, + "step": 4060 + }, + { + "epoch": 0.36844492832516784, + "grad_norm": 0.1452516712489142, + "learning_rate": 0.0007285451883424803, + "loss": 1.447, + "step": 4061 + }, + { + "epoch": 0.3685356559608057, + "grad_norm": 0.14948802535392744, + "learning_rate": 0.0007284144986464541, + "loss": 1.4174, + "step": 4062 + }, + { + "epoch": 0.36862638359644345, + "grad_norm": 0.1339807439711549, + "learning_rate": 0.0007282837892268025, + "loss": 1.4233, + "step": 4063 + }, + { + "epoch": 0.3687171112320813, + "grad_norm": 0.13860102800982643, + "learning_rate": 0.0007281530600948119, + "loss": 1.4266, + "step": 4064 + }, + { + "epoch": 0.3688078388677191, + "grad_norm": 0.13535513182557896, + "learning_rate": 0.0007280223112617709, + "loss": 1.4568, + "step": 4065 + }, + { + "epoch": 0.3688985665033569, + "grad_norm": 0.14766393385272822, + "learning_rate": 0.0007278915427389697, + "loss": 1.4264, + "step": 4066 + }, + { + "epoch": 0.36898929413899473, + "grad_norm": 0.13725879317764678, + "learning_rate": 0.0007277607545377003, + "loss": 1.4377, + "step": 4067 + }, + { + "epoch": 0.36908002177463256, + "grad_norm": 0.14313546537057978, + "learning_rate": 0.0007276299466692562, + "loss": 1.4365, + "step": 4068 + }, + { + "epoch": 0.3691707494102704, + "grad_norm": 0.14037565653830258, + "learning_rate": 0.0007274991191449325, + "loss": 1.4106, + "step": 4069 + }, + { + "epoch": 0.36926147704590817, + "grad_norm": 0.1347663649921093, + "learning_rate": 0.0007273682719760265, + "loss": 1.4294, + "step": 4070 + }, + { + "epoch": 0.369352204681546, + "grad_norm": 0.14234353942778485, + "learning_rate": 0.0007272374051738366, + "loss": 1.4356, + "step": 4071 + }, + { + "epoch": 0.36944293231718384, + "grad_norm": 0.15208968138607512, + "learning_rate": 0.0007271065187496633, + "loss": 1.415, + "step": 4072 + }, + { + "epoch": 0.3695336599528216, + "grad_norm": 0.14121945979441014, + "learning_rate": 0.0007269756127148086, + "loss": 1.4312, + "step": 4073 + }, + { + "epoch": 0.36962438758845945, + "grad_norm": 0.13139939520487276, + "learning_rate": 0.0007268446870805764, + "loss": 1.4566, + "step": 4074 + }, + { + "epoch": 0.3697151152240973, + "grad_norm": 0.15138876465350673, + "learning_rate": 0.000726713741858272, + "loss": 1.4317, + "step": 4075 + }, + { + "epoch": 0.36980584285973506, + "grad_norm": 0.1384008134570833, + "learning_rate": 0.0007265827770592024, + "loss": 1.4114, + "step": 4076 + }, + { + "epoch": 0.3698965704953729, + "grad_norm": 0.1572207404453003, + "learning_rate": 0.0007264517926946769, + "loss": 1.4204, + "step": 4077 + }, + { + "epoch": 0.3699872981310107, + "grad_norm": 0.1404877158959666, + "learning_rate": 0.0007263207887760055, + "loss": 1.3901, + "step": 4078 + }, + { + "epoch": 0.3700780257666485, + "grad_norm": 0.16725393681053233, + "learning_rate": 0.0007261897653145006, + "loss": 1.4091, + "step": 4079 + }, + { + "epoch": 0.37016875340228633, + "grad_norm": 0.13814180022661726, + "learning_rate": 0.0007260587223214763, + "loss": 1.3803, + "step": 4080 + }, + { + "epoch": 0.37025948103792417, + "grad_norm": 0.15550519459976206, + "learning_rate": 0.0007259276598082479, + "loss": 1.435, + "step": 4081 + }, + { + "epoch": 0.37035020867356194, + "grad_norm": 0.1670598662319181, + "learning_rate": 0.0007257965777861329, + "loss": 1.4301, + "step": 4082 + }, + { + "epoch": 0.3704409363091998, + "grad_norm": 0.14156576070272553, + "learning_rate": 0.0007256654762664501, + "loss": 1.4528, + "step": 4083 + }, + { + "epoch": 0.3705316639448376, + "grad_norm": 0.14222113761065272, + "learning_rate": 0.00072553435526052, + "loss": 1.4332, + "step": 4084 + }, + { + "epoch": 0.3706223915804754, + "grad_norm": 0.1444108153715793, + "learning_rate": 0.0007254032147796652, + "loss": 1.4435, + "step": 4085 + }, + { + "epoch": 0.3707131192161132, + "grad_norm": 0.15746209947384576, + "learning_rate": 0.0007252720548352096, + "loss": 1.4075, + "step": 4086 + }, + { + "epoch": 0.37080384685175105, + "grad_norm": 0.14243639561622048, + "learning_rate": 0.0007251408754384789, + "loss": 1.4145, + "step": 4087 + }, + { + "epoch": 0.3708945744873889, + "grad_norm": 0.15430076861839842, + "learning_rate": 0.0007250096766008003, + "loss": 1.4622, + "step": 4088 + }, + { + "epoch": 0.37098530212302666, + "grad_norm": 0.16063188692880753, + "learning_rate": 0.000724878458333503, + "loss": 1.4474, + "step": 4089 + }, + { + "epoch": 0.3710760297586645, + "grad_norm": 0.13529796919679238, + "learning_rate": 0.0007247472206479178, + "loss": 1.4211, + "step": 4090 + }, + { + "epoch": 0.3711667573943023, + "grad_norm": 0.17704035745788407, + "learning_rate": 0.0007246159635553768, + "loss": 1.4494, + "step": 4091 + }, + { + "epoch": 0.3712574850299401, + "grad_norm": 0.16989423250488325, + "learning_rate": 0.0007244846870672144, + "loss": 1.4595, + "step": 4092 + }, + { + "epoch": 0.37134821266557794, + "grad_norm": 0.15625428929061602, + "learning_rate": 0.0007243533911947661, + "loss": 1.4104, + "step": 4093 + }, + { + "epoch": 0.37143894030121577, + "grad_norm": 0.16171253585504533, + "learning_rate": 0.0007242220759493694, + "loss": 1.4392, + "step": 4094 + }, + { + "epoch": 0.37152966793685355, + "grad_norm": 0.15102233649627547, + "learning_rate": 0.0007240907413423634, + "loss": 1.4508, + "step": 4095 + }, + { + "epoch": 0.3716203955724914, + "grad_norm": 0.14691187340794964, + "learning_rate": 0.0007239593873850888, + "loss": 1.4014, + "step": 4096 + }, + { + "epoch": 0.3717111232081292, + "grad_norm": 0.17073605945099665, + "learning_rate": 0.0007238280140888882, + "loss": 1.4143, + "step": 4097 + }, + { + "epoch": 0.371801850843767, + "grad_norm": 0.1595799220773272, + "learning_rate": 0.0007236966214651055, + "loss": 1.4286, + "step": 4098 + }, + { + "epoch": 0.3718925784794048, + "grad_norm": 0.14442045605763118, + "learning_rate": 0.0007235652095250866, + "loss": 1.4102, + "step": 4099 + }, + { + "epoch": 0.37198330611504266, + "grad_norm": 0.1609537785286632, + "learning_rate": 0.000723433778280179, + "loss": 1.4329, + "step": 4100 + }, + { + "epoch": 0.37207403375068043, + "grad_norm": 0.16155881942558137, + "learning_rate": 0.0007233023277417316, + "loss": 1.4284, + "step": 4101 + }, + { + "epoch": 0.37216476138631827, + "grad_norm": 0.15200870142101663, + "learning_rate": 0.0007231708579210954, + "loss": 1.4343, + "step": 4102 + }, + { + "epoch": 0.3722554890219561, + "grad_norm": 0.1692498782501048, + "learning_rate": 0.0007230393688296226, + "loss": 1.4404, + "step": 4103 + }, + { + "epoch": 0.3723462166575939, + "grad_norm": 0.16670144782117935, + "learning_rate": 0.0007229078604786675, + "loss": 1.3778, + "step": 4104 + }, + { + "epoch": 0.3724369442932317, + "grad_norm": 0.15103507903953192, + "learning_rate": 0.0007227763328795858, + "loss": 1.421, + "step": 4105 + }, + { + "epoch": 0.37252767192886954, + "grad_norm": 0.15035855527017086, + "learning_rate": 0.0007226447860437348, + "loss": 1.4654, + "step": 4106 + }, + { + "epoch": 0.3726183995645074, + "grad_norm": 0.1577444399166555, + "learning_rate": 0.000722513219982474, + "loss": 1.4162, + "step": 4107 + }, + { + "epoch": 0.37270912720014515, + "grad_norm": 0.18599316851612435, + "learning_rate": 0.0007223816347071635, + "loss": 1.4646, + "step": 4108 + }, + { + "epoch": 0.372799854835783, + "grad_norm": 0.1725132207463424, + "learning_rate": 0.0007222500302291663, + "loss": 1.3996, + "step": 4109 + }, + { + "epoch": 0.3728905824714208, + "grad_norm": 0.17776730130647989, + "learning_rate": 0.0007221184065598462, + "loss": 1.3985, + "step": 4110 + }, + { + "epoch": 0.3729813101070586, + "grad_norm": 0.2108237585535805, + "learning_rate": 0.0007219867637105689, + "loss": 1.4448, + "step": 4111 + }, + { + "epoch": 0.3730720377426964, + "grad_norm": 0.15138280734177506, + "learning_rate": 0.0007218551016927019, + "loss": 1.403, + "step": 4112 + }, + { + "epoch": 0.37316276537833426, + "grad_norm": 0.14350328863458645, + "learning_rate": 0.0007217234205176141, + "loss": 1.433, + "step": 4113 + }, + { + "epoch": 0.37325349301397204, + "grad_norm": 0.15761926709192536, + "learning_rate": 0.0007215917201966763, + "loss": 1.4051, + "step": 4114 + }, + { + "epoch": 0.37334422064960987, + "grad_norm": 0.1520960232873895, + "learning_rate": 0.0007214600007412607, + "loss": 1.4304, + "step": 4115 + }, + { + "epoch": 0.3734349482852477, + "grad_norm": 0.14669009658029955, + "learning_rate": 0.0007213282621627415, + "loss": 1.4395, + "step": 4116 + }, + { + "epoch": 0.3735256759208855, + "grad_norm": 0.14866403436170836, + "learning_rate": 0.0007211965044724943, + "loss": 1.4339, + "step": 4117 + }, + { + "epoch": 0.3736164035565233, + "grad_norm": 0.1385903890193161, + "learning_rate": 0.0007210647276818963, + "loss": 1.4324, + "step": 4118 + }, + { + "epoch": 0.37370713119216115, + "grad_norm": 0.1352716143789953, + "learning_rate": 0.0007209329318023264, + "loss": 1.4487, + "step": 4119 + }, + { + "epoch": 0.3737978588277989, + "grad_norm": 0.14078622286731585, + "learning_rate": 0.0007208011168451654, + "loss": 1.4396, + "step": 4120 + }, + { + "epoch": 0.37388858646343676, + "grad_norm": 0.13581090008899835, + "learning_rate": 0.0007206692828217952, + "loss": 1.4348, + "step": 4121 + }, + { + "epoch": 0.3739793140990746, + "grad_norm": 0.14614691586479941, + "learning_rate": 0.0007205374297436001, + "loss": 1.4566, + "step": 4122 + }, + { + "epoch": 0.37407004173471237, + "grad_norm": 0.18827378862437077, + "learning_rate": 0.0007204055576219653, + "loss": 1.4238, + "step": 4123 + }, + { + "epoch": 0.3741607693703502, + "grad_norm": 0.13809532634100086, + "learning_rate": 0.0007202736664682783, + "loss": 1.4607, + "step": 4124 + }, + { + "epoch": 0.37425149700598803, + "grad_norm": 0.13777098207917726, + "learning_rate": 0.0007201417562939276, + "loss": 1.4256, + "step": 4125 + }, + { + "epoch": 0.37434222464162586, + "grad_norm": 0.1449592763208898, + "learning_rate": 0.0007200098271103039, + "loss": 1.4174, + "step": 4126 + }, + { + "epoch": 0.37443295227726364, + "grad_norm": 0.16905008783243292, + "learning_rate": 0.000719877878928799, + "loss": 1.4556, + "step": 4127 + }, + { + "epoch": 0.3745236799129015, + "grad_norm": 0.172764731715276, + "learning_rate": 0.0007197459117608071, + "loss": 1.4585, + "step": 4128 + }, + { + "epoch": 0.3746144075485393, + "grad_norm": 0.1464653027360009, + "learning_rate": 0.0007196139256177233, + "loss": 1.4022, + "step": 4129 + }, + { + "epoch": 0.3747051351841771, + "grad_norm": 0.15118380101222254, + "learning_rate": 0.0007194819205109446, + "loss": 1.4314, + "step": 4130 + }, + { + "epoch": 0.3747958628198149, + "grad_norm": 0.14951786022039498, + "learning_rate": 0.0007193498964518696, + "loss": 1.3954, + "step": 4131 + }, + { + "epoch": 0.37488659045545275, + "grad_norm": 0.1333621738218547, + "learning_rate": 0.0007192178534518989, + "loss": 1.4364, + "step": 4132 + }, + { + "epoch": 0.37497731809109053, + "grad_norm": 0.15072830004637458, + "learning_rate": 0.000719085791522434, + "loss": 1.4524, + "step": 4133 + }, + { + "epoch": 0.37506804572672836, + "grad_norm": 0.1484915380054989, + "learning_rate": 0.000718953710674879, + "loss": 1.4582, + "step": 4134 + }, + { + "epoch": 0.3751587733623662, + "grad_norm": 0.15610857852080368, + "learning_rate": 0.0007188216109206385, + "loss": 1.4168, + "step": 4135 + }, + { + "epoch": 0.37524950099800397, + "grad_norm": 0.1377760006551461, + "learning_rate": 0.00071868949227112, + "loss": 1.4637, + "step": 4136 + }, + { + "epoch": 0.3753402286336418, + "grad_norm": 0.16703820210249645, + "learning_rate": 0.0007185573547377315, + "loss": 1.3837, + "step": 4137 + }, + { + "epoch": 0.37543095626927964, + "grad_norm": 0.14566721977401687, + "learning_rate": 0.000718425198331883, + "loss": 1.3919, + "step": 4138 + }, + { + "epoch": 0.3755216839049174, + "grad_norm": 0.14095425350953006, + "learning_rate": 0.0007182930230649866, + "loss": 1.4289, + "step": 4139 + }, + { + "epoch": 0.37561241154055525, + "grad_norm": 0.15019599627702332, + "learning_rate": 0.0007181608289484554, + "loss": 1.4066, + "step": 4140 + }, + { + "epoch": 0.3757031391761931, + "grad_norm": 0.14733285743375749, + "learning_rate": 0.0007180286159937046, + "loss": 1.4087, + "step": 4141 + }, + { + "epoch": 0.37579386681183086, + "grad_norm": 0.13599997297064675, + "learning_rate": 0.0007178963842121507, + "loss": 1.3831, + "step": 4142 + }, + { + "epoch": 0.3758845944474687, + "grad_norm": 0.1398840328687718, + "learning_rate": 0.0007177641336152116, + "loss": 1.4171, + "step": 4143 + }, + { + "epoch": 0.3759753220831065, + "grad_norm": 0.1402776684047314, + "learning_rate": 0.0007176318642143077, + "loss": 1.3912, + "step": 4144 + }, + { + "epoch": 0.37606604971874436, + "grad_norm": 0.1379004323482483, + "learning_rate": 0.0007174995760208603, + "loss": 1.4202, + "step": 4145 + }, + { + "epoch": 0.37615677735438213, + "grad_norm": 0.13762065950565822, + "learning_rate": 0.0007173672690462924, + "loss": 1.413, + "step": 4146 + }, + { + "epoch": 0.37624750499001997, + "grad_norm": 0.15276941553827694, + "learning_rate": 0.0007172349433020287, + "loss": 1.4304, + "step": 4147 + }, + { + "epoch": 0.3763382326256578, + "grad_norm": 0.1430813424498947, + "learning_rate": 0.0007171025987994957, + "loss": 1.4699, + "step": 4148 + }, + { + "epoch": 0.3764289602612956, + "grad_norm": 0.3758154739942178, + "learning_rate": 0.0007169702355501213, + "loss": 1.4431, + "step": 4149 + }, + { + "epoch": 0.3765196878969334, + "grad_norm": 0.16722541613429423, + "learning_rate": 0.0007168378535653351, + "loss": 1.4013, + "step": 4150 + }, + { + "epoch": 0.37661041553257124, + "grad_norm": 0.1333521156837667, + "learning_rate": 0.0007167054528565682, + "loss": 1.4246, + "step": 4151 + }, + { + "epoch": 0.376701143168209, + "grad_norm": 0.12910186868090232, + "learning_rate": 0.0007165730334352535, + "loss": 1.4391, + "step": 4152 + }, + { + "epoch": 0.37679187080384685, + "grad_norm": 0.14111227827857695, + "learning_rate": 0.0007164405953128256, + "loss": 1.4154, + "step": 4153 + }, + { + "epoch": 0.3768825984394847, + "grad_norm": 0.14054082981880678, + "learning_rate": 0.0007163081385007201, + "loss": 1.4172, + "step": 4154 + }, + { + "epoch": 0.37697332607512246, + "grad_norm": 0.1352563264508343, + "learning_rate": 0.0007161756630103753, + "loss": 1.411, + "step": 4155 + }, + { + "epoch": 0.3770640537107603, + "grad_norm": 0.19973126080669915, + "learning_rate": 0.0007160431688532301, + "loss": 1.4754, + "step": 4156 + }, + { + "epoch": 0.3771547813463981, + "grad_norm": 0.12933359060406885, + "learning_rate": 0.0007159106560407252, + "loss": 1.4675, + "step": 4157 + }, + { + "epoch": 0.3772455089820359, + "grad_norm": 0.13897548159727033, + "learning_rate": 0.0007157781245843035, + "loss": 1.4613, + "step": 4158 + }, + { + "epoch": 0.37733623661767374, + "grad_norm": 0.12898972595178937, + "learning_rate": 0.0007156455744954088, + "loss": 1.4098, + "step": 4159 + }, + { + "epoch": 0.37742696425331157, + "grad_norm": 0.14498273552212013, + "learning_rate": 0.0007155130057854871, + "loss": 1.4431, + "step": 4160 + }, + { + "epoch": 0.37751769188894935, + "grad_norm": 0.13395793094094127, + "learning_rate": 0.0007153804184659855, + "loss": 1.4159, + "step": 4161 + }, + { + "epoch": 0.3776084195245872, + "grad_norm": 0.14100421079836206, + "learning_rate": 0.0007152478125483531, + "loss": 1.4269, + "step": 4162 + }, + { + "epoch": 0.377699147160225, + "grad_norm": 0.13056098895650511, + "learning_rate": 0.0007151151880440403, + "loss": 1.4305, + "step": 4163 + }, + { + "epoch": 0.37778987479586285, + "grad_norm": 0.1317087702945751, + "learning_rate": 0.0007149825449644993, + "loss": 1.4136, + "step": 4164 + }, + { + "epoch": 0.3778806024315006, + "grad_norm": 0.12773461918548326, + "learning_rate": 0.0007148498833211838, + "loss": 1.4196, + "step": 4165 + }, + { + "epoch": 0.37797133006713846, + "grad_norm": 0.13542827405877733, + "learning_rate": 0.0007147172031255493, + "loss": 1.4338, + "step": 4166 + }, + { + "epoch": 0.3780620577027763, + "grad_norm": 0.13561358275096283, + "learning_rate": 0.0007145845043890528, + "loss": 1.4357, + "step": 4167 + }, + { + "epoch": 0.37815278533841407, + "grad_norm": 0.13188936840482787, + "learning_rate": 0.0007144517871231526, + "loss": 1.3902, + "step": 4168 + }, + { + "epoch": 0.3782435129740519, + "grad_norm": 0.14323020093285482, + "learning_rate": 0.0007143190513393089, + "loss": 1.4126, + "step": 4169 + }, + { + "epoch": 0.37833424060968973, + "grad_norm": 0.14830743442426975, + "learning_rate": 0.0007141862970489836, + "loss": 1.448, + "step": 4170 + }, + { + "epoch": 0.3784249682453275, + "grad_norm": 0.13744222531661607, + "learning_rate": 0.0007140535242636399, + "loss": 1.4545, + "step": 4171 + }, + { + "epoch": 0.37851569588096534, + "grad_norm": 0.12402099148382491, + "learning_rate": 0.000713920732994743, + "loss": 1.4347, + "step": 4172 + }, + { + "epoch": 0.3786064235166032, + "grad_norm": 0.1371488118341702, + "learning_rate": 0.0007137879232537592, + "loss": 1.4438, + "step": 4173 + }, + { + "epoch": 0.37869715115224095, + "grad_norm": 0.14442720598728437, + "learning_rate": 0.0007136550950521566, + "loss": 1.3983, + "step": 4174 + }, + { + "epoch": 0.3787878787878788, + "grad_norm": 0.1339308966752494, + "learning_rate": 0.0007135222484014052, + "loss": 1.4506, + "step": 4175 + }, + { + "epoch": 0.3788786064235166, + "grad_norm": 0.14133415393418117, + "learning_rate": 0.0007133893833129761, + "loss": 1.4574, + "step": 4176 + }, + { + "epoch": 0.3789693340591544, + "grad_norm": 0.1310744169928404, + "learning_rate": 0.0007132564997983423, + "loss": 1.4294, + "step": 4177 + }, + { + "epoch": 0.3790600616947922, + "grad_norm": 0.1377848281603666, + "learning_rate": 0.0007131235978689783, + "loss": 1.4113, + "step": 4178 + }, + { + "epoch": 0.37915078933043006, + "grad_norm": 0.13882433695070187, + "learning_rate": 0.0007129906775363603, + "loss": 1.4709, + "step": 4179 + }, + { + "epoch": 0.37924151696606784, + "grad_norm": 0.13346163752966783, + "learning_rate": 0.0007128577388119659, + "loss": 1.4223, + "step": 4180 + }, + { + "epoch": 0.37933224460170567, + "grad_norm": 0.14510131042763807, + "learning_rate": 0.0007127247817072743, + "loss": 1.4173, + "step": 4181 + }, + { + "epoch": 0.3794229722373435, + "grad_norm": 0.13786178281221545, + "learning_rate": 0.0007125918062337665, + "loss": 1.4208, + "step": 4182 + }, + { + "epoch": 0.37951369987298134, + "grad_norm": 0.13676825436030138, + "learning_rate": 0.0007124588124029249, + "loss": 1.4178, + "step": 4183 + }, + { + "epoch": 0.3796044275086191, + "grad_norm": 0.1306823265114464, + "learning_rate": 0.0007123258002262333, + "loss": 1.4126, + "step": 4184 + }, + { + "epoch": 0.37969515514425695, + "grad_norm": 0.1403593007106075, + "learning_rate": 0.000712192769715178, + "loss": 1.443, + "step": 4185 + }, + { + "epoch": 0.3797858827798948, + "grad_norm": 0.13790051248320362, + "learning_rate": 0.0007120597208812453, + "loss": 1.4264, + "step": 4186 + }, + { + "epoch": 0.37987661041553256, + "grad_norm": 0.14047257684333025, + "learning_rate": 0.0007119266537359249, + "loss": 1.4133, + "step": 4187 + }, + { + "epoch": 0.3799673380511704, + "grad_norm": 0.13109678548245046, + "learning_rate": 0.0007117935682907064, + "loss": 1.436, + "step": 4188 + }, + { + "epoch": 0.3800580656868082, + "grad_norm": 0.149473154162648, + "learning_rate": 0.0007116604645570822, + "loss": 1.4428, + "step": 4189 + }, + { + "epoch": 0.380148793322446, + "grad_norm": 0.13536718664547057, + "learning_rate": 0.0007115273425465456, + "loss": 1.4083, + "step": 4190 + }, + { + "epoch": 0.38023952095808383, + "grad_norm": 0.14241083122359569, + "learning_rate": 0.0007113942022705919, + "loss": 1.4379, + "step": 4191 + }, + { + "epoch": 0.38033024859372166, + "grad_norm": 0.14890310732807602, + "learning_rate": 0.0007112610437407177, + "loss": 1.3959, + "step": 4192 + }, + { + "epoch": 0.38042097622935944, + "grad_norm": 0.14106934675418675, + "learning_rate": 0.000711127866968421, + "loss": 1.4347, + "step": 4193 + }, + { + "epoch": 0.3805117038649973, + "grad_norm": 0.12741860579368286, + "learning_rate": 0.0007109946719652021, + "loss": 1.4384, + "step": 4194 + }, + { + "epoch": 0.3806024315006351, + "grad_norm": 0.14800036258911747, + "learning_rate": 0.0007108614587425622, + "loss": 1.4487, + "step": 4195 + }, + { + "epoch": 0.3806931591362729, + "grad_norm": 0.13260701044161025, + "learning_rate": 0.0007107282273120042, + "loss": 1.4197, + "step": 4196 + }, + { + "epoch": 0.3807838867719107, + "grad_norm": 0.14716340136155254, + "learning_rate": 0.0007105949776850326, + "loss": 1.4278, + "step": 4197 + }, + { + "epoch": 0.38087461440754855, + "grad_norm": 0.13507574288198662, + "learning_rate": 0.0007104617098731537, + "loss": 1.4794, + "step": 4198 + }, + { + "epoch": 0.38096534204318633, + "grad_norm": 0.1348695419210795, + "learning_rate": 0.0007103284238878751, + "loss": 1.3865, + "step": 4199 + }, + { + "epoch": 0.38105606967882416, + "grad_norm": 0.14216897917908985, + "learning_rate": 0.0007101951197407061, + "loss": 1.4085, + "step": 4200 + }, + { + "epoch": 0.381146797314462, + "grad_norm": 0.14803742593172958, + "learning_rate": 0.0007100617974431576, + "loss": 1.457, + "step": 4201 + }, + { + "epoch": 0.3812375249500998, + "grad_norm": 0.16848302809600135, + "learning_rate": 0.000709928457006742, + "loss": 1.4375, + "step": 4202 + }, + { + "epoch": 0.3813282525857376, + "grad_norm": 0.1441228470006403, + "learning_rate": 0.0007097950984429731, + "loss": 1.3856, + "step": 4203 + }, + { + "epoch": 0.38141898022137544, + "grad_norm": 0.14737712565261893, + "learning_rate": 0.0007096617217633665, + "loss": 1.4179, + "step": 4204 + }, + { + "epoch": 0.38150970785701327, + "grad_norm": 0.13840101714512212, + "learning_rate": 0.0007095283269794394, + "loss": 1.429, + "step": 4205 + }, + { + "epoch": 0.38160043549265105, + "grad_norm": 0.14578890486712542, + "learning_rate": 0.0007093949141027103, + "loss": 1.4006, + "step": 4206 + }, + { + "epoch": 0.3816911631282889, + "grad_norm": 0.14281038980857516, + "learning_rate": 0.0007092614831446996, + "loss": 1.4248, + "step": 4207 + }, + { + "epoch": 0.3817818907639267, + "grad_norm": 0.13723191019515904, + "learning_rate": 0.000709128034116929, + "loss": 1.429, + "step": 4208 + }, + { + "epoch": 0.3818726183995645, + "grad_norm": 0.15544240127985903, + "learning_rate": 0.0007089945670309217, + "loss": 1.3742, + "step": 4209 + }, + { + "epoch": 0.3819633460352023, + "grad_norm": 0.13765452107432835, + "learning_rate": 0.0007088610818982027, + "loss": 1.4574, + "step": 4210 + }, + { + "epoch": 0.38205407367084016, + "grad_norm": 0.14981381160314475, + "learning_rate": 0.0007087275787302987, + "loss": 1.4254, + "step": 4211 + }, + { + "epoch": 0.38214480130647793, + "grad_norm": 0.13868521436052, + "learning_rate": 0.0007085940575387376, + "loss": 1.3984, + "step": 4212 + }, + { + "epoch": 0.38223552894211577, + "grad_norm": 0.14032807612934134, + "learning_rate": 0.0007084605183350484, + "loss": 1.4137, + "step": 4213 + }, + { + "epoch": 0.3823262565777536, + "grad_norm": 0.1687935294394334, + "learning_rate": 0.0007083269611307632, + "loss": 1.4329, + "step": 4214 + }, + { + "epoch": 0.3824169842133914, + "grad_norm": 0.14080628329048078, + "learning_rate": 0.000708193385937414, + "loss": 1.4185, + "step": 4215 + }, + { + "epoch": 0.3825077118490292, + "grad_norm": 0.1422215281402274, + "learning_rate": 0.0007080597927665352, + "loss": 1.4606, + "step": 4216 + }, + { + "epoch": 0.38259843948466704, + "grad_norm": 0.29710307300880423, + "learning_rate": 0.0007079261816296627, + "loss": 1.4383, + "step": 4217 + }, + { + "epoch": 0.3826891671203048, + "grad_norm": 0.13457070231403437, + "learning_rate": 0.0007077925525383337, + "loss": 1.4221, + "step": 4218 + }, + { + "epoch": 0.38277989475594265, + "grad_norm": 0.14041535290936816, + "learning_rate": 0.0007076589055040874, + "loss": 1.4205, + "step": 4219 + }, + { + "epoch": 0.3828706223915805, + "grad_norm": 0.15932292154089778, + "learning_rate": 0.0007075252405384638, + "loss": 1.4591, + "step": 4220 + }, + { + "epoch": 0.3829613500272183, + "grad_norm": 0.15452038826937553, + "learning_rate": 0.0007073915576530051, + "loss": 1.4558, + "step": 4221 + }, + { + "epoch": 0.3830520776628561, + "grad_norm": 0.15315374704393295, + "learning_rate": 0.0007072578568592549, + "loss": 1.4428, + "step": 4222 + }, + { + "epoch": 0.3831428052984939, + "grad_norm": 0.13762955711086142, + "learning_rate": 0.0007071241381687581, + "loss": 1.3909, + "step": 4223 + }, + { + "epoch": 0.38323353293413176, + "grad_norm": 0.143155234610127, + "learning_rate": 0.0007069904015930617, + "loss": 1.414, + "step": 4224 + }, + { + "epoch": 0.38332426056976954, + "grad_norm": 0.14433715178490936, + "learning_rate": 0.0007068566471437132, + "loss": 1.3748, + "step": 4225 + }, + { + "epoch": 0.38341498820540737, + "grad_norm": 0.20979793854215412, + "learning_rate": 0.0007067228748322631, + "loss": 1.4416, + "step": 4226 + }, + { + "epoch": 0.3835057158410452, + "grad_norm": 0.14104111122270646, + "learning_rate": 0.0007065890846702621, + "loss": 1.4232, + "step": 4227 + }, + { + "epoch": 0.383596443476683, + "grad_norm": 0.18044475791868803, + "learning_rate": 0.0007064552766692632, + "loss": 1.4463, + "step": 4228 + }, + { + "epoch": 0.3836871711123208, + "grad_norm": 0.13742257658868778, + "learning_rate": 0.0007063214508408208, + "loss": 1.3867, + "step": 4229 + }, + { + "epoch": 0.38377789874795865, + "grad_norm": 0.1450695611837243, + "learning_rate": 0.0007061876071964905, + "loss": 1.4197, + "step": 4230 + }, + { + "epoch": 0.3838686263835964, + "grad_norm": 0.1673319486844832, + "learning_rate": 0.0007060537457478303, + "loss": 1.4108, + "step": 4231 + }, + { + "epoch": 0.38395935401923426, + "grad_norm": 0.16713596939937603, + "learning_rate": 0.0007059198665063985, + "loss": 1.4269, + "step": 4232 + }, + { + "epoch": 0.3840500816548721, + "grad_norm": 0.1533439746514492, + "learning_rate": 0.000705785969483756, + "loss": 1.4314, + "step": 4233 + }, + { + "epoch": 0.38414080929050987, + "grad_norm": 0.1518589014282512, + "learning_rate": 0.0007056520546914647, + "loss": 1.4266, + "step": 4234 + }, + { + "epoch": 0.3842315369261477, + "grad_norm": 0.15532746484648285, + "learning_rate": 0.0007055181221410881, + "loss": 1.4306, + "step": 4235 + }, + { + "epoch": 0.38432226456178553, + "grad_norm": 0.15698143132987247, + "learning_rate": 0.0007053841718441914, + "loss": 1.4062, + "step": 4236 + }, + { + "epoch": 0.3844129921974233, + "grad_norm": 0.20443070858456033, + "learning_rate": 0.0007052502038123412, + "loss": 1.4399, + "step": 4237 + }, + { + "epoch": 0.38450371983306114, + "grad_norm": 0.15654758528288867, + "learning_rate": 0.0007051162180571056, + "loss": 1.4243, + "step": 4238 + }, + { + "epoch": 0.384594447468699, + "grad_norm": 0.15162104904645948, + "learning_rate": 0.0007049822145900545, + "loss": 1.437, + "step": 4239 + }, + { + "epoch": 0.3846851751043368, + "grad_norm": 0.15279374362621836, + "learning_rate": 0.0007048481934227587, + "loss": 1.4246, + "step": 4240 + }, + { + "epoch": 0.3847759027399746, + "grad_norm": 0.1581448635415576, + "learning_rate": 0.0007047141545667915, + "loss": 1.4164, + "step": 4241 + }, + { + "epoch": 0.3848666303756124, + "grad_norm": 0.18293524910162426, + "learning_rate": 0.0007045800980337267, + "loss": 1.4184, + "step": 4242 + }, + { + "epoch": 0.38495735801125025, + "grad_norm": 0.16405900670773962, + "learning_rate": 0.0007044460238351403, + "loss": 1.4203, + "step": 4243 + }, + { + "epoch": 0.385048085646888, + "grad_norm": 0.15296740261030214, + "learning_rate": 0.0007043119319826097, + "loss": 1.4275, + "step": 4244 + }, + { + "epoch": 0.38513881328252586, + "grad_norm": 0.173300094976548, + "learning_rate": 0.0007041778224877135, + "loss": 1.3958, + "step": 4245 + }, + { + "epoch": 0.3852295409181637, + "grad_norm": 0.1489471801187758, + "learning_rate": 0.0007040436953620324, + "loss": 1.3997, + "step": 4246 + }, + { + "epoch": 0.38532026855380147, + "grad_norm": 0.1697243936387421, + "learning_rate": 0.000703909550617148, + "loss": 1.3839, + "step": 4247 + }, + { + "epoch": 0.3854109961894393, + "grad_norm": 0.2004364728068382, + "learning_rate": 0.000703775388264644, + "loss": 1.4759, + "step": 4248 + }, + { + "epoch": 0.38550172382507714, + "grad_norm": 0.149381763547734, + "learning_rate": 0.000703641208316105, + "loss": 1.4192, + "step": 4249 + }, + { + "epoch": 0.3855924514607149, + "grad_norm": 0.16287799642865294, + "learning_rate": 0.0007035070107831177, + "loss": 1.4212, + "step": 4250 + }, + { + "epoch": 0.38568317909635275, + "grad_norm": 0.1542061856219318, + "learning_rate": 0.0007033727956772702, + "loss": 1.4106, + "step": 4251 + }, + { + "epoch": 0.3857739067319906, + "grad_norm": 0.14752575205614415, + "learning_rate": 0.0007032385630101516, + "loss": 1.4314, + "step": 4252 + }, + { + "epoch": 0.38586463436762836, + "grad_norm": 0.16352618396944266, + "learning_rate": 0.0007031043127933533, + "loss": 1.4356, + "step": 4253 + }, + { + "epoch": 0.3859553620032662, + "grad_norm": 0.16369762510021785, + "learning_rate": 0.0007029700450384676, + "loss": 1.4514, + "step": 4254 + }, + { + "epoch": 0.386046089638904, + "grad_norm": 0.18419681650358777, + "learning_rate": 0.0007028357597570885, + "loss": 1.4318, + "step": 4255 + }, + { + "epoch": 0.3861368172745418, + "grad_norm": 0.154829418009125, + "learning_rate": 0.0007027014569608117, + "loss": 1.4167, + "step": 4256 + }, + { + "epoch": 0.38622754491017963, + "grad_norm": 0.1561917737085542, + "learning_rate": 0.0007025671366612343, + "loss": 1.4285, + "step": 4257 + }, + { + "epoch": 0.38631827254581746, + "grad_norm": 0.14417718355376521, + "learning_rate": 0.0007024327988699548, + "loss": 1.3842, + "step": 4258 + }, + { + "epoch": 0.3864090001814553, + "grad_norm": 0.1471783873616724, + "learning_rate": 0.0007022984435985731, + "loss": 1.4181, + "step": 4259 + }, + { + "epoch": 0.3864997278170931, + "grad_norm": 0.15898260393729222, + "learning_rate": 0.0007021640708586911, + "loss": 1.4157, + "step": 4260 + }, + { + "epoch": 0.3865904554527309, + "grad_norm": 0.16925719883457113, + "learning_rate": 0.0007020296806619118, + "loss": 1.4119, + "step": 4261 + }, + { + "epoch": 0.38668118308836874, + "grad_norm": 0.16215432742529204, + "learning_rate": 0.0007018952730198398, + "loss": 1.4149, + "step": 4262 + }, + { + "epoch": 0.3867719107240065, + "grad_norm": 0.16586955677673343, + "learning_rate": 0.0007017608479440812, + "loss": 1.4727, + "step": 4263 + }, + { + "epoch": 0.38686263835964435, + "grad_norm": 0.16197324621994427, + "learning_rate": 0.0007016264054462435, + "loss": 1.4361, + "step": 4264 + }, + { + "epoch": 0.3869533659952822, + "grad_norm": 0.22476937512445533, + "learning_rate": 0.0007014919455379363, + "loss": 1.4105, + "step": 4265 + }, + { + "epoch": 0.38704409363091996, + "grad_norm": 0.155495215642636, + "learning_rate": 0.0007013574682307697, + "loss": 1.4559, + "step": 4266 + }, + { + "epoch": 0.3871348212665578, + "grad_norm": 0.16818144564880005, + "learning_rate": 0.000701222973536356, + "loss": 1.4473, + "step": 4267 + }, + { + "epoch": 0.3872255489021956, + "grad_norm": 0.142240995424903, + "learning_rate": 0.0007010884614663088, + "loss": 1.3882, + "step": 4268 + }, + { + "epoch": 0.3873162765378334, + "grad_norm": 0.2356092282734483, + "learning_rate": 0.0007009539320322435, + "loss": 1.3932, + "step": 4269 + }, + { + "epoch": 0.38740700417347124, + "grad_norm": 0.13807787663532034, + "learning_rate": 0.0007008193852457767, + "loss": 1.4227, + "step": 4270 + }, + { + "epoch": 0.38749773180910907, + "grad_norm": 0.14033694612112496, + "learning_rate": 0.0007006848211185261, + "loss": 1.4651, + "step": 4271 + }, + { + "epoch": 0.38758845944474685, + "grad_norm": 0.14116694729046664, + "learning_rate": 0.0007005502396621116, + "loss": 1.4186, + "step": 4272 + }, + { + "epoch": 0.3876791870803847, + "grad_norm": 0.13658664914944302, + "learning_rate": 0.0007004156408881545, + "loss": 1.4139, + "step": 4273 + }, + { + "epoch": 0.3877699147160225, + "grad_norm": 0.14910121661319542, + "learning_rate": 0.0007002810248082773, + "loss": 1.4139, + "step": 4274 + }, + { + "epoch": 0.3878606423516603, + "grad_norm": 0.14422890073062317, + "learning_rate": 0.000700146391434104, + "loss": 1.4238, + "step": 4275 + }, + { + "epoch": 0.3879513699872981, + "grad_norm": 0.1390275471851676, + "learning_rate": 0.0007000117407772602, + "loss": 1.3834, + "step": 4276 + }, + { + "epoch": 0.38804209762293596, + "grad_norm": 0.13829169527471477, + "learning_rate": 0.0006998770728493734, + "loss": 1.4106, + "step": 4277 + }, + { + "epoch": 0.3881328252585738, + "grad_norm": 0.16020836831312554, + "learning_rate": 0.0006997423876620717, + "loss": 1.4452, + "step": 4278 + }, + { + "epoch": 0.38822355289421157, + "grad_norm": 0.14650822814667006, + "learning_rate": 0.0006996076852269853, + "loss": 1.4015, + "step": 4279 + }, + { + "epoch": 0.3883142805298494, + "grad_norm": 0.23963794436548777, + "learning_rate": 0.0006994729655557462, + "loss": 1.4359, + "step": 4280 + }, + { + "epoch": 0.38840500816548723, + "grad_norm": 0.16047343016666, + "learning_rate": 0.0006993382286599868, + "loss": 1.4581, + "step": 4281 + }, + { + "epoch": 0.388495735801125, + "grad_norm": 0.14905343031012847, + "learning_rate": 0.000699203474551342, + "loss": 1.3975, + "step": 4282 + }, + { + "epoch": 0.38858646343676284, + "grad_norm": 0.1566651879515466, + "learning_rate": 0.0006990687032414479, + "loss": 1.4713, + "step": 4283 + }, + { + "epoch": 0.3886771910724007, + "grad_norm": 0.13841539305145525, + "learning_rate": 0.0006989339147419419, + "loss": 1.3658, + "step": 4284 + }, + { + "epoch": 0.38876791870803845, + "grad_norm": 0.1456219538683773, + "learning_rate": 0.0006987991090644632, + "loss": 1.3992, + "step": 4285 + }, + { + "epoch": 0.3888586463436763, + "grad_norm": 0.15232644238396933, + "learning_rate": 0.0006986642862206519, + "loss": 1.4302, + "step": 4286 + }, + { + "epoch": 0.3889493739793141, + "grad_norm": 0.16589066237466868, + "learning_rate": 0.0006985294462221503, + "loss": 1.3792, + "step": 4287 + }, + { + "epoch": 0.3890401016149519, + "grad_norm": 0.16618470504488775, + "learning_rate": 0.0006983945890806018, + "loss": 1.4051, + "step": 4288 + }, + { + "epoch": 0.3891308292505897, + "grad_norm": 0.14250766433626127, + "learning_rate": 0.0006982597148076511, + "loss": 1.4521, + "step": 4289 + }, + { + "epoch": 0.38922155688622756, + "grad_norm": 0.13819545872278988, + "learning_rate": 0.0006981248234149451, + "loss": 1.4169, + "step": 4290 + }, + { + "epoch": 0.38931228452186534, + "grad_norm": 0.13831539106823104, + "learning_rate": 0.0006979899149141311, + "loss": 1.3997, + "step": 4291 + }, + { + "epoch": 0.38940301215750317, + "grad_norm": 0.13875706414874958, + "learning_rate": 0.000697854989316859, + "loss": 1.4481, + "step": 4292 + }, + { + "epoch": 0.389493739793141, + "grad_norm": 0.14882919887280202, + "learning_rate": 0.0006977200466347794, + "loss": 1.4277, + "step": 4293 + }, + { + "epoch": 0.3895844674287788, + "grad_norm": 0.14022910852565343, + "learning_rate": 0.0006975850868795446, + "loss": 1.4204, + "step": 4294 + }, + { + "epoch": 0.3896751950644166, + "grad_norm": 0.14251845509979952, + "learning_rate": 0.0006974501100628083, + "loss": 1.474, + "step": 4295 + }, + { + "epoch": 0.38976592270005445, + "grad_norm": 0.1425078915695786, + "learning_rate": 0.000697315116196226, + "loss": 1.4424, + "step": 4296 + }, + { + "epoch": 0.3898566503356923, + "grad_norm": 0.15162206906560108, + "learning_rate": 0.0006971801052914544, + "loss": 1.4336, + "step": 4297 + }, + { + "epoch": 0.38994737797133006, + "grad_norm": 0.14354281846615463, + "learning_rate": 0.0006970450773601516, + "loss": 1.4231, + "step": 4298 + }, + { + "epoch": 0.3900381056069679, + "grad_norm": 0.13877601149521387, + "learning_rate": 0.0006969100324139772, + "loss": 1.4489, + "step": 4299 + }, + { + "epoch": 0.3901288332426057, + "grad_norm": 0.1425782235863104, + "learning_rate": 0.0006967749704645926, + "loss": 1.4274, + "step": 4300 + }, + { + "epoch": 0.3902195608782435, + "grad_norm": 0.13821088449486318, + "learning_rate": 0.0006966398915236602, + "loss": 1.4437, + "step": 4301 + }, + { + "epoch": 0.39031028851388133, + "grad_norm": 0.1466985042822969, + "learning_rate": 0.0006965047956028444, + "loss": 1.4228, + "step": 4302 + }, + { + "epoch": 0.39040101614951916, + "grad_norm": 0.13522224521775375, + "learning_rate": 0.0006963696827138102, + "loss": 1.4344, + "step": 4303 + }, + { + "epoch": 0.39049174378515694, + "grad_norm": 0.16387830788037114, + "learning_rate": 0.0006962345528682254, + "loss": 1.4326, + "step": 4304 + }, + { + "epoch": 0.3905824714207948, + "grad_norm": 0.12882803676467086, + "learning_rate": 0.0006960994060777577, + "loss": 1.4462, + "step": 4305 + }, + { + "epoch": 0.3906731990564326, + "grad_norm": 0.12825285565225938, + "learning_rate": 0.0006959642423540775, + "loss": 1.3857, + "step": 4306 + }, + { + "epoch": 0.3907639266920704, + "grad_norm": 0.13351585800985846, + "learning_rate": 0.0006958290617088559, + "loss": 1.4148, + "step": 4307 + }, + { + "epoch": 0.3908546543277082, + "grad_norm": 0.14378360979166951, + "learning_rate": 0.0006956938641537663, + "loss": 1.3807, + "step": 4308 + }, + { + "epoch": 0.39094538196334605, + "grad_norm": 0.19475593518315926, + "learning_rate": 0.0006955586497004824, + "loss": 1.4196, + "step": 4309 + }, + { + "epoch": 0.3910361095989838, + "grad_norm": 0.15328263508356949, + "learning_rate": 0.0006954234183606803, + "loss": 1.4424, + "step": 4310 + }, + { + "epoch": 0.39112683723462166, + "grad_norm": 0.16351816959923007, + "learning_rate": 0.0006952881701460374, + "loss": 1.4436, + "step": 4311 + }, + { + "epoch": 0.3912175648702595, + "grad_norm": 0.15203192643151767, + "learning_rate": 0.0006951529050682322, + "loss": 1.4231, + "step": 4312 + }, + { + "epoch": 0.39130829250589727, + "grad_norm": 0.1494795125972681, + "learning_rate": 0.0006950176231389448, + "loss": 1.4158, + "step": 4313 + }, + { + "epoch": 0.3913990201415351, + "grad_norm": 0.30816666893555406, + "learning_rate": 0.0006948823243698569, + "loss": 1.4347, + "step": 4314 + }, + { + "epoch": 0.39148974777717294, + "grad_norm": 0.14310606634690293, + "learning_rate": 0.0006947470087726516, + "loss": 1.4072, + "step": 4315 + }, + { + "epoch": 0.39158047541281077, + "grad_norm": 0.16159733624203518, + "learning_rate": 0.0006946116763590133, + "loss": 1.4366, + "step": 4316 + }, + { + "epoch": 0.39167120304844855, + "grad_norm": 0.14302687450860926, + "learning_rate": 0.0006944763271406282, + "loss": 1.4367, + "step": 4317 + }, + { + "epoch": 0.3917619306840864, + "grad_norm": 0.14365437196265643, + "learning_rate": 0.0006943409611291835, + "loss": 1.4386, + "step": 4318 + }, + { + "epoch": 0.3918526583197242, + "grad_norm": 0.14284408231238535, + "learning_rate": 0.0006942055783363683, + "loss": 1.4254, + "step": 4319 + }, + { + "epoch": 0.391943385955362, + "grad_norm": 0.14212662974595305, + "learning_rate": 0.0006940701787738725, + "loss": 1.4631, + "step": 4320 + }, + { + "epoch": 0.3920341135909998, + "grad_norm": 0.13659373592246832, + "learning_rate": 0.0006939347624533885, + "loss": 1.423, + "step": 4321 + }, + { + "epoch": 0.39212484122663765, + "grad_norm": 0.13775132825262312, + "learning_rate": 0.000693799329386609, + "loss": 1.4384, + "step": 4322 + }, + { + "epoch": 0.39221556886227543, + "grad_norm": 0.1613058924725193, + "learning_rate": 0.0006936638795852288, + "loss": 1.4257, + "step": 4323 + }, + { + "epoch": 0.39230629649791326, + "grad_norm": 0.1427604376009608, + "learning_rate": 0.0006935284130609443, + "loss": 1.4079, + "step": 4324 + }, + { + "epoch": 0.3923970241335511, + "grad_norm": 0.14247313418490173, + "learning_rate": 0.0006933929298254526, + "loss": 1.4239, + "step": 4325 + }, + { + "epoch": 0.3924877517691889, + "grad_norm": 0.15060149368912013, + "learning_rate": 0.0006932574298904528, + "loss": 1.4104, + "step": 4326 + }, + { + "epoch": 0.3925784794048267, + "grad_norm": 0.14559493513535643, + "learning_rate": 0.0006931219132676456, + "loss": 1.4076, + "step": 4327 + }, + { + "epoch": 0.39266920704046454, + "grad_norm": 0.13886239961449165, + "learning_rate": 0.0006929863799687327, + "loss": 1.4518, + "step": 4328 + }, + { + "epoch": 0.3927599346761023, + "grad_norm": 0.14663679373350705, + "learning_rate": 0.0006928508300054175, + "loss": 1.4304, + "step": 4329 + }, + { + "epoch": 0.39285066231174015, + "grad_norm": 0.15327866670467716, + "learning_rate": 0.0006927152633894046, + "loss": 1.4124, + "step": 4330 + }, + { + "epoch": 0.392941389947378, + "grad_norm": 0.1452051163312567, + "learning_rate": 0.0006925796801324005, + "loss": 1.452, + "step": 4331 + }, + { + "epoch": 0.39303211758301576, + "grad_norm": 0.14751615711905436, + "learning_rate": 0.0006924440802461123, + "loss": 1.399, + "step": 4332 + }, + { + "epoch": 0.3931228452186536, + "grad_norm": 0.14046118212595518, + "learning_rate": 0.0006923084637422497, + "loss": 1.3831, + "step": 4333 + }, + { + "epoch": 0.3932135728542914, + "grad_norm": 0.16901642023389019, + "learning_rate": 0.0006921728306325227, + "loss": 1.435, + "step": 4334 + }, + { + "epoch": 0.39330430048992926, + "grad_norm": 0.15432767295202138, + "learning_rate": 0.0006920371809286437, + "loss": 1.4482, + "step": 4335 + }, + { + "epoch": 0.39339502812556704, + "grad_norm": 0.15642165528464688, + "learning_rate": 0.0006919015146423258, + "loss": 1.4466, + "step": 4336 + }, + { + "epoch": 0.39348575576120487, + "grad_norm": 0.1478150715679178, + "learning_rate": 0.0006917658317852837, + "loss": 1.4239, + "step": 4337 + }, + { + "epoch": 0.3935764833968427, + "grad_norm": 0.13484481543637755, + "learning_rate": 0.0006916301323692338, + "loss": 1.4456, + "step": 4338 + }, + { + "epoch": 0.3936672110324805, + "grad_norm": 0.13697766678214587, + "learning_rate": 0.0006914944164058936, + "loss": 1.4531, + "step": 4339 + }, + { + "epoch": 0.3937579386681183, + "grad_norm": 0.134331432294711, + "learning_rate": 0.0006913586839069825, + "loss": 1.4138, + "step": 4340 + }, + { + "epoch": 0.39384866630375615, + "grad_norm": 0.1690079529322016, + "learning_rate": 0.0006912229348842207, + "loss": 1.4204, + "step": 4341 + }, + { + "epoch": 0.3939393939393939, + "grad_norm": 0.14847051958553023, + "learning_rate": 0.0006910871693493304, + "loss": 1.4106, + "step": 4342 + }, + { + "epoch": 0.39403012157503176, + "grad_norm": 0.13790332612550196, + "learning_rate": 0.0006909513873140349, + "loss": 1.3767, + "step": 4343 + }, + { + "epoch": 0.3941208492106696, + "grad_norm": 0.178767143789779, + "learning_rate": 0.0006908155887900588, + "loss": 1.4111, + "step": 4344 + }, + { + "epoch": 0.39421157684630737, + "grad_norm": 0.15822309072505517, + "learning_rate": 0.0006906797737891285, + "loss": 1.3921, + "step": 4345 + }, + { + "epoch": 0.3943023044819452, + "grad_norm": 0.14562487830860776, + "learning_rate": 0.0006905439423229718, + "loss": 1.4295, + "step": 4346 + }, + { + "epoch": 0.39439303211758303, + "grad_norm": 0.14081581550498457, + "learning_rate": 0.0006904080944033174, + "loss": 1.4225, + "step": 4347 + }, + { + "epoch": 0.3944837597532208, + "grad_norm": 0.14430312418248756, + "learning_rate": 0.0006902722300418963, + "loss": 1.4142, + "step": 4348 + }, + { + "epoch": 0.39457448738885864, + "grad_norm": 0.134770675975257, + "learning_rate": 0.0006901363492504397, + "loss": 1.3863, + "step": 4349 + }, + { + "epoch": 0.3946652150244965, + "grad_norm": 0.15642325926187955, + "learning_rate": 0.0006900004520406814, + "loss": 1.4343, + "step": 4350 + }, + { + "epoch": 0.39475594266013425, + "grad_norm": 0.14052544312537477, + "learning_rate": 0.0006898645384243563, + "loss": 1.4155, + "step": 4351 + }, + { + "epoch": 0.3948466702957721, + "grad_norm": 0.2127912392717582, + "learning_rate": 0.0006897286084132002, + "loss": 1.4079, + "step": 4352 + }, + { + "epoch": 0.3949373979314099, + "grad_norm": 0.13960476730651974, + "learning_rate": 0.0006895926620189508, + "loss": 1.4268, + "step": 4353 + }, + { + "epoch": 0.39502812556704775, + "grad_norm": 0.1435630724306407, + "learning_rate": 0.0006894566992533469, + "loss": 1.4389, + "step": 4354 + }, + { + "epoch": 0.3951188532026855, + "grad_norm": 0.1407675702346627, + "learning_rate": 0.0006893207201281293, + "loss": 1.4121, + "step": 4355 + }, + { + "epoch": 0.39520958083832336, + "grad_norm": 0.18772308283728265, + "learning_rate": 0.0006891847246550397, + "loss": 1.4004, + "step": 4356 + }, + { + "epoch": 0.3953003084739612, + "grad_norm": 0.17026160441712476, + "learning_rate": 0.0006890487128458209, + "loss": 1.4283, + "step": 4357 + }, + { + "epoch": 0.39539103610959897, + "grad_norm": 0.148359397036965, + "learning_rate": 0.0006889126847122182, + "loss": 1.4277, + "step": 4358 + }, + { + "epoch": 0.3954817637452368, + "grad_norm": 0.15702870849860787, + "learning_rate": 0.0006887766402659773, + "loss": 1.3968, + "step": 4359 + }, + { + "epoch": 0.39557249138087464, + "grad_norm": 0.163281344507715, + "learning_rate": 0.0006886405795188456, + "loss": 1.406, + "step": 4360 + }, + { + "epoch": 0.3956632190165124, + "grad_norm": 0.14335918923203522, + "learning_rate": 0.0006885045024825721, + "loss": 1.423, + "step": 4361 + }, + { + "epoch": 0.39575394665215025, + "grad_norm": 0.142703639702033, + "learning_rate": 0.000688368409168907, + "loss": 1.4248, + "step": 4362 + }, + { + "epoch": 0.3958446742877881, + "grad_norm": 0.1682176075542469, + "learning_rate": 0.0006882322995896021, + "loss": 1.4258, + "step": 4363 + }, + { + "epoch": 0.39593540192342586, + "grad_norm": 0.22256122796747047, + "learning_rate": 0.0006880961737564103, + "loss": 1.4107, + "step": 4364 + }, + { + "epoch": 0.3960261295590637, + "grad_norm": 0.15342988945908032, + "learning_rate": 0.0006879600316810861, + "loss": 1.4171, + "step": 4365 + }, + { + "epoch": 0.3961168571947015, + "grad_norm": 0.1490672367605106, + "learning_rate": 0.0006878238733753856, + "loss": 1.4537, + "step": 4366 + }, + { + "epoch": 0.3962075848303393, + "grad_norm": 0.14015707041051517, + "learning_rate": 0.0006876876988510659, + "loss": 1.4241, + "step": 4367 + }, + { + "epoch": 0.39629831246597713, + "grad_norm": 0.13934124803267045, + "learning_rate": 0.0006875515081198858, + "loss": 1.4332, + "step": 4368 + }, + { + "epoch": 0.39638904010161496, + "grad_norm": 0.1394186524671586, + "learning_rate": 0.0006874153011936051, + "loss": 1.4304, + "step": 4369 + }, + { + "epoch": 0.39647976773725274, + "grad_norm": 0.14832999344192843, + "learning_rate": 0.0006872790780839858, + "loss": 1.4052, + "step": 4370 + }, + { + "epoch": 0.3965704953728906, + "grad_norm": 0.16060005635873648, + "learning_rate": 0.0006871428388027904, + "loss": 1.4394, + "step": 4371 + }, + { + "epoch": 0.3966612230085284, + "grad_norm": 0.14699757042982403, + "learning_rate": 0.0006870065833617833, + "loss": 1.4053, + "step": 4372 + }, + { + "epoch": 0.39675195064416624, + "grad_norm": 0.1492230379733971, + "learning_rate": 0.0006868703117727302, + "loss": 1.4046, + "step": 4373 + }, + { + "epoch": 0.396842678279804, + "grad_norm": 0.13304876489905323, + "learning_rate": 0.000686734024047398, + "loss": 1.442, + "step": 4374 + }, + { + "epoch": 0.39693340591544185, + "grad_norm": 0.14094455050231053, + "learning_rate": 0.0006865977201975555, + "loss": 1.3951, + "step": 4375 + }, + { + "epoch": 0.3970241335510797, + "grad_norm": 0.1451319029976784, + "learning_rate": 0.0006864614002349722, + "loss": 1.459, + "step": 4376 + }, + { + "epoch": 0.39711486118671746, + "grad_norm": 0.16428917974208102, + "learning_rate": 0.0006863250641714195, + "loss": 1.4302, + "step": 4377 + }, + { + "epoch": 0.3972055888223553, + "grad_norm": 0.138204454712209, + "learning_rate": 0.0006861887120186701, + "loss": 1.4521, + "step": 4378 + }, + { + "epoch": 0.3972963164579931, + "grad_norm": 0.13685897490954135, + "learning_rate": 0.000686052343788498, + "loss": 1.4343, + "step": 4379 + }, + { + "epoch": 0.3973870440936309, + "grad_norm": 0.14310995346105326, + "learning_rate": 0.0006859159594926786, + "loss": 1.4333, + "step": 4380 + }, + { + "epoch": 0.39747777172926874, + "grad_norm": 0.1309563191551465, + "learning_rate": 0.0006857795591429885, + "loss": 1.4124, + "step": 4381 + }, + { + "epoch": 0.39756849936490657, + "grad_norm": 0.15003162975934795, + "learning_rate": 0.0006856431427512065, + "loss": 1.4464, + "step": 4382 + }, + { + "epoch": 0.39765922700054435, + "grad_norm": 0.14321555129730332, + "learning_rate": 0.0006855067103291114, + "loss": 1.4532, + "step": 4383 + }, + { + "epoch": 0.3977499546361822, + "grad_norm": 0.14946807467737805, + "learning_rate": 0.0006853702618884845, + "loss": 1.4417, + "step": 4384 + }, + { + "epoch": 0.39784068227182, + "grad_norm": 0.13296718690927778, + "learning_rate": 0.0006852337974411085, + "loss": 1.4242, + "step": 4385 + }, + { + "epoch": 0.3979314099074578, + "grad_norm": 0.1465610074771242, + "learning_rate": 0.0006850973169987664, + "loss": 1.417, + "step": 4386 + }, + { + "epoch": 0.3980221375430956, + "grad_norm": 0.14175972467172895, + "learning_rate": 0.0006849608205732441, + "loss": 1.4249, + "step": 4387 + }, + { + "epoch": 0.39811286517873345, + "grad_norm": 0.140976600421296, + "learning_rate": 0.0006848243081763273, + "loss": 1.4359, + "step": 4388 + }, + { + "epoch": 0.39820359281437123, + "grad_norm": 0.13341664461445393, + "learning_rate": 0.0006846877798198044, + "loss": 1.4168, + "step": 4389 + }, + { + "epoch": 0.39829432045000907, + "grad_norm": 0.1313446221984066, + "learning_rate": 0.0006845512355154646, + "loss": 1.4203, + "step": 4390 + }, + { + "epoch": 0.3983850480856469, + "grad_norm": 0.13483944978978343, + "learning_rate": 0.0006844146752750984, + "loss": 1.4432, + "step": 4391 + }, + { + "epoch": 0.39847577572128473, + "grad_norm": 0.13267706983490024, + "learning_rate": 0.0006842780991104977, + "loss": 1.4321, + "step": 4392 + }, + { + "epoch": 0.3985665033569225, + "grad_norm": 0.12985078749886253, + "learning_rate": 0.000684141507033456, + "loss": 1.4388, + "step": 4393 + }, + { + "epoch": 0.39865723099256034, + "grad_norm": 0.1568140073260415, + "learning_rate": 0.000684004899055768, + "loss": 1.4145, + "step": 4394 + }, + { + "epoch": 0.3987479586281982, + "grad_norm": 0.12947205562137984, + "learning_rate": 0.00068386827518923, + "loss": 1.4411, + "step": 4395 + }, + { + "epoch": 0.39883868626383595, + "grad_norm": 0.13017334800985586, + "learning_rate": 0.0006837316354456391, + "loss": 1.4467, + "step": 4396 + }, + { + "epoch": 0.3989294138994738, + "grad_norm": 0.12412007075380431, + "learning_rate": 0.0006835949798367946, + "loss": 1.4332, + "step": 4397 + }, + { + "epoch": 0.3990201415351116, + "grad_norm": 0.14277572633698812, + "learning_rate": 0.0006834583083744964, + "loss": 1.4163, + "step": 4398 + }, + { + "epoch": 0.3991108691707494, + "grad_norm": 0.1354213472995821, + "learning_rate": 0.0006833216210705463, + "loss": 1.4105, + "step": 4399 + }, + { + "epoch": 0.3992015968063872, + "grad_norm": 0.13982934318579582, + "learning_rate": 0.0006831849179367472, + "loss": 1.4136, + "step": 4400 + }, + { + "epoch": 0.39929232444202506, + "grad_norm": 0.14560358720970457, + "learning_rate": 0.0006830481989849034, + "loss": 1.4331, + "step": 4401 + }, + { + "epoch": 0.39938305207766284, + "grad_norm": 0.1284178264735284, + "learning_rate": 0.0006829114642268209, + "loss": 1.4207, + "step": 4402 + }, + { + "epoch": 0.39947377971330067, + "grad_norm": 0.13150127221476052, + "learning_rate": 0.0006827747136743061, + "loss": 1.4357, + "step": 4403 + }, + { + "epoch": 0.3995645073489385, + "grad_norm": 0.14404324408267707, + "learning_rate": 0.0006826379473391681, + "loss": 1.41, + "step": 4404 + }, + { + "epoch": 0.3996552349845763, + "grad_norm": 0.13127888730409618, + "learning_rate": 0.0006825011652332163, + "loss": 1.4209, + "step": 4405 + }, + { + "epoch": 0.3997459626202141, + "grad_norm": 0.13735559977440653, + "learning_rate": 0.000682364367368262, + "loss": 1.4539, + "step": 4406 + }, + { + "epoch": 0.39983669025585195, + "grad_norm": 0.14180397539388978, + "learning_rate": 0.0006822275537561178, + "loss": 1.4467, + "step": 4407 + }, + { + "epoch": 0.3999274178914897, + "grad_norm": 0.13869935965164767, + "learning_rate": 0.0006820907244085973, + "loss": 1.4039, + "step": 4408 + }, + { + "epoch": 0.40001814552712756, + "grad_norm": 0.1317855259890209, + "learning_rate": 0.0006819538793375161, + "loss": 1.383, + "step": 4409 + }, + { + "epoch": 0.4001088731627654, + "grad_norm": 0.143950628521598, + "learning_rate": 0.0006818170185546905, + "loss": 1.4226, + "step": 4410 + }, + { + "epoch": 0.4001996007984032, + "grad_norm": 0.1344217754789263, + "learning_rate": 0.0006816801420719385, + "loss": 1.4316, + "step": 4411 + }, + { + "epoch": 0.400290328434041, + "grad_norm": 0.1315520686779699, + "learning_rate": 0.0006815432499010795, + "loss": 1.395, + "step": 4412 + }, + { + "epoch": 0.40038105606967883, + "grad_norm": 0.13367469712065452, + "learning_rate": 0.0006814063420539342, + "loss": 1.4188, + "step": 4413 + }, + { + "epoch": 0.40047178370531666, + "grad_norm": 0.13311596885504312, + "learning_rate": 0.0006812694185423245, + "loss": 1.4535, + "step": 4414 + }, + { + "epoch": 0.40056251134095444, + "grad_norm": 0.15135408704066255, + "learning_rate": 0.0006811324793780738, + "loss": 1.4026, + "step": 4415 + }, + { + "epoch": 0.4006532389765923, + "grad_norm": 0.13718874530395503, + "learning_rate": 0.0006809955245730068, + "loss": 1.4321, + "step": 4416 + }, + { + "epoch": 0.4007439666122301, + "grad_norm": 0.13798666367291204, + "learning_rate": 0.0006808585541389495, + "loss": 1.4222, + "step": 4417 + }, + { + "epoch": 0.4008346942478679, + "grad_norm": 0.1308561838984346, + "learning_rate": 0.0006807215680877293, + "loss": 1.403, + "step": 4418 + }, + { + "epoch": 0.4009254218835057, + "grad_norm": 0.13681579635639315, + "learning_rate": 0.0006805845664311754, + "loss": 1.4438, + "step": 4419 + }, + { + "epoch": 0.40101614951914355, + "grad_norm": 0.13174029925702682, + "learning_rate": 0.0006804475491811172, + "loss": 1.4135, + "step": 4420 + }, + { + "epoch": 0.4011068771547813, + "grad_norm": 0.13522481165455172, + "learning_rate": 0.0006803105163493868, + "loss": 1.4279, + "step": 4421 + }, + { + "epoch": 0.40119760479041916, + "grad_norm": 0.1441141608360539, + "learning_rate": 0.0006801734679478166, + "loss": 1.4147, + "step": 4422 + }, + { + "epoch": 0.401288332426057, + "grad_norm": 0.1321328642258642, + "learning_rate": 0.0006800364039882408, + "loss": 1.3992, + "step": 4423 + }, + { + "epoch": 0.40137906006169477, + "grad_norm": 0.13872410180770703, + "learning_rate": 0.0006798993244824952, + "loss": 1.4394, + "step": 4424 + }, + { + "epoch": 0.4014697876973326, + "grad_norm": 0.1360974393028594, + "learning_rate": 0.0006797622294424164, + "loss": 1.3879, + "step": 4425 + }, + { + "epoch": 0.40156051533297044, + "grad_norm": 0.13312759371405747, + "learning_rate": 0.0006796251188798426, + "loss": 1.4363, + "step": 4426 + }, + { + "epoch": 0.4016512429686082, + "grad_norm": 0.12927715481396368, + "learning_rate": 0.0006794879928066131, + "loss": 1.3948, + "step": 4427 + }, + { + "epoch": 0.40174197060424605, + "grad_norm": 0.1463039950851078, + "learning_rate": 0.000679350851234569, + "loss": 1.4432, + "step": 4428 + }, + { + "epoch": 0.4018326982398839, + "grad_norm": 0.36424279612775007, + "learning_rate": 0.0006792136941755527, + "loss": 1.4323, + "step": 4429 + }, + { + "epoch": 0.4019234258755217, + "grad_norm": 0.1210278514351645, + "learning_rate": 0.0006790765216414073, + "loss": 1.4235, + "step": 4430 + }, + { + "epoch": 0.4020141535111595, + "grad_norm": 0.1307754318043805, + "learning_rate": 0.0006789393336439779, + "loss": 1.4337, + "step": 4431 + }, + { + "epoch": 0.4021048811467973, + "grad_norm": 0.13491813502464478, + "learning_rate": 0.0006788021301951107, + "loss": 1.4225, + "step": 4432 + }, + { + "epoch": 0.40219560878243515, + "grad_norm": 0.1496201956680859, + "learning_rate": 0.0006786649113066532, + "loss": 1.4083, + "step": 4433 + }, + { + "epoch": 0.40228633641807293, + "grad_norm": 0.14826806169077048, + "learning_rate": 0.0006785276769904542, + "loss": 1.4005, + "step": 4434 + }, + { + "epoch": 0.40237706405371076, + "grad_norm": 0.13897410732520266, + "learning_rate": 0.0006783904272583641, + "loss": 1.382, + "step": 4435 + }, + { + "epoch": 0.4024677916893486, + "grad_norm": 0.1506346118165891, + "learning_rate": 0.0006782531621222343, + "loss": 1.4288, + "step": 4436 + }, + { + "epoch": 0.4025585193249864, + "grad_norm": 0.15379715008449024, + "learning_rate": 0.0006781158815939177, + "loss": 1.4123, + "step": 4437 + }, + { + "epoch": 0.4026492469606242, + "grad_norm": 0.154359236779013, + "learning_rate": 0.0006779785856852683, + "loss": 1.4735, + "step": 4438 + }, + { + "epoch": 0.40273997459626204, + "grad_norm": 0.1380292206982261, + "learning_rate": 0.0006778412744081419, + "loss": 1.4094, + "step": 4439 + }, + { + "epoch": 0.4028307022318998, + "grad_norm": 0.15705211751690035, + "learning_rate": 0.0006777039477743953, + "loss": 1.4383, + "step": 4440 + }, + { + "epoch": 0.40292142986753765, + "grad_norm": 0.13578626359360502, + "learning_rate": 0.0006775666057958865, + "loss": 1.4143, + "step": 4441 + }, + { + "epoch": 0.4030121575031755, + "grad_norm": 0.13438383588630795, + "learning_rate": 0.0006774292484844753, + "loss": 1.4235, + "step": 4442 + }, + { + "epoch": 0.40310288513881326, + "grad_norm": 0.14205011214907157, + "learning_rate": 0.0006772918758520221, + "loss": 1.4244, + "step": 4443 + }, + { + "epoch": 0.4031936127744511, + "grad_norm": 0.1430841193897273, + "learning_rate": 0.0006771544879103895, + "loss": 1.3964, + "step": 4444 + }, + { + "epoch": 0.4032843404100889, + "grad_norm": 0.15048922893371144, + "learning_rate": 0.0006770170846714407, + "loss": 1.421, + "step": 4445 + }, + { + "epoch": 0.4033750680457267, + "grad_norm": 0.13682190881503395, + "learning_rate": 0.0006768796661470405, + "loss": 1.4176, + "step": 4446 + }, + { + "epoch": 0.40346579568136454, + "grad_norm": 0.1333922292298454, + "learning_rate": 0.0006767422323490551, + "loss": 1.4652, + "step": 4447 + }, + { + "epoch": 0.40355652331700237, + "grad_norm": 0.1380824076262123, + "learning_rate": 0.0006766047832893519, + "loss": 1.4688, + "step": 4448 + }, + { + "epoch": 0.4036472509526402, + "grad_norm": 0.13629577423011607, + "learning_rate": 0.0006764673189797996, + "loss": 1.4452, + "step": 4449 + }, + { + "epoch": 0.403737978588278, + "grad_norm": 0.1334868384358836, + "learning_rate": 0.0006763298394322683, + "loss": 1.4287, + "step": 4450 + }, + { + "epoch": 0.4038287062239158, + "grad_norm": 0.14520977457749903, + "learning_rate": 0.0006761923446586294, + "loss": 1.4099, + "step": 4451 + }, + { + "epoch": 0.40391943385955364, + "grad_norm": 0.12907423887108782, + "learning_rate": 0.0006760548346707554, + "loss": 1.4642, + "step": 4452 + }, + { + "epoch": 0.4040101614951914, + "grad_norm": 0.17334343155728446, + "learning_rate": 0.0006759173094805209, + "loss": 1.4275, + "step": 4453 + }, + { + "epoch": 0.40410088913082925, + "grad_norm": 0.1492787913052535, + "learning_rate": 0.0006757797690998004, + "loss": 1.4314, + "step": 4454 + }, + { + "epoch": 0.4041916167664671, + "grad_norm": 0.1300654197689449, + "learning_rate": 0.0006756422135404711, + "loss": 1.4233, + "step": 4455 + }, + { + "epoch": 0.40428234440210487, + "grad_norm": 0.1349796404662525, + "learning_rate": 0.0006755046428144107, + "loss": 1.432, + "step": 4456 + }, + { + "epoch": 0.4043730720377427, + "grad_norm": 0.17313988834571264, + "learning_rate": 0.0006753670569334986, + "loss": 1.4135, + "step": 4457 + }, + { + "epoch": 0.40446379967338053, + "grad_norm": 0.15060022964838754, + "learning_rate": 0.0006752294559096152, + "loss": 1.4047, + "step": 4458 + }, + { + "epoch": 0.4045545273090183, + "grad_norm": 0.13017723382698834, + "learning_rate": 0.0006750918397546425, + "loss": 1.4204, + "step": 4459 + }, + { + "epoch": 0.40464525494465614, + "grad_norm": 0.1410651171174092, + "learning_rate": 0.0006749542084804636, + "loss": 1.4354, + "step": 4460 + }, + { + "epoch": 0.404735982580294, + "grad_norm": 0.13712576829518816, + "learning_rate": 0.0006748165620989631, + "loss": 1.4017, + "step": 4461 + }, + { + "epoch": 0.40482671021593175, + "grad_norm": 0.1338569652382106, + "learning_rate": 0.0006746789006220266, + "loss": 1.421, + "step": 4462 + }, + { + "epoch": 0.4049174378515696, + "grad_norm": 0.1419157138573793, + "learning_rate": 0.0006745412240615414, + "loss": 1.4428, + "step": 4463 + }, + { + "epoch": 0.4050081654872074, + "grad_norm": 0.13867190894743353, + "learning_rate": 0.0006744035324293957, + "loss": 1.4328, + "step": 4464 + }, + { + "epoch": 0.4050988931228452, + "grad_norm": 0.1387587615044899, + "learning_rate": 0.0006742658257374793, + "loss": 1.4409, + "step": 4465 + }, + { + "epoch": 0.405189620758483, + "grad_norm": 0.1621020830215749, + "learning_rate": 0.000674128103997683, + "loss": 1.4668, + "step": 4466 + }, + { + "epoch": 0.40528034839412086, + "grad_norm": 0.14851708436405806, + "learning_rate": 0.0006739903672218994, + "loss": 1.4703, + "step": 4467 + }, + { + "epoch": 0.40537107602975864, + "grad_norm": 0.13649427224232, + "learning_rate": 0.0006738526154220221, + "loss": 1.4483, + "step": 4468 + }, + { + "epoch": 0.40546180366539647, + "grad_norm": 0.1444763776816015, + "learning_rate": 0.0006737148486099456, + "loss": 1.4183, + "step": 4469 + }, + { + "epoch": 0.4055525313010343, + "grad_norm": 0.1316823588186595, + "learning_rate": 0.0006735770667975665, + "loss": 1.4038, + "step": 4470 + }, + { + "epoch": 0.40564325893667214, + "grad_norm": 0.1391499719906858, + "learning_rate": 0.0006734392699967819, + "loss": 1.4309, + "step": 4471 + }, + { + "epoch": 0.4057339865723099, + "grad_norm": 0.1277065992374821, + "learning_rate": 0.0006733014582194908, + "loss": 1.3719, + "step": 4472 + }, + { + "epoch": 0.40582471420794775, + "grad_norm": 0.1385680481221216, + "learning_rate": 0.0006731636314775935, + "loss": 1.4086, + "step": 4473 + }, + { + "epoch": 0.4059154418435856, + "grad_norm": 0.2285254638911184, + "learning_rate": 0.0006730257897829908, + "loss": 1.4013, + "step": 4474 + }, + { + "epoch": 0.40600616947922336, + "grad_norm": 0.14409751624311798, + "learning_rate": 0.0006728879331475859, + "loss": 1.406, + "step": 4475 + }, + { + "epoch": 0.4060968971148612, + "grad_norm": 0.12840344954043423, + "learning_rate": 0.0006727500615832823, + "loss": 1.3895, + "step": 4476 + }, + { + "epoch": 0.406187624750499, + "grad_norm": 0.1386980624541789, + "learning_rate": 0.0006726121751019855, + "loss": 1.455, + "step": 4477 + }, + { + "epoch": 0.4062783523861368, + "grad_norm": 0.12941253413973663, + "learning_rate": 0.0006724742737156018, + "loss": 1.4009, + "step": 4478 + }, + { + "epoch": 0.40636908002177463, + "grad_norm": 0.13776789950532917, + "learning_rate": 0.0006723363574360393, + "loss": 1.4014, + "step": 4479 + }, + { + "epoch": 0.40645980765741246, + "grad_norm": 0.13454471959881584, + "learning_rate": 0.000672198426275207, + "loss": 1.4294, + "step": 4480 + }, + { + "epoch": 0.40655053529305024, + "grad_norm": 0.1346339249812733, + "learning_rate": 0.0006720604802450151, + "loss": 1.4179, + "step": 4481 + }, + { + "epoch": 0.4066412629286881, + "grad_norm": 0.12634041186039502, + "learning_rate": 0.0006719225193573754, + "loss": 1.4338, + "step": 4482 + }, + { + "epoch": 0.4067319905643259, + "grad_norm": 0.13571433886148737, + "learning_rate": 0.0006717845436242007, + "loss": 1.4065, + "step": 4483 + }, + { + "epoch": 0.4068227181999637, + "grad_norm": 0.13644036084796268, + "learning_rate": 0.0006716465530574055, + "loss": 1.423, + "step": 4484 + }, + { + "epoch": 0.4069134458356015, + "grad_norm": 0.12497158549290878, + "learning_rate": 0.0006715085476689051, + "loss": 1.395, + "step": 4485 + }, + { + "epoch": 0.40700417347123935, + "grad_norm": 0.15209272923783534, + "learning_rate": 0.0006713705274706162, + "loss": 1.4503, + "step": 4486 + }, + { + "epoch": 0.4070949011068771, + "grad_norm": 0.1313495437898136, + "learning_rate": 0.0006712324924744572, + "loss": 1.3494, + "step": 4487 + }, + { + "epoch": 0.40718562874251496, + "grad_norm": 0.13195186974892645, + "learning_rate": 0.0006710944426923471, + "loss": 1.4111, + "step": 4488 + }, + { + "epoch": 0.4072763563781528, + "grad_norm": 0.1378402655110281, + "learning_rate": 0.0006709563781362067, + "loss": 1.3931, + "step": 4489 + }, + { + "epoch": 0.4073670840137906, + "grad_norm": 0.13223936227665567, + "learning_rate": 0.0006708182988179579, + "loss": 1.4094, + "step": 4490 + }, + { + "epoch": 0.4074578116494284, + "grad_norm": 0.13140532079687395, + "learning_rate": 0.0006706802047495238, + "loss": 1.4404, + "step": 4491 + }, + { + "epoch": 0.40754853928506624, + "grad_norm": 0.1349002738999237, + "learning_rate": 0.0006705420959428288, + "loss": 1.4249, + "step": 4492 + }, + { + "epoch": 0.40763926692070407, + "grad_norm": 0.12686413383337627, + "learning_rate": 0.0006704039724097988, + "loss": 1.423, + "step": 4493 + }, + { + "epoch": 0.40772999455634185, + "grad_norm": 0.15017183706885087, + "learning_rate": 0.0006702658341623606, + "loss": 1.3886, + "step": 4494 + }, + { + "epoch": 0.4078207221919797, + "grad_norm": 0.16071239929529216, + "learning_rate": 0.0006701276812124424, + "loss": 1.4047, + "step": 4495 + }, + { + "epoch": 0.4079114498276175, + "grad_norm": 0.15323208315827394, + "learning_rate": 0.000669989513571974, + "loss": 1.4033, + "step": 4496 + }, + { + "epoch": 0.4080021774632553, + "grad_norm": 0.13998399223105853, + "learning_rate": 0.000669851331252886, + "loss": 1.3988, + "step": 4497 + }, + { + "epoch": 0.4080929050988931, + "grad_norm": 0.1332998390933705, + "learning_rate": 0.0006697131342671104, + "loss": 1.4176, + "step": 4498 + }, + { + "epoch": 0.40818363273453095, + "grad_norm": 0.13835011160101307, + "learning_rate": 0.000669574922626581, + "loss": 1.4003, + "step": 4499 + }, + { + "epoch": 0.40827436037016873, + "grad_norm": 0.13618733421135004, + "learning_rate": 0.0006694366963432317, + "loss": 1.3959, + "step": 4500 + }, + { + "epoch": 0.40836508800580656, + "grad_norm": 0.13457794324958308, + "learning_rate": 0.0006692984554289987, + "loss": 1.4262, + "step": 4501 + }, + { + "epoch": 0.4084558156414444, + "grad_norm": 0.13547754426029898, + "learning_rate": 0.0006691601998958193, + "loss": 1.4382, + "step": 4502 + }, + { + "epoch": 0.4085465432770822, + "grad_norm": 0.1411723355748039, + "learning_rate": 0.0006690219297556316, + "loss": 1.4193, + "step": 4503 + }, + { + "epoch": 0.40863727091272, + "grad_norm": 0.14553307019007894, + "learning_rate": 0.0006688836450203754, + "loss": 1.4232, + "step": 4504 + }, + { + "epoch": 0.40872799854835784, + "grad_norm": 0.13746628787131585, + "learning_rate": 0.0006687453457019916, + "loss": 1.4036, + "step": 4505 + }, + { + "epoch": 0.4088187261839956, + "grad_norm": 0.1387248936131232, + "learning_rate": 0.0006686070318124223, + "loss": 1.4324, + "step": 4506 + }, + { + "epoch": 0.40890945381963345, + "grad_norm": 0.13669140507954164, + "learning_rate": 0.000668468703363611, + "loss": 1.4468, + "step": 4507 + }, + { + "epoch": 0.4090001814552713, + "grad_norm": 0.14440927430154918, + "learning_rate": 0.0006683303603675022, + "loss": 1.4283, + "step": 4508 + }, + { + "epoch": 0.4090909090909091, + "grad_norm": 0.13555334717584816, + "learning_rate": 0.000668192002836042, + "loss": 1.4078, + "step": 4509 + }, + { + "epoch": 0.4091816367265469, + "grad_norm": 0.13823408267523005, + "learning_rate": 0.0006680536307811777, + "loss": 1.3944, + "step": 4510 + }, + { + "epoch": 0.4092723643621847, + "grad_norm": 0.2242833237510273, + "learning_rate": 0.0006679152442148574, + "loss": 1.3988, + "step": 4511 + }, + { + "epoch": 0.40936309199782256, + "grad_norm": 0.1360480257021384, + "learning_rate": 0.0006677768431490312, + "loss": 1.4043, + "step": 4512 + }, + { + "epoch": 0.40945381963346034, + "grad_norm": 0.13308823601663267, + "learning_rate": 0.0006676384275956498, + "loss": 1.4247, + "step": 4513 + }, + { + "epoch": 0.40954454726909817, + "grad_norm": 0.13906482194757577, + "learning_rate": 0.0006674999975666654, + "loss": 1.4342, + "step": 4514 + }, + { + "epoch": 0.409635274904736, + "grad_norm": 0.14356464090353796, + "learning_rate": 0.0006673615530740317, + "loss": 1.4426, + "step": 4515 + }, + { + "epoch": 0.4097260025403738, + "grad_norm": 0.13934470532866225, + "learning_rate": 0.000667223094129703, + "loss": 1.4205, + "step": 4516 + }, + { + "epoch": 0.4098167301760116, + "grad_norm": 0.14680352630529037, + "learning_rate": 0.0006670846207456356, + "loss": 1.4135, + "step": 4517 + }, + { + "epoch": 0.40990745781164944, + "grad_norm": 0.1412727042961075, + "learning_rate": 0.0006669461329337866, + "loss": 1.4543, + "step": 4518 + }, + { + "epoch": 0.4099981854472872, + "grad_norm": 0.13719530400968918, + "learning_rate": 0.0006668076307061145, + "loss": 1.4377, + "step": 4519 + }, + { + "epoch": 0.41008891308292506, + "grad_norm": 0.1489286480299336, + "learning_rate": 0.0006666691140745787, + "loss": 1.4484, + "step": 4520 + }, + { + "epoch": 0.4101796407185629, + "grad_norm": 0.13575281379208604, + "learning_rate": 0.0006665305830511405, + "loss": 1.4063, + "step": 4521 + }, + { + "epoch": 0.41027036835420067, + "grad_norm": 0.13983252494278922, + "learning_rate": 0.0006663920376477618, + "loss": 1.4311, + "step": 4522 + }, + { + "epoch": 0.4103610959898385, + "grad_norm": 0.14691706923813233, + "learning_rate": 0.0006662534778764061, + "loss": 1.4082, + "step": 4523 + }, + { + "epoch": 0.41045182362547633, + "grad_norm": 0.15661145103907487, + "learning_rate": 0.0006661149037490383, + "loss": 1.4086, + "step": 4524 + }, + { + "epoch": 0.4105425512611141, + "grad_norm": 0.14909710236085197, + "learning_rate": 0.0006659763152776237, + "loss": 1.4089, + "step": 4525 + }, + { + "epoch": 0.41063327889675194, + "grad_norm": 0.1613110210104921, + "learning_rate": 0.0006658377124741301, + "loss": 1.4286, + "step": 4526 + }, + { + "epoch": 0.4107240065323898, + "grad_norm": 0.1338067660261556, + "learning_rate": 0.0006656990953505255, + "loss": 1.3711, + "step": 4527 + }, + { + "epoch": 0.4108147341680276, + "grad_norm": 0.13895683212766263, + "learning_rate": 0.0006655604639187796, + "loss": 1.4239, + "step": 4528 + }, + { + "epoch": 0.4109054618036654, + "grad_norm": 0.13862111429705326, + "learning_rate": 0.0006654218181908633, + "loss": 1.4195, + "step": 4529 + }, + { + "epoch": 0.4109961894393032, + "grad_norm": 0.13236618412831333, + "learning_rate": 0.0006652831581787485, + "loss": 1.4489, + "step": 4530 + }, + { + "epoch": 0.41108691707494105, + "grad_norm": 0.15526369137255197, + "learning_rate": 0.0006651444838944088, + "loss": 1.4204, + "step": 4531 + }, + { + "epoch": 0.4111776447105788, + "grad_norm": 0.1384051276977031, + "learning_rate": 0.0006650057953498185, + "loss": 1.4139, + "step": 4532 + }, + { + "epoch": 0.41126837234621666, + "grad_norm": 0.13859737263455948, + "learning_rate": 0.0006648670925569534, + "loss": 1.452, + "step": 4533 + }, + { + "epoch": 0.4113590999818545, + "grad_norm": 0.14749633277533075, + "learning_rate": 0.0006647283755277907, + "loss": 1.412, + "step": 4534 + }, + { + "epoch": 0.41144982761749227, + "grad_norm": 0.12724015045558815, + "learning_rate": 0.0006645896442743082, + "loss": 1.4157, + "step": 4535 + }, + { + "epoch": 0.4115405552531301, + "grad_norm": 0.14546941467229493, + "learning_rate": 0.000664450898808486, + "loss": 1.4062, + "step": 4536 + }, + { + "epoch": 0.41163128288876794, + "grad_norm": 0.14167752155343027, + "learning_rate": 0.0006643121391423041, + "loss": 1.425, + "step": 4537 + }, + { + "epoch": 0.4117220105244057, + "grad_norm": 0.14813218620622157, + "learning_rate": 0.000664173365287745, + "loss": 1.4502, + "step": 4538 + }, + { + "epoch": 0.41181273816004355, + "grad_norm": 0.1387452673702035, + "learning_rate": 0.0006640345772567917, + "loss": 1.4146, + "step": 4539 + }, + { + "epoch": 0.4119034657956814, + "grad_norm": 0.1295925431454547, + "learning_rate": 0.0006638957750614282, + "loss": 1.4406, + "step": 4540 + }, + { + "epoch": 0.41199419343131916, + "grad_norm": 0.14225360338009416, + "learning_rate": 0.0006637569587136406, + "loss": 1.4194, + "step": 4541 + }, + { + "epoch": 0.412084921066957, + "grad_norm": 0.13440329931770942, + "learning_rate": 0.0006636181282254154, + "loss": 1.379, + "step": 4542 + }, + { + "epoch": 0.4121756487025948, + "grad_norm": 0.14027408581018305, + "learning_rate": 0.0006634792836087408, + "loss": 1.4544, + "step": 4543 + }, + { + "epoch": 0.4122663763382326, + "grad_norm": 0.14958068595351828, + "learning_rate": 0.0006633404248756057, + "loss": 1.4242, + "step": 4544 + }, + { + "epoch": 0.41235710397387043, + "grad_norm": 0.14563165912513124, + "learning_rate": 0.000663201552038001, + "loss": 1.3936, + "step": 4545 + }, + { + "epoch": 0.41244783160950826, + "grad_norm": 0.1425441463342543, + "learning_rate": 0.0006630626651079184, + "loss": 1.3868, + "step": 4546 + }, + { + "epoch": 0.4125385592451461, + "grad_norm": 0.1409195396333766, + "learning_rate": 0.0006629237640973504, + "loss": 1.3682, + "step": 4547 + }, + { + "epoch": 0.4126292868807839, + "grad_norm": 0.18238885546963762, + "learning_rate": 0.0006627848490182914, + "loss": 1.4048, + "step": 4548 + }, + { + "epoch": 0.4127200145164217, + "grad_norm": 0.14289363765033644, + "learning_rate": 0.0006626459198827367, + "loss": 1.413, + "step": 4549 + }, + { + "epoch": 0.41281074215205954, + "grad_norm": 0.1529357505670922, + "learning_rate": 0.0006625069767026829, + "loss": 1.4132, + "step": 4550 + }, + { + "epoch": 0.4129014697876973, + "grad_norm": 0.13744583247155998, + "learning_rate": 0.0006623680194901278, + "loss": 1.4132, + "step": 4551 + }, + { + "epoch": 0.41299219742333515, + "grad_norm": 0.1777220207750682, + "learning_rate": 0.0006622290482570701, + "loss": 1.4356, + "step": 4552 + }, + { + "epoch": 0.413082925058973, + "grad_norm": 0.14496508550159182, + "learning_rate": 0.0006620900630155102, + "loss": 1.4012, + "step": 4553 + }, + { + "epoch": 0.41317365269461076, + "grad_norm": 0.1418831836118273, + "learning_rate": 0.0006619510637774495, + "loss": 1.4104, + "step": 4554 + }, + { + "epoch": 0.4132643803302486, + "grad_norm": 0.1389672953354342, + "learning_rate": 0.0006618120505548906, + "loss": 1.4527, + "step": 4555 + }, + { + "epoch": 0.4133551079658864, + "grad_norm": 0.13818124248577954, + "learning_rate": 0.0006616730233598373, + "loss": 1.3931, + "step": 4556 + }, + { + "epoch": 0.4134458356015242, + "grad_norm": 0.17265491485939152, + "learning_rate": 0.0006615339822042945, + "loss": 1.3997, + "step": 4557 + }, + { + "epoch": 0.41353656323716204, + "grad_norm": 0.1459219636104571, + "learning_rate": 0.0006613949271002687, + "loss": 1.4085, + "step": 4558 + }, + { + "epoch": 0.41362729087279987, + "grad_norm": 0.1346958087293199, + "learning_rate": 0.000661255858059767, + "loss": 1.3965, + "step": 4559 + }, + { + "epoch": 0.41371801850843765, + "grad_norm": 0.13060282480173316, + "learning_rate": 0.0006611167750947984, + "loss": 1.4354, + "step": 4560 + }, + { + "epoch": 0.4138087461440755, + "grad_norm": 0.1439168343220772, + "learning_rate": 0.0006609776782173723, + "loss": 1.4348, + "step": 4561 + }, + { + "epoch": 0.4138994737797133, + "grad_norm": 0.1727411595837008, + "learning_rate": 0.0006608385674395, + "loss": 1.4366, + "step": 4562 + }, + { + "epoch": 0.4139902014153511, + "grad_norm": 0.13555384049650943, + "learning_rate": 0.0006606994427731938, + "loss": 1.4156, + "step": 4563 + }, + { + "epoch": 0.4140809290509889, + "grad_norm": 0.20893070812786899, + "learning_rate": 0.0006605603042304669, + "loss": 1.4291, + "step": 4564 + }, + { + "epoch": 0.41417165668662675, + "grad_norm": 0.2671082489752668, + "learning_rate": 0.0006604211518233343, + "loss": 1.4307, + "step": 4565 + }, + { + "epoch": 0.4142623843222646, + "grad_norm": 0.14705288317286316, + "learning_rate": 0.0006602819855638113, + "loss": 1.3821, + "step": 4566 + }, + { + "epoch": 0.41435311195790236, + "grad_norm": 0.15388121858803294, + "learning_rate": 0.0006601428054639154, + "loss": 1.4033, + "step": 4567 + }, + { + "epoch": 0.4144438395935402, + "grad_norm": 0.15889544489687515, + "learning_rate": 0.0006600036115356646, + "loss": 1.4276, + "step": 4568 + }, + { + "epoch": 0.41453456722917803, + "grad_norm": 0.16303453828561487, + "learning_rate": 0.0006598644037910784, + "loss": 1.4142, + "step": 4569 + }, + { + "epoch": 0.4146252948648158, + "grad_norm": 0.14902407776646354, + "learning_rate": 0.0006597251822421774, + "loss": 1.4492, + "step": 4570 + }, + { + "epoch": 0.41471602250045364, + "grad_norm": 0.15860572295457134, + "learning_rate": 0.0006595859469009833, + "loss": 1.4304, + "step": 4571 + }, + { + "epoch": 0.4148067501360915, + "grad_norm": 0.2464966318356979, + "learning_rate": 0.000659446697779519, + "loss": 1.4367, + "step": 4572 + }, + { + "epoch": 0.41489747777172925, + "grad_norm": 0.14615998590674956, + "learning_rate": 0.0006593074348898091, + "loss": 1.4304, + "step": 4573 + }, + { + "epoch": 0.4149882054073671, + "grad_norm": 0.14060512245649864, + "learning_rate": 0.0006591681582438786, + "loss": 1.4213, + "step": 4574 + }, + { + "epoch": 0.4150789330430049, + "grad_norm": 0.13777279639512874, + "learning_rate": 0.0006590288678537542, + "loss": 1.4067, + "step": 4575 + }, + { + "epoch": 0.4151696606786427, + "grad_norm": 0.13710873378466565, + "learning_rate": 0.0006588895637314636, + "loss": 1.4083, + "step": 4576 + }, + { + "epoch": 0.4152603883142805, + "grad_norm": 0.13549858087589417, + "learning_rate": 0.0006587502458890359, + "loss": 1.4123, + "step": 4577 + }, + { + "epoch": 0.41535111594991836, + "grad_norm": 0.13971452038010065, + "learning_rate": 0.000658610914338501, + "loss": 1.4028, + "step": 4578 + }, + { + "epoch": 0.41544184358555614, + "grad_norm": 0.17991110067860402, + "learning_rate": 0.00065847156909189, + "loss": 1.4172, + "step": 4579 + }, + { + "epoch": 0.41553257122119397, + "grad_norm": 0.13450958223733642, + "learning_rate": 0.000658332210161236, + "loss": 1.4326, + "step": 4580 + }, + { + "epoch": 0.4156232988568318, + "grad_norm": 0.14934789191844886, + "learning_rate": 0.0006581928375585721, + "loss": 1.4249, + "step": 4581 + }, + { + "epoch": 0.4157140264924696, + "grad_norm": 0.1364899135695582, + "learning_rate": 0.0006580534512959336, + "loss": 1.4364, + "step": 4582 + }, + { + "epoch": 0.4158047541281074, + "grad_norm": 0.1516505751401749, + "learning_rate": 0.000657914051385356, + "loss": 1.4129, + "step": 4583 + }, + { + "epoch": 0.41589548176374525, + "grad_norm": 0.13023028682234028, + "learning_rate": 0.000657774637838877, + "loss": 1.3905, + "step": 4584 + }, + { + "epoch": 0.4159862093993831, + "grad_norm": 0.13566849908502593, + "learning_rate": 0.0006576352106685348, + "loss": 1.3897, + "step": 4585 + }, + { + "epoch": 0.41607693703502086, + "grad_norm": 0.1380297097461343, + "learning_rate": 0.0006574957698863688, + "loss": 1.3828, + "step": 4586 + }, + { + "epoch": 0.4161676646706587, + "grad_norm": 0.13106545434572506, + "learning_rate": 0.0006573563155044198, + "loss": 1.4433, + "step": 4587 + }, + { + "epoch": 0.4162583923062965, + "grad_norm": 0.13108107713985354, + "learning_rate": 0.0006572168475347299, + "loss": 1.4202, + "step": 4588 + }, + { + "epoch": 0.4163491199419343, + "grad_norm": 0.14339362060878502, + "learning_rate": 0.0006570773659893419, + "loss": 1.4229, + "step": 4589 + }, + { + "epoch": 0.41643984757757213, + "grad_norm": 0.18737261042247372, + "learning_rate": 0.0006569378708803004, + "loss": 1.4366, + "step": 4590 + }, + { + "epoch": 0.41653057521320996, + "grad_norm": 0.14146912020922398, + "learning_rate": 0.0006567983622196504, + "loss": 1.3873, + "step": 4591 + }, + { + "epoch": 0.41662130284884774, + "grad_norm": 0.15597325267863213, + "learning_rate": 0.0006566588400194388, + "loss": 1.4532, + "step": 4592 + }, + { + "epoch": 0.4167120304844856, + "grad_norm": 0.19806246640289912, + "learning_rate": 0.0006565193042917134, + "loss": 1.4156, + "step": 4593 + }, + { + "epoch": 0.4168027581201234, + "grad_norm": 0.13847130976561864, + "learning_rate": 0.0006563797550485228, + "loss": 1.433, + "step": 4594 + }, + { + "epoch": 0.4168934857557612, + "grad_norm": 0.12872092921576636, + "learning_rate": 0.0006562401923019174, + "loss": 1.4013, + "step": 4595 + }, + { + "epoch": 0.416984213391399, + "grad_norm": 0.1364414999448823, + "learning_rate": 0.0006561006160639484, + "loss": 1.4234, + "step": 4596 + }, + { + "epoch": 0.41707494102703685, + "grad_norm": 0.146017511892456, + "learning_rate": 0.0006559610263466683, + "loss": 1.4244, + "step": 4597 + }, + { + "epoch": 0.4171656686626746, + "grad_norm": 0.1892320565003208, + "learning_rate": 0.0006558214231621305, + "loss": 1.4481, + "step": 4598 + }, + { + "epoch": 0.41725639629831246, + "grad_norm": 0.1395875621253417, + "learning_rate": 0.0006556818065223898, + "loss": 1.4103, + "step": 4599 + }, + { + "epoch": 0.4173471239339503, + "grad_norm": 0.14394177896200447, + "learning_rate": 0.0006555421764395021, + "loss": 1.3999, + "step": 4600 + }, + { + "epoch": 0.41743785156958807, + "grad_norm": 0.1576200971564526, + "learning_rate": 0.0006554025329255246, + "loss": 1.4394, + "step": 4601 + }, + { + "epoch": 0.4175285792052259, + "grad_norm": 0.14502782373743311, + "learning_rate": 0.0006552628759925156, + "loss": 1.4641, + "step": 4602 + }, + { + "epoch": 0.41761930684086374, + "grad_norm": 0.129410975848476, + "learning_rate": 0.000655123205652534, + "loss": 1.4268, + "step": 4603 + }, + { + "epoch": 0.41771003447650157, + "grad_norm": 0.14529326025829642, + "learning_rate": 0.0006549835219176411, + "loss": 1.4156, + "step": 4604 + }, + { + "epoch": 0.41780076211213935, + "grad_norm": 0.820915763078422, + "learning_rate": 0.0006548438247998981, + "loss": 1.4557, + "step": 4605 + }, + { + "epoch": 0.4178914897477772, + "grad_norm": 0.14008033424067332, + "learning_rate": 0.0006547041143113681, + "loss": 1.4261, + "step": 4606 + }, + { + "epoch": 0.417982217383415, + "grad_norm": 0.1435444848926053, + "learning_rate": 0.0006545643904641149, + "loss": 1.4354, + "step": 4607 + }, + { + "epoch": 0.4180729450190528, + "grad_norm": 0.14437277458488565, + "learning_rate": 0.0006544246532702038, + "loss": 1.4158, + "step": 4608 + }, + { + "epoch": 0.4181636726546906, + "grad_norm": 0.143897695538196, + "learning_rate": 0.0006542849027417013, + "loss": 1.4607, + "step": 4609 + }, + { + "epoch": 0.41825440029032845, + "grad_norm": 0.14905747281648724, + "learning_rate": 0.0006541451388906746, + "loss": 1.4229, + "step": 4610 + }, + { + "epoch": 0.41834512792596623, + "grad_norm": 0.14022091464447112, + "learning_rate": 0.0006540053617291924, + "loss": 1.4248, + "step": 4611 + }, + { + "epoch": 0.41843585556160406, + "grad_norm": 0.1628564612932806, + "learning_rate": 0.0006538655712693246, + "loss": 1.4621, + "step": 4612 + }, + { + "epoch": 0.4185265831972419, + "grad_norm": 0.1704767934155269, + "learning_rate": 0.0006537257675231419, + "loss": 1.4381, + "step": 4613 + }, + { + "epoch": 0.4186173108328797, + "grad_norm": 0.148186118825199, + "learning_rate": 0.0006535859505027167, + "loss": 1.4211, + "step": 4614 + }, + { + "epoch": 0.4187080384685175, + "grad_norm": 0.1602269381931011, + "learning_rate": 0.0006534461202201219, + "loss": 1.4469, + "step": 4615 + }, + { + "epoch": 0.41879876610415534, + "grad_norm": 0.1595897480010707, + "learning_rate": 0.0006533062766874322, + "loss": 1.4164, + "step": 4616 + }, + { + "epoch": 0.4188894937397931, + "grad_norm": 0.1467182112840743, + "learning_rate": 0.0006531664199167227, + "loss": 1.4319, + "step": 4617 + }, + { + "epoch": 0.41898022137543095, + "grad_norm": 0.18934048089581165, + "learning_rate": 0.0006530265499200702, + "loss": 1.4147, + "step": 4618 + }, + { + "epoch": 0.4190709490110688, + "grad_norm": 0.1493263634720357, + "learning_rate": 0.0006528866667095528, + "loss": 1.4415, + "step": 4619 + }, + { + "epoch": 0.41916167664670656, + "grad_norm": 0.13372557420908376, + "learning_rate": 0.000652746770297249, + "loss": 1.4473, + "step": 4620 + }, + { + "epoch": 0.4192524042823444, + "grad_norm": 0.14286422839190988, + "learning_rate": 0.0006526068606952394, + "loss": 1.4455, + "step": 4621 + }, + { + "epoch": 0.4193431319179822, + "grad_norm": 0.13565030575163942, + "learning_rate": 0.0006524669379156045, + "loss": 1.4108, + "step": 4622 + }, + { + "epoch": 0.41943385955362006, + "grad_norm": 0.1447791821404428, + "learning_rate": 0.0006523270019704271, + "loss": 1.4546, + "step": 4623 + }, + { + "epoch": 0.41952458718925784, + "grad_norm": 0.16464571999080785, + "learning_rate": 0.0006521870528717909, + "loss": 1.4503, + "step": 4624 + }, + { + "epoch": 0.41961531482489567, + "grad_norm": 0.14166089566331225, + "learning_rate": 0.0006520470906317801, + "loss": 1.4207, + "step": 4625 + }, + { + "epoch": 0.4197060424605335, + "grad_norm": 0.13810392679085856, + "learning_rate": 0.0006519071152624805, + "loss": 1.4184, + "step": 4626 + }, + { + "epoch": 0.4197967700961713, + "grad_norm": 0.15363114601187097, + "learning_rate": 0.0006517671267759792, + "loss": 1.3574, + "step": 4627 + }, + { + "epoch": 0.4198874977318091, + "grad_norm": 0.15048122152894955, + "learning_rate": 0.0006516271251843641, + "loss": 1.4492, + "step": 4628 + }, + { + "epoch": 0.41997822536744694, + "grad_norm": 0.15960409437306491, + "learning_rate": 0.0006514871104997246, + "loss": 1.4502, + "step": 4629 + }, + { + "epoch": 0.4200689530030847, + "grad_norm": 0.13582456498597706, + "learning_rate": 0.0006513470827341504, + "loss": 1.4093, + "step": 4630 + }, + { + "epoch": 0.42015968063872255, + "grad_norm": 0.14216054676469111, + "learning_rate": 0.0006512070418997336, + "loss": 1.4296, + "step": 4631 + }, + { + "epoch": 0.4202504082743604, + "grad_norm": 0.14684248895740284, + "learning_rate": 0.0006510669880085663, + "loss": 1.4689, + "step": 4632 + }, + { + "epoch": 0.42034113590999816, + "grad_norm": 0.14121265161239155, + "learning_rate": 0.0006509269210727424, + "loss": 1.4545, + "step": 4633 + }, + { + "epoch": 0.420431863545636, + "grad_norm": 0.1490226467620869, + "learning_rate": 0.0006507868411043566, + "loss": 1.3705, + "step": 4634 + }, + { + "epoch": 0.42052259118127383, + "grad_norm": 0.15204576894555683, + "learning_rate": 0.0006506467481155048, + "loss": 1.4168, + "step": 4635 + }, + { + "epoch": 0.4206133188169116, + "grad_norm": 0.13997386975362613, + "learning_rate": 0.0006505066421182842, + "loss": 1.4314, + "step": 4636 + }, + { + "epoch": 0.42070404645254944, + "grad_norm": 0.13607773170664209, + "learning_rate": 0.0006503665231247928, + "loss": 1.4531, + "step": 4637 + }, + { + "epoch": 0.4207947740881873, + "grad_norm": 0.1618440059644793, + "learning_rate": 0.00065022639114713, + "loss": 1.4261, + "step": 4638 + }, + { + "epoch": 0.42088550172382505, + "grad_norm": 0.1502739055164192, + "learning_rate": 0.0006500862461973962, + "loss": 1.426, + "step": 4639 + }, + { + "epoch": 0.4209762293594629, + "grad_norm": 0.1374234499456936, + "learning_rate": 0.0006499460882876929, + "loss": 1.4244, + "step": 4640 + }, + { + "epoch": 0.4210669569951007, + "grad_norm": 0.15466832800432928, + "learning_rate": 0.0006498059174301227, + "loss": 1.4377, + "step": 4641 + }, + { + "epoch": 0.42115768463073855, + "grad_norm": 0.13733749483403734, + "learning_rate": 0.0006496657336367895, + "loss": 1.4388, + "step": 4642 + }, + { + "epoch": 0.4212484122663763, + "grad_norm": 0.1437688390463224, + "learning_rate": 0.0006495255369197982, + "loss": 1.4263, + "step": 4643 + }, + { + "epoch": 0.42133913990201416, + "grad_norm": 0.13546324987703576, + "learning_rate": 0.0006493853272912547, + "loss": 1.4511, + "step": 4644 + }, + { + "epoch": 0.421429867537652, + "grad_norm": 0.13766018115278467, + "learning_rate": 0.0006492451047632661, + "loss": 1.4471, + "step": 4645 + }, + { + "epoch": 0.42152059517328977, + "grad_norm": 0.1387340417547724, + "learning_rate": 0.0006491048693479408, + "loss": 1.4226, + "step": 4646 + }, + { + "epoch": 0.4216113228089276, + "grad_norm": 0.15261996013999013, + "learning_rate": 0.0006489646210573881, + "loss": 1.4165, + "step": 4647 + }, + { + "epoch": 0.42170205044456543, + "grad_norm": 0.15218748587187544, + "learning_rate": 0.0006488243599037184, + "loss": 1.4099, + "step": 4648 + }, + { + "epoch": 0.4217927780802032, + "grad_norm": 0.1494088668201815, + "learning_rate": 0.0006486840858990432, + "loss": 1.4504, + "step": 4649 + }, + { + "epoch": 0.42188350571584105, + "grad_norm": 0.14762541207282326, + "learning_rate": 0.0006485437990554752, + "loss": 1.4051, + "step": 4650 + }, + { + "epoch": 0.4219742333514789, + "grad_norm": 0.15015503817931156, + "learning_rate": 0.0006484034993851284, + "loss": 1.4148, + "step": 4651 + }, + { + "epoch": 0.42206496098711666, + "grad_norm": 0.14625008556934696, + "learning_rate": 0.0006482631869001175, + "loss": 1.3884, + "step": 4652 + }, + { + "epoch": 0.4221556886227545, + "grad_norm": 0.16406471719607477, + "learning_rate": 0.0006481228616125588, + "loss": 1.4253, + "step": 4653 + }, + { + "epoch": 0.4222464162583923, + "grad_norm": 0.14306298628347797, + "learning_rate": 0.0006479825235345686, + "loss": 1.4549, + "step": 4654 + }, + { + "epoch": 0.4223371438940301, + "grad_norm": 0.15546931203650105, + "learning_rate": 0.0006478421726782663, + "loss": 1.3879, + "step": 4655 + }, + { + "epoch": 0.42242787152966793, + "grad_norm": 0.1416100339093936, + "learning_rate": 0.0006477018090557703, + "loss": 1.4081, + "step": 4656 + }, + { + "epoch": 0.42251859916530576, + "grad_norm": 0.14516602566607079, + "learning_rate": 0.0006475614326792012, + "loss": 1.417, + "step": 4657 + }, + { + "epoch": 0.42260932680094354, + "grad_norm": 0.14767192383283032, + "learning_rate": 0.0006474210435606809, + "loss": 1.4098, + "step": 4658 + }, + { + "epoch": 0.4227000544365814, + "grad_norm": 0.14604980120411964, + "learning_rate": 0.0006472806417123316, + "loss": 1.4312, + "step": 4659 + }, + { + "epoch": 0.4227907820722192, + "grad_norm": 0.1495051820783034, + "learning_rate": 0.0006471402271462773, + "loss": 1.4257, + "step": 4660 + }, + { + "epoch": 0.42288150970785704, + "grad_norm": 0.34442413297972957, + "learning_rate": 0.0006469997998746425, + "loss": 1.4289, + "step": 4661 + }, + { + "epoch": 0.4229722373434948, + "grad_norm": 0.13305839957212595, + "learning_rate": 0.0006468593599095535, + "loss": 1.3703, + "step": 4662 + }, + { + "epoch": 0.42306296497913265, + "grad_norm": 0.13945050979831192, + "learning_rate": 0.0006467189072631372, + "loss": 1.4101, + "step": 4663 + }, + { + "epoch": 0.4231536926147705, + "grad_norm": 0.14084489762021285, + "learning_rate": 0.0006465784419475214, + "loss": 1.4506, + "step": 4664 + }, + { + "epoch": 0.42324442025040826, + "grad_norm": 0.13425578093235582, + "learning_rate": 0.0006464379639748356, + "loss": 1.4161, + "step": 4665 + }, + { + "epoch": 0.4233351478860461, + "grad_norm": 0.14034432760899945, + "learning_rate": 0.0006462974733572101, + "loss": 1.4091, + "step": 4666 + }, + { + "epoch": 0.4234258755216839, + "grad_norm": 0.1419029745411268, + "learning_rate": 0.0006461569701067762, + "loss": 1.4354, + "step": 4667 + }, + { + "epoch": 0.4235166031573217, + "grad_norm": 0.13747573597628054, + "learning_rate": 0.0006460164542356665, + "loss": 1.3992, + "step": 4668 + }, + { + "epoch": 0.42360733079295954, + "grad_norm": 0.1481435582297619, + "learning_rate": 0.0006458759257560141, + "loss": 1.4052, + "step": 4669 + }, + { + "epoch": 0.42369805842859737, + "grad_norm": 0.14621947077710956, + "learning_rate": 0.0006457353846799544, + "loss": 1.4281, + "step": 4670 + }, + { + "epoch": 0.42378878606423515, + "grad_norm": 0.14488914564006386, + "learning_rate": 0.0006455948310196226, + "loss": 1.4571, + "step": 4671 + }, + { + "epoch": 0.423879513699873, + "grad_norm": 0.13724863310536597, + "learning_rate": 0.0006454542647871556, + "loss": 1.3762, + "step": 4672 + }, + { + "epoch": 0.4239702413355108, + "grad_norm": 0.15660918929645368, + "learning_rate": 0.0006453136859946915, + "loss": 1.4093, + "step": 4673 + }, + { + "epoch": 0.4240609689711486, + "grad_norm": 0.14780068775188213, + "learning_rate": 0.0006451730946543692, + "loss": 1.4004, + "step": 4674 + }, + { + "epoch": 0.4241516966067864, + "grad_norm": 0.15807651982298604, + "learning_rate": 0.0006450324907783288, + "loss": 1.435, + "step": 4675 + }, + { + "epoch": 0.42424242424242425, + "grad_norm": 0.14168189620634153, + "learning_rate": 0.0006448918743787113, + "loss": 1.415, + "step": 4676 + }, + { + "epoch": 0.42433315187806203, + "grad_norm": 0.13247285150493837, + "learning_rate": 0.0006447512454676593, + "loss": 1.4186, + "step": 4677 + }, + { + "epoch": 0.42442387951369986, + "grad_norm": 0.14018196977280775, + "learning_rate": 0.0006446106040573158, + "loss": 1.389, + "step": 4678 + }, + { + "epoch": 0.4245146071493377, + "grad_norm": 0.14080639680664997, + "learning_rate": 0.0006444699501598252, + "loss": 1.4429, + "step": 4679 + }, + { + "epoch": 0.42460533478497553, + "grad_norm": 0.13220398577310388, + "learning_rate": 0.0006443292837873334, + "loss": 1.4379, + "step": 4680 + }, + { + "epoch": 0.4246960624206133, + "grad_norm": 0.15796497529479434, + "learning_rate": 0.0006441886049519864, + "loss": 1.4064, + "step": 4681 + }, + { + "epoch": 0.42478679005625114, + "grad_norm": 0.14167259479657582, + "learning_rate": 0.0006440479136659323, + "loss": 1.4183, + "step": 4682 + }, + { + "epoch": 0.424877517691889, + "grad_norm": 0.14302765108891874, + "learning_rate": 0.0006439072099413195, + "loss": 1.3941, + "step": 4683 + }, + { + "epoch": 0.42496824532752675, + "grad_norm": 0.17699831234829883, + "learning_rate": 0.0006437664937902981, + "loss": 1.3846, + "step": 4684 + }, + { + "epoch": 0.4250589729631646, + "grad_norm": 0.13218735140790117, + "learning_rate": 0.0006436257652250185, + "loss": 1.4092, + "step": 4685 + }, + { + "epoch": 0.4251497005988024, + "grad_norm": 0.17479750251259832, + "learning_rate": 0.0006434850242576331, + "loss": 1.4077, + "step": 4686 + }, + { + "epoch": 0.4252404282344402, + "grad_norm": 0.1355292768782275, + "learning_rate": 0.0006433442709002948, + "loss": 1.3899, + "step": 4687 + }, + { + "epoch": 0.425331155870078, + "grad_norm": 0.17310050869127802, + "learning_rate": 0.0006432035051651574, + "loss": 1.4335, + "step": 4688 + }, + { + "epoch": 0.42542188350571586, + "grad_norm": 0.12911305965143682, + "learning_rate": 0.0006430627270643762, + "loss": 1.4294, + "step": 4689 + }, + { + "epoch": 0.42551261114135364, + "grad_norm": 0.15879738959833897, + "learning_rate": 0.0006429219366101075, + "loss": 1.4571, + "step": 4690 + }, + { + "epoch": 0.42560333877699147, + "grad_norm": 0.13657973923477834, + "learning_rate": 0.0006427811338145084, + "loss": 1.4219, + "step": 4691 + }, + { + "epoch": 0.4256940664126293, + "grad_norm": 0.1484018975683406, + "learning_rate": 0.0006426403186897376, + "loss": 1.3947, + "step": 4692 + }, + { + "epoch": 0.4257847940482671, + "grad_norm": 0.13368940274061783, + "learning_rate": 0.000642499491247954, + "loss": 1.4224, + "step": 4693 + }, + { + "epoch": 0.4258755216839049, + "grad_norm": 0.13420965104684718, + "learning_rate": 0.0006423586515013185, + "loss": 1.4441, + "step": 4694 + }, + { + "epoch": 0.42596624931954274, + "grad_norm": 0.19410975782097525, + "learning_rate": 0.0006422177994619924, + "loss": 1.4447, + "step": 4695 + }, + { + "epoch": 0.4260569769551805, + "grad_norm": 0.13894583299307034, + "learning_rate": 0.0006420769351421383, + "loss": 1.3941, + "step": 4696 + }, + { + "epoch": 0.42614770459081835, + "grad_norm": 0.14259709096349882, + "learning_rate": 0.00064193605855392, + "loss": 1.4362, + "step": 4697 + }, + { + "epoch": 0.4262384322264562, + "grad_norm": 0.14263347792439024, + "learning_rate": 0.000641795169709502, + "loss": 1.4103, + "step": 4698 + }, + { + "epoch": 0.426329159862094, + "grad_norm": 0.14687912407981824, + "learning_rate": 0.0006416542686210505, + "loss": 1.3984, + "step": 4699 + }, + { + "epoch": 0.4264198874977318, + "grad_norm": 0.17249093156072048, + "learning_rate": 0.0006415133553007317, + "loss": 1.4367, + "step": 4700 + }, + { + "epoch": 0.42651061513336963, + "grad_norm": 0.14161189483553202, + "learning_rate": 0.0006413724297607139, + "loss": 1.4256, + "step": 4701 + }, + { + "epoch": 0.42660134276900746, + "grad_norm": 0.16070055349065554, + "learning_rate": 0.000641231492013166, + "loss": 1.4122, + "step": 4702 + }, + { + "epoch": 0.42669207040464524, + "grad_norm": 0.157955370750823, + "learning_rate": 0.000641090542070258, + "loss": 1.4169, + "step": 4703 + }, + { + "epoch": 0.4267827980402831, + "grad_norm": 0.14365904285196304, + "learning_rate": 0.0006409495799441607, + "loss": 1.4161, + "step": 4704 + }, + { + "epoch": 0.4268735256759209, + "grad_norm": 0.13928364258451972, + "learning_rate": 0.0006408086056470466, + "loss": 1.41, + "step": 4705 + }, + { + "epoch": 0.4269642533115587, + "grad_norm": 0.13410457733926232, + "learning_rate": 0.0006406676191910885, + "loss": 1.3796, + "step": 4706 + }, + { + "epoch": 0.4270549809471965, + "grad_norm": 0.13896027335102717, + "learning_rate": 0.0006405266205884609, + "loss": 1.4438, + "step": 4707 + }, + { + "epoch": 0.42714570858283435, + "grad_norm": 0.1456255220150949, + "learning_rate": 0.0006403856098513387, + "loss": 1.3999, + "step": 4708 + }, + { + "epoch": 0.4272364362184721, + "grad_norm": 0.1339313790219614, + "learning_rate": 0.0006402445869918987, + "loss": 1.4271, + "step": 4709 + }, + { + "epoch": 0.42732716385410996, + "grad_norm": 0.14245946843591162, + "learning_rate": 0.0006401035520223178, + "loss": 1.4026, + "step": 4710 + }, + { + "epoch": 0.4274178914897478, + "grad_norm": 0.14977135604228595, + "learning_rate": 0.0006399625049547744, + "loss": 1.3984, + "step": 4711 + }, + { + "epoch": 0.42750861912538557, + "grad_norm": 0.1801112151993447, + "learning_rate": 0.0006398214458014482, + "loss": 1.3787, + "step": 4712 + }, + { + "epoch": 0.4275993467610234, + "grad_norm": 0.19694851459969356, + "learning_rate": 0.0006396803745745194, + "loss": 1.439, + "step": 4713 + }, + { + "epoch": 0.42769007439666124, + "grad_norm": 0.14848174258744115, + "learning_rate": 0.0006395392912861699, + "loss": 1.4112, + "step": 4714 + }, + { + "epoch": 0.427780802032299, + "grad_norm": 0.13820155826496744, + "learning_rate": 0.0006393981959485819, + "loss": 1.4284, + "step": 4715 + }, + { + "epoch": 0.42787152966793685, + "grad_norm": 0.13542950964973333, + "learning_rate": 0.0006392570885739392, + "loss": 1.4556, + "step": 4716 + }, + { + "epoch": 0.4279622573035747, + "grad_norm": 0.13189567711880576, + "learning_rate": 0.0006391159691744263, + "loss": 1.4072, + "step": 4717 + }, + { + "epoch": 0.4280529849392125, + "grad_norm": 0.14706080171551633, + "learning_rate": 0.000638974837762229, + "loss": 1.4443, + "step": 4718 + }, + { + "epoch": 0.4281437125748503, + "grad_norm": 0.15706864840403462, + "learning_rate": 0.0006388336943495339, + "loss": 1.4111, + "step": 4719 + }, + { + "epoch": 0.4282344402104881, + "grad_norm": 0.13749239084971696, + "learning_rate": 0.0006386925389485289, + "loss": 1.4172, + "step": 4720 + }, + { + "epoch": 0.42832516784612595, + "grad_norm": 0.12925722745745857, + "learning_rate": 0.0006385513715714028, + "loss": 1.4088, + "step": 4721 + }, + { + "epoch": 0.42841589548176373, + "grad_norm": 0.16892494096048172, + "learning_rate": 0.0006384101922303452, + "loss": 1.4464, + "step": 4722 + }, + { + "epoch": 0.42850662311740156, + "grad_norm": 0.1725321296013537, + "learning_rate": 0.0006382690009375471, + "loss": 1.3917, + "step": 4723 + }, + { + "epoch": 0.4285973507530394, + "grad_norm": 0.1375742600557294, + "learning_rate": 0.0006381277977052004, + "loss": 1.4286, + "step": 4724 + }, + { + "epoch": 0.4286880783886772, + "grad_norm": 0.13659331542733377, + "learning_rate": 0.0006379865825454982, + "loss": 1.4347, + "step": 4725 + }, + { + "epoch": 0.428778806024315, + "grad_norm": 0.1476028378777982, + "learning_rate": 0.0006378453554706341, + "loss": 1.4246, + "step": 4726 + }, + { + "epoch": 0.42886953365995284, + "grad_norm": 0.1439639727192002, + "learning_rate": 0.0006377041164928033, + "loss": 1.426, + "step": 4727 + }, + { + "epoch": 0.4289602612955906, + "grad_norm": 0.13065666954699093, + "learning_rate": 0.0006375628656242017, + "loss": 1.4056, + "step": 4728 + }, + { + "epoch": 0.42905098893122845, + "grad_norm": 0.1265589514065612, + "learning_rate": 0.0006374216028770263, + "loss": 1.393, + "step": 4729 + }, + { + "epoch": 0.4291417165668663, + "grad_norm": 0.1337674114206482, + "learning_rate": 0.0006372803282634754, + "loss": 1.4178, + "step": 4730 + }, + { + "epoch": 0.42923244420250406, + "grad_norm": 0.13871465930517243, + "learning_rate": 0.0006371390417957477, + "loss": 1.4206, + "step": 4731 + }, + { + "epoch": 0.4293231718381419, + "grad_norm": 0.1319327516713598, + "learning_rate": 0.0006369977434860438, + "loss": 1.4295, + "step": 4732 + }, + { + "epoch": 0.4294138994737797, + "grad_norm": 0.12986580868005718, + "learning_rate": 0.0006368564333465645, + "loss": 1.4033, + "step": 4733 + }, + { + "epoch": 0.4295046271094175, + "grad_norm": 0.1403613268885277, + "learning_rate": 0.0006367151113895119, + "loss": 1.4045, + "step": 4734 + }, + { + "epoch": 0.42959535474505534, + "grad_norm": 0.13370612227351433, + "learning_rate": 0.0006365737776270892, + "loss": 1.4148, + "step": 4735 + }, + { + "epoch": 0.42968608238069317, + "grad_norm": 0.13486451895283758, + "learning_rate": 0.0006364324320715009, + "loss": 1.4129, + "step": 4736 + }, + { + "epoch": 0.429776810016331, + "grad_norm": 0.13424880028613942, + "learning_rate": 0.0006362910747349518, + "loss": 1.4163, + "step": 4737 + }, + { + "epoch": 0.4298675376519688, + "grad_norm": 0.13508022078641344, + "learning_rate": 0.0006361497056296486, + "loss": 1.4241, + "step": 4738 + }, + { + "epoch": 0.4299582652876066, + "grad_norm": 0.13749377139974991, + "learning_rate": 0.0006360083247677979, + "loss": 1.4699, + "step": 4739 + }, + { + "epoch": 0.43004899292324444, + "grad_norm": 0.12947389099155093, + "learning_rate": 0.0006358669321616084, + "loss": 1.4472, + "step": 4740 + }, + { + "epoch": 0.4301397205588822, + "grad_norm": 0.1291403672792497, + "learning_rate": 0.0006357255278232894, + "loss": 1.429, + "step": 4741 + }, + { + "epoch": 0.43023044819452005, + "grad_norm": 0.13094204010893074, + "learning_rate": 0.000635584111765051, + "loss": 1.411, + "step": 4742 + }, + { + "epoch": 0.4303211758301579, + "grad_norm": 0.14040793775468663, + "learning_rate": 0.0006354426839991046, + "loss": 1.4123, + "step": 4743 + }, + { + "epoch": 0.43041190346579566, + "grad_norm": 0.13912235441767454, + "learning_rate": 0.0006353012445376624, + "loss": 1.4172, + "step": 4744 + }, + { + "epoch": 0.4305026311014335, + "grad_norm": 0.17823341812325466, + "learning_rate": 0.0006351597933929379, + "loss": 1.3914, + "step": 4745 + }, + { + "epoch": 0.43059335873707133, + "grad_norm": 0.1369109871344046, + "learning_rate": 0.0006350183305771453, + "loss": 1.409, + "step": 4746 + }, + { + "epoch": 0.4306840863727091, + "grad_norm": 0.17102996405403095, + "learning_rate": 0.0006348768561024998, + "loss": 1.4436, + "step": 4747 + }, + { + "epoch": 0.43077481400834694, + "grad_norm": 0.1425613040225065, + "learning_rate": 0.0006347353699812181, + "loss": 1.4056, + "step": 4748 + }, + { + "epoch": 0.4308655416439848, + "grad_norm": 0.1651342720610057, + "learning_rate": 0.0006345938722255171, + "loss": 1.4201, + "step": 4749 + }, + { + "epoch": 0.43095626927962255, + "grad_norm": 0.1319488962566797, + "learning_rate": 0.0006344523628476156, + "loss": 1.4751, + "step": 4750 + }, + { + "epoch": 0.4310469969152604, + "grad_norm": 0.16945542219964344, + "learning_rate": 0.0006343108418597328, + "loss": 1.413, + "step": 4751 + }, + { + "epoch": 0.4311377245508982, + "grad_norm": 0.1421855674523395, + "learning_rate": 0.0006341693092740887, + "loss": 1.4242, + "step": 4752 + }, + { + "epoch": 0.431228452186536, + "grad_norm": 0.1761687375246798, + "learning_rate": 0.0006340277651029054, + "loss": 1.3874, + "step": 4753 + }, + { + "epoch": 0.4313191798221738, + "grad_norm": 0.13298691523179537, + "learning_rate": 0.0006338862093584046, + "loss": 1.4201, + "step": 4754 + }, + { + "epoch": 0.43140990745781166, + "grad_norm": 0.12621343279813868, + "learning_rate": 0.0006337446420528099, + "loss": 1.3981, + "step": 4755 + }, + { + "epoch": 0.4315006350934495, + "grad_norm": 0.14625219647113613, + "learning_rate": 0.0006336030631983456, + "loss": 1.4311, + "step": 4756 + }, + { + "epoch": 0.43159136272908727, + "grad_norm": 0.1250499257080885, + "learning_rate": 0.0006334614728072371, + "loss": 1.373, + "step": 4757 + }, + { + "epoch": 0.4316820903647251, + "grad_norm": 0.13264629099723982, + "learning_rate": 0.000633319870891711, + "loss": 1.4375, + "step": 4758 + }, + { + "epoch": 0.43177281800036293, + "grad_norm": 0.1546360593083616, + "learning_rate": 0.0006331782574639939, + "loss": 1.3803, + "step": 4759 + }, + { + "epoch": 0.4318635456360007, + "grad_norm": 0.12992594495720702, + "learning_rate": 0.000633036632536315, + "loss": 1.4254, + "step": 4760 + }, + { + "epoch": 0.43195427327163854, + "grad_norm": 0.1374560945479263, + "learning_rate": 0.0006328949961209033, + "loss": 1.4468, + "step": 4761 + }, + { + "epoch": 0.4320450009072764, + "grad_norm": 0.15564274683757975, + "learning_rate": 0.000632753348229989, + "loss": 1.4175, + "step": 4762 + }, + { + "epoch": 0.43213572854291415, + "grad_norm": 0.13969320316044923, + "learning_rate": 0.0006326116888758036, + "loss": 1.4409, + "step": 4763 + }, + { + "epoch": 0.432226456178552, + "grad_norm": 0.12930800888442245, + "learning_rate": 0.0006324700180705791, + "loss": 1.4068, + "step": 4764 + }, + { + "epoch": 0.4323171838141898, + "grad_norm": 0.13324666350430145, + "learning_rate": 0.0006323283358265492, + "loss": 1.4364, + "step": 4765 + }, + { + "epoch": 0.4324079114498276, + "grad_norm": 0.13035797942785393, + "learning_rate": 0.000632186642155948, + "loss": 1.4277, + "step": 4766 + }, + { + "epoch": 0.43249863908546543, + "grad_norm": 0.14456138261316037, + "learning_rate": 0.0006320449370710107, + "loss": 1.4218, + "step": 4767 + }, + { + "epoch": 0.43258936672110326, + "grad_norm": 0.13798989874095502, + "learning_rate": 0.0006319032205839737, + "loss": 1.4234, + "step": 4768 + }, + { + "epoch": 0.43268009435674104, + "grad_norm": 0.1297103211009053, + "learning_rate": 0.000631761492707074, + "loss": 1.4136, + "step": 4769 + }, + { + "epoch": 0.4327708219923789, + "grad_norm": 0.12843395562312016, + "learning_rate": 0.0006316197534525502, + "loss": 1.3837, + "step": 4770 + }, + { + "epoch": 0.4328615496280167, + "grad_norm": 0.13774476727820656, + "learning_rate": 0.0006314780028326411, + "loss": 1.3815, + "step": 4771 + }, + { + "epoch": 0.4329522772636545, + "grad_norm": 0.14088094106406712, + "learning_rate": 0.0006313362408595873, + "loss": 1.4227, + "step": 4772 + }, + { + "epoch": 0.4330430048992923, + "grad_norm": 0.12520946708428335, + "learning_rate": 0.0006311944675456296, + "loss": 1.4154, + "step": 4773 + }, + { + "epoch": 0.43313373253493015, + "grad_norm": 0.1419668637636904, + "learning_rate": 0.0006310526829030101, + "loss": 1.3282, + "step": 4774 + }, + { + "epoch": 0.433224460170568, + "grad_norm": 0.13523902568707236, + "learning_rate": 0.0006309108869439725, + "loss": 1.41, + "step": 4775 + }, + { + "epoch": 0.43331518780620576, + "grad_norm": 0.14620842324980543, + "learning_rate": 0.0006307690796807602, + "loss": 1.3987, + "step": 4776 + }, + { + "epoch": 0.4334059154418436, + "grad_norm": 0.14157472823044034, + "learning_rate": 0.0006306272611256188, + "loss": 1.38, + "step": 4777 + }, + { + "epoch": 0.4334966430774814, + "grad_norm": 0.13510829036767286, + "learning_rate": 0.0006304854312907941, + "loss": 1.4189, + "step": 4778 + }, + { + "epoch": 0.4335873707131192, + "grad_norm": 0.15887651075665774, + "learning_rate": 0.000630343590188533, + "loss": 1.4321, + "step": 4779 + }, + { + "epoch": 0.43367809834875704, + "grad_norm": 0.13526159730908247, + "learning_rate": 0.000630201737831084, + "loss": 1.3944, + "step": 4780 + }, + { + "epoch": 0.43376882598439487, + "grad_norm": 0.12577682373385335, + "learning_rate": 0.0006300598742306955, + "loss": 1.4449, + "step": 4781 + }, + { + "epoch": 0.43385955362003265, + "grad_norm": 0.12745546332585453, + "learning_rate": 0.0006299179993996178, + "loss": 1.4253, + "step": 4782 + }, + { + "epoch": 0.4339502812556705, + "grad_norm": 0.1322039937303052, + "learning_rate": 0.0006297761133501016, + "loss": 1.4196, + "step": 4783 + }, + { + "epoch": 0.4340410088913083, + "grad_norm": 0.14031172663225167, + "learning_rate": 0.0006296342160943991, + "loss": 1.4246, + "step": 4784 + }, + { + "epoch": 0.4341317365269461, + "grad_norm": 0.1282002598745056, + "learning_rate": 0.0006294923076447628, + "loss": 1.4425, + "step": 4785 + }, + { + "epoch": 0.4342224641625839, + "grad_norm": 0.13361917944530832, + "learning_rate": 0.0006293503880134466, + "loss": 1.4124, + "step": 4786 + }, + { + "epoch": 0.43431319179822175, + "grad_norm": 0.15360764602009203, + "learning_rate": 0.0006292084572127054, + "loss": 1.39, + "step": 4787 + }, + { + "epoch": 0.43440391943385953, + "grad_norm": 0.13371972482789587, + "learning_rate": 0.0006290665152547948, + "loss": 1.3843, + "step": 4788 + }, + { + "epoch": 0.43449464706949736, + "grad_norm": 0.14591610027741006, + "learning_rate": 0.0006289245621519717, + "loss": 1.4708, + "step": 4789 + }, + { + "epoch": 0.4345853747051352, + "grad_norm": 0.13830908561881444, + "learning_rate": 0.0006287825979164936, + "loss": 1.4114, + "step": 4790 + }, + { + "epoch": 0.434676102340773, + "grad_norm": 0.12681639823313995, + "learning_rate": 0.0006286406225606191, + "loss": 1.3682, + "step": 4791 + }, + { + "epoch": 0.4347668299764108, + "grad_norm": 0.12861692804004146, + "learning_rate": 0.000628498636096608, + "loss": 1.4016, + "step": 4792 + }, + { + "epoch": 0.43485755761204864, + "grad_norm": 0.12969012346394773, + "learning_rate": 0.0006283566385367207, + "loss": 1.4245, + "step": 4793 + }, + { + "epoch": 0.4349482852476865, + "grad_norm": 0.16080992953536344, + "learning_rate": 0.0006282146298932187, + "loss": 1.424, + "step": 4794 + }, + { + "epoch": 0.43503901288332425, + "grad_norm": 0.129850489293655, + "learning_rate": 0.0006280726101783647, + "loss": 1.4026, + "step": 4795 + }, + { + "epoch": 0.4351297405189621, + "grad_norm": 0.12963696461317256, + "learning_rate": 0.0006279305794044218, + "loss": 1.401, + "step": 4796 + }, + { + "epoch": 0.4352204681545999, + "grad_norm": 0.13682435747927374, + "learning_rate": 0.0006277885375836545, + "loss": 1.4273, + "step": 4797 + }, + { + "epoch": 0.4353111957902377, + "grad_norm": 0.13071877232117712, + "learning_rate": 0.000627646484728328, + "loss": 1.457, + "step": 4798 + }, + { + "epoch": 0.4354019234258755, + "grad_norm": 0.1262448679557309, + "learning_rate": 0.0006275044208507091, + "loss": 1.4157, + "step": 4799 + }, + { + "epoch": 0.43549265106151336, + "grad_norm": 0.13146200892590315, + "learning_rate": 0.0006273623459630644, + "loss": 1.4302, + "step": 4800 + }, + { + "epoch": 0.43558337869715114, + "grad_norm": 0.12671991289828044, + "learning_rate": 0.0006272202600776624, + "loss": 1.4248, + "step": 4801 + }, + { + "epoch": 0.43567410633278897, + "grad_norm": 0.13595903510231241, + "learning_rate": 0.0006270781632067721, + "loss": 1.4107, + "step": 4802 + }, + { + "epoch": 0.4357648339684268, + "grad_norm": 0.13552254171024028, + "learning_rate": 0.0006269360553626637, + "loss": 1.3889, + "step": 4803 + }, + { + "epoch": 0.4358555616040646, + "grad_norm": 0.13416484334206308, + "learning_rate": 0.0006267939365576085, + "loss": 1.3893, + "step": 4804 + }, + { + "epoch": 0.4359462892397024, + "grad_norm": 0.12575696927716445, + "learning_rate": 0.000626651806803878, + "loss": 1.4253, + "step": 4805 + }, + { + "epoch": 0.43603701687534024, + "grad_norm": 0.1292198798239074, + "learning_rate": 0.0006265096661137452, + "loss": 1.4375, + "step": 4806 + }, + { + "epoch": 0.436127744510978, + "grad_norm": 0.12555107106938942, + "learning_rate": 0.0006263675144994841, + "loss": 1.4365, + "step": 4807 + }, + { + "epoch": 0.43621847214661585, + "grad_norm": 0.12301646506168415, + "learning_rate": 0.0006262253519733696, + "loss": 1.4103, + "step": 4808 + }, + { + "epoch": 0.4363091997822537, + "grad_norm": 0.13070716375464758, + "learning_rate": 0.0006260831785476773, + "loss": 1.4102, + "step": 4809 + }, + { + "epoch": 0.43639992741789146, + "grad_norm": 0.12572832745019571, + "learning_rate": 0.000625940994234684, + "loss": 1.452, + "step": 4810 + }, + { + "epoch": 0.4364906550535293, + "grad_norm": 0.1304622215945398, + "learning_rate": 0.0006257987990466674, + "loss": 1.3823, + "step": 4811 + }, + { + "epoch": 0.43658138268916713, + "grad_norm": 0.13128893936216623, + "learning_rate": 0.0006256565929959057, + "loss": 1.3929, + "step": 4812 + }, + { + "epoch": 0.43667211032480496, + "grad_norm": 0.12898362725655962, + "learning_rate": 0.0006255143760946788, + "loss": 1.4333, + "step": 4813 + }, + { + "epoch": 0.43676283796044274, + "grad_norm": 0.18971126310729505, + "learning_rate": 0.000625372148355267, + "loss": 1.4248, + "step": 4814 + }, + { + "epoch": 0.4368535655960806, + "grad_norm": 0.13199441066945342, + "learning_rate": 0.0006252299097899517, + "loss": 1.407, + "step": 4815 + }, + { + "epoch": 0.4369442932317184, + "grad_norm": 0.23205977560799276, + "learning_rate": 0.0006250876604110153, + "loss": 1.4326, + "step": 4816 + }, + { + "epoch": 0.4370350208673562, + "grad_norm": 0.13777833013489593, + "learning_rate": 0.000624945400230741, + "loss": 1.4029, + "step": 4817 + }, + { + "epoch": 0.437125748502994, + "grad_norm": 0.13824664494961225, + "learning_rate": 0.0006248031292614129, + "loss": 1.4218, + "step": 4818 + }, + { + "epoch": 0.43721647613863185, + "grad_norm": 0.2883695558244734, + "learning_rate": 0.0006246608475153164, + "loss": 1.3791, + "step": 4819 + }, + { + "epoch": 0.4373072037742696, + "grad_norm": 0.13136643013637925, + "learning_rate": 0.0006245185550047371, + "loss": 1.3865, + "step": 4820 + }, + { + "epoch": 0.43739793140990746, + "grad_norm": 0.13166554967613014, + "learning_rate": 0.0006243762517419622, + "loss": 1.4108, + "step": 4821 + }, + { + "epoch": 0.4374886590455453, + "grad_norm": 0.14128933649655698, + "learning_rate": 0.0006242339377392799, + "loss": 1.3926, + "step": 4822 + }, + { + "epoch": 0.43757938668118307, + "grad_norm": 0.13436113097227886, + "learning_rate": 0.0006240916130089784, + "loss": 1.3853, + "step": 4823 + }, + { + "epoch": 0.4376701143168209, + "grad_norm": 0.21683665142679728, + "learning_rate": 0.0006239492775633482, + "loss": 1.4341, + "step": 4824 + }, + { + "epoch": 0.43776084195245873, + "grad_norm": 0.13482973606360496, + "learning_rate": 0.0006238069314146793, + "loss": 1.3868, + "step": 4825 + }, + { + "epoch": 0.4378515695880965, + "grad_norm": 0.13333557073265576, + "learning_rate": 0.0006236645745752638, + "loss": 1.4116, + "step": 4826 + }, + { + "epoch": 0.43794229722373434, + "grad_norm": 0.15229223504504646, + "learning_rate": 0.0006235222070573941, + "loss": 1.3724, + "step": 4827 + }, + { + "epoch": 0.4380330248593722, + "grad_norm": 0.14596388877445796, + "learning_rate": 0.0006233798288733635, + "loss": 1.4102, + "step": 4828 + }, + { + "epoch": 0.43812375249500995, + "grad_norm": 0.13717054719116537, + "learning_rate": 0.0006232374400354664, + "loss": 1.4772, + "step": 4829 + }, + { + "epoch": 0.4382144801306478, + "grad_norm": 0.12843440467525893, + "learning_rate": 0.0006230950405559983, + "loss": 1.4282, + "step": 4830 + }, + { + "epoch": 0.4383052077662856, + "grad_norm": 0.1355157105067466, + "learning_rate": 0.0006229526304472555, + "loss": 1.4402, + "step": 4831 + }, + { + "epoch": 0.43839593540192345, + "grad_norm": 0.13543077874951115, + "learning_rate": 0.0006228102097215346, + "loss": 1.3746, + "step": 4832 + }, + { + "epoch": 0.43848666303756123, + "grad_norm": 0.2117955933429162, + "learning_rate": 0.0006226677783911342, + "loss": 1.4174, + "step": 4833 + }, + { + "epoch": 0.43857739067319906, + "grad_norm": 0.1266092884250909, + "learning_rate": 0.0006225253364683529, + "loss": 1.4018, + "step": 4834 + }, + { + "epoch": 0.4386681183088369, + "grad_norm": 0.13605320061288634, + "learning_rate": 0.000622382883965491, + "loss": 1.4199, + "step": 4835 + }, + { + "epoch": 0.4387588459444747, + "grad_norm": 0.13122565380127305, + "learning_rate": 0.0006222404208948488, + "loss": 1.3738, + "step": 4836 + }, + { + "epoch": 0.4388495735801125, + "grad_norm": 0.14082749106308462, + "learning_rate": 0.0006220979472687283, + "loss": 1.4029, + "step": 4837 + }, + { + "epoch": 0.43894030121575034, + "grad_norm": 0.17184784948369095, + "learning_rate": 0.0006219554630994322, + "loss": 1.4194, + "step": 4838 + }, + { + "epoch": 0.4390310288513881, + "grad_norm": 0.15598588197393404, + "learning_rate": 0.0006218129683992637, + "loss": 1.4241, + "step": 4839 + }, + { + "epoch": 0.43912175648702595, + "grad_norm": 0.14117647216193302, + "learning_rate": 0.0006216704631805274, + "loss": 1.4109, + "step": 4840 + }, + { + "epoch": 0.4392124841226638, + "grad_norm": 0.13552439694720927, + "learning_rate": 0.0006215279474555288, + "loss": 1.4301, + "step": 4841 + }, + { + "epoch": 0.43930321175830156, + "grad_norm": 0.14921280770415887, + "learning_rate": 0.000621385421236574, + "loss": 1.4313, + "step": 4842 + }, + { + "epoch": 0.4393939393939394, + "grad_norm": 0.12707035572490324, + "learning_rate": 0.0006212428845359702, + "loss": 1.3895, + "step": 4843 + }, + { + "epoch": 0.4394846670295772, + "grad_norm": 0.16411084822755775, + "learning_rate": 0.0006211003373660255, + "loss": 1.4047, + "step": 4844 + }, + { + "epoch": 0.439575394665215, + "grad_norm": 0.143144450583268, + "learning_rate": 0.0006209577797390486, + "loss": 1.443, + "step": 4845 + }, + { + "epoch": 0.43966612230085284, + "grad_norm": 0.14520932947303342, + "learning_rate": 0.0006208152116673498, + "loss": 1.408, + "step": 4846 + }, + { + "epoch": 0.43975684993649067, + "grad_norm": 0.14748995885179272, + "learning_rate": 0.0006206726331632397, + "loss": 1.4237, + "step": 4847 + }, + { + "epoch": 0.43984757757212845, + "grad_norm": 0.1373114767752759, + "learning_rate": 0.0006205300442390298, + "loss": 1.4095, + "step": 4848 + }, + { + "epoch": 0.4399383052077663, + "grad_norm": 0.1530408091182321, + "learning_rate": 0.0006203874449070328, + "loss": 1.4473, + "step": 4849 + }, + { + "epoch": 0.4400290328434041, + "grad_norm": 0.1365451055769037, + "learning_rate": 0.0006202448351795622, + "loss": 1.4064, + "step": 4850 + }, + { + "epoch": 0.44011976047904194, + "grad_norm": 0.2995534305453971, + "learning_rate": 0.0006201022150689325, + "loss": 1.4061, + "step": 4851 + }, + { + "epoch": 0.4402104881146797, + "grad_norm": 0.1323954840083028, + "learning_rate": 0.0006199595845874586, + "loss": 1.364, + "step": 4852 + }, + { + "epoch": 0.44030121575031755, + "grad_norm": 0.16723694818187385, + "learning_rate": 0.0006198169437474572, + "loss": 1.4125, + "step": 4853 + }, + { + "epoch": 0.4403919433859554, + "grad_norm": 0.14868163538177748, + "learning_rate": 0.0006196742925612447, + "loss": 1.411, + "step": 4854 + }, + { + "epoch": 0.44048267102159316, + "grad_norm": 0.14664198362531278, + "learning_rate": 0.0006195316310411396, + "loss": 1.4093, + "step": 4855 + }, + { + "epoch": 0.440573398657231, + "grad_norm": 0.16721115922356547, + "learning_rate": 0.0006193889591994604, + "loss": 1.4101, + "step": 4856 + }, + { + "epoch": 0.44066412629286883, + "grad_norm": 0.15464433417620013, + "learning_rate": 0.0006192462770485272, + "loss": 1.3949, + "step": 4857 + }, + { + "epoch": 0.4407548539285066, + "grad_norm": 0.14803745748136513, + "learning_rate": 0.0006191035846006603, + "loss": 1.4262, + "step": 4858 + }, + { + "epoch": 0.44084558156414444, + "grad_norm": 0.13625159850955593, + "learning_rate": 0.0006189608818681813, + "loss": 1.4577, + "step": 4859 + }, + { + "epoch": 0.4409363091997823, + "grad_norm": 0.13535388359169048, + "learning_rate": 0.0006188181688634127, + "loss": 1.3736, + "step": 4860 + }, + { + "epoch": 0.44102703683542005, + "grad_norm": 0.14966576602714343, + "learning_rate": 0.0006186754455986776, + "loss": 1.4314, + "step": 4861 + }, + { + "epoch": 0.4411177644710579, + "grad_norm": 0.14321403064569346, + "learning_rate": 0.0006185327120863003, + "loss": 1.3885, + "step": 4862 + }, + { + "epoch": 0.4412084921066957, + "grad_norm": 0.145205840482513, + "learning_rate": 0.0006183899683386059, + "loss": 1.4008, + "step": 4863 + }, + { + "epoch": 0.4412992197423335, + "grad_norm": 0.1573118963831874, + "learning_rate": 0.00061824721436792, + "loss": 1.3941, + "step": 4864 + }, + { + "epoch": 0.4413899473779713, + "grad_norm": 0.14076553512326093, + "learning_rate": 0.0006181044501865702, + "loss": 1.4102, + "step": 4865 + }, + { + "epoch": 0.44148067501360916, + "grad_norm": 0.13713475871117634, + "learning_rate": 0.0006179616758068834, + "loss": 1.447, + "step": 4866 + }, + { + "epoch": 0.44157140264924694, + "grad_norm": 0.1374821351341478, + "learning_rate": 0.0006178188912411886, + "loss": 1.4345, + "step": 4867 + }, + { + "epoch": 0.44166213028488477, + "grad_norm": 0.1409568818444109, + "learning_rate": 0.0006176760965018151, + "loss": 1.455, + "step": 4868 + }, + { + "epoch": 0.4417528579205226, + "grad_norm": 0.14213777167164476, + "learning_rate": 0.0006175332916010934, + "loss": 1.412, + "step": 4869 + }, + { + "epoch": 0.44184358555616043, + "grad_norm": 0.13320705155248821, + "learning_rate": 0.0006173904765513547, + "loss": 1.4043, + "step": 4870 + }, + { + "epoch": 0.4419343131917982, + "grad_norm": 0.14699000214579375, + "learning_rate": 0.000617247651364931, + "loss": 1.386, + "step": 4871 + }, + { + "epoch": 0.44202504082743604, + "grad_norm": 0.14898039727032014, + "learning_rate": 0.0006171048160541553, + "loss": 1.418, + "step": 4872 + }, + { + "epoch": 0.4421157684630739, + "grad_norm": 0.14280707318733624, + "learning_rate": 0.0006169619706313614, + "loss": 1.4195, + "step": 4873 + }, + { + "epoch": 0.44220649609871165, + "grad_norm": 0.14234882048519942, + "learning_rate": 0.0006168191151088841, + "loss": 1.4182, + "step": 4874 + }, + { + "epoch": 0.4422972237343495, + "grad_norm": 0.12841106841655164, + "learning_rate": 0.0006166762494990592, + "loss": 1.4143, + "step": 4875 + }, + { + "epoch": 0.4423879513699873, + "grad_norm": 0.16300644709622084, + "learning_rate": 0.0006165333738142227, + "loss": 1.3788, + "step": 4876 + }, + { + "epoch": 0.4424786790056251, + "grad_norm": 0.14984806946937979, + "learning_rate": 0.0006163904880667123, + "loss": 1.4393, + "step": 4877 + }, + { + "epoch": 0.44256940664126293, + "grad_norm": 0.12682772350984584, + "learning_rate": 0.0006162475922688662, + "loss": 1.4046, + "step": 4878 + }, + { + "epoch": 0.44266013427690076, + "grad_norm": 0.13139800617385844, + "learning_rate": 0.0006161046864330233, + "loss": 1.3958, + "step": 4879 + }, + { + "epoch": 0.44275086191253854, + "grad_norm": 0.1294982848140813, + "learning_rate": 0.0006159617705715236, + "loss": 1.4256, + "step": 4880 + }, + { + "epoch": 0.4428415895481764, + "grad_norm": 0.13278794936148305, + "learning_rate": 0.0006158188446967079, + "loss": 1.396, + "step": 4881 + }, + { + "epoch": 0.4429323171838142, + "grad_norm": 0.14406460856067493, + "learning_rate": 0.000615675908820918, + "loss": 1.4607, + "step": 4882 + }, + { + "epoch": 0.443023044819452, + "grad_norm": 0.13885104534075104, + "learning_rate": 0.0006155329629564963, + "loss": 1.4353, + "step": 4883 + }, + { + "epoch": 0.4431137724550898, + "grad_norm": 0.13182147225434068, + "learning_rate": 0.0006153900071157861, + "loss": 1.4318, + "step": 4884 + }, + { + "epoch": 0.44320450009072765, + "grad_norm": 0.1647505362836999, + "learning_rate": 0.0006152470413111319, + "loss": 1.3994, + "step": 4885 + }, + { + "epoch": 0.4432952277263654, + "grad_norm": 0.14529426134961157, + "learning_rate": 0.0006151040655548785, + "loss": 1.3853, + "step": 4886 + }, + { + "epoch": 0.44338595536200326, + "grad_norm": 0.12982703374335414, + "learning_rate": 0.0006149610798593722, + "loss": 1.3936, + "step": 4887 + }, + { + "epoch": 0.4434766829976411, + "grad_norm": 0.1343165724236935, + "learning_rate": 0.0006148180842369597, + "loss": 1.4136, + "step": 4888 + }, + { + "epoch": 0.44356741063327887, + "grad_norm": 0.2283257761594063, + "learning_rate": 0.0006146750786999888, + "loss": 1.4265, + "step": 4889 + }, + { + "epoch": 0.4436581382689167, + "grad_norm": 0.13889688932401634, + "learning_rate": 0.0006145320632608079, + "loss": 1.3998, + "step": 4890 + }, + { + "epoch": 0.44374886590455453, + "grad_norm": 0.14312663532416728, + "learning_rate": 0.0006143890379317664, + "loss": 1.4407, + "step": 4891 + }, + { + "epoch": 0.44383959354019237, + "grad_norm": 0.13993093782276286, + "learning_rate": 0.0006142460027252147, + "loss": 1.4266, + "step": 4892 + }, + { + "epoch": 0.44393032117583014, + "grad_norm": 0.14733451838391123, + "learning_rate": 0.0006141029576535037, + "loss": 1.4534, + "step": 4893 + }, + { + "epoch": 0.444021048811468, + "grad_norm": 0.13468756963902162, + "learning_rate": 0.0006139599027289856, + "loss": 1.3971, + "step": 4894 + }, + { + "epoch": 0.4441117764471058, + "grad_norm": 0.153201974596806, + "learning_rate": 0.0006138168379640131, + "loss": 1.4072, + "step": 4895 + }, + { + "epoch": 0.4442025040827436, + "grad_norm": 0.18689411539775616, + "learning_rate": 0.0006136737633709398, + "loss": 1.4074, + "step": 4896 + }, + { + "epoch": 0.4442932317183814, + "grad_norm": 0.17142996957902373, + "learning_rate": 0.0006135306789621204, + "loss": 1.3745, + "step": 4897 + }, + { + "epoch": 0.44438395935401925, + "grad_norm": 0.1266940438076139, + "learning_rate": 0.0006133875847499101, + "loss": 1.391, + "step": 4898 + }, + { + "epoch": 0.44447468698965703, + "grad_norm": 0.1437556501051644, + "learning_rate": 0.000613244480746665, + "loss": 1.4047, + "step": 4899 + }, + { + "epoch": 0.44456541462529486, + "grad_norm": 0.12991841267118032, + "learning_rate": 0.0006131013669647425, + "loss": 1.408, + "step": 4900 + }, + { + "epoch": 0.4446561422609327, + "grad_norm": 0.14596982881193404, + "learning_rate": 0.0006129582434165001, + "loss": 1.4241, + "step": 4901 + }, + { + "epoch": 0.4447468698965705, + "grad_norm": 0.13612081363376302, + "learning_rate": 0.0006128151101142969, + "loss": 1.4034, + "step": 4902 + }, + { + "epoch": 0.4448375975322083, + "grad_norm": 0.16328157121788206, + "learning_rate": 0.0006126719670704921, + "loss": 1.4227, + "step": 4903 + }, + { + "epoch": 0.44492832516784614, + "grad_norm": 0.16981674158277948, + "learning_rate": 0.0006125288142974466, + "loss": 1.4256, + "step": 4904 + }, + { + "epoch": 0.4450190528034839, + "grad_norm": 0.13294195452067534, + "learning_rate": 0.0006123856518075213, + "loss": 1.3988, + "step": 4905 + }, + { + "epoch": 0.44510978043912175, + "grad_norm": 0.1462582051600791, + "learning_rate": 0.0006122424796130783, + "loss": 1.4347, + "step": 4906 + }, + { + "epoch": 0.4452005080747596, + "grad_norm": 0.14848989702017415, + "learning_rate": 0.0006120992977264808, + "loss": 1.3719, + "step": 4907 + }, + { + "epoch": 0.44529123571039736, + "grad_norm": 0.1483934985654517, + "learning_rate": 0.0006119561061600923, + "loss": 1.4085, + "step": 4908 + }, + { + "epoch": 0.4453819633460352, + "grad_norm": 0.13284270924866687, + "learning_rate": 0.0006118129049262777, + "loss": 1.3973, + "step": 4909 + }, + { + "epoch": 0.445472690981673, + "grad_norm": 0.14637305581149945, + "learning_rate": 0.0006116696940374021, + "loss": 1.4343, + "step": 4910 + }, + { + "epoch": 0.44556341861731086, + "grad_norm": 0.1345627197957946, + "learning_rate": 0.000611526473505832, + "loss": 1.4024, + "step": 4911 + }, + { + "epoch": 0.44565414625294864, + "grad_norm": 0.15312290724732142, + "learning_rate": 0.0006113832433439345, + "loss": 1.4363, + "step": 4912 + }, + { + "epoch": 0.44574487388858647, + "grad_norm": 0.13987399863521502, + "learning_rate": 0.0006112400035640775, + "loss": 1.4359, + "step": 4913 + }, + { + "epoch": 0.4458356015242243, + "grad_norm": 0.13890565180035083, + "learning_rate": 0.0006110967541786299, + "loss": 1.4612, + "step": 4914 + }, + { + "epoch": 0.4459263291598621, + "grad_norm": 0.1484408160402715, + "learning_rate": 0.000610953495199961, + "loss": 1.3959, + "step": 4915 + }, + { + "epoch": 0.4460170567954999, + "grad_norm": 0.14228961884129274, + "learning_rate": 0.0006108102266404416, + "loss": 1.4551, + "step": 4916 + }, + { + "epoch": 0.44610778443113774, + "grad_norm": 0.12624700557789997, + "learning_rate": 0.0006106669485124428, + "loss": 1.3863, + "step": 4917 + }, + { + "epoch": 0.4461985120667755, + "grad_norm": 0.1396465428394517, + "learning_rate": 0.0006105236608283366, + "loss": 1.4194, + "step": 4918 + }, + { + "epoch": 0.44628923970241335, + "grad_norm": 0.2164249573400028, + "learning_rate": 0.000610380363600496, + "loss": 1.4119, + "step": 4919 + }, + { + "epoch": 0.4463799673380512, + "grad_norm": 0.12650569568053552, + "learning_rate": 0.0006102370568412947, + "loss": 1.4263, + "step": 4920 + }, + { + "epoch": 0.44647069497368896, + "grad_norm": 0.13762510272847064, + "learning_rate": 0.0006100937405631075, + "loss": 1.3833, + "step": 4921 + }, + { + "epoch": 0.4465614226093268, + "grad_norm": 0.16567897693856726, + "learning_rate": 0.0006099504147783094, + "loss": 1.4362, + "step": 4922 + }, + { + "epoch": 0.44665215024496463, + "grad_norm": 0.1328421510817229, + "learning_rate": 0.0006098070794992768, + "loss": 1.3975, + "step": 4923 + }, + { + "epoch": 0.4467428778806024, + "grad_norm": 0.13125001967591543, + "learning_rate": 0.0006096637347383866, + "loss": 1.4405, + "step": 4924 + }, + { + "epoch": 0.44683360551624024, + "grad_norm": 0.1457264033343502, + "learning_rate": 0.0006095203805080169, + "loss": 1.4116, + "step": 4925 + }, + { + "epoch": 0.4469243331518781, + "grad_norm": 0.12882186903626797, + "learning_rate": 0.0006093770168205463, + "loss": 1.3828, + "step": 4926 + }, + { + "epoch": 0.44701506078751585, + "grad_norm": 0.13798387926809894, + "learning_rate": 0.0006092336436883539, + "loss": 1.412, + "step": 4927 + }, + { + "epoch": 0.4471057884231537, + "grad_norm": 0.12653837801040305, + "learning_rate": 0.0006090902611238203, + "loss": 1.4405, + "step": 4928 + }, + { + "epoch": 0.4471965160587915, + "grad_norm": 0.12709076127601227, + "learning_rate": 0.000608946869139327, + "loss": 1.4265, + "step": 4929 + }, + { + "epoch": 0.44728724369442935, + "grad_norm": 0.12611749750179127, + "learning_rate": 0.0006088034677472549, + "loss": 1.3592, + "step": 4930 + }, + { + "epoch": 0.4473779713300671, + "grad_norm": 0.13623869595277072, + "learning_rate": 0.0006086600569599877, + "loss": 1.4466, + "step": 4931 + }, + { + "epoch": 0.44746869896570496, + "grad_norm": 0.15036435877446055, + "learning_rate": 0.0006085166367899085, + "loss": 1.3742, + "step": 4932 + }, + { + "epoch": 0.4475594266013428, + "grad_norm": 0.1294302149936901, + "learning_rate": 0.0006083732072494017, + "loss": 1.3902, + "step": 4933 + }, + { + "epoch": 0.44765015423698057, + "grad_norm": 0.21105133469001833, + "learning_rate": 0.0006082297683508524, + "loss": 1.4129, + "step": 4934 + }, + { + "epoch": 0.4477408818726184, + "grad_norm": 0.13315377915527996, + "learning_rate": 0.0006080863201066467, + "loss": 1.4305, + "step": 4935 + }, + { + "epoch": 0.44783160950825623, + "grad_norm": 0.12807436200983005, + "learning_rate": 0.0006079428625291714, + "loss": 1.3706, + "step": 4936 + }, + { + "epoch": 0.447922337143894, + "grad_norm": 0.1430486817699044, + "learning_rate": 0.000607799395630814, + "loss": 1.4064, + "step": 4937 + }, + { + "epoch": 0.44801306477953184, + "grad_norm": 0.12357878247730991, + "learning_rate": 0.0006076559194239628, + "loss": 1.3917, + "step": 4938 + }, + { + "epoch": 0.4481037924151697, + "grad_norm": 0.14511807208149252, + "learning_rate": 0.0006075124339210071, + "loss": 1.4395, + "step": 4939 + }, + { + "epoch": 0.44819452005080745, + "grad_norm": 0.15177884539952827, + "learning_rate": 0.000607368939134337, + "loss": 1.3853, + "step": 4940 + }, + { + "epoch": 0.4482852476864453, + "grad_norm": 0.13420038058384612, + "learning_rate": 0.0006072254350763432, + "loss": 1.404, + "step": 4941 + }, + { + "epoch": 0.4483759753220831, + "grad_norm": 0.13565089229911304, + "learning_rate": 0.000607081921759417, + "loss": 1.4354, + "step": 4942 + }, + { + "epoch": 0.4484667029577209, + "grad_norm": 0.13078787321002844, + "learning_rate": 0.0006069383991959514, + "loss": 1.4078, + "step": 4943 + }, + { + "epoch": 0.44855743059335873, + "grad_norm": 0.14432963646154373, + "learning_rate": 0.0006067948673983392, + "loss": 1.419, + "step": 4944 + }, + { + "epoch": 0.44864815822899656, + "grad_norm": 0.12501923562750866, + "learning_rate": 0.0006066513263789742, + "loss": 1.3525, + "step": 4945 + }, + { + "epoch": 0.44873888586463434, + "grad_norm": 0.14157648138479095, + "learning_rate": 0.0006065077761502518, + "loss": 1.4159, + "step": 4946 + }, + { + "epoch": 0.4488296135002722, + "grad_norm": 0.12876370557647107, + "learning_rate": 0.000606364216724567, + "loss": 1.4335, + "step": 4947 + }, + { + "epoch": 0.44892034113591, + "grad_norm": 0.1355637130651935, + "learning_rate": 0.0006062206481143165, + "loss": 1.46, + "step": 4948 + }, + { + "epoch": 0.44901106877154784, + "grad_norm": 0.1542728844635846, + "learning_rate": 0.0006060770703318974, + "loss": 1.3878, + "step": 4949 + }, + { + "epoch": 0.4491017964071856, + "grad_norm": 0.15789087825460255, + "learning_rate": 0.0006059334833897075, + "loss": 1.4044, + "step": 4950 + }, + { + "epoch": 0.44919252404282345, + "grad_norm": 0.1285590636504691, + "learning_rate": 0.0006057898873001458, + "loss": 1.4025, + "step": 4951 + }, + { + "epoch": 0.4492832516784613, + "grad_norm": 0.13777303902925833, + "learning_rate": 0.0006056462820756118, + "loss": 1.4114, + "step": 4952 + }, + { + "epoch": 0.44937397931409906, + "grad_norm": 0.1568093881889498, + "learning_rate": 0.0006055026677285058, + "loss": 1.4282, + "step": 4953 + }, + { + "epoch": 0.4494647069497369, + "grad_norm": 0.1310972337670305, + "learning_rate": 0.0006053590442712287, + "loss": 1.4379, + "step": 4954 + }, + { + "epoch": 0.4495554345853747, + "grad_norm": 0.151824094986408, + "learning_rate": 0.0006052154117161827, + "loss": 1.4309, + "step": 4955 + }, + { + "epoch": 0.4496461622210125, + "grad_norm": 0.13325043872308362, + "learning_rate": 0.0006050717700757704, + "loss": 1.4246, + "step": 4956 + }, + { + "epoch": 0.44973688985665033, + "grad_norm": 0.13442693487494659, + "learning_rate": 0.0006049281193623953, + "loss": 1.4025, + "step": 4957 + }, + { + "epoch": 0.44982761749228817, + "grad_norm": 0.13295470860115474, + "learning_rate": 0.0006047844595884616, + "loss": 1.3923, + "step": 4958 + }, + { + "epoch": 0.44991834512792594, + "grad_norm": 0.1318400238906031, + "learning_rate": 0.0006046407907663744, + "loss": 1.4116, + "step": 4959 + }, + { + "epoch": 0.4500090727635638, + "grad_norm": 0.12557696198422355, + "learning_rate": 0.0006044971129085395, + "loss": 1.374, + "step": 4960 + }, + { + "epoch": 0.4500998003992016, + "grad_norm": 0.1414955170780938, + "learning_rate": 0.0006043534260273634, + "loss": 1.437, + "step": 4961 + }, + { + "epoch": 0.4501905280348394, + "grad_norm": 0.13369880029688638, + "learning_rate": 0.0006042097301352534, + "loss": 1.3862, + "step": 4962 + }, + { + "epoch": 0.4502812556704772, + "grad_norm": 0.13436833710817855, + "learning_rate": 0.0006040660252446182, + "loss": 1.3853, + "step": 4963 + }, + { + "epoch": 0.45037198330611505, + "grad_norm": 0.12846431590329002, + "learning_rate": 0.0006039223113678664, + "loss": 1.407, + "step": 4964 + }, + { + "epoch": 0.45046271094175283, + "grad_norm": 0.1287536844952981, + "learning_rate": 0.0006037785885174076, + "loss": 1.3958, + "step": 4965 + }, + { + "epoch": 0.45055343857739066, + "grad_norm": 0.13278580488667172, + "learning_rate": 0.0006036348567056522, + "loss": 1.3874, + "step": 4966 + }, + { + "epoch": 0.4506441662130285, + "grad_norm": 0.15384569762862466, + "learning_rate": 0.0006034911159450118, + "loss": 1.383, + "step": 4967 + }, + { + "epoch": 0.45073489384866633, + "grad_norm": 0.13565340817955296, + "learning_rate": 0.0006033473662478984, + "loss": 1.417, + "step": 4968 + }, + { + "epoch": 0.4508256214843041, + "grad_norm": 0.12830804162987455, + "learning_rate": 0.0006032036076267244, + "loss": 1.4124, + "step": 4969 + }, + { + "epoch": 0.45091634911994194, + "grad_norm": 0.2119925120054134, + "learning_rate": 0.000603059840093904, + "loss": 1.3827, + "step": 4970 + }, + { + "epoch": 0.45100707675557977, + "grad_norm": 0.13197860321734925, + "learning_rate": 0.0006029160636618511, + "loss": 1.4322, + "step": 4971 + }, + { + "epoch": 0.45109780439121755, + "grad_norm": 0.14554440992152548, + "learning_rate": 0.0006027722783429807, + "loss": 1.3783, + "step": 4972 + }, + { + "epoch": 0.4511885320268554, + "grad_norm": 0.13033326435497336, + "learning_rate": 0.0006026284841497092, + "loss": 1.3923, + "step": 4973 + }, + { + "epoch": 0.4512792596624932, + "grad_norm": 0.15229146330556828, + "learning_rate": 0.0006024846810944529, + "loss": 1.438, + "step": 4974 + }, + { + "epoch": 0.451369987298131, + "grad_norm": 0.14281454832106916, + "learning_rate": 0.0006023408691896294, + "loss": 1.4035, + "step": 4975 + }, + { + "epoch": 0.4514607149337688, + "grad_norm": 0.12821379073476338, + "learning_rate": 0.0006021970484476565, + "loss": 1.4254, + "step": 4976 + }, + { + "epoch": 0.45155144256940666, + "grad_norm": 0.12791470624667062, + "learning_rate": 0.0006020532188809536, + "loss": 1.4175, + "step": 4977 + }, + { + "epoch": 0.45164217020504444, + "grad_norm": 0.18023408305974906, + "learning_rate": 0.0006019093805019401, + "loss": 1.3992, + "step": 4978 + }, + { + "epoch": 0.45173289784068227, + "grad_norm": 0.14458087947735834, + "learning_rate": 0.0006017655333230366, + "loss": 1.3853, + "step": 4979 + }, + { + "epoch": 0.4518236254763201, + "grad_norm": 0.12786123653239928, + "learning_rate": 0.0006016216773566643, + "loss": 1.379, + "step": 4980 + }, + { + "epoch": 0.4519143531119579, + "grad_norm": 0.1295839198034823, + "learning_rate": 0.0006014778126152452, + "loss": 1.3987, + "step": 4981 + }, + { + "epoch": 0.4520050807475957, + "grad_norm": 0.14575607000035465, + "learning_rate": 0.000601333939111202, + "loss": 1.4044, + "step": 4982 + }, + { + "epoch": 0.45209580838323354, + "grad_norm": 0.13152706249864632, + "learning_rate": 0.0006011900568569584, + "loss": 1.4325, + "step": 4983 + }, + { + "epoch": 0.4521865360188713, + "grad_norm": 0.15058620869949568, + "learning_rate": 0.0006010461658649383, + "loss": 1.4228, + "step": 4984 + }, + { + "epoch": 0.45227726365450915, + "grad_norm": 0.12670077521381437, + "learning_rate": 0.0006009022661475668, + "loss": 1.442, + "step": 4985 + }, + { + "epoch": 0.452367991290147, + "grad_norm": 0.13208461805825336, + "learning_rate": 0.0006007583577172698, + "loss": 1.3923, + "step": 4986 + }, + { + "epoch": 0.4524587189257848, + "grad_norm": 0.1339760549383497, + "learning_rate": 0.000600614440586474, + "loss": 1.4265, + "step": 4987 + }, + { + "epoch": 0.4525494465614226, + "grad_norm": 0.12420923126907814, + "learning_rate": 0.0006004705147676061, + "loss": 1.4072, + "step": 4988 + }, + { + "epoch": 0.45264017419706043, + "grad_norm": 0.13533564437419088, + "learning_rate": 0.0006003265802730946, + "loss": 1.4298, + "step": 4989 + }, + { + "epoch": 0.45273090183269826, + "grad_norm": 0.1417415263602733, + "learning_rate": 0.0006001826371153682, + "loss": 1.401, + "step": 4990 + }, + { + "epoch": 0.45282162946833604, + "grad_norm": 0.13597256108077282, + "learning_rate": 0.0006000386853068561, + "loss": 1.3829, + "step": 4991 + }, + { + "epoch": 0.4529123571039739, + "grad_norm": 0.1303461150342986, + "learning_rate": 0.000599894724859989, + "loss": 1.4086, + "step": 4992 + }, + { + "epoch": 0.4530030847396117, + "grad_norm": 0.12897670028796482, + "learning_rate": 0.0005997507557871975, + "loss": 1.4109, + "step": 4993 + }, + { + "epoch": 0.4530938123752495, + "grad_norm": 0.12848153748960872, + "learning_rate": 0.0005996067781009137, + "loss": 1.4023, + "step": 4994 + }, + { + "epoch": 0.4531845400108873, + "grad_norm": 0.12439952609590035, + "learning_rate": 0.0005994627918135697, + "loss": 1.4208, + "step": 4995 + }, + { + "epoch": 0.45327526764652515, + "grad_norm": 0.15237859884377372, + "learning_rate": 0.0005993187969375992, + "loss": 1.4218, + "step": 4996 + }, + { + "epoch": 0.4533659952821629, + "grad_norm": 0.1256427885510886, + "learning_rate": 0.0005991747934854358, + "loss": 1.3554, + "step": 4997 + }, + { + "epoch": 0.45345672291780076, + "grad_norm": 0.12601594600148233, + "learning_rate": 0.0005990307814695144, + "loss": 1.4421, + "step": 4998 + }, + { + "epoch": 0.4535474505534386, + "grad_norm": 0.12606041508482232, + "learning_rate": 0.0005988867609022705, + "loss": 1.4184, + "step": 4999 + }, + { + "epoch": 0.45363817818907637, + "grad_norm": 0.13288440171482804, + "learning_rate": 0.0005987427317961403, + "loss": 1.4001, + "step": 5000 + }, + { + "epoch": 0.4537289058247142, + "grad_norm": 0.13542363029768686, + "learning_rate": 0.0005985986941635603, + "loss": 1.4154, + "step": 5001 + }, + { + "epoch": 0.45381963346035203, + "grad_norm": 0.14611752068362444, + "learning_rate": 0.0005984546480169688, + "loss": 1.4313, + "step": 5002 + }, + { + "epoch": 0.4539103610959898, + "grad_norm": 0.13766209194426776, + "learning_rate": 0.0005983105933688039, + "loss": 1.4533, + "step": 5003 + }, + { + "epoch": 0.45400108873162764, + "grad_norm": 0.16400356773907496, + "learning_rate": 0.0005981665302315047, + "loss": 1.4554, + "step": 5004 + }, + { + "epoch": 0.4540918163672655, + "grad_norm": 0.1308285534325736, + "learning_rate": 0.0005980224586175113, + "loss": 1.3703, + "step": 5005 + }, + { + "epoch": 0.4541825440029033, + "grad_norm": 0.1265080126891533, + "learning_rate": 0.000597878378539264, + "loss": 1.3919, + "step": 5006 + }, + { + "epoch": 0.4542732716385411, + "grad_norm": 0.13332935793856004, + "learning_rate": 0.0005977342900092044, + "loss": 1.4364, + "step": 5007 + }, + { + "epoch": 0.4543639992741789, + "grad_norm": 0.13308465843864245, + "learning_rate": 0.0005975901930397742, + "loss": 1.429, + "step": 5008 + }, + { + "epoch": 0.45445472690981675, + "grad_norm": 0.14369891014640376, + "learning_rate": 0.0005974460876434168, + "loss": 1.4014, + "step": 5009 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 0.1438933142152419, + "learning_rate": 0.0005973019738325752, + "loss": 1.4323, + "step": 5010 + }, + { + "epoch": 0.45463618218109236, + "grad_norm": 0.1429393127707071, + "learning_rate": 0.0005971578516196938, + "loss": 1.3932, + "step": 5011 + }, + { + "epoch": 0.4547269098167302, + "grad_norm": 0.13243020022844254, + "learning_rate": 0.0005970137210172176, + "loss": 1.4099, + "step": 5012 + }, + { + "epoch": 0.454817637452368, + "grad_norm": 0.13614369904758905, + "learning_rate": 0.0005968695820375924, + "loss": 1.4553, + "step": 5013 + }, + { + "epoch": 0.4549083650880058, + "grad_norm": 0.13033234446960146, + "learning_rate": 0.0005967254346932644, + "loss": 1.4108, + "step": 5014 + }, + { + "epoch": 0.45499909272364364, + "grad_norm": 0.13148949005527705, + "learning_rate": 0.0005965812789966809, + "loss": 1.3928, + "step": 5015 + }, + { + "epoch": 0.4550898203592814, + "grad_norm": 0.13488153378275902, + "learning_rate": 0.0005964371149602898, + "loss": 1.3877, + "step": 5016 + }, + { + "epoch": 0.45518054799491925, + "grad_norm": 0.1403435862290916, + "learning_rate": 0.0005962929425965395, + "loss": 1.4117, + "step": 5017 + }, + { + "epoch": 0.4552712756305571, + "grad_norm": 0.15650714513692088, + "learning_rate": 0.0005961487619178794, + "loss": 1.4201, + "step": 5018 + }, + { + "epoch": 0.45536200326619486, + "grad_norm": 0.13909001389717915, + "learning_rate": 0.0005960045729367596, + "loss": 1.3966, + "step": 5019 + }, + { + "epoch": 0.4554527309018327, + "grad_norm": 0.13556510975941638, + "learning_rate": 0.0005958603756656307, + "loss": 1.4055, + "step": 5020 + }, + { + "epoch": 0.4555434585374705, + "grad_norm": 0.13690126413798206, + "learning_rate": 0.0005957161701169442, + "loss": 1.3655, + "step": 5021 + }, + { + "epoch": 0.4556341861731083, + "grad_norm": 0.14299124888020948, + "learning_rate": 0.0005955719563031524, + "loss": 1.4448, + "step": 5022 + }, + { + "epoch": 0.45572491380874613, + "grad_norm": 0.13889986356255923, + "learning_rate": 0.0005954277342367082, + "loss": 1.4237, + "step": 5023 + }, + { + "epoch": 0.45581564144438397, + "grad_norm": 0.12685068154808796, + "learning_rate": 0.0005952835039300648, + "loss": 1.4205, + "step": 5024 + }, + { + "epoch": 0.4559063690800218, + "grad_norm": 0.12836660775150696, + "learning_rate": 0.000595139265395677, + "loss": 1.3954, + "step": 5025 + }, + { + "epoch": 0.4559970967156596, + "grad_norm": 0.13225952567665883, + "learning_rate": 0.0005949950186459996, + "loss": 1.4261, + "step": 5026 + }, + { + "epoch": 0.4560878243512974, + "grad_norm": 0.17332964317288652, + "learning_rate": 0.0005948507636934883, + "loss": 1.3751, + "step": 5027 + }, + { + "epoch": 0.45617855198693524, + "grad_norm": 0.1539390244046411, + "learning_rate": 0.0005947065005505996, + "loss": 1.4075, + "step": 5028 + }, + { + "epoch": 0.456269279622573, + "grad_norm": 0.13257182212638705, + "learning_rate": 0.0005945622292297905, + "loss": 1.4004, + "step": 5029 + }, + { + "epoch": 0.45636000725821085, + "grad_norm": 0.12674772937122622, + "learning_rate": 0.0005944179497435192, + "loss": 1.427, + "step": 5030 + }, + { + "epoch": 0.4564507348938487, + "grad_norm": 0.12123709045798224, + "learning_rate": 0.0005942736621042439, + "loss": 1.4072, + "step": 5031 + }, + { + "epoch": 0.45654146252948646, + "grad_norm": 0.12676767958200294, + "learning_rate": 0.0005941293663244238, + "loss": 1.4213, + "step": 5032 + }, + { + "epoch": 0.4566321901651243, + "grad_norm": 0.13105234987043002, + "learning_rate": 0.0005939850624165193, + "loss": 1.416, + "step": 5033 + }, + { + "epoch": 0.45672291780076213, + "grad_norm": 0.13257375421121748, + "learning_rate": 0.0005938407503929908, + "loss": 1.4152, + "step": 5034 + }, + { + "epoch": 0.4568136454363999, + "grad_norm": 0.12969890266066292, + "learning_rate": 0.0005936964302662995, + "loss": 1.3976, + "step": 5035 + }, + { + "epoch": 0.45690437307203774, + "grad_norm": 0.12887759999906115, + "learning_rate": 0.0005935521020489077, + "loss": 1.4163, + "step": 5036 + }, + { + "epoch": 0.45699510070767557, + "grad_norm": 0.12345901778876661, + "learning_rate": 0.0005934077657532782, + "loss": 1.4428, + "step": 5037 + }, + { + "epoch": 0.45708582834331335, + "grad_norm": 0.15985594439299947, + "learning_rate": 0.0005932634213918744, + "loss": 1.4056, + "step": 5038 + }, + { + "epoch": 0.4571765559789512, + "grad_norm": 0.14040715414506133, + "learning_rate": 0.0005931190689771603, + "loss": 1.4142, + "step": 5039 + }, + { + "epoch": 0.457267283614589, + "grad_norm": 0.14173728164503777, + "learning_rate": 0.0005929747085216008, + "loss": 1.3904, + "step": 5040 + }, + { + "epoch": 0.4573580112502268, + "grad_norm": 0.13510027811005013, + "learning_rate": 0.0005928303400376616, + "loss": 1.4227, + "step": 5041 + }, + { + "epoch": 0.4574487388858646, + "grad_norm": 0.12868061307459908, + "learning_rate": 0.0005926859635378088, + "loss": 1.4316, + "step": 5042 + }, + { + "epoch": 0.45753946652150246, + "grad_norm": 0.12345017725736211, + "learning_rate": 0.0005925415790345095, + "loss": 1.4113, + "step": 5043 + }, + { + "epoch": 0.4576301941571403, + "grad_norm": 0.12689183279211183, + "learning_rate": 0.0005923971865402311, + "loss": 1.3982, + "step": 5044 + }, + { + "epoch": 0.45772092179277807, + "grad_norm": 0.1513066538740522, + "learning_rate": 0.0005922527860674421, + "loss": 1.385, + "step": 5045 + }, + { + "epoch": 0.4578116494284159, + "grad_norm": 0.12450152232960997, + "learning_rate": 0.0005921083776286115, + "loss": 1.3778, + "step": 5046 + }, + { + "epoch": 0.45790237706405373, + "grad_norm": 0.13036949638847037, + "learning_rate": 0.0005919639612362086, + "loss": 1.4325, + "step": 5047 + }, + { + "epoch": 0.4579931046996915, + "grad_norm": 0.24716717244839073, + "learning_rate": 0.0005918195369027044, + "loss": 1.4114, + "step": 5048 + }, + { + "epoch": 0.45808383233532934, + "grad_norm": 0.13641058152153493, + "learning_rate": 0.0005916751046405696, + "loss": 1.4015, + "step": 5049 + }, + { + "epoch": 0.4581745599709672, + "grad_norm": 0.12420246911057009, + "learning_rate": 0.0005915306644622759, + "loss": 1.4061, + "step": 5050 + }, + { + "epoch": 0.45826528760660495, + "grad_norm": 0.12178208250180579, + "learning_rate": 0.0005913862163802959, + "loss": 1.4002, + "step": 5051 + }, + { + "epoch": 0.4583560152422428, + "grad_norm": 0.13552285128769012, + "learning_rate": 0.0005912417604071027, + "loss": 1.401, + "step": 5052 + }, + { + "epoch": 0.4584467428778806, + "grad_norm": 0.12303255232863379, + "learning_rate": 0.0005910972965551701, + "loss": 1.42, + "step": 5053 + }, + { + "epoch": 0.4585374705135184, + "grad_norm": 0.11857582826524468, + "learning_rate": 0.0005909528248369723, + "loss": 1.3871, + "step": 5054 + }, + { + "epoch": 0.45862819814915623, + "grad_norm": 0.11262712481742729, + "learning_rate": 0.0005908083452649849, + "loss": 1.3955, + "step": 5055 + }, + { + "epoch": 0.45871892578479406, + "grad_norm": 0.12815663729639168, + "learning_rate": 0.0005906638578516833, + "loss": 1.4153, + "step": 5056 + }, + { + "epoch": 0.45880965342043184, + "grad_norm": 0.1331974676952654, + "learning_rate": 0.0005905193626095442, + "loss": 1.4024, + "step": 5057 + }, + { + "epoch": 0.4589003810560697, + "grad_norm": 0.12649980735408628, + "learning_rate": 0.000590374859551045, + "loss": 1.3928, + "step": 5058 + }, + { + "epoch": 0.4589911086917075, + "grad_norm": 0.12219590635735171, + "learning_rate": 0.0005902303486886631, + "loss": 1.4418, + "step": 5059 + }, + { + "epoch": 0.4590818363273453, + "grad_norm": 0.11914568789400937, + "learning_rate": 0.0005900858300348776, + "loss": 1.423, + "step": 5060 + }, + { + "epoch": 0.4591725639629831, + "grad_norm": 0.11854849358071701, + "learning_rate": 0.0005899413036021672, + "loss": 1.4194, + "step": 5061 + }, + { + "epoch": 0.45926329159862095, + "grad_norm": 0.1260657298952758, + "learning_rate": 0.000589796769403012, + "loss": 1.4234, + "step": 5062 + }, + { + "epoch": 0.4593540192342588, + "grad_norm": 0.1210852664542334, + "learning_rate": 0.0005896522274498926, + "loss": 1.3939, + "step": 5063 + }, + { + "epoch": 0.45944474686989656, + "grad_norm": 0.13550572935648744, + "learning_rate": 0.00058950767775529, + "loss": 1.3815, + "step": 5064 + }, + { + "epoch": 0.4595354745055344, + "grad_norm": 0.1263480522014025, + "learning_rate": 0.0005893631203316865, + "loss": 1.4274, + "step": 5065 + }, + { + "epoch": 0.4596262021411722, + "grad_norm": 0.13293600241577574, + "learning_rate": 0.0005892185551915641, + "loss": 1.3971, + "step": 5066 + }, + { + "epoch": 0.45971692977681, + "grad_norm": 0.1213024196913656, + "learning_rate": 0.0005890739823474064, + "loss": 1.3852, + "step": 5067 + }, + { + "epoch": 0.45980765741244783, + "grad_norm": 0.14145615035654754, + "learning_rate": 0.0005889294018116972, + "loss": 1.3984, + "step": 5068 + }, + { + "epoch": 0.45989838504808567, + "grad_norm": 0.11884886747989543, + "learning_rate": 0.000588784813596921, + "loss": 1.3986, + "step": 5069 + }, + { + "epoch": 0.45998911268372344, + "grad_norm": 0.12972791452141466, + "learning_rate": 0.0005886402177155633, + "loss": 1.421, + "step": 5070 + }, + { + "epoch": 0.4600798403193613, + "grad_norm": 0.13182005467130348, + "learning_rate": 0.0005884956141801094, + "loss": 1.4095, + "step": 5071 + }, + { + "epoch": 0.4601705679549991, + "grad_norm": 0.13908445828198943, + "learning_rate": 0.0005883510030030465, + "loss": 1.4014, + "step": 5072 + }, + { + "epoch": 0.4602612955906369, + "grad_norm": 0.12536628245997458, + "learning_rate": 0.0005882063841968613, + "loss": 1.4156, + "step": 5073 + }, + { + "epoch": 0.4603520232262747, + "grad_norm": 0.1442016458399597, + "learning_rate": 0.0005880617577740418, + "loss": 1.4318, + "step": 5074 + }, + { + "epoch": 0.46044275086191255, + "grad_norm": 0.12136451341419839, + "learning_rate": 0.0005879171237470765, + "loss": 1.4021, + "step": 5075 + }, + { + "epoch": 0.46053347849755033, + "grad_norm": 0.12438710585284048, + "learning_rate": 0.0005877724821284546, + "loss": 1.406, + "step": 5076 + }, + { + "epoch": 0.46062420613318816, + "grad_norm": 0.12739378817746935, + "learning_rate": 0.0005876278329306661, + "loss": 1.4083, + "step": 5077 + }, + { + "epoch": 0.460714933768826, + "grad_norm": 0.12656752603668164, + "learning_rate": 0.0005874831761662013, + "loss": 1.4219, + "step": 5078 + }, + { + "epoch": 0.4608056614044638, + "grad_norm": 0.13320471675783782, + "learning_rate": 0.0005873385118475511, + "loss": 1.3882, + "step": 5079 + }, + { + "epoch": 0.4608963890401016, + "grad_norm": 0.13762301268805122, + "learning_rate": 0.0005871938399872078, + "loss": 1.41, + "step": 5080 + }, + { + "epoch": 0.46098711667573944, + "grad_norm": 0.1311016755906381, + "learning_rate": 0.0005870491605976634, + "loss": 1.4153, + "step": 5081 + }, + { + "epoch": 0.46107784431137727, + "grad_norm": 0.13019929085140136, + "learning_rate": 0.0005869044736914113, + "loss": 1.368, + "step": 5082 + }, + { + "epoch": 0.46116857194701505, + "grad_norm": 0.13108140447329586, + "learning_rate": 0.000586759779280945, + "loss": 1.389, + "step": 5083 + }, + { + "epoch": 0.4612592995826529, + "grad_norm": 0.12798251127710467, + "learning_rate": 0.0005866150773787589, + "loss": 1.3989, + "step": 5084 + }, + { + "epoch": 0.4613500272182907, + "grad_norm": 0.13406950305336435, + "learning_rate": 0.0005864703679973482, + "loss": 1.3885, + "step": 5085 + }, + { + "epoch": 0.4614407548539285, + "grad_norm": 0.14550639802831272, + "learning_rate": 0.0005863256511492083, + "loss": 1.4243, + "step": 5086 + }, + { + "epoch": 0.4615314824895663, + "grad_norm": 0.12455853717020147, + "learning_rate": 0.000586180926846836, + "loss": 1.4497, + "step": 5087 + }, + { + "epoch": 0.46162221012520416, + "grad_norm": 0.13103812250579164, + "learning_rate": 0.0005860361951027278, + "loss": 1.3675, + "step": 5088 + }, + { + "epoch": 0.46171293776084193, + "grad_norm": 0.13010398482954452, + "learning_rate": 0.0005858914559293814, + "loss": 1.3944, + "step": 5089 + }, + { + "epoch": 0.46180366539647977, + "grad_norm": 0.13583235534060223, + "learning_rate": 0.0005857467093392951, + "loss": 1.4485, + "step": 5090 + }, + { + "epoch": 0.4618943930321176, + "grad_norm": 0.12698510867254775, + "learning_rate": 0.0005856019553449681, + "loss": 1.4084, + "step": 5091 + }, + { + "epoch": 0.4619851206677554, + "grad_norm": 0.12791410161642136, + "learning_rate": 0.0005854571939588996, + "loss": 1.4213, + "step": 5092 + }, + { + "epoch": 0.4620758483033932, + "grad_norm": 0.14002831177712036, + "learning_rate": 0.0005853124251935895, + "loss": 1.3726, + "step": 5093 + }, + { + "epoch": 0.46216657593903104, + "grad_norm": 0.1239739444607181, + "learning_rate": 0.000585167649061539, + "loss": 1.402, + "step": 5094 + }, + { + "epoch": 0.4622573035746688, + "grad_norm": 0.1332339528486337, + "learning_rate": 0.0005850228655752496, + "loss": 1.446, + "step": 5095 + }, + { + "epoch": 0.46234803121030665, + "grad_norm": 0.12915012235120016, + "learning_rate": 0.0005848780747472231, + "loss": 1.4072, + "step": 5096 + }, + { + "epoch": 0.4624387588459445, + "grad_norm": 0.15402289670620187, + "learning_rate": 0.0005847332765899626, + "loss": 1.4333, + "step": 5097 + }, + { + "epoch": 0.46252948648158226, + "grad_norm": 0.12934804096595723, + "learning_rate": 0.0005845884711159708, + "loss": 1.3646, + "step": 5098 + }, + { + "epoch": 0.4626202141172201, + "grad_norm": 0.13334302907429468, + "learning_rate": 0.0005844436583377523, + "loss": 1.4204, + "step": 5099 + }, + { + "epoch": 0.46271094175285793, + "grad_norm": 0.15585699114081675, + "learning_rate": 0.0005842988382678114, + "loss": 1.4153, + "step": 5100 + }, + { + "epoch": 0.46280166938849576, + "grad_norm": 0.11999301354354214, + "learning_rate": 0.0005841540109186533, + "loss": 1.3797, + "step": 5101 + }, + { + "epoch": 0.46289239702413354, + "grad_norm": 0.1347035189782227, + "learning_rate": 0.000584009176302784, + "loss": 1.3833, + "step": 5102 + }, + { + "epoch": 0.4629831246597714, + "grad_norm": 0.1285504013440621, + "learning_rate": 0.00058386433443271, + "loss": 1.3606, + "step": 5103 + }, + { + "epoch": 0.4630738522954092, + "grad_norm": 0.12826947700691482, + "learning_rate": 0.0005837194853209384, + "loss": 1.4027, + "step": 5104 + }, + { + "epoch": 0.463164579931047, + "grad_norm": 0.14697226132974564, + "learning_rate": 0.0005835746289799768, + "loss": 1.3822, + "step": 5105 + }, + { + "epoch": 0.4632553075666848, + "grad_norm": 0.1397436683678652, + "learning_rate": 0.0005834297654223337, + "loss": 1.3673, + "step": 5106 + }, + { + "epoch": 0.46334603520232265, + "grad_norm": 0.12748043114028326, + "learning_rate": 0.0005832848946605181, + "loss": 1.3625, + "step": 5107 + }, + { + "epoch": 0.4634367628379604, + "grad_norm": 0.12955501627714466, + "learning_rate": 0.0005831400167070394, + "loss": 1.4036, + "step": 5108 + }, + { + "epoch": 0.46352749047359826, + "grad_norm": 0.13607588100399512, + "learning_rate": 0.0005829951315744083, + "loss": 1.4004, + "step": 5109 + }, + { + "epoch": 0.4636182181092361, + "grad_norm": 0.1348734188378506, + "learning_rate": 0.0005828502392751351, + "loss": 1.4081, + "step": 5110 + }, + { + "epoch": 0.46370894574487387, + "grad_norm": 0.12266609979552492, + "learning_rate": 0.0005827053398217317, + "loss": 1.4169, + "step": 5111 + }, + { + "epoch": 0.4637996733805117, + "grad_norm": 0.12268304702526855, + "learning_rate": 0.0005825604332267098, + "loss": 1.4144, + "step": 5112 + }, + { + "epoch": 0.46389040101614953, + "grad_norm": 0.1348779492745244, + "learning_rate": 0.0005824155195025825, + "loss": 1.4028, + "step": 5113 + }, + { + "epoch": 0.4639811286517873, + "grad_norm": 0.127447050672759, + "learning_rate": 0.0005822705986618629, + "loss": 1.407, + "step": 5114 + }, + { + "epoch": 0.46407185628742514, + "grad_norm": 0.1437839457458934, + "learning_rate": 0.000582125670717065, + "loss": 1.3918, + "step": 5115 + }, + { + "epoch": 0.464162583923063, + "grad_norm": 0.11692649088868802, + "learning_rate": 0.0005819807356807034, + "loss": 1.3726, + "step": 5116 + }, + { + "epoch": 0.46425331155870075, + "grad_norm": 0.11893976227465627, + "learning_rate": 0.000581835793565293, + "loss": 1.3704, + "step": 5117 + }, + { + "epoch": 0.4643440391943386, + "grad_norm": 0.13278339695749528, + "learning_rate": 0.0005816908443833499, + "loss": 1.4258, + "step": 5118 + }, + { + "epoch": 0.4644347668299764, + "grad_norm": 0.13111397802259786, + "learning_rate": 0.0005815458881473903, + "loss": 1.4053, + "step": 5119 + }, + { + "epoch": 0.46452549446561425, + "grad_norm": 0.14786909293223424, + "learning_rate": 0.0005814009248699312, + "loss": 1.3956, + "step": 5120 + }, + { + "epoch": 0.46461622210125203, + "grad_norm": 0.12858853286408167, + "learning_rate": 0.0005812559545634903, + "loss": 1.3765, + "step": 5121 + }, + { + "epoch": 0.46470694973688986, + "grad_norm": 0.12441783455476213, + "learning_rate": 0.0005811109772405858, + "loss": 1.4367, + "step": 5122 + }, + { + "epoch": 0.4647976773725277, + "grad_norm": 0.1262883008941277, + "learning_rate": 0.0005809659929137363, + "loss": 1.4049, + "step": 5123 + }, + { + "epoch": 0.4648884050081655, + "grad_norm": 0.12849843661365526, + "learning_rate": 0.0005808210015954616, + "loss": 1.4417, + "step": 5124 + }, + { + "epoch": 0.4649791326438033, + "grad_norm": 0.1240243324708911, + "learning_rate": 0.0005806760032982813, + "loss": 1.3787, + "step": 5125 + }, + { + "epoch": 0.46506986027944114, + "grad_norm": 0.13254977094801676, + "learning_rate": 0.0005805309980347164, + "loss": 1.401, + "step": 5126 + }, + { + "epoch": 0.4651605879150789, + "grad_norm": 0.12146675872065996, + "learning_rate": 0.0005803859858172878, + "loss": 1.3619, + "step": 5127 + }, + { + "epoch": 0.46525131555071675, + "grad_norm": 0.11530341203819748, + "learning_rate": 0.0005802409666585175, + "loss": 1.3971, + "step": 5128 + }, + { + "epoch": 0.4653420431863546, + "grad_norm": 0.12300903560869242, + "learning_rate": 0.000580095940570928, + "loss": 1.4641, + "step": 5129 + }, + { + "epoch": 0.46543277082199236, + "grad_norm": 0.1361923827352692, + "learning_rate": 0.0005799509075670421, + "loss": 1.4188, + "step": 5130 + }, + { + "epoch": 0.4655234984576302, + "grad_norm": 0.12157508088319202, + "learning_rate": 0.0005798058676593837, + "loss": 1.426, + "step": 5131 + }, + { + "epoch": 0.465614226093268, + "grad_norm": 0.12047289725018662, + "learning_rate": 0.0005796608208604768, + "loss": 1.4476, + "step": 5132 + }, + { + "epoch": 0.4657049537289058, + "grad_norm": 0.16035075325924716, + "learning_rate": 0.0005795157671828463, + "loss": 1.4103, + "step": 5133 + }, + { + "epoch": 0.46579568136454363, + "grad_norm": 0.12386801521071415, + "learning_rate": 0.0005793707066390174, + "loss": 1.4223, + "step": 5134 + }, + { + "epoch": 0.46588640900018147, + "grad_norm": 0.1251617310828719, + "learning_rate": 0.0005792256392415165, + "loss": 1.374, + "step": 5135 + }, + { + "epoch": 0.46597713663581924, + "grad_norm": 0.13659768643822728, + "learning_rate": 0.00057908056500287, + "loss": 1.4478, + "step": 5136 + }, + { + "epoch": 0.4660678642714571, + "grad_norm": 0.12570177621335077, + "learning_rate": 0.0005789354839356048, + "loss": 1.4191, + "step": 5137 + }, + { + "epoch": 0.4661585919070949, + "grad_norm": 0.13492958940590943, + "learning_rate": 0.0005787903960522492, + "loss": 1.3734, + "step": 5138 + }, + { + "epoch": 0.46624931954273274, + "grad_norm": 0.12724773172913045, + "learning_rate": 0.0005786453013653312, + "loss": 1.4492, + "step": 5139 + }, + { + "epoch": 0.4663400471783705, + "grad_norm": 0.12591644696421814, + "learning_rate": 0.0005785001998873798, + "loss": 1.3921, + "step": 5140 + }, + { + "epoch": 0.46643077481400835, + "grad_norm": 0.1312597727961486, + "learning_rate": 0.0005783550916309244, + "loss": 1.3811, + "step": 5141 + }, + { + "epoch": 0.4665215024496462, + "grad_norm": 0.12348728918340289, + "learning_rate": 0.0005782099766084956, + "loss": 1.4841, + "step": 5142 + }, + { + "epoch": 0.46661223008528396, + "grad_norm": 0.11885347827051973, + "learning_rate": 0.0005780648548326237, + "loss": 1.3763, + "step": 5143 + }, + { + "epoch": 0.4667029577209218, + "grad_norm": 0.12137920869244327, + "learning_rate": 0.00057791972631584, + "loss": 1.3711, + "step": 5144 + }, + { + "epoch": 0.46679368535655963, + "grad_norm": 0.12843035238022094, + "learning_rate": 0.0005777745910706765, + "loss": 1.4322, + "step": 5145 + }, + { + "epoch": 0.4668844129921974, + "grad_norm": 0.12569161744468973, + "learning_rate": 0.0005776294491096657, + "loss": 1.421, + "step": 5146 + }, + { + "epoch": 0.46697514062783524, + "grad_norm": 0.12720318659490024, + "learning_rate": 0.0005774843004453403, + "loss": 1.4154, + "step": 5147 + }, + { + "epoch": 0.46706586826347307, + "grad_norm": 0.13224021179898202, + "learning_rate": 0.0005773391450902344, + "loss": 1.4113, + "step": 5148 + }, + { + "epoch": 0.46715659589911085, + "grad_norm": 0.14577753978341437, + "learning_rate": 0.0005771939830568815, + "loss": 1.3779, + "step": 5149 + }, + { + "epoch": 0.4672473235347487, + "grad_norm": 0.12241934883228461, + "learning_rate": 0.0005770488143578172, + "loss": 1.3668, + "step": 5150 + }, + { + "epoch": 0.4673380511703865, + "grad_norm": 0.12270048797747374, + "learning_rate": 0.0005769036390055763, + "loss": 1.4165, + "step": 5151 + }, + { + "epoch": 0.4674287788060243, + "grad_norm": 0.12078893726461726, + "learning_rate": 0.0005767584570126949, + "loss": 1.4059, + "step": 5152 + }, + { + "epoch": 0.4675195064416621, + "grad_norm": 0.1257145877178639, + "learning_rate": 0.0005766132683917093, + "loss": 1.432, + "step": 5153 + }, + { + "epoch": 0.46761023407729996, + "grad_norm": 0.13137037799690057, + "learning_rate": 0.0005764680731551566, + "loss": 1.4051, + "step": 5154 + }, + { + "epoch": 0.46770096171293774, + "grad_norm": 0.13159852643250125, + "learning_rate": 0.0005763228713155748, + "loss": 1.4252, + "step": 5155 + }, + { + "epoch": 0.46779168934857557, + "grad_norm": 0.20186846030647673, + "learning_rate": 0.0005761776628855016, + "loss": 1.3803, + "step": 5156 + }, + { + "epoch": 0.4678824169842134, + "grad_norm": 0.1400408430787793, + "learning_rate": 0.0005760324478774759, + "loss": 1.3786, + "step": 5157 + }, + { + "epoch": 0.46797314461985123, + "grad_norm": 0.12873294734871443, + "learning_rate": 0.0005758872263040373, + "loss": 1.3707, + "step": 5158 + }, + { + "epoch": 0.468063872255489, + "grad_norm": 0.1309419986178896, + "learning_rate": 0.0005757419981777255, + "loss": 1.4067, + "step": 5159 + }, + { + "epoch": 0.46815459989112684, + "grad_norm": 0.14543857137562757, + "learning_rate": 0.000575596763511081, + "loss": 1.3834, + "step": 5160 + }, + { + "epoch": 0.4682453275267647, + "grad_norm": 0.1276932732369336, + "learning_rate": 0.0005754515223166447, + "loss": 1.3934, + "step": 5161 + }, + { + "epoch": 0.46833605516240245, + "grad_norm": 0.1253602079181896, + "learning_rate": 0.0005753062746069585, + "loss": 1.4302, + "step": 5162 + }, + { + "epoch": 0.4684267827980403, + "grad_norm": 0.1325735774206355, + "learning_rate": 0.0005751610203945644, + "loss": 1.3931, + "step": 5163 + }, + { + "epoch": 0.4685175104336781, + "grad_norm": 0.12419739587234466, + "learning_rate": 0.000575015759692005, + "loss": 1.4219, + "step": 5164 + }, + { + "epoch": 0.4686082380693159, + "grad_norm": 0.14705155523793645, + "learning_rate": 0.0005748704925118238, + "loss": 1.4155, + "step": 5165 + }, + { + "epoch": 0.46869896570495373, + "grad_norm": 0.1367161524745178, + "learning_rate": 0.0005747252188665644, + "loss": 1.4008, + "step": 5166 + }, + { + "epoch": 0.46878969334059156, + "grad_norm": 0.1259970527262851, + "learning_rate": 0.0005745799387687714, + "loss": 1.4219, + "step": 5167 + }, + { + "epoch": 0.46888042097622934, + "grad_norm": 0.13671469794468577, + "learning_rate": 0.0005744346522309897, + "loss": 1.409, + "step": 5168 + }, + { + "epoch": 0.4689711486118672, + "grad_norm": 0.1643690300339312, + "learning_rate": 0.0005742893592657648, + "loss": 1.4262, + "step": 5169 + }, + { + "epoch": 0.469061876247505, + "grad_norm": 0.13592640762904562, + "learning_rate": 0.0005741440598856428, + "loss": 1.4034, + "step": 5170 + }, + { + "epoch": 0.4691526038831428, + "grad_norm": 0.13553286747005416, + "learning_rate": 0.0005739987541031703, + "loss": 1.4307, + "step": 5171 + }, + { + "epoch": 0.4692433315187806, + "grad_norm": 0.12323864744569639, + "learning_rate": 0.0005738534419308945, + "loss": 1.3818, + "step": 5172 + }, + { + "epoch": 0.46933405915441845, + "grad_norm": 0.12819711818788607, + "learning_rate": 0.000573708123381363, + "loss": 1.4342, + "step": 5173 + }, + { + "epoch": 0.4694247867900562, + "grad_norm": 0.13321945332588267, + "learning_rate": 0.0005735627984671242, + "loss": 1.4325, + "step": 5174 + }, + { + "epoch": 0.46951551442569406, + "grad_norm": 0.1347752981549419, + "learning_rate": 0.0005734174672007271, + "loss": 1.3984, + "step": 5175 + }, + { + "epoch": 0.4696062420613319, + "grad_norm": 0.14060677924771414, + "learning_rate": 0.0005732721295947206, + "loss": 1.4231, + "step": 5176 + }, + { + "epoch": 0.4696969696969697, + "grad_norm": 0.1367910295563376, + "learning_rate": 0.0005731267856616551, + "loss": 1.3917, + "step": 5177 + }, + { + "epoch": 0.4697876973326075, + "grad_norm": 0.14553133975013052, + "learning_rate": 0.0005729814354140808, + "loss": 1.4061, + "step": 5178 + }, + { + "epoch": 0.46987842496824533, + "grad_norm": 0.12629862556774765, + "learning_rate": 0.000572836078864549, + "loss": 1.4104, + "step": 5179 + }, + { + "epoch": 0.46996915260388317, + "grad_norm": 0.12572674532479505, + "learning_rate": 0.0005726907160256107, + "loss": 1.3922, + "step": 5180 + }, + { + "epoch": 0.47005988023952094, + "grad_norm": 0.1403579016091906, + "learning_rate": 0.0005725453469098186, + "loss": 1.393, + "step": 5181 + }, + { + "epoch": 0.4701506078751588, + "grad_norm": 0.14930299456567261, + "learning_rate": 0.0005723999715297251, + "loss": 1.3984, + "step": 5182 + }, + { + "epoch": 0.4702413355107966, + "grad_norm": 0.13009098390074048, + "learning_rate": 0.0005722545898978834, + "loss": 1.3828, + "step": 5183 + }, + { + "epoch": 0.4703320631464344, + "grad_norm": 0.17334545390923783, + "learning_rate": 0.0005721092020268471, + "loss": 1.3952, + "step": 5184 + }, + { + "epoch": 0.4704227907820722, + "grad_norm": 0.1347970836112881, + "learning_rate": 0.0005719638079291706, + "loss": 1.4016, + "step": 5185 + }, + { + "epoch": 0.47051351841771005, + "grad_norm": 0.14662283245188007, + "learning_rate": 0.0005718184076174087, + "loss": 1.4488, + "step": 5186 + }, + { + "epoch": 0.47060424605334783, + "grad_norm": 0.1461653068645382, + "learning_rate": 0.0005716730011041168, + "loss": 1.4085, + "step": 5187 + }, + { + "epoch": 0.47069497368898566, + "grad_norm": 0.13995100200428856, + "learning_rate": 0.0005715275884018505, + "loss": 1.4117, + "step": 5188 + }, + { + "epoch": 0.4707857013246235, + "grad_norm": 0.14095701189421145, + "learning_rate": 0.0005713821695231666, + "loss": 1.4218, + "step": 5189 + }, + { + "epoch": 0.4708764289602613, + "grad_norm": 0.12936614865305762, + "learning_rate": 0.0005712367444806217, + "loss": 1.4203, + "step": 5190 + }, + { + "epoch": 0.4709671565958991, + "grad_norm": 0.15932479750371953, + "learning_rate": 0.0005710913132867734, + "loss": 1.4323, + "step": 5191 + }, + { + "epoch": 0.47105788423153694, + "grad_norm": 0.17024290922277913, + "learning_rate": 0.0005709458759541799, + "loss": 1.3914, + "step": 5192 + }, + { + "epoch": 0.4711486118671747, + "grad_norm": 0.1311405767668322, + "learning_rate": 0.0005708004324953994, + "loss": 1.4203, + "step": 5193 + }, + { + "epoch": 0.47123933950281255, + "grad_norm": 0.21043588431324456, + "learning_rate": 0.0005706549829229912, + "loss": 1.3663, + "step": 5194 + }, + { + "epoch": 0.4713300671384504, + "grad_norm": 0.12289065189443078, + "learning_rate": 0.0005705095272495146, + "loss": 1.4144, + "step": 5195 + }, + { + "epoch": 0.4714207947740882, + "grad_norm": 0.16239961934096572, + "learning_rate": 0.0005703640654875302, + "loss": 1.4027, + "step": 5196 + }, + { + "epoch": 0.471511522409726, + "grad_norm": 0.13518602503211757, + "learning_rate": 0.0005702185976495984, + "loss": 1.4228, + "step": 5197 + }, + { + "epoch": 0.4716022500453638, + "grad_norm": 0.12380519586813103, + "learning_rate": 0.0005700731237482801, + "loss": 1.384, + "step": 5198 + }, + { + "epoch": 0.47169297768100166, + "grad_norm": 0.13585605460609193, + "learning_rate": 0.0005699276437961374, + "loss": 1.377, + "step": 5199 + }, + { + "epoch": 0.47178370531663943, + "grad_norm": 0.13198060783730586, + "learning_rate": 0.0005697821578057323, + "loss": 1.4094, + "step": 5200 + }, + { + "epoch": 0.47187443295227727, + "grad_norm": 0.1327097429037541, + "learning_rate": 0.0005696366657896276, + "loss": 1.4208, + "step": 5201 + }, + { + "epoch": 0.4719651605879151, + "grad_norm": 0.14801135927451678, + "learning_rate": 0.0005694911677603865, + "loss": 1.4136, + "step": 5202 + }, + { + "epoch": 0.4720558882235529, + "grad_norm": 0.13342792835783057, + "learning_rate": 0.0005693456637305729, + "loss": 1.411, + "step": 5203 + }, + { + "epoch": 0.4721466158591907, + "grad_norm": 0.12461033520294208, + "learning_rate": 0.000569200153712751, + "loss": 1.4021, + "step": 5204 + }, + { + "epoch": 0.47223734349482854, + "grad_norm": 0.1246490439290599, + "learning_rate": 0.0005690546377194857, + "loss": 1.4107, + "step": 5205 + }, + { + "epoch": 0.4723280711304663, + "grad_norm": 0.14112030787249374, + "learning_rate": 0.000568909115763342, + "loss": 1.4111, + "step": 5206 + }, + { + "epoch": 0.47241879876610415, + "grad_norm": 0.12358118458422887, + "learning_rate": 0.0005687635878568862, + "loss": 1.3865, + "step": 5207 + }, + { + "epoch": 0.472509526401742, + "grad_norm": 0.12814112407302064, + "learning_rate": 0.0005686180540126844, + "loss": 1.4434, + "step": 5208 + }, + { + "epoch": 0.47260025403737976, + "grad_norm": 0.12490035920032452, + "learning_rate": 0.0005684725142433036, + "loss": 1.4007, + "step": 5209 + }, + { + "epoch": 0.4726909816730176, + "grad_norm": 0.12503569823470162, + "learning_rate": 0.0005683269685613111, + "loss": 1.443, + "step": 5210 + }, + { + "epoch": 0.47278170930865543, + "grad_norm": 0.12436517366895271, + "learning_rate": 0.0005681814169792747, + "loss": 1.3876, + "step": 5211 + }, + { + "epoch": 0.4728724369442932, + "grad_norm": 0.12191019477186814, + "learning_rate": 0.0005680358595097629, + "loss": 1.3956, + "step": 5212 + }, + { + "epoch": 0.47296316457993104, + "grad_norm": 0.12773614145561601, + "learning_rate": 0.0005678902961653446, + "loss": 1.4182, + "step": 5213 + }, + { + "epoch": 0.47305389221556887, + "grad_norm": 0.13184659686348263, + "learning_rate": 0.0005677447269585894, + "loss": 1.4048, + "step": 5214 + }, + { + "epoch": 0.4731446198512067, + "grad_norm": 0.12936038149632265, + "learning_rate": 0.0005675991519020668, + "loss": 1.4543, + "step": 5215 + }, + { + "epoch": 0.4732353474868445, + "grad_norm": 0.1625219257273675, + "learning_rate": 0.0005674535710083476, + "loss": 1.4258, + "step": 5216 + }, + { + "epoch": 0.4733260751224823, + "grad_norm": 0.12304746218523219, + "learning_rate": 0.0005673079842900027, + "loss": 1.3787, + "step": 5217 + }, + { + "epoch": 0.47341680275812015, + "grad_norm": 0.12083659844755408, + "learning_rate": 0.0005671623917596031, + "loss": 1.3797, + "step": 5218 + }, + { + "epoch": 0.4735075303937579, + "grad_norm": 0.12320661732152574, + "learning_rate": 0.0005670167934297214, + "loss": 1.4062, + "step": 5219 + }, + { + "epoch": 0.47359825802939576, + "grad_norm": 0.12105955409542082, + "learning_rate": 0.0005668711893129295, + "loss": 1.45, + "step": 5220 + }, + { + "epoch": 0.4736889856650336, + "grad_norm": 0.15499673439578826, + "learning_rate": 0.0005667255794218007, + "loss": 1.3979, + "step": 5221 + }, + { + "epoch": 0.47377971330067137, + "grad_norm": 0.13858124530896526, + "learning_rate": 0.0005665799637689082, + "loss": 1.3646, + "step": 5222 + }, + { + "epoch": 0.4738704409363092, + "grad_norm": 0.12411891836808368, + "learning_rate": 0.0005664343423668261, + "loss": 1.4123, + "step": 5223 + }, + { + "epoch": 0.47396116857194703, + "grad_norm": 0.12218292735197259, + "learning_rate": 0.0005662887152281286, + "loss": 1.4104, + "step": 5224 + }, + { + "epoch": 0.4740518962075848, + "grad_norm": 0.13309136819678785, + "learning_rate": 0.0005661430823653908, + "loss": 1.4304, + "step": 5225 + }, + { + "epoch": 0.47414262384322264, + "grad_norm": 0.1360761260870404, + "learning_rate": 0.0005659974437911883, + "loss": 1.4333, + "step": 5226 + }, + { + "epoch": 0.4742333514788605, + "grad_norm": 0.1327271215107273, + "learning_rate": 0.0005658517995180965, + "loss": 1.3737, + "step": 5227 + }, + { + "epoch": 0.47432407911449825, + "grad_norm": 0.1384402259592983, + "learning_rate": 0.0005657061495586924, + "loss": 1.4248, + "step": 5228 + }, + { + "epoch": 0.4744148067501361, + "grad_norm": 0.12783100673366524, + "learning_rate": 0.0005655604939255525, + "loss": 1.3851, + "step": 5229 + }, + { + "epoch": 0.4745055343857739, + "grad_norm": 0.12928913589442434, + "learning_rate": 0.0005654148326312542, + "loss": 1.4575, + "step": 5230 + }, + { + "epoch": 0.4745962620214117, + "grad_norm": 0.1434543217516973, + "learning_rate": 0.0005652691656883754, + "loss": 1.3721, + "step": 5231 + }, + { + "epoch": 0.47468698965704953, + "grad_norm": 0.13336888184921114, + "learning_rate": 0.0005651234931094946, + "loss": 1.382, + "step": 5232 + }, + { + "epoch": 0.47477771729268736, + "grad_norm": 0.12135402271007435, + "learning_rate": 0.0005649778149071907, + "loss": 1.3852, + "step": 5233 + }, + { + "epoch": 0.4748684449283252, + "grad_norm": 0.13523803096358283, + "learning_rate": 0.0005648321310940427, + "loss": 1.4279, + "step": 5234 + }, + { + "epoch": 0.474959172563963, + "grad_norm": 0.12997556206831581, + "learning_rate": 0.0005646864416826306, + "loss": 1.3884, + "step": 5235 + }, + { + "epoch": 0.4750499001996008, + "grad_norm": 0.13510264382320866, + "learning_rate": 0.000564540746685535, + "loss": 1.4139, + "step": 5236 + }, + { + "epoch": 0.47514062783523864, + "grad_norm": 0.12919518219939605, + "learning_rate": 0.0005643950461153362, + "loss": 1.4106, + "step": 5237 + }, + { + "epoch": 0.4752313554708764, + "grad_norm": 0.13621988759983825, + "learning_rate": 0.0005642493399846158, + "loss": 1.3774, + "step": 5238 + }, + { + "epoch": 0.47532208310651425, + "grad_norm": 0.12873757423700866, + "learning_rate": 0.0005641036283059553, + "loss": 1.4051, + "step": 5239 + }, + { + "epoch": 0.4754128107421521, + "grad_norm": 0.12187285704964687, + "learning_rate": 0.000563957911091937, + "loss": 1.4111, + "step": 5240 + }, + { + "epoch": 0.47550353837778986, + "grad_norm": 0.1297604745780006, + "learning_rate": 0.0005638121883551439, + "loss": 1.4172, + "step": 5241 + }, + { + "epoch": 0.4755942660134277, + "grad_norm": 0.13211216578304652, + "learning_rate": 0.0005636664601081587, + "loss": 1.4561, + "step": 5242 + }, + { + "epoch": 0.4756849936490655, + "grad_norm": 0.14106334649002364, + "learning_rate": 0.0005635207263635655, + "loss": 1.3994, + "step": 5243 + }, + { + "epoch": 0.4757757212847033, + "grad_norm": 0.12646962856340718, + "learning_rate": 0.0005633749871339481, + "loss": 1.4304, + "step": 5244 + }, + { + "epoch": 0.47586644892034113, + "grad_norm": 0.12550694831990775, + "learning_rate": 0.0005632292424318912, + "loss": 1.4218, + "step": 5245 + }, + { + "epoch": 0.47595717655597897, + "grad_norm": 0.13541296327042046, + "learning_rate": 0.0005630834922699799, + "loss": 1.4194, + "step": 5246 + }, + { + "epoch": 0.47604790419161674, + "grad_norm": 0.11795089389650863, + "learning_rate": 0.0005629377366607998, + "loss": 1.3755, + "step": 5247 + }, + { + "epoch": 0.4761386318272546, + "grad_norm": 0.14102675921985744, + "learning_rate": 0.0005627919756169368, + "loss": 1.4099, + "step": 5248 + }, + { + "epoch": 0.4762293594628924, + "grad_norm": 0.1470794214804082, + "learning_rate": 0.0005626462091509774, + "loss": 1.4166, + "step": 5249 + }, + { + "epoch": 0.4763200870985302, + "grad_norm": 0.12718055918511795, + "learning_rate": 0.0005625004372755087, + "loss": 1.4225, + "step": 5250 + }, + { + "epoch": 0.476410814734168, + "grad_norm": 0.16250733868484815, + "learning_rate": 0.000562354660003118, + "loss": 1.4156, + "step": 5251 + }, + { + "epoch": 0.47650154236980585, + "grad_norm": 0.13040789402852804, + "learning_rate": 0.0005622088773463933, + "loss": 1.3845, + "step": 5252 + }, + { + "epoch": 0.4765922700054437, + "grad_norm": 0.12763101381993455, + "learning_rate": 0.0005620630893179229, + "loss": 1.4088, + "step": 5253 + }, + { + "epoch": 0.47668299764108146, + "grad_norm": 0.12364323141496993, + "learning_rate": 0.0005619172959302952, + "loss": 1.436, + "step": 5254 + }, + { + "epoch": 0.4767737252767193, + "grad_norm": 0.12181027596413402, + "learning_rate": 0.0005617714971961003, + "loss": 1.4084, + "step": 5255 + }, + { + "epoch": 0.47686445291235713, + "grad_norm": 0.12935363333513233, + "learning_rate": 0.0005616256931279274, + "loss": 1.3706, + "step": 5256 + }, + { + "epoch": 0.4769551805479949, + "grad_norm": 0.12479537999452499, + "learning_rate": 0.0005614798837383668, + "loss": 1.432, + "step": 5257 + }, + { + "epoch": 0.47704590818363274, + "grad_norm": 0.11948118742520537, + "learning_rate": 0.0005613340690400091, + "loss": 1.4021, + "step": 5258 + }, + { + "epoch": 0.47713663581927057, + "grad_norm": 0.11243478333304621, + "learning_rate": 0.0005611882490454455, + "loss": 1.3826, + "step": 5259 + }, + { + "epoch": 0.47722736345490835, + "grad_norm": 0.24337140947244992, + "learning_rate": 0.0005610424237672678, + "loss": 1.4412, + "step": 5260 + }, + { + "epoch": 0.4773180910905462, + "grad_norm": 0.13283082747534639, + "learning_rate": 0.0005608965932180676, + "loss": 1.4194, + "step": 5261 + }, + { + "epoch": 0.477408818726184, + "grad_norm": 0.20634382719220482, + "learning_rate": 0.0005607507574104377, + "loss": 1.4363, + "step": 5262 + }, + { + "epoch": 0.4774995463618218, + "grad_norm": 0.1288235131700079, + "learning_rate": 0.0005606049163569709, + "loss": 1.4258, + "step": 5263 + }, + { + "epoch": 0.4775902739974596, + "grad_norm": 0.12240415127716422, + "learning_rate": 0.0005604590700702605, + "loss": 1.4286, + "step": 5264 + }, + { + "epoch": 0.47768100163309746, + "grad_norm": 0.11403431163163019, + "learning_rate": 0.0005603132185629007, + "loss": 1.4099, + "step": 5265 + }, + { + "epoch": 0.47777172926873523, + "grad_norm": 0.11675445063885662, + "learning_rate": 0.0005601673618474855, + "loss": 1.3886, + "step": 5266 + }, + { + "epoch": 0.47786245690437307, + "grad_norm": 0.12430184326227618, + "learning_rate": 0.0005600214999366098, + "loss": 1.4181, + "step": 5267 + }, + { + "epoch": 0.4779531845400109, + "grad_norm": 0.11585580203451377, + "learning_rate": 0.0005598756328428686, + "loss": 1.3966, + "step": 5268 + }, + { + "epoch": 0.4780439121756487, + "grad_norm": 0.11626457105017293, + "learning_rate": 0.0005597297605788578, + "loss": 1.3797, + "step": 5269 + }, + { + "epoch": 0.4781346398112865, + "grad_norm": 0.12739479733228518, + "learning_rate": 0.0005595838831571734, + "loss": 1.3973, + "step": 5270 + }, + { + "epoch": 0.47822536744692434, + "grad_norm": 0.12714270596936836, + "learning_rate": 0.0005594380005904117, + "loss": 1.409, + "step": 5271 + }, + { + "epoch": 0.4783160950825622, + "grad_norm": 0.11370525210277711, + "learning_rate": 0.0005592921128911702, + "loss": 1.4103, + "step": 5272 + }, + { + "epoch": 0.47840682271819995, + "grad_norm": 0.1239646765578741, + "learning_rate": 0.0005591462200720457, + "loss": 1.4045, + "step": 5273 + }, + { + "epoch": 0.4784975503538378, + "grad_norm": 0.1200200034517753, + "learning_rate": 0.0005590003221456366, + "loss": 1.4125, + "step": 5274 + }, + { + "epoch": 0.4785882779894756, + "grad_norm": 0.12076197588280907, + "learning_rate": 0.000558854419124541, + "loss": 1.4218, + "step": 5275 + }, + { + "epoch": 0.4786790056251134, + "grad_norm": 0.12586201433630836, + "learning_rate": 0.0005587085110213575, + "loss": 1.3822, + "step": 5276 + }, + { + "epoch": 0.47876973326075123, + "grad_norm": 0.1301086716756529, + "learning_rate": 0.0005585625978486853, + "loss": 1.4099, + "step": 5277 + }, + { + "epoch": 0.47886046089638906, + "grad_norm": 0.31145745597278107, + "learning_rate": 0.0005584166796191244, + "loss": 1.4048, + "step": 5278 + }, + { + "epoch": 0.47895118853202684, + "grad_norm": 0.12105706651537934, + "learning_rate": 0.0005582707563452744, + "loss": 1.4182, + "step": 5279 + }, + { + "epoch": 0.47904191616766467, + "grad_norm": 0.14843336926057318, + "learning_rate": 0.0005581248280397363, + "loss": 1.4158, + "step": 5280 + }, + { + "epoch": 0.4791326438033025, + "grad_norm": 0.1231402534531842, + "learning_rate": 0.0005579788947151105, + "loss": 1.4293, + "step": 5281 + }, + { + "epoch": 0.4792233714389403, + "grad_norm": 0.12779633292823603, + "learning_rate": 0.0005578329563839987, + "loss": 1.3866, + "step": 5282 + }, + { + "epoch": 0.4793140990745781, + "grad_norm": 0.13971718507706263, + "learning_rate": 0.0005576870130590025, + "loss": 1.4113, + "step": 5283 + }, + { + "epoch": 0.47940482671021595, + "grad_norm": 0.12277231549803723, + "learning_rate": 0.0005575410647527242, + "loss": 1.4553, + "step": 5284 + }, + { + "epoch": 0.4794955543458537, + "grad_norm": 0.11829926995838506, + "learning_rate": 0.0005573951114777666, + "loss": 1.3933, + "step": 5285 + }, + { + "epoch": 0.47958628198149156, + "grad_norm": 0.12437022710660062, + "learning_rate": 0.0005572491532467326, + "loss": 1.36, + "step": 5286 + }, + { + "epoch": 0.4796770096171294, + "grad_norm": 0.12298734870750144, + "learning_rate": 0.0005571031900722257, + "loss": 1.4098, + "step": 5287 + }, + { + "epoch": 0.47976773725276717, + "grad_norm": 0.12097060149662088, + "learning_rate": 0.00055695722196685, + "loss": 1.4064, + "step": 5288 + }, + { + "epoch": 0.479858464888405, + "grad_norm": 0.12206487339265937, + "learning_rate": 0.0005568112489432097, + "loss": 1.4122, + "step": 5289 + }, + { + "epoch": 0.47994919252404283, + "grad_norm": 0.13462421634152164, + "learning_rate": 0.0005566652710139098, + "loss": 1.445, + "step": 5290 + }, + { + "epoch": 0.48003992015968067, + "grad_norm": 0.15086862917505975, + "learning_rate": 0.0005565192881915554, + "loss": 1.4384, + "step": 5291 + }, + { + "epoch": 0.48013064779531844, + "grad_norm": 0.5984945914282086, + "learning_rate": 0.0005563733004887522, + "loss": 1.3837, + "step": 5292 + }, + { + "epoch": 0.4802213754309563, + "grad_norm": 0.1419807504509573, + "learning_rate": 0.0005562273079181059, + "loss": 1.4082, + "step": 5293 + }, + { + "epoch": 0.4803121030665941, + "grad_norm": 0.1257708060201954, + "learning_rate": 0.0005560813104922237, + "loss": 1.3977, + "step": 5294 + }, + { + "epoch": 0.4804028307022319, + "grad_norm": 0.13062883576176199, + "learning_rate": 0.0005559353082237118, + "loss": 1.4465, + "step": 5295 + }, + { + "epoch": 0.4804935583378697, + "grad_norm": 0.13299474606630762, + "learning_rate": 0.0005557893011251777, + "loss": 1.3997, + "step": 5296 + }, + { + "epoch": 0.48058428597350755, + "grad_norm": 0.12784612564841313, + "learning_rate": 0.0005556432892092295, + "loss": 1.4035, + "step": 5297 + }, + { + "epoch": 0.48067501360914533, + "grad_norm": 0.12193703356457837, + "learning_rate": 0.0005554972724884748, + "loss": 1.3996, + "step": 5298 + }, + { + "epoch": 0.48076574124478316, + "grad_norm": 0.12465821838037669, + "learning_rate": 0.0005553512509755227, + "loss": 1.4127, + "step": 5299 + }, + { + "epoch": 0.480856468880421, + "grad_norm": 0.13173703085395239, + "learning_rate": 0.0005552052246829819, + "loss": 1.411, + "step": 5300 + }, + { + "epoch": 0.4809471965160588, + "grad_norm": 0.1462204714064937, + "learning_rate": 0.0005550591936234616, + "loss": 1.4415, + "step": 5301 + }, + { + "epoch": 0.4810379241516966, + "grad_norm": 0.12811484133472018, + "learning_rate": 0.0005549131578095718, + "loss": 1.4162, + "step": 5302 + }, + { + "epoch": 0.48112865178733444, + "grad_norm": 0.1275150411043979, + "learning_rate": 0.0005547671172539229, + "loss": 1.4072, + "step": 5303 + }, + { + "epoch": 0.4812193794229722, + "grad_norm": 0.13517047500998064, + "learning_rate": 0.0005546210719691255, + "loss": 1.362, + "step": 5304 + }, + { + "epoch": 0.48131010705861005, + "grad_norm": 0.1324340980468798, + "learning_rate": 0.0005544750219677901, + "loss": 1.413, + "step": 5305 + }, + { + "epoch": 0.4814008346942479, + "grad_norm": 0.1294741991514196, + "learning_rate": 0.0005543289672625288, + "loss": 1.4357, + "step": 5306 + }, + { + "epoch": 0.48149156232988566, + "grad_norm": 0.14042494365833877, + "learning_rate": 0.0005541829078659531, + "loss": 1.4184, + "step": 5307 + }, + { + "epoch": 0.4815822899655235, + "grad_norm": 0.13274373407788453, + "learning_rate": 0.0005540368437906753, + "loss": 1.4401, + "step": 5308 + }, + { + "epoch": 0.4816730176011613, + "grad_norm": 0.13334407776034446, + "learning_rate": 0.0005538907750493081, + "loss": 1.3989, + "step": 5309 + }, + { + "epoch": 0.4817637452367991, + "grad_norm": 0.13816372457455983, + "learning_rate": 0.0005537447016544645, + "loss": 1.4168, + "step": 5310 + }, + { + "epoch": 0.48185447287243693, + "grad_norm": 0.14687427601261485, + "learning_rate": 0.000553598623618758, + "loss": 1.3719, + "step": 5311 + }, + { + "epoch": 0.48194520050807477, + "grad_norm": 0.12585867744481652, + "learning_rate": 0.0005534525409548024, + "loss": 1.4075, + "step": 5312 + }, + { + "epoch": 0.4820359281437126, + "grad_norm": 0.13794800726428177, + "learning_rate": 0.000553306453675212, + "loss": 1.4354, + "step": 5313 + }, + { + "epoch": 0.4821266557793504, + "grad_norm": 0.1431222355084216, + "learning_rate": 0.0005531603617926017, + "loss": 1.3742, + "step": 5314 + }, + { + "epoch": 0.4822173834149882, + "grad_norm": 0.1491662876805703, + "learning_rate": 0.0005530142653195861, + "loss": 1.4086, + "step": 5315 + }, + { + "epoch": 0.48230811105062604, + "grad_norm": 0.15270460667189212, + "learning_rate": 0.0005528681642687808, + "loss": 1.3783, + "step": 5316 + }, + { + "epoch": 0.4823988386862638, + "grad_norm": 0.13590169022104437, + "learning_rate": 0.0005527220586528019, + "loss": 1.3994, + "step": 5317 + }, + { + "epoch": 0.48248956632190165, + "grad_norm": 0.140319911030737, + "learning_rate": 0.0005525759484842654, + "loss": 1.3742, + "step": 5318 + }, + { + "epoch": 0.4825802939575395, + "grad_norm": 0.1652330499229417, + "learning_rate": 0.0005524298337757881, + "loss": 1.3778, + "step": 5319 + }, + { + "epoch": 0.48267102159317726, + "grad_norm": 0.14320358300131508, + "learning_rate": 0.0005522837145399867, + "loss": 1.3991, + "step": 5320 + }, + { + "epoch": 0.4827617492288151, + "grad_norm": 0.1484726265418136, + "learning_rate": 0.0005521375907894791, + "loss": 1.4125, + "step": 5321 + }, + { + "epoch": 0.48285247686445293, + "grad_norm": 0.16091120252604058, + "learning_rate": 0.0005519914625368829, + "loss": 1.428, + "step": 5322 + }, + { + "epoch": 0.4829432045000907, + "grad_norm": 0.14937876291314242, + "learning_rate": 0.0005518453297948159, + "loss": 1.4467, + "step": 5323 + }, + { + "epoch": 0.48303393213572854, + "grad_norm": 0.13705098958630338, + "learning_rate": 0.0005516991925758973, + "loss": 1.3721, + "step": 5324 + }, + { + "epoch": 0.48312465977136637, + "grad_norm": 0.15723629394864538, + "learning_rate": 0.0005515530508927456, + "loss": 1.3803, + "step": 5325 + }, + { + "epoch": 0.48321538740700415, + "grad_norm": 0.14209825013301974, + "learning_rate": 0.0005514069047579806, + "loss": 1.4165, + "step": 5326 + }, + { + "epoch": 0.483306115042642, + "grad_norm": 0.8460314044548151, + "learning_rate": 0.0005512607541842217, + "loss": 1.381, + "step": 5327 + }, + { + "epoch": 0.4833968426782798, + "grad_norm": 0.13204836236686252, + "learning_rate": 0.000551114599184089, + "loss": 1.4331, + "step": 5328 + }, + { + "epoch": 0.4834875703139176, + "grad_norm": 0.141216416273939, + "learning_rate": 0.0005509684397702033, + "loss": 1.4293, + "step": 5329 + }, + { + "epoch": 0.4835782979495554, + "grad_norm": 0.1464996799658863, + "learning_rate": 0.0005508222759551852, + "loss": 1.3956, + "step": 5330 + }, + { + "epoch": 0.48366902558519326, + "grad_norm": 0.14900087298291725, + "learning_rate": 0.0005506761077516562, + "loss": 1.4463, + "step": 5331 + }, + { + "epoch": 0.4837597532208311, + "grad_norm": 0.14019534927684085, + "learning_rate": 0.0005505299351722376, + "loss": 1.3495, + "step": 5332 + }, + { + "epoch": 0.48385048085646887, + "grad_norm": 0.15000225720539961, + "learning_rate": 0.0005503837582295518, + "loss": 1.3972, + "step": 5333 + }, + { + "epoch": 0.4839412084921067, + "grad_norm": 0.14402622987613797, + "learning_rate": 0.0005502375769362211, + "loss": 1.441, + "step": 5334 + }, + { + "epoch": 0.48403193612774453, + "grad_norm": 0.14105793989860152, + "learning_rate": 0.0005500913913048682, + "loss": 1.4384, + "step": 5335 + }, + { + "epoch": 0.4841226637633823, + "grad_norm": 0.1423906020273037, + "learning_rate": 0.0005499452013481162, + "loss": 1.4146, + "step": 5336 + }, + { + "epoch": 0.48421339139902014, + "grad_norm": 0.13899611027292558, + "learning_rate": 0.0005497990070785888, + "loss": 1.3986, + "step": 5337 + }, + { + "epoch": 0.484304119034658, + "grad_norm": 0.13571815031889858, + "learning_rate": 0.0005496528085089099, + "loss": 1.4273, + "step": 5338 + }, + { + "epoch": 0.48439484667029575, + "grad_norm": 0.149588608875016, + "learning_rate": 0.0005495066056517034, + "loss": 1.4056, + "step": 5339 + }, + { + "epoch": 0.4844855743059336, + "grad_norm": 0.13759787617855856, + "learning_rate": 0.0005493603985195943, + "loss": 1.4194, + "step": 5340 + }, + { + "epoch": 0.4845763019415714, + "grad_norm": 0.14510735247837714, + "learning_rate": 0.0005492141871252075, + "loss": 1.4485, + "step": 5341 + }, + { + "epoch": 0.4846670295772092, + "grad_norm": 0.15790598651383342, + "learning_rate": 0.0005490679714811685, + "loss": 1.4071, + "step": 5342 + }, + { + "epoch": 0.48475775721284703, + "grad_norm": 0.177412771198171, + "learning_rate": 0.000548921751600103, + "loss": 1.3925, + "step": 5343 + }, + { + "epoch": 0.48484848484848486, + "grad_norm": 0.1397632559124042, + "learning_rate": 0.0005487755274946367, + "loss": 1.414, + "step": 5344 + }, + { + "epoch": 0.48493921248412264, + "grad_norm": 0.14452340182590728, + "learning_rate": 0.0005486292991773966, + "loss": 1.4334, + "step": 5345 + }, + { + "epoch": 0.48502994011976047, + "grad_norm": 0.18145711575009224, + "learning_rate": 0.0005484830666610094, + "loss": 1.3907, + "step": 5346 + }, + { + "epoch": 0.4851206677553983, + "grad_norm": 0.13925483838691186, + "learning_rate": 0.0005483368299581022, + "loss": 1.3989, + "step": 5347 + }, + { + "epoch": 0.4852113953910361, + "grad_norm": 0.15106895592078734, + "learning_rate": 0.0005481905890813026, + "loss": 1.4022, + "step": 5348 + }, + { + "epoch": 0.4853021230266739, + "grad_norm": 0.14015737689346255, + "learning_rate": 0.0005480443440432386, + "loss": 1.3835, + "step": 5349 + }, + { + "epoch": 0.48539285066231175, + "grad_norm": 0.14243292242044556, + "learning_rate": 0.0005478980948565384, + "loss": 1.4417, + "step": 5350 + }, + { + "epoch": 0.4854835782979496, + "grad_norm": 0.14216934626133138, + "learning_rate": 0.0005477518415338305, + "loss": 1.3952, + "step": 5351 + }, + { + "epoch": 0.48557430593358736, + "grad_norm": 0.14224875267114184, + "learning_rate": 0.000547605584087744, + "loss": 1.3882, + "step": 5352 + }, + { + "epoch": 0.4856650335692252, + "grad_norm": 0.16687780527816107, + "learning_rate": 0.0005474593225309087, + "loss": 1.4097, + "step": 5353 + }, + { + "epoch": 0.485755761204863, + "grad_norm": 0.13639010577434166, + "learning_rate": 0.0005473130568759536, + "loss": 1.4045, + "step": 5354 + }, + { + "epoch": 0.4858464888405008, + "grad_norm": 0.17144874302465002, + "learning_rate": 0.0005471667871355091, + "loss": 1.4176, + "step": 5355 + }, + { + "epoch": 0.48593721647613863, + "grad_norm": 0.15657559222366663, + "learning_rate": 0.0005470205133222055, + "loss": 1.4209, + "step": 5356 + }, + { + "epoch": 0.48602794411177647, + "grad_norm": 0.13334288559162152, + "learning_rate": 0.0005468742354486737, + "loss": 1.432, + "step": 5357 + }, + { + "epoch": 0.48611867174741424, + "grad_norm": 0.1394662683073579, + "learning_rate": 0.000546727953527545, + "loss": 1.441, + "step": 5358 + }, + { + "epoch": 0.4862093993830521, + "grad_norm": 0.14926548506857468, + "learning_rate": 0.0005465816675714504, + "loss": 1.4568, + "step": 5359 + }, + { + "epoch": 0.4863001270186899, + "grad_norm": 0.16290385767282903, + "learning_rate": 0.000546435377593022, + "loss": 1.3955, + "step": 5360 + }, + { + "epoch": 0.4863908546543277, + "grad_norm": 0.14126432191378552, + "learning_rate": 0.0005462890836048918, + "loss": 1.4393, + "step": 5361 + }, + { + "epoch": 0.4864815822899655, + "grad_norm": 0.3962855769967782, + "learning_rate": 0.0005461427856196925, + "loss": 1.4352, + "step": 5362 + }, + { + "epoch": 0.48657230992560335, + "grad_norm": 0.1343689614999429, + "learning_rate": 0.0005459964836500568, + "loss": 1.3787, + "step": 5363 + }, + { + "epoch": 0.48666303756124113, + "grad_norm": 0.14128765151695427, + "learning_rate": 0.000545850177708618, + "loss": 1.4148, + "step": 5364 + }, + { + "epoch": 0.48675376519687896, + "grad_norm": 0.1460855116445435, + "learning_rate": 0.0005457038678080097, + "loss": 1.4326, + "step": 5365 + }, + { + "epoch": 0.4868444928325168, + "grad_norm": 0.1371037079769657, + "learning_rate": 0.0005455575539608655, + "loss": 1.4121, + "step": 5366 + }, + { + "epoch": 0.4869352204681546, + "grad_norm": 0.13849989315360525, + "learning_rate": 0.0005454112361798199, + "loss": 1.3911, + "step": 5367 + }, + { + "epoch": 0.4870259481037924, + "grad_norm": 0.12627275594970852, + "learning_rate": 0.0005452649144775073, + "loss": 1.387, + "step": 5368 + }, + { + "epoch": 0.48711667573943024, + "grad_norm": 0.13958235719076234, + "learning_rate": 0.0005451185888665628, + "loss": 1.3918, + "step": 5369 + }, + { + "epoch": 0.48720740337506807, + "grad_norm": 0.2043478960826863, + "learning_rate": 0.0005449722593596214, + "loss": 1.4002, + "step": 5370 + }, + { + "epoch": 0.48729813101070585, + "grad_norm": 0.13510308713627597, + "learning_rate": 0.0005448259259693187, + "loss": 1.4337, + "step": 5371 + }, + { + "epoch": 0.4873888586463437, + "grad_norm": 0.13612013119729643, + "learning_rate": 0.0005446795887082908, + "loss": 1.3836, + "step": 5372 + }, + { + "epoch": 0.4874795862819815, + "grad_norm": 0.13025454362067587, + "learning_rate": 0.0005445332475891738, + "loss": 1.4111, + "step": 5373 + }, + { + "epoch": 0.4875703139176193, + "grad_norm": 0.13479268337825903, + "learning_rate": 0.0005443869026246042, + "loss": 1.4193, + "step": 5374 + }, + { + "epoch": 0.4876610415532571, + "grad_norm": 0.12411907888847784, + "learning_rate": 0.0005442405538272192, + "loss": 1.4098, + "step": 5375 + }, + { + "epoch": 0.48775176918889496, + "grad_norm": 0.13230093415564192, + "learning_rate": 0.0005440942012096557, + "loss": 1.4156, + "step": 5376 + }, + { + "epoch": 0.48784249682453273, + "grad_norm": 0.1307708736009048, + "learning_rate": 0.0005439478447845516, + "loss": 1.4088, + "step": 5377 + }, + { + "epoch": 0.48793322446017057, + "grad_norm": 0.15538552608773332, + "learning_rate": 0.0005438014845645446, + "loss": 1.3871, + "step": 5378 + }, + { + "epoch": 0.4880239520958084, + "grad_norm": 0.12592131652396057, + "learning_rate": 0.0005436551205622728, + "loss": 1.4202, + "step": 5379 + }, + { + "epoch": 0.4881146797314462, + "grad_norm": 0.13176546886903429, + "learning_rate": 0.0005435087527903749, + "loss": 1.3712, + "step": 5380 + }, + { + "epoch": 0.488205407367084, + "grad_norm": 0.12593175051348676, + "learning_rate": 0.00054336238126149, + "loss": 1.4235, + "step": 5381 + }, + { + "epoch": 0.48829613500272184, + "grad_norm": 0.12233495079051442, + "learning_rate": 0.0005432160059882569, + "loss": 1.4016, + "step": 5382 + }, + { + "epoch": 0.4883868626383596, + "grad_norm": 0.13192027529467704, + "learning_rate": 0.0005430696269833153, + "loss": 1.4571, + "step": 5383 + }, + { + "epoch": 0.48847759027399745, + "grad_norm": 0.15636087265327533, + "learning_rate": 0.0005429232442593053, + "loss": 1.4108, + "step": 5384 + }, + { + "epoch": 0.4885683179096353, + "grad_norm": 0.13829740952719952, + "learning_rate": 0.0005427768578288666, + "loss": 1.4334, + "step": 5385 + }, + { + "epoch": 0.48865904554527306, + "grad_norm": 0.1283272542379822, + "learning_rate": 0.00054263046770464, + "loss": 1.3996, + "step": 5386 + }, + { + "epoch": 0.4887497731809109, + "grad_norm": 0.14035228482533754, + "learning_rate": 0.0005424840738992661, + "loss": 1.4452, + "step": 5387 + }, + { + "epoch": 0.48884050081654873, + "grad_norm": 0.1411194665511652, + "learning_rate": 0.0005423376764253863, + "loss": 1.3915, + "step": 5388 + }, + { + "epoch": 0.48893122845218656, + "grad_norm": 0.1308706119465418, + "learning_rate": 0.0005421912752956419, + "loss": 1.3665, + "step": 5389 + }, + { + "epoch": 0.48902195608782434, + "grad_norm": 0.1564393141253821, + "learning_rate": 0.0005420448705226746, + "loss": 1.3831, + "step": 5390 + }, + { + "epoch": 0.48911268372346217, + "grad_norm": 0.13000745728681784, + "learning_rate": 0.0005418984621191266, + "loss": 1.4218, + "step": 5391 + }, + { + "epoch": 0.4892034113591, + "grad_norm": 0.1343315816603372, + "learning_rate": 0.0005417520500976402, + "loss": 1.4041, + "step": 5392 + }, + { + "epoch": 0.4892941389947378, + "grad_norm": 0.13272245904230062, + "learning_rate": 0.0005416056344708581, + "loss": 1.4019, + "step": 5393 + }, + { + "epoch": 0.4893848666303756, + "grad_norm": 0.13435928774467307, + "learning_rate": 0.0005414592152514232, + "loss": 1.4362, + "step": 5394 + }, + { + "epoch": 0.48947559426601345, + "grad_norm": 0.13563598321515816, + "learning_rate": 0.0005413127924519792, + "loss": 1.4203, + "step": 5395 + }, + { + "epoch": 0.4895663219016512, + "grad_norm": 0.20032526861881914, + "learning_rate": 0.0005411663660851694, + "loss": 1.4392, + "step": 5396 + }, + { + "epoch": 0.48965704953728906, + "grad_norm": 0.14792866034300076, + "learning_rate": 0.0005410199361636378, + "loss": 1.3873, + "step": 5397 + }, + { + "epoch": 0.4897477771729269, + "grad_norm": 0.12598706427147283, + "learning_rate": 0.0005408735027000285, + "loss": 1.3817, + "step": 5398 + }, + { + "epoch": 0.48983850480856467, + "grad_norm": 0.1641587599915781, + "learning_rate": 0.0005407270657069866, + "loss": 1.3628, + "step": 5399 + }, + { + "epoch": 0.4899292324442025, + "grad_norm": 0.13618819035437182, + "learning_rate": 0.0005405806251971563, + "loss": 1.4214, + "step": 5400 + }, + { + "epoch": 0.49001996007984033, + "grad_norm": 0.1383617622865736, + "learning_rate": 0.0005404341811831832, + "loss": 1.4089, + "step": 5401 + }, + { + "epoch": 0.4901106877154781, + "grad_norm": 0.14177870781420188, + "learning_rate": 0.0005402877336777123, + "loss": 1.3936, + "step": 5402 + }, + { + "epoch": 0.49020141535111594, + "grad_norm": 0.1439054598850184, + "learning_rate": 0.0005401412826933899, + "loss": 1.4153, + "step": 5403 + }, + { + "epoch": 0.4902921429867538, + "grad_norm": 0.14365016883866927, + "learning_rate": 0.0005399948282428618, + "loss": 1.4406, + "step": 5404 + }, + { + "epoch": 0.49038287062239155, + "grad_norm": 0.1563459593228405, + "learning_rate": 0.0005398483703387743, + "loss": 1.3972, + "step": 5405 + }, + { + "epoch": 0.4904735982580294, + "grad_norm": 0.16399183013366306, + "learning_rate": 0.0005397019089937742, + "loss": 1.4233, + "step": 5406 + }, + { + "epoch": 0.4905643258936672, + "grad_norm": 0.17646310097146417, + "learning_rate": 0.0005395554442205084, + "loss": 1.3861, + "step": 5407 + }, + { + "epoch": 0.49065505352930505, + "grad_norm": 0.15083004776447276, + "learning_rate": 0.0005394089760316242, + "loss": 1.3942, + "step": 5408 + }, + { + "epoch": 0.49074578116494283, + "grad_norm": 0.14603235809933243, + "learning_rate": 0.0005392625044397692, + "loss": 1.4045, + "step": 5409 + }, + { + "epoch": 0.49083650880058066, + "grad_norm": 0.13246277232653073, + "learning_rate": 0.0005391160294575908, + "loss": 1.3728, + "step": 5410 + }, + { + "epoch": 0.4909272364362185, + "grad_norm": 0.13090792217482097, + "learning_rate": 0.0005389695510977379, + "loss": 1.4093, + "step": 5411 + }, + { + "epoch": 0.49101796407185627, + "grad_norm": 0.13030788030868262, + "learning_rate": 0.0005388230693728583, + "loss": 1.3903, + "step": 5412 + }, + { + "epoch": 0.4911086917074941, + "grad_norm": 0.14210358516206154, + "learning_rate": 0.0005386765842956009, + "loss": 1.4124, + "step": 5413 + }, + { + "epoch": 0.49119941934313194, + "grad_norm": 0.145342085242602, + "learning_rate": 0.0005385300958786149, + "loss": 1.3384, + "step": 5414 + }, + { + "epoch": 0.4912901469787697, + "grad_norm": 0.24476822678299648, + "learning_rate": 0.0005383836041345494, + "loss": 1.4038, + "step": 5415 + }, + { + "epoch": 0.49138087461440755, + "grad_norm": 0.15799106305653174, + "learning_rate": 0.0005382371090760541, + "loss": 1.4205, + "step": 5416 + }, + { + "epoch": 0.4914716022500454, + "grad_norm": 0.14902725371110262, + "learning_rate": 0.0005380906107157786, + "loss": 1.3686, + "step": 5417 + }, + { + "epoch": 0.49156232988568316, + "grad_norm": 0.16165760569459472, + "learning_rate": 0.0005379441090663734, + "loss": 1.4047, + "step": 5418 + }, + { + "epoch": 0.491653057521321, + "grad_norm": 0.17505354946536342, + "learning_rate": 0.0005377976041404886, + "loss": 1.451, + "step": 5419 + }, + { + "epoch": 0.4917437851569588, + "grad_norm": 0.13178128123639238, + "learning_rate": 0.0005376510959507753, + "loss": 1.4212, + "step": 5420 + }, + { + "epoch": 0.4918345127925966, + "grad_norm": 0.15453521039142937, + "learning_rate": 0.0005375045845098844, + "loss": 1.4274, + "step": 5421 + }, + { + "epoch": 0.49192524042823443, + "grad_norm": 0.14184361050010774, + "learning_rate": 0.0005373580698304668, + "loss": 1.4359, + "step": 5422 + }, + { + "epoch": 0.49201596806387227, + "grad_norm": 0.132222519503548, + "learning_rate": 0.0005372115519251746, + "loss": 1.3864, + "step": 5423 + }, + { + "epoch": 0.49210669569951004, + "grad_norm": 0.15334132695207728, + "learning_rate": 0.0005370650308066594, + "loss": 1.3742, + "step": 5424 + }, + { + "epoch": 0.4921974233351479, + "grad_norm": 0.13846664295087635, + "learning_rate": 0.0005369185064875731, + "loss": 1.4179, + "step": 5425 + }, + { + "epoch": 0.4922881509707857, + "grad_norm": 0.12356435074105987, + "learning_rate": 0.0005367719789805685, + "loss": 1.3793, + "step": 5426 + }, + { + "epoch": 0.49237887860642354, + "grad_norm": 0.14144738016495567, + "learning_rate": 0.0005366254482982981, + "loss": 1.3413, + "step": 5427 + }, + { + "epoch": 0.4924696062420613, + "grad_norm": 0.13522674393069814, + "learning_rate": 0.0005364789144534149, + "loss": 1.4133, + "step": 5428 + }, + { + "epoch": 0.49256033387769915, + "grad_norm": 0.13716441504979213, + "learning_rate": 0.0005363323774585719, + "loss": 1.4154, + "step": 5429 + }, + { + "epoch": 0.492651061513337, + "grad_norm": 0.13415929241177724, + "learning_rate": 0.0005361858373264228, + "loss": 1.4367, + "step": 5430 + }, + { + "epoch": 0.49274178914897476, + "grad_norm": 0.14278012853793434, + "learning_rate": 0.0005360392940696214, + "loss": 1.432, + "step": 5431 + }, + { + "epoch": 0.4928325167846126, + "grad_norm": 0.1417866600872272, + "learning_rate": 0.0005358927477008216, + "loss": 1.3779, + "step": 5432 + }, + { + "epoch": 0.49292324442025043, + "grad_norm": 0.14732737713633517, + "learning_rate": 0.0005357461982326777, + "loss": 1.4119, + "step": 5433 + }, + { + "epoch": 0.4930139720558882, + "grad_norm": 0.13335143015216983, + "learning_rate": 0.0005355996456778444, + "loss": 1.4071, + "step": 5434 + }, + { + "epoch": 0.49310469969152604, + "grad_norm": 0.1542354041011318, + "learning_rate": 0.0005354530900489764, + "loss": 1.3977, + "step": 5435 + }, + { + "epoch": 0.49319542732716387, + "grad_norm": 0.13774701443089, + "learning_rate": 0.0005353065313587289, + "loss": 1.4472, + "step": 5436 + }, + { + "epoch": 0.49328615496280165, + "grad_norm": 0.13479497734537788, + "learning_rate": 0.000535159969619757, + "loss": 1.4282, + "step": 5437 + }, + { + "epoch": 0.4933768825984395, + "grad_norm": 0.1456577093882054, + "learning_rate": 0.0005350134048447169, + "loss": 1.421, + "step": 5438 + }, + { + "epoch": 0.4934676102340773, + "grad_norm": 0.12749126639959954, + "learning_rate": 0.0005348668370462638, + "loss": 1.3994, + "step": 5439 + }, + { + "epoch": 0.4935583378697151, + "grad_norm": 0.16397444579820994, + "learning_rate": 0.0005347202662370542, + "loss": 1.401, + "step": 5440 + }, + { + "epoch": 0.4936490655053529, + "grad_norm": 0.12696195185801198, + "learning_rate": 0.0005345736924297447, + "loss": 1.4097, + "step": 5441 + }, + { + "epoch": 0.49373979314099076, + "grad_norm": 0.12224111559545978, + "learning_rate": 0.0005344271156369916, + "loss": 1.4094, + "step": 5442 + }, + { + "epoch": 0.49383052077662853, + "grad_norm": 0.13235815457209377, + "learning_rate": 0.0005342805358714522, + "loss": 1.3878, + "step": 5443 + }, + { + "epoch": 0.49392124841226637, + "grad_norm": 0.1372152532860639, + "learning_rate": 0.0005341339531457833, + "loss": 1.4155, + "step": 5444 + }, + { + "epoch": 0.4940119760479042, + "grad_norm": 0.1432816292022758, + "learning_rate": 0.0005339873674726426, + "loss": 1.4028, + "step": 5445 + }, + { + "epoch": 0.49410270368354203, + "grad_norm": 0.16393185798631954, + "learning_rate": 0.0005338407788646876, + "loss": 1.4276, + "step": 5446 + }, + { + "epoch": 0.4941934313191798, + "grad_norm": 0.17062204324442137, + "learning_rate": 0.0005336941873345764, + "loss": 1.3948, + "step": 5447 + }, + { + "epoch": 0.49428415895481764, + "grad_norm": 0.1204396148246598, + "learning_rate": 0.0005335475928949673, + "loss": 1.3948, + "step": 5448 + }, + { + "epoch": 0.4943748865904555, + "grad_norm": 0.1343187913059892, + "learning_rate": 0.0005334009955585184, + "loss": 1.3845, + "step": 5449 + }, + { + "epoch": 0.49446561422609325, + "grad_norm": 0.1308800682662619, + "learning_rate": 0.0005332543953378888, + "loss": 1.4084, + "step": 5450 + }, + { + "epoch": 0.4945563418617311, + "grad_norm": 0.1209298183106904, + "learning_rate": 0.000533107792245737, + "loss": 1.4022, + "step": 5451 + }, + { + "epoch": 0.4946470694973689, + "grad_norm": 0.12851259098188694, + "learning_rate": 0.0005329611862947227, + "loss": 1.4021, + "step": 5452 + }, + { + "epoch": 0.4947377971330067, + "grad_norm": 0.12284641530731433, + "learning_rate": 0.000532814577497505, + "loss": 1.373, + "step": 5453 + }, + { + "epoch": 0.49482852476864453, + "grad_norm": 0.1283441645939703, + "learning_rate": 0.0005326679658667437, + "loss": 1.4006, + "step": 5454 + }, + { + "epoch": 0.49491925240428236, + "grad_norm": 0.13553421694997317, + "learning_rate": 0.0005325213514150988, + "loss": 1.3897, + "step": 5455 + }, + { + "epoch": 0.49500998003992014, + "grad_norm": 0.12936321757976468, + "learning_rate": 0.0005323747341552304, + "loss": 1.4076, + "step": 5456 + }, + { + "epoch": 0.49510070767555797, + "grad_norm": 0.14294812683418165, + "learning_rate": 0.0005322281140997989, + "loss": 1.41, + "step": 5457 + }, + { + "epoch": 0.4951914353111958, + "grad_norm": 0.13348094266978736, + "learning_rate": 0.000532081491261465, + "loss": 1.4228, + "step": 5458 + }, + { + "epoch": 0.4952821629468336, + "grad_norm": 0.12449185480469924, + "learning_rate": 0.0005319348656528895, + "loss": 1.3673, + "step": 5459 + }, + { + "epoch": 0.4953728905824714, + "grad_norm": 0.1224385801949936, + "learning_rate": 0.0005317882372867338, + "loss": 1.4224, + "step": 5460 + }, + { + "epoch": 0.49546361821810925, + "grad_norm": 0.11868003369138505, + "learning_rate": 0.000531641606175659, + "loss": 1.4035, + "step": 5461 + }, + { + "epoch": 0.495554345853747, + "grad_norm": 0.1375198141737469, + "learning_rate": 0.000531494972332327, + "loss": 1.3865, + "step": 5462 + }, + { + "epoch": 0.49564507348938486, + "grad_norm": 0.13868433076765163, + "learning_rate": 0.0005313483357693994, + "loss": 1.3605, + "step": 5463 + }, + { + "epoch": 0.4957358011250227, + "grad_norm": 0.16438153014895796, + "learning_rate": 0.0005312016964995384, + "loss": 1.457, + "step": 5464 + }, + { + "epoch": 0.4958265287606605, + "grad_norm": 0.14161495333691904, + "learning_rate": 0.0005310550545354063, + "loss": 1.3894, + "step": 5465 + }, + { + "epoch": 0.4959172563962983, + "grad_norm": 0.1223640954766885, + "learning_rate": 0.0005309084098896656, + "loss": 1.3888, + "step": 5466 + }, + { + "epoch": 0.49600798403193613, + "grad_norm": 0.14122008795404914, + "learning_rate": 0.0005307617625749793, + "loss": 1.41, + "step": 5467 + }, + { + "epoch": 0.49609871166757397, + "grad_norm": 0.12290782300980403, + "learning_rate": 0.0005306151126040101, + "loss": 1.3864, + "step": 5468 + }, + { + "epoch": 0.49618943930321174, + "grad_norm": 0.1306359394990822, + "learning_rate": 0.0005304684599894215, + "loss": 1.4072, + "step": 5469 + }, + { + "epoch": 0.4962801669388496, + "grad_norm": 0.13339684609625876, + "learning_rate": 0.0005303218047438769, + "loss": 1.3928, + "step": 5470 + }, + { + "epoch": 0.4963708945744874, + "grad_norm": 0.12316466689624124, + "learning_rate": 0.00053017514688004, + "loss": 1.3909, + "step": 5471 + }, + { + "epoch": 0.4964616222101252, + "grad_norm": 0.12420407406269912, + "learning_rate": 0.0005300284864105747, + "loss": 1.4213, + "step": 5472 + }, + { + "epoch": 0.496552349845763, + "grad_norm": 0.12199959615066253, + "learning_rate": 0.0005298818233481451, + "loss": 1.4225, + "step": 5473 + }, + { + "epoch": 0.49664307748140085, + "grad_norm": 0.14531433683081366, + "learning_rate": 0.0005297351577054158, + "loss": 1.4218, + "step": 5474 + }, + { + "epoch": 0.49673380511703863, + "grad_norm": 0.13091197240393315, + "learning_rate": 0.0005295884894950513, + "loss": 1.4298, + "step": 5475 + }, + { + "epoch": 0.49682453275267646, + "grad_norm": 0.1374980289972488, + "learning_rate": 0.000529441818729716, + "loss": 1.3932, + "step": 5476 + }, + { + "epoch": 0.4969152603883143, + "grad_norm": 0.12544842780194684, + "learning_rate": 0.0005292951454220757, + "loss": 1.3892, + "step": 5477 + }, + { + "epoch": 0.49700598802395207, + "grad_norm": 0.18632695275059116, + "learning_rate": 0.0005291484695847951, + "loss": 1.4039, + "step": 5478 + }, + { + "epoch": 0.4970967156595899, + "grad_norm": 0.12285116346958058, + "learning_rate": 0.0005290017912305399, + "loss": 1.3764, + "step": 5479 + }, + { + "epoch": 0.49718744329522774, + "grad_norm": 0.14049451983583372, + "learning_rate": 0.0005288551103719757, + "loss": 1.4132, + "step": 5480 + }, + { + "epoch": 0.4972781709308655, + "grad_norm": 0.14006211428647516, + "learning_rate": 0.0005287084270217684, + "loss": 1.4076, + "step": 5481 + }, + { + "epoch": 0.49736889856650335, + "grad_norm": 0.1288256547801179, + "learning_rate": 0.0005285617411925845, + "loss": 1.3817, + "step": 5482 + }, + { + "epoch": 0.4974596262021412, + "grad_norm": 0.1207919081992118, + "learning_rate": 0.0005284150528970898, + "loss": 1.4066, + "step": 5483 + }, + { + "epoch": 0.497550353837779, + "grad_norm": 0.1177880842087512, + "learning_rate": 0.0005282683621479512, + "loss": 1.4169, + "step": 5484 + }, + { + "epoch": 0.4976410814734168, + "grad_norm": 0.1253833207558626, + "learning_rate": 0.0005281216689578352, + "loss": 1.4047, + "step": 5485 + }, + { + "epoch": 0.4977318091090546, + "grad_norm": 0.12176028944113973, + "learning_rate": 0.000527974973339409, + "loss": 1.4226, + "step": 5486 + }, + { + "epoch": 0.49782253674469246, + "grad_norm": 0.14110752741352645, + "learning_rate": 0.00052782827530534, + "loss": 1.4149, + "step": 5487 + }, + { + "epoch": 0.49791326438033023, + "grad_norm": 0.1362147657274379, + "learning_rate": 0.0005276815748682949, + "loss": 1.3846, + "step": 5488 + }, + { + "epoch": 0.49800399201596807, + "grad_norm": 0.12906628280461127, + "learning_rate": 0.0005275348720409419, + "loss": 1.3638, + "step": 5489 + }, + { + "epoch": 0.4980947196516059, + "grad_norm": 0.14818299054824519, + "learning_rate": 0.0005273881668359488, + "loss": 1.4203, + "step": 5490 + }, + { + "epoch": 0.4981854472872437, + "grad_norm": 0.13009191837494874, + "learning_rate": 0.0005272414592659832, + "loss": 1.4097, + "step": 5491 + }, + { + "epoch": 0.4982761749228815, + "grad_norm": 0.19089955907821032, + "learning_rate": 0.0005270947493437137, + "loss": 1.4008, + "step": 5492 + }, + { + "epoch": 0.49836690255851934, + "grad_norm": 0.12541089843533207, + "learning_rate": 0.0005269480370818086, + "loss": 1.411, + "step": 5493 + }, + { + "epoch": 0.4984576301941571, + "grad_norm": 0.12764029474529426, + "learning_rate": 0.0005268013224929367, + "loss": 1.394, + "step": 5494 + }, + { + "epoch": 0.49854835782979495, + "grad_norm": 0.13206325050816484, + "learning_rate": 0.0005266546055897664, + "loss": 1.4087, + "step": 5495 + }, + { + "epoch": 0.4986390854654328, + "grad_norm": 0.12365316269408666, + "learning_rate": 0.000526507886384967, + "loss": 1.361, + "step": 5496 + }, + { + "epoch": 0.49872981310107056, + "grad_norm": 0.1309384794495096, + "learning_rate": 0.0005263611648912078, + "loss": 1.437, + "step": 5497 + }, + { + "epoch": 0.4988205407367084, + "grad_norm": 0.12941588522044073, + "learning_rate": 0.000526214441121158, + "loss": 1.3784, + "step": 5498 + }, + { + "epoch": 0.49891126837234623, + "grad_norm": 0.1302055194703602, + "learning_rate": 0.0005260677150874877, + "loss": 1.3982, + "step": 5499 + }, + { + "epoch": 0.499001996007984, + "grad_norm": 0.1384403036886394, + "learning_rate": 0.0005259209868028658, + "loss": 1.4171, + "step": 5500 + }, + { + "epoch": 0.49909272364362184, + "grad_norm": 0.12567420964471068, + "learning_rate": 0.0005257742562799633, + "loss": 1.3821, + "step": 5501 + }, + { + "epoch": 0.49918345127925967, + "grad_norm": 0.13958437461079418, + "learning_rate": 0.0005256275235314498, + "loss": 1.4034, + "step": 5502 + }, + { + "epoch": 0.4992741789148975, + "grad_norm": 0.14765331944311225, + "learning_rate": 0.0005254807885699958, + "loss": 1.4181, + "step": 5503 + }, + { + "epoch": 0.4993649065505353, + "grad_norm": 0.12526951579900644, + "learning_rate": 0.0005253340514082722, + "loss": 1.3935, + "step": 5504 + }, + { + "epoch": 0.4994556341861731, + "grad_norm": 0.13411207405312706, + "learning_rate": 0.0005251873120589493, + "loss": 1.4228, + "step": 5505 + }, + { + "epoch": 0.49954636182181095, + "grad_norm": 0.13137874814013126, + "learning_rate": 0.0005250405705346984, + "loss": 1.4104, + "step": 5506 + }, + { + "epoch": 0.4996370894574487, + "grad_norm": 0.11700664829889262, + "learning_rate": 0.0005248938268481904, + "loss": 1.3898, + "step": 5507 + }, + { + "epoch": 0.49972781709308656, + "grad_norm": 0.11838098374242396, + "learning_rate": 0.0005247470810120969, + "loss": 1.4241, + "step": 5508 + }, + { + "epoch": 0.4998185447287244, + "grad_norm": 0.12405618012672824, + "learning_rate": 0.0005246003330390894, + "loss": 1.4106, + "step": 5509 + }, + { + "epoch": 0.49990927236436217, + "grad_norm": 0.16263993607212685, + "learning_rate": 0.0005244535829418395, + "loss": 1.4061, + "step": 5510 + }, + { + "epoch": 0.5, + "grad_norm": 0.11160419571234305, + "learning_rate": 0.000524306830733019, + "loss": 1.4315, + "step": 5511 + }, + { + "epoch": 0.5000907276356378, + "grad_norm": 0.11703753712219812, + "learning_rate": 0.0005241600764253001, + "loss": 1.4037, + "step": 5512 + }, + { + "epoch": 0.5001814552712757, + "grad_norm": 0.13716485303258666, + "learning_rate": 0.000524013320031355, + "loss": 1.3943, + "step": 5513 + }, + { + "epoch": 0.5002721829069134, + "grad_norm": 0.13078353534687467, + "learning_rate": 0.0005238665615638563, + "loss": 1.4193, + "step": 5514 + }, + { + "epoch": 0.5003629105425512, + "grad_norm": 0.1381520843993569, + "learning_rate": 0.0005237198010354763, + "loss": 1.3882, + "step": 5515 + }, + { + "epoch": 0.5004536381781891, + "grad_norm": 0.1229757347980128, + "learning_rate": 0.0005235730384588881, + "loss": 1.3898, + "step": 5516 + }, + { + "epoch": 0.5005443658138269, + "grad_norm": 0.15807259791808992, + "learning_rate": 0.0005234262738467646, + "loss": 1.3739, + "step": 5517 + }, + { + "epoch": 0.5006350934494647, + "grad_norm": 0.12869351969332843, + "learning_rate": 0.0005232795072117788, + "loss": 1.4106, + "step": 5518 + }, + { + "epoch": 0.5007258210851026, + "grad_norm": 0.16438154483190928, + "learning_rate": 0.0005231327385666043, + "loss": 1.3814, + "step": 5519 + }, + { + "epoch": 0.5008165487207403, + "grad_norm": 0.1549530895648478, + "learning_rate": 0.0005229859679239142, + "loss": 1.4517, + "step": 5520 + }, + { + "epoch": 0.5009072763563781, + "grad_norm": 0.1199693606369538, + "learning_rate": 0.0005228391952963826, + "loss": 1.4715, + "step": 5521 + }, + { + "epoch": 0.500998003992016, + "grad_norm": 0.12044324021913587, + "learning_rate": 0.000522692420696683, + "loss": 1.3983, + "step": 5522 + }, + { + "epoch": 0.5010887316276538, + "grad_norm": 0.14237009981306792, + "learning_rate": 0.0005225456441374895, + "loss": 1.436, + "step": 5523 + }, + { + "epoch": 0.5011794592632915, + "grad_norm": 0.1213100678941179, + "learning_rate": 0.0005223988656314763, + "loss": 1.3817, + "step": 5524 + }, + { + "epoch": 0.5012701868989294, + "grad_norm": 0.13944796015851704, + "learning_rate": 0.0005222520851913177, + "loss": 1.3671, + "step": 5525 + }, + { + "epoch": 0.5013609145345672, + "grad_norm": 0.12824175163918364, + "learning_rate": 0.0005221053028296884, + "loss": 1.4109, + "step": 5526 + }, + { + "epoch": 0.501451642170205, + "grad_norm": 0.13040619361207012, + "learning_rate": 0.0005219585185592629, + "loss": 1.4205, + "step": 5527 + }, + { + "epoch": 0.5015423698058429, + "grad_norm": 0.12750720237242005, + "learning_rate": 0.0005218117323927162, + "loss": 1.394, + "step": 5528 + }, + { + "epoch": 0.5016330974414807, + "grad_norm": 0.11330987133275558, + "learning_rate": 0.0005216649443427231, + "loss": 1.4221, + "step": 5529 + }, + { + "epoch": 0.5017238250771184, + "grad_norm": 0.12500182224336573, + "learning_rate": 0.0005215181544219587, + "loss": 1.4018, + "step": 5530 + }, + { + "epoch": 0.5018145527127563, + "grad_norm": 0.19562947457194968, + "learning_rate": 0.0005213713626430986, + "loss": 1.3886, + "step": 5531 + }, + { + "epoch": 0.5019052803483941, + "grad_norm": 0.1326924821368834, + "learning_rate": 0.0005212245690188182, + "loss": 1.4003, + "step": 5532 + }, + { + "epoch": 0.501996007984032, + "grad_norm": 0.12629378289363516, + "learning_rate": 0.0005210777735617933, + "loss": 1.3791, + "step": 5533 + }, + { + "epoch": 0.5020867356196698, + "grad_norm": 0.12587699701330102, + "learning_rate": 0.0005209309762846993, + "loss": 1.3986, + "step": 5534 + }, + { + "epoch": 0.5021774632553075, + "grad_norm": 0.12873753936607166, + "learning_rate": 0.0005207841772002126, + "loss": 1.4198, + "step": 5535 + }, + { + "epoch": 0.5022681908909454, + "grad_norm": 0.12900183229760948, + "learning_rate": 0.000520637376321009, + "loss": 1.4129, + "step": 5536 + }, + { + "epoch": 0.5023589185265832, + "grad_norm": 0.13189676561149752, + "learning_rate": 0.000520490573659765, + "loss": 1.3826, + "step": 5537 + }, + { + "epoch": 0.502449646162221, + "grad_norm": 0.15754352350217893, + "learning_rate": 0.000520343769229157, + "loss": 1.365, + "step": 5538 + }, + { + "epoch": 0.5025403737978589, + "grad_norm": 0.12814231944495447, + "learning_rate": 0.0005201969630418612, + "loss": 1.4047, + "step": 5539 + }, + { + "epoch": 0.5026311014334967, + "grad_norm": 0.1255522108263818, + "learning_rate": 0.000520050155110555, + "loss": 1.3922, + "step": 5540 + }, + { + "epoch": 0.5027218290691344, + "grad_norm": 0.15993319242990314, + "learning_rate": 0.0005199033454479148, + "loss": 1.3958, + "step": 5541 + }, + { + "epoch": 0.5028125567047723, + "grad_norm": 0.13591721167001963, + "learning_rate": 0.0005197565340666177, + "loss": 1.4154, + "step": 5542 + }, + { + "epoch": 0.5029032843404101, + "grad_norm": 0.16078706128740303, + "learning_rate": 0.0005196097209793412, + "loss": 1.4591, + "step": 5543 + }, + { + "epoch": 0.5029940119760479, + "grad_norm": 0.15734391704184889, + "learning_rate": 0.0005194629061987623, + "loss": 1.3888, + "step": 5544 + }, + { + "epoch": 0.5030847396116858, + "grad_norm": 0.13319119370067656, + "learning_rate": 0.0005193160897375588, + "loss": 1.4068, + "step": 5545 + }, + { + "epoch": 0.5031754672473235, + "grad_norm": 0.12330934470939842, + "learning_rate": 0.0005191692716084076, + "loss": 1.416, + "step": 5546 + }, + { + "epoch": 0.5032661948829613, + "grad_norm": 0.1300263553903488, + "learning_rate": 0.0005190224518239874, + "loss": 1.4304, + "step": 5547 + }, + { + "epoch": 0.5033569225185992, + "grad_norm": 0.1310970118614047, + "learning_rate": 0.0005188756303969756, + "loss": 1.3716, + "step": 5548 + }, + { + "epoch": 0.503447650154237, + "grad_norm": 0.13316699891222372, + "learning_rate": 0.0005187288073400503, + "loss": 1.4331, + "step": 5549 + }, + { + "epoch": 0.5035383777898748, + "grad_norm": 0.12824272347159732, + "learning_rate": 0.0005185819826658896, + "loss": 1.4105, + "step": 5550 + }, + { + "epoch": 0.5036291054255126, + "grad_norm": 0.16140839764223447, + "learning_rate": 0.0005184351563871721, + "loss": 1.3957, + "step": 5551 + }, + { + "epoch": 0.5037198330611504, + "grad_norm": 0.1631480546276107, + "learning_rate": 0.000518288328516576, + "loss": 1.4305, + "step": 5552 + }, + { + "epoch": 0.5038105606967882, + "grad_norm": 0.13494783632525667, + "learning_rate": 0.0005181414990667804, + "loss": 1.402, + "step": 5553 + }, + { + "epoch": 0.5039012883324261, + "grad_norm": 0.23109315697087157, + "learning_rate": 0.0005179946680504632, + "loss": 1.3573, + "step": 5554 + }, + { + "epoch": 0.5039920159680639, + "grad_norm": 0.12764500609591148, + "learning_rate": 0.000517847835480304, + "loss": 1.3859, + "step": 5555 + }, + { + "epoch": 0.5040827436037016, + "grad_norm": 0.1219601800639739, + "learning_rate": 0.0005177010013689816, + "loss": 1.3887, + "step": 5556 + }, + { + "epoch": 0.5041734712393395, + "grad_norm": 0.1277214730804106, + "learning_rate": 0.000517554165729175, + "loss": 1.4388, + "step": 5557 + }, + { + "epoch": 0.5042641988749773, + "grad_norm": 0.13415365197543228, + "learning_rate": 0.0005174073285735636, + "loss": 1.381, + "step": 5558 + }, + { + "epoch": 0.5043549265106151, + "grad_norm": 0.12754433283020725, + "learning_rate": 0.000517260489914827, + "loss": 1.4406, + "step": 5559 + }, + { + "epoch": 0.504445654146253, + "grad_norm": 0.14049581461683558, + "learning_rate": 0.0005171136497656444, + "loss": 1.4082, + "step": 5560 + }, + { + "epoch": 0.5045363817818908, + "grad_norm": 0.13457489858825047, + "learning_rate": 0.0005169668081386958, + "loss": 1.4009, + "step": 5561 + }, + { + "epoch": 0.5046271094175285, + "grad_norm": 0.13414647881070657, + "learning_rate": 0.0005168199650466606, + "loss": 1.4101, + "step": 5562 + }, + { + "epoch": 0.5047178370531664, + "grad_norm": 0.1441738755354857, + "learning_rate": 0.000516673120502219, + "loss": 1.412, + "step": 5563 + }, + { + "epoch": 0.5048085646888042, + "grad_norm": 0.13007846212787394, + "learning_rate": 0.000516526274518051, + "loss": 1.3672, + "step": 5564 + }, + { + "epoch": 0.504899292324442, + "grad_norm": 0.12565204945604228, + "learning_rate": 0.0005163794271068368, + "loss": 1.4289, + "step": 5565 + }, + { + "epoch": 0.5049900199600799, + "grad_norm": 0.12279408616914186, + "learning_rate": 0.0005162325782812565, + "loss": 1.4009, + "step": 5566 + }, + { + "epoch": 0.5050807475957176, + "grad_norm": 0.13003139615084763, + "learning_rate": 0.0005160857280539909, + "loss": 1.399, + "step": 5567 + }, + { + "epoch": 0.5051714752313554, + "grad_norm": 0.12927082883399135, + "learning_rate": 0.0005159388764377202, + "loss": 1.3918, + "step": 5568 + }, + { + "epoch": 0.5052622028669933, + "grad_norm": 0.12084639021662591, + "learning_rate": 0.0005157920234451251, + "loss": 1.4054, + "step": 5569 + }, + { + "epoch": 0.5053529305026311, + "grad_norm": 0.12384514798096447, + "learning_rate": 0.0005156451690888866, + "loss": 1.4039, + "step": 5570 + }, + { + "epoch": 0.505443658138269, + "grad_norm": 0.12755679079689605, + "learning_rate": 0.0005154983133816853, + "loss": 1.4282, + "step": 5571 + }, + { + "epoch": 0.5055343857739067, + "grad_norm": 0.16831615221306448, + "learning_rate": 0.0005153514563362025, + "loss": 1.3755, + "step": 5572 + }, + { + "epoch": 0.5056251134095445, + "grad_norm": 0.1307710291340512, + "learning_rate": 0.000515204597965119, + "loss": 1.4012, + "step": 5573 + }, + { + "epoch": 0.5057158410451824, + "grad_norm": 0.11961922731894666, + "learning_rate": 0.0005150577382811164, + "loss": 1.3889, + "step": 5574 + }, + { + "epoch": 0.5058065686808202, + "grad_norm": 0.13918033430457608, + "learning_rate": 0.0005149108772968758, + "loss": 1.4078, + "step": 5575 + }, + { + "epoch": 0.505897296316458, + "grad_norm": 0.13068796926442985, + "learning_rate": 0.0005147640150250787, + "loss": 1.4226, + "step": 5576 + }, + { + "epoch": 0.5059880239520959, + "grad_norm": 0.12653629975633524, + "learning_rate": 0.000514617151478407, + "loss": 1.4259, + "step": 5577 + }, + { + "epoch": 0.5060787515877336, + "grad_norm": 0.12190180402180777, + "learning_rate": 0.0005144702866695417, + "loss": 1.3827, + "step": 5578 + }, + { + "epoch": 0.5061694792233714, + "grad_norm": 0.1162913956638854, + "learning_rate": 0.0005143234206111654, + "loss": 1.4318, + "step": 5579 + }, + { + "epoch": 0.5062602068590093, + "grad_norm": 0.13756110515828165, + "learning_rate": 0.0005141765533159592, + "loss": 1.4115, + "step": 5580 + }, + { + "epoch": 0.5063509344946471, + "grad_norm": 0.12108454958747807, + "learning_rate": 0.0005140296847966058, + "loss": 1.4225, + "step": 5581 + }, + { + "epoch": 0.5064416621302849, + "grad_norm": 0.11962879421654617, + "learning_rate": 0.000513882815065787, + "loss": 1.3743, + "step": 5582 + }, + { + "epoch": 0.5065323897659227, + "grad_norm": 0.13336445812598355, + "learning_rate": 0.0005137359441361851, + "loss": 1.3942, + "step": 5583 + }, + { + "epoch": 0.5066231174015605, + "grad_norm": 0.12235215537097385, + "learning_rate": 0.0005135890720204824, + "loss": 1.4001, + "step": 5584 + }, + { + "epoch": 0.5067138450371983, + "grad_norm": 0.1332951619749752, + "learning_rate": 0.0005134421987313613, + "loss": 1.4108, + "step": 5585 + }, + { + "epoch": 0.5068045726728362, + "grad_norm": 0.36934568026994996, + "learning_rate": 0.0005132953242815043, + "loss": 1.4118, + "step": 5586 + }, + { + "epoch": 0.506895300308474, + "grad_norm": 0.13229609728511205, + "learning_rate": 0.0005131484486835942, + "loss": 1.4066, + "step": 5587 + }, + { + "epoch": 0.5069860279441117, + "grad_norm": 0.13658092059719443, + "learning_rate": 0.0005130015719503136, + "loss": 1.4203, + "step": 5588 + }, + { + "epoch": 0.5070767555797496, + "grad_norm": 0.12664057511175864, + "learning_rate": 0.0005128546940943453, + "loss": 1.3826, + "step": 5589 + }, + { + "epoch": 0.5071674832153874, + "grad_norm": 0.12686456202677893, + "learning_rate": 0.0005127078151283723, + "loss": 1.4076, + "step": 5590 + }, + { + "epoch": 0.5072582108510252, + "grad_norm": 0.12121549285525252, + "learning_rate": 0.0005125609350650776, + "loss": 1.396, + "step": 5591 + }, + { + "epoch": 0.5073489384866631, + "grad_norm": 0.1212437585868492, + "learning_rate": 0.0005124140539171444, + "loss": 1.3799, + "step": 5592 + }, + { + "epoch": 0.5074396661223008, + "grad_norm": 0.12011536472182814, + "learning_rate": 0.0005122671716972555, + "loss": 1.4055, + "step": 5593 + }, + { + "epoch": 0.5075303937579386, + "grad_norm": 0.14074906791581793, + "learning_rate": 0.0005121202884180949, + "loss": 1.3881, + "step": 5594 + }, + { + "epoch": 0.5076211213935765, + "grad_norm": 0.12665292593032418, + "learning_rate": 0.0005119734040923454, + "loss": 1.3946, + "step": 5595 + }, + { + "epoch": 0.5077118490292143, + "grad_norm": 0.15083170354339923, + "learning_rate": 0.0005118265187326908, + "loss": 1.4002, + "step": 5596 + }, + { + "epoch": 0.5078025766648521, + "grad_norm": 0.12296330756379105, + "learning_rate": 0.0005116796323518144, + "loss": 1.4272, + "step": 5597 + }, + { + "epoch": 0.50789330430049, + "grad_norm": 0.12135519244055276, + "learning_rate": 0.0005115327449624, + "loss": 1.3983, + "step": 5598 + }, + { + "epoch": 0.5079840319361277, + "grad_norm": 0.12008541161519466, + "learning_rate": 0.0005113858565771317, + "loss": 1.3874, + "step": 5599 + }, + { + "epoch": 0.5080747595717655, + "grad_norm": 0.15842032915148527, + "learning_rate": 0.0005112389672086927, + "loss": 1.411, + "step": 5600 + }, + { + "epoch": 0.5081654872074034, + "grad_norm": 0.1260425010380979, + "learning_rate": 0.0005110920768697674, + "loss": 1.426, + "step": 5601 + }, + { + "epoch": 0.5082562148430412, + "grad_norm": 0.1266696118298083, + "learning_rate": 0.0005109451855730394, + "loss": 1.4228, + "step": 5602 + }, + { + "epoch": 0.508346942478679, + "grad_norm": 0.1310712388845473, + "learning_rate": 0.0005107982933311931, + "loss": 1.4248, + "step": 5603 + }, + { + "epoch": 0.5084376701143168, + "grad_norm": 0.12915875915687947, + "learning_rate": 0.0005106514001569128, + "loss": 1.4278, + "step": 5604 + }, + { + "epoch": 0.5085283977499546, + "grad_norm": 0.1297664832398101, + "learning_rate": 0.0005105045060628822, + "loss": 1.3866, + "step": 5605 + }, + { + "epoch": 0.5086191253855924, + "grad_norm": 0.12159313958828732, + "learning_rate": 0.0005103576110617863, + "loss": 1.4455, + "step": 5606 + }, + { + "epoch": 0.5087098530212303, + "grad_norm": 0.14441223193065245, + "learning_rate": 0.0005102107151663089, + "loss": 1.3966, + "step": 5607 + }, + { + "epoch": 0.5088005806568681, + "grad_norm": 0.12712502408280107, + "learning_rate": 0.0005100638183891347, + "loss": 1.4362, + "step": 5608 + }, + { + "epoch": 0.508891308292506, + "grad_norm": 0.16438429301462007, + "learning_rate": 0.0005099169207429485, + "loss": 1.4043, + "step": 5609 + }, + { + "epoch": 0.5089820359281437, + "grad_norm": 0.14124781107871745, + "learning_rate": 0.0005097700222404345, + "loss": 1.4158, + "step": 5610 + }, + { + "epoch": 0.5090727635637815, + "grad_norm": 0.13158625438237875, + "learning_rate": 0.0005096231228942779, + "loss": 1.4357, + "step": 5611 + }, + { + "epoch": 0.5091634911994194, + "grad_norm": 0.13314751431838737, + "learning_rate": 0.0005094762227171632, + "loss": 1.435, + "step": 5612 + }, + { + "epoch": 0.5092542188350572, + "grad_norm": 0.12438505161800914, + "learning_rate": 0.0005093293217217752, + "loss": 1.3831, + "step": 5613 + }, + { + "epoch": 0.509344946470695, + "grad_norm": 0.11875237542574023, + "learning_rate": 0.0005091824199207987, + "loss": 1.3858, + "step": 5614 + }, + { + "epoch": 0.5094356741063328, + "grad_norm": 0.13234140070222383, + "learning_rate": 0.0005090355173269191, + "loss": 1.3922, + "step": 5615 + }, + { + "epoch": 0.5095264017419706, + "grad_norm": 0.14545135334922105, + "learning_rate": 0.0005088886139528213, + "loss": 1.3801, + "step": 5616 + }, + { + "epoch": 0.5096171293776084, + "grad_norm": 0.11769590341087946, + "learning_rate": 0.0005087417098111901, + "loss": 1.3794, + "step": 5617 + }, + { + "epoch": 0.5097078570132463, + "grad_norm": 0.12449181767361932, + "learning_rate": 0.0005085948049147113, + "loss": 1.3973, + "step": 5618 + }, + { + "epoch": 0.5097985846488841, + "grad_norm": 0.1313460333068028, + "learning_rate": 0.0005084478992760697, + "loss": 1.4148, + "step": 5619 + }, + { + "epoch": 0.5098893122845218, + "grad_norm": 0.12817244512095347, + "learning_rate": 0.0005083009929079505, + "loss": 1.3939, + "step": 5620 + }, + { + "epoch": 0.5099800399201597, + "grad_norm": 0.12929634905066653, + "learning_rate": 0.0005081540858230397, + "loss": 1.4046, + "step": 5621 + }, + { + "epoch": 0.5100707675557975, + "grad_norm": 0.1220355299530685, + "learning_rate": 0.0005080071780340223, + "loss": 1.4105, + "step": 5622 + }, + { + "epoch": 0.5101614951914353, + "grad_norm": 0.14753561031214765, + "learning_rate": 0.0005078602695535837, + "loss": 1.4256, + "step": 5623 + }, + { + "epoch": 0.5102522228270732, + "grad_norm": 0.1206760541839519, + "learning_rate": 0.0005077133603944099, + "loss": 1.3868, + "step": 5624 + }, + { + "epoch": 0.5103429504627109, + "grad_norm": 0.13305607773814, + "learning_rate": 0.0005075664505691861, + "loss": 1.3881, + "step": 5625 + }, + { + "epoch": 0.5104336780983487, + "grad_norm": 0.12360697807933067, + "learning_rate": 0.0005074195400905984, + "loss": 1.3985, + "step": 5626 + }, + { + "epoch": 0.5105244057339866, + "grad_norm": 0.12241935448940083, + "learning_rate": 0.0005072726289713323, + "loss": 1.3828, + "step": 5627 + }, + { + "epoch": 0.5106151333696244, + "grad_norm": 0.1409183660035511, + "learning_rate": 0.0005071257172240735, + "loss": 1.3961, + "step": 5628 + }, + { + "epoch": 0.5107058610052622, + "grad_norm": 0.12443118416809491, + "learning_rate": 0.0005069788048615079, + "loss": 1.4196, + "step": 5629 + }, + { + "epoch": 0.5107965886409, + "grad_norm": 0.12943968538139333, + "learning_rate": 0.0005068318918963216, + "loss": 1.4124, + "step": 5630 + }, + { + "epoch": 0.5108873162765378, + "grad_norm": 0.12587977129829483, + "learning_rate": 0.0005066849783412006, + "loss": 1.4175, + "step": 5631 + }, + { + "epoch": 0.5109780439121756, + "grad_norm": 0.1288303101770681, + "learning_rate": 0.0005065380642088303, + "loss": 1.3699, + "step": 5632 + }, + { + "epoch": 0.5110687715478135, + "grad_norm": 0.123936580454712, + "learning_rate": 0.0005063911495118975, + "loss": 1.4262, + "step": 5633 + }, + { + "epoch": 0.5111594991834513, + "grad_norm": 0.13969945713460247, + "learning_rate": 0.0005062442342630879, + "loss": 1.4374, + "step": 5634 + }, + { + "epoch": 0.511250226819089, + "grad_norm": 0.12834112547896648, + "learning_rate": 0.0005060973184750877, + "loss": 1.3812, + "step": 5635 + }, + { + "epoch": 0.5113409544547269, + "grad_norm": 0.14642454007030128, + "learning_rate": 0.0005059504021605832, + "loss": 1.3877, + "step": 5636 + }, + { + "epoch": 0.5114316820903647, + "grad_norm": 0.12661237803695435, + "learning_rate": 0.0005058034853322606, + "loss": 1.3883, + "step": 5637 + }, + { + "epoch": 0.5115224097260025, + "grad_norm": 0.1337108782535366, + "learning_rate": 0.0005056565680028063, + "loss": 1.4051, + "step": 5638 + }, + { + "epoch": 0.5116131373616404, + "grad_norm": 0.12866944072756137, + "learning_rate": 0.0005055096501849065, + "loss": 1.3796, + "step": 5639 + }, + { + "epoch": 0.5117038649972782, + "grad_norm": 0.12298394236001284, + "learning_rate": 0.0005053627318912475, + "loss": 1.4248, + "step": 5640 + }, + { + "epoch": 0.5117945926329159, + "grad_norm": 0.12130299192970016, + "learning_rate": 0.0005052158131345157, + "loss": 1.3724, + "step": 5641 + }, + { + "epoch": 0.5118853202685538, + "grad_norm": 0.1275253772548249, + "learning_rate": 0.0005050688939273978, + "loss": 1.4408, + "step": 5642 + }, + { + "epoch": 0.5119760479041916, + "grad_norm": 0.1529045353065797, + "learning_rate": 0.0005049219742825803, + "loss": 1.4212, + "step": 5643 + }, + { + "epoch": 0.5120667755398294, + "grad_norm": 0.13318413588614572, + "learning_rate": 0.0005047750542127493, + "loss": 1.393, + "step": 5644 + }, + { + "epoch": 0.5121575031754673, + "grad_norm": 0.12562993446661924, + "learning_rate": 0.0005046281337305919, + "loss": 1.3977, + "step": 5645 + }, + { + "epoch": 0.512248230811105, + "grad_norm": 0.12795004456379086, + "learning_rate": 0.0005044812128487943, + "loss": 1.4042, + "step": 5646 + }, + { + "epoch": 0.5123389584467429, + "grad_norm": 0.14340463908970763, + "learning_rate": 0.0005043342915800434, + "loss": 1.4182, + "step": 5647 + }, + { + "epoch": 0.5124296860823807, + "grad_norm": 0.12707799377163814, + "learning_rate": 0.0005041873699370258, + "loss": 1.4173, + "step": 5648 + }, + { + "epoch": 0.5125204137180185, + "grad_norm": 0.15761652341926818, + "learning_rate": 0.0005040404479324282, + "loss": 1.4016, + "step": 5649 + }, + { + "epoch": 0.5126111413536564, + "grad_norm": 0.1318137934720492, + "learning_rate": 0.0005038935255789373, + "loss": 1.4242, + "step": 5650 + }, + { + "epoch": 0.5127018689892942, + "grad_norm": 0.12987240605970196, + "learning_rate": 0.00050374660288924, + "loss": 1.3941, + "step": 5651 + }, + { + "epoch": 0.5127925966249319, + "grad_norm": 0.13601555929027806, + "learning_rate": 0.0005035996798760227, + "loss": 1.3842, + "step": 5652 + }, + { + "epoch": 0.5128833242605698, + "grad_norm": 0.12800230992983355, + "learning_rate": 0.0005034527565519728, + "loss": 1.461, + "step": 5653 + }, + { + "epoch": 0.5129740518962076, + "grad_norm": 0.1847488353230749, + "learning_rate": 0.0005033058329297767, + "loss": 1.4339, + "step": 5654 + }, + { + "epoch": 0.5130647795318454, + "grad_norm": 0.14447323904031195, + "learning_rate": 0.0005031589090221215, + "loss": 1.3855, + "step": 5655 + }, + { + "epoch": 0.5131555071674833, + "grad_norm": 0.13725693073630058, + "learning_rate": 0.000503011984841694, + "loss": 1.4149, + "step": 5656 + }, + { + "epoch": 0.513246234803121, + "grad_norm": 0.1347366226700541, + "learning_rate": 0.0005028650604011813, + "loss": 1.4039, + "step": 5657 + }, + { + "epoch": 0.5133369624387588, + "grad_norm": 0.12672543445960857, + "learning_rate": 0.00050271813571327, + "loss": 1.4236, + "step": 5658 + }, + { + "epoch": 0.5134276900743967, + "grad_norm": 0.14923497758312532, + "learning_rate": 0.0005025712107906472, + "loss": 1.4224, + "step": 5659 + }, + { + "epoch": 0.5135184177100345, + "grad_norm": 0.1474665181185624, + "learning_rate": 0.0005024242856460002, + "loss": 1.387, + "step": 5660 + }, + { + "epoch": 0.5136091453456723, + "grad_norm": 0.2002505772733517, + "learning_rate": 0.0005022773602920159, + "loss": 1.4112, + "step": 5661 + }, + { + "epoch": 0.5136998729813101, + "grad_norm": 0.13521820179838173, + "learning_rate": 0.000502130434741381, + "loss": 1.3827, + "step": 5662 + }, + { + "epoch": 0.5137906006169479, + "grad_norm": 0.12749280524087722, + "learning_rate": 0.0005019835090067827, + "loss": 1.4004, + "step": 5663 + }, + { + "epoch": 0.5138813282525857, + "grad_norm": 0.16210997113974895, + "learning_rate": 0.0005018365831009081, + "loss": 1.3993, + "step": 5664 + }, + { + "epoch": 0.5139720558882236, + "grad_norm": 0.1374152470712655, + "learning_rate": 0.0005016896570364445, + "loss": 1.3985, + "step": 5665 + }, + { + "epoch": 0.5140627835238614, + "grad_norm": 0.12472682374742389, + "learning_rate": 0.0005015427308260787, + "loss": 1.3753, + "step": 5666 + }, + { + "epoch": 0.5141535111594991, + "grad_norm": 0.12813636866276265, + "learning_rate": 0.0005013958044824977, + "loss": 1.4271, + "step": 5667 + }, + { + "epoch": 0.514244238795137, + "grad_norm": 0.12239323722070879, + "learning_rate": 0.0005012488780183889, + "loss": 1.4389, + "step": 5668 + }, + { + "epoch": 0.5143349664307748, + "grad_norm": 0.12266695972356781, + "learning_rate": 0.0005011019514464393, + "loss": 1.3789, + "step": 5669 + }, + { + "epoch": 0.5144256940664126, + "grad_norm": 0.12103397576002518, + "learning_rate": 0.0005009550247793361, + "loss": 1.4186, + "step": 5670 + }, + { + "epoch": 0.5145164217020505, + "grad_norm": 0.13632396947613895, + "learning_rate": 0.0005008080980297661, + "loss": 1.4059, + "step": 5671 + }, + { + "epoch": 0.5146071493376883, + "grad_norm": 0.13931276250349278, + "learning_rate": 0.000500661171210417, + "loss": 1.4254, + "step": 5672 + }, + { + "epoch": 0.514697876973326, + "grad_norm": 0.1781398468803306, + "learning_rate": 0.0005005142443339755, + "loss": 1.3863, + "step": 5673 + }, + { + "epoch": 0.5147886046089639, + "grad_norm": 0.125101390108183, + "learning_rate": 0.000500367317413129, + "loss": 1.3994, + "step": 5674 + }, + { + "epoch": 0.5148793322446017, + "grad_norm": 0.13182294552727766, + "learning_rate": 0.0005002203904605646, + "loss": 1.4331, + "step": 5675 + }, + { + "epoch": 0.5149700598802395, + "grad_norm": 0.12193274695015989, + "learning_rate": 0.0005000734634889694, + "loss": 1.4124, + "step": 5676 + }, + { + "epoch": 0.5150607875158774, + "grad_norm": 0.13069552840008533, + "learning_rate": 0.0004999265365110306, + "loss": 1.426, + "step": 5677 + }, + { + "epoch": 0.5151515151515151, + "grad_norm": 0.13372308618187415, + "learning_rate": 0.0004997796095394354, + "loss": 1.4072, + "step": 5678 + }, + { + "epoch": 0.5152422427871529, + "grad_norm": 0.1286301727705654, + "learning_rate": 0.0004996326825868711, + "loss": 1.3748, + "step": 5679 + }, + { + "epoch": 0.5153329704227908, + "grad_norm": 0.134136290648615, + "learning_rate": 0.0004994857556660246, + "loss": 1.3658, + "step": 5680 + }, + { + "epoch": 0.5154236980584286, + "grad_norm": 0.12298414228197183, + "learning_rate": 0.0004993388287895831, + "loss": 1.4138, + "step": 5681 + }, + { + "epoch": 0.5155144256940664, + "grad_norm": 0.12155572476118097, + "learning_rate": 0.0004991919019702339, + "loss": 1.391, + "step": 5682 + }, + { + "epoch": 0.5156051533297042, + "grad_norm": 0.14614891257429682, + "learning_rate": 0.0004990449752206641, + "loss": 1.3662, + "step": 5683 + }, + { + "epoch": 0.515695880965342, + "grad_norm": 0.12115550067147958, + "learning_rate": 0.0004988980485535608, + "loss": 1.4079, + "step": 5684 + }, + { + "epoch": 0.5157866086009799, + "grad_norm": 0.1752715971842609, + "learning_rate": 0.0004987511219816111, + "loss": 1.3899, + "step": 5685 + }, + { + "epoch": 0.5158773362366177, + "grad_norm": 0.1294935721157307, + "learning_rate": 0.0004986041955175023, + "loss": 1.4019, + "step": 5686 + }, + { + "epoch": 0.5159680638722555, + "grad_norm": 0.12816288858001237, + "learning_rate": 0.0004984572691739213, + "loss": 1.4141, + "step": 5687 + }, + { + "epoch": 0.5160587915078934, + "grad_norm": 0.12404354423957378, + "learning_rate": 0.0004983103429635555, + "loss": 1.404, + "step": 5688 + }, + { + "epoch": 0.5161495191435311, + "grad_norm": 0.12102353466389609, + "learning_rate": 0.0004981634168990919, + "loss": 1.4378, + "step": 5689 + }, + { + "epoch": 0.5162402467791689, + "grad_norm": 0.1292669728440287, + "learning_rate": 0.0004980164909932173, + "loss": 1.4571, + "step": 5690 + }, + { + "epoch": 0.5163309744148068, + "grad_norm": 0.13030394604073692, + "learning_rate": 0.0004978695652586191, + "loss": 1.3868, + "step": 5691 + }, + { + "epoch": 0.5164217020504446, + "grad_norm": 0.1275193248857212, + "learning_rate": 0.0004977226397079843, + "loss": 1.4312, + "step": 5692 + }, + { + "epoch": 0.5165124296860824, + "grad_norm": 0.12840681535496626, + "learning_rate": 0.0004975757143539997, + "loss": 1.3701, + "step": 5693 + }, + { + "epoch": 0.5166031573217202, + "grad_norm": 0.12813898424130205, + "learning_rate": 0.0004974287892093527, + "loss": 1.4248, + "step": 5694 + }, + { + "epoch": 0.516693884957358, + "grad_norm": 0.1342664340589111, + "learning_rate": 0.0004972818642867301, + "loss": 1.4295, + "step": 5695 + }, + { + "epoch": 0.5167846125929958, + "grad_norm": 0.12228929453119847, + "learning_rate": 0.0004971349395988189, + "loss": 1.3975, + "step": 5696 + }, + { + "epoch": 0.5168753402286337, + "grad_norm": 0.19798822777904487, + "learning_rate": 0.0004969880151583061, + "loss": 1.422, + "step": 5697 + }, + { + "epoch": 0.5169660678642715, + "grad_norm": 0.13113402029619958, + "learning_rate": 0.0004968410909778786, + "loss": 1.3898, + "step": 5698 + }, + { + "epoch": 0.5170567954999092, + "grad_norm": 0.12608926645959548, + "learning_rate": 0.0004966941670702235, + "loss": 1.389, + "step": 5699 + }, + { + "epoch": 0.5171475231355471, + "grad_norm": 0.12042279278061532, + "learning_rate": 0.0004965472434480272, + "loss": 1.4236, + "step": 5700 + }, + { + "epoch": 0.5172382507711849, + "grad_norm": 0.1170364261538629, + "learning_rate": 0.0004964003201239773, + "loss": 1.4057, + "step": 5701 + }, + { + "epoch": 0.5173289784068227, + "grad_norm": 0.1328613122347989, + "learning_rate": 0.0004962533971107602, + "loss": 1.3951, + "step": 5702 + }, + { + "epoch": 0.5174197060424606, + "grad_norm": 0.1342907873624565, + "learning_rate": 0.0004961064744210627, + "loss": 1.3909, + "step": 5703 + }, + { + "epoch": 0.5175104336780983, + "grad_norm": 0.1255470070014109, + "learning_rate": 0.0004959595520675719, + "loss": 1.3734, + "step": 5704 + }, + { + "epoch": 0.5176011613137361, + "grad_norm": 0.12728912543238782, + "learning_rate": 0.0004958126300629743, + "loss": 1.413, + "step": 5705 + }, + { + "epoch": 0.517691888949374, + "grad_norm": 0.1478588606951643, + "learning_rate": 0.0004956657084199567, + "loss": 1.3843, + "step": 5706 + }, + { + "epoch": 0.5177826165850118, + "grad_norm": 0.11451318822239362, + "learning_rate": 0.0004955187871512058, + "loss": 1.384, + "step": 5707 + }, + { + "epoch": 0.5178733442206496, + "grad_norm": 0.12163044405957954, + "learning_rate": 0.0004953718662694082, + "loss": 1.3791, + "step": 5708 + }, + { + "epoch": 0.5179640718562875, + "grad_norm": 0.12871249216252492, + "learning_rate": 0.0004952249457872507, + "loss": 1.421, + "step": 5709 + }, + { + "epoch": 0.5180547994919252, + "grad_norm": 0.12506172054588394, + "learning_rate": 0.0004950780257174199, + "loss": 1.4298, + "step": 5710 + }, + { + "epoch": 0.518145527127563, + "grad_norm": 0.241595204406681, + "learning_rate": 0.0004949311060726023, + "loss": 1.4063, + "step": 5711 + }, + { + "epoch": 0.5182362547632009, + "grad_norm": 0.1265433005783059, + "learning_rate": 0.0004947841868654843, + "loss": 1.3888, + "step": 5712 + }, + { + "epoch": 0.5183269823988387, + "grad_norm": 0.12007115404311748, + "learning_rate": 0.0004946372681087527, + "loss": 1.4072, + "step": 5713 + }, + { + "epoch": 0.5184177100344765, + "grad_norm": 0.13408800606179155, + "learning_rate": 0.0004944903498150938, + "loss": 1.3773, + "step": 5714 + }, + { + "epoch": 0.5185084376701143, + "grad_norm": 0.13665963414520044, + "learning_rate": 0.0004943434319971938, + "loss": 1.3879, + "step": 5715 + }, + { + "epoch": 0.5185991653057521, + "grad_norm": 0.12181328489358687, + "learning_rate": 0.0004941965146677393, + "loss": 1.3874, + "step": 5716 + }, + { + "epoch": 0.5186898929413899, + "grad_norm": 0.11665439622404469, + "learning_rate": 0.0004940495978394167, + "loss": 1.3938, + "step": 5717 + }, + { + "epoch": 0.5187806205770278, + "grad_norm": 0.12301135624533946, + "learning_rate": 0.0004939026815249124, + "loss": 1.402, + "step": 5718 + }, + { + "epoch": 0.5188713482126656, + "grad_norm": 0.16479785536578026, + "learning_rate": 0.0004937557657369122, + "loss": 1.3783, + "step": 5719 + }, + { + "epoch": 0.5189620758483033, + "grad_norm": 0.1221335777638926, + "learning_rate": 0.0004936088504881026, + "loss": 1.3809, + "step": 5720 + }, + { + "epoch": 0.5190528034839412, + "grad_norm": 0.11611626470250891, + "learning_rate": 0.0004934619357911699, + "loss": 1.4286, + "step": 5721 + }, + { + "epoch": 0.519143531119579, + "grad_norm": 0.13418783756592276, + "learning_rate": 0.0004933150216587997, + "loss": 1.3976, + "step": 5722 + }, + { + "epoch": 0.5192342587552169, + "grad_norm": 0.11704208366062868, + "learning_rate": 0.0004931681081036786, + "loss": 1.3976, + "step": 5723 + }, + { + "epoch": 0.5193249863908547, + "grad_norm": 0.14029154592341855, + "learning_rate": 0.000493021195138492, + "loss": 1.4126, + "step": 5724 + }, + { + "epoch": 0.5194157140264924, + "grad_norm": 0.1306740631866222, + "learning_rate": 0.0004928742827759266, + "loss": 1.3968, + "step": 5725 + }, + { + "epoch": 0.5195064416621303, + "grad_norm": 0.13694941107266545, + "learning_rate": 0.0004927273710286678, + "loss": 1.3847, + "step": 5726 + }, + { + "epoch": 0.5195971692977681, + "grad_norm": 0.122302846597522, + "learning_rate": 0.0004925804599094016, + "loss": 1.4279, + "step": 5727 + }, + { + "epoch": 0.5196878969334059, + "grad_norm": 0.12160985511972522, + "learning_rate": 0.0004924335494308139, + "loss": 1.4007, + "step": 5728 + }, + { + "epoch": 0.5197786245690438, + "grad_norm": 0.17599893004765432, + "learning_rate": 0.0004922866396055902, + "loss": 1.4166, + "step": 5729 + }, + { + "epoch": 0.5198693522046816, + "grad_norm": 0.11936924401745827, + "learning_rate": 0.0004921397304464163, + "loss": 1.3735, + "step": 5730 + }, + { + "epoch": 0.5199600798403193, + "grad_norm": 0.12579616929154516, + "learning_rate": 0.0004919928219659779, + "loss": 1.3842, + "step": 5731 + }, + { + "epoch": 0.5200508074759572, + "grad_norm": 0.12458424479548455, + "learning_rate": 0.0004918459141769602, + "loss": 1.4037, + "step": 5732 + }, + { + "epoch": 0.520141535111595, + "grad_norm": 0.12530366870334386, + "learning_rate": 0.0004916990070920495, + "loss": 1.4144, + "step": 5733 + }, + { + "epoch": 0.5202322627472328, + "grad_norm": 0.21276015102979048, + "learning_rate": 0.0004915521007239304, + "loss": 1.4164, + "step": 5734 + }, + { + "epoch": 0.5203229903828707, + "grad_norm": 0.12980304431773476, + "learning_rate": 0.0004914051950852888, + "loss": 1.4151, + "step": 5735 + }, + { + "epoch": 0.5204137180185084, + "grad_norm": 0.14122522490010508, + "learning_rate": 0.0004912582901888099, + "loss": 1.4032, + "step": 5736 + }, + { + "epoch": 0.5205044456541462, + "grad_norm": 0.18000001605360907, + "learning_rate": 0.0004911113860471788, + "loss": 1.3744, + "step": 5737 + }, + { + "epoch": 0.5205951732897841, + "grad_norm": 0.12163963367450313, + "learning_rate": 0.0004909644826730811, + "loss": 1.3787, + "step": 5738 + }, + { + "epoch": 0.5206859009254219, + "grad_norm": 0.12451599710868899, + "learning_rate": 0.0004908175800792012, + "loss": 1.3964, + "step": 5739 + }, + { + "epoch": 0.5207766285610597, + "grad_norm": 0.1352533388854544, + "learning_rate": 0.0004906706782782249, + "loss": 1.3715, + "step": 5740 + }, + { + "epoch": 0.5208673561966976, + "grad_norm": 0.1274949028024298, + "learning_rate": 0.0004905237772828369, + "loss": 1.3852, + "step": 5741 + }, + { + "epoch": 0.5209580838323353, + "grad_norm": 0.12735013544221796, + "learning_rate": 0.0004903768771057221, + "loss": 1.4292, + "step": 5742 + }, + { + "epoch": 0.5210488114679731, + "grad_norm": 0.1333301320402461, + "learning_rate": 0.0004902299777595655, + "loss": 1.4084, + "step": 5743 + }, + { + "epoch": 0.521139539103611, + "grad_norm": 0.12317250267719076, + "learning_rate": 0.0004900830792570515, + "loss": 1.4134, + "step": 5744 + }, + { + "epoch": 0.5212302667392488, + "grad_norm": 0.14294998434325107, + "learning_rate": 0.0004899361816108653, + "loss": 1.3989, + "step": 5745 + }, + { + "epoch": 0.5213209943748865, + "grad_norm": 0.13453913942738077, + "learning_rate": 0.0004897892848336914, + "loss": 1.3979, + "step": 5746 + }, + { + "epoch": 0.5214117220105244, + "grad_norm": 0.12548558160121492, + "learning_rate": 0.0004896423889382139, + "loss": 1.4088, + "step": 5747 + }, + { + "epoch": 0.5215024496461622, + "grad_norm": 0.11813626173398402, + "learning_rate": 0.0004894954939371178, + "loss": 1.4235, + "step": 5748 + }, + { + "epoch": 0.5215931772818, + "grad_norm": 0.12224045606263757, + "learning_rate": 0.0004893485998430873, + "loss": 1.4109, + "step": 5749 + }, + { + "epoch": 0.5216839049174379, + "grad_norm": 0.12308379964925795, + "learning_rate": 0.0004892017066688069, + "loss": 1.3954, + "step": 5750 + }, + { + "epoch": 0.5217746325530757, + "grad_norm": 0.12895248411737056, + "learning_rate": 0.0004890548144269606, + "loss": 1.3698, + "step": 5751 + }, + { + "epoch": 0.5218653601887134, + "grad_norm": 0.11669378723776914, + "learning_rate": 0.0004889079231302328, + "loss": 1.3847, + "step": 5752 + }, + { + "epoch": 0.5219560878243513, + "grad_norm": 0.12538980525510945, + "learning_rate": 0.0004887610327913075, + "loss": 1.3439, + "step": 5753 + }, + { + "epoch": 0.5220468154599891, + "grad_norm": 0.12695272971093538, + "learning_rate": 0.0004886141434228686, + "loss": 1.4211, + "step": 5754 + }, + { + "epoch": 0.5221375430956269, + "grad_norm": 0.17157956015989223, + "learning_rate": 0.0004884672550375999, + "loss": 1.4058, + "step": 5755 + }, + { + "epoch": 0.5222282707312648, + "grad_norm": 0.13632299868727954, + "learning_rate": 0.0004883203676481856, + "loss": 1.3995, + "step": 5756 + }, + { + "epoch": 0.5223189983669025, + "grad_norm": 0.12347399956553778, + "learning_rate": 0.00048817348126730943, + "loss": 1.4176, + "step": 5757 + }, + { + "epoch": 0.5224097260025403, + "grad_norm": 0.11935482659446661, + "learning_rate": 0.0004880265959076547, + "loss": 1.3928, + "step": 5758 + }, + { + "epoch": 0.5225004536381782, + "grad_norm": 0.11868316963911296, + "learning_rate": 0.00048787971158190525, + "loss": 1.4303, + "step": 5759 + }, + { + "epoch": 0.522591181273816, + "grad_norm": 0.1490580861310047, + "learning_rate": 0.00048773282830274453, + "loss": 1.4149, + "step": 5760 + }, + { + "epoch": 0.5226819089094539, + "grad_norm": 0.12449018791970413, + "learning_rate": 0.00048758594608285584, + "loss": 1.3862, + "step": 5761 + }, + { + "epoch": 0.5227726365450917, + "grad_norm": 0.12192432665702423, + "learning_rate": 0.0004874390649349225, + "loss": 1.3986, + "step": 5762 + }, + { + "epoch": 0.5228633641807294, + "grad_norm": 0.1171334333031858, + "learning_rate": 0.0004872921848716276, + "loss": 1.3986, + "step": 5763 + }, + { + "epoch": 0.5229540918163673, + "grad_norm": 0.12150151178002619, + "learning_rate": 0.0004871453059056548, + "loss": 1.3971, + "step": 5764 + }, + { + "epoch": 0.5230448194520051, + "grad_norm": 0.12031622678537134, + "learning_rate": 0.00048699842804968645, + "loss": 1.3667, + "step": 5765 + }, + { + "epoch": 0.5231355470876429, + "grad_norm": 0.14436007822741023, + "learning_rate": 0.0004868515513164058, + "loss": 1.4123, + "step": 5766 + }, + { + "epoch": 0.5232262747232808, + "grad_norm": 0.12212239485158459, + "learning_rate": 0.00048670467571849577, + "loss": 1.4092, + "step": 5767 + }, + { + "epoch": 0.5233170023589185, + "grad_norm": 0.14220214035056808, + "learning_rate": 0.0004865578012686388, + "loss": 1.3798, + "step": 5768 + }, + { + "epoch": 0.5234077299945563, + "grad_norm": 0.12380712886565984, + "learning_rate": 0.0004864109279795177, + "loss": 1.4366, + "step": 5769 + }, + { + "epoch": 0.5234984576301942, + "grad_norm": 0.1344983307148703, + "learning_rate": 0.0004862640558638151, + "loss": 1.4218, + "step": 5770 + }, + { + "epoch": 0.523589185265832, + "grad_norm": 0.14719046543574887, + "learning_rate": 0.0004861171849342129, + "loss": 1.4213, + "step": 5771 + }, + { + "epoch": 0.5236799129014698, + "grad_norm": 0.12048331535259682, + "learning_rate": 0.00048597031520339427, + "loss": 1.3931, + "step": 5772 + }, + { + "epoch": 0.5237706405371076, + "grad_norm": 0.13066230656269576, + "learning_rate": 0.00048582344668404075, + "loss": 1.3932, + "step": 5773 + }, + { + "epoch": 0.5238613681727454, + "grad_norm": 0.1283015665609527, + "learning_rate": 0.0004856765793888348, + "loss": 1.4186, + "step": 5774 + }, + { + "epoch": 0.5239520958083832, + "grad_norm": 0.12102256023567823, + "learning_rate": 0.0004855297133304584, + "loss": 1.3767, + "step": 5775 + }, + { + "epoch": 0.5240428234440211, + "grad_norm": 0.12439105924380123, + "learning_rate": 0.00048538284852159323, + "loss": 1.38, + "step": 5776 + }, + { + "epoch": 0.5241335510796589, + "grad_norm": 0.12765579743140157, + "learning_rate": 0.00048523598497492143, + "loss": 1.399, + "step": 5777 + }, + { + "epoch": 0.5242242787152966, + "grad_norm": 0.13913028968088942, + "learning_rate": 0.00048508912270312415, + "loss": 1.4024, + "step": 5778 + }, + { + "epoch": 0.5243150063509345, + "grad_norm": 0.13361212057814434, + "learning_rate": 0.0004849422617188837, + "loss": 1.4262, + "step": 5779 + }, + { + "epoch": 0.5244057339865723, + "grad_norm": 0.12288329721653425, + "learning_rate": 0.00048479540203488097, + "loss": 1.3603, + "step": 5780 + }, + { + "epoch": 0.5244964616222101, + "grad_norm": 0.13463247599572198, + "learning_rate": 0.0004846485436637976, + "loss": 1.4052, + "step": 5781 + }, + { + "epoch": 0.524587189257848, + "grad_norm": 0.13308556864232454, + "learning_rate": 0.0004845016866183148, + "loss": 1.3791, + "step": 5782 + }, + { + "epoch": 0.5246779168934858, + "grad_norm": 0.1233419827954365, + "learning_rate": 0.00048435483091111356, + "loss": 1.395, + "step": 5783 + }, + { + "epoch": 0.5247686445291235, + "grad_norm": 0.13616733644071477, + "learning_rate": 0.0004842079765548749, + "loss": 1.4241, + "step": 5784 + }, + { + "epoch": 0.5248593721647614, + "grad_norm": 0.1388595035739773, + "learning_rate": 0.00048406112356227997, + "loss": 1.3976, + "step": 5785 + }, + { + "epoch": 0.5249500998003992, + "grad_norm": 0.12593728032205057, + "learning_rate": 0.00048391427194600917, + "loss": 1.4101, + "step": 5786 + }, + { + "epoch": 0.525040827436037, + "grad_norm": 0.1274973701791802, + "learning_rate": 0.00048376742171874344, + "loss": 1.41, + "step": 5787 + }, + { + "epoch": 0.5251315550716749, + "grad_norm": 0.13009126416357406, + "learning_rate": 0.00048362057289316323, + "loss": 1.4123, + "step": 5788 + }, + { + "epoch": 0.5252222827073126, + "grad_norm": 0.33086963550042625, + "learning_rate": 0.00048347372548194914, + "loss": 1.4082, + "step": 5789 + }, + { + "epoch": 0.5253130103429504, + "grad_norm": 0.12835474725014204, + "learning_rate": 0.0004833268794977811, + "loss": 1.4031, + "step": 5790 + }, + { + "epoch": 0.5254037379785883, + "grad_norm": 0.11287912704254874, + "learning_rate": 0.00048318003495333953, + "loss": 1.3836, + "step": 5791 + }, + { + "epoch": 0.5254944656142261, + "grad_norm": 0.121226950762352, + "learning_rate": 0.00048303319186130447, + "loss": 1.3707, + "step": 5792 + }, + { + "epoch": 0.5255851932498639, + "grad_norm": 0.6569452214133785, + "learning_rate": 0.0004828863502343557, + "loss": 1.4206, + "step": 5793 + }, + { + "epoch": 0.5256759208855017, + "grad_norm": 0.11465103361803149, + "learning_rate": 0.00048273951008517303, + "loss": 1.3836, + "step": 5794 + }, + { + "epoch": 0.5257666485211395, + "grad_norm": 0.1229853261594194, + "learning_rate": 0.00048259267142643626, + "loss": 1.3979, + "step": 5795 + }, + { + "epoch": 0.5258573761567773, + "grad_norm": 0.11954269703415149, + "learning_rate": 0.0004824458342708251, + "loss": 1.4215, + "step": 5796 + }, + { + "epoch": 0.5259481037924152, + "grad_norm": 0.1265555018249302, + "learning_rate": 0.0004822989986310185, + "loss": 1.4454, + "step": 5797 + }, + { + "epoch": 0.526038831428053, + "grad_norm": 0.11891933587108555, + "learning_rate": 0.000482152164519696, + "loss": 1.4134, + "step": 5798 + }, + { + "epoch": 0.5261295590636909, + "grad_norm": 0.17003195146164865, + "learning_rate": 0.0004820053319495369, + "loss": 1.415, + "step": 5799 + }, + { + "epoch": 0.5262202866993286, + "grad_norm": 0.14067330998481725, + "learning_rate": 0.0004818585009332199, + "loss": 1.3917, + "step": 5800 + }, + { + "epoch": 0.5263110143349664, + "grad_norm": 0.11756027515081768, + "learning_rate": 0.00048171167148342405, + "loss": 1.4103, + "step": 5801 + }, + { + "epoch": 0.5264017419706043, + "grad_norm": 0.11888377661368796, + "learning_rate": 0.0004815648436128279, + "loss": 1.4239, + "step": 5802 + }, + { + "epoch": 0.5264924696062421, + "grad_norm": 0.12673384298347304, + "learning_rate": 0.00048141801733411044, + "loss": 1.4151, + "step": 5803 + }, + { + "epoch": 0.5265831972418799, + "grad_norm": 0.13240273305449846, + "learning_rate": 0.00048127119265994986, + "loss": 1.3762, + "step": 5804 + }, + { + "epoch": 0.5266739248775177, + "grad_norm": 0.17025345247632542, + "learning_rate": 0.00048112436960302455, + "loss": 1.4054, + "step": 5805 + }, + { + "epoch": 0.5267646525131555, + "grad_norm": 0.12981070711918236, + "learning_rate": 0.0004809775481760127, + "loss": 1.3916, + "step": 5806 + }, + { + "epoch": 0.5268553801487933, + "grad_norm": 0.19023428136021728, + "learning_rate": 0.00048083072839159234, + "loss": 1.3988, + "step": 5807 + }, + { + "epoch": 0.5269461077844312, + "grad_norm": 0.13431765619736796, + "learning_rate": 0.00048068391026244146, + "loss": 1.376, + "step": 5808 + }, + { + "epoch": 0.527036835420069, + "grad_norm": 0.1246512344815615, + "learning_rate": 0.00048053709380123784, + "loss": 1.416, + "step": 5809 + }, + { + "epoch": 0.5271275630557067, + "grad_norm": 0.14037583525456696, + "learning_rate": 0.00048039027902065876, + "loss": 1.4112, + "step": 5810 + }, + { + "epoch": 0.5272182906913446, + "grad_norm": 0.1358563844348915, + "learning_rate": 0.0004802434659333822, + "loss": 1.4173, + "step": 5811 + }, + { + "epoch": 0.5273090183269824, + "grad_norm": 0.1358511037235025, + "learning_rate": 0.00048009665455208523, + "loss": 1.4023, + "step": 5812 + }, + { + "epoch": 0.5273997459626202, + "grad_norm": 0.14267017110960414, + "learning_rate": 0.00047994984488944505, + "loss": 1.4375, + "step": 5813 + }, + { + "epoch": 0.5274904735982581, + "grad_norm": 0.20077921865958523, + "learning_rate": 0.00047980303695813886, + "loss": 1.406, + "step": 5814 + }, + { + "epoch": 0.5275812012338958, + "grad_norm": 0.13936050051250715, + "learning_rate": 0.00047965623077084326, + "loss": 1.4143, + "step": 5815 + }, + { + "epoch": 0.5276719288695336, + "grad_norm": 0.1388417704564912, + "learning_rate": 0.00047950942634023523, + "loss": 1.3966, + "step": 5816 + }, + { + "epoch": 0.5277626565051715, + "grad_norm": 0.13524378309983826, + "learning_rate": 0.000479362623678991, + "loss": 1.4292, + "step": 5817 + }, + { + "epoch": 0.5278533841408093, + "grad_norm": 0.13111085037628403, + "learning_rate": 0.0004792158227997875, + "loss": 1.3995, + "step": 5818 + }, + { + "epoch": 0.5279441117764471, + "grad_norm": 0.13035953386096646, + "learning_rate": 0.0004790690237153007, + "loss": 1.3813, + "step": 5819 + }, + { + "epoch": 0.528034839412085, + "grad_norm": 0.1323549046853095, + "learning_rate": 0.00047892222643820684, + "loss": 1.4105, + "step": 5820 + }, + { + "epoch": 0.5281255670477227, + "grad_norm": 0.12885134172706433, + "learning_rate": 0.00047877543098118186, + "loss": 1.3932, + "step": 5821 + }, + { + "epoch": 0.5282162946833605, + "grad_norm": 0.1914586052529559, + "learning_rate": 0.00047862863735690145, + "loss": 1.391, + "step": 5822 + }, + { + "epoch": 0.5283070223189984, + "grad_norm": 0.13185575331736707, + "learning_rate": 0.00047848184557804134, + "loss": 1.4201, + "step": 5823 + }, + { + "epoch": 0.5283977499546362, + "grad_norm": 0.14700877833553358, + "learning_rate": 0.0004783350556572772, + "loss": 1.3795, + "step": 5824 + }, + { + "epoch": 0.528488477590274, + "grad_norm": 0.1314154360401111, + "learning_rate": 0.00047818826760728393, + "loss": 1.3666, + "step": 5825 + }, + { + "epoch": 0.5285792052259118, + "grad_norm": 0.1401479282102985, + "learning_rate": 0.0004780414814407371, + "loss": 1.4178, + "step": 5826 + }, + { + "epoch": 0.5286699328615496, + "grad_norm": 0.14366041620161218, + "learning_rate": 0.0004778946971703116, + "loss": 1.3595, + "step": 5827 + }, + { + "epoch": 0.5287606604971874, + "grad_norm": 0.11774834039496478, + "learning_rate": 0.0004777479148086823, + "loss": 1.3599, + "step": 5828 + }, + { + "epoch": 0.5288513881328253, + "grad_norm": 0.127815007338449, + "learning_rate": 0.00047760113436852376, + "loss": 1.4084, + "step": 5829 + }, + { + "epoch": 0.5289421157684631, + "grad_norm": 0.12876719785195717, + "learning_rate": 0.00047745435586251056, + "loss": 1.4042, + "step": 5830 + }, + { + "epoch": 0.5290328434041008, + "grad_norm": 0.12726353702158136, + "learning_rate": 0.0004773075793033172, + "loss": 1.3366, + "step": 5831 + }, + { + "epoch": 0.5291235710397387, + "grad_norm": 0.1227379870040054, + "learning_rate": 0.0004771608047036176, + "loss": 1.3954, + "step": 5832 + }, + { + "epoch": 0.5292142986753765, + "grad_norm": 0.1261021061505254, + "learning_rate": 0.00047701403207608576, + "loss": 1.4025, + "step": 5833 + }, + { + "epoch": 0.5293050263110143, + "grad_norm": 0.14320973937103051, + "learning_rate": 0.0004768672614333957, + "loss": 1.4038, + "step": 5834 + }, + { + "epoch": 0.5293957539466522, + "grad_norm": 0.12450168787687035, + "learning_rate": 0.00047672049278822123, + "loss": 1.375, + "step": 5835 + }, + { + "epoch": 0.52948648158229, + "grad_norm": 0.12170053633770568, + "learning_rate": 0.00047657372615323546, + "loss": 1.4078, + "step": 5836 + }, + { + "epoch": 0.5295772092179278, + "grad_norm": 0.12617494189212988, + "learning_rate": 0.0004764269615411119, + "loss": 1.395, + "step": 5837 + }, + { + "epoch": 0.5296679368535656, + "grad_norm": 0.13508729628176322, + "learning_rate": 0.00047628019896452377, + "loss": 1.3613, + "step": 5838 + }, + { + "epoch": 0.5297586644892034, + "grad_norm": 0.13528588919053414, + "learning_rate": 0.0004761334384361439, + "loss": 1.4056, + "step": 5839 + }, + { + "epoch": 0.5298493921248413, + "grad_norm": 0.12499217590201486, + "learning_rate": 0.0004759866799686451, + "loss": 1.3872, + "step": 5840 + }, + { + "epoch": 0.5299401197604791, + "grad_norm": 0.14029974717142638, + "learning_rate": 0.0004758399235746999, + "loss": 1.3991, + "step": 5841 + }, + { + "epoch": 0.5300308473961168, + "grad_norm": 0.1499034026951687, + "learning_rate": 0.00047569316926698103, + "loss": 1.4025, + "step": 5842 + }, + { + "epoch": 0.5301215750317547, + "grad_norm": 0.14045403756160713, + "learning_rate": 0.00047554641705816066, + "loss": 1.403, + "step": 5843 + }, + { + "epoch": 0.5302123026673925, + "grad_norm": 0.1395530079599949, + "learning_rate": 0.00047539966696091065, + "loss": 1.4198, + "step": 5844 + }, + { + "epoch": 0.5303030303030303, + "grad_norm": 0.16791402624803436, + "learning_rate": 0.00047525291898790315, + "loss": 1.4084, + "step": 5845 + }, + { + "epoch": 0.5303937579386682, + "grad_norm": 0.12930101971768365, + "learning_rate": 0.00047510617315180966, + "loss": 1.3683, + "step": 5846 + }, + { + "epoch": 0.5304844855743059, + "grad_norm": 0.12629194165744434, + "learning_rate": 0.00047495942946530176, + "loss": 1.4074, + "step": 5847 + }, + { + "epoch": 0.5305752132099437, + "grad_norm": 0.14090609724421332, + "learning_rate": 0.0004748126879410509, + "loss": 1.3822, + "step": 5848 + }, + { + "epoch": 0.5306659408455816, + "grad_norm": 0.1254167736189815, + "learning_rate": 0.00047466594859172783, + "loss": 1.3507, + "step": 5849 + }, + { + "epoch": 0.5307566684812194, + "grad_norm": 0.12874475547434194, + "learning_rate": 0.0004745192114300042, + "loss": 1.4095, + "step": 5850 + }, + { + "epoch": 0.5308473961168572, + "grad_norm": 0.12739423687478352, + "learning_rate": 0.00047437247646855025, + "loss": 1.4084, + "step": 5851 + }, + { + "epoch": 0.530938123752495, + "grad_norm": 0.15908421132697115, + "learning_rate": 0.0004742257437200368, + "loss": 1.39, + "step": 5852 + }, + { + "epoch": 0.5310288513881328, + "grad_norm": 0.12103760072469659, + "learning_rate": 0.0004740790131971343, + "loss": 1.3598, + "step": 5853 + }, + { + "epoch": 0.5311195790237706, + "grad_norm": 0.1238075350435841, + "learning_rate": 0.0004739322849125126, + "loss": 1.4216, + "step": 5854 + }, + { + "epoch": 0.5312103066594085, + "grad_norm": 0.1484682843917771, + "learning_rate": 0.00047378555887884214, + "loss": 1.3744, + "step": 5855 + }, + { + "epoch": 0.5313010342950463, + "grad_norm": 0.12288336181917711, + "learning_rate": 0.0004736388351087922, + "loss": 1.4564, + "step": 5856 + }, + { + "epoch": 0.531391761930684, + "grad_norm": 0.1319597230512638, + "learning_rate": 0.000473492113615033, + "loss": 1.4155, + "step": 5857 + }, + { + "epoch": 0.5314824895663219, + "grad_norm": 0.13159875355106526, + "learning_rate": 0.00047334539441023366, + "loss": 1.3877, + "step": 5858 + }, + { + "epoch": 0.5315732172019597, + "grad_norm": 0.12734585120189046, + "learning_rate": 0.00047319867750706345, + "loss": 1.3711, + "step": 5859 + }, + { + "epoch": 0.5316639448375975, + "grad_norm": 0.12843726386382626, + "learning_rate": 0.00047305196291819147, + "loss": 1.3907, + "step": 5860 + }, + { + "epoch": 0.5317546724732354, + "grad_norm": 0.12434468963535164, + "learning_rate": 0.0004729052506562864, + "loss": 1.3943, + "step": 5861 + }, + { + "epoch": 0.5318454001088732, + "grad_norm": 0.1207121275597437, + "learning_rate": 0.0004727585407340168, + "loss": 1.3922, + "step": 5862 + }, + { + "epoch": 0.5319361277445109, + "grad_norm": 0.1209944764843776, + "learning_rate": 0.00047261183316405143, + "loss": 1.385, + "step": 5863 + }, + { + "epoch": 0.5320268553801488, + "grad_norm": 0.13076696567902163, + "learning_rate": 0.0004724651279590581, + "loss": 1.4196, + "step": 5864 + }, + { + "epoch": 0.5321175830157866, + "grad_norm": 0.13004562254862825, + "learning_rate": 0.00047231842513170507, + "loss": 1.3941, + "step": 5865 + }, + { + "epoch": 0.5322083106514244, + "grad_norm": 0.13834450721734073, + "learning_rate": 0.0004721717246946602, + "loss": 1.3986, + "step": 5866 + }, + { + "epoch": 0.5322990382870623, + "grad_norm": 0.15270682801317634, + "learning_rate": 0.00047202502666059105, + "loss": 1.4055, + "step": 5867 + }, + { + "epoch": 0.5323897659227, + "grad_norm": 0.12657035332257596, + "learning_rate": 0.0004718783310421649, + "loss": 1.3934, + "step": 5868 + }, + { + "epoch": 0.5324804935583378, + "grad_norm": 0.13324777158049436, + "learning_rate": 0.00047173163785204893, + "loss": 1.3811, + "step": 5869 + }, + { + "epoch": 0.5325712211939757, + "grad_norm": 0.1429218408022886, + "learning_rate": 0.00047158494710291036, + "loss": 1.4266, + "step": 5870 + }, + { + "epoch": 0.5326619488296135, + "grad_norm": 0.119975314437342, + "learning_rate": 0.0004714382588074157, + "loss": 1.373, + "step": 5871 + }, + { + "epoch": 0.5327526764652513, + "grad_norm": 0.12829197911146498, + "learning_rate": 0.00047129157297823147, + "loss": 1.4192, + "step": 5872 + }, + { + "epoch": 0.5328434041008892, + "grad_norm": 0.16057691962833193, + "learning_rate": 0.00047114488962802424, + "loss": 1.4055, + "step": 5873 + }, + { + "epoch": 0.5329341317365269, + "grad_norm": 0.13657410368305115, + "learning_rate": 0.0004709982087694602, + "loss": 1.4031, + "step": 5874 + }, + { + "epoch": 0.5330248593721648, + "grad_norm": 0.12361233359712641, + "learning_rate": 0.00047085153041520497, + "loss": 1.4154, + "step": 5875 + }, + { + "epoch": 0.5331155870078026, + "grad_norm": 0.12781160191728017, + "learning_rate": 0.0004707048545779244, + "loss": 1.4221, + "step": 5876 + }, + { + "epoch": 0.5332063146434404, + "grad_norm": 0.13367468396166848, + "learning_rate": 0.00047055818127028404, + "loss": 1.4173, + "step": 5877 + }, + { + "epoch": 0.5332970422790783, + "grad_norm": 0.1275002147641984, + "learning_rate": 0.000470411510504949, + "loss": 1.4279, + "step": 5878 + }, + { + "epoch": 0.533387769914716, + "grad_norm": 0.14164067656059856, + "learning_rate": 0.0004702648422945843, + "loss": 1.3906, + "step": 5879 + }, + { + "epoch": 0.5334784975503538, + "grad_norm": 0.12971432317364998, + "learning_rate": 0.0004701181766518548, + "loss": 1.3843, + "step": 5880 + }, + { + "epoch": 0.5335692251859917, + "grad_norm": 0.12073629885035055, + "learning_rate": 0.00046997151358942524, + "loss": 1.3799, + "step": 5881 + }, + { + "epoch": 0.5336599528216295, + "grad_norm": 0.16118008458211472, + "learning_rate": 0.00046982485311996, + "loss": 1.4123, + "step": 5882 + }, + { + "epoch": 0.5337506804572673, + "grad_norm": 0.1228035069916711, + "learning_rate": 0.0004696781952561231, + "loss": 1.4287, + "step": 5883 + }, + { + "epoch": 0.5338414080929051, + "grad_norm": 0.1382496371231778, + "learning_rate": 0.00046953154001057865, + "loss": 1.3705, + "step": 5884 + }, + { + "epoch": 0.5339321357285429, + "grad_norm": 0.1273819855199993, + "learning_rate": 0.00046938488739599, + "loss": 1.3523, + "step": 5885 + }, + { + "epoch": 0.5340228633641807, + "grad_norm": 0.14095939907576066, + "learning_rate": 0.00046923823742502086, + "loss": 1.3936, + "step": 5886 + }, + { + "epoch": 0.5341135909998186, + "grad_norm": 0.1349418589777598, + "learning_rate": 0.0004690915901103346, + "loss": 1.3825, + "step": 5887 + }, + { + "epoch": 0.5342043186354564, + "grad_norm": 0.126969512757805, + "learning_rate": 0.0004689449454645937, + "loss": 1.3833, + "step": 5888 + }, + { + "epoch": 0.5342950462710941, + "grad_norm": 0.12798056279818648, + "learning_rate": 0.0004687983035004617, + "loss": 1.3826, + "step": 5889 + }, + { + "epoch": 0.534385773906732, + "grad_norm": 0.14222546679573966, + "learning_rate": 0.0004686516642306007, + "loss": 1.4317, + "step": 5890 + }, + { + "epoch": 0.5344765015423698, + "grad_norm": 0.141984281987018, + "learning_rate": 0.00046850502766767303, + "loss": 1.3921, + "step": 5891 + }, + { + "epoch": 0.5345672291780076, + "grad_norm": 0.13427998538625177, + "learning_rate": 0.0004683583938243411, + "loss": 1.3774, + "step": 5892 + }, + { + "epoch": 0.5346579568136455, + "grad_norm": 0.12821601911544497, + "learning_rate": 0.0004682117627132663, + "loss": 1.3662, + "step": 5893 + }, + { + "epoch": 0.5347486844492833, + "grad_norm": 0.13746764862045954, + "learning_rate": 0.0004680651343471107, + "loss": 1.3821, + "step": 5894 + }, + { + "epoch": 0.534839412084921, + "grad_norm": 0.1309343178815651, + "learning_rate": 0.000467918508738535, + "loss": 1.3846, + "step": 5895 + }, + { + "epoch": 0.5349301397205589, + "grad_norm": 0.12973061556805726, + "learning_rate": 0.0004677718859002012, + "loss": 1.3943, + "step": 5896 + }, + { + "epoch": 0.5350208673561967, + "grad_norm": 0.13022581018492804, + "learning_rate": 0.0004676252658447697, + "loss": 1.4281, + "step": 5897 + }, + { + "epoch": 0.5351115949918345, + "grad_norm": 0.13260729439453783, + "learning_rate": 0.00046747864858490124, + "loss": 1.4059, + "step": 5898 + }, + { + "epoch": 0.5352023226274724, + "grad_norm": 0.12870899411874015, + "learning_rate": 0.0004673320341332564, + "loss": 1.3821, + "step": 5899 + }, + { + "epoch": 0.5352930502631101, + "grad_norm": 0.14377576440017106, + "learning_rate": 0.0004671854225024951, + "loss": 1.4679, + "step": 5900 + }, + { + "epoch": 0.5353837778987479, + "grad_norm": 0.14035660837377326, + "learning_rate": 0.0004670388137052774, + "loss": 1.4112, + "step": 5901 + }, + { + "epoch": 0.5354745055343858, + "grad_norm": 0.1350152521814605, + "learning_rate": 0.0004668922077542631, + "loss": 1.3839, + "step": 5902 + }, + { + "epoch": 0.5355652331700236, + "grad_norm": 0.11883930634354462, + "learning_rate": 0.00046674560466211136, + "loss": 1.3749, + "step": 5903 + }, + { + "epoch": 0.5356559608056614, + "grad_norm": 0.12503719479056394, + "learning_rate": 0.00046659900444148163, + "loss": 1.4335, + "step": 5904 + }, + { + "epoch": 0.5357466884412992, + "grad_norm": 0.1959854896294752, + "learning_rate": 0.0004664524071050328, + "loss": 1.3936, + "step": 5905 + }, + { + "epoch": 0.535837416076937, + "grad_norm": 0.13168466963121694, + "learning_rate": 0.00046630581266542365, + "loss": 1.4342, + "step": 5906 + }, + { + "epoch": 0.5359281437125748, + "grad_norm": 0.1310374155056099, + "learning_rate": 0.00046615922113531243, + "loss": 1.4054, + "step": 5907 + }, + { + "epoch": 0.5360188713482127, + "grad_norm": 0.127612156786967, + "learning_rate": 0.00046601263252735755, + "loss": 1.4021, + "step": 5908 + }, + { + "epoch": 0.5361095989838505, + "grad_norm": 0.11770096961926144, + "learning_rate": 0.00046586604685421685, + "loss": 1.3747, + "step": 5909 + }, + { + "epoch": 0.5362003266194882, + "grad_norm": 0.12450083654105532, + "learning_rate": 0.000465719464128548, + "loss": 1.3927, + "step": 5910 + }, + { + "epoch": 0.5362910542551261, + "grad_norm": 0.11750040678838478, + "learning_rate": 0.0004655728843630083, + "loss": 1.3961, + "step": 5911 + }, + { + "epoch": 0.5363817818907639, + "grad_norm": 0.12669831041150406, + "learning_rate": 0.0004654263075702553, + "loss": 1.378, + "step": 5912 + }, + { + "epoch": 0.5364725095264018, + "grad_norm": 0.1304668225071117, + "learning_rate": 0.0004652797337629458, + "loss": 1.4104, + "step": 5913 + }, + { + "epoch": 0.5365632371620396, + "grad_norm": 0.12252810443477052, + "learning_rate": 0.0004651331629537363, + "loss": 1.4119, + "step": 5914 + }, + { + "epoch": 0.5366539647976774, + "grad_norm": 0.11425233130569011, + "learning_rate": 0.0004649865951552833, + "loss": 1.3903, + "step": 5915 + }, + { + "epoch": 0.5367446924333152, + "grad_norm": 0.11978522081646563, + "learning_rate": 0.0004648400303802431, + "loss": 1.3851, + "step": 5916 + }, + { + "epoch": 0.536835420068953, + "grad_norm": 0.15060551840071662, + "learning_rate": 0.0004646934686412713, + "loss": 1.4159, + "step": 5917 + }, + { + "epoch": 0.5369261477045908, + "grad_norm": 0.117793367018524, + "learning_rate": 0.0004645469099510238, + "loss": 1.4149, + "step": 5918 + }, + { + "epoch": 0.5370168753402287, + "grad_norm": 0.11503807266941293, + "learning_rate": 0.00046440035432215567, + "loss": 1.4094, + "step": 5919 + }, + { + "epoch": 0.5371076029758665, + "grad_norm": 0.12300306205051383, + "learning_rate": 0.00046425380176732227, + "loss": 1.3904, + "step": 5920 + }, + { + "epoch": 0.5371983306115042, + "grad_norm": 0.12052108420342371, + "learning_rate": 0.0004641072522991785, + "loss": 1.3978, + "step": 5921 + }, + { + "epoch": 0.5372890582471421, + "grad_norm": 0.13667348154741604, + "learning_rate": 0.0004639607059303787, + "loss": 1.3744, + "step": 5922 + }, + { + "epoch": 0.5373797858827799, + "grad_norm": 0.11496341782807873, + "learning_rate": 0.00046381416267357736, + "loss": 1.364, + "step": 5923 + }, + { + "epoch": 0.5374705135184177, + "grad_norm": 0.12062485190726578, + "learning_rate": 0.00046366762254142826, + "loss": 1.3921, + "step": 5924 + }, + { + "epoch": 0.5375612411540556, + "grad_norm": 0.11294216638275985, + "learning_rate": 0.0004635210855465853, + "loss": 1.3389, + "step": 5925 + }, + { + "epoch": 0.5376519687896933, + "grad_norm": 0.11754945087124881, + "learning_rate": 0.00046337455170170213, + "loss": 1.3914, + "step": 5926 + }, + { + "epoch": 0.5377426964253311, + "grad_norm": 0.1200997858625327, + "learning_rate": 0.00046322802101943147, + "loss": 1.4122, + "step": 5927 + }, + { + "epoch": 0.537833424060969, + "grad_norm": 0.11984647049883418, + "learning_rate": 0.0004630814935124269, + "loss": 1.3692, + "step": 5928 + }, + { + "epoch": 0.5379241516966068, + "grad_norm": 0.11415450448688523, + "learning_rate": 0.0004629349691933408, + "loss": 1.3906, + "step": 5929 + }, + { + "epoch": 0.5380148793322446, + "grad_norm": 0.12261607237789643, + "learning_rate": 0.00046278844807482545, + "loss": 1.4006, + "step": 5930 + }, + { + "epoch": 0.5381056069678825, + "grad_norm": 0.12006771158115057, + "learning_rate": 0.0004626419301695333, + "loss": 1.3828, + "step": 5931 + }, + { + "epoch": 0.5381963346035202, + "grad_norm": 0.1341132626056429, + "learning_rate": 0.0004624954154901159, + "loss": 1.4343, + "step": 5932 + }, + { + "epoch": 0.538287062239158, + "grad_norm": 0.11424720826707951, + "learning_rate": 0.0004623489040492249, + "loss": 1.3944, + "step": 5933 + }, + { + "epoch": 0.5383777898747959, + "grad_norm": 0.12294621428070547, + "learning_rate": 0.0004622023958595113, + "loss": 1.3701, + "step": 5934 + }, + { + "epoch": 0.5384685175104337, + "grad_norm": 0.11501479765284474, + "learning_rate": 0.0004620558909336267, + "loss": 1.3945, + "step": 5935 + }, + { + "epoch": 0.5385592451460715, + "grad_norm": 0.12079461791998328, + "learning_rate": 0.00046190938928422143, + "loss": 1.4689, + "step": 5936 + }, + { + "epoch": 0.5386499727817093, + "grad_norm": 0.11505413288582951, + "learning_rate": 0.000461762890923946, + "loss": 1.3915, + "step": 5937 + }, + { + "epoch": 0.5387407004173471, + "grad_norm": 0.11616915197803492, + "learning_rate": 0.00046161639586545075, + "loss": 1.4195, + "step": 5938 + }, + { + "epoch": 0.5388314280529849, + "grad_norm": 0.14195829737634744, + "learning_rate": 0.0004614699041213852, + "loss": 1.3818, + "step": 5939 + }, + { + "epoch": 0.5389221556886228, + "grad_norm": 0.117884690056483, + "learning_rate": 0.00046132341570439905, + "loss": 1.4223, + "step": 5940 + }, + { + "epoch": 0.5390128833242606, + "grad_norm": 0.13090172340233794, + "learning_rate": 0.0004611769306271419, + "loss": 1.4173, + "step": 5941 + }, + { + "epoch": 0.5391036109598983, + "grad_norm": 0.1500889334838667, + "learning_rate": 0.00046103044890226223, + "loss": 1.3921, + "step": 5942 + }, + { + "epoch": 0.5391943385955362, + "grad_norm": 0.11457697248284213, + "learning_rate": 0.0004608839705424091, + "loss": 1.3758, + "step": 5943 + }, + { + "epoch": 0.539285066231174, + "grad_norm": 0.11825291902773656, + "learning_rate": 0.00046073749556023094, + "loss": 1.412, + "step": 5944 + }, + { + "epoch": 0.5393757938668118, + "grad_norm": 0.12026488813594093, + "learning_rate": 0.0004605910239683759, + "loss": 1.3942, + "step": 5945 + }, + { + "epoch": 0.5394665215024497, + "grad_norm": 0.1223717909264393, + "learning_rate": 0.0004604445557794916, + "loss": 1.3845, + "step": 5946 + }, + { + "epoch": 0.5395572491380874, + "grad_norm": 0.12166318640906348, + "learning_rate": 0.0004602980910062258, + "loss": 1.3891, + "step": 5947 + }, + { + "epoch": 0.5396479767737252, + "grad_norm": 0.11898233470592025, + "learning_rate": 0.00046015162966122583, + "loss": 1.3905, + "step": 5948 + }, + { + "epoch": 0.5397387044093631, + "grad_norm": 0.12000849025983878, + "learning_rate": 0.00046000517175713834, + "loss": 1.3927, + "step": 5949 + }, + { + "epoch": 0.5398294320450009, + "grad_norm": 0.12180603871551997, + "learning_rate": 0.0004598587173066101, + "loss": 1.3889, + "step": 5950 + }, + { + "epoch": 0.5399201596806387, + "grad_norm": 0.11838563433977065, + "learning_rate": 0.0004597122663222876, + "loss": 1.3795, + "step": 5951 + }, + { + "epoch": 0.5400108873162766, + "grad_norm": 0.14587042978221426, + "learning_rate": 0.000459565818816817, + "loss": 1.3926, + "step": 5952 + }, + { + "epoch": 0.5401016149519143, + "grad_norm": 0.1181544703062225, + "learning_rate": 0.0004594193748028438, + "loss": 1.4168, + "step": 5953 + }, + { + "epoch": 0.5401923425875522, + "grad_norm": 0.11368399222830265, + "learning_rate": 0.00045927293429301355, + "loss": 1.3877, + "step": 5954 + }, + { + "epoch": 0.54028307022319, + "grad_norm": 0.11685903484474643, + "learning_rate": 0.00045912649729997146, + "loss": 1.3858, + "step": 5955 + }, + { + "epoch": 0.5403737978588278, + "grad_norm": 0.11833403698668787, + "learning_rate": 0.0004589800638363623, + "loss": 1.3837, + "step": 5956 + }, + { + "epoch": 0.5404645254944657, + "grad_norm": 0.11553135291954895, + "learning_rate": 0.00045883363391483075, + "loss": 1.4239, + "step": 5957 + }, + { + "epoch": 0.5405552531301034, + "grad_norm": 0.1746696077212527, + "learning_rate": 0.00045868720754802076, + "loss": 1.4143, + "step": 5958 + }, + { + "epoch": 0.5406459807657412, + "grad_norm": 0.11827503231230006, + "learning_rate": 0.0004585407847485766, + "loss": 1.4028, + "step": 5959 + }, + { + "epoch": 0.5407367084013791, + "grad_norm": 0.11720011061019328, + "learning_rate": 0.000458394365529142, + "loss": 1.3704, + "step": 5960 + }, + { + "epoch": 0.5408274360370169, + "grad_norm": 0.12012874876706414, + "learning_rate": 0.00045824794990235985, + "loss": 1.4009, + "step": 5961 + }, + { + "epoch": 0.5409181636726547, + "grad_norm": 0.17976850589118337, + "learning_rate": 0.00045810153788087356, + "loss": 1.4059, + "step": 5962 + }, + { + "epoch": 0.5410088913082925, + "grad_norm": 0.11462697585842647, + "learning_rate": 0.00045795512947732547, + "loss": 1.3858, + "step": 5963 + }, + { + "epoch": 0.5410996189439303, + "grad_norm": 0.11491881740080889, + "learning_rate": 0.0004578087247043582, + "loss": 1.3925, + "step": 5964 + }, + { + "epoch": 0.5411903465795681, + "grad_norm": 0.11639270894664264, + "learning_rate": 0.0004576623235746139, + "loss": 1.3728, + "step": 5965 + }, + { + "epoch": 0.541281074215206, + "grad_norm": 0.11813308648393481, + "learning_rate": 0.00045751592610073383, + "loss": 1.3733, + "step": 5966 + }, + { + "epoch": 0.5413718018508438, + "grad_norm": 0.12013238922155335, + "learning_rate": 0.00045736953229536014, + "loss": 1.3712, + "step": 5967 + }, + { + "epoch": 0.5414625294864815, + "grad_norm": 0.1305077892031267, + "learning_rate": 0.00045722314217113346, + "loss": 1.3596, + "step": 5968 + }, + { + "epoch": 0.5415532571221194, + "grad_norm": 0.1278815923517057, + "learning_rate": 0.00045707675574069485, + "loss": 1.3862, + "step": 5969 + }, + { + "epoch": 0.5416439847577572, + "grad_norm": 0.13071537356156318, + "learning_rate": 0.0004569303730166848, + "loss": 1.3975, + "step": 5970 + }, + { + "epoch": 0.541734712393395, + "grad_norm": 0.11623060629034254, + "learning_rate": 0.0004567839940117432, + "loss": 1.3782, + "step": 5971 + }, + { + "epoch": 0.5418254400290329, + "grad_norm": 0.12019321230232127, + "learning_rate": 0.00045663761873851027, + "loss": 1.4072, + "step": 5972 + }, + { + "epoch": 0.5419161676646707, + "grad_norm": 0.12146728575020972, + "learning_rate": 0.000456491247209625, + "loss": 1.3805, + "step": 5973 + }, + { + "epoch": 0.5420068953003084, + "grad_norm": 0.13497366140046504, + "learning_rate": 0.0004563448794377273, + "loss": 1.4045, + "step": 5974 + }, + { + "epoch": 0.5420976229359463, + "grad_norm": 0.11736355201895797, + "learning_rate": 0.00045619851543545555, + "loss": 1.3798, + "step": 5975 + }, + { + "epoch": 0.5421883505715841, + "grad_norm": 0.12215607996286913, + "learning_rate": 0.00045605215521544843, + "loss": 1.3781, + "step": 5976 + }, + { + "epoch": 0.5422790782072219, + "grad_norm": 0.1291703096365664, + "learning_rate": 0.00045590579879034436, + "loss": 1.3852, + "step": 5977 + }, + { + "epoch": 0.5423698058428598, + "grad_norm": 0.1140078365303465, + "learning_rate": 0.00045575944617278093, + "loss": 1.3888, + "step": 5978 + }, + { + "epoch": 0.5424605334784975, + "grad_norm": 0.10898290522742203, + "learning_rate": 0.0004556130973753958, + "loss": 1.3851, + "step": 5979 + }, + { + "epoch": 0.5425512611141353, + "grad_norm": 0.12110149620207293, + "learning_rate": 0.0004554667524108264, + "loss": 1.3692, + "step": 5980 + }, + { + "epoch": 0.5426419887497732, + "grad_norm": 0.11852857760981246, + "learning_rate": 0.00045532041129170924, + "loss": 1.3934, + "step": 5981 + }, + { + "epoch": 0.542732716385411, + "grad_norm": 0.1332899924933946, + "learning_rate": 0.00045517407403068135, + "loss": 1.4056, + "step": 5982 + }, + { + "epoch": 0.5428234440210488, + "grad_norm": 0.12292523820879689, + "learning_rate": 0.00045502774064037866, + "loss": 1.4104, + "step": 5983 + }, + { + "epoch": 0.5429141716566867, + "grad_norm": 0.13803468536660274, + "learning_rate": 0.0004548814111334374, + "loss": 1.4406, + "step": 5984 + }, + { + "epoch": 0.5430048992923244, + "grad_norm": 0.11965407117848255, + "learning_rate": 0.0004547350855224928, + "loss": 1.4329, + "step": 5985 + }, + { + "epoch": 0.5430956269279622, + "grad_norm": 0.12041818604902534, + "learning_rate": 0.00045458876382018014, + "loss": 1.3882, + "step": 5986 + }, + { + "epoch": 0.5431863545636001, + "grad_norm": 0.12010564874100972, + "learning_rate": 0.00045444244603913466, + "loss": 1.3691, + "step": 5987 + }, + { + "epoch": 0.5432770821992379, + "grad_norm": 0.12393020524964811, + "learning_rate": 0.0004542961321919905, + "loss": 1.3941, + "step": 5988 + }, + { + "epoch": 0.5433678098348756, + "grad_norm": 0.11504443078421171, + "learning_rate": 0.00045414982229138195, + "loss": 1.4549, + "step": 5989 + }, + { + "epoch": 0.5434585374705135, + "grad_norm": 0.11641117648239302, + "learning_rate": 0.00045400351634994313, + "loss": 1.4256, + "step": 5990 + }, + { + "epoch": 0.5435492651061513, + "grad_norm": 0.14646230983502637, + "learning_rate": 0.00045385721438030756, + "loss": 1.4341, + "step": 5991 + }, + { + "epoch": 0.5436399927417892, + "grad_norm": 0.1468891481731638, + "learning_rate": 0.00045371091639510825, + "loss": 1.3643, + "step": 5992 + }, + { + "epoch": 0.543730720377427, + "grad_norm": 0.2604538447211258, + "learning_rate": 0.0004535646224069781, + "loss": 1.4309, + "step": 5993 + }, + { + "epoch": 0.5438214480130648, + "grad_norm": 0.122117827055021, + "learning_rate": 0.0004534183324285498, + "loss": 1.4274, + "step": 5994 + }, + { + "epoch": 0.5439121756487026, + "grad_norm": 0.1290236138874457, + "learning_rate": 0.0004532720464724552, + "loss": 1.3891, + "step": 5995 + }, + { + "epoch": 0.5440029032843404, + "grad_norm": 0.12525757548240377, + "learning_rate": 0.00045312576455132624, + "loss": 1.4065, + "step": 5996 + }, + { + "epoch": 0.5440936309199782, + "grad_norm": 0.12489278908408676, + "learning_rate": 0.00045297948667779437, + "loss": 1.3952, + "step": 5997 + }, + { + "epoch": 0.5441843585556161, + "grad_norm": 0.12125354066850824, + "learning_rate": 0.0004528332128644909, + "loss": 1.3571, + "step": 5998 + }, + { + "epoch": 0.5442750861912539, + "grad_norm": 0.12371731432041842, + "learning_rate": 0.00045268694312404647, + "loss": 1.3826, + "step": 5999 + }, + { + "epoch": 0.5443658138268916, + "grad_norm": 0.12209347041500795, + "learning_rate": 0.00045254067746909143, + "loss": 1.3971, + "step": 6000 + }, + { + "epoch": 0.5444565414625295, + "grad_norm": 0.12310040586934644, + "learning_rate": 0.0004523944159122559, + "loss": 1.4017, + "step": 6001 + }, + { + "epoch": 0.5445472690981673, + "grad_norm": 0.12105493185762949, + "learning_rate": 0.00045224815846616964, + "loss": 1.3822, + "step": 6002 + }, + { + "epoch": 0.5446379967338051, + "grad_norm": 0.13114597859036978, + "learning_rate": 0.00045210190514346176, + "loss": 1.3613, + "step": 6003 + }, + { + "epoch": 0.544728724369443, + "grad_norm": 0.12307788737769963, + "learning_rate": 0.00045195565595676165, + "loss": 1.3897, + "step": 6004 + }, + { + "epoch": 0.5448194520050808, + "grad_norm": 0.1349269297605182, + "learning_rate": 0.00045180941091869735, + "loss": 1.3711, + "step": 6005 + }, + { + "epoch": 0.5449101796407185, + "grad_norm": 0.12800993801754645, + "learning_rate": 0.0004516631700418978, + "loss": 1.3935, + "step": 6006 + }, + { + "epoch": 0.5450009072763564, + "grad_norm": 0.14658460722389638, + "learning_rate": 0.0004515169333389906, + "loss": 1.4279, + "step": 6007 + }, + { + "epoch": 0.5450916349119942, + "grad_norm": 0.1368471721591662, + "learning_rate": 0.0004513707008226034, + "loss": 1.3894, + "step": 6008 + }, + { + "epoch": 0.545182362547632, + "grad_norm": 0.1303763703039409, + "learning_rate": 0.0004512244725053634, + "loss": 1.3512, + "step": 6009 + }, + { + "epoch": 0.5452730901832699, + "grad_norm": 0.1243895659177656, + "learning_rate": 0.00045107824839989727, + "loss": 1.3775, + "step": 6010 + }, + { + "epoch": 0.5453638178189076, + "grad_norm": 0.12181851325401026, + "learning_rate": 0.00045093202851883174, + "loss": 1.4049, + "step": 6011 + }, + { + "epoch": 0.5454545454545454, + "grad_norm": 0.12446162183732139, + "learning_rate": 0.0004507858128747924, + "loss": 1.3667, + "step": 6012 + }, + { + "epoch": 0.5455452730901833, + "grad_norm": 0.1260371711442443, + "learning_rate": 0.0004506396014804057, + "loss": 1.4077, + "step": 6013 + }, + { + "epoch": 0.5456360007258211, + "grad_norm": 0.12507783953494547, + "learning_rate": 0.0004504933943482967, + "loss": 1.3908, + "step": 6014 + }, + { + "epoch": 0.5457267283614589, + "grad_norm": 0.12298188368562636, + "learning_rate": 0.00045034719149109027, + "loss": 1.4021, + "step": 6015 + }, + { + "epoch": 0.5458174559970967, + "grad_norm": 0.12745095148681174, + "learning_rate": 0.0004502009929214113, + "loss": 1.4197, + "step": 6016 + }, + { + "epoch": 0.5459081836327345, + "grad_norm": 0.12454080745891662, + "learning_rate": 0.00045005479865188385, + "loss": 1.3855, + "step": 6017 + }, + { + "epoch": 0.5459989112683723, + "grad_norm": 0.13040187677687334, + "learning_rate": 0.0004499086086951319, + "loss": 1.4223, + "step": 6018 + }, + { + "epoch": 0.5460896389040102, + "grad_norm": 0.11592900792099207, + "learning_rate": 0.0004497624230637791, + "loss": 1.3671, + "step": 6019 + }, + { + "epoch": 0.546180366539648, + "grad_norm": 0.12272781874158002, + "learning_rate": 0.0004496162417704482, + "loss": 1.4388, + "step": 6020 + }, + { + "epoch": 0.5462710941752857, + "grad_norm": 0.12213584812525986, + "learning_rate": 0.00044947006482776234, + "loss": 1.4155, + "step": 6021 + }, + { + "epoch": 0.5463618218109236, + "grad_norm": 0.11717884167422193, + "learning_rate": 0.0004493238922483439, + "loss": 1.4206, + "step": 6022 + }, + { + "epoch": 0.5464525494465614, + "grad_norm": 0.2345241809420175, + "learning_rate": 0.00044917772404481487, + "loss": 1.4061, + "step": 6023 + }, + { + "epoch": 0.5465432770821992, + "grad_norm": 0.15136408991517455, + "learning_rate": 0.0004490315602297968, + "loss": 1.4231, + "step": 6024 + }, + { + "epoch": 0.5466340047178371, + "grad_norm": 0.15099765979229166, + "learning_rate": 0.000448885400815911, + "loss": 1.3877, + "step": 6025 + }, + { + "epoch": 0.5467247323534749, + "grad_norm": 0.11559890731545025, + "learning_rate": 0.0004487392458157785, + "loss": 1.3573, + "step": 6026 + }, + { + "epoch": 0.5468154599891126, + "grad_norm": 0.12588443962705786, + "learning_rate": 0.00044859309524201957, + "loss": 1.3686, + "step": 6027 + }, + { + "epoch": 0.5469061876247505, + "grad_norm": 0.12927434949681385, + "learning_rate": 0.0004484469491072543, + "loss": 1.3982, + "step": 6028 + }, + { + "epoch": 0.5469969152603883, + "grad_norm": 0.12391650068446095, + "learning_rate": 0.00044830080742410273, + "loss": 1.4015, + "step": 6029 + }, + { + "epoch": 0.5470876428960262, + "grad_norm": 0.14583244860802097, + "learning_rate": 0.00044815467020518413, + "loss": 1.3822, + "step": 6030 + }, + { + "epoch": 0.547178370531664, + "grad_norm": 0.1330247267691364, + "learning_rate": 0.0004480085374631173, + "loss": 1.3918, + "step": 6031 + }, + { + "epoch": 0.5472690981673017, + "grad_norm": 0.12957990588154525, + "learning_rate": 0.000447862409210521, + "loss": 1.3536, + "step": 6032 + }, + { + "epoch": 0.5473598258029396, + "grad_norm": 0.12734466257241184, + "learning_rate": 0.0004477162854600133, + "loss": 1.3992, + "step": 6033 + }, + { + "epoch": 0.5474505534385774, + "grad_norm": 0.12581578342469335, + "learning_rate": 0.000447570166224212, + "loss": 1.4068, + "step": 6034 + }, + { + "epoch": 0.5475412810742152, + "grad_norm": 0.1437633196657503, + "learning_rate": 0.0004474240515157347, + "loss": 1.3873, + "step": 6035 + }, + { + "epoch": 0.5476320087098531, + "grad_norm": 0.20824277934480462, + "learning_rate": 0.00044727794134719805, + "loss": 1.3818, + "step": 6036 + }, + { + "epoch": 0.5477227363454908, + "grad_norm": 0.14477947965890361, + "learning_rate": 0.00044713183573121906, + "loss": 1.4101, + "step": 6037 + }, + { + "epoch": 0.5478134639811286, + "grad_norm": 0.13151705409598446, + "learning_rate": 0.00044698573468041406, + "loss": 1.3533, + "step": 6038 + }, + { + "epoch": 0.5479041916167665, + "grad_norm": 0.14036565723415645, + "learning_rate": 0.0004468396382073985, + "loss": 1.4021, + "step": 6039 + }, + { + "epoch": 0.5479949192524043, + "grad_norm": 0.1271113251203138, + "learning_rate": 0.0004466935463247881, + "loss": 1.3635, + "step": 6040 + }, + { + "epoch": 0.5480856468880421, + "grad_norm": 0.12501579122873818, + "learning_rate": 0.00044654745904519773, + "loss": 1.4241, + "step": 6041 + }, + { + "epoch": 0.54817637452368, + "grad_norm": 0.12272225301061197, + "learning_rate": 0.00044640137638124217, + "loss": 1.4051, + "step": 6042 + }, + { + "epoch": 0.5482671021593177, + "grad_norm": 0.124036875348195, + "learning_rate": 0.0004462552983455357, + "loss": 1.372, + "step": 6043 + }, + { + "epoch": 0.5483578297949555, + "grad_norm": 0.12059974032403506, + "learning_rate": 0.00044610922495069195, + "loss": 1.3744, + "step": 6044 + }, + { + "epoch": 0.5484485574305934, + "grad_norm": 0.12459645058546993, + "learning_rate": 0.00044596315620932484, + "loss": 1.3469, + "step": 6045 + }, + { + "epoch": 0.5485392850662312, + "grad_norm": 0.12190334969239827, + "learning_rate": 0.00044581709213404696, + "loss": 1.3808, + "step": 6046 + }, + { + "epoch": 0.548630012701869, + "grad_norm": 0.12589379898668468, + "learning_rate": 0.0004456710327374713, + "loss": 1.4152, + "step": 6047 + }, + { + "epoch": 0.5487207403375068, + "grad_norm": 0.1399712440508969, + "learning_rate": 0.00044552497803221, + "loss": 1.4196, + "step": 6048 + }, + { + "epoch": 0.5488114679731446, + "grad_norm": 0.11398668123370741, + "learning_rate": 0.00044537892803087477, + "loss": 1.3944, + "step": 6049 + }, + { + "epoch": 0.5489021956087824, + "grad_norm": 0.12289434684724841, + "learning_rate": 0.00044523288274607724, + "loss": 1.3816, + "step": 6050 + }, + { + "epoch": 0.5489929232444203, + "grad_norm": 0.1555424942286588, + "learning_rate": 0.00044508684219042805, + "loss": 1.3984, + "step": 6051 + }, + { + "epoch": 0.5490836508800581, + "grad_norm": 0.13259386888777602, + "learning_rate": 0.0004449408063765385, + "loss": 1.3824, + "step": 6052 + }, + { + "epoch": 0.5491743785156958, + "grad_norm": 0.12684941849948808, + "learning_rate": 0.0004447947753170183, + "loss": 1.433, + "step": 6053 + }, + { + "epoch": 0.5492651061513337, + "grad_norm": 0.12795493945160719, + "learning_rate": 0.0004446487490244774, + "loss": 1.3928, + "step": 6054 + }, + { + "epoch": 0.5493558337869715, + "grad_norm": 0.12977678974362733, + "learning_rate": 0.0004445027275115252, + "loss": 1.4023, + "step": 6055 + }, + { + "epoch": 0.5494465614226093, + "grad_norm": 0.1383267631339196, + "learning_rate": 0.00044435671079077067, + "loss": 1.4203, + "step": 6056 + }, + { + "epoch": 0.5495372890582472, + "grad_norm": 0.1275363582297552, + "learning_rate": 0.0004442106988748223, + "loss": 1.423, + "step": 6057 + }, + { + "epoch": 0.549628016693885, + "grad_norm": 0.12571801925641612, + "learning_rate": 0.00044406469177628836, + "loss": 1.4084, + "step": 6058 + }, + { + "epoch": 0.5497187443295227, + "grad_norm": 0.12341045833308743, + "learning_rate": 0.0004439186895077764, + "loss": 1.4321, + "step": 6059 + }, + { + "epoch": 0.5498094719651606, + "grad_norm": 0.11972012796203214, + "learning_rate": 0.00044377269208189396, + "loss": 1.3937, + "step": 6060 + }, + { + "epoch": 0.5499001996007984, + "grad_norm": 0.1414873504326996, + "learning_rate": 0.00044362669951124783, + "loss": 1.3997, + "step": 6061 + }, + { + "epoch": 0.5499909272364362, + "grad_norm": 0.12299250362292592, + "learning_rate": 0.0004434807118084446, + "loss": 1.3952, + "step": 6062 + }, + { + "epoch": 0.550081654872074, + "grad_norm": 0.1204781784308993, + "learning_rate": 0.00044333472898609017, + "loss": 1.3776, + "step": 6063 + }, + { + "epoch": 0.5501723825077118, + "grad_norm": 0.12578428707386838, + "learning_rate": 0.00044318875105679023, + "loss": 1.4338, + "step": 6064 + }, + { + "epoch": 0.5502631101433496, + "grad_norm": 0.12206785849893606, + "learning_rate": 0.0004430427780331501, + "loss": 1.4095, + "step": 6065 + }, + { + "epoch": 0.5503538377789875, + "grad_norm": 0.12393420305164198, + "learning_rate": 0.00044289680992777445, + "loss": 1.3832, + "step": 6066 + }, + { + "epoch": 0.5504445654146253, + "grad_norm": 0.13353179975523008, + "learning_rate": 0.0004427508467532675, + "loss": 1.4317, + "step": 6067 + }, + { + "epoch": 0.5505352930502632, + "grad_norm": 0.11880164770004083, + "learning_rate": 0.00044260488852223347, + "loss": 1.4247, + "step": 6068 + }, + { + "epoch": 0.5506260206859009, + "grad_norm": 0.11969990685069029, + "learning_rate": 0.00044245893524727585, + "loss": 1.3979, + "step": 6069 + }, + { + "epoch": 0.5507167483215387, + "grad_norm": 0.11832858419512803, + "learning_rate": 0.00044231298694099767, + "loss": 1.4202, + "step": 6070 + }, + { + "epoch": 0.5508074759571766, + "grad_norm": 0.1286686647751539, + "learning_rate": 0.0004421670436160015, + "loss": 1.4223, + "step": 6071 + }, + { + "epoch": 0.5508982035928144, + "grad_norm": 0.11602887926787901, + "learning_rate": 0.0004420211052848897, + "loss": 1.3695, + "step": 6072 + }, + { + "epoch": 0.5509889312284522, + "grad_norm": 0.11282713046834232, + "learning_rate": 0.0004418751719602639, + "loss": 1.4137, + "step": 6073 + }, + { + "epoch": 0.55107965886409, + "grad_norm": 0.11929163517084662, + "learning_rate": 0.0004417292436547256, + "loss": 1.3875, + "step": 6074 + }, + { + "epoch": 0.5511703864997278, + "grad_norm": 0.2959525900126487, + "learning_rate": 0.0004415833203808756, + "loss": 1.4028, + "step": 6075 + }, + { + "epoch": 0.5512611141353656, + "grad_norm": 0.11586070442322563, + "learning_rate": 0.00044143740215131456, + "loss": 1.4091, + "step": 6076 + }, + { + "epoch": 0.5513518417710035, + "grad_norm": 0.12531589897794043, + "learning_rate": 0.00044129148897864264, + "loss": 1.3792, + "step": 6077 + }, + { + "epoch": 0.5514425694066413, + "grad_norm": 0.12166758017438309, + "learning_rate": 0.0004411455808754592, + "loss": 1.4108, + "step": 6078 + }, + { + "epoch": 0.551533297042279, + "grad_norm": 0.1176971644404456, + "learning_rate": 0.0004409996778543636, + "loss": 1.4006, + "step": 6079 + }, + { + "epoch": 0.5516240246779169, + "grad_norm": 0.11593913855044735, + "learning_rate": 0.0004408537799279544, + "loss": 1.4127, + "step": 6080 + }, + { + "epoch": 0.5517147523135547, + "grad_norm": 0.15026070309789272, + "learning_rate": 0.00044070788710883, + "loss": 1.3928, + "step": 6081 + }, + { + "epoch": 0.5518054799491925, + "grad_norm": 0.12370605485371917, + "learning_rate": 0.0004405619994095882, + "loss": 1.3758, + "step": 6082 + }, + { + "epoch": 0.5518962075848304, + "grad_norm": 0.1257370991198228, + "learning_rate": 0.0004404161168428266, + "loss": 1.3915, + "step": 6083 + }, + { + "epoch": 0.5519869352204682, + "grad_norm": 0.12956102451961277, + "learning_rate": 0.00044027023942114224, + "loss": 1.3963, + "step": 6084 + }, + { + "epoch": 0.5520776628561059, + "grad_norm": 0.11816965238091394, + "learning_rate": 0.0004401243671571314, + "loss": 1.3552, + "step": 6085 + }, + { + "epoch": 0.5521683904917438, + "grad_norm": 0.12113446203934657, + "learning_rate": 0.0004399785000633903, + "loss": 1.3762, + "step": 6086 + }, + { + "epoch": 0.5522591181273816, + "grad_norm": 0.11786644232476476, + "learning_rate": 0.00043983263815251464, + "loss": 1.3994, + "step": 6087 + }, + { + "epoch": 0.5523498457630194, + "grad_norm": 0.1407452000344956, + "learning_rate": 0.0004396867814370994, + "loss": 1.3835, + "step": 6088 + }, + { + "epoch": 0.5524405733986573, + "grad_norm": 0.1300275921376475, + "learning_rate": 0.00043954092992973964, + "loss": 1.3871, + "step": 6089 + }, + { + "epoch": 0.552531301034295, + "grad_norm": 0.13104678044821413, + "learning_rate": 0.00043939508364302917, + "loss": 1.3838, + "step": 6090 + }, + { + "epoch": 0.5526220286699328, + "grad_norm": 0.11491726787119108, + "learning_rate": 0.00043924924258956243, + "loss": 1.3802, + "step": 6091 + }, + { + "epoch": 0.5527127563055707, + "grad_norm": 0.12392405318923981, + "learning_rate": 0.00043910340678193245, + "loss": 1.3865, + "step": 6092 + }, + { + "epoch": 0.5528034839412085, + "grad_norm": 0.1259743449374873, + "learning_rate": 0.0004389575762327323, + "loss": 1.4168, + "step": 6093 + }, + { + "epoch": 0.5528942115768463, + "grad_norm": 0.12328996303696202, + "learning_rate": 0.0004388117509545545, + "loss": 1.3913, + "step": 6094 + }, + { + "epoch": 0.5529849392124842, + "grad_norm": 0.12742463161984782, + "learning_rate": 0.000438665930959991, + "loss": 1.4308, + "step": 6095 + }, + { + "epoch": 0.5530756668481219, + "grad_norm": 0.11501160863521559, + "learning_rate": 0.0004385201162616334, + "loss": 1.3687, + "step": 6096 + }, + { + "epoch": 0.5531663944837597, + "grad_norm": 0.11325490035039226, + "learning_rate": 0.0004383743068720728, + "loss": 1.3601, + "step": 6097 + }, + { + "epoch": 0.5532571221193976, + "grad_norm": 0.11309311456180961, + "learning_rate": 0.00043822850280389973, + "loss": 1.4357, + "step": 6098 + }, + { + "epoch": 0.5533478497550354, + "grad_norm": 0.1198085555946516, + "learning_rate": 0.0004380827040697047, + "loss": 1.3904, + "step": 6099 + }, + { + "epoch": 0.5534385773906731, + "grad_norm": 0.12038920057179929, + "learning_rate": 0.00043793691068207726, + "loss": 1.383, + "step": 6100 + }, + { + "epoch": 0.553529305026311, + "grad_norm": 0.12128090100989024, + "learning_rate": 0.00043779112265360687, + "loss": 1.3671, + "step": 6101 + }, + { + "epoch": 0.5536200326619488, + "grad_norm": 0.1256976881025764, + "learning_rate": 0.00043764533999688206, + "loss": 1.3799, + "step": 6102 + }, + { + "epoch": 0.5537107602975866, + "grad_norm": 0.1313695236504338, + "learning_rate": 0.00043749956272449135, + "loss": 1.3925, + "step": 6103 + }, + { + "epoch": 0.5538014879332245, + "grad_norm": 0.11775547100427439, + "learning_rate": 0.0004373537908490227, + "loss": 1.3755, + "step": 6104 + }, + { + "epoch": 0.5538922155688623, + "grad_norm": 0.11690026474255998, + "learning_rate": 0.0004372080243830634, + "loss": 1.4059, + "step": 6105 + }, + { + "epoch": 0.5539829432045001, + "grad_norm": 0.11775289891512354, + "learning_rate": 0.00043706226333920027, + "loss": 1.3851, + "step": 6106 + }, + { + "epoch": 0.5540736708401379, + "grad_norm": 0.13237892765399803, + "learning_rate": 0.0004369165077300201, + "loss": 1.365, + "step": 6107 + }, + { + "epoch": 0.5541643984757757, + "grad_norm": 0.13052895968085432, + "learning_rate": 0.0004367707575681089, + "loss": 1.4295, + "step": 6108 + }, + { + "epoch": 0.5542551261114136, + "grad_norm": 0.11794995591575165, + "learning_rate": 0.000436625012866052, + "loss": 1.4064, + "step": 6109 + }, + { + "epoch": 0.5543458537470514, + "grad_norm": 0.11625971592287808, + "learning_rate": 0.00043647927363643456, + "loss": 1.3864, + "step": 6110 + }, + { + "epoch": 0.5544365813826891, + "grad_norm": 0.11212969015024543, + "learning_rate": 0.00043633353989184137, + "loss": 1.3804, + "step": 6111 + }, + { + "epoch": 0.554527309018327, + "grad_norm": 0.1318304137792488, + "learning_rate": 0.00043618781164485624, + "loss": 1.4045, + "step": 6112 + }, + { + "epoch": 0.5546180366539648, + "grad_norm": 0.1145133546958959, + "learning_rate": 0.00043604208890806297, + "loss": 1.3808, + "step": 6113 + }, + { + "epoch": 0.5547087642896026, + "grad_norm": 0.12652577876482862, + "learning_rate": 0.00043589637169404465, + "loss": 1.4088, + "step": 6114 + }, + { + "epoch": 0.5547994919252405, + "grad_norm": 0.12310978928040921, + "learning_rate": 0.0004357506600153842, + "loss": 1.4142, + "step": 6115 + }, + { + "epoch": 0.5548902195608783, + "grad_norm": 0.12312490521686095, + "learning_rate": 0.00043560495388466383, + "loss": 1.3699, + "step": 6116 + }, + { + "epoch": 0.554980947196516, + "grad_norm": 0.17239293331787722, + "learning_rate": 0.00043545925331446507, + "loss": 1.3893, + "step": 6117 + }, + { + "epoch": 0.5550716748321539, + "grad_norm": 0.1259716493545865, + "learning_rate": 0.0004353135583173694, + "loss": 1.4409, + "step": 6118 + }, + { + "epoch": 0.5551624024677917, + "grad_norm": 0.11851835435326546, + "learning_rate": 0.00043516786890595747, + "loss": 1.3874, + "step": 6119 + }, + { + "epoch": 0.5552531301034295, + "grad_norm": 0.12628373707942336, + "learning_rate": 0.0004350221850928095, + "loss": 1.4103, + "step": 6120 + }, + { + "epoch": 0.5553438577390674, + "grad_norm": 0.1188265252276024, + "learning_rate": 0.0004348765068905054, + "loss": 1.4347, + "step": 6121 + }, + { + "epoch": 0.5554345853747051, + "grad_norm": 0.11471976229494284, + "learning_rate": 0.0004347308343116246, + "loss": 1.3802, + "step": 6122 + }, + { + "epoch": 0.5555253130103429, + "grad_norm": 0.14300131313379547, + "learning_rate": 0.000434585167368746, + "loss": 1.3449, + "step": 6123 + }, + { + "epoch": 0.5556160406459808, + "grad_norm": 0.16674823587203874, + "learning_rate": 0.0004344395060744477, + "loss": 1.3818, + "step": 6124 + }, + { + "epoch": 0.5557067682816186, + "grad_norm": 0.1278126306245782, + "learning_rate": 0.00043429385044130775, + "loss": 1.4361, + "step": 6125 + }, + { + "epoch": 0.5557974959172564, + "grad_norm": 0.1210751909311456, + "learning_rate": 0.00043414820048190357, + "loss": 1.395, + "step": 6126 + }, + { + "epoch": 0.5558882235528942, + "grad_norm": 0.14747941965939892, + "learning_rate": 0.00043400255620881193, + "loss": 1.3617, + "step": 6127 + }, + { + "epoch": 0.555978951188532, + "grad_norm": 0.11859823004743962, + "learning_rate": 0.00043385691763460935, + "loss": 1.3983, + "step": 6128 + }, + { + "epoch": 0.5560696788241698, + "grad_norm": 0.11399242703022279, + "learning_rate": 0.0004337112847718714, + "loss": 1.3815, + "step": 6129 + }, + { + "epoch": 0.5561604064598077, + "grad_norm": 0.10908409818416168, + "learning_rate": 0.000433565657633174, + "loss": 1.3639, + "step": 6130 + }, + { + "epoch": 0.5562511340954455, + "grad_norm": 0.11813779010402228, + "learning_rate": 0.0004334200362310918, + "loss": 1.3757, + "step": 6131 + }, + { + "epoch": 0.5563418617310832, + "grad_norm": 0.12197257380229949, + "learning_rate": 0.0004332744205781993, + "loss": 1.4113, + "step": 6132 + }, + { + "epoch": 0.5564325893667211, + "grad_norm": 0.16563027817985956, + "learning_rate": 0.00043312881068707055, + "loss": 1.369, + "step": 6133 + }, + { + "epoch": 0.5565233170023589, + "grad_norm": 0.11858648825455603, + "learning_rate": 0.0004329832065702788, + "loss": 1.414, + "step": 6134 + }, + { + "epoch": 0.5566140446379967, + "grad_norm": 0.11603147608120187, + "learning_rate": 0.0004328376082403969, + "loss": 1.4075, + "step": 6135 + }, + { + "epoch": 0.5567047722736346, + "grad_norm": 0.11718809633509611, + "learning_rate": 0.00043269201570999763, + "loss": 1.3978, + "step": 6136 + }, + { + "epoch": 0.5567954999092724, + "grad_norm": 0.12247315925703563, + "learning_rate": 0.0004325464289916525, + "loss": 1.3629, + "step": 6137 + }, + { + "epoch": 0.5568862275449101, + "grad_norm": 0.11383923564009282, + "learning_rate": 0.00043240084809793324, + "loss": 1.379, + "step": 6138 + }, + { + "epoch": 0.556976955180548, + "grad_norm": 0.11466461800548912, + "learning_rate": 0.0004322552730414107, + "loss": 1.4148, + "step": 6139 + }, + { + "epoch": 0.5570676828161858, + "grad_norm": 0.13333046713052252, + "learning_rate": 0.0004321097038346554, + "loss": 1.3637, + "step": 6140 + }, + { + "epoch": 0.5571584104518236, + "grad_norm": 0.11612474280003583, + "learning_rate": 0.0004319641404902372, + "loss": 1.3693, + "step": 6141 + }, + { + "epoch": 0.5572491380874615, + "grad_norm": 0.11804454665324945, + "learning_rate": 0.0004318185830207254, + "loss": 1.3939, + "step": 6142 + }, + { + "epoch": 0.5573398657230992, + "grad_norm": 0.11355452022607462, + "learning_rate": 0.00043167303143868915, + "loss": 1.3767, + "step": 6143 + }, + { + "epoch": 0.5574305933587371, + "grad_norm": 0.11823576745344941, + "learning_rate": 0.00043152748575669655, + "loss": 1.3507, + "step": 6144 + }, + { + "epoch": 0.5575213209943749, + "grad_norm": 0.11801141323415594, + "learning_rate": 0.00043138194598731555, + "loss": 1.3551, + "step": 6145 + }, + { + "epoch": 0.5576120486300127, + "grad_norm": 0.12082016616734165, + "learning_rate": 0.0004312364121431138, + "loss": 1.3739, + "step": 6146 + }, + { + "epoch": 0.5577027762656506, + "grad_norm": 0.13291135483782038, + "learning_rate": 0.000431090884236658, + "loss": 1.4125, + "step": 6147 + }, + { + "epoch": 0.5577935039012883, + "grad_norm": 0.11590381399313728, + "learning_rate": 0.00043094536228051447, + "loss": 1.3329, + "step": 6148 + }, + { + "epoch": 0.5578842315369261, + "grad_norm": 0.12443626833847642, + "learning_rate": 0.00043079984628724904, + "loss": 1.3942, + "step": 6149 + }, + { + "epoch": 0.557974959172564, + "grad_norm": 0.12227753029217846, + "learning_rate": 0.0004306543362694272, + "loss": 1.4612, + "step": 6150 + }, + { + "epoch": 0.5580656868082018, + "grad_norm": 0.1274608391060863, + "learning_rate": 0.00043050883223961356, + "loss": 1.3832, + "step": 6151 + }, + { + "epoch": 0.5581564144438396, + "grad_norm": 0.12510941960140423, + "learning_rate": 0.00043036333421037255, + "loss": 1.42, + "step": 6152 + }, + { + "epoch": 0.5582471420794775, + "grad_norm": 0.12257074570635516, + "learning_rate": 0.00043021784219426766, + "loss": 1.3779, + "step": 6153 + }, + { + "epoch": 0.5583378697151152, + "grad_norm": 0.13593703372433785, + "learning_rate": 0.0004300723562038626, + "loss": 1.3807, + "step": 6154 + }, + { + "epoch": 0.558428597350753, + "grad_norm": 0.12388246773973002, + "learning_rate": 0.00042992687625171996, + "loss": 1.439, + "step": 6155 + }, + { + "epoch": 0.5585193249863909, + "grad_norm": 0.12199380034825823, + "learning_rate": 0.00042978140235040176, + "loss": 1.3956, + "step": 6156 + }, + { + "epoch": 0.5586100526220287, + "grad_norm": 0.12346157516485587, + "learning_rate": 0.0004296359345124699, + "loss": 1.3842, + "step": 6157 + }, + { + "epoch": 0.5587007802576665, + "grad_norm": 0.13918458152634625, + "learning_rate": 0.00042949047275048544, + "loss": 1.3961, + "step": 6158 + }, + { + "epoch": 0.5587915078933043, + "grad_norm": 0.11918339608710125, + "learning_rate": 0.00042934501707700895, + "loss": 1.3704, + "step": 6159 + }, + { + "epoch": 0.5588822355289421, + "grad_norm": 0.14401456984039407, + "learning_rate": 0.0004291995675046006, + "loss": 1.4134, + "step": 6160 + }, + { + "epoch": 0.5589729631645799, + "grad_norm": 0.12210701192389972, + "learning_rate": 0.0004290541240458201, + "loss": 1.4047, + "step": 6161 + }, + { + "epoch": 0.5590636908002178, + "grad_norm": 0.12226394912782654, + "learning_rate": 0.0004289086867132266, + "loss": 1.417, + "step": 6162 + }, + { + "epoch": 0.5591544184358556, + "grad_norm": 0.11906820310447888, + "learning_rate": 0.00042876325551937836, + "loss": 1.3966, + "step": 6163 + }, + { + "epoch": 0.5592451460714933, + "grad_norm": 0.13036544795409868, + "learning_rate": 0.0004286178304768335, + "loss": 1.4242, + "step": 6164 + }, + { + "epoch": 0.5593358737071312, + "grad_norm": 0.16201014143118395, + "learning_rate": 0.0004284724115981496, + "loss": 1.4092, + "step": 6165 + }, + { + "epoch": 0.559426601342769, + "grad_norm": 0.12447056818459032, + "learning_rate": 0.0004283269988958834, + "loss": 1.3922, + "step": 6166 + }, + { + "epoch": 0.5595173289784068, + "grad_norm": 0.12676244468364267, + "learning_rate": 0.0004281815923825915, + "loss": 1.4078, + "step": 6167 + }, + { + "epoch": 0.5596080566140447, + "grad_norm": 0.12534619252787085, + "learning_rate": 0.00042803619207082937, + "loss": 1.3844, + "step": 6168 + }, + { + "epoch": 0.5596987842496824, + "grad_norm": 0.1196861704869179, + "learning_rate": 0.000427890797973153, + "loss": 1.3808, + "step": 6169 + }, + { + "epoch": 0.5597895118853202, + "grad_norm": 0.12038481169705885, + "learning_rate": 0.0004277454101021167, + "loss": 1.4022, + "step": 6170 + }, + { + "epoch": 0.5598802395209581, + "grad_norm": 0.1276858037978477, + "learning_rate": 0.00042760002847027495, + "loss": 1.3699, + "step": 6171 + }, + { + "epoch": 0.5599709671565959, + "grad_norm": 0.11720140461915236, + "learning_rate": 0.0004274546530901815, + "loss": 1.3967, + "step": 6172 + }, + { + "epoch": 0.5600616947922337, + "grad_norm": 0.12106070235297929, + "learning_rate": 0.00042730928397438926, + "loss": 1.4116, + "step": 6173 + }, + { + "epoch": 0.5601524224278716, + "grad_norm": 0.11281766495793082, + "learning_rate": 0.0004271639211354512, + "loss": 1.4067, + "step": 6174 + }, + { + "epoch": 0.5602431500635093, + "grad_norm": 0.12078540195301563, + "learning_rate": 0.0004270185645859193, + "loss": 1.3884, + "step": 6175 + }, + { + "epoch": 0.5603338776991471, + "grad_norm": 0.12180305260263556, + "learning_rate": 0.0004268732143383449, + "loss": 1.3871, + "step": 6176 + }, + { + "epoch": 0.560424605334785, + "grad_norm": 0.11034140280603408, + "learning_rate": 0.0004267278704052794, + "loss": 1.3765, + "step": 6177 + }, + { + "epoch": 0.5605153329704228, + "grad_norm": 0.12078357994148387, + "learning_rate": 0.000426582532799273, + "loss": 1.3927, + "step": 6178 + }, + { + "epoch": 0.5606060606060606, + "grad_norm": 0.11940297908042359, + "learning_rate": 0.00042643720153287583, + "loss": 1.4016, + "step": 6179 + }, + { + "epoch": 0.5606967882416984, + "grad_norm": 0.11398328874444623, + "learning_rate": 0.0004262918766186371, + "loss": 1.3838, + "step": 6180 + }, + { + "epoch": 0.5607875158773362, + "grad_norm": 0.1211856152052273, + "learning_rate": 0.00042614655806910565, + "loss": 1.3922, + "step": 6181 + }, + { + "epoch": 0.5608782435129741, + "grad_norm": 0.1316565600749469, + "learning_rate": 0.0004260012458968299, + "loss": 1.4141, + "step": 6182 + }, + { + "epoch": 0.5609689711486119, + "grad_norm": 0.12253925538154807, + "learning_rate": 0.00042585594011435734, + "loss": 1.3924, + "step": 6183 + }, + { + "epoch": 0.5610596987842497, + "grad_norm": 0.11442227579493193, + "learning_rate": 0.0004257106407342352, + "loss": 1.3613, + "step": 6184 + }, + { + "epoch": 0.5611504264198875, + "grad_norm": 0.12353663792190173, + "learning_rate": 0.0004255653477690103, + "loss": 1.3666, + "step": 6185 + }, + { + "epoch": 0.5612411540555253, + "grad_norm": 0.12108736169868534, + "learning_rate": 0.00042542006123122867, + "loss": 1.4407, + "step": 6186 + }, + { + "epoch": 0.5613318816911631, + "grad_norm": 0.11649955863106741, + "learning_rate": 0.00042527478113343567, + "loss": 1.3502, + "step": 6187 + }, + { + "epoch": 0.561422609326801, + "grad_norm": 0.1210609879413711, + "learning_rate": 0.0004251295074881763, + "loss": 1.4064, + "step": 6188 + }, + { + "epoch": 0.5615133369624388, + "grad_norm": 0.11981518512259096, + "learning_rate": 0.0004249842403079952, + "loss": 1.3781, + "step": 6189 + }, + { + "epoch": 0.5616040645980765, + "grad_norm": 0.12314590053007174, + "learning_rate": 0.00042483897960543575, + "loss": 1.4027, + "step": 6190 + }, + { + "epoch": 0.5616947922337144, + "grad_norm": 0.14342602940967417, + "learning_rate": 0.00042469372539304155, + "loss": 1.3784, + "step": 6191 + }, + { + "epoch": 0.5617855198693522, + "grad_norm": 0.12405653333240248, + "learning_rate": 0.0004245484776833551, + "loss": 1.3785, + "step": 6192 + }, + { + "epoch": 0.56187624750499, + "grad_norm": 0.12413413377205122, + "learning_rate": 0.000424403236488919, + "loss": 1.3393, + "step": 6193 + }, + { + "epoch": 0.5619669751406279, + "grad_norm": 0.12745096798565825, + "learning_rate": 0.00042425800182227454, + "loss": 1.4207, + "step": 6194 + }, + { + "epoch": 0.5620577027762657, + "grad_norm": 0.13824870028295877, + "learning_rate": 0.0004241127736959627, + "loss": 1.378, + "step": 6195 + }, + { + "epoch": 0.5621484304119034, + "grad_norm": 0.12025079333594363, + "learning_rate": 0.00042396755212252416, + "loss": 1.4131, + "step": 6196 + }, + { + "epoch": 0.5622391580475413, + "grad_norm": 0.11814923310067824, + "learning_rate": 0.00042382233711449856, + "loss": 1.3586, + "step": 6197 + }, + { + "epoch": 0.5623298856831791, + "grad_norm": 0.11491992908308855, + "learning_rate": 0.00042367712868442544, + "loss": 1.4029, + "step": 6198 + }, + { + "epoch": 0.5624206133188169, + "grad_norm": 0.1326081832887636, + "learning_rate": 0.00042353192684484333, + "loss": 1.401, + "step": 6199 + }, + { + "epoch": 0.5625113409544548, + "grad_norm": 0.11799382759138663, + "learning_rate": 0.0004233867316082907, + "loss": 1.3791, + "step": 6200 + }, + { + "epoch": 0.5626020685900925, + "grad_norm": 0.11329020739752205, + "learning_rate": 0.0004232415429873053, + "loss": 1.3562, + "step": 6201 + }, + { + "epoch": 0.5626927962257303, + "grad_norm": 0.12352457525146864, + "learning_rate": 0.0004230963609944238, + "loss": 1.4036, + "step": 6202 + }, + { + "epoch": 0.5627835238613682, + "grad_norm": 0.11417316632463137, + "learning_rate": 0.0004229511856421828, + "loss": 1.4019, + "step": 6203 + }, + { + "epoch": 0.562874251497006, + "grad_norm": 0.12429894941745379, + "learning_rate": 0.0004228060169431185, + "loss": 1.3766, + "step": 6204 + }, + { + "epoch": 0.5629649791326438, + "grad_norm": 0.12052873177744929, + "learning_rate": 0.00042266085490976586, + "loss": 1.415, + "step": 6205 + }, + { + "epoch": 0.5630557067682816, + "grad_norm": 0.11486962450310681, + "learning_rate": 0.0004225156995546599, + "loss": 1.3794, + "step": 6206 + }, + { + "epoch": 0.5631464344039194, + "grad_norm": 0.12071267799163239, + "learning_rate": 0.00042237055089033436, + "loss": 1.418, + "step": 6207 + }, + { + "epoch": 0.5632371620395572, + "grad_norm": 0.11966054646061029, + "learning_rate": 0.0004222254089293236, + "loss": 1.3939, + "step": 6208 + }, + { + "epoch": 0.5633278896751951, + "grad_norm": 0.11748551893382521, + "learning_rate": 0.00042208027368416, + "loss": 1.3865, + "step": 6209 + }, + { + "epoch": 0.5634186173108329, + "grad_norm": 0.11714003495671328, + "learning_rate": 0.00042193514516737635, + "loss": 1.3754, + "step": 6210 + }, + { + "epoch": 0.5635093449464706, + "grad_norm": 0.12268994605237189, + "learning_rate": 0.0004217900233915045, + "loss": 1.3897, + "step": 6211 + }, + { + "epoch": 0.5636000725821085, + "grad_norm": 0.11730572344243577, + "learning_rate": 0.0004216449083690756, + "loss": 1.3783, + "step": 6212 + }, + { + "epoch": 0.5636908002177463, + "grad_norm": 0.12524136178769363, + "learning_rate": 0.00042149980011262035, + "loss": 1.4116, + "step": 6213 + }, + { + "epoch": 0.5637815278533841, + "grad_norm": 0.11772492119919059, + "learning_rate": 0.00042135469863466903, + "loss": 1.3444, + "step": 6214 + }, + { + "epoch": 0.563872255489022, + "grad_norm": 0.1316493604002412, + "learning_rate": 0.0004212096039477509, + "loss": 1.3967, + "step": 6215 + }, + { + "epoch": 0.5639629831246598, + "grad_norm": 0.11764495702500545, + "learning_rate": 0.0004210645160643952, + "loss": 1.3592, + "step": 6216 + }, + { + "epoch": 0.5640537107602975, + "grad_norm": 0.127708363802044, + "learning_rate": 0.00042091943499713007, + "loss": 1.3494, + "step": 6217 + }, + { + "epoch": 0.5641444383959354, + "grad_norm": 0.12265650724191146, + "learning_rate": 0.00042077436075848364, + "loss": 1.3929, + "step": 6218 + }, + { + "epoch": 0.5642351660315732, + "grad_norm": 0.12392568001262219, + "learning_rate": 0.0004206292933609826, + "loss": 1.3979, + "step": 6219 + }, + { + "epoch": 0.5643258936672111, + "grad_norm": 0.1186090345150607, + "learning_rate": 0.00042048423281715384, + "loss": 1.3916, + "step": 6220 + }, + { + "epoch": 0.5644166213028489, + "grad_norm": 0.11982807632419823, + "learning_rate": 0.00042033917913952335, + "loss": 1.3794, + "step": 6221 + }, + { + "epoch": 0.5645073489384866, + "grad_norm": 0.12025370692776212, + "learning_rate": 0.0004201941323406164, + "loss": 1.3796, + "step": 6222 + }, + { + "epoch": 0.5645980765741245, + "grad_norm": 0.1277301533136631, + "learning_rate": 0.00042004909243295786, + "loss": 1.3763, + "step": 6223 + }, + { + "epoch": 0.5646888042097623, + "grad_norm": 0.14048423713880653, + "learning_rate": 0.000419904059429072, + "loss": 1.4111, + "step": 6224 + }, + { + "epoch": 0.5647795318454001, + "grad_norm": 0.12993394085604537, + "learning_rate": 0.00041975903334148253, + "loss": 1.4064, + "step": 6225 + }, + { + "epoch": 0.564870259481038, + "grad_norm": 0.11822960808915514, + "learning_rate": 0.0004196140141827122, + "loss": 1.4224, + "step": 6226 + }, + { + "epoch": 0.5649609871166758, + "grad_norm": 0.11374075074040288, + "learning_rate": 0.0004194690019652837, + "loss": 1.4352, + "step": 6227 + }, + { + "epoch": 0.5650517147523135, + "grad_norm": 0.1228400451213573, + "learning_rate": 0.00041932399670171875, + "loss": 1.4228, + "step": 6228 + }, + { + "epoch": 0.5651424423879514, + "grad_norm": 0.1187746631348867, + "learning_rate": 0.00041917899840453855, + "loss": 1.402, + "step": 6229 + }, + { + "epoch": 0.5652331700235892, + "grad_norm": 0.13528732729852422, + "learning_rate": 0.0004190340070862637, + "loss": 1.3962, + "step": 6230 + }, + { + "epoch": 0.565323897659227, + "grad_norm": 0.12779501320577136, + "learning_rate": 0.00041888902275941423, + "loss": 1.3519, + "step": 6231 + }, + { + "epoch": 0.5654146252948649, + "grad_norm": 0.12119633341801053, + "learning_rate": 0.0004187440454365096, + "loss": 1.3958, + "step": 6232 + }, + { + "epoch": 0.5655053529305026, + "grad_norm": 0.11866021624433448, + "learning_rate": 0.0004185990751300688, + "loss": 1.3768, + "step": 6233 + }, + { + "epoch": 0.5655960805661404, + "grad_norm": 0.1294684939605254, + "learning_rate": 0.0004184541118526097, + "loss": 1.373, + "step": 6234 + }, + { + "epoch": 0.5656868082017783, + "grad_norm": 0.1206262391100753, + "learning_rate": 0.00041830915561665027, + "loss": 1.3999, + "step": 6235 + }, + { + "epoch": 0.5657775358374161, + "grad_norm": 0.12072250751970028, + "learning_rate": 0.0004181642064347071, + "loss": 1.3842, + "step": 6236 + }, + { + "epoch": 0.5658682634730539, + "grad_norm": 0.1232669433620162, + "learning_rate": 0.0004180192643192968, + "loss": 1.3974, + "step": 6237 + }, + { + "epoch": 0.5659589911086917, + "grad_norm": 0.11754688124600049, + "learning_rate": 0.000417874329282935, + "loss": 1.3882, + "step": 6238 + }, + { + "epoch": 0.5660497187443295, + "grad_norm": 0.11478395635176217, + "learning_rate": 0.0004177294013381371, + "loss": 1.3582, + "step": 6239 + }, + { + "epoch": 0.5661404463799673, + "grad_norm": 0.12803049383960474, + "learning_rate": 0.0004175844804974176, + "loss": 1.3759, + "step": 6240 + }, + { + "epoch": 0.5662311740156052, + "grad_norm": 0.10889859542609139, + "learning_rate": 0.0004174395667732902, + "loss": 1.3701, + "step": 6241 + }, + { + "epoch": 0.566321901651243, + "grad_norm": 0.1466964283653909, + "learning_rate": 0.00041729466017826846, + "loss": 1.4022, + "step": 6242 + }, + { + "epoch": 0.5664126292868807, + "grad_norm": 0.11326055970635568, + "learning_rate": 0.00041714976072486506, + "loss": 1.4038, + "step": 6243 + }, + { + "epoch": 0.5665033569225186, + "grad_norm": 0.11433689816199859, + "learning_rate": 0.0004170048684255919, + "loss": 1.405, + "step": 6244 + }, + { + "epoch": 0.5665940845581564, + "grad_norm": 0.108513587916973, + "learning_rate": 0.0004168599832929608, + "loss": 1.3847, + "step": 6245 + }, + { + "epoch": 0.5666848121937942, + "grad_norm": 0.12416732043519901, + "learning_rate": 0.00041671510533948194, + "loss": 1.4178, + "step": 6246 + }, + { + "epoch": 0.5667755398294321, + "grad_norm": 0.12201796393684493, + "learning_rate": 0.00041657023457766643, + "loss": 1.3776, + "step": 6247 + }, + { + "epoch": 0.5668662674650699, + "grad_norm": 0.123861872204571, + "learning_rate": 0.0004164253710200233, + "loss": 1.3769, + "step": 6248 + }, + { + "epoch": 0.5669569951007076, + "grad_norm": 0.11383423713070671, + "learning_rate": 0.0004162805146790617, + "loss": 1.4124, + "step": 6249 + }, + { + "epoch": 0.5670477227363455, + "grad_norm": 0.11602657150850183, + "learning_rate": 0.0004161356655672901, + "loss": 1.3919, + "step": 6250 + }, + { + "epoch": 0.5671384503719833, + "grad_norm": 0.13275156506830946, + "learning_rate": 0.0004159908236972161, + "loss": 1.3932, + "step": 6251 + }, + { + "epoch": 0.5672291780076211, + "grad_norm": 0.11859044857893868, + "learning_rate": 0.00041584598908134675, + "loss": 1.3634, + "step": 6252 + }, + { + "epoch": 0.567319905643259, + "grad_norm": 0.11574112818209907, + "learning_rate": 0.0004157011617321889, + "loss": 1.3828, + "step": 6253 + }, + { + "epoch": 0.5674106332788967, + "grad_norm": 0.11465651874749219, + "learning_rate": 0.0004155563416622478, + "loss": 1.394, + "step": 6254 + }, + { + "epoch": 0.5675013609145345, + "grad_norm": 0.12220700949862813, + "learning_rate": 0.0004154115288840292, + "loss": 1.3935, + "step": 6255 + }, + { + "epoch": 0.5675920885501724, + "grad_norm": 0.11545660674612138, + "learning_rate": 0.00041526672341003757, + "loss": 1.3794, + "step": 6256 + }, + { + "epoch": 0.5676828161858102, + "grad_norm": 0.1065600160026564, + "learning_rate": 0.0004151219252527769, + "loss": 1.3923, + "step": 6257 + }, + { + "epoch": 0.5677735438214481, + "grad_norm": 0.11462317540911335, + "learning_rate": 0.0004149771344247505, + "loss": 1.3842, + "step": 6258 + }, + { + "epoch": 0.5678642714570858, + "grad_norm": 0.1154983506297668, + "learning_rate": 0.0004148323509384609, + "loss": 1.3849, + "step": 6259 + }, + { + "epoch": 0.5679549990927236, + "grad_norm": 0.1307004964866455, + "learning_rate": 0.0004146875748064106, + "loss": 1.3711, + "step": 6260 + }, + { + "epoch": 0.5680457267283615, + "grad_norm": 0.11720832580609754, + "learning_rate": 0.00041454280604110073, + "loss": 1.377, + "step": 6261 + }, + { + "epoch": 0.5681364543639993, + "grad_norm": 0.11145254278899201, + "learning_rate": 0.00041439804465503196, + "loss": 1.399, + "step": 6262 + }, + { + "epoch": 0.5682271819996371, + "grad_norm": 0.11903201317395372, + "learning_rate": 0.0004142532906607047, + "loss": 1.3852, + "step": 6263 + }, + { + "epoch": 0.568317909635275, + "grad_norm": 0.2152269346631098, + "learning_rate": 0.00041410854407061864, + "loss": 1.385, + "step": 6264 + }, + { + "epoch": 0.5684086372709127, + "grad_norm": 0.1182856275882994, + "learning_rate": 0.0004139638048972723, + "loss": 1.3735, + "step": 6265 + }, + { + "epoch": 0.5684993649065505, + "grad_norm": 0.1180524144820677, + "learning_rate": 0.00041381907315316414, + "loss": 1.3587, + "step": 6266 + }, + { + "epoch": 0.5685900925421884, + "grad_norm": 0.11498997283921829, + "learning_rate": 0.00041367434885079173, + "loss": 1.3651, + "step": 6267 + }, + { + "epoch": 0.5686808201778262, + "grad_norm": 0.11387883053916015, + "learning_rate": 0.00041352963200265197, + "loss": 1.3846, + "step": 6268 + }, + { + "epoch": 0.568771547813464, + "grad_norm": 0.111336720765397, + "learning_rate": 0.0004133849226212412, + "loss": 1.3769, + "step": 6269 + }, + { + "epoch": 0.5688622754491018, + "grad_norm": 0.12295104924840057, + "learning_rate": 0.0004132402207190551, + "loss": 1.391, + "step": 6270 + }, + { + "epoch": 0.5689530030847396, + "grad_norm": 0.1246678891398195, + "learning_rate": 0.00041309552630858877, + "loss": 1.3786, + "step": 6271 + }, + { + "epoch": 0.5690437307203774, + "grad_norm": 0.11512925588744667, + "learning_rate": 0.0004129508394023367, + "loss": 1.3718, + "step": 6272 + }, + { + "epoch": 0.5691344583560153, + "grad_norm": 0.10936912499014058, + "learning_rate": 0.0004128061600127923, + "loss": 1.3848, + "step": 6273 + }, + { + "epoch": 0.5692251859916531, + "grad_norm": 0.12621369870117383, + "learning_rate": 0.00041266148815244895, + "loss": 1.3849, + "step": 6274 + }, + { + "epoch": 0.5693159136272908, + "grad_norm": 0.12083315218449048, + "learning_rate": 0.00041251682383379896, + "loss": 1.4031, + "step": 6275 + }, + { + "epoch": 0.5694066412629287, + "grad_norm": 0.11454269607439585, + "learning_rate": 0.000412372167069334, + "loss": 1.4265, + "step": 6276 + }, + { + "epoch": 0.5694973688985665, + "grad_norm": 0.11343943838785715, + "learning_rate": 0.0004122275178715453, + "loss": 1.401, + "step": 6277 + }, + { + "epoch": 0.5695880965342043, + "grad_norm": 0.11293878524512274, + "learning_rate": 0.00041208287625292344, + "loss": 1.4098, + "step": 6278 + }, + { + "epoch": 0.5696788241698422, + "grad_norm": 0.1448174393542097, + "learning_rate": 0.00041193824222595835, + "loss": 1.3373, + "step": 6279 + }, + { + "epoch": 0.56976955180548, + "grad_norm": 0.11809669250536275, + "learning_rate": 0.0004117936158031388, + "loss": 1.3537, + "step": 6280 + }, + { + "epoch": 0.5698602794411177, + "grad_norm": 0.1363759188916545, + "learning_rate": 0.0004116489969969536, + "loss": 1.3721, + "step": 6281 + }, + { + "epoch": 0.5699510070767556, + "grad_norm": 0.12998314278648543, + "learning_rate": 0.0004115043858198906, + "loss": 1.4023, + "step": 6282 + }, + { + "epoch": 0.5700417347123934, + "grad_norm": 0.11951872925317646, + "learning_rate": 0.00041135978228443686, + "loss": 1.4084, + "step": 6283 + }, + { + "epoch": 0.5701324623480312, + "grad_norm": 0.10974965463550349, + "learning_rate": 0.000411215186403079, + "loss": 1.4154, + "step": 6284 + }, + { + "epoch": 0.570223189983669, + "grad_norm": 0.11868756882893668, + "learning_rate": 0.00041107059818830274, + "loss": 1.434, + "step": 6285 + }, + { + "epoch": 0.5703139176193068, + "grad_norm": 0.11662233176731371, + "learning_rate": 0.0004109260176525936, + "loss": 1.3599, + "step": 6286 + }, + { + "epoch": 0.5704046452549446, + "grad_norm": 0.11984940300051149, + "learning_rate": 0.00041078144480843594, + "loss": 1.3739, + "step": 6287 + }, + { + "epoch": 0.5704953728905825, + "grad_norm": 0.11958520583124206, + "learning_rate": 0.00041063687966831365, + "loss": 1.4306, + "step": 6288 + }, + { + "epoch": 0.5705861005262203, + "grad_norm": 0.12013815118871587, + "learning_rate": 0.00041049232224471007, + "loss": 1.3703, + "step": 6289 + }, + { + "epoch": 0.570676828161858, + "grad_norm": 0.11925807035536912, + "learning_rate": 0.0004103477725501076, + "loss": 1.4086, + "step": 6290 + }, + { + "epoch": 0.5707675557974959, + "grad_norm": 0.11748756791856237, + "learning_rate": 0.00041020323059698805, + "loss": 1.4172, + "step": 6291 + }, + { + "epoch": 0.5708582834331337, + "grad_norm": 0.12266175209962528, + "learning_rate": 0.000410058696397833, + "loss": 1.4002, + "step": 6292 + }, + { + "epoch": 0.5709490110687715, + "grad_norm": 0.1173309100104933, + "learning_rate": 0.0004099141699651225, + "loss": 1.3973, + "step": 6293 + }, + { + "epoch": 0.5710397387044094, + "grad_norm": 0.12273000448421355, + "learning_rate": 0.0004097696513113368, + "loss": 1.4248, + "step": 6294 + }, + { + "epoch": 0.5711304663400472, + "grad_norm": 0.12853498283666695, + "learning_rate": 0.00040962514044895503, + "loss": 1.4064, + "step": 6295 + }, + { + "epoch": 0.571221193975685, + "grad_norm": 0.2299313105962539, + "learning_rate": 0.00040948063739045583, + "loss": 1.3639, + "step": 6296 + }, + { + "epoch": 0.5713119216113228, + "grad_norm": 0.11564072428362136, + "learning_rate": 0.0004093361421483168, + "loss": 1.3484, + "step": 6297 + }, + { + "epoch": 0.5714026492469606, + "grad_norm": 0.1165180619894407, + "learning_rate": 0.0004091916547350153, + "loss": 1.3778, + "step": 6298 + }, + { + "epoch": 0.5714933768825985, + "grad_norm": 0.11313377173878886, + "learning_rate": 0.00040904717516302786, + "loss": 1.4182, + "step": 6299 + }, + { + "epoch": 0.5715841045182363, + "grad_norm": 0.11233512494478003, + "learning_rate": 0.00040890270344483016, + "loss": 1.3685, + "step": 6300 + }, + { + "epoch": 0.571674832153874, + "grad_norm": 0.11985699817469553, + "learning_rate": 0.00040875823959289734, + "loss": 1.4079, + "step": 6301 + }, + { + "epoch": 0.5717655597895119, + "grad_norm": 0.18297085161478438, + "learning_rate": 0.000408613783619704, + "loss": 1.3862, + "step": 6302 + }, + { + "epoch": 0.5718562874251497, + "grad_norm": 0.11337640542595721, + "learning_rate": 0.00040846933553772413, + "loss": 1.421, + "step": 6303 + }, + { + "epoch": 0.5719470150607875, + "grad_norm": 0.11642867340951141, + "learning_rate": 0.00040832489535943053, + "loss": 1.3657, + "step": 6304 + }, + { + "epoch": 0.5720377426964254, + "grad_norm": 0.13521965451552675, + "learning_rate": 0.0004081804630972956, + "loss": 1.3963, + "step": 6305 + }, + { + "epoch": 0.5721284703320632, + "grad_norm": 0.13122310966638775, + "learning_rate": 0.00040803603876379146, + "loss": 1.3496, + "step": 6306 + }, + { + "epoch": 0.5722191979677009, + "grad_norm": 0.10911056543862152, + "learning_rate": 0.00040789162237138876, + "loss": 1.3764, + "step": 6307 + }, + { + "epoch": 0.5723099256033388, + "grad_norm": 0.11836912397593305, + "learning_rate": 0.00040774721393255805, + "loss": 1.4087, + "step": 6308 + }, + { + "epoch": 0.5724006532389766, + "grad_norm": 0.11360542625170679, + "learning_rate": 0.0004076028134597689, + "loss": 1.3989, + "step": 6309 + }, + { + "epoch": 0.5724913808746144, + "grad_norm": 0.12829341823280388, + "learning_rate": 0.0004074584209654905, + "loss": 1.4039, + "step": 6310 + }, + { + "epoch": 0.5725821085102523, + "grad_norm": 0.12893147822808884, + "learning_rate": 0.0004073140364621912, + "loss": 1.3682, + "step": 6311 + }, + { + "epoch": 0.57267283614589, + "grad_norm": 0.1211431116215653, + "learning_rate": 0.00040716965996233844, + "loss": 1.4375, + "step": 6312 + }, + { + "epoch": 0.5727635637815278, + "grad_norm": 0.12104415016017973, + "learning_rate": 0.0004070252914783994, + "loss": 1.386, + "step": 6313 + }, + { + "epoch": 0.5728542914171657, + "grad_norm": 0.11598021742631277, + "learning_rate": 0.00040688093102283997, + "loss": 1.3853, + "step": 6314 + }, + { + "epoch": 0.5729450190528035, + "grad_norm": 0.11151953830063494, + "learning_rate": 0.00040673657860812587, + "loss": 1.3551, + "step": 6315 + }, + { + "epoch": 0.5730357466884413, + "grad_norm": 0.12430073775800196, + "learning_rate": 0.0004065922342467218, + "loss": 1.4152, + "step": 6316 + }, + { + "epoch": 0.5731264743240791, + "grad_norm": 0.11100659349193841, + "learning_rate": 0.0004064478979510922, + "loss": 1.3993, + "step": 6317 + }, + { + "epoch": 0.5732172019597169, + "grad_norm": 0.12689363747410745, + "learning_rate": 0.00040630356973370054, + "loss": 1.3948, + "step": 6318 + }, + { + "epoch": 0.5733079295953547, + "grad_norm": 0.11593631283837164, + "learning_rate": 0.00040615924960700934, + "loss": 1.3926, + "step": 6319 + }, + { + "epoch": 0.5733986572309926, + "grad_norm": 0.11493555074648419, + "learning_rate": 0.00040601493758348075, + "loss": 1.4081, + "step": 6320 + }, + { + "epoch": 0.5734893848666304, + "grad_norm": 0.1338806954687129, + "learning_rate": 0.0004058706336755763, + "loss": 1.3827, + "step": 6321 + }, + { + "epoch": 0.5735801125022681, + "grad_norm": 0.11802972806348429, + "learning_rate": 0.0004057263378957563, + "loss": 1.4412, + "step": 6322 + }, + { + "epoch": 0.573670840137906, + "grad_norm": 0.12532075103751109, + "learning_rate": 0.000405582050256481, + "loss": 1.3736, + "step": 6323 + }, + { + "epoch": 0.5737615677735438, + "grad_norm": 0.1283590157649146, + "learning_rate": 0.0004054377707702094, + "loss": 1.3786, + "step": 6324 + }, + { + "epoch": 0.5738522954091816, + "grad_norm": 0.12237447557721438, + "learning_rate": 0.00040529349944940055, + "loss": 1.3794, + "step": 6325 + }, + { + "epoch": 0.5739430230448195, + "grad_norm": 0.11536304381087308, + "learning_rate": 0.00040514923630651173, + "loss": 1.4067, + "step": 6326 + }, + { + "epoch": 0.5740337506804573, + "grad_norm": 0.11514807497112108, + "learning_rate": 0.00040500498135400044, + "loss": 1.3451, + "step": 6327 + }, + { + "epoch": 0.574124478316095, + "grad_norm": 0.12241170227348398, + "learning_rate": 0.0004048607346043231, + "loss": 1.3522, + "step": 6328 + }, + { + "epoch": 0.5742152059517329, + "grad_norm": 0.12418854478459325, + "learning_rate": 0.0004047164960699352, + "loss": 1.36, + "step": 6329 + }, + { + "epoch": 0.5743059335873707, + "grad_norm": 0.12793241871239325, + "learning_rate": 0.000404572265763292, + "loss": 1.4026, + "step": 6330 + }, + { + "epoch": 0.5743966612230085, + "grad_norm": 0.15850855991670654, + "learning_rate": 0.0004044280436968477, + "loss": 1.382, + "step": 6331 + }, + { + "epoch": 0.5744873888586464, + "grad_norm": 0.12075728252029817, + "learning_rate": 0.0004042838298830558, + "loss": 1.3699, + "step": 6332 + }, + { + "epoch": 0.5745781164942841, + "grad_norm": 0.15170984439859644, + "learning_rate": 0.00040413962433436933, + "loss": 1.4085, + "step": 6333 + }, + { + "epoch": 0.574668844129922, + "grad_norm": 0.15929042526238124, + "learning_rate": 0.0004039954270632405, + "loss": 1.4367, + "step": 6334 + }, + { + "epoch": 0.5747595717655598, + "grad_norm": 0.13847658778474348, + "learning_rate": 0.00040385123808212074, + "loss": 1.416, + "step": 6335 + }, + { + "epoch": 0.5748502994011976, + "grad_norm": 0.1360306145014738, + "learning_rate": 0.0004037070574034607, + "loss": 1.3879, + "step": 6336 + }, + { + "epoch": 0.5749410270368355, + "grad_norm": 0.1363825769939347, + "learning_rate": 0.0004035628850397104, + "loss": 1.3794, + "step": 6337 + }, + { + "epoch": 0.5750317546724732, + "grad_norm": 0.16694287514032272, + "learning_rate": 0.0004034187210033193, + "loss": 1.4063, + "step": 6338 + }, + { + "epoch": 0.575122482308111, + "grad_norm": 0.11699183270071592, + "learning_rate": 0.0004032745653067358, + "loss": 1.4027, + "step": 6339 + }, + { + "epoch": 0.5752132099437489, + "grad_norm": 0.13962263097414296, + "learning_rate": 0.00040313041796240766, + "loss": 1.3986, + "step": 6340 + }, + { + "epoch": 0.5753039375793867, + "grad_norm": 0.12748769789370787, + "learning_rate": 0.0004029862789827824, + "loss": 1.3761, + "step": 6341 + }, + { + "epoch": 0.5753946652150245, + "grad_norm": 0.11743532425890363, + "learning_rate": 0.0004028421483803063, + "loss": 1.3693, + "step": 6342 + }, + { + "epoch": 0.5754853928506624, + "grad_norm": 0.13024179484791323, + "learning_rate": 0.0004026980261674249, + "loss": 1.4077, + "step": 6343 + }, + { + "epoch": 0.5755761204863001, + "grad_norm": 0.12427671509903233, + "learning_rate": 0.0004025539123565833, + "loss": 1.3767, + "step": 6344 + }, + { + "epoch": 0.5756668481219379, + "grad_norm": 0.12236776544823388, + "learning_rate": 0.00040240980696022586, + "loss": 1.4167, + "step": 6345 + }, + { + "epoch": 0.5757575757575758, + "grad_norm": 0.1252696472007967, + "learning_rate": 0.0004022657099907958, + "loss": 1.37, + "step": 6346 + }, + { + "epoch": 0.5758483033932136, + "grad_norm": 0.1271759452411779, + "learning_rate": 0.0004021216214607361, + "loss": 1.3728, + "step": 6347 + }, + { + "epoch": 0.5759390310288514, + "grad_norm": 0.1271892621844263, + "learning_rate": 0.0004019775413824888, + "loss": 1.3774, + "step": 6348 + }, + { + "epoch": 0.5760297586644892, + "grad_norm": 0.11565359728146199, + "learning_rate": 0.0004018334697684952, + "loss": 1.3955, + "step": 6349 + }, + { + "epoch": 0.576120486300127, + "grad_norm": 0.1315576527336598, + "learning_rate": 0.0004016894066311962, + "loss": 1.3852, + "step": 6350 + }, + { + "epoch": 0.5762112139357648, + "grad_norm": 0.12771668326083643, + "learning_rate": 0.0004015453519830312, + "loss": 1.4164, + "step": 6351 + }, + { + "epoch": 0.5763019415714027, + "grad_norm": 0.11975543779717114, + "learning_rate": 0.0004014013058364398, + "loss": 1.3889, + "step": 6352 + }, + { + "epoch": 0.5763926692070405, + "grad_norm": 0.11508739448323052, + "learning_rate": 0.00040125726820385997, + "loss": 1.3987, + "step": 6353 + }, + { + "epoch": 0.5764833968426782, + "grad_norm": 0.13018769494096707, + "learning_rate": 0.00040111323909772966, + "loss": 1.3818, + "step": 6354 + }, + { + "epoch": 0.5765741244783161, + "grad_norm": 0.11521834504433112, + "learning_rate": 0.0004009692185304856, + "loss": 1.3993, + "step": 6355 + }, + { + "epoch": 0.5766648521139539, + "grad_norm": 0.11663440799872855, + "learning_rate": 0.0004008252065145642, + "loss": 1.4217, + "step": 6356 + }, + { + "epoch": 0.5767555797495917, + "grad_norm": 0.1195952452751724, + "learning_rate": 0.0004006812030624009, + "loss": 1.4089, + "step": 6357 + }, + { + "epoch": 0.5768463073852296, + "grad_norm": 0.11435766176892119, + "learning_rate": 0.0004005372081864303, + "loss": 1.3967, + "step": 6358 + }, + { + "epoch": 0.5769370350208674, + "grad_norm": 0.11762367714068322, + "learning_rate": 0.0004003932218990864, + "loss": 1.3788, + "step": 6359 + }, + { + "epoch": 0.5770277626565051, + "grad_norm": 0.11977014069485244, + "learning_rate": 0.00040024924421280263, + "loss": 1.4133, + "step": 6360 + }, + { + "epoch": 0.577118490292143, + "grad_norm": 0.11192052256284112, + "learning_rate": 0.00040010527514001114, + "loss": 1.3829, + "step": 6361 + }, + { + "epoch": 0.5772092179277808, + "grad_norm": 0.1130028426829911, + "learning_rate": 0.00039996131469314394, + "loss": 1.4, + "step": 6362 + }, + { + "epoch": 0.5772999455634186, + "grad_norm": 0.1470136588021409, + "learning_rate": 0.0003998173628846318, + "loss": 1.4021, + "step": 6363 + }, + { + "epoch": 0.5773906731990565, + "grad_norm": 0.12665608678926144, + "learning_rate": 0.00039967341972690543, + "loss": 1.3727, + "step": 6364 + }, + { + "epoch": 0.5774814008346942, + "grad_norm": 0.11788555686789745, + "learning_rate": 0.0003995294852323939, + "loss": 1.4067, + "step": 6365 + }, + { + "epoch": 0.577572128470332, + "grad_norm": 0.1271145956123905, + "learning_rate": 0.0003993855594135261, + "loss": 1.3761, + "step": 6366 + }, + { + "epoch": 0.5776628561059699, + "grad_norm": 0.1180629387842581, + "learning_rate": 0.0003992416422827302, + "loss": 1.3911, + "step": 6367 + }, + { + "epoch": 0.5777535837416077, + "grad_norm": 0.11940447039755185, + "learning_rate": 0.00039909773385243325, + "loss": 1.3907, + "step": 6368 + }, + { + "epoch": 0.5778443113772455, + "grad_norm": 0.11498419849428423, + "learning_rate": 0.00039895383413506187, + "loss": 1.3691, + "step": 6369 + }, + { + "epoch": 0.5779350390128833, + "grad_norm": 0.11968438683212262, + "learning_rate": 0.0003988099431430419, + "loss": 1.3783, + "step": 6370 + }, + { + "epoch": 0.5780257666485211, + "grad_norm": 0.11956328115283152, + "learning_rate": 0.00039866606088879796, + "loss": 1.4171, + "step": 6371 + }, + { + "epoch": 0.5781164942841589, + "grad_norm": 0.1147081246625279, + "learning_rate": 0.00039852218738475474, + "loss": 1.3489, + "step": 6372 + }, + { + "epoch": 0.5782072219197968, + "grad_norm": 0.2255975565936393, + "learning_rate": 0.0003983783226433357, + "loss": 1.3827, + "step": 6373 + }, + { + "epoch": 0.5782979495554346, + "grad_norm": 0.11663929462382579, + "learning_rate": 0.0003982344666769635, + "loss": 1.4249, + "step": 6374 + }, + { + "epoch": 0.5783886771910725, + "grad_norm": 0.13026545247815058, + "learning_rate": 0.00039809061949806, + "loss": 1.3834, + "step": 6375 + }, + { + "epoch": 0.5784794048267102, + "grad_norm": 0.12013588605474564, + "learning_rate": 0.0003979467811190465, + "loss": 1.4037, + "step": 6376 + }, + { + "epoch": 0.578570132462348, + "grad_norm": 0.12312376600758163, + "learning_rate": 0.00039780295155234364, + "loss": 1.4219, + "step": 6377 + }, + { + "epoch": 0.5786608600979859, + "grad_norm": 0.11574394663567175, + "learning_rate": 0.00039765913081037086, + "loss": 1.3733, + "step": 6378 + }, + { + "epoch": 0.5787515877336237, + "grad_norm": 0.11170700154168739, + "learning_rate": 0.0003975153189055471, + "loss": 1.39, + "step": 6379 + }, + { + "epoch": 0.5788423153692615, + "grad_norm": 0.11901656877442034, + "learning_rate": 0.00039737151585029075, + "loss": 1.3524, + "step": 6380 + }, + { + "epoch": 0.5789330430048993, + "grad_norm": 0.1214289695477794, + "learning_rate": 0.00039722772165701924, + "loss": 1.429, + "step": 6381 + }, + { + "epoch": 0.5790237706405371, + "grad_norm": 0.11611586640932649, + "learning_rate": 0.00039708393633814906, + "loss": 1.3532, + "step": 6382 + }, + { + "epoch": 0.5791144982761749, + "grad_norm": 0.15252369496404808, + "learning_rate": 0.0003969401599060961, + "loss": 1.3876, + "step": 6383 + }, + { + "epoch": 0.5792052259118128, + "grad_norm": 0.12344238618768774, + "learning_rate": 0.0003967963923732756, + "loss": 1.4027, + "step": 6384 + }, + { + "epoch": 0.5792959535474506, + "grad_norm": 0.11694458447675514, + "learning_rate": 0.0003966526337521018, + "loss": 1.4039, + "step": 6385 + }, + { + "epoch": 0.5793866811830883, + "grad_norm": 0.11274626583607641, + "learning_rate": 0.00039650888405498824, + "loss": 1.3908, + "step": 6386 + }, + { + "epoch": 0.5794774088187262, + "grad_norm": 0.1250232412992073, + "learning_rate": 0.00039636514329434777, + "loss": 1.3838, + "step": 6387 + }, + { + "epoch": 0.579568136454364, + "grad_norm": 0.12633379019162536, + "learning_rate": 0.0003962214114825925, + "loss": 1.3931, + "step": 6388 + }, + { + "epoch": 0.5796588640900018, + "grad_norm": 0.1168477206154575, + "learning_rate": 0.00039607768863213373, + "loss": 1.4, + "step": 6389 + }, + { + "epoch": 0.5797495917256397, + "grad_norm": 0.14366295775155707, + "learning_rate": 0.0003959339747553818, + "loss": 1.3902, + "step": 6390 + }, + { + "epoch": 0.5798403193612774, + "grad_norm": 0.12481841121543799, + "learning_rate": 0.0003957902698647465, + "loss": 1.4058, + "step": 6391 + }, + { + "epoch": 0.5799310469969152, + "grad_norm": 0.12257429585715858, + "learning_rate": 0.00039564657397263677, + "loss": 1.4072, + "step": 6392 + }, + { + "epoch": 0.5800217746325531, + "grad_norm": 0.11512760991503171, + "learning_rate": 0.0003955028870914607, + "loss": 1.3722, + "step": 6393 + }, + { + "epoch": 0.5801125022681909, + "grad_norm": 0.12411222982736471, + "learning_rate": 0.00039535920923362567, + "loss": 1.3645, + "step": 6394 + }, + { + "epoch": 0.5802032299038287, + "grad_norm": 0.12265919888244267, + "learning_rate": 0.0003952155404115384, + "loss": 1.4138, + "step": 6395 + }, + { + "epoch": 0.5802939575394666, + "grad_norm": 0.11994122164028709, + "learning_rate": 0.0003950718806376048, + "loss": 1.3709, + "step": 6396 + }, + { + "epoch": 0.5803846851751043, + "grad_norm": 0.11796965934863173, + "learning_rate": 0.0003949282299242296, + "loss": 1.401, + "step": 6397 + }, + { + "epoch": 0.5804754128107421, + "grad_norm": 0.12296582549645099, + "learning_rate": 0.0003947845882838173, + "loss": 1.3902, + "step": 6398 + }, + { + "epoch": 0.58056614044638, + "grad_norm": 0.12920248229847528, + "learning_rate": 0.0003946409557287714, + "loss": 1.3958, + "step": 6399 + }, + { + "epoch": 0.5806568680820178, + "grad_norm": 0.11856249719052396, + "learning_rate": 0.00039449733227149445, + "loss": 1.3714, + "step": 6400 + }, + { + "epoch": 0.5807475957176556, + "grad_norm": 0.12685008055353592, + "learning_rate": 0.0003943537179243883, + "loss": 1.3905, + "step": 6401 + }, + { + "epoch": 0.5808383233532934, + "grad_norm": 0.12307393451144394, + "learning_rate": 0.0003942101126998541, + "loss": 1.3555, + "step": 6402 + }, + { + "epoch": 0.5809290509889312, + "grad_norm": 0.1211392176764358, + "learning_rate": 0.00039406651661029246, + "loss": 1.351, + "step": 6403 + }, + { + "epoch": 0.581019778624569, + "grad_norm": 0.12150050138500645, + "learning_rate": 0.00039392292966810265, + "loss": 1.3969, + "step": 6404 + }, + { + "epoch": 0.5811105062602069, + "grad_norm": 0.12386672658038603, + "learning_rate": 0.00039377935188568354, + "loss": 1.375, + "step": 6405 + }, + { + "epoch": 0.5812012338958447, + "grad_norm": 0.1481671113939756, + "learning_rate": 0.00039363578327543316, + "loss": 1.3818, + "step": 6406 + }, + { + "epoch": 0.5812919615314824, + "grad_norm": 0.2112686763926864, + "learning_rate": 0.00039349222384974837, + "loss": 1.3947, + "step": 6407 + }, + { + "epoch": 0.5813826891671203, + "grad_norm": 0.1229370930295611, + "learning_rate": 0.0003933486736210258, + "loss": 1.3752, + "step": 6408 + }, + { + "epoch": 0.5814734168027581, + "grad_norm": 0.11623435241798993, + "learning_rate": 0.0003932051326016611, + "loss": 1.3944, + "step": 6409 + }, + { + "epoch": 0.5815641444383959, + "grad_norm": 0.12386116470088579, + "learning_rate": 0.0003930616008040487, + "loss": 1.4181, + "step": 6410 + }, + { + "epoch": 0.5816548720740338, + "grad_norm": 0.12057485588026848, + "learning_rate": 0.000392918078240583, + "loss": 1.4204, + "step": 6411 + }, + { + "epoch": 0.5817455997096715, + "grad_norm": 0.12201671975016215, + "learning_rate": 0.00039277456492365693, + "loss": 1.4013, + "step": 6412 + }, + { + "epoch": 0.5818363273453094, + "grad_norm": 0.1183977169579713, + "learning_rate": 0.00039263106086566315, + "loss": 1.3856, + "step": 6413 + }, + { + "epoch": 0.5819270549809472, + "grad_norm": 0.11921338294541904, + "learning_rate": 0.00039248756607899294, + "loss": 1.3575, + "step": 6414 + }, + { + "epoch": 0.582017782616585, + "grad_norm": 0.12834857899502472, + "learning_rate": 0.00039234408057603725, + "loss": 1.3901, + "step": 6415 + }, + { + "epoch": 0.5821085102522229, + "grad_norm": 0.11709075259866358, + "learning_rate": 0.0003922006043691862, + "loss": 1.3868, + "step": 6416 + }, + { + "epoch": 0.5821992378878607, + "grad_norm": 0.14279765159531202, + "learning_rate": 0.0003920571374708288, + "loss": 1.3672, + "step": 6417 + }, + { + "epoch": 0.5822899655234984, + "grad_norm": 0.11773932562242852, + "learning_rate": 0.00039191367989335323, + "loss": 1.3983, + "step": 6418 + }, + { + "epoch": 0.5823806931591363, + "grad_norm": 0.12062526766181625, + "learning_rate": 0.0003917702316491476, + "loss": 1.3487, + "step": 6419 + }, + { + "epoch": 0.5824714207947741, + "grad_norm": 0.12688036230650054, + "learning_rate": 0.0003916267927505984, + "loss": 1.3884, + "step": 6420 + }, + { + "epoch": 0.5825621484304119, + "grad_norm": 0.11411717074149012, + "learning_rate": 0.0003914833632100916, + "loss": 1.3973, + "step": 6421 + }, + { + "epoch": 0.5826528760660498, + "grad_norm": 0.12885107288770853, + "learning_rate": 0.00039133994304001233, + "loss": 1.4259, + "step": 6422 + }, + { + "epoch": 0.5827436037016875, + "grad_norm": 0.11498370128729381, + "learning_rate": 0.00039119653225274515, + "loss": 1.3963, + "step": 6423 + }, + { + "epoch": 0.5828343313373253, + "grad_norm": 0.13907170546626707, + "learning_rate": 0.0003910531308606733, + "loss": 1.3471, + "step": 6424 + }, + { + "epoch": 0.5829250589729632, + "grad_norm": 0.12415587290422117, + "learning_rate": 0.00039090973887617964, + "loss": 1.417, + "step": 6425 + }, + { + "epoch": 0.583015786608601, + "grad_norm": 0.12019783443826022, + "learning_rate": 0.000390766356311646, + "loss": 1.3903, + "step": 6426 + }, + { + "epoch": 0.5831065142442388, + "grad_norm": 0.128805519359616, + "learning_rate": 0.0003906229831794538, + "loss": 1.3891, + "step": 6427 + }, + { + "epoch": 0.5831972418798766, + "grad_norm": 0.18941057656544072, + "learning_rate": 0.00039047961949198316, + "loss": 1.378, + "step": 6428 + }, + { + "epoch": 0.5832879695155144, + "grad_norm": 0.14520769977538936, + "learning_rate": 0.00039033626526161337, + "loss": 1.399, + "step": 6429 + }, + { + "epoch": 0.5833786971511522, + "grad_norm": 0.1440903916404855, + "learning_rate": 0.00039019292050072326, + "loss": 1.3777, + "step": 6430 + }, + { + "epoch": 0.5834694247867901, + "grad_norm": 0.12218969079644423, + "learning_rate": 0.0003900495852216908, + "loss": 1.3801, + "step": 6431 + }, + { + "epoch": 0.5835601524224279, + "grad_norm": 0.13254030488248986, + "learning_rate": 0.00038990625943689274, + "loss": 1.3592, + "step": 6432 + }, + { + "epoch": 0.5836508800580656, + "grad_norm": 0.11746076236592412, + "learning_rate": 0.00038976294315870526, + "loss": 1.3272, + "step": 6433 + }, + { + "epoch": 0.5837416076937035, + "grad_norm": 0.12015829033434876, + "learning_rate": 0.00038961963639950396, + "loss": 1.3607, + "step": 6434 + }, + { + "epoch": 0.5838323353293413, + "grad_norm": 0.12086399224258992, + "learning_rate": 0.00038947633917166345, + "loss": 1.3585, + "step": 6435 + }, + { + "epoch": 0.5839230629649791, + "grad_norm": 0.11879162593166669, + "learning_rate": 0.00038933305148755734, + "loss": 1.3957, + "step": 6436 + }, + { + "epoch": 0.584013790600617, + "grad_norm": 0.11793724309199358, + "learning_rate": 0.0003891897733595585, + "loss": 1.387, + "step": 6437 + }, + { + "epoch": 0.5841045182362548, + "grad_norm": 0.13388276094032808, + "learning_rate": 0.0003890465048000391, + "loss": 1.3705, + "step": 6438 + }, + { + "epoch": 0.5841952458718925, + "grad_norm": 0.12825917931679148, + "learning_rate": 0.00038890324582137027, + "loss": 1.3583, + "step": 6439 + }, + { + "epoch": 0.5842859735075304, + "grad_norm": 0.11964351291107207, + "learning_rate": 0.00038875999643592257, + "loss": 1.3737, + "step": 6440 + }, + { + "epoch": 0.5843767011431682, + "grad_norm": 0.11909289101448611, + "learning_rate": 0.0003886167566560655, + "loss": 1.4111, + "step": 6441 + }, + { + "epoch": 0.584467428778806, + "grad_norm": 0.12458268227478356, + "learning_rate": 0.00038847352649416805, + "loss": 1.4062, + "step": 6442 + }, + { + "epoch": 0.5845581564144439, + "grad_norm": 0.12457240927469346, + "learning_rate": 0.000388330305962598, + "loss": 1.3661, + "step": 6443 + }, + { + "epoch": 0.5846488840500816, + "grad_norm": 0.11568519712296348, + "learning_rate": 0.0003881870950737224, + "loss": 1.3658, + "step": 6444 + }, + { + "epoch": 0.5847396116857194, + "grad_norm": 0.12734713908505707, + "learning_rate": 0.00038804389383990777, + "loss": 1.4232, + "step": 6445 + }, + { + "epoch": 0.5848303393213573, + "grad_norm": 0.11009608507707788, + "learning_rate": 0.00038790070227351934, + "loss": 1.3623, + "step": 6446 + }, + { + "epoch": 0.5849210669569951, + "grad_norm": 0.11511945572080165, + "learning_rate": 0.00038775752038692167, + "loss": 1.3944, + "step": 6447 + }, + { + "epoch": 0.5850117945926329, + "grad_norm": 0.11987326693580937, + "learning_rate": 0.00038761434819247886, + "loss": 1.4083, + "step": 6448 + }, + { + "epoch": 0.5851025222282707, + "grad_norm": 0.12630297473116026, + "learning_rate": 0.00038747118570255345, + "loss": 1.4073, + "step": 6449 + }, + { + "epoch": 0.5851932498639085, + "grad_norm": 0.1191125962068772, + "learning_rate": 0.0003873280329295078, + "loss": 1.3677, + "step": 6450 + }, + { + "epoch": 0.5852839774995464, + "grad_norm": 0.11535343875809297, + "learning_rate": 0.0003871848898857031, + "loss": 1.3998, + "step": 6451 + }, + { + "epoch": 0.5853747051351842, + "grad_norm": 0.11334461537852475, + "learning_rate": 0.0003870417565834999, + "loss": 1.3551, + "step": 6452 + }, + { + "epoch": 0.585465432770822, + "grad_norm": 0.12570212727742183, + "learning_rate": 0.0003868986330352576, + "loss": 1.3967, + "step": 6453 + }, + { + "epoch": 0.5855561604064599, + "grad_norm": 0.11582204471271358, + "learning_rate": 0.000386755519253335, + "loss": 1.3908, + "step": 6454 + }, + { + "epoch": 0.5856468880420976, + "grad_norm": 0.11088717648292659, + "learning_rate": 0.0003866124152500901, + "loss": 1.358, + "step": 6455 + }, + { + "epoch": 0.5857376156777354, + "grad_norm": 0.12067538344991438, + "learning_rate": 0.0003864693210378798, + "loss": 1.4138, + "step": 6456 + }, + { + "epoch": 0.5858283433133733, + "grad_norm": 0.11856316347105006, + "learning_rate": 0.0003863262366290602, + "loss": 1.3436, + "step": 6457 + }, + { + "epoch": 0.5859190709490111, + "grad_norm": 0.12067179483439894, + "learning_rate": 0.0003861831620359869, + "loss": 1.3545, + "step": 6458 + }, + { + "epoch": 0.5860097985846489, + "grad_norm": 0.15288822764008986, + "learning_rate": 0.00038604009727101447, + "loss": 1.4375, + "step": 6459 + }, + { + "epoch": 0.5861005262202867, + "grad_norm": 0.15409319577406114, + "learning_rate": 0.0003858970423464963, + "loss": 1.4173, + "step": 6460 + }, + { + "epoch": 0.5861912538559245, + "grad_norm": 0.1153872526658302, + "learning_rate": 0.0003857539972747854, + "loss": 1.3161, + "step": 6461 + }, + { + "epoch": 0.5862819814915623, + "grad_norm": 0.11880942031368973, + "learning_rate": 0.0003856109620682337, + "loss": 1.3462, + "step": 6462 + }, + { + "epoch": 0.5863727091272002, + "grad_norm": 0.13072537348163646, + "learning_rate": 0.00038546793673919225, + "loss": 1.3984, + "step": 6463 + }, + { + "epoch": 0.586463436762838, + "grad_norm": 0.11962115286883387, + "learning_rate": 0.0003853249213000113, + "loss": 1.4156, + "step": 6464 + }, + { + "epoch": 0.5865541643984757, + "grad_norm": 0.12473852289194036, + "learning_rate": 0.00038518191576304017, + "loss": 1.3853, + "step": 6465 + }, + { + "epoch": 0.5866448920341136, + "grad_norm": 0.12099607389321483, + "learning_rate": 0.0003850389201406277, + "loss": 1.4026, + "step": 6466 + }, + { + "epoch": 0.5867356196697514, + "grad_norm": 0.130676476707341, + "learning_rate": 0.0003848959344451215, + "loss": 1.4353, + "step": 6467 + }, + { + "epoch": 0.5868263473053892, + "grad_norm": 0.12459113769722215, + "learning_rate": 0.0003847529586888683, + "loss": 1.3912, + "step": 6468 + }, + { + "epoch": 0.5869170749410271, + "grad_norm": 0.12103672656124038, + "learning_rate": 0.000384609992884214, + "loss": 1.4023, + "step": 6469 + }, + { + "epoch": 0.5870078025766648, + "grad_norm": 0.13194121632459824, + "learning_rate": 0.00038446703704350394, + "loss": 1.341, + "step": 6470 + }, + { + "epoch": 0.5870985302123026, + "grad_norm": 0.11288730201585599, + "learning_rate": 0.0003843240911790822, + "loss": 1.3777, + "step": 6471 + }, + { + "epoch": 0.5871892578479405, + "grad_norm": 0.12436727117771414, + "learning_rate": 0.0003841811553032921, + "loss": 1.4105, + "step": 6472 + }, + { + "epoch": 0.5872799854835783, + "grad_norm": 0.16280885682890342, + "learning_rate": 0.0003840382294284764, + "loss": 1.4579, + "step": 6473 + }, + { + "epoch": 0.5873707131192161, + "grad_norm": 0.11291725391578425, + "learning_rate": 0.0003838953135669768, + "loss": 1.3694, + "step": 6474 + }, + { + "epoch": 0.587461440754854, + "grad_norm": 0.12083592148680723, + "learning_rate": 0.00038375240773113386, + "loss": 1.3963, + "step": 6475 + }, + { + "epoch": 0.5875521683904917, + "grad_norm": 0.12082673460599665, + "learning_rate": 0.0003836095119332876, + "loss": 1.3866, + "step": 6476 + }, + { + "epoch": 0.5876428960261295, + "grad_norm": 0.14177496822740734, + "learning_rate": 0.0003834666261857774, + "loss": 1.3758, + "step": 6477 + }, + { + "epoch": 0.5877336236617674, + "grad_norm": 0.13132988778260307, + "learning_rate": 0.000383323750500941, + "loss": 1.409, + "step": 6478 + }, + { + "epoch": 0.5878243512974052, + "grad_norm": 0.18637545489883783, + "learning_rate": 0.0003831808848911159, + "loss": 1.368, + "step": 6479 + }, + { + "epoch": 0.587915078933043, + "grad_norm": 0.1237439937084427, + "learning_rate": 0.0003830380293686386, + "loss": 1.3939, + "step": 6480 + }, + { + "epoch": 0.5880058065686808, + "grad_norm": 0.12034714631993725, + "learning_rate": 0.00038289518394584484, + "loss": 1.3732, + "step": 6481 + }, + { + "epoch": 0.5880965342043186, + "grad_norm": 0.1933727907191793, + "learning_rate": 0.00038275234863506915, + "loss": 1.4357, + "step": 6482 + }, + { + "epoch": 0.5881872618399564, + "grad_norm": 0.1175405396353176, + "learning_rate": 0.00038260952344864533, + "loss": 1.3981, + "step": 6483 + }, + { + "epoch": 0.5882779894755943, + "grad_norm": 0.5359470369064833, + "learning_rate": 0.0003824667083989067, + "loss": 1.4024, + "step": 6484 + }, + { + "epoch": 0.5883687171112321, + "grad_norm": 0.12208135889486856, + "learning_rate": 0.00038232390349818496, + "loss": 1.3962, + "step": 6485 + }, + { + "epoch": 0.5884594447468698, + "grad_norm": 0.12613507700670007, + "learning_rate": 0.0003821811087588115, + "loss": 1.3831, + "step": 6486 + }, + { + "epoch": 0.5885501723825077, + "grad_norm": 0.12388057952905177, + "learning_rate": 0.0003820383241931168, + "loss": 1.3701, + "step": 6487 + }, + { + "epoch": 0.5886409000181455, + "grad_norm": 0.14252539422310082, + "learning_rate": 0.0003818955498134299, + "loss": 1.4112, + "step": 6488 + }, + { + "epoch": 0.5887316276537834, + "grad_norm": 0.12032348294917224, + "learning_rate": 0.0003817527856320798, + "loss": 1.3731, + "step": 6489 + }, + { + "epoch": 0.5888223552894212, + "grad_norm": 0.12416615887502118, + "learning_rate": 0.0003816100316613942, + "loss": 1.364, + "step": 6490 + }, + { + "epoch": 0.588913082925059, + "grad_norm": 0.12406679481324306, + "learning_rate": 0.00038146728791369985, + "loss": 1.3955, + "step": 6491 + }, + { + "epoch": 0.5890038105606968, + "grad_norm": 0.12550693103236815, + "learning_rate": 0.00038132455440132254, + "loss": 1.4019, + "step": 6492 + }, + { + "epoch": 0.5890945381963346, + "grad_norm": 0.1274284977870676, + "learning_rate": 0.0003811818311365874, + "loss": 1.3569, + "step": 6493 + }, + { + "epoch": 0.5891852658319724, + "grad_norm": 0.12884722801278062, + "learning_rate": 0.00038103911813181884, + "loss": 1.3804, + "step": 6494 + }, + { + "epoch": 0.5892759934676103, + "grad_norm": 0.1274648330124796, + "learning_rate": 0.00038089641539933985, + "loss": 1.373, + "step": 6495 + }, + { + "epoch": 0.5893667211032481, + "grad_norm": 0.12973410433126664, + "learning_rate": 0.00038075372295147283, + "loss": 1.4208, + "step": 6496 + }, + { + "epoch": 0.5894574487388858, + "grad_norm": 0.1210122302495049, + "learning_rate": 0.0003806110408005395, + "loss": 1.4009, + "step": 6497 + }, + { + "epoch": 0.5895481763745237, + "grad_norm": 0.11937808468021033, + "learning_rate": 0.0003804683689588605, + "loss": 1.3796, + "step": 6498 + }, + { + "epoch": 0.5896389040101615, + "grad_norm": 0.131647089014714, + "learning_rate": 0.00038032570743875535, + "loss": 1.3516, + "step": 6499 + }, + { + "epoch": 0.5897296316457993, + "grad_norm": 0.13988314995835618, + "learning_rate": 0.000380183056252543, + "loss": 1.3662, + "step": 6500 + }, + { + "epoch": 0.5898203592814372, + "grad_norm": 0.1176593547159085, + "learning_rate": 0.00038004041541254155, + "loss": 1.3967, + "step": 6501 + }, + { + "epoch": 0.589911086917075, + "grad_norm": 0.11721080328024096, + "learning_rate": 0.00037989778493106776, + "loss": 1.3853, + "step": 6502 + }, + { + "epoch": 0.5900018145527127, + "grad_norm": 0.12954035714082782, + "learning_rate": 0.0003797551648204379, + "loss": 1.369, + "step": 6503 + }, + { + "epoch": 0.5900925421883506, + "grad_norm": 0.11253704449110816, + "learning_rate": 0.00037961255509296723, + "loss": 1.376, + "step": 6504 + }, + { + "epoch": 0.5901832698239884, + "grad_norm": 0.12187314321028124, + "learning_rate": 0.00037946995576097027, + "loss": 1.367, + "step": 6505 + }, + { + "epoch": 0.5902739974596262, + "grad_norm": 0.11806569599852067, + "learning_rate": 0.0003793273668367605, + "loss": 1.3825, + "step": 6506 + }, + { + "epoch": 0.590364725095264, + "grad_norm": 0.12289768748466048, + "learning_rate": 0.0003791847883326503, + "loss": 1.4405, + "step": 6507 + }, + { + "epoch": 0.5904554527309018, + "grad_norm": 0.128180906192214, + "learning_rate": 0.00037904222026095136, + "loss": 1.3633, + "step": 6508 + }, + { + "epoch": 0.5905461803665396, + "grad_norm": 0.12107437409128212, + "learning_rate": 0.00037889966263397474, + "loss": 1.4134, + "step": 6509 + }, + { + "epoch": 0.5906369080021775, + "grad_norm": 0.12754948569009017, + "learning_rate": 0.0003787571154640299, + "loss": 1.3769, + "step": 6510 + }, + { + "epoch": 0.5907276356378153, + "grad_norm": 0.12944734519930814, + "learning_rate": 0.00037861457876342596, + "loss": 1.3905, + "step": 6511 + }, + { + "epoch": 0.590818363273453, + "grad_norm": 0.1326989666969558, + "learning_rate": 0.00037847205254447114, + "loss": 1.4126, + "step": 6512 + }, + { + "epoch": 0.5909090909090909, + "grad_norm": 0.12407605413064936, + "learning_rate": 0.00037832953681947256, + "loss": 1.3803, + "step": 6513 + }, + { + "epoch": 0.5909998185447287, + "grad_norm": 0.12737438022820607, + "learning_rate": 0.00037818703160073634, + "loss": 1.3899, + "step": 6514 + }, + { + "epoch": 0.5910905461803665, + "grad_norm": 0.12188202496499326, + "learning_rate": 0.0003780445369005679, + "loss": 1.3956, + "step": 6515 + }, + { + "epoch": 0.5911812738160044, + "grad_norm": 0.1246975130587558, + "learning_rate": 0.00037790205273127177, + "loss": 1.3755, + "step": 6516 + }, + { + "epoch": 0.5912720014516422, + "grad_norm": 0.13668027248869655, + "learning_rate": 0.00037775957910515123, + "loss": 1.344, + "step": 6517 + }, + { + "epoch": 0.5913627290872799, + "grad_norm": 0.1417749654349124, + "learning_rate": 0.0003776171160345091, + "loss": 1.4057, + "step": 6518 + }, + { + "epoch": 0.5914534567229178, + "grad_norm": 0.12937358208343708, + "learning_rate": 0.00037747466353164693, + "loss": 1.3889, + "step": 6519 + }, + { + "epoch": 0.5915441843585556, + "grad_norm": 0.13627606029110256, + "learning_rate": 0.00037733222160886583, + "loss": 1.378, + "step": 6520 + }, + { + "epoch": 0.5916349119941934, + "grad_norm": 0.1554027306820303, + "learning_rate": 0.0003771897902784654, + "loss": 1.3642, + "step": 6521 + }, + { + "epoch": 0.5917256396298313, + "grad_norm": 0.12712077458841714, + "learning_rate": 0.00037704736955274467, + "loss": 1.3768, + "step": 6522 + }, + { + "epoch": 0.591816367265469, + "grad_norm": 0.12880933334201403, + "learning_rate": 0.00037690495944400174, + "loss": 1.4014, + "step": 6523 + }, + { + "epoch": 0.5919070949011068, + "grad_norm": 0.13030699707857038, + "learning_rate": 0.0003767625599645336, + "loss": 1.4181, + "step": 6524 + }, + { + "epoch": 0.5919978225367447, + "grad_norm": 0.13142524926781782, + "learning_rate": 0.00037662017112663666, + "loss": 1.4003, + "step": 6525 + }, + { + "epoch": 0.5920885501723825, + "grad_norm": 0.12201057502132746, + "learning_rate": 0.0003764777929426062, + "loss": 1.3713, + "step": 6526 + }, + { + "epoch": 0.5921792778080204, + "grad_norm": 0.1298206305111471, + "learning_rate": 0.00037633542542473623, + "loss": 1.3705, + "step": 6527 + }, + { + "epoch": 0.5922700054436582, + "grad_norm": 0.13517369324067496, + "learning_rate": 0.0003761930685853207, + "loss": 1.3994, + "step": 6528 + }, + { + "epoch": 0.5923607330792959, + "grad_norm": 0.121322024208105, + "learning_rate": 0.00037605072243665196, + "loss": 1.3674, + "step": 6529 + }, + { + "epoch": 0.5924514607149338, + "grad_norm": 0.15660474709070135, + "learning_rate": 0.00037590838699102163, + "loss": 1.3641, + "step": 6530 + }, + { + "epoch": 0.5925421883505716, + "grad_norm": 0.12413986574310945, + "learning_rate": 0.0003757660622607203, + "loss": 1.4062, + "step": 6531 + }, + { + "epoch": 0.5926329159862094, + "grad_norm": 0.1300980380690562, + "learning_rate": 0.0003756237482580378, + "loss": 1.3999, + "step": 6532 + }, + { + "epoch": 0.5927236436218473, + "grad_norm": 0.1403786650785009, + "learning_rate": 0.0003754814449952631, + "loss": 1.3635, + "step": 6533 + }, + { + "epoch": 0.592814371257485, + "grad_norm": 0.1212080440620873, + "learning_rate": 0.0003753391524846839, + "loss": 1.3007, + "step": 6534 + }, + { + "epoch": 0.5929050988931228, + "grad_norm": 0.12196124955421986, + "learning_rate": 0.00037519687073858703, + "loss": 1.3961, + "step": 6535 + }, + { + "epoch": 0.5929958265287607, + "grad_norm": 0.12977611364339853, + "learning_rate": 0.000375054599769259, + "loss": 1.359, + "step": 6536 + }, + { + "epoch": 0.5930865541643985, + "grad_norm": 0.12735379456912924, + "learning_rate": 0.00037491233958898473, + "loss": 1.3987, + "step": 6537 + }, + { + "epoch": 0.5931772818000363, + "grad_norm": 0.13110188853445773, + "learning_rate": 0.00037477009021004827, + "loss": 1.4337, + "step": 6538 + }, + { + "epoch": 0.5932680094356741, + "grad_norm": 0.12525447099241632, + "learning_rate": 0.000374627851644733, + "loss": 1.4046, + "step": 6539 + }, + { + "epoch": 0.5933587370713119, + "grad_norm": 0.1250803920567995, + "learning_rate": 0.0003744856239053213, + "loss": 1.389, + "step": 6540 + }, + { + "epoch": 0.5934494647069497, + "grad_norm": 0.12344240443176036, + "learning_rate": 0.0003743434070040944, + "loss": 1.3833, + "step": 6541 + }, + { + "epoch": 0.5935401923425876, + "grad_norm": 0.12035383776347908, + "learning_rate": 0.0003742012009533328, + "loss": 1.3834, + "step": 6542 + }, + { + "epoch": 0.5936309199782254, + "grad_norm": 0.13755475833938471, + "learning_rate": 0.00037405900576531595, + "loss": 1.3975, + "step": 6543 + }, + { + "epoch": 0.5937216476138631, + "grad_norm": 0.12776386462991354, + "learning_rate": 0.00037391682145232263, + "loss": 1.4085, + "step": 6544 + }, + { + "epoch": 0.593812375249501, + "grad_norm": 0.11782908718777242, + "learning_rate": 0.0003737746480266304, + "loss": 1.3827, + "step": 6545 + }, + { + "epoch": 0.5939031028851388, + "grad_norm": 0.1258657091381014, + "learning_rate": 0.00037363248550051586, + "loss": 1.3861, + "step": 6546 + }, + { + "epoch": 0.5939938305207766, + "grad_norm": 0.12366265927887754, + "learning_rate": 0.0003734903338862549, + "loss": 1.3939, + "step": 6547 + }, + { + "epoch": 0.5940845581564145, + "grad_norm": 0.13313008972317708, + "learning_rate": 0.00037334819319612226, + "loss": 1.3971, + "step": 6548 + }, + { + "epoch": 0.5941752857920523, + "grad_norm": 0.12883593271490462, + "learning_rate": 0.0003732060634423917, + "loss": 1.4146, + "step": 6549 + }, + { + "epoch": 0.59426601342769, + "grad_norm": 0.12654542948935682, + "learning_rate": 0.00037306394463733616, + "loss": 1.3703, + "step": 6550 + }, + { + "epoch": 0.5943567410633279, + "grad_norm": 0.14661079173309607, + "learning_rate": 0.0003729218367932278, + "loss": 1.3846, + "step": 6551 + }, + { + "epoch": 0.5944474686989657, + "grad_norm": 0.12457480335919405, + "learning_rate": 0.0003727797399223377, + "loss": 1.382, + "step": 6552 + }, + { + "epoch": 0.5945381963346035, + "grad_norm": 0.12062827771830958, + "learning_rate": 0.0003726376540369357, + "loss": 1.3935, + "step": 6553 + }, + { + "epoch": 0.5946289239702414, + "grad_norm": 0.12502593475477924, + "learning_rate": 0.000372495579149291, + "loss": 1.3798, + "step": 6554 + }, + { + "epoch": 0.5947196516058791, + "grad_norm": 0.13599048368938416, + "learning_rate": 0.000372353515271672, + "loss": 1.3751, + "step": 6555 + }, + { + "epoch": 0.5948103792415169, + "grad_norm": 0.12981678798479257, + "learning_rate": 0.0003722114624163456, + "loss": 1.3803, + "step": 6556 + }, + { + "epoch": 0.5949011068771548, + "grad_norm": 0.13531199083896187, + "learning_rate": 0.0003720694205955784, + "loss": 1.3852, + "step": 6557 + }, + { + "epoch": 0.5949918345127926, + "grad_norm": 0.12515952194368338, + "learning_rate": 0.00037192738982163534, + "loss": 1.4209, + "step": 6558 + }, + { + "epoch": 0.5950825621484304, + "grad_norm": 0.12034038427152495, + "learning_rate": 0.0003717853701067813, + "loss": 1.4159, + "step": 6559 + }, + { + "epoch": 0.5951732897840682, + "grad_norm": 0.12421831374014539, + "learning_rate": 0.00037164336146327924, + "loss": 1.366, + "step": 6560 + }, + { + "epoch": 0.595264017419706, + "grad_norm": 0.1135993240275309, + "learning_rate": 0.00037150136390339195, + "loss": 1.4054, + "step": 6561 + }, + { + "epoch": 0.5953547450553438, + "grad_norm": 0.12657180090234613, + "learning_rate": 0.00037135937743938094, + "loss": 1.4086, + "step": 6562 + }, + { + "epoch": 0.5954454726909817, + "grad_norm": 0.12343149118274618, + "learning_rate": 0.00037121740208350654, + "loss": 1.4267, + "step": 6563 + }, + { + "epoch": 0.5955362003266195, + "grad_norm": 0.11795582899779132, + "learning_rate": 0.0003710754378480284, + "loss": 1.3693, + "step": 6564 + }, + { + "epoch": 0.5956269279622574, + "grad_norm": 0.12708377248397112, + "learning_rate": 0.0003709334847452053, + "loss": 1.4097, + "step": 6565 + }, + { + "epoch": 0.5957176555978951, + "grad_norm": 0.12035560944153334, + "learning_rate": 0.00037079154278729464, + "loss": 1.4195, + "step": 6566 + }, + { + "epoch": 0.5958083832335329, + "grad_norm": 0.12418127787890658, + "learning_rate": 0.0003706496119865534, + "loss": 1.3977, + "step": 6567 + }, + { + "epoch": 0.5958991108691708, + "grad_norm": 0.12446658448440497, + "learning_rate": 0.0003705076923552372, + "loss": 1.4255, + "step": 6568 + }, + { + "epoch": 0.5959898385048086, + "grad_norm": 0.12040301905478232, + "learning_rate": 0.00037036578390560107, + "loss": 1.4055, + "step": 6569 + }, + { + "epoch": 0.5960805661404464, + "grad_norm": 0.12057896403544575, + "learning_rate": 0.00037022388664989834, + "loss": 1.3521, + "step": 6570 + }, + { + "epoch": 0.5961712937760842, + "grad_norm": 0.1370592741982362, + "learning_rate": 0.0003700820006003822, + "loss": 1.3675, + "step": 6571 + }, + { + "epoch": 0.596262021411722, + "grad_norm": 0.12514378461221026, + "learning_rate": 0.00036994012576930456, + "loss": 1.3637, + "step": 6572 + }, + { + "epoch": 0.5963527490473598, + "grad_norm": 0.12042517641628363, + "learning_rate": 0.00036979826216891615, + "loss": 1.3557, + "step": 6573 + }, + { + "epoch": 0.5964434766829977, + "grad_norm": 0.13113481142323125, + "learning_rate": 0.0003696564098114669, + "loss": 1.4126, + "step": 6574 + }, + { + "epoch": 0.5965342043186355, + "grad_norm": 0.12484030863407894, + "learning_rate": 0.0003695145687092059, + "loss": 1.3613, + "step": 6575 + }, + { + "epoch": 0.5966249319542732, + "grad_norm": 0.12723715327023277, + "learning_rate": 0.0003693727388743813, + "loss": 1.4227, + "step": 6576 + }, + { + "epoch": 0.5967156595899111, + "grad_norm": 0.12121069813033714, + "learning_rate": 0.0003692309203192398, + "loss": 1.4071, + "step": 6577 + }, + { + "epoch": 0.5968063872255489, + "grad_norm": 0.1479881937190374, + "learning_rate": 0.0003690891130560276, + "loss": 1.3803, + "step": 6578 + }, + { + "epoch": 0.5968971148611867, + "grad_norm": 0.18436519771598603, + "learning_rate": 0.00036894731709698995, + "loss": 1.3812, + "step": 6579 + }, + { + "epoch": 0.5969878424968246, + "grad_norm": 0.1170277698872446, + "learning_rate": 0.00036880553245437057, + "loss": 1.4133, + "step": 6580 + }, + { + "epoch": 0.5970785701324623, + "grad_norm": 0.12112954701280373, + "learning_rate": 0.0003686637591404129, + "loss": 1.3579, + "step": 6581 + }, + { + "epoch": 0.5971692977681001, + "grad_norm": 0.12189336587637424, + "learning_rate": 0.00036852199716735885, + "loss": 1.4054, + "step": 6582 + }, + { + "epoch": 0.597260025403738, + "grad_norm": 0.12564062747535132, + "learning_rate": 0.00036838024654744985, + "loss": 1.3946, + "step": 6583 + }, + { + "epoch": 0.5973507530393758, + "grad_norm": 0.11602241683288811, + "learning_rate": 0.00036823850729292594, + "loss": 1.3938, + "step": 6584 + }, + { + "epoch": 0.5974414806750136, + "grad_norm": 0.12237078523185468, + "learning_rate": 0.0003680967794160264, + "loss": 1.371, + "step": 6585 + }, + { + "epoch": 0.5975322083106515, + "grad_norm": 0.11955906766540243, + "learning_rate": 0.00036795506292898933, + "loss": 1.3373, + "step": 6586 + }, + { + "epoch": 0.5976229359462892, + "grad_norm": 0.12999381609114263, + "learning_rate": 0.0003678133578440521, + "loss": 1.4135, + "step": 6587 + }, + { + "epoch": 0.597713663581927, + "grad_norm": 0.1519524652849733, + "learning_rate": 0.0003676716641734509, + "loss": 1.3581, + "step": 6588 + }, + { + "epoch": 0.5978043912175649, + "grad_norm": 0.12261860806686334, + "learning_rate": 0.0003675299819294208, + "loss": 1.3984, + "step": 6589 + }, + { + "epoch": 0.5978951188532027, + "grad_norm": 0.12321954143272879, + "learning_rate": 0.00036738831112419646, + "loss": 1.3618, + "step": 6590 + }, + { + "epoch": 0.5979858464888405, + "grad_norm": 0.1162188529439443, + "learning_rate": 0.00036724665177001106, + "loss": 1.3627, + "step": 6591 + }, + { + "epoch": 0.5980765741244783, + "grad_norm": 0.12304733962476427, + "learning_rate": 0.00036710500387909675, + "loss": 1.4326, + "step": 6592 + }, + { + "epoch": 0.5981673017601161, + "grad_norm": 0.11975103623792703, + "learning_rate": 0.00036696336746368496, + "loss": 1.3752, + "step": 6593 + }, + { + "epoch": 0.5982580293957539, + "grad_norm": 0.11607410762539849, + "learning_rate": 0.0003668217425360061, + "loss": 1.3983, + "step": 6594 + }, + { + "epoch": 0.5983487570313918, + "grad_norm": 0.12442035978228481, + "learning_rate": 0.0003666801291082893, + "loss": 1.3999, + "step": 6595 + }, + { + "epoch": 0.5984394846670296, + "grad_norm": 0.11859375166596715, + "learning_rate": 0.00036653852719276293, + "loss": 1.3983, + "step": 6596 + }, + { + "epoch": 0.5985302123026673, + "grad_norm": 0.11824217896796453, + "learning_rate": 0.00036639693680165443, + "loss": 1.3935, + "step": 6597 + }, + { + "epoch": 0.5986209399383052, + "grad_norm": 0.12385946981876089, + "learning_rate": 0.00036625535794719024, + "loss": 1.378, + "step": 6598 + }, + { + "epoch": 0.598711667573943, + "grad_norm": 0.12104672010640224, + "learning_rate": 0.0003661137906415955, + "loss": 1.4099, + "step": 6599 + }, + { + "epoch": 0.5988023952095808, + "grad_norm": 0.12126415485274725, + "learning_rate": 0.0003659722348970947, + "loss": 1.3816, + "step": 6600 + }, + { + "epoch": 0.5988931228452187, + "grad_norm": 0.11347662030904816, + "learning_rate": 0.00036583069072591124, + "loss": 1.4054, + "step": 6601 + }, + { + "epoch": 0.5989838504808565, + "grad_norm": 0.14208374179676175, + "learning_rate": 0.00036568915814026736, + "loss": 1.4023, + "step": 6602 + }, + { + "epoch": 0.5990745781164943, + "grad_norm": 0.11861214247166654, + "learning_rate": 0.0003655476371523845, + "loss": 1.3977, + "step": 6603 + }, + { + "epoch": 0.5991653057521321, + "grad_norm": 0.11667435370514462, + "learning_rate": 0.00036540612777448303, + "loss": 1.3928, + "step": 6604 + }, + { + "epoch": 0.5992560333877699, + "grad_norm": 0.12166476539473176, + "learning_rate": 0.000365264630018782, + "loss": 1.406, + "step": 6605 + }, + { + "epoch": 0.5993467610234078, + "grad_norm": 0.1154598846124042, + "learning_rate": 0.0003651231438975002, + "loss": 1.3906, + "step": 6606 + }, + { + "epoch": 0.5994374886590456, + "grad_norm": 0.11510607962253094, + "learning_rate": 0.00036498166942285483, + "loss": 1.3819, + "step": 6607 + }, + { + "epoch": 0.5995282162946833, + "grad_norm": 0.11915701703436325, + "learning_rate": 0.0003648402066070623, + "loss": 1.3791, + "step": 6608 + }, + { + "epoch": 0.5996189439303212, + "grad_norm": 0.12516254117793407, + "learning_rate": 0.00036469875546233765, + "loss": 1.3869, + "step": 6609 + }, + { + "epoch": 0.599709671565959, + "grad_norm": 0.11140807749292919, + "learning_rate": 0.0003645573160008955, + "loss": 1.3732, + "step": 6610 + }, + { + "epoch": 0.5998003992015968, + "grad_norm": 0.11921100013055973, + "learning_rate": 0.0003644158882349492, + "loss": 1.3688, + "step": 6611 + }, + { + "epoch": 0.5998911268372347, + "grad_norm": 0.13824516807601472, + "learning_rate": 0.00036427447217671075, + "loss": 1.4234, + "step": 6612 + }, + { + "epoch": 0.5999818544728724, + "grad_norm": 0.11340052209455294, + "learning_rate": 0.0003641330678383916, + "loss": 1.3812, + "step": 6613 + }, + { + "epoch": 0.6000725821085102, + "grad_norm": 0.12682220188194454, + "learning_rate": 0.0003639916752322021, + "loss": 1.3933, + "step": 6614 + }, + { + "epoch": 0.6001633097441481, + "grad_norm": 0.12169692930806543, + "learning_rate": 0.00036385029437035153, + "loss": 1.3891, + "step": 6615 + }, + { + "epoch": 0.6002540373797859, + "grad_norm": 0.133665412533483, + "learning_rate": 0.00036370892526504825, + "loss": 1.3451, + "step": 6616 + }, + { + "epoch": 0.6003447650154237, + "grad_norm": 0.12271667762231847, + "learning_rate": 0.00036356756792849923, + "loss": 1.366, + "step": 6617 + }, + { + "epoch": 0.6004354926510616, + "grad_norm": 0.11163868547403848, + "learning_rate": 0.0003634262223729109, + "loss": 1.3706, + "step": 6618 + }, + { + "epoch": 0.6005262202866993, + "grad_norm": 0.12177211063282156, + "learning_rate": 0.00036328488861048827, + "loss": 1.4017, + "step": 6619 + }, + { + "epoch": 0.6006169479223371, + "grad_norm": 0.12318901259916398, + "learning_rate": 0.0003631435666534357, + "loss": 1.3498, + "step": 6620 + }, + { + "epoch": 0.600707675557975, + "grad_norm": 0.11480408873525835, + "learning_rate": 0.00036300225651395625, + "loss": 1.3322, + "step": 6621 + }, + { + "epoch": 0.6007984031936128, + "grad_norm": 0.12194360093994272, + "learning_rate": 0.0003628609582042521, + "loss": 1.3681, + "step": 6622 + }, + { + "epoch": 0.6008891308292506, + "grad_norm": 0.11621816387867744, + "learning_rate": 0.0003627196717365247, + "loss": 1.3762, + "step": 6623 + }, + { + "epoch": 0.6009798584648884, + "grad_norm": 0.12405815429746501, + "learning_rate": 0.0003625783971229737, + "loss": 1.371, + "step": 6624 + }, + { + "epoch": 0.6010705861005262, + "grad_norm": 0.11773186209351783, + "learning_rate": 0.00036243713437579837, + "loss": 1.3819, + "step": 6625 + }, + { + "epoch": 0.601161313736164, + "grad_norm": 0.12358208646142123, + "learning_rate": 0.0003622958835071969, + "loss": 1.3672, + "step": 6626 + }, + { + "epoch": 0.6012520413718019, + "grad_norm": 0.12722323035772645, + "learning_rate": 0.000362154644529366, + "loss": 1.3787, + "step": 6627 + }, + { + "epoch": 0.6013427690074397, + "grad_norm": 0.12229294101846377, + "learning_rate": 0.00036201341745450185, + "loss": 1.4005, + "step": 6628 + }, + { + "epoch": 0.6014334966430774, + "grad_norm": 0.12639770180249504, + "learning_rate": 0.00036187220229479946, + "loss": 1.3831, + "step": 6629 + }, + { + "epoch": 0.6015242242787153, + "grad_norm": 0.11247392557786984, + "learning_rate": 0.0003617309990624529, + "loss": 1.4228, + "step": 6630 + }, + { + "epoch": 0.6016149519143531, + "grad_norm": 0.1138287751001282, + "learning_rate": 0.00036158980776965487, + "loss": 1.392, + "step": 6631 + }, + { + "epoch": 0.6017056795499909, + "grad_norm": 0.13956141186844673, + "learning_rate": 0.00036144862842859725, + "loss": 1.402, + "step": 6632 + }, + { + "epoch": 0.6017964071856288, + "grad_norm": 0.11938026748569638, + "learning_rate": 0.0003613074610514712, + "loss": 1.3997, + "step": 6633 + }, + { + "epoch": 0.6018871348212665, + "grad_norm": 0.14084193967664396, + "learning_rate": 0.00036116630565046615, + "loss": 1.3843, + "step": 6634 + }, + { + "epoch": 0.6019778624569043, + "grad_norm": 0.12110099942020923, + "learning_rate": 0.00036102516223777113, + "loss": 1.3911, + "step": 6635 + }, + { + "epoch": 0.6020685900925422, + "grad_norm": 0.11450655370761131, + "learning_rate": 0.0003608840308255737, + "loss": 1.3895, + "step": 6636 + }, + { + "epoch": 0.60215931772818, + "grad_norm": 0.10970440999210229, + "learning_rate": 0.0003607429114260609, + "loss": 1.3672, + "step": 6637 + }, + { + "epoch": 0.6022500453638178, + "grad_norm": 0.12132234614962834, + "learning_rate": 0.00036060180405141816, + "loss": 1.3912, + "step": 6638 + }, + { + "epoch": 0.6023407729994557, + "grad_norm": 0.12704580941119417, + "learning_rate": 0.0003604607087138302, + "loss": 1.3641, + "step": 6639 + }, + { + "epoch": 0.6024315006350934, + "grad_norm": 0.12355922823759949, + "learning_rate": 0.0003603196254254806, + "loss": 1.3658, + "step": 6640 + }, + { + "epoch": 0.6025222282707313, + "grad_norm": 0.11501458882578265, + "learning_rate": 0.00036017855419855196, + "loss": 1.428, + "step": 6641 + }, + { + "epoch": 0.6026129559063691, + "grad_norm": 0.12844268341395412, + "learning_rate": 0.0003600374950452257, + "loss": 1.3856, + "step": 6642 + }, + { + "epoch": 0.6027036835420069, + "grad_norm": 0.11467900093396732, + "learning_rate": 0.0003598964479776825, + "loss": 1.3972, + "step": 6643 + }, + { + "epoch": 0.6027944111776448, + "grad_norm": 0.12275719851128712, + "learning_rate": 0.0003597554130081014, + "loss": 1.3858, + "step": 6644 + }, + { + "epoch": 0.6028851388132825, + "grad_norm": 0.12012874438692069, + "learning_rate": 0.0003596143901486613, + "loss": 1.377, + "step": 6645 + }, + { + "epoch": 0.6029758664489203, + "grad_norm": 0.12047724310231693, + "learning_rate": 0.00035947337941153915, + "loss": 1.4134, + "step": 6646 + }, + { + "epoch": 0.6030665940845582, + "grad_norm": 0.11872347401451512, + "learning_rate": 0.0003593323808089116, + "loss": 1.3811, + "step": 6647 + }, + { + "epoch": 0.603157321720196, + "grad_norm": 0.11875043378376231, + "learning_rate": 0.0003591913943529536, + "loss": 1.3408, + "step": 6648 + }, + { + "epoch": 0.6032480493558338, + "grad_norm": 0.11737189447468281, + "learning_rate": 0.0003590504200558393, + "loss": 1.4095, + "step": 6649 + }, + { + "epoch": 0.6033387769914716, + "grad_norm": 0.1135267270874221, + "learning_rate": 0.0003589094579297423, + "loss": 1.3299, + "step": 6650 + }, + { + "epoch": 0.6034295046271094, + "grad_norm": 0.11823667462414364, + "learning_rate": 0.0003587685079868342, + "loss": 1.376, + "step": 6651 + }, + { + "epoch": 0.6035202322627472, + "grad_norm": 0.12480517259059978, + "learning_rate": 0.00035862757023928616, + "loss": 1.4044, + "step": 6652 + }, + { + "epoch": 0.6036109598983851, + "grad_norm": 0.1369893744528852, + "learning_rate": 0.00035848664469926835, + "loss": 1.416, + "step": 6653 + }, + { + "epoch": 0.6037016875340229, + "grad_norm": 0.11589710960993382, + "learning_rate": 0.0003583457313789497, + "loss": 1.4292, + "step": 6654 + }, + { + "epoch": 0.6037924151696606, + "grad_norm": 0.11953878570576527, + "learning_rate": 0.000358204830290498, + "loss": 1.3451, + "step": 6655 + }, + { + "epoch": 0.6038831428052985, + "grad_norm": 0.11827852814782422, + "learning_rate": 0.0003580639414460801, + "loss": 1.3861, + "step": 6656 + }, + { + "epoch": 0.6039738704409363, + "grad_norm": 0.11344307962301553, + "learning_rate": 0.0003579230648578619, + "loss": 1.3773, + "step": 6657 + }, + { + "epoch": 0.6040645980765741, + "grad_norm": 0.11735453898513296, + "learning_rate": 0.0003577822005380078, + "loss": 1.4059, + "step": 6658 + }, + { + "epoch": 0.604155325712212, + "grad_norm": 0.12728669406712362, + "learning_rate": 0.0003576413484986817, + "loss": 1.4017, + "step": 6659 + }, + { + "epoch": 0.6042460533478498, + "grad_norm": 0.15035633516495456, + "learning_rate": 0.000357500508752046, + "loss": 1.4003, + "step": 6660 + }, + { + "epoch": 0.6043367809834875, + "grad_norm": 0.11467243762644438, + "learning_rate": 0.00035735968131026246, + "loss": 1.3654, + "step": 6661 + }, + { + "epoch": 0.6044275086191254, + "grad_norm": 0.12047277254176424, + "learning_rate": 0.00035721886618549157, + "loss": 1.3468, + "step": 6662 + }, + { + "epoch": 0.6045182362547632, + "grad_norm": 0.12131018306853412, + "learning_rate": 0.00035707806338989255, + "loss": 1.3875, + "step": 6663 + }, + { + "epoch": 0.604608963890401, + "grad_norm": 0.11724846386127201, + "learning_rate": 0.0003569372729356239, + "loss": 1.3553, + "step": 6664 + }, + { + "epoch": 0.6046996915260389, + "grad_norm": 0.11481830185658717, + "learning_rate": 0.00035679649483484285, + "loss": 1.3953, + "step": 6665 + }, + { + "epoch": 0.6047904191616766, + "grad_norm": 0.11078578873926129, + "learning_rate": 0.00035665572909970546, + "loss": 1.4075, + "step": 6666 + }, + { + "epoch": 0.6048811467973144, + "grad_norm": 0.12728879361350373, + "learning_rate": 0.0003565149757423669, + "loss": 1.4165, + "step": 6667 + }, + { + "epoch": 0.6049718744329523, + "grad_norm": 0.12068468329658581, + "learning_rate": 0.0003563742347749814, + "loss": 1.4123, + "step": 6668 + }, + { + "epoch": 0.6050626020685901, + "grad_norm": 0.11829958983875016, + "learning_rate": 0.00035623350620970205, + "loss": 1.391, + "step": 6669 + }, + { + "epoch": 0.6051533297042279, + "grad_norm": 0.11753297183571247, + "learning_rate": 0.0003560927900586805, + "loss": 1.3949, + "step": 6670 + }, + { + "epoch": 0.6052440573398657, + "grad_norm": 0.11660291036246778, + "learning_rate": 0.00035595208633406775, + "loss": 1.3959, + "step": 6671 + }, + { + "epoch": 0.6053347849755035, + "grad_norm": 0.12077188420307038, + "learning_rate": 0.00035581139504801366, + "loss": 1.3755, + "step": 6672 + }, + { + "epoch": 0.6054255126111413, + "grad_norm": 0.13939439991971153, + "learning_rate": 0.0003556707162126668, + "loss": 1.4029, + "step": 6673 + }, + { + "epoch": 0.6055162402467792, + "grad_norm": 0.12056153925520112, + "learning_rate": 0.0003555300498401748, + "loss": 1.3941, + "step": 6674 + }, + { + "epoch": 0.605606967882417, + "grad_norm": 0.11048760142886588, + "learning_rate": 0.0003553893959426842, + "loss": 1.3466, + "step": 6675 + }, + { + "epoch": 0.6056976955180547, + "grad_norm": 0.11638476584664308, + "learning_rate": 0.00035524875453234086, + "loss": 1.3224, + "step": 6676 + }, + { + "epoch": 0.6057884231536926, + "grad_norm": 0.12373937501628476, + "learning_rate": 0.0003551081256212887, + "loss": 1.4113, + "step": 6677 + }, + { + "epoch": 0.6058791507893304, + "grad_norm": 0.1236040833782254, + "learning_rate": 0.0003549675092216713, + "loss": 1.393, + "step": 6678 + }, + { + "epoch": 0.6059698784249683, + "grad_norm": 0.10984855086938367, + "learning_rate": 0.0003548269053456309, + "loss": 1.3731, + "step": 6679 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.10815762271150711, + "learning_rate": 0.00035468631400530857, + "loss": 1.3678, + "step": 6680 + }, + { + "epoch": 0.6061513336962439, + "grad_norm": 0.1154722329992382, + "learning_rate": 0.00035454573521284444, + "loss": 1.361, + "step": 6681 + }, + { + "epoch": 0.6062420613318817, + "grad_norm": 0.11537855828218506, + "learning_rate": 0.0003544051689803776, + "loss": 1.3672, + "step": 6682 + }, + { + "epoch": 0.6063327889675195, + "grad_norm": 0.1260509874191176, + "learning_rate": 0.0003542646153200457, + "loss": 1.3775, + "step": 6683 + }, + { + "epoch": 0.6064235166031573, + "grad_norm": 0.24506208793973283, + "learning_rate": 0.00035412407424398584, + "loss": 1.4131, + "step": 6684 + }, + { + "epoch": 0.6065142442387952, + "grad_norm": 0.11162296915670124, + "learning_rate": 0.00035398354576433367, + "loss": 1.3705, + "step": 6685 + }, + { + "epoch": 0.606604971874433, + "grad_norm": 0.12015991930755789, + "learning_rate": 0.0003538430298932239, + "loss": 1.3571, + "step": 6686 + }, + { + "epoch": 0.6066956995100707, + "grad_norm": 0.11160726406489536, + "learning_rate": 0.00035370252664278993, + "loss": 1.3522, + "step": 6687 + }, + { + "epoch": 0.6067864271457086, + "grad_norm": 0.14711933931527635, + "learning_rate": 0.00035356203602516443, + "loss": 1.4148, + "step": 6688 + }, + { + "epoch": 0.6068771547813464, + "grad_norm": 0.12078750944001776, + "learning_rate": 0.00035342155805247877, + "loss": 1.4009, + "step": 6689 + }, + { + "epoch": 0.6069678824169842, + "grad_norm": 0.10928941333870192, + "learning_rate": 0.00035328109273686305, + "loss": 1.3785, + "step": 6690 + }, + { + "epoch": 0.6070586100526221, + "grad_norm": 0.11063209543160468, + "learning_rate": 0.0003531406400904465, + "loss": 1.3871, + "step": 6691 + }, + { + "epoch": 0.6071493376882598, + "grad_norm": 0.14197167864588617, + "learning_rate": 0.00035300020012535744, + "loss": 1.3934, + "step": 6692 + }, + { + "epoch": 0.6072400653238976, + "grad_norm": 0.11164474335682731, + "learning_rate": 0.0003528597728537227, + "loss": 1.3794, + "step": 6693 + }, + { + "epoch": 0.6073307929595355, + "grad_norm": 0.13775121666252232, + "learning_rate": 0.00035271935828766845, + "loss": 1.3893, + "step": 6694 + }, + { + "epoch": 0.6074215205951733, + "grad_norm": 0.11269339339892355, + "learning_rate": 0.0003525789564393192, + "loss": 1.3568, + "step": 6695 + }, + { + "epoch": 0.6075122482308111, + "grad_norm": 0.13663576817736892, + "learning_rate": 0.00035243856732079885, + "loss": 1.3693, + "step": 6696 + }, + { + "epoch": 0.607602975866449, + "grad_norm": 0.11957542358366025, + "learning_rate": 0.0003522981909442299, + "loss": 1.3874, + "step": 6697 + }, + { + "epoch": 0.6076937035020867, + "grad_norm": 0.1269323123012658, + "learning_rate": 0.000352157827321734, + "loss": 1.3942, + "step": 6698 + }, + { + "epoch": 0.6077844311377245, + "grad_norm": 0.11265097628440865, + "learning_rate": 0.0003520174764654313, + "loss": 1.414, + "step": 6699 + }, + { + "epoch": 0.6078751587733624, + "grad_norm": 0.1175643601771875, + "learning_rate": 0.0003518771383874414, + "loss": 1.3981, + "step": 6700 + }, + { + "epoch": 0.6079658864090002, + "grad_norm": 0.11884336048299383, + "learning_rate": 0.00035173681309988257, + "loss": 1.3489, + "step": 6701 + }, + { + "epoch": 0.608056614044638, + "grad_norm": 0.12355423254693801, + "learning_rate": 0.0003515965006148717, + "loss": 1.4145, + "step": 6702 + }, + { + "epoch": 0.6081473416802758, + "grad_norm": 0.11534554931168892, + "learning_rate": 0.0003514562009445248, + "loss": 1.3741, + "step": 6703 + }, + { + "epoch": 0.6082380693159136, + "grad_norm": 0.11914108354637198, + "learning_rate": 0.000351315914100957, + "loss": 1.3722, + "step": 6704 + }, + { + "epoch": 0.6083287969515514, + "grad_norm": 0.11684881902571251, + "learning_rate": 0.0003511756400962818, + "loss": 1.4176, + "step": 6705 + }, + { + "epoch": 0.6084195245871893, + "grad_norm": 0.11463605134306262, + "learning_rate": 0.00035103537894261197, + "loss": 1.3627, + "step": 6706 + }, + { + "epoch": 0.6085102522228271, + "grad_norm": 0.11740584708429859, + "learning_rate": 0.0003508951306520592, + "loss": 1.3869, + "step": 6707 + }, + { + "epoch": 0.6086009798584648, + "grad_norm": 0.12083326799224343, + "learning_rate": 0.00035075489523673397, + "loss": 1.3794, + "step": 6708 + }, + { + "epoch": 0.6086917074941027, + "grad_norm": 0.1186179561446249, + "learning_rate": 0.0003506146727087454, + "loss": 1.3967, + "step": 6709 + }, + { + "epoch": 0.6087824351297405, + "grad_norm": 0.1094863483908673, + "learning_rate": 0.00035047446308020193, + "loss": 1.3658, + "step": 6710 + }, + { + "epoch": 0.6088731627653783, + "grad_norm": 0.11390011203650068, + "learning_rate": 0.0003503342663632106, + "loss": 1.3998, + "step": 6711 + }, + { + "epoch": 0.6089638904010162, + "grad_norm": 0.1144698910758076, + "learning_rate": 0.0003501940825698774, + "loss": 1.4152, + "step": 6712 + }, + { + "epoch": 0.609054618036654, + "grad_norm": 0.12745925886142268, + "learning_rate": 0.0003500539117123073, + "loss": 1.3659, + "step": 6713 + }, + { + "epoch": 0.6091453456722917, + "grad_norm": 0.12115695398049361, + "learning_rate": 0.0003499137538026038, + "loss": 1.4135, + "step": 6714 + }, + { + "epoch": 0.6092360733079296, + "grad_norm": 0.12356049089742671, + "learning_rate": 0.0003497736088528701, + "loss": 1.4136, + "step": 6715 + }, + { + "epoch": 0.6093268009435674, + "grad_norm": 0.1174573218813966, + "learning_rate": 0.00034963347687520726, + "loss": 1.4117, + "step": 6716 + }, + { + "epoch": 0.6094175285792053, + "grad_norm": 0.12157611969844466, + "learning_rate": 0.00034949335788171584, + "loss": 1.4076, + "step": 6717 + }, + { + "epoch": 0.6095082562148431, + "grad_norm": 0.10991640178185857, + "learning_rate": 0.0003493532518844953, + "loss": 1.4044, + "step": 6718 + }, + { + "epoch": 0.6095989838504808, + "grad_norm": 0.12142325130667009, + "learning_rate": 0.00034921315889564346, + "loss": 1.3625, + "step": 6719 + }, + { + "epoch": 0.6096897114861187, + "grad_norm": 0.13124816522914756, + "learning_rate": 0.0003490730789272577, + "loss": 1.3758, + "step": 6720 + }, + { + "epoch": 0.6097804391217565, + "grad_norm": 0.12580953369945294, + "learning_rate": 0.00034893301199143383, + "loss": 1.3604, + "step": 6721 + }, + { + "epoch": 0.6098711667573943, + "grad_norm": 0.111582059197165, + "learning_rate": 0.00034879295810026645, + "loss": 1.4001, + "step": 6722 + }, + { + "epoch": 0.6099618943930322, + "grad_norm": 0.11373497938881749, + "learning_rate": 0.00034865291726584956, + "loss": 1.3805, + "step": 6723 + }, + { + "epoch": 0.6100526220286699, + "grad_norm": 0.11384247842999734, + "learning_rate": 0.00034851288950027556, + "loss": 1.3672, + "step": 6724 + }, + { + "epoch": 0.6101433496643077, + "grad_norm": 0.11643930984988227, + "learning_rate": 0.00034837287481563595, + "loss": 1.3901, + "step": 6725 + }, + { + "epoch": 0.6102340772999456, + "grad_norm": 0.11160835581799679, + "learning_rate": 0.00034823287322402084, + "loss": 1.3939, + "step": 6726 + }, + { + "epoch": 0.6103248049355834, + "grad_norm": 0.12526090016750654, + "learning_rate": 0.00034809288473751956, + "loss": 1.4235, + "step": 6727 + }, + { + "epoch": 0.6104155325712212, + "grad_norm": 0.12934167168791852, + "learning_rate": 0.00034795290936822016, + "loss": 1.3488, + "step": 6728 + }, + { + "epoch": 0.610506260206859, + "grad_norm": 0.11446590519692505, + "learning_rate": 0.0003478129471282093, + "loss": 1.3887, + "step": 6729 + }, + { + "epoch": 0.6105969878424968, + "grad_norm": 0.10981821985661863, + "learning_rate": 0.0003476729980295728, + "loss": 1.4286, + "step": 6730 + }, + { + "epoch": 0.6106877154781346, + "grad_norm": 0.11192916224718741, + "learning_rate": 0.0003475330620843955, + "loss": 1.4075, + "step": 6731 + }, + { + "epoch": 0.6107784431137725, + "grad_norm": 0.12888556174290058, + "learning_rate": 0.00034739313930476075, + "loss": 1.349, + "step": 6732 + }, + { + "epoch": 0.6108691707494103, + "grad_norm": 0.12312567294368242, + "learning_rate": 0.00034725322970275096, + "loss": 1.4119, + "step": 6733 + }, + { + "epoch": 0.610959898385048, + "grad_norm": 0.12809843537432822, + "learning_rate": 0.00034711333329044724, + "loss": 1.3712, + "step": 6734 + }, + { + "epoch": 0.6110506260206859, + "grad_norm": 0.12322534680850038, + "learning_rate": 0.00034697345007992985, + "loss": 1.4033, + "step": 6735 + }, + { + "epoch": 0.6111413536563237, + "grad_norm": 0.14393428057925076, + "learning_rate": 0.0003468335800832775, + "loss": 1.3623, + "step": 6736 + }, + { + "epoch": 0.6112320812919615, + "grad_norm": 0.12036346687180467, + "learning_rate": 0.00034669372331256807, + "loss": 1.3966, + "step": 6737 + }, + { + "epoch": 0.6113228089275994, + "grad_norm": 0.1255333322871532, + "learning_rate": 0.00034655387977987806, + "loss": 1.4038, + "step": 6738 + }, + { + "epoch": 0.6114135365632372, + "grad_norm": 0.12071528813128939, + "learning_rate": 0.0003464140494972833, + "loss": 1.4076, + "step": 6739 + }, + { + "epoch": 0.6115042641988749, + "grad_norm": 0.12619324949306646, + "learning_rate": 0.0003462742324768581, + "loss": 1.3745, + "step": 6740 + }, + { + "epoch": 0.6115949918345128, + "grad_norm": 0.11653586051201256, + "learning_rate": 0.00034613442873067553, + "loss": 1.3699, + "step": 6741 + }, + { + "epoch": 0.6116857194701506, + "grad_norm": 0.1114206369169985, + "learning_rate": 0.0003459946382708077, + "loss": 1.344, + "step": 6742 + }, + { + "epoch": 0.6117764471057884, + "grad_norm": 0.11848399523924918, + "learning_rate": 0.0003458548611093256, + "loss": 1.4131, + "step": 6743 + }, + { + "epoch": 0.6118671747414263, + "grad_norm": 0.11686597946010328, + "learning_rate": 0.0003457150972582989, + "loss": 1.3492, + "step": 6744 + }, + { + "epoch": 0.611957902377064, + "grad_norm": 0.11475882520157203, + "learning_rate": 0.0003455753467297961, + "loss": 1.3751, + "step": 6745 + }, + { + "epoch": 0.6120486300127018, + "grad_norm": 0.1221453932902135, + "learning_rate": 0.00034543560953588504, + "loss": 1.3409, + "step": 6746 + }, + { + "epoch": 0.6121393576483397, + "grad_norm": 0.1278148209366397, + "learning_rate": 0.000345295885688632, + "loss": 1.3939, + "step": 6747 + }, + { + "epoch": 0.6122300852839775, + "grad_norm": 0.12527603956014915, + "learning_rate": 0.00034515617520010193, + "loss": 1.4004, + "step": 6748 + }, + { + "epoch": 0.6123208129196153, + "grad_norm": 0.13495626810686284, + "learning_rate": 0.00034501647808235893, + "loss": 1.3895, + "step": 6749 + }, + { + "epoch": 0.6124115405552532, + "grad_norm": 0.21094289663559554, + "learning_rate": 0.000344876794347466, + "loss": 1.3985, + "step": 6750 + }, + { + "epoch": 0.6125022681908909, + "grad_norm": 0.11686667619005241, + "learning_rate": 0.0003447371240074847, + "loss": 1.3936, + "step": 6751 + }, + { + "epoch": 0.6125929958265287, + "grad_norm": 0.11912999121287586, + "learning_rate": 0.00034459746707447554, + "loss": 1.4301, + "step": 6752 + }, + { + "epoch": 0.6126837234621666, + "grad_norm": 0.13348385457755044, + "learning_rate": 0.00034445782356049795, + "loss": 1.3763, + "step": 6753 + }, + { + "epoch": 0.6127744510978044, + "grad_norm": 0.11583140482558449, + "learning_rate": 0.0003443181934776104, + "loss": 1.3793, + "step": 6754 + }, + { + "epoch": 0.6128651787334423, + "grad_norm": 0.1173535040499328, + "learning_rate": 0.0003441785768378697, + "loss": 1.3639, + "step": 6755 + }, + { + "epoch": 0.61295590636908, + "grad_norm": 0.11742743323577473, + "learning_rate": 0.0003440389736533318, + "loss": 1.3443, + "step": 6756 + }, + { + "epoch": 0.6130466340047178, + "grad_norm": 0.12239752201316315, + "learning_rate": 0.0003438993839360517, + "loss": 1.3932, + "step": 6757 + }, + { + "epoch": 0.6131373616403557, + "grad_norm": 0.16284661882986412, + "learning_rate": 0.0003437598076980826, + "loss": 1.4104, + "step": 6758 + }, + { + "epoch": 0.6132280892759935, + "grad_norm": 0.11510865594782095, + "learning_rate": 0.0003436202449514772, + "loss": 1.3854, + "step": 6759 + }, + { + "epoch": 0.6133188169116313, + "grad_norm": 0.1221337696308055, + "learning_rate": 0.0003434806957082868, + "loss": 1.3745, + "step": 6760 + }, + { + "epoch": 0.6134095445472691, + "grad_norm": 0.10870847201854344, + "learning_rate": 0.000343341159980561, + "loss": 1.3366, + "step": 6761 + }, + { + "epoch": 0.6135002721829069, + "grad_norm": 0.11580421267639167, + "learning_rate": 0.0003432016377803496, + "loss": 1.3643, + "step": 6762 + }, + { + "epoch": 0.6135909998185447, + "grad_norm": 0.11326616642682498, + "learning_rate": 0.0003430621291196997, + "loss": 1.3689, + "step": 6763 + }, + { + "epoch": 0.6136817274541826, + "grad_norm": 0.12026870227861167, + "learning_rate": 0.00034292263401065816, + "loss": 1.4081, + "step": 6764 + }, + { + "epoch": 0.6137724550898204, + "grad_norm": 0.12176249281567665, + "learning_rate": 0.00034278315246527027, + "loss": 1.3612, + "step": 6765 + }, + { + "epoch": 0.6138631827254581, + "grad_norm": 0.11346968899234038, + "learning_rate": 0.00034264368449558023, + "loss": 1.3804, + "step": 6766 + }, + { + "epoch": 0.613953910361096, + "grad_norm": 0.11670823850411298, + "learning_rate": 0.0003425042301136314, + "loss": 1.3923, + "step": 6767 + }, + { + "epoch": 0.6140446379967338, + "grad_norm": 0.11735893461708322, + "learning_rate": 0.00034236478933146543, + "loss": 1.36, + "step": 6768 + }, + { + "epoch": 0.6141353656323716, + "grad_norm": 0.12286318670574452, + "learning_rate": 0.000342225362161123, + "loss": 1.3918, + "step": 6769 + }, + { + "epoch": 0.6142260932680095, + "grad_norm": 0.1207783347007082, + "learning_rate": 0.0003420859486146439, + "loss": 1.3835, + "step": 6770 + }, + { + "epoch": 0.6143168209036473, + "grad_norm": 0.11552041713306979, + "learning_rate": 0.0003419465487040665, + "loss": 1.4054, + "step": 6771 + }, + { + "epoch": 0.614407548539285, + "grad_norm": 0.11673953033452993, + "learning_rate": 0.0003418071624414279, + "loss": 1.3668, + "step": 6772 + }, + { + "epoch": 0.6144982761749229, + "grad_norm": 0.1168885263041246, + "learning_rate": 0.00034166778983876414, + "loss": 1.3752, + "step": 6773 + }, + { + "epoch": 0.6145890038105607, + "grad_norm": 0.12527732087908008, + "learning_rate": 0.00034152843090811004, + "loss": 1.4042, + "step": 6774 + }, + { + "epoch": 0.6146797314461985, + "grad_norm": 0.13318262018760008, + "learning_rate": 0.00034138908566149926, + "loss": 1.4171, + "step": 6775 + }, + { + "epoch": 0.6147704590818364, + "grad_norm": 0.12153483056440405, + "learning_rate": 0.00034124975411096433, + "loss": 1.3858, + "step": 6776 + }, + { + "epoch": 0.6148611867174741, + "grad_norm": 0.11087063365993824, + "learning_rate": 0.00034111043626853637, + "loss": 1.3884, + "step": 6777 + }, + { + "epoch": 0.6149519143531119, + "grad_norm": 0.11000414725457604, + "learning_rate": 0.00034097113214624574, + "loss": 1.3544, + "step": 6778 + }, + { + "epoch": 0.6150426419887498, + "grad_norm": 0.11221544850716658, + "learning_rate": 0.0003408318417561215, + "loss": 1.3816, + "step": 6779 + }, + { + "epoch": 0.6151333696243876, + "grad_norm": 0.10855077444071878, + "learning_rate": 0.000340692565110191, + "loss": 1.3949, + "step": 6780 + }, + { + "epoch": 0.6152240972600254, + "grad_norm": 0.11465149645993027, + "learning_rate": 0.00034055330222048096, + "loss": 1.3993, + "step": 6781 + }, + { + "epoch": 0.6153148248956632, + "grad_norm": 0.1138342398023761, + "learning_rate": 0.000340414053099017, + "loss": 1.3518, + "step": 6782 + }, + { + "epoch": 0.615405552531301, + "grad_norm": 0.16707171977448634, + "learning_rate": 0.0003402748177578229, + "loss": 1.3878, + "step": 6783 + }, + { + "epoch": 0.6154962801669388, + "grad_norm": 0.11269249910481481, + "learning_rate": 0.0003401355962089216, + "loss": 1.3817, + "step": 6784 + }, + { + "epoch": 0.6155870078025767, + "grad_norm": 0.11044199018017559, + "learning_rate": 0.0003399963884643354, + "loss": 1.4051, + "step": 6785 + }, + { + "epoch": 0.6156777354382145, + "grad_norm": 0.11251875487288596, + "learning_rate": 0.0003398571945360847, + "loss": 1.4003, + "step": 6786 + }, + { + "epoch": 0.6157684630738522, + "grad_norm": 0.10590634605222068, + "learning_rate": 0.00033971801443618876, + "loss": 1.3573, + "step": 6787 + }, + { + "epoch": 0.6158591907094901, + "grad_norm": 0.11225543166334925, + "learning_rate": 0.00033957884817666583, + "loss": 1.3833, + "step": 6788 + }, + { + "epoch": 0.6159499183451279, + "grad_norm": 0.11849214872254733, + "learning_rate": 0.00033943969576953316, + "loss": 1.3716, + "step": 6789 + }, + { + "epoch": 0.6160406459807657, + "grad_norm": 0.11745247602981243, + "learning_rate": 0.0003393005572268063, + "loss": 1.3944, + "step": 6790 + }, + { + "epoch": 0.6161313736164036, + "grad_norm": 0.10477118092881028, + "learning_rate": 0.00033916143256050005, + "loss": 1.4129, + "step": 6791 + }, + { + "epoch": 0.6162221012520414, + "grad_norm": 0.11311477013749063, + "learning_rate": 0.0003390223217826277, + "loss": 1.3681, + "step": 6792 + }, + { + "epoch": 0.6163128288876791, + "grad_norm": 0.10472335664040601, + "learning_rate": 0.0003388832249052017, + "loss": 1.3673, + "step": 6793 + }, + { + "epoch": 0.616403556523317, + "grad_norm": 0.13587901115621515, + "learning_rate": 0.00033874414194023293, + "loss": 1.3815, + "step": 6794 + }, + { + "epoch": 0.6164942841589548, + "grad_norm": 0.11152157085583425, + "learning_rate": 0.0003386050728997313, + "loss": 1.3605, + "step": 6795 + }, + { + "epoch": 0.6165850117945927, + "grad_norm": 0.11145757880880106, + "learning_rate": 0.0003384660177957055, + "loss": 1.4028, + "step": 6796 + }, + { + "epoch": 0.6166757394302305, + "grad_norm": 0.16763357877122634, + "learning_rate": 0.00033832697664016274, + "loss": 1.3801, + "step": 6797 + }, + { + "epoch": 0.6167664670658682, + "grad_norm": 0.11326635503049984, + "learning_rate": 0.00033818794944510944, + "loss": 1.3844, + "step": 6798 + }, + { + "epoch": 0.6168571947015061, + "grad_norm": 0.11825593406025833, + "learning_rate": 0.00033804893622255063, + "loss": 1.3717, + "step": 6799 + }, + { + "epoch": 0.6169479223371439, + "grad_norm": 0.12766018864036116, + "learning_rate": 0.0003379099369844897, + "loss": 1.3557, + "step": 6800 + }, + { + "epoch": 0.6170386499727817, + "grad_norm": 0.12270491167750255, + "learning_rate": 0.00033777095174292995, + "loss": 1.3662, + "step": 6801 + }, + { + "epoch": 0.6171293776084196, + "grad_norm": 0.1085834027171784, + "learning_rate": 0.0003376319805098723, + "loss": 1.3702, + "step": 6802 + }, + { + "epoch": 0.6172201052440573, + "grad_norm": 0.10886734999291166, + "learning_rate": 0.00033749302329731716, + "loss": 1.4066, + "step": 6803 + }, + { + "epoch": 0.6173108328796951, + "grad_norm": 0.12275492948851696, + "learning_rate": 0.0003373540801172633, + "loss": 1.406, + "step": 6804 + }, + { + "epoch": 0.617401560515333, + "grad_norm": 0.11129778248095225, + "learning_rate": 0.0003372151509817086, + "loss": 1.3625, + "step": 6805 + }, + { + "epoch": 0.6174922881509708, + "grad_norm": 0.10670318227181269, + "learning_rate": 0.0003370762359026498, + "loss": 1.4078, + "step": 6806 + }, + { + "epoch": 0.6175830157866086, + "grad_norm": 0.1078286974164438, + "learning_rate": 0.00033693733489208183, + "loss": 1.3719, + "step": 6807 + }, + { + "epoch": 0.6176737434222465, + "grad_norm": 0.1307979806109644, + "learning_rate": 0.000336798447961999, + "loss": 1.3538, + "step": 6808 + }, + { + "epoch": 0.6177644710578842, + "grad_norm": 0.11643371977230334, + "learning_rate": 0.0003366595751243943, + "loss": 1.3738, + "step": 6809 + }, + { + "epoch": 0.617855198693522, + "grad_norm": 0.1203842809754266, + "learning_rate": 0.0003365207163912593, + "loss": 1.3744, + "step": 6810 + }, + { + "epoch": 0.6179459263291599, + "grad_norm": 0.11346699097998847, + "learning_rate": 0.00033638187177458467, + "loss": 1.3497, + "step": 6811 + }, + { + "epoch": 0.6180366539647977, + "grad_norm": 0.11787512301833698, + "learning_rate": 0.00033624304128635954, + "loss": 1.3651, + "step": 6812 + }, + { + "epoch": 0.6181273816004355, + "grad_norm": 0.121382138453363, + "learning_rate": 0.0003361042249385719, + "loss": 1.3475, + "step": 6813 + }, + { + "epoch": 0.6182181092360733, + "grad_norm": 0.10794881689601939, + "learning_rate": 0.0003359654227432085, + "loss": 1.3548, + "step": 6814 + }, + { + "epoch": 0.6183088368717111, + "grad_norm": 0.11852384074587839, + "learning_rate": 0.00033582663471225504, + "loss": 1.3976, + "step": 6815 + }, + { + "epoch": 0.6183995645073489, + "grad_norm": 0.13133049967784066, + "learning_rate": 0.0003356878608576958, + "loss": 1.3699, + "step": 6816 + }, + { + "epoch": 0.6184902921429868, + "grad_norm": 0.10728320271723035, + "learning_rate": 0.0003355491011915141, + "loss": 1.38, + "step": 6817 + }, + { + "epoch": 0.6185810197786246, + "grad_norm": 0.11246201357092017, + "learning_rate": 0.0003354103557256918, + "loss": 1.4011, + "step": 6818 + }, + { + "epoch": 0.6186717474142623, + "grad_norm": 0.10815144914392862, + "learning_rate": 0.0003352716244722095, + "loss": 1.3921, + "step": 6819 + }, + { + "epoch": 0.6187624750499002, + "grad_norm": 0.13166969301571763, + "learning_rate": 0.0003351329074430467, + "loss": 1.373, + "step": 6820 + }, + { + "epoch": 0.618853202685538, + "grad_norm": 0.11972025365731387, + "learning_rate": 0.0003349942046501817, + "loss": 1.3778, + "step": 6821 + }, + { + "epoch": 0.6189439303211758, + "grad_norm": 0.11927009752695995, + "learning_rate": 0.0003348555161055913, + "loss": 1.357, + "step": 6822 + }, + { + "epoch": 0.6190346579568137, + "grad_norm": 0.10942698913746798, + "learning_rate": 0.0003347168418212514, + "loss": 1.4027, + "step": 6823 + }, + { + "epoch": 0.6191253855924514, + "grad_norm": 0.11547537286139707, + "learning_rate": 0.0003345781818091367, + "loss": 1.3783, + "step": 6824 + }, + { + "epoch": 0.6192161132280892, + "grad_norm": 0.11754976530703316, + "learning_rate": 0.0003344395360812204, + "loss": 1.3578, + "step": 6825 + }, + { + "epoch": 0.6193068408637271, + "grad_norm": 0.10767059071750763, + "learning_rate": 0.00033430090464947447, + "loss": 1.3939, + "step": 6826 + }, + { + "epoch": 0.6193975684993649, + "grad_norm": 0.1123962163210147, + "learning_rate": 0.0003341622875258699, + "loss": 1.3556, + "step": 6827 + }, + { + "epoch": 0.6194882961350027, + "grad_norm": 0.11625001497793157, + "learning_rate": 0.0003340236847223763, + "loss": 1.3736, + "step": 6828 + }, + { + "epoch": 0.6195790237706406, + "grad_norm": 0.10848177467653596, + "learning_rate": 0.00033388509625096197, + "loss": 1.3787, + "step": 6829 + }, + { + "epoch": 0.6196697514062783, + "grad_norm": 0.10809009525148486, + "learning_rate": 0.000333746522123594, + "loss": 1.4121, + "step": 6830 + }, + { + "epoch": 0.6197604790419161, + "grad_norm": 0.13945865162006785, + "learning_rate": 0.00033360796235223824, + "loss": 1.394, + "step": 6831 + }, + { + "epoch": 0.619851206677554, + "grad_norm": 0.11355413664605371, + "learning_rate": 0.00033346941694885964, + "loss": 1.3612, + "step": 6832 + }, + { + "epoch": 0.6199419343131918, + "grad_norm": 0.11990262076676264, + "learning_rate": 0.0003333308859254213, + "loss": 1.3966, + "step": 6833 + }, + { + "epoch": 0.6200326619488297, + "grad_norm": 0.15268781938891315, + "learning_rate": 0.0003331923692938856, + "loss": 1.4038, + "step": 6834 + }, + { + "epoch": 0.6201233895844674, + "grad_norm": 0.143054689698957, + "learning_rate": 0.0003330538670662134, + "loss": 1.3671, + "step": 6835 + }, + { + "epoch": 0.6202141172201052, + "grad_norm": 0.12589419039747382, + "learning_rate": 0.00033291537925436444, + "loss": 1.3589, + "step": 6836 + }, + { + "epoch": 0.6203048448557431, + "grad_norm": 0.12208846803026226, + "learning_rate": 0.00033277690587029703, + "loss": 1.4033, + "step": 6837 + }, + { + "epoch": 0.6203955724913809, + "grad_norm": 0.13357713726317913, + "learning_rate": 0.0003326384469259685, + "loss": 1.3764, + "step": 6838 + }, + { + "epoch": 0.6204863001270187, + "grad_norm": 0.1265295937121276, + "learning_rate": 0.00033250000243333446, + "loss": 1.395, + "step": 6839 + }, + { + "epoch": 0.6205770277626566, + "grad_norm": 0.13365922506721908, + "learning_rate": 0.0003323615724043503, + "loss": 1.3681, + "step": 6840 + }, + { + "epoch": 0.6206677553982943, + "grad_norm": 0.12511919308276587, + "learning_rate": 0.0003322231568509688, + "loss": 1.3773, + "step": 6841 + }, + { + "epoch": 0.6207584830339321, + "grad_norm": 0.11461250014019128, + "learning_rate": 0.00033208475578514265, + "loss": 1.3688, + "step": 6842 + }, + { + "epoch": 0.62084921066957, + "grad_norm": 0.10555351485159231, + "learning_rate": 0.00033194636921882246, + "loss": 1.3744, + "step": 6843 + }, + { + "epoch": 0.6209399383052078, + "grad_norm": 0.12107902965008986, + "learning_rate": 0.00033180799716395806, + "loss": 1.3996, + "step": 6844 + }, + { + "epoch": 0.6210306659408455, + "grad_norm": 0.1094623615391314, + "learning_rate": 0.00033166963963249794, + "loss": 1.3842, + "step": 6845 + }, + { + "epoch": 0.6211213935764834, + "grad_norm": 0.11713395079745505, + "learning_rate": 0.00033153129663638926, + "loss": 1.3951, + "step": 6846 + }, + { + "epoch": 0.6212121212121212, + "grad_norm": 0.1223656300975511, + "learning_rate": 0.00033139296818757776, + "loss": 1.385, + "step": 6847 + }, + { + "epoch": 0.621302848847759, + "grad_norm": 0.12300433572433542, + "learning_rate": 0.00033125465429800836, + "loss": 1.3751, + "step": 6848 + }, + { + "epoch": 0.6213935764833969, + "grad_norm": 0.11956946190471057, + "learning_rate": 0.00033111635497962453, + "loss": 1.3752, + "step": 6849 + }, + { + "epoch": 0.6214843041190347, + "grad_norm": 0.11482934651604257, + "learning_rate": 0.00033097807024436843, + "loss": 1.4187, + "step": 6850 + }, + { + "epoch": 0.6215750317546724, + "grad_norm": 0.3070642083538066, + "learning_rate": 0.00033083980010418075, + "loss": 1.3977, + "step": 6851 + }, + { + "epoch": 0.6216657593903103, + "grad_norm": 0.13018607329103554, + "learning_rate": 0.00033070154457100133, + "loss": 1.3345, + "step": 6852 + }, + { + "epoch": 0.6217564870259481, + "grad_norm": 0.11613908401315357, + "learning_rate": 0.00033056330365676843, + "loss": 1.3956, + "step": 6853 + }, + { + "epoch": 0.6218472146615859, + "grad_norm": 0.1189949174863175, + "learning_rate": 0.00033042507737341916, + "loss": 1.3785, + "step": 6854 + }, + { + "epoch": 0.6219379422972238, + "grad_norm": 0.11289272414815583, + "learning_rate": 0.00033028686573288946, + "loss": 1.3745, + "step": 6855 + }, + { + "epoch": 0.6220286699328615, + "grad_norm": 0.12352570864675506, + "learning_rate": 0.000330148668747114, + "loss": 1.4147, + "step": 6856 + }, + { + "epoch": 0.6221193975684993, + "grad_norm": 0.11610869597489115, + "learning_rate": 0.0003300104864280261, + "loss": 1.4094, + "step": 6857 + }, + { + "epoch": 0.6222101252041372, + "grad_norm": 0.1202704096423254, + "learning_rate": 0.00032987231878755765, + "loss": 1.4022, + "step": 6858 + }, + { + "epoch": 0.622300852839775, + "grad_norm": 0.1179575586443463, + "learning_rate": 0.0003297341658376396, + "loss": 1.3727, + "step": 6859 + }, + { + "epoch": 0.6223915804754128, + "grad_norm": 0.12999685507241354, + "learning_rate": 0.0003295960275902015, + "loss": 1.3924, + "step": 6860 + }, + { + "epoch": 0.6224823081110507, + "grad_norm": 0.12170160340011206, + "learning_rate": 0.00032945790405717137, + "loss": 1.398, + "step": 6861 + }, + { + "epoch": 0.6225730357466884, + "grad_norm": 0.11301330243830075, + "learning_rate": 0.00032931979525047627, + "loss": 1.3811, + "step": 6862 + }, + { + "epoch": 0.6226637633823262, + "grad_norm": 0.13240799724662297, + "learning_rate": 0.0003291817011820422, + "loss": 1.361, + "step": 6863 + }, + { + "epoch": 0.6227544910179641, + "grad_norm": 0.1285404949223935, + "learning_rate": 0.0003290436218637933, + "loss": 1.3685, + "step": 6864 + }, + { + "epoch": 0.6228452186536019, + "grad_norm": 0.20913257144821662, + "learning_rate": 0.00032890555730765293, + "loss": 1.365, + "step": 6865 + }, + { + "epoch": 0.6229359462892397, + "grad_norm": 0.11569518059670515, + "learning_rate": 0.00032876750752554285, + "loss": 1.4019, + "step": 6866 + }, + { + "epoch": 0.6230266739248775, + "grad_norm": 0.12252736615094387, + "learning_rate": 0.00032862947252938394, + "loss": 1.3787, + "step": 6867 + }, + { + "epoch": 0.6231174015605153, + "grad_norm": 0.11148587313911434, + "learning_rate": 0.0003284914523310951, + "loss": 1.3699, + "step": 6868 + }, + { + "epoch": 0.6232081291961531, + "grad_norm": 0.12572798132575183, + "learning_rate": 0.0003283534469425946, + "loss": 1.3704, + "step": 6869 + }, + { + "epoch": 0.623298856831791, + "grad_norm": 0.11541286856362111, + "learning_rate": 0.00032821545637579927, + "loss": 1.3902, + "step": 6870 + }, + { + "epoch": 0.6233895844674288, + "grad_norm": 0.11183419535589981, + "learning_rate": 0.0003280774806426248, + "loss": 1.3339, + "step": 6871 + }, + { + "epoch": 0.6234803121030666, + "grad_norm": 0.11527179905599119, + "learning_rate": 0.00032793951975498506, + "loss": 1.3744, + "step": 6872 + }, + { + "epoch": 0.6235710397387044, + "grad_norm": 0.12033849880038663, + "learning_rate": 0.0003278015737247931, + "loss": 1.396, + "step": 6873 + }, + { + "epoch": 0.6236617673743422, + "grad_norm": 0.12163769417961355, + "learning_rate": 0.0003276636425639608, + "loss": 1.3722, + "step": 6874 + }, + { + "epoch": 0.6237524950099801, + "grad_norm": 0.1333512112311929, + "learning_rate": 0.00032752572628439824, + "loss": 1.3924, + "step": 6875 + }, + { + "epoch": 0.6238432226456179, + "grad_norm": 0.11730903033589048, + "learning_rate": 0.00032738782489801464, + "loss": 1.4137, + "step": 6876 + }, + { + "epoch": 0.6239339502812556, + "grad_norm": 0.11426453906141372, + "learning_rate": 0.0003272499384167179, + "loss": 1.3853, + "step": 6877 + }, + { + "epoch": 0.6240246779168935, + "grad_norm": 0.11410601507770432, + "learning_rate": 0.00032711206685241413, + "loss": 1.4249, + "step": 6878 + }, + { + "epoch": 0.6241154055525313, + "grad_norm": 0.11115373933910673, + "learning_rate": 0.0003269742102170092, + "loss": 1.3935, + "step": 6879 + }, + { + "epoch": 0.6242061331881691, + "grad_norm": 0.11764198396374925, + "learning_rate": 0.0003268363685224066, + "loss": 1.4, + "step": 6880 + }, + { + "epoch": 0.624296860823807, + "grad_norm": 0.11357514488305294, + "learning_rate": 0.0003266985417805092, + "loss": 1.3681, + "step": 6881 + }, + { + "epoch": 0.6243875884594448, + "grad_norm": 0.11667459087567793, + "learning_rate": 0.0003265607300032182, + "loss": 1.3664, + "step": 6882 + }, + { + "epoch": 0.6244783160950825, + "grad_norm": 0.10953282270763125, + "learning_rate": 0.00032642293320243366, + "loss": 1.3863, + "step": 6883 + }, + { + "epoch": 0.6245690437307204, + "grad_norm": 0.12016598918619838, + "learning_rate": 0.00032628515139005457, + "loss": 1.4069, + "step": 6884 + }, + { + "epoch": 0.6246597713663582, + "grad_norm": 0.11363715041721428, + "learning_rate": 0.0003261473845779781, + "loss": 1.4058, + "step": 6885 + }, + { + "epoch": 0.624750499001996, + "grad_norm": 0.13017121582387414, + "learning_rate": 0.0003260096327781005, + "loss": 1.3657, + "step": 6886 + }, + { + "epoch": 0.6248412266376339, + "grad_norm": 0.11745015621972368, + "learning_rate": 0.0003258718960023169, + "loss": 1.3785, + "step": 6887 + }, + { + "epoch": 0.6249319542732716, + "grad_norm": 0.11569274697928923, + "learning_rate": 0.0003257341742625207, + "loss": 1.3689, + "step": 6888 + }, + { + "epoch": 0.6250226819089094, + "grad_norm": 0.11187771248843816, + "learning_rate": 0.0003255964675706044, + "loss": 1.363, + "step": 6889 + }, + { + "epoch": 0.6251134095445473, + "grad_norm": 0.11758099105182332, + "learning_rate": 0.00032545877593845876, + "loss": 1.3588, + "step": 6890 + }, + { + "epoch": 0.6252041371801851, + "grad_norm": 0.12206310777198336, + "learning_rate": 0.0003253210993779735, + "loss": 1.3561, + "step": 6891 + }, + { + "epoch": 0.6252948648158229, + "grad_norm": 0.11485229517724183, + "learning_rate": 0.00032518343790103704, + "loss": 1.413, + "step": 6892 + }, + { + "epoch": 0.6253855924514607, + "grad_norm": 0.11798004151663849, + "learning_rate": 0.00032504579151953637, + "loss": 1.3994, + "step": 6893 + }, + { + "epoch": 0.6254763200870985, + "grad_norm": 0.11627474975268809, + "learning_rate": 0.0003249081602453575, + "loss": 1.3497, + "step": 6894 + }, + { + "epoch": 0.6255670477227363, + "grad_norm": 0.10822855226149003, + "learning_rate": 0.0003247705440903848, + "loss": 1.3837, + "step": 6895 + }, + { + "epoch": 0.6256577753583742, + "grad_norm": 0.11951240722974045, + "learning_rate": 0.00032463294306650156, + "loss": 1.3408, + "step": 6896 + }, + { + "epoch": 0.625748502994012, + "grad_norm": 0.11811026743214718, + "learning_rate": 0.0003244953571855894, + "loss": 1.3436, + "step": 6897 + }, + { + "epoch": 0.6258392306296497, + "grad_norm": 0.1174836572096957, + "learning_rate": 0.000324357786459529, + "loss": 1.3532, + "step": 6898 + }, + { + "epoch": 0.6259299582652876, + "grad_norm": 0.11310107121702721, + "learning_rate": 0.00032422023090019974, + "loss": 1.3833, + "step": 6899 + }, + { + "epoch": 0.6260206859009254, + "grad_norm": 0.11709838770363125, + "learning_rate": 0.0003240826905194794, + "loss": 1.3365, + "step": 6900 + }, + { + "epoch": 0.6261114135365632, + "grad_norm": 0.10955024796024339, + "learning_rate": 0.00032394516532924445, + "loss": 1.3483, + "step": 6901 + }, + { + "epoch": 0.6262021411722011, + "grad_norm": 0.11290269326009034, + "learning_rate": 0.0003238076553413706, + "loss": 1.3839, + "step": 6902 + }, + { + "epoch": 0.6262928688078389, + "grad_norm": 0.11417622637715151, + "learning_rate": 0.0003236701605677318, + "loss": 1.3816, + "step": 6903 + }, + { + "epoch": 0.6263835964434766, + "grad_norm": 0.1257325207639902, + "learning_rate": 0.0003235326810202005, + "loss": 1.3753, + "step": 6904 + }, + { + "epoch": 0.6264743240791145, + "grad_norm": 0.11730843477110367, + "learning_rate": 0.0003233952167106482, + "loss": 1.3749, + "step": 6905 + }, + { + "epoch": 0.6265650517147523, + "grad_norm": 0.16138901928632657, + "learning_rate": 0.00032325776765094506, + "loss": 1.3453, + "step": 6906 + }, + { + "epoch": 0.6266557793503901, + "grad_norm": 0.1151142960041944, + "learning_rate": 0.0003231203338529596, + "loss": 1.3685, + "step": 6907 + }, + { + "epoch": 0.626746506986028, + "grad_norm": 0.12183946983591724, + "learning_rate": 0.0003229829153285594, + "loss": 1.4346, + "step": 6908 + }, + { + "epoch": 0.6268372346216657, + "grad_norm": 0.12108422428071706, + "learning_rate": 0.0003228455120896105, + "loss": 1.3833, + "step": 6909 + }, + { + "epoch": 0.6269279622573036, + "grad_norm": 0.12393009467916498, + "learning_rate": 0.0003227081241479779, + "loss": 1.3964, + "step": 6910 + }, + { + "epoch": 0.6270186898929414, + "grad_norm": 0.11412161604175318, + "learning_rate": 0.00032257075151552483, + "loss": 1.3791, + "step": 6911 + }, + { + "epoch": 0.6271094175285792, + "grad_norm": 0.1357390870625021, + "learning_rate": 0.0003224333942041134, + "loss": 1.3745, + "step": 6912 + }, + { + "epoch": 0.6272001451642171, + "grad_norm": 0.1142629687772458, + "learning_rate": 0.0003222960522256049, + "loss": 1.4017, + "step": 6913 + }, + { + "epoch": 0.6272908727998548, + "grad_norm": 0.11530654763364998, + "learning_rate": 0.00032215872559185815, + "loss": 1.4113, + "step": 6914 + }, + { + "epoch": 0.6273816004354926, + "grad_norm": 0.11027292464002314, + "learning_rate": 0.00032202141431473175, + "loss": 1.398, + "step": 6915 + }, + { + "epoch": 0.6274723280711305, + "grad_norm": 0.11904174431264068, + "learning_rate": 0.00032188411840608256, + "loss": 1.3551, + "step": 6916 + }, + { + "epoch": 0.6275630557067683, + "grad_norm": 0.10758310591527777, + "learning_rate": 0.0003217468378777657, + "loss": 1.3529, + "step": 6917 + }, + { + "epoch": 0.6276537833424061, + "grad_norm": 0.11615519734407176, + "learning_rate": 0.00032160957274163595, + "loss": 1.3391, + "step": 6918 + }, + { + "epoch": 0.627744510978044, + "grad_norm": 0.1138656396068331, + "learning_rate": 0.00032147232300954576, + "loss": 1.4131, + "step": 6919 + }, + { + "epoch": 0.6278352386136817, + "grad_norm": 0.11467822007800968, + "learning_rate": 0.00032133508869334695, + "loss": 1.3805, + "step": 6920 + }, + { + "epoch": 0.6279259662493195, + "grad_norm": 0.12058586158496663, + "learning_rate": 0.0003211978698048894, + "loss": 1.3543, + "step": 6921 + }, + { + "epoch": 0.6280166938849574, + "grad_norm": 0.11143143621733824, + "learning_rate": 0.00032106066635602214, + "loss": 1.3993, + "step": 6922 + }, + { + "epoch": 0.6281074215205952, + "grad_norm": 0.123435207362041, + "learning_rate": 0.0003209234783585929, + "loss": 1.3706, + "step": 6923 + }, + { + "epoch": 0.628198149156233, + "grad_norm": 0.1249016512560424, + "learning_rate": 0.0003207863058244475, + "loss": 1.4082, + "step": 6924 + }, + { + "epoch": 0.6282888767918708, + "grad_norm": 0.11918829503014122, + "learning_rate": 0.00032064914876543096, + "loss": 1.3734, + "step": 6925 + }, + { + "epoch": 0.6283796044275086, + "grad_norm": 0.11507423099629291, + "learning_rate": 0.0003205120071933869, + "loss": 1.4, + "step": 6926 + }, + { + "epoch": 0.6284703320631464, + "grad_norm": 0.1255040643016208, + "learning_rate": 0.0003203748811201576, + "loss": 1.3712, + "step": 6927 + }, + { + "epoch": 0.6285610596987843, + "grad_norm": 0.12128097294608904, + "learning_rate": 0.00032023777055758374, + "loss": 1.3967, + "step": 6928 + }, + { + "epoch": 0.6286517873344221, + "grad_norm": 0.11936117421565232, + "learning_rate": 0.00032010067551750486, + "loss": 1.3563, + "step": 6929 + }, + { + "epoch": 0.6287425149700598, + "grad_norm": 0.11724440556582601, + "learning_rate": 0.0003199635960117593, + "loss": 1.3771, + "step": 6930 + }, + { + "epoch": 0.6288332426056977, + "grad_norm": 0.11699954691606615, + "learning_rate": 0.0003198265320521836, + "loss": 1.3964, + "step": 6931 + }, + { + "epoch": 0.6289239702413355, + "grad_norm": 0.11049406452310997, + "learning_rate": 0.0003196894836506133, + "loss": 1.3811, + "step": 6932 + }, + { + "epoch": 0.6290146978769733, + "grad_norm": 0.11906700679974608, + "learning_rate": 0.0003195524508188828, + "loss": 1.3709, + "step": 6933 + }, + { + "epoch": 0.6291054255126112, + "grad_norm": 0.13186505547345878, + "learning_rate": 0.00031941543356882475, + "loss": 1.4252, + "step": 6934 + }, + { + "epoch": 0.629196153148249, + "grad_norm": 0.11576728745512384, + "learning_rate": 0.0003192784319122707, + "loss": 1.3968, + "step": 6935 + }, + { + "epoch": 0.6292868807838867, + "grad_norm": 0.11551909878658283, + "learning_rate": 0.00031914144586105066, + "loss": 1.4029, + "step": 6936 + }, + { + "epoch": 0.6293776084195246, + "grad_norm": 0.1191834439522111, + "learning_rate": 0.0003190044754269934, + "loss": 1.3926, + "step": 6937 + }, + { + "epoch": 0.6294683360551624, + "grad_norm": 0.10993017371297686, + "learning_rate": 0.0003188675206219264, + "loss": 1.3711, + "step": 6938 + }, + { + "epoch": 0.6295590636908002, + "grad_norm": 0.11317342092574381, + "learning_rate": 0.00031873058145767566, + "loss": 1.3792, + "step": 6939 + }, + { + "epoch": 0.6296497913264381, + "grad_norm": 0.11755661058594652, + "learning_rate": 0.0003185936579460658, + "loss": 1.3693, + "step": 6940 + }, + { + "epoch": 0.6297405189620758, + "grad_norm": 0.1095317019417499, + "learning_rate": 0.00031845675009892037, + "loss": 1.3991, + "step": 6941 + }, + { + "epoch": 0.6298312465977136, + "grad_norm": 0.1154284926528005, + "learning_rate": 0.0003183198579280615, + "loss": 1.3591, + "step": 6942 + }, + { + "epoch": 0.6299219742333515, + "grad_norm": 0.119783306259367, + "learning_rate": 0.00031818298144530954, + "loss": 1.3911, + "step": 6943 + }, + { + "epoch": 0.6300127018689893, + "grad_norm": 0.11563166202813491, + "learning_rate": 0.00031804612066248396, + "loss": 1.3968, + "step": 6944 + }, + { + "epoch": 0.630103429504627, + "grad_norm": 0.11794091278761418, + "learning_rate": 0.0003179092755914028, + "loss": 1.3551, + "step": 6945 + }, + { + "epoch": 0.6301941571402649, + "grad_norm": 0.13862238668139826, + "learning_rate": 0.00031777244624388236, + "loss": 1.3904, + "step": 6946 + }, + { + "epoch": 0.6302848847759027, + "grad_norm": 0.11382387813633789, + "learning_rate": 0.0003176356326317381, + "loss": 1.3471, + "step": 6947 + }, + { + "epoch": 0.6303756124115406, + "grad_norm": 0.12988888207790966, + "learning_rate": 0.0003174988347667837, + "loss": 1.3686, + "step": 6948 + }, + { + "epoch": 0.6304663400471784, + "grad_norm": 0.11850104453198268, + "learning_rate": 0.0003173620526608321, + "loss": 1.3715, + "step": 6949 + }, + { + "epoch": 0.6305570676828162, + "grad_norm": 0.11117608586916575, + "learning_rate": 0.0003172252863256939, + "loss": 1.3598, + "step": 6950 + }, + { + "epoch": 0.630647795318454, + "grad_norm": 0.11839556538149876, + "learning_rate": 0.00031708853577317933, + "loss": 1.3901, + "step": 6951 + }, + { + "epoch": 0.6307385229540918, + "grad_norm": 0.11577155796951434, + "learning_rate": 0.0003169518010150967, + "loss": 1.3673, + "step": 6952 + }, + { + "epoch": 0.6308292505897296, + "grad_norm": 0.1280811883163727, + "learning_rate": 0.00031681508206325286, + "loss": 1.4048, + "step": 6953 + }, + { + "epoch": 0.6309199782253675, + "grad_norm": 0.13358286807059833, + "learning_rate": 0.0003166783789294537, + "loss": 1.3973, + "step": 6954 + }, + { + "epoch": 0.6310107058610053, + "grad_norm": 0.12048796439744547, + "learning_rate": 0.0003165416916255037, + "loss": 1.343, + "step": 6955 + }, + { + "epoch": 0.631101433496643, + "grad_norm": 0.11666540798482138, + "learning_rate": 0.0003164050201632054, + "loss": 1.3616, + "step": 6956 + }, + { + "epoch": 0.6311921611322809, + "grad_norm": 0.11872915074377692, + "learning_rate": 0.0003162683645543609, + "loss": 1.4107, + "step": 6957 + }, + { + "epoch": 0.6312828887679187, + "grad_norm": 0.12239440716441859, + "learning_rate": 0.00031613172481077015, + "loss": 1.414, + "step": 6958 + }, + { + "epoch": 0.6313736164035565, + "grad_norm": 0.11657172220537804, + "learning_rate": 0.0003159951009442321, + "loss": 1.3327, + "step": 6959 + }, + { + "epoch": 0.6314643440391944, + "grad_norm": 0.11894663947027762, + "learning_rate": 0.00031585849296654413, + "loss": 1.3634, + "step": 6960 + }, + { + "epoch": 0.6315550716748322, + "grad_norm": 0.13893115396526856, + "learning_rate": 0.0003157219008895024, + "loss": 1.414, + "step": 6961 + }, + { + "epoch": 0.6316457993104699, + "grad_norm": 0.11603345599088745, + "learning_rate": 0.00031558532472490187, + "loss": 1.3643, + "step": 6962 + }, + { + "epoch": 0.6317365269461078, + "grad_norm": 0.13501508823419753, + "learning_rate": 0.00031544876448453554, + "loss": 1.4257, + "step": 6963 + }, + { + "epoch": 0.6318272545817456, + "grad_norm": 0.11613852520781141, + "learning_rate": 0.0003153122201801956, + "loss": 1.3465, + "step": 6964 + }, + { + "epoch": 0.6319179822173834, + "grad_norm": 0.10969000157518927, + "learning_rate": 0.0003151756918236727, + "loss": 1.3809, + "step": 6965 + }, + { + "epoch": 0.6320087098530213, + "grad_norm": 0.12522738957876567, + "learning_rate": 0.000315039179426756, + "loss": 1.3666, + "step": 6966 + }, + { + "epoch": 0.632099437488659, + "grad_norm": 0.12641806135540648, + "learning_rate": 0.0003149026830012336, + "loss": 1.3794, + "step": 6967 + }, + { + "epoch": 0.6321901651242968, + "grad_norm": 0.12560536943437822, + "learning_rate": 0.00031476620255889166, + "loss": 1.4059, + "step": 6968 + }, + { + "epoch": 0.6322808927599347, + "grad_norm": 0.11876905056046891, + "learning_rate": 0.0003146297381115155, + "loss": 1.3553, + "step": 6969 + }, + { + "epoch": 0.6323716203955725, + "grad_norm": 0.11534049071524721, + "learning_rate": 0.00031449328967088873, + "loss": 1.3927, + "step": 6970 + }, + { + "epoch": 0.6324623480312103, + "grad_norm": 0.11880638545632996, + "learning_rate": 0.0003143568572487937, + "loss": 1.3869, + "step": 6971 + }, + { + "epoch": 0.6325530756668482, + "grad_norm": 0.11782883351164791, + "learning_rate": 0.0003142204408570114, + "loss": 1.423, + "step": 6972 + }, + { + "epoch": 0.6326438033024859, + "grad_norm": 0.11327018473829434, + "learning_rate": 0.0003140840405073214, + "loss": 1.3907, + "step": 6973 + }, + { + "epoch": 0.6327345309381237, + "grad_norm": 0.11322942556753367, + "learning_rate": 0.0003139476562115021, + "loss": 1.3754, + "step": 6974 + }, + { + "epoch": 0.6328252585737616, + "grad_norm": 0.11023845448810402, + "learning_rate": 0.00031381128798132997, + "loss": 1.3546, + "step": 6975 + }, + { + "epoch": 0.6329159862093994, + "grad_norm": 0.13173767499671246, + "learning_rate": 0.00031367493582858054, + "loss": 1.3681, + "step": 6976 + }, + { + "epoch": 0.6330067138450372, + "grad_norm": 0.11224581869916037, + "learning_rate": 0.000313538599765028, + "loss": 1.3972, + "step": 6977 + }, + { + "epoch": 0.633097441480675, + "grad_norm": 0.23504855811843167, + "learning_rate": 0.00031340227980244473, + "loss": 1.3651, + "step": 6978 + }, + { + "epoch": 0.6331881691163128, + "grad_norm": 0.11978525839063524, + "learning_rate": 0.00031326597595260195, + "loss": 1.3909, + "step": 6979 + }, + { + "epoch": 0.6332788967519506, + "grad_norm": 0.11192210990103511, + "learning_rate": 0.0003131296882272698, + "loss": 1.3812, + "step": 6980 + }, + { + "epoch": 0.6333696243875885, + "grad_norm": 0.12168959625596677, + "learning_rate": 0.0003129934166382168, + "loss": 1.3927, + "step": 6981 + }, + { + "epoch": 0.6334603520232263, + "grad_norm": 0.12139089653293232, + "learning_rate": 0.00031285716119720965, + "loss": 1.3554, + "step": 6982 + }, + { + "epoch": 0.633551079658864, + "grad_norm": 0.1279813811405937, + "learning_rate": 0.0003127209219160142, + "loss": 1.3343, + "step": 6983 + }, + { + "epoch": 0.6336418072945019, + "grad_norm": 0.11640923104511024, + "learning_rate": 0.00031258469880639494, + "loss": 1.4056, + "step": 6984 + }, + { + "epoch": 0.6337325349301397, + "grad_norm": 0.12308819211207614, + "learning_rate": 0.0003124484918801144, + "loss": 1.3874, + "step": 6985 + }, + { + "epoch": 0.6338232625657776, + "grad_norm": 0.13085661815050714, + "learning_rate": 0.00031231230114893416, + "loss": 1.4005, + "step": 6986 + }, + { + "epoch": 0.6339139902014154, + "grad_norm": 0.11831001654444086, + "learning_rate": 0.0003121761266246144, + "loss": 1.3732, + "step": 6987 + }, + { + "epoch": 0.6340047178370531, + "grad_norm": 0.12611750339786357, + "learning_rate": 0.0003120399683189139, + "loss": 1.3661, + "step": 6988 + }, + { + "epoch": 0.634095445472691, + "grad_norm": 0.12052082783027562, + "learning_rate": 0.00031190382624358975, + "loss": 1.3826, + "step": 6989 + }, + { + "epoch": 0.6341861731083288, + "grad_norm": 0.11726831488456461, + "learning_rate": 0.00031176770041039793, + "loss": 1.3888, + "step": 6990 + }, + { + "epoch": 0.6342769007439666, + "grad_norm": 0.12462994977348346, + "learning_rate": 0.0003116315908310931, + "loss": 1.3907, + "step": 6991 + }, + { + "epoch": 0.6343676283796045, + "grad_norm": 0.11673519251014589, + "learning_rate": 0.000311495497517428, + "loss": 1.4109, + "step": 6992 + }, + { + "epoch": 0.6344583560152423, + "grad_norm": 0.147556567688212, + "learning_rate": 0.0003113594204811544, + "loss": 1.3628, + "step": 6993 + }, + { + "epoch": 0.63454908365088, + "grad_norm": 0.12286372186866039, + "learning_rate": 0.0003112233597340228, + "loss": 1.4073, + "step": 6994 + }, + { + "epoch": 0.6346398112865179, + "grad_norm": 0.115523808463929, + "learning_rate": 0.00031108731528778165, + "loss": 1.3511, + "step": 6995 + }, + { + "epoch": 0.6347305389221557, + "grad_norm": 0.1331147347477605, + "learning_rate": 0.000310951287154179, + "loss": 1.4095, + "step": 6996 + }, + { + "epoch": 0.6348212665577935, + "grad_norm": 0.1596676733746156, + "learning_rate": 0.00031081527534496036, + "loss": 1.4134, + "step": 6997 + }, + { + "epoch": 0.6349119941934314, + "grad_norm": 0.12720715044629416, + "learning_rate": 0.0003106792798718707, + "loss": 1.3575, + "step": 6998 + }, + { + "epoch": 0.6350027218290691, + "grad_norm": 0.1297106984465184, + "learning_rate": 0.0003105433007466531, + "loss": 1.3928, + "step": 6999 + }, + { + "epoch": 0.6350934494647069, + "grad_norm": 0.12247385234449083, + "learning_rate": 0.00031040733798104935, + "loss": 1.3965, + "step": 7000 + }, + { + "epoch": 0.6351841771003448, + "grad_norm": 0.11415382192383379, + "learning_rate": 0.0003102713915868001, + "loss": 1.3811, + "step": 7001 + }, + { + "epoch": 0.6352749047359826, + "grad_norm": 0.11223231189099485, + "learning_rate": 0.0003101354615756439, + "loss": 1.359, + "step": 7002 + }, + { + "epoch": 0.6353656323716204, + "grad_norm": 0.11312198425923943, + "learning_rate": 0.00030999954795931853, + "loss": 1.3597, + "step": 7003 + }, + { + "epoch": 0.6354563600072582, + "grad_norm": 0.11147961748633271, + "learning_rate": 0.00030986365074956037, + "loss": 1.3692, + "step": 7004 + }, + { + "epoch": 0.635547087642896, + "grad_norm": 0.11748851264967969, + "learning_rate": 0.0003097277699581039, + "loss": 1.3767, + "step": 7005 + }, + { + "epoch": 0.6356378152785338, + "grad_norm": 0.12088727581151894, + "learning_rate": 0.00030959190559668267, + "loss": 1.3764, + "step": 7006 + }, + { + "epoch": 0.6357285429141717, + "grad_norm": 0.12117201646586503, + "learning_rate": 0.00030945605767702835, + "loss": 1.3726, + "step": 7007 + }, + { + "epoch": 0.6358192705498095, + "grad_norm": 0.15681223458515398, + "learning_rate": 0.0003093202262108716, + "loss": 1.3797, + "step": 7008 + }, + { + "epoch": 0.6359099981854472, + "grad_norm": 0.11402452138491025, + "learning_rate": 0.0003091844112099413, + "loss": 1.3789, + "step": 7009 + }, + { + "epoch": 0.6360007258210851, + "grad_norm": 0.112756742855413, + "learning_rate": 0.00030904861268596525, + "loss": 1.4277, + "step": 7010 + }, + { + "epoch": 0.6360914534567229, + "grad_norm": 0.1266275095096536, + "learning_rate": 0.00030891283065066967, + "loss": 1.3717, + "step": 7011 + }, + { + "epoch": 0.6361821810923607, + "grad_norm": 0.11626480289777195, + "learning_rate": 0.0003087770651157793, + "loss": 1.4026, + "step": 7012 + }, + { + "epoch": 0.6362729087279986, + "grad_norm": 0.13473366840510126, + "learning_rate": 0.00030864131609301766, + "loss": 1.3584, + "step": 7013 + }, + { + "epoch": 0.6363636363636364, + "grad_norm": 0.12354373124796952, + "learning_rate": 0.00030850558359410646, + "loss": 1.3809, + "step": 7014 + }, + { + "epoch": 0.6364543639992741, + "grad_norm": 0.1303933554852257, + "learning_rate": 0.00030836986763076635, + "loss": 1.3637, + "step": 7015 + }, + { + "epoch": 0.636545091634912, + "grad_norm": 0.11674347185738869, + "learning_rate": 0.0003082341682147165, + "loss": 1.3574, + "step": 7016 + }, + { + "epoch": 0.6366358192705498, + "grad_norm": 0.11873805680739002, + "learning_rate": 0.00030809848535767444, + "loss": 1.3587, + "step": 7017 + }, + { + "epoch": 0.6367265469061876, + "grad_norm": 0.12792361983742634, + "learning_rate": 0.0003079628190713563, + "loss": 1.3729, + "step": 7018 + }, + { + "epoch": 0.6368172745418255, + "grad_norm": 0.13701602974482333, + "learning_rate": 0.0003078271693674772, + "loss": 1.3913, + "step": 7019 + }, + { + "epoch": 0.6369080021774632, + "grad_norm": 0.12277193352888023, + "learning_rate": 0.00030769153625775036, + "loss": 1.3745, + "step": 7020 + }, + { + "epoch": 0.636998729813101, + "grad_norm": 0.13234987503722373, + "learning_rate": 0.0003075559197538876, + "loss": 1.404, + "step": 7021 + }, + { + "epoch": 0.6370894574487389, + "grad_norm": 0.11833807275058598, + "learning_rate": 0.0003074203198675997, + "loss": 1.3808, + "step": 7022 + }, + { + "epoch": 0.6371801850843767, + "grad_norm": 0.13242696541882312, + "learning_rate": 0.00030728473661059555, + "loss": 1.352, + "step": 7023 + }, + { + "epoch": 0.6372709127200146, + "grad_norm": 0.13064706763953404, + "learning_rate": 0.00030714916999458264, + "loss": 1.3854, + "step": 7024 + }, + { + "epoch": 0.6373616403556523, + "grad_norm": 0.15451394033663407, + "learning_rate": 0.00030701362003126735, + "loss": 1.4343, + "step": 7025 + }, + { + "epoch": 0.6374523679912901, + "grad_norm": 0.11175917591779234, + "learning_rate": 0.0003068780867323544, + "loss": 1.3954, + "step": 7026 + }, + { + "epoch": 0.637543095626928, + "grad_norm": 0.11108690382508363, + "learning_rate": 0.00030674257010954723, + "loss": 1.4017, + "step": 7027 + }, + { + "epoch": 0.6376338232625658, + "grad_norm": 0.13370223631744885, + "learning_rate": 0.00030660707017454757, + "loss": 1.3497, + "step": 7028 + }, + { + "epoch": 0.6377245508982036, + "grad_norm": 0.1196507632202059, + "learning_rate": 0.00030647158693905584, + "loss": 1.3903, + "step": 7029 + }, + { + "epoch": 0.6378152785338415, + "grad_norm": 0.12328274259685203, + "learning_rate": 0.00030633612041477124, + "loss": 1.3716, + "step": 7030 + }, + { + "epoch": 0.6379060061694792, + "grad_norm": 0.32069302148075984, + "learning_rate": 0.0003062006706133911, + "loss": 1.3698, + "step": 7031 + }, + { + "epoch": 0.637996733805117, + "grad_norm": 0.1301499757234504, + "learning_rate": 0.0003060652375466116, + "loss": 1.4107, + "step": 7032 + }, + { + "epoch": 0.6380874614407549, + "grad_norm": 0.12159849735796999, + "learning_rate": 0.00030592982122612755, + "loss": 1.3681, + "step": 7033 + }, + { + "epoch": 0.6381781890763927, + "grad_norm": 0.12592205970822415, + "learning_rate": 0.0003057944216636317, + "loss": 1.3694, + "step": 7034 + }, + { + "epoch": 0.6382689167120305, + "grad_norm": 0.12960867437083698, + "learning_rate": 0.00030565903887081646, + "loss": 1.3918, + "step": 7035 + }, + { + "epoch": 0.6383596443476683, + "grad_norm": 0.14762033375671849, + "learning_rate": 0.00030552367285937177, + "loss": 1.4431, + "step": 7036 + }, + { + "epoch": 0.6384503719833061, + "grad_norm": 0.14256086873830504, + "learning_rate": 0.00030538832364098676, + "loss": 1.3663, + "step": 7037 + }, + { + "epoch": 0.6385410996189439, + "grad_norm": 0.12139215586881162, + "learning_rate": 0.0003052529912273485, + "loss": 1.3892, + "step": 7038 + }, + { + "epoch": 0.6386318272545818, + "grad_norm": 0.12496467059140634, + "learning_rate": 0.0003051176756301431, + "loss": 1.3559, + "step": 7039 + }, + { + "epoch": 0.6387225548902196, + "grad_norm": 0.13402168690858454, + "learning_rate": 0.00030498237686105536, + "loss": 1.3498, + "step": 7040 + }, + { + "epoch": 0.6388132825258573, + "grad_norm": 0.12566824974972504, + "learning_rate": 0.000304847094931768, + "loss": 1.3611, + "step": 7041 + }, + { + "epoch": 0.6389040101614952, + "grad_norm": 0.13393284701443434, + "learning_rate": 0.0003047118298539626, + "loss": 1.3764, + "step": 7042 + }, + { + "epoch": 0.638994737797133, + "grad_norm": 0.12067809524132769, + "learning_rate": 0.0003045765816393196, + "loss": 1.3373, + "step": 7043 + }, + { + "epoch": 0.6390854654327708, + "grad_norm": 0.12390931436687586, + "learning_rate": 0.0003044413502995176, + "loss": 1.3586, + "step": 7044 + }, + { + "epoch": 0.6391761930684087, + "grad_norm": 0.15396284134818208, + "learning_rate": 0.0003043061358462339, + "loss": 1.4025, + "step": 7045 + }, + { + "epoch": 0.6392669207040464, + "grad_norm": 0.13297859993460395, + "learning_rate": 0.00030417093829114404, + "loss": 1.3808, + "step": 7046 + }, + { + "epoch": 0.6393576483396842, + "grad_norm": 0.12343833455879888, + "learning_rate": 0.00030403575764592275, + "loss": 1.34, + "step": 7047 + }, + { + "epoch": 0.6394483759753221, + "grad_norm": 0.12109120894289063, + "learning_rate": 0.00030390059392224246, + "loss": 1.4083, + "step": 7048 + }, + { + "epoch": 0.6395391036109599, + "grad_norm": 0.12424030087637476, + "learning_rate": 0.0003037654471317748, + "loss": 1.3495, + "step": 7049 + }, + { + "epoch": 0.6396298312465977, + "grad_norm": 0.12451140808757734, + "learning_rate": 0.0003036303172861897, + "loss": 1.3863, + "step": 7050 + }, + { + "epoch": 0.6397205588822356, + "grad_norm": 0.11604787014841668, + "learning_rate": 0.0003034952043971557, + "loss": 1.3853, + "step": 7051 + }, + { + "epoch": 0.6398112865178733, + "grad_norm": 0.13094003200154053, + "learning_rate": 0.0003033601084763398, + "loss": 1.3936, + "step": 7052 + }, + { + "epoch": 0.6399020141535111, + "grad_norm": 0.11388367559685937, + "learning_rate": 0.0003032250295354075, + "loss": 1.407, + "step": 7053 + }, + { + "epoch": 0.639992741789149, + "grad_norm": 0.11537037111967904, + "learning_rate": 0.00030308996758602284, + "loss": 1.401, + "step": 7054 + }, + { + "epoch": 0.6400834694247868, + "grad_norm": 0.12215155410336559, + "learning_rate": 0.00030295492263984866, + "loss": 1.3845, + "step": 7055 + }, + { + "epoch": 0.6401741970604246, + "grad_norm": 0.11456126431068626, + "learning_rate": 0.00030281989470854577, + "loss": 1.3848, + "step": 7056 + }, + { + "epoch": 0.6402649246960624, + "grad_norm": 0.12111688542083171, + "learning_rate": 0.00030268488380377404, + "loss": 1.3973, + "step": 7057 + }, + { + "epoch": 0.6403556523317002, + "grad_norm": 0.12319020059847617, + "learning_rate": 0.0003025498899371917, + "loss": 1.3647, + "step": 7058 + }, + { + "epoch": 0.640446379967338, + "grad_norm": 0.11296672646723963, + "learning_rate": 0.00030241491312045553, + "loss": 1.3779, + "step": 7059 + }, + { + "epoch": 0.6405371076029759, + "grad_norm": 0.11740024370520424, + "learning_rate": 0.00030227995336522067, + "loss": 1.3571, + "step": 7060 + }, + { + "epoch": 0.6406278352386137, + "grad_norm": 0.11696220733117806, + "learning_rate": 0.00030214501068314103, + "loss": 1.3654, + "step": 7061 + }, + { + "epoch": 0.6407185628742516, + "grad_norm": 0.11487278649660088, + "learning_rate": 0.0003020100850858689, + "loss": 1.3943, + "step": 7062 + }, + { + "epoch": 0.6408092905098893, + "grad_norm": 0.1220443100910577, + "learning_rate": 0.000301875176585055, + "loss": 1.3936, + "step": 7063 + }, + { + "epoch": 0.6409000181455271, + "grad_norm": 0.11367938338930056, + "learning_rate": 0.00030174028519234884, + "loss": 1.3739, + "step": 7064 + }, + { + "epoch": 0.640990745781165, + "grad_norm": 0.11984146765782362, + "learning_rate": 0.0003016054109193982, + "loss": 1.3869, + "step": 7065 + }, + { + "epoch": 0.6410814734168028, + "grad_norm": 0.1136637926883735, + "learning_rate": 0.00030147055377784983, + "loss": 1.371, + "step": 7066 + }, + { + "epoch": 0.6411722010524405, + "grad_norm": 0.12490810018970798, + "learning_rate": 0.00030133571377934814, + "loss": 1.4094, + "step": 7067 + }, + { + "epoch": 0.6412629286880784, + "grad_norm": 0.11047515267214565, + "learning_rate": 0.00030120089093553694, + "loss": 1.3876, + "step": 7068 + }, + { + "epoch": 0.6413536563237162, + "grad_norm": 0.1150151250510665, + "learning_rate": 0.0003010660852580582, + "loss": 1.3678, + "step": 7069 + }, + { + "epoch": 0.641444383959354, + "grad_norm": 0.1373384085847247, + "learning_rate": 0.0003009312967585522, + "loss": 1.3219, + "step": 7070 + }, + { + "epoch": 0.6415351115949919, + "grad_norm": 0.11417421477987204, + "learning_rate": 0.0003007965254486581, + "loss": 1.3683, + "step": 7071 + }, + { + "epoch": 0.6416258392306297, + "grad_norm": 0.11898270009977692, + "learning_rate": 0.00030066177134001343, + "loss": 1.4065, + "step": 7072 + }, + { + "epoch": 0.6417165668662674, + "grad_norm": 0.12042707615763108, + "learning_rate": 0.0003005270344442539, + "loss": 1.4044, + "step": 7073 + }, + { + "epoch": 0.6418072945019053, + "grad_norm": 0.12526798864204275, + "learning_rate": 0.00030039231477301464, + "loss": 1.3893, + "step": 7074 + }, + { + "epoch": 0.6418980221375431, + "grad_norm": 0.11595756133899056, + "learning_rate": 0.00030025761233792836, + "loss": 1.3919, + "step": 7075 + }, + { + "epoch": 0.6419887497731809, + "grad_norm": 0.11536961426397035, + "learning_rate": 0.0003001229271506268, + "loss": 1.4193, + "step": 7076 + }, + { + "epoch": 0.6420794774088188, + "grad_norm": 0.16219641016978567, + "learning_rate": 0.00029998825922273974, + "loss": 1.384, + "step": 7077 + }, + { + "epoch": 0.6421702050444565, + "grad_norm": 0.1259957555963316, + "learning_rate": 0.00029985360856589607, + "loss": 1.3711, + "step": 7078 + }, + { + "epoch": 0.6422609326800943, + "grad_norm": 0.12578709834467805, + "learning_rate": 0.00029971897519172287, + "loss": 1.3675, + "step": 7079 + }, + { + "epoch": 0.6423516603157322, + "grad_norm": 0.11870115599453557, + "learning_rate": 0.00029958435911184555, + "loss": 1.3751, + "step": 7080 + }, + { + "epoch": 0.64244238795137, + "grad_norm": 0.11823143374334322, + "learning_rate": 0.0002994497603378883, + "loss": 1.3948, + "step": 7081 + }, + { + "epoch": 0.6425331155870078, + "grad_norm": 0.12022631429368202, + "learning_rate": 0.00029931517888147395, + "loss": 1.354, + "step": 7082 + }, + { + "epoch": 0.6426238432226457, + "grad_norm": 0.11611939776437744, + "learning_rate": 0.0002991806147542234, + "loss": 1.3996, + "step": 7083 + }, + { + "epoch": 0.6427145708582834, + "grad_norm": 0.11654814322864475, + "learning_rate": 0.00029904606796775645, + "loss": 1.3751, + "step": 7084 + }, + { + "epoch": 0.6428052984939212, + "grad_norm": 0.10951849020069981, + "learning_rate": 0.0002989115385336911, + "loss": 1.3718, + "step": 7085 + }, + { + "epoch": 0.6428960261295591, + "grad_norm": 0.11781651745242619, + "learning_rate": 0.0002987770264636441, + "loss": 1.406, + "step": 7086 + }, + { + "epoch": 0.6429867537651969, + "grad_norm": 0.1091265486375645, + "learning_rate": 0.0002986425317692305, + "loss": 1.418, + "step": 7087 + }, + { + "epoch": 0.6430774814008346, + "grad_norm": 0.11192163217381433, + "learning_rate": 0.00029850805446206383, + "loss": 1.4042, + "step": 7088 + }, + { + "epoch": 0.6431682090364725, + "grad_norm": 0.11968106016721441, + "learning_rate": 0.0002983735945537564, + "loss": 1.3939, + "step": 7089 + }, + { + "epoch": 0.6432589366721103, + "grad_norm": 0.11547854113144596, + "learning_rate": 0.00029823915205591886, + "loss": 1.3784, + "step": 7090 + }, + { + "epoch": 0.6433496643077481, + "grad_norm": 0.13331878222299587, + "learning_rate": 0.00029810472698016036, + "loss": 1.3821, + "step": 7091 + }, + { + "epoch": 0.643440391943386, + "grad_norm": 0.1131962736663654, + "learning_rate": 0.00029797031933808825, + "loss": 1.3685, + "step": 7092 + }, + { + "epoch": 0.6435311195790238, + "grad_norm": 0.12207510380930747, + "learning_rate": 0.00029783592914130896, + "loss": 1.3773, + "step": 7093 + }, + { + "epoch": 0.6436218472146615, + "grad_norm": 0.10862678117312445, + "learning_rate": 0.00029770155640142704, + "loss": 1.3903, + "step": 7094 + }, + { + "epoch": 0.6437125748502994, + "grad_norm": 0.11159473032234231, + "learning_rate": 0.00029756720113004544, + "loss": 1.3465, + "step": 7095 + }, + { + "epoch": 0.6438033024859372, + "grad_norm": 0.11835364239023569, + "learning_rate": 0.00029743286333876575, + "loss": 1.3457, + "step": 7096 + }, + { + "epoch": 0.643894030121575, + "grad_norm": 0.11762992709829956, + "learning_rate": 0.00029729854303918825, + "loss": 1.3909, + "step": 7097 + }, + { + "epoch": 0.6439847577572129, + "grad_norm": 0.12051436633744823, + "learning_rate": 0.00029716424024291155, + "loss": 1.3725, + "step": 7098 + }, + { + "epoch": 0.6440754853928506, + "grad_norm": 0.12314407312786746, + "learning_rate": 0.0002970299549615325, + "loss": 1.3673, + "step": 7099 + }, + { + "epoch": 0.6441662130284885, + "grad_norm": 0.11864079288420913, + "learning_rate": 0.00029689568720664677, + "loss": 1.3763, + "step": 7100 + }, + { + "epoch": 0.6442569406641263, + "grad_norm": 0.11042195236728017, + "learning_rate": 0.0002967614369898485, + "loss": 1.3396, + "step": 7101 + }, + { + "epoch": 0.6443476682997641, + "grad_norm": 0.11382083712683529, + "learning_rate": 0.00029662720432272995, + "loss": 1.3964, + "step": 7102 + }, + { + "epoch": 0.644438395935402, + "grad_norm": 0.137557769658842, + "learning_rate": 0.00029649298921688227, + "loss": 1.3688, + "step": 7103 + }, + { + "epoch": 0.6445291235710398, + "grad_norm": 0.12455383133314873, + "learning_rate": 0.00029635879168389497, + "loss": 1.3771, + "step": 7104 + }, + { + "epoch": 0.6446198512066775, + "grad_norm": 0.11813656193083807, + "learning_rate": 0.00029622461173535615, + "loss": 1.3815, + "step": 7105 + }, + { + "epoch": 0.6447105788423154, + "grad_norm": 0.11127701640276673, + "learning_rate": 0.000296090449382852, + "loss": 1.3866, + "step": 7106 + }, + { + "epoch": 0.6448013064779532, + "grad_norm": 0.11292914877475813, + "learning_rate": 0.0002959563046379676, + "loss": 1.3605, + "step": 7107 + }, + { + "epoch": 0.644892034113591, + "grad_norm": 0.10964260986451331, + "learning_rate": 0.00029582217751228656, + "loss": 1.4044, + "step": 7108 + }, + { + "epoch": 0.6449827617492289, + "grad_norm": 0.114510288754317, + "learning_rate": 0.00029568806801739045, + "loss": 1.3619, + "step": 7109 + }, + { + "epoch": 0.6450734893848666, + "grad_norm": 0.14358275823860642, + "learning_rate": 0.00029555397616485977, + "loss": 1.4005, + "step": 7110 + }, + { + "epoch": 0.6451642170205044, + "grad_norm": 0.12004357893565315, + "learning_rate": 0.0002954199019662734, + "loss": 1.4409, + "step": 7111 + }, + { + "epoch": 0.6452549446561423, + "grad_norm": 0.11438637095859373, + "learning_rate": 0.00029528584543320847, + "loss": 1.3356, + "step": 7112 + }, + { + "epoch": 0.6453456722917801, + "grad_norm": 0.11706285886663031, + "learning_rate": 0.0002951518065772412, + "loss": 1.3668, + "step": 7113 + }, + { + "epoch": 0.6454363999274179, + "grad_norm": 0.1246973703352442, + "learning_rate": 0.00029501778540994553, + "loss": 1.3718, + "step": 7114 + }, + { + "epoch": 0.6455271275630557, + "grad_norm": 0.13755650679915674, + "learning_rate": 0.00029488378194289446, + "loss": 1.3898, + "step": 7115 + }, + { + "epoch": 0.6456178551986935, + "grad_norm": 0.13859521212433032, + "learning_rate": 0.00029474979618765885, + "loss": 1.3814, + "step": 7116 + }, + { + "epoch": 0.6457085828343313, + "grad_norm": 0.12105784761608392, + "learning_rate": 0.00029461582815580866, + "loss": 1.3953, + "step": 7117 + }, + { + "epoch": 0.6457993104699692, + "grad_norm": 0.13548539429228057, + "learning_rate": 0.00029448187785891206, + "loss": 1.3631, + "step": 7118 + }, + { + "epoch": 0.645890038105607, + "grad_norm": 0.11898552145675922, + "learning_rate": 0.0002943479453085355, + "loss": 1.4034, + "step": 7119 + }, + { + "epoch": 0.6459807657412447, + "grad_norm": 0.12397788590868182, + "learning_rate": 0.00029421403051624404, + "loss": 1.3574, + "step": 7120 + }, + { + "epoch": 0.6460714933768826, + "grad_norm": 0.1228350451987103, + "learning_rate": 0.00029408013349360154, + "loss": 1.3729, + "step": 7121 + }, + { + "epoch": 0.6461622210125204, + "grad_norm": 0.12778587487176335, + "learning_rate": 0.00029394625425216984, + "loss": 1.3722, + "step": 7122 + }, + { + "epoch": 0.6462529486481582, + "grad_norm": 0.12253843093384391, + "learning_rate": 0.00029381239280350946, + "loss": 1.4164, + "step": 7123 + }, + { + "epoch": 0.6463436762837961, + "grad_norm": 0.21276799798170737, + "learning_rate": 0.00029367854915917936, + "loss": 1.4098, + "step": 7124 + }, + { + "epoch": 0.6464344039194339, + "grad_norm": 0.134233222438005, + "learning_rate": 0.000293544723330737, + "loss": 1.3583, + "step": 7125 + }, + { + "epoch": 0.6465251315550716, + "grad_norm": 0.1204211983400856, + "learning_rate": 0.00029341091532973814, + "loss": 1.3748, + "step": 7126 + }, + { + "epoch": 0.6466158591907095, + "grad_norm": 0.13381381920350538, + "learning_rate": 0.00029327712516773706, + "loss": 1.336, + "step": 7127 + }, + { + "epoch": 0.6467065868263473, + "grad_norm": 0.11317320820047835, + "learning_rate": 0.0002931433528562868, + "loss": 1.364, + "step": 7128 + }, + { + "epoch": 0.6467973144619851, + "grad_norm": 0.13306918505372736, + "learning_rate": 0.00029300959840693853, + "loss": 1.4306, + "step": 7129 + }, + { + "epoch": 0.646888042097623, + "grad_norm": 0.1283360899963382, + "learning_rate": 0.000292875861831242, + "loss": 1.38, + "step": 7130 + }, + { + "epoch": 0.6469787697332607, + "grad_norm": 0.11763530573233873, + "learning_rate": 0.0002927421431407452, + "loss": 1.3469, + "step": 7131 + }, + { + "epoch": 0.6470694973688985, + "grad_norm": 0.12154352251076528, + "learning_rate": 0.000292608442346995, + "loss": 1.3851, + "step": 7132 + }, + { + "epoch": 0.6471602250045364, + "grad_norm": 0.12325501453190792, + "learning_rate": 0.0002924747594615364, + "loss": 1.3874, + "step": 7133 + }, + { + "epoch": 0.6472509526401742, + "grad_norm": 0.13075842176035213, + "learning_rate": 0.00029234109449591283, + "loss": 1.3913, + "step": 7134 + }, + { + "epoch": 0.647341680275812, + "grad_norm": 0.12615333520466315, + "learning_rate": 0.00029220744746166625, + "loss": 1.3791, + "step": 7135 + }, + { + "epoch": 0.6474324079114498, + "grad_norm": 0.4271485905408699, + "learning_rate": 0.0002920738183703373, + "loss": 1.384, + "step": 7136 + }, + { + "epoch": 0.6475231355470876, + "grad_norm": 0.11707521011530785, + "learning_rate": 0.0002919402072334648, + "loss": 1.3617, + "step": 7137 + }, + { + "epoch": 0.6476138631827255, + "grad_norm": 0.12452334590121099, + "learning_rate": 0.000291806614062586, + "loss": 1.4007, + "step": 7138 + }, + { + "epoch": 0.6477045908183633, + "grad_norm": 0.1309055148200258, + "learning_rate": 0.00029167303886923693, + "loss": 1.3749, + "step": 7139 + }, + { + "epoch": 0.6477953184540011, + "grad_norm": 0.1108483848946619, + "learning_rate": 0.0002915394816649516, + "loss": 1.3733, + "step": 7140 + }, + { + "epoch": 0.647886046089639, + "grad_norm": 0.1131683685696173, + "learning_rate": 0.0002914059424612628, + "loss": 1.3342, + "step": 7141 + }, + { + "epoch": 0.6479767737252767, + "grad_norm": 0.11774671204664722, + "learning_rate": 0.0002912724212697014, + "loss": 1.3704, + "step": 7142 + }, + { + "epoch": 0.6480675013609145, + "grad_norm": 0.13603532684252362, + "learning_rate": 0.00029113891810179715, + "loss": 1.3766, + "step": 7143 + }, + { + "epoch": 0.6481582289965524, + "grad_norm": 0.11953304252067323, + "learning_rate": 0.0002910054329690784, + "loss": 1.3652, + "step": 7144 + }, + { + "epoch": 0.6482489566321902, + "grad_norm": 0.17282410094515913, + "learning_rate": 0.000290871965883071, + "loss": 1.3629, + "step": 7145 + }, + { + "epoch": 0.648339684267828, + "grad_norm": 0.13041320620621621, + "learning_rate": 0.00029073851685530044, + "loss": 1.3611, + "step": 7146 + }, + { + "epoch": 0.6484304119034658, + "grad_norm": 0.11376077359771714, + "learning_rate": 0.0002906050858972898, + "loss": 1.4097, + "step": 7147 + }, + { + "epoch": 0.6485211395391036, + "grad_norm": 0.12316954236289329, + "learning_rate": 0.00029047167302056066, + "loss": 1.4026, + "step": 7148 + }, + { + "epoch": 0.6486118671747414, + "grad_norm": 0.12814060329146204, + "learning_rate": 0.0002903382782366336, + "loss": 1.3676, + "step": 7149 + }, + { + "epoch": 0.6487025948103793, + "grad_norm": 0.12454042645005382, + "learning_rate": 0.0002902049015570271, + "loss": 1.3796, + "step": 7150 + }, + { + "epoch": 0.6487933224460171, + "grad_norm": 0.12830906369033007, + "learning_rate": 0.0002900715429932581, + "loss": 1.3641, + "step": 7151 + }, + { + "epoch": 0.6488840500816548, + "grad_norm": 0.11392746260520092, + "learning_rate": 0.0002899382025568423, + "loss": 1.3578, + "step": 7152 + }, + { + "epoch": 0.6489747777172927, + "grad_norm": 0.13735371604177854, + "learning_rate": 0.00028980488025929385, + "loss": 1.3822, + "step": 7153 + }, + { + "epoch": 0.6490655053529305, + "grad_norm": 0.11654296895581388, + "learning_rate": 0.00028967157611212505, + "loss": 1.3786, + "step": 7154 + }, + { + "epoch": 0.6491562329885683, + "grad_norm": 0.15674854033998314, + "learning_rate": 0.0002895382901268463, + "loss": 1.41, + "step": 7155 + }, + { + "epoch": 0.6492469606242062, + "grad_norm": 0.13009078908777807, + "learning_rate": 0.0002894050223149676, + "loss": 1.3962, + "step": 7156 + }, + { + "epoch": 0.649337688259844, + "grad_norm": 0.14760030226825238, + "learning_rate": 0.00028927177268799607, + "loss": 1.4016, + "step": 7157 + }, + { + "epoch": 0.6494284158954817, + "grad_norm": 0.11837570598888025, + "learning_rate": 0.00028913854125743794, + "loss": 1.3622, + "step": 7158 + }, + { + "epoch": 0.6495191435311196, + "grad_norm": 0.11463308907125873, + "learning_rate": 0.00028900532803479784, + "loss": 1.3824, + "step": 7159 + }, + { + "epoch": 0.6496098711667574, + "grad_norm": 0.11623047628836106, + "learning_rate": 0.00028887213303157894, + "loss": 1.3948, + "step": 7160 + }, + { + "epoch": 0.6497005988023952, + "grad_norm": 0.11260678640772745, + "learning_rate": 0.0002887389562592825, + "loss": 1.3533, + "step": 7161 + }, + { + "epoch": 0.6497913264380331, + "grad_norm": 0.11016569807002408, + "learning_rate": 0.0002886057977294081, + "loss": 1.3716, + "step": 7162 + }, + { + "epoch": 0.6498820540736708, + "grad_norm": 0.12436506130325069, + "learning_rate": 0.00028847265745345443, + "loss": 1.4021, + "step": 7163 + }, + { + "epoch": 0.6499727817093086, + "grad_norm": 0.11808933101909663, + "learning_rate": 0.00028833953544291796, + "loss": 1.4013, + "step": 7164 + }, + { + "epoch": 0.6500635093449465, + "grad_norm": 0.11574190816681199, + "learning_rate": 0.00028820643170929364, + "loss": 1.3792, + "step": 7165 + }, + { + "epoch": 0.6501542369805843, + "grad_norm": 0.12718331612856734, + "learning_rate": 0.0002880733462640751, + "loss": 1.3769, + "step": 7166 + }, + { + "epoch": 0.650244964616222, + "grad_norm": 0.1136848472834443, + "learning_rate": 0.0002879402791187545, + "loss": 1.3863, + "step": 7167 + }, + { + "epoch": 0.6503356922518599, + "grad_norm": 0.11880048058116337, + "learning_rate": 0.00028780723028482214, + "loss": 1.4027, + "step": 7168 + }, + { + "epoch": 0.6504264198874977, + "grad_norm": 0.1308645264228704, + "learning_rate": 0.0002876741997737665, + "loss": 1.3776, + "step": 7169 + }, + { + "epoch": 0.6505171475231355, + "grad_norm": 0.13260938236926398, + "learning_rate": 0.00028754118759707523, + "loss": 1.3697, + "step": 7170 + }, + { + "epoch": 0.6506078751587734, + "grad_norm": 0.11712207881336427, + "learning_rate": 0.00028740819376623375, + "loss": 1.3868, + "step": 7171 + }, + { + "epoch": 0.6506986027944112, + "grad_norm": 0.11839446342639075, + "learning_rate": 0.0002872752182927257, + "loss": 1.411, + "step": 7172 + }, + { + "epoch": 0.6507893304300489, + "grad_norm": 0.11800631847328863, + "learning_rate": 0.00028714226118803425, + "loss": 1.3798, + "step": 7173 + }, + { + "epoch": 0.6508800580656868, + "grad_norm": 0.12195490570836148, + "learning_rate": 0.00028700932246363974, + "loss": 1.3999, + "step": 7174 + }, + { + "epoch": 0.6509707857013246, + "grad_norm": 0.11996115059319369, + "learning_rate": 0.0002868764021310217, + "loss": 1.3613, + "step": 7175 + }, + { + "epoch": 0.6510615133369625, + "grad_norm": 0.13052049015120093, + "learning_rate": 0.00028674350020165766, + "loss": 1.3332, + "step": 7176 + }, + { + "epoch": 0.6511522409726003, + "grad_norm": 0.11456554827686867, + "learning_rate": 0.000286610616687024, + "loss": 1.3742, + "step": 7177 + }, + { + "epoch": 0.651242968608238, + "grad_norm": 0.12722226271265713, + "learning_rate": 0.00028647775159859504, + "loss": 1.3699, + "step": 7178 + }, + { + "epoch": 0.6513336962438759, + "grad_norm": 0.13276271708250414, + "learning_rate": 0.00028634490494784345, + "loss": 1.3898, + "step": 7179 + }, + { + "epoch": 0.6514244238795137, + "grad_norm": 0.11562454922488828, + "learning_rate": 0.000286212076746241, + "loss": 1.3359, + "step": 7180 + }, + { + "epoch": 0.6515151515151515, + "grad_norm": 0.12229973767348623, + "learning_rate": 0.0002860792670052572, + "loss": 1.4194, + "step": 7181 + }, + { + "epoch": 0.6516058791507894, + "grad_norm": 0.13258296557276342, + "learning_rate": 0.0002859464757363601, + "loss": 1.3436, + "step": 7182 + }, + { + "epoch": 0.6516966067864272, + "grad_norm": 0.11836668734047653, + "learning_rate": 0.0002858137029510164, + "loss": 1.3622, + "step": 7183 + }, + { + "epoch": 0.6517873344220649, + "grad_norm": 0.16770742021295054, + "learning_rate": 0.00028568094866069114, + "loss": 1.3806, + "step": 7184 + }, + { + "epoch": 0.6518780620577028, + "grad_norm": 0.12219351472524019, + "learning_rate": 0.0002855482128768476, + "loss": 1.3873, + "step": 7185 + }, + { + "epoch": 0.6519687896933406, + "grad_norm": 0.1257369587223857, + "learning_rate": 0.00028541549561094726, + "loss": 1.3935, + "step": 7186 + }, + { + "epoch": 0.6520595173289784, + "grad_norm": 0.12267844181448982, + "learning_rate": 0.0002852827968744507, + "loss": 1.3745, + "step": 7187 + }, + { + "epoch": 0.6521502449646163, + "grad_norm": 0.11291260389610354, + "learning_rate": 0.0002851501166788163, + "loss": 1.3959, + "step": 7188 + }, + { + "epoch": 0.652240972600254, + "grad_norm": 0.11169983151130074, + "learning_rate": 0.00028501745503550075, + "loss": 1.402, + "step": 7189 + }, + { + "epoch": 0.6523317002358918, + "grad_norm": 0.13638610762221437, + "learning_rate": 0.00028488481195595967, + "loss": 1.3846, + "step": 7190 + }, + { + "epoch": 0.6524224278715297, + "grad_norm": 0.11336258042620616, + "learning_rate": 0.000284752187451647, + "loss": 1.3964, + "step": 7191 + }, + { + "epoch": 0.6525131555071675, + "grad_norm": 0.11630367977805933, + "learning_rate": 0.0002846195815340146, + "loss": 1.371, + "step": 7192 + }, + { + "epoch": 0.6526038831428053, + "grad_norm": 0.1232458306159223, + "learning_rate": 0.00028448699421451294, + "loss": 1.3298, + "step": 7193 + }, + { + "epoch": 0.6526946107784432, + "grad_norm": 0.11686166104241436, + "learning_rate": 0.0002843544255045912, + "loss": 1.3974, + "step": 7194 + }, + { + "epoch": 0.6527853384140809, + "grad_norm": 0.11861714001162811, + "learning_rate": 0.0002842218754156968, + "loss": 1.357, + "step": 7195 + }, + { + "epoch": 0.6528760660497187, + "grad_norm": 0.12081147823592081, + "learning_rate": 0.00028408934395927486, + "loss": 1.3703, + "step": 7196 + }, + { + "epoch": 0.6529667936853566, + "grad_norm": 0.12098855164546703, + "learning_rate": 0.0002839568311467702, + "loss": 1.4063, + "step": 7197 + }, + { + "epoch": 0.6530575213209944, + "grad_norm": 0.11274898425889317, + "learning_rate": 0.00028382433698962475, + "loss": 1.3522, + "step": 7198 + }, + { + "epoch": 0.6531482489566321, + "grad_norm": 0.11247801278978367, + "learning_rate": 0.0002836918614992799, + "loss": 1.3837, + "step": 7199 + }, + { + "epoch": 0.65323897659227, + "grad_norm": 0.12911970129429043, + "learning_rate": 0.00028355940468717446, + "loss": 1.352, + "step": 7200 + }, + { + "epoch": 0.6533297042279078, + "grad_norm": 0.16470880697878712, + "learning_rate": 0.0002834269665647465, + "loss": 1.3785, + "step": 7201 + }, + { + "epoch": 0.6534204318635456, + "grad_norm": 0.6277702432409015, + "learning_rate": 0.000283294547143432, + "loss": 1.3727, + "step": 7202 + }, + { + "epoch": 0.6535111594991835, + "grad_norm": 0.11685025240500294, + "learning_rate": 0.000283162146434665, + "loss": 1.3651, + "step": 7203 + }, + { + "epoch": 0.6536018871348213, + "grad_norm": 0.11860470280283002, + "learning_rate": 0.0002830297644498789, + "loss": 1.3618, + "step": 7204 + }, + { + "epoch": 0.653692614770459, + "grad_norm": 0.12054715995147897, + "learning_rate": 0.00028289740120050435, + "loss": 1.3418, + "step": 7205 + }, + { + "epoch": 0.6537833424060969, + "grad_norm": 0.12607588575313258, + "learning_rate": 0.0002827650566979713, + "loss": 1.3699, + "step": 7206 + }, + { + "epoch": 0.6538740700417347, + "grad_norm": 0.13777489105738944, + "learning_rate": 0.00028263273095370766, + "loss": 1.3843, + "step": 7207 + }, + { + "epoch": 0.6539647976773725, + "grad_norm": 0.12065932407948178, + "learning_rate": 0.0002825004239791398, + "loss": 1.3823, + "step": 7208 + }, + { + "epoch": 0.6540555253130104, + "grad_norm": 0.12707543597773063, + "learning_rate": 0.00028236813578569244, + "loss": 1.3648, + "step": 7209 + }, + { + "epoch": 0.6541462529486481, + "grad_norm": 0.139039216581424, + "learning_rate": 0.0002822358663847884, + "loss": 1.4059, + "step": 7210 + }, + { + "epoch": 0.6542369805842859, + "grad_norm": 0.11731292675789952, + "learning_rate": 0.0002821036157878496, + "loss": 1.4072, + "step": 7211 + }, + { + "epoch": 0.6543277082199238, + "grad_norm": 0.12971786991035258, + "learning_rate": 0.0002819713840062956, + "loss": 1.3991, + "step": 7212 + }, + { + "epoch": 0.6544184358555616, + "grad_norm": 0.13183216076456214, + "learning_rate": 0.00028183917105154444, + "loss": 1.3822, + "step": 7213 + }, + { + "epoch": 0.6545091634911994, + "grad_norm": 0.12545107190927743, + "learning_rate": 0.0002817069769350134, + "loss": 1.3817, + "step": 7214 + }, + { + "epoch": 0.6545998911268373, + "grad_norm": 0.12299655153878182, + "learning_rate": 0.0002815748016681171, + "loss": 1.3439, + "step": 7215 + }, + { + "epoch": 0.654690618762475, + "grad_norm": 0.13008226043141619, + "learning_rate": 0.0002814426452622686, + "loss": 1.3735, + "step": 7216 + }, + { + "epoch": 0.6547813463981129, + "grad_norm": 0.14204129827098996, + "learning_rate": 0.0002813105077288801, + "loss": 1.406, + "step": 7217 + }, + { + "epoch": 0.6548720740337507, + "grad_norm": 0.14785230908263314, + "learning_rate": 0.0002811783890793615, + "loss": 1.3941, + "step": 7218 + }, + { + "epoch": 0.6549628016693885, + "grad_norm": 0.1249185310496207, + "learning_rate": 0.0002810462893251213, + "loss": 1.3584, + "step": 7219 + }, + { + "epoch": 0.6550535293050264, + "grad_norm": 0.14125111993000958, + "learning_rate": 0.000280914208477566, + "loss": 1.3873, + "step": 7220 + }, + { + "epoch": 0.6551442569406641, + "grad_norm": 0.14494446775329983, + "learning_rate": 0.0002807821465481011, + "loss": 1.3766, + "step": 7221 + }, + { + "epoch": 0.6552349845763019, + "grad_norm": 0.13348239327606629, + "learning_rate": 0.0002806501035481305, + "loss": 1.4067, + "step": 7222 + }, + { + "epoch": 0.6553257122119398, + "grad_norm": 0.1270208504694051, + "learning_rate": 0.00028051807948905537, + "loss": 1.4226, + "step": 7223 + }, + { + "epoch": 0.6554164398475776, + "grad_norm": 0.13040466954011953, + "learning_rate": 0.0002803860743822768, + "loss": 1.3507, + "step": 7224 + }, + { + "epoch": 0.6555071674832154, + "grad_norm": 0.1359968914491045, + "learning_rate": 0.000280254088239193, + "loss": 1.3491, + "step": 7225 + }, + { + "epoch": 0.6555978951188532, + "grad_norm": 0.13756474979130756, + "learning_rate": 0.0002801221210712008, + "loss": 1.3696, + "step": 7226 + }, + { + "epoch": 0.655688622754491, + "grad_norm": 0.1264782067021046, + "learning_rate": 0.0002799901728896962, + "loss": 1.3895, + "step": 7227 + }, + { + "epoch": 0.6557793503901288, + "grad_norm": 0.12645691350143018, + "learning_rate": 0.0002798582437060725, + "loss": 1.442, + "step": 7228 + }, + { + "epoch": 0.6558700780257667, + "grad_norm": 0.1210164508023837, + "learning_rate": 0.0002797263335317217, + "loss": 1.359, + "step": 7229 + }, + { + "epoch": 0.6559608056614045, + "grad_norm": 0.14623342289625102, + "learning_rate": 0.0002795944423780346, + "loss": 1.3915, + "step": 7230 + }, + { + "epoch": 0.6560515332970422, + "grad_norm": 0.14635440991054713, + "learning_rate": 0.0002794625702563999, + "loss": 1.4271, + "step": 7231 + }, + { + "epoch": 0.6561422609326801, + "grad_norm": 0.13987458907861333, + "learning_rate": 0.0002793307171782048, + "loss": 1.3205, + "step": 7232 + }, + { + "epoch": 0.6562329885683179, + "grad_norm": 0.16968653817937626, + "learning_rate": 0.00027919888315483467, + "loss": 1.4109, + "step": 7233 + }, + { + "epoch": 0.6563237162039557, + "grad_norm": 0.13058065135434133, + "learning_rate": 0.00027906706819767367, + "loss": 1.395, + "step": 7234 + }, + { + "epoch": 0.6564144438395936, + "grad_norm": 0.13310884570792547, + "learning_rate": 0.0002789352723181039, + "loss": 1.4083, + "step": 7235 + }, + { + "epoch": 0.6565051714752314, + "grad_norm": 0.12324089987265481, + "learning_rate": 0.0002788034955275058, + "loss": 1.3651, + "step": 7236 + }, + { + "epoch": 0.6565958991108691, + "grad_norm": 0.13449193243367696, + "learning_rate": 0.0002786717378372584, + "loss": 1.3736, + "step": 7237 + }, + { + "epoch": 0.656686626746507, + "grad_norm": 0.1392899805281385, + "learning_rate": 0.0002785399992587393, + "loss": 1.4123, + "step": 7238 + }, + { + "epoch": 0.6567773543821448, + "grad_norm": 0.12317902608776439, + "learning_rate": 0.00027840827980332386, + "loss": 1.3929, + "step": 7239 + }, + { + "epoch": 0.6568680820177826, + "grad_norm": 0.1681162216299024, + "learning_rate": 0.00027827657948238595, + "loss": 1.3537, + "step": 7240 + }, + { + "epoch": 0.6569588096534205, + "grad_norm": 0.11882665552223069, + "learning_rate": 0.00027814489830729826, + "loss": 1.3632, + "step": 7241 + }, + { + "epoch": 0.6570495372890582, + "grad_norm": 0.12437570446564564, + "learning_rate": 0.0002780132362894313, + "loss": 1.3717, + "step": 7242 + }, + { + "epoch": 0.657140264924696, + "grad_norm": 0.13771741234283755, + "learning_rate": 0.0002778815934401539, + "loss": 1.3542, + "step": 7243 + }, + { + "epoch": 0.6572309925603339, + "grad_norm": 0.13378277292119314, + "learning_rate": 0.00027774996977083366, + "loss": 1.4041, + "step": 7244 + }, + { + "epoch": 0.6573217201959717, + "grad_norm": 0.14529390356829408, + "learning_rate": 0.00027761836529283644, + "loss": 1.4245, + "step": 7245 + }, + { + "epoch": 0.6574124478316095, + "grad_norm": 0.12948857720029439, + "learning_rate": 0.00027748678001752623, + "loss": 1.3497, + "step": 7246 + }, + { + "epoch": 0.6575031754672473, + "grad_norm": 0.12174008150356845, + "learning_rate": 0.0002773552139562651, + "loss": 1.3684, + "step": 7247 + }, + { + "epoch": 0.6575939031028851, + "grad_norm": 0.12926746323008867, + "learning_rate": 0.0002772236671204143, + "loss": 1.3714, + "step": 7248 + }, + { + "epoch": 0.6576846307385229, + "grad_norm": 0.13705922770355705, + "learning_rate": 0.0002770921395213327, + "loss": 1.3716, + "step": 7249 + }, + { + "epoch": 0.6577753583741608, + "grad_norm": 0.12780918085169313, + "learning_rate": 0.0002769606311703774, + "loss": 1.3814, + "step": 7250 + }, + { + "epoch": 0.6578660860097986, + "grad_norm": 0.13460899271764343, + "learning_rate": 0.00027682914207890477, + "loss": 1.3553, + "step": 7251 + }, + { + "epoch": 0.6579568136454363, + "grad_norm": 0.13257905154790062, + "learning_rate": 0.0002766976722582684, + "loss": 1.3826, + "step": 7252 + }, + { + "epoch": 0.6580475412810742, + "grad_norm": 0.123167180947067, + "learning_rate": 0.0002765662217198211, + "loss": 1.4154, + "step": 7253 + }, + { + "epoch": 0.658138268916712, + "grad_norm": 0.13767077990191867, + "learning_rate": 0.0002764347904749133, + "loss": 1.3685, + "step": 7254 + }, + { + "epoch": 0.6582289965523499, + "grad_norm": 0.13663607351851437, + "learning_rate": 0.0002763033785348945, + "loss": 1.3676, + "step": 7255 + }, + { + "epoch": 0.6583197241879877, + "grad_norm": 0.1299743482316489, + "learning_rate": 0.00027617198591111194, + "loss": 1.3973, + "step": 7256 + }, + { + "epoch": 0.6584104518236255, + "grad_norm": 0.13630574634387277, + "learning_rate": 0.0002760406126149112, + "loss": 1.3699, + "step": 7257 + }, + { + "epoch": 0.6585011794592633, + "grad_norm": 0.12855613163412852, + "learning_rate": 0.0002759092586576367, + "loss": 1.3556, + "step": 7258 + }, + { + "epoch": 0.6585919070949011, + "grad_norm": 0.11626998517963294, + "learning_rate": 0.0002757779240506308, + "loss": 1.3875, + "step": 7259 + }, + { + "epoch": 0.6586826347305389, + "grad_norm": 0.12253254505281029, + "learning_rate": 0.00027564660880523403, + "loss": 1.3639, + "step": 7260 + }, + { + "epoch": 0.6587733623661768, + "grad_norm": 0.13306524402805084, + "learning_rate": 0.00027551531293278564, + "loss": 1.3435, + "step": 7261 + }, + { + "epoch": 0.6588640900018146, + "grad_norm": 0.12509727288442135, + "learning_rate": 0.0002753840364446232, + "loss": 1.3615, + "step": 7262 + }, + { + "epoch": 0.6589548176374523, + "grad_norm": 0.1127759956342755, + "learning_rate": 0.00027525277935208235, + "loss": 1.3567, + "step": 7263 + }, + { + "epoch": 0.6590455452730902, + "grad_norm": 0.12184495630665776, + "learning_rate": 0.00027512154166649695, + "loss": 1.37, + "step": 7264 + }, + { + "epoch": 0.659136272908728, + "grad_norm": 0.12186348091089341, + "learning_rate": 0.00027499032339919975, + "loss": 1.3312, + "step": 7265 + }, + { + "epoch": 0.6592270005443658, + "grad_norm": 0.13490931300804554, + "learning_rate": 0.0002748591245615213, + "loss": 1.3665, + "step": 7266 + }, + { + "epoch": 0.6593177281800037, + "grad_norm": 0.12020599449802448, + "learning_rate": 0.0002747279451647905, + "loss": 1.4087, + "step": 7267 + }, + { + "epoch": 0.6594084558156414, + "grad_norm": 0.1153068077228643, + "learning_rate": 0.0002745967852203347, + "loss": 1.3552, + "step": 7268 + }, + { + "epoch": 0.6594991834512792, + "grad_norm": 0.16498166882591353, + "learning_rate": 0.00027446564473948, + "loss": 1.3976, + "step": 7269 + }, + { + "epoch": 0.6595899110869171, + "grad_norm": 0.13111771980407907, + "learning_rate": 0.00027433452373355007, + "loss": 1.3496, + "step": 7270 + }, + { + "epoch": 0.6596806387225549, + "grad_norm": 0.1340955653145463, + "learning_rate": 0.0002742034222138671, + "loss": 1.3784, + "step": 7271 + }, + { + "epoch": 0.6597713663581927, + "grad_norm": 0.13991454212965126, + "learning_rate": 0.00027407234019175214, + "loss": 1.386, + "step": 7272 + }, + { + "epoch": 0.6598620939938306, + "grad_norm": 0.11992032768776396, + "learning_rate": 0.0002739412776785238, + "loss": 1.3618, + "step": 7273 + }, + { + "epoch": 0.6599528216294683, + "grad_norm": 0.12057896296506847, + "learning_rate": 0.00027381023468549937, + "loss": 1.3631, + "step": 7274 + }, + { + "epoch": 0.6600435492651061, + "grad_norm": 0.31499739671816884, + "learning_rate": 0.00027367921122399465, + "loss": 1.3779, + "step": 7275 + }, + { + "epoch": 0.660134276900744, + "grad_norm": 0.12338947074754032, + "learning_rate": 0.0002735482073053233, + "loss": 1.3291, + "step": 7276 + }, + { + "epoch": 0.6602250045363818, + "grad_norm": 0.13907624085552234, + "learning_rate": 0.00027341722294079763, + "loss": 1.3532, + "step": 7277 + }, + { + "epoch": 0.6603157321720196, + "grad_norm": 0.11233912694698917, + "learning_rate": 0.00027328625814172807, + "loss": 1.3625, + "step": 7278 + }, + { + "epoch": 0.6604064598076574, + "grad_norm": 0.1260651304295861, + "learning_rate": 0.00027315531291942374, + "loss": 1.3512, + "step": 7279 + }, + { + "epoch": 0.6604971874432952, + "grad_norm": 0.1167862651697133, + "learning_rate": 0.0002730243872851915, + "loss": 1.4013, + "step": 7280 + }, + { + "epoch": 0.660587915078933, + "grad_norm": 0.11090813662164516, + "learning_rate": 0.00027289348125033675, + "loss": 1.3973, + "step": 7281 + }, + { + "epoch": 0.6606786427145709, + "grad_norm": 0.11339610141666082, + "learning_rate": 0.0002727625948261635, + "loss": 1.3667, + "step": 7282 + }, + { + "epoch": 0.6607693703502087, + "grad_norm": 0.2335996254945483, + "learning_rate": 0.00027263172802397353, + "loss": 1.3926, + "step": 7283 + }, + { + "epoch": 0.6608600979858464, + "grad_norm": 0.13114459264929298, + "learning_rate": 0.00027250088085506734, + "loss": 1.375, + "step": 7284 + }, + { + "epoch": 0.6609508256214843, + "grad_norm": 0.11659863462343154, + "learning_rate": 0.0002723700533307438, + "loss": 1.3939, + "step": 7285 + }, + { + "epoch": 0.6610415532571221, + "grad_norm": 0.1223873000344312, + "learning_rate": 0.00027223924546229976, + "loss": 1.3443, + "step": 7286 + }, + { + "epoch": 0.6611322808927599, + "grad_norm": 0.11991355443112649, + "learning_rate": 0.0002721084572610304, + "loss": 1.4002, + "step": 7287 + }, + { + "epoch": 0.6612230085283978, + "grad_norm": 0.1242809260209631, + "learning_rate": 0.00027197768873822917, + "loss": 1.3891, + "step": 7288 + }, + { + "epoch": 0.6613137361640355, + "grad_norm": 0.13796404712416413, + "learning_rate": 0.00027184693990518825, + "loss": 1.3835, + "step": 7289 + }, + { + "epoch": 0.6614044637996733, + "grad_norm": 0.11816919616928952, + "learning_rate": 0.0002717162107731978, + "loss": 1.3875, + "step": 7290 + }, + { + "epoch": 0.6614951914353112, + "grad_norm": 0.11328103348149986, + "learning_rate": 0.0002715855013535458, + "loss": 1.4091, + "step": 7291 + }, + { + "epoch": 0.661585919070949, + "grad_norm": 0.11941884140597249, + "learning_rate": 0.00027145481165751975, + "loss": 1.3822, + "step": 7292 + }, + { + "epoch": 0.6616766467065869, + "grad_norm": 0.11093065160597616, + "learning_rate": 0.0002713241416964044, + "loss": 1.3608, + "step": 7293 + }, + { + "epoch": 0.6617673743422247, + "grad_norm": 0.11673661728157678, + "learning_rate": 0.0002711934914814829, + "loss": 1.3486, + "step": 7294 + }, + { + "epoch": 0.6618581019778624, + "grad_norm": 0.11354013307812139, + "learning_rate": 0.00027106286102403736, + "loss": 1.3765, + "step": 7295 + }, + { + "epoch": 0.6619488296135003, + "grad_norm": 0.12585902165060162, + "learning_rate": 0.00027093225033534754, + "loss": 1.3779, + "step": 7296 + }, + { + "epoch": 0.6620395572491381, + "grad_norm": 0.11934843872123646, + "learning_rate": 0.0002708016594266917, + "loss": 1.384, + "step": 7297 + }, + { + "epoch": 0.6621302848847759, + "grad_norm": 0.12386924464824753, + "learning_rate": 0.00027067108830934605, + "loss": 1.392, + "step": 7298 + }, + { + "epoch": 0.6622210125204138, + "grad_norm": 0.11449381683506124, + "learning_rate": 0.00027054053699458573, + "loss": 1.4126, + "step": 7299 + }, + { + "epoch": 0.6623117401560515, + "grad_norm": 0.1260848960098631, + "learning_rate": 0.00027041000549368413, + "loss": 1.4144, + "step": 7300 + }, + { + "epoch": 0.6624024677916893, + "grad_norm": 0.12014196358720941, + "learning_rate": 0.0002702794938179122, + "loss": 1.4014, + "step": 7301 + }, + { + "epoch": 0.6624931954273272, + "grad_norm": 0.11612185200321452, + "learning_rate": 0.0002701490019785399, + "loss": 1.3602, + "step": 7302 + }, + { + "epoch": 0.662583923062965, + "grad_norm": 0.12239543504669433, + "learning_rate": 0.0002700185299868353, + "loss": 1.371, + "step": 7303 + }, + { + "epoch": 0.6626746506986028, + "grad_norm": 0.1262444385122313, + "learning_rate": 0.00026988807785406426, + "loss": 1.3998, + "step": 7304 + }, + { + "epoch": 0.6627653783342407, + "grad_norm": 0.13080690709603982, + "learning_rate": 0.00026975764559149186, + "loss": 1.3637, + "step": 7305 + }, + { + "epoch": 0.6628561059698784, + "grad_norm": 0.1565455032313657, + "learning_rate": 0.0002696272332103806, + "loss": 1.3848, + "step": 7306 + }, + { + "epoch": 0.6629468336055162, + "grad_norm": 0.1186452849118037, + "learning_rate": 0.0002694968407219917, + "loss": 1.3442, + "step": 7307 + }, + { + "epoch": 0.6630375612411541, + "grad_norm": 0.12579965506120178, + "learning_rate": 0.00026936646813758436, + "loss": 1.3802, + "step": 7308 + }, + { + "epoch": 0.6631282888767919, + "grad_norm": 0.13378201456468253, + "learning_rate": 0.0002692361154684168, + "loss": 1.4075, + "step": 7309 + }, + { + "epoch": 0.6632190165124296, + "grad_norm": 0.1339438855250471, + "learning_rate": 0.00026910578272574463, + "loss": 1.3974, + "step": 7310 + }, + { + "epoch": 0.6633097441480675, + "grad_norm": 0.12535277514191967, + "learning_rate": 0.000268975469920822, + "loss": 1.4031, + "step": 7311 + }, + { + "epoch": 0.6634004717837053, + "grad_norm": 0.13621642881933785, + "learning_rate": 0.0002688451770649018, + "loss": 1.3799, + "step": 7312 + }, + { + "epoch": 0.6634911994193431, + "grad_norm": 0.12448160229793535, + "learning_rate": 0.00026871490416923463, + "loss": 1.3844, + "step": 7313 + }, + { + "epoch": 0.663581927054981, + "grad_norm": 0.11309680366064753, + "learning_rate": 0.0002685846512450693, + "loss": 1.3602, + "step": 7314 + }, + { + "epoch": 0.6636726546906188, + "grad_norm": 0.11536406326002713, + "learning_rate": 0.00026845441830365354, + "loss": 1.3456, + "step": 7315 + }, + { + "epoch": 0.6637633823262565, + "grad_norm": 0.11976516261117565, + "learning_rate": 0.000268324205356233, + "loss": 1.364, + "step": 7316 + }, + { + "epoch": 0.6638541099618944, + "grad_norm": 0.12503882106245007, + "learning_rate": 0.0002681940124140515, + "loss": 1.3564, + "step": 7317 + }, + { + "epoch": 0.6639448375975322, + "grad_norm": 0.23192332078088218, + "learning_rate": 0.000268063839488351, + "loss": 1.4143, + "step": 7318 + }, + { + "epoch": 0.66403556523317, + "grad_norm": 0.12016368401952837, + "learning_rate": 0.0002679336865903724, + "loss": 1.4048, + "step": 7319 + }, + { + "epoch": 0.6641262928688079, + "grad_norm": 0.11596116266667787, + "learning_rate": 0.00026780355373135406, + "loss": 1.3766, + "step": 7320 + }, + { + "epoch": 0.6642170205044456, + "grad_norm": 0.12689114247972708, + "learning_rate": 0.000267673440922533, + "loss": 1.3568, + "step": 7321 + }, + { + "epoch": 0.6643077481400834, + "grad_norm": 0.12227325338072499, + "learning_rate": 0.0002675433481751445, + "loss": 1.3809, + "step": 7322 + }, + { + "epoch": 0.6643984757757213, + "grad_norm": 0.1185991982871066, + "learning_rate": 0.00026741327550042226, + "loss": 1.3329, + "step": 7323 + }, + { + "epoch": 0.6644892034113591, + "grad_norm": 0.11804437075582208, + "learning_rate": 0.00026728322290959806, + "loss": 1.3373, + "step": 7324 + }, + { + "epoch": 0.6645799310469969, + "grad_norm": 0.12122013706600089, + "learning_rate": 0.0002671531904139016, + "loss": 1.3693, + "step": 7325 + }, + { + "epoch": 0.6646706586826348, + "grad_norm": 0.12921928798244373, + "learning_rate": 0.0002670231780245617, + "loss": 1.3667, + "step": 7326 + }, + { + "epoch": 0.6647613863182725, + "grad_norm": 0.12318944993825749, + "learning_rate": 0.0002668931857528047, + "loss": 1.3586, + "step": 7327 + }, + { + "epoch": 0.6648521139539103, + "grad_norm": 0.12055250482802614, + "learning_rate": 0.00026676321360985533, + "loss": 1.3739, + "step": 7328 + }, + { + "epoch": 0.6649428415895482, + "grad_norm": 0.12249636367980656, + "learning_rate": 0.000266633261606937, + "loss": 1.3539, + "step": 7329 + }, + { + "epoch": 0.665033569225186, + "grad_norm": 0.13260287106035248, + "learning_rate": 0.0002665033297552707, + "loss": 1.3531, + "step": 7330 + }, + { + "epoch": 0.6651242968608239, + "grad_norm": 0.13235227550129314, + "learning_rate": 0.00026637341806607653, + "loss": 1.4084, + "step": 7331 + }, + { + "epoch": 0.6652150244964616, + "grad_norm": 0.12763258946249326, + "learning_rate": 0.00026624352655057185, + "loss": 1.3756, + "step": 7332 + }, + { + "epoch": 0.6653057521320994, + "grad_norm": 0.1292341052383407, + "learning_rate": 0.00026611365521997344, + "loss": 1.3797, + "step": 7333 + }, + { + "epoch": 0.6653964797677373, + "grad_norm": 0.14303791552980147, + "learning_rate": 0.00026598380408549525, + "loss": 1.3666, + "step": 7334 + }, + { + "epoch": 0.6654872074033751, + "grad_norm": 0.1402882976391437, + "learning_rate": 0.00026585397315834994, + "loss": 1.345, + "step": 7335 + }, + { + "epoch": 0.6655779350390129, + "grad_norm": 0.13068197879292748, + "learning_rate": 0.00026572416244974875, + "loss": 1.364, + "step": 7336 + }, + { + "epoch": 0.6656686626746507, + "grad_norm": 0.12227194977694449, + "learning_rate": 0.00026559437197090066, + "loss": 1.4233, + "step": 7337 + }, + { + "epoch": 0.6657593903102885, + "grad_norm": 0.11104159980460115, + "learning_rate": 0.0002654646017330129, + "loss": 1.3926, + "step": 7338 + }, + { + "epoch": 0.6658501179459263, + "grad_norm": 0.12142125799673921, + "learning_rate": 0.00026533485174729134, + "loss": 1.3531, + "step": 7339 + }, + { + "epoch": 0.6659408455815642, + "grad_norm": 0.12243664426153426, + "learning_rate": 0.0002652051220249401, + "loss": 1.3808, + "step": 7340 + }, + { + "epoch": 0.666031573217202, + "grad_norm": 0.14050522609601754, + "learning_rate": 0.00026507541257716116, + "loss": 1.36, + "step": 7341 + }, + { + "epoch": 0.6661223008528397, + "grad_norm": 0.16136359358017685, + "learning_rate": 0.00026494572341515487, + "loss": 1.3672, + "step": 7342 + }, + { + "epoch": 0.6662130284884776, + "grad_norm": 0.13032635839358853, + "learning_rate": 0.00026481605455012014, + "loss": 1.4087, + "step": 7343 + }, + { + "epoch": 0.6663037561241154, + "grad_norm": 0.14320046425949304, + "learning_rate": 0.00026468640599325375, + "loss": 1.3385, + "step": 7344 + }, + { + "epoch": 0.6663944837597532, + "grad_norm": 0.12317789425294369, + "learning_rate": 0.0002645567777557507, + "loss": 1.339, + "step": 7345 + }, + { + "epoch": 0.6664852113953911, + "grad_norm": 0.1258391912814808, + "learning_rate": 0.00026442716984880453, + "loss": 1.4042, + "step": 7346 + }, + { + "epoch": 0.6665759390310289, + "grad_norm": 0.12016237016913825, + "learning_rate": 0.0002642975822836072, + "loss": 1.3649, + "step": 7347 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.11302510456475463, + "learning_rate": 0.0002641680150713485, + "loss": 1.3482, + "step": 7348 + }, + { + "epoch": 0.6667573943023045, + "grad_norm": 0.1427023485545906, + "learning_rate": 0.0002640384682232161, + "loss": 1.3652, + "step": 7349 + }, + { + "epoch": 0.6668481219379423, + "grad_norm": 0.1165463308089962, + "learning_rate": 0.00026390894175039705, + "loss": 1.368, + "step": 7350 + }, + { + "epoch": 0.6669388495735801, + "grad_norm": 0.16433302805795874, + "learning_rate": 0.00026377943566407557, + "loss": 1.3857, + "step": 7351 + }, + { + "epoch": 0.667029577209218, + "grad_norm": 0.11231071677004718, + "learning_rate": 0.0002636499499754345, + "loss": 1.4018, + "step": 7352 + }, + { + "epoch": 0.6671203048448557, + "grad_norm": 0.12351206295365104, + "learning_rate": 0.00026352048469565525, + "loss": 1.3704, + "step": 7353 + }, + { + "epoch": 0.6672110324804935, + "grad_norm": 0.12267253821581371, + "learning_rate": 0.00026339103983591687, + "loss": 1.4201, + "step": 7354 + }, + { + "epoch": 0.6673017601161314, + "grad_norm": 0.12053794042304153, + "learning_rate": 0.00026326161540739723, + "loss": 1.3634, + "step": 7355 + }, + { + "epoch": 0.6673924877517692, + "grad_norm": 0.11565689886369572, + "learning_rate": 0.0002631322114212718, + "loss": 1.3572, + "step": 7356 + }, + { + "epoch": 0.667483215387407, + "grad_norm": 0.12362315648262527, + "learning_rate": 0.00026300282788871497, + "loss": 1.415, + "step": 7357 + }, + { + "epoch": 0.6675739430230448, + "grad_norm": 0.1270658521763631, + "learning_rate": 0.00026287346482089895, + "loss": 1.3937, + "step": 7358 + }, + { + "epoch": 0.6676646706586826, + "grad_norm": 0.12829367378990386, + "learning_rate": 0.00026274412222899405, + "loss": 1.411, + "step": 7359 + }, + { + "epoch": 0.6677553982943204, + "grad_norm": 0.11677981390700527, + "learning_rate": 0.0002626148001241693, + "loss": 1.4016, + "step": 7360 + }, + { + "epoch": 0.6678461259299583, + "grad_norm": 0.13005334581441555, + "learning_rate": 0.00026248549851759127, + "loss": 1.4049, + "step": 7361 + }, + { + "epoch": 0.6679368535655961, + "grad_norm": 0.12668358447437789, + "learning_rate": 0.00026235621742042546, + "loss": 1.3861, + "step": 7362 + }, + { + "epoch": 0.6680275812012338, + "grad_norm": 0.13341742430130582, + "learning_rate": 0.00026222695684383556, + "loss": 1.3766, + "step": 7363 + }, + { + "epoch": 0.6681183088368717, + "grad_norm": 0.11775822466057531, + "learning_rate": 0.000262097716798983, + "loss": 1.3814, + "step": 7364 + }, + { + "epoch": 0.6682090364725095, + "grad_norm": 0.15496827079962108, + "learning_rate": 0.0002619684972970277, + "loss": 1.405, + "step": 7365 + }, + { + "epoch": 0.6682997641081473, + "grad_norm": 0.11719471074442518, + "learning_rate": 0.0002618392983491274, + "loss": 1.4147, + "step": 7366 + }, + { + "epoch": 0.6683904917437852, + "grad_norm": 0.11791644931186976, + "learning_rate": 0.0002617101199664391, + "loss": 1.392, + "step": 7367 + }, + { + "epoch": 0.668481219379423, + "grad_norm": 0.12083557907887123, + "learning_rate": 0.00026158096216011696, + "loss": 1.3592, + "step": 7368 + }, + { + "epoch": 0.6685719470150608, + "grad_norm": 0.14534417183433362, + "learning_rate": 0.0002614518249413136, + "loss": 1.357, + "step": 7369 + }, + { + "epoch": 0.6686626746506986, + "grad_norm": 0.1226437964670025, + "learning_rate": 0.0002613227083211806, + "loss": 1.3472, + "step": 7370 + }, + { + "epoch": 0.6687534022863364, + "grad_norm": 0.13489966341029072, + "learning_rate": 0.00026119361231086694, + "loss": 1.3759, + "step": 7371 + }, + { + "epoch": 0.6688441299219743, + "grad_norm": 0.12005946560678452, + "learning_rate": 0.00026106453692151987, + "loss": 1.4038, + "step": 7372 + }, + { + "epoch": 0.6689348575576121, + "grad_norm": 0.13078948258300171, + "learning_rate": 0.00026093548216428556, + "loss": 1.3686, + "step": 7373 + }, + { + "epoch": 0.6690255851932498, + "grad_norm": 0.11627890378823091, + "learning_rate": 0.0002608064480503074, + "loss": 1.3489, + "step": 7374 + }, + { + "epoch": 0.6691163128288877, + "grad_norm": 0.11981601158772069, + "learning_rate": 0.0002606774345907278, + "loss": 1.3534, + "step": 7375 + }, + { + "epoch": 0.6692070404645255, + "grad_norm": 0.11165628399930667, + "learning_rate": 0.0002605484417966868, + "loss": 1.3912, + "step": 7376 + }, + { + "epoch": 0.6692977681001633, + "grad_norm": 0.12183862912809654, + "learning_rate": 0.00026041946967932317, + "loss": 1.394, + "step": 7377 + }, + { + "epoch": 0.6693884957358012, + "grad_norm": 0.12860300375562367, + "learning_rate": 0.0002602905182497738, + "loss": 1.3739, + "step": 7378 + }, + { + "epoch": 0.669479223371439, + "grad_norm": 0.11498752112512298, + "learning_rate": 0.00026016158751917336, + "loss": 1.3677, + "step": 7379 + }, + { + "epoch": 0.6695699510070767, + "grad_norm": 0.131735957254395, + "learning_rate": 0.0002600326774986553, + "loss": 1.3513, + "step": 7380 + }, + { + "epoch": 0.6696606786427146, + "grad_norm": 0.1344320903740826, + "learning_rate": 0.000259903788199351, + "loss": 1.3778, + "step": 7381 + }, + { + "epoch": 0.6697514062783524, + "grad_norm": 0.1794562984406027, + "learning_rate": 0.00025977491963238977, + "loss": 1.4074, + "step": 7382 + }, + { + "epoch": 0.6698421339139902, + "grad_norm": 0.1263622420685828, + "learning_rate": 0.00025964607180889974, + "loss": 1.3711, + "step": 7383 + }, + { + "epoch": 0.6699328615496281, + "grad_norm": 0.12466304568272718, + "learning_rate": 0.0002595172447400069, + "loss": 1.3869, + "step": 7384 + }, + { + "epoch": 0.6700235891852658, + "grad_norm": 0.1308367726814947, + "learning_rate": 0.0002593884384368352, + "loss": 1.3693, + "step": 7385 + }, + { + "epoch": 0.6701143168209036, + "grad_norm": 0.13277823603112027, + "learning_rate": 0.00025925965291050736, + "loss": 1.344, + "step": 7386 + }, + { + "epoch": 0.6702050444565415, + "grad_norm": 0.12570019926968498, + "learning_rate": 0.00025913088817214406, + "loss": 1.3662, + "step": 7387 + }, + { + "epoch": 0.6702957720921793, + "grad_norm": 0.11628917801243077, + "learning_rate": 0.0002590021442328642, + "loss": 1.4006, + "step": 7388 + }, + { + "epoch": 0.670386499727817, + "grad_norm": 0.11779271766980318, + "learning_rate": 0.0002588734211037844, + "loss": 1.3912, + "step": 7389 + }, + { + "epoch": 0.6704772273634549, + "grad_norm": 0.2558320417225238, + "learning_rate": 0.00025874471879602047, + "loss": 1.3578, + "step": 7390 + }, + { + "epoch": 0.6705679549990927, + "grad_norm": 0.12184520169684313, + "learning_rate": 0.00025861603732068564, + "loss": 1.3463, + "step": 7391 + }, + { + "epoch": 0.6706586826347305, + "grad_norm": 0.11973723356543389, + "learning_rate": 0.00025848737668889135, + "loss": 1.3794, + "step": 7392 + }, + { + "epoch": 0.6707494102703684, + "grad_norm": 0.12105444975206846, + "learning_rate": 0.00025835873691174764, + "loss": 1.3814, + "step": 7393 + }, + { + "epoch": 0.6708401379060062, + "grad_norm": 0.12066689572620157, + "learning_rate": 0.00025823011800036287, + "loss": 1.4027, + "step": 7394 + }, + { + "epoch": 0.6709308655416439, + "grad_norm": 0.1315116844242234, + "learning_rate": 0.000258101519965843, + "loss": 1.3874, + "step": 7395 + }, + { + "epoch": 0.6710215931772818, + "grad_norm": 0.1722970295066902, + "learning_rate": 0.0002579729428192924, + "loss": 1.3828, + "step": 7396 + }, + { + "epoch": 0.6711123208129196, + "grad_norm": 0.12302333128864114, + "learning_rate": 0.00025784438657181397, + "loss": 1.4042, + "step": 7397 + }, + { + "epoch": 0.6712030484485574, + "grad_norm": 0.11776248567734952, + "learning_rate": 0.0002577158512345085, + "loss": 1.3835, + "step": 7398 + }, + { + "epoch": 0.6712937760841953, + "grad_norm": 0.11280682303602216, + "learning_rate": 0.0002575873368184748, + "loss": 1.3657, + "step": 7399 + }, + { + "epoch": 0.671384503719833, + "grad_norm": 0.11948563174936068, + "learning_rate": 0.0002574588433348103, + "loss": 1.4038, + "step": 7400 + }, + { + "epoch": 0.6714752313554708, + "grad_norm": 0.12362174092543621, + "learning_rate": 0.0002573303707946105, + "loss": 1.3786, + "step": 7401 + }, + { + "epoch": 0.6715659589911087, + "grad_norm": 0.12667443423151822, + "learning_rate": 0.00025720191920896907, + "loss": 1.3933, + "step": 7402 + }, + { + "epoch": 0.6716566866267465, + "grad_norm": 0.11251668478002673, + "learning_rate": 0.0002570734885889775, + "loss": 1.3752, + "step": 7403 + }, + { + "epoch": 0.6717474142623843, + "grad_norm": 0.20350612890855144, + "learning_rate": 0.00025694507894572616, + "loss": 1.3919, + "step": 7404 + }, + { + "epoch": 0.6718381418980222, + "grad_norm": 0.12665307629099623, + "learning_rate": 0.0002568166902903031, + "loss": 1.3905, + "step": 7405 + }, + { + "epoch": 0.6719288695336599, + "grad_norm": 0.1187346494601668, + "learning_rate": 0.0002566883226337945, + "loss": 1.3806, + "step": 7406 + }, + { + "epoch": 0.6720195971692978, + "grad_norm": 0.1176910505364098, + "learning_rate": 0.0002565599759872852, + "loss": 1.4053, + "step": 7407 + }, + { + "epoch": 0.6721103248049356, + "grad_norm": 0.1160977691065673, + "learning_rate": 0.0002564316503618578, + "loss": 1.3881, + "step": 7408 + }, + { + "epoch": 0.6722010524405734, + "grad_norm": 0.11384427347178136, + "learning_rate": 0.0002563033457685934, + "loss": 1.3565, + "step": 7409 + }, + { + "epoch": 0.6722917800762113, + "grad_norm": 0.1292154178615952, + "learning_rate": 0.00025617506221857077, + "loss": 1.3968, + "step": 7410 + }, + { + "epoch": 0.672382507711849, + "grad_norm": 0.11215414765803415, + "learning_rate": 0.0002560467997228677, + "loss": 1.3873, + "step": 7411 + }, + { + "epoch": 0.6724732353474868, + "grad_norm": 0.11054217353366173, + "learning_rate": 0.00025591855829255937, + "loss": 1.3696, + "step": 7412 + }, + { + "epoch": 0.6725639629831247, + "grad_norm": 0.11106708463432038, + "learning_rate": 0.0002557903379387194, + "loss": 1.3847, + "step": 7413 + }, + { + "epoch": 0.6726546906187625, + "grad_norm": 0.11616605969860384, + "learning_rate": 0.00025566213867241993, + "loss": 1.3595, + "step": 7414 + }, + { + "epoch": 0.6727454182544003, + "grad_norm": 0.12339657550623861, + "learning_rate": 0.00025553396050473077, + "loss": 1.3523, + "step": 7415 + }, + { + "epoch": 0.6728361458900382, + "grad_norm": 0.1401410608942197, + "learning_rate": 0.0002554058034467199, + "loss": 1.4204, + "step": 7416 + }, + { + "epoch": 0.6729268735256759, + "grad_norm": 0.1486228969589031, + "learning_rate": 0.000255277667509454, + "loss": 1.342, + "step": 7417 + }, + { + "epoch": 0.6730176011613137, + "grad_norm": 0.1172458403826109, + "learning_rate": 0.00025514955270399784, + "loss": 1.3953, + "step": 7418 + }, + { + "epoch": 0.6731083287969516, + "grad_norm": 0.12090798195239943, + "learning_rate": 0.00025502145904141385, + "loss": 1.3664, + "step": 7419 + }, + { + "epoch": 0.6731990564325894, + "grad_norm": 0.12065200613922149, + "learning_rate": 0.00025489338653276273, + "loss": 1.3774, + "step": 7420 + }, + { + "epoch": 0.6732897840682271, + "grad_norm": 0.1465244416385722, + "learning_rate": 0.00025476533518910407, + "loss": 1.3779, + "step": 7421 + }, + { + "epoch": 0.673380511703865, + "grad_norm": 0.11978076182584439, + "learning_rate": 0.00025463730502149476, + "loss": 1.3898, + "step": 7422 + }, + { + "epoch": 0.6734712393395028, + "grad_norm": 0.11613428101980737, + "learning_rate": 0.00025450929604099026, + "loss": 1.348, + "step": 7423 + }, + { + "epoch": 0.6735619669751406, + "grad_norm": 0.12675248718307022, + "learning_rate": 0.0002543813082586441, + "loss": 1.3639, + "step": 7424 + }, + { + "epoch": 0.6736526946107785, + "grad_norm": 0.13303305203056, + "learning_rate": 0.00025425334168550847, + "loss": 1.4056, + "step": 7425 + }, + { + "epoch": 0.6737434222464163, + "grad_norm": 0.12623711868810059, + "learning_rate": 0.000254125396332633, + "loss": 1.426, + "step": 7426 + }, + { + "epoch": 0.673834149882054, + "grad_norm": 0.13337140962921648, + "learning_rate": 0.0002539974722110655, + "loss": 1.3264, + "step": 7427 + }, + { + "epoch": 0.6739248775176919, + "grad_norm": 0.1233327033721042, + "learning_rate": 0.0002538695693318528, + "loss": 1.3557, + "step": 7428 + }, + { + "epoch": 0.6740156051533297, + "grad_norm": 0.1158504738833683, + "learning_rate": 0.0002537416877060391, + "loss": 1.39, + "step": 7429 + }, + { + "epoch": 0.6741063327889675, + "grad_norm": 0.16540202115467728, + "learning_rate": 0.0002536138273446667, + "loss": 1.3591, + "step": 7430 + }, + { + "epoch": 0.6741970604246054, + "grad_norm": 0.11980106056152773, + "learning_rate": 0.0002534859882587769, + "loss": 1.3872, + "step": 7431 + }, + { + "epoch": 0.6742877880602431, + "grad_norm": 0.14843904476195746, + "learning_rate": 0.000253358170459408, + "loss": 1.3686, + "step": 7432 + }, + { + "epoch": 0.6743785156958809, + "grad_norm": 0.20517139134959314, + "learning_rate": 0.00025323037395759776, + "loss": 1.3817, + "step": 7433 + }, + { + "epoch": 0.6744692433315188, + "grad_norm": 0.12451887938743471, + "learning_rate": 0.0002531025987643809, + "loss": 1.3665, + "step": 7434 + }, + { + "epoch": 0.6745599709671566, + "grad_norm": 0.12419806314036672, + "learning_rate": 0.00025297484489079117, + "loss": 1.4013, + "step": 7435 + }, + { + "epoch": 0.6746506986027944, + "grad_norm": 0.12410007884786066, + "learning_rate": 0.00025284711234785996, + "loss": 1.3636, + "step": 7436 + }, + { + "epoch": 0.6747414262384323, + "grad_norm": 0.14672038568264806, + "learning_rate": 0.0002527194011466169, + "loss": 1.3882, + "step": 7437 + }, + { + "epoch": 0.67483215387407, + "grad_norm": 0.12507056312488316, + "learning_rate": 0.0002525917112980902, + "loss": 1.3853, + "step": 7438 + }, + { + "epoch": 0.6749228815097078, + "grad_norm": 0.13614295131659368, + "learning_rate": 0.0002524640428133054, + "loss": 1.3905, + "step": 7439 + }, + { + "epoch": 0.6750136091453457, + "grad_norm": 0.192362533442351, + "learning_rate": 0.0002523363957032871, + "loss": 1.3536, + "step": 7440 + }, + { + "epoch": 0.6751043367809835, + "grad_norm": 0.19111215299124448, + "learning_rate": 0.0002522087699790577, + "loss": 1.3629, + "step": 7441 + }, + { + "epoch": 0.6751950644166212, + "grad_norm": 0.11847481659903743, + "learning_rate": 0.0002520811656516375, + "loss": 1.3776, + "step": 7442 + }, + { + "epoch": 0.6752857920522591, + "grad_norm": 0.20992353523643015, + "learning_rate": 0.00025195358273204515, + "loss": 1.3789, + "step": 7443 + }, + { + "epoch": 0.6753765196878969, + "grad_norm": 0.13043399664570646, + "learning_rate": 0.0002518260212312974, + "loss": 1.3641, + "step": 7444 + }, + { + "epoch": 0.6754672473235348, + "grad_norm": 0.14770838354875643, + "learning_rate": 0.0002516984811604094, + "loss": 1.3544, + "step": 7445 + }, + { + "epoch": 0.6755579749591726, + "grad_norm": 0.2663147196525203, + "learning_rate": 0.0002515709625303942, + "loss": 1.3875, + "step": 7446 + }, + { + "epoch": 0.6756487025948104, + "grad_norm": 0.14469857363239724, + "learning_rate": 0.0002514434653522626, + "loss": 1.3499, + "step": 7447 + }, + { + "epoch": 0.6757394302304482, + "grad_norm": 0.14676607724344926, + "learning_rate": 0.00025131598963702473, + "loss": 1.3881, + "step": 7448 + }, + { + "epoch": 0.675830157866086, + "grad_norm": 0.12340115968756787, + "learning_rate": 0.00025118853539568786, + "loss": 1.366, + "step": 7449 + }, + { + "epoch": 0.6759208855017238, + "grad_norm": 0.1191563665754085, + "learning_rate": 0.00025106110263925746, + "loss": 1.3722, + "step": 7450 + }, + { + "epoch": 0.6760116131373617, + "grad_norm": 0.1316718311675811, + "learning_rate": 0.0002509336913787377, + "loss": 1.3828, + "step": 7451 + }, + { + "epoch": 0.6761023407729995, + "grad_norm": 0.12124871541342525, + "learning_rate": 0.00025080630162513043, + "loss": 1.3933, + "step": 7452 + }, + { + "epoch": 0.6761930684086372, + "grad_norm": 0.12217978150483014, + "learning_rate": 0.00025067893338943573, + "loss": 1.3822, + "step": 7453 + }, + { + "epoch": 0.6762837960442751, + "grad_norm": 0.1267069730905833, + "learning_rate": 0.00025055158668265163, + "loss": 1.3595, + "step": 7454 + }, + { + "epoch": 0.6763745236799129, + "grad_norm": 0.12186489572915025, + "learning_rate": 0.0002504242615157748, + "loss": 1.3825, + "step": 7455 + }, + { + "epoch": 0.6764652513155507, + "grad_norm": 0.12651778430378452, + "learning_rate": 0.0002502969578998, + "loss": 1.3915, + "step": 7456 + }, + { + "epoch": 0.6765559789511886, + "grad_norm": 0.12160954955930386, + "learning_rate": 0.00025016967584571946, + "loss": 1.3392, + "step": 7457 + }, + { + "epoch": 0.6766467065868264, + "grad_norm": 0.1194947911589354, + "learning_rate": 0.0002500424153645245, + "loss": 1.36, + "step": 7458 + }, + { + "epoch": 0.6767374342224641, + "grad_norm": 0.1352089362025491, + "learning_rate": 0.0002499151764672037, + "loss": 1.369, + "step": 7459 + }, + { + "epoch": 0.676828161858102, + "grad_norm": 0.12582494854071902, + "learning_rate": 0.00024978795916474417, + "loss": 1.3729, + "step": 7460 + }, + { + "epoch": 0.6769188894937398, + "grad_norm": 0.16437071598158828, + "learning_rate": 0.00024966076346813145, + "loss": 1.3792, + "step": 7461 + }, + { + "epoch": 0.6770096171293776, + "grad_norm": 0.12274487003608238, + "learning_rate": 0.00024953358938834864, + "loss": 1.3692, + "step": 7462 + }, + { + "epoch": 0.6771003447650155, + "grad_norm": 0.12143416063143579, + "learning_rate": 0.0002494064369363771, + "loss": 1.3892, + "step": 7463 + }, + { + "epoch": 0.6771910724006532, + "grad_norm": 0.120532611239315, + "learning_rate": 0.0002492793061231967, + "loss": 1.3554, + "step": 7464 + }, + { + "epoch": 0.677281800036291, + "grad_norm": 0.14147542184846967, + "learning_rate": 0.0002491521969597854, + "loss": 1.3772, + "step": 7465 + }, + { + "epoch": 0.6773725276719289, + "grad_norm": 0.1209226778698576, + "learning_rate": 0.00024902510945711877, + "loss": 1.3702, + "step": 7466 + }, + { + "epoch": 0.6774632553075667, + "grad_norm": 0.13300938491486397, + "learning_rate": 0.00024889804362617077, + "loss": 1.3881, + "step": 7467 + }, + { + "epoch": 0.6775539829432045, + "grad_norm": 0.12142791116606198, + "learning_rate": 0.00024877099947791394, + "loss": 1.3952, + "step": 7468 + }, + { + "epoch": 0.6776447105788423, + "grad_norm": 0.13760624304770241, + "learning_rate": 0.00024864397702331826, + "loss": 1.3635, + "step": 7469 + }, + { + "epoch": 0.6777354382144801, + "grad_norm": 0.13994411903593554, + "learning_rate": 0.0002485169762733521, + "loss": 1.3477, + "step": 7470 + }, + { + "epoch": 0.6778261658501179, + "grad_norm": 0.12523700592675346, + "learning_rate": 0.00024838999723898205, + "loss": 1.4023, + "step": 7471 + }, + { + "epoch": 0.6779168934857558, + "grad_norm": 0.13357567254155076, + "learning_rate": 0.00024826303993117304, + "loss": 1.358, + "step": 7472 + }, + { + "epoch": 0.6780076211213936, + "grad_norm": 0.1257143739890596, + "learning_rate": 0.00024813610436088765, + "loss": 1.3981, + "step": 7473 + }, + { + "epoch": 0.6780983487570313, + "grad_norm": 0.12603496619137605, + "learning_rate": 0.00024800919053908656, + "loss": 1.3969, + "step": 7474 + }, + { + "epoch": 0.6781890763926692, + "grad_norm": 0.1528316456010544, + "learning_rate": 0.00024788229847672917, + "loss": 1.3724, + "step": 7475 + }, + { + "epoch": 0.678279804028307, + "grad_norm": 0.13746496675635408, + "learning_rate": 0.00024775542818477247, + "loss": 1.3884, + "step": 7476 + }, + { + "epoch": 0.6783705316639448, + "grad_norm": 0.12792777326693514, + "learning_rate": 0.0002476285796741716, + "loss": 1.3851, + "step": 7477 + }, + { + "epoch": 0.6784612592995827, + "grad_norm": 0.15046170567767314, + "learning_rate": 0.00024750175295587995, + "loss": 1.3619, + "step": 7478 + }, + { + "epoch": 0.6785519869352205, + "grad_norm": 0.1471715362700376, + "learning_rate": 0.0002473749480408494, + "loss": 1.3589, + "step": 7479 + }, + { + "epoch": 0.6786427145708582, + "grad_norm": 0.13748352320969098, + "learning_rate": 0.00024724816494002934, + "loss": 1.3558, + "step": 7480 + }, + { + "epoch": 0.6787334422064961, + "grad_norm": 0.13115222295312293, + "learning_rate": 0.0002471214036643673, + "loss": 1.3902, + "step": 7481 + }, + { + "epoch": 0.6788241698421339, + "grad_norm": 0.11361451575448624, + "learning_rate": 0.0002469946642248095, + "loss": 1.3939, + "step": 7482 + }, + { + "epoch": 0.6789148974777718, + "grad_norm": 0.1441400099017067, + "learning_rate": 0.00024686794663229974, + "loss": 1.4289, + "step": 7483 + }, + { + "epoch": 0.6790056251134096, + "grad_norm": 0.14515900400747087, + "learning_rate": 0.0002467412508977799, + "loss": 1.3396, + "step": 7484 + }, + { + "epoch": 0.6790963527490473, + "grad_norm": 0.11957101209226896, + "learning_rate": 0.0002466145770321905, + "loss": 1.3959, + "step": 7485 + }, + { + "epoch": 0.6791870803846852, + "grad_norm": 0.1284610007247161, + "learning_rate": 0.00024648792504646964, + "loss": 1.374, + "step": 7486 + }, + { + "epoch": 0.679277808020323, + "grad_norm": 0.1247858763542236, + "learning_rate": 0.00024636129495155397, + "loss": 1.3902, + "step": 7487 + }, + { + "epoch": 0.6793685356559608, + "grad_norm": 0.14562710709043286, + "learning_rate": 0.0002462346867583776, + "loss": 1.405, + "step": 7488 + }, + { + "epoch": 0.6794592632915987, + "grad_norm": 0.12132064569448003, + "learning_rate": 0.0002461081004778737, + "loss": 1.3982, + "step": 7489 + }, + { + "epoch": 0.6795499909272364, + "grad_norm": 0.13159405094327417, + "learning_rate": 0.00024598153612097275, + "loss": 1.3705, + "step": 7490 + }, + { + "epoch": 0.6796407185628742, + "grad_norm": 0.1297384150230457, + "learning_rate": 0.0002458549936986034, + "loss": 1.3826, + "step": 7491 + }, + { + "epoch": 0.6797314461985121, + "grad_norm": 0.47962794286654525, + "learning_rate": 0.00024572847322169297, + "loss": 1.391, + "step": 7492 + }, + { + "epoch": 0.6798221738341499, + "grad_norm": 0.14166347728669745, + "learning_rate": 0.0002456019747011663, + "loss": 1.3757, + "step": 7493 + }, + { + "epoch": 0.6799129014697877, + "grad_norm": 0.12223866771371147, + "learning_rate": 0.0002454754981479465, + "loss": 1.3743, + "step": 7494 + }, + { + "epoch": 0.6800036291054256, + "grad_norm": 0.1483859666610611, + "learning_rate": 0.0002453490435729549, + "loss": 1.4032, + "step": 7495 + }, + { + "epoch": 0.6800943567410633, + "grad_norm": 0.13327396311893278, + "learning_rate": 0.00024522261098711113, + "loss": 1.3617, + "step": 7496 + }, + { + "epoch": 0.6801850843767011, + "grad_norm": 0.13745168636116353, + "learning_rate": 0.00024509620040133245, + "loss": 1.3917, + "step": 7497 + }, + { + "epoch": 0.680275812012339, + "grad_norm": 0.1238619899715012, + "learning_rate": 0.0002449698118265342, + "loss": 1.3253, + "step": 7498 + }, + { + "epoch": 0.6803665396479768, + "grad_norm": 0.1432064596698943, + "learning_rate": 0.00024484344527363054, + "loss": 1.367, + "step": 7499 + }, + { + "epoch": 0.6804572672836146, + "grad_norm": 0.1637830718627159, + "learning_rate": 0.000244717100753533, + "loss": 1.3816, + "step": 7500 + }, + { + "epoch": 0.6805479949192524, + "grad_norm": 0.12081520761468012, + "learning_rate": 0.0002445907782771512, + "loss": 1.3893, + "step": 7501 + }, + { + "epoch": 0.6806387225548902, + "grad_norm": 0.12304684267346974, + "learning_rate": 0.00024446447785539334, + "loss": 1.4294, + "step": 7502 + }, + { + "epoch": 0.680729450190528, + "grad_norm": 0.1274990839583408, + "learning_rate": 0.0002443381994991657, + "loss": 1.3618, + "step": 7503 + }, + { + "epoch": 0.6808201778261659, + "grad_norm": 0.1280076253500202, + "learning_rate": 0.00024421194321937216, + "loss": 1.3651, + "step": 7504 + }, + { + "epoch": 0.6809109054618037, + "grad_norm": 0.12167494891268035, + "learning_rate": 0.0002440857090269149, + "loss": 1.3641, + "step": 7505 + }, + { + "epoch": 0.6810016330974414, + "grad_norm": 0.13701700759136126, + "learning_rate": 0.00024395949693269454, + "loss": 1.3892, + "step": 7506 + }, + { + "epoch": 0.6810923607330793, + "grad_norm": 0.12699703648292576, + "learning_rate": 0.00024383330694760936, + "loss": 1.4118, + "step": 7507 + }, + { + "epoch": 0.6811830883687171, + "grad_norm": 0.12224024132356198, + "learning_rate": 0.00024370713908255566, + "loss": 1.3676, + "step": 7508 + }, + { + "epoch": 0.6812738160043549, + "grad_norm": 0.12220296902270615, + "learning_rate": 0.0002435809933484285, + "loss": 1.3503, + "step": 7509 + }, + { + "epoch": 0.6813645436399928, + "grad_norm": 0.1377977914740339, + "learning_rate": 0.00024345486975612014, + "loss": 1.3715, + "step": 7510 + }, + { + "epoch": 0.6814552712756305, + "grad_norm": 0.1368629270101624, + "learning_rate": 0.0002433287683165218, + "loss": 1.3671, + "step": 7511 + }, + { + "epoch": 0.6815459989112683, + "grad_norm": 0.13419943314575195, + "learning_rate": 0.00024320268904052194, + "loss": 1.3688, + "step": 7512 + }, + { + "epoch": 0.6816367265469062, + "grad_norm": 0.11925607109279336, + "learning_rate": 0.00024307663193900793, + "loss": 1.393, + "step": 7513 + }, + { + "epoch": 0.681727454182544, + "grad_norm": 0.14095599195789227, + "learning_rate": 0.0002429505970228646, + "loss": 1.3721, + "step": 7514 + }, + { + "epoch": 0.6818181818181818, + "grad_norm": 0.11840015867693951, + "learning_rate": 0.00024282458430297495, + "loss": 1.3822, + "step": 7515 + }, + { + "epoch": 0.6819089094538197, + "grad_norm": 0.12662639722219796, + "learning_rate": 0.0002426985937902205, + "loss": 1.3975, + "step": 7516 + }, + { + "epoch": 0.6819996370894574, + "grad_norm": 0.12929096371009433, + "learning_rate": 0.00024257262549548026, + "loss": 1.3742, + "step": 7517 + }, + { + "epoch": 0.6820903647250952, + "grad_norm": 0.11748029531230031, + "learning_rate": 0.00024244667942963173, + "loss": 1.3622, + "step": 7518 + }, + { + "epoch": 0.6821810923607331, + "grad_norm": 0.1292059257315077, + "learning_rate": 0.00024232075560355054, + "loss": 1.3291, + "step": 7519 + }, + { + "epoch": 0.6822718199963709, + "grad_norm": 0.1184089903026906, + "learning_rate": 0.00024219485402811008, + "loss": 1.3891, + "step": 7520 + }, + { + "epoch": 0.6823625476320088, + "grad_norm": 0.12255921428150723, + "learning_rate": 0.00024206897471418205, + "loss": 1.3524, + "step": 7521 + }, + { + "epoch": 0.6824532752676465, + "grad_norm": 0.13184067232256677, + "learning_rate": 0.00024194311767263582, + "loss": 1.4225, + "step": 7522 + }, + { + "epoch": 0.6825440029032843, + "grad_norm": 0.11281772519709478, + "learning_rate": 0.00024181728291433957, + "loss": 1.3853, + "step": 7523 + }, + { + "epoch": 0.6826347305389222, + "grad_norm": 0.1316539047488467, + "learning_rate": 0.00024169147045015904, + "loss": 1.3294, + "step": 7524 + }, + { + "epoch": 0.68272545817456, + "grad_norm": 0.12615170504201778, + "learning_rate": 0.00024156568029095772, + "loss": 1.335, + "step": 7525 + }, + { + "epoch": 0.6828161858101978, + "grad_norm": 0.12739908627453586, + "learning_rate": 0.00024143991244759838, + "loss": 1.4136, + "step": 7526 + }, + { + "epoch": 0.6829069134458357, + "grad_norm": 0.1211776343428802, + "learning_rate": 0.0002413141669309407, + "loss": 1.3857, + "step": 7527 + }, + { + "epoch": 0.6829976410814734, + "grad_norm": 0.13081294758732326, + "learning_rate": 0.00024118844375184268, + "loss": 1.4049, + "step": 7528 + }, + { + "epoch": 0.6830883687171112, + "grad_norm": 0.11968068967496866, + "learning_rate": 0.00024106274292116082, + "loss": 1.3797, + "step": 7529 + }, + { + "epoch": 0.6831790963527491, + "grad_norm": 0.11774920082430652, + "learning_rate": 0.0002409370644497493, + "loss": 1.3484, + "step": 7530 + }, + { + "epoch": 0.6832698239883869, + "grad_norm": 0.1178164025453821, + "learning_rate": 0.00024081140834846044, + "loss": 1.3612, + "step": 7531 + }, + { + "epoch": 0.6833605516240246, + "grad_norm": 0.11245240177617595, + "learning_rate": 0.0002406857746281445, + "loss": 1.3805, + "step": 7532 + }, + { + "epoch": 0.6834512792596625, + "grad_norm": 0.11421405479416528, + "learning_rate": 0.00024056016329965008, + "loss": 1.3464, + "step": 7533 + }, + { + "epoch": 0.6835420068953003, + "grad_norm": 0.11447821105415089, + "learning_rate": 0.00024043457437382404, + "loss": 1.3637, + "step": 7534 + }, + { + "epoch": 0.6836327345309381, + "grad_norm": 0.13858068746619465, + "learning_rate": 0.00024030900786151056, + "loss": 1.3916, + "step": 7535 + }, + { + "epoch": 0.683723462166576, + "grad_norm": 0.1174976036558915, + "learning_rate": 0.00024018346377355272, + "loss": 1.3884, + "step": 7536 + }, + { + "epoch": 0.6838141898022138, + "grad_norm": 0.13653881934182152, + "learning_rate": 0.00024005794212079109, + "loss": 1.3693, + "step": 7537 + }, + { + "epoch": 0.6839049174378515, + "grad_norm": 0.12397612032927445, + "learning_rate": 0.00023993244291406425, + "loss": 1.3917, + "step": 7538 + }, + { + "epoch": 0.6839956450734894, + "grad_norm": 0.11775178075101503, + "learning_rate": 0.0002398069661642095, + "loss": 1.3985, + "step": 7539 + }, + { + "epoch": 0.6840863727091272, + "grad_norm": 0.11635585790536414, + "learning_rate": 0.00023968151188206156, + "loss": 1.3743, + "step": 7540 + }, + { + "epoch": 0.684177100344765, + "grad_norm": 0.11521608287740623, + "learning_rate": 0.00023955608007845326, + "loss": 1.3464, + "step": 7541 + }, + { + "epoch": 0.6842678279804029, + "grad_norm": 0.15141412542098256, + "learning_rate": 0.00023943067076421582, + "loss": 1.3935, + "step": 7542 + }, + { + "epoch": 0.6843585556160406, + "grad_norm": 0.13683038049805382, + "learning_rate": 0.00023930528395017854, + "loss": 1.4152, + "step": 7543 + }, + { + "epoch": 0.6844492832516784, + "grad_norm": 0.11658460237586724, + "learning_rate": 0.0002391799196471684, + "loss": 1.3428, + "step": 7544 + }, + { + "epoch": 0.6845400108873163, + "grad_norm": 0.14017246757786375, + "learning_rate": 0.0002390545778660105, + "loss": 1.3828, + "step": 7545 + }, + { + "epoch": 0.6846307385229541, + "grad_norm": 0.1381659270611142, + "learning_rate": 0.0002389292586175284, + "loss": 1.4148, + "step": 7546 + }, + { + "epoch": 0.6847214661585919, + "grad_norm": 0.11221753339669653, + "learning_rate": 0.00023880396191254332, + "loss": 1.3603, + "step": 7547 + }, + { + "epoch": 0.6848121937942298, + "grad_norm": 0.11355836227204935, + "learning_rate": 0.00023867868776187445, + "loss": 1.3502, + "step": 7548 + }, + { + "epoch": 0.6849029214298675, + "grad_norm": 0.11016821001957167, + "learning_rate": 0.0002385534361763394, + "loss": 1.382, + "step": 7549 + }, + { + "epoch": 0.6849936490655053, + "grad_norm": 0.11447216543508514, + "learning_rate": 0.00023842820716675383, + "loss": 1.4008, + "step": 7550 + }, + { + "epoch": 0.6850843767011432, + "grad_norm": 0.3152684668839204, + "learning_rate": 0.0002383030007439312, + "loss": 1.3584, + "step": 7551 + }, + { + "epoch": 0.685175104336781, + "grad_norm": 0.10645197788058829, + "learning_rate": 0.0002381778169186828, + "loss": 1.3617, + "step": 7552 + }, + { + "epoch": 0.6852658319724187, + "grad_norm": 0.12577556936468642, + "learning_rate": 0.0002380526557018186, + "loss": 1.3889, + "step": 7553 + }, + { + "epoch": 0.6853565596080566, + "grad_norm": 0.21083111248027467, + "learning_rate": 0.0002379275171041463, + "loss": 1.3604, + "step": 7554 + }, + { + "epoch": 0.6854472872436944, + "grad_norm": 0.1385294673338271, + "learning_rate": 0.00023780240113647127, + "loss": 1.3414, + "step": 7555 + }, + { + "epoch": 0.6855380148793322, + "grad_norm": 0.12134641248520497, + "learning_rate": 0.00023767730780959758, + "loss": 1.3815, + "step": 7556 + }, + { + "epoch": 0.6856287425149701, + "grad_norm": 0.12402872348644667, + "learning_rate": 0.0002375522371343272, + "loss": 1.3732, + "step": 7557 + }, + { + "epoch": 0.6857194701506079, + "grad_norm": 0.11915589830289888, + "learning_rate": 0.00023742718912145983, + "loss": 1.3595, + "step": 7558 + }, + { + "epoch": 0.6858101977862457, + "grad_norm": 0.11965178126388457, + "learning_rate": 0.0002373021637817932, + "loss": 1.3692, + "step": 7559 + }, + { + "epoch": 0.6859009254218835, + "grad_norm": 0.12876363280357428, + "learning_rate": 0.00023717716112612357, + "loss": 1.3502, + "step": 7560 + }, + { + "epoch": 0.6859916530575213, + "grad_norm": 0.13351922335228772, + "learning_rate": 0.0002370521811652449, + "loss": 1.3473, + "step": 7561 + }, + { + "epoch": 0.6860823806931592, + "grad_norm": 0.12172734615215698, + "learning_rate": 0.0002369272239099489, + "loss": 1.3745, + "step": 7562 + }, + { + "epoch": 0.686173108328797, + "grad_norm": 0.13289702733135192, + "learning_rate": 0.0002368022893710261, + "loss": 1.3906, + "step": 7563 + }, + { + "epoch": 0.6862638359644347, + "grad_norm": 0.1576901965282842, + "learning_rate": 0.0002366773775592642, + "loss": 1.3523, + "step": 7564 + }, + { + "epoch": 0.6863545636000726, + "grad_norm": 0.12052313111021902, + "learning_rate": 0.00023655248848544974, + "loss": 1.3457, + "step": 7565 + }, + { + "epoch": 0.6864452912357104, + "grad_norm": 0.12509813869706513, + "learning_rate": 0.00023642762216036657, + "loss": 1.3415, + "step": 7566 + }, + { + "epoch": 0.6865360188713482, + "grad_norm": 0.12186499050789064, + "learning_rate": 0.0002363027785947972, + "loss": 1.3662, + "step": 7567 + }, + { + "epoch": 0.6866267465069861, + "grad_norm": 0.1339124409244216, + "learning_rate": 0.00023617795779952184, + "loss": 1.3563, + "step": 7568 + }, + { + "epoch": 0.6867174741426239, + "grad_norm": 0.1283385372735569, + "learning_rate": 0.0002360531597853185, + "loss": 1.3771, + "step": 7569 + }, + { + "epoch": 0.6868082017782616, + "grad_norm": 0.15282269931352868, + "learning_rate": 0.0002359283845629639, + "loss": 1.4023, + "step": 7570 + }, + { + "epoch": 0.6868989294138995, + "grad_norm": 0.131829168449188, + "learning_rate": 0.0002358036321432322, + "loss": 1.3805, + "step": 7571 + }, + { + "epoch": 0.6869896570495373, + "grad_norm": 0.1234152474090281, + "learning_rate": 0.0002356789025368956, + "loss": 1.3872, + "step": 7572 + }, + { + "epoch": 0.6870803846851751, + "grad_norm": 0.13453462903586755, + "learning_rate": 0.0002355541957547247, + "loss": 1.3785, + "step": 7573 + }, + { + "epoch": 0.687171112320813, + "grad_norm": 0.1322549895292025, + "learning_rate": 0.00023542951180748823, + "loss": 1.3735, + "step": 7574 + }, + { + "epoch": 0.6872618399564507, + "grad_norm": 0.11895424011176742, + "learning_rate": 0.00023530485070595236, + "loss": 1.3317, + "step": 7575 + }, + { + "epoch": 0.6873525675920885, + "grad_norm": 0.12465701052251224, + "learning_rate": 0.0002351802124608815, + "loss": 1.3749, + "step": 7576 + }, + { + "epoch": 0.6874432952277264, + "grad_norm": 0.12334032490583557, + "learning_rate": 0.00023505559708303847, + "loss": 1.355, + "step": 7577 + }, + { + "epoch": 0.6875340228633642, + "grad_norm": 0.16326024945897427, + "learning_rate": 0.00023493100458318368, + "loss": 1.3814, + "step": 7578 + }, + { + "epoch": 0.687624750499002, + "grad_norm": 0.13313221620935792, + "learning_rate": 0.00023480643497207556, + "loss": 1.3887, + "step": 7579 + }, + { + "epoch": 0.6877154781346398, + "grad_norm": 0.1429011370740279, + "learning_rate": 0.00023468188826047083, + "loss": 1.3287, + "step": 7580 + }, + { + "epoch": 0.6878062057702776, + "grad_norm": 0.12025670371824926, + "learning_rate": 0.00023455736445912435, + "loss": 1.3904, + "step": 7581 + }, + { + "epoch": 0.6878969334059154, + "grad_norm": 0.12343732036061576, + "learning_rate": 0.00023443286357878858, + "loss": 1.4013, + "step": 7582 + }, + { + "epoch": 0.6879876610415533, + "grad_norm": 0.1155174283793404, + "learning_rate": 0.000234308385630214, + "loss": 1.4047, + "step": 7583 + }, + { + "epoch": 0.6880783886771911, + "grad_norm": 0.12401070298686144, + "learning_rate": 0.00023418393062414966, + "loss": 1.3519, + "step": 7584 + }, + { + "epoch": 0.6881691163128288, + "grad_norm": 0.1322013251714143, + "learning_rate": 0.00023405949857134206, + "loss": 1.3515, + "step": 7585 + }, + { + "epoch": 0.6882598439484667, + "grad_norm": 0.12367596960258587, + "learning_rate": 0.00023393508948253583, + "loss": 1.3801, + "step": 7586 + }, + { + "epoch": 0.6883505715841045, + "grad_norm": 0.12484206899851048, + "learning_rate": 0.00023381070336847377, + "loss": 1.3947, + "step": 7587 + }, + { + "epoch": 0.6884412992197423, + "grad_norm": 0.12623119241558067, + "learning_rate": 0.00023368634023989692, + "loss": 1.3597, + "step": 7588 + }, + { + "epoch": 0.6885320268553802, + "grad_norm": 0.12997753935182024, + "learning_rate": 0.00023356200010754387, + "loss": 1.3833, + "step": 7589 + }, + { + "epoch": 0.688622754491018, + "grad_norm": 0.13088845835925805, + "learning_rate": 0.00023343768298215114, + "loss": 1.367, + "step": 7590 + }, + { + "epoch": 0.6887134821266557, + "grad_norm": 0.13779278271410159, + "learning_rate": 0.00023331338887445398, + "loss": 1.3895, + "step": 7591 + }, + { + "epoch": 0.6888042097622936, + "grad_norm": 0.1277843454160392, + "learning_rate": 0.00023318911779518503, + "loss": 1.3651, + "step": 7592 + }, + { + "epoch": 0.6888949373979314, + "grad_norm": 0.12813960838371638, + "learning_rate": 0.00023306486975507485, + "loss": 1.3275, + "step": 7593 + }, + { + "epoch": 0.6889856650335692, + "grad_norm": 0.3141247438661128, + "learning_rate": 0.0002329406447648527, + "loss": 1.3623, + "step": 7594 + }, + { + "epoch": 0.6890763926692071, + "grad_norm": 0.15319835317091998, + "learning_rate": 0.00023281644283524506, + "loss": 1.3579, + "step": 7595 + }, + { + "epoch": 0.6891671203048448, + "grad_norm": 0.311695376329152, + "learning_rate": 0.00023269226397697697, + "loss": 1.3458, + "step": 7596 + }, + { + "epoch": 0.6892578479404827, + "grad_norm": 0.13837578277992313, + "learning_rate": 0.00023256810820077145, + "loss": 1.3555, + "step": 7597 + }, + { + "epoch": 0.6893485755761205, + "grad_norm": 0.14020256118429356, + "learning_rate": 0.0002324439755173492, + "loss": 1.3464, + "step": 7598 + }, + { + "epoch": 0.6894393032117583, + "grad_norm": 0.1862106231665511, + "learning_rate": 0.00023231986593742917, + "loss": 1.3403, + "step": 7599 + }, + { + "epoch": 0.6895300308473962, + "grad_norm": 0.13023254546492372, + "learning_rate": 0.00023219577947172787, + "loss": 1.3611, + "step": 7600 + }, + { + "epoch": 0.689620758483034, + "grad_norm": 0.1394849989719577, + "learning_rate": 0.00023207171613096072, + "loss": 1.3805, + "step": 7601 + }, + { + "epoch": 0.6897114861186717, + "grad_norm": 0.14175831209537418, + "learning_rate": 0.00023194767592584037, + "loss": 1.3763, + "step": 7602 + }, + { + "epoch": 0.6898022137543096, + "grad_norm": 0.13873230619578772, + "learning_rate": 0.00023182365886707736, + "loss": 1.3632, + "step": 7603 + }, + { + "epoch": 0.6898929413899474, + "grad_norm": 0.14776735337108854, + "learning_rate": 0.00023169966496538124, + "loss": 1.3721, + "step": 7604 + }, + { + "epoch": 0.6899836690255852, + "grad_norm": 0.23297058438399035, + "learning_rate": 0.0002315756942314587, + "loss": 1.3861, + "step": 7605 + }, + { + "epoch": 0.690074396661223, + "grad_norm": 0.12650081926594306, + "learning_rate": 0.00023145174667601426, + "loss": 1.3268, + "step": 7606 + }, + { + "epoch": 0.6901651242968608, + "grad_norm": 0.20793603362685786, + "learning_rate": 0.00023132782230975125, + "loss": 1.3471, + "step": 7607 + }, + { + "epoch": 0.6902558519324986, + "grad_norm": 0.15108872052420863, + "learning_rate": 0.0002312039211433704, + "loss": 1.3887, + "step": 7608 + }, + { + "epoch": 0.6903465795681365, + "grad_norm": 0.13249350195272191, + "learning_rate": 0.00023108004318757043, + "loss": 1.392, + "step": 7609 + }, + { + "epoch": 0.6904373072037743, + "grad_norm": 0.13325932788143993, + "learning_rate": 0.0002309561884530485, + "loss": 1.3715, + "step": 7610 + }, + { + "epoch": 0.690528034839412, + "grad_norm": 0.13837238917574257, + "learning_rate": 0.0002308323569504992, + "loss": 1.3874, + "step": 7611 + }, + { + "epoch": 0.6906187624750499, + "grad_norm": 0.1380571452960532, + "learning_rate": 0.0002307085486906157, + "loss": 1.3774, + "step": 7612 + }, + { + "epoch": 0.6907094901106877, + "grad_norm": 0.1734162336941304, + "learning_rate": 0.00023058476368408853, + "loss": 1.3374, + "step": 7613 + }, + { + "epoch": 0.6908002177463255, + "grad_norm": 0.1408234702660723, + "learning_rate": 0.00023046100194160695, + "loss": 1.384, + "step": 7614 + }, + { + "epoch": 0.6908909453819634, + "grad_norm": 0.15109767773317714, + "learning_rate": 0.00023033726347385753, + "loss": 1.3593, + "step": 7615 + }, + { + "epoch": 0.6909816730176012, + "grad_norm": 0.12893646961707275, + "learning_rate": 0.00023021354829152503, + "loss": 1.3636, + "step": 7616 + }, + { + "epoch": 0.6910724006532389, + "grad_norm": 0.13361831604732607, + "learning_rate": 0.00023008985640529252, + "loss": 1.3598, + "step": 7617 + }, + { + "epoch": 0.6911631282888768, + "grad_norm": 0.1301863029496906, + "learning_rate": 0.00022996618782584072, + "loss": 1.3789, + "step": 7618 + }, + { + "epoch": 0.6912538559245146, + "grad_norm": 0.12402046459743876, + "learning_rate": 0.0002298425425638482, + "loss": 1.3401, + "step": 7619 + }, + { + "epoch": 0.6913445835601524, + "grad_norm": 0.13437609591516622, + "learning_rate": 0.00022971892062999196, + "loss": 1.3926, + "step": 7620 + }, + { + "epoch": 0.6914353111957903, + "grad_norm": 0.1308513528912612, + "learning_rate": 0.00022959532203494694, + "loss": 1.3488, + "step": 7621 + }, + { + "epoch": 0.691526038831428, + "grad_norm": 0.1441803901306455, + "learning_rate": 0.0002294717467893857, + "loss": 1.3785, + "step": 7622 + }, + { + "epoch": 0.6916167664670658, + "grad_norm": 0.1270710706340179, + "learning_rate": 0.0002293481949039788, + "loss": 1.3799, + "step": 7623 + }, + { + "epoch": 0.6917074941027037, + "grad_norm": 0.1264648411782302, + "learning_rate": 0.00022922466638939537, + "loss": 1.3236, + "step": 7624 + }, + { + "epoch": 0.6917982217383415, + "grad_norm": 0.39026942825991406, + "learning_rate": 0.00022910116125630193, + "loss": 1.3819, + "step": 7625 + }, + { + "epoch": 0.6918889493739793, + "grad_norm": 0.12628704730254756, + "learning_rate": 0.00022897767951536292, + "loss": 1.3886, + "step": 7626 + }, + { + "epoch": 0.6919796770096172, + "grad_norm": 0.18381722057803848, + "learning_rate": 0.00022885422117724125, + "loss": 1.3983, + "step": 7627 + }, + { + "epoch": 0.6920704046452549, + "grad_norm": 0.12885786262531282, + "learning_rate": 0.00022873078625259768, + "loss": 1.3353, + "step": 7628 + }, + { + "epoch": 0.6921611322808927, + "grad_norm": 0.12218402765477995, + "learning_rate": 0.00022860737475209077, + "loss": 1.3885, + "step": 7629 + }, + { + "epoch": 0.6922518599165306, + "grad_norm": 0.11899625204500312, + "learning_rate": 0.0002284839866863768, + "loss": 1.3889, + "step": 7630 + }, + { + "epoch": 0.6923425875521684, + "grad_norm": 0.1543956747737421, + "learning_rate": 0.00022836062206611085, + "loss": 1.3552, + "step": 7631 + }, + { + "epoch": 0.6924333151878062, + "grad_norm": 0.15119239130371415, + "learning_rate": 0.00022823728090194518, + "loss": 1.3709, + "step": 7632 + }, + { + "epoch": 0.692524042823444, + "grad_norm": 0.13090979877940545, + "learning_rate": 0.00022811396320453016, + "loss": 1.3507, + "step": 7633 + }, + { + "epoch": 0.6926147704590818, + "grad_norm": 0.13532087542690457, + "learning_rate": 0.00022799066898451448, + "loss": 1.3535, + "step": 7634 + }, + { + "epoch": 0.6927054980947196, + "grad_norm": 0.1350991483471075, + "learning_rate": 0.00022786739825254488, + "loss": 1.3348, + "step": 7635 + }, + { + "epoch": 0.6927962257303575, + "grad_norm": 0.14281234506540616, + "learning_rate": 0.0002277441510192655, + "loss": 1.3785, + "step": 7636 + }, + { + "epoch": 0.6928869533659953, + "grad_norm": 0.14419572330352276, + "learning_rate": 0.00022762092729531857, + "loss": 1.3634, + "step": 7637 + }, + { + "epoch": 0.6929776810016332, + "grad_norm": 0.14057204818118782, + "learning_rate": 0.00022749772709134498, + "loss": 1.3856, + "step": 7638 + }, + { + "epoch": 0.6930684086372709, + "grad_norm": 0.13913947843748464, + "learning_rate": 0.00022737455041798278, + "loss": 1.3531, + "step": 7639 + }, + { + "epoch": 0.6931591362729087, + "grad_norm": 0.1709169181528203, + "learning_rate": 0.0002272513972858682, + "loss": 1.3248, + "step": 7640 + }, + { + "epoch": 0.6932498639085466, + "grad_norm": 0.1420425185173855, + "learning_rate": 0.00022712826770563576, + "loss": 1.3677, + "step": 7641 + }, + { + "epoch": 0.6933405915441844, + "grad_norm": 0.13832226352955335, + "learning_rate": 0.00022700516168791752, + "loss": 1.3907, + "step": 7642 + }, + { + "epoch": 0.6934313191798221, + "grad_norm": 0.14199391779908713, + "learning_rate": 0.00022688207924334393, + "loss": 1.3725, + "step": 7643 + }, + { + "epoch": 0.69352204681546, + "grad_norm": 0.18944762293725356, + "learning_rate": 0.00022675902038254286, + "loss": 1.3553, + "step": 7644 + }, + { + "epoch": 0.6936127744510978, + "grad_norm": 0.1332019154909905, + "learning_rate": 0.00022663598511614086, + "loss": 1.4028, + "step": 7645 + }, + { + "epoch": 0.6937035020867356, + "grad_norm": 0.1365274041250882, + "learning_rate": 0.00022651297345476184, + "loss": 1.385, + "step": 7646 + }, + { + "epoch": 0.6937942297223735, + "grad_norm": 0.12916044711231492, + "learning_rate": 0.00022638998540902772, + "loss": 1.3976, + "step": 7647 + }, + { + "epoch": 0.6938849573580113, + "grad_norm": 0.13244164395286692, + "learning_rate": 0.00022626702098955883, + "loss": 1.3532, + "step": 7648 + }, + { + "epoch": 0.693975684993649, + "grad_norm": 0.12086365847698022, + "learning_rate": 0.00022614408020697302, + "loss": 1.3503, + "step": 7649 + }, + { + "epoch": 0.6940664126292869, + "grad_norm": 0.37590921108220615, + "learning_rate": 0.00022602116307188601, + "loss": 1.3495, + "step": 7650 + }, + { + "epoch": 0.6941571402649247, + "grad_norm": 0.1602937441501994, + "learning_rate": 0.00022589826959491205, + "loss": 1.3586, + "step": 7651 + }, + { + "epoch": 0.6942478679005625, + "grad_norm": 0.18218592250322618, + "learning_rate": 0.00022577539978666306, + "loss": 1.3662, + "step": 7652 + }, + { + "epoch": 0.6943385955362004, + "grad_norm": 0.1383163076541113, + "learning_rate": 0.00022565255365774868, + "loss": 1.3838, + "step": 7653 + }, + { + "epoch": 0.6944293231718381, + "grad_norm": 0.1357298724438315, + "learning_rate": 0.00022552973121877656, + "loss": 1.3768, + "step": 7654 + }, + { + "epoch": 0.6945200508074759, + "grad_norm": 0.17259121142966552, + "learning_rate": 0.00022540693248035282, + "loss": 1.3936, + "step": 7655 + }, + { + "epoch": 0.6946107784431138, + "grad_norm": 0.16188546838160345, + "learning_rate": 0.0002252841574530809, + "loss": 1.3649, + "step": 7656 + }, + { + "epoch": 0.6947015060787516, + "grad_norm": 0.16617617855767203, + "learning_rate": 0.00022516140614756226, + "loss": 1.3749, + "step": 7657 + }, + { + "epoch": 0.6947922337143894, + "grad_norm": 0.14762617853541976, + "learning_rate": 0.00022503867857439676, + "loss": 1.4031, + "step": 7658 + }, + { + "epoch": 0.6948829613500273, + "grad_norm": 0.15117715224770323, + "learning_rate": 0.00022491597474418203, + "loss": 1.3608, + "step": 7659 + }, + { + "epoch": 0.694973688985665, + "grad_norm": 0.15907615567214325, + "learning_rate": 0.00022479329466751347, + "loss": 1.3367, + "step": 7660 + }, + { + "epoch": 0.6950644166213028, + "grad_norm": 0.1695231530640918, + "learning_rate": 0.00022467063835498424, + "loss": 1.3841, + "step": 7661 + }, + { + "epoch": 0.6951551442569407, + "grad_norm": 0.18556197246033165, + "learning_rate": 0.0002245480058171862, + "loss": 1.3814, + "step": 7662 + }, + { + "epoch": 0.6952458718925785, + "grad_norm": 0.1479623162433623, + "learning_rate": 0.00022442539706470844, + "loss": 1.3719, + "step": 7663 + }, + { + "epoch": 0.6953365995282162, + "grad_norm": 0.17904014318945818, + "learning_rate": 0.00022430281210813807, + "loss": 1.3871, + "step": 7664 + }, + { + "epoch": 0.6954273271638541, + "grad_norm": 0.1527884849635603, + "learning_rate": 0.0002241802509580605, + "loss": 1.3755, + "step": 7665 + }, + { + "epoch": 0.6955180547994919, + "grad_norm": 0.14394831371084954, + "learning_rate": 0.0002240577136250591, + "loss": 1.3794, + "step": 7666 + }, + { + "epoch": 0.6956087824351297, + "grad_norm": 0.1876433312220907, + "learning_rate": 0.0002239352001197148, + "loss": 1.4117, + "step": 7667 + }, + { + "epoch": 0.6956995100707676, + "grad_norm": 0.14503548549925316, + "learning_rate": 0.00022381271045260649, + "loss": 1.3682, + "step": 7668 + }, + { + "epoch": 0.6957902377064054, + "grad_norm": 0.13095313578882026, + "learning_rate": 0.00022369024463431147, + "loss": 1.3348, + "step": 7669 + }, + { + "epoch": 0.6958809653420431, + "grad_norm": 0.1998179416943737, + "learning_rate": 0.00022356780267540461, + "loss": 1.393, + "step": 7670 + }, + { + "epoch": 0.695971692977681, + "grad_norm": 0.1987575561559096, + "learning_rate": 0.00022344538458645853, + "loss": 1.3632, + "step": 7671 + }, + { + "epoch": 0.6960624206133188, + "grad_norm": 0.13835533147252568, + "learning_rate": 0.00022332299037804444, + "loss": 1.3984, + "step": 7672 + }, + { + "epoch": 0.6961531482489566, + "grad_norm": 0.12584037133520498, + "learning_rate": 0.0002232006200607307, + "loss": 1.3875, + "step": 7673 + }, + { + "epoch": 0.6962438758845945, + "grad_norm": 0.12972919511616932, + "learning_rate": 0.0002230782736450842, + "loss": 1.4014, + "step": 7674 + }, + { + "epoch": 0.6963346035202322, + "grad_norm": 0.13144854891820243, + "learning_rate": 0.00022295595114166984, + "loss": 1.3841, + "step": 7675 + }, + { + "epoch": 0.6964253311558701, + "grad_norm": 0.12327483818049795, + "learning_rate": 0.00022283365256104987, + "loss": 1.3884, + "step": 7676 + }, + { + "epoch": 0.6965160587915079, + "grad_norm": 0.11766008611475172, + "learning_rate": 0.00022271137791378488, + "loss": 1.3432, + "step": 7677 + }, + { + "epoch": 0.6966067864271457, + "grad_norm": 0.13713163791900818, + "learning_rate": 0.00022258912721043306, + "loss": 1.3563, + "step": 7678 + }, + { + "epoch": 0.6966975140627836, + "grad_norm": 0.13111713762994268, + "learning_rate": 0.0002224669004615512, + "loss": 1.3874, + "step": 7679 + }, + { + "epoch": 0.6967882416984214, + "grad_norm": 0.13214309753229744, + "learning_rate": 0.0002223446976776934, + "loss": 1.3357, + "step": 7680 + }, + { + "epoch": 0.6968789693340591, + "grad_norm": 0.11952304009504633, + "learning_rate": 0.00022222251886941147, + "loss": 1.3779, + "step": 7681 + }, + { + "epoch": 0.696969696969697, + "grad_norm": 0.12727182333427242, + "learning_rate": 0.00022210036404725638, + "loss": 1.3747, + "step": 7682 + }, + { + "epoch": 0.6970604246053348, + "grad_norm": 0.11195164186995447, + "learning_rate": 0.0002219782332217758, + "loss": 1.3769, + "step": 7683 + }, + { + "epoch": 0.6971511522409726, + "grad_norm": 0.11647515107225154, + "learning_rate": 0.0002218561264035156, + "loss": 1.3766, + "step": 7684 + }, + { + "epoch": 0.6972418798766105, + "grad_norm": 0.13201700356858553, + "learning_rate": 0.00022173404360302003, + "loss": 1.3965, + "step": 7685 + }, + { + "epoch": 0.6973326075122482, + "grad_norm": 0.12514012532191932, + "learning_rate": 0.0002216119848308308, + "loss": 1.3536, + "step": 7686 + }, + { + "epoch": 0.697423335147886, + "grad_norm": 0.12665085486882133, + "learning_rate": 0.00022148995009748756, + "loss": 1.3539, + "step": 7687 + }, + { + "epoch": 0.6975140627835239, + "grad_norm": 0.12950540649361575, + "learning_rate": 0.00022136793941352834, + "loss": 1.3728, + "step": 7688 + }, + { + "epoch": 0.6976047904191617, + "grad_norm": 0.12160394480144744, + "learning_rate": 0.00022124595278948846, + "loss": 1.3721, + "step": 7689 + }, + { + "epoch": 0.6976955180547995, + "grad_norm": 0.14290062284072794, + "learning_rate": 0.00022112399023590178, + "loss": 1.3999, + "step": 7690 + }, + { + "epoch": 0.6977862456904373, + "grad_norm": 0.11876452002350392, + "learning_rate": 0.00022100205176329947, + "loss": 1.3746, + "step": 7691 + }, + { + "epoch": 0.6978769733260751, + "grad_norm": 0.11552353248880795, + "learning_rate": 0.0002208801373822113, + "loss": 1.3644, + "step": 7692 + }, + { + "epoch": 0.6979677009617129, + "grad_norm": 0.12459066707344181, + "learning_rate": 0.00022075824710316444, + "loss": 1.3598, + "step": 7693 + }, + { + "epoch": 0.6980584285973508, + "grad_norm": 0.12589839605024358, + "learning_rate": 0.00022063638093668386, + "loss": 1.376, + "step": 7694 + }, + { + "epoch": 0.6981491562329886, + "grad_norm": 0.1284672116865136, + "learning_rate": 0.00022051453889329316, + "loss": 1.4053, + "step": 7695 + }, + { + "epoch": 0.6982398838686263, + "grad_norm": 0.11831714617831385, + "learning_rate": 0.00022039272098351326, + "loss": 1.3636, + "step": 7696 + }, + { + "epoch": 0.6983306115042642, + "grad_norm": 0.13082237138109712, + "learning_rate": 0.0002202709272178629, + "loss": 1.3629, + "step": 7697 + }, + { + "epoch": 0.698421339139902, + "grad_norm": 0.11267585162426, + "learning_rate": 0.00022014915760685917, + "loss": 1.3767, + "step": 7698 + }, + { + "epoch": 0.6985120667755398, + "grad_norm": 0.1134474669813503, + "learning_rate": 0.00022002741216101712, + "loss": 1.3501, + "step": 7699 + }, + { + "epoch": 0.6986027944111777, + "grad_norm": 0.12185315547905452, + "learning_rate": 0.00021990569089084934, + "loss": 1.4001, + "step": 7700 + }, + { + "epoch": 0.6986935220468155, + "grad_norm": 0.1217948174888299, + "learning_rate": 0.00021978399380686626, + "loss": 1.34, + "step": 7701 + }, + { + "epoch": 0.6987842496824532, + "grad_norm": 0.12009642361239914, + "learning_rate": 0.00021966232091957672, + "loss": 1.3383, + "step": 7702 + }, + { + "epoch": 0.6988749773180911, + "grad_norm": 0.13339403708057412, + "learning_rate": 0.00021954067223948714, + "loss": 1.3623, + "step": 7703 + }, + { + "epoch": 0.6989657049537289, + "grad_norm": 0.11568653955485547, + "learning_rate": 0.00021941904777710163, + "loss": 1.3853, + "step": 7704 + }, + { + "epoch": 0.6990564325893667, + "grad_norm": 0.11214013423689893, + "learning_rate": 0.00021929744754292275, + "loss": 1.3963, + "step": 7705 + }, + { + "epoch": 0.6991471602250046, + "grad_norm": 0.11941403218307348, + "learning_rate": 0.00021917587154745077, + "loss": 1.3717, + "step": 7706 + }, + { + "epoch": 0.6992378878606423, + "grad_norm": 0.12714142981989715, + "learning_rate": 0.00021905431980118374, + "loss": 1.3832, + "step": 7707 + }, + { + "epoch": 0.6993286154962801, + "grad_norm": 0.1275594369351961, + "learning_rate": 0.00021893279231461737, + "loss": 1.3538, + "step": 7708 + }, + { + "epoch": 0.699419343131918, + "grad_norm": 0.1251135995532844, + "learning_rate": 0.00021881128909824598, + "loss": 1.3613, + "step": 7709 + }, + { + "epoch": 0.6995100707675558, + "grad_norm": 0.14939706949976975, + "learning_rate": 0.00021868981016256124, + "loss": 1.4075, + "step": 7710 + }, + { + "epoch": 0.6996007984031936, + "grad_norm": 0.15743558155252427, + "learning_rate": 0.0002185683555180527, + "loss": 1.3798, + "step": 7711 + }, + { + "epoch": 0.6996915260388314, + "grad_norm": 0.11015447010132332, + "learning_rate": 0.00021844692517520808, + "loss": 1.358, + "step": 7712 + }, + { + "epoch": 0.6997822536744692, + "grad_norm": 0.11851213795115162, + "learning_rate": 0.00021832551914451322, + "loss": 1.3639, + "step": 7713 + }, + { + "epoch": 0.6998729813101071, + "grad_norm": 0.13333781809529147, + "learning_rate": 0.00021820413743645124, + "loss": 1.3455, + "step": 7714 + }, + { + "epoch": 0.6999637089457449, + "grad_norm": 0.11805093651427594, + "learning_rate": 0.00021808278006150335, + "loss": 1.3546, + "step": 7715 + }, + { + "epoch": 0.7000544365813827, + "grad_norm": 0.11216024527939485, + "learning_rate": 0.00021796144703014914, + "loss": 1.36, + "step": 7716 + }, + { + "epoch": 0.7001451642170206, + "grad_norm": 0.11772474720542246, + "learning_rate": 0.00021784013835286553, + "loss": 1.3868, + "step": 7717 + }, + { + "epoch": 0.7002358918526583, + "grad_norm": 0.13926295890299745, + "learning_rate": 0.00021771885404012743, + "loss": 1.3663, + "step": 7718 + }, + { + "epoch": 0.7003266194882961, + "grad_norm": 0.12528094859164482, + "learning_rate": 0.00021759759410240804, + "loss": 1.3923, + "step": 7719 + }, + { + "epoch": 0.700417347123934, + "grad_norm": 0.13402813277397582, + "learning_rate": 0.00021747635855017783, + "loss": 1.3362, + "step": 7720 + }, + { + "epoch": 0.7005080747595718, + "grad_norm": 0.12191011148849502, + "learning_rate": 0.00021735514739390589, + "loss": 1.4018, + "step": 7721 + }, + { + "epoch": 0.7005988023952096, + "grad_norm": 0.22989798599081843, + "learning_rate": 0.0002172339606440584, + "loss": 1.3534, + "step": 7722 + }, + { + "epoch": 0.7006895300308474, + "grad_norm": 0.11876451555579384, + "learning_rate": 0.00021711279831110038, + "loss": 1.4031, + "step": 7723 + }, + { + "epoch": 0.7007802576664852, + "grad_norm": 0.13021933660246335, + "learning_rate": 0.00021699166040549384, + "loss": 1.3741, + "step": 7724 + }, + { + "epoch": 0.700870985302123, + "grad_norm": 0.13294200848618007, + "learning_rate": 0.00021687054693769896, + "loss": 1.3871, + "step": 7725 + }, + { + "epoch": 0.7009617129377609, + "grad_norm": 0.11949101734849496, + "learning_rate": 0.00021674945791817424, + "loss": 1.3665, + "step": 7726 + }, + { + "epoch": 0.7010524405733987, + "grad_norm": 0.1355978664374951, + "learning_rate": 0.0002166283933573756, + "loss": 1.4052, + "step": 7727 + }, + { + "epoch": 0.7011431682090364, + "grad_norm": 0.11606363086231757, + "learning_rate": 0.00021650735326575682, + "loss": 1.384, + "step": 7728 + }, + { + "epoch": 0.7012338958446743, + "grad_norm": 0.1217567988460237, + "learning_rate": 0.0002163863376537698, + "loss": 1.3354, + "step": 7729 + }, + { + "epoch": 0.7013246234803121, + "grad_norm": 0.12294490614444584, + "learning_rate": 0.00021626534653186453, + "loss": 1.3716, + "step": 7730 + }, + { + "epoch": 0.7014153511159499, + "grad_norm": 0.12233506861223357, + "learning_rate": 0.0002161443799104884, + "loss": 1.3659, + "step": 7731 + }, + { + "epoch": 0.7015060787515878, + "grad_norm": 0.1230316916612071, + "learning_rate": 0.00021602343780008677, + "loss": 1.3915, + "step": 7732 + }, + { + "epoch": 0.7015968063872255, + "grad_norm": 0.13122681391131016, + "learning_rate": 0.0002159025202111033, + "loss": 1.3765, + "step": 7733 + }, + { + "epoch": 0.7016875340228633, + "grad_norm": 0.1508614564556645, + "learning_rate": 0.00021578162715397904, + "loss": 1.3882, + "step": 7734 + }, + { + "epoch": 0.7017782616585012, + "grad_norm": 0.12429079636007823, + "learning_rate": 0.00021566075863915297, + "loss": 1.3893, + "step": 7735 + }, + { + "epoch": 0.701868989294139, + "grad_norm": 0.1229114046659588, + "learning_rate": 0.00021553991467706236, + "loss": 1.365, + "step": 7736 + }, + { + "epoch": 0.7019597169297768, + "grad_norm": 0.11905601616150056, + "learning_rate": 0.00021541909527814218, + "loss": 1.3941, + "step": 7737 + }, + { + "epoch": 0.7020504445654147, + "grad_norm": 0.150416736891677, + "learning_rate": 0.0002152983004528251, + "loss": 1.3921, + "step": 7738 + }, + { + "epoch": 0.7021411722010524, + "grad_norm": 0.11868356217290203, + "learning_rate": 0.00021517753021154152, + "loss": 1.319, + "step": 7739 + }, + { + "epoch": 0.7022318998366902, + "grad_norm": 0.12618364052186082, + "learning_rate": 0.00021505678456472038, + "loss": 1.376, + "step": 7740 + }, + { + "epoch": 0.7023226274723281, + "grad_norm": 0.114172223389003, + "learning_rate": 0.00021493606352278788, + "loss": 1.3439, + "step": 7741 + }, + { + "epoch": 0.7024133551079659, + "grad_norm": 0.11978857440378501, + "learning_rate": 0.00021481536709616817, + "loss": 1.3718, + "step": 7742 + }, + { + "epoch": 0.7025040827436037, + "grad_norm": 0.11528265608466165, + "learning_rate": 0.0002146946952952835, + "loss": 1.3687, + "step": 7743 + }, + { + "epoch": 0.7025948103792415, + "grad_norm": 0.12693250900815492, + "learning_rate": 0.00021457404813055416, + "loss": 1.3642, + "step": 7744 + }, + { + "epoch": 0.7026855380148793, + "grad_norm": 0.12266208881016705, + "learning_rate": 0.00021445342561239778, + "loss": 1.3826, + "step": 7745 + }, + { + "epoch": 0.7027762656505171, + "grad_norm": 0.11945403531425801, + "learning_rate": 0.00021433282775123002, + "loss": 1.347, + "step": 7746 + }, + { + "epoch": 0.702866993286155, + "grad_norm": 0.12280067228111352, + "learning_rate": 0.0002142122545574649, + "loss": 1.3232, + "step": 7747 + }, + { + "epoch": 0.7029577209217928, + "grad_norm": 0.11820882400362685, + "learning_rate": 0.00021409170604151369, + "loss": 1.3399, + "step": 7748 + }, + { + "epoch": 0.7030484485574305, + "grad_norm": 0.12908217935643063, + "learning_rate": 0.00021397118221378565, + "loss": 1.4301, + "step": 7749 + }, + { + "epoch": 0.7031391761930684, + "grad_norm": 0.12473654184669274, + "learning_rate": 0.0002138506830846883, + "loss": 1.384, + "step": 7750 + }, + { + "epoch": 0.7032299038287062, + "grad_norm": 0.12860059066789475, + "learning_rate": 0.00021373020866462651, + "loss": 1.3695, + "step": 7751 + }, + { + "epoch": 0.7033206314643441, + "grad_norm": 0.11546117287453635, + "learning_rate": 0.00021360975896400343, + "loss": 1.358, + "step": 7752 + }, + { + "epoch": 0.7034113590999819, + "grad_norm": 0.11934080680366846, + "learning_rate": 0.00021348933399321996, + "loss": 1.3551, + "step": 7753 + }, + { + "epoch": 0.7035020867356196, + "grad_norm": 0.15059610742639018, + "learning_rate": 0.0002133689337626748, + "loss": 1.3817, + "step": 7754 + }, + { + "epoch": 0.7035928143712575, + "grad_norm": 0.13096434196453763, + "learning_rate": 0.00021324855828276423, + "loss": 1.3684, + "step": 7755 + }, + { + "epoch": 0.7036835420068953, + "grad_norm": 0.12126988870388251, + "learning_rate": 0.00021312820756388314, + "loss": 1.391, + "step": 7756 + }, + { + "epoch": 0.7037742696425331, + "grad_norm": 0.11906442693883666, + "learning_rate": 0.00021300788161642358, + "loss": 1.3795, + "step": 7757 + }, + { + "epoch": 0.703864997278171, + "grad_norm": 0.11812176376587512, + "learning_rate": 0.00021288758045077572, + "loss": 1.3726, + "step": 7758 + }, + { + "epoch": 0.7039557249138088, + "grad_norm": 0.12408052550551175, + "learning_rate": 0.00021276730407732725, + "loss": 1.3905, + "step": 7759 + }, + { + "epoch": 0.7040464525494465, + "grad_norm": 0.11769471683331784, + "learning_rate": 0.00021264705250646483, + "loss": 1.3961, + "step": 7760 + }, + { + "epoch": 0.7041371801850844, + "grad_norm": 0.13457591698648805, + "learning_rate": 0.00021252682574857167, + "loss": 1.3657, + "step": 7761 + }, + { + "epoch": 0.7042279078207222, + "grad_norm": 0.12490573967947832, + "learning_rate": 0.00021240662381402943, + "loss": 1.3605, + "step": 7762 + }, + { + "epoch": 0.70431863545636, + "grad_norm": 0.12183945135332143, + "learning_rate": 0.00021228644671321767, + "loss": 1.3771, + "step": 7763 + }, + { + "epoch": 0.7044093630919979, + "grad_norm": 0.11945407360186029, + "learning_rate": 0.00021216629445651368, + "loss": 1.3737, + "step": 7764 + }, + { + "epoch": 0.7045000907276356, + "grad_norm": 0.1214369068589472, + "learning_rate": 0.0002120461670542924, + "loss": 1.3984, + "step": 7765 + }, + { + "epoch": 0.7045908183632734, + "grad_norm": 0.12065239465665484, + "learning_rate": 0.00021192606451692726, + "loss": 1.3864, + "step": 7766 + }, + { + "epoch": 0.7046815459989113, + "grad_norm": 0.14440955818239437, + "learning_rate": 0.0002118059868547887, + "loss": 1.3494, + "step": 7767 + }, + { + "epoch": 0.7047722736345491, + "grad_norm": 0.12093934439801064, + "learning_rate": 0.00021168593407824583, + "loss": 1.3907, + "step": 7768 + }, + { + "epoch": 0.7048630012701869, + "grad_norm": 0.119517163385744, + "learning_rate": 0.00021156590619766485, + "loss": 1.385, + "step": 7769 + }, + { + "epoch": 0.7049537289058248, + "grad_norm": 0.11763912831648549, + "learning_rate": 0.00021144590322341055, + "loss": 1.3939, + "step": 7770 + }, + { + "epoch": 0.7050444565414625, + "grad_norm": 0.12555514431493064, + "learning_rate": 0.00021132592516584508, + "loss": 1.386, + "step": 7771 + }, + { + "epoch": 0.7051351841771003, + "grad_norm": 0.12552246951731302, + "learning_rate": 0.00021120597203532832, + "loss": 1.3712, + "step": 7772 + }, + { + "epoch": 0.7052259118127382, + "grad_norm": 0.11543837211052188, + "learning_rate": 0.00021108604384221862, + "loss": 1.3618, + "step": 7773 + }, + { + "epoch": 0.705316639448376, + "grad_norm": 0.12516355645179433, + "learning_rate": 0.0002109661405968716, + "loss": 1.3568, + "step": 7774 + }, + { + "epoch": 0.7054073670840137, + "grad_norm": 0.125287271926098, + "learning_rate": 0.00021084626230964077, + "loss": 1.3801, + "step": 7775 + }, + { + "epoch": 0.7054980947196516, + "grad_norm": 0.12137329931005728, + "learning_rate": 0.00021072640899087781, + "loss": 1.353, + "step": 7776 + }, + { + "epoch": 0.7055888223552894, + "grad_norm": 0.12071068709496871, + "learning_rate": 0.00021060658065093224, + "loss": 1.3635, + "step": 7777 + }, + { + "epoch": 0.7056795499909272, + "grad_norm": 0.11396442520844045, + "learning_rate": 0.0002104867773001511, + "loss": 1.348, + "step": 7778 + }, + { + "epoch": 0.7057702776265651, + "grad_norm": 0.15185079588421077, + "learning_rate": 0.00021036699894887918, + "loss": 1.3605, + "step": 7779 + }, + { + "epoch": 0.7058610052622029, + "grad_norm": 0.11916572790306633, + "learning_rate": 0.00021024724560745985, + "loss": 1.3746, + "step": 7780 + }, + { + "epoch": 0.7059517328978406, + "grad_norm": 0.1325189194015109, + "learning_rate": 0.00021012751728623346, + "loss": 1.3694, + "step": 7781 + }, + { + "epoch": 0.7060424605334785, + "grad_norm": 0.1429044748637299, + "learning_rate": 0.0002100078139955386, + "loss": 1.3815, + "step": 7782 + }, + { + "epoch": 0.7061331881691163, + "grad_norm": 0.11605307087188611, + "learning_rate": 0.0002098881357457117, + "loss": 1.3514, + "step": 7783 + }, + { + "epoch": 0.7062239158047541, + "grad_norm": 0.12761811571181766, + "learning_rate": 0.00020976848254708718, + "loss": 1.3752, + "step": 7784 + }, + { + "epoch": 0.706314643440392, + "grad_norm": 0.11904782670077138, + "learning_rate": 0.00020964885440999704, + "loss": 1.345, + "step": 7785 + }, + { + "epoch": 0.7064053710760297, + "grad_norm": 0.12434550357504598, + "learning_rate": 0.00020952925134477092, + "loss": 1.3525, + "step": 7786 + }, + { + "epoch": 0.7064960987116675, + "grad_norm": 0.13062367710146025, + "learning_rate": 0.0002094096733617369, + "loss": 1.3597, + "step": 7787 + }, + { + "epoch": 0.7065868263473054, + "grad_norm": 0.11476498038479623, + "learning_rate": 0.00020929012047122043, + "loss": 1.3889, + "step": 7788 + }, + { + "epoch": 0.7066775539829432, + "grad_norm": 0.18488972071631193, + "learning_rate": 0.00020917059268354472, + "loss": 1.4195, + "step": 7789 + }, + { + "epoch": 0.7067682816185811, + "grad_norm": 0.11383510870516382, + "learning_rate": 0.00020905109000903115, + "loss": 1.3784, + "step": 7790 + }, + { + "epoch": 0.7068590092542189, + "grad_norm": 0.12475782439566656, + "learning_rate": 0.00020893161245799907, + "loss": 1.3695, + "step": 7791 + }, + { + "epoch": 0.7069497368898566, + "grad_norm": 0.12413771072797584, + "learning_rate": 0.0002088121600407652, + "loss": 1.3436, + "step": 7792 + }, + { + "epoch": 0.7070404645254945, + "grad_norm": 0.13691399023558554, + "learning_rate": 0.000208692732767644, + "loss": 1.367, + "step": 7793 + }, + { + "epoch": 0.7071311921611323, + "grad_norm": 0.12514789895210351, + "learning_rate": 0.0002085733306489484, + "loss": 1.3414, + "step": 7794 + }, + { + "epoch": 0.7072219197967701, + "grad_norm": 0.12455915906436801, + "learning_rate": 0.0002084539536949887, + "loss": 1.4189, + "step": 7795 + }, + { + "epoch": 0.707312647432408, + "grad_norm": 0.15286288285523528, + "learning_rate": 0.0002083346019160729, + "loss": 1.359, + "step": 7796 + }, + { + "epoch": 0.7074033750680457, + "grad_norm": 0.13338277244291893, + "learning_rate": 0.00020821527532250734, + "loss": 1.3615, + "step": 7797 + }, + { + "epoch": 0.7074941027036835, + "grad_norm": 0.12223476628692906, + "learning_rate": 0.0002080959739245956, + "loss": 1.3632, + "step": 7798 + }, + { + "epoch": 0.7075848303393214, + "grad_norm": 0.12360470210224482, + "learning_rate": 0.00020797669773263972, + "loss": 1.3779, + "step": 7799 + }, + { + "epoch": 0.7076755579749592, + "grad_norm": 0.12269555279697615, + "learning_rate": 0.00020785744675693886, + "loss": 1.3586, + "step": 7800 + }, + { + "epoch": 0.707766285610597, + "grad_norm": 0.1486090695654875, + "learning_rate": 0.00020773822100779072, + "loss": 1.3962, + "step": 7801 + }, + { + "epoch": 0.7078570132462348, + "grad_norm": 0.12684270202077697, + "learning_rate": 0.00020761902049549024, + "loss": 1.3876, + "step": 7802 + }, + { + "epoch": 0.7079477408818726, + "grad_norm": 0.14303180685679767, + "learning_rate": 0.00020749984523033027, + "loss": 1.3485, + "step": 7803 + }, + { + "epoch": 0.7080384685175104, + "grad_norm": 0.11485619126773113, + "learning_rate": 0.00020738069522260194, + "loss": 1.3837, + "step": 7804 + }, + { + "epoch": 0.7081291961531483, + "grad_norm": 0.11366386722993047, + "learning_rate": 0.0002072615704825937, + "loss": 1.3689, + "step": 7805 + }, + { + "epoch": 0.7082199237887861, + "grad_norm": 0.11391855025716713, + "learning_rate": 0.00020714247102059186, + "loss": 1.3815, + "step": 7806 + }, + { + "epoch": 0.7083106514244238, + "grad_norm": 0.13798925224994132, + "learning_rate": 0.00020702339684688077, + "loss": 1.374, + "step": 7807 + }, + { + "epoch": 0.7084013790600617, + "grad_norm": 0.1207739052417359, + "learning_rate": 0.00020690434797174273, + "loss": 1.3825, + "step": 7808 + }, + { + "epoch": 0.7084921066956995, + "grad_norm": 0.13574432653865331, + "learning_rate": 0.00020678532440545743, + "loss": 1.3976, + "step": 7809 + }, + { + "epoch": 0.7085828343313373, + "grad_norm": 0.11412738226985802, + "learning_rate": 0.00020666632615830245, + "loss": 1.4022, + "step": 7810 + }, + { + "epoch": 0.7086735619669752, + "grad_norm": 0.1208038654140403, + "learning_rate": 0.00020654735324055357, + "loss": 1.359, + "step": 7811 + }, + { + "epoch": 0.708764289602613, + "grad_norm": 0.17055886101372789, + "learning_rate": 0.000206428405662484, + "loss": 1.3343, + "step": 7812 + }, + { + "epoch": 0.7088550172382507, + "grad_norm": 0.13205848357106212, + "learning_rate": 0.00020630948343436468, + "loss": 1.398, + "step": 7813 + }, + { + "epoch": 0.7089457448738886, + "grad_norm": 0.11877481417314303, + "learning_rate": 0.00020619058656646478, + "loss": 1.36, + "step": 7814 + }, + { + "epoch": 0.7090364725095264, + "grad_norm": 0.13519057275421395, + "learning_rate": 0.0002060717150690512, + "loss": 1.3872, + "step": 7815 + }, + { + "epoch": 0.7091272001451642, + "grad_norm": 0.1314620021808548, + "learning_rate": 0.00020595286895238834, + "loss": 1.3727, + "step": 7816 + }, + { + "epoch": 0.7092179277808021, + "grad_norm": 0.11418753643045937, + "learning_rate": 0.0002058340482267384, + "loss": 1.3185, + "step": 7817 + }, + { + "epoch": 0.7093086554164398, + "grad_norm": 0.12780761213536845, + "learning_rate": 0.00020571525290236193, + "loss": 1.3408, + "step": 7818 + }, + { + "epoch": 0.7093993830520776, + "grad_norm": 0.12129987291040319, + "learning_rate": 0.00020559648298951676, + "loss": 1.3626, + "step": 7819 + }, + { + "epoch": 0.7094901106877155, + "grad_norm": 0.12387356806060815, + "learning_rate": 0.00020547773849845846, + "loss": 1.3783, + "step": 7820 + }, + { + "epoch": 0.7095808383233533, + "grad_norm": 0.12186874884000141, + "learning_rate": 0.0002053590194394409, + "loss": 1.3768, + "step": 7821 + }, + { + "epoch": 0.7096715659589911, + "grad_norm": 0.12054793437346055, + "learning_rate": 0.00020524032582271558, + "loss": 1.378, + "step": 7822 + }, + { + "epoch": 0.709762293594629, + "grad_norm": 0.11442272706618027, + "learning_rate": 0.00020512165765853159, + "loss": 1.3635, + "step": 7823 + }, + { + "epoch": 0.7098530212302667, + "grad_norm": 0.12719988647515626, + "learning_rate": 0.00020500301495713575, + "loss": 1.3759, + "step": 7824 + }, + { + "epoch": 0.7099437488659045, + "grad_norm": 0.12381324278848216, + "learning_rate": 0.00020488439772877321, + "loss": 1.3938, + "step": 7825 + }, + { + "epoch": 0.7100344765015424, + "grad_norm": 0.11825281161832626, + "learning_rate": 0.00020476580598368638, + "loss": 1.3672, + "step": 7826 + }, + { + "epoch": 0.7101252041371802, + "grad_norm": 0.14220673078555124, + "learning_rate": 0.00020464723973211558, + "loss": 1.3951, + "step": 7827 + }, + { + "epoch": 0.710215931772818, + "grad_norm": 0.1361179004509561, + "learning_rate": 0.00020452869898429933, + "loss": 1.4052, + "step": 7828 + }, + { + "epoch": 0.7103066594084558, + "grad_norm": 0.12642774851247077, + "learning_rate": 0.0002044101837504733, + "loss": 1.4004, + "step": 7829 + }, + { + "epoch": 0.7103973870440936, + "grad_norm": 0.11754197070373373, + "learning_rate": 0.0002042916940408715, + "loss": 1.3929, + "step": 7830 + }, + { + "epoch": 0.7104881146797315, + "grad_norm": 0.11689166060269045, + "learning_rate": 0.00020417322986572561, + "loss": 1.3726, + "step": 7831 + }, + { + "epoch": 0.7105788423153693, + "grad_norm": 0.12150872924907152, + "learning_rate": 0.00020405479123526498, + "loss": 1.3501, + "step": 7832 + }, + { + "epoch": 0.710669569951007, + "grad_norm": 0.12607199958861767, + "learning_rate": 0.00020393637815971656, + "loss": 1.4116, + "step": 7833 + }, + { + "epoch": 0.7107602975866449, + "grad_norm": 0.1394299322483566, + "learning_rate": 0.00020381799064930566, + "loss": 1.3659, + "step": 7834 + }, + { + "epoch": 0.7108510252222827, + "grad_norm": 0.1229115774962087, + "learning_rate": 0.00020369962871425495, + "loss": 1.36, + "step": 7835 + }, + { + "epoch": 0.7109417528579205, + "grad_norm": 0.12129312252056214, + "learning_rate": 0.00020358129236478496, + "loss": 1.3786, + "step": 7836 + }, + { + "epoch": 0.7110324804935584, + "grad_norm": 0.14040993034745355, + "learning_rate": 0.0002034629816111137, + "loss": 1.3571, + "step": 7837 + }, + { + "epoch": 0.7111232081291962, + "grad_norm": 0.12292679619538326, + "learning_rate": 0.00020334469646345806, + "loss": 1.3457, + "step": 7838 + }, + { + "epoch": 0.7112139357648339, + "grad_norm": 0.1632695655854216, + "learning_rate": 0.00020322643693203158, + "loss": 1.3802, + "step": 7839 + }, + { + "epoch": 0.7113046634004718, + "grad_norm": 0.11743832519925976, + "learning_rate": 0.00020310820302704585, + "loss": 1.3639, + "step": 7840 + }, + { + "epoch": 0.7113953910361096, + "grad_norm": 0.12338413820342305, + "learning_rate": 0.00020298999475871072, + "loss": 1.3824, + "step": 7841 + }, + { + "epoch": 0.7114861186717474, + "grad_norm": 0.12209780954300967, + "learning_rate": 0.00020287181213723333, + "loss": 1.3835, + "step": 7842 + }, + { + "epoch": 0.7115768463073853, + "grad_norm": 0.12209653453426418, + "learning_rate": 0.00020275365517281857, + "loss": 1.3346, + "step": 7843 + }, + { + "epoch": 0.711667573943023, + "grad_norm": 0.12258105423264982, + "learning_rate": 0.00020263552387566962, + "loss": 1.383, + "step": 7844 + }, + { + "epoch": 0.7117583015786608, + "grad_norm": 0.16468730271449813, + "learning_rate": 0.0002025174182559869, + "loss": 1.3637, + "step": 7845 + }, + { + "epoch": 0.7118490292142987, + "grad_norm": 0.12123699926002457, + "learning_rate": 0.00020239933832396913, + "loss": 1.3267, + "step": 7846 + }, + { + "epoch": 0.7119397568499365, + "grad_norm": 0.3249123488564578, + "learning_rate": 0.00020228128408981218, + "loss": 1.3555, + "step": 7847 + }, + { + "epoch": 0.7120304844855743, + "grad_norm": 0.1291969534735206, + "learning_rate": 0.0002021632555637104, + "loss": 1.3879, + "step": 7848 + }, + { + "epoch": 0.7121212121212122, + "grad_norm": 0.1716260615569104, + "learning_rate": 0.0002020452527558554, + "loss": 1.3595, + "step": 7849 + }, + { + "epoch": 0.7122119397568499, + "grad_norm": 0.11985890230928309, + "learning_rate": 0.00020192727567643653, + "loss": 1.3489, + "step": 7850 + }, + { + "epoch": 0.7123026673924877, + "grad_norm": 0.12097511366444276, + "learning_rate": 0.00020180932433564152, + "loss": 1.4162, + "step": 7851 + }, + { + "epoch": 0.7123933950281256, + "grad_norm": 0.12377951365592396, + "learning_rate": 0.00020169139874365533, + "loss": 1.3658, + "step": 7852 + }, + { + "epoch": 0.7124841226637634, + "grad_norm": 0.13735113849331937, + "learning_rate": 0.00020157349891066062, + "loss": 1.3607, + "step": 7853 + }, + { + "epoch": 0.7125748502994012, + "grad_norm": 0.12722491047846377, + "learning_rate": 0.00020145562484683827, + "loss": 1.3797, + "step": 7854 + }, + { + "epoch": 0.712665577935039, + "grad_norm": 0.12383861025071051, + "learning_rate": 0.0002013377765623669, + "loss": 1.3645, + "step": 7855 + }, + { + "epoch": 0.7127563055706768, + "grad_norm": 0.1212761709449405, + "learning_rate": 0.00020121995406742254, + "loss": 1.3694, + "step": 7856 + }, + { + "epoch": 0.7128470332063146, + "grad_norm": 0.14115005596443855, + "learning_rate": 0.00020110215737217902, + "loss": 1.3535, + "step": 7857 + }, + { + "epoch": 0.7129377608419525, + "grad_norm": 0.12259926365978185, + "learning_rate": 0.00020098438648680846, + "loss": 1.3571, + "step": 7858 + }, + { + "epoch": 0.7130284884775903, + "grad_norm": 0.12744491024392393, + "learning_rate": 0.00020086664142148015, + "loss": 1.3606, + "step": 7859 + }, + { + "epoch": 0.713119216113228, + "grad_norm": 0.12475295438712997, + "learning_rate": 0.00020074892218636126, + "loss": 1.3648, + "step": 7860 + }, + { + "epoch": 0.7132099437488659, + "grad_norm": 0.12172246212599265, + "learning_rate": 0.00020063122879161705, + "loss": 1.3685, + "step": 7861 + }, + { + "epoch": 0.7133006713845037, + "grad_norm": 0.14811582438114884, + "learning_rate": 0.00020051356124741054, + "loss": 1.4024, + "step": 7862 + }, + { + "epoch": 0.7133913990201415, + "grad_norm": 0.13970643544167882, + "learning_rate": 0.00020039591956390218, + "loss": 1.4277, + "step": 7863 + }, + { + "epoch": 0.7134821266557794, + "grad_norm": 0.14278334304206888, + "learning_rate": 0.00020027830375125011, + "loss": 1.3186, + "step": 7864 + }, + { + "epoch": 0.7135728542914171, + "grad_norm": 0.13327822444800191, + "learning_rate": 0.00020016071381961088, + "loss": 1.3446, + "step": 7865 + }, + { + "epoch": 0.713663581927055, + "grad_norm": 0.13813206768716782, + "learning_rate": 0.00020004314977913824, + "loss": 1.3657, + "step": 7866 + }, + { + "epoch": 0.7137543095626928, + "grad_norm": 0.12392064679833424, + "learning_rate": 0.00019992561163998358, + "loss": 1.3645, + "step": 7867 + }, + { + "epoch": 0.7138450371983306, + "grad_norm": 0.12081775679531906, + "learning_rate": 0.00019980809941229661, + "loss": 1.3671, + "step": 7868 + }, + { + "epoch": 0.7139357648339685, + "grad_norm": 0.11985088863234566, + "learning_rate": 0.0001996906131062247, + "loss": 1.3586, + "step": 7869 + }, + { + "epoch": 0.7140264924696063, + "grad_norm": 0.14807421841055923, + "learning_rate": 0.00019957315273191257, + "loss": 1.4083, + "step": 7870 + }, + { + "epoch": 0.714117220105244, + "grad_norm": 0.1472308164212464, + "learning_rate": 0.00019945571829950282, + "loss": 1.3567, + "step": 7871 + }, + { + "epoch": 0.7142079477408819, + "grad_norm": 0.1217062673826216, + "learning_rate": 0.0001993383098191363, + "loss": 1.4314, + "step": 7872 + }, + { + "epoch": 0.7142986753765197, + "grad_norm": 0.12424728438652818, + "learning_rate": 0.00019922092730095098, + "loss": 1.3843, + "step": 7873 + }, + { + "epoch": 0.7143894030121575, + "grad_norm": 0.11920524293537446, + "learning_rate": 0.0001991035707550828, + "loss": 1.418, + "step": 7874 + }, + { + "epoch": 0.7144801306477954, + "grad_norm": 0.127143919124373, + "learning_rate": 0.00019898624019166577, + "loss": 1.3751, + "step": 7875 + }, + { + "epoch": 0.7145708582834331, + "grad_norm": 0.13251365958912278, + "learning_rate": 0.00019886893562083108, + "loss": 1.3932, + "step": 7876 + }, + { + "epoch": 0.7146615859190709, + "grad_norm": 0.20182144954122444, + "learning_rate": 0.00019875165705270838, + "loss": 1.4032, + "step": 7877 + }, + { + "epoch": 0.7147523135547088, + "grad_norm": 0.12800860845104972, + "learning_rate": 0.00019863440449742432, + "loss": 1.3951, + "step": 7878 + }, + { + "epoch": 0.7148430411903466, + "grad_norm": 0.13720911233631153, + "learning_rate": 0.00019851717796510405, + "loss": 1.3912, + "step": 7879 + }, + { + "epoch": 0.7149337688259844, + "grad_norm": 0.134520499440299, + "learning_rate": 0.00019839997746586986, + "loss": 1.4039, + "step": 7880 + }, + { + "epoch": 0.7150244964616222, + "grad_norm": 0.125831092007297, + "learning_rate": 0.00019828280300984187, + "loss": 1.3747, + "step": 7881 + }, + { + "epoch": 0.71511522409726, + "grad_norm": 0.13550527042265498, + "learning_rate": 0.00019816565460713853, + "loss": 1.3156, + "step": 7882 + }, + { + "epoch": 0.7152059517328978, + "grad_norm": 0.12171742195078383, + "learning_rate": 0.00019804853226787535, + "loss": 1.3609, + "step": 7883 + }, + { + "epoch": 0.7152966793685357, + "grad_norm": 0.1265258587794428, + "learning_rate": 0.00019793143600216578, + "loss": 1.3436, + "step": 7884 + }, + { + "epoch": 0.7153874070041735, + "grad_norm": 0.1226453836722049, + "learning_rate": 0.0001978143658201212, + "loss": 1.3586, + "step": 7885 + }, + { + "epoch": 0.7154781346398112, + "grad_norm": 0.1294885582904848, + "learning_rate": 0.0001976973217318509, + "loss": 1.3152, + "step": 7886 + }, + { + "epoch": 0.7155688622754491, + "grad_norm": 0.16965774281346724, + "learning_rate": 0.0001975803037474614, + "loss": 1.3481, + "step": 7887 + }, + { + "epoch": 0.7156595899110869, + "grad_norm": 0.1302663732155666, + "learning_rate": 0.00019746331187705702, + "loss": 1.3572, + "step": 7888 + }, + { + "epoch": 0.7157503175467247, + "grad_norm": 0.11859823832099069, + "learning_rate": 0.00019734634613074043, + "loss": 1.3093, + "step": 7889 + }, + { + "epoch": 0.7158410451823626, + "grad_norm": 0.1156409673699814, + "learning_rate": 0.0001972294065186115, + "loss": 1.3864, + "step": 7890 + }, + { + "epoch": 0.7159317728180004, + "grad_norm": 0.12650981974807907, + "learning_rate": 0.0001971124930507678, + "loss": 1.3747, + "step": 7891 + }, + { + "epoch": 0.7160225004536381, + "grad_norm": 0.11778959237591452, + "learning_rate": 0.00019699560573730485, + "loss": 1.3769, + "step": 7892 + }, + { + "epoch": 0.716113228089276, + "grad_norm": 0.12229447538550098, + "learning_rate": 0.0001968787445883163, + "loss": 1.3694, + "step": 7893 + }, + { + "epoch": 0.7162039557249138, + "grad_norm": 0.11697870421971057, + "learning_rate": 0.00019676190961389272, + "loss": 1.3384, + "step": 7894 + }, + { + "epoch": 0.7162946833605516, + "grad_norm": 0.11437709414742014, + "learning_rate": 0.00019664510082412285, + "loss": 1.371, + "step": 7895 + }, + { + "epoch": 0.7163854109961895, + "grad_norm": 0.12465348378059074, + "learning_rate": 0.0001965283182290934, + "loss": 1.3649, + "step": 7896 + }, + { + "epoch": 0.7164761386318272, + "grad_norm": 0.13630969622781122, + "learning_rate": 0.00019641156183888836, + "loss": 1.3794, + "step": 7897 + }, + { + "epoch": 0.716566866267465, + "grad_norm": 0.14399613928759994, + "learning_rate": 0.00019629483166358952, + "loss": 1.3747, + "step": 7898 + }, + { + "epoch": 0.7166575939031029, + "grad_norm": 0.12129049982046922, + "learning_rate": 0.00019617812771327675, + "loss": 1.3319, + "step": 7899 + }, + { + "epoch": 0.7167483215387407, + "grad_norm": 0.1874599659011508, + "learning_rate": 0.00019606144999802762, + "loss": 1.3564, + "step": 7900 + }, + { + "epoch": 0.7168390491743785, + "grad_norm": 0.12215081726213468, + "learning_rate": 0.00019594479852791686, + "loss": 1.3969, + "step": 7901 + }, + { + "epoch": 0.7169297768100164, + "grad_norm": 0.1209664808219156, + "learning_rate": 0.00019582817331301777, + "loss": 1.3729, + "step": 7902 + }, + { + "epoch": 0.7170205044456541, + "grad_norm": 0.12333360676713792, + "learning_rate": 0.00019571157436340075, + "loss": 1.3807, + "step": 7903 + }, + { + "epoch": 0.717111232081292, + "grad_norm": 0.13224688201667215, + "learning_rate": 0.00019559500168913414, + "loss": 1.37, + "step": 7904 + }, + { + "epoch": 0.7172019597169298, + "grad_norm": 0.1148850576785801, + "learning_rate": 0.0001954784553002838, + "loss": 1.3491, + "step": 7905 + }, + { + "epoch": 0.7172926873525676, + "grad_norm": 0.1408927496134006, + "learning_rate": 0.00019536193520691388, + "loss": 1.3507, + "step": 7906 + }, + { + "epoch": 0.7173834149882055, + "grad_norm": 0.1235366430665468, + "learning_rate": 0.00019524544141908562, + "loss": 1.3852, + "step": 7907 + }, + { + "epoch": 0.7174741426238432, + "grad_norm": 0.12426888796728104, + "learning_rate": 0.00019512897394685836, + "loss": 1.3917, + "step": 7908 + }, + { + "epoch": 0.717564870259481, + "grad_norm": 0.119666986263546, + "learning_rate": 0.00019501253280028934, + "loss": 1.3543, + "step": 7909 + }, + { + "epoch": 0.7176555978951189, + "grad_norm": 0.1200823055718611, + "learning_rate": 0.00019489611798943307, + "loss": 1.3899, + "step": 7910 + }, + { + "epoch": 0.7177463255307567, + "grad_norm": 0.12310773552197833, + "learning_rate": 0.0001947797295243418, + "loss": 1.3416, + "step": 7911 + }, + { + "epoch": 0.7178370531663945, + "grad_norm": 0.12151532606547576, + "learning_rate": 0.00019466336741506607, + "loss": 1.3332, + "step": 7912 + }, + { + "epoch": 0.7179277808020323, + "grad_norm": 0.12263572878046414, + "learning_rate": 0.0001945470316716535, + "loss": 1.3672, + "step": 7913 + }, + { + "epoch": 0.7180185084376701, + "grad_norm": 0.13558256339246286, + "learning_rate": 0.0001944307223041498, + "loss": 1.3932, + "step": 7914 + }, + { + "epoch": 0.7181092360733079, + "grad_norm": 0.11642038055057087, + "learning_rate": 0.00019431443932259795, + "loss": 1.3539, + "step": 7915 + }, + { + "epoch": 0.7181999637089458, + "grad_norm": 0.12794147965871744, + "learning_rate": 0.00019419818273703967, + "loss": 1.3797, + "step": 7916 + }, + { + "epoch": 0.7182906913445836, + "grad_norm": 0.12976251591989615, + "learning_rate": 0.0001940819525575133, + "loss": 1.3715, + "step": 7917 + }, + { + "epoch": 0.7183814189802213, + "grad_norm": 0.14106966942923285, + "learning_rate": 0.00019396574879405526, + "loss": 1.3782, + "step": 7918 + }, + { + "epoch": 0.7184721466158592, + "grad_norm": 0.12806366470456026, + "learning_rate": 0.00019384957145670014, + "loss": 1.3997, + "step": 7919 + }, + { + "epoch": 0.718562874251497, + "grad_norm": 0.12622597797312002, + "learning_rate": 0.00019373342055547965, + "loss": 1.3741, + "step": 7920 + }, + { + "epoch": 0.7186536018871348, + "grad_norm": 0.15063760202383994, + "learning_rate": 0.00019361729610042322, + "loss": 1.3251, + "step": 7921 + }, + { + "epoch": 0.7187443295227727, + "grad_norm": 0.12493730714532994, + "learning_rate": 0.00019350119810155865, + "loss": 1.3573, + "step": 7922 + }, + { + "epoch": 0.7188350571584105, + "grad_norm": 0.11994206757726422, + "learning_rate": 0.00019338512656891056, + "loss": 1.4003, + "step": 7923 + }, + { + "epoch": 0.7189257847940482, + "grad_norm": 0.12469818439973372, + "learning_rate": 0.00019326908151250216, + "loss": 1.3955, + "step": 7924 + }, + { + "epoch": 0.7190165124296861, + "grad_norm": 0.1327520545331851, + "learning_rate": 0.00019315306294235364, + "loss": 1.378, + "step": 7925 + }, + { + "epoch": 0.7191072400653239, + "grad_norm": 0.1197176891545929, + "learning_rate": 0.00019303707086848354, + "loss": 1.3529, + "step": 7926 + }, + { + "epoch": 0.7191979677009617, + "grad_norm": 0.12616086578613428, + "learning_rate": 0.00019292110530090757, + "loss": 1.3665, + "step": 7927 + }, + { + "epoch": 0.7192886953365996, + "grad_norm": 0.1378349115201831, + "learning_rate": 0.0001928051662496393, + "loss": 1.3627, + "step": 7928 + }, + { + "epoch": 0.7193794229722373, + "grad_norm": 0.12864031389485198, + "learning_rate": 0.0001926892537246903, + "loss": 1.3856, + "step": 7929 + }, + { + "epoch": 0.7194701506078751, + "grad_norm": 0.11691435584190686, + "learning_rate": 0.0001925733677360696, + "loss": 1.3624, + "step": 7930 + }, + { + "epoch": 0.719560878243513, + "grad_norm": 0.12885221224805543, + "learning_rate": 0.00019245750829378374, + "loss": 1.3799, + "step": 7931 + }, + { + "epoch": 0.7196516058791508, + "grad_norm": 0.15755445796471454, + "learning_rate": 0.0001923416754078373, + "loss": 1.4056, + "step": 7932 + }, + { + "epoch": 0.7197423335147886, + "grad_norm": 0.14398039834884294, + "learning_rate": 0.00019222586908823275, + "loss": 1.3594, + "step": 7933 + }, + { + "epoch": 0.7198330611504264, + "grad_norm": 0.11451946147210205, + "learning_rate": 0.00019211008934496977, + "loss": 1.3645, + "step": 7934 + }, + { + "epoch": 0.7199237887860642, + "grad_norm": 0.12902252140660808, + "learning_rate": 0.00019199433618804574, + "loss": 1.4172, + "step": 7935 + }, + { + "epoch": 0.720014516421702, + "grad_norm": 0.12549747110889206, + "learning_rate": 0.0001918786096274564, + "loss": 1.3532, + "step": 7936 + }, + { + "epoch": 0.7201052440573399, + "grad_norm": 0.12889457276373908, + "learning_rate": 0.0001917629096731945, + "loss": 1.3778, + "step": 7937 + }, + { + "epoch": 0.7201959716929777, + "grad_norm": 0.12448505020688652, + "learning_rate": 0.0001916472363352506, + "loss": 1.3539, + "step": 7938 + }, + { + "epoch": 0.7202866993286154, + "grad_norm": 0.12251893384197497, + "learning_rate": 0.00019153158962361327, + "loss": 1.3777, + "step": 7939 + }, + { + "epoch": 0.7203774269642533, + "grad_norm": 0.13028437419329295, + "learning_rate": 0.0001914159695482689, + "loss": 1.3761, + "step": 7940 + }, + { + "epoch": 0.7204681545998911, + "grad_norm": 0.12385134069573071, + "learning_rate": 0.00019130037611920098, + "loss": 1.3509, + "step": 7941 + }, + { + "epoch": 0.720558882235529, + "grad_norm": 0.128888086088559, + "learning_rate": 0.00019118480934639088, + "loss": 1.3472, + "step": 7942 + }, + { + "epoch": 0.7206496098711668, + "grad_norm": 0.12001039850226512, + "learning_rate": 0.00019106926923981822, + "loss": 1.3774, + "step": 7943 + }, + { + "epoch": 0.7207403375068046, + "grad_norm": 0.11540293130846305, + "learning_rate": 0.00019095375580945967, + "loss": 1.3279, + "step": 7944 + }, + { + "epoch": 0.7208310651424424, + "grad_norm": 0.12251337967084387, + "learning_rate": 0.00019083826906528972, + "loss": 1.3626, + "step": 7945 + }, + { + "epoch": 0.7209217927780802, + "grad_norm": 0.11660585272037231, + "learning_rate": 0.0001907228090172808, + "loss": 1.3928, + "step": 7946 + }, + { + "epoch": 0.721012520413718, + "grad_norm": 0.12128741689838007, + "learning_rate": 0.00019060737567540303, + "loss": 1.4147, + "step": 7947 + }, + { + "epoch": 0.7211032480493559, + "grad_norm": 0.21292825099607798, + "learning_rate": 0.00019049196904962395, + "loss": 1.3981, + "step": 7948 + }, + { + "epoch": 0.7211939756849937, + "grad_norm": 0.12796356564950973, + "learning_rate": 0.00019037658914990884, + "loss": 1.3559, + "step": 7949 + }, + { + "epoch": 0.7212847033206314, + "grad_norm": 0.1129971925686145, + "learning_rate": 0.00019026123598622107, + "loss": 1.3968, + "step": 7950 + }, + { + "epoch": 0.7213754309562693, + "grad_norm": 0.1196701772868321, + "learning_rate": 0.00019014590956852124, + "loss": 1.3749, + "step": 7951 + }, + { + "epoch": 0.7214661585919071, + "grad_norm": 0.12084492201507326, + "learning_rate": 0.00019003060990676758, + "loss": 1.3825, + "step": 7952 + }, + { + "epoch": 0.7215568862275449, + "grad_norm": 0.11662115990915525, + "learning_rate": 0.00018991533701091657, + "loss": 1.3916, + "step": 7953 + }, + { + "epoch": 0.7216476138631828, + "grad_norm": 0.11403317954458811, + "learning_rate": 0.00018980009089092183, + "loss": 1.375, + "step": 7954 + }, + { + "epoch": 0.7217383414988205, + "grad_norm": 0.11488486209280695, + "learning_rate": 0.0001896848715567351, + "loss": 1.3826, + "step": 7955 + }, + { + "epoch": 0.7218290691344583, + "grad_norm": 0.12852453445479747, + "learning_rate": 0.00018956967901830523, + "loss": 1.3869, + "step": 7956 + }, + { + "epoch": 0.7219197967700962, + "grad_norm": 0.11094798440791646, + "learning_rate": 0.0001894545132855795, + "loss": 1.3947, + "step": 7957 + }, + { + "epoch": 0.722010524405734, + "grad_norm": 0.122740895082674, + "learning_rate": 0.00018933937436850236, + "loss": 1.3526, + "step": 7958 + }, + { + "epoch": 0.7221012520413718, + "grad_norm": 0.11782390210067631, + "learning_rate": 0.00018922426227701582, + "loss": 1.3531, + "step": 7959 + }, + { + "epoch": 0.7221919796770097, + "grad_norm": 0.13011009638407817, + "learning_rate": 0.0001891091770210603, + "loss": 1.3509, + "step": 7960 + }, + { + "epoch": 0.7222827073126474, + "grad_norm": 0.148135997400728, + "learning_rate": 0.00018899411861057308, + "loss": 1.3758, + "step": 7961 + }, + { + "epoch": 0.7223734349482852, + "grad_norm": 0.10792246092411949, + "learning_rate": 0.0001888790870554894, + "loss": 1.3474, + "step": 7962 + }, + { + "epoch": 0.7224641625839231, + "grad_norm": 0.12093782349816, + "learning_rate": 0.00018876408236574238, + "loss": 1.3775, + "step": 7963 + }, + { + "epoch": 0.7225548902195609, + "grad_norm": 0.11892497881672622, + "learning_rate": 0.00018864910455126293, + "loss": 1.353, + "step": 7964 + }, + { + "epoch": 0.7226456178551987, + "grad_norm": 0.11622938063565383, + "learning_rate": 0.00018853415362197917, + "loss": 1.3748, + "step": 7965 + }, + { + "epoch": 0.7227363454908365, + "grad_norm": 0.1395091360742658, + "learning_rate": 0.000188419229587817, + "loss": 1.3771, + "step": 7966 + }, + { + "epoch": 0.7228270731264743, + "grad_norm": 0.12404668069936281, + "learning_rate": 0.00018830433245870044, + "loss": 1.3878, + "step": 7967 + }, + { + "epoch": 0.7229178007621121, + "grad_norm": 0.11826520397998437, + "learning_rate": 0.00018818946224455075, + "loss": 1.3453, + "step": 7968 + }, + { + "epoch": 0.72300852839775, + "grad_norm": 0.13073573484922432, + "learning_rate": 0.00018807461895528673, + "loss": 1.3802, + "step": 7969 + }, + { + "epoch": 0.7230992560333878, + "grad_norm": 0.11192042725237474, + "learning_rate": 0.00018795980260082534, + "loss": 1.3839, + "step": 7970 + }, + { + "epoch": 0.7231899836690255, + "grad_norm": 0.11267685329298628, + "learning_rate": 0.00018784501319108116, + "loss": 1.3716, + "step": 7971 + }, + { + "epoch": 0.7232807113046634, + "grad_norm": 0.11742143835301636, + "learning_rate": 0.0001877302507359661, + "loss": 1.3908, + "step": 7972 + }, + { + "epoch": 0.7233714389403012, + "grad_norm": 0.12125937260065103, + "learning_rate": 0.00018761551524538973, + "loss": 1.3279, + "step": 7973 + }, + { + "epoch": 0.723462166575939, + "grad_norm": 0.12144865541976145, + "learning_rate": 0.00018750080672925984, + "loss": 1.3998, + "step": 7974 + }, + { + "epoch": 0.7235528942115769, + "grad_norm": 0.15748850914441878, + "learning_rate": 0.0001873861251974814, + "loss": 1.4051, + "step": 7975 + }, + { + "epoch": 0.7236436218472146, + "grad_norm": 0.11651741154925756, + "learning_rate": 0.00018727147065995687, + "loss": 1.3846, + "step": 7976 + }, + { + "epoch": 0.7237343494828524, + "grad_norm": 0.11481928117309527, + "learning_rate": 0.00018715684312658697, + "loss": 1.3959, + "step": 7977 + }, + { + "epoch": 0.7238250771184903, + "grad_norm": 0.19367935627938457, + "learning_rate": 0.00018704224260726994, + "loss": 1.3785, + "step": 7978 + }, + { + "epoch": 0.7239158047541281, + "grad_norm": 0.11913025167481021, + "learning_rate": 0.00018692766911190123, + "loss": 1.365, + "step": 7979 + }, + { + "epoch": 0.724006532389766, + "grad_norm": 0.1184443267877379, + "learning_rate": 0.00018681312265037459, + "loss": 1.3887, + "step": 7980 + }, + { + "epoch": 0.7240972600254038, + "grad_norm": 0.11147264742406351, + "learning_rate": 0.00018669860323258097, + "loss": 1.3538, + "step": 7981 + }, + { + "epoch": 0.7241879876610415, + "grad_norm": 0.12319044581259778, + "learning_rate": 0.00018658411086840915, + "loss": 1.3565, + "step": 7982 + }, + { + "epoch": 0.7242787152966794, + "grad_norm": 0.1169364423235019, + "learning_rate": 0.00018646964556774537, + "loss": 1.3262, + "step": 7983 + }, + { + "epoch": 0.7243694429323172, + "grad_norm": 0.11972589326205518, + "learning_rate": 0.00018635520734047405, + "loss": 1.3599, + "step": 7984 + }, + { + "epoch": 0.724460170567955, + "grad_norm": 0.11896742508854517, + "learning_rate": 0.0001862407961964767, + "loss": 1.3717, + "step": 7985 + }, + { + "epoch": 0.7245508982035929, + "grad_norm": 0.1170442405983051, + "learning_rate": 0.0001861264121456328, + "loss": 1.334, + "step": 7986 + }, + { + "epoch": 0.7246416258392306, + "grad_norm": 0.12717594282571745, + "learning_rate": 0.00018601205519781962, + "loss": 1.3853, + "step": 7987 + }, + { + "epoch": 0.7247323534748684, + "grad_norm": 0.12648351163171925, + "learning_rate": 0.00018589772536291177, + "loss": 1.3737, + "step": 7988 + }, + { + "epoch": 0.7248230811105063, + "grad_norm": 0.11493816839232436, + "learning_rate": 0.00018578342265078147, + "loss": 1.4187, + "step": 7989 + }, + { + "epoch": 0.7249138087461441, + "grad_norm": 0.20032602879956943, + "learning_rate": 0.0001856691470712991, + "loss": 1.3746, + "step": 7990 + }, + { + "epoch": 0.7250045363817819, + "grad_norm": 0.1411738352773776, + "learning_rate": 0.00018555489863433222, + "loss": 1.3514, + "step": 7991 + }, + { + "epoch": 0.7250952640174197, + "grad_norm": 0.11650960032061448, + "learning_rate": 0.00018544067734974618, + "loss": 1.3844, + "step": 7992 + }, + { + "epoch": 0.7251859916530575, + "grad_norm": 0.1198524226806554, + "learning_rate": 0.00018532648322740363, + "loss": 1.3727, + "step": 7993 + }, + { + "epoch": 0.7252767192886953, + "grad_norm": 0.16198631886475717, + "learning_rate": 0.000185212316277166, + "loss": 1.3605, + "step": 7994 + }, + { + "epoch": 0.7253674469243332, + "grad_norm": 0.1138388340865409, + "learning_rate": 0.00018509817650889127, + "loss": 1.3181, + "step": 7995 + }, + { + "epoch": 0.725458174559971, + "grad_norm": 0.10967954295863941, + "learning_rate": 0.0001849840639324352, + "loss": 1.3201, + "step": 7996 + }, + { + "epoch": 0.7255489021956087, + "grad_norm": 0.11840499617296121, + "learning_rate": 0.00018486997855765187, + "loss": 1.3967, + "step": 7997 + }, + { + "epoch": 0.7256396298312466, + "grad_norm": 0.12187893845826453, + "learning_rate": 0.0001847559203943923, + "loss": 1.3456, + "step": 7998 + }, + { + "epoch": 0.7257303574668844, + "grad_norm": 0.11309552687723873, + "learning_rate": 0.0001846418894525053, + "loss": 1.3809, + "step": 7999 + }, + { + "epoch": 0.7258210851025222, + "grad_norm": 0.12136201526469605, + "learning_rate": 0.00018452788574183782, + "loss": 1.3537, + "step": 8000 + }, + { + "epoch": 0.7259118127381601, + "grad_norm": 0.12172555188284484, + "learning_rate": 0.00018441390927223373, + "loss": 1.3582, + "step": 8001 + }, + { + "epoch": 0.7260025403737979, + "grad_norm": 0.12122094149008744, + "learning_rate": 0.00018429996005353522, + "loss": 1.3828, + "step": 8002 + }, + { + "epoch": 0.7260932680094356, + "grad_norm": 0.13632068929082342, + "learning_rate": 0.00018418603809558154, + "loss": 1.3684, + "step": 8003 + }, + { + "epoch": 0.7261839956450735, + "grad_norm": 0.13645144582829022, + "learning_rate": 0.00018407214340821016, + "loss": 1.3621, + "step": 8004 + }, + { + "epoch": 0.7262747232807113, + "grad_norm": 0.12153408864101072, + "learning_rate": 0.0001839582760012558, + "loss": 1.381, + "step": 8005 + }, + { + "epoch": 0.7263654509163491, + "grad_norm": 0.11703600296082226, + "learning_rate": 0.00018384443588455069, + "loss": 1.3703, + "step": 8006 + }, + { + "epoch": 0.726456178551987, + "grad_norm": 0.1432723950713307, + "learning_rate": 0.00018373062306792533, + "loss": 1.4026, + "step": 8007 + }, + { + "epoch": 0.7265469061876247, + "grad_norm": 0.12723586400810172, + "learning_rate": 0.00018361683756120724, + "loss": 1.3861, + "step": 8008 + }, + { + "epoch": 0.7266376338232625, + "grad_norm": 0.1604624420049996, + "learning_rate": 0.00018350307937422172, + "loss": 1.3284, + "step": 8009 + }, + { + "epoch": 0.7267283614589004, + "grad_norm": 0.32377231429625886, + "learning_rate": 0.00018338934851679195, + "loss": 1.3728, + "step": 8010 + }, + { + "epoch": 0.7268190890945382, + "grad_norm": 0.13133357462897113, + "learning_rate": 0.00018327564499873871, + "loss": 1.3855, + "step": 8011 + }, + { + "epoch": 0.726909816730176, + "grad_norm": 0.11791208550841989, + "learning_rate": 0.0001831619688298803, + "loss": 1.3427, + "step": 8012 + }, + { + "epoch": 0.7270005443658138, + "grad_norm": 0.14391890780222058, + "learning_rate": 0.00018304832002003236, + "loss": 1.364, + "step": 8013 + }, + { + "epoch": 0.7270912720014516, + "grad_norm": 0.175058393913203, + "learning_rate": 0.00018293469857900884, + "loss": 1.3832, + "step": 8014 + }, + { + "epoch": 0.7271819996370894, + "grad_norm": 0.135972732704368, + "learning_rate": 0.00018282110451662087, + "loss": 1.3782, + "step": 8015 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 0.11944084389741931, + "learning_rate": 0.00018270753784267708, + "loss": 1.342, + "step": 8016 + }, + { + "epoch": 0.7273634549083651, + "grad_norm": 0.12652200921147, + "learning_rate": 0.00018259399856698416, + "loss": 1.3428, + "step": 8017 + }, + { + "epoch": 0.727454182544003, + "grad_norm": 0.12408406302081279, + "learning_rate": 0.00018248048669934646, + "loss": 1.4025, + "step": 8018 + }, + { + "epoch": 0.7275449101796407, + "grad_norm": 0.12729016167588103, + "learning_rate": 0.00018236700224956548, + "loss": 1.3772, + "step": 8019 + }, + { + "epoch": 0.7276356378152785, + "grad_norm": 0.1285059267903377, + "learning_rate": 0.00018225354522744054, + "loss": 1.3984, + "step": 8020 + }, + { + "epoch": 0.7277263654509164, + "grad_norm": 0.12373049327615321, + "learning_rate": 0.00018214011564276895, + "loss": 1.3767, + "step": 8021 + }, + { + "epoch": 0.7278170930865542, + "grad_norm": 0.13093761803951393, + "learning_rate": 0.00018202671350534523, + "loss": 1.2969, + "step": 8022 + }, + { + "epoch": 0.727907820722192, + "grad_norm": 0.12403325533457286, + "learning_rate": 0.00018191333882496146, + "loss": 1.3849, + "step": 8023 + }, + { + "epoch": 0.7279985483578298, + "grad_norm": 0.13033844590567023, + "learning_rate": 0.0001817999916114078, + "loss": 1.3783, + "step": 8024 + }, + { + "epoch": 0.7280892759934676, + "grad_norm": 0.12200867025185874, + "learning_rate": 0.0001816866718744719, + "loss": 1.3255, + "step": 8025 + }, + { + "epoch": 0.7281800036291054, + "grad_norm": 0.12881611848564914, + "learning_rate": 0.00018157337962393882, + "loss": 1.3357, + "step": 8026 + }, + { + "epoch": 0.7282707312647433, + "grad_norm": 0.12205224516954426, + "learning_rate": 0.00018146011486959118, + "loss": 1.3654, + "step": 8027 + }, + { + "epoch": 0.7283614589003811, + "grad_norm": 0.12724966139362381, + "learning_rate": 0.0001813468776212097, + "loss": 1.3576, + "step": 8028 + }, + { + "epoch": 0.7284521865360188, + "grad_norm": 0.12036624571161955, + "learning_rate": 0.00018123366788857232, + "loss": 1.4017, + "step": 8029 + }, + { + "epoch": 0.7285429141716567, + "grad_norm": 0.1298843783774848, + "learning_rate": 0.00018112048568145455, + "loss": 1.3518, + "step": 8030 + }, + { + "epoch": 0.7286336418072945, + "grad_norm": 0.11861677787615676, + "learning_rate": 0.00018100733100963, + "loss": 1.3945, + "step": 8031 + }, + { + "epoch": 0.7287243694429323, + "grad_norm": 0.1474888010925239, + "learning_rate": 0.00018089420388286927, + "loss": 1.361, + "step": 8032 + }, + { + "epoch": 0.7288150970785702, + "grad_norm": 0.14652536351008558, + "learning_rate": 0.00018078110431094123, + "loss": 1.3696, + "step": 8033 + }, + { + "epoch": 0.728905824714208, + "grad_norm": 0.1643364133454942, + "learning_rate": 0.00018066803230361173, + "loss": 1.3758, + "step": 8034 + }, + { + "epoch": 0.7289965523498457, + "grad_norm": 0.12575165000330382, + "learning_rate": 0.00018055498787064483, + "loss": 1.377, + "step": 8035 + }, + { + "epoch": 0.7290872799854836, + "grad_norm": 0.11295514249345265, + "learning_rate": 0.00018044197102180183, + "loss": 1.3442, + "step": 8036 + }, + { + "epoch": 0.7291780076211214, + "grad_norm": 0.13062247498237398, + "learning_rate": 0.00018032898176684154, + "loss": 1.3344, + "step": 8037 + }, + { + "epoch": 0.7292687352567592, + "grad_norm": 0.12379287477192653, + "learning_rate": 0.000180216020115521, + "loss": 1.3767, + "step": 8038 + }, + { + "epoch": 0.7293594628923971, + "grad_norm": 0.17090928547020284, + "learning_rate": 0.00018010308607759422, + "loss": 1.3818, + "step": 8039 + }, + { + "epoch": 0.7294501905280348, + "grad_norm": 0.12509171633170818, + "learning_rate": 0.00017999017966281294, + "loss": 1.3803, + "step": 8040 + }, + { + "epoch": 0.7295409181636726, + "grad_norm": 0.11973988489194864, + "learning_rate": 0.00017987730088092686, + "loss": 1.3806, + "step": 8041 + }, + { + "epoch": 0.7296316457993105, + "grad_norm": 0.1315583468131242, + "learning_rate": 0.00017976444974168317, + "loss": 1.3989, + "step": 8042 + }, + { + "epoch": 0.7297223734349483, + "grad_norm": 0.13007681683472186, + "learning_rate": 0.0001796516262548264, + "loss": 1.3548, + "step": 8043 + }, + { + "epoch": 0.7298131010705861, + "grad_norm": 0.12879640773262885, + "learning_rate": 0.00017953883043009876, + "loss": 1.3711, + "step": 8044 + }, + { + "epoch": 0.729903828706224, + "grad_norm": 0.14716306496261874, + "learning_rate": 0.00017942606227724057, + "loss": 1.3783, + "step": 8045 + }, + { + "epoch": 0.7299945563418617, + "grad_norm": 0.1300587443159047, + "learning_rate": 0.0001793133218059891, + "loss": 1.3743, + "step": 8046 + }, + { + "epoch": 0.7300852839774995, + "grad_norm": 0.14623694337281248, + "learning_rate": 0.00017920060902607937, + "loss": 1.3667, + "step": 8047 + }, + { + "epoch": 0.7301760116131374, + "grad_norm": 0.1256637840515656, + "learning_rate": 0.00017908792394724437, + "loss": 1.3684, + "step": 8048 + }, + { + "epoch": 0.7302667392487752, + "grad_norm": 0.12925155629662202, + "learning_rate": 0.0001789752665792146, + "loss": 1.3451, + "step": 8049 + }, + { + "epoch": 0.7303574668844129, + "grad_norm": 0.12973180102117352, + "learning_rate": 0.00017886263693171788, + "loss": 1.3672, + "step": 8050 + }, + { + "epoch": 0.7304481945200508, + "grad_norm": 0.16907974251686847, + "learning_rate": 0.0001787500350144796, + "loss": 1.3655, + "step": 8051 + }, + { + "epoch": 0.7305389221556886, + "grad_norm": 0.2715235434248744, + "learning_rate": 0.00017863746083722338, + "loss": 1.4118, + "step": 8052 + }, + { + "epoch": 0.7306296497913264, + "grad_norm": 0.12412798177094045, + "learning_rate": 0.00017852491440966973, + "loss": 1.3951, + "step": 8053 + }, + { + "epoch": 0.7307203774269643, + "grad_norm": 0.1272319045571887, + "learning_rate": 0.0001784123957415369, + "loss": 1.4193, + "step": 8054 + }, + { + "epoch": 0.730811105062602, + "grad_norm": 0.12489435143905696, + "learning_rate": 0.0001782999048425411, + "loss": 1.3833, + "step": 8055 + }, + { + "epoch": 0.7309018326982398, + "grad_norm": 0.12579213972360107, + "learning_rate": 0.00017818744172239616, + "loss": 1.3659, + "step": 8056 + }, + { + "epoch": 0.7309925603338777, + "grad_norm": 0.24379297202064437, + "learning_rate": 0.00017807500639081288, + "loss": 1.3085, + "step": 8057 + }, + { + "epoch": 0.7310832879695155, + "grad_norm": 0.11949905070700201, + "learning_rate": 0.00017796259885750033, + "loss": 1.3843, + "step": 8058 + }, + { + "epoch": 0.7311740156051534, + "grad_norm": 0.1302688347564087, + "learning_rate": 0.00017785021913216493, + "loss": 1.4121, + "step": 8059 + }, + { + "epoch": 0.7312647432407912, + "grad_norm": 0.1302889877651563, + "learning_rate": 0.00017773786722451052, + "loss": 1.4016, + "step": 8060 + }, + { + "epoch": 0.7313554708764289, + "grad_norm": 0.13839465432920672, + "learning_rate": 0.00017762554314423862, + "loss": 1.3695, + "step": 8061 + }, + { + "epoch": 0.7314461985120668, + "grad_norm": 0.13018114869493352, + "learning_rate": 0.00017751324690104875, + "loss": 1.3696, + "step": 8062 + }, + { + "epoch": 0.7315369261477046, + "grad_norm": 0.12022176786813918, + "learning_rate": 0.00017740097850463737, + "loss": 1.3695, + "step": 8063 + }, + { + "epoch": 0.7316276537833424, + "grad_norm": 0.12164315920406148, + "learning_rate": 0.0001772887379646991, + "loss": 1.3653, + "step": 8064 + }, + { + "epoch": 0.7317183814189803, + "grad_norm": 0.1352947890271567, + "learning_rate": 0.000177176525290926, + "loss": 1.343, + "step": 8065 + }, + { + "epoch": 0.731809109054618, + "grad_norm": 0.1204631327790889, + "learning_rate": 0.0001770643404930075, + "loss": 1.3653, + "step": 8066 + }, + { + "epoch": 0.7318998366902558, + "grad_norm": 0.12364592184437467, + "learning_rate": 0.00017695218358063065, + "loss": 1.3418, + "step": 8067 + }, + { + "epoch": 0.7319905643258937, + "grad_norm": 0.11785288480415763, + "learning_rate": 0.00017684005456348052, + "loss": 1.3963, + "step": 8068 + }, + { + "epoch": 0.7320812919615315, + "grad_norm": 0.13152484877930273, + "learning_rate": 0.00017672795345123927, + "loss": 1.3557, + "step": 8069 + }, + { + "epoch": 0.7321720195971693, + "grad_norm": 0.1187872573850378, + "learning_rate": 0.00017661588025358693, + "loss": 1.3939, + "step": 8070 + }, + { + "epoch": 0.7322627472328072, + "grad_norm": 0.13373810185156815, + "learning_rate": 0.00017650383498020068, + "loss": 1.3592, + "step": 8071 + }, + { + "epoch": 0.7323534748684449, + "grad_norm": 0.11964658186402259, + "learning_rate": 0.00017639181764075623, + "loss": 1.3655, + "step": 8072 + }, + { + "epoch": 0.7324442025040827, + "grad_norm": 0.12013144906215555, + "learning_rate": 0.00017627982824492606, + "loss": 1.3587, + "step": 8073 + }, + { + "epoch": 0.7325349301397206, + "grad_norm": 0.11976656510955487, + "learning_rate": 0.00017616786680238022, + "loss": 1.3593, + "step": 8074 + }, + { + "epoch": 0.7326256577753584, + "grad_norm": 0.11728435560042576, + "learning_rate": 0.00017605593332278702, + "loss": 1.4066, + "step": 8075 + }, + { + "epoch": 0.7327163854109962, + "grad_norm": 0.13677084389216826, + "learning_rate": 0.00017594402781581176, + "loss": 1.3672, + "step": 8076 + }, + { + "epoch": 0.732807113046634, + "grad_norm": 0.1317175010549208, + "learning_rate": 0.0001758321502911172, + "loss": 1.3134, + "step": 8077 + }, + { + "epoch": 0.7328978406822718, + "grad_norm": 0.11527875331257176, + "learning_rate": 0.00017572030075836448, + "loss": 1.3577, + "step": 8078 + }, + { + "epoch": 0.7329885683179096, + "grad_norm": 0.11292673515730675, + "learning_rate": 0.0001756084792272114, + "loss": 1.3543, + "step": 8079 + }, + { + "epoch": 0.7330792959535475, + "grad_norm": 0.11426126019280303, + "learning_rate": 0.0001754966857073141, + "loss": 1.3665, + "step": 8080 + }, + { + "epoch": 0.7331700235891853, + "grad_norm": 0.12027784100782907, + "learning_rate": 0.00017538492020832564, + "loss": 1.3793, + "step": 8081 + }, + { + "epoch": 0.733260751224823, + "grad_norm": 0.14309220636130537, + "learning_rate": 0.00017527318273989734, + "loss": 1.3336, + "step": 8082 + }, + { + "epoch": 0.7333514788604609, + "grad_norm": 0.1195906540274047, + "learning_rate": 0.0001751614733116776, + "loss": 1.3593, + "step": 8083 + }, + { + "epoch": 0.7334422064960987, + "grad_norm": 0.12383917352604967, + "learning_rate": 0.00017504979193331234, + "loss": 1.3779, + "step": 8084 + }, + { + "epoch": 0.7335329341317365, + "grad_norm": 0.11175167694169816, + "learning_rate": 0.00017493813861444558, + "loss": 1.3426, + "step": 8085 + }, + { + "epoch": 0.7336236617673744, + "grad_norm": 0.2269393860843884, + "learning_rate": 0.00017482651336471843, + "loss": 1.3788, + "step": 8086 + }, + { + "epoch": 0.7337143894030121, + "grad_norm": 0.10975728925489792, + "learning_rate": 0.00017471491619376966, + "loss": 1.3771, + "step": 8087 + }, + { + "epoch": 0.7338051170386499, + "grad_norm": 0.1305340544182608, + "learning_rate": 0.0001746033471112358, + "loss": 1.3404, + "step": 8088 + }, + { + "epoch": 0.7338958446742878, + "grad_norm": 0.11486330744181866, + "learning_rate": 0.00017449180612675108, + "loss": 1.3181, + "step": 8089 + }, + { + "epoch": 0.7339865723099256, + "grad_norm": 0.11754254835017992, + "learning_rate": 0.00017438029324994686, + "loss": 1.3521, + "step": 8090 + }, + { + "epoch": 0.7340772999455634, + "grad_norm": 0.14633667945595924, + "learning_rate": 0.00017426880849045212, + "loss": 1.3683, + "step": 8091 + }, + { + "epoch": 0.7341680275812013, + "grad_norm": 0.11749095354533873, + "learning_rate": 0.00017415735185789393, + "loss": 1.4001, + "step": 8092 + }, + { + "epoch": 0.734258755216839, + "grad_norm": 0.1528467496726807, + "learning_rate": 0.00017404592336189644, + "loss": 1.401, + "step": 8093 + }, + { + "epoch": 0.7343494828524768, + "grad_norm": 0.11083959002330905, + "learning_rate": 0.0001739345230120814, + "loss": 1.3583, + "step": 8094 + }, + { + "epoch": 0.7344402104881147, + "grad_norm": 0.12083589585267289, + "learning_rate": 0.00017382315081806833, + "loss": 1.375, + "step": 8095 + }, + { + "epoch": 0.7345309381237525, + "grad_norm": 0.1559167154756895, + "learning_rate": 0.00017371180678947434, + "loss": 1.3675, + "step": 8096 + }, + { + "epoch": 0.7346216657593904, + "grad_norm": 0.11125943588344606, + "learning_rate": 0.000173600490935914, + "loss": 1.3554, + "step": 8097 + }, + { + "epoch": 0.7347123933950281, + "grad_norm": 0.11420297847760318, + "learning_rate": 0.00017348920326699924, + "loss": 1.3598, + "step": 8098 + }, + { + "epoch": 0.7348031210306659, + "grad_norm": 0.12034487562696308, + "learning_rate": 0.00017337794379234002, + "loss": 1.4032, + "step": 8099 + }, + { + "epoch": 0.7348938486663038, + "grad_norm": 0.12084557150050547, + "learning_rate": 0.00017326671252154347, + "loss": 1.3424, + "step": 8100 + }, + { + "epoch": 0.7349845763019416, + "grad_norm": 0.12497187569477641, + "learning_rate": 0.0001731555094642143, + "loss": 1.3823, + "step": 8101 + }, + { + "epoch": 0.7350753039375794, + "grad_norm": 0.11480639956293771, + "learning_rate": 0.0001730443346299551, + "loss": 1.3728, + "step": 8102 + }, + { + "epoch": 0.7351660315732172, + "grad_norm": 0.11242283106308708, + "learning_rate": 0.00017293318802836594, + "loss": 1.3525, + "step": 8103 + }, + { + "epoch": 0.735256759208855, + "grad_norm": 0.1713056101673244, + "learning_rate": 0.00017282206966904418, + "loss": 1.3634, + "step": 8104 + }, + { + "epoch": 0.7353474868444928, + "grad_norm": 0.11693300276056078, + "learning_rate": 0.00017271097956158478, + "loss": 1.4032, + "step": 8105 + }, + { + "epoch": 0.7354382144801307, + "grad_norm": 0.11334991291471076, + "learning_rate": 0.00017259991771558064, + "loss": 1.3669, + "step": 8106 + }, + { + "epoch": 0.7355289421157685, + "grad_norm": 0.11106563436390982, + "learning_rate": 0.00017248888414062193, + "loss": 1.3861, + "step": 8107 + }, + { + "epoch": 0.7356196697514062, + "grad_norm": 0.1162656941832259, + "learning_rate": 0.00017237787884629615, + "loss": 1.3902, + "step": 8108 + }, + { + "epoch": 0.7357103973870441, + "grad_norm": 0.1133683969133566, + "learning_rate": 0.00017226690184218897, + "loss": 1.3708, + "step": 8109 + }, + { + "epoch": 0.7358011250226819, + "grad_norm": 0.11366949873043435, + "learning_rate": 0.00017215595313788303, + "loss": 1.3486, + "step": 8110 + }, + { + "epoch": 0.7358918526583197, + "grad_norm": 0.1448469988046156, + "learning_rate": 0.0001720450327429589, + "loss": 1.3718, + "step": 8111 + }, + { + "epoch": 0.7359825802939576, + "grad_norm": 0.11442763046529612, + "learning_rate": 0.0001719341406669944, + "loss": 1.3419, + "step": 8112 + }, + { + "epoch": 0.7360733079295954, + "grad_norm": 0.11933794023237937, + "learning_rate": 0.00017182327691956544, + "loss": 1.3353, + "step": 8113 + }, + { + "epoch": 0.7361640355652331, + "grad_norm": 0.11855233404628149, + "learning_rate": 0.00017171244151024484, + "loss": 1.3645, + "step": 8114 + }, + { + "epoch": 0.736254763200871, + "grad_norm": 0.11028757533543394, + "learning_rate": 0.00017160163444860315, + "loss": 1.3418, + "step": 8115 + }, + { + "epoch": 0.7363454908365088, + "grad_norm": 0.11326186491065805, + "learning_rate": 0.00017149085574420887, + "loss": 1.366, + "step": 8116 + }, + { + "epoch": 0.7364362184721466, + "grad_norm": 0.11460654333525937, + "learning_rate": 0.00017138010540662763, + "loss": 1.3453, + "step": 8117 + }, + { + "epoch": 0.7365269461077845, + "grad_norm": 0.11667820555805977, + "learning_rate": 0.00017126938344542254, + "loss": 1.3755, + "step": 8118 + }, + { + "epoch": 0.7366176737434222, + "grad_norm": 0.10842578885758768, + "learning_rate": 0.00017115868987015466, + "loss": 1.3583, + "step": 8119 + }, + { + "epoch": 0.73670840137906, + "grad_norm": 0.11952153718755519, + "learning_rate": 0.0001710480246903825, + "loss": 1.3812, + "step": 8120 + }, + { + "epoch": 0.7367991290146979, + "grad_norm": 0.12335637183751542, + "learning_rate": 0.00017093738791566193, + "loss": 1.3412, + "step": 8121 + }, + { + "epoch": 0.7368898566503357, + "grad_norm": 0.12070935944000183, + "learning_rate": 0.00017082677955554626, + "loss": 1.3869, + "step": 8122 + }, + { + "epoch": 0.7369805842859735, + "grad_norm": 0.11365720921642274, + "learning_rate": 0.0001707161996195868, + "loss": 1.3797, + "step": 8123 + }, + { + "epoch": 0.7370713119216113, + "grad_norm": 0.12480033042210965, + "learning_rate": 0.0001706056481173321, + "loss": 1.3841, + "step": 8124 + }, + { + "epoch": 0.7371620395572491, + "grad_norm": 0.15315069612457566, + "learning_rate": 0.00017049512505832798, + "loss": 1.4206, + "step": 8125 + }, + { + "epoch": 0.7372527671928869, + "grad_norm": 0.12956297192783472, + "learning_rate": 0.00017038463045211845, + "loss": 1.3529, + "step": 8126 + }, + { + "epoch": 0.7373434948285248, + "grad_norm": 0.11578478242270443, + "learning_rate": 0.00017027416430824482, + "loss": 1.3596, + "step": 8127 + }, + { + "epoch": 0.7374342224641626, + "grad_norm": 0.11380711092911093, + "learning_rate": 0.0001701637266362457, + "loss": 1.3673, + "step": 8128 + }, + { + "epoch": 0.7375249500998003, + "grad_norm": 0.112242838742281, + "learning_rate": 0.00017005331744565722, + "loss": 1.3424, + "step": 8129 + }, + { + "epoch": 0.7376156777354382, + "grad_norm": 0.12151701452524144, + "learning_rate": 0.00016994293674601352, + "loss": 1.3555, + "step": 8130 + }, + { + "epoch": 0.737706405371076, + "grad_norm": 0.10731970734489978, + "learning_rate": 0.00016983258454684598, + "loss": 1.3533, + "step": 8131 + }, + { + "epoch": 0.7377971330067138, + "grad_norm": 0.11512281196065591, + "learning_rate": 0.00016972226085768316, + "loss": 1.353, + "step": 8132 + }, + { + "epoch": 0.7378878606423517, + "grad_norm": 0.11840432647102742, + "learning_rate": 0.00016961196568805186, + "loss": 1.3648, + "step": 8133 + }, + { + "epoch": 0.7379785882779895, + "grad_norm": 0.1193426690304326, + "learning_rate": 0.00016950169904747614, + "loss": 1.3565, + "step": 8134 + }, + { + "epoch": 0.7380693159136273, + "grad_norm": 0.11135595519859968, + "learning_rate": 0.0001693914609454773, + "loss": 1.3195, + "step": 8135 + }, + { + "epoch": 0.7381600435492651, + "grad_norm": 0.12049585861250378, + "learning_rate": 0.00016928125139157468, + "loss": 1.3506, + "step": 8136 + }, + { + "epoch": 0.7382507711849029, + "grad_norm": 0.12440694711824787, + "learning_rate": 0.00016917107039528473, + "loss": 1.3891, + "step": 8137 + }, + { + "epoch": 0.7383414988205408, + "grad_norm": 0.13569130610331395, + "learning_rate": 0.0001690609179661216, + "loss": 1.3441, + "step": 8138 + }, + { + "epoch": 0.7384322264561786, + "grad_norm": 0.13010660582571892, + "learning_rate": 0.00016895079411359686, + "loss": 1.4007, + "step": 8139 + }, + { + "epoch": 0.7385229540918163, + "grad_norm": 0.11892160244373333, + "learning_rate": 0.00016884069884721996, + "loss": 1.3655, + "step": 8140 + }, + { + "epoch": 0.7386136817274542, + "grad_norm": 0.1178806951342685, + "learning_rate": 0.00016873063217649737, + "loss": 1.3453, + "step": 8141 + }, + { + "epoch": 0.738704409363092, + "grad_norm": 0.11801225017036032, + "learning_rate": 0.00016862059411093357, + "loss": 1.3705, + "step": 8142 + }, + { + "epoch": 0.7387951369987298, + "grad_norm": 0.11808898408703489, + "learning_rate": 0.00016851058466003038, + "loss": 1.3475, + "step": 8143 + }, + { + "epoch": 0.7388858646343677, + "grad_norm": 0.11339419607173223, + "learning_rate": 0.00016840060383328714, + "loss": 1.4027, + "step": 8144 + }, + { + "epoch": 0.7389765922700055, + "grad_norm": 0.12050633899819962, + "learning_rate": 0.0001682906516402004, + "loss": 1.3507, + "step": 8145 + }, + { + "epoch": 0.7390673199056432, + "grad_norm": 0.12276039090557697, + "learning_rate": 0.00016818072809026503, + "loss": 1.3609, + "step": 8146 + }, + { + "epoch": 0.7391580475412811, + "grad_norm": 0.11254156934707892, + "learning_rate": 0.0001680708331929726, + "loss": 1.3348, + "step": 8147 + }, + { + "epoch": 0.7392487751769189, + "grad_norm": 0.1131893420196048, + "learning_rate": 0.00016796096695781271, + "loss": 1.3529, + "step": 8148 + }, + { + "epoch": 0.7393395028125567, + "grad_norm": 0.11745563469320314, + "learning_rate": 0.00016785112939427188, + "loss": 1.3745, + "step": 8149 + }, + { + "epoch": 0.7394302304481946, + "grad_norm": 0.11825760230761474, + "learning_rate": 0.0001677413205118354, + "loss": 1.3626, + "step": 8150 + }, + { + "epoch": 0.7395209580838323, + "grad_norm": 0.1250279930885926, + "learning_rate": 0.00016763154031998478, + "loss": 1.3881, + "step": 8151 + }, + { + "epoch": 0.7396116857194701, + "grad_norm": 0.13301371187940514, + "learning_rate": 0.00016752178882819947, + "loss": 1.3565, + "step": 8152 + }, + { + "epoch": 0.739702413355108, + "grad_norm": 0.11971027996875433, + "learning_rate": 0.00016741206604595687, + "loss": 1.3722, + "step": 8153 + }, + { + "epoch": 0.7397931409907458, + "grad_norm": 0.11545771684816723, + "learning_rate": 0.00016730237198273146, + "loss": 1.3364, + "step": 8154 + }, + { + "epoch": 0.7398838686263836, + "grad_norm": 0.14486870321372824, + "learning_rate": 0.000167192706647995, + "loss": 1.3351, + "step": 8155 + }, + { + "epoch": 0.7399745962620214, + "grad_norm": 0.11928012731613034, + "learning_rate": 0.0001670830700512176, + "loss": 1.34, + "step": 8156 + }, + { + "epoch": 0.7400653238976592, + "grad_norm": 0.11569574740106955, + "learning_rate": 0.00016697346220186598, + "loss": 1.373, + "step": 8157 + }, + { + "epoch": 0.740156051533297, + "grad_norm": 0.14121577255058013, + "learning_rate": 0.00016686388310940513, + "loss": 1.3812, + "step": 8158 + }, + { + "epoch": 0.7402467791689349, + "grad_norm": 0.12682163396401863, + "learning_rate": 0.00016675433278329693, + "loss": 1.3633, + "step": 8159 + }, + { + "epoch": 0.7403375068045727, + "grad_norm": 0.1279409385187428, + "learning_rate": 0.0001666448112330014, + "loss": 1.3665, + "step": 8160 + }, + { + "epoch": 0.7404282344402104, + "grad_norm": 0.15606333426532432, + "learning_rate": 0.00016653531846797553, + "loss": 1.3901, + "step": 8161 + }, + { + "epoch": 0.7405189620758483, + "grad_norm": 0.12234637337383078, + "learning_rate": 0.0001664258544976739, + "loss": 1.3813, + "step": 8162 + }, + { + "epoch": 0.7406096897114861, + "grad_norm": 0.12378781404832259, + "learning_rate": 0.00016631641933154902, + "loss": 1.3543, + "step": 8163 + }, + { + "epoch": 0.7407004173471239, + "grad_norm": 0.11904356975530385, + "learning_rate": 0.00016620701297905056, + "loss": 1.3691, + "step": 8164 + }, + { + "epoch": 0.7407911449827618, + "grad_norm": 0.11949036619643152, + "learning_rate": 0.00016609763544962551, + "loss": 1.3875, + "step": 8165 + }, + { + "epoch": 0.7408818726183996, + "grad_norm": 0.1256104078307412, + "learning_rate": 0.00016598828675271892, + "loss": 1.3594, + "step": 8166 + }, + { + "epoch": 0.7409726002540373, + "grad_norm": 0.12479119606786425, + "learning_rate": 0.0001658789668977731, + "loss": 1.3918, + "step": 8167 + }, + { + "epoch": 0.7410633278896752, + "grad_norm": 0.11903783322650804, + "learning_rate": 0.00016576967589422776, + "loss": 1.3656, + "step": 8168 + }, + { + "epoch": 0.741154055525313, + "grad_norm": 0.12467017141856764, + "learning_rate": 0.00016566041375152003, + "loss": 1.3265, + "step": 8169 + }, + { + "epoch": 0.7412447831609508, + "grad_norm": 0.11252815787333431, + "learning_rate": 0.00016555118047908497, + "loss": 1.3414, + "step": 8170 + }, + { + "epoch": 0.7413355107965887, + "grad_norm": 0.11464805894888457, + "learning_rate": 0.00016544197608635476, + "loss": 1.3751, + "step": 8171 + }, + { + "epoch": 0.7414262384322264, + "grad_norm": 0.11849855895291628, + "learning_rate": 0.00016533280058275907, + "loss": 1.3767, + "step": 8172 + }, + { + "epoch": 0.7415169660678643, + "grad_norm": 0.1438504126526085, + "learning_rate": 0.00016522365397772542, + "loss": 1.3666, + "step": 8173 + }, + { + "epoch": 0.7416076937035021, + "grad_norm": 0.1756764598187242, + "learning_rate": 0.0001651145362806787, + "loss": 1.3467, + "step": 8174 + }, + { + "epoch": 0.7416984213391399, + "grad_norm": 0.12868427086657883, + "learning_rate": 0.00016500544750104118, + "loss": 1.3767, + "step": 8175 + }, + { + "epoch": 0.7417891489747778, + "grad_norm": 0.11742870379212818, + "learning_rate": 0.00016489638764823246, + "loss": 1.3866, + "step": 8176 + }, + { + "epoch": 0.7418798766104155, + "grad_norm": 0.11483215196919362, + "learning_rate": 0.00016478735673167017, + "loss": 1.3578, + "step": 8177 + }, + { + "epoch": 0.7419706042460533, + "grad_norm": 0.11332798137715928, + "learning_rate": 0.0001646783547607691, + "loss": 1.3437, + "step": 8178 + }, + { + "epoch": 0.7420613318816912, + "grad_norm": 0.11841717028890728, + "learning_rate": 0.00016456938174494128, + "loss": 1.3885, + "step": 8179 + }, + { + "epoch": 0.742152059517329, + "grad_norm": 0.10994627497663549, + "learning_rate": 0.00016446043769359681, + "loss": 1.3927, + "step": 8180 + }, + { + "epoch": 0.7422427871529668, + "grad_norm": 0.11317027452200124, + "learning_rate": 0.00016435152261614323, + "loss": 1.3601, + "step": 8181 + }, + { + "epoch": 0.7423335147886047, + "grad_norm": 0.1144815381331221, + "learning_rate": 0.00016424263652198507, + "loss": 1.3913, + "step": 8182 + }, + { + "epoch": 0.7424242424242424, + "grad_norm": 0.11512769910842201, + "learning_rate": 0.0001641337794205246, + "loss": 1.4191, + "step": 8183 + }, + { + "epoch": 0.7425149700598802, + "grad_norm": 0.10929393035606301, + "learning_rate": 0.0001640249513211619, + "loss": 1.3709, + "step": 8184 + }, + { + "epoch": 0.7426056976955181, + "grad_norm": 0.11702256049301667, + "learning_rate": 0.0001639161522332942, + "loss": 1.3559, + "step": 8185 + }, + { + "epoch": 0.7426964253311559, + "grad_norm": 0.12364491847696504, + "learning_rate": 0.00016380738216631614, + "loss": 1.3597, + "step": 8186 + }, + { + "epoch": 0.7427871529667937, + "grad_norm": 0.12032160746773127, + "learning_rate": 0.0001636986411296203, + "loss": 1.3519, + "step": 8187 + }, + { + "epoch": 0.7428778806024315, + "grad_norm": 0.11147912428987043, + "learning_rate": 0.00016358992913259623, + "loss": 1.3713, + "step": 8188 + }, + { + "epoch": 0.7429686082380693, + "grad_norm": 0.12260581472614383, + "learning_rate": 0.00016348124618463156, + "loss": 1.3509, + "step": 8189 + }, + { + "epoch": 0.7430593358737071, + "grad_norm": 0.12081474303712664, + "learning_rate": 0.00016337259229511064, + "loss": 1.3626, + "step": 8190 + }, + { + "epoch": 0.743150063509345, + "grad_norm": 0.11224552861404422, + "learning_rate": 0.0001632639674734162, + "loss": 1.3587, + "step": 8191 + }, + { + "epoch": 0.7432407911449828, + "grad_norm": 0.12393831824132129, + "learning_rate": 0.0001631553717289278, + "loss": 1.3705, + "step": 8192 + }, + { + "epoch": 0.7433315187806205, + "grad_norm": 0.11982375140704268, + "learning_rate": 0.00016304680507102254, + "loss": 1.3631, + "step": 8193 + }, + { + "epoch": 0.7434222464162584, + "grad_norm": 0.12938782429779233, + "learning_rate": 0.00016293826750907547, + "loss": 1.3566, + "step": 8194 + }, + { + "epoch": 0.7435129740518962, + "grad_norm": 0.1052562479669121, + "learning_rate": 0.00016282975905245866, + "loss": 1.3499, + "step": 8195 + }, + { + "epoch": 0.743603701687534, + "grad_norm": 0.11450244400163138, + "learning_rate": 0.00016272127971054173, + "loss": 1.37, + "step": 8196 + }, + { + "epoch": 0.7436944293231719, + "grad_norm": 0.11731910025343249, + "learning_rate": 0.00016261282949269195, + "loss": 1.345, + "step": 8197 + }, + { + "epoch": 0.7437851569588096, + "grad_norm": 0.12138883712703795, + "learning_rate": 0.0001625044084082743, + "loss": 1.3417, + "step": 8198 + }, + { + "epoch": 0.7438758845944474, + "grad_norm": 0.12087806946866939, + "learning_rate": 0.0001623960164666507, + "loss": 1.3409, + "step": 8199 + }, + { + "epoch": 0.7439666122300853, + "grad_norm": 0.11165477254861328, + "learning_rate": 0.00016228765367718063, + "loss": 1.3717, + "step": 8200 + }, + { + "epoch": 0.7440573398657231, + "grad_norm": 0.11739218339652356, + "learning_rate": 0.00016217932004922164, + "loss": 1.3891, + "step": 8201 + }, + { + "epoch": 0.7441480675013609, + "grad_norm": 0.11592845905663694, + "learning_rate": 0.00016207101559212816, + "loss": 1.3489, + "step": 8202 + }, + { + "epoch": 0.7442387951369988, + "grad_norm": 0.1140890545625888, + "learning_rate": 0.00016196274031525209, + "loss": 1.3881, + "step": 8203 + }, + { + "epoch": 0.7443295227726365, + "grad_norm": 0.12945620423443663, + "learning_rate": 0.0001618544942279433, + "loss": 1.3641, + "step": 8204 + }, + { + "epoch": 0.7444202504082743, + "grad_norm": 0.1249629679689283, + "learning_rate": 0.00016174627733954883, + "loss": 1.3781, + "step": 8205 + }, + { + "epoch": 0.7445109780439122, + "grad_norm": 0.11592428102253592, + "learning_rate": 0.00016163808965941322, + "loss": 1.3817, + "step": 8206 + }, + { + "epoch": 0.74460170567955, + "grad_norm": 0.111470115378035, + "learning_rate": 0.00016152993119687825, + "loss": 1.3434, + "step": 8207 + }, + { + "epoch": 0.7446924333151878, + "grad_norm": 0.1131284848585383, + "learning_rate": 0.0001614218019612838, + "loss": 1.384, + "step": 8208 + }, + { + "epoch": 0.7447831609508256, + "grad_norm": 0.12232277675243092, + "learning_rate": 0.00016131370196196664, + "loss": 1.3777, + "step": 8209 + }, + { + "epoch": 0.7448738885864634, + "grad_norm": 0.16094778008154434, + "learning_rate": 0.00016120563120826105, + "loss": 1.385, + "step": 8210 + }, + { + "epoch": 0.7449646162221013, + "grad_norm": 0.1164196949986277, + "learning_rate": 0.00016109758970949916, + "loss": 1.3726, + "step": 8211 + }, + { + "epoch": 0.7450553438577391, + "grad_norm": 0.11868777484223313, + "learning_rate": 0.00016098957747501052, + "loss": 1.4179, + "step": 8212 + }, + { + "epoch": 0.7451460714933769, + "grad_norm": 0.13001155200485684, + "learning_rate": 0.00016088159451412164, + "loss": 1.3483, + "step": 8213 + }, + { + "epoch": 0.7452367991290147, + "grad_norm": 0.1747573755774565, + "learning_rate": 0.00016077364083615727, + "loss": 1.3358, + "step": 8214 + }, + { + "epoch": 0.7453275267646525, + "grad_norm": 0.1252638601942285, + "learning_rate": 0.00016066571645043893, + "loss": 1.3518, + "step": 8215 + }, + { + "epoch": 0.7454182544002903, + "grad_norm": 0.12247580328928416, + "learning_rate": 0.00016055782136628605, + "loss": 1.3639, + "step": 8216 + }, + { + "epoch": 0.7455089820359282, + "grad_norm": 0.11741253030775457, + "learning_rate": 0.00016044995559301513, + "loss": 1.4054, + "step": 8217 + }, + { + "epoch": 0.745599709671566, + "grad_norm": 0.12128286398343456, + "learning_rate": 0.00016034211913994073, + "loss": 1.4115, + "step": 8218 + }, + { + "epoch": 0.7456904373072037, + "grad_norm": 0.11535703357931157, + "learning_rate": 0.00016023431201637428, + "loss": 1.3566, + "step": 8219 + }, + { + "epoch": 0.7457811649428416, + "grad_norm": 0.11526366249481389, + "learning_rate": 0.00016012653423162505, + "loss": 1.3615, + "step": 8220 + }, + { + "epoch": 0.7458718925784794, + "grad_norm": 0.1486896448364398, + "learning_rate": 0.00016001878579499984, + "loss": 1.3656, + "step": 8221 + }, + { + "epoch": 0.7459626202141172, + "grad_norm": 0.11671615390679994, + "learning_rate": 0.00015991106671580264, + "loss": 1.3843, + "step": 8222 + }, + { + "epoch": 0.7460533478497551, + "grad_norm": 0.15199784474346179, + "learning_rate": 0.0001598033770033347, + "loss": 1.3758, + "step": 8223 + }, + { + "epoch": 0.7461440754853929, + "grad_norm": 0.11783027421378389, + "learning_rate": 0.00015969571666689554, + "loss": 1.3632, + "step": 8224 + }, + { + "epoch": 0.7462348031210306, + "grad_norm": 0.10918972180579849, + "learning_rate": 0.00015958808571578132, + "loss": 1.379, + "step": 8225 + }, + { + "epoch": 0.7463255307566685, + "grad_norm": 0.1175220238184977, + "learning_rate": 0.00015948048415928613, + "loss": 1.3403, + "step": 8226 + }, + { + "epoch": 0.7464162583923063, + "grad_norm": 0.1141770529723067, + "learning_rate": 0.00015937291200670094, + "loss": 1.4047, + "step": 8227 + }, + { + "epoch": 0.7465069860279441, + "grad_norm": 0.11651006170674992, + "learning_rate": 0.0001592653692673154, + "loss": 1.3593, + "step": 8228 + }, + { + "epoch": 0.746597713663582, + "grad_norm": 0.11894582008116646, + "learning_rate": 0.00015915785595041544, + "loss": 1.3321, + "step": 8229 + }, + { + "epoch": 0.7466884412992197, + "grad_norm": 0.155054628585958, + "learning_rate": 0.0001590503720652846, + "loss": 1.3996, + "step": 8230 + }, + { + "epoch": 0.7467791689348575, + "grad_norm": 0.13728400913000716, + "learning_rate": 0.0001589429176212046, + "loss": 1.3846, + "step": 8231 + }, + { + "epoch": 0.7468698965704954, + "grad_norm": 0.11803636059819753, + "learning_rate": 0.00015883549262745396, + "loss": 1.3557, + "step": 8232 + }, + { + "epoch": 0.7469606242061332, + "grad_norm": 0.12294653347793069, + "learning_rate": 0.00015872809709330864, + "loss": 1.3799, + "step": 8233 + }, + { + "epoch": 0.747051351841771, + "grad_norm": 0.11823596809938856, + "learning_rate": 0.0001586207310280426, + "loss": 1.3503, + "step": 8234 + }, + { + "epoch": 0.7471420794774088, + "grad_norm": 0.11903675839168595, + "learning_rate": 0.00015851339444092655, + "loss": 1.3588, + "step": 8235 + }, + { + "epoch": 0.7472328071130466, + "grad_norm": 0.1103769179029489, + "learning_rate": 0.0001584060873412294, + "loss": 1.3828, + "step": 8236 + }, + { + "epoch": 0.7473235347486844, + "grad_norm": 0.11268177356673635, + "learning_rate": 0.00015829880973821682, + "loss": 1.3433, + "step": 8237 + }, + { + "epoch": 0.7474142623843223, + "grad_norm": 0.12207806258189728, + "learning_rate": 0.0001581915616411525, + "loss": 1.3744, + "step": 8238 + }, + { + "epoch": 0.7475049900199601, + "grad_norm": 0.11265557836894617, + "learning_rate": 0.0001580843430592972, + "loss": 1.3731, + "step": 8239 + }, + { + "epoch": 0.7475957176555978, + "grad_norm": 0.11344228865970434, + "learning_rate": 0.00015797715400190916, + "loss": 1.3736, + "step": 8240 + }, + { + "epoch": 0.7476864452912357, + "grad_norm": 0.12058324934134268, + "learning_rate": 0.0001578699944782444, + "loss": 1.3539, + "step": 8241 + }, + { + "epoch": 0.7477771729268735, + "grad_norm": 0.11476243880502016, + "learning_rate": 0.00015776286449755605, + "loss": 1.3549, + "step": 8242 + }, + { + "epoch": 0.7478679005625113, + "grad_norm": 0.11863853109643, + "learning_rate": 0.00015765576406909465, + "loss": 1.3477, + "step": 8243 + }, + { + "epoch": 0.7479586281981492, + "grad_norm": 0.1244637956767667, + "learning_rate": 0.00015754869320210846, + "loss": 1.3607, + "step": 8244 + }, + { + "epoch": 0.748049355833787, + "grad_norm": 0.11867355466351315, + "learning_rate": 0.00015744165190584324, + "loss": 1.3391, + "step": 8245 + }, + { + "epoch": 0.7481400834694247, + "grad_norm": 0.14512838250921692, + "learning_rate": 0.00015733464018954192, + "loss": 1.3735, + "step": 8246 + }, + { + "epoch": 0.7482308111050626, + "grad_norm": 0.359085807310474, + "learning_rate": 0.00015722765806244472, + "loss": 1.371, + "step": 8247 + }, + { + "epoch": 0.7483215387407004, + "grad_norm": 0.11960969517493707, + "learning_rate": 0.00015712070553378993, + "loss": 1.386, + "step": 8248 + }, + { + "epoch": 0.7484122663763383, + "grad_norm": 0.11757767250268526, + "learning_rate": 0.00015701378261281273, + "loss": 1.3424, + "step": 8249 + }, + { + "epoch": 0.7485029940119761, + "grad_norm": 0.1087006156716746, + "learning_rate": 0.00015690688930874582, + "loss": 1.3548, + "step": 8250 + }, + { + "epoch": 0.7485937216476138, + "grad_norm": 0.11233849387412825, + "learning_rate": 0.00015680002563081959, + "loss": 1.3683, + "step": 8251 + }, + { + "epoch": 0.7486844492832517, + "grad_norm": 0.1363510097859445, + "learning_rate": 0.00015669319158826184, + "loss": 1.354, + "step": 8252 + }, + { + "epoch": 0.7487751769188895, + "grad_norm": 0.11760171426123855, + "learning_rate": 0.00015658638719029765, + "loss": 1.3318, + "step": 8253 + }, + { + "epoch": 0.7488659045545273, + "grad_norm": 0.12977827290537627, + "learning_rate": 0.00015647961244614933, + "loss": 1.3197, + "step": 8254 + }, + { + "epoch": 0.7489566321901652, + "grad_norm": 0.11641835485528189, + "learning_rate": 0.0001563728673650373, + "loss": 1.3212, + "step": 8255 + }, + { + "epoch": 0.749047359825803, + "grad_norm": 0.12016234425352251, + "learning_rate": 0.00015626615195617883, + "loss": 1.3823, + "step": 8256 + }, + { + "epoch": 0.7491380874614407, + "grad_norm": 0.11861403528381298, + "learning_rate": 0.00015615946622878863, + "loss": 1.3555, + "step": 8257 + }, + { + "epoch": 0.7492288150970786, + "grad_norm": 0.12124400798649963, + "learning_rate": 0.00015605281019207917, + "loss": 1.3408, + "step": 8258 + }, + { + "epoch": 0.7493195427327164, + "grad_norm": 0.11967487750480947, + "learning_rate": 0.00015594618385526043, + "loss": 1.3563, + "step": 8259 + }, + { + "epoch": 0.7494102703683542, + "grad_norm": 0.11699703331127212, + "learning_rate": 0.00015583958722753945, + "loss": 1.388, + "step": 8260 + }, + { + "epoch": 0.7495009980039921, + "grad_norm": 0.11730216475082231, + "learning_rate": 0.00015573302031812063, + "loss": 1.3892, + "step": 8261 + }, + { + "epoch": 0.7495917256396298, + "grad_norm": 0.12599731171312092, + "learning_rate": 0.00015562648313620643, + "loss": 1.3606, + "step": 8262 + }, + { + "epoch": 0.7496824532752676, + "grad_norm": 0.11401389773575622, + "learning_rate": 0.00015551997569099614, + "loss": 1.3335, + "step": 8263 + }, + { + "epoch": 0.7497731809109055, + "grad_norm": 0.12185598264584498, + "learning_rate": 0.00015541349799168657, + "loss": 1.4147, + "step": 8264 + }, + { + "epoch": 0.7498639085465433, + "grad_norm": 0.127396695483879, + "learning_rate": 0.00015530705004747241, + "loss": 1.3759, + "step": 8265 + }, + { + "epoch": 0.7499546361821811, + "grad_norm": 0.12185564395850601, + "learning_rate": 0.00015520063186754514, + "loss": 1.3526, + "step": 8266 + }, + { + "epoch": 0.7500453638178189, + "grad_norm": 0.11793240972287092, + "learning_rate": 0.00015509424346109424, + "loss": 1.3857, + "step": 8267 + }, + { + "epoch": 0.7501360914534567, + "grad_norm": 0.12402032149768472, + "learning_rate": 0.00015498788483730609, + "loss": 1.3616, + "step": 8268 + }, + { + "epoch": 0.7502268190890945, + "grad_norm": 0.116877117224718, + "learning_rate": 0.00015488155600536514, + "loss": 1.3636, + "step": 8269 + }, + { + "epoch": 0.7503175467247324, + "grad_norm": 0.11321994117096419, + "learning_rate": 0.00015477525697445266, + "loss": 1.3515, + "step": 8270 + }, + { + "epoch": 0.7504082743603702, + "grad_norm": 0.11432973937715174, + "learning_rate": 0.00015466898775374742, + "loss": 1.3322, + "step": 8271 + }, + { + "epoch": 0.7504990019960079, + "grad_norm": 0.12209326079152347, + "learning_rate": 0.00015456274835242624, + "loss": 1.3479, + "step": 8272 + }, + { + "epoch": 0.7505897296316458, + "grad_norm": 0.11819709961243464, + "learning_rate": 0.0001544565387796626, + "loss": 1.3455, + "step": 8273 + }, + { + "epoch": 0.7506804572672836, + "grad_norm": 0.11805565911733193, + "learning_rate": 0.00015435035904462756, + "loss": 1.358, + "step": 8274 + }, + { + "epoch": 0.7507711849029214, + "grad_norm": 0.11462578116497563, + "learning_rate": 0.00015424420915649006, + "loss": 1.3168, + "step": 8275 + }, + { + "epoch": 0.7508619125385593, + "grad_norm": 0.1260882686375297, + "learning_rate": 0.00015413808912441613, + "loss": 1.3309, + "step": 8276 + }, + { + "epoch": 0.750952640174197, + "grad_norm": 0.11095573786665007, + "learning_rate": 0.00015403199895756926, + "loss": 1.3565, + "step": 8277 + }, + { + "epoch": 0.7510433678098348, + "grad_norm": 0.11454273145461942, + "learning_rate": 0.00015392593866511006, + "loss": 1.3658, + "step": 8278 + }, + { + "epoch": 0.7511340954454727, + "grad_norm": 0.1186625194827563, + "learning_rate": 0.00015381990825619725, + "loss": 1.3963, + "step": 8279 + }, + { + "epoch": 0.7512248230811105, + "grad_norm": 0.12381226952628198, + "learning_rate": 0.00015371390773998632, + "loss": 1.3596, + "step": 8280 + }, + { + "epoch": 0.7513155507167483, + "grad_norm": 0.12064449856808317, + "learning_rate": 0.00015360793712563037, + "loss": 1.3758, + "step": 8281 + }, + { + "epoch": 0.7514062783523862, + "grad_norm": 0.12321037339720874, + "learning_rate": 0.00015350199642228014, + "loss": 1.3755, + "step": 8282 + }, + { + "epoch": 0.7514970059880239, + "grad_norm": 0.12514004051576638, + "learning_rate": 0.0001533960856390837, + "loss": 1.3623, + "step": 8283 + }, + { + "epoch": 0.7515877336236617, + "grad_norm": 0.1185221674676068, + "learning_rate": 0.00015329020478518636, + "loss": 1.3517, + "step": 8284 + }, + { + "epoch": 0.7516784612592996, + "grad_norm": 0.12273094901717056, + "learning_rate": 0.00015318435386973078, + "loss": 1.3346, + "step": 8285 + }, + { + "epoch": 0.7517691888949374, + "grad_norm": 0.12702725773674048, + "learning_rate": 0.00015307853290185753, + "loss": 1.3594, + "step": 8286 + }, + { + "epoch": 0.7518599165305753, + "grad_norm": 0.1355586524379398, + "learning_rate": 0.0001529727418907041, + "loss": 1.3465, + "step": 8287 + }, + { + "epoch": 0.751950644166213, + "grad_norm": 0.14148355182711733, + "learning_rate": 0.00015286698084540534, + "loss": 1.3692, + "step": 8288 + }, + { + "epoch": 0.7520413718018508, + "grad_norm": 0.12387922958314697, + "learning_rate": 0.00015276124977509404, + "loss": 1.3442, + "step": 8289 + }, + { + "epoch": 0.7521320994374887, + "grad_norm": 0.11316490171088169, + "learning_rate": 0.00015265554868890008, + "loss": 1.3603, + "step": 8290 + }, + { + "epoch": 0.7522228270731265, + "grad_norm": 0.11572617374103589, + "learning_rate": 0.00015254987759595056, + "loss": 1.3564, + "step": 8291 + }, + { + "epoch": 0.7523135547087643, + "grad_norm": 0.12091903376330158, + "learning_rate": 0.00015244423650537047, + "loss": 1.3934, + "step": 8292 + }, + { + "epoch": 0.7524042823444022, + "grad_norm": 0.1295841524401297, + "learning_rate": 0.0001523386254262818, + "loss": 1.3639, + "step": 8293 + }, + { + "epoch": 0.7524950099800399, + "grad_norm": 0.11567007486634509, + "learning_rate": 0.000152233044367804, + "loss": 1.373, + "step": 8294 + }, + { + "epoch": 0.7525857376156777, + "grad_norm": 0.11847597392843015, + "learning_rate": 0.00015212749333905396, + "loss": 1.29, + "step": 8295 + }, + { + "epoch": 0.7526764652513156, + "grad_norm": 0.12584729309824513, + "learning_rate": 0.00015202197234914634, + "loss": 1.38, + "step": 8296 + }, + { + "epoch": 0.7527671928869534, + "grad_norm": 0.12231391744950271, + "learning_rate": 0.0001519164814071925, + "loss": 1.3519, + "step": 8297 + }, + { + "epoch": 0.7528579205225912, + "grad_norm": 0.11289670741697966, + "learning_rate": 0.00015181102052230178, + "loss": 1.3334, + "step": 8298 + }, + { + "epoch": 0.752948648158229, + "grad_norm": 0.12521255314135799, + "learning_rate": 0.0001517055897035809, + "loss": 1.3427, + "step": 8299 + }, + { + "epoch": 0.7530393757938668, + "grad_norm": 0.11512688694215516, + "learning_rate": 0.00015160018896013373, + "loss": 1.3779, + "step": 8300 + }, + { + "epoch": 0.7531301034295046, + "grad_norm": 0.13069215106265206, + "learning_rate": 0.0001514948183010614, + "loss": 1.36, + "step": 8301 + }, + { + "epoch": 0.7532208310651425, + "grad_norm": 0.12203806867675546, + "learning_rate": 0.00015138947773546302, + "loss": 1.3461, + "step": 8302 + }, + { + "epoch": 0.7533115587007803, + "grad_norm": 0.11890099204644597, + "learning_rate": 0.00015128416727243466, + "loss": 1.3658, + "step": 8303 + }, + { + "epoch": 0.753402286336418, + "grad_norm": 0.12226518151169297, + "learning_rate": 0.00015117888692106968, + "loss": 1.3627, + "step": 8304 + }, + { + "epoch": 0.7534930139720559, + "grad_norm": 0.12166836416987173, + "learning_rate": 0.0001510736366904592, + "loss": 1.3453, + "step": 8305 + }, + { + "epoch": 0.7535837416076937, + "grad_norm": 0.11854205838123202, + "learning_rate": 0.0001509684165896918, + "loss": 1.3481, + "step": 8306 + }, + { + "epoch": 0.7536744692433315, + "grad_norm": 0.12341194566039926, + "learning_rate": 0.00015086322662785306, + "loss": 1.3363, + "step": 8307 + }, + { + "epoch": 0.7537651968789694, + "grad_norm": 0.12198320334559284, + "learning_rate": 0.000150758066814026, + "loss": 1.3757, + "step": 8308 + }, + { + "epoch": 0.7538559245146071, + "grad_norm": 0.13540238967345916, + "learning_rate": 0.0001506529371572915, + "loss": 1.3703, + "step": 8309 + }, + { + "epoch": 0.7539466521502449, + "grad_norm": 0.12183859255767653, + "learning_rate": 0.00015054783766672736, + "loss": 1.3713, + "step": 8310 + }, + { + "epoch": 0.7540373797858828, + "grad_norm": 0.11010555992119493, + "learning_rate": 0.0001504427683514088, + "loss": 1.3408, + "step": 8311 + }, + { + "epoch": 0.7541281074215206, + "grad_norm": 0.11971420453146092, + "learning_rate": 0.0001503377292204089, + "loss": 1.36, + "step": 8312 + }, + { + "epoch": 0.7542188350571584, + "grad_norm": 0.11596136791541929, + "learning_rate": 0.0001502327202827974, + "loss": 1.3916, + "step": 8313 + }, + { + "epoch": 0.7543095626927963, + "grad_norm": 0.11910868319844682, + "learning_rate": 0.00015012774154764226, + "loss": 1.3741, + "step": 8314 + }, + { + "epoch": 0.754400290328434, + "grad_norm": 0.12362725649616355, + "learning_rate": 0.00015002279302400801, + "loss": 1.3388, + "step": 8315 + }, + { + "epoch": 0.7544910179640718, + "grad_norm": 0.12027330864067035, + "learning_rate": 0.0001499178747209573, + "loss": 1.3804, + "step": 8316 + }, + { + "epoch": 0.7545817455997097, + "grad_norm": 0.1173517569028704, + "learning_rate": 0.00014981298664754978, + "loss": 1.3869, + "step": 8317 + }, + { + "epoch": 0.7546724732353475, + "grad_norm": 0.1240464234626053, + "learning_rate": 0.00014970812881284223, + "loss": 1.3787, + "step": 8318 + }, + { + "epoch": 0.7547632008709853, + "grad_norm": 0.12454819507138895, + "learning_rate": 0.0001496033012258896, + "loss": 1.3357, + "step": 8319 + }, + { + "epoch": 0.7548539285066231, + "grad_norm": 0.12990732360270005, + "learning_rate": 0.00014949850389574354, + "loss": 1.3324, + "step": 8320 + }, + { + "epoch": 0.7549446561422609, + "grad_norm": 0.11117394701271023, + "learning_rate": 0.0001493937368314532, + "loss": 1.4119, + "step": 8321 + }, + { + "epoch": 0.7550353837778987, + "grad_norm": 0.12271269459028462, + "learning_rate": 0.0001492890000420653, + "loss": 1.3978, + "step": 8322 + }, + { + "epoch": 0.7551261114135366, + "grad_norm": 0.13372407424092086, + "learning_rate": 0.0001491842935366241, + "loss": 1.3518, + "step": 8323 + }, + { + "epoch": 0.7552168390491744, + "grad_norm": 0.12206439508464802, + "learning_rate": 0.0001490796173241709, + "loss": 1.3604, + "step": 8324 + }, + { + "epoch": 0.7553075666848122, + "grad_norm": 0.11682516105334512, + "learning_rate": 0.00014897497141374427, + "loss": 1.3872, + "step": 8325 + }, + { + "epoch": 0.75539829432045, + "grad_norm": 0.11787126058489521, + "learning_rate": 0.0001488703558143808, + "loss": 1.3709, + "step": 8326 + }, + { + "epoch": 0.7554890219560878, + "grad_norm": 0.11386327679894545, + "learning_rate": 0.0001487657705351138, + "loss": 1.4048, + "step": 8327 + }, + { + "epoch": 0.7555797495917257, + "grad_norm": 0.11422089977889807, + "learning_rate": 0.00014866121558497415, + "loss": 1.3709, + "step": 8328 + }, + { + "epoch": 0.7556704772273635, + "grad_norm": 0.10971510817692078, + "learning_rate": 0.0001485566909729903, + "loss": 1.3648, + "step": 8329 + }, + { + "epoch": 0.7557612048630012, + "grad_norm": 0.15370079862712246, + "learning_rate": 0.00014845219670818816, + "loss": 1.3898, + "step": 8330 + }, + { + "epoch": 0.7558519324986391, + "grad_norm": 0.11344104006926269, + "learning_rate": 0.00014834773279959063, + "loss": 1.3965, + "step": 8331 + }, + { + "epoch": 0.7559426601342769, + "grad_norm": 0.11537315236807032, + "learning_rate": 0.00014824329925621806, + "loss": 1.3429, + "step": 8332 + }, + { + "epoch": 0.7560333877699147, + "grad_norm": 0.11920665200795211, + "learning_rate": 0.0001481388960870886, + "loss": 1.3713, + "step": 8333 + }, + { + "epoch": 0.7561241154055526, + "grad_norm": 0.12102911817853355, + "learning_rate": 0.0001480345233012173, + "loss": 1.3616, + "step": 8334 + }, + { + "epoch": 0.7562148430411904, + "grad_norm": 0.11461363769687995, + "learning_rate": 0.00014793018090761666, + "loss": 1.3385, + "step": 8335 + }, + { + "epoch": 0.7563055706768281, + "grad_norm": 0.11085838609723073, + "learning_rate": 0.00014782586891529676, + "loss": 1.3354, + "step": 8336 + }, + { + "epoch": 0.756396298312466, + "grad_norm": 0.11838563985546446, + "learning_rate": 0.00014772158733326518, + "loss": 1.3517, + "step": 8337 + }, + { + "epoch": 0.7564870259481038, + "grad_norm": 0.12344188988264358, + "learning_rate": 0.00014761733617052643, + "loss": 1.3487, + "step": 8338 + }, + { + "epoch": 0.7565777535837416, + "grad_norm": 0.1152862468528558, + "learning_rate": 0.00014751311543608248, + "loss": 1.3519, + "step": 8339 + }, + { + "epoch": 0.7566684812193795, + "grad_norm": 0.1233845961440092, + "learning_rate": 0.00014740892513893312, + "loss": 1.3227, + "step": 8340 + }, + { + "epoch": 0.7567592088550172, + "grad_norm": 0.12817241535424415, + "learning_rate": 0.00014730476528807503, + "loss": 1.3609, + "step": 8341 + }, + { + "epoch": 0.756849936490655, + "grad_norm": 0.1324309693234332, + "learning_rate": 0.0001472006358925023, + "loss": 1.3489, + "step": 8342 + }, + { + "epoch": 0.7569406641262929, + "grad_norm": 0.11160962047353817, + "learning_rate": 0.0001470965369612068, + "loss": 1.4023, + "step": 8343 + }, + { + "epoch": 0.7570313917619307, + "grad_norm": 0.1177457312961043, + "learning_rate": 0.0001469924685031772, + "loss": 1.3625, + "step": 8344 + }, + { + "epoch": 0.7571221193975685, + "grad_norm": 0.12206066080704159, + "learning_rate": 0.00014688843052740013, + "loss": 1.363, + "step": 8345 + }, + { + "epoch": 0.7572128470332063, + "grad_norm": 0.119611824432003, + "learning_rate": 0.00014678442304285895, + "loss": 1.3761, + "step": 8346 + }, + { + "epoch": 0.7573035746688441, + "grad_norm": 0.12747573618790722, + "learning_rate": 0.00014668044605853508, + "loss": 1.379, + "step": 8347 + }, + { + "epoch": 0.7573943023044819, + "grad_norm": 0.12075058499282157, + "learning_rate": 0.00014657649958340675, + "loss": 1.3592, + "step": 8348 + }, + { + "epoch": 0.7574850299401198, + "grad_norm": 0.11706199128535826, + "learning_rate": 0.00014647258362644966, + "loss": 1.3712, + "step": 8349 + }, + { + "epoch": 0.7575757575757576, + "grad_norm": 0.11790600630484696, + "learning_rate": 0.00014636869819663716, + "loss": 1.3711, + "step": 8350 + }, + { + "epoch": 0.7576664852113953, + "grad_norm": 0.12402856542278168, + "learning_rate": 0.00014626484330293978, + "loss": 1.3633, + "step": 8351 + }, + { + "epoch": 0.7577572128470332, + "grad_norm": 0.12388488084537913, + "learning_rate": 0.0001461610189543251, + "loss": 1.3535, + "step": 8352 + }, + { + "epoch": 0.757847940482671, + "grad_norm": 0.11362028772309843, + "learning_rate": 0.00014605722515975857, + "loss": 1.3446, + "step": 8353 + }, + { + "epoch": 0.7579386681183088, + "grad_norm": 0.1092605767062645, + "learning_rate": 0.00014595346192820298, + "loss": 1.3882, + "step": 8354 + }, + { + "epoch": 0.7580293957539467, + "grad_norm": 0.11875494276907088, + "learning_rate": 0.00014584972926861818, + "loss": 1.3716, + "step": 8355 + }, + { + "epoch": 0.7581201233895845, + "grad_norm": 0.12223308041251366, + "learning_rate": 0.0001457460271899612, + "loss": 1.3352, + "step": 8356 + }, + { + "epoch": 0.7582108510252222, + "grad_norm": 0.11863193195559506, + "learning_rate": 0.00014564235570118723, + "loss": 1.3513, + "step": 8357 + }, + { + "epoch": 0.7583015786608601, + "grad_norm": 0.13822733420578087, + "learning_rate": 0.00014553871481124808, + "loss": 1.3723, + "step": 8358 + }, + { + "epoch": 0.7583923062964979, + "grad_norm": 0.1266217531246681, + "learning_rate": 0.0001454351045290928, + "loss": 1.3672, + "step": 8359 + }, + { + "epoch": 0.7584830339321357, + "grad_norm": 0.12542269047373059, + "learning_rate": 0.00014533152486366878, + "loss": 1.3534, + "step": 8360 + }, + { + "epoch": 0.7585737615677736, + "grad_norm": 0.11590465309842696, + "learning_rate": 0.00014522797582391996, + "loss": 1.3703, + "step": 8361 + }, + { + "epoch": 0.7586644892034113, + "grad_norm": 0.12042169828884894, + "learning_rate": 0.0001451244574187876, + "loss": 1.3834, + "step": 8362 + }, + { + "epoch": 0.7587552168390492, + "grad_norm": 0.12310172546222807, + "learning_rate": 0.00014502096965721055, + "loss": 1.3214, + "step": 8363 + }, + { + "epoch": 0.758845944474687, + "grad_norm": 0.11342975387763941, + "learning_rate": 0.00014491751254812525, + "loss": 1.3816, + "step": 8364 + }, + { + "epoch": 0.7589366721103248, + "grad_norm": 0.10860522467834051, + "learning_rate": 0.00014481408610046503, + "loss": 1.3367, + "step": 8365 + }, + { + "epoch": 0.7590273997459627, + "grad_norm": 0.11832247123043374, + "learning_rate": 0.0001447106903231607, + "loss": 1.4125, + "step": 8366 + }, + { + "epoch": 0.7591181273816004, + "grad_norm": 0.11458771078227387, + "learning_rate": 0.00014460732522514065, + "loss": 1.3797, + "step": 8367 + }, + { + "epoch": 0.7592088550172382, + "grad_norm": 0.11597057910693931, + "learning_rate": 0.00014450399081533056, + "loss": 1.3644, + "step": 8368 + }, + { + "epoch": 0.7592995826528761, + "grad_norm": 0.17430497881688675, + "learning_rate": 0.00014440068710265313, + "loss": 1.324, + "step": 8369 + }, + { + "epoch": 0.7593903102885139, + "grad_norm": 0.12207314980625399, + "learning_rate": 0.00014429741409602891, + "loss": 1.3727, + "step": 8370 + }, + { + "epoch": 0.7594810379241517, + "grad_norm": 0.12115657307086516, + "learning_rate": 0.00014419417180437544, + "loss": 1.4056, + "step": 8371 + }, + { + "epoch": 0.7595717655597896, + "grad_norm": 0.13273325419155976, + "learning_rate": 0.00014409096023660766, + "loss": 1.3716, + "step": 8372 + }, + { + "epoch": 0.7596624931954273, + "grad_norm": 0.1181015636154439, + "learning_rate": 0.00014398777940163775, + "loss": 1.3246, + "step": 8373 + }, + { + "epoch": 0.7597532208310651, + "grad_norm": 0.4162314763949864, + "learning_rate": 0.0001438846293083757, + "loss": 1.3408, + "step": 8374 + }, + { + "epoch": 0.759843948466703, + "grad_norm": 0.11892805725788931, + "learning_rate": 0.0001437815099657283, + "loss": 1.321, + "step": 8375 + }, + { + "epoch": 0.7599346761023408, + "grad_norm": 0.13396780235156855, + "learning_rate": 0.00014367842138259996, + "loss": 1.3185, + "step": 8376 + }, + { + "epoch": 0.7600254037379786, + "grad_norm": 0.11658264724802059, + "learning_rate": 0.00014357536356789265, + "loss": 1.373, + "step": 8377 + }, + { + "epoch": 0.7601161313736164, + "grad_norm": 0.12365314705202343, + "learning_rate": 0.0001434723365305052, + "loss": 1.3541, + "step": 8378 + }, + { + "epoch": 0.7602068590092542, + "grad_norm": 0.12184203304229989, + "learning_rate": 0.00014336934027933386, + "loss": 1.3832, + "step": 8379 + }, + { + "epoch": 0.760297586644892, + "grad_norm": 0.19457467789503632, + "learning_rate": 0.00014326637482327275, + "loss": 1.3837, + "step": 8380 + }, + { + "epoch": 0.7603883142805299, + "grad_norm": 0.11619994688786167, + "learning_rate": 0.00014316344017121268, + "loss": 1.3869, + "step": 8381 + }, + { + "epoch": 0.7604790419161677, + "grad_norm": 0.11887146969400815, + "learning_rate": 0.00014306053633204207, + "loss": 1.3492, + "step": 8382 + }, + { + "epoch": 0.7605697695518054, + "grad_norm": 0.13024914144693195, + "learning_rate": 0.0001429576633146467, + "loss": 1.3477, + "step": 8383 + }, + { + "epoch": 0.7606604971874433, + "grad_norm": 0.12459518952702556, + "learning_rate": 0.00014285482112790993, + "loss": 1.4067, + "step": 8384 + }, + { + "epoch": 0.7607512248230811, + "grad_norm": 0.1609957961855582, + "learning_rate": 0.00014275200978071194, + "loss": 1.371, + "step": 8385 + }, + { + "epoch": 0.7608419524587189, + "grad_norm": 0.12074625637315774, + "learning_rate": 0.00014264922928193047, + "loss": 1.3543, + "step": 8386 + }, + { + "epoch": 0.7609326800943568, + "grad_norm": 0.12023326432803072, + "learning_rate": 0.00014254647964044082, + "loss": 1.3744, + "step": 8387 + }, + { + "epoch": 0.7610234077299945, + "grad_norm": 0.15390915600038352, + "learning_rate": 0.00014244376086511534, + "loss": 1.3762, + "step": 8388 + }, + { + "epoch": 0.7611141353656323, + "grad_norm": 0.13807817005612466, + "learning_rate": 0.00014234107296482367, + "loss": 1.3659, + "step": 8389 + }, + { + "epoch": 0.7612048630012702, + "grad_norm": 0.11595352987674944, + "learning_rate": 0.00014223841594843324, + "loss": 1.3549, + "step": 8390 + }, + { + "epoch": 0.761295590636908, + "grad_norm": 0.12297988279894524, + "learning_rate": 0.00014213578982480812, + "loss": 1.382, + "step": 8391 + }, + { + "epoch": 0.7613863182725458, + "grad_norm": 0.1318989219018802, + "learning_rate": 0.00014203319460281044, + "loss": 1.3771, + "step": 8392 + }, + { + "epoch": 0.7614770459081837, + "grad_norm": 0.12698415896428236, + "learning_rate": 0.00014193063029129904, + "loss": 1.3809, + "step": 8393 + }, + { + "epoch": 0.7615677735438214, + "grad_norm": 0.2492010836378639, + "learning_rate": 0.00014182809689913069, + "loss": 1.3461, + "step": 8394 + }, + { + "epoch": 0.7616585011794592, + "grad_norm": 0.11930300489895862, + "learning_rate": 0.00014172559443515885, + "loss": 1.3628, + "step": 8395 + }, + { + "epoch": 0.7617492288150971, + "grad_norm": 0.13350632599362677, + "learning_rate": 0.0001416231229082347, + "loss": 1.3718, + "step": 8396 + }, + { + "epoch": 0.7618399564507349, + "grad_norm": 0.13054906442634276, + "learning_rate": 0.00014152068232720677, + "loss": 1.3554, + "step": 8397 + }, + { + "epoch": 0.7619306840863727, + "grad_norm": 0.12728172479518773, + "learning_rate": 0.00014141827270092066, + "loss": 1.3792, + "step": 8398 + }, + { + "epoch": 0.7620214117220105, + "grad_norm": 0.12212901074941797, + "learning_rate": 0.00014131589403821966, + "loss": 1.3599, + "step": 8399 + }, + { + "epoch": 0.7621121393576483, + "grad_norm": 0.11564301278395113, + "learning_rate": 0.00014121354634794391, + "loss": 1.3239, + "step": 8400 + }, + { + "epoch": 0.7622028669932862, + "grad_norm": 0.11991451127674195, + "learning_rate": 0.00014111122963893154, + "loss": 1.3981, + "step": 8401 + }, + { + "epoch": 0.762293594628924, + "grad_norm": 0.12264639638877212, + "learning_rate": 0.00014100894392001735, + "loss": 1.3524, + "step": 8402 + }, + { + "epoch": 0.7623843222645618, + "grad_norm": 0.11895374951308788, + "learning_rate": 0.00014090668920003357, + "loss": 1.3364, + "step": 8403 + }, + { + "epoch": 0.7624750499001997, + "grad_norm": 0.11612875689507098, + "learning_rate": 0.0001408044654878103, + "loss": 1.3488, + "step": 8404 + }, + { + "epoch": 0.7625657775358374, + "grad_norm": 0.1271347478596221, + "learning_rate": 0.00014070227279217436, + "loss": 1.3814, + "step": 8405 + }, + { + "epoch": 0.7626565051714752, + "grad_norm": 0.12119636045315911, + "learning_rate": 0.00014060011112194997, + "loss": 1.3591, + "step": 8406 + }, + { + "epoch": 0.7627472328071131, + "grad_norm": 0.11553380001989022, + "learning_rate": 0.00014049798048595898, + "loss": 1.3601, + "step": 8407 + }, + { + "epoch": 0.7628379604427509, + "grad_norm": 0.13232765338221988, + "learning_rate": 0.00014039588089302047, + "loss": 1.3534, + "step": 8408 + }, + { + "epoch": 0.7629286880783887, + "grad_norm": 0.12726887432908726, + "learning_rate": 0.00014029381235195065, + "loss": 1.3616, + "step": 8409 + }, + { + "epoch": 0.7630194157140265, + "grad_norm": 0.12085337575004651, + "learning_rate": 0.00014019177487156298, + "loss": 1.3607, + "step": 8410 + }, + { + "epoch": 0.7631101433496643, + "grad_norm": 0.13633059790747157, + "learning_rate": 0.00014008976846066878, + "loss": 1.3836, + "step": 8411 + }, + { + "epoch": 0.7632008709853021, + "grad_norm": 0.1175194676707633, + "learning_rate": 0.00013998779312807607, + "loss": 1.3582, + "step": 8412 + }, + { + "epoch": 0.76329159862094, + "grad_norm": 0.1276657807331373, + "learning_rate": 0.00013988584888259038, + "loss": 1.336, + "step": 8413 + }, + { + "epoch": 0.7633823262565778, + "grad_norm": 0.11380410797368139, + "learning_rate": 0.0001397839357330147, + "loss": 1.3578, + "step": 8414 + }, + { + "epoch": 0.7634730538922155, + "grad_norm": 0.12927223083242262, + "learning_rate": 0.0001396820536881494, + "loss": 1.3715, + "step": 8415 + }, + { + "epoch": 0.7635637815278534, + "grad_norm": 0.13604826059984454, + "learning_rate": 0.0001395802027567919, + "loss": 1.3822, + "step": 8416 + }, + { + "epoch": 0.7636545091634912, + "grad_norm": 0.19190264401092122, + "learning_rate": 0.0001394783829477368, + "loss": 1.3683, + "step": 8417 + }, + { + "epoch": 0.763745236799129, + "grad_norm": 0.11598640230903028, + "learning_rate": 0.00013937659426977667, + "loss": 1.3554, + "step": 8418 + }, + { + "epoch": 0.7638359644347669, + "grad_norm": 0.12054862451499744, + "learning_rate": 0.00013927483673170077, + "loss": 1.369, + "step": 8419 + }, + { + "epoch": 0.7639266920704046, + "grad_norm": 0.140375967281895, + "learning_rate": 0.00013917311034229574, + "loss": 1.3393, + "step": 8420 + }, + { + "epoch": 0.7640174197060424, + "grad_norm": 0.12908032902080505, + "learning_rate": 0.00013907141511034594, + "loss": 1.3804, + "step": 8421 + }, + { + "epoch": 0.7641081473416803, + "grad_norm": 0.13103905407505628, + "learning_rate": 0.0001389697510446325, + "loss": 1.3536, + "step": 8422 + }, + { + "epoch": 0.7641988749773181, + "grad_norm": 0.15254695075521368, + "learning_rate": 0.00013886811815393442, + "loss": 1.3802, + "step": 8423 + }, + { + "epoch": 0.7642896026129559, + "grad_norm": 0.11769439999777959, + "learning_rate": 0.00013876651644702736, + "loss": 1.3485, + "step": 8424 + }, + { + "epoch": 0.7643803302485938, + "grad_norm": 0.1572613881469441, + "learning_rate": 0.00013866494593268509, + "loss": 1.345, + "step": 8425 + }, + { + "epoch": 0.7644710578842315, + "grad_norm": 0.11926219532117656, + "learning_rate": 0.00013856340661967792, + "loss": 1.3628, + "step": 8426 + }, + { + "epoch": 0.7645617855198693, + "grad_norm": 0.12961479706814022, + "learning_rate": 0.00013846189851677372, + "loss": 1.3619, + "step": 8427 + }, + { + "epoch": 0.7646525131555072, + "grad_norm": 0.1245488545329894, + "learning_rate": 0.00013836042163273805, + "loss": 1.3375, + "step": 8428 + }, + { + "epoch": 0.764743240791145, + "grad_norm": 0.11853174539765902, + "learning_rate": 0.00013825897597633324, + "loss": 1.3166, + "step": 8429 + }, + { + "epoch": 0.7648339684267828, + "grad_norm": 0.11951816997082892, + "learning_rate": 0.00013815756155631902, + "loss": 1.3478, + "step": 8430 + }, + { + "epoch": 0.7649246960624206, + "grad_norm": 0.12026894298205892, + "learning_rate": 0.00013805617838145274, + "loss": 1.3372, + "step": 8431 + }, + { + "epoch": 0.7650154236980584, + "grad_norm": 0.11433584542165992, + "learning_rate": 0.0001379548264604889, + "loss": 1.3251, + "step": 8432 + }, + { + "epoch": 0.7651061513336962, + "grad_norm": 0.1183699631572948, + "learning_rate": 0.0001378535058021792, + "loss": 1.3732, + "step": 8433 + }, + { + "epoch": 0.7651968789693341, + "grad_norm": 0.11687224793108479, + "learning_rate": 0.00013775221641527247, + "loss": 1.3592, + "step": 8434 + }, + { + "epoch": 0.7652876066049719, + "grad_norm": 0.11241275807586792, + "learning_rate": 0.00013765095830851537, + "loss": 1.3433, + "step": 8435 + }, + { + "epoch": 0.7653783342406096, + "grad_norm": 0.11784594794709605, + "learning_rate": 0.00013754973149065146, + "loss": 1.3612, + "step": 8436 + }, + { + "epoch": 0.7654690618762475, + "grad_norm": 0.11362140336719022, + "learning_rate": 0.00013744853597042134, + "loss": 1.3786, + "step": 8437 + }, + { + "epoch": 0.7655597895118853, + "grad_norm": 0.11719010646183582, + "learning_rate": 0.00013734737175656386, + "loss": 1.3516, + "step": 8438 + }, + { + "epoch": 0.7656505171475232, + "grad_norm": 0.1257224490574553, + "learning_rate": 0.0001372462388578143, + "loss": 1.3504, + "step": 8439 + }, + { + "epoch": 0.765741244783161, + "grad_norm": 0.12600374353684654, + "learning_rate": 0.0001371451372829055, + "loss": 1.3529, + "step": 8440 + }, + { + "epoch": 0.7658319724187987, + "grad_norm": 0.12205082580592803, + "learning_rate": 0.0001370440670405674, + "loss": 1.3573, + "step": 8441 + }, + { + "epoch": 0.7659227000544366, + "grad_norm": 0.12003126461484837, + "learning_rate": 0.00013694302813952776, + "loss": 1.367, + "step": 8442 + }, + { + "epoch": 0.7660134276900744, + "grad_norm": 0.12181239574028346, + "learning_rate": 0.00013684202058851115, + "loss": 1.3566, + "step": 8443 + }, + { + "epoch": 0.7661041553257122, + "grad_norm": 0.12240812761620459, + "learning_rate": 0.0001367410443962394, + "loss": 1.356, + "step": 8444 + }, + { + "epoch": 0.7661948829613501, + "grad_norm": 0.11865612479189845, + "learning_rate": 0.00013664009957143204, + "loss": 1.3769, + "step": 8445 + }, + { + "epoch": 0.7662856105969879, + "grad_norm": 0.11596470459242315, + "learning_rate": 0.00013653918612280575, + "loss": 1.3696, + "step": 8446 + }, + { + "epoch": 0.7663763382326256, + "grad_norm": 0.11822003467859442, + "learning_rate": 0.0001364383040590742, + "loss": 1.3674, + "step": 8447 + }, + { + "epoch": 0.7664670658682635, + "grad_norm": 0.12469101933993534, + "learning_rate": 0.00013633745338894883, + "loss": 1.3408, + "step": 8448 + }, + { + "epoch": 0.7665577935039013, + "grad_norm": 0.11471256690884206, + "learning_rate": 0.00013623663412113795, + "loss": 1.3468, + "step": 8449 + }, + { + "epoch": 0.7666485211395391, + "grad_norm": 0.13344894865236626, + "learning_rate": 0.00013613584626434732, + "loss": 1.3532, + "step": 8450 + }, + { + "epoch": 0.766739248775177, + "grad_norm": 0.1168047722823252, + "learning_rate": 0.00013603508982727985, + "loss": 1.3667, + "step": 8451 + }, + { + "epoch": 0.7668299764108147, + "grad_norm": 0.11611758922714042, + "learning_rate": 0.00013593436481863615, + "loss": 1.4099, + "step": 8452 + }, + { + "epoch": 0.7669207040464525, + "grad_norm": 0.12713554510208627, + "learning_rate": 0.00013583367124711356, + "loss": 1.3384, + "step": 8453 + }, + { + "epoch": 0.7670114316820904, + "grad_norm": 0.12403303779923981, + "learning_rate": 0.00013573300912140713, + "loss": 1.3392, + "step": 8454 + }, + { + "epoch": 0.7671021593177282, + "grad_norm": 0.12018460174104777, + "learning_rate": 0.0001356323784502092, + "loss": 1.3617, + "step": 8455 + }, + { + "epoch": 0.767192886953366, + "grad_norm": 0.13008041700909012, + "learning_rate": 0.0001355317792422091, + "loss": 1.3331, + "step": 8456 + }, + { + "epoch": 0.7672836145890038, + "grad_norm": 0.1188793095922753, + "learning_rate": 0.00013543121150609344, + "loss": 1.3689, + "step": 8457 + }, + { + "epoch": 0.7673743422246416, + "grad_norm": 0.11564570387156199, + "learning_rate": 0.00013533067525054655, + "loss": 1.3226, + "step": 8458 + }, + { + "epoch": 0.7674650698602794, + "grad_norm": 0.17816837185402998, + "learning_rate": 0.00013523017048424957, + "loss": 1.3619, + "step": 8459 + }, + { + "epoch": 0.7675557974959173, + "grad_norm": 0.1294476057391299, + "learning_rate": 0.00013512969721588098, + "loss": 1.3837, + "step": 8460 + }, + { + "epoch": 0.7676465251315551, + "grad_norm": 0.11051487558026389, + "learning_rate": 0.00013502925545411687, + "loss": 1.3546, + "step": 8461 + }, + { + "epoch": 0.7677372527671928, + "grad_norm": 0.20268474424021818, + "learning_rate": 0.00013492884520763043, + "loss": 1.4143, + "step": 8462 + }, + { + "epoch": 0.7678279804028307, + "grad_norm": 0.1274162380171135, + "learning_rate": 0.00013482846648509207, + "loss": 1.3645, + "step": 8463 + }, + { + "epoch": 0.7679187080384685, + "grad_norm": 0.1175571004924571, + "learning_rate": 0.0001347281192951692, + "loss": 1.3531, + "step": 8464 + }, + { + "epoch": 0.7680094356741063, + "grad_norm": 0.11196505972178863, + "learning_rate": 0.00013462780364652732, + "loss": 1.3146, + "step": 8465 + }, + { + "epoch": 0.7681001633097442, + "grad_norm": 0.1109089637278221, + "learning_rate": 0.00013452751954782838, + "loss": 1.3712, + "step": 8466 + }, + { + "epoch": 0.768190890945382, + "grad_norm": 0.11599512656029001, + "learning_rate": 0.0001344272670077319, + "loss": 1.3516, + "step": 8467 + }, + { + "epoch": 0.7682816185810197, + "grad_norm": 0.11784471846936463, + "learning_rate": 0.00013432704603489487, + "loss": 1.3654, + "step": 8468 + }, + { + "epoch": 0.7683723462166576, + "grad_norm": 0.12105032439514829, + "learning_rate": 0.00013422685663797118, + "loss": 1.3757, + "step": 8469 + }, + { + "epoch": 0.7684630738522954, + "grad_norm": 0.11608270253233965, + "learning_rate": 0.0001341266988256125, + "loss": 1.3975, + "step": 8470 + }, + { + "epoch": 0.7685538014879332, + "grad_norm": 0.14334943312639983, + "learning_rate": 0.00013402657260646716, + "loss": 1.3753, + "step": 8471 + }, + { + "epoch": 0.7686445291235711, + "grad_norm": 0.13310612993620463, + "learning_rate": 0.00013392647798918134, + "loss": 1.356, + "step": 8472 + }, + { + "epoch": 0.7687352567592088, + "grad_norm": 0.12000918445507358, + "learning_rate": 0.0001338264149823981, + "loss": 1.3781, + "step": 8473 + }, + { + "epoch": 0.7688259843948466, + "grad_norm": 0.11859290599344528, + "learning_rate": 0.00013372638359475782, + "loss": 1.3728, + "step": 8474 + }, + { + "epoch": 0.7689167120304845, + "grad_norm": 0.17141754072366952, + "learning_rate": 0.00013362638383489832, + "loss": 1.3779, + "step": 8475 + }, + { + "epoch": 0.7690074396661223, + "grad_norm": 0.1196625294542689, + "learning_rate": 0.0001335264157114545, + "loss": 1.3408, + "step": 8476 + }, + { + "epoch": 0.7690981673017602, + "grad_norm": 0.12760213967483852, + "learning_rate": 0.00013342647923305883, + "loss": 1.3988, + "step": 8477 + }, + { + "epoch": 0.769188894937398, + "grad_norm": 0.12364901270360196, + "learning_rate": 0.00013332657440834063, + "loss": 1.3497, + "step": 8478 + }, + { + "epoch": 0.7692796225730357, + "grad_norm": 0.12334802102586281, + "learning_rate": 0.00013322670124592685, + "loss": 1.3306, + "step": 8479 + }, + { + "epoch": 0.7693703502086736, + "grad_norm": 0.12636965631420144, + "learning_rate": 0.00013312685975444145, + "loss": 1.3561, + "step": 8480 + }, + { + "epoch": 0.7694610778443114, + "grad_norm": 0.11413311736667721, + "learning_rate": 0.00013302704994250564, + "loss": 1.3282, + "step": 8481 + }, + { + "epoch": 0.7695518054799492, + "grad_norm": 0.1153675807192851, + "learning_rate": 0.0001329272718187383, + "loss": 1.3462, + "step": 8482 + }, + { + "epoch": 0.7696425331155871, + "grad_norm": 0.12104595446274388, + "learning_rate": 0.0001328275253917552, + "loss": 1.363, + "step": 8483 + }, + { + "epoch": 0.7697332607512248, + "grad_norm": 0.13148694870494163, + "learning_rate": 0.0001327278106701692, + "loss": 1.3192, + "step": 8484 + }, + { + "epoch": 0.7698239883868626, + "grad_norm": 0.12294024782842398, + "learning_rate": 0.00013262812766259093, + "loss": 1.343, + "step": 8485 + }, + { + "epoch": 0.7699147160225005, + "grad_norm": 0.12953024241753422, + "learning_rate": 0.00013252847637762805, + "loss": 1.3864, + "step": 8486 + }, + { + "epoch": 0.7700054436581383, + "grad_norm": 0.13009590568556276, + "learning_rate": 0.00013242885682388544, + "loss": 1.3338, + "step": 8487 + }, + { + "epoch": 0.770096171293776, + "grad_norm": 0.14056387524942054, + "learning_rate": 0.00013232926900996506, + "loss": 1.3859, + "step": 8488 + }, + { + "epoch": 0.7701868989294139, + "grad_norm": 0.11231566522048396, + "learning_rate": 0.00013222971294446668, + "loss": 1.3549, + "step": 8489 + }, + { + "epoch": 0.7702776265650517, + "grad_norm": 0.12393465683274439, + "learning_rate": 0.00013213018863598674, + "loss": 1.3598, + "step": 8490 + }, + { + "epoch": 0.7703683542006895, + "grad_norm": 0.1205296630034228, + "learning_rate": 0.00013203069609311907, + "loss": 1.3613, + "step": 8491 + }, + { + "epoch": 0.7704590818363274, + "grad_norm": 0.12313680429132833, + "learning_rate": 0.00013193123532445505, + "loss": 1.3353, + "step": 8492 + }, + { + "epoch": 0.7705498094719652, + "grad_norm": 0.13072744456787938, + "learning_rate": 0.00013183180633858328, + "loss": 1.4174, + "step": 8493 + }, + { + "epoch": 0.7706405371076029, + "grad_norm": 0.121785550904848, + "learning_rate": 0.00013173240914408923, + "loss": 1.3507, + "step": 8494 + }, + { + "epoch": 0.7707312647432408, + "grad_norm": 0.11837408015567438, + "learning_rate": 0.00013163304374955588, + "loss": 1.3269, + "step": 8495 + }, + { + "epoch": 0.7708219923788786, + "grad_norm": 0.12382282471050697, + "learning_rate": 0.00013153371016356354, + "loss": 1.3259, + "step": 8496 + }, + { + "epoch": 0.7709127200145164, + "grad_norm": 0.11813456632474663, + "learning_rate": 0.00013143440839468966, + "loss": 1.374, + "step": 8497 + }, + { + "epoch": 0.7710034476501543, + "grad_norm": 0.11637413185978537, + "learning_rate": 0.00013133513845150886, + "loss": 1.3641, + "step": 8498 + }, + { + "epoch": 0.771094175285792, + "grad_norm": 0.12032044166388203, + "learning_rate": 0.00013123590034259326, + "loss": 1.3664, + "step": 8499 + }, + { + "epoch": 0.7711849029214298, + "grad_norm": 0.11767518547948132, + "learning_rate": 0.00013113669407651185, + "loss": 1.3819, + "step": 8500 + }, + { + "epoch": 0.7712756305570677, + "grad_norm": 0.11789178415119803, + "learning_rate": 0.00013103751966183147, + "loss": 1.3571, + "step": 8501 + }, + { + "epoch": 0.7713663581927055, + "grad_norm": 0.11488950639652935, + "learning_rate": 0.00013093837710711549, + "loss": 1.3565, + "step": 8502 + }, + { + "epoch": 0.7714570858283433, + "grad_norm": 0.11538355987929963, + "learning_rate": 0.0001308392664209252, + "loss": 1.3335, + "step": 8503 + }, + { + "epoch": 0.7715478134639812, + "grad_norm": 0.11472341533973324, + "learning_rate": 0.00013074018761181867, + "loss": 1.357, + "step": 8504 + }, + { + "epoch": 0.7716385410996189, + "grad_norm": 0.12376146430426495, + "learning_rate": 0.0001306411406883512, + "loss": 1.3598, + "step": 8505 + }, + { + "epoch": 0.7717292687352567, + "grad_norm": 0.11322610877282062, + "learning_rate": 0.0001305421256590758, + "loss": 1.4103, + "step": 8506 + }, + { + "epoch": 0.7718199963708946, + "grad_norm": 0.11801537007281947, + "learning_rate": 0.0001304431425325423, + "loss": 1.3324, + "step": 8507 + }, + { + "epoch": 0.7719107240065324, + "grad_norm": 0.11991217149511434, + "learning_rate": 0.0001303441913172978, + "loss": 1.362, + "step": 8508 + }, + { + "epoch": 0.7720014516421702, + "grad_norm": 0.11229145576115704, + "learning_rate": 0.00013024527202188678, + "loss": 1.3851, + "step": 8509 + }, + { + "epoch": 0.772092179277808, + "grad_norm": 0.11861180381030123, + "learning_rate": 0.0001301463846548513, + "loss": 1.3752, + "step": 8510 + }, + { + "epoch": 0.7721829069134458, + "grad_norm": 0.11361199899688124, + "learning_rate": 0.0001300475292247299, + "loss": 1.3699, + "step": 8511 + }, + { + "epoch": 0.7722736345490836, + "grad_norm": 0.11694377545654612, + "learning_rate": 0.0001299487057400588, + "loss": 1.3699, + "step": 8512 + }, + { + "epoch": 0.7723643621847215, + "grad_norm": 0.11734942399772427, + "learning_rate": 0.00012984991420937159, + "loss": 1.3317, + "step": 8513 + }, + { + "epoch": 0.7724550898203593, + "grad_norm": 0.12085819724554389, + "learning_rate": 0.00012975115464119885, + "loss": 1.3678, + "step": 8514 + }, + { + "epoch": 0.772545817455997, + "grad_norm": 0.11262194070609567, + "learning_rate": 0.00012965242704406822, + "loss": 1.3451, + "step": 8515 + }, + { + "epoch": 0.7726365450916349, + "grad_norm": 0.12179923151421157, + "learning_rate": 0.00012955373142650535, + "loss": 1.3584, + "step": 8516 + }, + { + "epoch": 0.7727272727272727, + "grad_norm": 0.11565008305372515, + "learning_rate": 0.00012945506779703242, + "loss": 1.3536, + "step": 8517 + }, + { + "epoch": 0.7728180003629106, + "grad_norm": 0.675592977249366, + "learning_rate": 0.00012935643616416898, + "loss": 1.3732, + "step": 8518 + }, + { + "epoch": 0.7729087279985484, + "grad_norm": 0.12071494040429608, + "learning_rate": 0.0001292578365364317, + "loss": 1.356, + "step": 8519 + }, + { + "epoch": 0.7729994556341862, + "grad_norm": 0.12604515459141755, + "learning_rate": 0.00012915926892233509, + "loss": 1.3716, + "step": 8520 + }, + { + "epoch": 0.773090183269824, + "grad_norm": 0.11601812805132619, + "learning_rate": 0.00012906073333039027, + "loss": 1.3493, + "step": 8521 + }, + { + "epoch": 0.7731809109054618, + "grad_norm": 0.12397648595130868, + "learning_rate": 0.0001289622297691056, + "loss": 1.3549, + "step": 8522 + }, + { + "epoch": 0.7732716385410996, + "grad_norm": 0.11901449839576335, + "learning_rate": 0.0001288637582469871, + "loss": 1.3895, + "step": 8523 + }, + { + "epoch": 0.7733623661767375, + "grad_norm": 0.12769270036474561, + "learning_rate": 0.00012876531877253794, + "loss": 1.4006, + "step": 8524 + }, + { + "epoch": 0.7734530938123753, + "grad_norm": 0.11158568614740948, + "learning_rate": 0.000128666911354258, + "loss": 1.3886, + "step": 8525 + }, + { + "epoch": 0.773543821448013, + "grad_norm": 0.1255564692747795, + "learning_rate": 0.00012856853600064523, + "loss": 1.3717, + "step": 8526 + }, + { + "epoch": 0.7736345490836509, + "grad_norm": 0.1213873667405957, + "learning_rate": 0.0001284701927201941, + "loss": 1.3671, + "step": 8527 + }, + { + "epoch": 0.7737252767192887, + "grad_norm": 0.1167412869834852, + "learning_rate": 0.0001283718815213964, + "loss": 1.342, + "step": 8528 + }, + { + "epoch": 0.7738160043549265, + "grad_norm": 0.11722027992616252, + "learning_rate": 0.00012827360241274166, + "loss": 1.3727, + "step": 8529 + }, + { + "epoch": 0.7739067319905644, + "grad_norm": 0.1239719902247615, + "learning_rate": 0.0001281753554027162, + "loss": 1.3697, + "step": 8530 + }, + { + "epoch": 0.7739974596262021, + "grad_norm": 0.11040812626196861, + "learning_rate": 0.0001280771404998034, + "loss": 1.3363, + "step": 8531 + }, + { + "epoch": 0.7740881872618399, + "grad_norm": 0.12255152356700395, + "learning_rate": 0.0001279789577124843, + "loss": 1.3955, + "step": 8532 + }, + { + "epoch": 0.7741789148974778, + "grad_norm": 0.1576619707632767, + "learning_rate": 0.00012788080704923726, + "loss": 1.3663, + "step": 8533 + }, + { + "epoch": 0.7742696425331156, + "grad_norm": 0.12614906406775447, + "learning_rate": 0.0001277826885185373, + "loss": 1.3323, + "step": 8534 + }, + { + "epoch": 0.7743603701687534, + "grad_norm": 0.1290989335496953, + "learning_rate": 0.00012768460212885695, + "loss": 1.3676, + "step": 8535 + }, + { + "epoch": 0.7744510978043913, + "grad_norm": 0.15563073312761178, + "learning_rate": 0.0001275865478886662, + "loss": 1.3808, + "step": 8536 + }, + { + "epoch": 0.774541825440029, + "grad_norm": 0.11538520756682255, + "learning_rate": 0.00012748852580643193, + "loss": 1.3043, + "step": 8537 + }, + { + "epoch": 0.7746325530756668, + "grad_norm": 0.11800421788891959, + "learning_rate": 0.00012739053589061827, + "loss": 1.3531, + "step": 8538 + }, + { + "epoch": 0.7747232807113047, + "grad_norm": 0.13105081992864867, + "learning_rate": 0.0001272925781496867, + "loss": 1.3684, + "step": 8539 + }, + { + "epoch": 0.7748140083469425, + "grad_norm": 0.11383123159041035, + "learning_rate": 0.00012719465259209607, + "loss": 1.3587, + "step": 8540 + }, + { + "epoch": 0.7749047359825803, + "grad_norm": 0.12207653123023618, + "learning_rate": 0.00012709675922630215, + "loss": 1.3756, + "step": 8541 + }, + { + "epoch": 0.7749954636182181, + "grad_norm": 0.11930787541932314, + "learning_rate": 0.00012699889806075789, + "loss": 1.3949, + "step": 8542 + }, + { + "epoch": 0.7750861912538559, + "grad_norm": 0.1193004476844812, + "learning_rate": 0.00012690106910391393, + "loss": 1.4037, + "step": 8543 + }, + { + "epoch": 0.7751769188894937, + "grad_norm": 0.11423308131197865, + "learning_rate": 0.00012680327236421756, + "loss": 1.3303, + "step": 8544 + }, + { + "epoch": 0.7752676465251316, + "grad_norm": 0.1221533606382379, + "learning_rate": 0.0001267055078501136, + "loss": 1.3494, + "step": 8545 + }, + { + "epoch": 0.7753583741607694, + "grad_norm": 0.11047195133880933, + "learning_rate": 0.0001266077755700441, + "loss": 1.3571, + "step": 8546 + }, + { + "epoch": 0.7754491017964071, + "grad_norm": 0.12055082141646717, + "learning_rate": 0.0001265100755324481, + "loss": 1.3928, + "step": 8547 + }, + { + "epoch": 0.775539829432045, + "grad_norm": 0.11510090803410299, + "learning_rate": 0.0001264124077457623, + "loss": 1.3989, + "step": 8548 + }, + { + "epoch": 0.7756305570676828, + "grad_norm": 0.12033667485316268, + "learning_rate": 0.00012631477221841997, + "loss": 1.3435, + "step": 8549 + }, + { + "epoch": 0.7757212847033206, + "grad_norm": 0.11553950067326048, + "learning_rate": 0.00012621716895885228, + "loss": 1.3842, + "step": 8550 + }, + { + "epoch": 0.7758120123389585, + "grad_norm": 0.14445207936791243, + "learning_rate": 0.0001261195979754871, + "loss": 1.3807, + "step": 8551 + }, + { + "epoch": 0.7759027399745962, + "grad_norm": 0.11569273650037247, + "learning_rate": 0.0001260220592767497, + "loss": 1.372, + "step": 8552 + }, + { + "epoch": 0.775993467610234, + "grad_norm": 0.11497833468308273, + "learning_rate": 0.00012592455287106265, + "loss": 1.3256, + "step": 8553 + }, + { + "epoch": 0.7760841952458719, + "grad_norm": 0.11191654634612211, + "learning_rate": 0.00012582707876684552, + "loss": 1.3656, + "step": 8554 + }, + { + "epoch": 0.7761749228815097, + "grad_norm": 0.11417090874641272, + "learning_rate": 0.00012572963697251538, + "loss": 1.3583, + "step": 8555 + }, + { + "epoch": 0.7762656505171476, + "grad_norm": 0.11093455426467935, + "learning_rate": 0.00012563222749648612, + "loss": 1.354, + "step": 8556 + }, + { + "epoch": 0.7763563781527854, + "grad_norm": 0.11506026241854848, + "learning_rate": 0.00012553485034716931, + "loss": 1.362, + "step": 8557 + }, + { + "epoch": 0.7764471057884231, + "grad_norm": 0.12140539170889407, + "learning_rate": 0.0001254375055329734, + "loss": 1.3837, + "step": 8558 + }, + { + "epoch": 0.776537833424061, + "grad_norm": 0.11988316836160233, + "learning_rate": 0.00012534019306230393, + "loss": 1.3324, + "step": 8559 + }, + { + "epoch": 0.7766285610596988, + "grad_norm": 0.11818936349331757, + "learning_rate": 0.00012524291294356416, + "loss": 1.3641, + "step": 8560 + }, + { + "epoch": 0.7767192886953366, + "grad_norm": 0.11788218244680103, + "learning_rate": 0.0001251456651851541, + "loss": 1.3817, + "step": 8561 + }, + { + "epoch": 0.7768100163309745, + "grad_norm": 0.12122322986604697, + "learning_rate": 0.00012504844979547097, + "loss": 1.3427, + "step": 8562 + }, + { + "epoch": 0.7769007439666122, + "grad_norm": 0.11973180731858148, + "learning_rate": 0.00012495126678290942, + "loss": 1.331, + "step": 8563 + }, + { + "epoch": 0.77699147160225, + "grad_norm": 0.13219591186726307, + "learning_rate": 0.00012485411615586139, + "loss": 1.3392, + "step": 8564 + }, + { + "epoch": 0.7770821992378879, + "grad_norm": 0.1157204422948562, + "learning_rate": 0.00012475699792271577, + "loss": 1.3422, + "step": 8565 + }, + { + "epoch": 0.7771729268735257, + "grad_norm": 0.11874564803452908, + "learning_rate": 0.0001246599120918585, + "loss": 1.3845, + "step": 8566 + }, + { + "epoch": 0.7772636545091635, + "grad_norm": 0.11663484050248829, + "learning_rate": 0.00012456285867167333, + "loss": 1.3394, + "step": 8567 + }, + { + "epoch": 0.7773543821448013, + "grad_norm": 0.11539657166191962, + "learning_rate": 0.00012446583767054066, + "loss": 1.3243, + "step": 8568 + }, + { + "epoch": 0.7774451097804391, + "grad_norm": 0.125905766979607, + "learning_rate": 0.00012436884909683803, + "loss": 1.3556, + "step": 8569 + }, + { + "epoch": 0.7775358374160769, + "grad_norm": 0.11854499163545701, + "learning_rate": 0.0001242718929589407, + "loss": 1.3377, + "step": 8570 + }, + { + "epoch": 0.7776265650517148, + "grad_norm": 0.11507553646396687, + "learning_rate": 0.00012417496926522094, + "loss": 1.3803, + "step": 8571 + }, + { + "epoch": 0.7777172926873526, + "grad_norm": 0.11170640533449509, + "learning_rate": 0.00012407807802404796, + "loss": 1.3709, + "step": 8572 + }, + { + "epoch": 0.7778080203229903, + "grad_norm": 0.1912835886309127, + "learning_rate": 0.0001239812192437882, + "loss": 1.3272, + "step": 8573 + }, + { + "epoch": 0.7778987479586282, + "grad_norm": 0.1202887561145961, + "learning_rate": 0.00012388439293280573, + "loss": 1.3574, + "step": 8574 + }, + { + "epoch": 0.777989475594266, + "grad_norm": 0.12436606346677855, + "learning_rate": 0.00012378759909946135, + "loss": 1.3971, + "step": 8575 + }, + { + "epoch": 0.7780802032299038, + "grad_norm": 0.11357233288303493, + "learning_rate": 0.0001236908377521131, + "loss": 1.3946, + "step": 8576 + }, + { + "epoch": 0.7781709308655417, + "grad_norm": 0.12437772227601458, + "learning_rate": 0.0001235941088991166, + "loss": 1.3774, + "step": 8577 + }, + { + "epoch": 0.7782616585011795, + "grad_norm": 0.11300494096927047, + "learning_rate": 0.00012349741254882412, + "loss": 1.3582, + "step": 8578 + }, + { + "epoch": 0.7783523861368172, + "grad_norm": 0.11526759570942327, + "learning_rate": 0.00012340074870958572, + "loss": 1.3863, + "step": 8579 + }, + { + "epoch": 0.7784431137724551, + "grad_norm": 0.13630976067041564, + "learning_rate": 0.00012330411738974807, + "loss": 1.3531, + "step": 8580 + }, + { + "epoch": 0.7785338414080929, + "grad_norm": 0.12097046230324528, + "learning_rate": 0.0001232075185976555, + "loss": 1.3378, + "step": 8581 + }, + { + "epoch": 0.7786245690437307, + "grad_norm": 0.11744319128383235, + "learning_rate": 0.00012311095234164926, + "loss": 1.351, + "step": 8582 + }, + { + "epoch": 0.7787152966793686, + "grad_norm": 0.15684569894324027, + "learning_rate": 0.0001230144186300677, + "loss": 1.3641, + "step": 8583 + }, + { + "epoch": 0.7788060243150063, + "grad_norm": 0.11556591250094075, + "learning_rate": 0.00012291791747124676, + "loss": 1.3717, + "step": 8584 + }, + { + "epoch": 0.7788967519506441, + "grad_norm": 0.11442740423913142, + "learning_rate": 0.0001228214488735192, + "loss": 1.3473, + "step": 8585 + }, + { + "epoch": 0.778987479586282, + "grad_norm": 0.1302082338449462, + "learning_rate": 0.00012272501284521503, + "loss": 1.3661, + "step": 8586 + }, + { + "epoch": 0.7790782072219198, + "grad_norm": 0.17531378990191127, + "learning_rate": 0.00012262860939466158, + "loss": 1.3766, + "step": 8587 + }, + { + "epoch": 0.7791689348575576, + "grad_norm": 0.11349581506389671, + "learning_rate": 0.00012253223853018346, + "loss": 1.3673, + "step": 8588 + }, + { + "epoch": 0.7792596624931954, + "grad_norm": 0.13059831195596905, + "learning_rate": 0.0001224359002601022, + "loss": 1.3733, + "step": 8589 + }, + { + "epoch": 0.7793503901288332, + "grad_norm": 0.12984496946400453, + "learning_rate": 0.00012233959459273647, + "loss": 1.388, + "step": 8590 + }, + { + "epoch": 0.779441117764471, + "grad_norm": 0.11594025696112745, + "learning_rate": 0.0001222433215364025, + "loss": 1.383, + "step": 8591 + }, + { + "epoch": 0.7795318454001089, + "grad_norm": 0.13204644345586303, + "learning_rate": 0.00012214708109941342, + "loss": 1.3302, + "step": 8592 + }, + { + "epoch": 0.7796225730357467, + "grad_norm": 0.1204568696779321, + "learning_rate": 0.0001220508732900793, + "loss": 1.3767, + "step": 8593 + }, + { + "epoch": 0.7797133006713846, + "grad_norm": 0.11678968736021184, + "learning_rate": 0.00012195469811670823, + "loss": 1.3895, + "step": 8594 + }, + { + "epoch": 0.7798040283070223, + "grad_norm": 0.11888757596043613, + "learning_rate": 0.00012185855558760473, + "loss": 1.3299, + "step": 8595 + }, + { + "epoch": 0.7798947559426601, + "grad_norm": 0.12479748764479297, + "learning_rate": 0.0001217624457110707, + "loss": 1.3561, + "step": 8596 + }, + { + "epoch": 0.779985483578298, + "grad_norm": 0.11882538652328044, + "learning_rate": 0.00012166636849540502, + "loss": 1.3419, + "step": 8597 + }, + { + "epoch": 0.7800762112139358, + "grad_norm": 0.1258389815278811, + "learning_rate": 0.00012157032394890438, + "loss": 1.378, + "step": 8598 + }, + { + "epoch": 0.7801669388495736, + "grad_norm": 0.12626922254710776, + "learning_rate": 0.00012147431207986204, + "loss": 1.3743, + "step": 8599 + }, + { + "epoch": 0.7802576664852114, + "grad_norm": 0.1227128132792496, + "learning_rate": 0.00012137833289656847, + "loss": 1.3701, + "step": 8600 + }, + { + "epoch": 0.7803483941208492, + "grad_norm": 0.13731161531441685, + "learning_rate": 0.0001212823864073117, + "loss": 1.3812, + "step": 8601 + }, + { + "epoch": 0.780439121756487, + "grad_norm": 0.11836943096542184, + "learning_rate": 0.0001211864726203768, + "loss": 1.3563, + "step": 8602 + }, + { + "epoch": 0.7805298493921249, + "grad_norm": 0.11702241207278166, + "learning_rate": 0.00012109059154404568, + "loss": 1.3601, + "step": 8603 + }, + { + "epoch": 0.7806205770277627, + "grad_norm": 0.1217586618551904, + "learning_rate": 0.00012099474318659798, + "loss": 1.3299, + "step": 8604 + }, + { + "epoch": 0.7807113046634004, + "grad_norm": 0.11385334927788723, + "learning_rate": 0.00012089892755631005, + "loss": 1.3809, + "step": 8605 + }, + { + "epoch": 0.7808020322990383, + "grad_norm": 0.11878058390665173, + "learning_rate": 0.00012080314466145542, + "loss": 1.3778, + "step": 8606 + }, + { + "epoch": 0.7808927599346761, + "grad_norm": 0.11860665848516541, + "learning_rate": 0.00012070739451030532, + "loss": 1.3711, + "step": 8607 + }, + { + "epoch": 0.7809834875703139, + "grad_norm": 0.12046493005556021, + "learning_rate": 0.00012061167711112758, + "loss": 1.3513, + "step": 8608 + }, + { + "epoch": 0.7810742152059518, + "grad_norm": 0.13567972041491813, + "learning_rate": 0.00012051599247218737, + "loss": 1.3869, + "step": 8609 + }, + { + "epoch": 0.7811649428415895, + "grad_norm": 0.11830425559970316, + "learning_rate": 0.00012042034060174706, + "loss": 1.3451, + "step": 8610 + }, + { + "epoch": 0.7812556704772273, + "grad_norm": 0.11735215885331869, + "learning_rate": 0.00012032472150806645, + "loss": 1.3593, + "step": 8611 + }, + { + "epoch": 0.7813463981128652, + "grad_norm": 0.11940073344263868, + "learning_rate": 0.00012022913519940209, + "loss": 1.3599, + "step": 8612 + }, + { + "epoch": 0.781437125748503, + "grad_norm": 0.1202127942230879, + "learning_rate": 0.00012013358168400773, + "loss": 1.3515, + "step": 8613 + }, + { + "epoch": 0.7815278533841408, + "grad_norm": 0.11518916665618263, + "learning_rate": 0.00012003806097013475, + "loss": 1.3429, + "step": 8614 + }, + { + "epoch": 0.7816185810197787, + "grad_norm": 0.11313712019809723, + "learning_rate": 0.00011994257306603118, + "loss": 1.3381, + "step": 8615 + }, + { + "epoch": 0.7817093086554164, + "grad_norm": 0.11619554777646296, + "learning_rate": 0.0001198471179799423, + "loss": 1.3821, + "step": 8616 + }, + { + "epoch": 0.7818000362910542, + "grad_norm": 0.12531499091191944, + "learning_rate": 0.00011975169572011085, + "loss": 1.3702, + "step": 8617 + }, + { + "epoch": 0.7818907639266921, + "grad_norm": 0.12444620151407727, + "learning_rate": 0.00011965630629477665, + "loss": 1.3408, + "step": 8618 + }, + { + "epoch": 0.7819814915623299, + "grad_norm": 0.13624238274598036, + "learning_rate": 0.0001195609497121764, + "loss": 1.3464, + "step": 8619 + }, + { + "epoch": 0.7820722191979677, + "grad_norm": 0.14895326730365147, + "learning_rate": 0.00011946562598054412, + "loss": 1.3298, + "step": 8620 + }, + { + "epoch": 0.7821629468336055, + "grad_norm": 0.1195227579990953, + "learning_rate": 0.00011937033510811124, + "loss": 1.3612, + "step": 8621 + }, + { + "epoch": 0.7822536744692433, + "grad_norm": 0.12833035380648186, + "learning_rate": 0.00011927507710310603, + "loss": 1.3684, + "step": 8622 + }, + { + "epoch": 0.7823444021048811, + "grad_norm": 0.22310359045387565, + "learning_rate": 0.00011917985197375392, + "loss": 1.3726, + "step": 8623 + }, + { + "epoch": 0.782435129740519, + "grad_norm": 0.13234565007054802, + "learning_rate": 0.00011908465972827781, + "loss": 1.4044, + "step": 8624 + }, + { + "epoch": 0.7825258573761568, + "grad_norm": 0.11396708222580496, + "learning_rate": 0.00011898950037489737, + "loss": 1.3696, + "step": 8625 + }, + { + "epoch": 0.7826165850117945, + "grad_norm": 0.11839898198323041, + "learning_rate": 0.00011889437392182984, + "loss": 1.3366, + "step": 8626 + }, + { + "epoch": 0.7827073126474324, + "grad_norm": 0.11760631696217426, + "learning_rate": 0.00011879928037728916, + "loss": 1.3427, + "step": 8627 + }, + { + "epoch": 0.7827980402830702, + "grad_norm": 0.12227643138635562, + "learning_rate": 0.00011870421974948693, + "loss": 1.3245, + "step": 8628 + }, + { + "epoch": 0.782888767918708, + "grad_norm": 0.12270568520806328, + "learning_rate": 0.00011860919204663157, + "loss": 1.3749, + "step": 8629 + }, + { + "epoch": 0.7829794955543459, + "grad_norm": 0.11343740113889828, + "learning_rate": 0.00011851419727692847, + "loss": 1.3613, + "step": 8630 + }, + { + "epoch": 0.7830702231899836, + "grad_norm": 0.11914034416433045, + "learning_rate": 0.00011841923544858086, + "loss": 1.3542, + "step": 8631 + }, + { + "epoch": 0.7831609508256215, + "grad_norm": 0.12094499751634882, + "learning_rate": 0.00011832430656978838, + "loss": 1.3556, + "step": 8632 + }, + { + "epoch": 0.7832516784612593, + "grad_norm": 0.1420522364054583, + "learning_rate": 0.00011822941064874837, + "loss": 1.3417, + "step": 8633 + }, + { + "epoch": 0.7833424060968971, + "grad_norm": 0.1335023834423529, + "learning_rate": 0.00011813454769365489, + "loss": 1.3431, + "step": 8634 + }, + { + "epoch": 0.783433133732535, + "grad_norm": 0.12146634998739068, + "learning_rate": 0.00011803971771269967, + "loss": 1.3454, + "step": 8635 + }, + { + "epoch": 0.7835238613681728, + "grad_norm": 0.1306747452786876, + "learning_rate": 0.00011794492071407109, + "loss": 1.3572, + "step": 8636 + }, + { + "epoch": 0.7836145890038105, + "grad_norm": 0.12142104080231494, + "learning_rate": 0.00011785015670595478, + "loss": 1.3639, + "step": 8637 + }, + { + "epoch": 0.7837053166394484, + "grad_norm": 0.11967715647243665, + "learning_rate": 0.00011775542569653392, + "loss": 1.3214, + "step": 8638 + }, + { + "epoch": 0.7837960442750862, + "grad_norm": 0.11709935419704078, + "learning_rate": 0.0001176607276939884, + "loss": 1.3096, + "step": 8639 + }, + { + "epoch": 0.783886771910724, + "grad_norm": 0.12067840907670323, + "learning_rate": 0.00011756606270649517, + "loss": 1.3516, + "step": 8640 + }, + { + "epoch": 0.7839774995463619, + "grad_norm": 0.12029012100128203, + "learning_rate": 0.00011747143074222887, + "loss": 1.3602, + "step": 8641 + }, + { + "epoch": 0.7840682271819996, + "grad_norm": 0.1185856867671129, + "learning_rate": 0.00011737683180936104, + "loss": 1.3503, + "step": 8642 + }, + { + "epoch": 0.7841589548176374, + "grad_norm": 0.1233296431424669, + "learning_rate": 0.00011728226591606017, + "loss": 1.4116, + "step": 8643 + }, + { + "epoch": 0.7842496824532753, + "grad_norm": 0.11840118035707907, + "learning_rate": 0.00011718773307049186, + "loss": 1.349, + "step": 8644 + }, + { + "epoch": 0.7843404100889131, + "grad_norm": 0.1145476479431742, + "learning_rate": 0.00011709323328081938, + "loss": 1.3555, + "step": 8645 + }, + { + "epoch": 0.7844311377245509, + "grad_norm": 0.11698882028162605, + "learning_rate": 0.00011699876655520269, + "loss": 1.4176, + "step": 8646 + }, + { + "epoch": 0.7845218653601888, + "grad_norm": 0.12925497100795397, + "learning_rate": 0.00011690433290179874, + "loss": 1.406, + "step": 8647 + }, + { + "epoch": 0.7846125929958265, + "grad_norm": 0.12466039283583956, + "learning_rate": 0.00011680993232876219, + "loss": 1.3662, + "step": 8648 + }, + { + "epoch": 0.7847033206314643, + "grad_norm": 0.11881921066180466, + "learning_rate": 0.00011671556484424456, + "loss": 1.356, + "step": 8649 + }, + { + "epoch": 0.7847940482671022, + "grad_norm": 0.30964157195559094, + "learning_rate": 0.00011662123045639439, + "loss": 1.3703, + "step": 8650 + }, + { + "epoch": 0.78488477590274, + "grad_norm": 0.12537398350440132, + "learning_rate": 0.00011652692917335733, + "loss": 1.386, + "step": 8651 + }, + { + "epoch": 0.7849755035383778, + "grad_norm": 0.1191301273208533, + "learning_rate": 0.0001164326610032766, + "loss": 1.3559, + "step": 8652 + }, + { + "epoch": 0.7850662311740156, + "grad_norm": 0.12220283334738045, + "learning_rate": 0.00011633842595429211, + "loss": 1.3468, + "step": 8653 + }, + { + "epoch": 0.7851569588096534, + "grad_norm": 0.13243318562754913, + "learning_rate": 0.000116244224034541, + "loss": 1.3585, + "step": 8654 + }, + { + "epoch": 0.7852476864452912, + "grad_norm": 0.11413031650093462, + "learning_rate": 0.00011615005525215777, + "loss": 1.3904, + "step": 8655 + }, + { + "epoch": 0.7853384140809291, + "grad_norm": 0.12582387251124308, + "learning_rate": 0.00011605591961527378, + "loss": 1.3768, + "step": 8656 + }, + { + "epoch": 0.7854291417165669, + "grad_norm": 0.12178780185055751, + "learning_rate": 0.00011596181713201781, + "loss": 1.3829, + "step": 8657 + }, + { + "epoch": 0.7855198693522046, + "grad_norm": 0.1298039047691562, + "learning_rate": 0.0001158677478105154, + "loss": 1.3818, + "step": 8658 + }, + { + "epoch": 0.7856105969878425, + "grad_norm": 0.1159026660800084, + "learning_rate": 0.00011577371165888973, + "loss": 1.3526, + "step": 8659 + }, + { + "epoch": 0.7857013246234803, + "grad_norm": 0.11459527887063908, + "learning_rate": 0.00011567970868526068, + "loss": 1.313, + "step": 8660 + }, + { + "epoch": 0.7857920522591181, + "grad_norm": 0.11508465492166692, + "learning_rate": 0.00011558573889774526, + "loss": 1.3643, + "step": 8661 + }, + { + "epoch": 0.785882779894756, + "grad_norm": 0.11447680040604206, + "learning_rate": 0.00011549180230445811, + "loss": 1.3485, + "step": 8662 + }, + { + "epoch": 0.7859735075303937, + "grad_norm": 0.12398590282162933, + "learning_rate": 0.00011539789891351049, + "loss": 1.4, + "step": 8663 + }, + { + "epoch": 0.7860642351660315, + "grad_norm": 0.11766392090834596, + "learning_rate": 0.00011530402873301088, + "loss": 1.3599, + "step": 8664 + }, + { + "epoch": 0.7861549628016694, + "grad_norm": 0.11947407379347252, + "learning_rate": 0.00011521019177106507, + "loss": 1.3588, + "step": 8665 + }, + { + "epoch": 0.7862456904373072, + "grad_norm": 0.1270309132842703, + "learning_rate": 0.00011511638803577601, + "loss": 1.3821, + "step": 8666 + }, + { + "epoch": 0.786336418072945, + "grad_norm": 0.12151084844169253, + "learning_rate": 0.00011502261753524363, + "loss": 1.3802, + "step": 8667 + }, + { + "epoch": 0.7864271457085829, + "grad_norm": 0.11185353592468983, + "learning_rate": 0.00011492888027756481, + "loss": 1.3579, + "step": 8668 + }, + { + "epoch": 0.7865178733442206, + "grad_norm": 0.15295388155756717, + "learning_rate": 0.00011483517627083406, + "loss": 1.3664, + "step": 8669 + }, + { + "epoch": 0.7866086009798585, + "grad_norm": 0.1219907803004099, + "learning_rate": 0.00011474150552314261, + "loss": 1.3492, + "step": 8670 + }, + { + "epoch": 0.7866993286154963, + "grad_norm": 0.1146774978432732, + "learning_rate": 0.00011464786804257866, + "loss": 1.3435, + "step": 8671 + }, + { + "epoch": 0.7867900562511341, + "grad_norm": 0.1154430257977767, + "learning_rate": 0.00011455426383722834, + "loss": 1.361, + "step": 8672 + }, + { + "epoch": 0.786880783886772, + "grad_norm": 0.12435004709032503, + "learning_rate": 0.0001144606929151742, + "loss": 1.3561, + "step": 8673 + }, + { + "epoch": 0.7869715115224097, + "grad_norm": 0.11740851572886023, + "learning_rate": 0.00011436715528449587, + "loss": 1.328, + "step": 8674 + }, + { + "epoch": 0.7870622391580475, + "grad_norm": 0.11710185793503777, + "learning_rate": 0.00011427365095327069, + "loss": 1.3741, + "step": 8675 + }, + { + "epoch": 0.7871529667936854, + "grad_norm": 0.14470132228322588, + "learning_rate": 0.00011418017992957263, + "loss": 1.3942, + "step": 8676 + }, + { + "epoch": 0.7872436944293232, + "grad_norm": 0.11556163070763362, + "learning_rate": 0.00011408674222147286, + "loss": 1.3663, + "step": 8677 + }, + { + "epoch": 0.787334422064961, + "grad_norm": 0.1272755919964242, + "learning_rate": 0.00011399333783703964, + "loss": 1.3712, + "step": 8678 + }, + { + "epoch": 0.7874251497005988, + "grad_norm": 0.11741757975121402, + "learning_rate": 0.00011389996678433855, + "loss": 1.3544, + "step": 8679 + }, + { + "epoch": 0.7875158773362366, + "grad_norm": 0.1122573487548408, + "learning_rate": 0.00011380662907143241, + "loss": 1.3491, + "step": 8680 + }, + { + "epoch": 0.7876066049718744, + "grad_norm": 0.12110146726432586, + "learning_rate": 0.00011371332470638063, + "loss": 1.3782, + "step": 8681 + }, + { + "epoch": 0.7876973326075123, + "grad_norm": 0.12840698972834794, + "learning_rate": 0.00011362005369724033, + "loss": 1.3957, + "step": 8682 + }, + { + "epoch": 0.7877880602431501, + "grad_norm": 0.12230770080468957, + "learning_rate": 0.00011352681605206533, + "loss": 1.3956, + "step": 8683 + }, + { + "epoch": 0.7878787878787878, + "grad_norm": 0.11714353064717488, + "learning_rate": 0.0001134336117789066, + "loss": 1.3493, + "step": 8684 + }, + { + "epoch": 0.7879695155144257, + "grad_norm": 0.1240008645242348, + "learning_rate": 0.00011334044088581264, + "loss": 1.3271, + "step": 8685 + }, + { + "epoch": 0.7880602431500635, + "grad_norm": 0.1363080551103796, + "learning_rate": 0.00011324730338082856, + "loss": 1.3353, + "step": 8686 + }, + { + "epoch": 0.7881509707857013, + "grad_norm": 0.12565499108873218, + "learning_rate": 0.00011315419927199666, + "loss": 1.3661, + "step": 8687 + }, + { + "epoch": 0.7882416984213392, + "grad_norm": 0.11822062010744307, + "learning_rate": 0.00011306112856735673, + "loss": 1.3693, + "step": 8688 + }, + { + "epoch": 0.788332426056977, + "grad_norm": 0.12081882531269353, + "learning_rate": 0.00011296809127494545, + "loss": 1.3529, + "step": 8689 + }, + { + "epoch": 0.7884231536926147, + "grad_norm": 0.12488081542929252, + "learning_rate": 0.00011287508740279657, + "loss": 1.3652, + "step": 8690 + }, + { + "epoch": 0.7885138813282526, + "grad_norm": 0.1312482406894774, + "learning_rate": 0.0001127821169589408, + "loss": 1.3593, + "step": 8691 + }, + { + "epoch": 0.7886046089638904, + "grad_norm": 0.11713943050826822, + "learning_rate": 0.00011268917995140648, + "loss": 1.3933, + "step": 8692 + }, + { + "epoch": 0.7886953365995282, + "grad_norm": 0.11397054309950379, + "learning_rate": 0.00011259627638821851, + "loss": 1.3772, + "step": 8693 + }, + { + "epoch": 0.7887860642351661, + "grad_norm": 0.11399286557683955, + "learning_rate": 0.00011250340627739908, + "loss": 1.3485, + "step": 8694 + }, + { + "epoch": 0.7888767918708038, + "grad_norm": 0.13878542644257902, + "learning_rate": 0.00011241056962696755, + "loss": 1.3462, + "step": 8695 + }, + { + "epoch": 0.7889675195064416, + "grad_norm": 0.12261959112311341, + "learning_rate": 0.00011231776644494068, + "loss": 1.3317, + "step": 8696 + }, + { + "epoch": 0.7890582471420795, + "grad_norm": 0.1260989159936741, + "learning_rate": 0.00011222499673933179, + "loss": 1.3664, + "step": 8697 + }, + { + "epoch": 0.7891489747777173, + "grad_norm": 0.12048026817700175, + "learning_rate": 0.0001121322605181514, + "loss": 1.4216, + "step": 8698 + }, + { + "epoch": 0.7892397024133551, + "grad_norm": 0.1158858483670653, + "learning_rate": 0.00011203955778940767, + "loss": 1.3445, + "step": 8699 + }, + { + "epoch": 0.789330430048993, + "grad_norm": 0.12112904312552654, + "learning_rate": 0.0001119468885611053, + "loss": 1.3813, + "step": 8700 + }, + { + "epoch": 0.7894211576846307, + "grad_norm": 0.11557173215471177, + "learning_rate": 0.00011185425284124611, + "loss": 1.3282, + "step": 8701 + }, + { + "epoch": 0.7895118853202685, + "grad_norm": 0.12058108549112842, + "learning_rate": 0.00011176165063782956, + "loss": 1.3484, + "step": 8702 + }, + { + "epoch": 0.7896026129559064, + "grad_norm": 0.11383342112240709, + "learning_rate": 0.00011166908195885156, + "loss": 1.3606, + "step": 8703 + }, + { + "epoch": 0.7896933405915442, + "grad_norm": 0.12385581944306541, + "learning_rate": 0.00011157654681230573, + "loss": 1.3862, + "step": 8704 + }, + { + "epoch": 0.789784068227182, + "grad_norm": 0.6031863833368938, + "learning_rate": 0.00011148404520618216, + "loss": 1.3639, + "step": 8705 + }, + { + "epoch": 0.7898747958628198, + "grad_norm": 0.11299542704879645, + "learning_rate": 0.0001113915771484687, + "loss": 1.3023, + "step": 8706 + }, + { + "epoch": 0.7899655234984576, + "grad_norm": 0.12076098844737775, + "learning_rate": 0.00011129914264714985, + "loss": 1.3472, + "step": 8707 + }, + { + "epoch": 0.7900562511340955, + "grad_norm": 0.11675482335111553, + "learning_rate": 0.00011120674171020717, + "loss": 1.3372, + "step": 8708 + }, + { + "epoch": 0.7901469787697333, + "grad_norm": 0.12329639561591571, + "learning_rate": 0.00011111437434561978, + "loss": 1.3576, + "step": 8709 + }, + { + "epoch": 0.790237706405371, + "grad_norm": 0.12034572212394126, + "learning_rate": 0.00011102204056136333, + "loss": 1.3356, + "step": 8710 + }, + { + "epoch": 0.7903284340410089, + "grad_norm": 0.13055422738727687, + "learning_rate": 0.00011092974036541114, + "loss": 1.3229, + "step": 8711 + }, + { + "epoch": 0.7904191616766467, + "grad_norm": 0.11632240495709352, + "learning_rate": 0.00011083747376573312, + "loss": 1.3746, + "step": 8712 + }, + { + "epoch": 0.7905098893122845, + "grad_norm": 0.11665186418344965, + "learning_rate": 0.00011074524077029668, + "loss": 1.3472, + "step": 8713 + }, + { + "epoch": 0.7906006169479224, + "grad_norm": 0.11390236491390457, + "learning_rate": 0.00011065304138706606, + "loss": 1.347, + "step": 8714 + }, + { + "epoch": 0.7906913445835602, + "grad_norm": 0.11514253269547407, + "learning_rate": 0.00011056087562400264, + "loss": 1.3889, + "step": 8715 + }, + { + "epoch": 0.7907820722191979, + "grad_norm": 0.12940642625589274, + "learning_rate": 0.00011046874348906505, + "loss": 1.3642, + "step": 8716 + }, + { + "epoch": 0.7908727998548358, + "grad_norm": 0.12902409678947027, + "learning_rate": 0.00011037664499020894, + "loss": 1.3246, + "step": 8717 + }, + { + "epoch": 0.7909635274904736, + "grad_norm": 0.11945911928050014, + "learning_rate": 0.00011028458013538678, + "loss": 1.3621, + "step": 8718 + }, + { + "epoch": 0.7910542551261114, + "grad_norm": 0.12184026817302036, + "learning_rate": 0.00011019254893254859, + "loss": 1.3585, + "step": 8719 + }, + { + "epoch": 0.7911449827617493, + "grad_norm": 0.11264798958088228, + "learning_rate": 0.00011010055138964142, + "loss": 1.3386, + "step": 8720 + }, + { + "epoch": 0.791235710397387, + "grad_norm": 0.11444050442353854, + "learning_rate": 0.00011000858751460907, + "loss": 1.3633, + "step": 8721 + }, + { + "epoch": 0.7913264380330248, + "grad_norm": 0.11802075578161993, + "learning_rate": 0.00010991665731539251, + "loss": 1.3243, + "step": 8722 + }, + { + "epoch": 0.7914171656686627, + "grad_norm": 0.12435237594412776, + "learning_rate": 0.00010982476079993025, + "loss": 1.3549, + "step": 8723 + }, + { + "epoch": 0.7915078933043005, + "grad_norm": 0.13048633314300204, + "learning_rate": 0.00010973289797615738, + "loss": 1.3583, + "step": 8724 + }, + { + "epoch": 0.7915986209399383, + "grad_norm": 0.11533527834545006, + "learning_rate": 0.00010964106885200614, + "loss": 1.3375, + "step": 8725 + }, + { + "epoch": 0.7916893485755762, + "grad_norm": 0.12481438655830081, + "learning_rate": 0.00010954927343540616, + "loss": 1.3535, + "step": 8726 + }, + { + "epoch": 0.7917800762112139, + "grad_norm": 0.11739220980832533, + "learning_rate": 0.00010945751173428409, + "loss": 1.3715, + "step": 8727 + }, + { + "epoch": 0.7918708038468517, + "grad_norm": 0.12246351004815337, + "learning_rate": 0.00010936578375656348, + "loss": 1.3619, + "step": 8728 + }, + { + "epoch": 0.7919615314824896, + "grad_norm": 0.12324509174516766, + "learning_rate": 0.00010927408951016482, + "loss": 1.407, + "step": 8729 + }, + { + "epoch": 0.7920522591181274, + "grad_norm": 0.13042665987063923, + "learning_rate": 0.0001091824290030063, + "loss": 1.3766, + "step": 8730 + }, + { + "epoch": 0.7921429867537652, + "grad_norm": 0.130552654841721, + "learning_rate": 0.00010909080224300261, + "loss": 1.3757, + "step": 8731 + }, + { + "epoch": 0.792233714389403, + "grad_norm": 0.11384060922646924, + "learning_rate": 0.00010899920923806567, + "loss": 1.3117, + "step": 8732 + }, + { + "epoch": 0.7923244420250408, + "grad_norm": 0.11489100942556366, + "learning_rate": 0.0001089076499961047, + "loss": 1.3593, + "step": 8733 + }, + { + "epoch": 0.7924151696606786, + "grad_norm": 0.11979676500104226, + "learning_rate": 0.0001088161245250257, + "loss": 1.3785, + "step": 8734 + }, + { + "epoch": 0.7925058972963165, + "grad_norm": 0.11616144567001686, + "learning_rate": 0.00010872463283273215, + "loss": 1.3697, + "step": 8735 + }, + { + "epoch": 0.7925966249319543, + "grad_norm": 0.11778146957982578, + "learning_rate": 0.0001086331749271241, + "loss": 1.3632, + "step": 8736 + }, + { + "epoch": 0.792687352567592, + "grad_norm": 0.11566220175724772, + "learning_rate": 0.00010854175081609918, + "loss": 1.3689, + "step": 8737 + }, + { + "epoch": 0.7927780802032299, + "grad_norm": 0.12706394809697766, + "learning_rate": 0.00010845036050755174, + "loss": 1.3526, + "step": 8738 + }, + { + "epoch": 0.7928688078388677, + "grad_norm": 0.11435519025865358, + "learning_rate": 0.00010835900400937332, + "loss": 1.3909, + "step": 8739 + }, + { + "epoch": 0.7929595354745055, + "grad_norm": 0.11683449443907487, + "learning_rate": 0.0001082676813294527, + "loss": 1.3456, + "step": 8740 + }, + { + "epoch": 0.7930502631101434, + "grad_norm": 0.12904149079896723, + "learning_rate": 0.00010817639247567556, + "loss": 1.3903, + "step": 8741 + }, + { + "epoch": 0.7931409907457811, + "grad_norm": 0.12277916465411956, + "learning_rate": 0.00010808513745592458, + "loss": 1.3998, + "step": 8742 + }, + { + "epoch": 0.7932317183814189, + "grad_norm": 0.12254485196354557, + "learning_rate": 0.00010799391627807969, + "loss": 1.3935, + "step": 8743 + }, + { + "epoch": 0.7933224460170568, + "grad_norm": 0.11948442639574004, + "learning_rate": 0.00010790272895001801, + "loss": 1.3602, + "step": 8744 + }, + { + "epoch": 0.7934131736526946, + "grad_norm": 0.11955804089343072, + "learning_rate": 0.00010781157547961356, + "loss": 1.3803, + "step": 8745 + }, + { + "epoch": 0.7935039012883325, + "grad_norm": 0.1152429060290202, + "learning_rate": 0.00010772045587473716, + "loss": 1.3675, + "step": 8746 + }, + { + "epoch": 0.7935946289239703, + "grad_norm": 0.1265710353820021, + "learning_rate": 0.00010762937014325742, + "loss": 1.3619, + "step": 8747 + }, + { + "epoch": 0.793685356559608, + "grad_norm": 0.12224544951627535, + "learning_rate": 0.00010753831829303934, + "loss": 1.384, + "step": 8748 + }, + { + "epoch": 0.7937760841952459, + "grad_norm": 0.12296005807513906, + "learning_rate": 0.00010744730033194505, + "loss": 1.3707, + "step": 8749 + }, + { + "epoch": 0.7938668118308837, + "grad_norm": 0.11444245071064381, + "learning_rate": 0.00010735631626783455, + "loss": 1.3565, + "step": 8750 + }, + { + "epoch": 0.7939575394665215, + "grad_norm": 0.18232979335531954, + "learning_rate": 0.00010726536610856397, + "loss": 1.3451, + "step": 8751 + }, + { + "epoch": 0.7940482671021594, + "grad_norm": 0.12682914306045875, + "learning_rate": 0.00010717444986198676, + "loss": 1.3234, + "step": 8752 + }, + { + "epoch": 0.7941389947377971, + "grad_norm": 0.1288000084056752, + "learning_rate": 0.00010708356753595382, + "loss": 1.3561, + "step": 8753 + }, + { + "epoch": 0.7942297223734349, + "grad_norm": 0.1276618085413782, + "learning_rate": 0.00010699271913831282, + "loss": 1.3767, + "step": 8754 + }, + { + "epoch": 0.7943204500090728, + "grad_norm": 0.11563288167506047, + "learning_rate": 0.00010690190467690836, + "loss": 1.394, + "step": 8755 + }, + { + "epoch": 0.7944111776447106, + "grad_norm": 0.11016709168963283, + "learning_rate": 0.00010681112415958228, + "loss": 1.3996, + "step": 8756 + }, + { + "epoch": 0.7945019052803484, + "grad_norm": 0.11686487682912211, + "learning_rate": 0.00010672037759417357, + "loss": 1.3781, + "step": 8757 + }, + { + "epoch": 0.7945926329159863, + "grad_norm": 0.12362520407775567, + "learning_rate": 0.00010662966498851834, + "loss": 1.3158, + "step": 8758 + }, + { + "epoch": 0.794683360551624, + "grad_norm": 0.11896672236555265, + "learning_rate": 0.0001065389863504494, + "loss": 1.3787, + "step": 8759 + }, + { + "epoch": 0.7947740881872618, + "grad_norm": 0.11944389618841388, + "learning_rate": 0.0001064483416877971, + "loss": 1.333, + "step": 8760 + }, + { + "epoch": 0.7948648158228997, + "grad_norm": 0.12769324415726016, + "learning_rate": 0.00010635773100838853, + "loss": 1.3525, + "step": 8761 + }, + { + "epoch": 0.7949555434585375, + "grad_norm": 0.12119286481113495, + "learning_rate": 0.00010626715432004774, + "loss": 1.3843, + "step": 8762 + }, + { + "epoch": 0.7950462710941752, + "grad_norm": 0.11326474840600508, + "learning_rate": 0.00010617661163059633, + "loss": 1.3828, + "step": 8763 + }, + { + "epoch": 0.7951369987298131, + "grad_norm": 0.12236168459828657, + "learning_rate": 0.0001060861029478526, + "loss": 1.3329, + "step": 8764 + }, + { + "epoch": 0.7952277263654509, + "grad_norm": 0.11271178180876626, + "learning_rate": 0.00010599562827963172, + "loss": 1.3613, + "step": 8765 + }, + { + "epoch": 0.7953184540010887, + "grad_norm": 0.1297991048338422, + "learning_rate": 0.00010590518763374645, + "loss": 1.3789, + "step": 8766 + }, + { + "epoch": 0.7954091816367266, + "grad_norm": 0.11821863698061182, + "learning_rate": 0.00010581478101800645, + "loss": 1.3302, + "step": 8767 + }, + { + "epoch": 0.7954999092723644, + "grad_norm": 0.12067477010496735, + "learning_rate": 0.00010572440844021824, + "loss": 1.3774, + "step": 8768 + }, + { + "epoch": 0.7955906369080021, + "grad_norm": 0.12102365649380666, + "learning_rate": 0.0001056340699081853, + "loss": 1.3098, + "step": 8769 + }, + { + "epoch": 0.79568136454364, + "grad_norm": 0.13150106907542575, + "learning_rate": 0.00010554376542970867, + "loss": 1.3343, + "step": 8770 + }, + { + "epoch": 0.7957720921792778, + "grad_norm": 0.11919035938094481, + "learning_rate": 0.00010545349501258605, + "loss": 1.3597, + "step": 8771 + }, + { + "epoch": 0.7958628198149156, + "grad_norm": 0.13358256464059312, + "learning_rate": 0.00010536325866461216, + "loss": 1.3499, + "step": 8772 + }, + { + "epoch": 0.7959535474505535, + "grad_norm": 0.12287676305721752, + "learning_rate": 0.00010527305639357904, + "loss": 1.3294, + "step": 8773 + }, + { + "epoch": 0.7960442750861912, + "grad_norm": 0.11486048241094896, + "learning_rate": 0.00010518288820727578, + "loss": 1.3483, + "step": 8774 + }, + { + "epoch": 0.796135002721829, + "grad_norm": 0.11926523429268138, + "learning_rate": 0.00010509275411348835, + "loss": 1.3419, + "step": 8775 + }, + { + "epoch": 0.7962257303574669, + "grad_norm": 0.1413860978045919, + "learning_rate": 0.00010500265411999966, + "loss": 1.3587, + "step": 8776 + }, + { + "epoch": 0.7963164579931047, + "grad_norm": 0.1419606787986257, + "learning_rate": 0.0001049125882345901, + "loss": 1.3723, + "step": 8777 + }, + { + "epoch": 0.7964071856287425, + "grad_norm": 0.11453515074226603, + "learning_rate": 0.00010482255646503686, + "loss": 1.3806, + "step": 8778 + }, + { + "epoch": 0.7964979132643804, + "grad_norm": 0.2878105444655748, + "learning_rate": 0.00010473255881911392, + "loss": 1.3232, + "step": 8779 + }, + { + "epoch": 0.7965886409000181, + "grad_norm": 0.12218502028614413, + "learning_rate": 0.00010464259530459291, + "loss": 1.3559, + "step": 8780 + }, + { + "epoch": 0.7966793685356559, + "grad_norm": 0.13814550030487757, + "learning_rate": 0.00010455266592924195, + "loss": 1.4005, + "step": 8781 + }, + { + "epoch": 0.7967700961712938, + "grad_norm": 0.12097510813577754, + "learning_rate": 0.00010446277070082672, + "loss": 1.3626, + "step": 8782 + }, + { + "epoch": 0.7968608238069316, + "grad_norm": 0.11602692616587516, + "learning_rate": 0.00010437290962710938, + "loss": 1.3282, + "step": 8783 + }, + { + "epoch": 0.7969515514425695, + "grad_norm": 0.1304474371723329, + "learning_rate": 0.00010428308271584974, + "loss": 1.3648, + "step": 8784 + }, + { + "epoch": 0.7970422790782072, + "grad_norm": 0.17831991926758706, + "learning_rate": 0.00010419328997480426, + "loss": 1.3319, + "step": 8785 + }, + { + "epoch": 0.797133006713845, + "grad_norm": 0.12289211327634791, + "learning_rate": 0.00010410353141172635, + "loss": 1.3366, + "step": 8786 + }, + { + "epoch": 0.7972237343494829, + "grad_norm": 0.11895058180294592, + "learning_rate": 0.00010401380703436702, + "loss": 1.3664, + "step": 8787 + }, + { + "epoch": 0.7973144619851207, + "grad_norm": 0.11830985256534382, + "learning_rate": 0.00010392411685047365, + "loss": 1.3519, + "step": 8788 + }, + { + "epoch": 0.7974051896207585, + "grad_norm": 0.13019956390003543, + "learning_rate": 0.0001038344608677913, + "loss": 1.3691, + "step": 8789 + }, + { + "epoch": 0.7974959172563963, + "grad_norm": 0.1178632446309529, + "learning_rate": 0.00010374483909406151, + "loss": 1.3527, + "step": 8790 + }, + { + "epoch": 0.7975866448920341, + "grad_norm": 0.11945584279720661, + "learning_rate": 0.00010365525153702344, + "loss": 1.3198, + "step": 8791 + }, + { + "epoch": 0.7976773725276719, + "grad_norm": 0.12077387253257817, + "learning_rate": 0.00010356569820441281, + "loss": 1.3534, + "step": 8792 + }, + { + "epoch": 0.7977681001633098, + "grad_norm": 0.12133363021692203, + "learning_rate": 0.0001034761791039624, + "loss": 1.3662, + "step": 8793 + }, + { + "epoch": 0.7978588277989476, + "grad_norm": 0.13264854598040507, + "learning_rate": 0.00010338669424340247, + "loss": 1.4153, + "step": 8794 + }, + { + "epoch": 0.7979495554345853, + "grad_norm": 0.12241286858741182, + "learning_rate": 0.00010329724363046, + "loss": 1.345, + "step": 8795 + }, + { + "epoch": 0.7980402830702232, + "grad_norm": 0.11547638672785411, + "learning_rate": 0.00010320782727285888, + "loss": 1.3382, + "step": 8796 + }, + { + "epoch": 0.798131010705861, + "grad_norm": 0.12504923729592832, + "learning_rate": 0.00010311844517832037, + "loss": 1.3804, + "step": 8797 + }, + { + "epoch": 0.7982217383414988, + "grad_norm": 0.19342540741974773, + "learning_rate": 0.00010302909735456268, + "loss": 1.3746, + "step": 8798 + }, + { + "epoch": 0.7983124659771367, + "grad_norm": 0.11700873760507449, + "learning_rate": 0.00010293978380930096, + "loss": 1.3851, + "step": 8799 + }, + { + "epoch": 0.7984031936127745, + "grad_norm": 0.12433892381507733, + "learning_rate": 0.00010285050455024736, + "loss": 1.3493, + "step": 8800 + }, + { + "epoch": 0.7984939212484122, + "grad_norm": 0.11751818831677009, + "learning_rate": 0.00010276125958511129, + "loss": 1.3761, + "step": 8801 + }, + { + "epoch": 0.7985846488840501, + "grad_norm": 0.11096285881289297, + "learning_rate": 0.00010267204892159904, + "loss": 1.3815, + "step": 8802 + }, + { + "epoch": 0.7986753765196879, + "grad_norm": 0.11989105147927463, + "learning_rate": 0.00010258287256741378, + "loss": 1.3405, + "step": 8803 + }, + { + "epoch": 0.7987661041553257, + "grad_norm": 0.12031412738125909, + "learning_rate": 0.00010249373053025601, + "loss": 1.399, + "step": 8804 + }, + { + "epoch": 0.7988568317909636, + "grad_norm": 0.11979096666527328, + "learning_rate": 0.00010240462281782337, + "loss": 1.3541, + "step": 8805 + }, + { + "epoch": 0.7989475594266013, + "grad_norm": 0.116080007738003, + "learning_rate": 0.00010231554943781019, + "loss": 1.3486, + "step": 8806 + }, + { + "epoch": 0.7990382870622391, + "grad_norm": 0.11539972477661518, + "learning_rate": 0.0001022265103979077, + "loss": 1.3485, + "step": 8807 + }, + { + "epoch": 0.799129014697877, + "grad_norm": 0.11601050369009658, + "learning_rate": 0.00010213750570580488, + "loss": 1.348, + "step": 8808 + }, + { + "epoch": 0.7992197423335148, + "grad_norm": 0.11761649266255243, + "learning_rate": 0.00010204853536918701, + "loss": 1.3401, + "step": 8809 + }, + { + "epoch": 0.7993104699691526, + "grad_norm": 0.11192230581811811, + "learning_rate": 0.00010195959939573668, + "loss": 1.3727, + "step": 8810 + }, + { + "epoch": 0.7994011976047904, + "grad_norm": 0.11777988013441851, + "learning_rate": 0.00010187069779313374, + "loss": 1.3489, + "step": 8811 + }, + { + "epoch": 0.7994919252404282, + "grad_norm": 0.14430511890848924, + "learning_rate": 0.00010178183056905454, + "loss": 1.3502, + "step": 8812 + }, + { + "epoch": 0.799582652876066, + "grad_norm": 0.15646740352873437, + "learning_rate": 0.00010169299773117313, + "loss": 1.349, + "step": 8813 + }, + { + "epoch": 0.7996733805117039, + "grad_norm": 0.11550332102970111, + "learning_rate": 0.00010160419928715997, + "loss": 1.3888, + "step": 8814 + }, + { + "epoch": 0.7997641081473417, + "grad_norm": 0.13201624723511932, + "learning_rate": 0.00010151543524468298, + "loss": 1.4129, + "step": 8815 + }, + { + "epoch": 0.7998548357829794, + "grad_norm": 0.11826937048095265, + "learning_rate": 0.00010142670561140694, + "loss": 1.3324, + "step": 8816 + }, + { + "epoch": 0.7999455634186173, + "grad_norm": 0.11996485927697895, + "learning_rate": 0.00010133801039499346, + "loss": 1.3498, + "step": 8817 + }, + { + "epoch": 0.8000362910542551, + "grad_norm": 0.11645945059421134, + "learning_rate": 0.00010124934960310172, + "loss": 1.3472, + "step": 8818 + }, + { + "epoch": 0.8001270186898929, + "grad_norm": 0.11037695602375146, + "learning_rate": 0.00010116072324338738, + "loss": 1.3596, + "step": 8819 + }, + { + "epoch": 0.8002177463255308, + "grad_norm": 0.11412992953185634, + "learning_rate": 0.00010107213132350307, + "loss": 1.3769, + "step": 8820 + }, + { + "epoch": 0.8003084739611686, + "grad_norm": 0.11061326930753153, + "learning_rate": 0.0001009835738510993, + "loss": 1.3098, + "step": 8821 + }, + { + "epoch": 0.8003992015968064, + "grad_norm": 0.12237619075162193, + "learning_rate": 0.00010089505083382278, + "loss": 1.3277, + "step": 8822 + }, + { + "epoch": 0.8004899292324442, + "grad_norm": 0.12934628076803473, + "learning_rate": 0.0001008065622793174, + "loss": 1.3811, + "step": 8823 + }, + { + "epoch": 0.800580656868082, + "grad_norm": 0.11045191549435439, + "learning_rate": 0.0001007181081952241, + "loss": 1.3612, + "step": 8824 + }, + { + "epoch": 0.8006713845037199, + "grad_norm": 0.1135721639121763, + "learning_rate": 0.00010062968858918109, + "loss": 1.3633, + "step": 8825 + }, + { + "epoch": 0.8007621121393577, + "grad_norm": 0.12245959937055433, + "learning_rate": 0.00010054130346882334, + "loss": 1.4044, + "step": 8826 + }, + { + "epoch": 0.8008528397749954, + "grad_norm": 0.13188118342940217, + "learning_rate": 0.00010045295284178268, + "loss": 1.3831, + "step": 8827 + }, + { + "epoch": 0.8009435674106333, + "grad_norm": 0.11132327025724692, + "learning_rate": 0.00010036463671568868, + "loss": 1.3354, + "step": 8828 + }, + { + "epoch": 0.8010342950462711, + "grad_norm": 0.12435880245310296, + "learning_rate": 0.00010027635509816718, + "loss": 1.3404, + "step": 8829 + }, + { + "epoch": 0.8011250226819089, + "grad_norm": 0.11751033113112315, + "learning_rate": 0.00010018810799684124, + "loss": 1.348, + "step": 8830 + }, + { + "epoch": 0.8012157503175468, + "grad_norm": 0.11076039744550385, + "learning_rate": 0.00010009989541933118, + "loss": 1.4082, + "step": 8831 + }, + { + "epoch": 0.8013064779531845, + "grad_norm": 0.11900225540329923, + "learning_rate": 0.00010001171737325416, + "loss": 1.3958, + "step": 8832 + }, + { + "epoch": 0.8013972055888223, + "grad_norm": 0.10945376005615251, + "learning_rate": 9.992357386622425e-05, + "loss": 1.3695, + "step": 8833 + }, + { + "epoch": 0.8014879332244602, + "grad_norm": 0.114068888031458, + "learning_rate": 9.983546490585255e-05, + "loss": 1.3588, + "step": 8834 + }, + { + "epoch": 0.801578660860098, + "grad_norm": 0.11185473152078261, + "learning_rate": 9.974739049974745e-05, + "loss": 1.347, + "step": 8835 + }, + { + "epoch": 0.8016693884957358, + "grad_norm": 0.12001229389483525, + "learning_rate": 9.965935065551428e-05, + "loss": 1.3459, + "step": 8836 + }, + { + "epoch": 0.8017601161313737, + "grad_norm": 0.13769525704423416, + "learning_rate": 9.957134538075512e-05, + "loss": 1.3276, + "step": 8837 + }, + { + "epoch": 0.8018508437670114, + "grad_norm": 0.17393654367917946, + "learning_rate": 9.94833746830694e-05, + "loss": 1.353, + "step": 8838 + }, + { + "epoch": 0.8019415714026492, + "grad_norm": 0.11368838436016872, + "learning_rate": 9.939543857005329e-05, + "loss": 1.346, + "step": 8839 + }, + { + "epoch": 0.8020322990382871, + "grad_norm": 0.12250417697365724, + "learning_rate": 9.930753704930001e-05, + "loss": 1.3277, + "step": 8840 + }, + { + "epoch": 0.8021230266739249, + "grad_norm": 0.11658008363168547, + "learning_rate": 9.921967012840005e-05, + "loss": 1.3611, + "step": 8841 + }, + { + "epoch": 0.8022137543095627, + "grad_norm": 0.11686614328118898, + "learning_rate": 9.913183781494067e-05, + "loss": 1.3915, + "step": 8842 + }, + { + "epoch": 0.8023044819452005, + "grad_norm": 0.11574498646038457, + "learning_rate": 9.904404011650603e-05, + "loss": 1.3736, + "step": 8843 + }, + { + "epoch": 0.8023952095808383, + "grad_norm": 0.11400787734521509, + "learning_rate": 9.895627704067766e-05, + "loss": 1.3571, + "step": 8844 + }, + { + "epoch": 0.8024859372164761, + "grad_norm": 0.11868276291211985, + "learning_rate": 9.886854859503397e-05, + "loss": 1.3392, + "step": 8845 + }, + { + "epoch": 0.802576664852114, + "grad_norm": 0.1181044579259869, + "learning_rate": 9.878085478715021e-05, + "loss": 1.3306, + "step": 8846 + }, + { + "epoch": 0.8026673924877518, + "grad_norm": 0.12276147777979277, + "learning_rate": 9.869319562459866e-05, + "loss": 1.3594, + "step": 8847 + }, + { + "epoch": 0.8027581201233895, + "grad_norm": 0.11616683620644679, + "learning_rate": 9.8605571114949e-05, + "loss": 1.3523, + "step": 8848 + }, + { + "epoch": 0.8028488477590274, + "grad_norm": 0.12430853192479621, + "learning_rate": 9.851798126576739e-05, + "loss": 1.3529, + "step": 8849 + }, + { + "epoch": 0.8029395753946652, + "grad_norm": 0.11755602574766451, + "learning_rate": 9.843042608461711e-05, + "loss": 1.3814, + "step": 8850 + }, + { + "epoch": 0.803030303030303, + "grad_norm": 0.11519462805666747, + "learning_rate": 9.834290557905872e-05, + "loss": 1.3538, + "step": 8851 + }, + { + "epoch": 0.8031210306659409, + "grad_norm": 0.12724182547213303, + "learning_rate": 9.825541975664981e-05, + "loss": 1.3701, + "step": 8852 + }, + { + "epoch": 0.8032117583015786, + "grad_norm": 0.11664709844443505, + "learning_rate": 9.816796862494459e-05, + "loss": 1.337, + "step": 8853 + }, + { + "epoch": 0.8033024859372164, + "grad_norm": 0.13266467918143476, + "learning_rate": 9.808055219149436e-05, + "loss": 1.3792, + "step": 8854 + }, + { + "epoch": 0.8033932135728543, + "grad_norm": 0.1634568633984663, + "learning_rate": 9.799317046384787e-05, + "loss": 1.3532, + "step": 8855 + }, + { + "epoch": 0.8034839412084921, + "grad_norm": 0.11365096793583733, + "learning_rate": 9.790582344955035e-05, + "loss": 1.3945, + "step": 8856 + }, + { + "epoch": 0.8035746688441299, + "grad_norm": 0.11332787736984018, + "learning_rate": 9.781851115614404e-05, + "loss": 1.3641, + "step": 8857 + }, + { + "epoch": 0.8036653964797678, + "grad_norm": 0.11483252443284708, + "learning_rate": 9.773123359116875e-05, + "loss": 1.3603, + "step": 8858 + }, + { + "epoch": 0.8037561241154055, + "grad_norm": 0.18565009124585458, + "learning_rate": 9.764399076216057e-05, + "loss": 1.3605, + "step": 8859 + }, + { + "epoch": 0.8038468517510434, + "grad_norm": 0.11612806119330406, + "learning_rate": 9.755678267665325e-05, + "loss": 1.3628, + "step": 8860 + }, + { + "epoch": 0.8039375793866812, + "grad_norm": 0.11627257638671072, + "learning_rate": 9.74696093421769e-05, + "loss": 1.3581, + "step": 8861 + }, + { + "epoch": 0.804028307022319, + "grad_norm": 0.12747058784848367, + "learning_rate": 9.738247076625928e-05, + "loss": 1.3732, + "step": 8862 + }, + { + "epoch": 0.8041190346579569, + "grad_norm": 0.12014471875446779, + "learning_rate": 9.729536695642461e-05, + "loss": 1.3403, + "step": 8863 + }, + { + "epoch": 0.8042097622935946, + "grad_norm": 0.11671108083097402, + "learning_rate": 9.720829792019426e-05, + "loss": 1.3915, + "step": 8864 + }, + { + "epoch": 0.8043004899292324, + "grad_norm": 0.11640982969459551, + "learning_rate": 9.712126366508683e-05, + "loss": 1.3905, + "step": 8865 + }, + { + "epoch": 0.8043912175648703, + "grad_norm": 0.12298532501340093, + "learning_rate": 9.703426419861755e-05, + "loss": 1.3687, + "step": 8866 + }, + { + "epoch": 0.8044819452005081, + "grad_norm": 0.11784594112377944, + "learning_rate": 9.694729952829911e-05, + "loss": 1.3274, + "step": 8867 + }, + { + "epoch": 0.8045726728361459, + "grad_norm": 0.12751789170433217, + "learning_rate": 9.686036966164063e-05, + "loss": 1.3387, + "step": 8868 + }, + { + "epoch": 0.8046634004717838, + "grad_norm": 0.11693279849514704, + "learning_rate": 9.677347460614871e-05, + "loss": 1.3785, + "step": 8869 + }, + { + "epoch": 0.8047541281074215, + "grad_norm": 0.12813889670536352, + "learning_rate": 9.668661436932674e-05, + "loss": 1.3784, + "step": 8870 + }, + { + "epoch": 0.8048448557430593, + "grad_norm": 0.11852376537277547, + "learning_rate": 9.659978895867494e-05, + "loss": 1.3606, + "step": 8871 + }, + { + "epoch": 0.8049355833786972, + "grad_norm": 0.13951113035541032, + "learning_rate": 9.651299838169092e-05, + "loss": 1.331, + "step": 8872 + }, + { + "epoch": 0.805026311014335, + "grad_norm": 0.12116824016391449, + "learning_rate": 9.642624264586896e-05, + "loss": 1.3614, + "step": 8873 + }, + { + "epoch": 0.8051170386499727, + "grad_norm": 0.12288361405276137, + "learning_rate": 9.63395217587003e-05, + "loss": 1.3723, + "step": 8874 + }, + { + "epoch": 0.8052077662856106, + "grad_norm": 0.18735909901182002, + "learning_rate": 9.625283572767347e-05, + "loss": 1.3327, + "step": 8875 + }, + { + "epoch": 0.8052984939212484, + "grad_norm": 0.17324243017704882, + "learning_rate": 9.616618456027387e-05, + "loss": 1.3836, + "step": 8876 + }, + { + "epoch": 0.8053892215568862, + "grad_norm": 0.11608430815188382, + "learning_rate": 9.607956826398379e-05, + "loss": 1.3396, + "step": 8877 + }, + { + "epoch": 0.8054799491925241, + "grad_norm": 0.12228386052604502, + "learning_rate": 9.599298684628233e-05, + "loss": 1.3485, + "step": 8878 + }, + { + "epoch": 0.8055706768281619, + "grad_norm": 0.12515614655878812, + "learning_rate": 9.590644031464618e-05, + "loss": 1.3434, + "step": 8879 + }, + { + "epoch": 0.8056614044637996, + "grad_norm": 0.11812666169857708, + "learning_rate": 9.581992867654843e-05, + "loss": 1.3656, + "step": 8880 + }, + { + "epoch": 0.8057521320994375, + "grad_norm": 0.1185329794409334, + "learning_rate": 9.573345193945931e-05, + "loss": 1.308, + "step": 8881 + }, + { + "epoch": 0.8058428597350753, + "grad_norm": 0.11237743205140405, + "learning_rate": 9.56470101108462e-05, + "loss": 1.3758, + "step": 8882 + }, + { + "epoch": 0.8059335873707131, + "grad_norm": 0.12408907487407629, + "learning_rate": 9.556060319817344e-05, + "loss": 1.3285, + "step": 8883 + }, + { + "epoch": 0.806024315006351, + "grad_norm": 0.1173869476283193, + "learning_rate": 9.547423120890224e-05, + "loss": 1.3517, + "step": 8884 + }, + { + "epoch": 0.8061150426419887, + "grad_norm": 0.12057418465486297, + "learning_rate": 9.538789415049071e-05, + "loss": 1.3346, + "step": 8885 + }, + { + "epoch": 0.8062057702776265, + "grad_norm": 0.11990862381190871, + "learning_rate": 9.530159203039423e-05, + "loss": 1.3411, + "step": 8886 + }, + { + "epoch": 0.8062964979132644, + "grad_norm": 0.11764042578542672, + "learning_rate": 9.521532485606493e-05, + "loss": 1.35, + "step": 8887 + }, + { + "epoch": 0.8063872255489022, + "grad_norm": 0.11963302630944236, + "learning_rate": 9.512909263495184e-05, + "loss": 1.3192, + "step": 8888 + }, + { + "epoch": 0.80647795318454, + "grad_norm": 0.2673506117189758, + "learning_rate": 9.504289537450145e-05, + "loss": 1.3865, + "step": 8889 + }, + { + "epoch": 0.8065686808201779, + "grad_norm": 0.12340095457505121, + "learning_rate": 9.495673308215657e-05, + "loss": 1.354, + "step": 8890 + }, + { + "epoch": 0.8066594084558156, + "grad_norm": 0.11353381547599717, + "learning_rate": 9.487060576535761e-05, + "loss": 1.3533, + "step": 8891 + }, + { + "epoch": 0.8067501360914534, + "grad_norm": 0.11777258955145989, + "learning_rate": 9.478451343154143e-05, + "loss": 1.3192, + "step": 8892 + }, + { + "epoch": 0.8068408637270913, + "grad_norm": 0.18764978031440435, + "learning_rate": 9.469845608814237e-05, + "loss": 1.3489, + "step": 8893 + }, + { + "epoch": 0.8069315913627291, + "grad_norm": 0.13284048681319544, + "learning_rate": 9.461243374259137e-05, + "loss": 1.3446, + "step": 8894 + }, + { + "epoch": 0.8070223189983668, + "grad_norm": 0.11545030552490806, + "learning_rate": 9.452644640231634e-05, + "loss": 1.3635, + "step": 8895 + }, + { + "epoch": 0.8071130466340047, + "grad_norm": 0.1269354909095149, + "learning_rate": 9.444049407474253e-05, + "loss": 1.3839, + "step": 8896 + }, + { + "epoch": 0.8072037742696425, + "grad_norm": 0.11942548053473552, + "learning_rate": 9.435457676729187e-05, + "loss": 1.3485, + "step": 8897 + }, + { + "epoch": 0.8072945019052804, + "grad_norm": 0.11684663811355872, + "learning_rate": 9.426869448738295e-05, + "loss": 1.413, + "step": 8898 + }, + { + "epoch": 0.8073852295409182, + "grad_norm": 0.12274716794758425, + "learning_rate": 9.418284724243237e-05, + "loss": 1.3756, + "step": 8899 + }, + { + "epoch": 0.807475957176556, + "grad_norm": 0.11780224048414514, + "learning_rate": 9.409703503985279e-05, + "loss": 1.3534, + "step": 8900 + }, + { + "epoch": 0.8075666848121938, + "grad_norm": 0.12149867551149539, + "learning_rate": 9.4011257887054e-05, + "loss": 1.4057, + "step": 8901 + }, + { + "epoch": 0.8076574124478316, + "grad_norm": 0.11796851069791497, + "learning_rate": 9.392551579144282e-05, + "loss": 1.3655, + "step": 8902 + }, + { + "epoch": 0.8077481400834694, + "grad_norm": 0.1281485817038632, + "learning_rate": 9.38398087604233e-05, + "loss": 1.4093, + "step": 8903 + }, + { + "epoch": 0.8078388677191073, + "grad_norm": 0.12114370566929117, + "learning_rate": 9.375413680139616e-05, + "loss": 1.3428, + "step": 8904 + }, + { + "epoch": 0.8079295953547451, + "grad_norm": 0.12046024543644664, + "learning_rate": 9.366849992175896e-05, + "loss": 1.3741, + "step": 8905 + }, + { + "epoch": 0.8080203229903828, + "grad_norm": 0.11976597797137588, + "learning_rate": 9.358289812890691e-05, + "loss": 1.3641, + "step": 8906 + }, + { + "epoch": 0.8081110506260207, + "grad_norm": 0.12033060312666553, + "learning_rate": 9.349733143023154e-05, + "loss": 1.3428, + "step": 8907 + }, + { + "epoch": 0.8082017782616585, + "grad_norm": 0.12355352015365785, + "learning_rate": 9.341179983312131e-05, + "loss": 1.3895, + "step": 8908 + }, + { + "epoch": 0.8082925058972963, + "grad_norm": 0.11701429964640148, + "learning_rate": 9.332630334496229e-05, + "loss": 1.3578, + "step": 8909 + }, + { + "epoch": 0.8083832335329342, + "grad_norm": 0.12203654588278756, + "learning_rate": 9.324084197313686e-05, + "loss": 1.3415, + "step": 8910 + }, + { + "epoch": 0.808473961168572, + "grad_norm": 0.12375966272221388, + "learning_rate": 9.315541572502473e-05, + "loss": 1.3296, + "step": 8911 + }, + { + "epoch": 0.8085646888042097, + "grad_norm": 0.1268788989806856, + "learning_rate": 9.307002460800223e-05, + "loss": 1.3382, + "step": 8912 + }, + { + "epoch": 0.8086554164398476, + "grad_norm": 0.12557976351378178, + "learning_rate": 9.298466862944316e-05, + "loss": 1.3255, + "step": 8913 + }, + { + "epoch": 0.8087461440754854, + "grad_norm": 0.17828305683147802, + "learning_rate": 9.289934779671799e-05, + "loss": 1.3787, + "step": 8914 + }, + { + "epoch": 0.8088368717111232, + "grad_norm": 0.13038707404515762, + "learning_rate": 9.281406211719406e-05, + "loss": 1.3568, + "step": 8915 + }, + { + "epoch": 0.8089275993467611, + "grad_norm": 0.13384872056486402, + "learning_rate": 9.272881159823599e-05, + "loss": 1.3301, + "step": 8916 + }, + { + "epoch": 0.8090183269823988, + "grad_norm": 0.12358717738293085, + "learning_rate": 9.264359624720509e-05, + "loss": 1.335, + "step": 8917 + }, + { + "epoch": 0.8091090546180366, + "grad_norm": 0.12583193410290913, + "learning_rate": 9.255841607145949e-05, + "loss": 1.3598, + "step": 8918 + }, + { + "epoch": 0.8091997822536745, + "grad_norm": 0.1164356419991848, + "learning_rate": 9.24732710783549e-05, + "loss": 1.3416, + "step": 8919 + }, + { + "epoch": 0.8092905098893123, + "grad_norm": 0.15325652397153391, + "learning_rate": 9.238816127524341e-05, + "loss": 1.3671, + "step": 8920 + }, + { + "epoch": 0.8093812375249501, + "grad_norm": 0.22186013699490384, + "learning_rate": 9.230308666947413e-05, + "loss": 1.37, + "step": 8921 + }, + { + "epoch": 0.809471965160588, + "grad_norm": 0.2533611095615178, + "learning_rate": 9.221804726839339e-05, + "loss": 1.3664, + "step": 8922 + }, + { + "epoch": 0.8095626927962257, + "grad_norm": 0.1305810567647746, + "learning_rate": 9.213304307934456e-05, + "loss": 1.3471, + "step": 8923 + }, + { + "epoch": 0.8096534204318635, + "grad_norm": 0.12412807050169972, + "learning_rate": 9.204807410966754e-05, + "loss": 1.3423, + "step": 8924 + }, + { + "epoch": 0.8097441480675014, + "grad_norm": 0.11714310142763333, + "learning_rate": 9.196314036669929e-05, + "loss": 1.3577, + "step": 8925 + }, + { + "epoch": 0.8098348757031392, + "grad_norm": 0.12709782483561408, + "learning_rate": 9.187824185777415e-05, + "loss": 1.3708, + "step": 8926 + }, + { + "epoch": 0.809925603338777, + "grad_norm": 0.12163305559074711, + "learning_rate": 9.179337859022297e-05, + "loss": 1.3908, + "step": 8927 + }, + { + "epoch": 0.8100163309744148, + "grad_norm": 0.12261659784167618, + "learning_rate": 9.170855057137362e-05, + "loss": 1.3553, + "step": 8928 + }, + { + "epoch": 0.8101070586100526, + "grad_norm": 0.11662219317465346, + "learning_rate": 9.162375780855109e-05, + "loss": 1.3706, + "step": 8929 + }, + { + "epoch": 0.8101977862456904, + "grad_norm": 0.1299236060717597, + "learning_rate": 9.153900030907735e-05, + "loss": 1.3607, + "step": 8930 + }, + { + "epoch": 0.8102885138813283, + "grad_norm": 0.13717157580533587, + "learning_rate": 9.145427808027118e-05, + "loss": 1.3536, + "step": 8931 + }, + { + "epoch": 0.810379241516966, + "grad_norm": 0.1663255376008074, + "learning_rate": 9.136959112944815e-05, + "loss": 1.3465, + "step": 8932 + }, + { + "epoch": 0.8104699691526038, + "grad_norm": 0.1202996733656865, + "learning_rate": 9.12849394639213e-05, + "loss": 1.3753, + "step": 8933 + }, + { + "epoch": 0.8105606967882417, + "grad_norm": 0.2087860361483655, + "learning_rate": 9.120032309100018e-05, + "loss": 1.3623, + "step": 8934 + }, + { + "epoch": 0.8106514244238795, + "grad_norm": 0.12308077171760523, + "learning_rate": 9.111574201799128e-05, + "loss": 1.3585, + "step": 8935 + }, + { + "epoch": 0.8107421520595173, + "grad_norm": 0.12097550996251917, + "learning_rate": 9.103119625219841e-05, + "loss": 1.3493, + "step": 8936 + }, + { + "epoch": 0.8108328796951552, + "grad_norm": 0.14276699648203875, + "learning_rate": 9.094668580092192e-05, + "loss": 1.3328, + "step": 8937 + }, + { + "epoch": 0.8109236073307929, + "grad_norm": 0.13063882778713412, + "learning_rate": 9.08622106714595e-05, + "loss": 1.3556, + "step": 8938 + }, + { + "epoch": 0.8110143349664308, + "grad_norm": 0.131329597380713, + "learning_rate": 9.07777708711054e-05, + "loss": 1.3773, + "step": 8939 + }, + { + "epoch": 0.8111050626020686, + "grad_norm": 0.12136960387352015, + "learning_rate": 9.06933664071512e-05, + "loss": 1.3466, + "step": 8940 + }, + { + "epoch": 0.8111957902377064, + "grad_norm": 0.14416231181729028, + "learning_rate": 9.060899728688515e-05, + "loss": 1.4004, + "step": 8941 + }, + { + "epoch": 0.8112865178733443, + "grad_norm": 0.11920196330429095, + "learning_rate": 9.052466351759242e-05, + "loss": 1.3276, + "step": 8942 + }, + { + "epoch": 0.811377245508982, + "grad_norm": 0.12407612025147233, + "learning_rate": 9.044036510655546e-05, + "loss": 1.3325, + "step": 8943 + }, + { + "epoch": 0.8114679731446198, + "grad_norm": 0.1285785336964737, + "learning_rate": 9.035610206105316e-05, + "loss": 1.3448, + "step": 8944 + }, + { + "epoch": 0.8115587007802577, + "grad_norm": 0.11685149446661489, + "learning_rate": 9.027187438836198e-05, + "loss": 1.3324, + "step": 8945 + }, + { + "epoch": 0.8116494284158955, + "grad_norm": 0.11773340358192982, + "learning_rate": 9.018768209575472e-05, + "loss": 1.3666, + "step": 8946 + }, + { + "epoch": 0.8117401560515333, + "grad_norm": 0.1416288265530007, + "learning_rate": 9.010352519050163e-05, + "loss": 1.3676, + "step": 8947 + }, + { + "epoch": 0.8118308836871712, + "grad_norm": 0.12419889822016911, + "learning_rate": 9.001940367986955e-05, + "loss": 1.3705, + "step": 8948 + }, + { + "epoch": 0.8119216113228089, + "grad_norm": 0.1233486214817043, + "learning_rate": 8.993531757112227e-05, + "loss": 1.3449, + "step": 8949 + }, + { + "epoch": 0.8120123389584467, + "grad_norm": 0.12442058414406847, + "learning_rate": 8.985126687152084e-05, + "loss": 1.35, + "step": 8950 + }, + { + "epoch": 0.8121030665940846, + "grad_norm": 0.11837284073836239, + "learning_rate": 8.976725158832305e-05, + "loss": 1.3626, + "step": 8951 + }, + { + "epoch": 0.8121937942297224, + "grad_norm": 0.123950844360878, + "learning_rate": 8.968327172878332e-05, + "loss": 1.382, + "step": 8952 + }, + { + "epoch": 0.8122845218653602, + "grad_norm": 0.1319733266166192, + "learning_rate": 8.959932730015363e-05, + "loss": 1.3624, + "step": 8953 + }, + { + "epoch": 0.812375249500998, + "grad_norm": 0.12315728875984162, + "learning_rate": 8.951541830968263e-05, + "loss": 1.3908, + "step": 8954 + }, + { + "epoch": 0.8124659771366358, + "grad_norm": 0.11554366824498953, + "learning_rate": 8.943154476461573e-05, + "loss": 1.4022, + "step": 8955 + }, + { + "epoch": 0.8125567047722736, + "grad_norm": 0.11121088811996085, + "learning_rate": 8.934770667219533e-05, + "loss": 1.3523, + "step": 8956 + }, + { + "epoch": 0.8126474324079115, + "grad_norm": 0.11346158816391687, + "learning_rate": 8.926390403966111e-05, + "loss": 1.3918, + "step": 8957 + }, + { + "epoch": 0.8127381600435493, + "grad_norm": 0.11796273683298174, + "learning_rate": 8.91801368742493e-05, + "loss": 1.3605, + "step": 8958 + }, + { + "epoch": 0.812828887679187, + "grad_norm": 0.1817890278820212, + "learning_rate": 8.909640518319312e-05, + "loss": 1.3944, + "step": 8959 + }, + { + "epoch": 0.8129196153148249, + "grad_norm": 0.11616805234987733, + "learning_rate": 8.901270897372288e-05, + "loss": 1.3419, + "step": 8960 + }, + { + "epoch": 0.8130103429504627, + "grad_norm": 0.12244255246996179, + "learning_rate": 8.892904825306597e-05, + "loss": 1.3406, + "step": 8961 + }, + { + "epoch": 0.8131010705861005, + "grad_norm": 0.12481828717195716, + "learning_rate": 8.884542302844628e-05, + "loss": 1.3423, + "step": 8962 + }, + { + "epoch": 0.8131917982217384, + "grad_norm": 0.11439501122754858, + "learning_rate": 8.876183330708482e-05, + "loss": 1.3041, + "step": 8963 + }, + { + "epoch": 0.8132825258573761, + "grad_norm": 0.12078027750783547, + "learning_rate": 8.86782790961998e-05, + "loss": 1.3359, + "step": 8964 + }, + { + "epoch": 0.8133732534930139, + "grad_norm": 0.12044577682144826, + "learning_rate": 8.859476040300596e-05, + "loss": 1.3967, + "step": 8965 + }, + { + "epoch": 0.8134639811286518, + "grad_norm": 0.11351137170175055, + "learning_rate": 8.851127723471508e-05, + "loss": 1.3361, + "step": 8966 + }, + { + "epoch": 0.8135547087642896, + "grad_norm": 0.11724080984313986, + "learning_rate": 8.842782959853617e-05, + "loss": 1.3423, + "step": 8967 + }, + { + "epoch": 0.8136454363999274, + "grad_norm": 0.11571351360454171, + "learning_rate": 8.834441750167477e-05, + "loss": 1.3816, + "step": 8968 + }, + { + "epoch": 0.8137361640355653, + "grad_norm": 0.12220902979975895, + "learning_rate": 8.826104095133363e-05, + "loss": 1.3749, + "step": 8969 + }, + { + "epoch": 0.813826891671203, + "grad_norm": 0.1178842908503073, + "learning_rate": 8.81776999547122e-05, + "loss": 1.3535, + "step": 8970 + }, + { + "epoch": 0.8139176193068408, + "grad_norm": 0.13104541533218414, + "learning_rate": 8.809439451900725e-05, + "loss": 1.338, + "step": 8971 + }, + { + "epoch": 0.8140083469424787, + "grad_norm": 0.11501337986410584, + "learning_rate": 8.801112465141198e-05, + "loss": 1.3999, + "step": 8972 + }, + { + "epoch": 0.8140990745781165, + "grad_norm": 0.11678459030111622, + "learning_rate": 8.792789035911669e-05, + "loss": 1.3371, + "step": 8973 + }, + { + "epoch": 0.8141898022137543, + "grad_norm": 0.1268935219104004, + "learning_rate": 8.784469164930897e-05, + "loss": 1.3506, + "step": 8974 + }, + { + "epoch": 0.8142805298493921, + "grad_norm": 0.13119686174883088, + "learning_rate": 8.776152852917285e-05, + "loss": 1.3597, + "step": 8975 + }, + { + "epoch": 0.8143712574850299, + "grad_norm": 0.12574437483546827, + "learning_rate": 8.767840100588926e-05, + "loss": 1.3495, + "step": 8976 + }, + { + "epoch": 0.8144619851206678, + "grad_norm": 0.11799480022946764, + "learning_rate": 8.759530908663671e-05, + "loss": 1.3458, + "step": 8977 + }, + { + "epoch": 0.8145527127563056, + "grad_norm": 0.1212181470731246, + "learning_rate": 8.751225277859004e-05, + "loss": 1.3204, + "step": 8978 + }, + { + "epoch": 0.8146434403919434, + "grad_norm": 0.12385034205232821, + "learning_rate": 8.742923208892117e-05, + "loss": 1.3441, + "step": 8979 + }, + { + "epoch": 0.8147341680275813, + "grad_norm": 0.11782740207714025, + "learning_rate": 8.734624702479877e-05, + "loss": 1.3344, + "step": 8980 + }, + { + "epoch": 0.814824895663219, + "grad_norm": 0.1354209571163626, + "learning_rate": 8.726329759338886e-05, + "loss": 1.3455, + "step": 8981 + }, + { + "epoch": 0.8149156232988568, + "grad_norm": 0.12000517822818312, + "learning_rate": 8.718038380185406e-05, + "loss": 1.3721, + "step": 8982 + }, + { + "epoch": 0.8150063509344947, + "grad_norm": 0.11822720297384309, + "learning_rate": 8.70975056573537e-05, + "loss": 1.3366, + "step": 8983 + }, + { + "epoch": 0.8150970785701325, + "grad_norm": 0.12076943175014117, + "learning_rate": 8.701466316704482e-05, + "loss": 1.3313, + "step": 8984 + }, + { + "epoch": 0.8151878062057702, + "grad_norm": 0.12570118080052192, + "learning_rate": 8.693185633808064e-05, + "loss": 1.3851, + "step": 8985 + }, + { + "epoch": 0.8152785338414081, + "grad_norm": 0.1301692989377024, + "learning_rate": 8.684908517761147e-05, + "loss": 1.3164, + "step": 8986 + }, + { + "epoch": 0.8153692614770459, + "grad_norm": 0.12091986691875946, + "learning_rate": 8.676634969278474e-05, + "loss": 1.3651, + "step": 8987 + }, + { + "epoch": 0.8154599891126837, + "grad_norm": 0.11231544511388859, + "learning_rate": 8.668364989074468e-05, + "loss": 1.3732, + "step": 8988 + }, + { + "epoch": 0.8155507167483216, + "grad_norm": 0.12769433743847042, + "learning_rate": 8.660098577863235e-05, + "loss": 1.3398, + "step": 8989 + }, + { + "epoch": 0.8156414443839594, + "grad_norm": 0.1319505879491962, + "learning_rate": 8.65183573635857e-05, + "loss": 1.345, + "step": 8990 + }, + { + "epoch": 0.8157321720195971, + "grad_norm": 0.12286994456234801, + "learning_rate": 8.643576465273983e-05, + "loss": 1.3729, + "step": 8991 + }, + { + "epoch": 0.815822899655235, + "grad_norm": 0.11784057848615914, + "learning_rate": 8.635320765322674e-05, + "loss": 1.3597, + "step": 8992 + }, + { + "epoch": 0.8159136272908728, + "grad_norm": 0.11803805885888023, + "learning_rate": 8.627068637217505e-05, + "loss": 1.3194, + "step": 8993 + }, + { + "epoch": 0.8160043549265106, + "grad_norm": 0.12041593212010483, + "learning_rate": 8.618820081671064e-05, + "loss": 1.3177, + "step": 8994 + }, + { + "epoch": 0.8160950825621485, + "grad_norm": 0.120004386757693, + "learning_rate": 8.610575099395613e-05, + "loss": 1.3688, + "step": 8995 + }, + { + "epoch": 0.8161858101977862, + "grad_norm": 0.11516914869169513, + "learning_rate": 8.602333691103087e-05, + "loss": 1.3287, + "step": 8996 + }, + { + "epoch": 0.816276537833424, + "grad_norm": 0.12182219036473735, + "learning_rate": 8.594095857505163e-05, + "loss": 1.3575, + "step": 8997 + }, + { + "epoch": 0.8163672654690619, + "grad_norm": 0.15287305162245052, + "learning_rate": 8.58586159931316e-05, + "loss": 1.3223, + "step": 8998 + }, + { + "epoch": 0.8164579931046997, + "grad_norm": 0.15388475696465886, + "learning_rate": 8.577630917238106e-05, + "loss": 1.3347, + "step": 8999 + }, + { + "epoch": 0.8165487207403375, + "grad_norm": 0.11713906167364353, + "learning_rate": 8.569403811990722e-05, + "loss": 1.3646, + "step": 9000 + }, + { + "epoch": 0.8166394483759754, + "grad_norm": 0.12014816340696993, + "learning_rate": 8.56118028428144e-05, + "loss": 1.362, + "step": 9001 + }, + { + "epoch": 0.8167301760116131, + "grad_norm": 0.11797890265767147, + "learning_rate": 8.552960334820347e-05, + "loss": 1.3804, + "step": 9002 + }, + { + "epoch": 0.8168209036472509, + "grad_norm": 0.1385329851720377, + "learning_rate": 8.544743964317225e-05, + "loss": 1.3378, + "step": 9003 + }, + { + "epoch": 0.8169116312828888, + "grad_norm": 0.12140246352351776, + "learning_rate": 8.536531173481588e-05, + "loss": 1.3314, + "step": 9004 + }, + { + "epoch": 0.8170023589185266, + "grad_norm": 0.12178249405324544, + "learning_rate": 8.528321963022589e-05, + "loss": 1.3363, + "step": 9005 + }, + { + "epoch": 0.8170930865541643, + "grad_norm": 0.13551701411418993, + "learning_rate": 8.520116333649091e-05, + "loss": 1.36, + "step": 9006 + }, + { + "epoch": 0.8171838141898022, + "grad_norm": 0.12153665957488831, + "learning_rate": 8.51191428606966e-05, + "loss": 1.4129, + "step": 9007 + }, + { + "epoch": 0.81727454182544, + "grad_norm": 0.12249472055522012, + "learning_rate": 8.503715820992558e-05, + "loss": 1.3567, + "step": 9008 + }, + { + "epoch": 0.8173652694610778, + "grad_norm": 0.1373412672960753, + "learning_rate": 8.495520939125712e-05, + "loss": 1.3583, + "step": 9009 + }, + { + "epoch": 0.8174559970967157, + "grad_norm": 0.12072399172305133, + "learning_rate": 8.487329641176739e-05, + "loss": 1.3463, + "step": 9010 + }, + { + "epoch": 0.8175467247323535, + "grad_norm": 0.12891251164608591, + "learning_rate": 8.479141927852979e-05, + "loss": 1.3249, + "step": 9011 + }, + { + "epoch": 0.8176374523679912, + "grad_norm": 0.1387263584677907, + "learning_rate": 8.470957799861433e-05, + "loss": 1.3439, + "step": 9012 + }, + { + "epoch": 0.8177281800036291, + "grad_norm": 0.12331247844445667, + "learning_rate": 8.462777257908793e-05, + "loss": 1.304, + "step": 9013 + }, + { + "epoch": 0.8178189076392669, + "grad_norm": 0.11929639587052202, + "learning_rate": 8.454600302701465e-05, + "loss": 1.367, + "step": 9014 + }, + { + "epoch": 0.8179096352749048, + "grad_norm": 0.12381893209868694, + "learning_rate": 8.446426934945517e-05, + "loss": 1.3863, + "step": 9015 + }, + { + "epoch": 0.8180003629105426, + "grad_norm": 0.20798905379399313, + "learning_rate": 8.43825715534674e-05, + "loss": 1.3622, + "step": 9016 + }, + { + "epoch": 0.8180910905461803, + "grad_norm": 0.12334051034457995, + "learning_rate": 8.43009096461057e-05, + "loss": 1.3802, + "step": 9017 + }, + { + "epoch": 0.8181818181818182, + "grad_norm": 0.1255244022049514, + "learning_rate": 8.421928363442183e-05, + "loss": 1.3472, + "step": 9018 + }, + { + "epoch": 0.818272545817456, + "grad_norm": 0.14437016483947024, + "learning_rate": 8.413769352546408e-05, + "loss": 1.3935, + "step": 9019 + }, + { + "epoch": 0.8183632734530938, + "grad_norm": 0.13560438014322412, + "learning_rate": 8.40561393262777e-05, + "loss": 1.3597, + "step": 9020 + }, + { + "epoch": 0.8184540010887317, + "grad_norm": 0.12500876146784673, + "learning_rate": 8.397462104390507e-05, + "loss": 1.3253, + "step": 9021 + }, + { + "epoch": 0.8185447287243695, + "grad_norm": 0.13556518396866984, + "learning_rate": 8.389313868538517e-05, + "loss": 1.3335, + "step": 9022 + }, + { + "epoch": 0.8186354563600072, + "grad_norm": 0.15718636287182902, + "learning_rate": 8.381169225775415e-05, + "loss": 1.3696, + "step": 9023 + }, + { + "epoch": 0.8187261839956451, + "grad_norm": 0.14018330143844196, + "learning_rate": 8.373028176804475e-05, + "loss": 1.3397, + "step": 9024 + }, + { + "epoch": 0.8188169116312829, + "grad_norm": 0.12299710752645109, + "learning_rate": 8.3648907223287e-05, + "loss": 1.3578, + "step": 9025 + }, + { + "epoch": 0.8189076392669207, + "grad_norm": 0.146351548249403, + "learning_rate": 8.356756863050741e-05, + "loss": 1.3567, + "step": 9026 + }, + { + "epoch": 0.8189983669025586, + "grad_norm": 0.12521756394056963, + "learning_rate": 8.34862659967296e-05, + "loss": 1.354, + "step": 9027 + }, + { + "epoch": 0.8190890945381963, + "grad_norm": 0.11931630024155497, + "learning_rate": 8.340499932897416e-05, + "loss": 1.3625, + "step": 9028 + }, + { + "epoch": 0.8191798221738341, + "grad_norm": 0.11984187262095522, + "learning_rate": 8.33237686342585e-05, + "loss": 1.3254, + "step": 9029 + }, + { + "epoch": 0.819270549809472, + "grad_norm": 0.12810128583436312, + "learning_rate": 8.324257391959666e-05, + "loss": 1.3905, + "step": 9030 + }, + { + "epoch": 0.8193612774451098, + "grad_norm": 0.1219189875098515, + "learning_rate": 8.316141519199999e-05, + "loss": 1.3572, + "step": 9031 + }, + { + "epoch": 0.8194520050807476, + "grad_norm": 0.11966062978380279, + "learning_rate": 8.30802924584767e-05, + "loss": 1.3708, + "step": 9032 + }, + { + "epoch": 0.8195427327163854, + "grad_norm": 0.13213198502481727, + "learning_rate": 8.299920572603158e-05, + "loss": 1.3435, + "step": 9033 + }, + { + "epoch": 0.8196334603520232, + "grad_norm": 0.1283510658866632, + "learning_rate": 8.291815500166639e-05, + "loss": 1.3163, + "step": 9034 + }, + { + "epoch": 0.819724187987661, + "grad_norm": 0.12303938857623786, + "learning_rate": 8.28371402923801e-05, + "loss": 1.3667, + "step": 9035 + }, + { + "epoch": 0.8198149156232989, + "grad_norm": 0.1213622206441821, + "learning_rate": 8.275616160516824e-05, + "loss": 1.352, + "step": 9036 + }, + { + "epoch": 0.8199056432589367, + "grad_norm": 0.12788406326925236, + "learning_rate": 8.267521894702318e-05, + "loss": 1.3467, + "step": 9037 + }, + { + "epoch": 0.8199963708945744, + "grad_norm": 0.12437218123080215, + "learning_rate": 8.259431232493453e-05, + "loss": 1.3698, + "step": 9038 + }, + { + "epoch": 0.8200870985302123, + "grad_norm": 0.1224860116922976, + "learning_rate": 8.25134417458886e-05, + "loss": 1.364, + "step": 9039 + }, + { + "epoch": 0.8201778261658501, + "grad_norm": 0.11689919317938302, + "learning_rate": 8.243260721686852e-05, + "loss": 1.3507, + "step": 9040 + }, + { + "epoch": 0.8202685538014879, + "grad_norm": 0.15015378704162077, + "learning_rate": 8.23518087448542e-05, + "loss": 1.3488, + "step": 9041 + }, + { + "epoch": 0.8203592814371258, + "grad_norm": 0.12479692847622767, + "learning_rate": 8.227104633682291e-05, + "loss": 1.3213, + "step": 9042 + }, + { + "epoch": 0.8204500090727636, + "grad_norm": 0.11915023253323097, + "learning_rate": 8.21903199997483e-05, + "loss": 1.3588, + "step": 9043 + }, + { + "epoch": 0.8205407367084013, + "grad_norm": 0.4168656749230325, + "learning_rate": 8.210962974060104e-05, + "loss": 1.3211, + "step": 9044 + }, + { + "epoch": 0.8206314643440392, + "grad_norm": 0.13223721477586942, + "learning_rate": 8.202897556634897e-05, + "loss": 1.3252, + "step": 9045 + }, + { + "epoch": 0.820722191979677, + "grad_norm": 0.11532431217748458, + "learning_rate": 8.194835748395635e-05, + "loss": 1.3058, + "step": 9046 + }, + { + "epoch": 0.8208129196153148, + "grad_norm": 0.11811057596044217, + "learning_rate": 8.186777550038477e-05, + "loss": 1.3292, + "step": 9047 + }, + { + "epoch": 0.8209036472509527, + "grad_norm": 0.11628059038744551, + "learning_rate": 8.178722962259227e-05, + "loss": 1.352, + "step": 9048 + }, + { + "epoch": 0.8209943748865904, + "grad_norm": 0.11968142977914233, + "learning_rate": 8.17067198575343e-05, + "loss": 1.3713, + "step": 9049 + }, + { + "epoch": 0.8210851025222282, + "grad_norm": 0.12476321010168058, + "learning_rate": 8.162624621216269e-05, + "loss": 1.3342, + "step": 9050 + }, + { + "epoch": 0.8211758301578661, + "grad_norm": 0.12009684883934857, + "learning_rate": 8.154580869342626e-05, + "loss": 1.3614, + "step": 9051 + }, + { + "epoch": 0.8212665577935039, + "grad_norm": 0.138430631040767, + "learning_rate": 8.146540730827106e-05, + "loss": 1.4028, + "step": 9052 + }, + { + "epoch": 0.8213572854291418, + "grad_norm": 0.17860558653329156, + "learning_rate": 8.138504206363961e-05, + "loss": 1.3552, + "step": 9053 + }, + { + "epoch": 0.8214480130647795, + "grad_norm": 0.1246091975378527, + "learning_rate": 8.130471296647124e-05, + "loss": 1.3361, + "step": 9054 + }, + { + "epoch": 0.8215387407004173, + "grad_norm": 0.12585500086631352, + "learning_rate": 8.122442002370284e-05, + "loss": 1.3458, + "step": 9055 + }, + { + "epoch": 0.8216294683360552, + "grad_norm": 0.13327780729115948, + "learning_rate": 8.114416324226748e-05, + "loss": 1.3756, + "step": 9056 + }, + { + "epoch": 0.821720195971693, + "grad_norm": 0.12330120170530166, + "learning_rate": 8.106394262909539e-05, + "loss": 1.3594, + "step": 9057 + }, + { + "epoch": 0.8218109236073308, + "grad_norm": 0.13631796599965712, + "learning_rate": 8.098375819111348e-05, + "loss": 1.3658, + "step": 9058 + }, + { + "epoch": 0.8219016512429687, + "grad_norm": 0.12467851001380084, + "learning_rate": 8.090360993524593e-05, + "loss": 1.3692, + "step": 9059 + }, + { + "epoch": 0.8219923788786064, + "grad_norm": 0.1349207190947535, + "learning_rate": 8.082349786841342e-05, + "loss": 1.3803, + "step": 9060 + }, + { + "epoch": 0.8220831065142442, + "grad_norm": 0.12158512298321858, + "learning_rate": 8.074342199753338e-05, + "loss": 1.3462, + "step": 9061 + }, + { + "epoch": 0.8221738341498821, + "grad_norm": 0.12836960278078796, + "learning_rate": 8.066338232952092e-05, + "loss": 1.3242, + "step": 9062 + }, + { + "epoch": 0.8222645617855199, + "grad_norm": 0.12222532707090927, + "learning_rate": 8.058337887128719e-05, + "loss": 1.3827, + "step": 9063 + }, + { + "epoch": 0.8223552894211577, + "grad_norm": 0.12401816169456595, + "learning_rate": 8.05034116297404e-05, + "loss": 1.3452, + "step": 9064 + }, + { + "epoch": 0.8224460170567955, + "grad_norm": 0.14334987050216325, + "learning_rate": 8.042348061178595e-05, + "loss": 1.3745, + "step": 9065 + }, + { + "epoch": 0.8225367446924333, + "grad_norm": 0.1199814075646709, + "learning_rate": 8.034358582432578e-05, + "loss": 1.3651, + "step": 9066 + }, + { + "epoch": 0.8226274723280711, + "grad_norm": 0.1273748123960627, + "learning_rate": 8.026372727425884e-05, + "loss": 1.3372, + "step": 9067 + }, + { + "epoch": 0.822718199963709, + "grad_norm": 0.12241105025844067, + "learning_rate": 8.018390496848077e-05, + "loss": 1.351, + "step": 9068 + }, + { + "epoch": 0.8228089275993468, + "grad_norm": 0.12902939748569145, + "learning_rate": 8.010411891388441e-05, + "loss": 1.3544, + "step": 9069 + }, + { + "epoch": 0.8228996552349845, + "grad_norm": 0.1626400327415295, + "learning_rate": 8.002436911735939e-05, + "loss": 1.3188, + "step": 9070 + }, + { + "epoch": 0.8229903828706224, + "grad_norm": 0.12586429814987435, + "learning_rate": 7.994465558579184e-05, + "loss": 1.3121, + "step": 9071 + }, + { + "epoch": 0.8230811105062602, + "grad_norm": 0.11794141332836988, + "learning_rate": 7.986497832606532e-05, + "loss": 1.4095, + "step": 9072 + }, + { + "epoch": 0.823171838141898, + "grad_norm": 0.1383087728369772, + "learning_rate": 7.978533734505988e-05, + "loss": 1.344, + "step": 9073 + }, + { + "epoch": 0.8232625657775359, + "grad_norm": 0.11956989184792327, + "learning_rate": 7.970573264965237e-05, + "loss": 1.3602, + "step": 9074 + }, + { + "epoch": 0.8233532934131736, + "grad_norm": 0.1310832965354881, + "learning_rate": 7.962616424671687e-05, + "loss": 1.399, + "step": 9075 + }, + { + "epoch": 0.8234440210488114, + "grad_norm": 0.11862757628291047, + "learning_rate": 7.954663214312407e-05, + "loss": 1.3641, + "step": 9076 + }, + { + "epoch": 0.8235347486844493, + "grad_norm": 0.1322059365364479, + "learning_rate": 7.946713634574144e-05, + "loss": 1.3386, + "step": 9077 + }, + { + "epoch": 0.8236254763200871, + "grad_norm": 0.11915505416993805, + "learning_rate": 7.938767686143355e-05, + "loss": 1.3815, + "step": 9078 + }, + { + "epoch": 0.8237162039557249, + "grad_norm": 0.13457371165555096, + "learning_rate": 7.930825369706185e-05, + "loss": 1.3898, + "step": 9079 + }, + { + "epoch": 0.8238069315913628, + "grad_norm": 0.16808947257141957, + "learning_rate": 7.92288668594845e-05, + "loss": 1.3563, + "step": 9080 + }, + { + "epoch": 0.8238976592270005, + "grad_norm": 0.13495191849045093, + "learning_rate": 7.914951635555635e-05, + "loss": 1.3658, + "step": 9081 + }, + { + "epoch": 0.8239883868626383, + "grad_norm": 0.12569503035903196, + "learning_rate": 7.907020219212963e-05, + "loss": 1.3408, + "step": 9082 + }, + { + "epoch": 0.8240791144982762, + "grad_norm": 0.11164210968943847, + "learning_rate": 7.899092437605298e-05, + "loss": 1.3129, + "step": 9083 + }, + { + "epoch": 0.824169842133914, + "grad_norm": 0.1339080313866877, + "learning_rate": 7.891168291417194e-05, + "loss": 1.3668, + "step": 9084 + }, + { + "epoch": 0.8242605697695518, + "grad_norm": 0.13272248697711433, + "learning_rate": 7.883247781332914e-05, + "loss": 1.3752, + "step": 9085 + }, + { + "epoch": 0.8243512974051896, + "grad_norm": 0.2552587000524499, + "learning_rate": 7.87533090803641e-05, + "loss": 1.3422, + "step": 9086 + }, + { + "epoch": 0.8244420250408274, + "grad_norm": 0.12488341479666219, + "learning_rate": 7.867417672211291e-05, + "loss": 1.3874, + "step": 9087 + }, + { + "epoch": 0.8245327526764652, + "grad_norm": 0.1264047577333365, + "learning_rate": 7.85950807454085e-05, + "loss": 1.3634, + "step": 9088 + }, + { + "epoch": 0.8246234803121031, + "grad_norm": 0.12626497147028368, + "learning_rate": 7.851602115708112e-05, + "loss": 1.3737, + "step": 9089 + }, + { + "epoch": 0.8247142079477409, + "grad_norm": 0.12694013130412582, + "learning_rate": 7.843699796395742e-05, + "loss": 1.3772, + "step": 9090 + }, + { + "epoch": 0.8248049355833788, + "grad_norm": 0.1219448766017687, + "learning_rate": 7.835801117286096e-05, + "loss": 1.3672, + "step": 9091 + }, + { + "epoch": 0.8248956632190165, + "grad_norm": 0.11734513419211137, + "learning_rate": 7.827906079061248e-05, + "loss": 1.3674, + "step": 9092 + }, + { + "epoch": 0.8249863908546543, + "grad_norm": 0.13565948504904435, + "learning_rate": 7.820014682402915e-05, + "loss": 1.3633, + "step": 9093 + }, + { + "epoch": 0.8250771184902922, + "grad_norm": 0.12663682054831416, + "learning_rate": 7.812126927992547e-05, + "loss": 1.315, + "step": 9094 + }, + { + "epoch": 0.82516784612593, + "grad_norm": 0.13061910090209122, + "learning_rate": 7.804242816511221e-05, + "loss": 1.3505, + "step": 9095 + }, + { + "epoch": 0.8252585737615677, + "grad_norm": 0.12633909425501974, + "learning_rate": 7.796362348639757e-05, + "loss": 1.3157, + "step": 9096 + }, + { + "epoch": 0.8253493013972056, + "grad_norm": 0.12071361191759727, + "learning_rate": 7.788485525058631e-05, + "loss": 1.2966, + "step": 9097 + }, + { + "epoch": 0.8254400290328434, + "grad_norm": 0.11366257909642423, + "learning_rate": 7.780612346447979e-05, + "loss": 1.3729, + "step": 9098 + }, + { + "epoch": 0.8255307566684812, + "grad_norm": 0.15461686587173726, + "learning_rate": 7.772742813487693e-05, + "loss": 1.3823, + "step": 9099 + }, + { + "epoch": 0.8256214843041191, + "grad_norm": 0.1260669568870561, + "learning_rate": 7.764876926857278e-05, + "loss": 1.3255, + "step": 9100 + }, + { + "epoch": 0.8257122119397569, + "grad_norm": 0.13592753902314622, + "learning_rate": 7.757014687235975e-05, + "loss": 1.3248, + "step": 9101 + }, + { + "epoch": 0.8258029395753946, + "grad_norm": 0.12132008726631498, + "learning_rate": 7.749156095302662e-05, + "loss": 1.3652, + "step": 9102 + }, + { + "epoch": 0.8258936672110325, + "grad_norm": 0.12770909163580343, + "learning_rate": 7.741301151735968e-05, + "loss": 1.353, + "step": 9103 + }, + { + "epoch": 0.8259843948466703, + "grad_norm": 0.1188581158176519, + "learning_rate": 7.733449857214142e-05, + "loss": 1.3489, + "step": 9104 + }, + { + "epoch": 0.8260751224823081, + "grad_norm": 0.13003910923306058, + "learning_rate": 7.72560221241514e-05, + "loss": 1.3533, + "step": 9105 + }, + { + "epoch": 0.826165850117946, + "grad_norm": 0.11462150030591582, + "learning_rate": 7.71775821801663e-05, + "loss": 1.3314, + "step": 9106 + }, + { + "epoch": 0.8262565777535837, + "grad_norm": 0.12641879080941193, + "learning_rate": 7.709917874695932e-05, + "loss": 1.37, + "step": 9107 + }, + { + "epoch": 0.8263473053892215, + "grad_norm": 0.11982569587568533, + "learning_rate": 7.702081183130044e-05, + "loss": 1.3208, + "step": 9108 + }, + { + "epoch": 0.8264380330248594, + "grad_norm": 0.1279823685890968, + "learning_rate": 7.694248143995674e-05, + "loss": 1.3654, + "step": 9109 + }, + { + "epoch": 0.8265287606604972, + "grad_norm": 0.12104871055387893, + "learning_rate": 7.686418757969233e-05, + "loss": 1.3534, + "step": 9110 + }, + { + "epoch": 0.826619488296135, + "grad_norm": 0.13190789967292146, + "learning_rate": 7.678593025726765e-05, + "loss": 1.3511, + "step": 9111 + }, + { + "epoch": 0.8267102159317729, + "grad_norm": 0.13298459716673453, + "learning_rate": 7.670770947944017e-05, + "loss": 1.3798, + "step": 9112 + }, + { + "epoch": 0.8268009435674106, + "grad_norm": 0.1305002895298204, + "learning_rate": 7.662952525296446e-05, + "loss": 1.3456, + "step": 9113 + }, + { + "epoch": 0.8268916712030484, + "grad_norm": 0.12085220537519648, + "learning_rate": 7.655137758459162e-05, + "loss": 1.3696, + "step": 9114 + }, + { + "epoch": 0.8269823988386863, + "grad_norm": 0.2701070561338019, + "learning_rate": 7.647326648106967e-05, + "loss": 1.3744, + "step": 9115 + }, + { + "epoch": 0.8270731264743241, + "grad_norm": 0.11881194998988827, + "learning_rate": 7.639519194914357e-05, + "loss": 1.3648, + "step": 9116 + }, + { + "epoch": 0.8271638541099618, + "grad_norm": 0.1327604756298926, + "learning_rate": 7.631715399555517e-05, + "loss": 1.4103, + "step": 9117 + }, + { + "epoch": 0.8272545817455997, + "grad_norm": 0.12221570197948142, + "learning_rate": 7.623915262704295e-05, + "loss": 1.3724, + "step": 9118 + }, + { + "epoch": 0.8273453093812375, + "grad_norm": 0.11800117068775538, + "learning_rate": 7.616118785034226e-05, + "loss": 1.3522, + "step": 9119 + }, + { + "epoch": 0.8274360370168753, + "grad_norm": 0.12633325384255265, + "learning_rate": 7.60832596721856e-05, + "loss": 1.3631, + "step": 9120 + }, + { + "epoch": 0.8275267646525132, + "grad_norm": 0.11885448463073214, + "learning_rate": 7.600536809930198e-05, + "loss": 1.3213, + "step": 9121 + }, + { + "epoch": 0.827617492288151, + "grad_norm": 0.12699984326773323, + "learning_rate": 7.592751313841711e-05, + "loss": 1.3683, + "step": 9122 + }, + { + "epoch": 0.8277082199237887, + "grad_norm": 0.12195381308903594, + "learning_rate": 7.584969479625414e-05, + "loss": 1.3278, + "step": 9123 + }, + { + "epoch": 0.8277989475594266, + "grad_norm": 0.12476169063791807, + "learning_rate": 7.577191307953246e-05, + "loss": 1.3261, + "step": 9124 + }, + { + "epoch": 0.8278896751950644, + "grad_norm": 0.12712907502289889, + "learning_rate": 7.56941679949687e-05, + "loss": 1.3719, + "step": 9125 + }, + { + "epoch": 0.8279804028307022, + "grad_norm": 0.12809040748676212, + "learning_rate": 7.561645954927593e-05, + "loss": 1.3875, + "step": 9126 + }, + { + "epoch": 0.8280711304663401, + "grad_norm": 0.12382797439793336, + "learning_rate": 7.553878774916456e-05, + "loss": 1.3354, + "step": 9127 + }, + { + "epoch": 0.8281618581019778, + "grad_norm": 0.12017488874260698, + "learning_rate": 7.54611526013414e-05, + "loss": 1.3834, + "step": 9128 + }, + { + "epoch": 0.8282525857376157, + "grad_norm": 0.12061664084722275, + "learning_rate": 7.538355411251018e-05, + "loss": 1.3426, + "step": 9129 + }, + { + "epoch": 0.8283433133732535, + "grad_norm": 0.14664617857439366, + "learning_rate": 7.530599228937179e-05, + "loss": 1.3682, + "step": 9130 + }, + { + "epoch": 0.8284340410088913, + "grad_norm": 0.11410181576722143, + "learning_rate": 7.522846713862342e-05, + "loss": 1.3575, + "step": 9131 + }, + { + "epoch": 0.8285247686445292, + "grad_norm": 0.11549699771891153, + "learning_rate": 7.515097866695952e-05, + "loss": 1.3387, + "step": 9132 + }, + { + "epoch": 0.828615496280167, + "grad_norm": 0.12479265107560658, + "learning_rate": 7.507352688107138e-05, + "loss": 1.3328, + "step": 9133 + }, + { + "epoch": 0.8287062239158047, + "grad_norm": 0.1248024806179662, + "learning_rate": 7.499611178764676e-05, + "loss": 1.368, + "step": 9134 + }, + { + "epoch": 0.8287969515514426, + "grad_norm": 0.1342334582141785, + "learning_rate": 7.491873339337057e-05, + "loss": 1.3753, + "step": 9135 + }, + { + "epoch": 0.8288876791870804, + "grad_norm": 0.13568559229961438, + "learning_rate": 7.484139170492427e-05, + "loss": 1.3351, + "step": 9136 + }, + { + "epoch": 0.8289784068227182, + "grad_norm": 0.13010766521883596, + "learning_rate": 7.47640867289866e-05, + "loss": 1.3204, + "step": 9137 + }, + { + "epoch": 0.8290691344583561, + "grad_norm": 0.1155769123618389, + "learning_rate": 7.468681847223269e-05, + "loss": 1.3678, + "step": 9138 + }, + { + "epoch": 0.8291598620939938, + "grad_norm": 0.11349684018227334, + "learning_rate": 7.460958694133446e-05, + "loss": 1.3299, + "step": 9139 + }, + { + "epoch": 0.8292505897296316, + "grad_norm": 0.16467183998183169, + "learning_rate": 7.453239214296132e-05, + "loss": 1.3921, + "step": 9140 + }, + { + "epoch": 0.8293413173652695, + "grad_norm": 0.12079149840169795, + "learning_rate": 7.44552340837788e-05, + "loss": 1.3493, + "step": 9141 + }, + { + "epoch": 0.8294320450009073, + "grad_norm": 0.12058311310613762, + "learning_rate": 7.437811277044948e-05, + "loss": 1.3127, + "step": 9142 + }, + { + "epoch": 0.8295227726365451, + "grad_norm": 0.11921657570733596, + "learning_rate": 7.430102820963292e-05, + "loss": 1.35, + "step": 9143 + }, + { + "epoch": 0.829613500272183, + "grad_norm": 0.11846775571539812, + "learning_rate": 7.422398040798533e-05, + "loss": 1.3339, + "step": 9144 + }, + { + "epoch": 0.8297042279078207, + "grad_norm": 0.1196926491805721, + "learning_rate": 7.414696937215982e-05, + "loss": 1.3589, + "step": 9145 + }, + { + "epoch": 0.8297949555434585, + "grad_norm": 0.12333063355755475, + "learning_rate": 7.406999510880608e-05, + "loss": 1.3759, + "step": 9146 + }, + { + "epoch": 0.8298856831790964, + "grad_norm": 0.13376767136662884, + "learning_rate": 7.39930576245711e-05, + "loss": 1.3494, + "step": 9147 + }, + { + "epoch": 0.8299764108147342, + "grad_norm": 0.11637962439681045, + "learning_rate": 7.391615692609849e-05, + "loss": 1.3223, + "step": 9148 + }, + { + "epoch": 0.8300671384503719, + "grad_norm": 0.12158695662592664, + "learning_rate": 7.38392930200284e-05, + "loss": 1.3448, + "step": 9149 + }, + { + "epoch": 0.8301578660860098, + "grad_norm": 0.13722040060487992, + "learning_rate": 7.376246591299829e-05, + "loss": 1.3569, + "step": 9150 + }, + { + "epoch": 0.8302485937216476, + "grad_norm": 0.12124706754742855, + "learning_rate": 7.368567561164203e-05, + "loss": 1.3513, + "step": 9151 + }, + { + "epoch": 0.8303393213572854, + "grad_norm": 0.12337068789559462, + "learning_rate": 7.360892212259041e-05, + "loss": 1.3555, + "step": 9152 + }, + { + "epoch": 0.8304300489929233, + "grad_norm": 0.11847334517752629, + "learning_rate": 7.353220545247136e-05, + "loss": 1.3695, + "step": 9153 + }, + { + "epoch": 0.830520776628561, + "grad_norm": 0.12085225411491193, + "learning_rate": 7.345552560790919e-05, + "loss": 1.3431, + "step": 9154 + }, + { + "epoch": 0.8306115042641988, + "grad_norm": 0.11626861805754818, + "learning_rate": 7.337888259552516e-05, + "loss": 1.3778, + "step": 9155 + }, + { + "epoch": 0.8307022318998367, + "grad_norm": 0.1277395229769628, + "learning_rate": 7.330227642193749e-05, + "loss": 1.357, + "step": 9156 + }, + { + "epoch": 0.8307929595354745, + "grad_norm": 0.1398930362714478, + "learning_rate": 7.322570709376125e-05, + "loss": 1.3665, + "step": 9157 + }, + { + "epoch": 0.8308836871711123, + "grad_norm": 0.16384380637525034, + "learning_rate": 7.314917461760812e-05, + "loss": 1.3513, + "step": 9158 + }, + { + "epoch": 0.8309744148067502, + "grad_norm": 0.11859131014358885, + "learning_rate": 7.307267900008651e-05, + "loss": 1.3213, + "step": 9159 + }, + { + "epoch": 0.8310651424423879, + "grad_norm": 0.11743331222018111, + "learning_rate": 7.299622024780216e-05, + "loss": 1.3841, + "step": 9160 + }, + { + "epoch": 0.8311558700780257, + "grad_norm": 0.12243575497800528, + "learning_rate": 7.291979836735713e-05, + "loss": 1.3651, + "step": 9161 + }, + { + "epoch": 0.8312465977136636, + "grad_norm": 0.1206631360008365, + "learning_rate": 7.284341336535027e-05, + "loss": 1.335, + "step": 9162 + }, + { + "epoch": 0.8313373253493014, + "grad_norm": 0.11880096106207168, + "learning_rate": 7.276706524837767e-05, + "loss": 1.3437, + "step": 9163 + }, + { + "epoch": 0.8314280529849392, + "grad_norm": 0.11485024135702497, + "learning_rate": 7.26907540230321e-05, + "loss": 1.3927, + "step": 9164 + }, + { + "epoch": 0.831518780620577, + "grad_norm": 0.1590566855984486, + "learning_rate": 7.261447969590285e-05, + "loss": 1.3484, + "step": 9165 + }, + { + "epoch": 0.8316095082562148, + "grad_norm": 0.12788607380019765, + "learning_rate": 7.253824227357614e-05, + "loss": 1.3385, + "step": 9166 + }, + { + "epoch": 0.8317002358918527, + "grad_norm": 0.11946697719027416, + "learning_rate": 7.246204176263538e-05, + "loss": 1.3574, + "step": 9167 + }, + { + "epoch": 0.8317909635274905, + "grad_norm": 0.18269219002665055, + "learning_rate": 7.23858781696603e-05, + "loss": 1.3719, + "step": 9168 + }, + { + "epoch": 0.8318816911631283, + "grad_norm": 0.11537515208435146, + "learning_rate": 7.230975150122754e-05, + "loss": 1.3512, + "step": 9169 + }, + { + "epoch": 0.8319724187987662, + "grad_norm": 0.11658955698652648, + "learning_rate": 7.223366176391077e-05, + "loss": 1.3631, + "step": 9170 + }, + { + "epoch": 0.8320631464344039, + "grad_norm": 0.11450457058837683, + "learning_rate": 7.215760896428047e-05, + "loss": 1.3477, + "step": 9171 + }, + { + "epoch": 0.8321538740700417, + "grad_norm": 0.11332518911750143, + "learning_rate": 7.208159310890372e-05, + "loss": 1.3389, + "step": 9172 + }, + { + "epoch": 0.8322446017056796, + "grad_norm": 0.11558646543853499, + "learning_rate": 7.200561420434437e-05, + "loss": 1.3319, + "step": 9173 + }, + { + "epoch": 0.8323353293413174, + "grad_norm": 0.12245557981137034, + "learning_rate": 7.19296722571634e-05, + "loss": 1.3626, + "step": 9174 + }, + { + "epoch": 0.8324260569769552, + "grad_norm": 0.24698887730960462, + "learning_rate": 7.185376727391829e-05, + "loss": 1.3276, + "step": 9175 + }, + { + "epoch": 0.832516784612593, + "grad_norm": 0.12000062998923165, + "learning_rate": 7.17778992611634e-05, + "loss": 1.3315, + "step": 9176 + }, + { + "epoch": 0.8326075122482308, + "grad_norm": 0.12612854086032493, + "learning_rate": 7.170206822545016e-05, + "loss": 1.3605, + "step": 9177 + }, + { + "epoch": 0.8326982398838686, + "grad_norm": 0.11520760332251107, + "learning_rate": 7.162627417332629e-05, + "loss": 1.3315, + "step": 9178 + }, + { + "epoch": 0.8327889675195065, + "grad_norm": 0.11937517134433731, + "learning_rate": 7.155051711133692e-05, + "loss": 1.3408, + "step": 9179 + }, + { + "epoch": 0.8328796951551443, + "grad_norm": 0.12009046007061917, + "learning_rate": 7.147479704602345e-05, + "loss": 1.3445, + "step": 9180 + }, + { + "epoch": 0.832970422790782, + "grad_norm": 0.12070751017428252, + "learning_rate": 7.139911398392446e-05, + "loss": 1.348, + "step": 9181 + }, + { + "epoch": 0.8330611504264199, + "grad_norm": 0.13205229551539602, + "learning_rate": 7.132346793157524e-05, + "loss": 1.3365, + "step": 9182 + }, + { + "epoch": 0.8331518780620577, + "grad_norm": 0.12326792848656092, + "learning_rate": 7.124785889550755e-05, + "loss": 1.3412, + "step": 9183 + }, + { + "epoch": 0.8332426056976955, + "grad_norm": 0.12769403598219545, + "learning_rate": 7.117228688225058e-05, + "loss": 1.3721, + "step": 9184 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.11565551058744704, + "learning_rate": 7.109675189832986e-05, + "loss": 1.3699, + "step": 9185 + }, + { + "epoch": 0.8334240609689711, + "grad_norm": 0.12301022458835448, + "learning_rate": 7.102125395026771e-05, + "loss": 1.3621, + "step": 9186 + }, + { + "epoch": 0.8335147886046089, + "grad_norm": 0.11908245296136097, + "learning_rate": 7.094579304458349e-05, + "loss": 1.3829, + "step": 9187 + }, + { + "epoch": 0.8336055162402468, + "grad_norm": 0.11614456493299942, + "learning_rate": 7.087036918779338e-05, + "loss": 1.3411, + "step": 9188 + }, + { + "epoch": 0.8336962438758846, + "grad_norm": 0.12020180371318187, + "learning_rate": 7.079498238641014e-05, + "loss": 1.3333, + "step": 9189 + }, + { + "epoch": 0.8337869715115224, + "grad_norm": 0.12240748582853249, + "learning_rate": 7.071963264694331e-05, + "loss": 1.3335, + "step": 9190 + }, + { + "epoch": 0.8338776991471603, + "grad_norm": 0.3732945070787441, + "learning_rate": 7.06443199758996e-05, + "loss": 1.3442, + "step": 9191 + }, + { + "epoch": 0.833968426782798, + "grad_norm": 0.1297349184511812, + "learning_rate": 7.056904437978218e-05, + "loss": 1.3582, + "step": 9192 + }, + { + "epoch": 0.8340591544184358, + "grad_norm": 0.1165069699742866, + "learning_rate": 7.04938058650909e-05, + "loss": 1.3523, + "step": 9193 + }, + { + "epoch": 0.8341498820540737, + "grad_norm": 0.11603742548116479, + "learning_rate": 7.041860443832276e-05, + "loss": 1.3208, + "step": 9194 + }, + { + "epoch": 0.8342406096897115, + "grad_norm": 0.11757618848023299, + "learning_rate": 7.03434401059716e-05, + "loss": 1.3268, + "step": 9195 + }, + { + "epoch": 0.8343313373253493, + "grad_norm": 0.12841871753036496, + "learning_rate": 7.026831287452773e-05, + "loss": 1.3283, + "step": 9196 + }, + { + "epoch": 0.8344220649609871, + "grad_norm": 0.11365673556541336, + "learning_rate": 7.019322275047823e-05, + "loss": 1.3631, + "step": 9197 + }, + { + "epoch": 0.8345127925966249, + "grad_norm": 0.13575059759158203, + "learning_rate": 7.011816974030743e-05, + "loss": 1.3701, + "step": 9198 + }, + { + "epoch": 0.8346035202322627, + "grad_norm": 0.12031396082012284, + "learning_rate": 7.004315385049603e-05, + "loss": 1.3667, + "step": 9199 + }, + { + "epoch": 0.8346942478679006, + "grad_norm": 0.1217817433490109, + "learning_rate": 6.996817508752151e-05, + "loss": 1.3839, + "step": 9200 + }, + { + "epoch": 0.8347849755035384, + "grad_norm": 0.1286954001596256, + "learning_rate": 6.989323345785864e-05, + "loss": 1.3411, + "step": 9201 + }, + { + "epoch": 0.8348757031391761, + "grad_norm": 0.12345795812365926, + "learning_rate": 6.98183289679783e-05, + "loss": 1.3174, + "step": 9202 + }, + { + "epoch": 0.834966430774814, + "grad_norm": 0.24212425520154876, + "learning_rate": 6.974346162434874e-05, + "loss": 1.3209, + "step": 9203 + }, + { + "epoch": 0.8350571584104518, + "grad_norm": 0.1627575988947576, + "learning_rate": 6.96686314334346e-05, + "loss": 1.3524, + "step": 9204 + }, + { + "epoch": 0.8351478860460897, + "grad_norm": 0.13069060453089432, + "learning_rate": 6.959383840169765e-05, + "loss": 1.3259, + "step": 9205 + }, + { + "epoch": 0.8352386136817275, + "grad_norm": 0.11715101316123773, + "learning_rate": 6.951908253559625e-05, + "loss": 1.3479, + "step": 9206 + }, + { + "epoch": 0.8353293413173652, + "grad_norm": 0.12986725873512417, + "learning_rate": 6.944436384158531e-05, + "loss": 1.3376, + "step": 9207 + }, + { + "epoch": 0.8354200689530031, + "grad_norm": 0.12382930285798631, + "learning_rate": 6.936968232611718e-05, + "loss": 1.3837, + "step": 9208 + }, + { + "epoch": 0.8355107965886409, + "grad_norm": 0.12286204570968927, + "learning_rate": 6.929503799564036e-05, + "loss": 1.3609, + "step": 9209 + }, + { + "epoch": 0.8356015242242787, + "grad_norm": 0.11541289506785048, + "learning_rate": 6.922043085660046e-05, + "loss": 1.3547, + "step": 9210 + }, + { + "epoch": 0.8356922518599166, + "grad_norm": 0.12335411857747071, + "learning_rate": 6.914586091543995e-05, + "loss": 1.306, + "step": 9211 + }, + { + "epoch": 0.8357829794955544, + "grad_norm": 0.11944471344884033, + "learning_rate": 6.907132817859785e-05, + "loss": 1.3368, + "step": 9212 + }, + { + "epoch": 0.8358737071311921, + "grad_norm": 0.15232391063590545, + "learning_rate": 6.899683265251011e-05, + "loss": 1.2975, + "step": 9213 + }, + { + "epoch": 0.83596443476683, + "grad_norm": 0.12399478250477435, + "learning_rate": 6.892237434360927e-05, + "loss": 1.3613, + "step": 9214 + }, + { + "epoch": 0.8360551624024678, + "grad_norm": 0.1227616414280765, + "learning_rate": 6.884795325832499e-05, + "loss": 1.3591, + "step": 9215 + }, + { + "epoch": 0.8361458900381056, + "grad_norm": 0.13095678580864248, + "learning_rate": 6.877356940308355e-05, + "loss": 1.3347, + "step": 9216 + }, + { + "epoch": 0.8362366176737435, + "grad_norm": 0.1203081937569434, + "learning_rate": 6.869922278430774e-05, + "loss": 1.34, + "step": 9217 + }, + { + "epoch": 0.8363273453093812, + "grad_norm": 0.11962100193994275, + "learning_rate": 6.862491340841775e-05, + "loss": 1.3611, + "step": 9218 + }, + { + "epoch": 0.836418072945019, + "grad_norm": 0.12032329822387001, + "learning_rate": 6.855064128183014e-05, + "loss": 1.3597, + "step": 9219 + }, + { + "epoch": 0.8365088005806569, + "grad_norm": 0.11369390729135827, + "learning_rate": 6.847640641095815e-05, + "loss": 1.3371, + "step": 9220 + }, + { + "epoch": 0.8365995282162947, + "grad_norm": 0.11493431257848165, + "learning_rate": 6.840220880221209e-05, + "loss": 1.3732, + "step": 9221 + }, + { + "epoch": 0.8366902558519325, + "grad_norm": 0.12606822219906444, + "learning_rate": 6.832804846199897e-05, + "loss": 1.3374, + "step": 9222 + }, + { + "epoch": 0.8367809834875704, + "grad_norm": 0.12854120110762537, + "learning_rate": 6.825392539672237e-05, + "loss": 1.3311, + "step": 9223 + }, + { + "epoch": 0.8368717111232081, + "grad_norm": 0.12677423416116132, + "learning_rate": 6.817983961278307e-05, + "loss": 1.3718, + "step": 9224 + }, + { + "epoch": 0.8369624387588459, + "grad_norm": 0.11727883294250217, + "learning_rate": 6.810579111657816e-05, + "loss": 1.3434, + "step": 9225 + }, + { + "epoch": 0.8370531663944838, + "grad_norm": 0.11923646685413461, + "learning_rate": 6.803177991450193e-05, + "loss": 1.3841, + "step": 9226 + }, + { + "epoch": 0.8371438940301216, + "grad_norm": 0.12321752986362906, + "learning_rate": 6.795780601294504e-05, + "loss": 1.3831, + "step": 9227 + }, + { + "epoch": 0.8372346216657593, + "grad_norm": 0.12294938782352349, + "learning_rate": 6.788386941829544e-05, + "loss": 1.3324, + "step": 9228 + }, + { + "epoch": 0.8373253493013972, + "grad_norm": 0.11829828670886905, + "learning_rate": 6.780997013693735e-05, + "loss": 1.3545, + "step": 9229 + }, + { + "epoch": 0.837416076937035, + "grad_norm": 0.11621992743965462, + "learning_rate": 6.773610817525195e-05, + "loss": 1.3814, + "step": 9230 + }, + { + "epoch": 0.8375068045726728, + "grad_norm": 0.1271910018703827, + "learning_rate": 6.76622835396174e-05, + "loss": 1.3603, + "step": 9231 + }, + { + "epoch": 0.8375975322083107, + "grad_norm": 0.13234406469297819, + "learning_rate": 6.758849623640844e-05, + "loss": 1.3735, + "step": 9232 + }, + { + "epoch": 0.8376882598439485, + "grad_norm": 0.12709528041367057, + "learning_rate": 6.751474627199638e-05, + "loss": 1.3373, + "step": 9233 + }, + { + "epoch": 0.8377789874795862, + "grad_norm": 0.12267042376667814, + "learning_rate": 6.744103365274978e-05, + "loss": 1.3917, + "step": 9234 + }, + { + "epoch": 0.8378697151152241, + "grad_norm": 0.1339649377270351, + "learning_rate": 6.736735838503377e-05, + "loss": 1.3489, + "step": 9235 + }, + { + "epoch": 0.8379604427508619, + "grad_norm": 0.12175438563759913, + "learning_rate": 6.729372047521015e-05, + "loss": 1.3201, + "step": 9236 + }, + { + "epoch": 0.8380511703864997, + "grad_norm": 0.12355259349118357, + "learning_rate": 6.722011992963738e-05, + "loss": 1.3498, + "step": 9237 + }, + { + "epoch": 0.8381418980221376, + "grad_norm": 0.12655148739991895, + "learning_rate": 6.714655675467119e-05, + "loss": 1.3376, + "step": 9238 + }, + { + "epoch": 0.8382326256577753, + "grad_norm": 0.11145928245920277, + "learning_rate": 6.707303095666361e-05, + "loss": 1.3476, + "step": 9239 + }, + { + "epoch": 0.8383233532934131, + "grad_norm": 0.1220575201708626, + "learning_rate": 6.69995425419635e-05, + "loss": 1.3953, + "step": 9240 + }, + { + "epoch": 0.838414080929051, + "grad_norm": 0.6460251414978683, + "learning_rate": 6.69260915169167e-05, + "loss": 1.3571, + "step": 9241 + }, + { + "epoch": 0.8385048085646888, + "grad_norm": 0.11656236105124154, + "learning_rate": 6.685267788786587e-05, + "loss": 1.3605, + "step": 9242 + }, + { + "epoch": 0.8385955362003267, + "grad_norm": 0.11382576308056842, + "learning_rate": 6.677930166115015e-05, + "loss": 1.3732, + "step": 9243 + }, + { + "epoch": 0.8386862638359645, + "grad_norm": 0.11323530909272962, + "learning_rate": 6.670596284310542e-05, + "loss": 1.3361, + "step": 9244 + }, + { + "epoch": 0.8387769914716022, + "grad_norm": 0.15988282562813075, + "learning_rate": 6.66326614400648e-05, + "loss": 1.3822, + "step": 9245 + }, + { + "epoch": 0.8388677191072401, + "grad_norm": 0.12321748729616605, + "learning_rate": 6.65593974583577e-05, + "loss": 1.3369, + "step": 9246 + }, + { + "epoch": 0.8389584467428779, + "grad_norm": 0.11501224770403946, + "learning_rate": 6.648617090431048e-05, + "loss": 1.3706, + "step": 9247 + }, + { + "epoch": 0.8390491743785157, + "grad_norm": 0.1195728385556324, + "learning_rate": 6.64129817842462e-05, + "loss": 1.3642, + "step": 9248 + }, + { + "epoch": 0.8391399020141536, + "grad_norm": 0.1279107253864198, + "learning_rate": 6.633983010448502e-05, + "loss": 1.3816, + "step": 9249 + }, + { + "epoch": 0.8392306296497913, + "grad_norm": 0.11245466076949563, + "learning_rate": 6.626671587134342e-05, + "loss": 1.3424, + "step": 9250 + }, + { + "epoch": 0.8393213572854291, + "grad_norm": 0.12645432617948296, + "learning_rate": 6.619363909113469e-05, + "loss": 1.3437, + "step": 9251 + }, + { + "epoch": 0.839412084921067, + "grad_norm": 0.11414405123081367, + "learning_rate": 6.612059977016932e-05, + "loss": 1.3649, + "step": 9252 + }, + { + "epoch": 0.8395028125567048, + "grad_norm": 0.11286613119418536, + "learning_rate": 6.604759791475407e-05, + "loss": 1.3249, + "step": 9253 + }, + { + "epoch": 0.8395935401923426, + "grad_norm": 0.11869272124608518, + "learning_rate": 6.59746335311926e-05, + "loss": 1.3578, + "step": 9254 + }, + { + "epoch": 0.8396842678279804, + "grad_norm": 0.11658909777476074, + "learning_rate": 6.59017066257856e-05, + "loss": 1.3718, + "step": 9255 + }, + { + "epoch": 0.8397749954636182, + "grad_norm": 0.11704824864255113, + "learning_rate": 6.582881720483009e-05, + "loss": 1.3314, + "step": 9256 + }, + { + "epoch": 0.839865723099256, + "grad_norm": 0.11844077578618703, + "learning_rate": 6.57559652746203e-05, + "loss": 1.3079, + "step": 9257 + }, + { + "epoch": 0.8399564507348939, + "grad_norm": 0.12108948387449833, + "learning_rate": 6.568315084144682e-05, + "loss": 1.3515, + "step": 9258 + }, + { + "epoch": 0.8400471783705317, + "grad_norm": 0.1179153770662797, + "learning_rate": 6.561037391159741e-05, + "loss": 1.3678, + "step": 9259 + }, + { + "epoch": 0.8401379060061694, + "grad_norm": 0.11781652607789345, + "learning_rate": 6.553763449135624e-05, + "loss": 1.3452, + "step": 9260 + }, + { + "epoch": 0.8402286336418073, + "grad_norm": 0.12154678982356748, + "learning_rate": 6.54649325870042e-05, + "loss": 1.3157, + "step": 9261 + }, + { + "epoch": 0.8403193612774451, + "grad_norm": 0.11991952453502686, + "learning_rate": 6.539226820481941e-05, + "loss": 1.3421, + "step": 9262 + }, + { + "epoch": 0.8404100889130829, + "grad_norm": 0.11919170531502268, + "learning_rate": 6.531964135107638e-05, + "loss": 1.3887, + "step": 9263 + }, + { + "epoch": 0.8405008165487208, + "grad_norm": 0.12750688625179185, + "learning_rate": 6.524705203204617e-05, + "loss": 1.3699, + "step": 9264 + }, + { + "epoch": 0.8405915441843586, + "grad_norm": 0.11638079590320201, + "learning_rate": 6.517450025399719e-05, + "loss": 1.3288, + "step": 9265 + }, + { + "epoch": 0.8406822718199963, + "grad_norm": 0.11430283126071784, + "learning_rate": 6.510198602319423e-05, + "loss": 1.3974, + "step": 9266 + }, + { + "epoch": 0.8407729994556342, + "grad_norm": 0.14053522890130687, + "learning_rate": 6.50295093458989e-05, + "loss": 1.3881, + "step": 9267 + }, + { + "epoch": 0.840863727091272, + "grad_norm": 0.11310379183361068, + "learning_rate": 6.495707022836945e-05, + "loss": 1.3336, + "step": 9268 + }, + { + "epoch": 0.8409544547269098, + "grad_norm": 0.11459469111606756, + "learning_rate": 6.488466867686121e-05, + "loss": 1.344, + "step": 9269 + }, + { + "epoch": 0.8410451823625477, + "grad_norm": 0.12426836980875522, + "learning_rate": 6.481230469762595e-05, + "loss": 1.3922, + "step": 9270 + }, + { + "epoch": 0.8411359099981854, + "grad_norm": 0.11895494175545687, + "learning_rate": 6.47399782969122e-05, + "loss": 1.3667, + "step": 9271 + }, + { + "epoch": 0.8412266376338232, + "grad_norm": 0.11687745107765737, + "learning_rate": 6.466768948096547e-05, + "loss": 1.399, + "step": 9272 + }, + { + "epoch": 0.8413173652694611, + "grad_norm": 0.13798143494223628, + "learning_rate": 6.459543825602804e-05, + "loss": 1.3514, + "step": 9273 + }, + { + "epoch": 0.8414080929050989, + "grad_norm": 0.13156022737589482, + "learning_rate": 6.452322462833871e-05, + "loss": 1.3657, + "step": 9274 + }, + { + "epoch": 0.8414988205407367, + "grad_norm": 0.12292158420981586, + "learning_rate": 6.445104860413298e-05, + "loss": 1.3999, + "step": 9275 + }, + { + "epoch": 0.8415895481763745, + "grad_norm": 0.12071214782736675, + "learning_rate": 6.437891018964353e-05, + "loss": 1.355, + "step": 9276 + }, + { + "epoch": 0.8416802758120123, + "grad_norm": 0.11861328938857263, + "learning_rate": 6.430680939109934e-05, + "loss": 1.3181, + "step": 9277 + }, + { + "epoch": 0.8417710034476501, + "grad_norm": 0.13811599808166997, + "learning_rate": 6.42347462147263e-05, + "loss": 1.3702, + "step": 9278 + }, + { + "epoch": 0.841861731083288, + "grad_norm": 0.12064391812576496, + "learning_rate": 6.416272066674728e-05, + "loss": 1.3469, + "step": 9279 + }, + { + "epoch": 0.8419524587189258, + "grad_norm": 0.11514055329605861, + "learning_rate": 6.409073275338145e-05, + "loss": 1.3656, + "step": 9280 + }, + { + "epoch": 0.8420431863545637, + "grad_norm": 0.11461661194278577, + "learning_rate": 6.40187824808452e-05, + "loss": 1.3299, + "step": 9281 + }, + { + "epoch": 0.8421339139902014, + "grad_norm": 0.12365232759244218, + "learning_rate": 6.39468698553512e-05, + "loss": 1.3834, + "step": 9282 + }, + { + "epoch": 0.8422246416258392, + "grad_norm": 0.1282171067349153, + "learning_rate": 6.387499488310939e-05, + "loss": 1.3541, + "step": 9283 + }, + { + "epoch": 0.8423153692614771, + "grad_norm": 0.12916379260707225, + "learning_rate": 6.380315757032601e-05, + "loss": 1.3685, + "step": 9284 + }, + { + "epoch": 0.8424060968971149, + "grad_norm": 0.12499867615788048, + "learning_rate": 6.373135792320417e-05, + "loss": 1.3637, + "step": 9285 + }, + { + "epoch": 0.8424968245327527, + "grad_norm": 0.1324535509005655, + "learning_rate": 6.365959594794401e-05, + "loss": 1.3923, + "step": 9286 + }, + { + "epoch": 0.8425875521683905, + "grad_norm": 0.114250434297831, + "learning_rate": 6.358787165074193e-05, + "loss": 1.3563, + "step": 9287 + }, + { + "epoch": 0.8426782798040283, + "grad_norm": 0.1707079454820773, + "learning_rate": 6.351618503779144e-05, + "loss": 1.307, + "step": 9288 + }, + { + "epoch": 0.8427690074396661, + "grad_norm": 0.1242593451773709, + "learning_rate": 6.344453611528283e-05, + "loss": 1.3496, + "step": 9289 + }, + { + "epoch": 0.842859735075304, + "grad_norm": 0.11926601055146864, + "learning_rate": 6.337292488940289e-05, + "loss": 1.3572, + "step": 9290 + }, + { + "epoch": 0.8429504627109418, + "grad_norm": 0.11595440427872776, + "learning_rate": 6.330135136633519e-05, + "loss": 1.3645, + "step": 9291 + }, + { + "epoch": 0.8430411903465795, + "grad_norm": 0.13478278670898577, + "learning_rate": 6.32298155522601e-05, + "loss": 1.3283, + "step": 9292 + }, + { + "epoch": 0.8431319179822174, + "grad_norm": 0.11532806379016866, + "learning_rate": 6.315831745335487e-05, + "loss": 1.3915, + "step": 9293 + }, + { + "epoch": 0.8432226456178552, + "grad_norm": 0.11261088906100145, + "learning_rate": 6.308685707579331e-05, + "loss": 1.3063, + "step": 9294 + }, + { + "epoch": 0.843313373253493, + "grad_norm": 0.15844248666750757, + "learning_rate": 6.301543442574587e-05, + "loss": 1.3545, + "step": 9295 + }, + { + "epoch": 0.8434041008891309, + "grad_norm": 0.12148029578188808, + "learning_rate": 6.294404950938026e-05, + "loss": 1.323, + "step": 9296 + }, + { + "epoch": 0.8434948285247686, + "grad_norm": 0.1276206456452625, + "learning_rate": 6.287270233286036e-05, + "loss": 1.3641, + "step": 9297 + }, + { + "epoch": 0.8435855561604064, + "grad_norm": 0.12066014270026096, + "learning_rate": 6.2801392902347e-05, + "loss": 1.357, + "step": 9298 + }, + { + "epoch": 0.8436762837960443, + "grad_norm": 0.12347622282563703, + "learning_rate": 6.273012122399784e-05, + "loss": 1.3806, + "step": 9299 + }, + { + "epoch": 0.8437670114316821, + "grad_norm": 0.12010402107187657, + "learning_rate": 6.265888730396718e-05, + "loss": 1.3383, + "step": 9300 + }, + { + "epoch": 0.8438577390673199, + "grad_norm": 0.12060030772461285, + "learning_rate": 6.258769114840595e-05, + "loss": 1.3636, + "step": 9301 + }, + { + "epoch": 0.8439484667029578, + "grad_norm": 0.11534530952413144, + "learning_rate": 6.25165327634622e-05, + "loss": 1.3641, + "step": 9302 + }, + { + "epoch": 0.8440391943385955, + "grad_norm": 0.11645609858882859, + "learning_rate": 6.244541215528022e-05, + "loss": 1.3531, + "step": 9303 + }, + { + "epoch": 0.8441299219742333, + "grad_norm": 0.12587389633854776, + "learning_rate": 6.237432933000142e-05, + "loss": 1.3245, + "step": 9304 + }, + { + "epoch": 0.8442206496098712, + "grad_norm": 0.11563288067504088, + "learning_rate": 6.230328429376375e-05, + "loss": 1.3467, + "step": 9305 + }, + { + "epoch": 0.844311377245509, + "grad_norm": 0.13048695256986106, + "learning_rate": 6.223227705270207e-05, + "loss": 1.3463, + "step": 9306 + }, + { + "epoch": 0.8444021048811468, + "grad_norm": 0.1143955657149027, + "learning_rate": 6.21613076129478e-05, + "loss": 1.3487, + "step": 9307 + }, + { + "epoch": 0.8444928325167846, + "grad_norm": 0.11764168951973021, + "learning_rate": 6.2090375980629e-05, + "loss": 1.3351, + "step": 9308 + }, + { + "epoch": 0.8445835601524224, + "grad_norm": 0.11667594096767787, + "learning_rate": 6.201948216187093e-05, + "loss": 1.3672, + "step": 9309 + }, + { + "epoch": 0.8446742877880602, + "grad_norm": 0.11423147462371297, + "learning_rate": 6.194862616279511e-05, + "loss": 1.3656, + "step": 9310 + }, + { + "epoch": 0.8447650154236981, + "grad_norm": 0.1357128061326456, + "learning_rate": 6.187780798951986e-05, + "loss": 1.3195, + "step": 9311 + }, + { + "epoch": 0.8448557430593359, + "grad_norm": 0.11858274604214358, + "learning_rate": 6.180702764816048e-05, + "loss": 1.3423, + "step": 9312 + }, + { + "epoch": 0.8449464706949736, + "grad_norm": 0.11102038921540354, + "learning_rate": 6.173628514482898e-05, + "loss": 1.3576, + "step": 9313 + }, + { + "epoch": 0.8450371983306115, + "grad_norm": 0.12314276893955985, + "learning_rate": 6.16655804856338e-05, + "loss": 1.3312, + "step": 9314 + }, + { + "epoch": 0.8451279259662493, + "grad_norm": 0.11674725780672865, + "learning_rate": 6.159491367668025e-05, + "loss": 1.36, + "step": 9315 + }, + { + "epoch": 0.8452186536018871, + "grad_norm": 0.12011057456361107, + "learning_rate": 6.152428472407068e-05, + "loss": 1.3517, + "step": 9316 + }, + { + "epoch": 0.845309381237525, + "grad_norm": 0.12193432025393514, + "learning_rate": 6.145369363390369e-05, + "loss": 1.3566, + "step": 9317 + }, + { + "epoch": 0.8454001088731627, + "grad_norm": 0.1185331622752038, + "learning_rate": 6.138314041227478e-05, + "loss": 1.3484, + "step": 9318 + }, + { + "epoch": 0.8454908365088006, + "grad_norm": 0.11827252145653741, + "learning_rate": 6.131262506527635e-05, + "loss": 1.3606, + "step": 9319 + }, + { + "epoch": 0.8455815641444384, + "grad_norm": 0.1247929289396767, + "learning_rate": 6.124214759899754e-05, + "loss": 1.346, + "step": 9320 + }, + { + "epoch": 0.8456722917800762, + "grad_norm": 0.10922864362179995, + "learning_rate": 6.117170801952392e-05, + "loss": 1.3408, + "step": 9321 + }, + { + "epoch": 0.8457630194157141, + "grad_norm": 0.12748974764459228, + "learning_rate": 6.110130633293792e-05, + "loss": 1.3921, + "step": 9322 + }, + { + "epoch": 0.8458537470513519, + "grad_norm": 0.11621248997189226, + "learning_rate": 6.103094254531888e-05, + "loss": 1.3553, + "step": 9323 + }, + { + "epoch": 0.8459444746869896, + "grad_norm": 0.11894179267419354, + "learning_rate": 6.0960616662742665e-05, + "loss": 1.3524, + "step": 9324 + }, + { + "epoch": 0.8460352023226275, + "grad_norm": 0.12380345850209643, + "learning_rate": 6.089032869128175e-05, + "loss": 1.3687, + "step": 9325 + }, + { + "epoch": 0.8461259299582653, + "grad_norm": 0.11996915676718989, + "learning_rate": 6.0820078637005724e-05, + "loss": 1.3538, + "step": 9326 + }, + { + "epoch": 0.8462166575939031, + "grad_norm": 0.11876816799595447, + "learning_rate": 6.074986650598074e-05, + "loss": 1.339, + "step": 9327 + }, + { + "epoch": 0.846307385229541, + "grad_norm": 0.2016981483707992, + "learning_rate": 6.067969230426951e-05, + "loss": 1.3631, + "step": 9328 + }, + { + "epoch": 0.8463981128651787, + "grad_norm": 0.11515446609219032, + "learning_rate": 6.06095560379315e-05, + "loss": 1.3315, + "step": 9329 + }, + { + "epoch": 0.8464888405008165, + "grad_norm": 0.11558412771201292, + "learning_rate": 6.053945771302316e-05, + "loss": 1.3387, + "step": 9330 + }, + { + "epoch": 0.8465795681364544, + "grad_norm": 0.12166783094308409, + "learning_rate": 6.0469397335597475e-05, + "loss": 1.3409, + "step": 9331 + }, + { + "epoch": 0.8466702957720922, + "grad_norm": 0.1156277636346744, + "learning_rate": 6.039937491170394e-05, + "loss": 1.3144, + "step": 9332 + }, + { + "epoch": 0.84676102340773, + "grad_norm": 0.12136399884074263, + "learning_rate": 6.032939044738933e-05, + "loss": 1.3624, + "step": 9333 + }, + { + "epoch": 0.8468517510433679, + "grad_norm": 0.11229722674298148, + "learning_rate": 6.025944394869654e-05, + "loss": 1.3598, + "step": 9334 + }, + { + "epoch": 0.8469424786790056, + "grad_norm": 0.1301707455309548, + "learning_rate": 6.0189535421665665e-05, + "loss": 1.3447, + "step": 9335 + }, + { + "epoch": 0.8470332063146434, + "grad_norm": 0.12742627032450746, + "learning_rate": 6.011966487233311e-05, + "loss": 1.348, + "step": 9336 + }, + { + "epoch": 0.8471239339502813, + "grad_norm": 0.12096208886283313, + "learning_rate": 6.0049832306732446e-05, + "loss": 1.369, + "step": 9337 + }, + { + "epoch": 0.8472146615859191, + "grad_norm": 0.1297348134545049, + "learning_rate": 5.998003773089361e-05, + "loss": 1.3407, + "step": 9338 + }, + { + "epoch": 0.8473053892215568, + "grad_norm": 0.11968615106628136, + "learning_rate": 5.9910281150843296e-05, + "loss": 1.351, + "step": 9339 + }, + { + "epoch": 0.8473961168571947, + "grad_norm": 0.11853619727866214, + "learning_rate": 5.9840562572605115e-05, + "loss": 1.3639, + "step": 9340 + }, + { + "epoch": 0.8474868444928325, + "grad_norm": 0.11699268121896729, + "learning_rate": 5.977088200219927e-05, + "loss": 1.3234, + "step": 9341 + }, + { + "epoch": 0.8475775721284703, + "grad_norm": 0.21523956907277306, + "learning_rate": 5.970123944564249e-05, + "loss": 1.3252, + "step": 9342 + }, + { + "epoch": 0.8476682997641082, + "grad_norm": 0.12741518666254534, + "learning_rate": 5.963163490894863e-05, + "loss": 1.3427, + "step": 9343 + }, + { + "epoch": 0.847759027399746, + "grad_norm": 0.12184503285491828, + "learning_rate": 5.956206839812805e-05, + "loss": 1.3722, + "step": 9344 + }, + { + "epoch": 0.8478497550353837, + "grad_norm": 0.12223343666888456, + "learning_rate": 5.949253991918785e-05, + "loss": 1.3508, + "step": 9345 + }, + { + "epoch": 0.8479404826710216, + "grad_norm": 0.12304805696498869, + "learning_rate": 5.94230494781316e-05, + "loss": 1.3579, + "step": 9346 + }, + { + "epoch": 0.8480312103066594, + "grad_norm": 0.13016408874327584, + "learning_rate": 5.935359708096005e-05, + "loss": 1.3637, + "step": 9347 + }, + { + "epoch": 0.8481219379422972, + "grad_norm": 0.12463368325302547, + "learning_rate": 5.928418273367037e-05, + "loss": 1.3784, + "step": 9348 + }, + { + "epoch": 0.8482126655779351, + "grad_norm": 0.12690610329420313, + "learning_rate": 5.921480644225635e-05, + "loss": 1.3247, + "step": 9349 + }, + { + "epoch": 0.8483033932135728, + "grad_norm": 0.1243652116260325, + "learning_rate": 5.9145468212708774e-05, + "loss": 1.3206, + "step": 9350 + }, + { + "epoch": 0.8483941208492106, + "grad_norm": 0.182598865201554, + "learning_rate": 5.907616805101507e-05, + "loss": 1.361, + "step": 9351 + }, + { + "epoch": 0.8484848484848485, + "grad_norm": 0.12212537019713822, + "learning_rate": 5.900690596315927e-05, + "loss": 1.367, + "step": 9352 + }, + { + "epoch": 0.8485755761204863, + "grad_norm": 0.14327286709431528, + "learning_rate": 5.8937681955122e-05, + "loss": 1.3608, + "step": 9353 + }, + { + "epoch": 0.8486663037561241, + "grad_norm": 0.12541290718550752, + "learning_rate": 5.886849603288102e-05, + "loss": 1.3251, + "step": 9354 + }, + { + "epoch": 0.848757031391762, + "grad_norm": 0.13231406529541614, + "learning_rate": 5.879934820241045e-05, + "loss": 1.3074, + "step": 9355 + }, + { + "epoch": 0.8488477590273997, + "grad_norm": 0.12313938129653072, + "learning_rate": 5.873023846968106e-05, + "loss": 1.3667, + "step": 9356 + }, + { + "epoch": 0.8489384866630375, + "grad_norm": 0.12755105734116287, + "learning_rate": 5.86611668406607e-05, + "loss": 1.3347, + "step": 9357 + }, + { + "epoch": 0.8490292142986754, + "grad_norm": 0.14424310307446284, + "learning_rate": 5.859213332131358e-05, + "loss": 1.3562, + "step": 9358 + }, + { + "epoch": 0.8491199419343132, + "grad_norm": 0.13774474805002349, + "learning_rate": 5.852313791760083e-05, + "loss": 1.3332, + "step": 9359 + }, + { + "epoch": 0.8492106695699511, + "grad_norm": 0.1172050300200765, + "learning_rate": 5.845418063548014e-05, + "loss": 1.3762, + "step": 9360 + }, + { + "epoch": 0.8493013972055888, + "grad_norm": 0.12166022033399328, + "learning_rate": 5.838526148090612e-05, + "loss": 1.3678, + "step": 9361 + }, + { + "epoch": 0.8493921248412266, + "grad_norm": 0.1285210782429413, + "learning_rate": 5.831638045982984e-05, + "loss": 1.3768, + "step": 9362 + }, + { + "epoch": 0.8494828524768645, + "grad_norm": 0.1290339755754683, + "learning_rate": 5.824753757819917e-05, + "loss": 1.3275, + "step": 9363 + }, + { + "epoch": 0.8495735801125023, + "grad_norm": 0.12769252356957392, + "learning_rate": 5.817873284195879e-05, + "loss": 1.3464, + "step": 9364 + }, + { + "epoch": 0.8496643077481401, + "grad_norm": 0.12454989006869793, + "learning_rate": 5.810996625704984e-05, + "loss": 1.3424, + "step": 9365 + }, + { + "epoch": 0.849755035383778, + "grad_norm": 0.1565434149674003, + "learning_rate": 5.80412378294104e-05, + "loss": 1.3768, + "step": 9366 + }, + { + "epoch": 0.8498457630194157, + "grad_norm": 0.11569881580906204, + "learning_rate": 5.797254756497538e-05, + "loss": 1.3673, + "step": 9367 + }, + { + "epoch": 0.8499364906550535, + "grad_norm": 0.1183713116952722, + "learning_rate": 5.790389546967601e-05, + "loss": 1.3604, + "step": 9368 + }, + { + "epoch": 0.8500272182906914, + "grad_norm": 0.13020959413022642, + "learning_rate": 5.783528154944029e-05, + "loss": 1.3589, + "step": 9369 + }, + { + "epoch": 0.8501179459263292, + "grad_norm": 0.12016947683831826, + "learning_rate": 5.776670581019328e-05, + "loss": 1.3373, + "step": 9370 + }, + { + "epoch": 0.8502086735619669, + "grad_norm": 0.11603075137675779, + "learning_rate": 5.7698168257856455e-05, + "loss": 1.3507, + "step": 9371 + }, + { + "epoch": 0.8502994011976048, + "grad_norm": 0.11428190030457726, + "learning_rate": 5.762966889834792e-05, + "loss": 1.3531, + "step": 9372 + }, + { + "epoch": 0.8503901288332426, + "grad_norm": 0.12155123623864394, + "learning_rate": 5.75612077375825e-05, + "loss": 1.3133, + "step": 9373 + }, + { + "epoch": 0.8504808564688804, + "grad_norm": 0.12549696456745327, + "learning_rate": 5.749278478147224e-05, + "loss": 1.335, + "step": 9374 + }, + { + "epoch": 0.8505715841045183, + "grad_norm": 0.12379224278659061, + "learning_rate": 5.7424400035925193e-05, + "loss": 1.3469, + "step": 9375 + }, + { + "epoch": 0.850662311740156, + "grad_norm": 0.11413891057835251, + "learning_rate": 5.7356053506846304e-05, + "loss": 1.3233, + "step": 9376 + }, + { + "epoch": 0.8507530393757938, + "grad_norm": 0.1201637322318542, + "learning_rate": 5.7287745200137575e-05, + "loss": 1.3725, + "step": 9377 + }, + { + "epoch": 0.8508437670114317, + "grad_norm": 0.11593404024804349, + "learning_rate": 5.7219475121697284e-05, + "loss": 1.3376, + "step": 9378 + }, + { + "epoch": 0.8509344946470695, + "grad_norm": 0.11768966268633561, + "learning_rate": 5.7151243277420495e-05, + "loss": 1.3305, + "step": 9379 + }, + { + "epoch": 0.8510252222827073, + "grad_norm": 0.11704363181692641, + "learning_rate": 5.708304967319922e-05, + "loss": 1.298, + "step": 9380 + }, + { + "epoch": 0.8511159499183452, + "grad_norm": 0.1216158586317872, + "learning_rate": 5.701489431492174e-05, + "loss": 1.3512, + "step": 9381 + }, + { + "epoch": 0.8512066775539829, + "grad_norm": 0.12245574212818662, + "learning_rate": 5.694677720847358e-05, + "loss": 1.3683, + "step": 9382 + }, + { + "epoch": 0.8512974051896207, + "grad_norm": 0.15345465360933977, + "learning_rate": 5.687869835973636e-05, + "loss": 1.3617, + "step": 9383 + }, + { + "epoch": 0.8513881328252586, + "grad_norm": 0.11957320278109174, + "learning_rate": 5.6810657774588995e-05, + "loss": 1.384, + "step": 9384 + }, + { + "epoch": 0.8514788604608964, + "grad_norm": 0.12590977143033322, + "learning_rate": 5.674265545890661e-05, + "loss": 1.3358, + "step": 9385 + }, + { + "epoch": 0.8515695880965342, + "grad_norm": 0.12039665120793751, + "learning_rate": 5.6674691418561185e-05, + "loss": 1.3669, + "step": 9386 + }, + { + "epoch": 0.851660315732172, + "grad_norm": 0.1131290101424666, + "learning_rate": 5.660676565942158e-05, + "loss": 1.3418, + "step": 9387 + }, + { + "epoch": 0.8517510433678098, + "grad_norm": 0.130181114363082, + "learning_rate": 5.653887818735309e-05, + "loss": 1.3962, + "step": 9388 + }, + { + "epoch": 0.8518417710034476, + "grad_norm": 0.12955801710843737, + "learning_rate": 5.6471029008217766e-05, + "loss": 1.3455, + "step": 9389 + }, + { + "epoch": 0.8519324986390855, + "grad_norm": 0.11593990247661111, + "learning_rate": 5.6403218127874414e-05, + "loss": 1.366, + "step": 9390 + }, + { + "epoch": 0.8520232262747233, + "grad_norm": 0.4792128959049203, + "learning_rate": 5.633544555217873e-05, + "loss": 1.3726, + "step": 9391 + }, + { + "epoch": 0.852113953910361, + "grad_norm": 0.12551516159258205, + "learning_rate": 5.626771128698266e-05, + "loss": 1.3189, + "step": 9392 + }, + { + "epoch": 0.8522046815459989, + "grad_norm": 0.11703502294844902, + "learning_rate": 5.6200015338135016e-05, + "loss": 1.3693, + "step": 9393 + }, + { + "epoch": 0.8522954091816367, + "grad_norm": 0.12821565416193512, + "learning_rate": 5.613235771148156e-05, + "loss": 1.3563, + "step": 9394 + }, + { + "epoch": 0.8523861368172745, + "grad_norm": 0.12437177342451299, + "learning_rate": 5.6064738412864414e-05, + "loss": 1.3114, + "step": 9395 + }, + { + "epoch": 0.8524768644529124, + "grad_norm": 0.1302587111581288, + "learning_rate": 5.599715744812245e-05, + "loss": 1.3751, + "step": 9396 + }, + { + "epoch": 0.8525675920885502, + "grad_norm": 0.14182126932146316, + "learning_rate": 5.592961482309139e-05, + "loss": 1.3665, + "step": 9397 + }, + { + "epoch": 0.852658319724188, + "grad_norm": 0.11873004809770492, + "learning_rate": 5.5862110543603684e-05, + "loss": 1.3057, + "step": 9398 + }, + { + "epoch": 0.8527490473598258, + "grad_norm": 0.11998980824132076, + "learning_rate": 5.579464461548811e-05, + "loss": 1.3716, + "step": 9399 + }, + { + "epoch": 0.8528397749954636, + "grad_norm": 0.11000794637399593, + "learning_rate": 5.5727217044570396e-05, + "loss": 1.3125, + "step": 9400 + }, + { + "epoch": 0.8529305026311015, + "grad_norm": 0.13269544243110334, + "learning_rate": 5.565982783667306e-05, + "loss": 1.3403, + "step": 9401 + }, + { + "epoch": 0.8530212302667393, + "grad_norm": 0.121904713474291, + "learning_rate": 5.559247699761505e-05, + "loss": 1.3571, + "step": 9402 + }, + { + "epoch": 0.853111957902377, + "grad_norm": 0.12578042857769284, + "learning_rate": 5.55251645332121e-05, + "loss": 1.3685, + "step": 9403 + }, + { + "epoch": 0.8532026855380149, + "grad_norm": 0.11635892621948253, + "learning_rate": 5.545789044927668e-05, + "loss": 1.3106, + "step": 9404 + }, + { + "epoch": 0.8532934131736527, + "grad_norm": 0.11460039719741268, + "learning_rate": 5.539065475161803e-05, + "loss": 1.3352, + "step": 9405 + }, + { + "epoch": 0.8533841408092905, + "grad_norm": 0.12532820042409615, + "learning_rate": 5.532345744604183e-05, + "loss": 1.3935, + "step": 9406 + }, + { + "epoch": 0.8534748684449284, + "grad_norm": 0.12124173412240691, + "learning_rate": 5.525629853835057e-05, + "loss": 1.3665, + "step": 9407 + }, + { + "epoch": 0.8535655960805661, + "grad_norm": 0.17642711914194037, + "learning_rate": 5.518917803434359e-05, + "loss": 1.3525, + "step": 9408 + }, + { + "epoch": 0.8536563237162039, + "grad_norm": 0.12286157767312278, + "learning_rate": 5.512209593981665e-05, + "loss": 1.3568, + "step": 9409 + }, + { + "epoch": 0.8537470513518418, + "grad_norm": 0.11429420536196475, + "learning_rate": 5.505505226056212e-05, + "loss": 1.3413, + "step": 9410 + }, + { + "epoch": 0.8538377789874796, + "grad_norm": 0.12789625987048284, + "learning_rate": 5.498804700236959e-05, + "loss": 1.3802, + "step": 9411 + }, + { + "epoch": 0.8539285066231174, + "grad_norm": 0.16237630383365176, + "learning_rate": 5.492108017102465e-05, + "loss": 1.3517, + "step": 9412 + }, + { + "epoch": 0.8540192342587553, + "grad_norm": 0.12793868375018588, + "learning_rate": 5.4854151772310127e-05, + "loss": 1.3438, + "step": 9413 + }, + { + "epoch": 0.854109961894393, + "grad_norm": 0.13009286912248888, + "learning_rate": 5.478726181200511e-05, + "loss": 1.3433, + "step": 9414 + }, + { + "epoch": 0.8542006895300308, + "grad_norm": 0.11176363739323, + "learning_rate": 5.472041029588576e-05, + "loss": 1.3199, + "step": 9415 + }, + { + "epoch": 0.8542914171656687, + "grad_norm": 0.11664963011322788, + "learning_rate": 5.4653597229724625e-05, + "loss": 1.3902, + "step": 9416 + }, + { + "epoch": 0.8543821448013065, + "grad_norm": 0.11879940005797753, + "learning_rate": 5.458682261929088e-05, + "loss": 1.3288, + "step": 9417 + }, + { + "epoch": 0.8544728724369443, + "grad_norm": 0.1275031719129341, + "learning_rate": 5.4520086470350784e-05, + "loss": 1.3521, + "step": 9418 + }, + { + "epoch": 0.8545636000725821, + "grad_norm": 0.12084858450972089, + "learning_rate": 5.4453388788666844e-05, + "loss": 1.384, + "step": 9419 + }, + { + "epoch": 0.8546543277082199, + "grad_norm": 0.12347711769069553, + "learning_rate": 5.438672957999835e-05, + "loss": 1.4082, + "step": 9420 + }, + { + "epoch": 0.8547450553438577, + "grad_norm": 0.11426355031088982, + "learning_rate": 5.432010885010147e-05, + "loss": 1.3602, + "step": 9421 + }, + { + "epoch": 0.8548357829794956, + "grad_norm": 0.11597448998466114, + "learning_rate": 5.425352660472893e-05, + "loss": 1.3569, + "step": 9422 + }, + { + "epoch": 0.8549265106151334, + "grad_norm": 0.1267660092721527, + "learning_rate": 5.4186982849630085e-05, + "loss": 1.3105, + "step": 9423 + }, + { + "epoch": 0.8550172382507711, + "grad_norm": 0.12077531338147303, + "learning_rate": 5.4120477590550885e-05, + "loss": 1.3451, + "step": 9424 + }, + { + "epoch": 0.855107965886409, + "grad_norm": 0.11702628518118664, + "learning_rate": 5.405401083323425e-05, + "loss": 1.375, + "step": 9425 + }, + { + "epoch": 0.8551986935220468, + "grad_norm": 0.119024946886797, + "learning_rate": 5.398758258341951e-05, + "loss": 1.3218, + "step": 9426 + }, + { + "epoch": 0.8552894211576846, + "grad_norm": 0.11675634315134335, + "learning_rate": 5.392119284684266e-05, + "loss": 1.346, + "step": 9427 + }, + { + "epoch": 0.8553801487933225, + "grad_norm": 0.11547333956186868, + "learning_rate": 5.385484162923654e-05, + "loss": 1.3265, + "step": 9428 + }, + { + "epoch": 0.8554708764289602, + "grad_norm": 0.12440459937945254, + "learning_rate": 5.378852893633068e-05, + "loss": 1.3382, + "step": 9429 + }, + { + "epoch": 0.855561604064598, + "grad_norm": 0.12090260373454689, + "learning_rate": 5.372225477385112e-05, + "loss": 1.3677, + "step": 9430 + }, + { + "epoch": 0.8556523317002359, + "grad_norm": 0.1198468653449351, + "learning_rate": 5.3656019147520554e-05, + "loss": 1.3399, + "step": 9431 + }, + { + "epoch": 0.8557430593358737, + "grad_norm": 0.2180911385331309, + "learning_rate": 5.358982206305862e-05, + "loss": 1.3321, + "step": 9432 + }, + { + "epoch": 0.8558337869715115, + "grad_norm": 0.13654315364973932, + "learning_rate": 5.352366352618132e-05, + "loss": 1.3276, + "step": 9433 + }, + { + "epoch": 0.8559245146071494, + "grad_norm": 0.11814393683721572, + "learning_rate": 5.3457543542601404e-05, + "loss": 1.3491, + "step": 9434 + }, + { + "epoch": 0.8560152422427871, + "grad_norm": 0.12197118074264306, + "learning_rate": 5.3391462118028533e-05, + "loss": 1.3542, + "step": 9435 + }, + { + "epoch": 0.856105969878425, + "grad_norm": 0.12376015577146406, + "learning_rate": 5.332541925816858e-05, + "loss": 1.3511, + "step": 9436 + }, + { + "epoch": 0.8561966975140628, + "grad_norm": 0.11484115422020955, + "learning_rate": 5.3259414968724665e-05, + "loss": 1.3438, + "step": 9437 + }, + { + "epoch": 0.8562874251497006, + "grad_norm": 0.11664727182994772, + "learning_rate": 5.3193449255395984e-05, + "loss": 1.3642, + "step": 9438 + }, + { + "epoch": 0.8563781527853385, + "grad_norm": 0.11598462148020267, + "learning_rate": 5.312752212387889e-05, + "loss": 1.3221, + "step": 9439 + }, + { + "epoch": 0.8564688804209762, + "grad_norm": 0.12081793986652475, + "learning_rate": 5.3061633579866156e-05, + "loss": 1.3287, + "step": 9440 + }, + { + "epoch": 0.856559608056614, + "grad_norm": 0.15372389756208618, + "learning_rate": 5.299578362904711e-05, + "loss": 1.3538, + "step": 9441 + }, + { + "epoch": 0.8566503356922519, + "grad_norm": 0.12345823232404297, + "learning_rate": 5.292997227710811e-05, + "loss": 1.3427, + "step": 9442 + }, + { + "epoch": 0.8567410633278897, + "grad_norm": 0.12822026378743473, + "learning_rate": 5.2864199529731815e-05, + "loss": 1.3465, + "step": 9443 + }, + { + "epoch": 0.8568317909635275, + "grad_norm": 0.1294460335973004, + "learning_rate": 5.27984653925978e-05, + "loss": 1.3942, + "step": 9444 + }, + { + "epoch": 0.8569225185991654, + "grad_norm": 0.12815461486996296, + "learning_rate": 5.273276987138231e-05, + "loss": 1.3572, + "step": 9445 + }, + { + "epoch": 0.8570132462348031, + "grad_norm": 0.11773776436352107, + "learning_rate": 5.2667112971758077e-05, + "loss": 1.3149, + "step": 9446 + }, + { + "epoch": 0.8571039738704409, + "grad_norm": 0.13104229590623215, + "learning_rate": 5.260149469939446e-05, + "loss": 1.3743, + "step": 9447 + }, + { + "epoch": 0.8571947015060788, + "grad_norm": 0.1200628199169931, + "learning_rate": 5.2535915059957804e-05, + "loss": 1.3681, + "step": 9448 + }, + { + "epoch": 0.8572854291417166, + "grad_norm": 0.11567816406931315, + "learning_rate": 5.247037405911081e-05, + "loss": 1.3571, + "step": 9449 + }, + { + "epoch": 0.8573761567773543, + "grad_norm": 0.12466346050634294, + "learning_rate": 5.240487170251301e-05, + "loss": 1.3595, + "step": 9450 + }, + { + "epoch": 0.8574668844129922, + "grad_norm": 0.12260804066845217, + "learning_rate": 5.2339407995820324e-05, + "loss": 1.3308, + "step": 9451 + }, + { + "epoch": 0.85755761204863, + "grad_norm": 0.11588874168548188, + "learning_rate": 5.2273982944685884e-05, + "loss": 1.3675, + "step": 9452 + }, + { + "epoch": 0.8576483396842678, + "grad_norm": 0.12977757181933527, + "learning_rate": 5.220859655475907e-05, + "loss": 1.3696, + "step": 9453 + }, + { + "epoch": 0.8577390673199057, + "grad_norm": 0.11675125054348609, + "learning_rate": 5.2143248831685805e-05, + "loss": 1.3777, + "step": 9454 + }, + { + "epoch": 0.8578297949555435, + "grad_norm": 0.11967506214145522, + "learning_rate": 5.207793978110908e-05, + "loss": 1.3418, + "step": 9455 + }, + { + "epoch": 0.8579205225911812, + "grad_norm": 0.12328079910515402, + "learning_rate": 5.201266940866833e-05, + "loss": 1.3458, + "step": 9456 + }, + { + "epoch": 0.8580112502268191, + "grad_norm": 0.11816968951421251, + "learning_rate": 5.1947437719999436e-05, + "loss": 1.3184, + "step": 9457 + }, + { + "epoch": 0.8581019778624569, + "grad_norm": 0.12770456182325157, + "learning_rate": 5.188224472073549e-05, + "loss": 1.3253, + "step": 9458 + }, + { + "epoch": 0.8581927054980947, + "grad_norm": 0.1215326405671706, + "learning_rate": 5.181709041650562e-05, + "loss": 1.3663, + "step": 9459 + }, + { + "epoch": 0.8582834331337326, + "grad_norm": 0.11516213763894789, + "learning_rate": 5.1751974812936196e-05, + "loss": 1.3326, + "step": 9460 + }, + { + "epoch": 0.8583741607693703, + "grad_norm": 0.1255306880610133, + "learning_rate": 5.168689791564968e-05, + "loss": 1.3363, + "step": 9461 + }, + { + "epoch": 0.8584648884050081, + "grad_norm": 0.11735251052423465, + "learning_rate": 5.162185973026568e-05, + "loss": 1.354, + "step": 9462 + }, + { + "epoch": 0.858555616040646, + "grad_norm": 0.12061239408540322, + "learning_rate": 5.1556860262400194e-05, + "loss": 1.3542, + "step": 9463 + }, + { + "epoch": 0.8586463436762838, + "grad_norm": 0.11490372859054519, + "learning_rate": 5.149189951766581e-05, + "loss": 1.3903, + "step": 9464 + }, + { + "epoch": 0.8587370713119216, + "grad_norm": 0.12222729822457548, + "learning_rate": 5.1426977501672134e-05, + "loss": 1.4136, + "step": 9465 + }, + { + "epoch": 0.8588277989475595, + "grad_norm": 0.18943710553224502, + "learning_rate": 5.136209422002508e-05, + "loss": 1.3432, + "step": 9466 + }, + { + "epoch": 0.8589185265831972, + "grad_norm": 0.11113223694792805, + "learning_rate": 5.129724967832716e-05, + "loss": 1.3518, + "step": 9467 + }, + { + "epoch": 0.859009254218835, + "grad_norm": 0.11221225235502581, + "learning_rate": 5.12324438821779e-05, + "loss": 1.3419, + "step": 9468 + }, + { + "epoch": 0.8590999818544729, + "grad_norm": 0.12819280496988347, + "learning_rate": 5.116767683717333e-05, + "loss": 1.3274, + "step": 9469 + }, + { + "epoch": 0.8591907094901107, + "grad_norm": 0.12715770938194904, + "learning_rate": 5.110294854890601e-05, + "loss": 1.3569, + "step": 9470 + }, + { + "epoch": 0.8592814371257484, + "grad_norm": 0.12058212638832679, + "learning_rate": 5.103825902296516e-05, + "loss": 1.3487, + "step": 9471 + }, + { + "epoch": 0.8593721647613863, + "grad_norm": 0.12164725735316248, + "learning_rate": 5.097360826493691e-05, + "loss": 1.3633, + "step": 9472 + }, + { + "epoch": 0.8594628923970241, + "grad_norm": 0.11023749291994371, + "learning_rate": 5.0908996280403744e-05, + "loss": 1.3411, + "step": 9473 + }, + { + "epoch": 0.859553620032662, + "grad_norm": 0.12222761636219968, + "learning_rate": 5.084442307494486e-05, + "loss": 1.3483, + "step": 9474 + }, + { + "epoch": 0.8596443476682998, + "grad_norm": 0.13156379462649587, + "learning_rate": 5.077988865413624e-05, + "loss": 1.3347, + "step": 9475 + }, + { + "epoch": 0.8597350753039376, + "grad_norm": 0.11939140363724673, + "learning_rate": 5.071539302355055e-05, + "loss": 1.3397, + "step": 9476 + }, + { + "epoch": 0.8598258029395754, + "grad_norm": 0.1334906649490776, + "learning_rate": 5.065093618875688e-05, + "loss": 1.3658, + "step": 9477 + }, + { + "epoch": 0.8599165305752132, + "grad_norm": 0.12711440678025226, + "learning_rate": 5.0586518155321004e-05, + "loss": 1.3509, + "step": 9478 + }, + { + "epoch": 0.860007258210851, + "grad_norm": 0.12247559324236722, + "learning_rate": 5.0522138928805636e-05, + "loss": 1.3332, + "step": 9479 + }, + { + "epoch": 0.8600979858464889, + "grad_norm": 0.11564184058592249, + "learning_rate": 5.045779851476984e-05, + "loss": 1.3475, + "step": 9480 + }, + { + "epoch": 0.8601887134821267, + "grad_norm": 0.183884832835562, + "learning_rate": 5.039349691876921e-05, + "loss": 1.3314, + "step": 9481 + }, + { + "epoch": 0.8602794411177644, + "grad_norm": 0.12006528745058861, + "learning_rate": 5.0329234146356484e-05, + "loss": 1.3634, + "step": 9482 + }, + { + "epoch": 0.8603701687534023, + "grad_norm": 0.12463836966296726, + "learning_rate": 5.026501020308072e-05, + "loss": 1.3448, + "step": 9483 + }, + { + "epoch": 0.8604608963890401, + "grad_norm": 0.12004953177541436, + "learning_rate": 5.020082509448759e-05, + "loss": 1.3369, + "step": 9484 + }, + { + "epoch": 0.8605516240246779, + "grad_norm": 0.12376824322357734, + "learning_rate": 5.013667882611944e-05, + "loss": 1.3592, + "step": 9485 + }, + { + "epoch": 0.8606423516603158, + "grad_norm": 0.12651722779047203, + "learning_rate": 5.007257140351546e-05, + "loss": 1.342, + "step": 9486 + }, + { + "epoch": 0.8607330792959536, + "grad_norm": 0.13960575418033117, + "learning_rate": 5.00085028322112e-05, + "loss": 1.3546, + "step": 9487 + }, + { + "epoch": 0.8608238069315913, + "grad_norm": 0.11927113261073152, + "learning_rate": 4.994447311773903e-05, + "loss": 1.3365, + "step": 9488 + }, + { + "epoch": 0.8609145345672292, + "grad_norm": 0.12927616132875014, + "learning_rate": 4.988048226562797e-05, + "loss": 1.3533, + "step": 9489 + }, + { + "epoch": 0.861005262202867, + "grad_norm": 0.13712632616937068, + "learning_rate": 4.981653028140354e-05, + "loss": 1.3551, + "step": 9490 + }, + { + "epoch": 0.8610959898385048, + "grad_norm": 0.11888994837311555, + "learning_rate": 4.975261717058821e-05, + "loss": 1.3388, + "step": 9491 + }, + { + "epoch": 0.8611867174741427, + "grad_norm": 0.11932544642236775, + "learning_rate": 4.968874293870057e-05, + "loss": 1.3515, + "step": 9492 + }, + { + "epoch": 0.8612774451097804, + "grad_norm": 0.11670626816325948, + "learning_rate": 4.962490759125649e-05, + "loss": 1.3839, + "step": 9493 + }, + { + "epoch": 0.8613681727454182, + "grad_norm": 0.11695960794679021, + "learning_rate": 4.956111113376799e-05, + "loss": 1.3494, + "step": 9494 + }, + { + "epoch": 0.8614589003810561, + "grad_norm": 0.11921147402814625, + "learning_rate": 4.9497353571743895e-05, + "loss": 1.3201, + "step": 9495 + }, + { + "epoch": 0.8615496280166939, + "grad_norm": 0.11636268232746655, + "learning_rate": 4.943363491068975e-05, + "loss": 1.3309, + "step": 9496 + }, + { + "epoch": 0.8616403556523317, + "grad_norm": 0.12688655718324787, + "learning_rate": 4.936995515610765e-05, + "loss": 1.3272, + "step": 9497 + }, + { + "epoch": 0.8617310832879695, + "grad_norm": 0.11486531213915911, + "learning_rate": 4.930631431349625e-05, + "loss": 1.3338, + "step": 9498 + }, + { + "epoch": 0.8618218109236073, + "grad_norm": 0.13467142033779567, + "learning_rate": 4.924271238835104e-05, + "loss": 1.342, + "step": 9499 + }, + { + "epoch": 0.8619125385592451, + "grad_norm": 0.21962979937140203, + "learning_rate": 4.9179149386164136e-05, + "loss": 1.3525, + "step": 9500 + }, + { + "epoch": 0.862003266194883, + "grad_norm": 0.11510566225265308, + "learning_rate": 4.911562531242414e-05, + "loss": 1.3868, + "step": 9501 + }, + { + "epoch": 0.8620939938305208, + "grad_norm": 0.1208432319314736, + "learning_rate": 4.9052140172616275e-05, + "loss": 1.3685, + "step": 9502 + }, + { + "epoch": 0.8621847214661585, + "grad_norm": 0.13637255735118797, + "learning_rate": 4.898869397222261e-05, + "loss": 1.3572, + "step": 9503 + }, + { + "epoch": 0.8622754491017964, + "grad_norm": 0.11402246798170076, + "learning_rate": 4.89252867167217e-05, + "loss": 1.3316, + "step": 9504 + }, + { + "epoch": 0.8623661767374342, + "grad_norm": 0.12452542153984676, + "learning_rate": 4.8861918411588715e-05, + "loss": 1.3854, + "step": 9505 + }, + { + "epoch": 0.862456904373072, + "grad_norm": 0.11483404538274117, + "learning_rate": 4.879858906229556e-05, + "loss": 1.3655, + "step": 9506 + }, + { + "epoch": 0.8625476320087099, + "grad_norm": 0.12196407438528623, + "learning_rate": 4.8735298674310815e-05, + "loss": 1.3795, + "step": 9507 + }, + { + "epoch": 0.8626383596443477, + "grad_norm": 0.11722951798485817, + "learning_rate": 4.8672047253099536e-05, + "loss": 1.3577, + "step": 9508 + }, + { + "epoch": 0.8627290872799854, + "grad_norm": 0.1225874000680098, + "learning_rate": 4.860883480412337e-05, + "loss": 1.3685, + "step": 9509 + }, + { + "epoch": 0.8628198149156233, + "grad_norm": 0.13001823344529323, + "learning_rate": 4.8545661332841e-05, + "loss": 1.3532, + "step": 9510 + }, + { + "epoch": 0.8629105425512611, + "grad_norm": 0.11540749348223694, + "learning_rate": 4.848252684470727e-05, + "loss": 1.4252, + "step": 9511 + }, + { + "epoch": 0.863001270186899, + "grad_norm": 0.1175380658169902, + "learning_rate": 4.841943134517379e-05, + "loss": 1.358, + "step": 9512 + }, + { + "epoch": 0.8630919978225368, + "grad_norm": 0.17756858320041968, + "learning_rate": 4.835637483968913e-05, + "loss": 1.387, + "step": 9513 + }, + { + "epoch": 0.8631827254581745, + "grad_norm": 0.11838217782966012, + "learning_rate": 4.829335733369794e-05, + "loss": 1.3448, + "step": 9514 + }, + { + "epoch": 0.8632734530938124, + "grad_norm": 0.12209628184791188, + "learning_rate": 4.823037883264198e-05, + "loss": 1.337, + "step": 9515 + }, + { + "epoch": 0.8633641807294502, + "grad_norm": 0.12871755804837876, + "learning_rate": 4.8167439341959326e-05, + "loss": 1.3383, + "step": 9516 + }, + { + "epoch": 0.863454908365088, + "grad_norm": 0.11102020479403624, + "learning_rate": 4.810453886708499e-05, + "loss": 1.3405, + "step": 9517 + }, + { + "epoch": 0.8635456360007259, + "grad_norm": 0.17327315817995495, + "learning_rate": 4.804167741345028e-05, + "loss": 1.3479, + "step": 9518 + }, + { + "epoch": 0.8636363636363636, + "grad_norm": 0.11862649601988687, + "learning_rate": 4.79788549864833e-05, + "loss": 1.3266, + "step": 9519 + }, + { + "epoch": 0.8637270912720014, + "grad_norm": 0.12373349101019453, + "learning_rate": 4.7916071591608875e-05, + "loss": 1.3881, + "step": 9520 + }, + { + "epoch": 0.8638178189076393, + "grad_norm": 0.11963257014466816, + "learning_rate": 4.785332723424818e-05, + "loss": 1.3489, + "step": 9521 + }, + { + "epoch": 0.8639085465432771, + "grad_norm": 0.12499621545750379, + "learning_rate": 4.779062191981936e-05, + "loss": 1.3051, + "step": 9522 + }, + { + "epoch": 0.8639992741789149, + "grad_norm": 0.12320810452744455, + "learning_rate": 4.772795565373705e-05, + "loss": 1.3574, + "step": 9523 + }, + { + "epoch": 0.8640900018145528, + "grad_norm": 0.11391142027343752, + "learning_rate": 4.766532844141241e-05, + "loss": 1.3428, + "step": 9524 + }, + { + "epoch": 0.8641807294501905, + "grad_norm": 0.12368934038512425, + "learning_rate": 4.760274028825329e-05, + "loss": 1.3391, + "step": 9525 + }, + { + "epoch": 0.8642714570858283, + "grad_norm": 0.11636296697242836, + "learning_rate": 4.75401911996643e-05, + "loss": 1.334, + "step": 9526 + }, + { + "epoch": 0.8643621847214662, + "grad_norm": 0.11486640180814005, + "learning_rate": 4.747768118104645e-05, + "loss": 1.3418, + "step": 9527 + }, + { + "epoch": 0.864452912357104, + "grad_norm": 0.1138665485552664, + "learning_rate": 4.741521023779755e-05, + "loss": 1.3459, + "step": 9528 + }, + { + "epoch": 0.8645436399927418, + "grad_norm": 0.12955540116356987, + "learning_rate": 4.7352778375311766e-05, + "loss": 1.3613, + "step": 9529 + }, + { + "epoch": 0.8646343676283796, + "grad_norm": 0.12688029091484532, + "learning_rate": 4.729038559898047e-05, + "loss": 1.3676, + "step": 9530 + }, + { + "epoch": 0.8647250952640174, + "grad_norm": 0.14213507063801065, + "learning_rate": 4.7228031914191114e-05, + "loss": 1.3803, + "step": 9531 + }, + { + "epoch": 0.8648158228996552, + "grad_norm": 0.11164157349834186, + "learning_rate": 4.716571732632779e-05, + "loss": 1.3685, + "step": 9532 + }, + { + "epoch": 0.8649065505352931, + "grad_norm": 0.12096573325070045, + "learning_rate": 4.7103441840771645e-05, + "loss": 1.3342, + "step": 9533 + }, + { + "epoch": 0.8649972781709309, + "grad_norm": 0.12461824278881457, + "learning_rate": 4.7041205462900083e-05, + "loss": 1.4005, + "step": 9534 + }, + { + "epoch": 0.8650880058065686, + "grad_norm": 0.11196902467628606, + "learning_rate": 4.697900819808704e-05, + "loss": 1.3485, + "step": 9535 + }, + { + "epoch": 0.8651787334422065, + "grad_norm": 0.11670612925091473, + "learning_rate": 4.691685005170354e-05, + "loss": 1.3718, + "step": 9536 + }, + { + "epoch": 0.8652694610778443, + "grad_norm": 0.11657520928042713, + "learning_rate": 4.685473102911669e-05, + "loss": 1.3759, + "step": 9537 + }, + { + "epoch": 0.8653601887134821, + "grad_norm": 0.11967482174479217, + "learning_rate": 4.67926511356907e-05, + "loss": 1.3502, + "step": 9538 + }, + { + "epoch": 0.86545091634912, + "grad_norm": 0.11500976926845385, + "learning_rate": 4.6730610376785995e-05, + "loss": 1.3371, + "step": 9539 + }, + { + "epoch": 0.8655416439847577, + "grad_norm": 0.11790523981902425, + "learning_rate": 4.666860875775997e-05, + "loss": 1.3047, + "step": 9540 + }, + { + "epoch": 0.8656323716203955, + "grad_norm": 0.1201217787177603, + "learning_rate": 4.660664628396638e-05, + "loss": 1.3654, + "step": 9541 + }, + { + "epoch": 0.8657230992560334, + "grad_norm": 0.28671680144206013, + "learning_rate": 4.654472296075568e-05, + "loss": 1.3544, + "step": 9542 + }, + { + "epoch": 0.8658138268916712, + "grad_norm": 0.1393780819643746, + "learning_rate": 4.648283879347503e-05, + "loss": 1.391, + "step": 9543 + }, + { + "epoch": 0.865904554527309, + "grad_norm": 0.11814130657044801, + "learning_rate": 4.6420993787468055e-05, + "loss": 1.3749, + "step": 9544 + }, + { + "epoch": 0.8659952821629469, + "grad_norm": 0.11426693080721544, + "learning_rate": 4.635918794807509e-05, + "loss": 1.3664, + "step": 9545 + }, + { + "epoch": 0.8660860097985846, + "grad_norm": 0.11681321152394149, + "learning_rate": 4.629742128063308e-05, + "loss": 1.3319, + "step": 9546 + }, + { + "epoch": 0.8661767374342224, + "grad_norm": 0.1143363157636168, + "learning_rate": 4.6235693790475706e-05, + "loss": 1.3506, + "step": 9547 + }, + { + "epoch": 0.8662674650698603, + "grad_norm": 0.12243489702477808, + "learning_rate": 4.6174005482933046e-05, + "loss": 1.3081, + "step": 9548 + }, + { + "epoch": 0.8663581927054981, + "grad_norm": 0.12353448665096271, + "learning_rate": 4.611235636333183e-05, + "loss": 1.3388, + "step": 9549 + }, + { + "epoch": 0.866448920341136, + "grad_norm": 0.12499374918928802, + "learning_rate": 4.605074643699558e-05, + "loss": 1.3335, + "step": 9550 + }, + { + "epoch": 0.8665396479767737, + "grad_norm": 0.13558967815735568, + "learning_rate": 4.5989175709244324e-05, + "loss": 1.4019, + "step": 9551 + }, + { + "epoch": 0.8666303756124115, + "grad_norm": 0.11834728833227166, + "learning_rate": 4.592764418539458e-05, + "loss": 1.3674, + "step": 9552 + }, + { + "epoch": 0.8667211032480494, + "grad_norm": 0.13022736797716727, + "learning_rate": 4.586615187075965e-05, + "loss": 1.3507, + "step": 9553 + }, + { + "epoch": 0.8668118308836872, + "grad_norm": 0.12316517163160412, + "learning_rate": 4.580469877064952e-05, + "loss": 1.3127, + "step": 9554 + }, + { + "epoch": 0.866902558519325, + "grad_norm": 0.12087892568733323, + "learning_rate": 4.5743284890370604e-05, + "loss": 1.3437, + "step": 9555 + }, + { + "epoch": 0.8669932861549628, + "grad_norm": 0.1217020286896708, + "learning_rate": 4.568191023522594e-05, + "loss": 1.378, + "step": 9556 + }, + { + "epoch": 0.8670840137906006, + "grad_norm": 0.12800248150347848, + "learning_rate": 4.562057481051535e-05, + "loss": 1.3512, + "step": 9557 + }, + { + "epoch": 0.8671747414262384, + "grad_norm": 0.11386468584593892, + "learning_rate": 4.555927862153508e-05, + "loss": 1.3593, + "step": 9558 + }, + { + "epoch": 0.8672654690618763, + "grad_norm": 0.11920716093634084, + "learning_rate": 4.549802167357797e-05, + "loss": 1.3814, + "step": 9559 + }, + { + "epoch": 0.8673561966975141, + "grad_norm": 0.11682752820737845, + "learning_rate": 4.543680397193373e-05, + "loss": 1.3286, + "step": 9560 + }, + { + "epoch": 0.8674469243331518, + "grad_norm": 0.1329317090248932, + "learning_rate": 4.537562552188851e-05, + "loss": 1.357, + "step": 9561 + }, + { + "epoch": 0.8675376519687897, + "grad_norm": 0.133833396644374, + "learning_rate": 4.531448632872503e-05, + "loss": 1.3107, + "step": 9562 + }, + { + "epoch": 0.8676283796044275, + "grad_norm": 0.11726224952840647, + "learning_rate": 4.5253386397722583e-05, + "loss": 1.3469, + "step": 9563 + }, + { + "epoch": 0.8677191072400653, + "grad_norm": 0.11456883205530055, + "learning_rate": 4.5192325734157383e-05, + "loss": 1.3171, + "step": 9564 + }, + { + "epoch": 0.8678098348757032, + "grad_norm": 0.12017935852065582, + "learning_rate": 4.5131304343301813e-05, + "loss": 1.3638, + "step": 9565 + }, + { + "epoch": 0.867900562511341, + "grad_norm": 0.12149732631325293, + "learning_rate": 4.507032223042512e-05, + "loss": 1.3093, + "step": 9566 + }, + { + "epoch": 0.8679912901469787, + "grad_norm": 0.12182084552047769, + "learning_rate": 4.5009379400793194e-05, + "loss": 1.3509, + "step": 9567 + }, + { + "epoch": 0.8680820177826166, + "grad_norm": 0.11851556072606485, + "learning_rate": 4.494847585966838e-05, + "loss": 1.3594, + "step": 9568 + }, + { + "epoch": 0.8681727454182544, + "grad_norm": 0.11929081377229735, + "learning_rate": 4.488761161230981e-05, + "loss": 1.3561, + "step": 9569 + }, + { + "epoch": 0.8682634730538922, + "grad_norm": 0.12392209377114649, + "learning_rate": 4.482678666397294e-05, + "loss": 1.3724, + "step": 9570 + }, + { + "epoch": 0.8683542006895301, + "grad_norm": 0.1317298323887641, + "learning_rate": 4.476600101991024e-05, + "loss": 1.308, + "step": 9571 + }, + { + "epoch": 0.8684449283251678, + "grad_norm": 0.13168213027128045, + "learning_rate": 4.4705254685370456e-05, + "loss": 1.3466, + "step": 9572 + }, + { + "epoch": 0.8685356559608056, + "grad_norm": 0.12099710998309313, + "learning_rate": 4.464454766559894e-05, + "loss": 1.3353, + "step": 9573 + }, + { + "epoch": 0.8686263835964435, + "grad_norm": 0.14189813100031762, + "learning_rate": 4.4583879965837905e-05, + "loss": 1.3328, + "step": 9574 + }, + { + "epoch": 0.8687171112320813, + "grad_norm": 0.1467529459716514, + "learning_rate": 4.452325159132597e-05, + "loss": 1.3296, + "step": 9575 + }, + { + "epoch": 0.8688078388677191, + "grad_norm": 0.11614678479671904, + "learning_rate": 4.4462662547298304e-05, + "loss": 1.3163, + "step": 9576 + }, + { + "epoch": 0.868898566503357, + "grad_norm": 0.11837344052487596, + "learning_rate": 4.440211283898687e-05, + "loss": 1.3809, + "step": 9577 + }, + { + "epoch": 0.8689892941389947, + "grad_norm": 0.11397505607799795, + "learning_rate": 4.434160247162022e-05, + "loss": 1.3372, + "step": 9578 + }, + { + "epoch": 0.8690800217746325, + "grad_norm": 0.11953420525481909, + "learning_rate": 4.4281131450423393e-05, + "loss": 1.3552, + "step": 9579 + }, + { + "epoch": 0.8691707494102704, + "grad_norm": 0.11779517494192483, + "learning_rate": 4.422069978061793e-05, + "loss": 1.3597, + "step": 9580 + }, + { + "epoch": 0.8692614770459082, + "grad_norm": 0.12040235825537958, + "learning_rate": 4.416030746742228e-05, + "loss": 1.3747, + "step": 9581 + }, + { + "epoch": 0.869352204681546, + "grad_norm": 0.11838064637286373, + "learning_rate": 4.4099954516051254e-05, + "loss": 1.3848, + "step": 9582 + }, + { + "epoch": 0.8694429323171838, + "grad_norm": 0.1517267969814268, + "learning_rate": 4.4039640931716307e-05, + "loss": 1.3884, + "step": 9583 + }, + { + "epoch": 0.8695336599528216, + "grad_norm": 0.1184142965328012, + "learning_rate": 4.3979366719625546e-05, + "loss": 1.3518, + "step": 9584 + }, + { + "epoch": 0.8696243875884594, + "grad_norm": 0.12225938693297433, + "learning_rate": 4.391913188498375e-05, + "loss": 1.3231, + "step": 9585 + }, + { + "epoch": 0.8697151152240973, + "grad_norm": 0.12121564955065593, + "learning_rate": 4.3858936432992156e-05, + "loss": 1.3404, + "step": 9586 + }, + { + "epoch": 0.8698058428597351, + "grad_norm": 0.11904478030233334, + "learning_rate": 4.379878036884849e-05, + "loss": 1.3038, + "step": 9587 + }, + { + "epoch": 0.869896570495373, + "grad_norm": 0.12111940731156069, + "learning_rate": 4.373866369774754e-05, + "loss": 1.2905, + "step": 9588 + }, + { + "epoch": 0.8699872981310107, + "grad_norm": 0.11503756989261159, + "learning_rate": 4.3678586424880204e-05, + "loss": 1.3216, + "step": 9589 + }, + { + "epoch": 0.8700780257666485, + "grad_norm": 0.13610607182853257, + "learning_rate": 4.361854855543407e-05, + "loss": 1.3825, + "step": 9590 + }, + { + "epoch": 0.8701687534022864, + "grad_norm": 0.11864877080513966, + "learning_rate": 4.355855009459364e-05, + "loss": 1.3389, + "step": 9591 + }, + { + "epoch": 0.8702594810379242, + "grad_norm": 0.12438294071438746, + "learning_rate": 4.349859104753956e-05, + "loss": 1.374, + "step": 9592 + }, + { + "epoch": 0.8703502086735619, + "grad_norm": 0.11968540203237497, + "learning_rate": 4.343867141944946e-05, + "loss": 1.3321, + "step": 9593 + }, + { + "epoch": 0.8704409363091998, + "grad_norm": 0.12099294110253479, + "learning_rate": 4.337879121549748e-05, + "loss": 1.3471, + "step": 9594 + }, + { + "epoch": 0.8705316639448376, + "grad_norm": 0.1451003757960807, + "learning_rate": 4.3318950440854154e-05, + "loss": 1.3526, + "step": 9595 + }, + { + "epoch": 0.8706223915804754, + "grad_norm": 0.11569049207316733, + "learning_rate": 4.325914910068673e-05, + "loss": 1.322, + "step": 9596 + }, + { + "epoch": 0.8707131192161133, + "grad_norm": 0.11624138574575779, + "learning_rate": 4.319938720015903e-05, + "loss": 1.3771, + "step": 9597 + }, + { + "epoch": 0.870803846851751, + "grad_norm": 0.11356691573920882, + "learning_rate": 4.313966474443171e-05, + "loss": 1.3776, + "step": 9598 + }, + { + "epoch": 0.8708945744873888, + "grad_norm": 0.11742074193969129, + "learning_rate": 4.307998173866151e-05, + "loss": 1.3508, + "step": 9599 + }, + { + "epoch": 0.8709853021230267, + "grad_norm": 0.1264288466602748, + "learning_rate": 4.302033818800227e-05, + "loss": 1.3715, + "step": 9600 + }, + { + "epoch": 0.8710760297586645, + "grad_norm": 0.1255307274516723, + "learning_rate": 4.296073409760426e-05, + "loss": 1.3463, + "step": 9601 + }, + { + "epoch": 0.8711667573943023, + "grad_norm": 0.1404680259660553, + "learning_rate": 4.290116947261424e-05, + "loss": 1.3779, + "step": 9602 + }, + { + "epoch": 0.8712574850299402, + "grad_norm": 0.11026155546025514, + "learning_rate": 4.28416443181755e-05, + "loss": 1.2984, + "step": 9603 + }, + { + "epoch": 0.8713482126655779, + "grad_norm": 0.12859323688512506, + "learning_rate": 4.2782158639428245e-05, + "loss": 1.3357, + "step": 9604 + }, + { + "epoch": 0.8714389403012157, + "grad_norm": 0.12862041594618892, + "learning_rate": 4.2722712441508995e-05, + "loss": 1.3466, + "step": 9605 + }, + { + "epoch": 0.8715296679368536, + "grad_norm": 0.12385236127054672, + "learning_rate": 4.26633057295509e-05, + "loss": 1.3748, + "step": 9606 + }, + { + "epoch": 0.8716203955724914, + "grad_norm": 0.13190377592982347, + "learning_rate": 4.260393850868355e-05, + "loss": 1.3465, + "step": 9607 + }, + { + "epoch": 0.8717111232081292, + "grad_norm": 0.11617079670882484, + "learning_rate": 4.254461078403377e-05, + "loss": 1.3183, + "step": 9608 + }, + { + "epoch": 0.871801850843767, + "grad_norm": 0.1499014404136955, + "learning_rate": 4.2485322560724235e-05, + "loss": 1.3849, + "step": 9609 + }, + { + "epoch": 0.8718925784794048, + "grad_norm": 0.11787833356365202, + "learning_rate": 4.2426073843874425e-05, + "loss": 1.3554, + "step": 9610 + }, + { + "epoch": 0.8719833061150426, + "grad_norm": 0.11963186145640793, + "learning_rate": 4.2366864638600674e-05, + "loss": 1.3288, + "step": 9611 + }, + { + "epoch": 0.8720740337506805, + "grad_norm": 0.11876642910321578, + "learning_rate": 4.230769495001563e-05, + "loss": 1.3645, + "step": 9612 + }, + { + "epoch": 0.8721647613863183, + "grad_norm": 0.1192281783818163, + "learning_rate": 4.224856478322847e-05, + "loss": 1.3611, + "step": 9613 + }, + { + "epoch": 0.872255489021956, + "grad_norm": 0.12320937654496697, + "learning_rate": 4.218947414334534e-05, + "loss": 1.3952, + "step": 9614 + }, + { + "epoch": 0.8723462166575939, + "grad_norm": 0.12139067760470083, + "learning_rate": 4.2130423035468436e-05, + "loss": 1.335, + "step": 9615 + }, + { + "epoch": 0.8724369442932317, + "grad_norm": 0.12170653035477962, + "learning_rate": 4.207141146469712e-05, + "loss": 1.345, + "step": 9616 + }, + { + "epoch": 0.8725276719288695, + "grad_norm": 0.12595382246438117, + "learning_rate": 4.201243943612681e-05, + "loss": 1.3776, + "step": 9617 + }, + { + "epoch": 0.8726183995645074, + "grad_norm": 0.11822619023337647, + "learning_rate": 4.195350695484995e-05, + "loss": 1.3926, + "step": 9618 + }, + { + "epoch": 0.8727091272001452, + "grad_norm": 0.13817432808136487, + "learning_rate": 4.1894614025955294e-05, + "loss": 1.393, + "step": 9619 + }, + { + "epoch": 0.8727998548357829, + "grad_norm": 0.12312334833147086, + "learning_rate": 4.18357606545281e-05, + "loss": 1.3365, + "step": 9620 + }, + { + "epoch": 0.8728905824714208, + "grad_norm": 0.12061917821958659, + "learning_rate": 4.1776946845650646e-05, + "loss": 1.3502, + "step": 9621 + }, + { + "epoch": 0.8729813101070586, + "grad_norm": 0.12705021887316065, + "learning_rate": 4.171817260440131e-05, + "loss": 1.4237, + "step": 9622 + }, + { + "epoch": 0.8730720377426964, + "grad_norm": 0.1143340733085039, + "learning_rate": 4.165943793585525e-05, + "loss": 1.338, + "step": 9623 + }, + { + "epoch": 0.8731627653783343, + "grad_norm": 0.11837801426306567, + "learning_rate": 4.16007428450843e-05, + "loss": 1.3618, + "step": 9624 + }, + { + "epoch": 0.873253493013972, + "grad_norm": 0.13455332855993132, + "learning_rate": 4.154208733715681e-05, + "loss": 1.3501, + "step": 9625 + }, + { + "epoch": 0.8733442206496099, + "grad_norm": 0.16168218860711334, + "learning_rate": 4.148347141713771e-05, + "loss": 1.3502, + "step": 9626 + }, + { + "epoch": 0.8734349482852477, + "grad_norm": 0.12762324764148786, + "learning_rate": 4.1424895090088286e-05, + "loss": 1.3396, + "step": 9627 + }, + { + "epoch": 0.8735256759208855, + "grad_norm": 0.1316644275571781, + "learning_rate": 4.136635836106684e-05, + "loss": 1.3571, + "step": 9628 + }, + { + "epoch": 0.8736164035565234, + "grad_norm": 0.12460376255777876, + "learning_rate": 4.1307861235127975e-05, + "loss": 1.3576, + "step": 9629 + }, + { + "epoch": 0.8737071311921611, + "grad_norm": 0.11364235855729972, + "learning_rate": 4.124940371732283e-05, + "loss": 1.3675, + "step": 9630 + }, + { + "epoch": 0.8737978588277989, + "grad_norm": 0.1270847018905187, + "learning_rate": 4.119098581269926e-05, + "loss": 1.3672, + "step": 9631 + }, + { + "epoch": 0.8738885864634368, + "grad_norm": 0.13375032417818924, + "learning_rate": 4.1132607526301766e-05, + "loss": 1.3802, + "step": 9632 + }, + { + "epoch": 0.8739793140990746, + "grad_norm": 0.14256048214882477, + "learning_rate": 4.107426886317123e-05, + "loss": 1.3133, + "step": 9633 + }, + { + "epoch": 0.8740700417347124, + "grad_norm": 0.11609154067044837, + "learning_rate": 4.1015969828345154e-05, + "loss": 1.3292, + "step": 9634 + }, + { + "epoch": 0.8741607693703503, + "grad_norm": 0.13737366233987786, + "learning_rate": 4.0957710426857805e-05, + "loss": 1.3772, + "step": 9635 + }, + { + "epoch": 0.874251497005988, + "grad_norm": 0.12255984997609132, + "learning_rate": 4.0899490663739825e-05, + "loss": 1.3466, + "step": 9636 + }, + { + "epoch": 0.8743422246416258, + "grad_norm": 0.11895260342077722, + "learning_rate": 4.084131054401841e-05, + "loss": 1.3246, + "step": 9637 + }, + { + "epoch": 0.8744329522772637, + "grad_norm": 0.21585752134570135, + "learning_rate": 4.078317007271753e-05, + "loss": 1.3345, + "step": 9638 + }, + { + "epoch": 0.8745236799129015, + "grad_norm": 0.11681602766964971, + "learning_rate": 4.072506925485764e-05, + "loss": 1.3185, + "step": 9639 + }, + { + "epoch": 0.8746144075485393, + "grad_norm": 0.13055586048997989, + "learning_rate": 4.0667008095455704e-05, + "loss": 1.3192, + "step": 9640 + }, + { + "epoch": 0.8747051351841771, + "grad_norm": 0.1203465147898536, + "learning_rate": 4.060898659952522e-05, + "loss": 1.3541, + "step": 9641 + }, + { + "epoch": 0.8747958628198149, + "grad_norm": 0.11682544023424611, + "learning_rate": 4.055100477207657e-05, + "loss": 1.3765, + "step": 9642 + }, + { + "epoch": 0.8748865904554527, + "grad_norm": 0.12854869477661351, + "learning_rate": 4.049306261811636e-05, + "loss": 1.333, + "step": 9643 + }, + { + "epoch": 0.8749773180910906, + "grad_norm": 0.11065028499357131, + "learning_rate": 4.043516014264786e-05, + "loss": 1.3281, + "step": 9644 + }, + { + "epoch": 0.8750680457267284, + "grad_norm": 0.11750171089048671, + "learning_rate": 4.037729735067108e-05, + "loss": 1.3253, + "step": 9645 + }, + { + "epoch": 0.8751587733623661, + "grad_norm": 0.11779099914592095, + "learning_rate": 4.0319474247182355e-05, + "loss": 1.3575, + "step": 9646 + }, + { + "epoch": 0.875249500998004, + "grad_norm": 0.11723737297070815, + "learning_rate": 4.026169083717479e-05, + "loss": 1.3198, + "step": 9647 + }, + { + "epoch": 0.8753402286336418, + "grad_norm": 0.14007916829960407, + "learning_rate": 4.020394712563796e-05, + "loss": 1.3324, + "step": 9648 + }, + { + "epoch": 0.8754309562692796, + "grad_norm": 0.14057142799133238, + "learning_rate": 4.0146243117558154e-05, + "loss": 1.3964, + "step": 9649 + }, + { + "epoch": 0.8755216839049175, + "grad_norm": 0.1279995282021224, + "learning_rate": 4.0088578817918e-05, + "loss": 1.3694, + "step": 9650 + }, + { + "epoch": 0.8756124115405552, + "grad_norm": 0.12297640720623716, + "learning_rate": 4.0030954231696726e-05, + "loss": 1.3627, + "step": 9651 + }, + { + "epoch": 0.875703139176193, + "grad_norm": 0.13582835220837558, + "learning_rate": 3.997336936387053e-05, + "loss": 1.3915, + "step": 9652 + }, + { + "epoch": 0.8757938668118309, + "grad_norm": 0.11681008097079854, + "learning_rate": 3.991582421941159e-05, + "loss": 1.3462, + "step": 9653 + }, + { + "epoch": 0.8758845944474687, + "grad_norm": 0.12494823228696109, + "learning_rate": 3.9858318803288995e-05, + "loss": 1.343, + "step": 9654 + }, + { + "epoch": 0.8759753220831065, + "grad_norm": 0.12499587745004495, + "learning_rate": 3.980085312046844e-05, + "loss": 1.3694, + "step": 9655 + }, + { + "epoch": 0.8760660497187444, + "grad_norm": 0.12457940946645046, + "learning_rate": 3.9743427175912064e-05, + "loss": 1.3777, + "step": 9656 + }, + { + "epoch": 0.8761567773543821, + "grad_norm": 0.12111964543745975, + "learning_rate": 3.968604097457862e-05, + "loss": 1.3604, + "step": 9657 + }, + { + "epoch": 0.8762475049900199, + "grad_norm": 0.12539457234040594, + "learning_rate": 3.9628694521423256e-05, + "loss": 1.3279, + "step": 9658 + }, + { + "epoch": 0.8763382326256578, + "grad_norm": 0.12061963220477419, + "learning_rate": 3.9571387821398074e-05, + "loss": 1.3475, + "step": 9659 + }, + { + "epoch": 0.8764289602612956, + "grad_norm": 0.1188217719073719, + "learning_rate": 3.951412087945144e-05, + "loss": 1.3686, + "step": 9660 + }, + { + "epoch": 0.8765196878969334, + "grad_norm": 0.11687529416531776, + "learning_rate": 3.945689370052824e-05, + "loss": 1.3624, + "step": 9661 + }, + { + "epoch": 0.8766104155325712, + "grad_norm": 0.11572403840036789, + "learning_rate": 3.9399706289570124e-05, + "loss": 1.374, + "step": 9662 + }, + { + "epoch": 0.876701143168209, + "grad_norm": 0.11817787985340829, + "learning_rate": 3.934255865151537e-05, + "loss": 1.3554, + "step": 9663 + }, + { + "epoch": 0.8767918708038469, + "grad_norm": 0.11553941051786687, + "learning_rate": 3.928545079129853e-05, + "loss": 1.3163, + "step": 9664 + }, + { + "epoch": 0.8768825984394847, + "grad_norm": 0.12657598864977818, + "learning_rate": 3.9228382713850877e-05, + "loss": 1.342, + "step": 9665 + }, + { + "epoch": 0.8769733260751225, + "grad_norm": 0.1337416027996938, + "learning_rate": 3.917135442410036e-05, + "loss": 1.3935, + "step": 9666 + }, + { + "epoch": 0.8770640537107603, + "grad_norm": 0.12386048825555401, + "learning_rate": 3.9114365926971265e-05, + "loss": 1.3685, + "step": 9667 + }, + { + "epoch": 0.8771547813463981, + "grad_norm": 0.11919126008001073, + "learning_rate": 3.905741722738454e-05, + "loss": 1.353, + "step": 9668 + }, + { + "epoch": 0.8772455089820359, + "grad_norm": 0.12103606029488272, + "learning_rate": 3.9000508330257925e-05, + "loss": 1.3369, + "step": 9669 + }, + { + "epoch": 0.8773362366176738, + "grad_norm": 0.12019041947687255, + "learning_rate": 3.8943639240505205e-05, + "loss": 1.3765, + "step": 9670 + }, + { + "epoch": 0.8774269642533116, + "grad_norm": 0.11777819303748, + "learning_rate": 3.888680996303717e-05, + "loss": 1.317, + "step": 9671 + }, + { + "epoch": 0.8775176918889493, + "grad_norm": 0.11564290727265905, + "learning_rate": 3.883002050276119e-05, + "loss": 1.3198, + "step": 9672 + }, + { + "epoch": 0.8776084195245872, + "grad_norm": 0.12144166899732084, + "learning_rate": 3.8773270864580876e-05, + "loss": 1.3706, + "step": 9673 + }, + { + "epoch": 0.877699147160225, + "grad_norm": 0.12177137471216863, + "learning_rate": 3.871656105339666e-05, + "loss": 1.3573, + "step": 9674 + }, + { + "epoch": 0.8777898747958628, + "grad_norm": 0.11956856904589404, + "learning_rate": 3.8659891074105226e-05, + "loss": 1.3268, + "step": 9675 + }, + { + "epoch": 0.8778806024315007, + "grad_norm": 0.1201444184079372, + "learning_rate": 3.8603260931600324e-05, + "loss": 1.3402, + "step": 9676 + }, + { + "epoch": 0.8779713300671385, + "grad_norm": 0.10982961143489524, + "learning_rate": 3.854667063077172e-05, + "loss": 1.3672, + "step": 9677 + }, + { + "epoch": 0.8780620577027762, + "grad_norm": 0.11642783448515785, + "learning_rate": 3.849012017650616e-05, + "loss": 1.3858, + "step": 9678 + }, + { + "epoch": 0.8781527853384141, + "grad_norm": 0.14452749742735685, + "learning_rate": 3.84336095736868e-05, + "loss": 1.2879, + "step": 9679 + }, + { + "epoch": 0.8782435129740519, + "grad_norm": 0.11736326600415546, + "learning_rate": 3.8377138827193294e-05, + "loss": 1.3926, + "step": 9680 + }, + { + "epoch": 0.8783342406096897, + "grad_norm": 0.1620266524654086, + "learning_rate": 3.832070794190179e-05, + "loss": 1.3467, + "step": 9681 + }, + { + "epoch": 0.8784249682453276, + "grad_norm": 0.12048588817161872, + "learning_rate": 3.8264316922685284e-05, + "loss": 1.3687, + "step": 9682 + }, + { + "epoch": 0.8785156958809653, + "grad_norm": 0.21358194621765278, + "learning_rate": 3.820796577441305e-05, + "loss": 1.3579, + "step": 9683 + }, + { + "epoch": 0.8786064235166031, + "grad_norm": 0.12610182356744012, + "learning_rate": 3.815165450195107e-05, + "loss": 1.3408, + "step": 9684 + }, + { + "epoch": 0.878697151152241, + "grad_norm": 0.11624552977812189, + "learning_rate": 3.809538311016158e-05, + "loss": 1.4024, + "step": 9685 + }, + { + "epoch": 0.8787878787878788, + "grad_norm": 0.11668934313018034, + "learning_rate": 3.8039151603904086e-05, + "loss": 1.3746, + "step": 9686 + }, + { + "epoch": 0.8788786064235166, + "grad_norm": 0.14576900153960057, + "learning_rate": 3.798295998803386e-05, + "loss": 1.3378, + "step": 9687 + }, + { + "epoch": 0.8789693340591545, + "grad_norm": 0.12603910049071704, + "learning_rate": 3.7926808267403076e-05, + "loss": 1.3585, + "step": 9688 + }, + { + "epoch": 0.8790600616947922, + "grad_norm": 0.1248541771730665, + "learning_rate": 3.7870696446860585e-05, + "loss": 1.3582, + "step": 9689 + }, + { + "epoch": 0.87915078933043, + "grad_norm": 0.12167710800932312, + "learning_rate": 3.7814624531251564e-05, + "loss": 1.3846, + "step": 9690 + }, + { + "epoch": 0.8792415169660679, + "grad_norm": 0.13804921455528094, + "learning_rate": 3.775859252541775e-05, + "loss": 1.3401, + "step": 9691 + }, + { + "epoch": 0.8793322446017057, + "grad_norm": 0.11457681397147192, + "learning_rate": 3.7702600434197654e-05, + "loss": 1.3671, + "step": 9692 + }, + { + "epoch": 0.8794229722373434, + "grad_norm": 0.13994378447007685, + "learning_rate": 3.7646648262426085e-05, + "loss": 1.4018, + "step": 9693 + }, + { + "epoch": 0.8795136998729813, + "grad_norm": 0.12507526918595954, + "learning_rate": 3.759073601493468e-05, + "loss": 1.3932, + "step": 9694 + }, + { + "epoch": 0.8796044275086191, + "grad_norm": 0.1299681405848728, + "learning_rate": 3.7534863696551294e-05, + "loss": 1.3347, + "step": 9695 + }, + { + "epoch": 0.8796951551442569, + "grad_norm": 0.11523261189906103, + "learning_rate": 3.7479031312100677e-05, + "loss": 1.3564, + "step": 9696 + }, + { + "epoch": 0.8797858827798948, + "grad_norm": 0.11716771881221111, + "learning_rate": 3.742323886640386e-05, + "loss": 1.3502, + "step": 9697 + }, + { + "epoch": 0.8798766104155326, + "grad_norm": 0.11950227516878419, + "learning_rate": 3.7367486364278444e-05, + "loss": 1.3248, + "step": 9698 + }, + { + "epoch": 0.8799673380511703, + "grad_norm": 0.11749223241849924, + "learning_rate": 3.7311773810538855e-05, + "loss": 1.3657, + "step": 9699 + }, + { + "epoch": 0.8800580656868082, + "grad_norm": 0.13491604595925555, + "learning_rate": 3.7256101209995785e-05, + "loss": 1.3671, + "step": 9700 + }, + { + "epoch": 0.880148793322446, + "grad_norm": 0.11335813529237683, + "learning_rate": 3.720046856745651e-05, + "loss": 1.3261, + "step": 9701 + }, + { + "epoch": 0.8802395209580839, + "grad_norm": 0.11617654650790853, + "learning_rate": 3.7144875887724975e-05, + "loss": 1.3323, + "step": 9702 + }, + { + "epoch": 0.8803302485937217, + "grad_norm": 0.11608676743756963, + "learning_rate": 3.708932317560171e-05, + "loss": 1.3468, + "step": 9703 + }, + { + "epoch": 0.8804209762293594, + "grad_norm": 0.1179987260692585, + "learning_rate": 3.703381043588361e-05, + "loss": 1.3379, + "step": 9704 + }, + { + "epoch": 0.8805117038649973, + "grad_norm": 0.12330695827710057, + "learning_rate": 3.697833767336417e-05, + "loss": 1.3082, + "step": 9705 + }, + { + "epoch": 0.8806024315006351, + "grad_norm": 0.11987650153968486, + "learning_rate": 3.6922904892833567e-05, + "loss": 1.351, + "step": 9706 + }, + { + "epoch": 0.8806931591362729, + "grad_norm": 0.12456876661086856, + "learning_rate": 3.686751209907835e-05, + "loss": 1.3858, + "step": 9707 + }, + { + "epoch": 0.8807838867719108, + "grad_norm": 0.12244863596545415, + "learning_rate": 3.681215929688164e-05, + "loss": 1.3686, + "step": 9708 + }, + { + "epoch": 0.8808746144075486, + "grad_norm": 0.1270890516843696, + "learning_rate": 3.675684649102329e-05, + "loss": 1.3574, + "step": 9709 + }, + { + "epoch": 0.8809653420431863, + "grad_norm": 0.11956215642658434, + "learning_rate": 3.670157368627958e-05, + "loss": 1.3826, + "step": 9710 + }, + { + "epoch": 0.8810560696788242, + "grad_norm": 0.12248162943408306, + "learning_rate": 3.6646340887423244e-05, + "loss": 1.3294, + "step": 9711 + }, + { + "epoch": 0.881146797314462, + "grad_norm": 0.12152537054902614, + "learning_rate": 3.659114809922365e-05, + "loss": 1.3248, + "step": 9712 + }, + { + "epoch": 0.8812375249500998, + "grad_norm": 0.12172993653342809, + "learning_rate": 3.6535995326446745e-05, + "loss": 1.3497, + "step": 9713 + }, + { + "epoch": 0.8813282525857377, + "grad_norm": 0.14534430543019097, + "learning_rate": 3.648088257385496e-05, + "loss": 1.377, + "step": 9714 + }, + { + "epoch": 0.8814189802213754, + "grad_norm": 0.11899340809785589, + "learning_rate": 3.642580984620719e-05, + "loss": 1.3505, + "step": 9715 + }, + { + "epoch": 0.8815097078570132, + "grad_norm": 0.1185628700379543, + "learning_rate": 3.637077714825904e-05, + "loss": 1.3435, + "step": 9716 + }, + { + "epoch": 0.8816004354926511, + "grad_norm": 0.1228557424796281, + "learning_rate": 3.631578448476275e-05, + "loss": 1.3447, + "step": 9717 + }, + { + "epoch": 0.8816911631282889, + "grad_norm": 0.11969094045608045, + "learning_rate": 3.626083186046675e-05, + "loss": 1.3826, + "step": 9718 + }, + { + "epoch": 0.8817818907639267, + "grad_norm": 0.11986700124732529, + "learning_rate": 3.620591928011624e-05, + "loss": 1.3493, + "step": 9719 + }, + { + "epoch": 0.8818726183995645, + "grad_norm": 0.12129040094325254, + "learning_rate": 3.615104674845299e-05, + "loss": 1.364, + "step": 9720 + }, + { + "epoch": 0.8819633460352023, + "grad_norm": 0.12455001851807684, + "learning_rate": 3.6096214270215253e-05, + "loss": 1.3593, + "step": 9721 + }, + { + "epoch": 0.8820540736708401, + "grad_norm": 0.14062470710982583, + "learning_rate": 3.6041421850137703e-05, + "loss": 1.3361, + "step": 9722 + }, + { + "epoch": 0.882144801306478, + "grad_norm": 0.12786967460195403, + "learning_rate": 3.598666949295182e-05, + "loss": 1.351, + "step": 9723 + }, + { + "epoch": 0.8822355289421158, + "grad_norm": 0.11781483291992297, + "learning_rate": 3.593195720338538e-05, + "loss": 1.3532, + "step": 9724 + }, + { + "epoch": 0.8823262565777535, + "grad_norm": 0.11605086196430524, + "learning_rate": 3.5877284986162885e-05, + "loss": 1.3456, + "step": 9725 + }, + { + "epoch": 0.8824169842133914, + "grad_norm": 0.11871521156604896, + "learning_rate": 3.582265284600511e-05, + "loss": 1.3273, + "step": 9726 + }, + { + "epoch": 0.8825077118490292, + "grad_norm": 0.14559739717702497, + "learning_rate": 3.576806078762984e-05, + "loss": 1.3935, + "step": 9727 + }, + { + "epoch": 0.882598439484667, + "grad_norm": 0.1248500363781981, + "learning_rate": 3.571350881575086e-05, + "loss": 1.3685, + "step": 9728 + }, + { + "epoch": 0.8826891671203049, + "grad_norm": 0.12182870970168809, + "learning_rate": 3.565899693507879e-05, + "loss": 1.3144, + "step": 9729 + }, + { + "epoch": 0.8827798947559427, + "grad_norm": 0.1179570626471312, + "learning_rate": 3.5604525150320866e-05, + "loss": 1.3547, + "step": 9730 + }, + { + "epoch": 0.8828706223915804, + "grad_norm": 0.12225244509323174, + "learning_rate": 3.55500934661806e-05, + "loss": 1.3493, + "step": 9731 + }, + { + "epoch": 0.8829613500272183, + "grad_norm": 0.12871676686899916, + "learning_rate": 3.549570188735812e-05, + "loss": 1.3023, + "step": 9732 + }, + { + "epoch": 0.8830520776628561, + "grad_norm": 0.11953233889960217, + "learning_rate": 3.544135041855029e-05, + "loss": 1.3598, + "step": 9733 + }, + { + "epoch": 0.8831428052984939, + "grad_norm": 0.12056893276303321, + "learning_rate": 3.53870390644504e-05, + "loss": 1.3409, + "step": 9734 + }, + { + "epoch": 0.8832335329341318, + "grad_norm": 0.1193628681996684, + "learning_rate": 3.533276782974815e-05, + "loss": 1.3192, + "step": 9735 + }, + { + "epoch": 0.8833242605697695, + "grad_norm": 0.12477222371063958, + "learning_rate": 3.527853671912978e-05, + "loss": 1.3638, + "step": 9736 + }, + { + "epoch": 0.8834149882054073, + "grad_norm": 0.12447458490267169, + "learning_rate": 3.522434573727839e-05, + "loss": 1.3618, + "step": 9737 + }, + { + "epoch": 0.8835057158410452, + "grad_norm": 0.12190844442709439, + "learning_rate": 3.5170194888873184e-05, + "loss": 1.345, + "step": 9738 + }, + { + "epoch": 0.883596443476683, + "grad_norm": 0.11048966811525605, + "learning_rate": 3.511608417859014e-05, + "loss": 1.3897, + "step": 9739 + }, + { + "epoch": 0.8836871711123209, + "grad_norm": 0.12079278969236945, + "learning_rate": 3.506201361110167e-05, + "loss": 1.3897, + "step": 9740 + }, + { + "epoch": 0.8837778987479586, + "grad_norm": 0.13601212943947555, + "learning_rate": 3.500798319107701e-05, + "loss": 1.3626, + "step": 9741 + }, + { + "epoch": 0.8838686263835964, + "grad_norm": 0.11941161029897845, + "learning_rate": 3.495399292318146e-05, + "loss": 1.3631, + "step": 9742 + }, + { + "epoch": 0.8839593540192343, + "grad_norm": 0.12018841337051787, + "learning_rate": 3.490004281207715e-05, + "loss": 1.3375, + "step": 9743 + }, + { + "epoch": 0.8840500816548721, + "grad_norm": 0.11955135429865335, + "learning_rate": 3.484613286242272e-05, + "loss": 1.3079, + "step": 9744 + }, + { + "epoch": 0.8841408092905099, + "grad_norm": 0.1275815061662086, + "learning_rate": 3.479226307887329e-05, + "loss": 1.3519, + "step": 9745 + }, + { + "epoch": 0.8842315369261478, + "grad_norm": 0.12419720681825701, + "learning_rate": 3.4738433466080465e-05, + "loss": 1.338, + "step": 9746 + }, + { + "epoch": 0.8843222645617855, + "grad_norm": 0.12033139377187836, + "learning_rate": 3.468464402869248e-05, + "loss": 1.3451, + "step": 9747 + }, + { + "epoch": 0.8844129921974233, + "grad_norm": 0.11839551847551888, + "learning_rate": 3.463089477135406e-05, + "loss": 1.3183, + "step": 9748 + }, + { + "epoch": 0.8845037198330612, + "grad_norm": 0.13161013273754674, + "learning_rate": 3.457718569870644e-05, + "loss": 1.3391, + "step": 9749 + }, + { + "epoch": 0.884594447468699, + "grad_norm": 0.11488207887270709, + "learning_rate": 3.452351681538751e-05, + "loss": 1.341, + "step": 9750 + }, + { + "epoch": 0.8846851751043368, + "grad_norm": 0.12184449504496386, + "learning_rate": 3.4469888126031524e-05, + "loss": 1.361, + "step": 9751 + }, + { + "epoch": 0.8847759027399746, + "grad_norm": 0.14123117509243827, + "learning_rate": 3.4416299635269264e-05, + "loss": 1.3322, + "step": 9752 + }, + { + "epoch": 0.8848666303756124, + "grad_norm": 0.15134950835516528, + "learning_rate": 3.4362751347728094e-05, + "loss": 1.3465, + "step": 9753 + }, + { + "epoch": 0.8849573580112502, + "grad_norm": 0.1120423346279045, + "learning_rate": 3.430924326803209e-05, + "loss": 1.3225, + "step": 9754 + }, + { + "epoch": 0.8850480856468881, + "grad_norm": 0.11905350449470778, + "learning_rate": 3.425577540080144e-05, + "loss": 1.3333, + "step": 9755 + }, + { + "epoch": 0.8851388132825259, + "grad_norm": 0.1505402511208834, + "learning_rate": 3.420234775065323e-05, + "loss": 1.3938, + "step": 9756 + }, + { + "epoch": 0.8852295409181636, + "grad_norm": 0.12436904696101872, + "learning_rate": 3.414896032220105e-05, + "loss": 1.3402, + "step": 9757 + }, + { + "epoch": 0.8853202685538015, + "grad_norm": 0.12195340281209328, + "learning_rate": 3.409561312005477e-05, + "loss": 1.3427, + "step": 9758 + }, + { + "epoch": 0.8854109961894393, + "grad_norm": 0.11726890584141504, + "learning_rate": 3.4042306148820925e-05, + "loss": 1.3306, + "step": 9759 + }, + { + "epoch": 0.8855017238250771, + "grad_norm": 0.11729423487390789, + "learning_rate": 3.398903941310266e-05, + "loss": 1.3517, + "step": 9760 + }, + { + "epoch": 0.885592451460715, + "grad_norm": 0.12119953994916731, + "learning_rate": 3.393581291749953e-05, + "loss": 1.3375, + "step": 9761 + }, + { + "epoch": 0.8856831790963527, + "grad_norm": 0.11591178072349959, + "learning_rate": 3.388262666660768e-05, + "loss": 1.3372, + "step": 9762 + }, + { + "epoch": 0.8857739067319905, + "grad_norm": 0.1250882722089296, + "learning_rate": 3.382948066501951e-05, + "loss": 1.3346, + "step": 9763 + }, + { + "epoch": 0.8858646343676284, + "grad_norm": 0.11431176016104151, + "learning_rate": 3.3776374917324606e-05, + "loss": 1.3492, + "step": 9764 + }, + { + "epoch": 0.8859553620032662, + "grad_norm": 0.13227698305133856, + "learning_rate": 3.3723309428108416e-05, + "loss": 1.3645, + "step": 9765 + }, + { + "epoch": 0.886046089638904, + "grad_norm": 0.1446905621472708, + "learning_rate": 3.367028420195306e-05, + "loss": 1.364, + "step": 9766 + }, + { + "epoch": 0.8861368172745419, + "grad_norm": 0.13472222259668845, + "learning_rate": 3.361729924343754e-05, + "loss": 1.3714, + "step": 9767 + }, + { + "epoch": 0.8862275449101796, + "grad_norm": 0.12288760685551982, + "learning_rate": 3.356435455713691e-05, + "loss": 1.3827, + "step": 9768 + }, + { + "epoch": 0.8863182725458174, + "grad_norm": 0.11763226029002696, + "learning_rate": 3.351145014762297e-05, + "loss": 1.3201, + "step": 9769 + }, + { + "epoch": 0.8864090001814553, + "grad_norm": 0.118530000217427, + "learning_rate": 3.345858601946411e-05, + "loss": 1.3701, + "step": 9770 + }, + { + "epoch": 0.8864997278170931, + "grad_norm": 0.12141893245747647, + "learning_rate": 3.340576217722508e-05, + "loss": 1.3374, + "step": 9771 + }, + { + "epoch": 0.8865904554527309, + "grad_norm": 0.1273852989269849, + "learning_rate": 3.3352978625467265e-05, + "loss": 1.3552, + "step": 9772 + }, + { + "epoch": 0.8866811830883687, + "grad_norm": 0.12359163859933002, + "learning_rate": 3.330023536874849e-05, + "loss": 1.3476, + "step": 9773 + }, + { + "epoch": 0.8867719107240065, + "grad_norm": 0.11455099889125318, + "learning_rate": 3.324753241162326e-05, + "loss": 1.3646, + "step": 9774 + }, + { + "epoch": 0.8868626383596443, + "grad_norm": 0.12400414184754059, + "learning_rate": 3.319486975864239e-05, + "loss": 1.3519, + "step": 9775 + }, + { + "epoch": 0.8869533659952822, + "grad_norm": 0.13109451709523862, + "learning_rate": 3.3142247414353235e-05, + "loss": 1.3642, + "step": 9776 + }, + { + "epoch": 0.88704409363092, + "grad_norm": 0.12196045217457527, + "learning_rate": 3.3089665383299885e-05, + "loss": 1.3704, + "step": 9777 + }, + { + "epoch": 0.8871348212665577, + "grad_norm": 0.13467302239559684, + "learning_rate": 3.303712367002276e-05, + "loss": 1.347, + "step": 9778 + }, + { + "epoch": 0.8872255489021956, + "grad_norm": 0.11950324767842807, + "learning_rate": 3.298462227905874e-05, + "loss": 1.3754, + "step": 9779 + }, + { + "epoch": 0.8873162765378334, + "grad_norm": 0.13226426960780582, + "learning_rate": 3.293216121494142e-05, + "loss": 1.3477, + "step": 9780 + }, + { + "epoch": 0.8874070041734713, + "grad_norm": 0.11809847670867633, + "learning_rate": 3.2879740482200935e-05, + "loss": 1.3463, + "step": 9781 + }, + { + "epoch": 0.8874977318091091, + "grad_norm": 0.12245971995519263, + "learning_rate": 3.2827360085363636e-05, + "loss": 1.3749, + "step": 9782 + }, + { + "epoch": 0.8875884594447468, + "grad_norm": 0.11951358795837227, + "learning_rate": 3.277502002895261e-05, + "loss": 1.3526, + "step": 9783 + }, + { + "epoch": 0.8876791870803847, + "grad_norm": 0.12386537709702809, + "learning_rate": 3.272272031748758e-05, + "loss": 1.4031, + "step": 9784 + }, + { + "epoch": 0.8877699147160225, + "grad_norm": 0.12683542257599995, + "learning_rate": 3.2670460955484426e-05, + "loss": 1.3531, + "step": 9785 + }, + { + "epoch": 0.8878606423516603, + "grad_norm": 0.1250633564711926, + "learning_rate": 3.261824194745583e-05, + "loss": 1.3634, + "step": 9786 + }, + { + "epoch": 0.8879513699872982, + "grad_norm": 0.11940196882373622, + "learning_rate": 3.256606329791095e-05, + "loss": 1.3523, + "step": 9787 + }, + { + "epoch": 0.888042097622936, + "grad_norm": 0.13187959317493067, + "learning_rate": 3.2513925011355414e-05, + "loss": 1.3663, + "step": 9788 + }, + { + "epoch": 0.8881328252585737, + "grad_norm": 0.11912263936975595, + "learning_rate": 3.24618270922914e-05, + "loss": 1.3237, + "step": 9789 + }, + { + "epoch": 0.8882235528942116, + "grad_norm": 0.13568223960309259, + "learning_rate": 3.2409769545217406e-05, + "loss": 1.3876, + "step": 9790 + }, + { + "epoch": 0.8883142805298494, + "grad_norm": 0.12054576079660233, + "learning_rate": 3.235775237462885e-05, + "loss": 1.3683, + "step": 9791 + }, + { + "epoch": 0.8884050081654872, + "grad_norm": 0.1286445496913973, + "learning_rate": 3.2305775585017294e-05, + "loss": 1.3522, + "step": 9792 + }, + { + "epoch": 0.8884957358011251, + "grad_norm": 0.1230405086205489, + "learning_rate": 3.225383918087083e-05, + "loss": 1.3331, + "step": 9793 + }, + { + "epoch": 0.8885864634367628, + "grad_norm": 0.11791429207376636, + "learning_rate": 3.220194316667435e-05, + "loss": 1.4071, + "step": 9794 + }, + { + "epoch": 0.8886771910724006, + "grad_norm": 0.15587565819168012, + "learning_rate": 3.215008754690907e-05, + "loss": 1.3553, + "step": 9795 + }, + { + "epoch": 0.8887679187080385, + "grad_norm": 0.11864946666577307, + "learning_rate": 3.209827232605267e-05, + "loss": 1.3528, + "step": 9796 + }, + { + "epoch": 0.8888586463436763, + "grad_norm": 0.11394931836058414, + "learning_rate": 3.2046497508579355e-05, + "loss": 1.3553, + "step": 9797 + }, + { + "epoch": 0.8889493739793141, + "grad_norm": 0.125900521087663, + "learning_rate": 3.199476309895999e-05, + "loss": 1.3295, + "step": 9798 + }, + { + "epoch": 0.889040101614952, + "grad_norm": 0.1360337839972989, + "learning_rate": 3.1943069101661834e-05, + "loss": 1.3198, + "step": 9799 + }, + { + "epoch": 0.8891308292505897, + "grad_norm": 0.11457352885308057, + "learning_rate": 3.18914155211486e-05, + "loss": 1.3194, + "step": 9800 + }, + { + "epoch": 0.8892215568862275, + "grad_norm": 0.12809363266607954, + "learning_rate": 3.183980236188066e-05, + "loss": 1.3265, + "step": 9801 + }, + { + "epoch": 0.8893122845218654, + "grad_norm": 0.12235185424677009, + "learning_rate": 3.1788229628314737e-05, + "loss": 1.3256, + "step": 9802 + }, + { + "epoch": 0.8894030121575032, + "grad_norm": 0.13766980286793684, + "learning_rate": 3.1736697324904304e-05, + "loss": 1.3853, + "step": 9803 + }, + { + "epoch": 0.889493739793141, + "grad_norm": 0.12086953264590578, + "learning_rate": 3.168520545609893e-05, + "loss": 1.3723, + "step": 9804 + }, + { + "epoch": 0.8895844674287788, + "grad_norm": 0.13149111646859046, + "learning_rate": 3.163375402634516e-05, + "loss": 1.3723, + "step": 9805 + }, + { + "epoch": 0.8896751950644166, + "grad_norm": 0.1214256016096796, + "learning_rate": 3.1582343040085836e-05, + "loss": 1.3394, + "step": 9806 + }, + { + "epoch": 0.8897659227000544, + "grad_norm": 0.1383819489032147, + "learning_rate": 3.153097250176007e-05, + "loss": 1.3616, + "step": 9807 + }, + { + "epoch": 0.8898566503356923, + "grad_norm": 0.11543376855626213, + "learning_rate": 3.147964241580398e-05, + "loss": 1.3761, + "step": 9808 + }, + { + "epoch": 0.8899473779713301, + "grad_norm": 0.14907992176682705, + "learning_rate": 3.142835278664985e-05, + "loss": 1.3485, + "step": 9809 + }, + { + "epoch": 0.8900381056069678, + "grad_norm": 0.12393776040877776, + "learning_rate": 3.137710361872642e-05, + "loss": 1.3242, + "step": 9810 + }, + { + "epoch": 0.8901288332426057, + "grad_norm": 0.11603252133565976, + "learning_rate": 3.132589491645915e-05, + "loss": 1.3542, + "step": 9811 + }, + { + "epoch": 0.8902195608782435, + "grad_norm": 0.11668240821097689, + "learning_rate": 3.127472668427001e-05, + "loss": 1.3433, + "step": 9812 + }, + { + "epoch": 0.8903102885138813, + "grad_norm": 0.11957795694535908, + "learning_rate": 3.122359892657728e-05, + "loss": 1.3514, + "step": 9813 + }, + { + "epoch": 0.8904010161495192, + "grad_norm": 0.12060433143917923, + "learning_rate": 3.117251164779583e-05, + "loss": 1.3777, + "step": 9814 + }, + { + "epoch": 0.8904917437851569, + "grad_norm": 0.128972724215107, + "learning_rate": 3.112146485233719e-05, + "loss": 1.3684, + "step": 9815 + }, + { + "epoch": 0.8905824714207947, + "grad_norm": 0.13852514492527107, + "learning_rate": 3.10704585446091e-05, + "loss": 1.3809, + "step": 9816 + }, + { + "epoch": 0.8906731990564326, + "grad_norm": 0.13558901865283435, + "learning_rate": 3.1019492729016e-05, + "loss": 1.2918, + "step": 9817 + }, + { + "epoch": 0.8907639266920704, + "grad_norm": 0.15269013284409272, + "learning_rate": 3.096856740995885e-05, + "loss": 1.3338, + "step": 9818 + }, + { + "epoch": 0.8908546543277083, + "grad_norm": 0.12009873005368908, + "learning_rate": 3.091768259183503e-05, + "loss": 1.3715, + "step": 9819 + }, + { + "epoch": 0.890945381963346, + "grad_norm": 0.11705426130575325, + "learning_rate": 3.086683827903852e-05, + "loss": 1.3249, + "step": 9820 + }, + { + "epoch": 0.8910361095989838, + "grad_norm": 0.12518552874823066, + "learning_rate": 3.081603447595954e-05, + "loss": 1.3709, + "step": 9821 + }, + { + "epoch": 0.8911268372346217, + "grad_norm": 0.11980374118409427, + "learning_rate": 3.076527118698524e-05, + "loss": 1.3673, + "step": 9822 + }, + { + "epoch": 0.8912175648702595, + "grad_norm": 0.1293705265573188, + "learning_rate": 3.071454841649896e-05, + "loss": 1.3598, + "step": 9823 + }, + { + "epoch": 0.8913082925058973, + "grad_norm": 0.1213909355744274, + "learning_rate": 3.0663866168880504e-05, + "loss": 1.3322, + "step": 9824 + }, + { + "epoch": 0.8913990201415352, + "grad_norm": 0.11659573450509193, + "learning_rate": 3.061322444850639e-05, + "loss": 1.3306, + "step": 9825 + }, + { + "epoch": 0.8914897477771729, + "grad_norm": 0.18510986143467517, + "learning_rate": 3.056262325974951e-05, + "loss": 1.3563, + "step": 9826 + }, + { + "epoch": 0.8915804754128107, + "grad_norm": 0.12267733535167365, + "learning_rate": 3.051206260697931e-05, + "loss": 1.3347, + "step": 9827 + }, + { + "epoch": 0.8916712030484486, + "grad_norm": 0.3865949210451871, + "learning_rate": 3.0461542494561733e-05, + "loss": 1.346, + "step": 9828 + }, + { + "epoch": 0.8917619306840864, + "grad_norm": 0.1229859759997696, + "learning_rate": 3.041106292685919e-05, + "loss": 1.343, + "step": 9829 + }, + { + "epoch": 0.8918526583197242, + "grad_norm": 0.1306273157474598, + "learning_rate": 3.0360623908230524e-05, + "loss": 1.4081, + "step": 9830 + }, + { + "epoch": 0.891943385955362, + "grad_norm": 0.12486756586899703, + "learning_rate": 3.0310225443031193e-05, + "loss": 1.3308, + "step": 9831 + }, + { + "epoch": 0.8920341135909998, + "grad_norm": 0.11572796522915738, + "learning_rate": 3.025986753561316e-05, + "loss": 1.3511, + "step": 9832 + }, + { + "epoch": 0.8921248412266376, + "grad_norm": 0.13816144998603314, + "learning_rate": 3.020955019032473e-05, + "loss": 1.3523, + "step": 9833 + }, + { + "epoch": 0.8922155688622755, + "grad_norm": 0.12382031254836569, + "learning_rate": 3.0159273411510866e-05, + "loss": 1.3586, + "step": 9834 + }, + { + "epoch": 0.8923062964979133, + "grad_norm": 0.1325561685091119, + "learning_rate": 3.01090372035131e-05, + "loss": 1.3892, + "step": 9835 + }, + { + "epoch": 0.892397024133551, + "grad_norm": 0.11193406150211348, + "learning_rate": 3.0058841570669183e-05, + "loss": 1.326, + "step": 9836 + }, + { + "epoch": 0.8924877517691889, + "grad_norm": 0.11893009102111843, + "learning_rate": 3.0008686517313532e-05, + "loss": 1.3169, + "step": 9837 + }, + { + "epoch": 0.8925784794048267, + "grad_norm": 0.11628161926888128, + "learning_rate": 2.9958572047777132e-05, + "loss": 1.3711, + "step": 9838 + }, + { + "epoch": 0.8926692070404645, + "grad_norm": 0.1203666074417914, + "learning_rate": 2.9908498166387298e-05, + "loss": 1.3765, + "step": 9839 + }, + { + "epoch": 0.8927599346761024, + "grad_norm": 0.1416923684679805, + "learning_rate": 2.9858464877467905e-05, + "loss": 1.3596, + "step": 9840 + }, + { + "epoch": 0.8928506623117402, + "grad_norm": 0.11399755715190842, + "learning_rate": 2.980847218533922e-05, + "loss": 1.3201, + "step": 9841 + }, + { + "epoch": 0.8929413899473779, + "grad_norm": 0.11421674807175977, + "learning_rate": 2.9758520094318455e-05, + "loss": 1.3457, + "step": 9842 + }, + { + "epoch": 0.8930321175830158, + "grad_norm": 0.12351231160394716, + "learning_rate": 2.9708608608718768e-05, + "loss": 1.3677, + "step": 9843 + }, + { + "epoch": 0.8931228452186536, + "grad_norm": 0.12142273984906578, + "learning_rate": 2.9658737732849937e-05, + "loss": 1.3452, + "step": 9844 + }, + { + "epoch": 0.8932135728542914, + "grad_norm": 0.11230519840262329, + "learning_rate": 2.960890747101852e-05, + "loss": 1.3376, + "step": 9845 + }, + { + "epoch": 0.8933043004899293, + "grad_norm": 0.11802806837177415, + "learning_rate": 2.9559117827527292e-05, + "loss": 1.3268, + "step": 9846 + }, + { + "epoch": 0.893395028125567, + "grad_norm": 0.12574866181426003, + "learning_rate": 2.9509368806675485e-05, + "loss": 1.3561, + "step": 9847 + }, + { + "epoch": 0.8934857557612048, + "grad_norm": 0.11743095730290318, + "learning_rate": 2.9459660412759103e-05, + "loss": 1.3746, + "step": 9848 + }, + { + "epoch": 0.8935764833968427, + "grad_norm": 0.17653717208370695, + "learning_rate": 2.940999265007033e-05, + "loss": 1.3461, + "step": 9849 + }, + { + "epoch": 0.8936672110324805, + "grad_norm": 0.12514865147985726, + "learning_rate": 2.9360365522898113e-05, + "loss": 1.4025, + "step": 9850 + }, + { + "epoch": 0.8937579386681183, + "grad_norm": 0.12045711812859357, + "learning_rate": 2.931077903552759e-05, + "loss": 1.351, + "step": 9851 + }, + { + "epoch": 0.8938486663037561, + "grad_norm": 0.1242657722271409, + "learning_rate": 2.9261233192240775e-05, + "loss": 1.3539, + "step": 9852 + }, + { + "epoch": 0.8939393939393939, + "grad_norm": 0.1179752480404421, + "learning_rate": 2.921172799731586e-05, + "loss": 1.372, + "step": 9853 + }, + { + "epoch": 0.8940301215750317, + "grad_norm": 0.12231415358628991, + "learning_rate": 2.9162263455027538e-05, + "loss": 1.3669, + "step": 9854 + }, + { + "epoch": 0.8941208492106696, + "grad_norm": 0.12237105779095325, + "learning_rate": 2.911283956964722e-05, + "loss": 1.3809, + "step": 9855 + }, + { + "epoch": 0.8942115768463074, + "grad_norm": 0.12565860216103658, + "learning_rate": 2.9063456345442662e-05, + "loss": 1.3694, + "step": 9856 + }, + { + "epoch": 0.8943023044819453, + "grad_norm": 0.11255770446770161, + "learning_rate": 2.90141137866779e-05, + "loss": 1.2987, + "step": 9857 + }, + { + "epoch": 0.894393032117583, + "grad_norm": 0.11582868387237975, + "learning_rate": 2.8964811897613907e-05, + "loss": 1.3444, + "step": 9858 + }, + { + "epoch": 0.8944837597532208, + "grad_norm": 0.11913393083707963, + "learning_rate": 2.891555068250784e-05, + "loss": 1.3304, + "step": 9859 + }, + { + "epoch": 0.8945744873888587, + "grad_norm": 0.11891874560137915, + "learning_rate": 2.886633014561346e-05, + "loss": 1.3305, + "step": 9860 + }, + { + "epoch": 0.8946652150244965, + "grad_norm": 0.13693716376259615, + "learning_rate": 2.8817150291180815e-05, + "loss": 1.3475, + "step": 9861 + }, + { + "epoch": 0.8947559426601343, + "grad_norm": 0.1242494427444496, + "learning_rate": 2.876801112345673e-05, + "loss": 1.3656, + "step": 9862 + }, + { + "epoch": 0.8948466702957721, + "grad_norm": 0.11255296903288242, + "learning_rate": 2.871891264668436e-05, + "loss": 1.3341, + "step": 9863 + }, + { + "epoch": 0.8949373979314099, + "grad_norm": 0.1342427931197088, + "learning_rate": 2.8669854865103262e-05, + "loss": 1.3407, + "step": 9864 + }, + { + "epoch": 0.8950281255670477, + "grad_norm": 0.14487806146654655, + "learning_rate": 2.862083778294966e-05, + "loss": 1.3507, + "step": 9865 + }, + { + "epoch": 0.8951188532026856, + "grad_norm": 0.1131193699976683, + "learning_rate": 2.8571861404456277e-05, + "loss": 1.3557, + "step": 9866 + }, + { + "epoch": 0.8952095808383234, + "grad_norm": 0.12082562329588806, + "learning_rate": 2.8522925733852177e-05, + "loss": 1.3704, + "step": 9867 + }, + { + "epoch": 0.8953003084739611, + "grad_norm": 0.1344781968105983, + "learning_rate": 2.8474030775362814e-05, + "loss": 1.3407, + "step": 9868 + }, + { + "epoch": 0.895391036109599, + "grad_norm": 0.11875028668896553, + "learning_rate": 2.8425176533210474e-05, + "loss": 1.3617, + "step": 9869 + }, + { + "epoch": 0.8954817637452368, + "grad_norm": 0.12261215834953415, + "learning_rate": 2.8376363011613615e-05, + "loss": 1.3422, + "step": 9870 + }, + { + "epoch": 0.8955724913808746, + "grad_norm": 0.12631098659280832, + "learning_rate": 2.8327590214787256e-05, + "loss": 1.3151, + "step": 9871 + }, + { + "epoch": 0.8956632190165125, + "grad_norm": 0.12640013219535717, + "learning_rate": 2.827885814694303e-05, + "loss": 1.3589, + "step": 9872 + }, + { + "epoch": 0.8957539466521502, + "grad_norm": 0.12061988426950417, + "learning_rate": 2.823016681228896e-05, + "loss": 1.3399, + "step": 9873 + }, + { + "epoch": 0.895844674287788, + "grad_norm": 0.11754075172469093, + "learning_rate": 2.8181516215029568e-05, + "loss": 1.292, + "step": 9874 + }, + { + "epoch": 0.8959354019234259, + "grad_norm": 0.12481146730654154, + "learning_rate": 2.8132906359365664e-05, + "loss": 1.3668, + "step": 9875 + }, + { + "epoch": 0.8960261295590637, + "grad_norm": 0.14118048861741597, + "learning_rate": 2.808433724949494e-05, + "loss": 1.3306, + "step": 9876 + }, + { + "epoch": 0.8961168571947015, + "grad_norm": 0.11715468536954403, + "learning_rate": 2.803580888961127e-05, + "loss": 1.3364, + "step": 9877 + }, + { + "epoch": 0.8962075848303394, + "grad_norm": 0.13391744255265578, + "learning_rate": 2.7987321283904966e-05, + "loss": 1.3892, + "step": 9878 + }, + { + "epoch": 0.8962983124659771, + "grad_norm": 0.12923776204590068, + "learning_rate": 2.7938874436563122e-05, + "loss": 1.3163, + "step": 9879 + }, + { + "epoch": 0.8963890401016149, + "grad_norm": 0.11694492603295288, + "learning_rate": 2.789046835176895e-05, + "loss": 1.3513, + "step": 9880 + }, + { + "epoch": 0.8964797677372528, + "grad_norm": 0.1347138905607355, + "learning_rate": 2.7842103033702493e-05, + "loss": 1.3415, + "step": 9881 + }, + { + "epoch": 0.8965704953728906, + "grad_norm": 0.11735107529554478, + "learning_rate": 2.7793778486539968e-05, + "loss": 1.3727, + "step": 9882 + }, + { + "epoch": 0.8966612230085284, + "grad_norm": 0.11381011404069802, + "learning_rate": 2.774549471445431e-05, + "loss": 1.336, + "step": 9883 + }, + { + "epoch": 0.8967519506441662, + "grad_norm": 0.12199197407666247, + "learning_rate": 2.7697251721614858e-05, + "loss": 1.3314, + "step": 9884 + }, + { + "epoch": 0.896842678279804, + "grad_norm": 0.13076167959448085, + "learning_rate": 2.7649049512187162e-05, + "loss": 1.3096, + "step": 9885 + }, + { + "epoch": 0.8969334059154418, + "grad_norm": 0.12177716802610085, + "learning_rate": 2.7600888090333786e-05, + "loss": 1.3432, + "step": 9886 + }, + { + "epoch": 0.8970241335510797, + "grad_norm": 0.12122769375662712, + "learning_rate": 2.7552767460213347e-05, + "loss": 1.3261, + "step": 9887 + }, + { + "epoch": 0.8971148611867175, + "grad_norm": 0.13823135199065717, + "learning_rate": 2.7504687625981016e-05, + "loss": 1.3263, + "step": 9888 + }, + { + "epoch": 0.8972055888223552, + "grad_norm": 0.12036764691687378, + "learning_rate": 2.7456648591788536e-05, + "loss": 1.328, + "step": 9889 + }, + { + "epoch": 0.8972963164579931, + "grad_norm": 0.1194724892223881, + "learning_rate": 2.7408650361784137e-05, + "loss": 1.3637, + "step": 9890 + }, + { + "epoch": 0.8973870440936309, + "grad_norm": 0.12342901378049782, + "learning_rate": 2.736069294011245e-05, + "loss": 1.3596, + "step": 9891 + }, + { + "epoch": 0.8974777717292687, + "grad_norm": 0.11860492804864448, + "learning_rate": 2.7312776330914503e-05, + "loss": 1.3443, + "step": 9892 + }, + { + "epoch": 0.8975684993649066, + "grad_norm": 0.1365162927919177, + "learning_rate": 2.7264900538328087e-05, + "loss": 1.3554, + "step": 9893 + }, + { + "epoch": 0.8976592270005443, + "grad_norm": 0.11525127260407886, + "learning_rate": 2.7217065566487177e-05, + "loss": 1.3386, + "step": 9894 + }, + { + "epoch": 0.8977499546361822, + "grad_norm": 0.11691209579713327, + "learning_rate": 2.7169271419522247e-05, + "loss": 1.3518, + "step": 9895 + }, + { + "epoch": 0.89784068227182, + "grad_norm": 0.1222227208649334, + "learning_rate": 2.7121518101560382e-05, + "loss": 1.3154, + "step": 9896 + }, + { + "epoch": 0.8979314099074578, + "grad_norm": 0.1254016058192407, + "learning_rate": 2.707380561672529e-05, + "loss": 1.3972, + "step": 9897 + }, + { + "epoch": 0.8980221375430957, + "grad_norm": 0.14279144782971803, + "learning_rate": 2.7026133969136723e-05, + "loss": 1.3283, + "step": 9898 + }, + { + "epoch": 0.8981128651787335, + "grad_norm": 0.11652505727724495, + "learning_rate": 2.6978503162911118e-05, + "loss": 1.3957, + "step": 9899 + }, + { + "epoch": 0.8982035928143712, + "grad_norm": 0.11745406544489012, + "learning_rate": 2.6930913202161566e-05, + "loss": 1.3647, + "step": 9900 + }, + { + "epoch": 0.8982943204500091, + "grad_norm": 0.11854679957889835, + "learning_rate": 2.68833640909974e-05, + "loss": 1.3425, + "step": 9901 + }, + { + "epoch": 0.8983850480856469, + "grad_norm": 0.13374892620485168, + "learning_rate": 2.683585583352438e-05, + "loss": 1.3596, + "step": 9902 + }, + { + "epoch": 0.8984757757212847, + "grad_norm": 0.1215202383187937, + "learning_rate": 2.6788388433844956e-05, + "loss": 1.3901, + "step": 9903 + }, + { + "epoch": 0.8985665033569226, + "grad_norm": 0.12638322523510828, + "learning_rate": 2.6740961896058015e-05, + "loss": 1.357, + "step": 9904 + }, + { + "epoch": 0.8986572309925603, + "grad_norm": 0.14066903386666793, + "learning_rate": 2.669357622425872e-05, + "loss": 1.3438, + "step": 9905 + }, + { + "epoch": 0.8987479586281981, + "grad_norm": 0.11101597731784701, + "learning_rate": 2.6646231422538913e-05, + "loss": 1.3478, + "step": 9906 + }, + { + "epoch": 0.898838686263836, + "grad_norm": 0.12003558274545181, + "learning_rate": 2.659892749498677e-05, + "loss": 1.3341, + "step": 9907 + }, + { + "epoch": 0.8989294138994738, + "grad_norm": 0.11740346687802077, + "learning_rate": 2.655166444568702e-05, + "loss": 1.3619, + "step": 9908 + }, + { + "epoch": 0.8990201415351116, + "grad_norm": 0.1150646642085171, + "learning_rate": 2.6504442278720787e-05, + "loss": 1.3357, + "step": 9909 + }, + { + "epoch": 0.8991108691707494, + "grad_norm": 0.12011439578926801, + "learning_rate": 2.6457260998165755e-05, + "loss": 1.346, + "step": 9910 + }, + { + "epoch": 0.8992015968063872, + "grad_norm": 0.12410249485429316, + "learning_rate": 2.6410120608096e-05, + "loss": 1.3232, + "step": 9911 + }, + { + "epoch": 0.899292324442025, + "grad_norm": 0.12163671529714264, + "learning_rate": 2.6363021112582153e-05, + "loss": 1.3579, + "step": 9912 + }, + { + "epoch": 0.8993830520776629, + "grad_norm": 0.12501002663086996, + "learning_rate": 2.6315962515691293e-05, + "loss": 1.334, + "step": 9913 + }, + { + "epoch": 0.8994737797133007, + "grad_norm": 0.11830778983106445, + "learning_rate": 2.6268944821486897e-05, + "loss": 1.3691, + "step": 9914 + }, + { + "epoch": 0.8995645073489384, + "grad_norm": 0.12594301603935382, + "learning_rate": 2.622196803402882e-05, + "loss": 1.3837, + "step": 9915 + }, + { + "epoch": 0.8996552349845763, + "grad_norm": 0.1188734937566749, + "learning_rate": 2.6175032157373714e-05, + "loss": 1.3423, + "step": 9916 + }, + { + "epoch": 0.8997459626202141, + "grad_norm": 0.11802556343569373, + "learning_rate": 2.6128137195574442e-05, + "loss": 1.3547, + "step": 9917 + }, + { + "epoch": 0.8998366902558519, + "grad_norm": 0.1160946124135448, + "learning_rate": 2.608128315268038e-05, + "loss": 1.3482, + "step": 9918 + }, + { + "epoch": 0.8999274178914898, + "grad_norm": 0.11837801701956685, + "learning_rate": 2.6034470032737177e-05, + "loss": 1.3437, + "step": 9919 + }, + { + "epoch": 0.9000181455271276, + "grad_norm": 0.12466106258240107, + "learning_rate": 2.5987697839787496e-05, + "loss": 1.374, + "step": 9920 + }, + { + "epoch": 0.9001088731627653, + "grad_norm": 0.11720867791703111, + "learning_rate": 2.5940966577869985e-05, + "loss": 1.3018, + "step": 9921 + }, + { + "epoch": 0.9001996007984032, + "grad_norm": 0.11749314805331643, + "learning_rate": 2.589427625101981e-05, + "loss": 1.3315, + "step": 9922 + }, + { + "epoch": 0.900290328434041, + "grad_norm": 0.11568695336527364, + "learning_rate": 2.5847626863268804e-05, + "loss": 1.3401, + "step": 9923 + }, + { + "epoch": 0.9003810560696788, + "grad_norm": 0.1215921381379428, + "learning_rate": 2.5801018418645128e-05, + "loss": 1.3553, + "step": 9924 + }, + { + "epoch": 0.9004717837053167, + "grad_norm": 0.12616393513626575, + "learning_rate": 2.575445092117329e-05, + "loss": 1.3353, + "step": 9925 + }, + { + "epoch": 0.9005625113409544, + "grad_norm": 0.11421409697461138, + "learning_rate": 2.5707924374874625e-05, + "loss": 1.365, + "step": 9926 + }, + { + "epoch": 0.9006532389765922, + "grad_norm": 0.11136122321757609, + "learning_rate": 2.5661438783766476e-05, + "loss": 1.3713, + "step": 9927 + }, + { + "epoch": 0.9007439666122301, + "grad_norm": 0.11790208233965495, + "learning_rate": 2.5614994151863136e-05, + "loss": 1.3645, + "step": 9928 + }, + { + "epoch": 0.9008346942478679, + "grad_norm": 0.12062527702764198, + "learning_rate": 2.5568590483174838e-05, + "loss": 1.3563, + "step": 9929 + }, + { + "epoch": 0.9009254218835057, + "grad_norm": 0.1260418878516838, + "learning_rate": 2.552222778170876e-05, + "loss": 1.3438, + "step": 9930 + }, + { + "epoch": 0.9010161495191435, + "grad_norm": 0.11657473761281027, + "learning_rate": 2.5475906051468267e-05, + "loss": 1.34, + "step": 9931 + }, + { + "epoch": 0.9011068771547813, + "grad_norm": 0.12393700892102176, + "learning_rate": 2.5429625296453152e-05, + "loss": 1.3008, + "step": 9932 + }, + { + "epoch": 0.9011976047904192, + "grad_norm": 0.11934153924254592, + "learning_rate": 2.5383385520659884e-05, + "loss": 1.3241, + "step": 9933 + }, + { + "epoch": 0.901288332426057, + "grad_norm": 0.1252747231680287, + "learning_rate": 2.5337186728081275e-05, + "loss": 1.32, + "step": 9934 + }, + { + "epoch": 0.9013790600616948, + "grad_norm": 0.12461875338315706, + "learning_rate": 2.5291028922706405e-05, + "loss": 1.3101, + "step": 9935 + }, + { + "epoch": 0.9014697876973327, + "grad_norm": 0.12466204673107037, + "learning_rate": 2.52449121085212e-05, + "loss": 1.3603, + "step": 9936 + }, + { + "epoch": 0.9015605153329704, + "grad_norm": 0.11995247284131286, + "learning_rate": 2.5198836289507864e-05, + "loss": 1.371, + "step": 9937 + }, + { + "epoch": 0.9016512429686082, + "grad_norm": 0.1523459347575432, + "learning_rate": 2.5152801469644994e-05, + "loss": 1.3189, + "step": 9938 + }, + { + "epoch": 0.9017419706042461, + "grad_norm": 0.1153940125617346, + "learning_rate": 2.510680765290768e-05, + "loss": 1.3219, + "step": 9939 + }, + { + "epoch": 0.9018326982398839, + "grad_norm": 0.11938955478122448, + "learning_rate": 2.5060854843267533e-05, + "loss": 1.3751, + "step": 9940 + }, + { + "epoch": 0.9019234258755217, + "grad_norm": 0.13199261713235946, + "learning_rate": 2.5014943044692597e-05, + "loss": 1.4087, + "step": 9941 + }, + { + "epoch": 0.9020141535111595, + "grad_norm": 0.12660058591008252, + "learning_rate": 2.4969072261147198e-05, + "loss": 1.3751, + "step": 9942 + }, + { + "epoch": 0.9021048811467973, + "grad_norm": 0.1164303286142808, + "learning_rate": 2.4923242496592503e-05, + "loss": 1.3019, + "step": 9943 + }, + { + "epoch": 0.9021956087824351, + "grad_norm": 0.12442100248765432, + "learning_rate": 2.4877453754985845e-05, + "loss": 1.3336, + "step": 9944 + }, + { + "epoch": 0.902286336418073, + "grad_norm": 0.11601657141106135, + "learning_rate": 2.483170604028112e-05, + "loss": 1.3024, + "step": 9945 + }, + { + "epoch": 0.9023770640537108, + "grad_norm": 0.12450227479352757, + "learning_rate": 2.4785999356428557e-05, + "loss": 1.3071, + "step": 9946 + }, + { + "epoch": 0.9024677916893485, + "grad_norm": 0.12959220994524498, + "learning_rate": 2.4740333707374996e-05, + "loss": 1.3295, + "step": 9947 + }, + { + "epoch": 0.9025585193249864, + "grad_norm": 0.11819952468101064, + "learning_rate": 2.469470909706373e-05, + "loss": 1.3636, + "step": 9948 + }, + { + "epoch": 0.9026492469606242, + "grad_norm": 0.12116379989321326, + "learning_rate": 2.464912552943427e-05, + "loss": 1.3578, + "step": 9949 + }, + { + "epoch": 0.902739974596262, + "grad_norm": 0.12829218342561083, + "learning_rate": 2.4603583008422915e-05, + "loss": 1.3474, + "step": 9950 + }, + { + "epoch": 0.9028307022318999, + "grad_norm": 0.11817331356005537, + "learning_rate": 2.4558081537962296e-05, + "loss": 1.331, + "step": 9951 + }, + { + "epoch": 0.9029214298675377, + "grad_norm": 0.1165803070682975, + "learning_rate": 2.451262112198138e-05, + "loss": 1.3121, + "step": 9952 + }, + { + "epoch": 0.9030121575031754, + "grad_norm": 0.11856636252141158, + "learning_rate": 2.44672017644057e-05, + "loss": 1.3554, + "step": 9953 + }, + { + "epoch": 0.9031028851388133, + "grad_norm": 0.124922420986568, + "learning_rate": 2.442182346915722e-05, + "loss": 1.3355, + "step": 9954 + }, + { + "epoch": 0.9031936127744511, + "grad_norm": 0.12538377858359945, + "learning_rate": 2.4376486240154428e-05, + "loss": 1.3774, + "step": 9955 + }, + { + "epoch": 0.9032843404100889, + "grad_norm": 0.12075213902605478, + "learning_rate": 2.433119008131207e-05, + "loss": 1.3101, + "step": 9956 + }, + { + "epoch": 0.9033750680457268, + "grad_norm": 0.12568424066411563, + "learning_rate": 2.4285934996541635e-05, + "loss": 1.3614, + "step": 9957 + }, + { + "epoch": 0.9034657956813645, + "grad_norm": 0.1184638026702741, + "learning_rate": 2.4240720989750777e-05, + "loss": 1.3628, + "step": 9958 + }, + { + "epoch": 0.9035565233170023, + "grad_norm": 0.1272447258636944, + "learning_rate": 2.4195548064843808e-05, + "loss": 1.3767, + "step": 9959 + }, + { + "epoch": 0.9036472509526402, + "grad_norm": 0.12266013395482252, + "learning_rate": 2.415041622572134e-05, + "loss": 1.3283, + "step": 9960 + }, + { + "epoch": 0.903737978588278, + "grad_norm": 0.12256087164499475, + "learning_rate": 2.410532547628064e-05, + "loss": 1.3484, + "step": 9961 + }, + { + "epoch": 0.9038287062239158, + "grad_norm": 0.1275656240909804, + "learning_rate": 2.40602758204152e-05, + "loss": 1.3426, + "step": 9962 + }, + { + "epoch": 0.9039194338595536, + "grad_norm": 0.12593901775533564, + "learning_rate": 2.401526726201503e-05, + "loss": 1.3596, + "step": 9963 + }, + { + "epoch": 0.9040101614951914, + "grad_norm": 0.1179811036877432, + "learning_rate": 2.3970299804966734e-05, + "loss": 1.3338, + "step": 9964 + }, + { + "epoch": 0.9041008891308292, + "grad_norm": 0.12329965342409065, + "learning_rate": 2.3925373453153265e-05, + "loss": 1.3638, + "step": 9965 + }, + { + "epoch": 0.9041916167664671, + "grad_norm": 0.11470855966043629, + "learning_rate": 2.3880488210453853e-05, + "loss": 1.3099, + "step": 9966 + }, + { + "epoch": 0.9042823444021049, + "grad_norm": 0.1307554127275303, + "learning_rate": 2.3835644080744455e-05, + "loss": 1.3521, + "step": 9967 + }, + { + "epoch": 0.9043730720377426, + "grad_norm": 0.11825918801766008, + "learning_rate": 2.379084106789747e-05, + "loss": 1.3251, + "step": 9968 + }, + { + "epoch": 0.9044637996733805, + "grad_norm": 0.11539317560306782, + "learning_rate": 2.374607917578153e-05, + "loss": 1.3313, + "step": 9969 + }, + { + "epoch": 0.9045545273090183, + "grad_norm": 0.12689366919090733, + "learning_rate": 2.3701358408261764e-05, + "loss": 1.3813, + "step": 9970 + }, + { + "epoch": 0.9046452549446562, + "grad_norm": 0.11950152898832975, + "learning_rate": 2.3656678769199967e-05, + "loss": 1.3211, + "step": 9971 + }, + { + "epoch": 0.904735982580294, + "grad_norm": 0.118766556419932, + "learning_rate": 2.361204026245417e-05, + "loss": 1.3572, + "step": 9972 + }, + { + "epoch": 0.9048267102159318, + "grad_norm": 0.12883527599384684, + "learning_rate": 2.3567442891878845e-05, + "loss": 1.3436, + "step": 9973 + }, + { + "epoch": 0.9049174378515696, + "grad_norm": 0.12151193378239829, + "learning_rate": 2.3522886661325073e-05, + "loss": 1.3356, + "step": 9974 + }, + { + "epoch": 0.9050081654872074, + "grad_norm": 0.12143641060815205, + "learning_rate": 2.347837157464028e-05, + "loss": 1.3841, + "step": 9975 + }, + { + "epoch": 0.9050988931228452, + "grad_norm": 0.11443744847519637, + "learning_rate": 2.343389763566839e-05, + "loss": 1.3633, + "step": 9976 + }, + { + "epoch": 0.9051896207584831, + "grad_norm": 0.12327041256537283, + "learning_rate": 2.3389464848249553e-05, + "loss": 1.3194, + "step": 9977 + }, + { + "epoch": 0.9052803483941209, + "grad_norm": 0.11734003581014377, + "learning_rate": 2.334507321622076e-05, + "loss": 1.3513, + "step": 9978 + }, + { + "epoch": 0.9053710760297586, + "grad_norm": 0.1170934270958194, + "learning_rate": 2.330072274341516e-05, + "loss": 1.3535, + "step": 9979 + }, + { + "epoch": 0.9054618036653965, + "grad_norm": 0.11800759648562194, + "learning_rate": 2.3256413433662304e-05, + "loss": 1.3481, + "step": 9980 + }, + { + "epoch": 0.9055525313010343, + "grad_norm": 0.11635657687304704, + "learning_rate": 2.3212145290788468e-05, + "loss": 1.3261, + "step": 9981 + }, + { + "epoch": 0.9056432589366721, + "grad_norm": 0.12619933217006396, + "learning_rate": 2.3167918318616198e-05, + "loss": 1.3479, + "step": 9982 + }, + { + "epoch": 0.90573398657231, + "grad_norm": 0.1203229997821628, + "learning_rate": 2.312373252096439e-05, + "loss": 1.3512, + "step": 9983 + }, + { + "epoch": 0.9058247142079477, + "grad_norm": 0.12429706311663813, + "learning_rate": 2.30795879016486e-05, + "loss": 1.3337, + "step": 9984 + }, + { + "epoch": 0.9059154418435855, + "grad_norm": 0.12201747935608931, + "learning_rate": 2.3035484464480728e-05, + "loss": 1.3125, + "step": 9985 + }, + { + "epoch": 0.9060061694792234, + "grad_norm": 0.11863786547592782, + "learning_rate": 2.2991422213269054e-05, + "loss": 1.3367, + "step": 9986 + }, + { + "epoch": 0.9060968971148612, + "grad_norm": 0.1365410075937304, + "learning_rate": 2.2947401151818315e-05, + "loss": 1.3303, + "step": 9987 + }, + { + "epoch": 0.906187624750499, + "grad_norm": 0.1219526733264196, + "learning_rate": 2.290342128392986e-05, + "loss": 1.3579, + "step": 9988 + }, + { + "epoch": 0.9062783523861369, + "grad_norm": 0.1739805064216427, + "learning_rate": 2.2859482613401207e-05, + "loss": 1.3551, + "step": 9989 + }, + { + "epoch": 0.9063690800217746, + "grad_norm": 0.12594740032529195, + "learning_rate": 2.2815585144026597e-05, + "loss": 1.3284, + "step": 9990 + }, + { + "epoch": 0.9064598076574124, + "grad_norm": 0.12128937838523186, + "learning_rate": 2.2771728879596608e-05, + "loss": 1.3278, + "step": 9991 + }, + { + "epoch": 0.9065505352930503, + "grad_norm": 0.1219336976742645, + "learning_rate": 2.2727913823898104e-05, + "loss": 1.3655, + "step": 9992 + }, + { + "epoch": 0.9066412629286881, + "grad_norm": 0.12719512203026404, + "learning_rate": 2.268413998071456e-05, + "loss": 1.3563, + "step": 9993 + }, + { + "epoch": 0.9067319905643259, + "grad_norm": 0.11595764847876784, + "learning_rate": 2.2640407353825944e-05, + "loss": 1.3457, + "step": 9994 + }, + { + "epoch": 0.9068227181999637, + "grad_norm": 0.11607148296278381, + "learning_rate": 2.2596715947008518e-05, + "loss": 1.3459, + "step": 9995 + }, + { + "epoch": 0.9069134458356015, + "grad_norm": 0.12765712344401783, + "learning_rate": 2.255306576403493e-05, + "loss": 1.3892, + "step": 9996 + }, + { + "epoch": 0.9070041734712393, + "grad_norm": 0.21535695643650624, + "learning_rate": 2.25094568086745e-05, + "loss": 1.3197, + "step": 9997 + }, + { + "epoch": 0.9070949011068772, + "grad_norm": 0.12067797668359624, + "learning_rate": 2.2465889084692935e-05, + "loss": 1.33, + "step": 9998 + }, + { + "epoch": 0.907185628742515, + "grad_norm": 0.11857017144326314, + "learning_rate": 2.2422362595852232e-05, + "loss": 1.3383, + "step": 9999 + }, + { + "epoch": 0.9072763563781527, + "grad_norm": 0.13551221468778185, + "learning_rate": 2.237887734591082e-05, + "loss": 1.3425, + "step": 10000 + }, + { + "epoch": 0.9073670840137906, + "grad_norm": 0.12127066476993299, + "learning_rate": 2.2335433338623813e-05, + "loss": 1.3574, + "step": 10001 + }, + { + "epoch": 0.9074578116494284, + "grad_norm": 0.11773094690637438, + "learning_rate": 2.229203057774254e-05, + "loss": 1.3677, + "step": 10002 + }, + { + "epoch": 0.9075485392850662, + "grad_norm": 0.18907696894068243, + "learning_rate": 2.2248669067014727e-05, + "loss": 1.3537, + "step": 10003 + }, + { + "epoch": 0.9076392669207041, + "grad_norm": 0.13077457991484728, + "learning_rate": 2.2205348810184876e-05, + "loss": 1.3809, + "step": 10004 + }, + { + "epoch": 0.9077299945563418, + "grad_norm": 0.1987838642505101, + "learning_rate": 2.2162069810993502e-05, + "loss": 1.3235, + "step": 10005 + }, + { + "epoch": 0.9078207221919796, + "grad_norm": 0.11831437119526175, + "learning_rate": 2.2118832073177887e-05, + "loss": 1.3355, + "step": 10006 + }, + { + "epoch": 0.9079114498276175, + "grad_norm": 0.12357377653908973, + "learning_rate": 2.207563560047149e-05, + "loss": 1.3306, + "step": 10007 + }, + { + "epoch": 0.9080021774632553, + "grad_norm": 0.1202235628505854, + "learning_rate": 2.2032480396604436e-05, + "loss": 1.3299, + "step": 10008 + }, + { + "epoch": 0.9080929050988932, + "grad_norm": 0.12255154105551223, + "learning_rate": 2.198936646530314e-05, + "loss": 1.3164, + "step": 10009 + }, + { + "epoch": 0.908183632734531, + "grad_norm": 0.11944687322669201, + "learning_rate": 2.194629381029051e-05, + "loss": 1.3718, + "step": 10010 + }, + { + "epoch": 0.9082743603701687, + "grad_norm": 0.13779155402445917, + "learning_rate": 2.190326243528584e-05, + "loss": 1.3436, + "step": 10011 + }, + { + "epoch": 0.9083650880058066, + "grad_norm": 0.13471543864585345, + "learning_rate": 2.186027234400495e-05, + "loss": 1.3582, + "step": 10012 + }, + { + "epoch": 0.9084558156414444, + "grad_norm": 0.11930613140140596, + "learning_rate": 2.1817323540159973e-05, + "loss": 1.3375, + "step": 10013 + }, + { + "epoch": 0.9085465432770822, + "grad_norm": 0.11432816777962432, + "learning_rate": 2.177441602745961e-05, + "loss": 1.3083, + "step": 10014 + }, + { + "epoch": 0.9086372709127201, + "grad_norm": 0.12090049039750053, + "learning_rate": 2.1731549809608898e-05, + "loss": 1.3705, + "step": 10015 + }, + { + "epoch": 0.9087279985483578, + "grad_norm": 0.1223387583564166, + "learning_rate": 2.168872489030943e-05, + "loss": 1.3398, + "step": 10016 + }, + { + "epoch": 0.9088187261839956, + "grad_norm": 0.12014497813904296, + "learning_rate": 2.164594127325892e-05, + "loss": 1.3685, + "step": 10017 + }, + { + "epoch": 0.9089094538196335, + "grad_norm": 0.12445797041388587, + "learning_rate": 2.1603198962152014e-05, + "loss": 1.3651, + "step": 10018 + }, + { + "epoch": 0.9090001814552713, + "grad_norm": 0.12916815108601207, + "learning_rate": 2.1560497960679327e-05, + "loss": 1.3543, + "step": 10019 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.13078859000083567, + "learning_rate": 2.1517838272528124e-05, + "loss": 1.3832, + "step": 10020 + }, + { + "epoch": 0.909181636726547, + "grad_norm": 0.119802023697953, + "learning_rate": 2.1475219901382124e-05, + "loss": 1.3417, + "step": 10021 + }, + { + "epoch": 0.9092723643621847, + "grad_norm": 0.13104165171057966, + "learning_rate": 2.1432642850921445e-05, + "loss": 1.3329, + "step": 10022 + }, + { + "epoch": 0.9093630919978225, + "grad_norm": 0.12522999234321133, + "learning_rate": 2.139010712482259e-05, + "loss": 1.3437, + "step": 10023 + }, + { + "epoch": 0.9094538196334604, + "grad_norm": 0.1305450636118074, + "learning_rate": 2.1347612726758502e-05, + "loss": 1.376, + "step": 10024 + }, + { + "epoch": 0.9095445472690982, + "grad_norm": 0.11644134622695151, + "learning_rate": 2.1305159660398645e-05, + "loss": 1.374, + "step": 10025 + }, + { + "epoch": 0.909635274904736, + "grad_norm": 0.11581714564437018, + "learning_rate": 2.12627479294088e-05, + "loss": 1.3203, + "step": 10026 + }, + { + "epoch": 0.9097260025403738, + "grad_norm": 0.1181844085175125, + "learning_rate": 2.122037753745115e-05, + "loss": 1.3515, + "step": 10027 + }, + { + "epoch": 0.9098167301760116, + "grad_norm": 0.1319856328398337, + "learning_rate": 2.117804848818444e-05, + "loss": 1.3401, + "step": 10028 + }, + { + "epoch": 0.9099074578116494, + "grad_norm": 0.12357346239509917, + "learning_rate": 2.1135760785263912e-05, + "loss": 1.3286, + "step": 10029 + }, + { + "epoch": 0.9099981854472873, + "grad_norm": 0.13340637670561675, + "learning_rate": 2.109351443234103e-05, + "loss": 1.3453, + "step": 10030 + }, + { + "epoch": 0.910088913082925, + "grad_norm": 0.14931895803139025, + "learning_rate": 2.1051309433063715e-05, + "loss": 1.3581, + "step": 10031 + }, + { + "epoch": 0.9101796407185628, + "grad_norm": 0.1205481273640021, + "learning_rate": 2.1009145791076434e-05, + "loss": 1.3248, + "step": 10032 + }, + { + "epoch": 0.9102703683542007, + "grad_norm": 0.12805527223170804, + "learning_rate": 2.0967023510020056e-05, + "loss": 1.365, + "step": 10033 + }, + { + "epoch": 0.9103610959898385, + "grad_norm": 0.12959023685659504, + "learning_rate": 2.092494259353167e-05, + "loss": 1.3737, + "step": 10034 + }, + { + "epoch": 0.9104518236254763, + "grad_norm": 0.11895943443759095, + "learning_rate": 2.0882903045245205e-05, + "loss": 1.3603, + "step": 10035 + }, + { + "epoch": 0.9105425512611142, + "grad_norm": 0.12098799228297805, + "learning_rate": 2.0840904868790645e-05, + "loss": 1.3329, + "step": 10036 + }, + { + "epoch": 0.9106332788967519, + "grad_norm": 0.118441875493028, + "learning_rate": 2.079894806779459e-05, + "loss": 1.32, + "step": 10037 + }, + { + "epoch": 0.9107240065323897, + "grad_norm": 0.11546061718892589, + "learning_rate": 2.0757032645879914e-05, + "loss": 1.3354, + "step": 10038 + }, + { + "epoch": 0.9108147341680276, + "grad_norm": 0.1454735425726831, + "learning_rate": 2.071515860666623e-05, + "loss": 1.3037, + "step": 10039 + }, + { + "epoch": 0.9109054618036654, + "grad_norm": 0.1364440184321445, + "learning_rate": 2.0673325953769194e-05, + "loss": 1.3513, + "step": 10040 + }, + { + "epoch": 0.9109961894393032, + "grad_norm": 0.12812027926176908, + "learning_rate": 2.0631534690801034e-05, + "loss": 1.3731, + "step": 10041 + }, + { + "epoch": 0.911086917074941, + "grad_norm": 0.11853304454052838, + "learning_rate": 2.0589784821370638e-05, + "loss": 1.337, + "step": 10042 + }, + { + "epoch": 0.9111776447105788, + "grad_norm": 0.12207721290748921, + "learning_rate": 2.05480763490829e-05, + "loss": 1.3677, + "step": 10043 + }, + { + "epoch": 0.9112683723462166, + "grad_norm": 0.12045331616338051, + "learning_rate": 2.0506409277539383e-05, + "loss": 1.3546, + "step": 10044 + }, + { + "epoch": 0.9113590999818545, + "grad_norm": 0.17648352718377522, + "learning_rate": 2.0464783610338156e-05, + "loss": 1.3059, + "step": 10045 + }, + { + "epoch": 0.9114498276174923, + "grad_norm": 0.11516510936505912, + "learning_rate": 2.0423199351073563e-05, + "loss": 1.3337, + "step": 10046 + }, + { + "epoch": 0.9115405552531302, + "grad_norm": 0.12099383842912778, + "learning_rate": 2.0381656503336344e-05, + "loss": 1.3509, + "step": 10047 + }, + { + "epoch": 0.9116312828887679, + "grad_norm": 0.12293878310451598, + "learning_rate": 2.0340155070713796e-05, + "loss": 1.3537, + "step": 10048 + }, + { + "epoch": 0.9117220105244057, + "grad_norm": 0.11994675865671482, + "learning_rate": 2.029869505678955e-05, + "loss": 1.3545, + "step": 10049 + }, + { + "epoch": 0.9118127381600436, + "grad_norm": 0.12607878129626654, + "learning_rate": 2.0257276465143692e-05, + "loss": 1.3638, + "step": 10050 + }, + { + "epoch": 0.9119034657956814, + "grad_norm": 0.11781768291589383, + "learning_rate": 2.021589929935269e-05, + "loss": 1.3549, + "step": 10051 + }, + { + "epoch": 0.9119941934313192, + "grad_norm": 0.128466367370814, + "learning_rate": 2.0174563562989468e-05, + "loss": 1.3564, + "step": 10052 + }, + { + "epoch": 0.912084921066957, + "grad_norm": 0.12160126910181561, + "learning_rate": 2.01332692596235e-05, + "loss": 1.3697, + "step": 10053 + }, + { + "epoch": 0.9121756487025948, + "grad_norm": 0.1225949071273042, + "learning_rate": 2.0092016392820434e-05, + "loss": 1.3456, + "step": 10054 + }, + { + "epoch": 0.9122663763382326, + "grad_norm": 0.12164810954874923, + "learning_rate": 2.005080496614242e-05, + "loss": 1.3435, + "step": 10055 + }, + { + "epoch": 0.9123571039738705, + "grad_norm": 0.12735663628288557, + "learning_rate": 2.0009634983148173e-05, + "loss": 1.3298, + "step": 10056 + }, + { + "epoch": 0.9124478316095083, + "grad_norm": 0.11856154731312707, + "learning_rate": 1.996850644739273e-05, + "loss": 1.3213, + "step": 10057 + }, + { + "epoch": 0.912538559245146, + "grad_norm": 0.12026624875797891, + "learning_rate": 1.9927419362427424e-05, + "loss": 1.353, + "step": 10058 + }, + { + "epoch": 0.9126292868807839, + "grad_norm": 0.12316696180178563, + "learning_rate": 1.988637373180019e-05, + "loss": 1.3612, + "step": 10059 + }, + { + "epoch": 0.9127200145164217, + "grad_norm": 0.12314278890620761, + "learning_rate": 1.984536955905547e-05, + "loss": 1.3301, + "step": 10060 + }, + { + "epoch": 0.9128107421520595, + "grad_norm": 0.13018872472926868, + "learning_rate": 1.9804406847733714e-05, + "loss": 1.3645, + "step": 10061 + }, + { + "epoch": 0.9129014697876974, + "grad_norm": 0.1180320689798619, + "learning_rate": 1.9763485601372312e-05, + "loss": 1.3337, + "step": 10062 + }, + { + "epoch": 0.9129921974233352, + "grad_norm": 0.11790688511191509, + "learning_rate": 1.9722605823504713e-05, + "loss": 1.3691, + "step": 10063 + }, + { + "epoch": 0.9130829250589729, + "grad_norm": 0.12361836656102033, + "learning_rate": 1.9681767517660877e-05, + "loss": 1.3669, + "step": 10064 + }, + { + "epoch": 0.9131736526946108, + "grad_norm": 0.1203156637392795, + "learning_rate": 1.9640970687367144e-05, + "loss": 1.3492, + "step": 10065 + }, + { + "epoch": 0.9132643803302486, + "grad_norm": 0.1191656955032675, + "learning_rate": 1.960021533614642e-05, + "loss": 1.3327, + "step": 10066 + }, + { + "epoch": 0.9133551079658864, + "grad_norm": 0.12183314991779659, + "learning_rate": 1.955950146751789e-05, + "loss": 1.3761, + "step": 10067 + }, + { + "epoch": 0.9134458356015243, + "grad_norm": 0.12651032810295176, + "learning_rate": 1.951882908499719e-05, + "loss": 1.3455, + "step": 10068 + }, + { + "epoch": 0.913536563237162, + "grad_norm": 0.1431015063861596, + "learning_rate": 1.947819819209651e-05, + "loss": 1.3775, + "step": 10069 + }, + { + "epoch": 0.9136272908727998, + "grad_norm": 0.11551497257438416, + "learning_rate": 1.94376087923242e-05, + "loss": 1.3673, + "step": 10070 + }, + { + "epoch": 0.9137180185084377, + "grad_norm": 0.12740623726052525, + "learning_rate": 1.9397060889185137e-05, + "loss": 1.3563, + "step": 10071 + }, + { + "epoch": 0.9138087461440755, + "grad_norm": 0.12104406592902142, + "learning_rate": 1.935655448618079e-05, + "loss": 1.3469, + "step": 10072 + }, + { + "epoch": 0.9138994737797133, + "grad_norm": 0.1172247113173238, + "learning_rate": 1.931608958680875e-05, + "loss": 1.3282, + "step": 10073 + }, + { + "epoch": 0.9139902014153511, + "grad_norm": 0.12464268842513848, + "learning_rate": 1.9275666194563223e-05, + "loss": 1.376, + "step": 10074 + }, + { + "epoch": 0.9140809290509889, + "grad_norm": 0.13506439009088136, + "learning_rate": 1.9235284312934752e-05, + "loss": 1.3667, + "step": 10075 + }, + { + "epoch": 0.9141716566866267, + "grad_norm": 0.12220881572028254, + "learning_rate": 1.9194943945410382e-05, + "loss": 1.3302, + "step": 10076 + }, + { + "epoch": 0.9142623843222646, + "grad_norm": 0.11336395540069258, + "learning_rate": 1.915464509547349e-05, + "loss": 1.3354, + "step": 10077 + }, + { + "epoch": 0.9143531119579024, + "grad_norm": 0.11525740762008858, + "learning_rate": 1.911438776660379e-05, + "loss": 1.3139, + "step": 10078 + }, + { + "epoch": 0.9144438395935401, + "grad_norm": 0.12739202410787182, + "learning_rate": 1.907417196227762e-05, + "loss": 1.3715, + "step": 10079 + }, + { + "epoch": 0.914534567229178, + "grad_norm": 0.1261367136569007, + "learning_rate": 1.9033997685967585e-05, + "loss": 1.3734, + "step": 10080 + }, + { + "epoch": 0.9146252948648158, + "grad_norm": 0.1245596313906453, + "learning_rate": 1.899386494114269e-05, + "loss": 1.3419, + "step": 10081 + }, + { + "epoch": 0.9147160225004536, + "grad_norm": 0.12530006992167048, + "learning_rate": 1.8953773731268552e-05, + "loss": 1.3389, + "step": 10082 + }, + { + "epoch": 0.9148067501360915, + "grad_norm": 0.12594507013815767, + "learning_rate": 1.891372405980685e-05, + "loss": 1.3371, + "step": 10083 + }, + { + "epoch": 0.9148974777717293, + "grad_norm": 0.12391490104771406, + "learning_rate": 1.8873715930216027e-05, + "loss": 1.304, + "step": 10084 + }, + { + "epoch": 0.9149882054073671, + "grad_norm": 0.1207593932238633, + "learning_rate": 1.8833749345950668e-05, + "loss": 1.3575, + "step": 10085 + }, + { + "epoch": 0.9150789330430049, + "grad_norm": 0.12123699774524131, + "learning_rate": 1.8793824310462114e-05, + "loss": 1.3406, + "step": 10086 + }, + { + "epoch": 0.9151696606786427, + "grad_norm": 0.11976571343719514, + "learning_rate": 1.8753940827197668e-05, + "loss": 1.3308, + "step": 10087 + }, + { + "epoch": 0.9152603883142806, + "grad_norm": 0.12096036642704469, + "learning_rate": 1.871409889960135e-05, + "loss": 1.3682, + "step": 10088 + }, + { + "epoch": 0.9153511159499184, + "grad_norm": 0.11408233564599674, + "learning_rate": 1.8674298531113574e-05, + "loss": 1.3361, + "step": 10089 + }, + { + "epoch": 0.9154418435855561, + "grad_norm": 0.12378329973859271, + "learning_rate": 1.863453972517104e-05, + "loss": 1.3486, + "step": 10090 + }, + { + "epoch": 0.915532571221194, + "grad_norm": 0.1147189898011062, + "learning_rate": 1.8594822485206886e-05, + "loss": 1.3138, + "step": 10091 + }, + { + "epoch": 0.9156232988568318, + "grad_norm": 0.12264154478976895, + "learning_rate": 1.8555146814650814e-05, + "loss": 1.3883, + "step": 10092 + }, + { + "epoch": 0.9157140264924696, + "grad_norm": 0.11596798927160913, + "learning_rate": 1.8515512716928807e-05, + "loss": 1.3142, + "step": 10093 + }, + { + "epoch": 0.9158047541281075, + "grad_norm": 0.12302206167412877, + "learning_rate": 1.847592019546318e-05, + "loss": 1.3455, + "step": 10094 + }, + { + "epoch": 0.9158954817637452, + "grad_norm": 0.12992268121763417, + "learning_rate": 1.8436369253672812e-05, + "loss": 1.3295, + "step": 10095 + }, + { + "epoch": 0.915986209399383, + "grad_norm": 0.13366186112272982, + "learning_rate": 1.8396859894972974e-05, + "loss": 1.3658, + "step": 10096 + }, + { + "epoch": 0.9160769370350209, + "grad_norm": 0.11437679580417706, + "learning_rate": 1.8357392122775318e-05, + "loss": 1.3642, + "step": 10097 + }, + { + "epoch": 0.9161676646706587, + "grad_norm": 0.12099730000956872, + "learning_rate": 1.8317965940487736e-05, + "loss": 1.3437, + "step": 10098 + }, + { + "epoch": 0.9162583923062965, + "grad_norm": 0.12731515305683636, + "learning_rate": 1.827858135151478e-05, + "loss": 1.3421, + "step": 10099 + }, + { + "epoch": 0.9163491199419344, + "grad_norm": 0.12985493954044422, + "learning_rate": 1.8239238359257394e-05, + "loss": 1.383, + "step": 10100 + }, + { + "epoch": 0.9164398475775721, + "grad_norm": 0.12285491595633807, + "learning_rate": 1.8199936967112753e-05, + "loss": 1.3396, + "step": 10101 + }, + { + "epoch": 0.9165305752132099, + "grad_norm": 0.11869786743884977, + "learning_rate": 1.816067717847453e-05, + "loss": 1.3524, + "step": 10102 + }, + { + "epoch": 0.9166213028488478, + "grad_norm": 0.1217723732010814, + "learning_rate": 1.8121458996732954e-05, + "loss": 1.3735, + "step": 10103 + }, + { + "epoch": 0.9167120304844856, + "grad_norm": 0.12272139123295879, + "learning_rate": 1.808228242527432e-05, + "loss": 1.3394, + "step": 10104 + }, + { + "epoch": 0.9168027581201234, + "grad_norm": 0.12014130699852522, + "learning_rate": 1.8043147467481646e-05, + "loss": 1.3587, + "step": 10105 + }, + { + "epoch": 0.9168934857557612, + "grad_norm": 0.12962382378554826, + "learning_rate": 1.800405412673417e-05, + "loss": 1.359, + "step": 10106 + }, + { + "epoch": 0.916984213391399, + "grad_norm": 0.12338395963134931, + "learning_rate": 1.796500240640775e-05, + "loss": 1.3513, + "step": 10107 + }, + { + "epoch": 0.9170749410270368, + "grad_norm": 0.12266676407639163, + "learning_rate": 1.7925992309874406e-05, + "loss": 1.3464, + "step": 10108 + }, + { + "epoch": 0.9171656686626747, + "grad_norm": 0.11283771103322929, + "learning_rate": 1.7887023840502614e-05, + "loss": 1.3031, + "step": 10109 + }, + { + "epoch": 0.9172563962983125, + "grad_norm": 0.12495364886974382, + "learning_rate": 1.784809700165746e-05, + "loss": 1.3255, + "step": 10110 + }, + { + "epoch": 0.9173471239339502, + "grad_norm": 0.1303925981868482, + "learning_rate": 1.780921179670014e-05, + "loss": 1.3568, + "step": 10111 + }, + { + "epoch": 0.9174378515695881, + "grad_norm": 0.12356408989386791, + "learning_rate": 1.777036822898842e-05, + "loss": 1.3574, + "step": 10112 + }, + { + "epoch": 0.9175285792052259, + "grad_norm": 0.12538886514953734, + "learning_rate": 1.7731566301876557e-05, + "loss": 1.3473, + "step": 10113 + }, + { + "epoch": 0.9176193068408637, + "grad_norm": 0.12556127298505362, + "learning_rate": 1.7692806018714923e-05, + "loss": 1.3604, + "step": 10114 + }, + { + "epoch": 0.9177100344765016, + "grad_norm": 0.12060733591957896, + "learning_rate": 1.7654087382850624e-05, + "loss": 1.3231, + "step": 10115 + }, + { + "epoch": 0.9178007621121393, + "grad_norm": 0.1152681636544885, + "learning_rate": 1.761541039762693e-05, + "loss": 1.3235, + "step": 10116 + }, + { + "epoch": 0.9178914897477771, + "grad_norm": 0.11844872269262502, + "learning_rate": 1.757677506638372e-05, + "loss": 1.3291, + "step": 10117 + }, + { + "epoch": 0.917982217383415, + "grad_norm": 0.11853041291544221, + "learning_rate": 1.753818139245705e-05, + "loss": 1.3288, + "step": 10118 + }, + { + "epoch": 0.9180729450190528, + "grad_norm": 0.1186177146369364, + "learning_rate": 1.7499629379179483e-05, + "loss": 1.3688, + "step": 10119 + }, + { + "epoch": 0.9181636726546906, + "grad_norm": 0.11478070822470164, + "learning_rate": 1.7461119029880124e-05, + "loss": 1.342, + "step": 10120 + }, + { + "epoch": 0.9182544002903285, + "grad_norm": 0.1153695638023266, + "learning_rate": 1.7422650347884205e-05, + "loss": 1.345, + "step": 10121 + }, + { + "epoch": 0.9183451279259662, + "grad_norm": 0.13913534509890535, + "learning_rate": 1.7384223336513517e-05, + "loss": 1.3145, + "step": 10122 + }, + { + "epoch": 0.9184358555616041, + "grad_norm": 0.12552937661874522, + "learning_rate": 1.734583799908629e-05, + "loss": 1.3422, + "step": 10123 + }, + { + "epoch": 0.9185265831972419, + "grad_norm": 0.1190670193439122, + "learning_rate": 1.730749433891715e-05, + "loss": 1.3986, + "step": 10124 + }, + { + "epoch": 0.9186173108328797, + "grad_norm": 0.14766208020903124, + "learning_rate": 1.7269192359316955e-05, + "loss": 1.3124, + "step": 10125 + }, + { + "epoch": 0.9187080384685176, + "grad_norm": 0.12906090749088606, + "learning_rate": 1.7230932063593162e-05, + "loss": 1.3169, + "step": 10126 + }, + { + "epoch": 0.9187987661041553, + "grad_norm": 0.12091849146664063, + "learning_rate": 1.719271345504958e-05, + "loss": 1.3626, + "step": 10127 + }, + { + "epoch": 0.9188894937397931, + "grad_norm": 0.12018564182800241, + "learning_rate": 1.7154536536986286e-05, + "loss": 1.3659, + "step": 10128 + }, + { + "epoch": 0.918980221375431, + "grad_norm": 0.15334390889153743, + "learning_rate": 1.7116401312699926e-05, + "loss": 1.3901, + "step": 10129 + }, + { + "epoch": 0.9190709490110688, + "grad_norm": 0.12029046814367089, + "learning_rate": 1.7078307785483472e-05, + "loss": 1.3286, + "step": 10130 + }, + { + "epoch": 0.9191616766467066, + "grad_norm": 0.12445768247336284, + "learning_rate": 1.7040255958626404e-05, + "loss": 1.3464, + "step": 10131 + }, + { + "epoch": 0.9192524042823444, + "grad_norm": 0.12833663161682682, + "learning_rate": 1.7002245835414377e-05, + "loss": 1.3618, + "step": 10132 + }, + { + "epoch": 0.9193431319179822, + "grad_norm": 0.1252611578112144, + "learning_rate": 1.6964277419129537e-05, + "loss": 1.3133, + "step": 10133 + }, + { + "epoch": 0.91943385955362, + "grad_norm": 0.12702071595964212, + "learning_rate": 1.6926350713050652e-05, + "loss": 1.3815, + "step": 10134 + }, + { + "epoch": 0.9195245871892579, + "grad_norm": 0.11397146197605125, + "learning_rate": 1.6888465720452496e-05, + "loss": 1.3448, + "step": 10135 + }, + { + "epoch": 0.9196153148248957, + "grad_norm": 0.12669492100994073, + "learning_rate": 1.6850622444606555e-05, + "loss": 1.3189, + "step": 10136 + }, + { + "epoch": 0.9197060424605334, + "grad_norm": 0.1370011816148569, + "learning_rate": 1.68128208887805e-05, + "loss": 1.3708, + "step": 10137 + }, + { + "epoch": 0.9197967700961713, + "grad_norm": 0.13047902358009697, + "learning_rate": 1.6775061056238717e-05, + "loss": 1.3357, + "step": 10138 + }, + { + "epoch": 0.9198874977318091, + "grad_norm": 0.11985628768520362, + "learning_rate": 1.6737342950241484e-05, + "loss": 1.3374, + "step": 10139 + }, + { + "epoch": 0.9199782253674469, + "grad_norm": 0.12670705479558672, + "learning_rate": 1.6699666574046035e-05, + "loss": 1.3603, + "step": 10140 + }, + { + "epoch": 0.9200689530030848, + "grad_norm": 0.1302343845658717, + "learning_rate": 1.6662031930905596e-05, + "loss": 1.3456, + "step": 10141 + }, + { + "epoch": 0.9201596806387226, + "grad_norm": 0.5205041149282176, + "learning_rate": 1.6624439024069847e-05, + "loss": 1.3425, + "step": 10142 + }, + { + "epoch": 0.9202504082743603, + "grad_norm": 0.11730220097265073, + "learning_rate": 1.658688785678514e-05, + "loss": 1.366, + "step": 10143 + }, + { + "epoch": 0.9203411359099982, + "grad_norm": 0.14022926450661322, + "learning_rate": 1.654937843229387e-05, + "loss": 1.3625, + "step": 10144 + }, + { + "epoch": 0.920431863545636, + "grad_norm": 0.12311877648762487, + "learning_rate": 1.6511910753834957e-05, + "loss": 1.3418, + "step": 10145 + }, + { + "epoch": 0.9205225911812738, + "grad_norm": 0.12143845734992977, + "learning_rate": 1.6474484824643867e-05, + "loss": 1.3313, + "step": 10146 + }, + { + "epoch": 0.9206133188169117, + "grad_norm": 0.11181412171688421, + "learning_rate": 1.6437100647952285e-05, + "loss": 1.2882, + "step": 10147 + }, + { + "epoch": 0.9207040464525494, + "grad_norm": 0.1418098677834315, + "learning_rate": 1.639975822698836e-05, + "loss": 1.3561, + "step": 10148 + }, + { + "epoch": 0.9207947740881872, + "grad_norm": 0.1305947372952159, + "learning_rate": 1.636245756497651e-05, + "loss": 1.3704, + "step": 10149 + }, + { + "epoch": 0.9208855017238251, + "grad_norm": 0.12135779767068088, + "learning_rate": 1.6325198665137818e-05, + "loss": 1.3509, + "step": 10150 + }, + { + "epoch": 0.9209762293594629, + "grad_norm": 0.11443972496195672, + "learning_rate": 1.62879815306895e-05, + "loss": 1.3388, + "step": 10151 + }, + { + "epoch": 0.9210669569951007, + "grad_norm": 0.11919097542385991, + "learning_rate": 1.62508061648452e-05, + "loss": 1.3448, + "step": 10152 + }, + { + "epoch": 0.9211576846307385, + "grad_norm": 0.12317521618813071, + "learning_rate": 1.6213672570815073e-05, + "loss": 1.4171, + "step": 10153 + }, + { + "epoch": 0.9212484122663763, + "grad_norm": 0.1248019920344492, + "learning_rate": 1.617658075180567e-05, + "loss": 1.372, + "step": 10154 + }, + { + "epoch": 0.9213391399020141, + "grad_norm": 0.12427274311130093, + "learning_rate": 1.613953071101987e-05, + "loss": 1.3348, + "step": 10155 + }, + { + "epoch": 0.921429867537652, + "grad_norm": 0.12151760432652343, + "learning_rate": 1.610252245165683e-05, + "loss": 1.3194, + "step": 10156 + }, + { + "epoch": 0.9215205951732898, + "grad_norm": 0.12534255880806597, + "learning_rate": 1.606555597691234e-05, + "loss": 1.3462, + "step": 10157 + }, + { + "epoch": 0.9216113228089275, + "grad_norm": 0.21681081390008275, + "learning_rate": 1.602863128997839e-05, + "loss": 1.3466, + "step": 10158 + }, + { + "epoch": 0.9217020504445654, + "grad_norm": 0.11351253351555328, + "learning_rate": 1.5991748394043436e-05, + "loss": 1.332, + "step": 10159 + }, + { + "epoch": 0.9217927780802032, + "grad_norm": 0.11660350050670291, + "learning_rate": 1.595490729229243e-05, + "loss": 1.3697, + "step": 10160 + }, + { + "epoch": 0.9218835057158411, + "grad_norm": 0.16054271826851751, + "learning_rate": 1.5918107987906437e-05, + "loss": 1.3561, + "step": 10161 + }, + { + "epoch": 0.9219742333514789, + "grad_norm": 0.1328590287441551, + "learning_rate": 1.5881350484063195e-05, + "loss": 1.3753, + "step": 10162 + }, + { + "epoch": 0.9220649609871167, + "grad_norm": 0.12178177692079938, + "learning_rate": 1.5844634783936664e-05, + "loss": 1.3706, + "step": 10163 + }, + { + "epoch": 0.9221556886227545, + "grad_norm": 0.1168842351860427, + "learning_rate": 1.580796089069736e-05, + "loss": 1.3264, + "step": 10164 + }, + { + "epoch": 0.9222464162583923, + "grad_norm": 0.12175204430275098, + "learning_rate": 1.577132880751192e-05, + "loss": 1.3807, + "step": 10165 + }, + { + "epoch": 0.9223371438940301, + "grad_norm": 0.126958767085049, + "learning_rate": 1.5734738537543593e-05, + "loss": 1.3356, + "step": 10166 + }, + { + "epoch": 0.922427871529668, + "grad_norm": 0.12314244178554658, + "learning_rate": 1.569819008395207e-05, + "loss": 1.3565, + "step": 10167 + }, + { + "epoch": 0.9225185991653058, + "grad_norm": 0.12335573074390392, + "learning_rate": 1.566168344989316e-05, + "loss": 1.3345, + "step": 10168 + }, + { + "epoch": 0.9226093268009435, + "grad_norm": 0.11511512830738169, + "learning_rate": 1.562521863851918e-05, + "loss": 1.3356, + "step": 10169 + }, + { + "epoch": 0.9227000544365814, + "grad_norm": 0.12143832654723545, + "learning_rate": 1.5588795652978992e-05, + "loss": 1.3608, + "step": 10170 + }, + { + "epoch": 0.9227907820722192, + "grad_norm": 0.11803310768370037, + "learning_rate": 1.5552414496417755e-05, + "loss": 1.3446, + "step": 10171 + }, + { + "epoch": 0.922881509707857, + "grad_norm": 0.11866593532086404, + "learning_rate": 1.5516075171976952e-05, + "loss": 1.3144, + "step": 10172 + }, + { + "epoch": 0.9229722373434949, + "grad_norm": 0.12278404966314113, + "learning_rate": 1.5479777682794405e-05, + "loss": 1.314, + "step": 10173 + }, + { + "epoch": 0.9230629649791326, + "grad_norm": 0.13022163604727022, + "learning_rate": 1.54435220320045e-05, + "loss": 1.3516, + "step": 10174 + }, + { + "epoch": 0.9231536926147704, + "grad_norm": 0.13241388000610352, + "learning_rate": 1.5407308222737836e-05, + "loss": 1.3814, + "step": 10175 + }, + { + "epoch": 0.9232444202504083, + "grad_norm": 0.12639781086650384, + "learning_rate": 1.537113625812153e-05, + "loss": 1.3582, + "step": 10176 + }, + { + "epoch": 0.9233351478860461, + "grad_norm": 0.1275908102954983, + "learning_rate": 1.533500614127903e-05, + "loss": 1.3636, + "step": 10177 + }, + { + "epoch": 0.9234258755216839, + "grad_norm": 0.12098899865289828, + "learning_rate": 1.529891787533022e-05, + "loss": 1.3317, + "step": 10178 + }, + { + "epoch": 0.9235166031573218, + "grad_norm": 0.12347488098527518, + "learning_rate": 1.5262871463391336e-05, + "loss": 1.3624, + "step": 10179 + }, + { + "epoch": 0.9236073307929595, + "grad_norm": 0.1373426933817587, + "learning_rate": 1.5226866908574833e-05, + "loss": 1.3242, + "step": 10180 + }, + { + "epoch": 0.9236980584285973, + "grad_norm": 0.12008142831812495, + "learning_rate": 1.5190904213989887e-05, + "loss": 1.3774, + "step": 10181 + }, + { + "epoch": 0.9237887860642352, + "grad_norm": 0.14146053671730444, + "learning_rate": 1.5154983382741849e-05, + "loss": 1.355, + "step": 10182 + }, + { + "epoch": 0.923879513699873, + "grad_norm": 0.12934707194381534, + "learning_rate": 1.511910441793235e-05, + "loss": 1.3538, + "step": 10183 + }, + { + "epoch": 0.9239702413355108, + "grad_norm": 0.11750423956936439, + "learning_rate": 1.5083267322659688e-05, + "loss": 1.3526, + "step": 10184 + }, + { + "epoch": 0.9240609689711486, + "grad_norm": 0.13541549548035406, + "learning_rate": 1.5047472100018389e-05, + "loss": 1.376, + "step": 10185 + }, + { + "epoch": 0.9241516966067864, + "grad_norm": 0.12142488379704343, + "learning_rate": 1.5011718753099368e-05, + "loss": 1.3518, + "step": 10186 + }, + { + "epoch": 0.9242424242424242, + "grad_norm": 0.11827204270246215, + "learning_rate": 1.4976007284989879e-05, + "loss": 1.33, + "step": 10187 + }, + { + "epoch": 0.9243331518780621, + "grad_norm": 0.11687344318205332, + "learning_rate": 1.4940337698773677e-05, + "loss": 1.3061, + "step": 10188 + }, + { + "epoch": 0.9244238795136999, + "grad_norm": 0.11734148116840203, + "learning_rate": 1.4904709997530796e-05, + "loss": 1.366, + "step": 10189 + }, + { + "epoch": 0.9245146071493376, + "grad_norm": 0.12513206226545462, + "learning_rate": 1.4869124184337668e-05, + "loss": 1.3224, + "step": 10190 + }, + { + "epoch": 0.9246053347849755, + "grad_norm": 0.14815588886854808, + "learning_rate": 1.4833580262267221e-05, + "loss": 1.3783, + "step": 10191 + }, + { + "epoch": 0.9246960624206133, + "grad_norm": 0.13383688877503983, + "learning_rate": 1.4798078234388556e-05, + "loss": 1.3388, + "step": 10192 + }, + { + "epoch": 0.9247867900562511, + "grad_norm": 0.1300996782117922, + "learning_rate": 1.4762618103767388e-05, + "loss": 1.3515, + "step": 10193 + }, + { + "epoch": 0.924877517691889, + "grad_norm": 0.12959548355179765, + "learning_rate": 1.4727199873465602e-05, + "loss": 1.3417, + "step": 10194 + }, + { + "epoch": 0.9249682453275268, + "grad_norm": 0.11490142273512871, + "learning_rate": 1.4691823546541694e-05, + "loss": 1.3135, + "step": 10195 + }, + { + "epoch": 0.9250589729631645, + "grad_norm": 0.13750134314415424, + "learning_rate": 1.4656489126050387e-05, + "loss": 1.3283, + "step": 10196 + }, + { + "epoch": 0.9251497005988024, + "grad_norm": 0.12144515390915488, + "learning_rate": 1.4621196615042687e-05, + "loss": 1.3467, + "step": 10197 + }, + { + "epoch": 0.9252404282344402, + "grad_norm": 0.11953373685955956, + "learning_rate": 1.4585946016566265e-05, + "loss": 1.3503, + "step": 10198 + }, + { + "epoch": 0.925331155870078, + "grad_norm": 0.15074523230498063, + "learning_rate": 1.4550737333664966e-05, + "loss": 1.3201, + "step": 10199 + }, + { + "epoch": 0.9254218835057159, + "grad_norm": 0.13348400513207126, + "learning_rate": 1.4515570569378966e-05, + "loss": 1.3507, + "step": 10200 + }, + { + "epoch": 0.9255126111413536, + "grad_norm": 0.1180594880071138, + "learning_rate": 1.4480445726745007e-05, + "loss": 1.3458, + "step": 10201 + }, + { + "epoch": 0.9256033387769915, + "grad_norm": 0.1242774659323955, + "learning_rate": 1.4445362808796159e-05, + "loss": 1.3321, + "step": 10202 + }, + { + "epoch": 0.9256940664126293, + "grad_norm": 0.12170390016505153, + "learning_rate": 1.4410321818561779e-05, + "loss": 1.3426, + "step": 10203 + }, + { + "epoch": 0.9257847940482671, + "grad_norm": 0.1261018255240555, + "learning_rate": 1.4375322759067666e-05, + "loss": 1.3837, + "step": 10204 + }, + { + "epoch": 0.925875521683905, + "grad_norm": 0.13913050191861528, + "learning_rate": 1.4340365633336072e-05, + "loss": 1.3253, + "step": 10205 + }, + { + "epoch": 0.9259662493195427, + "grad_norm": 0.12017702612245805, + "learning_rate": 1.4305450444385414e-05, + "loss": 1.3414, + "step": 10206 + }, + { + "epoch": 0.9260569769551805, + "grad_norm": 0.14193779226556888, + "learning_rate": 1.4270577195230727e-05, + "loss": 1.383, + "step": 10207 + }, + { + "epoch": 0.9261477045908184, + "grad_norm": 0.12538822577048506, + "learning_rate": 1.4235745888883212e-05, + "loss": 1.3572, + "step": 10208 + }, + { + "epoch": 0.9262384322264562, + "grad_norm": 0.12217125922205861, + "learning_rate": 1.420095652835074e-05, + "loss": 1.3454, + "step": 10209 + }, + { + "epoch": 0.926329159862094, + "grad_norm": 0.13148085593103367, + "learning_rate": 1.4166209116637241e-05, + "loss": 1.3371, + "step": 10210 + }, + { + "epoch": 0.9264198874977319, + "grad_norm": 0.12551987172574114, + "learning_rate": 1.413150365674315e-05, + "loss": 1.3457, + "step": 10211 + }, + { + "epoch": 0.9265106151333696, + "grad_norm": 0.12195264517475335, + "learning_rate": 1.4096840151665403e-05, + "loss": 1.3816, + "step": 10212 + }, + { + "epoch": 0.9266013427690074, + "grad_norm": 0.11410941904885744, + "learning_rate": 1.4062218604397047e-05, + "loss": 1.3405, + "step": 10213 + }, + { + "epoch": 0.9266920704046453, + "grad_norm": 0.1171282293604971, + "learning_rate": 1.4027639017927752e-05, + "loss": 1.3178, + "step": 10214 + }, + { + "epoch": 0.9267827980402831, + "grad_norm": 0.12672801292077213, + "learning_rate": 1.3993101395243457e-05, + "loss": 1.3576, + "step": 10215 + }, + { + "epoch": 0.9268735256759209, + "grad_norm": 0.11737094638833999, + "learning_rate": 1.3958605739326502e-05, + "loss": 1.3895, + "step": 10216 + }, + { + "epoch": 0.9269642533115587, + "grad_norm": 0.11595802388353928, + "learning_rate": 1.392415205315556e-05, + "loss": 1.3631, + "step": 10217 + }, + { + "epoch": 0.9270549809471965, + "grad_norm": 0.1172963321826509, + "learning_rate": 1.3889740339705748e-05, + "loss": 1.3399, + "step": 10218 + }, + { + "epoch": 0.9271457085828343, + "grad_norm": 0.14080184656989614, + "learning_rate": 1.3855370601948525e-05, + "loss": 1.3132, + "step": 10219 + }, + { + "epoch": 0.9272364362184722, + "grad_norm": 0.1653548521604877, + "learning_rate": 1.3821042842851627e-05, + "loss": 1.3633, + "step": 10220 + }, + { + "epoch": 0.92732716385411, + "grad_norm": 0.14504284562164294, + "learning_rate": 1.3786757065379408e-05, + "loss": 1.3074, + "step": 10221 + }, + { + "epoch": 0.9274178914897477, + "grad_norm": 0.12245087313678296, + "learning_rate": 1.3752513272492328e-05, + "loss": 1.3508, + "step": 10222 + }, + { + "epoch": 0.9275086191253856, + "grad_norm": 0.12337189744933834, + "learning_rate": 1.3718311467147416e-05, + "loss": 1.3494, + "step": 10223 + }, + { + "epoch": 0.9275993467610234, + "grad_norm": 0.12101038351535076, + "learning_rate": 1.3684151652297916e-05, + "loss": 1.3556, + "step": 10224 + }, + { + "epoch": 0.9276900743966612, + "grad_norm": 0.11900633029399137, + "learning_rate": 1.3650033830893693e-05, + "loss": 1.3243, + "step": 10225 + }, + { + "epoch": 0.9277808020322991, + "grad_norm": 0.1173229460577331, + "learning_rate": 1.3615958005880724e-05, + "loss": 1.3439, + "step": 10226 + }, + { + "epoch": 0.9278715296679368, + "grad_norm": 0.11556591420843496, + "learning_rate": 1.3581924180201377e-05, + "loss": 1.3499, + "step": 10227 + }, + { + "epoch": 0.9279622573035746, + "grad_norm": 0.11856836201236541, + "learning_rate": 1.3547932356794635e-05, + "loss": 1.3441, + "step": 10228 + }, + { + "epoch": 0.9280529849392125, + "grad_norm": 0.12019519278384312, + "learning_rate": 1.3513982538595648e-05, + "loss": 1.3431, + "step": 10229 + }, + { + "epoch": 0.9281437125748503, + "grad_norm": 0.12375106266414747, + "learning_rate": 1.3480074728535906e-05, + "loss": 1.352, + "step": 10230 + }, + { + "epoch": 0.9282344402104881, + "grad_norm": 0.13724674312014382, + "learning_rate": 1.3446208929543403e-05, + "loss": 1.3537, + "step": 10231 + }, + { + "epoch": 0.928325167846126, + "grad_norm": 0.11892016913403718, + "learning_rate": 1.3412385144542572e-05, + "loss": 1.3353, + "step": 10232 + }, + { + "epoch": 0.9284158954817637, + "grad_norm": 0.12371027798230268, + "learning_rate": 1.3378603376453969e-05, + "loss": 1.3698, + "step": 10233 + }, + { + "epoch": 0.9285066231174015, + "grad_norm": 0.11947954155483434, + "learning_rate": 1.3344863628194593e-05, + "loss": 1.303, + "step": 10234 + }, + { + "epoch": 0.9285973507530394, + "grad_norm": 0.11959583094444401, + "learning_rate": 1.3311165902678057e-05, + "loss": 1.3471, + "step": 10235 + }, + { + "epoch": 0.9286880783886772, + "grad_norm": 0.11886779968855707, + "learning_rate": 1.3277510202814091e-05, + "loss": 1.3548, + "step": 10236 + }, + { + "epoch": 0.928778806024315, + "grad_norm": 0.1272974820685666, + "learning_rate": 1.3243896531508815e-05, + "loss": 1.3473, + "step": 10237 + }, + { + "epoch": 0.9288695336599528, + "grad_norm": 0.11982003761339398, + "learning_rate": 1.3210324891664794e-05, + "loss": 1.3873, + "step": 10238 + }, + { + "epoch": 0.9289602612955906, + "grad_norm": 0.13255528319832066, + "learning_rate": 1.3176795286180986e-05, + "loss": 1.3012, + "step": 10239 + }, + { + "epoch": 0.9290509889312285, + "grad_norm": 0.14358047943236407, + "learning_rate": 1.3143307717952691e-05, + "loss": 1.3526, + "step": 10240 + }, + { + "epoch": 0.9291417165668663, + "grad_norm": 0.12148404380779784, + "learning_rate": 1.3109862189871425e-05, + "loss": 1.3062, + "step": 10241 + }, + { + "epoch": 0.9292324442025041, + "grad_norm": 0.14343998812053663, + "learning_rate": 1.3076458704825434e-05, + "loss": 1.3632, + "step": 10242 + }, + { + "epoch": 0.929323171838142, + "grad_norm": 0.12893707788639383, + "learning_rate": 1.3043097265698966e-05, + "loss": 1.3241, + "step": 10243 + }, + { + "epoch": 0.9294138994737797, + "grad_norm": 0.11970463200236658, + "learning_rate": 1.3009777875372775e-05, + "loss": 1.3127, + "step": 10244 + }, + { + "epoch": 0.9295046271094175, + "grad_norm": 0.12525263374929943, + "learning_rate": 1.2976500536724111e-05, + "loss": 1.3374, + "step": 10245 + }, + { + "epoch": 0.9295953547450554, + "grad_norm": 0.12613021434359728, + "learning_rate": 1.2943265252626402e-05, + "loss": 1.332, + "step": 10246 + }, + { + "epoch": 0.9296860823806932, + "grad_norm": 0.11734151225236412, + "learning_rate": 1.291007202594946e-05, + "loss": 1.355, + "step": 10247 + }, + { + "epoch": 0.929776810016331, + "grad_norm": 0.1201079535118238, + "learning_rate": 1.2876920859559548e-05, + "loss": 1.3459, + "step": 10248 + }, + { + "epoch": 0.9298675376519688, + "grad_norm": 0.12624403580716556, + "learning_rate": 1.2843811756319434e-05, + "loss": 1.3651, + "step": 10249 + }, + { + "epoch": 0.9299582652876066, + "grad_norm": 0.1278783785629685, + "learning_rate": 1.281074471908794e-05, + "loss": 1.3774, + "step": 10250 + }, + { + "epoch": 0.9300489929232444, + "grad_norm": 0.12309404866049979, + "learning_rate": 1.2777719750720395e-05, + "loss": 1.3129, + "step": 10251 + }, + { + "epoch": 0.9301397205588823, + "grad_norm": 0.1251196101900343, + "learning_rate": 1.2744736854068629e-05, + "loss": 1.3136, + "step": 10252 + }, + { + "epoch": 0.93023044819452, + "grad_norm": 0.24157886541156323, + "learning_rate": 1.2711796031980583e-05, + "loss": 1.3935, + "step": 10253 + }, + { + "epoch": 0.9303211758301578, + "grad_norm": 0.12813198871572506, + "learning_rate": 1.2678897287300762e-05, + "loss": 1.3533, + "step": 10254 + }, + { + "epoch": 0.9304119034657957, + "grad_norm": 0.11667308908572122, + "learning_rate": 1.2646040622870003e-05, + "loss": 1.3551, + "step": 10255 + }, + { + "epoch": 0.9305026311014335, + "grad_norm": 0.11931830833281923, + "learning_rate": 1.2613226041525539e-05, + "loss": 1.3308, + "step": 10256 + }, + { + "epoch": 0.9305933587370713, + "grad_norm": 0.12446041823664379, + "learning_rate": 1.2580453546100768e-05, + "loss": 1.381, + "step": 10257 + }, + { + "epoch": 0.9306840863727092, + "grad_norm": 0.1195003287152191, + "learning_rate": 1.2547723139425648e-05, + "loss": 1.3745, + "step": 10258 + }, + { + "epoch": 0.9307748140083469, + "grad_norm": 0.11565391427092461, + "learning_rate": 1.2515034824326533e-05, + "loss": 1.3247, + "step": 10259 + }, + { + "epoch": 0.9308655416439847, + "grad_norm": 0.1440938048004279, + "learning_rate": 1.248238860362605e-05, + "loss": 1.3325, + "step": 10260 + }, + { + "epoch": 0.9309562692796226, + "grad_norm": 0.1321702303002305, + "learning_rate": 1.2449784480143056e-05, + "loss": 1.3578, + "step": 10261 + }, + { + "epoch": 0.9310469969152604, + "grad_norm": 0.11981666700689904, + "learning_rate": 1.2417222456693023e-05, + "loss": 1.3235, + "step": 10262 + }, + { + "epoch": 0.9311377245508982, + "grad_norm": 0.11569388128701791, + "learning_rate": 1.238470253608781e-05, + "loss": 1.3466, + "step": 10263 + }, + { + "epoch": 0.931228452186536, + "grad_norm": 0.1690886997342682, + "learning_rate": 1.2352224721135341e-05, + "loss": 1.3602, + "step": 10264 + }, + { + "epoch": 0.9313191798221738, + "grad_norm": 0.12322982487519824, + "learning_rate": 1.2319789014640092e-05, + "loss": 1.3491, + "step": 10265 + }, + { + "epoch": 0.9314099074578116, + "grad_norm": 0.11420390829909736, + "learning_rate": 1.2287395419402992e-05, + "loss": 1.3273, + "step": 10266 + }, + { + "epoch": 0.9315006350934495, + "grad_norm": 0.1251443016053533, + "learning_rate": 1.2255043938221244e-05, + "loss": 1.3373, + "step": 10267 + }, + { + "epoch": 0.9315913627290873, + "grad_norm": 0.11632301323850842, + "learning_rate": 1.2222734573888227e-05, + "loss": 1.3389, + "step": 10268 + }, + { + "epoch": 0.931682090364725, + "grad_norm": 0.12290842228799112, + "learning_rate": 1.2190467329193987e-05, + "loss": 1.3441, + "step": 10269 + }, + { + "epoch": 0.9317728180003629, + "grad_norm": 0.12703830189968776, + "learning_rate": 1.2158242206924797e-05, + "loss": 1.3261, + "step": 10270 + }, + { + "epoch": 0.9318635456360007, + "grad_norm": 0.13645159064861279, + "learning_rate": 1.2126059209863316e-05, + "loss": 1.3475, + "step": 10271 + }, + { + "epoch": 0.9319542732716385, + "grad_norm": 0.12727041980802817, + "learning_rate": 1.209391834078849e-05, + "loss": 1.3601, + "step": 10272 + }, + { + "epoch": 0.9320450009072764, + "grad_norm": 0.12103704793134036, + "learning_rate": 1.2061819602475766e-05, + "loss": 1.3313, + "step": 10273 + }, + { + "epoch": 0.9321357285429142, + "grad_norm": 0.1219671518075458, + "learning_rate": 1.202976299769687e-05, + "loss": 1.3748, + "step": 10274 + }, + { + "epoch": 0.9322264561785519, + "grad_norm": 0.11571251266414043, + "learning_rate": 1.1997748529219755e-05, + "loss": 1.3473, + "step": 10275 + }, + { + "epoch": 0.9323171838141898, + "grad_norm": 0.12004870519180168, + "learning_rate": 1.1965776199809042e-05, + "loss": 1.3289, + "step": 10276 + }, + { + "epoch": 0.9324079114498276, + "grad_norm": 0.12812346728452778, + "learning_rate": 1.1933846012225525e-05, + "loss": 1.3098, + "step": 10277 + }, + { + "epoch": 0.9324986390854655, + "grad_norm": 0.12492760154246135, + "learning_rate": 1.1901957969226274e-05, + "loss": 1.3816, + "step": 10278 + }, + { + "epoch": 0.9325893667211033, + "grad_norm": 0.11701764047269334, + "learning_rate": 1.1870112073564865e-05, + "loss": 1.3654, + "step": 10279 + }, + { + "epoch": 0.932680094356741, + "grad_norm": 0.1321478133576473, + "learning_rate": 1.1838308327991266e-05, + "loss": 1.321, + "step": 10280 + }, + { + "epoch": 0.9327708219923789, + "grad_norm": 0.29052000036389547, + "learning_rate": 1.1806546735251722e-05, + "loss": 1.3546, + "step": 10281 + }, + { + "epoch": 0.9328615496280167, + "grad_norm": 0.12616526995234872, + "learning_rate": 1.1774827298088763e-05, + "loss": 1.3833, + "step": 10282 + }, + { + "epoch": 0.9329522772636545, + "grad_norm": 0.11919673982884951, + "learning_rate": 1.1743150019241422e-05, + "loss": 1.3594, + "step": 10283 + }, + { + "epoch": 0.9330430048992924, + "grad_norm": 0.11674697910054675, + "learning_rate": 1.171151490144512e-05, + "loss": 1.3032, + "step": 10284 + }, + { + "epoch": 0.9331337325349301, + "grad_norm": 0.11700879273593537, + "learning_rate": 1.1679921947431338e-05, + "loss": 1.3677, + "step": 10285 + }, + { + "epoch": 0.9332244601705679, + "grad_norm": 0.11939266581456574, + "learning_rate": 1.1648371159928339e-05, + "loss": 1.3049, + "step": 10286 + }, + { + "epoch": 0.9333151878062058, + "grad_norm": 0.11922784696416454, + "learning_rate": 1.16168625416605e-05, + "loss": 1.3009, + "step": 10287 + }, + { + "epoch": 0.9334059154418436, + "grad_norm": 0.11826975496971912, + "learning_rate": 1.1585396095348478e-05, + "loss": 1.3601, + "step": 10288 + }, + { + "epoch": 0.9334966430774814, + "grad_norm": 0.12195777923566538, + "learning_rate": 1.1553971823709542e-05, + "loss": 1.3485, + "step": 10289 + }, + { + "epoch": 0.9335873707131193, + "grad_norm": 0.11712391815336594, + "learning_rate": 1.1522589729457188e-05, + "loss": 1.3668, + "step": 10290 + }, + { + "epoch": 0.933678098348757, + "grad_norm": 0.11429049042432936, + "learning_rate": 1.1491249815301142e-05, + "loss": 1.3397, + "step": 10291 + }, + { + "epoch": 0.9337688259843948, + "grad_norm": 0.11697163902753491, + "learning_rate": 1.1459952083947622e-05, + "loss": 1.3566, + "step": 10292 + }, + { + "epoch": 0.9338595536200327, + "grad_norm": 0.12008678332219824, + "learning_rate": 1.1428696538099303e-05, + "loss": 1.3659, + "step": 10293 + }, + { + "epoch": 0.9339502812556705, + "grad_norm": 0.11562691313841325, + "learning_rate": 1.1397483180455025e-05, + "loss": 1.3297, + "step": 10294 + }, + { + "epoch": 0.9340410088913083, + "grad_norm": 0.14656134763636974, + "learning_rate": 1.1366312013710078e-05, + "loss": 1.3771, + "step": 10295 + }, + { + "epoch": 0.9341317365269461, + "grad_norm": 0.11816440563379535, + "learning_rate": 1.133518304055614e-05, + "loss": 1.3695, + "step": 10296 + }, + { + "epoch": 0.9342224641625839, + "grad_norm": 0.11764476956850754, + "learning_rate": 1.1304096263681173e-05, + "loss": 1.38, + "step": 10297 + }, + { + "epoch": 0.9343131917982217, + "grad_norm": 0.11871622747342923, + "learning_rate": 1.1273051685769475e-05, + "loss": 1.2864, + "step": 10298 + }, + { + "epoch": 0.9344039194338596, + "grad_norm": 0.13355925944331015, + "learning_rate": 1.1242049309501845e-05, + "loss": 1.363, + "step": 10299 + }, + { + "epoch": 0.9344946470694974, + "grad_norm": 0.12473238982015383, + "learning_rate": 1.1211089137555308e-05, + "loss": 1.3413, + "step": 10300 + }, + { + "epoch": 0.9345853747051351, + "grad_norm": 0.12480052320635628, + "learning_rate": 1.118017117260317e-05, + "loss": 1.3537, + "step": 10301 + }, + { + "epoch": 0.934676102340773, + "grad_norm": 0.12380109591083506, + "learning_rate": 1.1149295417315352e-05, + "loss": 1.3366, + "step": 10302 + }, + { + "epoch": 0.9347668299764108, + "grad_norm": 0.1252870191973566, + "learning_rate": 1.1118461874357944e-05, + "loss": 1.3275, + "step": 10303 + }, + { + "epoch": 0.9348575576120486, + "grad_norm": 0.11791000609055356, + "learning_rate": 1.1087670546393424e-05, + "loss": 1.3065, + "step": 10304 + }, + { + "epoch": 0.9349482852476865, + "grad_norm": 0.16068035981012987, + "learning_rate": 1.1056921436080503e-05, + "loss": 1.3516, + "step": 10305 + }, + { + "epoch": 0.9350390128833242, + "grad_norm": 0.12023760241437452, + "learning_rate": 1.1026214546074609e-05, + "loss": 1.3333, + "step": 10306 + }, + { + "epoch": 0.935129740518962, + "grad_norm": 0.12472497732394923, + "learning_rate": 1.0995549879027066e-05, + "loss": 1.3756, + "step": 10307 + }, + { + "epoch": 0.9352204681545999, + "grad_norm": 0.12597887030732136, + "learning_rate": 1.096492743758587e-05, + "loss": 1.3277, + "step": 10308 + }, + { + "epoch": 0.9353111957902377, + "grad_norm": 0.11850864656988958, + "learning_rate": 1.093434722439529e-05, + "loss": 1.3675, + "step": 10309 + }, + { + "epoch": 0.9354019234258755, + "grad_norm": 0.12700910992797124, + "learning_rate": 1.0903809242095942e-05, + "loss": 1.3215, + "step": 10310 + }, + { + "epoch": 0.9354926510615134, + "grad_norm": 0.2074244585691812, + "learning_rate": 1.087331349332471e-05, + "loss": 1.3322, + "step": 10311 + }, + { + "epoch": 0.9355833786971511, + "grad_norm": 0.12256581534147604, + "learning_rate": 1.0842859980714937e-05, + "loss": 1.351, + "step": 10312 + }, + { + "epoch": 0.9356741063327889, + "grad_norm": 0.11308858790871659, + "learning_rate": 1.081244870689635e-05, + "loss": 1.3527, + "step": 10313 + }, + { + "epoch": 0.9357648339684268, + "grad_norm": 0.11619574239555634, + "learning_rate": 1.0782079674494905e-05, + "loss": 1.3069, + "step": 10314 + }, + { + "epoch": 0.9358555616040646, + "grad_norm": 0.12000979966372714, + "learning_rate": 1.0751752886132892e-05, + "loss": 1.3419, + "step": 10315 + }, + { + "epoch": 0.9359462892397025, + "grad_norm": 0.12043871335546728, + "learning_rate": 1.0721468344429219e-05, + "loss": 1.3192, + "step": 10316 + }, + { + "epoch": 0.9360370168753402, + "grad_norm": 0.12358537330564835, + "learning_rate": 1.0691226051998847e-05, + "loss": 1.3426, + "step": 10317 + }, + { + "epoch": 0.936127744510978, + "grad_norm": 0.12312725901930328, + "learning_rate": 1.0661026011453245e-05, + "loss": 1.3514, + "step": 10318 + }, + { + "epoch": 0.9362184721466159, + "grad_norm": 0.11679780749815821, + "learning_rate": 1.0630868225400103e-05, + "loss": 1.3015, + "step": 10319 + }, + { + "epoch": 0.9363091997822537, + "grad_norm": 0.116527396230071, + "learning_rate": 1.0600752696443671e-05, + "loss": 1.3301, + "step": 10320 + }, + { + "epoch": 0.9363999274178915, + "grad_norm": 0.21463841640877288, + "learning_rate": 1.0570679427184371e-05, + "loss": 1.3848, + "step": 10321 + }, + { + "epoch": 0.9364906550535294, + "grad_norm": 0.12363229557312053, + "learning_rate": 1.0540648420218957e-05, + "loss": 1.3982, + "step": 10322 + }, + { + "epoch": 0.9365813826891671, + "grad_norm": 0.12010937442182755, + "learning_rate": 1.0510659678140799e-05, + "loss": 1.355, + "step": 10323 + }, + { + "epoch": 0.9366721103248049, + "grad_norm": 0.11551174103363301, + "learning_rate": 1.0480713203539271e-05, + "loss": 1.3402, + "step": 10324 + }, + { + "epoch": 0.9367628379604428, + "grad_norm": 0.1224367112874941, + "learning_rate": 1.0450808999000306e-05, + "loss": 1.352, + "step": 10325 + }, + { + "epoch": 0.9368535655960806, + "grad_norm": 0.14866169581940553, + "learning_rate": 1.0420947067106113e-05, + "loss": 1.3416, + "step": 10326 + }, + { + "epoch": 0.9369442932317184, + "grad_norm": 0.12268028111294067, + "learning_rate": 1.0391127410435353e-05, + "loss": 1.3371, + "step": 10327 + }, + { + "epoch": 0.9370350208673562, + "grad_norm": 0.11385046987088229, + "learning_rate": 1.0361350031562911e-05, + "loss": 1.3397, + "step": 10328 + }, + { + "epoch": 0.937125748502994, + "grad_norm": 0.13407981133969965, + "learning_rate": 1.0331614933059952e-05, + "loss": 1.3574, + "step": 10329 + }, + { + "epoch": 0.9372164761386318, + "grad_norm": 0.12211818025870064, + "learning_rate": 1.030192211749431e-05, + "loss": 1.319, + "step": 10330 + }, + { + "epoch": 0.9373072037742697, + "grad_norm": 0.12348704305655733, + "learning_rate": 1.0272271587429882e-05, + "loss": 1.3371, + "step": 10331 + }, + { + "epoch": 0.9373979314099075, + "grad_norm": 0.12834894922709955, + "learning_rate": 1.0242663345426895e-05, + "loss": 1.344, + "step": 10332 + }, + { + "epoch": 0.9374886590455452, + "grad_norm": 0.11872549636364875, + "learning_rate": 1.0213097394042137e-05, + "loss": 1.3353, + "step": 10333 + }, + { + "epoch": 0.9375793866811831, + "grad_norm": 0.1330097591382414, + "learning_rate": 1.0183573735828622e-05, + "loss": 1.3563, + "step": 10334 + }, + { + "epoch": 0.9376701143168209, + "grad_norm": 0.1170302499192709, + "learning_rate": 1.0154092373335755e-05, + "loss": 1.3631, + "step": 10335 + }, + { + "epoch": 0.9377608419524587, + "grad_norm": 0.12857669619323064, + "learning_rate": 1.012465330910911e-05, + "loss": 1.3739, + "step": 10336 + }, + { + "epoch": 0.9378515695880966, + "grad_norm": 0.12281019939195316, + "learning_rate": 1.0095256545690933e-05, + "loss": 1.341, + "step": 10337 + }, + { + "epoch": 0.9379422972237343, + "grad_norm": 0.1192353875253703, + "learning_rate": 1.0065902085619583e-05, + "loss": 1.3453, + "step": 10338 + }, + { + "epoch": 0.9380330248593721, + "grad_norm": 0.11835242019676871, + "learning_rate": 1.0036589931429695e-05, + "loss": 1.335, + "step": 10339 + }, + { + "epoch": 0.93812375249501, + "grad_norm": 0.11886724757355042, + "learning_rate": 1.0007320085652527e-05, + "loss": 1.3694, + "step": 10340 + }, + { + "epoch": 0.9382144801306478, + "grad_norm": 0.12056709367086313, + "learning_rate": 9.978092550815498e-06, + "loss": 1.3847, + "step": 10341 + }, + { + "epoch": 0.9383052077662856, + "grad_norm": 0.12556700607117954, + "learning_rate": 9.94890732944237e-06, + "loss": 1.3316, + "step": 10342 + }, + { + "epoch": 0.9383959354019235, + "grad_norm": 0.12610697430444107, + "learning_rate": 9.919764424053346e-06, + "loss": 1.3828, + "step": 10343 + }, + { + "epoch": 0.9384866630375612, + "grad_norm": 0.1613189380092912, + "learning_rate": 9.890663837164916e-06, + "loss": 1.3165, + "step": 10344 + }, + { + "epoch": 0.938577390673199, + "grad_norm": 0.12086237569556885, + "learning_rate": 9.861605571289844e-06, + "loss": 1.3888, + "step": 10345 + }, + { + "epoch": 0.9386681183088369, + "grad_norm": 0.13596601690353613, + "learning_rate": 9.832589628937404e-06, + "loss": 1.3276, + "step": 10346 + }, + { + "epoch": 0.9387588459444747, + "grad_norm": 0.12696955864377535, + "learning_rate": 9.803616012613092e-06, + "loss": 1.3502, + "step": 10347 + }, + { + "epoch": 0.9388495735801125, + "grad_norm": 0.11937787618087453, + "learning_rate": 9.774684724818738e-06, + "loss": 1.3213, + "step": 10348 + }, + { + "epoch": 0.9389403012157503, + "grad_norm": 0.11739550417514072, + "learning_rate": 9.745795768052678e-06, + "loss": 1.3176, + "step": 10349 + }, + { + "epoch": 0.9390310288513881, + "grad_norm": 0.1440476988747064, + "learning_rate": 9.716949144809362e-06, + "loss": 1.3616, + "step": 10350 + }, + { + "epoch": 0.9391217564870259, + "grad_norm": 0.11648410425360839, + "learning_rate": 9.688144857579795e-06, + "loss": 1.299, + "step": 10351 + }, + { + "epoch": 0.9392124841226638, + "grad_norm": 0.11707745575982761, + "learning_rate": 9.659382908851155e-06, + "loss": 1.346, + "step": 10352 + }, + { + "epoch": 0.9393032117583016, + "grad_norm": 0.12139864491456438, + "learning_rate": 9.630663301107068e-06, + "loss": 1.3194, + "step": 10353 + }, + { + "epoch": 0.9393939393939394, + "grad_norm": 0.12663762417133584, + "learning_rate": 9.601986036827492e-06, + "loss": 1.3369, + "step": 10354 + }, + { + "epoch": 0.9394846670295772, + "grad_norm": 0.11865322170140544, + "learning_rate": 9.57335111848867e-06, + "loss": 1.3671, + "step": 10355 + }, + { + "epoch": 0.939575394665215, + "grad_norm": 0.1159180879950521, + "learning_rate": 9.544758548563293e-06, + "loss": 1.3763, + "step": 10356 + }, + { + "epoch": 0.9396661223008529, + "grad_norm": 0.12489927618111764, + "learning_rate": 9.516208329520215e-06, + "loss": 1.3754, + "step": 10357 + }, + { + "epoch": 0.9397568499364907, + "grad_norm": 0.11994058381763922, + "learning_rate": 9.487700463824966e-06, + "loss": 1.3378, + "step": 10358 + }, + { + "epoch": 0.9398475775721284, + "grad_norm": 0.12674685850707215, + "learning_rate": 9.459234953939022e-06, + "loss": 1.293, + "step": 10359 + }, + { + "epoch": 0.9399383052077663, + "grad_norm": 0.12009546511392513, + "learning_rate": 9.430811802320415e-06, + "loss": 1.3785, + "step": 10360 + }, + { + "epoch": 0.9400290328434041, + "grad_norm": 0.12185500816549771, + "learning_rate": 9.402431011423518e-06, + "loss": 1.3328, + "step": 10361 + }, + { + "epoch": 0.9401197604790419, + "grad_norm": 0.11425078206318046, + "learning_rate": 9.374092583699034e-06, + "loss": 1.3384, + "step": 10362 + }, + { + "epoch": 0.9402104881146798, + "grad_norm": 0.1291740876422281, + "learning_rate": 9.345796521593953e-06, + "loss": 1.3718, + "step": 10363 + }, + { + "epoch": 0.9403012157503176, + "grad_norm": 0.1194675476173377, + "learning_rate": 9.31754282755165e-06, + "loss": 1.3014, + "step": 10364 + }, + { + "epoch": 0.9403919433859553, + "grad_norm": 0.12242801739921116, + "learning_rate": 9.289331504011845e-06, + "loss": 1.3526, + "step": 10365 + }, + { + "epoch": 0.9404826710215932, + "grad_norm": 0.13268192041914537, + "learning_rate": 9.261162553410585e-06, + "loss": 1.3701, + "step": 10366 + }, + { + "epoch": 0.940573398657231, + "grad_norm": 0.16534371384978536, + "learning_rate": 9.23303597818026e-06, + "loss": 1.3737, + "step": 10367 + }, + { + "epoch": 0.9406641262928688, + "grad_norm": 0.12540866919561164, + "learning_rate": 9.204951780749648e-06, + "loss": 1.3558, + "step": 10368 + }, + { + "epoch": 0.9407548539285067, + "grad_norm": 0.12466346824350873, + "learning_rate": 9.176909963543811e-06, + "loss": 1.3349, + "step": 10369 + }, + { + "epoch": 0.9408455815641444, + "grad_norm": 0.1222657121315179, + "learning_rate": 9.148910528984033e-06, + "loss": 1.3079, + "step": 10370 + }, + { + "epoch": 0.9409363091997822, + "grad_norm": 0.14453608472893814, + "learning_rate": 9.120953479488214e-06, + "loss": 1.3532, + "step": 10371 + }, + { + "epoch": 0.9410270368354201, + "grad_norm": 0.1247933431401224, + "learning_rate": 9.093038817470477e-06, + "loss": 1.3168, + "step": 10372 + }, + { + "epoch": 0.9411177644710579, + "grad_norm": 0.11921819053219856, + "learning_rate": 9.065166545341119e-06, + "loss": 1.3314, + "step": 10373 + }, + { + "epoch": 0.9412084921066957, + "grad_norm": 0.1480724672311389, + "learning_rate": 9.03733666550699e-06, + "loss": 1.3175, + "step": 10374 + }, + { + "epoch": 0.9412992197423335, + "grad_norm": 0.1237217009556979, + "learning_rate": 9.00954918037128e-06, + "loss": 1.3766, + "step": 10375 + }, + { + "epoch": 0.9413899473779713, + "grad_norm": 0.12058471532513731, + "learning_rate": 8.981804092333234e-06, + "loss": 1.3335, + "step": 10376 + }, + { + "epoch": 0.9414806750136091, + "grad_norm": 0.11295281006712332, + "learning_rate": 8.954101403788884e-06, + "loss": 1.3282, + "step": 10377 + }, + { + "epoch": 0.941571402649247, + "grad_norm": 0.1217082134386643, + "learning_rate": 8.926441117130201e-06, + "loss": 1.3395, + "step": 10378 + }, + { + "epoch": 0.9416621302848848, + "grad_norm": 0.1281626002142225, + "learning_rate": 8.898823234745723e-06, + "loss": 1.3652, + "step": 10379 + }, + { + "epoch": 0.9417528579205225, + "grad_norm": 0.11773659141973167, + "learning_rate": 8.871247759020206e-06, + "loss": 1.3282, + "step": 10380 + }, + { + "epoch": 0.9418435855561604, + "grad_norm": 0.12471406799520829, + "learning_rate": 8.84371469233486e-06, + "loss": 1.3096, + "step": 10381 + }, + { + "epoch": 0.9419343131917982, + "grad_norm": 0.11851384370436868, + "learning_rate": 8.816224037067112e-06, + "loss": 1.3441, + "step": 10382 + }, + { + "epoch": 0.942025040827436, + "grad_norm": 0.12968742608588907, + "learning_rate": 8.788775795590842e-06, + "loss": 1.3509, + "step": 10383 + }, + { + "epoch": 0.9421157684630739, + "grad_norm": 0.1576022098471875, + "learning_rate": 8.761369970276155e-06, + "loss": 1.3719, + "step": 10384 + }, + { + "epoch": 0.9422064960987117, + "grad_norm": 0.14724349708766005, + "learning_rate": 8.734006563489604e-06, + "loss": 1.3382, + "step": 10385 + }, + { + "epoch": 0.9422972237343494, + "grad_norm": 0.11481151034114291, + "learning_rate": 8.706685577593965e-06, + "loss": 1.332, + "step": 10386 + }, + { + "epoch": 0.9423879513699873, + "grad_norm": 0.12470610214733135, + "learning_rate": 8.679407014948405e-06, + "loss": 1.3213, + "step": 10387 + }, + { + "epoch": 0.9424786790056251, + "grad_norm": 0.11770029073716867, + "learning_rate": 8.652170877908538e-06, + "loss": 1.3912, + "step": 10388 + }, + { + "epoch": 0.9425694066412629, + "grad_norm": 0.12294620649290831, + "learning_rate": 8.624977168826097e-06, + "loss": 1.3283, + "step": 10389 + }, + { + "epoch": 0.9426601342769008, + "grad_norm": 0.1418942364175844, + "learning_rate": 8.59782589004926e-06, + "loss": 1.3466, + "step": 10390 + }, + { + "epoch": 0.9427508619125385, + "grad_norm": 0.11787033714741098, + "learning_rate": 8.570717043922648e-06, + "loss": 1.3556, + "step": 10391 + }, + { + "epoch": 0.9428415895481764, + "grad_norm": 0.13633004721880032, + "learning_rate": 8.543650632787058e-06, + "loss": 1.368, + "step": 10392 + }, + { + "epoch": 0.9429323171838142, + "grad_norm": 0.11623196234100452, + "learning_rate": 8.51662665897962e-06, + "loss": 1.3274, + "step": 10393 + }, + { + "epoch": 0.943023044819452, + "grad_norm": 0.12645734206193926, + "learning_rate": 8.48964512483391e-06, + "loss": 1.3718, + "step": 10394 + }, + { + "epoch": 0.9431137724550899, + "grad_norm": 0.12299770549849288, + "learning_rate": 8.46270603267979e-06, + "loss": 1.3934, + "step": 10395 + }, + { + "epoch": 0.9432045000907276, + "grad_norm": 0.2880575020901579, + "learning_rate": 8.43580938484345e-06, + "loss": 1.3665, + "step": 10396 + }, + { + "epoch": 0.9432952277263654, + "grad_norm": 0.12644505019052243, + "learning_rate": 8.40895518364737e-06, + "loss": 1.3602, + "step": 10397 + }, + { + "epoch": 0.9433859553620033, + "grad_norm": 0.124277386583954, + "learning_rate": 8.382143431410583e-06, + "loss": 1.3422, + "step": 10398 + }, + { + "epoch": 0.9434766829976411, + "grad_norm": 0.11957789634595213, + "learning_rate": 8.355374130448068e-06, + "loss": 1.3415, + "step": 10399 + }, + { + "epoch": 0.9435674106332789, + "grad_norm": 0.1262986555976474, + "learning_rate": 8.328647283071478e-06, + "loss": 1.387, + "step": 10400 + }, + { + "epoch": 0.9436581382689168, + "grad_norm": 0.13239788525548585, + "learning_rate": 8.301962891588688e-06, + "loss": 1.3556, + "step": 10401 + }, + { + "epoch": 0.9437488659045545, + "grad_norm": 0.11709498187626662, + "learning_rate": 8.275320958303855e-06, + "loss": 1.3112, + "step": 10402 + }, + { + "epoch": 0.9438395935401923, + "grad_norm": 0.11802436170400463, + "learning_rate": 8.248721485517529e-06, + "loss": 1.2812, + "step": 10403 + }, + { + "epoch": 0.9439303211758302, + "grad_norm": 0.11422977812243566, + "learning_rate": 8.222164475526538e-06, + "loss": 1.3292, + "step": 10404 + }, + { + "epoch": 0.944021048811468, + "grad_norm": 0.11106623151910319, + "learning_rate": 8.195649930624215e-06, + "loss": 1.3423, + "step": 10405 + }, + { + "epoch": 0.9441117764471058, + "grad_norm": 0.11689085646070856, + "learning_rate": 8.169177853099951e-06, + "loss": 1.3465, + "step": 10406 + }, + { + "epoch": 0.9442025040827436, + "grad_norm": 0.11347344014637281, + "learning_rate": 8.142748245239639e-06, + "loss": 1.3437, + "step": 10407 + }, + { + "epoch": 0.9442932317183814, + "grad_norm": 0.11838587355588012, + "learning_rate": 8.116361109325566e-06, + "loss": 1.3273, + "step": 10408 + }, + { + "epoch": 0.9443839593540192, + "grad_norm": 0.12659937378384925, + "learning_rate": 8.09001644763624e-06, + "loss": 1.3574, + "step": 10409 + }, + { + "epoch": 0.9444746869896571, + "grad_norm": 0.12203869684289866, + "learning_rate": 8.063714262446398e-06, + "loss": 1.3461, + "step": 10410 + }, + { + "epoch": 0.9445654146252949, + "grad_norm": 0.12844178918589255, + "learning_rate": 8.037454556027334e-06, + "loss": 1.3679, + "step": 10411 + }, + { + "epoch": 0.9446561422609326, + "grad_norm": 0.11958481274350019, + "learning_rate": 8.011237330646626e-06, + "loss": 1.354, + "step": 10412 + }, + { + "epoch": 0.9447468698965705, + "grad_norm": 0.12110250165636074, + "learning_rate": 7.985062588568126e-06, + "loss": 1.3064, + "step": 10413 + }, + { + "epoch": 0.9448375975322083, + "grad_norm": 0.12194251642820535, + "learning_rate": 7.958930332051918e-06, + "loss": 1.3529, + "step": 10414 + }, + { + "epoch": 0.9449283251678461, + "grad_norm": 0.12130403180456631, + "learning_rate": 7.932840563354582e-06, + "loss": 1.3372, + "step": 10415 + }, + { + "epoch": 0.945019052803484, + "grad_norm": 0.11992112882665672, + "learning_rate": 7.906793284729042e-06, + "loss": 1.3311, + "step": 10416 + }, + { + "epoch": 0.9451097804391217, + "grad_norm": 0.15860927974785624, + "learning_rate": 7.880788498424384e-06, + "loss": 1.3314, + "step": 10417 + }, + { + "epoch": 0.9452005080747595, + "grad_norm": 0.11924406342978687, + "learning_rate": 7.854826206686206e-06, + "loss": 1.3501, + "step": 10418 + }, + { + "epoch": 0.9452912357103974, + "grad_norm": 0.12026416253022137, + "learning_rate": 7.828906411756321e-06, + "loss": 1.3673, + "step": 10419 + }, + { + "epoch": 0.9453819633460352, + "grad_norm": 0.12730430352839656, + "learning_rate": 7.803029115872883e-06, + "loss": 1.3109, + "step": 10420 + }, + { + "epoch": 0.945472690981673, + "grad_norm": 0.12646467719345728, + "learning_rate": 7.777194321270441e-06, + "loss": 1.3415, + "step": 10421 + }, + { + "epoch": 0.9455634186173109, + "grad_norm": 0.12646928344285197, + "learning_rate": 7.751402030179822e-06, + "loss": 1.3355, + "step": 10422 + }, + { + "epoch": 0.9456541462529486, + "grad_norm": 0.1209381912334345, + "learning_rate": 7.725652244828185e-06, + "loss": 1.3142, + "step": 10423 + }, + { + "epoch": 0.9457448738885864, + "grad_norm": 0.12016047698046985, + "learning_rate": 7.699944967438976e-06, + "loss": 1.3171, + "step": 10424 + }, + { + "epoch": 0.9458356015242243, + "grad_norm": 0.1212914483188228, + "learning_rate": 7.674280200232142e-06, + "loss": 1.3699, + "step": 10425 + }, + { + "epoch": 0.9459263291598621, + "grad_norm": 0.12248842954992388, + "learning_rate": 7.648657945423743e-06, + "loss": 1.3314, + "step": 10426 + }, + { + "epoch": 0.9460170567954999, + "grad_norm": 0.12033915796155542, + "learning_rate": 7.623078205226286e-06, + "loss": 1.2977, + "step": 10427 + }, + { + "epoch": 0.9461077844311377, + "grad_norm": 0.12121857607322407, + "learning_rate": 7.597540981848616e-06, + "loss": 1.3721, + "step": 10428 + }, + { + "epoch": 0.9461985120667755, + "grad_norm": 0.1628029519974939, + "learning_rate": 7.572046277495859e-06, + "loss": 1.3498, + "step": 10429 + }, + { + "epoch": 0.9462892397024134, + "grad_norm": 0.1154822198583858, + "learning_rate": 7.546594094369475e-06, + "loss": 1.3442, + "step": 10430 + }, + { + "epoch": 0.9463799673380512, + "grad_norm": 0.12323450333706305, + "learning_rate": 7.5211844346672605e-06, + "loss": 1.3532, + "step": 10431 + }, + { + "epoch": 0.946470694973689, + "grad_norm": 0.13961929228477485, + "learning_rate": 7.4958173005833515e-06, + "loss": 1.3336, + "step": 10432 + }, + { + "epoch": 0.9465614226093269, + "grad_norm": 0.11893869797861749, + "learning_rate": 7.470492694308273e-06, + "loss": 1.3393, + "step": 10433 + }, + { + "epoch": 0.9466521502449646, + "grad_norm": 0.12352919356329563, + "learning_rate": 7.445210618028664e-06, + "loss": 1.3508, + "step": 10434 + }, + { + "epoch": 0.9467428778806024, + "grad_norm": 0.12163098123839533, + "learning_rate": 7.419971073927667e-06, + "loss": 1.3598, + "step": 10435 + }, + { + "epoch": 0.9468336055162403, + "grad_norm": 0.11952348452746932, + "learning_rate": 7.394774064184872e-06, + "loss": 1.3619, + "step": 10436 + }, + { + "epoch": 0.9469243331518781, + "grad_norm": 0.12769156680703875, + "learning_rate": 7.369619590975929e-06, + "loss": 1.3531, + "step": 10437 + }, + { + "epoch": 0.9470150607875158, + "grad_norm": 0.11932894739673906, + "learning_rate": 7.344507656472876e-06, + "loss": 1.323, + "step": 10438 + }, + { + "epoch": 0.9471057884231537, + "grad_norm": 0.11436027300943719, + "learning_rate": 7.319438262844258e-06, + "loss": 1.311, + "step": 10439 + }, + { + "epoch": 0.9471965160587915, + "grad_norm": 0.12228580772139606, + "learning_rate": 7.294411412254731e-06, + "loss": 1.4023, + "step": 10440 + }, + { + "epoch": 0.9472872436944293, + "grad_norm": 0.11966889964856853, + "learning_rate": 7.269427106865401e-06, + "loss": 1.3154, + "step": 10441 + }, + { + "epoch": 0.9473779713300672, + "grad_norm": 0.11738597577285648, + "learning_rate": 7.244485348833651e-06, + "loss": 1.3446, + "step": 10442 + }, + { + "epoch": 0.947468698965705, + "grad_norm": 0.12171792437286084, + "learning_rate": 7.219586140313206e-06, + "loss": 1.3435, + "step": 10443 + }, + { + "epoch": 0.9475594266013427, + "grad_norm": 0.13043316161550408, + "learning_rate": 7.194729483454121e-06, + "loss": 1.3635, + "step": 10444 + }, + { + "epoch": 0.9476501542369806, + "grad_norm": 0.1150169375584064, + "learning_rate": 7.169915380402847e-06, + "loss": 1.3649, + "step": 10445 + }, + { + "epoch": 0.9477408818726184, + "grad_norm": 0.12442711676045098, + "learning_rate": 7.145143833302003e-06, + "loss": 1.3713, + "step": 10446 + }, + { + "epoch": 0.9478316095082562, + "grad_norm": 0.13489723846629972, + "learning_rate": 7.1204148442906014e-06, + "loss": 1.3637, + "step": 10447 + }, + { + "epoch": 0.9479223371438941, + "grad_norm": 0.12677875500213084, + "learning_rate": 7.095728415503988e-06, + "loss": 1.3294, + "step": 10448 + }, + { + "epoch": 0.9480130647795318, + "grad_norm": 0.1266234037804297, + "learning_rate": 7.0710845490739025e-06, + "loss": 1.3229, + "step": 10449 + }, + { + "epoch": 0.9481037924151696, + "grad_norm": 0.12004263484016668, + "learning_rate": 7.04648324712831e-06, + "loss": 1.3274, + "step": 10450 + }, + { + "epoch": 0.9481945200508075, + "grad_norm": 0.1182757391553664, + "learning_rate": 7.021924511791511e-06, + "loss": 1.3239, + "step": 10451 + }, + { + "epoch": 0.9482852476864453, + "grad_norm": 0.12205139538239128, + "learning_rate": 6.997408345184253e-06, + "loss": 1.3463, + "step": 10452 + }, + { + "epoch": 0.9483759753220831, + "grad_norm": 0.1562737989486353, + "learning_rate": 6.972934749423454e-06, + "loss": 1.3408, + "step": 10453 + }, + { + "epoch": 0.948466702957721, + "grad_norm": 0.14676799770107254, + "learning_rate": 6.948503726622312e-06, + "loss": 1.3637, + "step": 10454 + }, + { + "epoch": 0.9485574305933587, + "grad_norm": 0.1174251239014913, + "learning_rate": 6.924115278890642e-06, + "loss": 1.3183, + "step": 10455 + }, + { + "epoch": 0.9486481582289965, + "grad_norm": 0.12052695678885375, + "learning_rate": 6.899769408334256e-06, + "loss": 1.3775, + "step": 10456 + }, + { + "epoch": 0.9487388858646344, + "grad_norm": 0.13096679172628822, + "learning_rate": 6.8754661170554174e-06, + "loss": 1.3464, + "step": 10457 + }, + { + "epoch": 0.9488296135002722, + "grad_norm": 0.12979929248780592, + "learning_rate": 6.851205407152783e-06, + "loss": 1.3606, + "step": 10458 + }, + { + "epoch": 0.94892034113591, + "grad_norm": 0.11518222548850031, + "learning_rate": 6.82698728072123e-06, + "loss": 1.3401, + "step": 10459 + }, + { + "epoch": 0.9490110687715478, + "grad_norm": 0.12555770973153932, + "learning_rate": 6.802811739852033e-06, + "loss": 1.378, + "step": 10460 + }, + { + "epoch": 0.9491017964071856, + "grad_norm": 0.11919384070815084, + "learning_rate": 6.778678786632741e-06, + "loss": 1.3489, + "step": 10461 + }, + { + "epoch": 0.9491925240428234, + "grad_norm": 0.12354594293303098, + "learning_rate": 6.754588423147245e-06, + "loss": 1.3928, + "step": 10462 + }, + { + "epoch": 0.9492832516784613, + "grad_norm": 0.1257967933066605, + "learning_rate": 6.7305406514757136e-06, + "loss": 1.3609, + "step": 10463 + }, + { + "epoch": 0.9493739793140991, + "grad_norm": 0.1163441474434418, + "learning_rate": 6.706535473694653e-06, + "loss": 1.3413, + "step": 10464 + }, + { + "epoch": 0.9494647069497368, + "grad_norm": 0.11955714362064228, + "learning_rate": 6.682572891876959e-06, + "loss": 1.3371, + "step": 10465 + }, + { + "epoch": 0.9495554345853747, + "grad_norm": 0.12391514120363491, + "learning_rate": 6.658652908091867e-06, + "loss": 1.3528, + "step": 10466 + }, + { + "epoch": 0.9496461622210125, + "grad_norm": 0.12400878056635875, + "learning_rate": 6.634775524404779e-06, + "loss": 1.3468, + "step": 10467 + }, + { + "epoch": 0.9497368898566504, + "grad_norm": 0.1164720618608087, + "learning_rate": 6.610940742877547e-06, + "loss": 1.3513, + "step": 10468 + }, + { + "epoch": 0.9498276174922882, + "grad_norm": 0.11795702774211589, + "learning_rate": 6.587148565568301e-06, + "loss": 1.3427, + "step": 10469 + }, + { + "epoch": 0.949918345127926, + "grad_norm": 0.12320531215269842, + "learning_rate": 6.563398994531511e-06, + "loss": 1.3398, + "step": 10470 + }, + { + "epoch": 0.9500090727635638, + "grad_norm": 0.1240510412263065, + "learning_rate": 6.539692031817923e-06, + "loss": 1.3382, + "step": 10471 + }, + { + "epoch": 0.9500998003992016, + "grad_norm": 0.11715044091363502, + "learning_rate": 6.516027679474623e-06, + "loss": 1.3534, + "step": 10472 + }, + { + "epoch": 0.9501905280348394, + "grad_norm": 0.12303179140291866, + "learning_rate": 6.492405939545087e-06, + "loss": 1.3173, + "step": 10473 + }, + { + "epoch": 0.9502812556704773, + "grad_norm": 0.13017270302937659, + "learning_rate": 6.468826814069073e-06, + "loss": 1.3591, + "step": 10474 + }, + { + "epoch": 0.950371983306115, + "grad_norm": 0.12444519058988712, + "learning_rate": 6.445290305082563e-06, + "loss": 1.3491, + "step": 10475 + }, + { + "epoch": 0.9504627109417528, + "grad_norm": 0.12232717840298192, + "learning_rate": 6.421796414617987e-06, + "loss": 1.3587, + "step": 10476 + }, + { + "epoch": 0.9505534385773907, + "grad_norm": 0.1303666133734349, + "learning_rate": 6.398345144704054e-06, + "loss": 1.3623, + "step": 10477 + }, + { + "epoch": 0.9506441662130285, + "grad_norm": 0.1484430948064812, + "learning_rate": 6.374936497365758e-06, + "loss": 1.3447, + "step": 10478 + }, + { + "epoch": 0.9507348938486663, + "grad_norm": 0.13696251227864123, + "learning_rate": 6.351570474624424e-06, + "loss": 1.3485, + "step": 10479 + }, + { + "epoch": 0.9508256214843042, + "grad_norm": 0.12991744472010505, + "learning_rate": 6.328247078497773e-06, + "loss": 1.3555, + "step": 10480 + }, + { + "epoch": 0.9509163491199419, + "grad_norm": 0.12471319696713576, + "learning_rate": 6.304966310999749e-06, + "loss": 1.3141, + "step": 10481 + }, + { + "epoch": 0.9510070767555797, + "grad_norm": 0.12325499324345457, + "learning_rate": 6.281728174140577e-06, + "loss": 1.3242, + "step": 10482 + }, + { + "epoch": 0.9510978043912176, + "grad_norm": 0.11438264051733839, + "learning_rate": 6.258532669927042e-06, + "loss": 1.3328, + "step": 10483 + }, + { + "epoch": 0.9511885320268554, + "grad_norm": 0.15822792045391978, + "learning_rate": 6.235379800361929e-06, + "loss": 1.3678, + "step": 10484 + }, + { + "epoch": 0.9512792596624932, + "grad_norm": 0.13797631667689889, + "learning_rate": 6.212269567444528e-06, + "loss": 1.3599, + "step": 10485 + }, + { + "epoch": 0.951369987298131, + "grad_norm": 0.11085238633660753, + "learning_rate": 6.1892019731704665e-06, + "loss": 1.3572, + "step": 10486 + }, + { + "epoch": 0.9514607149337688, + "grad_norm": 0.12805249624481918, + "learning_rate": 6.166177019531594e-06, + "loss": 1.3387, + "step": 10487 + }, + { + "epoch": 0.9515514425694066, + "grad_norm": 0.13962500718702558, + "learning_rate": 6.143194708516042e-06, + "loss": 1.3549, + "step": 10488 + }, + { + "epoch": 0.9516421702050445, + "grad_norm": 0.12390358665838813, + "learning_rate": 6.120255042108447e-06, + "loss": 1.3365, + "step": 10489 + }, + { + "epoch": 0.9517328978406823, + "grad_norm": 0.11907335831019768, + "learning_rate": 6.097358022289667e-06, + "loss": 1.3608, + "step": 10490 + }, + { + "epoch": 0.95182362547632, + "grad_norm": 0.12289910610258081, + "learning_rate": 6.074503651036789e-06, + "loss": 1.3678, + "step": 10491 + }, + { + "epoch": 0.9519143531119579, + "grad_norm": 0.13648349418502273, + "learning_rate": 6.051691930323344e-06, + "loss": 1.3604, + "step": 10492 + }, + { + "epoch": 0.9520050807475957, + "grad_norm": 0.11923652054932378, + "learning_rate": 6.02892286211909e-06, + "loss": 1.3584, + "step": 10493 + }, + { + "epoch": 0.9520958083832335, + "grad_norm": 0.11968190121743548, + "learning_rate": 6.006196448390122e-06, + "loss": 1.3777, + "step": 10494 + }, + { + "epoch": 0.9521865360188714, + "grad_norm": 0.11818738470234746, + "learning_rate": 5.983512691098925e-06, + "loss": 1.3252, + "step": 10495 + }, + { + "epoch": 0.9522772636545092, + "grad_norm": 0.11916385518874885, + "learning_rate": 5.960871592204154e-06, + "loss": 1.3362, + "step": 10496 + }, + { + "epoch": 0.9523679912901469, + "grad_norm": 0.12783131167415737, + "learning_rate": 5.938273153661023e-06, + "loss": 1.3386, + "step": 10497 + }, + { + "epoch": 0.9524587189257848, + "grad_norm": 0.12460248841766651, + "learning_rate": 5.915717377420804e-06, + "loss": 1.3416, + "step": 10498 + }, + { + "epoch": 0.9525494465614226, + "grad_norm": 0.12356435900679184, + "learning_rate": 5.893204265431162e-06, + "loss": 1.3379, + "step": 10499 + }, + { + "epoch": 0.9526401741970604, + "grad_norm": 0.12779966164964743, + "learning_rate": 5.870733819636209e-06, + "loss": 1.3054, + "step": 10500 + }, + { + "epoch": 0.9527309018326983, + "grad_norm": 0.11811981664675843, + "learning_rate": 5.8483060419762256e-06, + "loss": 1.348, + "step": 10501 + }, + { + "epoch": 0.952821629468336, + "grad_norm": 0.1328556778749164, + "learning_rate": 5.825920934387774e-06, + "loss": 1.3378, + "step": 10502 + }, + { + "epoch": 0.9529123571039738, + "grad_norm": 0.12494756764290063, + "learning_rate": 5.803578498803974e-06, + "loss": 1.3551, + "step": 10503 + }, + { + "epoch": 0.9530030847396117, + "grad_norm": 0.12781763277974323, + "learning_rate": 5.781278737153894e-06, + "loss": 1.3726, + "step": 10504 + }, + { + "epoch": 0.9530938123752495, + "grad_norm": 0.11833412626326036, + "learning_rate": 5.759021651363327e-06, + "loss": 1.3593, + "step": 10505 + }, + { + "epoch": 0.9531845400108874, + "grad_norm": 0.1339714724488286, + "learning_rate": 5.73680724335407e-06, + "loss": 1.3632, + "step": 10506 + }, + { + "epoch": 0.9532752676465251, + "grad_norm": 0.12307274865289881, + "learning_rate": 5.714635515044364e-06, + "loss": 1.32, + "step": 10507 + }, + { + "epoch": 0.9533659952821629, + "grad_norm": 0.12210632894249757, + "learning_rate": 5.69250646834868e-06, + "loss": 1.3703, + "step": 10508 + }, + { + "epoch": 0.9534567229178008, + "grad_norm": 0.12319571480470025, + "learning_rate": 5.6704201051779315e-06, + "loss": 1.3544, + "step": 10509 + }, + { + "epoch": 0.9535474505534386, + "grad_norm": 0.11848743694205827, + "learning_rate": 5.648376427439317e-06, + "loss": 1.3622, + "step": 10510 + }, + { + "epoch": 0.9536381781890764, + "grad_norm": 0.12415941666574691, + "learning_rate": 5.626375437036202e-06, + "loss": 1.3373, + "step": 10511 + }, + { + "epoch": 0.9537289058247143, + "grad_norm": 0.12553509612071256, + "learning_rate": 5.6044171358684e-06, + "loss": 1.3357, + "step": 10512 + }, + { + "epoch": 0.953819633460352, + "grad_norm": 0.12834743669139226, + "learning_rate": 5.582501525832117e-06, + "loss": 1.3729, + "step": 10513 + }, + { + "epoch": 0.9539103610959898, + "grad_norm": 0.12057671133821754, + "learning_rate": 5.560628608819673e-06, + "loss": 1.3223, + "step": 10514 + }, + { + "epoch": 0.9540010887316277, + "grad_norm": 0.11513750392525761, + "learning_rate": 5.538798386719834e-06, + "loss": 1.3339, + "step": 10515 + }, + { + "epoch": 0.9540918163672655, + "grad_norm": 0.19686324116620127, + "learning_rate": 5.517010861417648e-06, + "loss": 1.3836, + "step": 10516 + }, + { + "epoch": 0.9541825440029033, + "grad_norm": 0.1193251647727687, + "learning_rate": 5.495266034794388e-06, + "loss": 1.3579, + "step": 10517 + }, + { + "epoch": 0.9542732716385411, + "grad_norm": 0.12134454664903188, + "learning_rate": 5.473563908727885e-06, + "loss": 1.3389, + "step": 10518 + }, + { + "epoch": 0.9543639992741789, + "grad_norm": 0.12641955387190862, + "learning_rate": 5.451904485091919e-06, + "loss": 1.352, + "step": 10519 + }, + { + "epoch": 0.9544547269098167, + "grad_norm": 0.1305217008092299, + "learning_rate": 5.430287765756936e-06, + "loss": 1.3772, + "step": 10520 + }, + { + "epoch": 0.9545454545454546, + "grad_norm": 0.13145043066617182, + "learning_rate": 5.4087137525895005e-06, + "loss": 1.3425, + "step": 10521 + }, + { + "epoch": 0.9546361821810924, + "grad_norm": 0.11943564160856336, + "learning_rate": 5.38718244745251e-06, + "loss": 1.3492, + "step": 10522 + }, + { + "epoch": 0.9547269098167301, + "grad_norm": 0.15309656689008463, + "learning_rate": 5.3656938522052e-06, + "loss": 1.3407, + "step": 10523 + }, + { + "epoch": 0.954817637452368, + "grad_norm": 0.12783608092944826, + "learning_rate": 5.344247968703197e-06, + "loss": 1.3699, + "step": 10524 + }, + { + "epoch": 0.9549083650880058, + "grad_norm": 0.1254895137004116, + "learning_rate": 5.322844798798243e-06, + "loss": 1.3795, + "step": 10525 + }, + { + "epoch": 0.9549990927236436, + "grad_norm": 0.1428721052134132, + "learning_rate": 5.301484344338525e-06, + "loss": 1.3528, + "step": 10526 + }, + { + "epoch": 0.9550898203592815, + "grad_norm": 0.12832146022357485, + "learning_rate": 5.280166607168568e-06, + "loss": 1.3595, + "step": 10527 + }, + { + "epoch": 0.9551805479949192, + "grad_norm": 0.11626929716350937, + "learning_rate": 5.258891589129122e-06, + "loss": 1.3502, + "step": 10528 + }, + { + "epoch": 0.955271275630557, + "grad_norm": 0.11509095605370813, + "learning_rate": 5.237659292057329e-06, + "loss": 1.3527, + "step": 10529 + }, + { + "epoch": 0.9553620032661949, + "grad_norm": 0.12441081430617217, + "learning_rate": 5.216469717786554e-06, + "loss": 1.3207, + "step": 10530 + }, + { + "epoch": 0.9554527309018327, + "grad_norm": 0.14923246530464468, + "learning_rate": 5.1953228681466125e-06, + "loss": 1.3941, + "step": 10531 + }, + { + "epoch": 0.9555434585374705, + "grad_norm": 0.11879220149460372, + "learning_rate": 5.174218744963377e-06, + "loss": 1.3346, + "step": 10532 + }, + { + "epoch": 0.9556341861731084, + "grad_norm": 0.11468452603309431, + "learning_rate": 5.153157350059334e-06, + "loss": 1.3212, + "step": 10533 + }, + { + "epoch": 0.9557249138087461, + "grad_norm": 0.12386282222409331, + "learning_rate": 5.132138685253085e-06, + "loss": 1.3518, + "step": 10534 + }, + { + "epoch": 0.9558156414443839, + "grad_norm": 0.11260302952986213, + "learning_rate": 5.111162752359566e-06, + "loss": 1.3275, + "step": 10535 + }, + { + "epoch": 0.9559063690800218, + "grad_norm": 0.11913946531375517, + "learning_rate": 5.090229553190051e-06, + "loss": 1.3629, + "step": 10536 + }, + { + "epoch": 0.9559970967156596, + "grad_norm": 0.1226704084600762, + "learning_rate": 5.069339089552261e-06, + "loss": 1.3673, + "step": 10537 + }, + { + "epoch": 0.9560878243512974, + "grad_norm": 0.1254871103452915, + "learning_rate": 5.048491363249919e-06, + "loss": 1.3352, + "step": 10538 + }, + { + "epoch": 0.9561785519869352, + "grad_norm": 0.12410680097201388, + "learning_rate": 5.027686376083307e-06, + "loss": 1.338, + "step": 10539 + }, + { + "epoch": 0.956269279622573, + "grad_norm": 0.12146760554108432, + "learning_rate": 5.006924129848933e-06, + "loss": 1.3533, + "step": 10540 + }, + { + "epoch": 0.9563600072582108, + "grad_norm": 0.11504336057033317, + "learning_rate": 4.986204626339585e-06, + "loss": 1.3463, + "step": 10541 + }, + { + "epoch": 0.9564507348938487, + "grad_norm": 0.11850302892130925, + "learning_rate": 4.965527867344444e-06, + "loss": 1.2873, + "step": 10542 + }, + { + "epoch": 0.9565414625294865, + "grad_norm": 0.11457176516885585, + "learning_rate": 4.9448938546489134e-06, + "loss": 1.39, + "step": 10543 + }, + { + "epoch": 0.9566321901651244, + "grad_norm": 0.12810823350874811, + "learning_rate": 4.924302590034846e-06, + "loss": 1.3343, + "step": 10544 + }, + { + "epoch": 0.9567229178007621, + "grad_norm": 0.11730691307817633, + "learning_rate": 4.9037540752801535e-06, + "loss": 1.326, + "step": 10545 + }, + { + "epoch": 0.9568136454363999, + "grad_norm": 0.12624951148902977, + "learning_rate": 4.88324831215925e-06, + "loss": 1.3461, + "step": 10546 + }, + { + "epoch": 0.9569043730720378, + "grad_norm": 0.1174140793980867, + "learning_rate": 4.862785302442829e-06, + "loss": 1.3412, + "step": 10547 + }, + { + "epoch": 0.9569951007076756, + "grad_norm": 0.12579406737883742, + "learning_rate": 4.842365047897923e-06, + "loss": 1.3594, + "step": 10548 + }, + { + "epoch": 0.9570858283433133, + "grad_norm": 0.12012119923957959, + "learning_rate": 4.821987550287732e-06, + "loss": 1.3304, + "step": 10549 + }, + { + "epoch": 0.9571765559789512, + "grad_norm": 0.11827446956573696, + "learning_rate": 4.8016528113719035e-06, + "loss": 1.3312, + "step": 10550 + }, + { + "epoch": 0.957267283614589, + "grad_norm": 0.12172050416013719, + "learning_rate": 4.781360832906312e-06, + "loss": 1.328, + "step": 10551 + }, + { + "epoch": 0.9573580112502268, + "grad_norm": 0.12707027581403588, + "learning_rate": 4.7611116166432765e-06, + "loss": 1.356, + "step": 10552 + }, + { + "epoch": 0.9574487388858647, + "grad_norm": 0.12184354161174989, + "learning_rate": 4.7409051643311775e-06, + "loss": 1.3553, + "step": 10553 + }, + { + "epoch": 0.9575394665215025, + "grad_norm": 0.12598892853938698, + "learning_rate": 4.720741477714952e-06, + "loss": 1.3341, + "step": 10554 + }, + { + "epoch": 0.9576301941571402, + "grad_norm": 0.26631239959516306, + "learning_rate": 4.700620558535707e-06, + "loss": 1.3756, + "step": 10555 + }, + { + "epoch": 0.9577209217927781, + "grad_norm": 0.12029978124149208, + "learning_rate": 4.6805424085308305e-06, + "loss": 1.41, + "step": 10556 + }, + { + "epoch": 0.9578116494284159, + "grad_norm": 0.1249319810095527, + "learning_rate": 4.660507029434102e-06, + "loss": 1.324, + "step": 10557 + }, + { + "epoch": 0.9579023770640537, + "grad_norm": 0.12314560561635952, + "learning_rate": 4.640514422975639e-06, + "loss": 1.3267, + "step": 10558 + }, + { + "epoch": 0.9579931046996916, + "grad_norm": 0.12048877113038936, + "learning_rate": 4.6205645908817816e-06, + "loss": 1.3272, + "step": 10559 + }, + { + "epoch": 0.9580838323353293, + "grad_norm": 0.11723556532944168, + "learning_rate": 4.600657534875097e-06, + "loss": 1.3109, + "step": 10560 + }, + { + "epoch": 0.9581745599709671, + "grad_norm": 0.13463842491358613, + "learning_rate": 4.580793256674765e-06, + "loss": 1.3475, + "step": 10561 + }, + { + "epoch": 0.958265287606605, + "grad_norm": 0.12965887402267928, + "learning_rate": 4.560971757995913e-06, + "loss": 1.3553, + "step": 10562 + }, + { + "epoch": 0.9583560152422428, + "grad_norm": 0.11828567292747096, + "learning_rate": 4.541193040550118e-06, + "loss": 1.338, + "step": 10563 + }, + { + "epoch": 0.9584467428778806, + "grad_norm": 0.1258400521585718, + "learning_rate": 4.521457106045346e-06, + "loss": 1.3396, + "step": 10564 + }, + { + "epoch": 0.9585374705135185, + "grad_norm": 0.11380985517121235, + "learning_rate": 4.501763956185845e-06, + "loss": 1.3632, + "step": 10565 + }, + { + "epoch": 0.9586281981491562, + "grad_norm": 0.11826347282437374, + "learning_rate": 4.482113592671977e-06, + "loss": 1.3311, + "step": 10566 + }, + { + "epoch": 0.958718925784794, + "grad_norm": 0.20185202400388527, + "learning_rate": 4.462506017200662e-06, + "loss": 1.3227, + "step": 10567 + }, + { + "epoch": 0.9588096534204319, + "grad_norm": 0.13599566653559597, + "learning_rate": 4.442941231464992e-06, + "loss": 1.3223, + "step": 10568 + }, + { + "epoch": 0.9589003810560697, + "grad_norm": 0.12050139401058817, + "learning_rate": 4.423419237154391e-06, + "loss": 1.3882, + "step": 10569 + }, + { + "epoch": 0.9589911086917075, + "grad_norm": 0.11729941642946023, + "learning_rate": 4.403940035954568e-06, + "loss": 1.35, + "step": 10570 + }, + { + "epoch": 0.9590818363273453, + "grad_norm": 0.12210702430208126, + "learning_rate": 4.384503629547565e-06, + "loss": 1.3265, + "step": 10571 + }, + { + "epoch": 0.9591725639629831, + "grad_norm": 0.1223697139562766, + "learning_rate": 4.365110019611707e-06, + "loss": 1.3299, + "step": 10572 + }, + { + "epoch": 0.9592632915986209, + "grad_norm": 0.12693234200272405, + "learning_rate": 4.345759207821653e-06, + "loss": 1.3789, + "step": 10573 + }, + { + "epoch": 0.9593540192342588, + "grad_norm": 0.11668221544673271, + "learning_rate": 4.326451195848347e-06, + "loss": 1.3111, + "step": 10574 + }, + { + "epoch": 0.9594447468698966, + "grad_norm": 0.24463272867986383, + "learning_rate": 4.3071859853590635e-06, + "loss": 1.3397, + "step": 10575 + }, + { + "epoch": 0.9595354745055343, + "grad_norm": 0.12043961566551814, + "learning_rate": 4.287963578017307e-06, + "loss": 1.3524, + "step": 10576 + }, + { + "epoch": 0.9596262021411722, + "grad_norm": 0.1226343912054595, + "learning_rate": 4.268783975482915e-06, + "loss": 1.3459, + "step": 10577 + }, + { + "epoch": 0.95971692977681, + "grad_norm": 0.12994188801268905, + "learning_rate": 4.24964717941212e-06, + "loss": 1.3738, + "step": 10578 + }, + { + "epoch": 0.9598076574124478, + "grad_norm": 0.11697091819996179, + "learning_rate": 4.230553191457376e-06, + "loss": 1.3453, + "step": 10579 + }, + { + "epoch": 0.9598983850480857, + "grad_norm": 0.13015448241991182, + "learning_rate": 4.211502013267421e-06, + "loss": 1.3662, + "step": 10580 + }, + { + "epoch": 0.9599891126837234, + "grad_norm": 0.11499722503815403, + "learning_rate": 4.192493646487383e-06, + "loss": 1.3441, + "step": 10581 + }, + { + "epoch": 0.9600798403193613, + "grad_norm": 0.1341450610849737, + "learning_rate": 4.173528092758505e-06, + "loss": 1.3654, + "step": 10582 + }, + { + "epoch": 0.9601705679549991, + "grad_norm": 0.16178190858820993, + "learning_rate": 4.154605353718643e-06, + "loss": 1.3669, + "step": 10583 + }, + { + "epoch": 0.9602612955906369, + "grad_norm": 0.1286786667791494, + "learning_rate": 4.135725431001602e-06, + "loss": 1.367, + "step": 10584 + }, + { + "epoch": 0.9603520232262748, + "grad_norm": 0.1214862329906248, + "learning_rate": 4.1168883262378e-06, + "loss": 1.3588, + "step": 10585 + }, + { + "epoch": 0.9604427508619126, + "grad_norm": 0.1196261386136637, + "learning_rate": 4.0980940410537685e-06, + "loss": 1.3264, + "step": 10586 + }, + { + "epoch": 0.9605334784975503, + "grad_norm": 0.11614567342954064, + "learning_rate": 4.079342577072431e-06, + "loss": 1.3535, + "step": 10587 + }, + { + "epoch": 0.9606242061331882, + "grad_norm": 0.11935645487192807, + "learning_rate": 4.0606339359129386e-06, + "loss": 1.3313, + "step": 10588 + }, + { + "epoch": 0.960714933768826, + "grad_norm": 0.1231337123950403, + "learning_rate": 4.041968119190775e-06, + "loss": 1.3697, + "step": 10589 + }, + { + "epoch": 0.9608056614044638, + "grad_norm": 0.13338396225620294, + "learning_rate": 4.023345128517764e-06, + "loss": 1.3362, + "step": 10590 + }, + { + "epoch": 0.9608963890401017, + "grad_norm": 0.1286201762101508, + "learning_rate": 4.004764965502006e-06, + "loss": 1.3547, + "step": 10591 + }, + { + "epoch": 0.9609871166757394, + "grad_norm": 0.12926576009503163, + "learning_rate": 3.98622763174794e-06, + "loss": 1.407, + "step": 10592 + }, + { + "epoch": 0.9610778443113772, + "grad_norm": 0.11774794466113152, + "learning_rate": 3.967733128856177e-06, + "loss": 1.3489, + "step": 10593 + }, + { + "epoch": 0.9611685719470151, + "grad_norm": 0.12060392240341979, + "learning_rate": 3.9492814584237705e-06, + "loss": 1.3125, + "step": 10594 + }, + { + "epoch": 0.9612592995826529, + "grad_norm": 0.12094006513552563, + "learning_rate": 3.930872622044002e-06, + "loss": 1.3144, + "step": 10595 + }, + { + "epoch": 0.9613500272182907, + "grad_norm": 0.13961718804010237, + "learning_rate": 3.912506621306544e-06, + "loss": 1.3867, + "step": 10596 + }, + { + "epoch": 0.9614407548539285, + "grad_norm": 0.12096189847369068, + "learning_rate": 3.8941834577972405e-06, + "loss": 1.3532, + "step": 10597 + }, + { + "epoch": 0.9615314824895663, + "grad_norm": 0.12703482149329814, + "learning_rate": 3.875903133098268e-06, + "loss": 1.3305, + "step": 10598 + }, + { + "epoch": 0.9616222101252041, + "grad_norm": 0.1122627297590928, + "learning_rate": 3.857665648788256e-06, + "loss": 1.3383, + "step": 10599 + }, + { + "epoch": 0.961712937760842, + "grad_norm": 0.11567696835979525, + "learning_rate": 3.8394710064418305e-06, + "loss": 1.3306, + "step": 10600 + }, + { + "epoch": 0.9618036653964798, + "grad_norm": 0.13389097261169544, + "learning_rate": 3.821319207630292e-06, + "loss": 1.3469, + "step": 10601 + }, + { + "epoch": 0.9618943930321175, + "grad_norm": 0.1265377817368437, + "learning_rate": 3.803210253920997e-06, + "loss": 1.3024, + "step": 10602 + }, + { + "epoch": 0.9619851206677554, + "grad_norm": 0.1220136374920228, + "learning_rate": 3.785144146877584e-06, + "loss": 1.3281, + "step": 10603 + }, + { + "epoch": 0.9620758483033932, + "grad_norm": 0.12395821549942147, + "learning_rate": 3.767120888060083e-06, + "loss": 1.3722, + "step": 10604 + }, + { + "epoch": 0.962166575939031, + "grad_norm": 0.13044660520287496, + "learning_rate": 3.7491404790248594e-06, + "loss": 1.3162, + "step": 10605 + }, + { + "epoch": 0.9622573035746689, + "grad_norm": 0.11963363169444298, + "learning_rate": 3.731202921324506e-06, + "loss": 1.325, + "step": 10606 + }, + { + "epoch": 0.9623480312103067, + "grad_norm": 0.15410443261284315, + "learning_rate": 3.7133082165078935e-06, + "loss": 1.3205, + "step": 10607 + }, + { + "epoch": 0.9624387588459444, + "grad_norm": 0.11638859578831219, + "learning_rate": 3.6954563661202867e-06, + "loss": 1.3447, + "step": 10608 + }, + { + "epoch": 0.9625294864815823, + "grad_norm": 0.12011374104793714, + "learning_rate": 3.6776473717031743e-06, + "loss": 1.3328, + "step": 10609 + }, + { + "epoch": 0.9626202141172201, + "grad_norm": 0.12083813033111283, + "learning_rate": 3.659881234794382e-06, + "loss": 1.3342, + "step": 10610 + }, + { + "epoch": 0.9627109417528579, + "grad_norm": 0.1231100705287943, + "learning_rate": 3.6421579569279606e-06, + "loss": 1.3761, + "step": 10611 + }, + { + "epoch": 0.9628016693884958, + "grad_norm": 0.13047427879661874, + "learning_rate": 3.624477539634352e-06, + "loss": 1.3293, + "step": 10612 + }, + { + "epoch": 0.9628923970241335, + "grad_norm": 0.13210061568920847, + "learning_rate": 3.60683998444028e-06, + "loss": 1.3372, + "step": 10613 + }, + { + "epoch": 0.9629831246597713, + "grad_norm": 0.11966763526824738, + "learning_rate": 3.5892452928686927e-06, + "loss": 1.3877, + "step": 10614 + }, + { + "epoch": 0.9630738522954092, + "grad_norm": 0.13608644818536386, + "learning_rate": 3.5716934664389857e-06, + "loss": 1.3181, + "step": 10615 + }, + { + "epoch": 0.963164579931047, + "grad_norm": 0.12059440111791653, + "learning_rate": 3.5541845066667243e-06, + "loss": 1.3587, + "step": 10616 + }, + { + "epoch": 0.9632553075666848, + "grad_norm": 0.12400301274288134, + "learning_rate": 3.536718415063811e-06, + "loss": 1.3414, + "step": 10617 + }, + { + "epoch": 0.9633460352023226, + "grad_norm": 0.1186100422937183, + "learning_rate": 3.5192951931383723e-06, + "loss": 1.3758, + "step": 10618 + }, + { + "epoch": 0.9634367628379604, + "grad_norm": 0.12470926286866957, + "learning_rate": 3.5019148423950377e-06, + "loss": 1.3029, + "step": 10619 + }, + { + "epoch": 0.9635274904735982, + "grad_norm": 0.1143647940282262, + "learning_rate": 3.484577364334496e-06, + "loss": 1.345, + "step": 10620 + }, + { + "epoch": 0.9636182181092361, + "grad_norm": 0.12142628012958298, + "learning_rate": 3.467282760453827e-06, + "loss": 1.3129, + "step": 10621 + }, + { + "epoch": 0.9637089457448739, + "grad_norm": 0.14232287811545583, + "learning_rate": 3.450031032246559e-06, + "loss": 1.3393, + "step": 10622 + }, + { + "epoch": 0.9637996733805118, + "grad_norm": 0.12325815670505415, + "learning_rate": 3.4328221812022777e-06, + "loss": 1.3601, + "step": 10623 + }, + { + "epoch": 0.9638904010161495, + "grad_norm": 0.11947843122833852, + "learning_rate": 3.4156562088069053e-06, + "loss": 1.3352, + "step": 10624 + }, + { + "epoch": 0.9639811286517873, + "grad_norm": 0.11658862981514437, + "learning_rate": 3.3985331165429236e-06, + "loss": 1.3604, + "step": 10625 + }, + { + "epoch": 0.9640718562874252, + "grad_norm": 0.1335436786449123, + "learning_rate": 3.38145290588876e-06, + "loss": 1.343, + "step": 10626 + }, + { + "epoch": 0.964162583923063, + "grad_norm": 0.11873695229198201, + "learning_rate": 3.364415578319291e-06, + "loss": 1.3697, + "step": 10627 + }, + { + "epoch": 0.9642533115587008, + "grad_norm": 0.12086977078042266, + "learning_rate": 3.3474211353057836e-06, + "loss": 1.3572, + "step": 10628 + }, + { + "epoch": 0.9643440391943386, + "grad_norm": 0.12520933922899147, + "learning_rate": 3.3304695783156202e-06, + "loss": 1.3401, + "step": 10629 + }, + { + "epoch": 0.9644347668299764, + "grad_norm": 0.12975840054542614, + "learning_rate": 3.3135609088126294e-06, + "loss": 1.349, + "step": 10630 + }, + { + "epoch": 0.9645254944656142, + "grad_norm": 0.13142447645732233, + "learning_rate": 3.2966951282568105e-06, + "loss": 1.3447, + "step": 10631 + }, + { + "epoch": 0.9646162221012521, + "grad_norm": 0.12171556607900033, + "learning_rate": 3.279872238104664e-06, + "loss": 1.3335, + "step": 10632 + }, + { + "epoch": 0.9647069497368899, + "grad_norm": 0.12111969025163523, + "learning_rate": 3.263092239808696e-06, + "loss": 1.318, + "step": 10633 + }, + { + "epoch": 0.9647976773725276, + "grad_norm": 0.11812801492637261, + "learning_rate": 3.2463551348179687e-06, + "loss": 1.3541, + "step": 10634 + }, + { + "epoch": 0.9648884050081655, + "grad_norm": 0.11745867489204972, + "learning_rate": 3.22966092457766e-06, + "loss": 1.3412, + "step": 10635 + }, + { + "epoch": 0.9649791326438033, + "grad_norm": 0.14174256597396526, + "learning_rate": 3.2130096105293383e-06, + "loss": 1.357, + "step": 10636 + }, + { + "epoch": 0.9650698602794411, + "grad_norm": 0.12386700942094885, + "learning_rate": 3.196401194110854e-06, + "loss": 1.3269, + "step": 10637 + }, + { + "epoch": 0.965160587915079, + "grad_norm": 0.1256628067201168, + "learning_rate": 3.1798356767563376e-06, + "loss": 1.3273, + "step": 10638 + }, + { + "epoch": 0.9652513155507167, + "grad_norm": 0.11693571517584576, + "learning_rate": 3.163313059896311e-06, + "loss": 1.3223, + "step": 10639 + }, + { + "epoch": 0.9653420431863545, + "grad_norm": 0.14530892383036034, + "learning_rate": 3.146833344957356e-06, + "loss": 1.3386, + "step": 10640 + }, + { + "epoch": 0.9654327708219924, + "grad_norm": 0.14139369161494658, + "learning_rate": 3.1303965333626116e-06, + "loss": 1.3792, + "step": 10641 + }, + { + "epoch": 0.9655234984576302, + "grad_norm": 0.12271735061963604, + "learning_rate": 3.1140026265313317e-06, + "loss": 1.3378, + "step": 10642 + }, + { + "epoch": 0.965614226093268, + "grad_norm": 0.1194685576558569, + "learning_rate": 3.097651625879161e-06, + "loss": 1.3589, + "step": 10643 + }, + { + "epoch": 0.9657049537289059, + "grad_norm": 0.12288303046321858, + "learning_rate": 3.081343532817971e-06, + "loss": 1.3624, + "step": 10644 + }, + { + "epoch": 0.9657956813645436, + "grad_norm": 0.13074906751683096, + "learning_rate": 3.0650783487560785e-06, + "loss": 1.3457, + "step": 10645 + }, + { + "epoch": 0.9658864090001814, + "grad_norm": 0.12997653614689741, + "learning_rate": 3.0488560750978613e-06, + "loss": 1.3709, + "step": 10646 + }, + { + "epoch": 0.9659771366358193, + "grad_norm": 0.12533834603137597, + "learning_rate": 3.032676713244198e-06, + "loss": 1.3232, + "step": 10647 + }, + { + "epoch": 0.9660678642714571, + "grad_norm": 0.12007721960212432, + "learning_rate": 3.016540264592138e-06, + "loss": 1.303, + "step": 10648 + }, + { + "epoch": 0.9661585919070949, + "grad_norm": 0.12352106986587928, + "learning_rate": 3.000446730535067e-06, + "loss": 1.282, + "step": 10649 + }, + { + "epoch": 0.9662493195427327, + "grad_norm": 0.13673947540995735, + "learning_rate": 2.9843961124626507e-06, + "loss": 1.3194, + "step": 10650 + }, + { + "epoch": 0.9663400471783705, + "grad_norm": 0.13300248693205946, + "learning_rate": 2.9683884117608916e-06, + "loss": 1.3534, + "step": 10651 + }, + { + "epoch": 0.9664307748140083, + "grad_norm": 0.11505972903825633, + "learning_rate": 2.9524236298120733e-06, + "loss": 1.365, + "step": 10652 + }, + { + "epoch": 0.9665215024496462, + "grad_norm": 0.1410262498337316, + "learning_rate": 2.9365017679947037e-06, + "loss": 1.3559, + "step": 10653 + }, + { + "epoch": 0.966612230085284, + "grad_norm": 0.12818632911099678, + "learning_rate": 2.9206228276836834e-06, + "loss": 1.3447, + "step": 10654 + }, + { + "epoch": 0.9667029577209217, + "grad_norm": 0.12272686887971186, + "learning_rate": 2.9047868102501372e-06, + "loss": 1.3105, + "step": 10655 + }, + { + "epoch": 0.9667936853565596, + "grad_norm": 0.12547635373938948, + "learning_rate": 2.888993717061528e-06, + "loss": 1.3311, + "step": 10656 + }, + { + "epoch": 0.9668844129921974, + "grad_norm": 0.1246868911428238, + "learning_rate": 2.8732435494815414e-06, + "loss": 1.3838, + "step": 10657 + }, + { + "epoch": 0.9669751406278352, + "grad_norm": 0.12724003333477935, + "learning_rate": 2.8575363088702566e-06, + "loss": 1.3487, + "step": 10658 + }, + { + "epoch": 0.9670658682634731, + "grad_norm": 0.11947673667287613, + "learning_rate": 2.8418719965840334e-06, + "loss": 1.36, + "step": 10659 + }, + { + "epoch": 0.9671565958991108, + "grad_norm": 0.14536951531739142, + "learning_rate": 2.826250613975345e-06, + "loss": 1.3573, + "step": 10660 + }, + { + "epoch": 0.9672473235347487, + "grad_norm": 0.1131398873125269, + "learning_rate": 2.8106721623932796e-06, + "loss": 1.3738, + "step": 10661 + }, + { + "epoch": 0.9673380511703865, + "grad_norm": 0.12013064435327125, + "learning_rate": 2.795136643182927e-06, + "loss": 1.317, + "step": 10662 + }, + { + "epoch": 0.9674287788060243, + "grad_norm": 0.15136011572893296, + "learning_rate": 2.7796440576857707e-06, + "loss": 1.3409, + "step": 10663 + }, + { + "epoch": 0.9675195064416622, + "grad_norm": 0.12998645254103772, + "learning_rate": 2.764194407239684e-06, + "loss": 1.3733, + "step": 10664 + }, + { + "epoch": 0.9676102340773, + "grad_norm": 0.13512675832129661, + "learning_rate": 2.7487876931786557e-06, + "loss": 1.3584, + "step": 10665 + }, + { + "epoch": 0.9677009617129377, + "grad_norm": 0.12367755452903083, + "learning_rate": 2.733423916833122e-06, + "loss": 1.3287, + "step": 10666 + }, + { + "epoch": 0.9677916893485756, + "grad_norm": 0.12398838408518505, + "learning_rate": 2.718103079529688e-06, + "loss": 1.3474, + "step": 10667 + }, + { + "epoch": 0.9678824169842134, + "grad_norm": 0.13320096269674814, + "learning_rate": 2.702825182591351e-06, + "loss": 1.3316, + "step": 10668 + }, + { + "epoch": 0.9679731446198512, + "grad_norm": 0.11516994845906602, + "learning_rate": 2.6875902273373886e-06, + "loss": 1.3168, + "step": 10669 + }, + { + "epoch": 0.9680638722554891, + "grad_norm": 0.12259074313845064, + "learning_rate": 2.672398215083305e-06, + "loss": 1.3732, + "step": 10670 + }, + { + "epoch": 0.9681545998911268, + "grad_norm": 0.13931200248764464, + "learning_rate": 2.6572491471408834e-06, + "loss": 1.3584, + "step": 10671 + }, + { + "epoch": 0.9682453275267646, + "grad_norm": 0.11705708461801718, + "learning_rate": 2.6421430248183552e-06, + "loss": 1.3689, + "step": 10672 + }, + { + "epoch": 0.9683360551624025, + "grad_norm": 0.11540096332066765, + "learning_rate": 2.6270798494200664e-06, + "loss": 1.3637, + "step": 10673 + }, + { + "epoch": 0.9684267827980403, + "grad_norm": 0.1301787722577034, + "learning_rate": 2.612059622246754e-06, + "loss": 1.3455, + "step": 10674 + }, + { + "epoch": 0.9685175104336781, + "grad_norm": 0.13931761346159516, + "learning_rate": 2.5970823445953252e-06, + "loss": 1.3603, + "step": 10675 + }, + { + "epoch": 0.968608238069316, + "grad_norm": 0.12632798983057877, + "learning_rate": 2.5821480177591895e-06, + "loss": 1.3448, + "step": 10676 + }, + { + "epoch": 0.9686989657049537, + "grad_norm": 0.11760534567107934, + "learning_rate": 2.5672566430279266e-06, + "loss": 1.3778, + "step": 10677 + }, + { + "epoch": 0.9687896933405915, + "grad_norm": 0.11951194054845345, + "learning_rate": 2.552408221687286e-06, + "loss": 1.3724, + "step": 10678 + }, + { + "epoch": 0.9688804209762294, + "grad_norm": 0.11779539550730345, + "learning_rate": 2.5376027550195745e-06, + "loss": 1.3886, + "step": 10679 + }, + { + "epoch": 0.9689711486118672, + "grad_norm": 0.1306609558426599, + "learning_rate": 2.5228402443031595e-06, + "loss": 1.3482, + "step": 10680 + }, + { + "epoch": 0.969061876247505, + "grad_norm": 0.11864387791126577, + "learning_rate": 2.508120690812854e-06, + "loss": 1.3224, + "step": 10681 + }, + { + "epoch": 0.9691526038831428, + "grad_norm": 0.1212366102032812, + "learning_rate": 2.493444095819586e-06, + "loss": 1.3525, + "step": 10682 + }, + { + "epoch": 0.9692433315187806, + "grad_norm": 0.12052007318142625, + "learning_rate": 2.4788104605907302e-06, + "loss": 1.3548, + "step": 10683 + }, + { + "epoch": 0.9693340591544184, + "grad_norm": 0.11924891220742154, + "learning_rate": 2.4642197863899986e-06, + "loss": 1.3723, + "step": 10684 + }, + { + "epoch": 0.9694247867900563, + "grad_norm": 0.14581131791023, + "learning_rate": 2.4496720744771605e-06, + "loss": 1.3736, + "step": 10685 + }, + { + "epoch": 0.9695155144256941, + "grad_norm": 0.11669808700278436, + "learning_rate": 2.4351673261084895e-06, + "loss": 1.349, + "step": 10686 + }, + { + "epoch": 0.9696062420613318, + "grad_norm": 0.12107796512537873, + "learning_rate": 2.420705542536483e-06, + "loss": 1.3823, + "step": 10687 + }, + { + "epoch": 0.9696969696969697, + "grad_norm": 0.12485480154736789, + "learning_rate": 2.4062867250098097e-06, + "loss": 1.3509, + "step": 10688 + }, + { + "epoch": 0.9697876973326075, + "grad_norm": 0.13842356610660028, + "learning_rate": 2.391910874773695e-06, + "loss": 1.3213, + "step": 10689 + }, + { + "epoch": 0.9698784249682453, + "grad_norm": 0.16712224580976107, + "learning_rate": 2.3775779930693686e-06, + "loss": 1.3472, + "step": 10690 + }, + { + "epoch": 0.9699691526038832, + "grad_norm": 0.12763131792971558, + "learning_rate": 2.3632880811345624e-06, + "loss": 1.3371, + "step": 10691 + }, + { + "epoch": 0.9700598802395209, + "grad_norm": 0.1226808501959324, + "learning_rate": 2.349041140203123e-06, + "loss": 1.3275, + "step": 10692 + }, + { + "epoch": 0.9701506078751587, + "grad_norm": 0.1220494604708677, + "learning_rate": 2.334837171505344e-06, + "loss": 1.3258, + "step": 10693 + }, + { + "epoch": 0.9702413355107966, + "grad_norm": 0.12371767414159174, + "learning_rate": 2.320676176267744e-06, + "loss": 1.3601, + "step": 10694 + }, + { + "epoch": 0.9703320631464344, + "grad_norm": 0.11216048174564604, + "learning_rate": 2.306558155713068e-06, + "loss": 1.3507, + "step": 10695 + }, + { + "epoch": 0.9704227907820722, + "grad_norm": 0.11812716764675882, + "learning_rate": 2.292483111060506e-06, + "loss": 1.3369, + "step": 10696 + }, + { + "epoch": 0.97051351841771, + "grad_norm": 0.1318754093386791, + "learning_rate": 2.278451043525365e-06, + "loss": 1.3412, + "step": 10697 + }, + { + "epoch": 0.9706042460533478, + "grad_norm": 0.11831591759056488, + "learning_rate": 2.264461954319341e-06, + "loss": 1.3575, + "step": 10698 + }, + { + "epoch": 0.9706949736889857, + "grad_norm": 0.12422283372540911, + "learning_rate": 2.2505158446503025e-06, + "loss": 1.3652, + "step": 10699 + }, + { + "epoch": 0.9707857013246235, + "grad_norm": 0.11847816703717706, + "learning_rate": 2.2366127157226745e-06, + "loss": 1.3345, + "step": 10700 + }, + { + "epoch": 0.9708764289602613, + "grad_norm": 0.1873108236629649, + "learning_rate": 2.2227525687368856e-06, + "loss": 1.3252, + "step": 10701 + }, + { + "epoch": 0.9709671565958992, + "grad_norm": 0.12814075100971054, + "learning_rate": 2.208935404889756e-06, + "loss": 1.3278, + "step": 10702 + }, + { + "epoch": 0.9710578842315369, + "grad_norm": 0.12237790020708719, + "learning_rate": 2.1951612253744424e-06, + "loss": 1.3489, + "step": 10703 + }, + { + "epoch": 0.9711486118671747, + "grad_norm": 0.12371323013172968, + "learning_rate": 2.181430031380327e-06, + "loss": 1.3764, + "step": 10704 + }, + { + "epoch": 0.9712393395028126, + "grad_norm": 0.12318610917616307, + "learning_rate": 2.1677418240930725e-06, + "loss": 1.3661, + "step": 10705 + }, + { + "epoch": 0.9713300671384504, + "grad_norm": 0.12313354194437905, + "learning_rate": 2.1540966046946774e-06, + "loss": 1.3302, + "step": 10706 + }, + { + "epoch": 0.9714207947740882, + "grad_norm": 0.1365737973042286, + "learning_rate": 2.140494374363422e-06, + "loss": 1.3483, + "step": 10707 + }, + { + "epoch": 0.971511522409726, + "grad_norm": 0.11975319223240423, + "learning_rate": 2.126935134273866e-06, + "loss": 1.3259, + "step": 10708 + }, + { + "epoch": 0.9716022500453638, + "grad_norm": 0.1178181823924696, + "learning_rate": 2.1134188855968518e-06, + "loss": 1.3254, + "step": 10709 + }, + { + "epoch": 0.9716929776810016, + "grad_norm": 0.12166293097133758, + "learning_rate": 2.0999456294995002e-06, + "loss": 1.3453, + "step": 10710 + }, + { + "epoch": 0.9717837053166395, + "grad_norm": 0.12409398381343438, + "learning_rate": 2.0865153671452143e-06, + "loss": 1.3288, + "step": 10711 + }, + { + "epoch": 0.9718744329522773, + "grad_norm": 0.13567679933099008, + "learning_rate": 2.0731280996936773e-06, + "loss": 1.3456, + "step": 10712 + }, + { + "epoch": 0.971965160587915, + "grad_norm": 0.13670417858165582, + "learning_rate": 2.059783828300965e-06, + "loss": 1.3426, + "step": 10713 + }, + { + "epoch": 0.9720558882235529, + "grad_norm": 0.1341359712059183, + "learning_rate": 2.0464825541193222e-06, + "loss": 1.352, + "step": 10714 + }, + { + "epoch": 0.9721466158591907, + "grad_norm": 0.12374331260521856, + "learning_rate": 2.0332242782972743e-06, + "loss": 1.3954, + "step": 10715 + }, + { + "epoch": 0.9722373434948285, + "grad_norm": 0.13352841090018513, + "learning_rate": 2.0200090019796835e-06, + "loss": 1.3673, + "step": 10716 + }, + { + "epoch": 0.9723280711304664, + "grad_norm": 0.12471519648967672, + "learning_rate": 2.0068367263076924e-06, + "loss": 1.3228, + "step": 10717 + }, + { + "epoch": 0.9724187987661042, + "grad_norm": 0.1258739239119795, + "learning_rate": 1.9937074524188358e-06, + "loss": 1.3522, + "step": 10718 + }, + { + "epoch": 0.9725095264017419, + "grad_norm": 0.14699652714665273, + "learning_rate": 1.9806211814466513e-06, + "loss": 1.333, + "step": 10719 + }, + { + "epoch": 0.9726002540373798, + "grad_norm": 0.1958554987207165, + "learning_rate": 1.96757791452129e-06, + "loss": 1.3512, + "step": 10720 + }, + { + "epoch": 0.9726909816730176, + "grad_norm": 0.19926580264783664, + "learning_rate": 1.954577652768963e-06, + "loss": 1.3202, + "step": 10721 + }, + { + "epoch": 0.9727817093086554, + "grad_norm": 0.1432123573238963, + "learning_rate": 1.941620397312216e-06, + "loss": 1.3488, + "step": 10722 + }, + { + "epoch": 0.9728724369442933, + "grad_norm": 0.13276855282004876, + "learning_rate": 1.9287061492699875e-06, + "loss": 1.3154, + "step": 10723 + }, + { + "epoch": 0.972963164579931, + "grad_norm": 0.12183062288043953, + "learning_rate": 1.9158349097573858e-06, + "loss": 1.341, + "step": 10724 + }, + { + "epoch": 0.9730538922155688, + "grad_norm": 0.12065530874741567, + "learning_rate": 1.9030066798858548e-06, + "loss": 1.3352, + "step": 10725 + }, + { + "epoch": 0.9731446198512067, + "grad_norm": 0.12256962338194877, + "learning_rate": 1.890221460763064e-06, + "loss": 1.3636, + "step": 10726 + }, + { + "epoch": 0.9732353474868445, + "grad_norm": 0.12253349381761611, + "learning_rate": 1.8774792534931306e-06, + "loss": 1.3607, + "step": 10727 + }, + { + "epoch": 0.9733260751224823, + "grad_norm": 0.12556047400549228, + "learning_rate": 1.8647800591762855e-06, + "loss": 1.3564, + "step": 10728 + }, + { + "epoch": 0.9734168027581201, + "grad_norm": 0.12098253850913081, + "learning_rate": 1.85212387890904e-06, + "loss": 1.3722, + "step": 10729 + }, + { + "epoch": 0.9735075303937579, + "grad_norm": 0.12193548333033542, + "learning_rate": 1.8395107137843537e-06, + "loss": 1.3762, + "step": 10730 + }, + { + "epoch": 0.9735982580293957, + "grad_norm": 0.12115987928985275, + "learning_rate": 1.8269405648912995e-06, + "loss": 1.3262, + "step": 10731 + }, + { + "epoch": 0.9736889856650336, + "grad_norm": 0.12313090683127281, + "learning_rate": 1.8144134333153983e-06, + "loss": 1.355, + "step": 10732 + }, + { + "epoch": 0.9737797133006714, + "grad_norm": 0.12090241771225047, + "learning_rate": 1.8019293201383403e-06, + "loss": 1.3361, + "step": 10733 + }, + { + "epoch": 0.9738704409363091, + "grad_norm": 0.1208518983385848, + "learning_rate": 1.7894882264380408e-06, + "loss": 1.338, + "step": 10734 + }, + { + "epoch": 0.973961168571947, + "grad_norm": 0.11773426805019345, + "learning_rate": 1.7770901532889183e-06, + "loss": 1.3707, + "step": 10735 + }, + { + "epoch": 0.9740518962075848, + "grad_norm": 0.12067398660285572, + "learning_rate": 1.76473510176145e-06, + "loss": 1.3678, + "step": 10736 + }, + { + "epoch": 0.9741426238432227, + "grad_norm": 0.12294417903797426, + "learning_rate": 1.75242307292256e-06, + "loss": 1.3042, + "step": 10737 + }, + { + "epoch": 0.9742333514788605, + "grad_norm": 0.1272884125827376, + "learning_rate": 1.7401540678353977e-06, + "loss": 1.3735, + "step": 10738 + }, + { + "epoch": 0.9743240791144983, + "grad_norm": 0.11454961517580013, + "learning_rate": 1.7279280875593383e-06, + "loss": 1.3298, + "step": 10739 + }, + { + "epoch": 0.9744148067501361, + "grad_norm": 0.12352962675378786, + "learning_rate": 1.7157451331500928e-06, + "loss": 1.3492, + "step": 10740 + }, + { + "epoch": 0.9745055343857739, + "grad_norm": 0.135463766772337, + "learning_rate": 1.703605205659764e-06, + "loss": 1.332, + "step": 10741 + }, + { + "epoch": 0.9745962620214117, + "grad_norm": 0.11772658376751349, + "learning_rate": 1.691508306136569e-06, + "loss": 1.3383, + "step": 10742 + }, + { + "epoch": 0.9746869896570496, + "grad_norm": 0.12264068701563775, + "learning_rate": 1.6794544356250052e-06, + "loss": 1.3504, + "step": 10743 + }, + { + "epoch": 0.9747777172926874, + "grad_norm": 0.11670045309255746, + "learning_rate": 1.6674435951660182e-06, + "loss": 1.3644, + "step": 10744 + }, + { + "epoch": 0.9748684449283251, + "grad_norm": 0.11514862263915047, + "learning_rate": 1.6554757857967783e-06, + "loss": 1.3245, + "step": 10745 + }, + { + "epoch": 0.974959172563963, + "grad_norm": 0.12986837674926502, + "learning_rate": 1.6435510085505701e-06, + "loss": 1.3594, + "step": 10746 + }, + { + "epoch": 0.9750499001996008, + "grad_norm": 0.12105333541593727, + "learning_rate": 1.631669264457236e-06, + "loss": 1.3516, + "step": 10747 + }, + { + "epoch": 0.9751406278352386, + "grad_norm": 0.11922623419256019, + "learning_rate": 1.6198305545426784e-06, + "loss": 1.3916, + "step": 10748 + }, + { + "epoch": 0.9752313554708765, + "grad_norm": 0.12479751699429936, + "learning_rate": 1.6080348798292454e-06, + "loss": 1.3581, + "step": 10749 + }, + { + "epoch": 0.9753220831065142, + "grad_norm": 0.12251092525779148, + "learning_rate": 1.596282241335456e-06, + "loss": 1.3391, + "step": 10750 + }, + { + "epoch": 0.975412810742152, + "grad_norm": 0.11965193863434716, + "learning_rate": 1.5845726400761096e-06, + "loss": 1.342, + "step": 10751 + }, + { + "epoch": 0.9755035383777899, + "grad_norm": 0.12194022595124933, + "learning_rate": 1.5729060770624525e-06, + "loss": 1.3186, + "step": 10752 + }, + { + "epoch": 0.9755942660134277, + "grad_norm": 0.1266400321993326, + "learning_rate": 1.561282553301735e-06, + "loss": 1.3547, + "step": 10753 + }, + { + "epoch": 0.9756849936490655, + "grad_norm": 0.12012910997775589, + "learning_rate": 1.5497020697977648e-06, + "loss": 1.3721, + "step": 10754 + }, + { + "epoch": 0.9757757212847034, + "grad_norm": 0.11753699860536819, + "learning_rate": 1.5381646275504645e-06, + "loss": 1.3448, + "step": 10755 + }, + { + "epoch": 0.9758664489203411, + "grad_norm": 0.11748800149464293, + "learning_rate": 1.5266702275560928e-06, + "loss": 1.3637, + "step": 10756 + }, + { + "epoch": 0.9759571765559789, + "grad_norm": 0.11707445125864867, + "learning_rate": 1.5152188708073e-06, + "loss": 1.3459, + "step": 10757 + }, + { + "epoch": 0.9760479041916168, + "grad_norm": 0.12025976150355827, + "learning_rate": 1.5038105582927952e-06, + "loss": 1.3396, + "step": 10758 + }, + { + "epoch": 0.9761386318272546, + "grad_norm": 0.12301140871347442, + "learning_rate": 1.4924452909976793e-06, + "loss": 1.3782, + "step": 10759 + }, + { + "epoch": 0.9762293594628924, + "grad_norm": 0.12974775625110696, + "learning_rate": 1.4811230699033895e-06, + "loss": 1.3628, + "step": 10760 + }, + { + "epoch": 0.9763200870985302, + "grad_norm": 0.11589165971905446, + "learning_rate": 1.4698438959876436e-06, + "loss": 1.3464, + "step": 10761 + }, + { + "epoch": 0.976410814734168, + "grad_norm": 0.11656453106813665, + "learning_rate": 1.4586077702243294e-06, + "loss": 1.3507, + "step": 10762 + }, + { + "epoch": 0.9765015423698058, + "grad_norm": 0.1155733872889947, + "learning_rate": 1.4474146935836706e-06, + "loss": 1.3055, + "step": 10763 + }, + { + "epoch": 0.9765922700054437, + "grad_norm": 0.12665103082586435, + "learning_rate": 1.4362646670322831e-06, + "loss": 1.4093, + "step": 10764 + }, + { + "epoch": 0.9766829976410815, + "grad_norm": 0.12462394040589489, + "learning_rate": 1.4251576915328967e-06, + "loss": 1.3929, + "step": 10765 + }, + { + "epoch": 0.9767737252767192, + "grad_norm": 0.11469779683170724, + "learning_rate": 1.4140937680446331e-06, + "loss": 1.3815, + "step": 10766 + }, + { + "epoch": 0.9768644529123571, + "grad_norm": 0.11525143718011159, + "learning_rate": 1.4030728975228946e-06, + "loss": 1.303, + "step": 10767 + }, + { + "epoch": 0.9769551805479949, + "grad_norm": 0.12024122563468741, + "learning_rate": 1.3920950809192534e-06, + "loss": 1.4011, + "step": 10768 + }, + { + "epoch": 0.9770459081836327, + "grad_norm": 0.1266272062977772, + "learning_rate": 1.3811603191816735e-06, + "loss": 1.3292, + "step": 10769 + }, + { + "epoch": 0.9771366358192706, + "grad_norm": 0.12068394924705103, + "learning_rate": 1.3702686132543996e-06, + "loss": 1.3434, + "step": 10770 + }, + { + "epoch": 0.9772273634549083, + "grad_norm": 0.12771655924762632, + "learning_rate": 1.359419964077957e-06, + "loss": 1.3891, + "step": 10771 + }, + { + "epoch": 0.9773180910905461, + "grad_norm": 0.12257612473869069, + "learning_rate": 1.3486143725890965e-06, + "loss": 1.3434, + "step": 10772 + }, + { + "epoch": 0.977408818726184, + "grad_norm": 0.13094975351004495, + "learning_rate": 1.3378518397208494e-06, + "loss": 1.3531, + "step": 10773 + }, + { + "epoch": 0.9774995463618218, + "grad_norm": 0.13920214297743605, + "learning_rate": 1.3271323664025836e-06, + "loss": 1.3607, + "step": 10774 + }, + { + "epoch": 0.9775902739974597, + "grad_norm": 0.1206816908486294, + "learning_rate": 1.3164559535599474e-06, + "loss": 1.352, + "step": 10775 + }, + { + "epoch": 0.9776810016330975, + "grad_norm": 0.12251263777651868, + "learning_rate": 1.3058226021148146e-06, + "loss": 1.3438, + "step": 10776 + }, + { + "epoch": 0.9777717292687352, + "grad_norm": 0.12724011879358005, + "learning_rate": 1.2952323129854504e-06, + "loss": 1.3434, + "step": 10777 + }, + { + "epoch": 0.9778624569043731, + "grad_norm": 0.11754434586042588, + "learning_rate": 1.2846850870862347e-06, + "loss": 1.3226, + "step": 10778 + }, + { + "epoch": 0.9779531845400109, + "grad_norm": 0.12779482098668002, + "learning_rate": 1.2741809253279946e-06, + "loss": 1.3152, + "step": 10779 + }, + { + "epoch": 0.9780439121756487, + "grad_norm": 0.13185745663843504, + "learning_rate": 1.2637198286177265e-06, + "loss": 1.3441, + "step": 10780 + }, + { + "epoch": 0.9781346398112866, + "grad_norm": 0.12732558871617025, + "learning_rate": 1.2533017978587635e-06, + "loss": 1.3223, + "step": 10781 + }, + { + "epoch": 0.9782253674469243, + "grad_norm": 0.1168127610124899, + "learning_rate": 1.2429268339506639e-06, + "loss": 1.3257, + "step": 10782 + }, + { + "epoch": 0.9783160950825621, + "grad_norm": 0.12228545124638428, + "learning_rate": 1.2325949377893775e-06, + "loss": 1.3369, + "step": 10783 + }, + { + "epoch": 0.9784068227182, + "grad_norm": 2.0410012825008894, + "learning_rate": 1.222306110267024e-06, + "loss": 1.3699, + "step": 10784 + }, + { + "epoch": 0.9784975503538378, + "grad_norm": 0.11943886285478143, + "learning_rate": 1.2120603522720043e-06, + "loss": 1.3567, + "step": 10785 + }, + { + "epoch": 0.9785882779894756, + "grad_norm": 0.12858923616092122, + "learning_rate": 1.2018576646891655e-06, + "loss": 1.3618, + "step": 10786 + }, + { + "epoch": 0.9786790056251135, + "grad_norm": 0.13513512962365803, + "learning_rate": 1.191698048399359e-06, + "loss": 1.3889, + "step": 10787 + }, + { + "epoch": 0.9787697332607512, + "grad_norm": 0.14412825290775017, + "learning_rate": 1.1815815042799938e-06, + "loss": 1.3459, + "step": 10788 + }, + { + "epoch": 0.978860460896389, + "grad_norm": 0.13837436435368436, + "learning_rate": 1.1715080332045936e-06, + "loss": 1.3577, + "step": 10789 + }, + { + "epoch": 0.9789511885320269, + "grad_norm": 0.11607948878362999, + "learning_rate": 1.1614776360429624e-06, + "loss": 1.326, + "step": 10790 + }, + { + "epoch": 0.9790419161676647, + "grad_norm": 0.12486848706166558, + "learning_rate": 1.151490313661241e-06, + "loss": 1.3622, + "step": 10791 + }, + { + "epoch": 0.9791326438033024, + "grad_norm": 0.12363278826598653, + "learning_rate": 1.1415460669218503e-06, + "loss": 1.365, + "step": 10792 + }, + { + "epoch": 0.9792233714389403, + "grad_norm": 0.11722522605889797, + "learning_rate": 1.1316448966835479e-06, + "loss": 1.336, + "step": 10793 + }, + { + "epoch": 0.9793140990745781, + "grad_norm": 0.11666328775531652, + "learning_rate": 1.1217868038011503e-06, + "loss": 1.3505, + "step": 10794 + }, + { + "epoch": 0.9794048267102159, + "grad_norm": 0.12131704634549201, + "learning_rate": 1.1119717891260872e-06, + "loss": 1.3299, + "step": 10795 + }, + { + "epoch": 0.9794955543458538, + "grad_norm": 0.12792007996212693, + "learning_rate": 1.1021998535057365e-06, + "loss": 1.3562, + "step": 10796 + }, + { + "epoch": 0.9795862819814916, + "grad_norm": 0.11778795163619542, + "learning_rate": 1.0924709977839231e-06, + "loss": 1.3459, + "step": 10797 + }, + { + "epoch": 0.9796770096171293, + "grad_norm": 0.12448373397567182, + "learning_rate": 1.0827852228008085e-06, + "loss": 1.3133, + "step": 10798 + }, + { + "epoch": 0.9797677372527672, + "grad_norm": 0.12289284726614484, + "learning_rate": 1.0731425293927232e-06, + "loss": 1.3246, + "step": 10799 + }, + { + "epoch": 0.979858464888405, + "grad_norm": 0.12365865232312341, + "learning_rate": 1.0635429183922795e-06, + "loss": 1.3476, + "step": 10800 + }, + { + "epoch": 0.9799491925240428, + "grad_norm": 0.1184228025648867, + "learning_rate": 1.0539863906284808e-06, + "loss": 1.3322, + "step": 10801 + }, + { + "epoch": 0.9800399201596807, + "grad_norm": 0.1168080602637793, + "learning_rate": 1.0444729469265001e-06, + "loss": 1.3365, + "step": 10802 + }, + { + "epoch": 0.9801306477953184, + "grad_norm": 0.12122804323245073, + "learning_rate": 1.0350025881077918e-06, + "loss": 1.3347, + "step": 10803 + }, + { + "epoch": 0.9802213754309562, + "grad_norm": 0.12058990645926193, + "learning_rate": 1.025575314990146e-06, + "loss": 1.3374, + "step": 10804 + }, + { + "epoch": 0.9803121030665941, + "grad_norm": 0.11858397841645862, + "learning_rate": 1.0161911283876335e-06, + "loss": 1.333, + "step": 10805 + }, + { + "epoch": 0.9804028307022319, + "grad_norm": 0.12236088236814688, + "learning_rate": 1.0068500291105509e-06, + "loss": 1.3246, + "step": 10806 + }, + { + "epoch": 0.9804935583378697, + "grad_norm": 0.12389734905889704, + "learning_rate": 9.975520179655306e-07, + "loss": 1.3494, + "step": 10807 + }, + { + "epoch": 0.9805842859735076, + "grad_norm": 0.12183088138067506, + "learning_rate": 9.882970957554304e-07, + "loss": 1.3607, + "step": 10808 + }, + { + "epoch": 0.9806750136091453, + "grad_norm": 0.12784465846015677, + "learning_rate": 9.79085263279389e-07, + "loss": 1.3473, + "step": 10809 + }, + { + "epoch": 0.9807657412447831, + "grad_norm": 0.11669843145699295, + "learning_rate": 9.699165213329364e-07, + "loss": 1.3407, + "step": 10810 + }, + { + "epoch": 0.980856468880421, + "grad_norm": 0.12575829221018095, + "learning_rate": 9.607908707077173e-07, + "loss": 1.2913, + "step": 10811 + }, + { + "epoch": 0.9809471965160588, + "grad_norm": 0.12878312300410477, + "learning_rate": 9.517083121917681e-07, + "loss": 1.3822, + "step": 10812 + }, + { + "epoch": 0.9810379241516967, + "grad_norm": 0.12176980796744862, + "learning_rate": 9.4266884656935e-07, + "loss": 1.3582, + "step": 10813 + }, + { + "epoch": 0.9811286517873344, + "grad_norm": 0.12298668286826085, + "learning_rate": 9.336724746210056e-07, + "loss": 1.3463, + "step": 10814 + }, + { + "epoch": 0.9812193794229722, + "grad_norm": 0.1344571261003437, + "learning_rate": 9.247191971236135e-07, + "loss": 1.3585, + "step": 10815 + }, + { + "epoch": 0.9813101070586101, + "grad_norm": 0.1359928607653353, + "learning_rate": 9.158090148502774e-07, + "loss": 1.3454, + "step": 10816 + }, + { + "epoch": 0.9814008346942479, + "grad_norm": 0.1211594497421441, + "learning_rate": 9.069419285703817e-07, + "loss": 1.3504, + "step": 10817 + }, + { + "epoch": 0.9814915623298857, + "grad_norm": 0.1253695866154722, + "learning_rate": 8.981179390496474e-07, + "loss": 1.3646, + "step": 10818 + }, + { + "epoch": 0.9815822899655235, + "grad_norm": 0.11848889122164698, + "learning_rate": 8.893370470499651e-07, + "loss": 1.3357, + "step": 10819 + }, + { + "epoch": 0.9816730176011613, + "grad_norm": 0.12595270506659928, + "learning_rate": 8.805992533295615e-07, + "loss": 1.3507, + "step": 10820 + }, + { + "epoch": 0.9817637452367991, + "grad_norm": 0.11621264520081603, + "learning_rate": 8.719045586429996e-07, + "loss": 1.3469, + "step": 10821 + }, + { + "epoch": 0.981854472872437, + "grad_norm": 0.1234730196504807, + "learning_rate": 8.63252963741068e-07, + "loss": 1.3445, + "step": 10822 + }, + { + "epoch": 0.9819452005080748, + "grad_norm": 0.11649943897907582, + "learning_rate": 8.5464446937078e-07, + "loss": 1.3434, + "step": 10823 + }, + { + "epoch": 0.9820359281437125, + "grad_norm": 0.12472708797336848, + "learning_rate": 8.460790762754856e-07, + "loss": 1.3216, + "step": 10824 + }, + { + "epoch": 0.9821266557793504, + "grad_norm": 0.12280305315367403, + "learning_rate": 8.375567851948707e-07, + "loss": 1.358, + "step": 10825 + }, + { + "epoch": 0.9822173834149882, + "grad_norm": 0.11951649713158513, + "learning_rate": 8.290775968647912e-07, + "loss": 1.3465, + "step": 10826 + }, + { + "epoch": 0.982308111050626, + "grad_norm": 0.12590599908531325, + "learning_rate": 8.206415120174393e-07, + "loss": 1.3616, + "step": 10827 + }, + { + "epoch": 0.9823988386862639, + "grad_norm": 0.11840533184586723, + "learning_rate": 8.122485313812323e-07, + "loss": 1.3189, + "step": 10828 + }, + { + "epoch": 0.9824895663219017, + "grad_norm": 0.12155241544665671, + "learning_rate": 8.038986556809236e-07, + "loss": 1.3129, + "step": 10829 + }, + { + "epoch": 0.9825802939575394, + "grad_norm": 0.1192772483205827, + "learning_rate": 7.955918856376033e-07, + "loss": 1.3524, + "step": 10830 + }, + { + "epoch": 0.9826710215931773, + "grad_norm": 0.11821629665535777, + "learning_rate": 7.873282219684197e-07, + "loss": 1.3336, + "step": 10831 + }, + { + "epoch": 0.9827617492288151, + "grad_norm": 0.12202616345732534, + "learning_rate": 7.791076653870799e-07, + "loss": 1.3706, + "step": 10832 + }, + { + "epoch": 0.9828524768644529, + "grad_norm": 0.1465502237331008, + "learning_rate": 7.709302166033494e-07, + "loss": 1.3551, + "step": 10833 + }, + { + "epoch": 0.9829432045000908, + "grad_norm": 0.1198119355501596, + "learning_rate": 7.627958763233855e-07, + "loss": 1.3798, + "step": 10834 + }, + { + "epoch": 0.9830339321357285, + "grad_norm": 0.12144324590259364, + "learning_rate": 7.547046452495709e-07, + "loss": 1.3691, + "step": 10835 + }, + { + "epoch": 0.9831246597713663, + "grad_norm": 0.12349340530802168, + "learning_rate": 7.466565240806244e-07, + "loss": 1.3624, + "step": 10836 + }, + { + "epoch": 0.9832153874070042, + "grad_norm": 0.12695921655709633, + "learning_rate": 7.386515135114347e-07, + "loss": 1.3291, + "step": 10837 + }, + { + "epoch": 0.983306115042642, + "grad_norm": 0.12260021179454668, + "learning_rate": 7.306896142332819e-07, + "loss": 1.3734, + "step": 10838 + }, + { + "epoch": 0.9833968426782798, + "grad_norm": 0.13651582371998594, + "learning_rate": 7.227708269336164e-07, + "loss": 1.3344, + "step": 10839 + }, + { + "epoch": 0.9834875703139176, + "grad_norm": 0.1171543450693776, + "learning_rate": 7.148951522963353e-07, + "loss": 1.366, + "step": 10840 + }, + { + "epoch": 0.9835782979495554, + "grad_norm": 0.11958818279879435, + "learning_rate": 7.070625910014506e-07, + "loss": 1.3245, + "step": 10841 + }, + { + "epoch": 0.9836690255851932, + "grad_norm": 0.12661361241028832, + "learning_rate": 6.992731437252542e-07, + "loss": 1.3352, + "step": 10842 + }, + { + "epoch": 0.9837597532208311, + "grad_norm": 0.13772514547454326, + "learning_rate": 6.915268111404305e-07, + "loss": 1.3165, + "step": 10843 + }, + { + "epoch": 0.9838504808564689, + "grad_norm": 0.12260024857724042, + "learning_rate": 6.838235939158887e-07, + "loss": 1.3379, + "step": 10844 + }, + { + "epoch": 0.9839412084921066, + "grad_norm": 0.12338226039551506, + "learning_rate": 6.761634927167081e-07, + "loss": 1.3501, + "step": 10845 + }, + { + "epoch": 0.9840319361277445, + "grad_norm": 0.11769244123748773, + "learning_rate": 6.685465082044707e-07, + "loss": 1.3736, + "step": 10846 + }, + { + "epoch": 0.9841226637633823, + "grad_norm": 0.11847644131447208, + "learning_rate": 6.609726410367611e-07, + "loss": 1.3528, + "step": 10847 + }, + { + "epoch": 0.9842133913990201, + "grad_norm": 0.12116253907446566, + "learning_rate": 6.534418918677232e-07, + "loss": 1.3478, + "step": 10848 + }, + { + "epoch": 0.984304119034658, + "grad_norm": 0.14768690123112052, + "learning_rate": 6.459542613475588e-07, + "loss": 1.3878, + "step": 10849 + }, + { + "epoch": 0.9843948466702958, + "grad_norm": 0.13600376059769917, + "learning_rate": 6.385097501228066e-07, + "loss": 1.3743, + "step": 10850 + }, + { + "epoch": 0.9844855743059336, + "grad_norm": 0.12316292656089806, + "learning_rate": 6.311083588363963e-07, + "loss": 1.3633, + "step": 10851 + }, + { + "epoch": 0.9845763019415714, + "grad_norm": 0.12351650010931114, + "learning_rate": 6.237500881273173e-07, + "loss": 1.3502, + "step": 10852 + }, + { + "epoch": 0.9846670295772092, + "grad_norm": 0.11606449827956604, + "learning_rate": 6.164349386310609e-07, + "loss": 1.3426, + "step": 10853 + }, + { + "epoch": 0.9847577572128471, + "grad_norm": 0.12744594466651768, + "learning_rate": 6.091629109792329e-07, + "loss": 1.3659, + "step": 10854 + }, + { + "epoch": 0.9848484848484849, + "grad_norm": 0.11316083744522502, + "learning_rate": 6.019340057997757e-07, + "loss": 1.37, + "step": 10855 + }, + { + "epoch": 0.9849392124841226, + "grad_norm": 0.12561801929064423, + "learning_rate": 5.947482237169677e-07, + "loss": 1.3964, + "step": 10856 + }, + { + "epoch": 0.9850299401197605, + "grad_norm": 0.11981359185233757, + "learning_rate": 5.876055653512013e-07, + "loss": 1.3426, + "step": 10857 + }, + { + "epoch": 0.9851206677553983, + "grad_norm": 0.11652086760038936, + "learning_rate": 5.805060313193167e-07, + "loss": 1.3668, + "step": 10858 + }, + { + "epoch": 0.9852113953910361, + "grad_norm": 0.12299662551220736, + "learning_rate": 5.734496222343788e-07, + "loss": 1.3501, + "step": 10859 + }, + { + "epoch": 0.985302123026674, + "grad_norm": 0.1296065152391894, + "learning_rate": 5.664363387056226e-07, + "loss": 1.3395, + "step": 10860 + }, + { + "epoch": 0.9853928506623117, + "grad_norm": 0.12344589321716277, + "learning_rate": 5.594661813387303e-07, + "loss": 1.3351, + "step": 10861 + }, + { + "epoch": 0.9854835782979495, + "grad_norm": 0.11351301012985283, + "learning_rate": 5.525391507355537e-07, + "loss": 1.3358, + "step": 10862 + }, + { + "epoch": 0.9855743059335874, + "grad_norm": 0.13318398555792577, + "learning_rate": 5.4565524749417e-07, + "loss": 1.3286, + "step": 10863 + }, + { + "epoch": 0.9856650335692252, + "grad_norm": 0.15015866640495168, + "learning_rate": 5.388144722091592e-07, + "loss": 1.3369, + "step": 10864 + }, + { + "epoch": 0.985755761204863, + "grad_norm": 0.12404553693918242, + "learning_rate": 5.320168254710489e-07, + "loss": 1.3456, + "step": 10865 + }, + { + "epoch": 0.9858464888405009, + "grad_norm": 0.12037338841044663, + "learning_rate": 5.25262307866925e-07, + "loss": 1.3927, + "step": 10866 + }, + { + "epoch": 0.9859372164761386, + "grad_norm": 0.12060438209452001, + "learning_rate": 5.185509199800431e-07, + "loss": 1.3604, + "step": 10867 + }, + { + "epoch": 0.9860279441117764, + "grad_norm": 0.16997750715247847, + "learning_rate": 5.118826623898842e-07, + "loss": 1.3365, + "step": 10868 + }, + { + "epoch": 0.9861186717474143, + "grad_norm": 0.1270470760266966, + "learning_rate": 5.052575356722656e-07, + "loss": 1.3308, + "step": 10869 + }, + { + "epoch": 0.9862093993830521, + "grad_norm": 0.12597897140186295, + "learning_rate": 4.986755403992849e-07, + "loss": 1.3761, + "step": 10870 + }, + { + "epoch": 0.9863001270186899, + "grad_norm": 0.1232259577505349, + "learning_rate": 4.92136677139321e-07, + "loss": 1.3871, + "step": 10871 + }, + { + "epoch": 0.9863908546543277, + "grad_norm": 0.1634502074010743, + "learning_rate": 4.856409464569222e-07, + "loss": 1.3458, + "step": 10872 + }, + { + "epoch": 0.9864815822899655, + "grad_norm": 0.1249711444972125, + "learning_rate": 4.791883489130843e-07, + "loss": 1.3083, + "step": 10873 + }, + { + "epoch": 0.9865723099256033, + "grad_norm": 0.11949223126123822, + "learning_rate": 4.727788850649173e-07, + "loss": 1.3268, + "step": 10874 + }, + { + "epoch": 0.9866630375612412, + "grad_norm": 0.1227847100883984, + "learning_rate": 4.6641255546597814e-07, + "loss": 1.3856, + "step": 10875 + }, + { + "epoch": 0.986753765196879, + "grad_norm": 0.13387847818779883, + "learning_rate": 4.6008936066588293e-07, + "loss": 1.3171, + "step": 10876 + }, + { + "epoch": 0.9868444928325167, + "grad_norm": 0.11522800454226419, + "learning_rate": 4.5380930121075023e-07, + "loss": 1.3774, + "step": 10877 + }, + { + "epoch": 0.9869352204681546, + "grad_norm": 0.11504745248857291, + "learning_rate": 4.475723776427576e-07, + "loss": 1.3597, + "step": 10878 + }, + { + "epoch": 0.9870259481037924, + "grad_norm": 0.14053922912237862, + "learning_rate": 4.4137859050052963e-07, + "loss": 1.3418, + "step": 10879 + }, + { + "epoch": 0.9871166757394302, + "grad_norm": 0.13006218151728757, + "learning_rate": 4.352279403188608e-07, + "loss": 1.3446, + "step": 10880 + }, + { + "epoch": 0.9872074033750681, + "grad_norm": 0.12382343494353874, + "learning_rate": 4.2912042762893735e-07, + "loss": 1.3584, + "step": 10881 + }, + { + "epoch": 0.9872981310107058, + "grad_norm": 0.15429562036133965, + "learning_rate": 4.230560529580596e-07, + "loss": 1.3525, + "step": 10882 + }, + { + "epoch": 0.9873888586463436, + "grad_norm": 0.18246083509076694, + "learning_rate": 4.1703481682997537e-07, + "loss": 1.3738, + "step": 10883 + }, + { + "epoch": 0.9874795862819815, + "grad_norm": 0.12118377629668439, + "learning_rate": 4.11056719764491e-07, + "loss": 1.3487, + "step": 10884 + }, + { + "epoch": 0.9875703139176193, + "grad_norm": 0.12536347025560368, + "learning_rate": 4.051217622779713e-07, + "loss": 1.3465, + "step": 10885 + }, + { + "epoch": 0.9876610415532571, + "grad_norm": 0.1171518377685885, + "learning_rate": 3.992299448827286e-07, + "loss": 1.3418, + "step": 10886 + }, + { + "epoch": 0.987751769188895, + "grad_norm": 0.11811979126217508, + "learning_rate": 3.933812680876891e-07, + "loss": 1.3137, + "step": 10887 + }, + { + "epoch": 0.9878424968245327, + "grad_norm": 0.15341458707838182, + "learning_rate": 3.8757573239778246e-07, + "loss": 1.3529, + "step": 10888 + }, + { + "epoch": 0.9879332244601706, + "grad_norm": 0.12036964462065786, + "learning_rate": 3.818133383143296e-07, + "loss": 1.3308, + "step": 10889 + }, + { + "epoch": 0.9880239520958084, + "grad_norm": 0.1263736010162287, + "learning_rate": 3.760940863349882e-07, + "loss": 1.3251, + "step": 10890 + }, + { + "epoch": 0.9881146797314462, + "grad_norm": 0.12400171176219514, + "learning_rate": 3.7041797695352984e-07, + "loss": 1.358, + "step": 10891 + }, + { + "epoch": 0.9882054073670841, + "grad_norm": 0.12203704349186738, + "learning_rate": 3.647850106600625e-07, + "loss": 1.3302, + "step": 10892 + }, + { + "epoch": 0.9882961350027218, + "grad_norm": 0.12581064312152307, + "learning_rate": 3.591951879411415e-07, + "loss": 1.3323, + "step": 10893 + }, + { + "epoch": 0.9883868626383596, + "grad_norm": 0.13262864352236362, + "learning_rate": 3.536485092792696e-07, + "loss": 1.3466, + "step": 10894 + }, + { + "epoch": 0.9884775902739975, + "grad_norm": 0.12321605810478783, + "learning_rate": 3.4814497515356367e-07, + "loss": 1.3418, + "step": 10895 + }, + { + "epoch": 0.9885683179096353, + "grad_norm": 0.11535050470831262, + "learning_rate": 3.4268458603908814e-07, + "loss": 1.3673, + "step": 10896 + }, + { + "epoch": 0.9886590455452731, + "grad_norm": 0.12111819351555368, + "learning_rate": 3.372673424075212e-07, + "loss": 1.3538, + "step": 10897 + }, + { + "epoch": 0.988749773180911, + "grad_norm": 0.136905801451768, + "learning_rate": 3.3189324472654437e-07, + "loss": 1.3204, + "step": 10898 + }, + { + "epoch": 0.9888405008165487, + "grad_norm": 0.1423461918829932, + "learning_rate": 3.265622934602308e-07, + "loss": 1.331, + "step": 10899 + }, + { + "epoch": 0.9889312284521865, + "grad_norm": 0.12034645498248034, + "learning_rate": 3.21274489068879e-07, + "loss": 1.3421, + "step": 10900 + }, + { + "epoch": 0.9890219560878244, + "grad_norm": 0.13317561468720274, + "learning_rate": 3.160298320091792e-07, + "loss": 1.3347, + "step": 10901 + }, + { + "epoch": 0.9891126837234622, + "grad_norm": 0.11766937582448482, + "learning_rate": 3.108283227338804e-07, + "loss": 1.3035, + "step": 10902 + }, + { + "epoch": 0.9892034113591, + "grad_norm": 0.1236017142122136, + "learning_rate": 3.0566996169223426e-07, + "loss": 1.3454, + "step": 10903 + }, + { + "epoch": 0.9892941389947378, + "grad_norm": 0.1199515631479178, + "learning_rate": 3.0055474932960683e-07, + "loss": 1.373, + "step": 10904 + }, + { + "epoch": 0.9893848666303756, + "grad_norm": 0.11655917131901161, + "learning_rate": 2.9548268608775574e-07, + "loss": 1.3407, + "step": 10905 + }, + { + "epoch": 0.9894755942660134, + "grad_norm": 0.12251865892168368, + "learning_rate": 2.904537724046641e-07, + "loss": 1.3585, + "step": 10906 + }, + { + "epoch": 0.9895663219016513, + "grad_norm": 0.1267379044087828, + "learning_rate": 2.854680087144845e-07, + "loss": 1.3468, + "step": 10907 + }, + { + "epoch": 0.9896570495372891, + "grad_norm": 0.13327593312374572, + "learning_rate": 2.8052539544781706e-07, + "loss": 1.3373, + "step": 10908 + }, + { + "epoch": 0.9897477771729268, + "grad_norm": 0.12786310328628608, + "learning_rate": 2.7562593303137596e-07, + "loss": 1.3246, + "step": 10909 + }, + { + "epoch": 0.9898385048085647, + "grad_norm": 0.13791270571546763, + "learning_rate": 2.7076962188832266e-07, + "loss": 1.3406, + "step": 10910 + }, + { + "epoch": 0.9899292324442025, + "grad_norm": 0.14300943498507954, + "learning_rate": 2.659564624379884e-07, + "loss": 1.3602, + "step": 10911 + }, + { + "epoch": 0.9900199600798403, + "grad_norm": 0.11606869144573777, + "learning_rate": 2.6118645509592974e-07, + "loss": 1.3516, + "step": 10912 + }, + { + "epoch": 0.9901106877154782, + "grad_norm": 0.12851349128832992, + "learning_rate": 2.5645960027409487e-07, + "loss": 1.3295, + "step": 10913 + }, + { + "epoch": 0.9902014153511159, + "grad_norm": 0.12189671935316607, + "learning_rate": 2.517758983806573e-07, + "loss": 1.3391, + "step": 10914 + }, + { + "epoch": 0.9902921429867537, + "grad_norm": 0.18493261391960075, + "learning_rate": 2.4713534981996023e-07, + "loss": 1.3661, + "step": 10915 + }, + { + "epoch": 0.9903828706223916, + "grad_norm": 0.16033047591783936, + "learning_rate": 2.425379549928497e-07, + "loss": 1.3599, + "step": 10916 + }, + { + "epoch": 0.9904735982580294, + "grad_norm": 0.11939009882005977, + "learning_rate": 2.3798371429623044e-07, + "loss": 1.3553, + "step": 10917 + }, + { + "epoch": 0.9905643258936672, + "grad_norm": 0.11958995841212487, + "learning_rate": 2.3347262812334347e-07, + "loss": 1.3202, + "step": 10918 + }, + { + "epoch": 0.990655053529305, + "grad_norm": 0.11923482362493108, + "learning_rate": 2.2900469686376602e-07, + "loss": 1.3323, + "step": 10919 + }, + { + "epoch": 0.9907457811649428, + "grad_norm": 0.1711044750519547, + "learning_rate": 2.245799209033006e-07, + "loss": 1.3383, + "step": 10920 + }, + { + "epoch": 0.9908365088005806, + "grad_norm": 0.11792040633923372, + "learning_rate": 2.2019830062397495e-07, + "loss": 1.3584, + "step": 10921 + }, + { + "epoch": 0.9909272364362185, + "grad_norm": 0.12412749528929694, + "learning_rate": 2.1585983640420859e-07, + "loss": 1.3572, + "step": 10922 + }, + { + "epoch": 0.9910179640718563, + "grad_norm": 0.12928546439756194, + "learning_rate": 2.1156452861864627e-07, + "loss": 1.38, + "step": 10923 + }, + { + "epoch": 0.991108691707494, + "grad_norm": 0.18337506327108719, + "learning_rate": 2.0731237763810252e-07, + "loss": 1.3352, + "step": 10924 + }, + { + "epoch": 0.9911994193431319, + "grad_norm": 0.39950945450543374, + "learning_rate": 2.0310338382983905e-07, + "loss": 1.327, + "step": 10925 + }, + { + "epoch": 0.9912901469787697, + "grad_norm": 0.12335777814265887, + "learning_rate": 1.9893754755723193e-07, + "loss": 1.3169, + "step": 10926 + }, + { + "epoch": 0.9913808746144076, + "grad_norm": 0.12073496144190879, + "learning_rate": 1.9481486918004886e-07, + "loss": 1.3526, + "step": 10927 + }, + { + "epoch": 0.9914716022500454, + "grad_norm": 0.13777005675226645, + "learning_rate": 1.907353490542274e-07, + "loss": 1.3482, + "step": 10928 + }, + { + "epoch": 0.9915623298856832, + "grad_norm": 0.1146434121660018, + "learning_rate": 1.866989875320968e-07, + "loss": 1.3261, + "step": 10929 + }, + { + "epoch": 0.991653057521321, + "grad_norm": 0.1231115870611982, + "learning_rate": 1.8270578496215606e-07, + "loss": 1.3279, + "step": 10930 + }, + { + "epoch": 0.9917437851569588, + "grad_norm": 0.12273425233588586, + "learning_rate": 1.7875574168929599e-07, + "loss": 1.3361, + "step": 10931 + }, + { + "epoch": 0.9918345127925966, + "grad_norm": 0.21050162530789965, + "learning_rate": 1.7484885805446604e-07, + "loss": 1.3224, + "step": 10932 + }, + { + "epoch": 0.9919252404282345, + "grad_norm": 0.12558266635242346, + "learning_rate": 1.7098513439517404e-07, + "loss": 1.3599, + "step": 10933 + }, + { + "epoch": 0.9920159680638723, + "grad_norm": 0.13157696907683492, + "learning_rate": 1.67164571044931e-07, + "loss": 1.2839, + "step": 10934 + }, + { + "epoch": 0.99210669569951, + "grad_norm": 0.14433059346311433, + "learning_rate": 1.6338716833369515e-07, + "loss": 1.3492, + "step": 10935 + }, + { + "epoch": 0.9921974233351479, + "grad_norm": 0.12906770911738588, + "learning_rate": 1.5965292658765007e-07, + "loss": 1.3541, + "step": 10936 + }, + { + "epoch": 0.9922881509707857, + "grad_norm": 0.12031828210702744, + "learning_rate": 1.5596184612926e-07, + "loss": 1.3893, + "step": 10937 + }, + { + "epoch": 0.9923788786064235, + "grad_norm": 0.23619232771642154, + "learning_rate": 1.5231392727727e-07, + "loss": 1.3404, + "step": 10938 + }, + { + "epoch": 0.9924696062420614, + "grad_norm": 0.13087612188346745, + "learning_rate": 1.487091703465948e-07, + "loss": 1.3503, + "step": 10939 + }, + { + "epoch": 0.9925603338776992, + "grad_norm": 0.12209416462996867, + "learning_rate": 1.4514757564854098e-07, + "loss": 1.3229, + "step": 10940 + }, + { + "epoch": 0.9926510615133369, + "grad_norm": 0.11800517819512232, + "learning_rate": 1.416291434906958e-07, + "loss": 1.3711, + "step": 10941 + }, + { + "epoch": 0.9927417891489748, + "grad_norm": 0.11299136520443125, + "learning_rate": 1.3815387417681625e-07, + "loss": 1.3162, + "step": 10942 + }, + { + "epoch": 0.9928325167846126, + "grad_norm": 0.12397899208187707, + "learning_rate": 1.3472176800705116e-07, + "loss": 1.3225, + "step": 10943 + }, + { + "epoch": 0.9929232444202504, + "grad_norm": 0.11969817624439096, + "learning_rate": 1.3133282527771905e-07, + "loss": 1.3298, + "step": 10944 + }, + { + "epoch": 0.9930139720558883, + "grad_norm": 0.11859402179154473, + "learning_rate": 1.2798704628147474e-07, + "loss": 1.3597, + "step": 10945 + }, + { + "epoch": 0.993104699691526, + "grad_norm": 0.1182337096047353, + "learning_rate": 1.2468443130725372e-07, + "loss": 1.339, + "step": 10946 + }, + { + "epoch": 0.9931954273271638, + "grad_norm": 0.12847067676625812, + "learning_rate": 1.2142498064016127e-07, + "loss": 1.3391, + "step": 10947 + }, + { + "epoch": 0.9932861549628017, + "grad_norm": 0.11999355986995654, + "learning_rate": 1.1820869456169448e-07, + "loss": 1.3639, + "step": 10948 + }, + { + "epoch": 0.9933768825984395, + "grad_norm": 0.12457907520998261, + "learning_rate": 1.1503557334963111e-07, + "loss": 1.3478, + "step": 10949 + }, + { + "epoch": 0.9934676102340773, + "grad_norm": 0.13042160713068737, + "learning_rate": 1.1190561727786319e-07, + "loss": 1.3206, + "step": 10950 + }, + { + "epoch": 0.9935583378697151, + "grad_norm": 0.11629082319125608, + "learning_rate": 1.0881882661673004e-07, + "loss": 1.3209, + "step": 10951 + }, + { + "epoch": 0.9936490655053529, + "grad_norm": 0.12443654937747325, + "learning_rate": 1.0577520163279619e-07, + "loss": 1.3769, + "step": 10952 + }, + { + "epoch": 0.9937397931409907, + "grad_norm": 0.12015461864206392, + "learning_rate": 1.0277474258885144e-07, + "loss": 1.3481, + "step": 10953 + }, + { + "epoch": 0.9938305207766286, + "grad_norm": 0.12691276272244523, + "learning_rate": 9.981744974396634e-08, + "loss": 1.3725, + "step": 10954 + }, + { + "epoch": 0.9939212484122664, + "grad_norm": 0.11375531115739225, + "learning_rate": 9.690332335354767e-08, + "loss": 1.3402, + "step": 10955 + }, + { + "epoch": 0.9940119760479041, + "grad_norm": 0.12893606716763234, + "learning_rate": 9.4032363669172e-08, + "loss": 1.3596, + "step": 10956 + }, + { + "epoch": 0.994102703683542, + "grad_norm": 0.11457030239854692, + "learning_rate": 9.120457093875211e-08, + "loss": 1.3603, + "step": 10957 + }, + { + "epoch": 0.9941934313191798, + "grad_norm": 0.1223489098170321, + "learning_rate": 8.841994540659259e-08, + "loss": 1.3423, + "step": 10958 + }, + { + "epoch": 0.9942841589548176, + "grad_norm": 0.1214683875425154, + "learning_rate": 8.56784873129457e-08, + "loss": 1.3543, + "step": 10959 + }, + { + "epoch": 0.9943748865904555, + "grad_norm": 0.11907875295402669, + "learning_rate": 8.298019689473301e-08, + "loss": 1.3759, + "step": 10960 + }, + { + "epoch": 0.9944656142260933, + "grad_norm": 0.1181370353399603, + "learning_rate": 8.032507438482384e-08, + "loss": 1.3383, + "step": 10961 + }, + { + "epoch": 0.994556341861731, + "grad_norm": 0.11924360295228836, + "learning_rate": 7.77131200124792e-08, + "loss": 1.3387, + "step": 10962 + }, + { + "epoch": 0.9946470694973689, + "grad_norm": 0.12391030682290151, + "learning_rate": 7.514433400335197e-08, + "loss": 1.3602, + "step": 10963 + }, + { + "epoch": 0.9947377971330067, + "grad_norm": 0.1177552569392387, + "learning_rate": 7.261871657915364e-08, + "loss": 1.3292, + "step": 10964 + }, + { + "epoch": 0.9948285247686446, + "grad_norm": 0.11608330694985915, + "learning_rate": 7.013626795804306e-08, + "loss": 1.342, + "step": 10965 + }, + { + "epoch": 0.9949192524042824, + "grad_norm": 0.13514972215835808, + "learning_rate": 6.769698835429327e-08, + "loss": 1.3071, + "step": 10966 + }, + { + "epoch": 0.9950099800399201, + "grad_norm": 0.12466805231604032, + "learning_rate": 6.530087797862461e-08, + "loss": 1.3411, + "step": 10967 + }, + { + "epoch": 0.995100707675558, + "grad_norm": 0.11636889786432102, + "learning_rate": 6.294793703792712e-08, + "loss": 1.3662, + "step": 10968 + }, + { + "epoch": 0.9951914353111958, + "grad_norm": 0.11626947943767382, + "learning_rate": 6.063816573537162e-08, + "loss": 1.3251, + "step": 10969 + }, + { + "epoch": 0.9952821629468336, + "grad_norm": 0.12216634242511912, + "learning_rate": 5.837156427035417e-08, + "loss": 1.327, + "step": 10970 + }, + { + "epoch": 0.9953728905824715, + "grad_norm": 0.11975844190709245, + "learning_rate": 5.6148132838662604e-08, + "loss": 1.3314, + "step": 10971 + }, + { + "epoch": 0.9954636182181092, + "grad_norm": 0.12120673189862907, + "learning_rate": 5.3967871632254475e-08, + "loss": 1.3893, + "step": 10972 + }, + { + "epoch": 0.995554345853747, + "grad_norm": 0.14365459274950237, + "learning_rate": 5.18307808393681e-08, + "loss": 1.3432, + "step": 10973 + }, + { + "epoch": 0.9956450734893849, + "grad_norm": 0.11921720185055384, + "learning_rate": 4.973686064463356e-08, + "loss": 1.3746, + "step": 10974 + }, + { + "epoch": 0.9957358011250227, + "grad_norm": 0.1388943009489665, + "learning_rate": 4.768611122873967e-08, + "loss": 1.3345, + "step": 10975 + }, + { + "epoch": 0.9958265287606605, + "grad_norm": 0.1895209121048533, + "learning_rate": 4.567853276887801e-08, + "loss": 1.3118, + "step": 10976 + }, + { + "epoch": 0.9959172563962984, + "grad_norm": 0.12636311251343388, + "learning_rate": 4.3714125438409914e-08, + "loss": 1.3606, + "step": 10977 + }, + { + "epoch": 0.9960079840319361, + "grad_norm": 0.1282219553841542, + "learning_rate": 4.1792889406810916e-08, + "loss": 1.3025, + "step": 10978 + }, + { + "epoch": 0.9960987116675739, + "grad_norm": 0.12667591477784293, + "learning_rate": 3.991482484017039e-08, + "loss": 1.356, + "step": 10979 + }, + { + "epoch": 0.9961894393032118, + "grad_norm": 0.11410209796498143, + "learning_rate": 3.807993190052539e-08, + "loss": 1.3337, + "step": 10980 + }, + { + "epoch": 0.9962801669388496, + "grad_norm": 0.12752571178221914, + "learning_rate": 3.628821074636024e-08, + "loss": 1.3559, + "step": 10981 + }, + { + "epoch": 0.9963708945744874, + "grad_norm": 0.12496137356855255, + "learning_rate": 3.453966153244004e-08, + "loss": 1.3517, + "step": 10982 + }, + { + "epoch": 0.9964616222101252, + "grad_norm": 0.12588913378234975, + "learning_rate": 3.283428440964409e-08, + "loss": 1.3195, + "step": 10983 + }, + { + "epoch": 0.996552349845763, + "grad_norm": 0.12340691051654465, + "learning_rate": 3.11720795253545e-08, + "loss": 1.3172, + "step": 10984 + }, + { + "epoch": 0.9966430774814008, + "grad_norm": 0.11845747559531085, + "learning_rate": 2.955304702301209e-08, + "loss": 1.3688, + "step": 10985 + }, + { + "epoch": 0.9967338051170387, + "grad_norm": 0.12459701368917363, + "learning_rate": 2.7977187042449446e-08, + "loss": 1.3311, + "step": 10986 + }, + { + "epoch": 0.9968245327526765, + "grad_norm": 0.13921679581257893, + "learning_rate": 2.6444499719779914e-08, + "loss": 1.3693, + "step": 10987 + }, + { + "epoch": 0.9969152603883142, + "grad_norm": 0.13265434820186706, + "learning_rate": 2.495498518728656e-08, + "loss": 1.3824, + "step": 10988 + }, + { + "epoch": 0.9970059880239521, + "grad_norm": 0.15563131818416145, + "learning_rate": 2.350864357358873e-08, + "loss": 1.3323, + "step": 10989 + }, + { + "epoch": 0.9970967156595899, + "grad_norm": 0.11729797194797267, + "learning_rate": 2.2105475003642018e-08, + "loss": 1.3678, + "step": 10990 + }, + { + "epoch": 0.9971874432952277, + "grad_norm": 0.12492446556989938, + "learning_rate": 2.074547959862727e-08, + "loss": 1.3647, + "step": 10991 + }, + { + "epoch": 0.9972781709308656, + "grad_norm": 0.12437297651974173, + "learning_rate": 1.942865747583955e-08, + "loss": 1.3043, + "step": 10992 + }, + { + "epoch": 0.9973688985665033, + "grad_norm": 0.1322978190657724, + "learning_rate": 1.815500874913223e-08, + "loss": 1.3119, + "step": 10993 + }, + { + "epoch": 0.9974596262021411, + "grad_norm": 0.12461316352304952, + "learning_rate": 1.6924533528417386e-08, + "loss": 1.3327, + "step": 10994 + }, + { + "epoch": 0.997550353837779, + "grad_norm": 0.12714613584261025, + "learning_rate": 1.573723191994336e-08, + "loss": 1.3314, + "step": 10995 + }, + { + "epoch": 0.9976410814734168, + "grad_norm": 0.1179958522976882, + "learning_rate": 1.4593104026294768e-08, + "loss": 1.3731, + "step": 10996 + }, + { + "epoch": 0.9977318091090546, + "grad_norm": 0.1244488628550195, + "learning_rate": 1.3492149946170428e-08, + "loss": 1.3896, + "step": 10997 + }, + { + "epoch": 0.9978225367446925, + "grad_norm": 0.11854736322000073, + "learning_rate": 1.2434369774716458e-08, + "loss": 1.3088, + "step": 10998 + }, + { + "epoch": 0.9979132643803302, + "grad_norm": 0.12635545000231513, + "learning_rate": 1.14197636032487e-08, + "loss": 1.3713, + "step": 10999 + }, + { + "epoch": 0.998003992015968, + "grad_norm": 0.12033560288585753, + "learning_rate": 1.0448331519363752e-08, + "loss": 1.3807, + "step": 11000 + }, + { + "epoch": 0.9980947196516059, + "grad_norm": 0.1140866743746187, + "learning_rate": 9.520073606938962e-09, + "loss": 1.345, + "step": 11001 + }, + { + "epoch": 0.9981854472872437, + "grad_norm": 0.13117160092243044, + "learning_rate": 8.634989946132433e-09, + "loss": 1.3842, + "step": 11002 + }, + { + "epoch": 0.9982761749228816, + "grad_norm": 0.13946294897898037, + "learning_rate": 7.79308061343853e-09, + "loss": 1.3614, + "step": 11003 + }, + { + "epoch": 0.9983669025585193, + "grad_norm": 0.1238305114235542, + "learning_rate": 6.994345681465841e-09, + "loss": 1.3656, + "step": 11004 + }, + { + "epoch": 0.9984576301941571, + "grad_norm": 0.1188622133729442, + "learning_rate": 6.238785219270238e-09, + "loss": 1.3429, + "step": 11005 + }, + { + "epoch": 0.998548357829795, + "grad_norm": 0.12406436434249074, + "learning_rate": 5.526399292021811e-09, + "loss": 1.323, + "step": 11006 + }, + { + "epoch": 0.9986390854654328, + "grad_norm": 0.1442848059192041, + "learning_rate": 4.857187961226917e-09, + "loss": 1.3296, + "step": 11007 + }, + { + "epoch": 0.9987298131010706, + "grad_norm": 0.1276191920944935, + "learning_rate": 4.231151284728174e-09, + "loss": 1.3625, + "step": 11008 + }, + { + "epoch": 0.9988205407367085, + "grad_norm": 0.12370825643187572, + "learning_rate": 3.648289316593445e-09, + "loss": 1.3313, + "step": 11009 + }, + { + "epoch": 0.9989112683723462, + "grad_norm": 0.11353032487346468, + "learning_rate": 3.108602107060321e-09, + "loss": 1.3002, + "step": 11010 + }, + { + "epoch": 0.999001996007984, + "grad_norm": 0.14272735699220132, + "learning_rate": 2.6120897028691915e-09, + "loss": 1.3456, + "step": 11011 + }, + { + "epoch": 0.9990927236436219, + "grad_norm": 0.1172667450501014, + "learning_rate": 2.1587521467636427e-09, + "loss": 1.3645, + "step": 11012 + }, + { + "epoch": 0.9991834512792597, + "grad_norm": 0.1090335042611265, + "learning_rate": 1.748589477934548e-09, + "loss": 1.3351, + "step": 11013 + }, + { + "epoch": 0.9992741789148974, + "grad_norm": 0.11806547927437956, + "learning_rate": 1.3816017317980211e-09, + "loss": 1.3567, + "step": 11014 + }, + { + "epoch": 0.9993649065505353, + "grad_norm": 0.12792078644480392, + "learning_rate": 1.0577889401064411e-09, + "loss": 1.3365, + "step": 11015 + }, + { + "epoch": 0.9994556341861731, + "grad_norm": 0.11900406638075543, + "learning_rate": 7.771511307264056e-10, + "loss": 1.3612, + "step": 11016 + }, + { + "epoch": 0.9995463618218109, + "grad_norm": 0.11944495080715964, + "learning_rate": 5.39688327971799e-10, + "loss": 1.3201, + "step": 11017 + }, + { + "epoch": 0.9996370894574488, + "grad_norm": 0.1225738603470268, + "learning_rate": 3.4540055227072487e-10, + "loss": 1.3169, + "step": 11018 + }, + { + "epoch": 0.9997278170930866, + "grad_norm": 0.11972983058699295, + "learning_rate": 1.9428782044306204e-10, + "loss": 1.2806, + "step": 11019 + }, + { + "epoch": 0.9998185447287243, + "grad_norm": 0.12237192732469131, + "learning_rate": 8.635014553393105e-11, + "loss": 1.3124, + "step": 11020 + }, + { + "epoch": 0.9999092723643622, + "grad_norm": 0.12086356292591903, + "learning_rate": 2.158753686920534e-11, + "loss": 1.3489, + "step": 11021 + }, + { + "epoch": 1.0, + "grad_norm": 0.1246624550837589, + "learning_rate": 0.0, + "loss": 1.3146, + "step": 11022 + }, + { + "epoch": 1.0, + "step": 11022, + "total_flos": 5.328882153658778e+16, + "train_loss": 0.24744041566082148, + "train_runtime": 93720.3062, + "train_samples_per_second": 120.426, + "train_steps_per_second": 0.118 + } + ], + "logging_steps": 1.0, + "max_steps": 11022, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.328882153658778e+16, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}