diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,257138 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9999863865008543, + "eval_steps": 500, + "global_step": 36728, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 5.4453996583011714e-05, + "grad_norm": 0.6514295021273755, + "learning_rate": 1.8148820326678765e-07, + "loss": 12.2835, + "step": 1 + }, + { + "epoch": 0.00010890799316602343, + "grad_norm": 0.6445339767412759, + "learning_rate": 3.629764065335753e-07, + "loss": 12.0613, + "step": 2 + }, + { + "epoch": 0.00016336198974903514, + "grad_norm": 0.7426420695460909, + "learning_rate": 5.44464609800363e-07, + "loss": 12.4091, + "step": 3 + }, + { + "epoch": 0.00021781598633204685, + "grad_norm": 0.7481904256847658, + "learning_rate": 7.259528130671506e-07, + "loss": 12.3499, + "step": 4 + }, + { + "epoch": 0.0002722699829150586, + "grad_norm": 0.6345118387930209, + "learning_rate": 9.074410163339384e-07, + "loss": 12.1005, + "step": 5 + }, + { + "epoch": 0.0003267239794980703, + "grad_norm": 0.7830858567681372, + "learning_rate": 1.088929219600726e-06, + "loss": 12.3444, + "step": 6 + }, + { + "epoch": 0.000381177976081082, + "grad_norm": 0.6552438665550809, + "learning_rate": 1.2704174228675138e-06, + "loss": 12.0518, + "step": 7 + }, + { + "epoch": 0.0004356319726640937, + "grad_norm": 1.0230629177664952, + "learning_rate": 1.4519056261343012e-06, + "loss": 12.2645, + "step": 8 + }, + { + "epoch": 0.0004900859692471054, + "grad_norm": 0.736508733398066, + "learning_rate": 1.6333938294010888e-06, + "loss": 12.4657, + "step": 9 + }, + { + "epoch": 0.0005445399658301172, + "grad_norm": 0.7832204882876573, + "learning_rate": 1.8148820326678768e-06, + "loss": 12.2456, + "step": 10 + }, + { + "epoch": 0.0005989939624131288, + "grad_norm": 0.6406598213024922, + "learning_rate": 1.996370235934664e-06, + "loss": 12.2987, + "step": 11 + }, + { + "epoch": 0.0006534479589961406, + "grad_norm": 0.6616531858585198, + "learning_rate": 2.177858439201452e-06, + "loss": 12.2704, + "step": 12 + }, + { + "epoch": 0.0007079019555791523, + "grad_norm": 0.9041079613798669, + "learning_rate": 2.35934664246824e-06, + "loss": 12.219, + "step": 13 + }, + { + "epoch": 0.000762355952162164, + "grad_norm": 0.7350143336611246, + "learning_rate": 2.5408348457350276e-06, + "loss": 12.3781, + "step": 14 + }, + { + "epoch": 0.0008168099487451757, + "grad_norm": 0.650775455014501, + "learning_rate": 2.722323049001815e-06, + "loss": 12.3572, + "step": 15 + }, + { + "epoch": 0.0008712639453281874, + "grad_norm": 0.7993019309118847, + "learning_rate": 2.9038112522686024e-06, + "loss": 12.2421, + "step": 16 + }, + { + "epoch": 0.0009257179419111992, + "grad_norm": 0.7507798789960347, + "learning_rate": 3.0852994555353906e-06, + "loss": 12.3405, + "step": 17 + }, + { + "epoch": 0.000980171938494211, + "grad_norm": 0.7232033569227868, + "learning_rate": 3.2667876588021776e-06, + "loss": 12.2115, + "step": 18 + }, + { + "epoch": 0.0010346259350772226, + "grad_norm": 0.6730149594636531, + "learning_rate": 3.448275862068966e-06, + "loss": 12.3185, + "step": 19 + }, + { + "epoch": 0.0010890799316602344, + "grad_norm": 0.7695953679018918, + "learning_rate": 3.6297640653357536e-06, + "loss": 12.3159, + "step": 20 + }, + { + "epoch": 0.001143533928243246, + "grad_norm": 0.711349662965617, + "learning_rate": 3.8112522686025406e-06, + "loss": 12.2963, + "step": 21 + }, + { + "epoch": 0.0011979879248262576, + "grad_norm": 0.8016325530794184, + "learning_rate": 3.992740471869328e-06, + "loss": 12.3665, + "step": 22 + }, + { + "epoch": 0.0012524419214092694, + "grad_norm": 0.5886320970744596, + "learning_rate": 4.174228675136116e-06, + "loss": 12.1546, + "step": 23 + }, + { + "epoch": 0.0013068959179922811, + "grad_norm": 0.6537757775197935, + "learning_rate": 4.355716878402904e-06, + "loss": 12.4958, + "step": 24 + }, + { + "epoch": 0.0013613499145752929, + "grad_norm": 0.6751351224388435, + "learning_rate": 4.537205081669692e-06, + "loss": 12.3862, + "step": 25 + }, + { + "epoch": 0.0014158039111583046, + "grad_norm": 0.6901807715714254, + "learning_rate": 4.71869328493648e-06, + "loss": 12.1915, + "step": 26 + }, + { + "epoch": 0.0014702579077413163, + "grad_norm": 0.7439712686389042, + "learning_rate": 4.900181488203267e-06, + "loss": 12.3018, + "step": 27 + }, + { + "epoch": 0.001524711904324328, + "grad_norm": 0.6966653361776312, + "learning_rate": 5.081669691470055e-06, + "loss": 12.2465, + "step": 28 + }, + { + "epoch": 0.0015791659009073396, + "grad_norm": 0.6787479289536181, + "learning_rate": 5.263157894736842e-06, + "loss": 12.3024, + "step": 29 + }, + { + "epoch": 0.0016336198974903514, + "grad_norm": 0.7046649743230384, + "learning_rate": 5.44464609800363e-06, + "loss": 12.3373, + "step": 30 + }, + { + "epoch": 0.001688073894073363, + "grad_norm": 0.6384618515156677, + "learning_rate": 5.626134301270418e-06, + "loss": 12.2291, + "step": 31 + }, + { + "epoch": 0.0017425278906563748, + "grad_norm": 0.7312209897280082, + "learning_rate": 5.807622504537205e-06, + "loss": 12.2218, + "step": 32 + }, + { + "epoch": 0.0017969818872393866, + "grad_norm": 0.6929587260508587, + "learning_rate": 5.9891107078039935e-06, + "loss": 12.2801, + "step": 33 + }, + { + "epoch": 0.0018514358838223983, + "grad_norm": 0.6811907398958092, + "learning_rate": 6.170598911070781e-06, + "loss": 12.2846, + "step": 34 + }, + { + "epoch": 0.00190588988040541, + "grad_norm": 0.7114030300924737, + "learning_rate": 6.352087114337568e-06, + "loss": 12.2761, + "step": 35 + }, + { + "epoch": 0.001960343876988422, + "grad_norm": 0.7208253804183478, + "learning_rate": 6.533575317604355e-06, + "loss": 12.2223, + "step": 36 + }, + { + "epoch": 0.0020147978735714335, + "grad_norm": 0.7173103462219833, + "learning_rate": 6.715063520871144e-06, + "loss": 12.2833, + "step": 37 + }, + { + "epoch": 0.0020692518701544453, + "grad_norm": 0.7199801679873419, + "learning_rate": 6.896551724137932e-06, + "loss": 12.4004, + "step": 38 + }, + { + "epoch": 0.002123705866737457, + "grad_norm": 0.6283121742102948, + "learning_rate": 7.078039927404719e-06, + "loss": 12.2422, + "step": 39 + }, + { + "epoch": 0.0021781598633204688, + "grad_norm": 0.6438610301627259, + "learning_rate": 7.259528130671507e-06, + "loss": 12.2721, + "step": 40 + }, + { + "epoch": 0.0022326138599034805, + "grad_norm": 0.673710537418366, + "learning_rate": 7.441016333938294e-06, + "loss": 12.2338, + "step": 41 + }, + { + "epoch": 0.002287067856486492, + "grad_norm": 0.7046804607495716, + "learning_rate": 7.622504537205081e-06, + "loss": 12.3124, + "step": 42 + }, + { + "epoch": 0.0023415218530695035, + "grad_norm": 0.7040526020482769, + "learning_rate": 7.80399274047187e-06, + "loss": 12.2752, + "step": 43 + }, + { + "epoch": 0.0023959758496525153, + "grad_norm": 0.7306043247419787, + "learning_rate": 7.985480943738657e-06, + "loss": 12.4015, + "step": 44 + }, + { + "epoch": 0.002450429846235527, + "grad_norm": 0.664490461781453, + "learning_rate": 8.166969147005445e-06, + "loss": 12.3169, + "step": 45 + }, + { + "epoch": 0.0025048838428185388, + "grad_norm": 0.6548692389195887, + "learning_rate": 8.348457350272232e-06, + "loss": 12.2979, + "step": 46 + }, + { + "epoch": 0.0025593378394015505, + "grad_norm": 0.7790840513869893, + "learning_rate": 8.52994555353902e-06, + "loss": 12.3873, + "step": 47 + }, + { + "epoch": 0.0026137918359845622, + "grad_norm": 0.700770109715519, + "learning_rate": 8.711433756805808e-06, + "loss": 12.2902, + "step": 48 + }, + { + "epoch": 0.002668245832567574, + "grad_norm": 0.7284900184817878, + "learning_rate": 8.892921960072596e-06, + "loss": 12.2719, + "step": 49 + }, + { + "epoch": 0.0027226998291505857, + "grad_norm": 0.8332734018054926, + "learning_rate": 9.074410163339384e-06, + "loss": 12.2799, + "step": 50 + }, + { + "epoch": 0.0027771538257335975, + "grad_norm": 0.6965636980670914, + "learning_rate": 9.255898366606171e-06, + "loss": 12.2629, + "step": 51 + }, + { + "epoch": 0.002831607822316609, + "grad_norm": 0.6783391272444385, + "learning_rate": 9.43738656987296e-06, + "loss": 12.1928, + "step": 52 + }, + { + "epoch": 0.002886061818899621, + "grad_norm": 0.6768160560169009, + "learning_rate": 9.618874773139747e-06, + "loss": 12.2738, + "step": 53 + }, + { + "epoch": 0.0029405158154826327, + "grad_norm": 0.7038244532929006, + "learning_rate": 9.800362976406533e-06, + "loss": 12.2378, + "step": 54 + }, + { + "epoch": 0.0029949698120656444, + "grad_norm": 0.6932385380100631, + "learning_rate": 9.981851179673321e-06, + "loss": 12.3725, + "step": 55 + }, + { + "epoch": 0.003049423808648656, + "grad_norm": 0.6915084120243854, + "learning_rate": 1.016333938294011e-05, + "loss": 12.2021, + "step": 56 + }, + { + "epoch": 0.003103877805231668, + "grad_norm": 0.7148909867057305, + "learning_rate": 1.0344827586206897e-05, + "loss": 12.3215, + "step": 57 + }, + { + "epoch": 0.0031583318018146792, + "grad_norm": 0.6991958091345991, + "learning_rate": 1.0526315789473684e-05, + "loss": 12.2382, + "step": 58 + }, + { + "epoch": 0.003212785798397691, + "grad_norm": 0.6919246914121425, + "learning_rate": 1.0707803992740472e-05, + "loss": 12.183, + "step": 59 + }, + { + "epoch": 0.0032672397949807027, + "grad_norm": 0.8025321181544712, + "learning_rate": 1.088929219600726e-05, + "loss": 12.4441, + "step": 60 + }, + { + "epoch": 0.0033216937915637144, + "grad_norm": 0.6698039631860974, + "learning_rate": 1.1070780399274048e-05, + "loss": 12.2611, + "step": 61 + }, + { + "epoch": 0.003376147788146726, + "grad_norm": 0.7407394943261039, + "learning_rate": 1.1252268602540836e-05, + "loss": 12.2053, + "step": 62 + }, + { + "epoch": 0.003430601784729738, + "grad_norm": 0.6865613288316597, + "learning_rate": 1.1433756805807623e-05, + "loss": 12.3175, + "step": 63 + }, + { + "epoch": 0.0034850557813127497, + "grad_norm": 0.7041064521893132, + "learning_rate": 1.161524500907441e-05, + "loss": 12.2321, + "step": 64 + }, + { + "epoch": 0.0035395097778957614, + "grad_norm": 0.6723425810734814, + "learning_rate": 1.1796733212341199e-05, + "loss": 12.2558, + "step": 65 + }, + { + "epoch": 0.003593963774478773, + "grad_norm": 0.661956982839324, + "learning_rate": 1.1978221415607987e-05, + "loss": 12.2064, + "step": 66 + }, + { + "epoch": 0.003648417771061785, + "grad_norm": 0.7483218714408623, + "learning_rate": 1.2159709618874773e-05, + "loss": 12.3637, + "step": 67 + }, + { + "epoch": 0.0037028717676447966, + "grad_norm": 0.6719289161767829, + "learning_rate": 1.2341197822141563e-05, + "loss": 12.2866, + "step": 68 + }, + { + "epoch": 0.0037573257642278084, + "grad_norm": 0.7353482196298523, + "learning_rate": 1.2522686025408347e-05, + "loss": 12.3733, + "step": 69 + }, + { + "epoch": 0.00381177976081082, + "grad_norm": 0.7121839832467598, + "learning_rate": 1.2704174228675136e-05, + "loss": 12.3107, + "step": 70 + }, + { + "epoch": 0.003866233757393832, + "grad_norm": 0.7298666561899594, + "learning_rate": 1.2885662431941924e-05, + "loss": 12.2409, + "step": 71 + }, + { + "epoch": 0.003920687753976844, + "grad_norm": 0.673187183618948, + "learning_rate": 1.306715063520871e-05, + "loss": 12.2624, + "step": 72 + }, + { + "epoch": 0.003975141750559855, + "grad_norm": 0.6647579362296864, + "learning_rate": 1.32486388384755e-05, + "loss": 12.2228, + "step": 73 + }, + { + "epoch": 0.004029595747142867, + "grad_norm": 0.7126085308286554, + "learning_rate": 1.3430127041742288e-05, + "loss": 12.1887, + "step": 74 + }, + { + "epoch": 0.004084049743725879, + "grad_norm": 0.6508629608031615, + "learning_rate": 1.3611615245009074e-05, + "loss": 12.2574, + "step": 75 + }, + { + "epoch": 0.0041385037403088906, + "grad_norm": 0.7217793347228008, + "learning_rate": 1.3793103448275863e-05, + "loss": 12.3596, + "step": 76 + }, + { + "epoch": 0.004192957736891902, + "grad_norm": 0.7277886621255288, + "learning_rate": 1.3974591651542651e-05, + "loss": 12.1593, + "step": 77 + }, + { + "epoch": 0.004247411733474914, + "grad_norm": 0.7476740623784802, + "learning_rate": 1.4156079854809437e-05, + "loss": 12.2851, + "step": 78 + }, + { + "epoch": 0.004301865730057926, + "grad_norm": 0.670232339117414, + "learning_rate": 1.4337568058076225e-05, + "loss": 12.2627, + "step": 79 + }, + { + "epoch": 0.0043563197266409375, + "grad_norm": 0.6467570917998483, + "learning_rate": 1.4519056261343015e-05, + "loss": 12.2336, + "step": 80 + }, + { + "epoch": 0.004410773723223949, + "grad_norm": 0.7120208830515039, + "learning_rate": 1.47005444646098e-05, + "loss": 12.357, + "step": 81 + }, + { + "epoch": 0.004465227719806961, + "grad_norm": 0.7432640467149172, + "learning_rate": 1.4882032667876588e-05, + "loss": 12.3145, + "step": 82 + }, + { + "epoch": 0.004519681716389972, + "grad_norm": 0.8293675536060945, + "learning_rate": 1.5063520871143378e-05, + "loss": 12.6006, + "step": 83 + }, + { + "epoch": 0.004574135712972984, + "grad_norm": 0.7120304671169275, + "learning_rate": 1.5245009074410162e-05, + "loss": 12.3627, + "step": 84 + }, + { + "epoch": 0.004628589709555995, + "grad_norm": 0.6673914704270645, + "learning_rate": 1.542649727767695e-05, + "loss": 12.2417, + "step": 85 + }, + { + "epoch": 0.004683043706139007, + "grad_norm": 0.6868912483040882, + "learning_rate": 1.560798548094374e-05, + "loss": 12.2832, + "step": 86 + }, + { + "epoch": 0.004737497702722019, + "grad_norm": 0.6950953167328401, + "learning_rate": 1.5789473684210526e-05, + "loss": 12.2372, + "step": 87 + }, + { + "epoch": 0.004791951699305031, + "grad_norm": 0.7383575421215864, + "learning_rate": 1.5970961887477314e-05, + "loss": 12.2887, + "step": 88 + }, + { + "epoch": 0.004846405695888042, + "grad_norm": 0.703029378935624, + "learning_rate": 1.6152450090744105e-05, + "loss": 12.3149, + "step": 89 + }, + { + "epoch": 0.004900859692471054, + "grad_norm": 0.7021279833401868, + "learning_rate": 1.633393829401089e-05, + "loss": 12.2678, + "step": 90 + }, + { + "epoch": 0.004955313689054066, + "grad_norm": 0.7251509526578306, + "learning_rate": 1.6515426497277677e-05, + "loss": 12.3562, + "step": 91 + }, + { + "epoch": 0.0050097676856370775, + "grad_norm": 0.7797560038561253, + "learning_rate": 1.6696914700544465e-05, + "loss": 12.276, + "step": 92 + }, + { + "epoch": 0.005064221682220089, + "grad_norm": 0.7064592922388186, + "learning_rate": 1.6878402903811253e-05, + "loss": 12.3852, + "step": 93 + }, + { + "epoch": 0.005118675678803101, + "grad_norm": 0.7053365174441039, + "learning_rate": 1.705989110707804e-05, + "loss": 12.3483, + "step": 94 + }, + { + "epoch": 0.005173129675386113, + "grad_norm": 0.6738625360373299, + "learning_rate": 1.7241379310344828e-05, + "loss": 12.1678, + "step": 95 + }, + { + "epoch": 0.0052275836719691245, + "grad_norm": 0.5972838096238351, + "learning_rate": 1.7422867513611616e-05, + "loss": 12.1698, + "step": 96 + }, + { + "epoch": 0.005282037668552136, + "grad_norm": 0.694605357226162, + "learning_rate": 1.7604355716878404e-05, + "loss": 12.2028, + "step": 97 + }, + { + "epoch": 0.005336491665135148, + "grad_norm": 0.7235709308856128, + "learning_rate": 1.7785843920145192e-05, + "loss": 12.3249, + "step": 98 + }, + { + "epoch": 0.00539094566171816, + "grad_norm": 0.7116272658253867, + "learning_rate": 1.796733212341198e-05, + "loss": 12.1328, + "step": 99 + }, + { + "epoch": 0.0054453996583011715, + "grad_norm": 0.7960311740307418, + "learning_rate": 1.8148820326678767e-05, + "loss": 12.4241, + "step": 100 + }, + { + "epoch": 0.005499853654884183, + "grad_norm": 0.7607101112707406, + "learning_rate": 1.8330308529945555e-05, + "loss": 12.1386, + "step": 101 + }, + { + "epoch": 0.005554307651467195, + "grad_norm": 0.6989823266122064, + "learning_rate": 1.8511796733212343e-05, + "loss": 12.3272, + "step": 102 + }, + { + "epoch": 0.005608761648050207, + "grad_norm": 0.6771018070968361, + "learning_rate": 1.869328493647913e-05, + "loss": 12.2864, + "step": 103 + }, + { + "epoch": 0.005663215644633218, + "grad_norm": 0.6776164869457959, + "learning_rate": 1.887477313974592e-05, + "loss": 12.2483, + "step": 104 + }, + { + "epoch": 0.00571766964121623, + "grad_norm": 0.7161127268232864, + "learning_rate": 1.9056261343012703e-05, + "loss": 12.1731, + "step": 105 + }, + { + "epoch": 0.005772123637799242, + "grad_norm": 0.7248124927863123, + "learning_rate": 1.9237749546279494e-05, + "loss": 12.4591, + "step": 106 + }, + { + "epoch": 0.005826577634382254, + "grad_norm": 0.7591292383102465, + "learning_rate": 1.941923774954628e-05, + "loss": 12.1536, + "step": 107 + }, + { + "epoch": 0.005881031630965265, + "grad_norm": 0.6567533407138826, + "learning_rate": 1.9600725952813066e-05, + "loss": 12.2191, + "step": 108 + }, + { + "epoch": 0.005935485627548277, + "grad_norm": 0.7190643868028403, + "learning_rate": 1.9782214156079858e-05, + "loss": 12.2527, + "step": 109 + }, + { + "epoch": 0.005989939624131289, + "grad_norm": 0.6898991311631406, + "learning_rate": 1.9963702359346642e-05, + "loss": 12.2642, + "step": 110 + }, + { + "epoch": 0.006044393620714301, + "grad_norm": 0.7495187164171233, + "learning_rate": 2.014519056261343e-05, + "loss": 12.2646, + "step": 111 + }, + { + "epoch": 0.006098847617297312, + "grad_norm": 0.7170076277469752, + "learning_rate": 2.032667876588022e-05, + "loss": 12.3398, + "step": 112 + }, + { + "epoch": 0.006153301613880324, + "grad_norm": 0.731108164640672, + "learning_rate": 2.0508166969147005e-05, + "loss": 12.3531, + "step": 113 + }, + { + "epoch": 0.006207755610463336, + "grad_norm": 0.7825773788282819, + "learning_rate": 2.0689655172413793e-05, + "loss": 12.2599, + "step": 114 + }, + { + "epoch": 0.006262209607046348, + "grad_norm": 0.6717004811664388, + "learning_rate": 2.087114337568058e-05, + "loss": 12.3298, + "step": 115 + }, + { + "epoch": 0.0063166636036293584, + "grad_norm": 0.6969969828201115, + "learning_rate": 2.105263157894737e-05, + "loss": 12.2737, + "step": 116 + }, + { + "epoch": 0.00637111760021237, + "grad_norm": 0.7242188258016712, + "learning_rate": 2.1234119782214157e-05, + "loss": 12.3695, + "step": 117 + }, + { + "epoch": 0.006425571596795382, + "grad_norm": 0.8811248563175815, + "learning_rate": 2.1415607985480945e-05, + "loss": 12.3923, + "step": 118 + }, + { + "epoch": 0.006480025593378394, + "grad_norm": 0.6943426432171022, + "learning_rate": 2.1597096188747732e-05, + "loss": 12.2724, + "step": 119 + }, + { + "epoch": 0.006534479589961405, + "grad_norm": 0.8367672147901168, + "learning_rate": 2.177858439201452e-05, + "loss": 12.2846, + "step": 120 + }, + { + "epoch": 0.006588933586544417, + "grad_norm": 0.7169323963392524, + "learning_rate": 2.1960072595281308e-05, + "loss": 12.3054, + "step": 121 + }, + { + "epoch": 0.006643387583127429, + "grad_norm": 0.7328339351641384, + "learning_rate": 2.2141560798548096e-05, + "loss": 12.3401, + "step": 122 + }, + { + "epoch": 0.006697841579710441, + "grad_norm": 0.6950351401076931, + "learning_rate": 2.2323049001814884e-05, + "loss": 12.2651, + "step": 123 + }, + { + "epoch": 0.006752295576293452, + "grad_norm": 0.7135461876331347, + "learning_rate": 2.250453720508167e-05, + "loss": 12.3066, + "step": 124 + }, + { + "epoch": 0.006806749572876464, + "grad_norm": 0.6740158526405283, + "learning_rate": 2.2686025408348456e-05, + "loss": 12.2421, + "step": 125 + }, + { + "epoch": 0.006861203569459476, + "grad_norm": 0.7511554883846614, + "learning_rate": 2.2867513611615247e-05, + "loss": 12.2741, + "step": 126 + }, + { + "epoch": 0.006915657566042488, + "grad_norm": 0.6990531199482589, + "learning_rate": 2.3049001814882035e-05, + "loss": 12.384, + "step": 127 + }, + { + "epoch": 0.006970111562625499, + "grad_norm": 0.7135501305238853, + "learning_rate": 2.323049001814882e-05, + "loss": 12.3623, + "step": 128 + }, + { + "epoch": 0.007024565559208511, + "grad_norm": 0.7041623355500041, + "learning_rate": 2.341197822141561e-05, + "loss": 12.3004, + "step": 129 + }, + { + "epoch": 0.007079019555791523, + "grad_norm": 0.6661526549812437, + "learning_rate": 2.3593466424682398e-05, + "loss": 12.1402, + "step": 130 + }, + { + "epoch": 0.0071334735523745346, + "grad_norm": 0.7361476840420841, + "learning_rate": 2.3774954627949183e-05, + "loss": 12.1733, + "step": 131 + }, + { + "epoch": 0.007187927548957546, + "grad_norm": 0.7967543207030372, + "learning_rate": 2.3956442831215974e-05, + "loss": 12.4345, + "step": 132 + }, + { + "epoch": 0.007242381545540558, + "grad_norm": 0.77469629112749, + "learning_rate": 2.413793103448276e-05, + "loss": 12.3164, + "step": 133 + }, + { + "epoch": 0.00729683554212357, + "grad_norm": 0.8176680285748424, + "learning_rate": 2.4319419237749546e-05, + "loss": 12.3651, + "step": 134 + }, + { + "epoch": 0.0073512895387065815, + "grad_norm": 0.6969513054321705, + "learning_rate": 2.4500907441016334e-05, + "loss": 12.3023, + "step": 135 + }, + { + "epoch": 0.007405743535289593, + "grad_norm": 0.6874259890342583, + "learning_rate": 2.4682395644283125e-05, + "loss": 12.3488, + "step": 136 + }, + { + "epoch": 0.007460197531872605, + "grad_norm": 0.7848881195123376, + "learning_rate": 2.486388384754991e-05, + "loss": 12.3248, + "step": 137 + }, + { + "epoch": 0.007514651528455617, + "grad_norm": 0.7536572549155603, + "learning_rate": 2.5045372050816694e-05, + "loss": 12.3101, + "step": 138 + }, + { + "epoch": 0.0075691055250386285, + "grad_norm": 0.6779931950396582, + "learning_rate": 2.5226860254083485e-05, + "loss": 12.21, + "step": 139 + }, + { + "epoch": 0.00762355952162164, + "grad_norm": 0.7551152796581928, + "learning_rate": 2.5408348457350273e-05, + "loss": 12.4042, + "step": 140 + }, + { + "epoch": 0.007678013518204652, + "grad_norm": 0.9069979314667508, + "learning_rate": 2.558983666061706e-05, + "loss": 12.2847, + "step": 141 + }, + { + "epoch": 0.007732467514787664, + "grad_norm": 0.7202987526526302, + "learning_rate": 2.577132486388385e-05, + "loss": 12.363, + "step": 142 + }, + { + "epoch": 0.0077869215113706754, + "grad_norm": 0.7524333057239485, + "learning_rate": 2.595281306715064e-05, + "loss": 12.3406, + "step": 143 + }, + { + "epoch": 0.007841375507953687, + "grad_norm": 0.7164888079373964, + "learning_rate": 2.613430127041742e-05, + "loss": 12.3726, + "step": 144 + }, + { + "epoch": 0.007895829504536699, + "grad_norm": 0.7311551787417221, + "learning_rate": 2.6315789473684212e-05, + "loss": 12.2858, + "step": 145 + }, + { + "epoch": 0.00795028350111971, + "grad_norm": 0.6773134174743295, + "learning_rate": 2.6497277676951e-05, + "loss": 12.3468, + "step": 146 + }, + { + "epoch": 0.008004737497702722, + "grad_norm": 0.7174401575961286, + "learning_rate": 2.6678765880217788e-05, + "loss": 12.2457, + "step": 147 + }, + { + "epoch": 0.008059191494285734, + "grad_norm": 0.6863666889751157, + "learning_rate": 2.6860254083484575e-05, + "loss": 12.2749, + "step": 148 + }, + { + "epoch": 0.008113645490868746, + "grad_norm": 0.7262950182241283, + "learning_rate": 2.7041742286751363e-05, + "loss": 12.2609, + "step": 149 + }, + { + "epoch": 0.008168099487451758, + "grad_norm": 0.8015975463544986, + "learning_rate": 2.7223230490018148e-05, + "loss": 12.2737, + "step": 150 + }, + { + "epoch": 0.00822255348403477, + "grad_norm": 0.6974688735570929, + "learning_rate": 2.7404718693284935e-05, + "loss": 12.2764, + "step": 151 + }, + { + "epoch": 0.008277007480617781, + "grad_norm": 0.7612859811519558, + "learning_rate": 2.7586206896551727e-05, + "loss": 12.3173, + "step": 152 + }, + { + "epoch": 0.008331461477200793, + "grad_norm": 0.7535272542092525, + "learning_rate": 2.7767695099818514e-05, + "loss": 12.2599, + "step": 153 + }, + { + "epoch": 0.008385915473783805, + "grad_norm": 0.8024979560171636, + "learning_rate": 2.7949183303085302e-05, + "loss": 12.3722, + "step": 154 + }, + { + "epoch": 0.008440369470366816, + "grad_norm": 0.7531576898161162, + "learning_rate": 2.813067150635209e-05, + "loss": 12.3065, + "step": 155 + }, + { + "epoch": 0.008494823466949828, + "grad_norm": 0.7575320428987756, + "learning_rate": 2.8312159709618874e-05, + "loss": 12.356, + "step": 156 + }, + { + "epoch": 0.00854927746353284, + "grad_norm": 0.6921716231479544, + "learning_rate": 2.8493647912885662e-05, + "loss": 12.2568, + "step": 157 + }, + { + "epoch": 0.008603731460115852, + "grad_norm": 0.6867771987616564, + "learning_rate": 2.867513611615245e-05, + "loss": 12.2139, + "step": 158 + }, + { + "epoch": 0.008658185456698863, + "grad_norm": 0.7022464961805333, + "learning_rate": 2.885662431941924e-05, + "loss": 12.3242, + "step": 159 + }, + { + "epoch": 0.008712639453281875, + "grad_norm": 0.7220153309753276, + "learning_rate": 2.903811252268603e-05, + "loss": 12.3137, + "step": 160 + }, + { + "epoch": 0.008767093449864887, + "grad_norm": 0.7116207468224576, + "learning_rate": 2.9219600725952817e-05, + "loss": 12.4212, + "step": 161 + }, + { + "epoch": 0.008821547446447899, + "grad_norm": 0.7606595881201347, + "learning_rate": 2.94010889292196e-05, + "loss": 12.3533, + "step": 162 + }, + { + "epoch": 0.00887600144303091, + "grad_norm": 0.8010136363377759, + "learning_rate": 2.958257713248639e-05, + "loss": 12.384, + "step": 163 + }, + { + "epoch": 0.008930455439613922, + "grad_norm": 0.7641948223788549, + "learning_rate": 2.9764065335753177e-05, + "loss": 12.2878, + "step": 164 + }, + { + "epoch": 0.008984909436196934, + "grad_norm": 0.7313238653805536, + "learning_rate": 2.9945553539019965e-05, + "loss": 12.2797, + "step": 165 + }, + { + "epoch": 0.009039363432779944, + "grad_norm": 0.7280246174606982, + "learning_rate": 3.0127041742286756e-05, + "loss": 12.2048, + "step": 166 + }, + { + "epoch": 0.009093817429362955, + "grad_norm": 0.6735111329183364, + "learning_rate": 3.0308529945553544e-05, + "loss": 12.1298, + "step": 167 + }, + { + "epoch": 0.009148271425945967, + "grad_norm": 0.662515268105397, + "learning_rate": 3.0490018148820325e-05, + "loss": 12.2694, + "step": 168 + }, + { + "epoch": 0.009202725422528979, + "grad_norm": 0.6814067150138639, + "learning_rate": 3.0671506352087116e-05, + "loss": 12.3329, + "step": 169 + }, + { + "epoch": 0.00925717941911199, + "grad_norm": 0.6299645547657693, + "learning_rate": 3.08529945553539e-05, + "loss": 12.2323, + "step": 170 + }, + { + "epoch": 0.009311633415695002, + "grad_norm": 0.7046415195233939, + "learning_rate": 3.103448275862069e-05, + "loss": 12.1888, + "step": 171 + }, + { + "epoch": 0.009366087412278014, + "grad_norm": 0.6635786550238101, + "learning_rate": 3.121597096188748e-05, + "loss": 12.3145, + "step": 172 + }, + { + "epoch": 0.009420541408861026, + "grad_norm": 0.8405662525571916, + "learning_rate": 3.139745916515426e-05, + "loss": 12.3302, + "step": 173 + }, + { + "epoch": 0.009474995405444038, + "grad_norm": 0.7053223200658312, + "learning_rate": 3.157894736842105e-05, + "loss": 12.2905, + "step": 174 + }, + { + "epoch": 0.00952944940202705, + "grad_norm": 0.7179732734291909, + "learning_rate": 3.176043557168784e-05, + "loss": 12.3143, + "step": 175 + }, + { + "epoch": 0.009583903398610061, + "grad_norm": 0.6713806463041468, + "learning_rate": 3.194192377495463e-05, + "loss": 12.2889, + "step": 176 + }, + { + "epoch": 0.009638357395193073, + "grad_norm": 0.8537429522165395, + "learning_rate": 3.212341197822142e-05, + "loss": 12.3003, + "step": 177 + }, + { + "epoch": 0.009692811391776085, + "grad_norm": 0.7936643802651663, + "learning_rate": 3.230490018148821e-05, + "loss": 12.4024, + "step": 178 + }, + { + "epoch": 0.009747265388359096, + "grad_norm": 0.6673211953219038, + "learning_rate": 3.248638838475499e-05, + "loss": 12.3591, + "step": 179 + }, + { + "epoch": 0.009801719384942108, + "grad_norm": 0.7664690630438143, + "learning_rate": 3.266787658802178e-05, + "loss": 12.2, + "step": 180 + }, + { + "epoch": 0.00985617338152512, + "grad_norm": 0.7602742645809355, + "learning_rate": 3.284936479128857e-05, + "loss": 12.2861, + "step": 181 + }, + { + "epoch": 0.009910627378108132, + "grad_norm": 0.7223425709471121, + "learning_rate": 3.3030852994555354e-05, + "loss": 12.3773, + "step": 182 + }, + { + "epoch": 0.009965081374691143, + "grad_norm": 0.7272094124639393, + "learning_rate": 3.3212341197822145e-05, + "loss": 12.2859, + "step": 183 + }, + { + "epoch": 0.010019535371274155, + "grad_norm": 0.7226902937341132, + "learning_rate": 3.339382940108893e-05, + "loss": 12.3335, + "step": 184 + }, + { + "epoch": 0.010073989367857167, + "grad_norm": 0.7069144564518154, + "learning_rate": 3.3575317604355714e-05, + "loss": 12.3361, + "step": 185 + }, + { + "epoch": 0.010128443364440179, + "grad_norm": 0.7351369937399913, + "learning_rate": 3.3756805807622505e-05, + "loss": 12.3304, + "step": 186 + }, + { + "epoch": 0.01018289736102319, + "grad_norm": 0.7562449223296611, + "learning_rate": 3.3938294010889297e-05, + "loss": 12.4463, + "step": 187 + }, + { + "epoch": 0.010237351357606202, + "grad_norm": 0.7380133543278365, + "learning_rate": 3.411978221415608e-05, + "loss": 12.3487, + "step": 188 + }, + { + "epoch": 0.010291805354189214, + "grad_norm": 0.8018559952427972, + "learning_rate": 3.430127041742287e-05, + "loss": 12.4735, + "step": 189 + }, + { + "epoch": 0.010346259350772226, + "grad_norm": 0.6992645273661477, + "learning_rate": 3.4482758620689657e-05, + "loss": 12.2485, + "step": 190 + }, + { + "epoch": 0.010400713347355237, + "grad_norm": 0.7190265436442772, + "learning_rate": 3.466424682395644e-05, + "loss": 12.2231, + "step": 191 + }, + { + "epoch": 0.010455167343938249, + "grad_norm": 0.7728396372356257, + "learning_rate": 3.484573502722323e-05, + "loss": 12.4133, + "step": 192 + }, + { + "epoch": 0.01050962134052126, + "grad_norm": 0.8039774543145355, + "learning_rate": 3.502722323049002e-05, + "loss": 12.4807, + "step": 193 + }, + { + "epoch": 0.010564075337104272, + "grad_norm": 0.7275544168870578, + "learning_rate": 3.520871143375681e-05, + "loss": 12.3294, + "step": 194 + }, + { + "epoch": 0.010618529333687284, + "grad_norm": 0.8566875819319059, + "learning_rate": 3.53901996370236e-05, + "loss": 12.4185, + "step": 195 + }, + { + "epoch": 0.010672983330270296, + "grad_norm": 0.8055650218978196, + "learning_rate": 3.5571687840290383e-05, + "loss": 12.5118, + "step": 196 + }, + { + "epoch": 0.010727437326853308, + "grad_norm": 0.7349187170227696, + "learning_rate": 3.575317604355717e-05, + "loss": 12.2468, + "step": 197 + }, + { + "epoch": 0.01078189132343632, + "grad_norm": 0.7134840505631237, + "learning_rate": 3.593466424682396e-05, + "loss": 12.388, + "step": 198 + }, + { + "epoch": 0.010836345320019331, + "grad_norm": 0.6774397050532335, + "learning_rate": 3.6116152450090743e-05, + "loss": 12.2235, + "step": 199 + }, + { + "epoch": 0.010890799316602343, + "grad_norm": 0.7744176715025664, + "learning_rate": 3.6297640653357535e-05, + "loss": 12.293, + "step": 200 + }, + { + "epoch": 0.010945253313185355, + "grad_norm": 0.7348144637670879, + "learning_rate": 3.6479128856624326e-05, + "loss": 12.3172, + "step": 201 + }, + { + "epoch": 0.010999707309768366, + "grad_norm": 0.6968931217183741, + "learning_rate": 3.666061705989111e-05, + "loss": 12.1999, + "step": 202 + }, + { + "epoch": 0.011054161306351378, + "grad_norm": 0.847067144995256, + "learning_rate": 3.6842105263157895e-05, + "loss": 12.2706, + "step": 203 + }, + { + "epoch": 0.01110861530293439, + "grad_norm": 0.7530363546527266, + "learning_rate": 3.7023593466424686e-05, + "loss": 12.2301, + "step": 204 + }, + { + "epoch": 0.011163069299517402, + "grad_norm": 0.7564652633064927, + "learning_rate": 3.720508166969147e-05, + "loss": 12.3639, + "step": 205 + }, + { + "epoch": 0.011217523296100413, + "grad_norm": 0.7135173235708773, + "learning_rate": 3.738656987295826e-05, + "loss": 12.2338, + "step": 206 + }, + { + "epoch": 0.011271977292683425, + "grad_norm": 0.6858025792749569, + "learning_rate": 3.7568058076225046e-05, + "loss": 12.3793, + "step": 207 + }, + { + "epoch": 0.011326431289266437, + "grad_norm": 0.7824021027165805, + "learning_rate": 3.774954627949184e-05, + "loss": 12.339, + "step": 208 + }, + { + "epoch": 0.011380885285849449, + "grad_norm": 0.7250247555230146, + "learning_rate": 3.793103448275862e-05, + "loss": 12.2351, + "step": 209 + }, + { + "epoch": 0.01143533928243246, + "grad_norm": 0.6809803088998201, + "learning_rate": 3.8112522686025406e-05, + "loss": 12.4248, + "step": 210 + }, + { + "epoch": 0.011489793279015472, + "grad_norm": 0.7032058028279533, + "learning_rate": 3.82940108892922e-05, + "loss": 12.3732, + "step": 211 + }, + { + "epoch": 0.011544247275598484, + "grad_norm": 0.754428880383836, + "learning_rate": 3.847549909255899e-05, + "loss": 12.3197, + "step": 212 + }, + { + "epoch": 0.011598701272181496, + "grad_norm": 0.7504424399849499, + "learning_rate": 3.865698729582577e-05, + "loss": 12.3846, + "step": 213 + }, + { + "epoch": 0.011653155268764507, + "grad_norm": 0.8098844516869852, + "learning_rate": 3.883847549909256e-05, + "loss": 12.4009, + "step": 214 + }, + { + "epoch": 0.011707609265347519, + "grad_norm": 0.7434306698270551, + "learning_rate": 3.901996370235935e-05, + "loss": 12.3121, + "step": 215 + }, + { + "epoch": 0.01176206326193053, + "grad_norm": 0.835756221038369, + "learning_rate": 3.920145190562613e-05, + "loss": 12.3561, + "step": 216 + }, + { + "epoch": 0.011816517258513543, + "grad_norm": 0.7441859041127364, + "learning_rate": 3.9382940108892924e-05, + "loss": 12.3837, + "step": 217 + }, + { + "epoch": 0.011870971255096554, + "grad_norm": 0.6774272997258979, + "learning_rate": 3.9564428312159715e-05, + "loss": 12.321, + "step": 218 + }, + { + "epoch": 0.011925425251679566, + "grad_norm": 0.740858626909941, + "learning_rate": 3.97459165154265e-05, + "loss": 12.3377, + "step": 219 + }, + { + "epoch": 0.011979879248262578, + "grad_norm": 0.8029304956373081, + "learning_rate": 3.9927404718693284e-05, + "loss": 12.2668, + "step": 220 + }, + { + "epoch": 0.01203433324484559, + "grad_norm": 0.7255710443809265, + "learning_rate": 4.0108892921960075e-05, + "loss": 12.2052, + "step": 221 + }, + { + "epoch": 0.012088787241428601, + "grad_norm": 0.7467564299377789, + "learning_rate": 4.029038112522686e-05, + "loss": 12.3052, + "step": 222 + }, + { + "epoch": 0.012143241238011613, + "grad_norm": 0.7757543921998024, + "learning_rate": 4.047186932849365e-05, + "loss": 12.3313, + "step": 223 + }, + { + "epoch": 0.012197695234594625, + "grad_norm": 0.7663879408559824, + "learning_rate": 4.065335753176044e-05, + "loss": 12.3567, + "step": 224 + }, + { + "epoch": 0.012252149231177636, + "grad_norm": 0.830893424480459, + "learning_rate": 4.0834845735027227e-05, + "loss": 12.2888, + "step": 225 + }, + { + "epoch": 0.012306603227760648, + "grad_norm": 0.7659576414170449, + "learning_rate": 4.101633393829401e-05, + "loss": 12.2534, + "step": 226 + }, + { + "epoch": 0.01236105722434366, + "grad_norm": 0.8240503874061232, + "learning_rate": 4.11978221415608e-05, + "loss": 12.2796, + "step": 227 + }, + { + "epoch": 0.012415511220926672, + "grad_norm": 0.8055779272980373, + "learning_rate": 4.1379310344827587e-05, + "loss": 12.3734, + "step": 228 + }, + { + "epoch": 0.012469965217509683, + "grad_norm": 0.6990309940422604, + "learning_rate": 4.156079854809438e-05, + "loss": 12.2448, + "step": 229 + }, + { + "epoch": 0.012524419214092695, + "grad_norm": 0.7814896272818583, + "learning_rate": 4.174228675136116e-05, + "loss": 12.3718, + "step": 230 + }, + { + "epoch": 0.012578873210675707, + "grad_norm": 0.6456843025285693, + "learning_rate": 4.192377495462795e-05, + "loss": 12.2105, + "step": 231 + }, + { + "epoch": 0.012633327207258717, + "grad_norm": 0.7374802126161342, + "learning_rate": 4.210526315789474e-05, + "loss": 12.3398, + "step": 232 + }, + { + "epoch": 0.012687781203841729, + "grad_norm": 0.720484469540933, + "learning_rate": 4.228675136116152e-05, + "loss": 12.3939, + "step": 233 + }, + { + "epoch": 0.01274223520042474, + "grad_norm": 0.7966943666753264, + "learning_rate": 4.2468239564428313e-05, + "loss": 12.41, + "step": 234 + }, + { + "epoch": 0.012796689197007752, + "grad_norm": 0.6898793732291435, + "learning_rate": 4.2649727767695105e-05, + "loss": 12.1404, + "step": 235 + }, + { + "epoch": 0.012851143193590764, + "grad_norm": 0.7640340826855201, + "learning_rate": 4.283121597096189e-05, + "loss": 12.2975, + "step": 236 + }, + { + "epoch": 0.012905597190173776, + "grad_norm": 0.7512000166482308, + "learning_rate": 4.301270417422868e-05, + "loss": 12.483, + "step": 237 + }, + { + "epoch": 0.012960051186756787, + "grad_norm": 0.7378596174200351, + "learning_rate": 4.3194192377495465e-05, + "loss": 12.3202, + "step": 238 + }, + { + "epoch": 0.013014505183339799, + "grad_norm": 0.813840093552659, + "learning_rate": 4.337568058076225e-05, + "loss": 12.3155, + "step": 239 + }, + { + "epoch": 0.01306895917992281, + "grad_norm": 0.7077202129525905, + "learning_rate": 4.355716878402904e-05, + "loss": 12.331, + "step": 240 + }, + { + "epoch": 0.013123413176505823, + "grad_norm": 0.7432924329815834, + "learning_rate": 4.373865698729583e-05, + "loss": 12.4413, + "step": 241 + }, + { + "epoch": 0.013177867173088834, + "grad_norm": 0.8087370274823864, + "learning_rate": 4.3920145190562616e-05, + "loss": 12.352, + "step": 242 + }, + { + "epoch": 0.013232321169671846, + "grad_norm": 0.787315390919198, + "learning_rate": 4.410163339382941e-05, + "loss": 12.3767, + "step": 243 + }, + { + "epoch": 0.013286775166254858, + "grad_norm": 0.7186308136487785, + "learning_rate": 4.428312159709619e-05, + "loss": 12.3972, + "step": 244 + }, + { + "epoch": 0.01334122916283787, + "grad_norm": 0.6962881361865607, + "learning_rate": 4.4464609800362976e-05, + "loss": 12.2565, + "step": 245 + }, + { + "epoch": 0.013395683159420881, + "grad_norm": 0.6962482325758493, + "learning_rate": 4.464609800362977e-05, + "loss": 12.2827, + "step": 246 + }, + { + "epoch": 0.013450137156003893, + "grad_norm": 0.8134645016161036, + "learning_rate": 4.482758620689655e-05, + "loss": 12.3889, + "step": 247 + }, + { + "epoch": 0.013504591152586905, + "grad_norm": 0.7305915380958484, + "learning_rate": 4.500907441016334e-05, + "loss": 12.3915, + "step": 248 + }, + { + "epoch": 0.013559045149169916, + "grad_norm": 0.7618719998357277, + "learning_rate": 4.5190562613430134e-05, + "loss": 12.3237, + "step": 249 + }, + { + "epoch": 0.013613499145752928, + "grad_norm": 0.7662374563616653, + "learning_rate": 4.537205081669691e-05, + "loss": 12.3536, + "step": 250 + }, + { + "epoch": 0.01366795314233594, + "grad_norm": 0.7878655053271634, + "learning_rate": 4.55535390199637e-05, + "loss": 12.2811, + "step": 251 + }, + { + "epoch": 0.013722407138918952, + "grad_norm": 0.7351407959136845, + "learning_rate": 4.5735027223230494e-05, + "loss": 12.3271, + "step": 252 + }, + { + "epoch": 0.013776861135501963, + "grad_norm": 0.738532069890918, + "learning_rate": 4.591651542649728e-05, + "loss": 12.3281, + "step": 253 + }, + { + "epoch": 0.013831315132084975, + "grad_norm": 0.7684920721548087, + "learning_rate": 4.609800362976407e-05, + "loss": 12.174, + "step": 254 + }, + { + "epoch": 0.013885769128667987, + "grad_norm": 0.85709477932742, + "learning_rate": 4.6279491833030854e-05, + "loss": 12.3123, + "step": 255 + }, + { + "epoch": 0.013940223125250999, + "grad_norm": 0.7699276923675464, + "learning_rate": 4.646098003629764e-05, + "loss": 12.3149, + "step": 256 + }, + { + "epoch": 0.01399467712183401, + "grad_norm": 0.7252363551142195, + "learning_rate": 4.664246823956443e-05, + "loss": 12.2997, + "step": 257 + }, + { + "epoch": 0.014049131118417022, + "grad_norm": 0.7307344287538694, + "learning_rate": 4.682395644283122e-05, + "loss": 12.3743, + "step": 258 + }, + { + "epoch": 0.014103585115000034, + "grad_norm": 0.7688745278021739, + "learning_rate": 4.7005444646098005e-05, + "loss": 12.2888, + "step": 259 + }, + { + "epoch": 0.014158039111583046, + "grad_norm": 0.7519190643126366, + "learning_rate": 4.7186932849364796e-05, + "loss": 12.2947, + "step": 260 + }, + { + "epoch": 0.014212493108166057, + "grad_norm": 0.7811339132345416, + "learning_rate": 4.736842105263158e-05, + "loss": 12.462, + "step": 261 + }, + { + "epoch": 0.014266947104749069, + "grad_norm": 0.7115737251169268, + "learning_rate": 4.7549909255898365e-05, + "loss": 12.3242, + "step": 262 + }, + { + "epoch": 0.01432140110133208, + "grad_norm": 0.7507844491490189, + "learning_rate": 4.7731397459165156e-05, + "loss": 12.2234, + "step": 263 + }, + { + "epoch": 0.014375855097915093, + "grad_norm": 0.7610821598373387, + "learning_rate": 4.791288566243195e-05, + "loss": 12.4286, + "step": 264 + }, + { + "epoch": 0.014430309094498104, + "grad_norm": 0.8770692142481441, + "learning_rate": 4.809437386569873e-05, + "loss": 12.5179, + "step": 265 + }, + { + "epoch": 0.014484763091081116, + "grad_norm": 0.7994207367776012, + "learning_rate": 4.827586206896552e-05, + "loss": 12.3988, + "step": 266 + }, + { + "epoch": 0.014539217087664128, + "grad_norm": 0.7317523430466678, + "learning_rate": 4.845735027223231e-05, + "loss": 12.3898, + "step": 267 + }, + { + "epoch": 0.01459367108424714, + "grad_norm": 0.7656122484229141, + "learning_rate": 4.863883847549909e-05, + "loss": 12.0056, + "step": 268 + }, + { + "epoch": 0.014648125080830151, + "grad_norm": 0.7986586703788588, + "learning_rate": 4.882032667876588e-05, + "loss": 12.3948, + "step": 269 + }, + { + "epoch": 0.014702579077413163, + "grad_norm": 0.7657932505480529, + "learning_rate": 4.900181488203267e-05, + "loss": 12.3687, + "step": 270 + }, + { + "epoch": 0.014757033073996175, + "grad_norm": 0.7099680506665171, + "learning_rate": 4.918330308529946e-05, + "loss": 12.2355, + "step": 271 + }, + { + "epoch": 0.014811487070579187, + "grad_norm": 0.6931982785581737, + "learning_rate": 4.936479128856625e-05, + "loss": 12.3633, + "step": 272 + }, + { + "epoch": 0.014865941067162198, + "grad_norm": 0.7474303399881781, + "learning_rate": 4.954627949183303e-05, + "loss": 12.3746, + "step": 273 + }, + { + "epoch": 0.01492039506374521, + "grad_norm": 0.862481964480327, + "learning_rate": 4.972776769509982e-05, + "loss": 12.4141, + "step": 274 + }, + { + "epoch": 0.014974849060328222, + "grad_norm": 0.7318364097265114, + "learning_rate": 4.990925589836661e-05, + "loss": 12.2247, + "step": 275 + }, + { + "epoch": 0.015029303056911233, + "grad_norm": 0.734011719238498, + "learning_rate": 5.009074410163339e-05, + "loss": 12.3864, + "step": 276 + }, + { + "epoch": 0.015083757053494245, + "grad_norm": 0.7736991802041899, + "learning_rate": 5.027223230490018e-05, + "loss": 12.3802, + "step": 277 + }, + { + "epoch": 0.015138211050077257, + "grad_norm": 0.7148902208446675, + "learning_rate": 5.045372050816697e-05, + "loss": 12.3205, + "step": 278 + }, + { + "epoch": 0.015192665046660269, + "grad_norm": 0.7319645221512004, + "learning_rate": 5.0635208711433755e-05, + "loss": 12.3342, + "step": 279 + }, + { + "epoch": 0.01524711904324328, + "grad_norm": 0.7299722315856292, + "learning_rate": 5.0816696914700546e-05, + "loss": 12.3347, + "step": 280 + }, + { + "epoch": 0.015301573039826292, + "grad_norm": 0.7296940630372005, + "learning_rate": 5.099818511796734e-05, + "loss": 12.3668, + "step": 281 + }, + { + "epoch": 0.015356027036409304, + "grad_norm": 0.7073166461717706, + "learning_rate": 5.117967332123412e-05, + "loss": 12.3703, + "step": 282 + }, + { + "epoch": 0.015410481032992316, + "grad_norm": 0.7546793817999312, + "learning_rate": 5.136116152450091e-05, + "loss": 12.3491, + "step": 283 + }, + { + "epoch": 0.015464935029575327, + "grad_norm": 0.685017972082848, + "learning_rate": 5.15426497277677e-05, + "loss": 12.2707, + "step": 284 + }, + { + "epoch": 0.01551938902615834, + "grad_norm": 0.7562946426920689, + "learning_rate": 5.172413793103449e-05, + "loss": 12.3115, + "step": 285 + }, + { + "epoch": 0.015573843022741351, + "grad_norm": 0.7216084563358899, + "learning_rate": 5.190562613430128e-05, + "loss": 12.3921, + "step": 286 + }, + { + "epoch": 0.01562829701932436, + "grad_norm": 0.6939190861763674, + "learning_rate": 5.2087114337568064e-05, + "loss": 12.2032, + "step": 287 + }, + { + "epoch": 0.015682751015907374, + "grad_norm": 0.7239626701676075, + "learning_rate": 5.226860254083484e-05, + "loss": 12.2799, + "step": 288 + }, + { + "epoch": 0.015737205012490384, + "grad_norm": 0.7753868516243451, + "learning_rate": 5.245009074410163e-05, + "loss": 12.4619, + "step": 289 + }, + { + "epoch": 0.015791659009073398, + "grad_norm": 0.7514557376174034, + "learning_rate": 5.2631578947368424e-05, + "loss": 12.4112, + "step": 290 + }, + { + "epoch": 0.015846113005656408, + "grad_norm": 0.7377806576725301, + "learning_rate": 5.281306715063521e-05, + "loss": 12.3133, + "step": 291 + }, + { + "epoch": 0.01590056700223942, + "grad_norm": 0.7454703909220026, + "learning_rate": 5.2994555353902e-05, + "loss": 12.2604, + "step": 292 + }, + { + "epoch": 0.01595502099882243, + "grad_norm": 0.7425978328340237, + "learning_rate": 5.3176043557168784e-05, + "loss": 12.3337, + "step": 293 + }, + { + "epoch": 0.016009474995405445, + "grad_norm": 0.8121816159096134, + "learning_rate": 5.3357531760435575e-05, + "loss": 12.3189, + "step": 294 + }, + { + "epoch": 0.016063928991988455, + "grad_norm": 0.874090569506292, + "learning_rate": 5.3539019963702366e-05, + "loss": 12.4024, + "step": 295 + }, + { + "epoch": 0.01611838298857147, + "grad_norm": 0.766723150191294, + "learning_rate": 5.372050816696915e-05, + "loss": 12.3427, + "step": 296 + }, + { + "epoch": 0.01617283698515448, + "grad_norm": 0.7533549696517355, + "learning_rate": 5.390199637023594e-05, + "loss": 12.1945, + "step": 297 + }, + { + "epoch": 0.016227290981737492, + "grad_norm": 0.82295747879306, + "learning_rate": 5.4083484573502726e-05, + "loss": 12.3395, + "step": 298 + }, + { + "epoch": 0.016281744978320502, + "grad_norm": 0.7763144627979807, + "learning_rate": 5.4264972776769504e-05, + "loss": 12.4658, + "step": 299 + }, + { + "epoch": 0.016336198974903515, + "grad_norm": 0.7803622453108773, + "learning_rate": 5.4446460980036295e-05, + "loss": 12.1774, + "step": 300 + }, + { + "epoch": 0.016390652971486525, + "grad_norm": 0.7711036795355979, + "learning_rate": 5.4627949183303086e-05, + "loss": 12.3421, + "step": 301 + }, + { + "epoch": 0.01644510696806954, + "grad_norm": 0.7862864570197541, + "learning_rate": 5.480943738656987e-05, + "loss": 12.44, + "step": 302 + }, + { + "epoch": 0.01649956096465255, + "grad_norm": 0.7174141772095207, + "learning_rate": 5.499092558983666e-05, + "loss": 12.3887, + "step": 303 + }, + { + "epoch": 0.016554014961235562, + "grad_norm": 0.8079655442331966, + "learning_rate": 5.517241379310345e-05, + "loss": 12.3736, + "step": 304 + }, + { + "epoch": 0.016608468957818572, + "grad_norm": 0.7882892914456249, + "learning_rate": 5.535390199637024e-05, + "loss": 12.3937, + "step": 305 + }, + { + "epoch": 0.016662922954401586, + "grad_norm": 0.7739803428465745, + "learning_rate": 5.553539019963703e-05, + "loss": 12.3119, + "step": 306 + }, + { + "epoch": 0.016717376950984596, + "grad_norm": 0.8202440006814529, + "learning_rate": 5.571687840290381e-05, + "loss": 12.469, + "step": 307 + }, + { + "epoch": 0.01677183094756761, + "grad_norm": 0.6948421754884823, + "learning_rate": 5.5898366606170604e-05, + "loss": 12.38, + "step": 308 + }, + { + "epoch": 0.01682628494415062, + "grad_norm": 0.7278638745473617, + "learning_rate": 5.6079854809437396e-05, + "loss": 12.3829, + "step": 309 + }, + { + "epoch": 0.016880738940733633, + "grad_norm": 0.8078662521778953, + "learning_rate": 5.626134301270418e-05, + "loss": 12.4609, + "step": 310 + }, + { + "epoch": 0.016935192937316643, + "grad_norm": 0.8163381416660692, + "learning_rate": 5.644283121597096e-05, + "loss": 12.296, + "step": 311 + }, + { + "epoch": 0.016989646933899656, + "grad_norm": 0.7499116355114528, + "learning_rate": 5.662431941923775e-05, + "loss": 12.2917, + "step": 312 + }, + { + "epoch": 0.017044100930482666, + "grad_norm": 0.8559513637122415, + "learning_rate": 5.680580762250453e-05, + "loss": 12.3777, + "step": 313 + }, + { + "epoch": 0.01709855492706568, + "grad_norm": 0.8186258240775369, + "learning_rate": 5.6987295825771325e-05, + "loss": 12.449, + "step": 314 + }, + { + "epoch": 0.01715300892364869, + "grad_norm": 0.7314385570608495, + "learning_rate": 5.7168784029038116e-05, + "loss": 12.2357, + "step": 315 + }, + { + "epoch": 0.017207462920231703, + "grad_norm": 0.8568191729204323, + "learning_rate": 5.73502722323049e-05, + "loss": 12.5074, + "step": 316 + }, + { + "epoch": 0.017261916916814713, + "grad_norm": 0.8062065360577912, + "learning_rate": 5.753176043557169e-05, + "loss": 12.357, + "step": 317 + }, + { + "epoch": 0.017316370913397727, + "grad_norm": 0.7803665817022063, + "learning_rate": 5.771324863883848e-05, + "loss": 12.4644, + "step": 318 + }, + { + "epoch": 0.017370824909980737, + "grad_norm": 0.742566393860498, + "learning_rate": 5.789473684210527e-05, + "loss": 12.3906, + "step": 319 + }, + { + "epoch": 0.01742527890656375, + "grad_norm": 0.7289345098896403, + "learning_rate": 5.807622504537206e-05, + "loss": 12.3417, + "step": 320 + }, + { + "epoch": 0.01747973290314676, + "grad_norm": 0.7808326632555322, + "learning_rate": 5.825771324863884e-05, + "loss": 12.3817, + "step": 321 + }, + { + "epoch": 0.017534186899729774, + "grad_norm": 0.8249973000669343, + "learning_rate": 5.8439201451905634e-05, + "loss": 12.2332, + "step": 322 + }, + { + "epoch": 0.017588640896312784, + "grad_norm": 0.7574058668425114, + "learning_rate": 5.862068965517241e-05, + "loss": 12.2956, + "step": 323 + }, + { + "epoch": 0.017643094892895797, + "grad_norm": 0.8298624360645479, + "learning_rate": 5.88021778584392e-05, + "loss": 12.36, + "step": 324 + }, + { + "epoch": 0.017697548889478807, + "grad_norm": 0.8547904402189499, + "learning_rate": 5.898366606170599e-05, + "loss": 12.3879, + "step": 325 + }, + { + "epoch": 0.01775200288606182, + "grad_norm": 0.7968314498228707, + "learning_rate": 5.916515426497278e-05, + "loss": 12.46, + "step": 326 + }, + { + "epoch": 0.01780645688264483, + "grad_norm": 0.8736437654956223, + "learning_rate": 5.934664246823956e-05, + "loss": 12.4407, + "step": 327 + }, + { + "epoch": 0.017860910879227844, + "grad_norm": 0.8668907429370268, + "learning_rate": 5.9528130671506354e-05, + "loss": 12.4397, + "step": 328 + }, + { + "epoch": 0.017915364875810854, + "grad_norm": 0.7818621779531549, + "learning_rate": 5.9709618874773145e-05, + "loss": 12.4018, + "step": 329 + }, + { + "epoch": 0.017969818872393867, + "grad_norm": 0.9103192243585209, + "learning_rate": 5.989110707803993e-05, + "loss": 12.5196, + "step": 330 + }, + { + "epoch": 0.018024272868976877, + "grad_norm": 0.7862412367698574, + "learning_rate": 6.007259528130672e-05, + "loss": 12.3264, + "step": 331 + }, + { + "epoch": 0.018078726865559887, + "grad_norm": 0.7143113953678407, + "learning_rate": 6.025408348457351e-05, + "loss": 12.4047, + "step": 332 + }, + { + "epoch": 0.0181331808621429, + "grad_norm": 0.9052912116449027, + "learning_rate": 6.0435571687840296e-05, + "loss": 12.3172, + "step": 333 + }, + { + "epoch": 0.01818763485872591, + "grad_norm": 0.7947826092807085, + "learning_rate": 6.061705989110709e-05, + "loss": 12.3744, + "step": 334 + }, + { + "epoch": 0.018242088855308924, + "grad_norm": 0.7545460389607728, + "learning_rate": 6.0798548094373865e-05, + "loss": 12.1713, + "step": 335 + }, + { + "epoch": 0.018296542851891934, + "grad_norm": 0.7152244757749424, + "learning_rate": 6.098003629764065e-05, + "loss": 12.3317, + "step": 336 + }, + { + "epoch": 0.018350996848474948, + "grad_norm": 0.7996572062771121, + "learning_rate": 6.116152450090745e-05, + "loss": 12.4122, + "step": 337 + }, + { + "epoch": 0.018405450845057958, + "grad_norm": 0.8411178606002937, + "learning_rate": 6.134301270417423e-05, + "loss": 12.3052, + "step": 338 + }, + { + "epoch": 0.01845990484164097, + "grad_norm": 0.7975036289012045, + "learning_rate": 6.152450090744102e-05, + "loss": 12.4601, + "step": 339 + }, + { + "epoch": 0.01851435883822398, + "grad_norm": 0.9137429350223257, + "learning_rate": 6.17059891107078e-05, + "loss": 12.3111, + "step": 340 + }, + { + "epoch": 0.018568812834806995, + "grad_norm": 0.7148462961473729, + "learning_rate": 6.18874773139746e-05, + "loss": 12.4212, + "step": 341 + }, + { + "epoch": 0.018623266831390005, + "grad_norm": 0.6913217865933418, + "learning_rate": 6.206896551724138e-05, + "loss": 12.3248, + "step": 342 + }, + { + "epoch": 0.01867772082797302, + "grad_norm": 0.8838781205805315, + "learning_rate": 6.225045372050817e-05, + "loss": 12.3943, + "step": 343 + }, + { + "epoch": 0.01873217482455603, + "grad_norm": 0.7873429833167643, + "learning_rate": 6.243194192377497e-05, + "loss": 12.3052, + "step": 344 + }, + { + "epoch": 0.018786628821139042, + "grad_norm": 0.6852179867320145, + "learning_rate": 6.261343012704175e-05, + "loss": 12.3556, + "step": 345 + }, + { + "epoch": 0.018841082817722052, + "grad_norm": 0.8621452861175046, + "learning_rate": 6.279491833030852e-05, + "loss": 12.5934, + "step": 346 + }, + { + "epoch": 0.018895536814305065, + "grad_norm": 0.9120176242511238, + "learning_rate": 6.297640653357532e-05, + "loss": 12.4076, + "step": 347 + }, + { + "epoch": 0.018949990810888075, + "grad_norm": 0.7843427231761305, + "learning_rate": 6.31578947368421e-05, + "loss": 12.3899, + "step": 348 + }, + { + "epoch": 0.01900444480747109, + "grad_norm": 0.7893170140012902, + "learning_rate": 6.333938294010889e-05, + "loss": 12.2412, + "step": 349 + }, + { + "epoch": 0.0190588988040541, + "grad_norm": 0.8916646848287791, + "learning_rate": 6.352087114337569e-05, + "loss": 12.4694, + "step": 350 + }, + { + "epoch": 0.019113352800637112, + "grad_norm": 0.7258088717745435, + "learning_rate": 6.370235934664247e-05, + "loss": 12.2489, + "step": 351 + }, + { + "epoch": 0.019167806797220122, + "grad_norm": 0.8650617630444332, + "learning_rate": 6.388384754990925e-05, + "loss": 12.2905, + "step": 352 + }, + { + "epoch": 0.019222260793803136, + "grad_norm": 0.7667753049211509, + "learning_rate": 6.406533575317605e-05, + "loss": 12.3042, + "step": 353 + }, + { + "epoch": 0.019276714790386146, + "grad_norm": 0.7511282035657841, + "learning_rate": 6.424682395644284e-05, + "loss": 12.2579, + "step": 354 + }, + { + "epoch": 0.01933116878696916, + "grad_norm": 0.7735404530594452, + "learning_rate": 6.442831215970962e-05, + "loss": 12.3486, + "step": 355 + }, + { + "epoch": 0.01938562278355217, + "grad_norm": 0.8412619305074472, + "learning_rate": 6.460980036297642e-05, + "loss": 12.4623, + "step": 356 + }, + { + "epoch": 0.019440076780135183, + "grad_norm": 0.7301192343330496, + "learning_rate": 6.47912885662432e-05, + "loss": 12.3276, + "step": 357 + }, + { + "epoch": 0.019494530776718193, + "grad_norm": 0.7804652434665313, + "learning_rate": 6.497277676950997e-05, + "loss": 12.2476, + "step": 358 + }, + { + "epoch": 0.019548984773301206, + "grad_norm": 0.8556088812883522, + "learning_rate": 6.515426497277677e-05, + "loss": 12.529, + "step": 359 + }, + { + "epoch": 0.019603438769884216, + "grad_norm": 0.8768877684878126, + "learning_rate": 6.533575317604356e-05, + "loss": 12.4842, + "step": 360 + }, + { + "epoch": 0.01965789276646723, + "grad_norm": 0.7343479317743452, + "learning_rate": 6.551724137931034e-05, + "loss": 12.3482, + "step": 361 + }, + { + "epoch": 0.01971234676305024, + "grad_norm": 0.8436152095043282, + "learning_rate": 6.569872958257714e-05, + "loss": 12.422, + "step": 362 + }, + { + "epoch": 0.019766800759633253, + "grad_norm": 0.8293501645016546, + "learning_rate": 6.588021778584392e-05, + "loss": 12.3398, + "step": 363 + }, + { + "epoch": 0.019821254756216263, + "grad_norm": 0.757977291896263, + "learning_rate": 6.606170598911071e-05, + "loss": 12.3762, + "step": 364 + }, + { + "epoch": 0.019875708752799277, + "grad_norm": 0.9108534009229488, + "learning_rate": 6.62431941923775e-05, + "loss": 12.5203, + "step": 365 + }, + { + "epoch": 0.019930162749382287, + "grad_norm": 0.861404684253492, + "learning_rate": 6.642468239564429e-05, + "loss": 12.3693, + "step": 366 + }, + { + "epoch": 0.0199846167459653, + "grad_norm": 0.7442435951270349, + "learning_rate": 6.660617059891108e-05, + "loss": 12.3365, + "step": 367 + }, + { + "epoch": 0.02003907074254831, + "grad_norm": 0.7481686167086928, + "learning_rate": 6.678765880217786e-05, + "loss": 12.3823, + "step": 368 + }, + { + "epoch": 0.020093524739131324, + "grad_norm": 0.8866860856512322, + "learning_rate": 6.696914700544466e-05, + "loss": 12.3893, + "step": 369 + }, + { + "epoch": 0.020147978735714334, + "grad_norm": 0.7480340530756074, + "learning_rate": 6.715063520871143e-05, + "loss": 12.3733, + "step": 370 + }, + { + "epoch": 0.020202432732297347, + "grad_norm": 0.8429189686136943, + "learning_rate": 6.733212341197823e-05, + "loss": 12.3659, + "step": 371 + }, + { + "epoch": 0.020256886728880357, + "grad_norm": 0.927639442740603, + "learning_rate": 6.751361161524501e-05, + "loss": 12.287, + "step": 372 + }, + { + "epoch": 0.02031134072546337, + "grad_norm": 0.7857422926853126, + "learning_rate": 6.76950998185118e-05, + "loss": 12.412, + "step": 373 + }, + { + "epoch": 0.02036579472204638, + "grad_norm": 0.806848985078179, + "learning_rate": 6.787658802177859e-05, + "loss": 12.3987, + "step": 374 + }, + { + "epoch": 0.020420248718629394, + "grad_norm": 0.9004884915515702, + "learning_rate": 6.805807622504538e-05, + "loss": 12.3248, + "step": 375 + }, + { + "epoch": 0.020474702715212404, + "grad_norm": 0.8136907332825543, + "learning_rate": 6.823956442831216e-05, + "loss": 12.5514, + "step": 376 + }, + { + "epoch": 0.020529156711795418, + "grad_norm": 0.7947680138594297, + "learning_rate": 6.842105263157895e-05, + "loss": 12.3048, + "step": 377 + }, + { + "epoch": 0.020583610708378428, + "grad_norm": 0.787939095178154, + "learning_rate": 6.860254083484574e-05, + "loss": 12.5616, + "step": 378 + }, + { + "epoch": 0.02063806470496144, + "grad_norm": 0.8053644554897705, + "learning_rate": 6.878402903811253e-05, + "loss": 12.342, + "step": 379 + }, + { + "epoch": 0.02069251870154445, + "grad_norm": 0.7333377537038525, + "learning_rate": 6.896551724137931e-05, + "loss": 12.346, + "step": 380 + }, + { + "epoch": 0.020746972698127465, + "grad_norm": 0.8115592545530859, + "learning_rate": 6.914700544464611e-05, + "loss": 12.5671, + "step": 381 + }, + { + "epoch": 0.020801426694710475, + "grad_norm": 0.9025245777577485, + "learning_rate": 6.932849364791288e-05, + "loss": 12.2945, + "step": 382 + }, + { + "epoch": 0.020855880691293488, + "grad_norm": 0.7272441731724826, + "learning_rate": 6.950998185117967e-05, + "loss": 12.2068, + "step": 383 + }, + { + "epoch": 0.020910334687876498, + "grad_norm": 0.8508505713825312, + "learning_rate": 6.969147005444646e-05, + "loss": 12.4984, + "step": 384 + }, + { + "epoch": 0.02096478868445951, + "grad_norm": 0.7415051803154171, + "learning_rate": 6.987295825771325e-05, + "loss": 12.3752, + "step": 385 + }, + { + "epoch": 0.02101924268104252, + "grad_norm": 0.759163767114583, + "learning_rate": 7.005444646098003e-05, + "loss": 12.3624, + "step": 386 + }, + { + "epoch": 0.021073696677625535, + "grad_norm": 0.8489341011156437, + "learning_rate": 7.023593466424683e-05, + "loss": 12.3969, + "step": 387 + }, + { + "epoch": 0.021128150674208545, + "grad_norm": 0.8462665677785168, + "learning_rate": 7.041742286751362e-05, + "loss": 12.2932, + "step": 388 + }, + { + "epoch": 0.02118260467079156, + "grad_norm": 0.75552639338092, + "learning_rate": 7.05989110707804e-05, + "loss": 12.3267, + "step": 389 + }, + { + "epoch": 0.02123705866737457, + "grad_norm": 0.8161265444898805, + "learning_rate": 7.07803992740472e-05, + "loss": 12.5891, + "step": 390 + }, + { + "epoch": 0.021291512663957582, + "grad_norm": 0.8856078253587745, + "learning_rate": 7.096188747731398e-05, + "loss": 12.2956, + "step": 391 + }, + { + "epoch": 0.021345966660540592, + "grad_norm": 0.8152626351928155, + "learning_rate": 7.114337568058077e-05, + "loss": 12.3987, + "step": 392 + }, + { + "epoch": 0.021400420657123605, + "grad_norm": 0.8211638500815782, + "learning_rate": 7.132486388384755e-05, + "loss": 12.3112, + "step": 393 + }, + { + "epoch": 0.021454874653706615, + "grad_norm": 0.8404912916628079, + "learning_rate": 7.150635208711434e-05, + "loss": 12.2929, + "step": 394 + }, + { + "epoch": 0.02150932865028963, + "grad_norm": 0.7378950463318673, + "learning_rate": 7.168784029038112e-05, + "loss": 12.4709, + "step": 395 + }, + { + "epoch": 0.02156378264687264, + "grad_norm": 0.7621638941069262, + "learning_rate": 7.186932849364792e-05, + "loss": 12.4729, + "step": 396 + }, + { + "epoch": 0.021618236643455652, + "grad_norm": 0.8226698918779496, + "learning_rate": 7.20508166969147e-05, + "loss": 12.4705, + "step": 397 + }, + { + "epoch": 0.021672690640038662, + "grad_norm": 0.7052288911047748, + "learning_rate": 7.223230490018149e-05, + "loss": 12.4461, + "step": 398 + }, + { + "epoch": 0.021727144636621672, + "grad_norm": 0.7609734401203072, + "learning_rate": 7.241379310344828e-05, + "loss": 12.3506, + "step": 399 + }, + { + "epoch": 0.021781598633204686, + "grad_norm": 0.770636318894737, + "learning_rate": 7.259528130671507e-05, + "loss": 12.248, + "step": 400 + }, + { + "epoch": 0.021836052629787696, + "grad_norm": 0.7161390931014325, + "learning_rate": 7.277676950998185e-05, + "loss": 12.2974, + "step": 401 + }, + { + "epoch": 0.02189050662637071, + "grad_norm": 0.7093602343926348, + "learning_rate": 7.295825771324865e-05, + "loss": 12.3475, + "step": 402 + }, + { + "epoch": 0.02194496062295372, + "grad_norm": 0.7846838901586377, + "learning_rate": 7.313974591651544e-05, + "loss": 12.4812, + "step": 403 + }, + { + "epoch": 0.021999414619536733, + "grad_norm": 0.7610570252567146, + "learning_rate": 7.332123411978222e-05, + "loss": 12.3179, + "step": 404 + }, + { + "epoch": 0.022053868616119743, + "grad_norm": 0.7678646935604143, + "learning_rate": 7.3502722323049e-05, + "loss": 12.4395, + "step": 405 + }, + { + "epoch": 0.022108322612702756, + "grad_norm": 0.788227147067854, + "learning_rate": 7.368421052631579e-05, + "loss": 12.3643, + "step": 406 + }, + { + "epoch": 0.022162776609285766, + "grad_norm": 0.8555008491336252, + "learning_rate": 7.386569872958257e-05, + "loss": 12.5442, + "step": 407 + }, + { + "epoch": 0.02221723060586878, + "grad_norm": 0.7848374239784748, + "learning_rate": 7.404718693284937e-05, + "loss": 12.4897, + "step": 408 + }, + { + "epoch": 0.02227168460245179, + "grad_norm": 0.7986911623537468, + "learning_rate": 7.422867513611616e-05, + "loss": 12.4207, + "step": 409 + }, + { + "epoch": 0.022326138599034803, + "grad_norm": 0.8578821767689867, + "learning_rate": 7.441016333938294e-05, + "loss": 12.406, + "step": 410 + }, + { + "epoch": 0.022380592595617813, + "grad_norm": 0.8579401771922557, + "learning_rate": 7.459165154264974e-05, + "loss": 12.4753, + "step": 411 + }, + { + "epoch": 0.022435046592200827, + "grad_norm": 0.7942776463738789, + "learning_rate": 7.477313974591652e-05, + "loss": 12.3652, + "step": 412 + }, + { + "epoch": 0.022489500588783837, + "grad_norm": 0.7606530813783086, + "learning_rate": 7.495462794918331e-05, + "loss": 12.3301, + "step": 413 + }, + { + "epoch": 0.02254395458536685, + "grad_norm": 0.7820187282722689, + "learning_rate": 7.513611615245009e-05, + "loss": 12.2943, + "step": 414 + }, + { + "epoch": 0.02259840858194986, + "grad_norm": 0.8133279352459926, + "learning_rate": 7.531760435571689e-05, + "loss": 12.3255, + "step": 415 + }, + { + "epoch": 0.022652862578532874, + "grad_norm": 0.7202675473587253, + "learning_rate": 7.549909255898367e-05, + "loss": 12.4437, + "step": 416 + }, + { + "epoch": 0.022707316575115884, + "grad_norm": 0.7250526735890024, + "learning_rate": 7.568058076225046e-05, + "loss": 12.4672, + "step": 417 + }, + { + "epoch": 0.022761770571698897, + "grad_norm": 0.7829881797724314, + "learning_rate": 7.586206896551724e-05, + "loss": 12.4441, + "step": 418 + }, + { + "epoch": 0.022816224568281907, + "grad_norm": 0.8157466754769012, + "learning_rate": 7.604355716878403e-05, + "loss": 12.3637, + "step": 419 + }, + { + "epoch": 0.02287067856486492, + "grad_norm": 0.6878509475121516, + "learning_rate": 7.622504537205081e-05, + "loss": 12.265, + "step": 420 + }, + { + "epoch": 0.02292513256144793, + "grad_norm": 0.8133681369767054, + "learning_rate": 7.640653357531761e-05, + "loss": 12.4073, + "step": 421 + }, + { + "epoch": 0.022979586558030944, + "grad_norm": 0.7970282881764229, + "learning_rate": 7.65880217785844e-05, + "loss": 12.4782, + "step": 422 + }, + { + "epoch": 0.023034040554613954, + "grad_norm": 0.7306816228265234, + "learning_rate": 7.676950998185118e-05, + "loss": 12.2968, + "step": 423 + }, + { + "epoch": 0.023088494551196968, + "grad_norm": 0.6634878489767484, + "learning_rate": 7.695099818511798e-05, + "loss": 12.43, + "step": 424 + }, + { + "epoch": 0.023142948547779978, + "grad_norm": 0.7106521845447317, + "learning_rate": 7.713248638838476e-05, + "loss": 12.2943, + "step": 425 + }, + { + "epoch": 0.02319740254436299, + "grad_norm": 0.6994237071240959, + "learning_rate": 7.731397459165155e-05, + "loss": 12.3155, + "step": 426 + }, + { + "epoch": 0.023251856540946, + "grad_norm": 0.8721037875029398, + "learning_rate": 7.749546279491834e-05, + "loss": 12.3339, + "step": 427 + }, + { + "epoch": 0.023306310537529015, + "grad_norm": 0.7274699367575759, + "learning_rate": 7.767695099818511e-05, + "loss": 12.4143, + "step": 428 + }, + { + "epoch": 0.023360764534112025, + "grad_norm": 0.7122374265862785, + "learning_rate": 7.78584392014519e-05, + "loss": 12.1451, + "step": 429 + }, + { + "epoch": 0.023415218530695038, + "grad_norm": 0.8254803830668477, + "learning_rate": 7.80399274047187e-05, + "loss": 12.5343, + "step": 430 + }, + { + "epoch": 0.023469672527278048, + "grad_norm": 0.7387526625998517, + "learning_rate": 7.822141560798548e-05, + "loss": 12.351, + "step": 431 + }, + { + "epoch": 0.02352412652386106, + "grad_norm": 0.7816935354004675, + "learning_rate": 7.840290381125227e-05, + "loss": 12.3752, + "step": 432 + }, + { + "epoch": 0.02357858052044407, + "grad_norm": 0.8477449845456101, + "learning_rate": 7.858439201451906e-05, + "loss": 12.4905, + "step": 433 + }, + { + "epoch": 0.023633034517027085, + "grad_norm": 0.7892505674089381, + "learning_rate": 7.876588021778585e-05, + "loss": 12.4304, + "step": 434 + }, + { + "epoch": 0.023687488513610095, + "grad_norm": 0.7935738185972664, + "learning_rate": 7.894736842105263e-05, + "loss": 12.295, + "step": 435 + }, + { + "epoch": 0.02374194251019311, + "grad_norm": 0.7129426510545228, + "learning_rate": 7.912885662431943e-05, + "loss": 12.4046, + "step": 436 + }, + { + "epoch": 0.02379639650677612, + "grad_norm": 0.836938154929551, + "learning_rate": 7.931034482758621e-05, + "loss": 12.3877, + "step": 437 + }, + { + "epoch": 0.023850850503359132, + "grad_norm": 1.0221663146021225, + "learning_rate": 7.9491833030853e-05, + "loss": 12.3748, + "step": 438 + }, + { + "epoch": 0.023905304499942142, + "grad_norm": 0.9267503989787906, + "learning_rate": 7.96733212341198e-05, + "loss": 12.4954, + "step": 439 + }, + { + "epoch": 0.023959758496525155, + "grad_norm": 0.8434026048076346, + "learning_rate": 7.985480943738657e-05, + "loss": 12.4728, + "step": 440 + }, + { + "epoch": 0.024014212493108165, + "grad_norm": 0.7878062858231188, + "learning_rate": 8.003629764065335e-05, + "loss": 12.3983, + "step": 441 + }, + { + "epoch": 0.02406866648969118, + "grad_norm": 0.710973960203588, + "learning_rate": 8.021778584392015e-05, + "loss": 12.3385, + "step": 442 + }, + { + "epoch": 0.02412312048627419, + "grad_norm": 0.7938168455755067, + "learning_rate": 8.039927404718693e-05, + "loss": 12.3286, + "step": 443 + }, + { + "epoch": 0.024177574482857202, + "grad_norm": 0.7947193309206884, + "learning_rate": 8.058076225045372e-05, + "loss": 12.5106, + "step": 444 + }, + { + "epoch": 0.024232028479440212, + "grad_norm": 0.7584161983962521, + "learning_rate": 8.076225045372052e-05, + "loss": 12.3826, + "step": 445 + }, + { + "epoch": 0.024286482476023226, + "grad_norm": 0.8202312029520822, + "learning_rate": 8.09437386569873e-05, + "loss": 12.345, + "step": 446 + }, + { + "epoch": 0.024340936472606236, + "grad_norm": 0.7645543641740715, + "learning_rate": 8.112522686025409e-05, + "loss": 12.4588, + "step": 447 + }, + { + "epoch": 0.02439539046918925, + "grad_norm": 0.7730517169532368, + "learning_rate": 8.130671506352088e-05, + "loss": 12.421, + "step": 448 + }, + { + "epoch": 0.02444984446577226, + "grad_norm": 0.814708665964133, + "learning_rate": 8.148820326678767e-05, + "loss": 12.3183, + "step": 449 + }, + { + "epoch": 0.024504298462355273, + "grad_norm": 0.7711615040975552, + "learning_rate": 8.166969147005445e-05, + "loss": 12.4889, + "step": 450 + }, + { + "epoch": 0.024558752458938283, + "grad_norm": 0.7528070171512011, + "learning_rate": 8.185117967332124e-05, + "loss": 12.4343, + "step": 451 + }, + { + "epoch": 0.024613206455521296, + "grad_norm": 0.8249958258020976, + "learning_rate": 8.203266787658802e-05, + "loss": 12.4099, + "step": 452 + }, + { + "epoch": 0.024667660452104306, + "grad_norm": 0.8813431530918817, + "learning_rate": 8.22141560798548e-05, + "loss": 12.4739, + "step": 453 + }, + { + "epoch": 0.02472211444868732, + "grad_norm": 0.9494001566035871, + "learning_rate": 8.23956442831216e-05, + "loss": 12.4154, + "step": 454 + }, + { + "epoch": 0.02477656844527033, + "grad_norm": 0.7154527838559613, + "learning_rate": 8.257713248638839e-05, + "loss": 12.3046, + "step": 455 + }, + { + "epoch": 0.024831022441853343, + "grad_norm": 0.8426756172886806, + "learning_rate": 8.275862068965517e-05, + "loss": 12.3878, + "step": 456 + }, + { + "epoch": 0.024885476438436353, + "grad_norm": 0.7868340051489234, + "learning_rate": 8.294010889292196e-05, + "loss": 12.4081, + "step": 457 + }, + { + "epoch": 0.024939930435019367, + "grad_norm": 0.7614062874800365, + "learning_rate": 8.312159709618876e-05, + "loss": 12.3421, + "step": 458 + }, + { + "epoch": 0.024994384431602377, + "grad_norm": 0.7831978878652606, + "learning_rate": 8.330308529945554e-05, + "loss": 12.4271, + "step": 459 + }, + { + "epoch": 0.02504883842818539, + "grad_norm": 0.7742389342308593, + "learning_rate": 8.348457350272232e-05, + "loss": 12.6071, + "step": 460 + }, + { + "epoch": 0.0251032924247684, + "grad_norm": 0.7667575505932676, + "learning_rate": 8.366606170598912e-05, + "loss": 12.4484, + "step": 461 + }, + { + "epoch": 0.025157746421351414, + "grad_norm": 0.7949954459613233, + "learning_rate": 8.38475499092559e-05, + "loss": 12.3982, + "step": 462 + }, + { + "epoch": 0.025212200417934424, + "grad_norm": 0.8839761700709547, + "learning_rate": 8.402903811252269e-05, + "loss": 12.3902, + "step": 463 + }, + { + "epoch": 0.025266654414517434, + "grad_norm": 0.7436720063586588, + "learning_rate": 8.421052631578948e-05, + "loss": 12.3808, + "step": 464 + }, + { + "epoch": 0.025321108411100447, + "grad_norm": 0.8086165049759212, + "learning_rate": 8.439201451905626e-05, + "loss": 12.3742, + "step": 465 + }, + { + "epoch": 0.025375562407683457, + "grad_norm": 0.8217729326448585, + "learning_rate": 8.457350272232304e-05, + "loss": 12.3675, + "step": 466 + }, + { + "epoch": 0.02543001640426647, + "grad_norm": 0.8804919476658869, + "learning_rate": 8.475499092558984e-05, + "loss": 12.4588, + "step": 467 + }, + { + "epoch": 0.02548447040084948, + "grad_norm": 0.8021891332568722, + "learning_rate": 8.493647912885663e-05, + "loss": 12.443, + "step": 468 + }, + { + "epoch": 0.025538924397432494, + "grad_norm": 0.7325722424197364, + "learning_rate": 8.511796733212341e-05, + "loss": 12.4545, + "step": 469 + }, + { + "epoch": 0.025593378394015504, + "grad_norm": 0.8306668524452858, + "learning_rate": 8.529945553539021e-05, + "loss": 12.5469, + "step": 470 + }, + { + "epoch": 0.025647832390598518, + "grad_norm": 0.7943932754142018, + "learning_rate": 8.5480943738657e-05, + "loss": 12.2246, + "step": 471 + }, + { + "epoch": 0.025702286387181528, + "grad_norm": 0.7407462425158048, + "learning_rate": 8.566243194192378e-05, + "loss": 12.3846, + "step": 472 + }, + { + "epoch": 0.02575674038376454, + "grad_norm": 0.8711005010892444, + "learning_rate": 8.584392014519058e-05, + "loss": 12.5484, + "step": 473 + }, + { + "epoch": 0.02581119438034755, + "grad_norm": 0.7937737358003336, + "learning_rate": 8.602540834845736e-05, + "loss": 12.3675, + "step": 474 + }, + { + "epoch": 0.025865648376930565, + "grad_norm": 0.739494251610042, + "learning_rate": 8.620689655172413e-05, + "loss": 12.4181, + "step": 475 + }, + { + "epoch": 0.025920102373513575, + "grad_norm": 0.7494105764676788, + "learning_rate": 8.638838475499093e-05, + "loss": 12.4294, + "step": 476 + }, + { + "epoch": 0.025974556370096588, + "grad_norm": 0.8112658612116864, + "learning_rate": 8.656987295825771e-05, + "loss": 12.3863, + "step": 477 + }, + { + "epoch": 0.026029010366679598, + "grad_norm": 0.7951356603711836, + "learning_rate": 8.67513611615245e-05, + "loss": 12.509, + "step": 478 + }, + { + "epoch": 0.02608346436326261, + "grad_norm": 0.8050809097921263, + "learning_rate": 8.69328493647913e-05, + "loss": 12.3642, + "step": 479 + }, + { + "epoch": 0.02613791835984562, + "grad_norm": 0.8245382505558528, + "learning_rate": 8.711433756805808e-05, + "loss": 12.3835, + "step": 480 + }, + { + "epoch": 0.026192372356428635, + "grad_norm": 0.742494597149955, + "learning_rate": 8.729582577132486e-05, + "loss": 12.4518, + "step": 481 + }, + { + "epoch": 0.026246826353011645, + "grad_norm": 0.9027867580588818, + "learning_rate": 8.747731397459166e-05, + "loss": 12.3726, + "step": 482 + }, + { + "epoch": 0.02630128034959466, + "grad_norm": 0.7595129443596266, + "learning_rate": 8.765880217785845e-05, + "loss": 12.6085, + "step": 483 + }, + { + "epoch": 0.02635573434617767, + "grad_norm": 0.8600852902167964, + "learning_rate": 8.784029038112523e-05, + "loss": 12.5357, + "step": 484 + }, + { + "epoch": 0.026410188342760682, + "grad_norm": 0.9430561851745869, + "learning_rate": 8.802177858439202e-05, + "loss": 12.5052, + "step": 485 + }, + { + "epoch": 0.026464642339343692, + "grad_norm": 0.7608697999488, + "learning_rate": 8.820326678765881e-05, + "loss": 12.3988, + "step": 486 + }, + { + "epoch": 0.026519096335926706, + "grad_norm": 0.9102688809761854, + "learning_rate": 8.838475499092559e-05, + "loss": 12.4724, + "step": 487 + }, + { + "epoch": 0.026573550332509716, + "grad_norm": 0.7791000410634494, + "learning_rate": 8.856624319419238e-05, + "loss": 12.5351, + "step": 488 + }, + { + "epoch": 0.02662800432909273, + "grad_norm": 0.7489844670021688, + "learning_rate": 8.874773139745917e-05, + "loss": 12.2126, + "step": 489 + }, + { + "epoch": 0.02668245832567574, + "grad_norm": 0.8085753500287401, + "learning_rate": 8.892921960072595e-05, + "loss": 12.5421, + "step": 490 + }, + { + "epoch": 0.026736912322258753, + "grad_norm": 0.9563889035257984, + "learning_rate": 8.911070780399275e-05, + "loss": 12.6741, + "step": 491 + }, + { + "epoch": 0.026791366318841763, + "grad_norm": 0.7063448574074058, + "learning_rate": 8.929219600725953e-05, + "loss": 12.2971, + "step": 492 + }, + { + "epoch": 0.026845820315424776, + "grad_norm": 0.7813369753167644, + "learning_rate": 8.947368421052632e-05, + "loss": 12.2513, + "step": 493 + }, + { + "epoch": 0.026900274312007786, + "grad_norm": 0.8629014027216114, + "learning_rate": 8.96551724137931e-05, + "loss": 12.4148, + "step": 494 + }, + { + "epoch": 0.0269547283085908, + "grad_norm": 0.7481663513315681, + "learning_rate": 8.98366606170599e-05, + "loss": 12.5463, + "step": 495 + }, + { + "epoch": 0.02700918230517381, + "grad_norm": 0.8758771305014487, + "learning_rate": 9.001814882032669e-05, + "loss": 12.4045, + "step": 496 + }, + { + "epoch": 0.027063636301756823, + "grad_norm": 0.7480406054883233, + "learning_rate": 9.019963702359347e-05, + "loss": 12.4557, + "step": 497 + }, + { + "epoch": 0.027118090298339833, + "grad_norm": 0.8152346713462907, + "learning_rate": 9.038112522686027e-05, + "loss": 12.3973, + "step": 498 + }, + { + "epoch": 0.027172544294922846, + "grad_norm": 0.8415389004370808, + "learning_rate": 9.056261343012704e-05, + "loss": 12.6063, + "step": 499 + }, + { + "epoch": 0.027226998291505856, + "grad_norm": 0.8011366657793219, + "learning_rate": 9.074410163339382e-05, + "loss": 12.4955, + "step": 500 + }, + { + "epoch": 0.02728145228808887, + "grad_norm": 0.7380805437115405, + "learning_rate": 9.092558983666062e-05, + "loss": 12.3469, + "step": 501 + }, + { + "epoch": 0.02733590628467188, + "grad_norm": 0.7133968415768672, + "learning_rate": 9.11070780399274e-05, + "loss": 12.4881, + "step": 502 + }, + { + "epoch": 0.027390360281254893, + "grad_norm": 0.8125970947519109, + "learning_rate": 9.128856624319419e-05, + "loss": 12.3151, + "step": 503 + }, + { + "epoch": 0.027444814277837903, + "grad_norm": 0.7077249733189649, + "learning_rate": 9.147005444646099e-05, + "loss": 12.3738, + "step": 504 + }, + { + "epoch": 0.027499268274420917, + "grad_norm": 0.7809924557333555, + "learning_rate": 9.165154264972777e-05, + "loss": 12.4415, + "step": 505 + }, + { + "epoch": 0.027553722271003927, + "grad_norm": 0.7543662314141218, + "learning_rate": 9.183303085299456e-05, + "loss": 12.425, + "step": 506 + }, + { + "epoch": 0.02760817626758694, + "grad_norm": 0.740968620199914, + "learning_rate": 9.201451905626135e-05, + "loss": 12.4001, + "step": 507 + }, + { + "epoch": 0.02766263026416995, + "grad_norm": 0.715948782602618, + "learning_rate": 9.219600725952814e-05, + "loss": 12.4193, + "step": 508 + }, + { + "epoch": 0.027717084260752964, + "grad_norm": 0.7555285381975604, + "learning_rate": 9.237749546279492e-05, + "loss": 12.3583, + "step": 509 + }, + { + "epoch": 0.027771538257335974, + "grad_norm": 0.7926894903352032, + "learning_rate": 9.255898366606171e-05, + "loss": 12.5034, + "step": 510 + }, + { + "epoch": 0.027825992253918987, + "grad_norm": 0.722277348946715, + "learning_rate": 9.274047186932849e-05, + "loss": 12.4026, + "step": 511 + }, + { + "epoch": 0.027880446250501997, + "grad_norm": 0.8485310235578939, + "learning_rate": 9.292196007259528e-05, + "loss": 12.4843, + "step": 512 + }, + { + "epoch": 0.02793490024708501, + "grad_norm": 0.8246964012075222, + "learning_rate": 9.310344827586207e-05, + "loss": 12.3967, + "step": 513 + }, + { + "epoch": 0.02798935424366802, + "grad_norm": 0.8152543979538756, + "learning_rate": 9.328493647912886e-05, + "loss": 12.26, + "step": 514 + }, + { + "epoch": 0.028043808240251034, + "grad_norm": 0.8555264952532589, + "learning_rate": 9.346642468239564e-05, + "loss": 12.3481, + "step": 515 + }, + { + "epoch": 0.028098262236834044, + "grad_norm": 0.7301430730163417, + "learning_rate": 9.364791288566244e-05, + "loss": 12.3859, + "step": 516 + }, + { + "epoch": 0.028152716233417058, + "grad_norm": 0.8502343380693529, + "learning_rate": 9.382940108892923e-05, + "loss": 12.3946, + "step": 517 + }, + { + "epoch": 0.028207170230000068, + "grad_norm": 0.7616908996355757, + "learning_rate": 9.401088929219601e-05, + "loss": 12.4984, + "step": 518 + }, + { + "epoch": 0.02826162422658308, + "grad_norm": 0.848187178061048, + "learning_rate": 9.419237749546281e-05, + "loss": 12.5009, + "step": 519 + }, + { + "epoch": 0.02831607822316609, + "grad_norm": 0.7605162192554722, + "learning_rate": 9.437386569872959e-05, + "loss": 12.3757, + "step": 520 + }, + { + "epoch": 0.028370532219749105, + "grad_norm": 0.6941160676897854, + "learning_rate": 9.455535390199638e-05, + "loss": 12.497, + "step": 521 + }, + { + "epoch": 0.028424986216332115, + "grad_norm": 0.7856867296190392, + "learning_rate": 9.473684210526316e-05, + "loss": 12.4029, + "step": 522 + }, + { + "epoch": 0.028479440212915128, + "grad_norm": 0.7048400049802264, + "learning_rate": 9.491833030852995e-05, + "loss": 12.4511, + "step": 523 + }, + { + "epoch": 0.028533894209498138, + "grad_norm": 0.8013294724513513, + "learning_rate": 9.509981851179673e-05, + "loss": 12.4689, + "step": 524 + }, + { + "epoch": 0.02858834820608115, + "grad_norm": 0.7878969559755142, + "learning_rate": 9.528130671506353e-05, + "loss": 12.5229, + "step": 525 + }, + { + "epoch": 0.02864280220266416, + "grad_norm": 0.7987625743494264, + "learning_rate": 9.546279491833031e-05, + "loss": 12.3626, + "step": 526 + }, + { + "epoch": 0.028697256199247175, + "grad_norm": 0.8079525903340861, + "learning_rate": 9.56442831215971e-05, + "loss": 12.4762, + "step": 527 + }, + { + "epoch": 0.028751710195830185, + "grad_norm": 0.6640448779297942, + "learning_rate": 9.58257713248639e-05, + "loss": 12.3718, + "step": 528 + }, + { + "epoch": 0.0288061641924132, + "grad_norm": 0.8399913617521838, + "learning_rate": 9.600725952813068e-05, + "loss": 12.342, + "step": 529 + }, + { + "epoch": 0.02886061818899621, + "grad_norm": 0.8352947312719899, + "learning_rate": 9.618874773139746e-05, + "loss": 12.4117, + "step": 530 + }, + { + "epoch": 0.02891507218557922, + "grad_norm": 0.7357612574343283, + "learning_rate": 9.637023593466425e-05, + "loss": 12.3374, + "step": 531 + }, + { + "epoch": 0.028969526182162232, + "grad_norm": 0.8621291940802033, + "learning_rate": 9.655172413793105e-05, + "loss": 12.4102, + "step": 532 + }, + { + "epoch": 0.029023980178745242, + "grad_norm": 0.7496563648684155, + "learning_rate": 9.673321234119783e-05, + "loss": 12.553, + "step": 533 + }, + { + "epoch": 0.029078434175328256, + "grad_norm": 0.7178040502697846, + "learning_rate": 9.691470054446462e-05, + "loss": 12.3036, + "step": 534 + }, + { + "epoch": 0.029132888171911266, + "grad_norm": 0.7748952398348568, + "learning_rate": 9.70961887477314e-05, + "loss": 12.493, + "step": 535 + }, + { + "epoch": 0.02918734216849428, + "grad_norm": 0.7713180455385662, + "learning_rate": 9.727767695099818e-05, + "loss": 12.4102, + "step": 536 + }, + { + "epoch": 0.02924179616507729, + "grad_norm": 0.7740687521356002, + "learning_rate": 9.745916515426497e-05, + "loss": 12.5748, + "step": 537 + }, + { + "epoch": 0.029296250161660303, + "grad_norm": 0.7865595720640453, + "learning_rate": 9.764065335753177e-05, + "loss": 12.4367, + "step": 538 + }, + { + "epoch": 0.029350704158243313, + "grad_norm": 0.7560847556377283, + "learning_rate": 9.782214156079855e-05, + "loss": 12.3583, + "step": 539 + }, + { + "epoch": 0.029405158154826326, + "grad_norm": 0.7978359099601086, + "learning_rate": 9.800362976406534e-05, + "loss": 12.3999, + "step": 540 + }, + { + "epoch": 0.029459612151409336, + "grad_norm": 0.7535509446044629, + "learning_rate": 9.818511796733213e-05, + "loss": 12.3052, + "step": 541 + }, + { + "epoch": 0.02951406614799235, + "grad_norm": 0.7775521738235894, + "learning_rate": 9.836660617059892e-05, + "loss": 12.4467, + "step": 542 + }, + { + "epoch": 0.02956852014457536, + "grad_norm": 0.7469699238521446, + "learning_rate": 9.85480943738657e-05, + "loss": 12.456, + "step": 543 + }, + { + "epoch": 0.029622974141158373, + "grad_norm": 1.0563298749968926, + "learning_rate": 9.87295825771325e-05, + "loss": 12.4247, + "step": 544 + }, + { + "epoch": 0.029677428137741383, + "grad_norm": 0.8361142060905258, + "learning_rate": 9.891107078039928e-05, + "loss": 12.5043, + "step": 545 + }, + { + "epoch": 0.029731882134324396, + "grad_norm": 1.0096705276772675, + "learning_rate": 9.909255898366606e-05, + "loss": 12.3808, + "step": 546 + }, + { + "epoch": 0.029786336130907407, + "grad_norm": 0.7864328089464631, + "learning_rate": 9.927404718693285e-05, + "loss": 12.538, + "step": 547 + }, + { + "epoch": 0.02984079012749042, + "grad_norm": 0.8451585533599, + "learning_rate": 9.945553539019964e-05, + "loss": 12.5222, + "step": 548 + }, + { + "epoch": 0.02989524412407343, + "grad_norm": 0.8653338116417337, + "learning_rate": 9.963702359346642e-05, + "loss": 12.4985, + "step": 549 + }, + { + "epoch": 0.029949698120656443, + "grad_norm": 0.7662298878370504, + "learning_rate": 9.981851179673322e-05, + "loss": 12.4266, + "step": 550 + }, + { + "epoch": 0.030004152117239453, + "grad_norm": 0.7970444594455892, + "learning_rate": 0.0001, + "loss": 12.3258, + "step": 551 + }, + { + "epoch": 0.030058606113822467, + "grad_norm": 0.7349918099412, + "learning_rate": 0.00010018148820326678, + "loss": 12.4242, + "step": 552 + }, + { + "epoch": 0.030113060110405477, + "grad_norm": 0.854292237492196, + "learning_rate": 0.00010036297640653359, + "loss": 12.3636, + "step": 553 + }, + { + "epoch": 0.03016751410698849, + "grad_norm": 0.7395022781358609, + "learning_rate": 0.00010054446460980036, + "loss": 12.5779, + "step": 554 + }, + { + "epoch": 0.0302219681035715, + "grad_norm": 0.768148807115022, + "learning_rate": 0.00010072595281306716, + "loss": 12.5132, + "step": 555 + }, + { + "epoch": 0.030276422100154514, + "grad_norm": 0.8159993812941795, + "learning_rate": 0.00010090744101633394, + "loss": 12.4995, + "step": 556 + }, + { + "epoch": 0.030330876096737524, + "grad_norm": 0.8351323851458299, + "learning_rate": 0.00010108892921960074, + "loss": 12.5193, + "step": 557 + }, + { + "epoch": 0.030385330093320537, + "grad_norm": 0.8416946553162289, + "learning_rate": 0.00010127041742286751, + "loss": 12.5428, + "step": 558 + }, + { + "epoch": 0.030439784089903547, + "grad_norm": 0.9752655054331537, + "learning_rate": 0.00010145190562613431, + "loss": 12.6036, + "step": 559 + }, + { + "epoch": 0.03049423808648656, + "grad_norm": 0.7197724811301942, + "learning_rate": 0.00010163339382940109, + "loss": 12.4491, + "step": 560 + }, + { + "epoch": 0.03054869208306957, + "grad_norm": 0.7503723774413532, + "learning_rate": 0.00010181488203266789, + "loss": 12.3547, + "step": 561 + }, + { + "epoch": 0.030603146079652584, + "grad_norm": 0.8452417124749888, + "learning_rate": 0.00010199637023593467, + "loss": 12.5108, + "step": 562 + }, + { + "epoch": 0.030657600076235594, + "grad_norm": 1.0895902134284718, + "learning_rate": 0.00010217785843920144, + "loss": 12.5596, + "step": 563 + }, + { + "epoch": 0.030712054072818608, + "grad_norm": 0.7045960377212881, + "learning_rate": 0.00010235934664246824, + "loss": 12.5114, + "step": 564 + }, + { + "epoch": 0.030766508069401618, + "grad_norm": 0.7817959879503318, + "learning_rate": 0.00010254083484573503, + "loss": 12.4439, + "step": 565 + }, + { + "epoch": 0.03082096206598463, + "grad_norm": 0.7924652130755838, + "learning_rate": 0.00010272232304900183, + "loss": 12.5138, + "step": 566 + }, + { + "epoch": 0.03087541606256764, + "grad_norm": 0.8559510622634047, + "learning_rate": 0.0001029038112522686, + "loss": 12.4195, + "step": 567 + }, + { + "epoch": 0.030929870059150655, + "grad_norm": 0.7675767381139006, + "learning_rate": 0.0001030852994555354, + "loss": 12.4978, + "step": 568 + }, + { + "epoch": 0.030984324055733665, + "grad_norm": 0.7741050473466831, + "learning_rate": 0.00010326678765880218, + "loss": 12.2737, + "step": 569 + }, + { + "epoch": 0.03103877805231668, + "grad_norm": 0.7415387084328533, + "learning_rate": 0.00010344827586206898, + "loss": 12.4745, + "step": 570 + }, + { + "epoch": 0.03109323204889969, + "grad_norm": 0.7841681248843312, + "learning_rate": 0.00010362976406533576, + "loss": 12.4871, + "step": 571 + }, + { + "epoch": 0.031147686045482702, + "grad_norm": 0.7936691292245915, + "learning_rate": 0.00010381125226860256, + "loss": 12.652, + "step": 572 + }, + { + "epoch": 0.031202140042065712, + "grad_norm": 0.8113102552857533, + "learning_rate": 0.00010399274047186933, + "loss": 12.5519, + "step": 573 + }, + { + "epoch": 0.03125659403864872, + "grad_norm": 0.7347337505529872, + "learning_rate": 0.00010417422867513613, + "loss": 12.3806, + "step": 574 + }, + { + "epoch": 0.031311048035231735, + "grad_norm": 0.746487006322028, + "learning_rate": 0.00010435571687840291, + "loss": 12.4832, + "step": 575 + }, + { + "epoch": 0.03136550203181475, + "grad_norm": 0.8489077584446184, + "learning_rate": 0.00010453720508166968, + "loss": 12.2913, + "step": 576 + }, + { + "epoch": 0.03141995602839776, + "grad_norm": 0.7185282981457465, + "learning_rate": 0.00010471869328493648, + "loss": 12.3301, + "step": 577 + }, + { + "epoch": 0.03147441002498077, + "grad_norm": 0.8659215057338194, + "learning_rate": 0.00010490018148820327, + "loss": 12.601, + "step": 578 + }, + { + "epoch": 0.03152886402156378, + "grad_norm": 0.76054791141889, + "learning_rate": 0.00010508166969147006, + "loss": 12.3515, + "step": 579 + }, + { + "epoch": 0.031583318018146796, + "grad_norm": 0.8109048061455886, + "learning_rate": 0.00010526315789473685, + "loss": 12.5157, + "step": 580 + }, + { + "epoch": 0.03163777201472981, + "grad_norm": 0.7397461059924253, + "learning_rate": 0.00010544464609800365, + "loss": 12.4316, + "step": 581 + }, + { + "epoch": 0.031692226011312816, + "grad_norm": 0.7946491784910253, + "learning_rate": 0.00010562613430127042, + "loss": 12.3995, + "step": 582 + }, + { + "epoch": 0.03174668000789583, + "grad_norm": 0.8084263112882792, + "learning_rate": 0.00010580762250453721, + "loss": 12.4758, + "step": 583 + }, + { + "epoch": 0.03180113400447884, + "grad_norm": 0.7806289943327588, + "learning_rate": 0.000105989110707804, + "loss": 12.3552, + "step": 584 + }, + { + "epoch": 0.031855588001061856, + "grad_norm": 0.782911514302089, + "learning_rate": 0.0001061705989110708, + "loss": 12.5056, + "step": 585 + }, + { + "epoch": 0.03191004199764486, + "grad_norm": 0.8014304453370548, + "learning_rate": 0.00010635208711433757, + "loss": 12.5809, + "step": 586 + }, + { + "epoch": 0.031964495994227876, + "grad_norm": 0.8775607468840907, + "learning_rate": 0.00010653357531760435, + "loss": 12.5721, + "step": 587 + }, + { + "epoch": 0.03201894999081089, + "grad_norm": 0.7172825755544577, + "learning_rate": 0.00010671506352087115, + "loss": 12.4076, + "step": 588 + }, + { + "epoch": 0.0320734039873939, + "grad_norm": 0.7180909578238679, + "learning_rate": 0.00010689655172413792, + "loss": 12.4168, + "step": 589 + }, + { + "epoch": 0.03212785798397691, + "grad_norm": 0.8139633699732863, + "learning_rate": 0.00010707803992740473, + "loss": 12.4554, + "step": 590 + }, + { + "epoch": 0.03218231198055992, + "grad_norm": 0.8201333378367557, + "learning_rate": 0.0001072595281306715, + "loss": 12.5802, + "step": 591 + }, + { + "epoch": 0.03223676597714294, + "grad_norm": 0.7468982443082229, + "learning_rate": 0.0001074410163339383, + "loss": 12.4596, + "step": 592 + }, + { + "epoch": 0.03229121997372595, + "grad_norm": 0.8776024210025977, + "learning_rate": 0.00010762250453720509, + "loss": 12.4242, + "step": 593 + }, + { + "epoch": 0.03234567397030896, + "grad_norm": 0.7475469170008233, + "learning_rate": 0.00010780399274047188, + "loss": 12.2832, + "step": 594 + }, + { + "epoch": 0.03240012796689197, + "grad_norm": 0.7498393845005602, + "learning_rate": 0.00010798548094373865, + "loss": 12.4668, + "step": 595 + }, + { + "epoch": 0.032454581963474984, + "grad_norm": 0.8085189966961024, + "learning_rate": 0.00010816696914700545, + "loss": 12.5477, + "step": 596 + }, + { + "epoch": 0.03250903596005799, + "grad_norm": 0.781876240427016, + "learning_rate": 0.00010834845735027224, + "loss": 12.3383, + "step": 597 + }, + { + "epoch": 0.032563489956641004, + "grad_norm": 0.7894170574953352, + "learning_rate": 0.00010852994555353901, + "loss": 12.505, + "step": 598 + }, + { + "epoch": 0.03261794395322402, + "grad_norm": 0.7576096050152189, + "learning_rate": 0.00010871143375680582, + "loss": 12.3167, + "step": 599 + }, + { + "epoch": 0.03267239794980703, + "grad_norm": 0.8250448715677351, + "learning_rate": 0.00010889292196007259, + "loss": 12.4931, + "step": 600 + }, + { + "epoch": 0.03272685194639004, + "grad_norm": 0.7191331124925939, + "learning_rate": 0.00010907441016333939, + "loss": 12.343, + "step": 601 + }, + { + "epoch": 0.03278130594297305, + "grad_norm": 0.7096756844673696, + "learning_rate": 0.00010925589836660617, + "loss": 12.3706, + "step": 602 + }, + { + "epoch": 0.032835759939556064, + "grad_norm": 0.8855413287507164, + "learning_rate": 0.00010943738656987297, + "loss": 12.5008, + "step": 603 + }, + { + "epoch": 0.03289021393613908, + "grad_norm": 0.702816878198603, + "learning_rate": 0.00010961887477313974, + "loss": 12.4181, + "step": 604 + }, + { + "epoch": 0.032944667932722084, + "grad_norm": 0.7511863541614481, + "learning_rate": 0.00010980036297640654, + "loss": 12.4619, + "step": 605 + }, + { + "epoch": 0.0329991219293051, + "grad_norm": 0.7440166154853498, + "learning_rate": 0.00010998185117967332, + "loss": 12.5371, + "step": 606 + }, + { + "epoch": 0.03305357592588811, + "grad_norm": 0.7329152168316372, + "learning_rate": 0.00011016333938294012, + "loss": 12.4742, + "step": 607 + }, + { + "epoch": 0.033108029922471124, + "grad_norm": 0.736280532598944, + "learning_rate": 0.0001103448275862069, + "loss": 12.3711, + "step": 608 + }, + { + "epoch": 0.03316248391905413, + "grad_norm": 0.7413393159584099, + "learning_rate": 0.0001105263157894737, + "loss": 12.4609, + "step": 609 + }, + { + "epoch": 0.033216937915637144, + "grad_norm": 0.789987821650849, + "learning_rate": 0.00011070780399274048, + "loss": 12.5343, + "step": 610 + }, + { + "epoch": 0.03327139191222016, + "grad_norm": 0.7557116687034697, + "learning_rate": 0.00011088929219600726, + "loss": 12.5283, + "step": 611 + }, + { + "epoch": 0.03332584590880317, + "grad_norm": 0.7854327140332928, + "learning_rate": 0.00011107078039927406, + "loss": 12.4899, + "step": 612 + }, + { + "epoch": 0.03338029990538618, + "grad_norm": 0.8594935168696665, + "learning_rate": 0.00011125226860254083, + "loss": 12.4901, + "step": 613 + }, + { + "epoch": 0.03343475390196919, + "grad_norm": 0.880417761612294, + "learning_rate": 0.00011143375680580763, + "loss": 12.5552, + "step": 614 + }, + { + "epoch": 0.033489207898552205, + "grad_norm": 0.7806942770954527, + "learning_rate": 0.00011161524500907441, + "loss": 12.6158, + "step": 615 + }, + { + "epoch": 0.03354366189513522, + "grad_norm": 0.8841970711057281, + "learning_rate": 0.00011179673321234121, + "loss": 12.4989, + "step": 616 + }, + { + "epoch": 0.033598115891718225, + "grad_norm": 0.7713157169726781, + "learning_rate": 0.000111978221415608, + "loss": 12.4892, + "step": 617 + }, + { + "epoch": 0.03365256988830124, + "grad_norm": 0.8324243289561584, + "learning_rate": 0.00011215970961887479, + "loss": 12.4635, + "step": 618 + }, + { + "epoch": 0.03370702388488425, + "grad_norm": 0.7807991754630468, + "learning_rate": 0.00011234119782214156, + "loss": 12.5345, + "step": 619 + }, + { + "epoch": 0.033761477881467265, + "grad_norm": 0.7865945001071623, + "learning_rate": 0.00011252268602540836, + "loss": 12.3791, + "step": 620 + }, + { + "epoch": 0.03381593187805027, + "grad_norm": 0.7872398487857712, + "learning_rate": 0.00011270417422867514, + "loss": 12.5145, + "step": 621 + }, + { + "epoch": 0.033870385874633285, + "grad_norm": 0.7885667646810096, + "learning_rate": 0.00011288566243194192, + "loss": 12.5641, + "step": 622 + }, + { + "epoch": 0.0339248398712163, + "grad_norm": 0.7741959357450423, + "learning_rate": 0.00011306715063520871, + "loss": 12.4703, + "step": 623 + }, + { + "epoch": 0.03397929386779931, + "grad_norm": 0.7564638066325946, + "learning_rate": 0.0001132486388384755, + "loss": 12.3965, + "step": 624 + }, + { + "epoch": 0.03403374786438232, + "grad_norm": 0.9519171311859516, + "learning_rate": 0.0001134301270417423, + "loss": 12.4289, + "step": 625 + }, + { + "epoch": 0.03408820186096533, + "grad_norm": 0.8107340278897369, + "learning_rate": 0.00011361161524500907, + "loss": 12.3782, + "step": 626 + }, + { + "epoch": 0.034142655857548346, + "grad_norm": 0.8481212877491462, + "learning_rate": 0.00011379310344827588, + "loss": 12.4214, + "step": 627 + }, + { + "epoch": 0.03419710985413136, + "grad_norm": 0.8650098885796367, + "learning_rate": 0.00011397459165154265, + "loss": 12.4615, + "step": 628 + }, + { + "epoch": 0.034251563850714366, + "grad_norm": 0.7457653007753933, + "learning_rate": 0.00011415607985480945, + "loss": 12.596, + "step": 629 + }, + { + "epoch": 0.03430601784729738, + "grad_norm": 0.9104614202638347, + "learning_rate": 0.00011433756805807623, + "loss": 12.4861, + "step": 630 + }, + { + "epoch": 0.03436047184388039, + "grad_norm": 0.7999052962726083, + "learning_rate": 0.00011451905626134303, + "loss": 12.4831, + "step": 631 + }, + { + "epoch": 0.034414925840463406, + "grad_norm": 0.8776766817589621, + "learning_rate": 0.0001147005444646098, + "loss": 12.4767, + "step": 632 + }, + { + "epoch": 0.03446937983704641, + "grad_norm": 0.757055231958654, + "learning_rate": 0.0001148820326678766, + "loss": 12.4682, + "step": 633 + }, + { + "epoch": 0.034523833833629426, + "grad_norm": 0.7452271311596549, + "learning_rate": 0.00011506352087114338, + "loss": 12.4019, + "step": 634 + }, + { + "epoch": 0.03457828783021244, + "grad_norm": 0.8914212821035306, + "learning_rate": 0.00011524500907441015, + "loss": 12.5484, + "step": 635 + }, + { + "epoch": 0.03463274182679545, + "grad_norm": 0.8426837477394973, + "learning_rate": 0.00011542649727767697, + "loss": 12.6615, + "step": 636 + }, + { + "epoch": 0.03468719582337846, + "grad_norm": 0.8084210678468613, + "learning_rate": 0.00011560798548094374, + "loss": 12.3847, + "step": 637 + }, + { + "epoch": 0.03474164981996147, + "grad_norm": 0.8653995572984597, + "learning_rate": 0.00011578947368421053, + "loss": 12.5286, + "step": 638 + }, + { + "epoch": 0.03479610381654449, + "grad_norm": 0.7890053357866484, + "learning_rate": 0.00011597096188747732, + "loss": 12.4752, + "step": 639 + }, + { + "epoch": 0.0348505578131275, + "grad_norm": 0.8857773461034695, + "learning_rate": 0.00011615245009074412, + "loss": 12.537, + "step": 640 + }, + { + "epoch": 0.03490501180971051, + "grad_norm": 0.9416668002877222, + "learning_rate": 0.00011633393829401089, + "loss": 12.5413, + "step": 641 + }, + { + "epoch": 0.03495946580629352, + "grad_norm": 0.779793688730529, + "learning_rate": 0.00011651542649727769, + "loss": 12.5147, + "step": 642 + }, + { + "epoch": 0.035013919802876534, + "grad_norm": 0.951732854095737, + "learning_rate": 0.00011669691470054447, + "loss": 12.684, + "step": 643 + }, + { + "epoch": 0.03506837379945955, + "grad_norm": 0.7967477855943389, + "learning_rate": 0.00011687840290381127, + "loss": 12.5166, + "step": 644 + }, + { + "epoch": 0.035122827796042554, + "grad_norm": 0.9520128117212568, + "learning_rate": 0.00011705989110707805, + "loss": 12.4273, + "step": 645 + }, + { + "epoch": 0.03517728179262557, + "grad_norm": 0.7204273522718219, + "learning_rate": 0.00011724137931034482, + "loss": 12.452, + "step": 646 + }, + { + "epoch": 0.03523173578920858, + "grad_norm": 0.8316091332522132, + "learning_rate": 0.00011742286751361162, + "loss": 12.5175, + "step": 647 + }, + { + "epoch": 0.035286189785791594, + "grad_norm": 0.8106043550269486, + "learning_rate": 0.0001176043557168784, + "loss": 12.6276, + "step": 648 + }, + { + "epoch": 0.0353406437823746, + "grad_norm": 0.907645004050805, + "learning_rate": 0.0001177858439201452, + "loss": 12.5116, + "step": 649 + }, + { + "epoch": 0.035395097778957614, + "grad_norm": 0.8421133812031315, + "learning_rate": 0.00011796733212341197, + "loss": 12.5068, + "step": 650 + }, + { + "epoch": 0.03544955177554063, + "grad_norm": 0.8263559879788932, + "learning_rate": 0.00011814882032667877, + "loss": 12.5064, + "step": 651 + }, + { + "epoch": 0.03550400577212364, + "grad_norm": 0.8368348477203709, + "learning_rate": 0.00011833030852994556, + "loss": 12.6344, + "step": 652 + }, + { + "epoch": 0.03555845976870665, + "grad_norm": 0.8435898015434021, + "learning_rate": 0.00011851179673321235, + "loss": 12.5091, + "step": 653 + }, + { + "epoch": 0.03561291376528966, + "grad_norm": 0.8127838512211614, + "learning_rate": 0.00011869328493647913, + "loss": 12.5358, + "step": 654 + }, + { + "epoch": 0.035667367761872674, + "grad_norm": 0.844221855096733, + "learning_rate": 0.00011887477313974594, + "loss": 12.5023, + "step": 655 + }, + { + "epoch": 0.03572182175845569, + "grad_norm": 0.830609883523681, + "learning_rate": 0.00011905626134301271, + "loss": 12.5483, + "step": 656 + }, + { + "epoch": 0.035776275755038695, + "grad_norm": 0.7751682709968774, + "learning_rate": 0.00011923774954627949, + "loss": 12.5475, + "step": 657 + }, + { + "epoch": 0.03583072975162171, + "grad_norm": 0.7594651681846976, + "learning_rate": 0.00011941923774954629, + "loss": 12.4376, + "step": 658 + }, + { + "epoch": 0.03588518374820472, + "grad_norm": 0.8105747332842143, + "learning_rate": 0.00011960072595281306, + "loss": 12.5585, + "step": 659 + }, + { + "epoch": 0.035939637744787735, + "grad_norm": 0.7793088239529253, + "learning_rate": 0.00011978221415607986, + "loss": 12.5071, + "step": 660 + }, + { + "epoch": 0.03599409174137074, + "grad_norm": 0.8091456420082649, + "learning_rate": 0.00011996370235934664, + "loss": 12.4204, + "step": 661 + }, + { + "epoch": 0.036048545737953755, + "grad_norm": 0.7951004347492463, + "learning_rate": 0.00012014519056261344, + "loss": 12.4689, + "step": 662 + }, + { + "epoch": 0.03610299973453677, + "grad_norm": 0.8203582718986129, + "learning_rate": 0.00012032667876588021, + "loss": 12.4161, + "step": 663 + }, + { + "epoch": 0.036157453731119775, + "grad_norm": 0.9104629783176926, + "learning_rate": 0.00012050816696914702, + "loss": 12.4031, + "step": 664 + }, + { + "epoch": 0.03621190772770279, + "grad_norm": 0.8353893787785487, + "learning_rate": 0.0001206896551724138, + "loss": 12.4805, + "step": 665 + }, + { + "epoch": 0.0362663617242858, + "grad_norm": 0.8025972229636333, + "learning_rate": 0.00012087114337568059, + "loss": 12.5601, + "step": 666 + }, + { + "epoch": 0.036320815720868815, + "grad_norm": 0.7563712264723594, + "learning_rate": 0.00012105263157894738, + "loss": 12.4652, + "step": 667 + }, + { + "epoch": 0.03637526971745182, + "grad_norm": 0.8122372235020665, + "learning_rate": 0.00012123411978221418, + "loss": 12.4633, + "step": 668 + }, + { + "epoch": 0.036429723714034835, + "grad_norm": 0.8880903012742153, + "learning_rate": 0.00012141560798548095, + "loss": 12.4917, + "step": 669 + }, + { + "epoch": 0.03648417771061785, + "grad_norm": 0.7902079118875632, + "learning_rate": 0.00012159709618874773, + "loss": 12.5041, + "step": 670 + }, + { + "epoch": 0.03653863170720086, + "grad_norm": 0.7919532434646256, + "learning_rate": 0.00012177858439201453, + "loss": 12.4096, + "step": 671 + }, + { + "epoch": 0.03659308570378387, + "grad_norm": 0.786591506755876, + "learning_rate": 0.0001219600725952813, + "loss": 12.5606, + "step": 672 + }, + { + "epoch": 0.03664753970036688, + "grad_norm": 0.8038365184195617, + "learning_rate": 0.0001221415607985481, + "loss": 12.4756, + "step": 673 + }, + { + "epoch": 0.036701993696949896, + "grad_norm": 0.770757422573286, + "learning_rate": 0.0001223230490018149, + "loss": 12.4893, + "step": 674 + }, + { + "epoch": 0.03675644769353291, + "grad_norm": 0.761612989544494, + "learning_rate": 0.0001225045372050817, + "loss": 12.3772, + "step": 675 + }, + { + "epoch": 0.036810901690115916, + "grad_norm": 0.8670862349575074, + "learning_rate": 0.00012268602540834846, + "loss": 12.3902, + "step": 676 + }, + { + "epoch": 0.03686535568669893, + "grad_norm": 0.7716718708781327, + "learning_rate": 0.00012286751361161526, + "loss": 12.5174, + "step": 677 + }, + { + "epoch": 0.03691980968328194, + "grad_norm": 0.7626179805058609, + "learning_rate": 0.00012304900181488203, + "loss": 12.5441, + "step": 678 + }, + { + "epoch": 0.036974263679864956, + "grad_norm": 0.8312158992798632, + "learning_rate": 0.00012323049001814883, + "loss": 12.3531, + "step": 679 + }, + { + "epoch": 0.03702871767644796, + "grad_norm": 0.8564858249915114, + "learning_rate": 0.0001234119782214156, + "loss": 12.5319, + "step": 680 + }, + { + "epoch": 0.037083171673030976, + "grad_norm": 0.8648959668269681, + "learning_rate": 0.0001235934664246824, + "loss": 12.5591, + "step": 681 + }, + { + "epoch": 0.03713762566961399, + "grad_norm": 0.8916475653141226, + "learning_rate": 0.0001237749546279492, + "loss": 12.6703, + "step": 682 + }, + { + "epoch": 0.037192079666197, + "grad_norm": 0.9084031257077833, + "learning_rate": 0.00012395644283121597, + "loss": 12.5349, + "step": 683 + }, + { + "epoch": 0.03724653366278001, + "grad_norm": 0.8797221532380368, + "learning_rate": 0.00012413793103448277, + "loss": 12.4662, + "step": 684 + }, + { + "epoch": 0.03730098765936302, + "grad_norm": 0.8043936491667408, + "learning_rate": 0.00012431941923774954, + "loss": 12.5472, + "step": 685 + }, + { + "epoch": 0.03735544165594604, + "grad_norm": 0.817146472456262, + "learning_rate": 0.00012450090744101634, + "loss": 12.4383, + "step": 686 + }, + { + "epoch": 0.03740989565252905, + "grad_norm": 0.8114469583343945, + "learning_rate": 0.00012468239564428313, + "loss": 12.5172, + "step": 687 + }, + { + "epoch": 0.03746434964911206, + "grad_norm": 0.8326000918570917, + "learning_rate": 0.00012486388384754993, + "loss": 12.5178, + "step": 688 + }, + { + "epoch": 0.03751880364569507, + "grad_norm": 0.7723478809577488, + "learning_rate": 0.0001250453720508167, + "loss": 12.5362, + "step": 689 + }, + { + "epoch": 0.037573257642278084, + "grad_norm": 0.9410693950400154, + "learning_rate": 0.0001252268602540835, + "loss": 12.5571, + "step": 690 + }, + { + "epoch": 0.0376277116388611, + "grad_norm": 0.7561947345119391, + "learning_rate": 0.00012540834845735027, + "loss": 12.4258, + "step": 691 + }, + { + "epoch": 0.037682165635444104, + "grad_norm": 0.8418495240004382, + "learning_rate": 0.00012558983666061704, + "loss": 12.5777, + "step": 692 + }, + { + "epoch": 0.03773661963202712, + "grad_norm": 0.8746311873777267, + "learning_rate": 0.00012577132486388387, + "loss": 12.5951, + "step": 693 + }, + { + "epoch": 0.03779107362861013, + "grad_norm": 0.7625921236694155, + "learning_rate": 0.00012595281306715064, + "loss": 12.4046, + "step": 694 + }, + { + "epoch": 0.037845527625193144, + "grad_norm": 0.734823565054105, + "learning_rate": 0.00012613430127041744, + "loss": 12.5139, + "step": 695 + }, + { + "epoch": 0.03789998162177615, + "grad_norm": 0.7621770331854553, + "learning_rate": 0.0001263157894736842, + "loss": 12.6277, + "step": 696 + }, + { + "epoch": 0.037954435618359164, + "grad_norm": 0.7166086175672672, + "learning_rate": 0.000126497277676951, + "loss": 12.4117, + "step": 697 + }, + { + "epoch": 0.03800888961494218, + "grad_norm": 0.7971843612908569, + "learning_rate": 0.00012667876588021778, + "loss": 12.4814, + "step": 698 + }, + { + "epoch": 0.03806334361152519, + "grad_norm": 0.7481586297499027, + "learning_rate": 0.00012686025408348457, + "loss": 12.4657, + "step": 699 + }, + { + "epoch": 0.0381177976081082, + "grad_norm": 0.7143053642727076, + "learning_rate": 0.00012704174228675137, + "loss": 12.3673, + "step": 700 + }, + { + "epoch": 0.03817225160469121, + "grad_norm": 0.8593540767983657, + "learning_rate": 0.00012722323049001817, + "loss": 12.5246, + "step": 701 + }, + { + "epoch": 0.038226705601274225, + "grad_norm": 0.7833857321998503, + "learning_rate": 0.00012740471869328494, + "loss": 12.5754, + "step": 702 + }, + { + "epoch": 0.03828115959785724, + "grad_norm": 0.7740278489930515, + "learning_rate": 0.00012758620689655174, + "loss": 12.6015, + "step": 703 + }, + { + "epoch": 0.038335613594440245, + "grad_norm": 0.8410629541952023, + "learning_rate": 0.0001277676950998185, + "loss": 12.6456, + "step": 704 + }, + { + "epoch": 0.03839006759102326, + "grad_norm": 0.8093859296985529, + "learning_rate": 0.0001279491833030853, + "loss": 12.4783, + "step": 705 + }, + { + "epoch": 0.03844452158760627, + "grad_norm": 0.7619677944718481, + "learning_rate": 0.0001281306715063521, + "loss": 12.4919, + "step": 706 + }, + { + "epoch": 0.038498975584189285, + "grad_norm": 0.7627275451844259, + "learning_rate": 0.00012831215970961888, + "loss": 12.4902, + "step": 707 + }, + { + "epoch": 0.03855342958077229, + "grad_norm": 0.7874103603037563, + "learning_rate": 0.00012849364791288567, + "loss": 12.4671, + "step": 708 + }, + { + "epoch": 0.038607883577355305, + "grad_norm": 0.8572838287744005, + "learning_rate": 0.00012867513611615244, + "loss": 12.5543, + "step": 709 + }, + { + "epoch": 0.03866233757393832, + "grad_norm": 0.8810561414897864, + "learning_rate": 0.00012885662431941924, + "loss": 12.51, + "step": 710 + }, + { + "epoch": 0.03871679157052133, + "grad_norm": 0.7541640787422855, + "learning_rate": 0.00012903811252268604, + "loss": 12.5667, + "step": 711 + }, + { + "epoch": 0.03877124556710434, + "grad_norm": 0.773591066392057, + "learning_rate": 0.00012921960072595284, + "loss": 12.3739, + "step": 712 + }, + { + "epoch": 0.03882569956368735, + "grad_norm": 0.7308375378339267, + "learning_rate": 0.0001294010889292196, + "loss": 12.5431, + "step": 713 + }, + { + "epoch": 0.038880153560270365, + "grad_norm": 0.8362238065065555, + "learning_rate": 0.0001295825771324864, + "loss": 12.5286, + "step": 714 + }, + { + "epoch": 0.03893460755685338, + "grad_norm": 0.8560896444701315, + "learning_rate": 0.00012976406533575318, + "loss": 12.6039, + "step": 715 + }, + { + "epoch": 0.038989061553436385, + "grad_norm": 0.7718918195967964, + "learning_rate": 0.00012994555353901995, + "loss": 12.6776, + "step": 716 + }, + { + "epoch": 0.0390435155500194, + "grad_norm": 0.8479818318382927, + "learning_rate": 0.00013012704174228675, + "loss": 12.4291, + "step": 717 + }, + { + "epoch": 0.03909796954660241, + "grad_norm": 0.8189985279402068, + "learning_rate": 0.00013030852994555355, + "loss": 12.4984, + "step": 718 + }, + { + "epoch": 0.039152423543185426, + "grad_norm": 0.8252447601960348, + "learning_rate": 0.00013049001814882034, + "loss": 12.5111, + "step": 719 + }, + { + "epoch": 0.03920687753976843, + "grad_norm": 0.8136491572382893, + "learning_rate": 0.00013067150635208711, + "loss": 12.5477, + "step": 720 + }, + { + "epoch": 0.039261331536351446, + "grad_norm": 0.741260225544626, + "learning_rate": 0.0001308529945553539, + "loss": 12.5397, + "step": 721 + }, + { + "epoch": 0.03931578553293446, + "grad_norm": 1.0188705229204873, + "learning_rate": 0.00013103448275862068, + "loss": 12.5885, + "step": 722 + }, + { + "epoch": 0.03937023952951747, + "grad_norm": 0.8344084817037877, + "learning_rate": 0.00013121597096188748, + "loss": 12.6934, + "step": 723 + }, + { + "epoch": 0.03942469352610048, + "grad_norm": 0.8182160497160557, + "learning_rate": 0.00013139745916515428, + "loss": 12.5394, + "step": 724 + }, + { + "epoch": 0.03947914752268349, + "grad_norm": 0.754364334548919, + "learning_rate": 0.00013157894736842108, + "loss": 12.5191, + "step": 725 + }, + { + "epoch": 0.039533601519266506, + "grad_norm": 0.7740832117265269, + "learning_rate": 0.00013176043557168785, + "loss": 12.5467, + "step": 726 + }, + { + "epoch": 0.03958805551584952, + "grad_norm": 0.8335032571621467, + "learning_rate": 0.00013194192377495462, + "loss": 12.4449, + "step": 727 + }, + { + "epoch": 0.039642509512432526, + "grad_norm": 0.8051614632134781, + "learning_rate": 0.00013212341197822142, + "loss": 12.6346, + "step": 728 + }, + { + "epoch": 0.03969696350901554, + "grad_norm": 0.7995481068509528, + "learning_rate": 0.0001323049001814882, + "loss": 12.6264, + "step": 729 + }, + { + "epoch": 0.03975141750559855, + "grad_norm": 0.8623273622651532, + "learning_rate": 0.000132486388384755, + "loss": 12.5711, + "step": 730 + }, + { + "epoch": 0.03980587150218156, + "grad_norm": 0.8437033455684732, + "learning_rate": 0.00013266787658802178, + "loss": 12.5684, + "step": 731 + }, + { + "epoch": 0.03986032549876457, + "grad_norm": 0.9133141357437732, + "learning_rate": 0.00013284936479128858, + "loss": 12.602, + "step": 732 + }, + { + "epoch": 0.03991477949534759, + "grad_norm": 0.8189575723503095, + "learning_rate": 0.00013303085299455535, + "loss": 12.4223, + "step": 733 + }, + { + "epoch": 0.0399692334919306, + "grad_norm": 0.7768915209961624, + "learning_rate": 0.00013321234119782215, + "loss": 12.5371, + "step": 734 + }, + { + "epoch": 0.04002368748851361, + "grad_norm": 0.8949770926241271, + "learning_rate": 0.00013339382940108892, + "loss": 12.5584, + "step": 735 + }, + { + "epoch": 0.04007814148509662, + "grad_norm": 0.8251457768920586, + "learning_rate": 0.00013357531760435572, + "loss": 12.5211, + "step": 736 + }, + { + "epoch": 0.040132595481679634, + "grad_norm": 1.057308901725991, + "learning_rate": 0.00013375680580762252, + "loss": 12.7114, + "step": 737 + }, + { + "epoch": 0.04018704947826265, + "grad_norm": 0.7555355053742839, + "learning_rate": 0.00013393829401088931, + "loss": 12.4727, + "step": 738 + }, + { + "epoch": 0.040241503474845654, + "grad_norm": 0.9506351004654765, + "learning_rate": 0.00013411978221415609, + "loss": 12.5368, + "step": 739 + }, + { + "epoch": 0.04029595747142867, + "grad_norm": 0.8585509422496327, + "learning_rate": 0.00013430127041742286, + "loss": 12.59, + "step": 740 + }, + { + "epoch": 0.04035041146801168, + "grad_norm": 0.8650805842588604, + "learning_rate": 0.00013448275862068965, + "loss": 12.4794, + "step": 741 + }, + { + "epoch": 0.040404865464594694, + "grad_norm": 0.8002843337946893, + "learning_rate": 0.00013466424682395645, + "loss": 12.3748, + "step": 742 + }, + { + "epoch": 0.0404593194611777, + "grad_norm": 0.7811881804163763, + "learning_rate": 0.00013484573502722325, + "loss": 12.5016, + "step": 743 + }, + { + "epoch": 0.040513773457760714, + "grad_norm": 0.8033795374596253, + "learning_rate": 0.00013502722323049002, + "loss": 12.5511, + "step": 744 + }, + { + "epoch": 0.04056822745434373, + "grad_norm": 0.7797716186956701, + "learning_rate": 0.00013520871143375682, + "loss": 12.6652, + "step": 745 + }, + { + "epoch": 0.04062268145092674, + "grad_norm": 0.8900245517652725, + "learning_rate": 0.0001353901996370236, + "loss": 12.5599, + "step": 746 + }, + { + "epoch": 0.04067713544750975, + "grad_norm": 0.942216054700293, + "learning_rate": 0.0001355716878402904, + "loss": 12.5546, + "step": 747 + }, + { + "epoch": 0.04073158944409276, + "grad_norm": 1.0359805567146756, + "learning_rate": 0.00013575317604355719, + "loss": 12.5748, + "step": 748 + }, + { + "epoch": 0.040786043440675775, + "grad_norm": 1.0075629890341031, + "learning_rate": 0.00013593466424682398, + "loss": 12.5473, + "step": 749 + }, + { + "epoch": 0.04084049743725879, + "grad_norm": 0.7558468233619073, + "learning_rate": 0.00013611615245009076, + "loss": 12.4132, + "step": 750 + }, + { + "epoch": 0.040894951433841795, + "grad_norm": 0.778620606134502, + "learning_rate": 0.00013629764065335753, + "loss": 12.5153, + "step": 751 + }, + { + "epoch": 0.04094940543042481, + "grad_norm": 0.8510205117821609, + "learning_rate": 0.00013647912885662432, + "loss": 12.6411, + "step": 752 + }, + { + "epoch": 0.04100385942700782, + "grad_norm": 0.9177016995670307, + "learning_rate": 0.0001366606170598911, + "loss": 12.5692, + "step": 753 + }, + { + "epoch": 0.041058313423590835, + "grad_norm": 0.7745160379530536, + "learning_rate": 0.0001368421052631579, + "loss": 12.5341, + "step": 754 + }, + { + "epoch": 0.04111276742017384, + "grad_norm": 0.8140311412247214, + "learning_rate": 0.0001370235934664247, + "loss": 12.5958, + "step": 755 + }, + { + "epoch": 0.041167221416756855, + "grad_norm": 0.8551770393253671, + "learning_rate": 0.0001372050816696915, + "loss": 12.5419, + "step": 756 + }, + { + "epoch": 0.04122167541333987, + "grad_norm": 0.7937937119864299, + "learning_rate": 0.00013738656987295826, + "loss": 12.5787, + "step": 757 + }, + { + "epoch": 0.04127612940992288, + "grad_norm": 0.8020893372298973, + "learning_rate": 0.00013756805807622506, + "loss": 12.5617, + "step": 758 + }, + { + "epoch": 0.04133058340650589, + "grad_norm": 0.903708509231395, + "learning_rate": 0.00013774954627949183, + "loss": 12.5472, + "step": 759 + }, + { + "epoch": 0.0413850374030889, + "grad_norm": 0.8577396403644533, + "learning_rate": 0.00013793103448275863, + "loss": 12.5458, + "step": 760 + }, + { + "epoch": 0.041439491399671916, + "grad_norm": 1.091639804694237, + "learning_rate": 0.00013811252268602542, + "loss": 12.5178, + "step": 761 + }, + { + "epoch": 0.04149394539625493, + "grad_norm": 0.842781245898835, + "learning_rate": 0.00013829401088929222, + "loss": 12.662, + "step": 762 + }, + { + "epoch": 0.041548399392837936, + "grad_norm": 0.9767804515104263, + "learning_rate": 0.000138475499092559, + "loss": 12.5239, + "step": 763 + }, + { + "epoch": 0.04160285338942095, + "grad_norm": 0.8484900134788992, + "learning_rate": 0.00013865698729582576, + "loss": 12.5559, + "step": 764 + }, + { + "epoch": 0.04165730738600396, + "grad_norm": 0.8415112219861786, + "learning_rate": 0.00013883847549909256, + "loss": 12.527, + "step": 765 + }, + { + "epoch": 0.041711761382586976, + "grad_norm": 1.0104772032943936, + "learning_rate": 0.00013901996370235933, + "loss": 12.5081, + "step": 766 + }, + { + "epoch": 0.04176621537916998, + "grad_norm": 0.7789779394160212, + "learning_rate": 0.00013920145190562616, + "loss": 12.5802, + "step": 767 + }, + { + "epoch": 0.041820669375752996, + "grad_norm": 0.8828806284149939, + "learning_rate": 0.00013938294010889293, + "loss": 12.5638, + "step": 768 + }, + { + "epoch": 0.04187512337233601, + "grad_norm": 0.9095358751899403, + "learning_rate": 0.00013956442831215973, + "loss": 12.6153, + "step": 769 + }, + { + "epoch": 0.04192957736891902, + "grad_norm": 0.7915235702796843, + "learning_rate": 0.0001397459165154265, + "loss": 12.575, + "step": 770 + }, + { + "epoch": 0.04198403136550203, + "grad_norm": 0.9294936050905224, + "learning_rate": 0.0001399274047186933, + "loss": 12.4846, + "step": 771 + }, + { + "epoch": 0.04203848536208504, + "grad_norm": 0.749715657221197, + "learning_rate": 0.00014010889292196007, + "loss": 12.5267, + "step": 772 + }, + { + "epoch": 0.042092939358668056, + "grad_norm": 0.8036323955706655, + "learning_rate": 0.00014029038112522686, + "loss": 12.602, + "step": 773 + }, + { + "epoch": 0.04214739335525107, + "grad_norm": 0.8151381024538288, + "learning_rate": 0.00014047186932849366, + "loss": 12.5653, + "step": 774 + }, + { + "epoch": 0.042201847351834076, + "grad_norm": 0.9971274493801566, + "learning_rate": 0.00014065335753176043, + "loss": 12.6751, + "step": 775 + }, + { + "epoch": 0.04225630134841709, + "grad_norm": 0.8513572095537125, + "learning_rate": 0.00014083484573502723, + "loss": 12.3767, + "step": 776 + }, + { + "epoch": 0.0423107553450001, + "grad_norm": 1.0023920639584978, + "learning_rate": 0.000141016333938294, + "loss": 12.7133, + "step": 777 + }, + { + "epoch": 0.04236520934158312, + "grad_norm": 0.8628820897885917, + "learning_rate": 0.0001411978221415608, + "loss": 12.5328, + "step": 778 + }, + { + "epoch": 0.04241966333816612, + "grad_norm": 0.7938920881523879, + "learning_rate": 0.0001413793103448276, + "loss": 12.3853, + "step": 779 + }, + { + "epoch": 0.04247411733474914, + "grad_norm": 0.8646632555026025, + "learning_rate": 0.0001415607985480944, + "loss": 12.6662, + "step": 780 + }, + { + "epoch": 0.04252857133133215, + "grad_norm": 0.8439048939595939, + "learning_rate": 0.00014174228675136117, + "loss": 12.3081, + "step": 781 + }, + { + "epoch": 0.042583025327915164, + "grad_norm": 0.7599687354406947, + "learning_rate": 0.00014192377495462796, + "loss": 12.4467, + "step": 782 + }, + { + "epoch": 0.04263747932449817, + "grad_norm": 0.8070920519030709, + "learning_rate": 0.00014210526315789474, + "loss": 12.531, + "step": 783 + }, + { + "epoch": 0.042691933321081184, + "grad_norm": 0.873705703606423, + "learning_rate": 0.00014228675136116153, + "loss": 12.4829, + "step": 784 + }, + { + "epoch": 0.0427463873176642, + "grad_norm": 0.8746134110818439, + "learning_rate": 0.00014246823956442833, + "loss": 12.5633, + "step": 785 + }, + { + "epoch": 0.04280084131424721, + "grad_norm": 0.8003175609249429, + "learning_rate": 0.0001426497277676951, + "loss": 12.6093, + "step": 786 + }, + { + "epoch": 0.04285529531083022, + "grad_norm": 0.9226227234251844, + "learning_rate": 0.0001428312159709619, + "loss": 12.7582, + "step": 787 + }, + { + "epoch": 0.04290974930741323, + "grad_norm": 0.774081515279731, + "learning_rate": 0.00014301270417422867, + "loss": 12.4773, + "step": 788 + }, + { + "epoch": 0.042964203303996244, + "grad_norm": 0.7770717492469534, + "learning_rate": 0.00014319419237749547, + "loss": 12.5703, + "step": 789 + }, + { + "epoch": 0.04301865730057926, + "grad_norm": 0.8342543205146827, + "learning_rate": 0.00014337568058076224, + "loss": 12.596, + "step": 790 + }, + { + "epoch": 0.043073111297162264, + "grad_norm": 0.8516389169196312, + "learning_rate": 0.00014355716878402904, + "loss": 12.5668, + "step": 791 + }, + { + "epoch": 0.04312756529374528, + "grad_norm": 0.7896009308409137, + "learning_rate": 0.00014373865698729584, + "loss": 12.519, + "step": 792 + }, + { + "epoch": 0.04318201929032829, + "grad_norm": 0.8052339672016775, + "learning_rate": 0.00014392014519056263, + "loss": 12.5517, + "step": 793 + }, + { + "epoch": 0.043236473286911305, + "grad_norm": 0.7530572458315676, + "learning_rate": 0.0001441016333938294, + "loss": 12.541, + "step": 794 + }, + { + "epoch": 0.04329092728349431, + "grad_norm": 0.8662017245842865, + "learning_rate": 0.0001442831215970962, + "loss": 12.5754, + "step": 795 + }, + { + "epoch": 0.043345381280077325, + "grad_norm": 0.7645406815099872, + "learning_rate": 0.00014446460980036297, + "loss": 12.4066, + "step": 796 + }, + { + "epoch": 0.04339983527666034, + "grad_norm": 0.8330667873972226, + "learning_rate": 0.00014464609800362977, + "loss": 12.4053, + "step": 797 + }, + { + "epoch": 0.043454289273243345, + "grad_norm": 0.8205108353659064, + "learning_rate": 0.00014482758620689657, + "loss": 12.5986, + "step": 798 + }, + { + "epoch": 0.04350874326982636, + "grad_norm": 0.8520375890357685, + "learning_rate": 0.00014500907441016334, + "loss": 12.5274, + "step": 799 + }, + { + "epoch": 0.04356319726640937, + "grad_norm": 0.8080865183543499, + "learning_rate": 0.00014519056261343014, + "loss": 12.703, + "step": 800 + }, + { + "epoch": 0.043617651262992385, + "grad_norm": 0.8028752755990547, + "learning_rate": 0.0001453720508166969, + "loss": 12.4813, + "step": 801 + }, + { + "epoch": 0.04367210525957539, + "grad_norm": 0.7991778930135419, + "learning_rate": 0.0001455535390199637, + "loss": 12.4828, + "step": 802 + }, + { + "epoch": 0.043726559256158405, + "grad_norm": 0.7638276877678696, + "learning_rate": 0.00014573502722323048, + "loss": 12.4735, + "step": 803 + }, + { + "epoch": 0.04378101325274142, + "grad_norm": 0.818424687275988, + "learning_rate": 0.0001459165154264973, + "loss": 12.6986, + "step": 804 + }, + { + "epoch": 0.04383546724932443, + "grad_norm": 1.082495586362405, + "learning_rate": 0.00014609800362976407, + "loss": 12.5908, + "step": 805 + }, + { + "epoch": 0.04388992124590744, + "grad_norm": 1.325984013203311, + "learning_rate": 0.00014627949183303087, + "loss": 12.6857, + "step": 806 + }, + { + "epoch": 0.04394437524249045, + "grad_norm": 0.8686683284879365, + "learning_rate": 0.00014646098003629764, + "loss": 12.4566, + "step": 807 + }, + { + "epoch": 0.043998829239073466, + "grad_norm": 0.8177369162606946, + "learning_rate": 0.00014664246823956444, + "loss": 12.5558, + "step": 808 + }, + { + "epoch": 0.04405328323565648, + "grad_norm": 0.7760195423893618, + "learning_rate": 0.0001468239564428312, + "loss": 12.4437, + "step": 809 + }, + { + "epoch": 0.044107737232239486, + "grad_norm": 0.8298412976489291, + "learning_rate": 0.000147005444646098, + "loss": 12.4522, + "step": 810 + }, + { + "epoch": 0.0441621912288225, + "grad_norm": 0.7607087893488296, + "learning_rate": 0.0001471869328493648, + "loss": 12.5662, + "step": 811 + }, + { + "epoch": 0.04421664522540551, + "grad_norm": 0.8810991381778492, + "learning_rate": 0.00014736842105263158, + "loss": 12.6585, + "step": 812 + }, + { + "epoch": 0.044271099221988526, + "grad_norm": 0.8024073570168333, + "learning_rate": 0.00014754990925589838, + "loss": 12.5763, + "step": 813 + }, + { + "epoch": 0.04432555321857153, + "grad_norm": 0.7715357348992065, + "learning_rate": 0.00014773139745916515, + "loss": 12.6189, + "step": 814 + }, + { + "epoch": 0.044380007215154546, + "grad_norm": 0.8995056229545437, + "learning_rate": 0.00014791288566243195, + "loss": 12.6572, + "step": 815 + }, + { + "epoch": 0.04443446121173756, + "grad_norm": 0.8248284303171265, + "learning_rate": 0.00014809437386569874, + "loss": 12.5962, + "step": 816 + }, + { + "epoch": 0.04448891520832057, + "grad_norm": 0.866524307728336, + "learning_rate": 0.00014827586206896554, + "loss": 12.6484, + "step": 817 + }, + { + "epoch": 0.04454336920490358, + "grad_norm": 0.8879113215826534, + "learning_rate": 0.0001484573502722323, + "loss": 12.6412, + "step": 818 + }, + { + "epoch": 0.04459782320148659, + "grad_norm": 0.8758200396877941, + "learning_rate": 0.0001486388384754991, + "loss": 12.4537, + "step": 819 + }, + { + "epoch": 0.044652277198069606, + "grad_norm": 0.8436526664202744, + "learning_rate": 0.00014882032667876588, + "loss": 12.5899, + "step": 820 + }, + { + "epoch": 0.04470673119465262, + "grad_norm": 0.8545141478541453, + "learning_rate": 0.00014900181488203265, + "loss": 12.3574, + "step": 821 + }, + { + "epoch": 0.044761185191235627, + "grad_norm": 1.0267167465528357, + "learning_rate": 0.00014918330308529948, + "loss": 12.5645, + "step": 822 + }, + { + "epoch": 0.04481563918781864, + "grad_norm": 0.784780075284951, + "learning_rate": 0.00014936479128856625, + "loss": 12.4824, + "step": 823 + }, + { + "epoch": 0.04487009318440165, + "grad_norm": 0.9789745719784654, + "learning_rate": 0.00014954627949183305, + "loss": 12.8387, + "step": 824 + }, + { + "epoch": 0.04492454718098467, + "grad_norm": 0.8341195518544883, + "learning_rate": 0.00014972776769509982, + "loss": 12.5543, + "step": 825 + }, + { + "epoch": 0.04497900117756767, + "grad_norm": 0.869629747247344, + "learning_rate": 0.00014990925589836661, + "loss": 12.6488, + "step": 826 + }, + { + "epoch": 0.04503345517415069, + "grad_norm": 0.826578908620228, + "learning_rate": 0.00015009074410163339, + "loss": 12.7158, + "step": 827 + }, + { + "epoch": 0.0450879091707337, + "grad_norm": 0.8756522771620907, + "learning_rate": 0.00015027223230490018, + "loss": 12.6569, + "step": 828 + }, + { + "epoch": 0.045142363167316714, + "grad_norm": 0.9101434033316181, + "learning_rate": 0.00015045372050816698, + "loss": 12.6332, + "step": 829 + }, + { + "epoch": 0.04519681716389972, + "grad_norm": 0.8952132643866115, + "learning_rate": 0.00015063520871143378, + "loss": 12.5787, + "step": 830 + }, + { + "epoch": 0.045251271160482734, + "grad_norm": 0.8011479463268877, + "learning_rate": 0.00015081669691470055, + "loss": 12.608, + "step": 831 + }, + { + "epoch": 0.04530572515706575, + "grad_norm": 0.8206162307714958, + "learning_rate": 0.00015099818511796735, + "loss": 12.4505, + "step": 832 + }, + { + "epoch": 0.04536017915364876, + "grad_norm": 0.832128042031696, + "learning_rate": 0.00015117967332123412, + "loss": 12.6066, + "step": 833 + }, + { + "epoch": 0.04541463315023177, + "grad_norm": 0.8683483549303054, + "learning_rate": 0.00015136116152450092, + "loss": 12.606, + "step": 834 + }, + { + "epoch": 0.04546908714681478, + "grad_norm": 0.8436085523266798, + "learning_rate": 0.00015154264972776772, + "loss": 12.6469, + "step": 835 + }, + { + "epoch": 0.045523541143397794, + "grad_norm": 0.8528405540241153, + "learning_rate": 0.00015172413793103449, + "loss": 12.6791, + "step": 836 + }, + { + "epoch": 0.04557799513998081, + "grad_norm": 0.8141272474203417, + "learning_rate": 0.00015190562613430128, + "loss": 12.6382, + "step": 837 + }, + { + "epoch": 0.045632449136563814, + "grad_norm": 0.74720840520126, + "learning_rate": 0.00015208711433756806, + "loss": 12.5676, + "step": 838 + }, + { + "epoch": 0.04568690313314683, + "grad_norm": 0.7927722643407407, + "learning_rate": 0.00015226860254083485, + "loss": 12.5094, + "step": 839 + }, + { + "epoch": 0.04574135712972984, + "grad_norm": 0.8636739467475668, + "learning_rate": 0.00015245009074410162, + "loss": 12.6322, + "step": 840 + }, + { + "epoch": 0.045795811126312855, + "grad_norm": 0.7951696899986485, + "learning_rate": 0.00015263157894736845, + "loss": 12.5994, + "step": 841 + }, + { + "epoch": 0.04585026512289586, + "grad_norm": 0.8781451899122102, + "learning_rate": 0.00015281306715063522, + "loss": 12.7485, + "step": 842 + }, + { + "epoch": 0.045904719119478875, + "grad_norm": 0.7856930125310769, + "learning_rate": 0.00015299455535390202, + "loss": 12.6924, + "step": 843 + }, + { + "epoch": 0.04595917311606189, + "grad_norm": 0.8669106113156154, + "learning_rate": 0.0001531760435571688, + "loss": 12.756, + "step": 844 + }, + { + "epoch": 0.0460136271126449, + "grad_norm": 0.7645949228985675, + "learning_rate": 0.00015335753176043556, + "loss": 12.5389, + "step": 845 + }, + { + "epoch": 0.04606808110922791, + "grad_norm": 0.8798600529095105, + "learning_rate": 0.00015353901996370236, + "loss": 12.728, + "step": 846 + }, + { + "epoch": 0.04612253510581092, + "grad_norm": 0.741923863402535, + "learning_rate": 0.00015372050816696916, + "loss": 12.5727, + "step": 847 + }, + { + "epoch": 0.046176989102393935, + "grad_norm": 0.8260769631339705, + "learning_rate": 0.00015390199637023595, + "loss": 12.6141, + "step": 848 + }, + { + "epoch": 0.04623144309897695, + "grad_norm": 0.8194036460957305, + "learning_rate": 0.00015408348457350272, + "loss": 12.5255, + "step": 849 + }, + { + "epoch": 0.046285897095559955, + "grad_norm": 0.8691842016409564, + "learning_rate": 0.00015426497277676952, + "loss": 12.665, + "step": 850 + }, + { + "epoch": 0.04634035109214297, + "grad_norm": 0.8265824746983291, + "learning_rate": 0.0001544464609800363, + "loss": 12.6078, + "step": 851 + }, + { + "epoch": 0.04639480508872598, + "grad_norm": 0.7402054058609698, + "learning_rate": 0.0001546279491833031, + "loss": 12.6317, + "step": 852 + }, + { + "epoch": 0.046449259085308996, + "grad_norm": 0.8385811912548654, + "learning_rate": 0.0001548094373865699, + "loss": 12.5836, + "step": 853 + }, + { + "epoch": 0.046503713081892, + "grad_norm": 0.8237170640137944, + "learning_rate": 0.0001549909255898367, + "loss": 12.6291, + "step": 854 + }, + { + "epoch": 0.046558167078475016, + "grad_norm": 0.823899899114155, + "learning_rate": 0.00015517241379310346, + "loss": 12.4743, + "step": 855 + }, + { + "epoch": 0.04661262107505803, + "grad_norm": 0.7988258901543164, + "learning_rate": 0.00015535390199637023, + "loss": 12.4996, + "step": 856 + }, + { + "epoch": 0.04666707507164104, + "grad_norm": 0.809244770017492, + "learning_rate": 0.00015553539019963703, + "loss": 12.575, + "step": 857 + }, + { + "epoch": 0.04672152906822405, + "grad_norm": 0.7959350249409557, + "learning_rate": 0.0001557168784029038, + "loss": 12.5737, + "step": 858 + }, + { + "epoch": 0.04677598306480706, + "grad_norm": 1.0155065239576782, + "learning_rate": 0.00015589836660617062, + "loss": 12.55, + "step": 859 + }, + { + "epoch": 0.046830437061390076, + "grad_norm": 0.9194464079420726, + "learning_rate": 0.0001560798548094374, + "loss": 12.551, + "step": 860 + }, + { + "epoch": 0.04688489105797308, + "grad_norm": 0.9893867712993335, + "learning_rate": 0.0001562613430127042, + "loss": 12.5736, + "step": 861 + }, + { + "epoch": 0.046939345054556096, + "grad_norm": 0.862259978923311, + "learning_rate": 0.00015644283121597096, + "loss": 12.5943, + "step": 862 + }, + { + "epoch": 0.04699379905113911, + "grad_norm": 0.7750438451575176, + "learning_rate": 0.00015662431941923776, + "loss": 12.5714, + "step": 863 + }, + { + "epoch": 0.04704825304772212, + "grad_norm": 0.8220468097414089, + "learning_rate": 0.00015680580762250453, + "loss": 12.6296, + "step": 864 + }, + { + "epoch": 0.04710270704430513, + "grad_norm": 0.8713481011755143, + "learning_rate": 0.00015698729582577133, + "loss": 12.5893, + "step": 865 + }, + { + "epoch": 0.04715716104088814, + "grad_norm": 0.9801443084095905, + "learning_rate": 0.00015716878402903813, + "loss": 12.7519, + "step": 866 + }, + { + "epoch": 0.04721161503747116, + "grad_norm": 0.9026000187524962, + "learning_rate": 0.00015735027223230493, + "loss": 12.7312, + "step": 867 + }, + { + "epoch": 0.04726606903405417, + "grad_norm": 0.895500696590104, + "learning_rate": 0.0001575317604355717, + "loss": 12.5189, + "step": 868 + }, + { + "epoch": 0.04732052303063718, + "grad_norm": 0.8867758320775264, + "learning_rate": 0.00015771324863883847, + "loss": 12.5417, + "step": 869 + }, + { + "epoch": 0.04737497702722019, + "grad_norm": 0.8124655980552189, + "learning_rate": 0.00015789473684210527, + "loss": 12.6201, + "step": 870 + }, + { + "epoch": 0.047429431023803204, + "grad_norm": 0.9722011505333534, + "learning_rate": 0.00015807622504537206, + "loss": 12.5834, + "step": 871 + }, + { + "epoch": 0.04748388502038622, + "grad_norm": 0.982984567178049, + "learning_rate": 0.00015825771324863886, + "loss": 12.6461, + "step": 872 + }, + { + "epoch": 0.047538339016969224, + "grad_norm": 0.8381062655513668, + "learning_rate": 0.00015843920145190563, + "loss": 12.5809, + "step": 873 + }, + { + "epoch": 0.04759279301355224, + "grad_norm": 0.9650603897304773, + "learning_rate": 0.00015862068965517243, + "loss": 12.7033, + "step": 874 + }, + { + "epoch": 0.04764724701013525, + "grad_norm": 0.8854713870148178, + "learning_rate": 0.0001588021778584392, + "loss": 12.5962, + "step": 875 + }, + { + "epoch": 0.047701701006718264, + "grad_norm": 0.7761059432684236, + "learning_rate": 0.000158983666061706, + "loss": 12.6441, + "step": 876 + }, + { + "epoch": 0.04775615500330127, + "grad_norm": 0.8718350296220381, + "learning_rate": 0.00015916515426497277, + "loss": 12.6047, + "step": 877 + }, + { + "epoch": 0.047810608999884284, + "grad_norm": 0.82164764356862, + "learning_rate": 0.0001593466424682396, + "loss": 12.5409, + "step": 878 + }, + { + "epoch": 0.0478650629964673, + "grad_norm": 0.8196215669312907, + "learning_rate": 0.00015952813067150637, + "loss": 12.5776, + "step": 879 + }, + { + "epoch": 0.04791951699305031, + "grad_norm": 1.014476253091219, + "learning_rate": 0.00015970961887477314, + "loss": 12.6802, + "step": 880 + }, + { + "epoch": 0.04797397098963332, + "grad_norm": 0.8195727306565402, + "learning_rate": 0.00015989110707803993, + "loss": 12.5736, + "step": 881 + }, + { + "epoch": 0.04802842498621633, + "grad_norm": 0.862536103470985, + "learning_rate": 0.0001600725952813067, + "loss": 12.4775, + "step": 882 + }, + { + "epoch": 0.048082878982799344, + "grad_norm": 0.7498353482743876, + "learning_rate": 0.0001602540834845735, + "loss": 12.6313, + "step": 883 + }, + { + "epoch": 0.04813733297938236, + "grad_norm": 0.8062614637177298, + "learning_rate": 0.0001604355716878403, + "loss": 12.4909, + "step": 884 + }, + { + "epoch": 0.048191786975965364, + "grad_norm": 0.8415146193366887, + "learning_rate": 0.0001606170598911071, + "loss": 12.6701, + "step": 885 + }, + { + "epoch": 0.04824624097254838, + "grad_norm": 0.8429914823102769, + "learning_rate": 0.00016079854809437387, + "loss": 12.6284, + "step": 886 + }, + { + "epoch": 0.04830069496913139, + "grad_norm": 0.8222857362939741, + "learning_rate": 0.00016098003629764067, + "loss": 12.5522, + "step": 887 + }, + { + "epoch": 0.048355148965714405, + "grad_norm": 0.7890962523234009, + "learning_rate": 0.00016116152450090744, + "loss": 12.5168, + "step": 888 + }, + { + "epoch": 0.04840960296229741, + "grad_norm": 0.8912584208255049, + "learning_rate": 0.00016134301270417424, + "loss": 12.6498, + "step": 889 + }, + { + "epoch": 0.048464056958880425, + "grad_norm": 0.9530476347867869, + "learning_rate": 0.00016152450090744103, + "loss": 12.6872, + "step": 890 + }, + { + "epoch": 0.04851851095546344, + "grad_norm": 0.7724850359717667, + "learning_rate": 0.0001617059891107078, + "loss": 12.5858, + "step": 891 + }, + { + "epoch": 0.04857296495204645, + "grad_norm": 1.0184330473902963, + "learning_rate": 0.0001618874773139746, + "loss": 12.6251, + "step": 892 + }, + { + "epoch": 0.04862741894862946, + "grad_norm": 0.9254790514667174, + "learning_rate": 0.00016206896551724137, + "loss": 12.6039, + "step": 893 + }, + { + "epoch": 0.04868187294521247, + "grad_norm": 0.758029348988625, + "learning_rate": 0.00016225045372050817, + "loss": 12.4594, + "step": 894 + }, + { + "epoch": 0.048736326941795485, + "grad_norm": 0.7274927678606498, + "learning_rate": 0.00016243194192377494, + "loss": 12.5977, + "step": 895 + }, + { + "epoch": 0.0487907809383785, + "grad_norm": 0.8384075646167376, + "learning_rate": 0.00016261343012704177, + "loss": 12.6449, + "step": 896 + }, + { + "epoch": 0.048845234934961505, + "grad_norm": 0.7486479322661992, + "learning_rate": 0.00016279491833030854, + "loss": 12.6152, + "step": 897 + }, + { + "epoch": 0.04889968893154452, + "grad_norm": 0.9106636506573582, + "learning_rate": 0.00016297640653357534, + "loss": 12.6102, + "step": 898 + }, + { + "epoch": 0.04895414292812753, + "grad_norm": 0.8113137306900519, + "learning_rate": 0.0001631578947368421, + "loss": 12.5951, + "step": 899 + }, + { + "epoch": 0.049008596924710546, + "grad_norm": 0.8959399175330679, + "learning_rate": 0.0001633393829401089, + "loss": 12.7228, + "step": 900 + }, + { + "epoch": 0.04906305092129355, + "grad_norm": 1.0275683355568288, + "learning_rate": 0.00016352087114337568, + "loss": 12.7082, + "step": 901 + }, + { + "epoch": 0.049117504917876566, + "grad_norm": 0.889496269768351, + "learning_rate": 0.00016370235934664247, + "loss": 12.6843, + "step": 902 + }, + { + "epoch": 0.04917195891445958, + "grad_norm": 0.7581300503636256, + "learning_rate": 0.00016388384754990927, + "loss": 12.5467, + "step": 903 + }, + { + "epoch": 0.04922641291104259, + "grad_norm": 0.8974381502103214, + "learning_rate": 0.00016406533575317604, + "loss": 12.6412, + "step": 904 + }, + { + "epoch": 0.0492808669076256, + "grad_norm": 0.9315736025199278, + "learning_rate": 0.00016424682395644284, + "loss": 12.6383, + "step": 905 + }, + { + "epoch": 0.04933532090420861, + "grad_norm": 0.8746410644890605, + "learning_rate": 0.0001644283121597096, + "loss": 12.689, + "step": 906 + }, + { + "epoch": 0.049389774900791626, + "grad_norm": 0.8032178195748642, + "learning_rate": 0.0001646098003629764, + "loss": 12.5981, + "step": 907 + }, + { + "epoch": 0.04944422889737464, + "grad_norm": 1.055609092234511, + "learning_rate": 0.0001647912885662432, + "loss": 12.6674, + "step": 908 + }, + { + "epoch": 0.049498682893957646, + "grad_norm": 0.9329468352906664, + "learning_rate": 0.00016497277676951, + "loss": 12.6298, + "step": 909 + }, + { + "epoch": 0.04955313689054066, + "grad_norm": 0.9100919688447834, + "learning_rate": 0.00016515426497277678, + "loss": 12.6856, + "step": 910 + }, + { + "epoch": 0.04960759088712367, + "grad_norm": 0.8157553859700658, + "learning_rate": 0.00016533575317604358, + "loss": 12.6711, + "step": 911 + }, + { + "epoch": 0.04966204488370669, + "grad_norm": 0.8407602222280705, + "learning_rate": 0.00016551724137931035, + "loss": 12.5952, + "step": 912 + }, + { + "epoch": 0.04971649888028969, + "grad_norm": 0.8536838879477414, + "learning_rate": 0.00016569872958257714, + "loss": 12.4985, + "step": 913 + }, + { + "epoch": 0.04977095287687271, + "grad_norm": 0.818610133507912, + "learning_rate": 0.00016588021778584392, + "loss": 12.5662, + "step": 914 + }, + { + "epoch": 0.04982540687345572, + "grad_norm": 0.8852001994986728, + "learning_rate": 0.0001660617059891107, + "loss": 12.5936, + "step": 915 + }, + { + "epoch": 0.049879860870038734, + "grad_norm": 0.9487098832759733, + "learning_rate": 0.0001662431941923775, + "loss": 12.7971, + "step": 916 + }, + { + "epoch": 0.04993431486662174, + "grad_norm": 0.872066612504273, + "learning_rate": 0.00016642468239564428, + "loss": 12.7543, + "step": 917 + }, + { + "epoch": 0.049988768863204754, + "grad_norm": 0.7908053266666842, + "learning_rate": 0.00016660617059891108, + "loss": 12.6731, + "step": 918 + }, + { + "epoch": 0.05004322285978777, + "grad_norm": 0.8909361958279155, + "learning_rate": 0.00016678765880217785, + "loss": 12.6195, + "step": 919 + }, + { + "epoch": 0.05009767685637078, + "grad_norm": 0.8839930655370404, + "learning_rate": 0.00016696914700544465, + "loss": 12.6823, + "step": 920 + }, + { + "epoch": 0.05015213085295379, + "grad_norm": 0.8593282266531492, + "learning_rate": 0.00016715063520871145, + "loss": 12.5395, + "step": 921 + }, + { + "epoch": 0.0502065848495368, + "grad_norm": 0.7522025735405317, + "learning_rate": 0.00016733212341197824, + "loss": 12.5567, + "step": 922 + }, + { + "epoch": 0.050261038846119814, + "grad_norm": 1.0190661738060314, + "learning_rate": 0.00016751361161524502, + "loss": 12.6318, + "step": 923 + }, + { + "epoch": 0.05031549284270283, + "grad_norm": 0.7614223532031728, + "learning_rate": 0.0001676950998185118, + "loss": 12.6078, + "step": 924 + }, + { + "epoch": 0.050369946839285834, + "grad_norm": 0.9321891712221156, + "learning_rate": 0.00016787658802177858, + "loss": 12.6678, + "step": 925 + }, + { + "epoch": 0.05042440083586885, + "grad_norm": 0.8467437632932115, + "learning_rate": 0.00016805807622504538, + "loss": 12.6101, + "step": 926 + }, + { + "epoch": 0.05047885483245186, + "grad_norm": 1.0609254909084949, + "learning_rate": 0.00016823956442831218, + "loss": 12.7275, + "step": 927 + }, + { + "epoch": 0.05053330882903487, + "grad_norm": 0.9175270724116192, + "learning_rate": 0.00016842105263157895, + "loss": 12.7194, + "step": 928 + }, + { + "epoch": 0.05058776282561788, + "grad_norm": 0.9166307896802638, + "learning_rate": 0.00016860254083484575, + "loss": 12.6248, + "step": 929 + }, + { + "epoch": 0.050642216822200894, + "grad_norm": 0.8315736647865203, + "learning_rate": 0.00016878402903811252, + "loss": 12.6652, + "step": 930 + }, + { + "epoch": 0.05069667081878391, + "grad_norm": 0.779456315691902, + "learning_rate": 0.00016896551724137932, + "loss": 12.6193, + "step": 931 + }, + { + "epoch": 0.050751124815366915, + "grad_norm": 0.9332276655363869, + "learning_rate": 0.0001691470054446461, + "loss": 12.4871, + "step": 932 + }, + { + "epoch": 0.05080557881194993, + "grad_norm": 0.7483055003595003, + "learning_rate": 0.0001693284936479129, + "loss": 12.3438, + "step": 933 + }, + { + "epoch": 0.05086003280853294, + "grad_norm": 1.043133034199254, + "learning_rate": 0.00016950998185117968, + "loss": 12.5724, + "step": 934 + }, + { + "epoch": 0.050914486805115955, + "grad_norm": 0.9885656883730269, + "learning_rate": 0.00016969147005444648, + "loss": 12.5337, + "step": 935 + }, + { + "epoch": 0.05096894080169896, + "grad_norm": 0.8284948943642783, + "learning_rate": 0.00016987295825771325, + "loss": 12.7829, + "step": 936 + }, + { + "epoch": 0.051023394798281975, + "grad_norm": 0.8429753478015432, + "learning_rate": 0.00017005444646098005, + "loss": 12.671, + "step": 937 + }, + { + "epoch": 0.05107784879486499, + "grad_norm": 0.8150872100250276, + "learning_rate": 0.00017023593466424682, + "loss": 12.5752, + "step": 938 + }, + { + "epoch": 0.051132302791448, + "grad_norm": 0.975085573256272, + "learning_rate": 0.00017041742286751362, + "loss": 12.7308, + "step": 939 + }, + { + "epoch": 0.05118675678803101, + "grad_norm": 0.8226356283220444, + "learning_rate": 0.00017059891107078042, + "loss": 12.7762, + "step": 940 + }, + { + "epoch": 0.05124121078461402, + "grad_norm": 0.9192672341554673, + "learning_rate": 0.0001707803992740472, + "loss": 12.6354, + "step": 941 + }, + { + "epoch": 0.051295664781197035, + "grad_norm": 0.8457327315713598, + "learning_rate": 0.000170961887477314, + "loss": 12.665, + "step": 942 + }, + { + "epoch": 0.05135011877778005, + "grad_norm": 0.8549173394249183, + "learning_rate": 0.00017114337568058076, + "loss": 12.546, + "step": 943 + }, + { + "epoch": 0.051404572774363055, + "grad_norm": 0.9118763020226056, + "learning_rate": 0.00017132486388384756, + "loss": 12.6458, + "step": 944 + }, + { + "epoch": 0.05145902677094607, + "grad_norm": 0.8463997651394616, + "learning_rate": 0.00017150635208711435, + "loss": 12.7347, + "step": 945 + }, + { + "epoch": 0.05151348076752908, + "grad_norm": 0.8627001066806589, + "learning_rate": 0.00017168784029038115, + "loss": 12.5186, + "step": 946 + }, + { + "epoch": 0.051567934764112096, + "grad_norm": 0.8447051949506521, + "learning_rate": 0.00017186932849364792, + "loss": 12.6107, + "step": 947 + }, + { + "epoch": 0.0516223887606951, + "grad_norm": 0.8246470203738206, + "learning_rate": 0.00017205081669691472, + "loss": 12.6495, + "step": 948 + }, + { + "epoch": 0.051676842757278116, + "grad_norm": 0.8653752959166282, + "learning_rate": 0.0001722323049001815, + "loss": 12.5561, + "step": 949 + }, + { + "epoch": 0.05173129675386113, + "grad_norm": 0.8007262251384583, + "learning_rate": 0.00017241379310344826, + "loss": 12.6482, + "step": 950 + }, + { + "epoch": 0.05178575075044414, + "grad_norm": 0.8102077511779113, + "learning_rate": 0.00017259528130671506, + "loss": 12.6646, + "step": 951 + }, + { + "epoch": 0.05184020474702715, + "grad_norm": 0.904787409745801, + "learning_rate": 0.00017277676950998186, + "loss": 12.613, + "step": 952 + }, + { + "epoch": 0.05189465874361016, + "grad_norm": 0.8622064273999556, + "learning_rate": 0.00017295825771324866, + "loss": 12.6379, + "step": 953 + }, + { + "epoch": 0.051949112740193176, + "grad_norm": 0.9227253225257225, + "learning_rate": 0.00017313974591651543, + "loss": 12.4684, + "step": 954 + }, + { + "epoch": 0.05200356673677619, + "grad_norm": 0.9007687103506802, + "learning_rate": 0.00017332123411978223, + "loss": 12.6496, + "step": 955 + }, + { + "epoch": 0.052058020733359196, + "grad_norm": 0.9285057014615055, + "learning_rate": 0.000173502722323049, + "loss": 12.5537, + "step": 956 + }, + { + "epoch": 0.05211247472994221, + "grad_norm": 0.789205719752802, + "learning_rate": 0.0001736842105263158, + "loss": 12.5936, + "step": 957 + }, + { + "epoch": 0.05216692872652522, + "grad_norm": 0.8941984741380373, + "learning_rate": 0.0001738656987295826, + "loss": 12.5961, + "step": 958 + }, + { + "epoch": 0.05222138272310824, + "grad_norm": 0.9388036268594567, + "learning_rate": 0.0001740471869328494, + "loss": 12.609, + "step": 959 + }, + { + "epoch": 0.05227583671969124, + "grad_norm": 0.9588542625267311, + "learning_rate": 0.00017422867513611616, + "loss": 12.662, + "step": 960 + }, + { + "epoch": 0.05233029071627426, + "grad_norm": 0.9761067195333801, + "learning_rate": 0.00017441016333938296, + "loss": 12.7561, + "step": 961 + }, + { + "epoch": 0.05238474471285727, + "grad_norm": 0.7943628439855379, + "learning_rate": 0.00017459165154264973, + "loss": 12.661, + "step": 962 + }, + { + "epoch": 0.052439198709440284, + "grad_norm": 0.7609648264074853, + "learning_rate": 0.0001747731397459165, + "loss": 12.6289, + "step": 963 + }, + { + "epoch": 0.05249365270602329, + "grad_norm": 0.9609887685299381, + "learning_rate": 0.00017495462794918333, + "loss": 12.7005, + "step": 964 + }, + { + "epoch": 0.052548106702606304, + "grad_norm": 0.7698262991659199, + "learning_rate": 0.0001751361161524501, + "loss": 12.5711, + "step": 965 + }, + { + "epoch": 0.05260256069918932, + "grad_norm": 0.8387736314013369, + "learning_rate": 0.0001753176043557169, + "loss": 12.6618, + "step": 966 + }, + { + "epoch": 0.05265701469577233, + "grad_norm": 0.8919145099367959, + "learning_rate": 0.00017549909255898367, + "loss": 12.6663, + "step": 967 + }, + { + "epoch": 0.05271146869235534, + "grad_norm": 0.9217171184647402, + "learning_rate": 0.00017568058076225046, + "loss": 12.6593, + "step": 968 + }, + { + "epoch": 0.05276592268893835, + "grad_norm": 0.8225240714439287, + "learning_rate": 0.00017586206896551723, + "loss": 12.7085, + "step": 969 + }, + { + "epoch": 0.052820376685521364, + "grad_norm": 0.8413886223855485, + "learning_rate": 0.00017604355716878403, + "loss": 12.5573, + "step": 970 + }, + { + "epoch": 0.05287483068210438, + "grad_norm": 0.7944122346053981, + "learning_rate": 0.00017622504537205083, + "loss": 12.649, + "step": 971 + }, + { + "epoch": 0.052929284678687384, + "grad_norm": 0.9027497161747975, + "learning_rate": 0.00017640653357531763, + "loss": 12.6321, + "step": 972 + }, + { + "epoch": 0.0529837386752704, + "grad_norm": 0.7864322574182031, + "learning_rate": 0.0001765880217785844, + "loss": 12.7023, + "step": 973 + }, + { + "epoch": 0.05303819267185341, + "grad_norm": 0.8749044448504391, + "learning_rate": 0.00017676950998185117, + "loss": 12.6465, + "step": 974 + }, + { + "epoch": 0.053092646668436425, + "grad_norm": 0.733046163574283, + "learning_rate": 0.00017695099818511797, + "loss": 12.5625, + "step": 975 + }, + { + "epoch": 0.05314710066501943, + "grad_norm": 0.850086651338118, + "learning_rate": 0.00017713248638838477, + "loss": 12.6172, + "step": 976 + }, + { + "epoch": 0.053201554661602445, + "grad_norm": 0.9315575438714128, + "learning_rate": 0.00017731397459165156, + "loss": 12.7023, + "step": 977 + }, + { + "epoch": 0.05325600865818546, + "grad_norm": 0.8522769717057842, + "learning_rate": 0.00017749546279491833, + "loss": 12.7316, + "step": 978 + }, + { + "epoch": 0.05331046265476847, + "grad_norm": 0.7952996316417338, + "learning_rate": 0.00017767695099818513, + "loss": 12.5978, + "step": 979 + }, + { + "epoch": 0.05336491665135148, + "grad_norm": 0.8405135630987488, + "learning_rate": 0.0001778584392014519, + "loss": 12.8137, + "step": 980 + }, + { + "epoch": 0.05341937064793449, + "grad_norm": 0.7651645706970682, + "learning_rate": 0.0001780399274047187, + "loss": 12.5507, + "step": 981 + }, + { + "epoch": 0.053473824644517505, + "grad_norm": 0.8459505872252346, + "learning_rate": 0.0001782214156079855, + "loss": 12.5907, + "step": 982 + }, + { + "epoch": 0.05352827864110052, + "grad_norm": 0.7574463269859852, + "learning_rate": 0.0001784029038112523, + "loss": 12.4691, + "step": 983 + }, + { + "epoch": 0.053582732637683525, + "grad_norm": 0.8922042819317508, + "learning_rate": 0.00017858439201451907, + "loss": 12.7275, + "step": 984 + }, + { + "epoch": 0.05363718663426654, + "grad_norm": 0.7755357628949647, + "learning_rate": 0.00017876588021778584, + "loss": 12.6564, + "step": 985 + }, + { + "epoch": 0.05369164063084955, + "grad_norm": 0.888720018295191, + "learning_rate": 0.00017894736842105264, + "loss": 12.4645, + "step": 986 + }, + { + "epoch": 0.053746094627432565, + "grad_norm": 0.8258444684364085, + "learning_rate": 0.0001791288566243194, + "loss": 12.7308, + "step": 987 + }, + { + "epoch": 0.05380054862401557, + "grad_norm": 0.8517525708787187, + "learning_rate": 0.0001793103448275862, + "loss": 12.6665, + "step": 988 + }, + { + "epoch": 0.053855002620598585, + "grad_norm": 0.9308572420303274, + "learning_rate": 0.000179491833030853, + "loss": 12.6683, + "step": 989 + }, + { + "epoch": 0.0539094566171816, + "grad_norm": 0.947701272403253, + "learning_rate": 0.0001796733212341198, + "loss": 12.7302, + "step": 990 + }, + { + "epoch": 0.05396391061376461, + "grad_norm": 0.8412669138813667, + "learning_rate": 0.00017985480943738657, + "loss": 12.4787, + "step": 991 + }, + { + "epoch": 0.05401836461034762, + "grad_norm": 0.9849026278127962, + "learning_rate": 0.00018003629764065337, + "loss": 12.5891, + "step": 992 + }, + { + "epoch": 0.05407281860693063, + "grad_norm": 0.9591250834751449, + "learning_rate": 0.00018021778584392014, + "loss": 12.6623, + "step": 993 + }, + { + "epoch": 0.054127272603513646, + "grad_norm": 0.9657335153684073, + "learning_rate": 0.00018039927404718694, + "loss": 12.6658, + "step": 994 + }, + { + "epoch": 0.05418172660009665, + "grad_norm": 0.7862727136981313, + "learning_rate": 0.00018058076225045374, + "loss": 12.6252, + "step": 995 + }, + { + "epoch": 0.054236180596679666, + "grad_norm": 0.8994392012305145, + "learning_rate": 0.00018076225045372054, + "loss": 12.6117, + "step": 996 + }, + { + "epoch": 0.05429063459326268, + "grad_norm": 0.9888859609422734, + "learning_rate": 0.0001809437386569873, + "loss": 12.542, + "step": 997 + }, + { + "epoch": 0.05434508858984569, + "grad_norm": 0.8085407049387496, + "learning_rate": 0.00018112522686025408, + "loss": 12.511, + "step": 998 + }, + { + "epoch": 0.0543995425864287, + "grad_norm": 0.8706037750909749, + "learning_rate": 0.00018130671506352088, + "loss": 12.6018, + "step": 999 + }, + { + "epoch": 0.05445399658301171, + "grad_norm": 0.9259418524451194, + "learning_rate": 0.00018148820326678765, + "loss": 12.8483, + "step": 1000 + }, + { + "epoch": 0.054508450579594726, + "grad_norm": 0.7746764072383874, + "learning_rate": 0.00018166969147005447, + "loss": 12.5888, + "step": 1001 + }, + { + "epoch": 0.05456290457617774, + "grad_norm": 0.9766058753944424, + "learning_rate": 0.00018185117967332124, + "loss": 12.5827, + "step": 1002 + }, + { + "epoch": 0.054617358572760746, + "grad_norm": 0.8856003767869254, + "learning_rate": 0.00018203266787658804, + "loss": 12.4993, + "step": 1003 + }, + { + "epoch": 0.05467181256934376, + "grad_norm": 0.9485411740557078, + "learning_rate": 0.0001822141560798548, + "loss": 12.677, + "step": 1004 + }, + { + "epoch": 0.05472626656592677, + "grad_norm": 0.7672312949692742, + "learning_rate": 0.0001823956442831216, + "loss": 12.6134, + "step": 1005 + }, + { + "epoch": 0.05478072056250979, + "grad_norm": 0.9172643751252387, + "learning_rate": 0.00018257713248638838, + "loss": 12.7224, + "step": 1006 + }, + { + "epoch": 0.05483517455909279, + "grad_norm": 0.8420022646409505, + "learning_rate": 0.00018275862068965518, + "loss": 12.6461, + "step": 1007 + }, + { + "epoch": 0.05488962855567581, + "grad_norm": 0.8333960383700091, + "learning_rate": 0.00018294010889292198, + "loss": 12.6246, + "step": 1008 + }, + { + "epoch": 0.05494408255225882, + "grad_norm": 1.0096244098506704, + "learning_rate": 0.00018312159709618875, + "loss": 12.7259, + "step": 1009 + }, + { + "epoch": 0.054998536548841834, + "grad_norm": 0.7871373395387745, + "learning_rate": 0.00018330308529945554, + "loss": 12.7275, + "step": 1010 + }, + { + "epoch": 0.05505299054542484, + "grad_norm": 1.032343747821382, + "learning_rate": 0.00018348457350272232, + "loss": 12.6628, + "step": 1011 + }, + { + "epoch": 0.055107444542007854, + "grad_norm": 0.7083079315534552, + "learning_rate": 0.00018366606170598911, + "loss": 12.4748, + "step": 1012 + }, + { + "epoch": 0.05516189853859087, + "grad_norm": 1.1907392275444992, + "learning_rate": 0.0001838475499092559, + "loss": 12.5132, + "step": 1013 + }, + { + "epoch": 0.05521635253517388, + "grad_norm": 0.8442442447866033, + "learning_rate": 0.0001840290381125227, + "loss": 12.6859, + "step": 1014 + }, + { + "epoch": 0.05527080653175689, + "grad_norm": 0.9375890211793874, + "learning_rate": 0.00018421052631578948, + "loss": 12.6139, + "step": 1015 + }, + { + "epoch": 0.0553252605283399, + "grad_norm": 0.7957705162002837, + "learning_rate": 0.00018439201451905628, + "loss": 12.6227, + "step": 1016 + }, + { + "epoch": 0.055379714524922914, + "grad_norm": 0.8572364025943078, + "learning_rate": 0.00018457350272232305, + "loss": 12.4793, + "step": 1017 + }, + { + "epoch": 0.05543416852150593, + "grad_norm": 0.8594165745763462, + "learning_rate": 0.00018475499092558985, + "loss": 12.7353, + "step": 1018 + }, + { + "epoch": 0.055488622518088934, + "grad_norm": 0.8947901476734842, + "learning_rate": 0.00018493647912885665, + "loss": 12.5524, + "step": 1019 + }, + { + "epoch": 0.05554307651467195, + "grad_norm": 0.8317346016577231, + "learning_rate": 0.00018511796733212342, + "loss": 12.5814, + "step": 1020 + }, + { + "epoch": 0.05559753051125496, + "grad_norm": 0.7741287564610906, + "learning_rate": 0.00018529945553539021, + "loss": 12.7118, + "step": 1021 + }, + { + "epoch": 0.055651984507837975, + "grad_norm": 0.9229904453397703, + "learning_rate": 0.00018548094373865698, + "loss": 12.7203, + "step": 1022 + }, + { + "epoch": 0.05570643850442098, + "grad_norm": 0.7189120710008146, + "learning_rate": 0.00018566243194192378, + "loss": 12.7316, + "step": 1023 + }, + { + "epoch": 0.055760892501003995, + "grad_norm": 0.7996729763717355, + "learning_rate": 0.00018584392014519055, + "loss": 12.6907, + "step": 1024 + }, + { + "epoch": 0.05581534649758701, + "grad_norm": 0.8282088203606525, + "learning_rate": 0.00018602540834845735, + "loss": 12.6942, + "step": 1025 + }, + { + "epoch": 0.05586980049417002, + "grad_norm": 0.8101927238311741, + "learning_rate": 0.00018620689655172415, + "loss": 12.6429, + "step": 1026 + }, + { + "epoch": 0.05592425449075303, + "grad_norm": 0.8723302771571373, + "learning_rate": 0.00018638838475499095, + "loss": 12.715, + "step": 1027 + }, + { + "epoch": 0.05597870848733604, + "grad_norm": 0.8993457263793324, + "learning_rate": 0.00018656987295825772, + "loss": 12.7494, + "step": 1028 + }, + { + "epoch": 0.056033162483919055, + "grad_norm": 0.8956846970014478, + "learning_rate": 0.00018675136116152452, + "loss": 12.7523, + "step": 1029 + }, + { + "epoch": 0.05608761648050207, + "grad_norm": 0.848471231997586, + "learning_rate": 0.0001869328493647913, + "loss": 12.7121, + "step": 1030 + }, + { + "epoch": 0.056142070477085075, + "grad_norm": 0.7681001667107862, + "learning_rate": 0.00018711433756805809, + "loss": 12.6024, + "step": 1031 + }, + { + "epoch": 0.05619652447366809, + "grad_norm": 0.7924794070425718, + "learning_rate": 0.00018729582577132488, + "loss": 12.7012, + "step": 1032 + }, + { + "epoch": 0.0562509784702511, + "grad_norm": 0.8349770749508899, + "learning_rate": 0.00018747731397459165, + "loss": 12.6386, + "step": 1033 + }, + { + "epoch": 0.056305432466834116, + "grad_norm": 0.8191879420778659, + "learning_rate": 0.00018765880217785845, + "loss": 12.6633, + "step": 1034 + }, + { + "epoch": 0.05635988646341712, + "grad_norm": 0.9108979371089774, + "learning_rate": 0.00018784029038112522, + "loss": 12.7074, + "step": 1035 + }, + { + "epoch": 0.056414340460000136, + "grad_norm": 0.9995188221538889, + "learning_rate": 0.00018802177858439202, + "loss": 12.7757, + "step": 1036 + }, + { + "epoch": 0.05646879445658315, + "grad_norm": 0.8331304603254887, + "learning_rate": 0.0001882032667876588, + "loss": 12.7268, + "step": 1037 + }, + { + "epoch": 0.05652324845316616, + "grad_norm": 0.8421314134400244, + "learning_rate": 0.00018838475499092562, + "loss": 12.6786, + "step": 1038 + }, + { + "epoch": 0.05657770244974917, + "grad_norm": 0.9249062144746272, + "learning_rate": 0.0001885662431941924, + "loss": 12.6439, + "step": 1039 + }, + { + "epoch": 0.05663215644633218, + "grad_norm": 1.0740033540400613, + "learning_rate": 0.00018874773139745919, + "loss": 12.6613, + "step": 1040 + }, + { + "epoch": 0.056686610442915196, + "grad_norm": 0.9515879897654931, + "learning_rate": 0.00018892921960072596, + "loss": 12.7961, + "step": 1041 + }, + { + "epoch": 0.05674106443949821, + "grad_norm": 1.0687652941035266, + "learning_rate": 0.00018911070780399275, + "loss": 12.7123, + "step": 1042 + }, + { + "epoch": 0.056795518436081216, + "grad_norm": 0.9446694097624536, + "learning_rate": 0.00018929219600725953, + "loss": 12.7562, + "step": 1043 + }, + { + "epoch": 0.05684997243266423, + "grad_norm": 0.9237974447352099, + "learning_rate": 0.00018947368421052632, + "loss": 12.8357, + "step": 1044 + }, + { + "epoch": 0.05690442642924724, + "grad_norm": 0.8936942172764373, + "learning_rate": 0.00018965517241379312, + "loss": 12.7792, + "step": 1045 + }, + { + "epoch": 0.056958880425830256, + "grad_norm": 0.9961818489725659, + "learning_rate": 0.0001898366606170599, + "loss": 12.6665, + "step": 1046 + }, + { + "epoch": 0.05701333442241326, + "grad_norm": 0.8390675260109299, + "learning_rate": 0.0001900181488203267, + "loss": 12.6463, + "step": 1047 + }, + { + "epoch": 0.057067788418996276, + "grad_norm": 0.9127513873491976, + "learning_rate": 0.00019019963702359346, + "loss": 12.6908, + "step": 1048 + }, + { + "epoch": 0.05712224241557929, + "grad_norm": 0.7822521272987107, + "learning_rate": 0.00019038112522686026, + "loss": 12.6916, + "step": 1049 + }, + { + "epoch": 0.0571766964121623, + "grad_norm": 0.9316417618579463, + "learning_rate": 0.00019056261343012706, + "loss": 12.6148, + "step": 1050 + }, + { + "epoch": 0.05723115040874531, + "grad_norm": 0.8267893855283287, + "learning_rate": 0.00019074410163339386, + "loss": 12.6444, + "step": 1051 + }, + { + "epoch": 0.05728560440532832, + "grad_norm": 0.8756184417618663, + "learning_rate": 0.00019092558983666063, + "loss": 12.6479, + "step": 1052 + }, + { + "epoch": 0.05734005840191134, + "grad_norm": 0.8537715754578956, + "learning_rate": 0.00019110707803992742, + "loss": 12.8705, + "step": 1053 + }, + { + "epoch": 0.05739451239849435, + "grad_norm": 0.8793465083226172, + "learning_rate": 0.0001912885662431942, + "loss": 12.6592, + "step": 1054 + }, + { + "epoch": 0.05744896639507736, + "grad_norm": 0.914347190314965, + "learning_rate": 0.000191470054446461, + "loss": 12.6625, + "step": 1055 + }, + { + "epoch": 0.05750342039166037, + "grad_norm": 0.8856692561805453, + "learning_rate": 0.0001916515426497278, + "loss": 12.692, + "step": 1056 + }, + { + "epoch": 0.057557874388243384, + "grad_norm": 0.8458952197724661, + "learning_rate": 0.00019183303085299456, + "loss": 12.7096, + "step": 1057 + }, + { + "epoch": 0.0576123283848264, + "grad_norm": 0.7757824745642351, + "learning_rate": 0.00019201451905626136, + "loss": 12.7721, + "step": 1058 + }, + { + "epoch": 0.057666782381409404, + "grad_norm": 0.975994260166484, + "learning_rate": 0.00019219600725952813, + "loss": 12.719, + "step": 1059 + }, + { + "epoch": 0.05772123637799242, + "grad_norm": 0.8585148684375852, + "learning_rate": 0.00019237749546279493, + "loss": 12.7498, + "step": 1060 + }, + { + "epoch": 0.05777569037457543, + "grad_norm": 0.8545141793726162, + "learning_rate": 0.0001925589836660617, + "loss": 12.7521, + "step": 1061 + }, + { + "epoch": 0.05783014437115844, + "grad_norm": 1.0351696982338359, + "learning_rate": 0.0001927404718693285, + "loss": 12.6858, + "step": 1062 + }, + { + "epoch": 0.05788459836774145, + "grad_norm": 0.8932909538412702, + "learning_rate": 0.0001929219600725953, + "loss": 12.6172, + "step": 1063 + }, + { + "epoch": 0.057939052364324464, + "grad_norm": 0.8033540231795703, + "learning_rate": 0.0001931034482758621, + "loss": 12.7224, + "step": 1064 + }, + { + "epoch": 0.05799350636090748, + "grad_norm": 0.9647008699499642, + "learning_rate": 0.00019328493647912886, + "loss": 12.8202, + "step": 1065 + }, + { + "epoch": 0.058047960357490484, + "grad_norm": 0.8107858754859842, + "learning_rate": 0.00019346642468239566, + "loss": 12.6884, + "step": 1066 + }, + { + "epoch": 0.0581024143540735, + "grad_norm": 0.8763919745645723, + "learning_rate": 0.00019364791288566243, + "loss": 12.5539, + "step": 1067 + }, + { + "epoch": 0.05815686835065651, + "grad_norm": 1.0992738520488685, + "learning_rate": 0.00019382940108892923, + "loss": 12.6674, + "step": 1068 + }, + { + "epoch": 0.058211322347239525, + "grad_norm": 0.7850194207297764, + "learning_rate": 0.00019401088929219603, + "loss": 12.597, + "step": 1069 + }, + { + "epoch": 0.05826577634382253, + "grad_norm": 0.9852227560575982, + "learning_rate": 0.0001941923774954628, + "loss": 12.6947, + "step": 1070 + }, + { + "epoch": 0.058320230340405545, + "grad_norm": 1.0687907738165234, + "learning_rate": 0.0001943738656987296, + "loss": 12.8078, + "step": 1071 + }, + { + "epoch": 0.05837468433698856, + "grad_norm": 0.8874665780436435, + "learning_rate": 0.00019455535390199637, + "loss": 12.6525, + "step": 1072 + }, + { + "epoch": 0.05842913833357157, + "grad_norm": 0.8938001131329727, + "learning_rate": 0.00019473684210526317, + "loss": 12.7372, + "step": 1073 + }, + { + "epoch": 0.05848359233015458, + "grad_norm": 0.8296677243268067, + "learning_rate": 0.00019491833030852994, + "loss": 12.6129, + "step": 1074 + }, + { + "epoch": 0.05853804632673759, + "grad_norm": 0.9888068104357823, + "learning_rate": 0.00019509981851179676, + "loss": 12.8547, + "step": 1075 + }, + { + "epoch": 0.058592500323320605, + "grad_norm": 1.028019721476723, + "learning_rate": 0.00019528130671506353, + "loss": 12.7056, + "step": 1076 + }, + { + "epoch": 0.05864695431990362, + "grad_norm": 0.8361357407910752, + "learning_rate": 0.00019546279491833033, + "loss": 12.6857, + "step": 1077 + }, + { + "epoch": 0.058701408316486625, + "grad_norm": 0.8973490125437626, + "learning_rate": 0.0001956442831215971, + "loss": 12.6222, + "step": 1078 + }, + { + "epoch": 0.05875586231306964, + "grad_norm": 0.7877561882710619, + "learning_rate": 0.00019582577132486387, + "loss": 12.7484, + "step": 1079 + }, + { + "epoch": 0.05881031630965265, + "grad_norm": 1.346493036428632, + "learning_rate": 0.00019600725952813067, + "loss": 12.7032, + "step": 1080 + }, + { + "epoch": 0.058864770306235666, + "grad_norm": 0.9387631520562696, + "learning_rate": 0.00019618874773139747, + "loss": 12.6168, + "step": 1081 + }, + { + "epoch": 0.05891922430281867, + "grad_norm": 0.8426429211134219, + "learning_rate": 0.00019637023593466427, + "loss": 12.7582, + "step": 1082 + }, + { + "epoch": 0.058973678299401686, + "grad_norm": 0.8821282555993558, + "learning_rate": 0.00019655172413793104, + "loss": 12.6952, + "step": 1083 + }, + { + "epoch": 0.0590281322959847, + "grad_norm": 0.9413833624250499, + "learning_rate": 0.00019673321234119784, + "loss": 12.8895, + "step": 1084 + }, + { + "epoch": 0.05908258629256771, + "grad_norm": 1.0551890250213953, + "learning_rate": 0.0001969147005444646, + "loss": 12.7802, + "step": 1085 + }, + { + "epoch": 0.05913704028915072, + "grad_norm": 0.826483127495897, + "learning_rate": 0.0001970961887477314, + "loss": 12.7205, + "step": 1086 + }, + { + "epoch": 0.05919149428573373, + "grad_norm": 0.953767015147213, + "learning_rate": 0.0001972776769509982, + "loss": 12.6614, + "step": 1087 + }, + { + "epoch": 0.059245948282316746, + "grad_norm": 0.8755683812872677, + "learning_rate": 0.000197459165154265, + "loss": 12.7046, + "step": 1088 + }, + { + "epoch": 0.05930040227889976, + "grad_norm": 0.8242367546212285, + "learning_rate": 0.00019764065335753177, + "loss": 12.5865, + "step": 1089 + }, + { + "epoch": 0.059354856275482766, + "grad_norm": 0.8118383627116624, + "learning_rate": 0.00019782214156079857, + "loss": 12.7108, + "step": 1090 + }, + { + "epoch": 0.05940931027206578, + "grad_norm": 0.8309785056550677, + "learning_rate": 0.00019800362976406534, + "loss": 12.7555, + "step": 1091 + }, + { + "epoch": 0.05946376426864879, + "grad_norm": 0.8702493286280903, + "learning_rate": 0.0001981851179673321, + "loss": 12.7094, + "step": 1092 + }, + { + "epoch": 0.059518218265231806, + "grad_norm": 0.8935609387918308, + "learning_rate": 0.00019836660617059894, + "loss": 12.8853, + "step": 1093 + }, + { + "epoch": 0.05957267226181481, + "grad_norm": 0.8176063965684935, + "learning_rate": 0.0001985480943738657, + "loss": 12.7473, + "step": 1094 + }, + { + "epoch": 0.059627126258397826, + "grad_norm": 0.9382886609351457, + "learning_rate": 0.0001987295825771325, + "loss": 12.6793, + "step": 1095 + }, + { + "epoch": 0.05968158025498084, + "grad_norm": 0.8780758861923207, + "learning_rate": 0.00019891107078039928, + "loss": 12.6795, + "step": 1096 + }, + { + "epoch": 0.05973603425156385, + "grad_norm": 0.9060522534957417, + "learning_rate": 0.00019909255898366607, + "loss": 12.6617, + "step": 1097 + }, + { + "epoch": 0.05979048824814686, + "grad_norm": 0.8515080058550354, + "learning_rate": 0.00019927404718693284, + "loss": 12.6893, + "step": 1098 + }, + { + "epoch": 0.05984494224472987, + "grad_norm": 0.8872822863676731, + "learning_rate": 0.00019945553539019964, + "loss": 12.7036, + "step": 1099 + }, + { + "epoch": 0.05989939624131289, + "grad_norm": 0.838829157388531, + "learning_rate": 0.00019963702359346644, + "loss": 12.6507, + "step": 1100 + }, + { + "epoch": 0.0599538502378959, + "grad_norm": 0.8884415432987369, + "learning_rate": 0.00019981851179673324, + "loss": 12.8487, + "step": 1101 + }, + { + "epoch": 0.06000830423447891, + "grad_norm": 0.9740412298051592, + "learning_rate": 0.0002, + "loss": 12.6881, + "step": 1102 + }, + { + "epoch": 0.06006275823106192, + "grad_norm": 0.9241866593303006, + "learning_rate": 0.0001999999996111916, + "loss": 12.7269, + "step": 1103 + }, + { + "epoch": 0.060117212227644934, + "grad_norm": 0.9278070088616288, + "learning_rate": 0.00019999999844476647, + "loss": 12.7711, + "step": 1104 + }, + { + "epoch": 0.06017166622422795, + "grad_norm": 0.8756109875155865, + "learning_rate": 0.00019999999650072457, + "loss": 12.6504, + "step": 1105 + }, + { + "epoch": 0.060226120220810954, + "grad_norm": 0.9555653657587291, + "learning_rate": 0.0001999999937790659, + "loss": 12.5121, + "step": 1106 + }, + { + "epoch": 0.06028057421739397, + "grad_norm": 0.8096454821951842, + "learning_rate": 0.00019999999027979054, + "loss": 12.6582, + "step": 1107 + }, + { + "epoch": 0.06033502821397698, + "grad_norm": 0.9296910383062582, + "learning_rate": 0.00019999998600289846, + "loss": 12.6657, + "step": 1108 + }, + { + "epoch": 0.060389482210559994, + "grad_norm": 0.8466965321383215, + "learning_rate": 0.00019999998094838973, + "loss": 12.6374, + "step": 1109 + }, + { + "epoch": 0.060443936207143, + "grad_norm": 0.8264775366654458, + "learning_rate": 0.0001999999751162644, + "loss": 12.7052, + "step": 1110 + }, + { + "epoch": 0.060498390203726014, + "grad_norm": 0.9400289147748783, + "learning_rate": 0.00019999996850652245, + "loss": 12.7985, + "step": 1111 + }, + { + "epoch": 0.06055284420030903, + "grad_norm": 0.940019638160756, + "learning_rate": 0.00019999996111916399, + "loss": 12.7924, + "step": 1112 + }, + { + "epoch": 0.06060729819689204, + "grad_norm": 0.7509979058254492, + "learning_rate": 0.00019999995295418908, + "loss": 12.6273, + "step": 1113 + }, + { + "epoch": 0.06066175219347505, + "grad_norm": 0.8144515839177705, + "learning_rate": 0.00019999994401159775, + "loss": 12.7125, + "step": 1114 + }, + { + "epoch": 0.06071620619005806, + "grad_norm": 0.7871352320666705, + "learning_rate": 0.0001999999342913901, + "loss": 12.6371, + "step": 1115 + }, + { + "epoch": 0.060770660186641075, + "grad_norm": 0.8128165695381447, + "learning_rate": 0.0001999999237935662, + "loss": 12.7798, + "step": 1116 + }, + { + "epoch": 0.06082511418322409, + "grad_norm": 0.9145599160586337, + "learning_rate": 0.00019999991251812608, + "loss": 12.7207, + "step": 1117 + }, + { + "epoch": 0.060879568179807095, + "grad_norm": 0.8061894716041041, + "learning_rate": 0.00019999990046506988, + "loss": 12.6761, + "step": 1118 + }, + { + "epoch": 0.06093402217639011, + "grad_norm": 0.84955895633201, + "learning_rate": 0.00019999988763439773, + "loss": 12.6494, + "step": 1119 + }, + { + "epoch": 0.06098847617297312, + "grad_norm": 0.8119447540792254, + "learning_rate": 0.00019999987402610962, + "loss": 12.7347, + "step": 1120 + }, + { + "epoch": 0.061042930169556135, + "grad_norm": 0.8194282221432447, + "learning_rate": 0.00019999985964020577, + "loss": 12.7614, + "step": 1121 + }, + { + "epoch": 0.06109738416613914, + "grad_norm": 0.8014853175758311, + "learning_rate": 0.00019999984447668622, + "loss": 12.7345, + "step": 1122 + }, + { + "epoch": 0.061151838162722155, + "grad_norm": 0.9225218876342778, + "learning_rate": 0.00019999982853555111, + "loss": 12.751, + "step": 1123 + }, + { + "epoch": 0.06120629215930517, + "grad_norm": 0.8304127287116028, + "learning_rate": 0.00019999981181680057, + "loss": 12.7568, + "step": 1124 + }, + { + "epoch": 0.06126074615588818, + "grad_norm": 0.8598278997579997, + "learning_rate": 0.00019999979432043472, + "loss": 12.5819, + "step": 1125 + }, + { + "epoch": 0.06131520015247119, + "grad_norm": 0.8826576027775591, + "learning_rate": 0.00019999977604645368, + "loss": 12.7199, + "step": 1126 + }, + { + "epoch": 0.0613696541490542, + "grad_norm": 0.804636809658474, + "learning_rate": 0.00019999975699485763, + "loss": 12.5365, + "step": 1127 + }, + { + "epoch": 0.061424108145637216, + "grad_norm": 0.8346707911072948, + "learning_rate": 0.00019999973716564672, + "loss": 12.7627, + "step": 1128 + }, + { + "epoch": 0.06147856214222022, + "grad_norm": 0.8549089778900794, + "learning_rate": 0.00019999971655882106, + "loss": 12.6977, + "step": 1129 + }, + { + "epoch": 0.061533016138803236, + "grad_norm": 0.8472158811477527, + "learning_rate": 0.0001999996951743808, + "loss": 12.785, + "step": 1130 + }, + { + "epoch": 0.06158747013538625, + "grad_norm": 0.7968932169209947, + "learning_rate": 0.00019999967301232623, + "loss": 12.6148, + "step": 1131 + }, + { + "epoch": 0.06164192413196926, + "grad_norm": 0.8026579849549752, + "learning_rate": 0.00019999965007265735, + "loss": 12.6353, + "step": 1132 + }, + { + "epoch": 0.06169637812855227, + "grad_norm": 0.793885187224609, + "learning_rate": 0.00019999962635537446, + "loss": 12.6965, + "step": 1133 + }, + { + "epoch": 0.06175083212513528, + "grad_norm": 0.8813812746509808, + "learning_rate": 0.0001999996018604777, + "loss": 12.6435, + "step": 1134 + }, + { + "epoch": 0.061805286121718296, + "grad_norm": 0.7731513844126909, + "learning_rate": 0.00019999957658796725, + "loss": 12.6346, + "step": 1135 + }, + { + "epoch": 0.06185974011830131, + "grad_norm": 0.8927790695605196, + "learning_rate": 0.00019999955053784336, + "loss": 12.8121, + "step": 1136 + }, + { + "epoch": 0.061914194114884316, + "grad_norm": 0.8686464281487228, + "learning_rate": 0.00019999952371010617, + "loss": 12.7914, + "step": 1137 + }, + { + "epoch": 0.06196864811146733, + "grad_norm": 0.8148901326895414, + "learning_rate": 0.0001999994961047559, + "loss": 12.7512, + "step": 1138 + }, + { + "epoch": 0.06202310210805034, + "grad_norm": 0.8212055968907241, + "learning_rate": 0.00019999946772179282, + "loss": 12.7898, + "step": 1139 + }, + { + "epoch": 0.06207755610463336, + "grad_norm": 0.859935862765019, + "learning_rate": 0.00019999943856121707, + "loss": 12.7415, + "step": 1140 + }, + { + "epoch": 0.06213201010121636, + "grad_norm": 0.8271274292210274, + "learning_rate": 0.00019999940862302893, + "loss": 12.5209, + "step": 1141 + }, + { + "epoch": 0.06218646409779938, + "grad_norm": 0.9194182466076581, + "learning_rate": 0.0001999993779072286, + "loss": 12.8127, + "step": 1142 + }, + { + "epoch": 0.06224091809438239, + "grad_norm": 0.7947629670968179, + "learning_rate": 0.00019999934641381635, + "loss": 12.5794, + "step": 1143 + }, + { + "epoch": 0.062295372090965404, + "grad_norm": 0.8589934847949596, + "learning_rate": 0.0001999993141427924, + "loss": 12.862, + "step": 1144 + }, + { + "epoch": 0.06234982608754841, + "grad_norm": 0.9370285835777666, + "learning_rate": 0.00019999928109415706, + "loss": 12.6531, + "step": 1145 + }, + { + "epoch": 0.062404280084131424, + "grad_norm": 0.8650870724910286, + "learning_rate": 0.00019999924726791051, + "loss": 12.7084, + "step": 1146 + }, + { + "epoch": 0.06245873408071444, + "grad_norm": 0.8105626060240043, + "learning_rate": 0.00019999921266405303, + "loss": 12.6377, + "step": 1147 + }, + { + "epoch": 0.06251318807729744, + "grad_norm": 0.8396580432219618, + "learning_rate": 0.00019999917728258493, + "loss": 12.626, + "step": 1148 + }, + { + "epoch": 0.06256764207388046, + "grad_norm": 0.8616677580171016, + "learning_rate": 0.00019999914112350643, + "loss": 12.7841, + "step": 1149 + }, + { + "epoch": 0.06262209607046347, + "grad_norm": 0.8225606805167363, + "learning_rate": 0.00019999910418681783, + "loss": 12.6528, + "step": 1150 + }, + { + "epoch": 0.06267655006704648, + "grad_norm": 0.8853599634712473, + "learning_rate": 0.00019999906647251946, + "loss": 12.9739, + "step": 1151 + }, + { + "epoch": 0.0627310040636295, + "grad_norm": 0.8499949719325622, + "learning_rate": 0.00019999902798061156, + "loss": 12.7154, + "step": 1152 + }, + { + "epoch": 0.0627854580602125, + "grad_norm": 0.7984338515402579, + "learning_rate": 0.00019999898871109445, + "loss": 12.7525, + "step": 1153 + }, + { + "epoch": 0.06283991205679552, + "grad_norm": 0.8270221917898859, + "learning_rate": 0.00019999894866396846, + "loss": 12.6632, + "step": 1154 + }, + { + "epoch": 0.06289436605337853, + "grad_norm": 0.8417958776114451, + "learning_rate": 0.00019999890783923386, + "loss": 12.6455, + "step": 1155 + }, + { + "epoch": 0.06294882004996154, + "grad_norm": 0.9328871763734922, + "learning_rate": 0.00019999886623689098, + "loss": 12.8024, + "step": 1156 + }, + { + "epoch": 0.06300327404654456, + "grad_norm": 1.0856841110950344, + "learning_rate": 0.00019999882385694014, + "loss": 12.6466, + "step": 1157 + }, + { + "epoch": 0.06305772804312756, + "grad_norm": 0.7988820949419523, + "learning_rate": 0.00019999878069938167, + "loss": 12.8665, + "step": 1158 + }, + { + "epoch": 0.06311218203971057, + "grad_norm": 0.906338199416811, + "learning_rate": 0.00019999873676421594, + "loss": 12.826, + "step": 1159 + }, + { + "epoch": 0.06316663603629359, + "grad_norm": 0.9190725587084829, + "learning_rate": 0.00019999869205144323, + "loss": 12.8269, + "step": 1160 + }, + { + "epoch": 0.0632210900328766, + "grad_norm": 0.8270321207992188, + "learning_rate": 0.00019999864656106392, + "loss": 12.7298, + "step": 1161 + }, + { + "epoch": 0.06327554402945962, + "grad_norm": 0.7746905689590335, + "learning_rate": 0.0001999986002930784, + "loss": 12.559, + "step": 1162 + }, + { + "epoch": 0.06332999802604262, + "grad_norm": 0.8378946907958874, + "learning_rate": 0.00019999855324748697, + "loss": 12.8177, + "step": 1163 + }, + { + "epoch": 0.06338445202262563, + "grad_norm": 0.7859625331062614, + "learning_rate": 0.00019999850542429002, + "loss": 12.7153, + "step": 1164 + }, + { + "epoch": 0.06343890601920865, + "grad_norm": 0.8331880794666584, + "learning_rate": 0.00019999845682348792, + "loss": 12.6974, + "step": 1165 + }, + { + "epoch": 0.06349336001579166, + "grad_norm": 0.962658613381509, + "learning_rate": 0.00019999840744508107, + "loss": 12.6871, + "step": 1166 + }, + { + "epoch": 0.06354781401237466, + "grad_norm": 0.819743012172273, + "learning_rate": 0.00019999835728906984, + "loss": 12.5648, + "step": 1167 + }, + { + "epoch": 0.06360226800895769, + "grad_norm": 0.8145608023687774, + "learning_rate": 0.00019999830635545457, + "loss": 12.7575, + "step": 1168 + }, + { + "epoch": 0.06365672200554069, + "grad_norm": 0.9500062161145167, + "learning_rate": 0.00019999825464423574, + "loss": 12.7535, + "step": 1169 + }, + { + "epoch": 0.06371117600212371, + "grad_norm": 0.8339180428418321, + "learning_rate": 0.0001999982021554137, + "loss": 12.7517, + "step": 1170 + }, + { + "epoch": 0.06376562999870672, + "grad_norm": 0.8859042989078137, + "learning_rate": 0.00019999814888898887, + "loss": 12.7061, + "step": 1171 + }, + { + "epoch": 0.06382008399528973, + "grad_norm": 0.9593859826719203, + "learning_rate": 0.00019999809484496167, + "loss": 12.769, + "step": 1172 + }, + { + "epoch": 0.06387453799187275, + "grad_norm": 0.9254926975424156, + "learning_rate": 0.0001999980400233325, + "loss": 12.7988, + "step": 1173 + }, + { + "epoch": 0.06392899198845575, + "grad_norm": 0.9218656660623168, + "learning_rate": 0.00019999798442410177, + "loss": 12.6074, + "step": 1174 + }, + { + "epoch": 0.06398344598503876, + "grad_norm": 0.9195323497831199, + "learning_rate": 0.00019999792804727, + "loss": 12.7693, + "step": 1175 + }, + { + "epoch": 0.06403789998162178, + "grad_norm": 0.8757789350337974, + "learning_rate": 0.00019999787089283757, + "loss": 12.666, + "step": 1176 + }, + { + "epoch": 0.06409235397820479, + "grad_norm": 0.896203375036291, + "learning_rate": 0.0001999978129608049, + "loss": 12.8525, + "step": 1177 + }, + { + "epoch": 0.0641468079747878, + "grad_norm": 0.9466387119592451, + "learning_rate": 0.0001999977542511725, + "loss": 12.7746, + "step": 1178 + }, + { + "epoch": 0.06420126197137081, + "grad_norm": 0.9107062802083105, + "learning_rate": 0.00019999769476394076, + "loss": 12.7753, + "step": 1179 + }, + { + "epoch": 0.06425571596795382, + "grad_norm": 0.8633923015901338, + "learning_rate": 0.00019999763449911017, + "loss": 12.7459, + "step": 1180 + }, + { + "epoch": 0.06431016996453684, + "grad_norm": 0.8523060925255115, + "learning_rate": 0.00019999757345668122, + "loss": 12.7009, + "step": 1181 + }, + { + "epoch": 0.06436462396111985, + "grad_norm": 0.8732343539755694, + "learning_rate": 0.00019999751163665437, + "loss": 12.8505, + "step": 1182 + }, + { + "epoch": 0.06441907795770285, + "grad_norm": 0.8955887659856802, + "learning_rate": 0.00019999744903903007, + "loss": 12.6876, + "step": 1183 + }, + { + "epoch": 0.06447353195428587, + "grad_norm": 0.8869982370172741, + "learning_rate": 0.00019999738566380887, + "loss": 12.6886, + "step": 1184 + }, + { + "epoch": 0.06452798595086888, + "grad_norm": 0.8861061504796613, + "learning_rate": 0.00019999732151099124, + "loss": 12.892, + "step": 1185 + }, + { + "epoch": 0.0645824399474519, + "grad_norm": 0.8650163616204314, + "learning_rate": 0.00019999725658057766, + "loss": 12.6152, + "step": 1186 + }, + { + "epoch": 0.0646368939440349, + "grad_norm": 0.7317105713251505, + "learning_rate": 0.00019999719087256864, + "loss": 12.6659, + "step": 1187 + }, + { + "epoch": 0.06469134794061791, + "grad_norm": 0.9458481135838617, + "learning_rate": 0.00019999712438696467, + "loss": 12.9147, + "step": 1188 + }, + { + "epoch": 0.06474580193720093, + "grad_norm": 0.7615024396322205, + "learning_rate": 0.00019999705712376632, + "loss": 12.7225, + "step": 1189 + }, + { + "epoch": 0.06480025593378394, + "grad_norm": 0.8568408849038813, + "learning_rate": 0.00019999698908297408, + "loss": 12.5842, + "step": 1190 + }, + { + "epoch": 0.06485470993036695, + "grad_norm": 0.814394862279894, + "learning_rate": 0.00019999692026458847, + "loss": 12.8761, + "step": 1191 + }, + { + "epoch": 0.06490916392694997, + "grad_norm": 0.8714145177343299, + "learning_rate": 0.00019999685066861007, + "loss": 12.8574, + "step": 1192 + }, + { + "epoch": 0.06496361792353297, + "grad_norm": 0.8264656686816955, + "learning_rate": 0.00019999678029503936, + "loss": 12.7122, + "step": 1193 + }, + { + "epoch": 0.06501807192011598, + "grad_norm": 0.8104604521918491, + "learning_rate": 0.00019999670914387695, + "loss": 12.6977, + "step": 1194 + }, + { + "epoch": 0.065072525916699, + "grad_norm": 0.8354849449147481, + "learning_rate": 0.0001999966372151233, + "loss": 12.7495, + "step": 1195 + }, + { + "epoch": 0.06512697991328201, + "grad_norm": 0.8670770016037759, + "learning_rate": 0.00019999656450877908, + "loss": 12.8445, + "step": 1196 + }, + { + "epoch": 0.06518143390986503, + "grad_norm": 0.761947793544306, + "learning_rate": 0.00019999649102484475, + "loss": 12.7165, + "step": 1197 + }, + { + "epoch": 0.06523588790644803, + "grad_norm": 0.8575494256497836, + "learning_rate": 0.00019999641676332098, + "loss": 12.7475, + "step": 1198 + }, + { + "epoch": 0.06529034190303104, + "grad_norm": 0.9446326318271256, + "learning_rate": 0.00019999634172420834, + "loss": 12.7995, + "step": 1199 + }, + { + "epoch": 0.06534479589961406, + "grad_norm": 0.8926293577591496, + "learning_rate": 0.00019999626590750733, + "loss": 12.8856, + "step": 1200 + }, + { + "epoch": 0.06539924989619707, + "grad_norm": 0.8785564167937711, + "learning_rate": 0.00019999618931321859, + "loss": 12.4816, + "step": 1201 + }, + { + "epoch": 0.06545370389278007, + "grad_norm": 0.8769465556308361, + "learning_rate": 0.00019999611194134272, + "loss": 12.7773, + "step": 1202 + }, + { + "epoch": 0.0655081578893631, + "grad_norm": 0.8078239982110514, + "learning_rate": 0.0001999960337918803, + "loss": 12.6893, + "step": 1203 + }, + { + "epoch": 0.0655626118859461, + "grad_norm": 0.8764038524011952, + "learning_rate": 0.000199995954864832, + "loss": 12.6881, + "step": 1204 + }, + { + "epoch": 0.06561706588252912, + "grad_norm": 1.0046378226101262, + "learning_rate": 0.00019999587516019834, + "loss": 12.6412, + "step": 1205 + }, + { + "epoch": 0.06567151987911213, + "grad_norm": 0.8499068641220393, + "learning_rate": 0.00019999579467797998, + "loss": 12.8251, + "step": 1206 + }, + { + "epoch": 0.06572597387569513, + "grad_norm": 0.8114198595859603, + "learning_rate": 0.00019999571341817755, + "loss": 12.7518, + "step": 1207 + }, + { + "epoch": 0.06578042787227815, + "grad_norm": 0.7826828314807258, + "learning_rate": 0.0001999956313807917, + "loss": 12.773, + "step": 1208 + }, + { + "epoch": 0.06583488186886116, + "grad_norm": 0.9134842134989745, + "learning_rate": 0.00019999554856582304, + "loss": 12.6953, + "step": 1209 + }, + { + "epoch": 0.06588933586544417, + "grad_norm": 0.8648511371370636, + "learning_rate": 0.00019999546497327227, + "loss": 12.6783, + "step": 1210 + }, + { + "epoch": 0.06594378986202719, + "grad_norm": 0.937972103939476, + "learning_rate": 0.00019999538060313995, + "loss": 12.7148, + "step": 1211 + }, + { + "epoch": 0.0659982438586102, + "grad_norm": 0.7882262998747774, + "learning_rate": 0.00019999529545542677, + "loss": 12.5461, + "step": 1212 + }, + { + "epoch": 0.06605269785519322, + "grad_norm": 0.8935868580453479, + "learning_rate": 0.00019999520953013344, + "loss": 12.8204, + "step": 1213 + }, + { + "epoch": 0.06610715185177622, + "grad_norm": 0.8836712629861396, + "learning_rate": 0.00019999512282726055, + "loss": 12.6649, + "step": 1214 + }, + { + "epoch": 0.06616160584835923, + "grad_norm": 0.7804057926443057, + "learning_rate": 0.00019999503534680888, + "loss": 12.7262, + "step": 1215 + }, + { + "epoch": 0.06621605984494225, + "grad_norm": 0.8700122862674238, + "learning_rate": 0.000199994947088779, + "loss": 12.8055, + "step": 1216 + }, + { + "epoch": 0.06627051384152526, + "grad_norm": 0.8834004841790215, + "learning_rate": 0.00019999485805317164, + "loss": 12.762, + "step": 1217 + }, + { + "epoch": 0.06632496783810826, + "grad_norm": 0.8343108775202628, + "learning_rate": 0.00019999476823998752, + "loss": 12.6515, + "step": 1218 + }, + { + "epoch": 0.06637942183469128, + "grad_norm": 0.8579726093320554, + "learning_rate": 0.00019999467764922728, + "loss": 12.7467, + "step": 1219 + }, + { + "epoch": 0.06643387583127429, + "grad_norm": 0.8654133283698366, + "learning_rate": 0.00019999458628089167, + "loss": 12.7794, + "step": 1220 + }, + { + "epoch": 0.06648832982785731, + "grad_norm": 0.8325988034952059, + "learning_rate": 0.00019999449413498138, + "loss": 12.6766, + "step": 1221 + }, + { + "epoch": 0.06654278382444032, + "grad_norm": 0.7726895332022075, + "learning_rate": 0.00019999440121149715, + "loss": 12.7095, + "step": 1222 + }, + { + "epoch": 0.06659723782102332, + "grad_norm": 0.8422111747797787, + "learning_rate": 0.00019999430751043972, + "loss": 12.7101, + "step": 1223 + }, + { + "epoch": 0.06665169181760634, + "grad_norm": 0.8713740577866859, + "learning_rate": 0.00019999421303180972, + "loss": 12.7866, + "step": 1224 + }, + { + "epoch": 0.06670614581418935, + "grad_norm": 0.7861309399978694, + "learning_rate": 0.000199994117775608, + "loss": 12.5477, + "step": 1225 + }, + { + "epoch": 0.06676059981077236, + "grad_norm": 0.7909438744345703, + "learning_rate": 0.00019999402174183524, + "loss": 12.6011, + "step": 1226 + }, + { + "epoch": 0.06681505380735538, + "grad_norm": 0.805144263404179, + "learning_rate": 0.00019999392493049215, + "loss": 12.6455, + "step": 1227 + }, + { + "epoch": 0.06686950780393838, + "grad_norm": 0.9161247444164494, + "learning_rate": 0.0001999938273415796, + "loss": 12.9511, + "step": 1228 + }, + { + "epoch": 0.0669239618005214, + "grad_norm": 0.8007339932273023, + "learning_rate": 0.00019999372897509826, + "loss": 12.4325, + "step": 1229 + }, + { + "epoch": 0.06697841579710441, + "grad_norm": 0.8580605244250328, + "learning_rate": 0.00019999362983104887, + "loss": 12.7317, + "step": 1230 + }, + { + "epoch": 0.06703286979368742, + "grad_norm": 0.9518365532593344, + "learning_rate": 0.0001999935299094323, + "loss": 12.8734, + "step": 1231 + }, + { + "epoch": 0.06708732379027044, + "grad_norm": 0.8239272304844115, + "learning_rate": 0.00019999342921024927, + "loss": 12.7499, + "step": 1232 + }, + { + "epoch": 0.06714177778685344, + "grad_norm": 0.8968558651975471, + "learning_rate": 0.00019999332773350053, + "loss": 12.8772, + "step": 1233 + }, + { + "epoch": 0.06719623178343645, + "grad_norm": 0.9650739082159776, + "learning_rate": 0.00019999322547918692, + "loss": 12.864, + "step": 1234 + }, + { + "epoch": 0.06725068578001947, + "grad_norm": 0.8170253300214879, + "learning_rate": 0.00019999312244730924, + "loss": 12.8578, + "step": 1235 + }, + { + "epoch": 0.06730513977660248, + "grad_norm": 0.8107661672269623, + "learning_rate": 0.00019999301863786825, + "loss": 12.6231, + "step": 1236 + }, + { + "epoch": 0.0673595937731855, + "grad_norm": 0.8584076187816582, + "learning_rate": 0.00019999291405086477, + "loss": 12.8479, + "step": 1237 + }, + { + "epoch": 0.0674140477697685, + "grad_norm": 0.8387218124859667, + "learning_rate": 0.00019999280868629964, + "loss": 12.8035, + "step": 1238 + }, + { + "epoch": 0.06746850176635151, + "grad_norm": 0.833310340002492, + "learning_rate": 0.00019999270254417363, + "loss": 12.813, + "step": 1239 + }, + { + "epoch": 0.06752295576293453, + "grad_norm": 0.7848080365904714, + "learning_rate": 0.00019999259562448766, + "loss": 12.5327, + "step": 1240 + }, + { + "epoch": 0.06757740975951754, + "grad_norm": 0.8079709147478482, + "learning_rate": 0.00019999248792724244, + "loss": 12.7529, + "step": 1241 + }, + { + "epoch": 0.06763186375610054, + "grad_norm": 0.797562748653487, + "learning_rate": 0.0001999923794524389, + "loss": 12.8353, + "step": 1242 + }, + { + "epoch": 0.06768631775268356, + "grad_norm": 0.7568940240260569, + "learning_rate": 0.00019999227020007783, + "loss": 12.5846, + "step": 1243 + }, + { + "epoch": 0.06774077174926657, + "grad_norm": 0.9021563734495636, + "learning_rate": 0.00019999216017016006, + "loss": 12.7845, + "step": 1244 + }, + { + "epoch": 0.06779522574584959, + "grad_norm": 0.7172017086809768, + "learning_rate": 0.00019999204936268656, + "loss": 12.3948, + "step": 1245 + }, + { + "epoch": 0.0678496797424326, + "grad_norm": 0.899680702607057, + "learning_rate": 0.00019999193777765805, + "loss": 12.6401, + "step": 1246 + }, + { + "epoch": 0.0679041337390156, + "grad_norm": 0.8631946648104156, + "learning_rate": 0.0001999918254150755, + "loss": 12.7803, + "step": 1247 + }, + { + "epoch": 0.06795858773559862, + "grad_norm": 0.8901750091644829, + "learning_rate": 0.00019999171227493974, + "loss": 12.7478, + "step": 1248 + }, + { + "epoch": 0.06801304173218163, + "grad_norm": 0.8298302381975877, + "learning_rate": 0.00019999159835725166, + "loss": 12.6882, + "step": 1249 + }, + { + "epoch": 0.06806749572876464, + "grad_norm": 0.8391719325869385, + "learning_rate": 0.00019999148366201214, + "loss": 12.7615, + "step": 1250 + }, + { + "epoch": 0.06812194972534766, + "grad_norm": 0.9542640810854296, + "learning_rate": 0.0001999913681892221, + "loss": 12.8767, + "step": 1251 + }, + { + "epoch": 0.06817640372193066, + "grad_norm": 0.7886309533998018, + "learning_rate": 0.00019999125193888238, + "loss": 12.6535, + "step": 1252 + }, + { + "epoch": 0.06823085771851368, + "grad_norm": 0.8938502848779079, + "learning_rate": 0.0001999911349109939, + "loss": 12.7479, + "step": 1253 + }, + { + "epoch": 0.06828531171509669, + "grad_norm": 1.01894846212262, + "learning_rate": 0.00019999101710555762, + "loss": 12.7659, + "step": 1254 + }, + { + "epoch": 0.0683397657116797, + "grad_norm": 0.8200912021011945, + "learning_rate": 0.0001999908985225744, + "loss": 12.7416, + "step": 1255 + }, + { + "epoch": 0.06839421970826272, + "grad_norm": 0.7572550774775388, + "learning_rate": 0.00019999077916204517, + "loss": 12.7203, + "step": 1256 + }, + { + "epoch": 0.06844867370484572, + "grad_norm": 0.9595690781436814, + "learning_rate": 0.00019999065902397093, + "loss": 12.9055, + "step": 1257 + }, + { + "epoch": 0.06850312770142873, + "grad_norm": 0.9216699050913904, + "learning_rate": 0.00019999053810835254, + "loss": 12.9216, + "step": 1258 + }, + { + "epoch": 0.06855758169801175, + "grad_norm": 0.7783220350551281, + "learning_rate": 0.00019999041641519095, + "loss": 12.7104, + "step": 1259 + }, + { + "epoch": 0.06861203569459476, + "grad_norm": 0.7964697170367746, + "learning_rate": 0.0001999902939444871, + "loss": 12.5829, + "step": 1260 + }, + { + "epoch": 0.06866648969117777, + "grad_norm": 0.947797767672639, + "learning_rate": 0.00019999017069624193, + "loss": 12.8261, + "step": 1261 + }, + { + "epoch": 0.06872094368776079, + "grad_norm": 0.9990216177536839, + "learning_rate": 0.00019999004667045647, + "loss": 12.6765, + "step": 1262 + }, + { + "epoch": 0.06877539768434379, + "grad_norm": 0.9003760583623563, + "learning_rate": 0.00019998992186713165, + "loss": 12.7849, + "step": 1263 + }, + { + "epoch": 0.06882985168092681, + "grad_norm": 0.8353313742840284, + "learning_rate": 0.00019998979628626837, + "loss": 12.6698, + "step": 1264 + }, + { + "epoch": 0.06888430567750982, + "grad_norm": 0.8651149020580153, + "learning_rate": 0.00019998966992786768, + "loss": 12.6586, + "step": 1265 + }, + { + "epoch": 0.06893875967409283, + "grad_norm": 0.9382816369907496, + "learning_rate": 0.0001999895427919306, + "loss": 12.9221, + "step": 1266 + }, + { + "epoch": 0.06899321367067585, + "grad_norm": 0.7998177833076818, + "learning_rate": 0.00019998941487845803, + "loss": 12.6578, + "step": 1267 + }, + { + "epoch": 0.06904766766725885, + "grad_norm": 0.8184372156049693, + "learning_rate": 0.00019998928618745102, + "loss": 12.7793, + "step": 1268 + }, + { + "epoch": 0.06910212166384186, + "grad_norm": 0.8071861459286777, + "learning_rate": 0.00019998915671891055, + "loss": 12.3847, + "step": 1269 + }, + { + "epoch": 0.06915657566042488, + "grad_norm": 0.8132926016895157, + "learning_rate": 0.0001999890264728376, + "loss": 12.8209, + "step": 1270 + }, + { + "epoch": 0.06921102965700789, + "grad_norm": 0.8625809229455872, + "learning_rate": 0.00019998889544923322, + "loss": 12.6077, + "step": 1271 + }, + { + "epoch": 0.0692654836535909, + "grad_norm": 0.877250661583724, + "learning_rate": 0.00019998876364809843, + "loss": 12.888, + "step": 1272 + }, + { + "epoch": 0.06931993765017391, + "grad_norm": 0.9665899385261744, + "learning_rate": 0.00019998863106943427, + "loss": 12.8492, + "step": 1273 + }, + { + "epoch": 0.06937439164675692, + "grad_norm": 0.8873863859176445, + "learning_rate": 0.0001999884977132417, + "loss": 12.7165, + "step": 1274 + }, + { + "epoch": 0.06942884564333994, + "grad_norm": 0.8998207159482029, + "learning_rate": 0.00019998836357952183, + "loss": 12.7232, + "step": 1275 + }, + { + "epoch": 0.06948329963992295, + "grad_norm": 0.9357481029675431, + "learning_rate": 0.00019998822866827568, + "loss": 12.7706, + "step": 1276 + }, + { + "epoch": 0.06953775363650595, + "grad_norm": 0.9390340910364658, + "learning_rate": 0.0001999880929795043, + "loss": 12.8162, + "step": 1277 + }, + { + "epoch": 0.06959220763308897, + "grad_norm": 0.8788392373945151, + "learning_rate": 0.00019998795651320875, + "loss": 12.7985, + "step": 1278 + }, + { + "epoch": 0.06964666162967198, + "grad_norm": 0.9910757288059426, + "learning_rate": 0.00019998781926939004, + "loss": 12.7973, + "step": 1279 + }, + { + "epoch": 0.069701115626255, + "grad_norm": 0.7825317381332015, + "learning_rate": 0.00019998768124804931, + "loss": 12.7743, + "step": 1280 + }, + { + "epoch": 0.069755569622838, + "grad_norm": 0.9466682022844718, + "learning_rate": 0.0001999875424491876, + "loss": 12.8456, + "step": 1281 + }, + { + "epoch": 0.06981002361942101, + "grad_norm": 0.8392171551646117, + "learning_rate": 0.00019998740287280597, + "loss": 12.7803, + "step": 1282 + }, + { + "epoch": 0.06986447761600403, + "grad_norm": 0.9682561669853998, + "learning_rate": 0.00019998726251890556, + "loss": 12.7737, + "step": 1283 + }, + { + "epoch": 0.06991893161258704, + "grad_norm": 1.0730646542580322, + "learning_rate": 0.0001999871213874874, + "loss": 12.6476, + "step": 1284 + }, + { + "epoch": 0.06997338560917005, + "grad_norm": 0.8438483865520613, + "learning_rate": 0.00019998697947855263, + "loss": 12.8169, + "step": 1285 + }, + { + "epoch": 0.07002783960575307, + "grad_norm": 0.883713442906129, + "learning_rate": 0.00019998683679210236, + "loss": 12.7374, + "step": 1286 + }, + { + "epoch": 0.07008229360233607, + "grad_norm": 0.940886253494083, + "learning_rate": 0.00019998669332813764, + "loss": 12.7748, + "step": 1287 + }, + { + "epoch": 0.0701367475989191, + "grad_norm": 0.7718253307822253, + "learning_rate": 0.00019998654908665966, + "loss": 12.665, + "step": 1288 + }, + { + "epoch": 0.0701912015955021, + "grad_norm": 0.9335549581515296, + "learning_rate": 0.00019998640406766947, + "loss": 12.728, + "step": 1289 + }, + { + "epoch": 0.07024565559208511, + "grad_norm": 0.7689425613584284, + "learning_rate": 0.00019998625827116827, + "loss": 12.7429, + "step": 1290 + }, + { + "epoch": 0.07030010958866813, + "grad_norm": 0.8506528158595922, + "learning_rate": 0.00019998611169715712, + "loss": 12.8497, + "step": 1291 + }, + { + "epoch": 0.07035456358525113, + "grad_norm": 0.7830197778249123, + "learning_rate": 0.00019998596434563724, + "loss": 12.8308, + "step": 1292 + }, + { + "epoch": 0.07040901758183414, + "grad_norm": 0.7674751880153996, + "learning_rate": 0.00019998581621660973, + "loss": 12.6265, + "step": 1293 + }, + { + "epoch": 0.07046347157841716, + "grad_norm": 0.8792718237656185, + "learning_rate": 0.0001999856673100757, + "loss": 12.6221, + "step": 1294 + }, + { + "epoch": 0.07051792557500017, + "grad_norm": 0.888901523985518, + "learning_rate": 0.0001999855176260364, + "loss": 12.6924, + "step": 1295 + }, + { + "epoch": 0.07057237957158319, + "grad_norm": 0.8764202643621799, + "learning_rate": 0.00019998536716449292, + "loss": 12.7861, + "step": 1296 + }, + { + "epoch": 0.0706268335681662, + "grad_norm": 1.0514054139949702, + "learning_rate": 0.00019998521592544646, + "loss": 12.738, + "step": 1297 + }, + { + "epoch": 0.0706812875647492, + "grad_norm": 0.9078005384364782, + "learning_rate": 0.0001999850639088982, + "loss": 12.6706, + "step": 1298 + }, + { + "epoch": 0.07073574156133222, + "grad_norm": 1.0033928702575605, + "learning_rate": 0.00019998491111484934, + "loss": 12.7786, + "step": 1299 + }, + { + "epoch": 0.07079019555791523, + "grad_norm": 0.9070619957929708, + "learning_rate": 0.000199984757543301, + "loss": 12.7996, + "step": 1300 + }, + { + "epoch": 0.07084464955449823, + "grad_norm": 0.8637899760457025, + "learning_rate": 0.00019998460319425445, + "loss": 12.6295, + "step": 1301 + }, + { + "epoch": 0.07089910355108126, + "grad_norm": 0.9270227663127247, + "learning_rate": 0.00019998444806771084, + "loss": 12.693, + "step": 1302 + }, + { + "epoch": 0.07095355754766426, + "grad_norm": 0.8678327197234903, + "learning_rate": 0.0001999842921636714, + "loss": 12.6845, + "step": 1303 + }, + { + "epoch": 0.07100801154424728, + "grad_norm": 0.9123627237275738, + "learning_rate": 0.00019998413548213734, + "loss": 12.738, + "step": 1304 + }, + { + "epoch": 0.07106246554083029, + "grad_norm": 0.7763699733385914, + "learning_rate": 0.00019998397802310986, + "loss": 12.6883, + "step": 1305 + }, + { + "epoch": 0.0711169195374133, + "grad_norm": 1.0035685910073238, + "learning_rate": 0.00019998381978659024, + "loss": 12.889, + "step": 1306 + }, + { + "epoch": 0.07117137353399632, + "grad_norm": 0.890221905170563, + "learning_rate": 0.00019998366077257962, + "loss": 12.8601, + "step": 1307 + }, + { + "epoch": 0.07122582753057932, + "grad_norm": 0.8082291689304159, + "learning_rate": 0.0001999835009810793, + "loss": 12.7015, + "step": 1308 + }, + { + "epoch": 0.07128028152716233, + "grad_norm": 0.8360757752561405, + "learning_rate": 0.00019998334041209054, + "loss": 12.7729, + "step": 1309 + }, + { + "epoch": 0.07133473552374535, + "grad_norm": 0.8218382034287375, + "learning_rate": 0.00019998317906561454, + "loss": 12.8634, + "step": 1310 + }, + { + "epoch": 0.07138918952032836, + "grad_norm": 0.924703496644343, + "learning_rate": 0.00019998301694165255, + "loss": 12.7538, + "step": 1311 + }, + { + "epoch": 0.07144364351691138, + "grad_norm": 0.8849010458774731, + "learning_rate": 0.00019998285404020588, + "loss": 12.7549, + "step": 1312 + }, + { + "epoch": 0.07149809751349438, + "grad_norm": 0.7943815027792276, + "learning_rate": 0.00019998269036127577, + "loss": 12.6778, + "step": 1313 + }, + { + "epoch": 0.07155255151007739, + "grad_norm": 0.7558448698202527, + "learning_rate": 0.00019998252590486346, + "loss": 12.7548, + "step": 1314 + }, + { + "epoch": 0.07160700550666041, + "grad_norm": 0.8911125068611316, + "learning_rate": 0.00019998236067097033, + "loss": 12.7131, + "step": 1315 + }, + { + "epoch": 0.07166145950324342, + "grad_norm": 0.8561803781699435, + "learning_rate": 0.00019998219465959752, + "loss": 12.8283, + "step": 1316 + }, + { + "epoch": 0.07171591349982642, + "grad_norm": 0.8473732850401972, + "learning_rate": 0.00019998202787074645, + "loss": 12.8933, + "step": 1317 + }, + { + "epoch": 0.07177036749640944, + "grad_norm": 0.727468497024035, + "learning_rate": 0.00019998186030441832, + "loss": 12.6992, + "step": 1318 + }, + { + "epoch": 0.07182482149299245, + "grad_norm": 0.8180382041248009, + "learning_rate": 0.00019998169196061452, + "loss": 12.6889, + "step": 1319 + }, + { + "epoch": 0.07187927548957547, + "grad_norm": 0.7852274604537989, + "learning_rate": 0.0001999815228393363, + "loss": 12.7212, + "step": 1320 + }, + { + "epoch": 0.07193372948615848, + "grad_norm": 0.9769106293559628, + "learning_rate": 0.00019998135294058497, + "loss": 12.8875, + "step": 1321 + }, + { + "epoch": 0.07198818348274148, + "grad_norm": 0.8503420912576485, + "learning_rate": 0.0001999811822643619, + "loss": 12.7531, + "step": 1322 + }, + { + "epoch": 0.0720426374793245, + "grad_norm": 0.7948220930979146, + "learning_rate": 0.00019998101081066837, + "loss": 12.5656, + "step": 1323 + }, + { + "epoch": 0.07209709147590751, + "grad_norm": 0.8488679894413691, + "learning_rate": 0.00019998083857950577, + "loss": 12.8234, + "step": 1324 + }, + { + "epoch": 0.07215154547249052, + "grad_norm": 0.9413227053637365, + "learning_rate": 0.00019998066557087537, + "loss": 12.7785, + "step": 1325 + }, + { + "epoch": 0.07220599946907354, + "grad_norm": 0.7645388464604446, + "learning_rate": 0.00019998049178477853, + "loss": 12.6192, + "step": 1326 + }, + { + "epoch": 0.07226045346565654, + "grad_norm": 0.9032030787263513, + "learning_rate": 0.00019998031722121663, + "loss": 12.8004, + "step": 1327 + }, + { + "epoch": 0.07231490746223955, + "grad_norm": 0.9060112901526959, + "learning_rate": 0.00019998014188019105, + "loss": 12.8844, + "step": 1328 + }, + { + "epoch": 0.07236936145882257, + "grad_norm": 0.9041037289993008, + "learning_rate": 0.00019997996576170312, + "loss": 12.6895, + "step": 1329 + }, + { + "epoch": 0.07242381545540558, + "grad_norm": 0.8757460620636227, + "learning_rate": 0.00019997978886575416, + "loss": 12.7804, + "step": 1330 + }, + { + "epoch": 0.0724782694519886, + "grad_norm": 0.8629108827418805, + "learning_rate": 0.00019997961119234563, + "loss": 12.6409, + "step": 1331 + }, + { + "epoch": 0.0725327234485716, + "grad_norm": 0.8312445852962548, + "learning_rate": 0.00019997943274147889, + "loss": 12.7261, + "step": 1332 + }, + { + "epoch": 0.07258717744515461, + "grad_norm": 0.8520995556695088, + "learning_rate": 0.00019997925351315527, + "loss": 12.6756, + "step": 1333 + }, + { + "epoch": 0.07264163144173763, + "grad_norm": 0.9686838517002854, + "learning_rate": 0.00019997907350737624, + "loss": 12.8004, + "step": 1334 + }, + { + "epoch": 0.07269608543832064, + "grad_norm": 0.9052860715073252, + "learning_rate": 0.0001999788927241432, + "loss": 12.8299, + "step": 1335 + }, + { + "epoch": 0.07275053943490364, + "grad_norm": 0.7694932364124429, + "learning_rate": 0.00019997871116345746, + "loss": 12.5894, + "step": 1336 + }, + { + "epoch": 0.07280499343148666, + "grad_norm": 0.8697208727797224, + "learning_rate": 0.00019997852882532052, + "loss": 12.7327, + "step": 1337 + }, + { + "epoch": 0.07285944742806967, + "grad_norm": 0.7950392651172854, + "learning_rate": 0.00019997834570973378, + "loss": 12.6369, + "step": 1338 + }, + { + "epoch": 0.07291390142465269, + "grad_norm": 0.9149058956676394, + "learning_rate": 0.00019997816181669865, + "loss": 12.6927, + "step": 1339 + }, + { + "epoch": 0.0729683554212357, + "grad_norm": 0.7917477900076075, + "learning_rate": 0.00019997797714621656, + "loss": 12.6357, + "step": 1340 + }, + { + "epoch": 0.0730228094178187, + "grad_norm": 0.8974376761470795, + "learning_rate": 0.00019997779169828896, + "loss": 12.5907, + "step": 1341 + }, + { + "epoch": 0.07307726341440172, + "grad_norm": 0.9046560938278421, + "learning_rate": 0.0001999776054729173, + "loss": 12.63, + "step": 1342 + }, + { + "epoch": 0.07313171741098473, + "grad_norm": 0.8687122090517582, + "learning_rate": 0.000199977418470103, + "loss": 12.6174, + "step": 1343 + }, + { + "epoch": 0.07318617140756774, + "grad_norm": 0.8219971025694719, + "learning_rate": 0.00019997723068984754, + "loss": 12.7169, + "step": 1344 + }, + { + "epoch": 0.07324062540415076, + "grad_norm": 0.9499603674891286, + "learning_rate": 0.00019997704213215234, + "loss": 12.7951, + "step": 1345 + }, + { + "epoch": 0.07329507940073376, + "grad_norm": 0.8403131431498408, + "learning_rate": 0.00019997685279701889, + "loss": 12.6047, + "step": 1346 + }, + { + "epoch": 0.07334953339731679, + "grad_norm": 1.0116847450199355, + "learning_rate": 0.00019997666268444872, + "loss": 12.7429, + "step": 1347 + }, + { + "epoch": 0.07340398739389979, + "grad_norm": 0.8687286331334413, + "learning_rate": 0.00019997647179444323, + "loss": 12.7573, + "step": 1348 + }, + { + "epoch": 0.0734584413904828, + "grad_norm": 0.9707217570791576, + "learning_rate": 0.0001999762801270039, + "loss": 12.6194, + "step": 1349 + }, + { + "epoch": 0.07351289538706582, + "grad_norm": 0.913308078894722, + "learning_rate": 0.0001999760876821323, + "loss": 12.791, + "step": 1350 + }, + { + "epoch": 0.07356734938364883, + "grad_norm": 0.870495234403029, + "learning_rate": 0.00019997589445982982, + "loss": 12.6617, + "step": 1351 + }, + { + "epoch": 0.07362180338023183, + "grad_norm": 0.9077511397743955, + "learning_rate": 0.00019997570046009807, + "loss": 12.7826, + "step": 1352 + }, + { + "epoch": 0.07367625737681485, + "grad_norm": 0.865216146586963, + "learning_rate": 0.00019997550568293847, + "loss": 12.6991, + "step": 1353 + }, + { + "epoch": 0.07373071137339786, + "grad_norm": 0.8039294697903542, + "learning_rate": 0.00019997531012835257, + "loss": 12.718, + "step": 1354 + }, + { + "epoch": 0.07378516536998088, + "grad_norm": 0.8036977410464696, + "learning_rate": 0.00019997511379634192, + "loss": 12.6545, + "step": 1355 + }, + { + "epoch": 0.07383961936656389, + "grad_norm": 0.9204505253050527, + "learning_rate": 0.00019997491668690803, + "loss": 12.6631, + "step": 1356 + }, + { + "epoch": 0.07389407336314689, + "grad_norm": 0.8244381847188533, + "learning_rate": 0.0001999747188000524, + "loss": 12.7989, + "step": 1357 + }, + { + "epoch": 0.07394852735972991, + "grad_norm": 0.794070214572969, + "learning_rate": 0.00019997452013577658, + "loss": 12.6709, + "step": 1358 + }, + { + "epoch": 0.07400298135631292, + "grad_norm": 0.8115791022284864, + "learning_rate": 0.00019997432069408214, + "loss": 12.7089, + "step": 1359 + }, + { + "epoch": 0.07405743535289593, + "grad_norm": 0.8152891881701941, + "learning_rate": 0.00019997412047497058, + "loss": 12.6603, + "step": 1360 + }, + { + "epoch": 0.07411188934947895, + "grad_norm": 0.8932045405545638, + "learning_rate": 0.00019997391947844354, + "loss": 12.6571, + "step": 1361 + }, + { + "epoch": 0.07416634334606195, + "grad_norm": 0.8189990385705692, + "learning_rate": 0.00019997371770450256, + "loss": 12.7136, + "step": 1362 + }, + { + "epoch": 0.07422079734264497, + "grad_norm": 0.8729171047097313, + "learning_rate": 0.00019997351515314913, + "loss": 12.744, + "step": 1363 + }, + { + "epoch": 0.07427525133922798, + "grad_norm": 0.9197472633047581, + "learning_rate": 0.0001999733118243849, + "loss": 12.6353, + "step": 1364 + }, + { + "epoch": 0.07432970533581099, + "grad_norm": 0.8243862529909959, + "learning_rate": 0.00019997310771821143, + "loss": 12.8734, + "step": 1365 + }, + { + "epoch": 0.074384159332394, + "grad_norm": 0.7922404837557487, + "learning_rate": 0.0001999729028346303, + "loss": 12.7866, + "step": 1366 + }, + { + "epoch": 0.07443861332897701, + "grad_norm": 0.8674908828154206, + "learning_rate": 0.00019997269717364312, + "loss": 12.632, + "step": 1367 + }, + { + "epoch": 0.07449306732556002, + "grad_norm": 0.8979652665766763, + "learning_rate": 0.0001999724907352515, + "loss": 12.6652, + "step": 1368 + }, + { + "epoch": 0.07454752132214304, + "grad_norm": 0.8543331532248469, + "learning_rate": 0.000199972283519457, + "loss": 12.7693, + "step": 1369 + }, + { + "epoch": 0.07460197531872605, + "grad_norm": 0.8961519185426239, + "learning_rate": 0.00019997207552626127, + "loss": 12.7032, + "step": 1370 + }, + { + "epoch": 0.07465642931530907, + "grad_norm": 0.8094646158907232, + "learning_rate": 0.0001999718667556659, + "loss": 12.7456, + "step": 1371 + }, + { + "epoch": 0.07471088331189207, + "grad_norm": 1.108898840885292, + "learning_rate": 0.00019997165720767255, + "loss": 12.7149, + "step": 1372 + }, + { + "epoch": 0.07476533730847508, + "grad_norm": 0.8660662469532117, + "learning_rate": 0.00019997144688228282, + "loss": 12.765, + "step": 1373 + }, + { + "epoch": 0.0748197913050581, + "grad_norm": 0.8121742729568635, + "learning_rate": 0.00019997123577949837, + "loss": 12.7631, + "step": 1374 + }, + { + "epoch": 0.0748742453016411, + "grad_norm": 0.9143360210967595, + "learning_rate": 0.0001999710238993208, + "loss": 12.7528, + "step": 1375 + }, + { + "epoch": 0.07492869929822411, + "grad_norm": 0.8538749301886916, + "learning_rate": 0.0001999708112417518, + "loss": 12.7464, + "step": 1376 + }, + { + "epoch": 0.07498315329480713, + "grad_norm": 0.8345594107229769, + "learning_rate": 0.000199970597806793, + "loss": 12.7875, + "step": 1377 + }, + { + "epoch": 0.07503760729139014, + "grad_norm": 0.799878598540185, + "learning_rate": 0.00019997038359444605, + "loss": 12.6608, + "step": 1378 + }, + { + "epoch": 0.07509206128797316, + "grad_norm": 0.7981199107212789, + "learning_rate": 0.00019997016860471268, + "loss": 12.7673, + "step": 1379 + }, + { + "epoch": 0.07514651528455617, + "grad_norm": 0.8155962023244945, + "learning_rate": 0.00019996995283759445, + "loss": 12.6999, + "step": 1380 + }, + { + "epoch": 0.07520096928113917, + "grad_norm": 0.8283187517960794, + "learning_rate": 0.00019996973629309316, + "loss": 12.7195, + "step": 1381 + }, + { + "epoch": 0.0752554232777222, + "grad_norm": 0.8160714565733528, + "learning_rate": 0.0001999695189712104, + "loss": 12.5985, + "step": 1382 + }, + { + "epoch": 0.0753098772743052, + "grad_norm": 0.7522090236557449, + "learning_rate": 0.0001999693008719479, + "loss": 12.6562, + "step": 1383 + }, + { + "epoch": 0.07536433127088821, + "grad_norm": 0.8246447349504396, + "learning_rate": 0.00019996908199530736, + "loss": 12.7008, + "step": 1384 + }, + { + "epoch": 0.07541878526747123, + "grad_norm": 0.7906471474347675, + "learning_rate": 0.00019996886234129046, + "loss": 12.6111, + "step": 1385 + }, + { + "epoch": 0.07547323926405423, + "grad_norm": 0.8896552341485157, + "learning_rate": 0.00019996864190989895, + "loss": 12.6309, + "step": 1386 + }, + { + "epoch": 0.07552769326063725, + "grad_norm": 0.8374448428066728, + "learning_rate": 0.00019996842070113449, + "loss": 12.8523, + "step": 1387 + }, + { + "epoch": 0.07558214725722026, + "grad_norm": 0.8911290392829221, + "learning_rate": 0.00019996819871499882, + "loss": 12.7662, + "step": 1388 + }, + { + "epoch": 0.07563660125380327, + "grad_norm": 0.8776686874771998, + "learning_rate": 0.00019996797595149367, + "loss": 12.6586, + "step": 1389 + }, + { + "epoch": 0.07569105525038629, + "grad_norm": 0.7757694249013276, + "learning_rate": 0.0001999677524106208, + "loss": 12.7784, + "step": 1390 + }, + { + "epoch": 0.0757455092469693, + "grad_norm": 0.8463963270041872, + "learning_rate": 0.00019996752809238192, + "loss": 12.7496, + "step": 1391 + }, + { + "epoch": 0.0757999632435523, + "grad_norm": 0.9666247978951433, + "learning_rate": 0.0001999673029967788, + "loss": 12.6967, + "step": 1392 + }, + { + "epoch": 0.07585441724013532, + "grad_norm": 0.911715433375072, + "learning_rate": 0.00019996707712381312, + "loss": 12.7605, + "step": 1393 + }, + { + "epoch": 0.07590887123671833, + "grad_norm": 0.896211013201681, + "learning_rate": 0.0001999668504734867, + "loss": 12.7567, + "step": 1394 + }, + { + "epoch": 0.07596332523330133, + "grad_norm": 0.7875639968187476, + "learning_rate": 0.00019996662304580127, + "loss": 12.7262, + "step": 1395 + }, + { + "epoch": 0.07601777922988436, + "grad_norm": 0.8028651920579518, + "learning_rate": 0.00019996639484075863, + "loss": 12.6632, + "step": 1396 + }, + { + "epoch": 0.07607223322646736, + "grad_norm": 0.8290344892272751, + "learning_rate": 0.00019996616585836056, + "loss": 12.7777, + "step": 1397 + }, + { + "epoch": 0.07612668722305038, + "grad_norm": 0.808369979827406, + "learning_rate": 0.0001999659360986088, + "loss": 12.6464, + "step": 1398 + }, + { + "epoch": 0.07618114121963339, + "grad_norm": 0.8593156464362707, + "learning_rate": 0.00019996570556150516, + "loss": 12.8318, + "step": 1399 + }, + { + "epoch": 0.0762355952162164, + "grad_norm": 0.8476622102536071, + "learning_rate": 0.0001999654742470514, + "loss": 12.5885, + "step": 1400 + }, + { + "epoch": 0.07629004921279942, + "grad_norm": 1.0197744142411682, + "learning_rate": 0.0001999652421552494, + "loss": 12.8219, + "step": 1401 + }, + { + "epoch": 0.07634450320938242, + "grad_norm": 0.8796520538054026, + "learning_rate": 0.0001999650092861009, + "loss": 12.7627, + "step": 1402 + }, + { + "epoch": 0.07639895720596543, + "grad_norm": 0.8767347301844162, + "learning_rate": 0.0001999647756396077, + "loss": 12.6461, + "step": 1403 + }, + { + "epoch": 0.07645341120254845, + "grad_norm": 0.8968474287614663, + "learning_rate": 0.00019996454121577167, + "loss": 12.7267, + "step": 1404 + }, + { + "epoch": 0.07650786519913146, + "grad_norm": 0.8363621234719137, + "learning_rate": 0.00019996430601459454, + "loss": 12.627, + "step": 1405 + }, + { + "epoch": 0.07656231919571448, + "grad_norm": 0.8442098672535624, + "learning_rate": 0.00019996407003607827, + "loss": 12.7895, + "step": 1406 + }, + { + "epoch": 0.07661677319229748, + "grad_norm": 0.895294258933168, + "learning_rate": 0.0001999638332802246, + "loss": 12.8037, + "step": 1407 + }, + { + "epoch": 0.07667122718888049, + "grad_norm": 0.8150125735904853, + "learning_rate": 0.0001999635957470354, + "loss": 12.8318, + "step": 1408 + }, + { + "epoch": 0.07672568118546351, + "grad_norm": 0.8107636017840302, + "learning_rate": 0.00019996335743651254, + "loss": 12.7149, + "step": 1409 + }, + { + "epoch": 0.07678013518204652, + "grad_norm": 0.8164062918816573, + "learning_rate": 0.00019996311834865783, + "loss": 12.7795, + "step": 1410 + }, + { + "epoch": 0.07683458917862952, + "grad_norm": 0.7278267164603288, + "learning_rate": 0.00019996287848347315, + "loss": 12.7514, + "step": 1411 + }, + { + "epoch": 0.07688904317521254, + "grad_norm": 0.8448076956301601, + "learning_rate": 0.00019996263784096034, + "loss": 12.8286, + "step": 1412 + }, + { + "epoch": 0.07694349717179555, + "grad_norm": 0.8097564652529006, + "learning_rate": 0.00019996239642112133, + "loss": 12.5989, + "step": 1413 + }, + { + "epoch": 0.07699795116837857, + "grad_norm": 0.8107001306743004, + "learning_rate": 0.00019996215422395794, + "loss": 12.7783, + "step": 1414 + }, + { + "epoch": 0.07705240516496158, + "grad_norm": 0.8086448265638508, + "learning_rate": 0.00019996191124947208, + "loss": 12.6879, + "step": 1415 + }, + { + "epoch": 0.07710685916154458, + "grad_norm": 0.7665013421105268, + "learning_rate": 0.00019996166749766564, + "loss": 12.6297, + "step": 1416 + }, + { + "epoch": 0.0771613131581276, + "grad_norm": 0.7943762132967175, + "learning_rate": 0.00019996142296854047, + "loss": 12.7468, + "step": 1417 + }, + { + "epoch": 0.07721576715471061, + "grad_norm": 0.8981478243989491, + "learning_rate": 0.00019996117766209857, + "loss": 12.8986, + "step": 1418 + }, + { + "epoch": 0.07727022115129362, + "grad_norm": 0.8143901857096051, + "learning_rate": 0.00019996093157834176, + "loss": 12.78, + "step": 1419 + }, + { + "epoch": 0.07732467514787664, + "grad_norm": 0.8756714785828305, + "learning_rate": 0.000199960684717272, + "loss": 12.785, + "step": 1420 + }, + { + "epoch": 0.07737912914445964, + "grad_norm": 0.8496567372331549, + "learning_rate": 0.00019996043707889118, + "loss": 12.7625, + "step": 1421 + }, + { + "epoch": 0.07743358314104266, + "grad_norm": 0.8563016596814594, + "learning_rate": 0.00019996018866320122, + "loss": 12.818, + "step": 1422 + }, + { + "epoch": 0.07748803713762567, + "grad_norm": 0.7546198210158892, + "learning_rate": 0.0001999599394702041, + "loss": 12.6427, + "step": 1423 + }, + { + "epoch": 0.07754249113420868, + "grad_norm": 1.0243034781564102, + "learning_rate": 0.00019995968949990171, + "loss": 12.7954, + "step": 1424 + }, + { + "epoch": 0.0775969451307917, + "grad_norm": 0.8101715088607566, + "learning_rate": 0.000199959438752296, + "loss": 12.7001, + "step": 1425 + }, + { + "epoch": 0.0776513991273747, + "grad_norm": 0.8761233776508695, + "learning_rate": 0.000199959187227389, + "loss": 12.8675, + "step": 1426 + }, + { + "epoch": 0.07770585312395771, + "grad_norm": 0.9545124675812537, + "learning_rate": 0.00019995893492518252, + "loss": 12.7311, + "step": 1427 + }, + { + "epoch": 0.07776030712054073, + "grad_norm": 1.0653521779212813, + "learning_rate": 0.00019995868184567863, + "loss": 12.837, + "step": 1428 + }, + { + "epoch": 0.07781476111712374, + "grad_norm": 0.7823863372608583, + "learning_rate": 0.00019995842798887925, + "loss": 12.646, + "step": 1429 + }, + { + "epoch": 0.07786921511370676, + "grad_norm": 0.9468592289955438, + "learning_rate": 0.0001999581733547864, + "loss": 12.5773, + "step": 1430 + }, + { + "epoch": 0.07792366911028976, + "grad_norm": 0.8859844301333014, + "learning_rate": 0.000199957917943402, + "loss": 12.6881, + "step": 1431 + }, + { + "epoch": 0.07797812310687277, + "grad_norm": 0.7958834591179493, + "learning_rate": 0.00019995766175472807, + "loss": 12.7938, + "step": 1432 + }, + { + "epoch": 0.07803257710345579, + "grad_norm": 0.9045806271425504, + "learning_rate": 0.00019995740478876662, + "loss": 12.8938, + "step": 1433 + }, + { + "epoch": 0.0780870311000388, + "grad_norm": 0.9066374152311184, + "learning_rate": 0.0001999571470455196, + "loss": 12.8734, + "step": 1434 + }, + { + "epoch": 0.0781414850966218, + "grad_norm": 1.105244040197418, + "learning_rate": 0.00019995688852498907, + "loss": 12.8091, + "step": 1435 + }, + { + "epoch": 0.07819593909320482, + "grad_norm": 0.9163991440480068, + "learning_rate": 0.000199956629227177, + "loss": 12.8646, + "step": 1436 + }, + { + "epoch": 0.07825039308978783, + "grad_norm": 0.9264755380848951, + "learning_rate": 0.0001999563691520854, + "loss": 12.6942, + "step": 1437 + }, + { + "epoch": 0.07830484708637085, + "grad_norm": 0.8982000129129306, + "learning_rate": 0.00019995610829971633, + "loss": 12.7016, + "step": 1438 + }, + { + "epoch": 0.07835930108295386, + "grad_norm": 0.8429653568543607, + "learning_rate": 0.0001999558466700718, + "loss": 12.7855, + "step": 1439 + }, + { + "epoch": 0.07841375507953686, + "grad_norm": 1.02653089378891, + "learning_rate": 0.00019995558426315384, + "loss": 12.699, + "step": 1440 + }, + { + "epoch": 0.07846820907611989, + "grad_norm": 0.8931709995895065, + "learning_rate": 0.0001999553210789645, + "loss": 12.8778, + "step": 1441 + }, + { + "epoch": 0.07852266307270289, + "grad_norm": 1.077265824329578, + "learning_rate": 0.00019995505711750583, + "loss": 12.7602, + "step": 1442 + }, + { + "epoch": 0.0785771170692859, + "grad_norm": 0.8471373686576927, + "learning_rate": 0.00019995479237877985, + "loss": 12.7904, + "step": 1443 + }, + { + "epoch": 0.07863157106586892, + "grad_norm": 1.0498061251024426, + "learning_rate": 0.00019995452686278866, + "loss": 12.9102, + "step": 1444 + }, + { + "epoch": 0.07868602506245193, + "grad_norm": 0.9447132721604138, + "learning_rate": 0.00019995426056953428, + "loss": 12.7107, + "step": 1445 + }, + { + "epoch": 0.07874047905903495, + "grad_norm": 0.963176706304791, + "learning_rate": 0.00019995399349901884, + "loss": 12.9, + "step": 1446 + }, + { + "epoch": 0.07879493305561795, + "grad_norm": 0.9043190032368775, + "learning_rate": 0.00019995372565124436, + "loss": 12.8733, + "step": 1447 + }, + { + "epoch": 0.07884938705220096, + "grad_norm": 0.895109525929898, + "learning_rate": 0.00019995345702621296, + "loss": 12.7444, + "step": 1448 + }, + { + "epoch": 0.07890384104878398, + "grad_norm": 1.0157615390283048, + "learning_rate": 0.00019995318762392673, + "loss": 12.7979, + "step": 1449 + }, + { + "epoch": 0.07895829504536699, + "grad_norm": 0.9177562381958061, + "learning_rate": 0.0001999529174443877, + "loss": 12.7878, + "step": 1450 + }, + { + "epoch": 0.07901274904194999, + "grad_norm": 0.8671856971809547, + "learning_rate": 0.0001999526464875981, + "loss": 12.8859, + "step": 1451 + }, + { + "epoch": 0.07906720303853301, + "grad_norm": 0.7667322554377042, + "learning_rate": 0.0001999523747535599, + "loss": 12.7204, + "step": 1452 + }, + { + "epoch": 0.07912165703511602, + "grad_norm": 0.8422632651938327, + "learning_rate": 0.0001999521022422753, + "loss": 12.7557, + "step": 1453 + }, + { + "epoch": 0.07917611103169904, + "grad_norm": 0.9579270469578615, + "learning_rate": 0.00019995182895374635, + "loss": 12.6818, + "step": 1454 + }, + { + "epoch": 0.07923056502828205, + "grad_norm": 0.8821151364436076, + "learning_rate": 0.00019995155488797525, + "loss": 12.6819, + "step": 1455 + }, + { + "epoch": 0.07928501902486505, + "grad_norm": 0.8051216332043438, + "learning_rate": 0.0001999512800449641, + "loss": 12.7483, + "step": 1456 + }, + { + "epoch": 0.07933947302144807, + "grad_norm": 0.8706203100578653, + "learning_rate": 0.00019995100442471504, + "loss": 12.7462, + "step": 1457 + }, + { + "epoch": 0.07939392701803108, + "grad_norm": 0.8392511128264428, + "learning_rate": 0.00019995072802723017, + "loss": 12.7625, + "step": 1458 + }, + { + "epoch": 0.07944838101461409, + "grad_norm": 0.7159408022254062, + "learning_rate": 0.00019995045085251172, + "loss": 12.6584, + "step": 1459 + }, + { + "epoch": 0.0795028350111971, + "grad_norm": 0.8182234292300353, + "learning_rate": 0.00019995017290056177, + "loss": 12.7386, + "step": 1460 + }, + { + "epoch": 0.07955728900778011, + "grad_norm": 0.8776394827159079, + "learning_rate": 0.00019994989417138252, + "loss": 12.84, + "step": 1461 + }, + { + "epoch": 0.07961174300436312, + "grad_norm": 0.7774015631160571, + "learning_rate": 0.00019994961466497614, + "loss": 12.6897, + "step": 1462 + }, + { + "epoch": 0.07966619700094614, + "grad_norm": 0.7153588530373372, + "learning_rate": 0.0001999493343813448, + "loss": 12.696, + "step": 1463 + }, + { + "epoch": 0.07972065099752915, + "grad_norm": 0.8526001778470991, + "learning_rate": 0.00019994905332049067, + "loss": 12.6799, + "step": 1464 + }, + { + "epoch": 0.07977510499411217, + "grad_norm": 0.8395065430273043, + "learning_rate": 0.00019994877148241593, + "loss": 12.7107, + "step": 1465 + }, + { + "epoch": 0.07982955899069517, + "grad_norm": 0.7910115963722115, + "learning_rate": 0.0001999484888671228, + "loss": 12.7074, + "step": 1466 + }, + { + "epoch": 0.07988401298727818, + "grad_norm": 0.8736867046566489, + "learning_rate": 0.00019994820547461343, + "loss": 12.7821, + "step": 1467 + }, + { + "epoch": 0.0799384669838612, + "grad_norm": 0.7909169429327706, + "learning_rate": 0.0001999479213048901, + "loss": 12.7732, + "step": 1468 + }, + { + "epoch": 0.07999292098044421, + "grad_norm": 0.712764262632445, + "learning_rate": 0.00019994763635795493, + "loss": 12.5218, + "step": 1469 + }, + { + "epoch": 0.08004737497702721, + "grad_norm": 0.7489196228051317, + "learning_rate": 0.00019994735063381017, + "loss": 12.7551, + "step": 1470 + }, + { + "epoch": 0.08010182897361023, + "grad_norm": 0.8125103235341982, + "learning_rate": 0.0001999470641324581, + "loss": 12.7572, + "step": 1471 + }, + { + "epoch": 0.08015628297019324, + "grad_norm": 0.8649553566871733, + "learning_rate": 0.00019994677685390087, + "loss": 12.7905, + "step": 1472 + }, + { + "epoch": 0.08021073696677626, + "grad_norm": 0.8974811724427579, + "learning_rate": 0.00019994648879814074, + "loss": 12.7806, + "step": 1473 + }, + { + "epoch": 0.08026519096335927, + "grad_norm": 0.7657442868661224, + "learning_rate": 0.00019994619996517997, + "loss": 12.7352, + "step": 1474 + }, + { + "epoch": 0.08031964495994227, + "grad_norm": 0.8360366489275451, + "learning_rate": 0.00019994591035502076, + "loss": 12.7474, + "step": 1475 + }, + { + "epoch": 0.0803740989565253, + "grad_norm": 0.7867683365999412, + "learning_rate": 0.0001999456199676654, + "loss": 12.5452, + "step": 1476 + }, + { + "epoch": 0.0804285529531083, + "grad_norm": 0.8163665133622418, + "learning_rate": 0.00019994532880311617, + "loss": 12.6865, + "step": 1477 + }, + { + "epoch": 0.08048300694969131, + "grad_norm": 0.8767810719540375, + "learning_rate": 0.00019994503686137524, + "loss": 12.7188, + "step": 1478 + }, + { + "epoch": 0.08053746094627433, + "grad_norm": 0.7954817734841462, + "learning_rate": 0.000199944744142445, + "loss": 12.7262, + "step": 1479 + }, + { + "epoch": 0.08059191494285733, + "grad_norm": 0.7956972360870302, + "learning_rate": 0.00019994445064632762, + "loss": 12.81, + "step": 1480 + }, + { + "epoch": 0.08064636893944035, + "grad_norm": 0.9637155793432769, + "learning_rate": 0.00019994415637302547, + "loss": 12.6966, + "step": 1481 + }, + { + "epoch": 0.08070082293602336, + "grad_norm": 0.8875279185921682, + "learning_rate": 0.0001999438613225408, + "loss": 12.6457, + "step": 1482 + }, + { + "epoch": 0.08075527693260637, + "grad_norm": 0.8889887759230662, + "learning_rate": 0.00019994356549487587, + "loss": 12.7212, + "step": 1483 + }, + { + "epoch": 0.08080973092918939, + "grad_norm": 0.8712764841724392, + "learning_rate": 0.00019994326889003302, + "loss": 12.709, + "step": 1484 + }, + { + "epoch": 0.0808641849257724, + "grad_norm": 0.8618471723921115, + "learning_rate": 0.0001999429715080146, + "loss": 12.6181, + "step": 1485 + }, + { + "epoch": 0.0809186389223554, + "grad_norm": 0.9185534941177032, + "learning_rate": 0.00019994267334882282, + "loss": 12.7084, + "step": 1486 + }, + { + "epoch": 0.08097309291893842, + "grad_norm": 0.8517384145727733, + "learning_rate": 0.0001999423744124601, + "loss": 12.7456, + "step": 1487 + }, + { + "epoch": 0.08102754691552143, + "grad_norm": 0.7427396610603257, + "learning_rate": 0.00019994207469892867, + "loss": 12.6107, + "step": 1488 + }, + { + "epoch": 0.08108200091210445, + "grad_norm": 0.80926578166257, + "learning_rate": 0.00019994177420823092, + "loss": 12.7398, + "step": 1489 + }, + { + "epoch": 0.08113645490868746, + "grad_norm": 0.9206143365338525, + "learning_rate": 0.00019994147294036916, + "loss": 12.7456, + "step": 1490 + }, + { + "epoch": 0.08119090890527046, + "grad_norm": 0.8103141575787716, + "learning_rate": 0.00019994117089534576, + "loss": 12.7612, + "step": 1491 + }, + { + "epoch": 0.08124536290185348, + "grad_norm": 0.8039408920478894, + "learning_rate": 0.00019994086807316306, + "loss": 12.6768, + "step": 1492 + }, + { + "epoch": 0.08129981689843649, + "grad_norm": 0.9364347767261306, + "learning_rate": 0.0001999405644738234, + "loss": 12.7028, + "step": 1493 + }, + { + "epoch": 0.0813542708950195, + "grad_norm": 0.7758284076712086, + "learning_rate": 0.00019994026009732916, + "loss": 12.7189, + "step": 1494 + }, + { + "epoch": 0.08140872489160252, + "grad_norm": 0.7628202869652245, + "learning_rate": 0.00019993995494368272, + "loss": 12.7589, + "step": 1495 + }, + { + "epoch": 0.08146317888818552, + "grad_norm": 0.8970131980378315, + "learning_rate": 0.00019993964901288637, + "loss": 12.7318, + "step": 1496 + }, + { + "epoch": 0.08151763288476854, + "grad_norm": 0.9069018572400294, + "learning_rate": 0.0001999393423049426, + "loss": 12.8936, + "step": 1497 + }, + { + "epoch": 0.08157208688135155, + "grad_norm": 0.876220223123507, + "learning_rate": 0.00019993903481985373, + "loss": 12.812, + "step": 1498 + }, + { + "epoch": 0.08162654087793456, + "grad_norm": 0.8535583445255587, + "learning_rate": 0.00019993872655762215, + "loss": 12.698, + "step": 1499 + }, + { + "epoch": 0.08168099487451758, + "grad_norm": 0.8631114608236895, + "learning_rate": 0.00019993841751825032, + "loss": 12.8378, + "step": 1500 + }, + { + "epoch": 0.08173544887110058, + "grad_norm": 0.9638776193483425, + "learning_rate": 0.00019993810770174055, + "loss": 12.7563, + "step": 1501 + }, + { + "epoch": 0.08178990286768359, + "grad_norm": 1.1874397730440935, + "learning_rate": 0.00019993779710809532, + "loss": 12.4875, + "step": 1502 + }, + { + "epoch": 0.08184435686426661, + "grad_norm": 0.7936581239575987, + "learning_rate": 0.00019993748573731698, + "loss": 12.714, + "step": 1503 + }, + { + "epoch": 0.08189881086084962, + "grad_norm": 0.7976386685488491, + "learning_rate": 0.00019993717358940803, + "loss": 12.7046, + "step": 1504 + }, + { + "epoch": 0.08195326485743264, + "grad_norm": 0.9118200058009364, + "learning_rate": 0.00019993686066437086, + "loss": 12.7126, + "step": 1505 + }, + { + "epoch": 0.08200771885401564, + "grad_norm": 0.8982301897740976, + "learning_rate": 0.00019993654696220787, + "loss": 12.7474, + "step": 1506 + }, + { + "epoch": 0.08206217285059865, + "grad_norm": 0.8340940733070505, + "learning_rate": 0.00019993623248292156, + "loss": 12.6143, + "step": 1507 + }, + { + "epoch": 0.08211662684718167, + "grad_norm": 0.7856628911786119, + "learning_rate": 0.00019993591722651432, + "loss": 12.6957, + "step": 1508 + }, + { + "epoch": 0.08217108084376468, + "grad_norm": 0.8563602762901024, + "learning_rate": 0.00019993560119298866, + "loss": 12.7298, + "step": 1509 + }, + { + "epoch": 0.08222553484034768, + "grad_norm": 0.9064468196397422, + "learning_rate": 0.00019993528438234698, + "loss": 12.7308, + "step": 1510 + }, + { + "epoch": 0.0822799888369307, + "grad_norm": 0.7929295665041554, + "learning_rate": 0.0001999349667945918, + "loss": 12.6453, + "step": 1511 + }, + { + "epoch": 0.08233444283351371, + "grad_norm": 0.7409195103316495, + "learning_rate": 0.00019993464842972552, + "loss": 12.7021, + "step": 1512 + }, + { + "epoch": 0.08238889683009673, + "grad_norm": 1.0174626901426223, + "learning_rate": 0.00019993432928775069, + "loss": 12.654, + "step": 1513 + }, + { + "epoch": 0.08244335082667974, + "grad_norm": 0.8385420276571517, + "learning_rate": 0.00019993400936866974, + "loss": 12.7606, + "step": 1514 + }, + { + "epoch": 0.08249780482326274, + "grad_norm": 0.7773434141608896, + "learning_rate": 0.00019993368867248518, + "loss": 12.6926, + "step": 1515 + }, + { + "epoch": 0.08255225881984576, + "grad_norm": 0.948935487487568, + "learning_rate": 0.0001999333671991995, + "loss": 12.7262, + "step": 1516 + }, + { + "epoch": 0.08260671281642877, + "grad_norm": 0.7845942917925295, + "learning_rate": 0.0001999330449488152, + "loss": 12.7695, + "step": 1517 + }, + { + "epoch": 0.08266116681301178, + "grad_norm": 0.796696184682058, + "learning_rate": 0.00019993272192133477, + "loss": 12.7048, + "step": 1518 + }, + { + "epoch": 0.0827156208095948, + "grad_norm": 0.8115038277668638, + "learning_rate": 0.00019993239811676075, + "loss": 12.7077, + "step": 1519 + }, + { + "epoch": 0.0827700748061778, + "grad_norm": 0.8598970799174874, + "learning_rate": 0.00019993207353509562, + "loss": 12.7663, + "step": 1520 + }, + { + "epoch": 0.08282452880276082, + "grad_norm": 0.8329175862926992, + "learning_rate": 0.00019993174817634196, + "loss": 12.6002, + "step": 1521 + }, + { + "epoch": 0.08287898279934383, + "grad_norm": 0.9493548901124595, + "learning_rate": 0.00019993142204050224, + "loss": 12.8797, + "step": 1522 + }, + { + "epoch": 0.08293343679592684, + "grad_norm": 0.7844916289661439, + "learning_rate": 0.00019993109512757903, + "loss": 12.7075, + "step": 1523 + }, + { + "epoch": 0.08298789079250986, + "grad_norm": 0.7485570224627376, + "learning_rate": 0.00019993076743757485, + "loss": 12.7142, + "step": 1524 + }, + { + "epoch": 0.08304234478909286, + "grad_norm": 0.8230611251453327, + "learning_rate": 0.0001999304389704923, + "loss": 12.6775, + "step": 1525 + }, + { + "epoch": 0.08309679878567587, + "grad_norm": 0.7802141642317684, + "learning_rate": 0.00019993010972633389, + "loss": 12.7862, + "step": 1526 + }, + { + "epoch": 0.08315125278225889, + "grad_norm": 0.804109574654945, + "learning_rate": 0.0001999297797051022, + "loss": 12.7384, + "step": 1527 + }, + { + "epoch": 0.0832057067788419, + "grad_norm": 0.7549845829476725, + "learning_rate": 0.00019992944890679976, + "loss": 12.6077, + "step": 1528 + }, + { + "epoch": 0.0832601607754249, + "grad_norm": 0.8158889763260669, + "learning_rate": 0.00019992911733142916, + "loss": 12.7293, + "step": 1529 + }, + { + "epoch": 0.08331461477200792, + "grad_norm": 0.8322818323951467, + "learning_rate": 0.00019992878497899298, + "loss": 12.7941, + "step": 1530 + }, + { + "epoch": 0.08336906876859093, + "grad_norm": 0.7974712778159135, + "learning_rate": 0.00019992845184949384, + "loss": 12.6936, + "step": 1531 + }, + { + "epoch": 0.08342352276517395, + "grad_norm": 0.789866577373669, + "learning_rate": 0.0001999281179429343, + "loss": 12.684, + "step": 1532 + }, + { + "epoch": 0.08347797676175696, + "grad_norm": 0.8534707399357608, + "learning_rate": 0.00019992778325931694, + "loss": 12.6751, + "step": 1533 + }, + { + "epoch": 0.08353243075833997, + "grad_norm": 0.9367458502912297, + "learning_rate": 0.00019992744779864438, + "loss": 12.7332, + "step": 1534 + }, + { + "epoch": 0.08358688475492299, + "grad_norm": 0.9117832539518956, + "learning_rate": 0.00019992711156091925, + "loss": 12.6615, + "step": 1535 + }, + { + "epoch": 0.08364133875150599, + "grad_norm": 0.8034947883057498, + "learning_rate": 0.00019992677454614414, + "loss": 12.6091, + "step": 1536 + }, + { + "epoch": 0.083695792748089, + "grad_norm": 0.8610815668935233, + "learning_rate": 0.00019992643675432163, + "loss": 12.6785, + "step": 1537 + }, + { + "epoch": 0.08375024674467202, + "grad_norm": 0.7634725773013094, + "learning_rate": 0.00019992609818545443, + "loss": 12.6252, + "step": 1538 + }, + { + "epoch": 0.08380470074125503, + "grad_norm": 0.773853435934736, + "learning_rate": 0.00019992575883954512, + "loss": 12.6299, + "step": 1539 + }, + { + "epoch": 0.08385915473783805, + "grad_norm": 0.83449596965377, + "learning_rate": 0.00019992541871659636, + "loss": 12.7319, + "step": 1540 + }, + { + "epoch": 0.08391360873442105, + "grad_norm": 0.7817968068976223, + "learning_rate": 0.00019992507781661076, + "loss": 12.6707, + "step": 1541 + }, + { + "epoch": 0.08396806273100406, + "grad_norm": 0.8437153661982023, + "learning_rate": 0.00019992473613959102, + "loss": 12.7417, + "step": 1542 + }, + { + "epoch": 0.08402251672758708, + "grad_norm": 0.7850470107731357, + "learning_rate": 0.00019992439368553977, + "loss": 12.7926, + "step": 1543 + }, + { + "epoch": 0.08407697072417009, + "grad_norm": 0.8146054061971123, + "learning_rate": 0.0001999240504544597, + "loss": 12.8017, + "step": 1544 + }, + { + "epoch": 0.08413142472075309, + "grad_norm": 0.871613332041884, + "learning_rate": 0.00019992370644635342, + "loss": 12.8044, + "step": 1545 + }, + { + "epoch": 0.08418587871733611, + "grad_norm": 0.8555636063343115, + "learning_rate": 0.00019992336166122366, + "loss": 12.7711, + "step": 1546 + }, + { + "epoch": 0.08424033271391912, + "grad_norm": 0.9492028973039249, + "learning_rate": 0.0001999230160990731, + "loss": 12.8954, + "step": 1547 + }, + { + "epoch": 0.08429478671050214, + "grad_norm": 0.8221100770793226, + "learning_rate": 0.00019992266975990436, + "loss": 12.8115, + "step": 1548 + }, + { + "epoch": 0.08434924070708515, + "grad_norm": 0.8717001137503292, + "learning_rate": 0.00019992232264372023, + "loss": 12.7496, + "step": 1549 + }, + { + "epoch": 0.08440369470366815, + "grad_norm": 0.8772885860945219, + "learning_rate": 0.00019992197475052334, + "loss": 12.6613, + "step": 1550 + }, + { + "epoch": 0.08445814870025117, + "grad_norm": 0.7647801603416208, + "learning_rate": 0.00019992162608031643, + "loss": 12.605, + "step": 1551 + }, + { + "epoch": 0.08451260269683418, + "grad_norm": 0.8249989839894225, + "learning_rate": 0.00019992127663310218, + "loss": 12.713, + "step": 1552 + }, + { + "epoch": 0.08456705669341719, + "grad_norm": 0.8004163246461077, + "learning_rate": 0.00019992092640888336, + "loss": 12.8538, + "step": 1553 + }, + { + "epoch": 0.0846215106900002, + "grad_norm": 0.7652912812630724, + "learning_rate": 0.00019992057540766262, + "loss": 12.6518, + "step": 1554 + }, + { + "epoch": 0.08467596468658321, + "grad_norm": 0.7636164105422508, + "learning_rate": 0.00019992022362944276, + "loss": 12.7801, + "step": 1555 + }, + { + "epoch": 0.08473041868316623, + "grad_norm": 0.7770954475005418, + "learning_rate": 0.00019991987107422646, + "loss": 12.7198, + "step": 1556 + }, + { + "epoch": 0.08478487267974924, + "grad_norm": 0.7876480663521178, + "learning_rate": 0.0001999195177420165, + "loss": 12.7493, + "step": 1557 + }, + { + "epoch": 0.08483932667633225, + "grad_norm": 0.7950431925010902, + "learning_rate": 0.0001999191636328156, + "loss": 12.7189, + "step": 1558 + }, + { + "epoch": 0.08489378067291527, + "grad_norm": 0.8317608535674442, + "learning_rate": 0.00019991880874662655, + "loss": 12.8068, + "step": 1559 + }, + { + "epoch": 0.08494823466949827, + "grad_norm": 0.9077379398287838, + "learning_rate": 0.00019991845308345204, + "loss": 12.9445, + "step": 1560 + }, + { + "epoch": 0.08500268866608128, + "grad_norm": 1.2586595675046723, + "learning_rate": 0.00019991809664329492, + "loss": 12.7941, + "step": 1561 + }, + { + "epoch": 0.0850571426626643, + "grad_norm": 0.934969280480126, + "learning_rate": 0.00019991773942615795, + "loss": 12.7453, + "step": 1562 + }, + { + "epoch": 0.08511159665924731, + "grad_norm": 0.8689759469261831, + "learning_rate": 0.00019991738143204383, + "loss": 12.716, + "step": 1563 + }, + { + "epoch": 0.08516605065583033, + "grad_norm": 0.9048636278651105, + "learning_rate": 0.00019991702266095542, + "loss": 12.6813, + "step": 1564 + }, + { + "epoch": 0.08522050465241333, + "grad_norm": 0.9309895881664149, + "learning_rate": 0.0001999166631128955, + "loss": 12.8226, + "step": 1565 + }, + { + "epoch": 0.08527495864899634, + "grad_norm": 1.085437499269038, + "learning_rate": 0.00019991630278786682, + "loss": 12.7647, + "step": 1566 + }, + { + "epoch": 0.08532941264557936, + "grad_norm": 0.9087661866473474, + "learning_rate": 0.00019991594168587224, + "loss": 12.8076, + "step": 1567 + }, + { + "epoch": 0.08538386664216237, + "grad_norm": 0.9896982322788401, + "learning_rate": 0.00019991557980691453, + "loss": 12.7972, + "step": 1568 + }, + { + "epoch": 0.08543832063874537, + "grad_norm": 0.832692895973604, + "learning_rate": 0.0001999152171509965, + "loss": 12.7677, + "step": 1569 + }, + { + "epoch": 0.0854927746353284, + "grad_norm": 0.9248339271879307, + "learning_rate": 0.00019991485371812103, + "loss": 12.789, + "step": 1570 + }, + { + "epoch": 0.0855472286319114, + "grad_norm": 0.767994607022334, + "learning_rate": 0.00019991448950829085, + "loss": 12.6611, + "step": 1571 + }, + { + "epoch": 0.08560168262849442, + "grad_norm": 0.8421196864236498, + "learning_rate": 0.0001999141245215089, + "loss": 12.7529, + "step": 1572 + }, + { + "epoch": 0.08565613662507743, + "grad_norm": 0.9014060216445708, + "learning_rate": 0.0001999137587577779, + "loss": 12.7877, + "step": 1573 + }, + { + "epoch": 0.08571059062166043, + "grad_norm": 0.866376171224473, + "learning_rate": 0.00019991339221710078, + "loss": 12.7103, + "step": 1574 + }, + { + "epoch": 0.08576504461824346, + "grad_norm": 0.721762025104229, + "learning_rate": 0.0001999130248994804, + "loss": 12.7141, + "step": 1575 + }, + { + "epoch": 0.08581949861482646, + "grad_norm": 0.7766649078647998, + "learning_rate": 0.00019991265680491954, + "loss": 12.6779, + "step": 1576 + }, + { + "epoch": 0.08587395261140947, + "grad_norm": 0.809840598959927, + "learning_rate": 0.00019991228793342112, + "loss": 12.8861, + "step": 1577 + }, + { + "epoch": 0.08592840660799249, + "grad_norm": 0.8275739487902802, + "learning_rate": 0.000199911918284988, + "loss": 12.5711, + "step": 1578 + }, + { + "epoch": 0.0859828606045755, + "grad_norm": 0.8132028689679895, + "learning_rate": 0.00019991154785962306, + "loss": 12.8249, + "step": 1579 + }, + { + "epoch": 0.08603731460115852, + "grad_norm": 1.3463942717961974, + "learning_rate": 0.00019991117665732914, + "loss": 12.8725, + "step": 1580 + }, + { + "epoch": 0.08609176859774152, + "grad_norm": 0.7744003920615815, + "learning_rate": 0.00019991080467810917, + "loss": 12.733, + "step": 1581 + }, + { + "epoch": 0.08614622259432453, + "grad_norm": 0.8797671865198825, + "learning_rate": 0.00019991043192196602, + "loss": 12.7362, + "step": 1582 + }, + { + "epoch": 0.08620067659090755, + "grad_norm": 0.8685316217744672, + "learning_rate": 0.0001999100583889026, + "loss": 12.8417, + "step": 1583 + }, + { + "epoch": 0.08625513058749056, + "grad_norm": 0.8634938681153937, + "learning_rate": 0.0001999096840789218, + "loss": 12.8137, + "step": 1584 + }, + { + "epoch": 0.08630958458407356, + "grad_norm": 0.9845889721614345, + "learning_rate": 0.00019990930899202656, + "loss": 12.8088, + "step": 1585 + }, + { + "epoch": 0.08636403858065658, + "grad_norm": 0.9203403556282245, + "learning_rate": 0.00019990893312821976, + "loss": 12.8132, + "step": 1586 + }, + { + "epoch": 0.08641849257723959, + "grad_norm": 0.8059258371430177, + "learning_rate": 0.00019990855648750438, + "loss": 12.6877, + "step": 1587 + }, + { + "epoch": 0.08647294657382261, + "grad_norm": 0.9359421414283161, + "learning_rate": 0.00019990817906988327, + "loss": 12.7852, + "step": 1588 + }, + { + "epoch": 0.08652740057040562, + "grad_norm": 0.828140212206367, + "learning_rate": 0.00019990780087535942, + "loss": 12.7481, + "step": 1589 + }, + { + "epoch": 0.08658185456698862, + "grad_norm": 0.8451668669812522, + "learning_rate": 0.00019990742190393573, + "loss": 12.7679, + "step": 1590 + }, + { + "epoch": 0.08663630856357164, + "grad_norm": 0.9171546488032268, + "learning_rate": 0.0001999070421556152, + "loss": 12.7237, + "step": 1591 + }, + { + "epoch": 0.08669076256015465, + "grad_norm": 0.7928322850405436, + "learning_rate": 0.00019990666163040077, + "loss": 12.831, + "step": 1592 + }, + { + "epoch": 0.08674521655673766, + "grad_norm": 0.7971408312985833, + "learning_rate": 0.00019990628032829537, + "loss": 12.6751, + "step": 1593 + }, + { + "epoch": 0.08679967055332068, + "grad_norm": 0.7932785961720318, + "learning_rate": 0.00019990589824930198, + "loss": 12.732, + "step": 1594 + }, + { + "epoch": 0.08685412454990368, + "grad_norm": 0.8749998597118366, + "learning_rate": 0.00019990551539342355, + "loss": 12.7074, + "step": 1595 + }, + { + "epoch": 0.08690857854648669, + "grad_norm": 0.81543285620202, + "learning_rate": 0.0001999051317606631, + "loss": 12.7376, + "step": 1596 + }, + { + "epoch": 0.08696303254306971, + "grad_norm": 0.9261825740964793, + "learning_rate": 0.0001999047473510236, + "loss": 12.8214, + "step": 1597 + }, + { + "epoch": 0.08701748653965272, + "grad_norm": 0.8622490108458055, + "learning_rate": 0.00019990436216450803, + "loss": 12.7227, + "step": 1598 + }, + { + "epoch": 0.08707194053623574, + "grad_norm": 0.8550542644779822, + "learning_rate": 0.00019990397620111937, + "loss": 12.7084, + "step": 1599 + }, + { + "epoch": 0.08712639453281874, + "grad_norm": 0.8381618824955894, + "learning_rate": 0.00019990358946086063, + "loss": 12.6713, + "step": 1600 + }, + { + "epoch": 0.08718084852940175, + "grad_norm": 0.9796927090666376, + "learning_rate": 0.00019990320194373485, + "loss": 12.7857, + "step": 1601 + }, + { + "epoch": 0.08723530252598477, + "grad_norm": 0.9539240134811797, + "learning_rate": 0.000199902813649745, + "loss": 12.7795, + "step": 1602 + }, + { + "epoch": 0.08728975652256778, + "grad_norm": 0.8105784092854924, + "learning_rate": 0.0001999024245788941, + "loss": 12.6445, + "step": 1603 + }, + { + "epoch": 0.08734421051915078, + "grad_norm": 0.7883626144517288, + "learning_rate": 0.00019990203473118522, + "loss": 12.6751, + "step": 1604 + }, + { + "epoch": 0.0873986645157338, + "grad_norm": 0.8767347617569384, + "learning_rate": 0.00019990164410662136, + "loss": 12.6106, + "step": 1605 + }, + { + "epoch": 0.08745311851231681, + "grad_norm": 0.9084313776833122, + "learning_rate": 0.00019990125270520558, + "loss": 12.7858, + "step": 1606 + }, + { + "epoch": 0.08750757250889983, + "grad_norm": 0.8747776366189766, + "learning_rate": 0.0001999008605269409, + "loss": 12.9065, + "step": 1607 + }, + { + "epoch": 0.08756202650548284, + "grad_norm": 0.8489601291243307, + "learning_rate": 0.00019990046757183033, + "loss": 12.8027, + "step": 1608 + }, + { + "epoch": 0.08761648050206584, + "grad_norm": 0.9162027477048691, + "learning_rate": 0.00019990007383987698, + "loss": 12.7236, + "step": 1609 + }, + { + "epoch": 0.08767093449864886, + "grad_norm": 0.8386437850394985, + "learning_rate": 0.00019989967933108394, + "loss": 12.7247, + "step": 1610 + }, + { + "epoch": 0.08772538849523187, + "grad_norm": 0.8829733589907728, + "learning_rate": 0.00019989928404545425, + "loss": 12.7478, + "step": 1611 + }, + { + "epoch": 0.08777984249181488, + "grad_norm": 0.8150772135220602, + "learning_rate": 0.00019989888798299093, + "loss": 12.6919, + "step": 1612 + }, + { + "epoch": 0.0878342964883979, + "grad_norm": 0.7878375599715177, + "learning_rate": 0.0001998984911436971, + "loss": 12.6992, + "step": 1613 + }, + { + "epoch": 0.0878887504849809, + "grad_norm": 0.8416899312712404, + "learning_rate": 0.0001998980935275759, + "loss": 12.7049, + "step": 1614 + }, + { + "epoch": 0.08794320448156392, + "grad_norm": 0.9133401631169028, + "learning_rate": 0.00019989769513463035, + "loss": 12.8479, + "step": 1615 + }, + { + "epoch": 0.08799765847814693, + "grad_norm": 0.8952845893190265, + "learning_rate": 0.00019989729596486355, + "loss": 12.7254, + "step": 1616 + }, + { + "epoch": 0.08805211247472994, + "grad_norm": 0.8407850339653142, + "learning_rate": 0.00019989689601827864, + "loss": 12.7772, + "step": 1617 + }, + { + "epoch": 0.08810656647131296, + "grad_norm": 0.8027908080124432, + "learning_rate": 0.0001998964952948787, + "loss": 12.793, + "step": 1618 + }, + { + "epoch": 0.08816102046789596, + "grad_norm": 0.8720305737368109, + "learning_rate": 0.00019989609379466688, + "loss": 12.8101, + "step": 1619 + }, + { + "epoch": 0.08821547446447897, + "grad_norm": 0.8423254972283053, + "learning_rate": 0.0001998956915176463, + "loss": 12.7649, + "step": 1620 + }, + { + "epoch": 0.08826992846106199, + "grad_norm": 0.8906736340443412, + "learning_rate": 0.00019989528846382, + "loss": 12.6544, + "step": 1621 + }, + { + "epoch": 0.088324382457645, + "grad_norm": 0.8685979598804452, + "learning_rate": 0.00019989488463319127, + "loss": 12.7049, + "step": 1622 + }, + { + "epoch": 0.08837883645422802, + "grad_norm": 0.9201523573659224, + "learning_rate": 0.0001998944800257631, + "loss": 12.7724, + "step": 1623 + }, + { + "epoch": 0.08843329045081103, + "grad_norm": 0.7950617489837553, + "learning_rate": 0.00019989407464153874, + "loss": 12.6128, + "step": 1624 + }, + { + "epoch": 0.08848774444739403, + "grad_norm": 0.8431903484712208, + "learning_rate": 0.00019989366848052127, + "loss": 12.6943, + "step": 1625 + }, + { + "epoch": 0.08854219844397705, + "grad_norm": 0.792854903121601, + "learning_rate": 0.0001998932615427139, + "loss": 12.7372, + "step": 1626 + }, + { + "epoch": 0.08859665244056006, + "grad_norm": 0.919581588175274, + "learning_rate": 0.00019989285382811977, + "loss": 12.8211, + "step": 1627 + }, + { + "epoch": 0.08865110643714307, + "grad_norm": 1.0225144687152612, + "learning_rate": 0.00019989244533674208, + "loss": 12.9169, + "step": 1628 + }, + { + "epoch": 0.08870556043372609, + "grad_norm": 0.9628652507615445, + "learning_rate": 0.00019989203606858395, + "loss": 12.7378, + "step": 1629 + }, + { + "epoch": 0.08876001443030909, + "grad_norm": 0.9482872431610143, + "learning_rate": 0.0001998916260236486, + "loss": 12.7291, + "step": 1630 + }, + { + "epoch": 0.08881446842689211, + "grad_norm": 0.8991062958487321, + "learning_rate": 0.00019989121520193925, + "loss": 12.817, + "step": 1631 + }, + { + "epoch": 0.08886892242347512, + "grad_norm": 0.9023899125767259, + "learning_rate": 0.00019989080360345902, + "loss": 12.7704, + "step": 1632 + }, + { + "epoch": 0.08892337642005813, + "grad_norm": 0.7729477313833728, + "learning_rate": 0.00019989039122821116, + "loss": 12.6245, + "step": 1633 + }, + { + "epoch": 0.08897783041664115, + "grad_norm": 1.0417855932195912, + "learning_rate": 0.00019988997807619886, + "loss": 12.7979, + "step": 1634 + }, + { + "epoch": 0.08903228441322415, + "grad_norm": 0.7937034858967675, + "learning_rate": 0.00019988956414742536, + "loss": 12.7046, + "step": 1635 + }, + { + "epoch": 0.08908673840980716, + "grad_norm": 0.8001933910250998, + "learning_rate": 0.00019988914944189386, + "loss": 12.7312, + "step": 1636 + }, + { + "epoch": 0.08914119240639018, + "grad_norm": 0.8558583761877688, + "learning_rate": 0.00019988873395960756, + "loss": 12.7965, + "step": 1637 + }, + { + "epoch": 0.08919564640297319, + "grad_norm": 0.7599770895603871, + "learning_rate": 0.00019988831770056972, + "loss": 12.7082, + "step": 1638 + }, + { + "epoch": 0.0892501003995562, + "grad_norm": 0.7960868786708084, + "learning_rate": 0.00019988790066478358, + "loss": 12.6888, + "step": 1639 + }, + { + "epoch": 0.08930455439613921, + "grad_norm": 0.7261475247206844, + "learning_rate": 0.00019988748285225237, + "loss": 12.5913, + "step": 1640 + }, + { + "epoch": 0.08935900839272222, + "grad_norm": 0.7993605836884233, + "learning_rate": 0.00019988706426297932, + "loss": 12.717, + "step": 1641 + }, + { + "epoch": 0.08941346238930524, + "grad_norm": 0.748685997004754, + "learning_rate": 0.00019988664489696773, + "loss": 12.794, + "step": 1642 + }, + { + "epoch": 0.08946791638588825, + "grad_norm": 0.8745510045470801, + "learning_rate": 0.00019988622475422085, + "loss": 12.7614, + "step": 1643 + }, + { + "epoch": 0.08952237038247125, + "grad_norm": 0.8887894206010839, + "learning_rate": 0.00019988580383474192, + "loss": 12.8313, + "step": 1644 + }, + { + "epoch": 0.08957682437905427, + "grad_norm": 0.7790629439842283, + "learning_rate": 0.0001998853821385342, + "loss": 12.6754, + "step": 1645 + }, + { + "epoch": 0.08963127837563728, + "grad_norm": 0.7548331983129816, + "learning_rate": 0.00019988495966560103, + "loss": 12.6716, + "step": 1646 + }, + { + "epoch": 0.0896857323722203, + "grad_norm": 0.8683759239019688, + "learning_rate": 0.00019988453641594568, + "loss": 12.7828, + "step": 1647 + }, + { + "epoch": 0.0897401863688033, + "grad_norm": 0.8333977638249082, + "learning_rate": 0.0001998841123895714, + "loss": 12.7488, + "step": 1648 + }, + { + "epoch": 0.08979464036538631, + "grad_norm": 0.7821755083844757, + "learning_rate": 0.0001998836875864815, + "loss": 12.803, + "step": 1649 + }, + { + "epoch": 0.08984909436196933, + "grad_norm": 0.8495799064371253, + "learning_rate": 0.00019988326200667933, + "loss": 12.614, + "step": 1650 + }, + { + "epoch": 0.08990354835855234, + "grad_norm": 0.8922029842895346, + "learning_rate": 0.00019988283565016812, + "loss": 12.7251, + "step": 1651 + }, + { + "epoch": 0.08995800235513535, + "grad_norm": 0.8249423440125538, + "learning_rate": 0.00019988240851695125, + "loss": 12.7249, + "step": 1652 + }, + { + "epoch": 0.09001245635171837, + "grad_norm": 0.7715976194103749, + "learning_rate": 0.00019988198060703205, + "loss": 12.7746, + "step": 1653 + }, + { + "epoch": 0.09006691034830137, + "grad_norm": 0.8493084075161741, + "learning_rate": 0.00019988155192041378, + "loss": 12.6983, + "step": 1654 + }, + { + "epoch": 0.0901213643448844, + "grad_norm": 0.8833809799334171, + "learning_rate": 0.00019988112245709983, + "loss": 12.6754, + "step": 1655 + }, + { + "epoch": 0.0901758183414674, + "grad_norm": 0.813960208688227, + "learning_rate": 0.00019988069221709348, + "loss": 12.7352, + "step": 1656 + }, + { + "epoch": 0.09023027233805041, + "grad_norm": 0.7960213057660804, + "learning_rate": 0.0001998802612003982, + "loss": 12.8401, + "step": 1657 + }, + { + "epoch": 0.09028472633463343, + "grad_norm": 0.7846256038589693, + "learning_rate": 0.0001998798294070172, + "loss": 12.8505, + "step": 1658 + }, + { + "epoch": 0.09033918033121643, + "grad_norm": 0.911526256365673, + "learning_rate": 0.0001998793968369539, + "loss": 12.8778, + "step": 1659 + }, + { + "epoch": 0.09039363432779944, + "grad_norm": 0.8647590123578177, + "learning_rate": 0.00019987896349021167, + "loss": 12.6736, + "step": 1660 + }, + { + "epoch": 0.09044808832438246, + "grad_norm": 0.9390747596810028, + "learning_rate": 0.00019987852936679388, + "loss": 12.7272, + "step": 1661 + }, + { + "epoch": 0.09050254232096547, + "grad_norm": 0.9456846322227296, + "learning_rate": 0.00019987809446670387, + "loss": 12.7014, + "step": 1662 + }, + { + "epoch": 0.09055699631754847, + "grad_norm": 0.9041210520828457, + "learning_rate": 0.00019987765878994507, + "loss": 12.6473, + "step": 1663 + }, + { + "epoch": 0.0906114503141315, + "grad_norm": 0.7426745800172005, + "learning_rate": 0.00019987722233652086, + "loss": 12.7287, + "step": 1664 + }, + { + "epoch": 0.0906659043107145, + "grad_norm": 1.0350350328989921, + "learning_rate": 0.00019987678510643457, + "loss": 12.7622, + "step": 1665 + }, + { + "epoch": 0.09072035830729752, + "grad_norm": 0.7795116162583914, + "learning_rate": 0.0001998763470996897, + "loss": 12.644, + "step": 1666 + }, + { + "epoch": 0.09077481230388053, + "grad_norm": 1.0945356128978212, + "learning_rate": 0.00019987590831628955, + "loss": 12.7748, + "step": 1667 + }, + { + "epoch": 0.09082926630046353, + "grad_norm": 0.9341997157336556, + "learning_rate": 0.00019987546875623765, + "loss": 12.6985, + "step": 1668 + }, + { + "epoch": 0.09088372029704656, + "grad_norm": 1.0754647338306391, + "learning_rate": 0.0001998750284195373, + "loss": 12.743, + "step": 1669 + }, + { + "epoch": 0.09093817429362956, + "grad_norm": 0.8777951178194842, + "learning_rate": 0.00019987458730619202, + "loss": 12.6949, + "step": 1670 + }, + { + "epoch": 0.09099262829021257, + "grad_norm": 0.7755947654045108, + "learning_rate": 0.0001998741454162052, + "loss": 12.7449, + "step": 1671 + }, + { + "epoch": 0.09104708228679559, + "grad_norm": 0.9579356976820738, + "learning_rate": 0.00019987370274958025, + "loss": 12.6444, + "step": 1672 + }, + { + "epoch": 0.0911015362833786, + "grad_norm": 0.8891914081952949, + "learning_rate": 0.00019987325930632065, + "loss": 12.878, + "step": 1673 + }, + { + "epoch": 0.09115599027996162, + "grad_norm": 0.9225125348962756, + "learning_rate": 0.00019987281508642983, + "loss": 12.5637, + "step": 1674 + }, + { + "epoch": 0.09121044427654462, + "grad_norm": 0.7929658741370371, + "learning_rate": 0.00019987237008991127, + "loss": 12.5481, + "step": 1675 + }, + { + "epoch": 0.09126489827312763, + "grad_norm": 0.8286918511585734, + "learning_rate": 0.00019987192431676843, + "loss": 12.7087, + "step": 1676 + }, + { + "epoch": 0.09131935226971065, + "grad_norm": 0.8387876060507591, + "learning_rate": 0.00019987147776700473, + "loss": 12.6977, + "step": 1677 + }, + { + "epoch": 0.09137380626629366, + "grad_norm": 0.8037519330448577, + "learning_rate": 0.00019987103044062364, + "loss": 12.7798, + "step": 1678 + }, + { + "epoch": 0.09142826026287666, + "grad_norm": 0.8990989280319842, + "learning_rate": 0.0001998705823376287, + "loss": 12.7042, + "step": 1679 + }, + { + "epoch": 0.09148271425945968, + "grad_norm": 0.801847772818663, + "learning_rate": 0.00019987013345802336, + "loss": 12.8197, + "step": 1680 + }, + { + "epoch": 0.09153716825604269, + "grad_norm": 0.8304429597862878, + "learning_rate": 0.00019986968380181113, + "loss": 12.682, + "step": 1681 + }, + { + "epoch": 0.09159162225262571, + "grad_norm": 0.886711334594854, + "learning_rate": 0.00019986923336899547, + "loss": 12.5952, + "step": 1682 + }, + { + "epoch": 0.09164607624920872, + "grad_norm": 0.7732843373918673, + "learning_rate": 0.0001998687821595799, + "loss": 12.6595, + "step": 1683 + }, + { + "epoch": 0.09170053024579172, + "grad_norm": 0.9707310684742881, + "learning_rate": 0.00019986833017356797, + "loss": 12.7915, + "step": 1684 + }, + { + "epoch": 0.09175498424237474, + "grad_norm": 0.7450978514787958, + "learning_rate": 0.00019986787741096311, + "loss": 12.6324, + "step": 1685 + }, + { + "epoch": 0.09180943823895775, + "grad_norm": 0.8637568515593369, + "learning_rate": 0.0001998674238717689, + "loss": 12.729, + "step": 1686 + }, + { + "epoch": 0.09186389223554076, + "grad_norm": 0.7476349343697685, + "learning_rate": 0.0001998669695559889, + "loss": 12.6907, + "step": 1687 + }, + { + "epoch": 0.09191834623212378, + "grad_norm": 0.7931163580976857, + "learning_rate": 0.00019986651446362653, + "loss": 12.8222, + "step": 1688 + }, + { + "epoch": 0.09197280022870678, + "grad_norm": 0.7635941349745159, + "learning_rate": 0.00019986605859468543, + "loss": 12.6869, + "step": 1689 + }, + { + "epoch": 0.0920272542252898, + "grad_norm": 0.7402233735864691, + "learning_rate": 0.0001998656019491691, + "loss": 12.6336, + "step": 1690 + }, + { + "epoch": 0.09208170822187281, + "grad_norm": 0.7765142940286349, + "learning_rate": 0.0001998651445270811, + "loss": 12.6721, + "step": 1691 + }, + { + "epoch": 0.09213616221845582, + "grad_norm": 0.8834448067628252, + "learning_rate": 0.000199864686328425, + "loss": 12.763, + "step": 1692 + }, + { + "epoch": 0.09219061621503884, + "grad_norm": 0.7297738087025569, + "learning_rate": 0.00019986422735320436, + "loss": 12.7451, + "step": 1693 + }, + { + "epoch": 0.09224507021162184, + "grad_norm": 0.7648076695333648, + "learning_rate": 0.00019986376760142274, + "loss": 12.8448, + "step": 1694 + }, + { + "epoch": 0.09229952420820485, + "grad_norm": 0.7793752591738599, + "learning_rate": 0.00019986330707308367, + "loss": 12.9426, + "step": 1695 + }, + { + "epoch": 0.09235397820478787, + "grad_norm": 0.7778053571595327, + "learning_rate": 0.00019986284576819084, + "loss": 12.6704, + "step": 1696 + }, + { + "epoch": 0.09240843220137088, + "grad_norm": 0.7202035826372535, + "learning_rate": 0.00019986238368674774, + "loss": 12.7045, + "step": 1697 + }, + { + "epoch": 0.0924628861979539, + "grad_norm": 0.7528090876710528, + "learning_rate": 0.000199861920828758, + "loss": 12.7146, + "step": 1698 + }, + { + "epoch": 0.0925173401945369, + "grad_norm": 0.7248727230127, + "learning_rate": 0.00019986145719422523, + "loss": 12.6796, + "step": 1699 + }, + { + "epoch": 0.09257179419111991, + "grad_norm": 0.7771517513852315, + "learning_rate": 0.000199860992783153, + "loss": 12.7017, + "step": 1700 + }, + { + "epoch": 0.09262624818770293, + "grad_norm": 0.7915660755684254, + "learning_rate": 0.00019986052759554497, + "loss": 12.6844, + "step": 1701 + }, + { + "epoch": 0.09268070218428594, + "grad_norm": 0.8352445974040995, + "learning_rate": 0.0001998600616314047, + "loss": 12.7781, + "step": 1702 + }, + { + "epoch": 0.09273515618086894, + "grad_norm": 0.7784112929413094, + "learning_rate": 0.00019985959489073586, + "loss": 12.6843, + "step": 1703 + }, + { + "epoch": 0.09278961017745196, + "grad_norm": 0.9289410263110156, + "learning_rate": 0.00019985912737354206, + "loss": 12.7159, + "step": 1704 + }, + { + "epoch": 0.09284406417403497, + "grad_norm": 0.7633534931473822, + "learning_rate": 0.00019985865907982695, + "loss": 12.6632, + "step": 1705 + }, + { + "epoch": 0.09289851817061799, + "grad_norm": 0.7276428028327624, + "learning_rate": 0.00019985819000959416, + "loss": 12.7251, + "step": 1706 + }, + { + "epoch": 0.092952972167201, + "grad_norm": 0.819345051368717, + "learning_rate": 0.0001998577201628473, + "loss": 12.667, + "step": 1707 + }, + { + "epoch": 0.093007426163784, + "grad_norm": 0.7978315345932591, + "learning_rate": 0.00019985724953959012, + "loss": 12.7466, + "step": 1708 + }, + { + "epoch": 0.09306188016036702, + "grad_norm": 0.7802810303182017, + "learning_rate": 0.0001998567781398262, + "loss": 12.6264, + "step": 1709 + }, + { + "epoch": 0.09311633415695003, + "grad_norm": 0.8275515696845572, + "learning_rate": 0.0001998563059635592, + "loss": 12.6937, + "step": 1710 + }, + { + "epoch": 0.09317078815353304, + "grad_norm": 0.8539871682918575, + "learning_rate": 0.00019985583301079286, + "loss": 12.7544, + "step": 1711 + }, + { + "epoch": 0.09322524215011606, + "grad_norm": 0.7965901249303493, + "learning_rate": 0.00019985535928153077, + "loss": 12.743, + "step": 1712 + }, + { + "epoch": 0.09327969614669906, + "grad_norm": 1.0469523307408424, + "learning_rate": 0.00019985488477577672, + "loss": 12.835, + "step": 1713 + }, + { + "epoch": 0.09333415014328209, + "grad_norm": 1.0200281444576251, + "learning_rate": 0.0001998544094935343, + "loss": 12.7051, + "step": 1714 + }, + { + "epoch": 0.09338860413986509, + "grad_norm": 0.9224592123697221, + "learning_rate": 0.00019985393343480726, + "loss": 12.7287, + "step": 1715 + }, + { + "epoch": 0.0934430581364481, + "grad_norm": 0.8094401652320615, + "learning_rate": 0.00019985345659959927, + "loss": 12.7504, + "step": 1716 + }, + { + "epoch": 0.09349751213303112, + "grad_norm": 0.8806549796355795, + "learning_rate": 0.00019985297898791407, + "loss": 12.7822, + "step": 1717 + }, + { + "epoch": 0.09355196612961413, + "grad_norm": 0.8017319253189918, + "learning_rate": 0.00019985250059975534, + "loss": 12.7851, + "step": 1718 + }, + { + "epoch": 0.09360642012619713, + "grad_norm": 0.7970534334621997, + "learning_rate": 0.00019985202143512688, + "loss": 12.727, + "step": 1719 + }, + { + "epoch": 0.09366087412278015, + "grad_norm": 0.8444063092172269, + "learning_rate": 0.00019985154149403228, + "loss": 12.6435, + "step": 1720 + }, + { + "epoch": 0.09371532811936316, + "grad_norm": 0.7854418054238925, + "learning_rate": 0.00019985106077647543, + "loss": 12.575, + "step": 1721 + }, + { + "epoch": 0.09376978211594617, + "grad_norm": 0.9488296508732591, + "learning_rate": 0.00019985057928245992, + "loss": 12.689, + "step": 1722 + }, + { + "epoch": 0.09382423611252919, + "grad_norm": 0.9478793448789539, + "learning_rate": 0.00019985009701198957, + "loss": 12.8562, + "step": 1723 + }, + { + "epoch": 0.09387869010911219, + "grad_norm": 1.3421603271758058, + "learning_rate": 0.00019984961396506815, + "loss": 12.7497, + "step": 1724 + }, + { + "epoch": 0.09393314410569521, + "grad_norm": 1.0297410530163407, + "learning_rate": 0.00019984913014169938, + "loss": 12.7798, + "step": 1725 + }, + { + "epoch": 0.09398759810227822, + "grad_norm": 1.1291195574382247, + "learning_rate": 0.000199848645541887, + "loss": 12.648, + "step": 1726 + }, + { + "epoch": 0.09404205209886123, + "grad_norm": 0.9849925329312679, + "learning_rate": 0.00019984816016563483, + "loss": 12.5487, + "step": 1727 + }, + { + "epoch": 0.09409650609544425, + "grad_norm": 0.8181122519141973, + "learning_rate": 0.0001998476740129466, + "loss": 12.6672, + "step": 1728 + }, + { + "epoch": 0.09415096009202725, + "grad_norm": 0.9015974890306336, + "learning_rate": 0.00019984718708382615, + "loss": 12.771, + "step": 1729 + }, + { + "epoch": 0.09420541408861026, + "grad_norm": 0.8528008739132253, + "learning_rate": 0.00019984669937827719, + "loss": 12.7735, + "step": 1730 + }, + { + "epoch": 0.09425986808519328, + "grad_norm": 0.8831817396018113, + "learning_rate": 0.00019984621089630356, + "loss": 12.696, + "step": 1731 + }, + { + "epoch": 0.09431432208177629, + "grad_norm": 0.834736105594587, + "learning_rate": 0.00019984572163790908, + "loss": 12.7276, + "step": 1732 + }, + { + "epoch": 0.0943687760783593, + "grad_norm": 0.9257863305824491, + "learning_rate": 0.00019984523160309752, + "loss": 12.8682, + "step": 1733 + }, + { + "epoch": 0.09442323007494231, + "grad_norm": 0.8519996446656369, + "learning_rate": 0.00019984474079187266, + "loss": 12.7941, + "step": 1734 + }, + { + "epoch": 0.09447768407152532, + "grad_norm": 0.8464983573387057, + "learning_rate": 0.00019984424920423837, + "loss": 12.7706, + "step": 1735 + }, + { + "epoch": 0.09453213806810834, + "grad_norm": 0.8616173053640237, + "learning_rate": 0.00019984375684019848, + "loss": 12.7734, + "step": 1736 + }, + { + "epoch": 0.09458659206469135, + "grad_norm": 0.8584513670065499, + "learning_rate": 0.00019984326369975675, + "loss": 12.8588, + "step": 1737 + }, + { + "epoch": 0.09464104606127435, + "grad_norm": 0.7275419976228897, + "learning_rate": 0.00019984276978291709, + "loss": 12.6037, + "step": 1738 + }, + { + "epoch": 0.09469550005785737, + "grad_norm": 0.90704252581912, + "learning_rate": 0.00019984227508968328, + "loss": 12.8174, + "step": 1739 + }, + { + "epoch": 0.09474995405444038, + "grad_norm": 0.7890718965669796, + "learning_rate": 0.0001998417796200592, + "loss": 12.7361, + "step": 1740 + }, + { + "epoch": 0.0948044080510234, + "grad_norm": 0.8700908686746459, + "learning_rate": 0.0001998412833740487, + "loss": 12.7187, + "step": 1741 + }, + { + "epoch": 0.09485886204760641, + "grad_norm": 0.8820330592465839, + "learning_rate": 0.00019984078635165565, + "loss": 12.7653, + "step": 1742 + }, + { + "epoch": 0.09491331604418941, + "grad_norm": 0.8480096022522114, + "learning_rate": 0.0001998402885528839, + "loss": 12.7305, + "step": 1743 + }, + { + "epoch": 0.09496777004077243, + "grad_norm": 0.844964622166045, + "learning_rate": 0.00019983978997773733, + "loss": 12.6504, + "step": 1744 + }, + { + "epoch": 0.09502222403735544, + "grad_norm": 0.7337818119185138, + "learning_rate": 0.0001998392906262198, + "loss": 12.6787, + "step": 1745 + }, + { + "epoch": 0.09507667803393845, + "grad_norm": 0.8951156428654814, + "learning_rate": 0.0001998387904983352, + "loss": 12.629, + "step": 1746 + }, + { + "epoch": 0.09513113203052147, + "grad_norm": 0.7417498529985256, + "learning_rate": 0.00019983828959408743, + "loss": 12.6818, + "step": 1747 + }, + { + "epoch": 0.09518558602710447, + "grad_norm": 0.8106578184852027, + "learning_rate": 0.00019983778791348038, + "loss": 12.6571, + "step": 1748 + }, + { + "epoch": 0.0952400400236875, + "grad_norm": 0.8339705961484438, + "learning_rate": 0.00019983728545651795, + "loss": 12.7501, + "step": 1749 + }, + { + "epoch": 0.0952944940202705, + "grad_norm": 0.7656032487484683, + "learning_rate": 0.00019983678222320402, + "loss": 12.6431, + "step": 1750 + }, + { + "epoch": 0.09534894801685351, + "grad_norm": 0.7678729913466037, + "learning_rate": 0.00019983627821354254, + "loss": 12.7806, + "step": 1751 + }, + { + "epoch": 0.09540340201343653, + "grad_norm": 0.7412100270850847, + "learning_rate": 0.00019983577342753744, + "loss": 12.8495, + "step": 1752 + }, + { + "epoch": 0.09545785601001953, + "grad_norm": 0.8329923403455379, + "learning_rate": 0.0001998352678651926, + "loss": 12.6991, + "step": 1753 + }, + { + "epoch": 0.09551231000660254, + "grad_norm": 0.6852399910270895, + "learning_rate": 0.00019983476152651196, + "loss": 12.6196, + "step": 1754 + }, + { + "epoch": 0.09556676400318556, + "grad_norm": 0.7473720505363164, + "learning_rate": 0.0001998342544114995, + "loss": 12.641, + "step": 1755 + }, + { + "epoch": 0.09562121799976857, + "grad_norm": 0.8072951108918488, + "learning_rate": 0.00019983374652015915, + "loss": 12.6861, + "step": 1756 + }, + { + "epoch": 0.09567567199635159, + "grad_norm": 0.8901914678671833, + "learning_rate": 0.0001998332378524948, + "loss": 12.7401, + "step": 1757 + }, + { + "epoch": 0.0957301259929346, + "grad_norm": 0.7381858416182421, + "learning_rate": 0.00019983272840851048, + "loss": 12.6173, + "step": 1758 + }, + { + "epoch": 0.0957845799895176, + "grad_norm": 0.7780328757663166, + "learning_rate": 0.00019983221818821011, + "loss": 12.6119, + "step": 1759 + }, + { + "epoch": 0.09583903398610062, + "grad_norm": 0.8312609985976824, + "learning_rate": 0.00019983170719159769, + "loss": 12.7331, + "step": 1760 + }, + { + "epoch": 0.09589348798268363, + "grad_norm": 0.8830377568322108, + "learning_rate": 0.00019983119541867718, + "loss": 12.9335, + "step": 1761 + }, + { + "epoch": 0.09594794197926663, + "grad_norm": 1.0031691115579495, + "learning_rate": 0.0001998306828694525, + "loss": 12.6282, + "step": 1762 + }, + { + "epoch": 0.09600239597584966, + "grad_norm": 0.8776848123359052, + "learning_rate": 0.00019983016954392771, + "loss": 12.5339, + "step": 1763 + }, + { + "epoch": 0.09605684997243266, + "grad_norm": 0.9444493402268572, + "learning_rate": 0.0001998296554421068, + "loss": 12.797, + "step": 1764 + }, + { + "epoch": 0.09611130396901568, + "grad_norm": 0.8453358988930049, + "learning_rate": 0.00019982914056399374, + "loss": 12.7444, + "step": 1765 + }, + { + "epoch": 0.09616575796559869, + "grad_norm": 1.0057800902831977, + "learning_rate": 0.00019982862490959256, + "loss": 12.615, + "step": 1766 + }, + { + "epoch": 0.0962202119621817, + "grad_norm": 0.8227321761404245, + "learning_rate": 0.0001998281084789072, + "loss": 12.7163, + "step": 1767 + }, + { + "epoch": 0.09627466595876472, + "grad_norm": 0.9078437574448277, + "learning_rate": 0.00019982759127194178, + "loss": 12.8353, + "step": 1768 + }, + { + "epoch": 0.09632911995534772, + "grad_norm": 0.8607841260012521, + "learning_rate": 0.00019982707328870025, + "loss": 12.7217, + "step": 1769 + }, + { + "epoch": 0.09638357395193073, + "grad_norm": 0.806857734594643, + "learning_rate": 0.00019982655452918663, + "loss": 12.8509, + "step": 1770 + }, + { + "epoch": 0.09643802794851375, + "grad_norm": 0.9016798428985953, + "learning_rate": 0.00019982603499340502, + "loss": 12.7054, + "step": 1771 + }, + { + "epoch": 0.09649248194509676, + "grad_norm": 0.8270136349810251, + "learning_rate": 0.00019982551468135943, + "loss": 12.7688, + "step": 1772 + }, + { + "epoch": 0.09654693594167978, + "grad_norm": 0.7586339597744727, + "learning_rate": 0.00019982499359305384, + "loss": 12.6027, + "step": 1773 + }, + { + "epoch": 0.09660138993826278, + "grad_norm": 0.8039689280170982, + "learning_rate": 0.00019982447172849243, + "loss": 12.6849, + "step": 1774 + }, + { + "epoch": 0.09665584393484579, + "grad_norm": 0.864875110019144, + "learning_rate": 0.00019982394908767912, + "loss": 12.8059, + "step": 1775 + }, + { + "epoch": 0.09671029793142881, + "grad_norm": 0.7660481506729493, + "learning_rate": 0.00019982342567061807, + "loss": 12.6182, + "step": 1776 + }, + { + "epoch": 0.09676475192801182, + "grad_norm": 0.8262935586777914, + "learning_rate": 0.00019982290147731334, + "loss": 12.6749, + "step": 1777 + }, + { + "epoch": 0.09681920592459482, + "grad_norm": 0.8597501934936786, + "learning_rate": 0.00019982237650776897, + "loss": 12.8307, + "step": 1778 + }, + { + "epoch": 0.09687365992117784, + "grad_norm": 0.9670302026180937, + "learning_rate": 0.00019982185076198905, + "loss": 12.6684, + "step": 1779 + }, + { + "epoch": 0.09692811391776085, + "grad_norm": 1.0795139067045474, + "learning_rate": 0.0001998213242399777, + "loss": 12.7065, + "step": 1780 + }, + { + "epoch": 0.09698256791434387, + "grad_norm": 0.8816692193509686, + "learning_rate": 0.00019982079694173897, + "loss": 12.6734, + "step": 1781 + }, + { + "epoch": 0.09703702191092688, + "grad_norm": 0.7591381912463186, + "learning_rate": 0.00019982026886727702, + "loss": 12.5892, + "step": 1782 + }, + { + "epoch": 0.09709147590750988, + "grad_norm": 0.831267387648959, + "learning_rate": 0.00019981974001659586, + "loss": 12.687, + "step": 1783 + }, + { + "epoch": 0.0971459299040929, + "grad_norm": 0.8723663496857355, + "learning_rate": 0.0001998192103896997, + "loss": 12.7374, + "step": 1784 + }, + { + "epoch": 0.09720038390067591, + "grad_norm": 0.7613084972321603, + "learning_rate": 0.0001998186799865926, + "loss": 12.8568, + "step": 1785 + }, + { + "epoch": 0.09725483789725892, + "grad_norm": 0.9693531135750679, + "learning_rate": 0.00019981814880727875, + "loss": 12.7736, + "step": 1786 + }, + { + "epoch": 0.09730929189384194, + "grad_norm": 0.8070502979590324, + "learning_rate": 0.00019981761685176222, + "loss": 12.7267, + "step": 1787 + }, + { + "epoch": 0.09736374589042494, + "grad_norm": 0.7585035282744843, + "learning_rate": 0.0001998170841200471, + "loss": 12.6695, + "step": 1788 + }, + { + "epoch": 0.09741819988700795, + "grad_norm": 0.8562878599847409, + "learning_rate": 0.00019981655061213766, + "loss": 12.7517, + "step": 1789 + }, + { + "epoch": 0.09747265388359097, + "grad_norm": 0.8076468198038834, + "learning_rate": 0.000199816016328038, + "loss": 12.6599, + "step": 1790 + }, + { + "epoch": 0.09752710788017398, + "grad_norm": 0.8624308020640535, + "learning_rate": 0.0001998154812677522, + "loss": 12.8433, + "step": 1791 + }, + { + "epoch": 0.097581561876757, + "grad_norm": 0.8124216890531085, + "learning_rate": 0.00019981494543128448, + "loss": 12.7618, + "step": 1792 + }, + { + "epoch": 0.09763601587334, + "grad_norm": 0.9145054786820154, + "learning_rate": 0.00019981440881863905, + "loss": 12.6682, + "step": 1793 + }, + { + "epoch": 0.09769046986992301, + "grad_norm": 0.8483562639490578, + "learning_rate": 0.00019981387142982003, + "loss": 12.8717, + "step": 1794 + }, + { + "epoch": 0.09774492386650603, + "grad_norm": 0.8409069976588321, + "learning_rate": 0.00019981333326483158, + "loss": 12.7535, + "step": 1795 + }, + { + "epoch": 0.09779937786308904, + "grad_norm": 0.8193328017494412, + "learning_rate": 0.0001998127943236779, + "loss": 12.8387, + "step": 1796 + }, + { + "epoch": 0.09785383185967204, + "grad_norm": 0.7893282197935113, + "learning_rate": 0.00019981225460636326, + "loss": 12.6631, + "step": 1797 + }, + { + "epoch": 0.09790828585625506, + "grad_norm": 0.9814262516729859, + "learning_rate": 0.00019981171411289172, + "loss": 12.7136, + "step": 1798 + }, + { + "epoch": 0.09796273985283807, + "grad_norm": 0.8465352965252302, + "learning_rate": 0.00019981117284326757, + "loss": 12.7594, + "step": 1799 + }, + { + "epoch": 0.09801719384942109, + "grad_norm": 0.8189406451797356, + "learning_rate": 0.00019981063079749505, + "loss": 12.6765, + "step": 1800 + }, + { + "epoch": 0.0980716478460041, + "grad_norm": 0.8344960848336206, + "learning_rate": 0.00019981008797557827, + "loss": 12.7143, + "step": 1801 + }, + { + "epoch": 0.0981261018425871, + "grad_norm": 0.793799228357131, + "learning_rate": 0.00019980954437752153, + "loss": 12.6669, + "step": 1802 + }, + { + "epoch": 0.09818055583917012, + "grad_norm": 0.8231053066247628, + "learning_rate": 0.00019980900000332903, + "loss": 12.7411, + "step": 1803 + }, + { + "epoch": 0.09823500983575313, + "grad_norm": 0.8060131383467297, + "learning_rate": 0.000199808454853005, + "loss": 12.7213, + "step": 1804 + }, + { + "epoch": 0.09828946383233614, + "grad_norm": 0.7542976640094788, + "learning_rate": 0.0001998079089265537, + "loss": 12.7119, + "step": 1805 + }, + { + "epoch": 0.09834391782891916, + "grad_norm": 0.8811074779603901, + "learning_rate": 0.0001998073622239794, + "loss": 12.7143, + "step": 1806 + }, + { + "epoch": 0.09839837182550216, + "grad_norm": 0.7443022336030499, + "learning_rate": 0.00019980681474528623, + "loss": 12.5875, + "step": 1807 + }, + { + "epoch": 0.09845282582208519, + "grad_norm": 0.8255299157055661, + "learning_rate": 0.0001998062664904786, + "loss": 12.777, + "step": 1808 + }, + { + "epoch": 0.09850727981866819, + "grad_norm": 0.7914682742250475, + "learning_rate": 0.00019980571745956068, + "loss": 12.6854, + "step": 1809 + }, + { + "epoch": 0.0985617338152512, + "grad_norm": 0.9831540550159134, + "learning_rate": 0.00019980516765253674, + "loss": 12.8013, + "step": 1810 + }, + { + "epoch": 0.09861618781183422, + "grad_norm": 0.7181653532335807, + "learning_rate": 0.0001998046170694111, + "loss": 12.6832, + "step": 1811 + }, + { + "epoch": 0.09867064180841723, + "grad_norm": 0.8613942793337783, + "learning_rate": 0.000199804065710188, + "loss": 12.7333, + "step": 1812 + }, + { + "epoch": 0.09872509580500023, + "grad_norm": 0.7765773016663301, + "learning_rate": 0.00019980351357487178, + "loss": 12.7315, + "step": 1813 + }, + { + "epoch": 0.09877954980158325, + "grad_norm": 0.6808776096578668, + "learning_rate": 0.0001998029606634667, + "loss": 12.7699, + "step": 1814 + }, + { + "epoch": 0.09883400379816626, + "grad_norm": 0.8799736462156778, + "learning_rate": 0.00019980240697597704, + "loss": 12.7738, + "step": 1815 + }, + { + "epoch": 0.09888845779474928, + "grad_norm": 0.8627772359823945, + "learning_rate": 0.00019980185251240715, + "loss": 12.7422, + "step": 1816 + }, + { + "epoch": 0.09894291179133229, + "grad_norm": 0.9523158797483452, + "learning_rate": 0.00019980129727276128, + "loss": 12.6161, + "step": 1817 + }, + { + "epoch": 0.09899736578791529, + "grad_norm": 0.8316339425431188, + "learning_rate": 0.00019980074125704381, + "loss": 12.7533, + "step": 1818 + }, + { + "epoch": 0.09905181978449831, + "grad_norm": 0.7555606839824176, + "learning_rate": 0.00019980018446525904, + "loss": 12.8045, + "step": 1819 + }, + { + "epoch": 0.09910627378108132, + "grad_norm": 0.8736839244670254, + "learning_rate": 0.00019979962689741133, + "loss": 12.8051, + "step": 1820 + }, + { + "epoch": 0.09916072777766433, + "grad_norm": 0.8298289371961125, + "learning_rate": 0.00019979906855350493, + "loss": 12.7442, + "step": 1821 + }, + { + "epoch": 0.09921518177424735, + "grad_norm": 0.7359470330154858, + "learning_rate": 0.00019979850943354429, + "loss": 12.6789, + "step": 1822 + }, + { + "epoch": 0.09926963577083035, + "grad_norm": 0.7984255387074274, + "learning_rate": 0.00019979794953753368, + "loss": 12.6387, + "step": 1823 + }, + { + "epoch": 0.09932408976741337, + "grad_norm": 0.7948787479597286, + "learning_rate": 0.00019979738886547748, + "loss": 12.7372, + "step": 1824 + }, + { + "epoch": 0.09937854376399638, + "grad_norm": 0.7525955093049125, + "learning_rate": 0.00019979682741738005, + "loss": 12.5815, + "step": 1825 + }, + { + "epoch": 0.09943299776057939, + "grad_norm": 0.8045976161147929, + "learning_rate": 0.00019979626519324572, + "loss": 12.6558, + "step": 1826 + }, + { + "epoch": 0.0994874517571624, + "grad_norm": 0.868763583150912, + "learning_rate": 0.00019979570219307892, + "loss": 12.9435, + "step": 1827 + }, + { + "epoch": 0.09954190575374541, + "grad_norm": 0.8254280429266189, + "learning_rate": 0.000199795138416884, + "loss": 12.7005, + "step": 1828 + }, + { + "epoch": 0.09959635975032842, + "grad_norm": 0.8215715474865103, + "learning_rate": 0.00019979457386466536, + "loss": 12.6894, + "step": 1829 + }, + { + "epoch": 0.09965081374691144, + "grad_norm": 0.8617949069115154, + "learning_rate": 0.0001997940085364274, + "loss": 12.8104, + "step": 1830 + }, + { + "epoch": 0.09970526774349445, + "grad_norm": 0.7084497799870297, + "learning_rate": 0.00019979344243217445, + "loss": 12.5418, + "step": 1831 + }, + { + "epoch": 0.09975972174007747, + "grad_norm": 0.7914428213251324, + "learning_rate": 0.00019979287555191096, + "loss": 12.7525, + "step": 1832 + }, + { + "epoch": 0.09981417573666047, + "grad_norm": 0.8036534649424923, + "learning_rate": 0.00019979230789564137, + "loss": 12.8148, + "step": 1833 + }, + { + "epoch": 0.09986862973324348, + "grad_norm": 0.783771502898549, + "learning_rate": 0.00019979173946337, + "loss": 12.6985, + "step": 1834 + }, + { + "epoch": 0.0999230837298265, + "grad_norm": 0.8433229115010147, + "learning_rate": 0.00019979117025510136, + "loss": 12.8037, + "step": 1835 + }, + { + "epoch": 0.09997753772640951, + "grad_norm": 0.7781085291475797, + "learning_rate": 0.00019979060027083988, + "loss": 12.6502, + "step": 1836 + }, + { + "epoch": 0.10003199172299251, + "grad_norm": 0.7396520761371812, + "learning_rate": 0.00019979002951058992, + "loss": 12.6156, + "step": 1837 + }, + { + "epoch": 0.10008644571957553, + "grad_norm": 0.7502516601071137, + "learning_rate": 0.00019978945797435594, + "loss": 12.6214, + "step": 1838 + }, + { + "epoch": 0.10014089971615854, + "grad_norm": 0.7945760298938975, + "learning_rate": 0.00019978888566214245, + "loss": 12.7648, + "step": 1839 + }, + { + "epoch": 0.10019535371274156, + "grad_norm": 0.805994217809907, + "learning_rate": 0.00019978831257395384, + "loss": 12.6749, + "step": 1840 + }, + { + "epoch": 0.10024980770932457, + "grad_norm": 0.7585193198751154, + "learning_rate": 0.00019978773870979452, + "loss": 12.6268, + "step": 1841 + }, + { + "epoch": 0.10030426170590757, + "grad_norm": 0.8305633847794065, + "learning_rate": 0.00019978716406966905, + "loss": 12.7269, + "step": 1842 + }, + { + "epoch": 0.1003587157024906, + "grad_norm": 0.7422976190993655, + "learning_rate": 0.00019978658865358185, + "loss": 12.7719, + "step": 1843 + }, + { + "epoch": 0.1004131696990736, + "grad_norm": 0.740572357468796, + "learning_rate": 0.00019978601246153742, + "loss": 12.7352, + "step": 1844 + }, + { + "epoch": 0.10046762369565661, + "grad_norm": 0.7707640168766531, + "learning_rate": 0.00019978543549354022, + "loss": 12.4927, + "step": 1845 + }, + { + "epoch": 0.10052207769223963, + "grad_norm": 0.815737711358069, + "learning_rate": 0.00019978485774959474, + "loss": 12.6872, + "step": 1846 + }, + { + "epoch": 0.10057653168882263, + "grad_norm": 0.7948585836889104, + "learning_rate": 0.00019978427922970546, + "loss": 12.6934, + "step": 1847 + }, + { + "epoch": 0.10063098568540566, + "grad_norm": 0.7752338166617883, + "learning_rate": 0.0001997836999338769, + "loss": 12.7227, + "step": 1848 + }, + { + "epoch": 0.10068543968198866, + "grad_norm": 0.7549758786488137, + "learning_rate": 0.00019978311986211354, + "loss": 12.7107, + "step": 1849 + }, + { + "epoch": 0.10073989367857167, + "grad_norm": 0.7228918115206341, + "learning_rate": 0.00019978253901441992, + "loss": 12.7228, + "step": 1850 + }, + { + "epoch": 0.10079434767515469, + "grad_norm": 0.9869084616467323, + "learning_rate": 0.00019978195739080054, + "loss": 12.691, + "step": 1851 + }, + { + "epoch": 0.1008488016717377, + "grad_norm": 0.7541298740075157, + "learning_rate": 0.00019978137499125994, + "loss": 12.6926, + "step": 1852 + }, + { + "epoch": 0.1009032556683207, + "grad_norm": 0.8353392921949634, + "learning_rate": 0.0001997807918158026, + "loss": 12.693, + "step": 1853 + }, + { + "epoch": 0.10095770966490372, + "grad_norm": 0.9022504957175957, + "learning_rate": 0.00019978020786443312, + "loss": 12.8224, + "step": 1854 + }, + { + "epoch": 0.10101216366148673, + "grad_norm": 0.7754694247261438, + "learning_rate": 0.00019977962313715602, + "loss": 12.6533, + "step": 1855 + }, + { + "epoch": 0.10106661765806974, + "grad_norm": 0.9496107528055869, + "learning_rate": 0.0001997790376339758, + "loss": 12.6253, + "step": 1856 + }, + { + "epoch": 0.10112107165465276, + "grad_norm": 0.8295457108745374, + "learning_rate": 0.00019977845135489707, + "loss": 12.759, + "step": 1857 + }, + { + "epoch": 0.10117552565123576, + "grad_norm": 0.9132320161804102, + "learning_rate": 0.00019977786429992438, + "loss": 12.7149, + "step": 1858 + }, + { + "epoch": 0.10122997964781878, + "grad_norm": 0.9099955640502901, + "learning_rate": 0.0001997772764690623, + "loss": 12.7591, + "step": 1859 + }, + { + "epoch": 0.10128443364440179, + "grad_norm": 0.9136455396020913, + "learning_rate": 0.00019977668786231534, + "loss": 12.6479, + "step": 1860 + }, + { + "epoch": 0.1013388876409848, + "grad_norm": 1.0443368465604608, + "learning_rate": 0.00019977609847968812, + "loss": 12.7831, + "step": 1861 + }, + { + "epoch": 0.10139334163756782, + "grad_norm": 0.8544793584331826, + "learning_rate": 0.00019977550832118526, + "loss": 12.8603, + "step": 1862 + }, + { + "epoch": 0.10144779563415082, + "grad_norm": 0.9758123126638216, + "learning_rate": 0.00019977491738681132, + "loss": 12.8229, + "step": 1863 + }, + { + "epoch": 0.10150224963073383, + "grad_norm": 1.0450927875618519, + "learning_rate": 0.00019977432567657086, + "loss": 12.785, + "step": 1864 + }, + { + "epoch": 0.10155670362731685, + "grad_norm": 0.8321896909389046, + "learning_rate": 0.0001997737331904685, + "loss": 12.7621, + "step": 1865 + }, + { + "epoch": 0.10161115762389986, + "grad_norm": 1.086697442673203, + "learning_rate": 0.0001997731399285089, + "loss": 12.7405, + "step": 1866 + }, + { + "epoch": 0.10166561162048288, + "grad_norm": 0.8795583094614052, + "learning_rate": 0.0001997725458906966, + "loss": 12.6583, + "step": 1867 + }, + { + "epoch": 0.10172006561706588, + "grad_norm": 0.9093661938885378, + "learning_rate": 0.00019977195107703625, + "loss": 12.6382, + "step": 1868 + }, + { + "epoch": 0.10177451961364889, + "grad_norm": 0.8206637853045706, + "learning_rate": 0.0001997713554875325, + "loss": 12.7288, + "step": 1869 + }, + { + "epoch": 0.10182897361023191, + "grad_norm": 0.7959548053380612, + "learning_rate": 0.00019977075912218996, + "loss": 12.5389, + "step": 1870 + }, + { + "epoch": 0.10188342760681492, + "grad_norm": 0.8940054567632745, + "learning_rate": 0.00019977016198101326, + "loss": 12.7477, + "step": 1871 + }, + { + "epoch": 0.10193788160339792, + "grad_norm": 0.7983708631762738, + "learning_rate": 0.00019976956406400704, + "loss": 12.6809, + "step": 1872 + }, + { + "epoch": 0.10199233559998094, + "grad_norm": 0.8947777592247796, + "learning_rate": 0.00019976896537117597, + "loss": 12.7298, + "step": 1873 + }, + { + "epoch": 0.10204678959656395, + "grad_norm": 0.7639695051807167, + "learning_rate": 0.00019976836590252469, + "loss": 12.8679, + "step": 1874 + }, + { + "epoch": 0.10210124359314697, + "grad_norm": 0.8503436782329791, + "learning_rate": 0.00019976776565805787, + "loss": 12.7481, + "step": 1875 + }, + { + "epoch": 0.10215569758972998, + "grad_norm": 0.7299504612088192, + "learning_rate": 0.00019976716463778016, + "loss": 12.6675, + "step": 1876 + }, + { + "epoch": 0.10221015158631298, + "grad_norm": 0.8034290287601704, + "learning_rate": 0.00019976656284169625, + "loss": 12.7463, + "step": 1877 + }, + { + "epoch": 0.102264605582896, + "grad_norm": 0.9378384856057707, + "learning_rate": 0.0001997659602698108, + "loss": 12.7667, + "step": 1878 + }, + { + "epoch": 0.10231905957947901, + "grad_norm": 0.8112566177222267, + "learning_rate": 0.00019976535692212854, + "loss": 12.6914, + "step": 1879 + }, + { + "epoch": 0.10237351357606202, + "grad_norm": 0.8836096715216687, + "learning_rate": 0.00019976475279865415, + "loss": 12.5431, + "step": 1880 + }, + { + "epoch": 0.10242796757264504, + "grad_norm": 0.7678332184149458, + "learning_rate": 0.00019976414789939226, + "loss": 12.68, + "step": 1881 + }, + { + "epoch": 0.10248242156922804, + "grad_norm": 0.9820205706063538, + "learning_rate": 0.00019976354222434766, + "loss": 12.5327, + "step": 1882 + }, + { + "epoch": 0.10253687556581106, + "grad_norm": 0.839689545120463, + "learning_rate": 0.00019976293577352502, + "loss": 12.7281, + "step": 1883 + }, + { + "epoch": 0.10259132956239407, + "grad_norm": 0.7763360763117715, + "learning_rate": 0.00019976232854692903, + "loss": 12.7423, + "step": 1884 + }, + { + "epoch": 0.10264578355897708, + "grad_norm": 0.8263614999513452, + "learning_rate": 0.0001997617205445645, + "loss": 12.7349, + "step": 1885 + }, + { + "epoch": 0.1027002375555601, + "grad_norm": 0.8544881902889455, + "learning_rate": 0.00019976111176643607, + "loss": 12.8311, + "step": 1886 + }, + { + "epoch": 0.1027546915521431, + "grad_norm": 0.803396672866949, + "learning_rate": 0.0001997605022125485, + "loss": 12.75, + "step": 1887 + }, + { + "epoch": 0.10280914554872611, + "grad_norm": 0.7346963496646222, + "learning_rate": 0.00019975989188290654, + "loss": 12.6567, + "step": 1888 + }, + { + "epoch": 0.10286359954530913, + "grad_norm": 0.8449617676754794, + "learning_rate": 0.00019975928077751496, + "loss": 12.6853, + "step": 1889 + }, + { + "epoch": 0.10291805354189214, + "grad_norm": 0.7721976538792544, + "learning_rate": 0.00019975866889637844, + "loss": 12.657, + "step": 1890 + }, + { + "epoch": 0.10297250753847516, + "grad_norm": 0.8370477843159757, + "learning_rate": 0.0001997580562395018, + "loss": 12.7145, + "step": 1891 + }, + { + "epoch": 0.10302696153505816, + "grad_norm": 0.786273334397099, + "learning_rate": 0.0001997574428068898, + "loss": 12.7155, + "step": 1892 + }, + { + "epoch": 0.10308141553164117, + "grad_norm": 0.7773834981716732, + "learning_rate": 0.00019975682859854716, + "loss": 12.6723, + "step": 1893 + }, + { + "epoch": 0.10313586952822419, + "grad_norm": 0.7131770678235348, + "learning_rate": 0.0001997562136144787, + "loss": 12.7067, + "step": 1894 + }, + { + "epoch": 0.1031903235248072, + "grad_norm": 0.751203222047551, + "learning_rate": 0.00019975559785468923, + "loss": 12.7865, + "step": 1895 + }, + { + "epoch": 0.1032447775213902, + "grad_norm": 0.8029767491515284, + "learning_rate": 0.00019975498131918348, + "loss": 12.8447, + "step": 1896 + }, + { + "epoch": 0.10329923151797323, + "grad_norm": 0.8498280896911178, + "learning_rate": 0.00019975436400796625, + "loss": 12.6351, + "step": 1897 + }, + { + "epoch": 0.10335368551455623, + "grad_norm": 0.8046931953621171, + "learning_rate": 0.00019975374592104235, + "loss": 12.814, + "step": 1898 + }, + { + "epoch": 0.10340813951113925, + "grad_norm": 0.8911829559360334, + "learning_rate": 0.00019975312705841663, + "loss": 12.833, + "step": 1899 + }, + { + "epoch": 0.10346259350772226, + "grad_norm": 0.9755131176201265, + "learning_rate": 0.00019975250742009382, + "loss": 12.7744, + "step": 1900 + }, + { + "epoch": 0.10351704750430527, + "grad_norm": 0.7652603388028444, + "learning_rate": 0.00019975188700607882, + "loss": 12.6047, + "step": 1901 + }, + { + "epoch": 0.10357150150088829, + "grad_norm": 0.7308159389641743, + "learning_rate": 0.00019975126581637642, + "loss": 12.54, + "step": 1902 + }, + { + "epoch": 0.10362595549747129, + "grad_norm": 0.8841537311014933, + "learning_rate": 0.00019975064385099143, + "loss": 12.8176, + "step": 1903 + }, + { + "epoch": 0.1036804094940543, + "grad_norm": 0.7959266490780836, + "learning_rate": 0.0001997500211099287, + "loss": 12.8648, + "step": 1904 + }, + { + "epoch": 0.10373486349063732, + "grad_norm": 0.7932715396835438, + "learning_rate": 0.0001997493975931931, + "loss": 12.7503, + "step": 1905 + }, + { + "epoch": 0.10378931748722033, + "grad_norm": 0.7059892133233987, + "learning_rate": 0.00019974877330078945, + "loss": 12.6694, + "step": 1906 + }, + { + "epoch": 0.10384377148380335, + "grad_norm": 0.7647967486749511, + "learning_rate": 0.00019974814823272265, + "loss": 12.7022, + "step": 1907 + }, + { + "epoch": 0.10389822548038635, + "grad_norm": 0.7149325329573871, + "learning_rate": 0.00019974752238899744, + "loss": 12.7661, + "step": 1908 + }, + { + "epoch": 0.10395267947696936, + "grad_norm": 0.8302278048238623, + "learning_rate": 0.00019974689576961882, + "loss": 12.7787, + "step": 1909 + }, + { + "epoch": 0.10400713347355238, + "grad_norm": 0.7650845662765747, + "learning_rate": 0.00019974626837459161, + "loss": 12.7433, + "step": 1910 + }, + { + "epoch": 0.10406158747013539, + "grad_norm": 0.7263652929619784, + "learning_rate": 0.00019974564020392067, + "loss": 12.8098, + "step": 1911 + }, + { + "epoch": 0.10411604146671839, + "grad_norm": 0.75783854526348, + "learning_rate": 0.00019974501125761092, + "loss": 12.6675, + "step": 1912 + }, + { + "epoch": 0.10417049546330141, + "grad_norm": 0.9010729595464053, + "learning_rate": 0.00019974438153566723, + "loss": 12.7539, + "step": 1913 + }, + { + "epoch": 0.10422494945988442, + "grad_norm": 0.8126529024951368, + "learning_rate": 0.00019974375103809448, + "loss": 12.8125, + "step": 1914 + }, + { + "epoch": 0.10427940345646744, + "grad_norm": 0.941997554515698, + "learning_rate": 0.0001997431197648976, + "loss": 12.7994, + "step": 1915 + }, + { + "epoch": 0.10433385745305045, + "grad_norm": 0.9094933655259666, + "learning_rate": 0.00019974248771608154, + "loss": 12.6271, + "step": 1916 + }, + { + "epoch": 0.10438831144963345, + "grad_norm": 0.8626037953851565, + "learning_rate": 0.00019974185489165112, + "loss": 12.6484, + "step": 1917 + }, + { + "epoch": 0.10444276544621647, + "grad_norm": 0.8310424783833855, + "learning_rate": 0.00019974122129161133, + "loss": 12.8424, + "step": 1918 + }, + { + "epoch": 0.10449721944279948, + "grad_norm": 0.8262241343181942, + "learning_rate": 0.00019974058691596706, + "loss": 12.7997, + "step": 1919 + }, + { + "epoch": 0.10455167343938249, + "grad_norm": 0.8746503937477267, + "learning_rate": 0.0001997399517647233, + "loss": 12.8625, + "step": 1920 + }, + { + "epoch": 0.1046061274359655, + "grad_norm": 0.7630310464515302, + "learning_rate": 0.0001997393158378849, + "loss": 12.7733, + "step": 1921 + }, + { + "epoch": 0.10466058143254851, + "grad_norm": 0.9309510895312205, + "learning_rate": 0.0001997386791354569, + "loss": 12.6717, + "step": 1922 + }, + { + "epoch": 0.10471503542913152, + "grad_norm": 0.7652281673953589, + "learning_rate": 0.00019973804165744418, + "loss": 12.6441, + "step": 1923 + }, + { + "epoch": 0.10476948942571454, + "grad_norm": 0.9094132289341906, + "learning_rate": 0.0001997374034038517, + "loss": 12.8794, + "step": 1924 + }, + { + "epoch": 0.10482394342229755, + "grad_norm": 0.9693633912024958, + "learning_rate": 0.0001997367643746845, + "loss": 12.6089, + "step": 1925 + }, + { + "epoch": 0.10487839741888057, + "grad_norm": 0.7918707360267294, + "learning_rate": 0.00019973612456994743, + "loss": 12.7389, + "step": 1926 + }, + { + "epoch": 0.10493285141546357, + "grad_norm": 0.8084363062606581, + "learning_rate": 0.00019973548398964557, + "loss": 12.7046, + "step": 1927 + }, + { + "epoch": 0.10498730541204658, + "grad_norm": 0.9597969610620458, + "learning_rate": 0.00019973484263378387, + "loss": 12.865, + "step": 1928 + }, + { + "epoch": 0.1050417594086296, + "grad_norm": 0.8646966847427582, + "learning_rate": 0.00019973420050236728, + "loss": 12.7711, + "step": 1929 + }, + { + "epoch": 0.10509621340521261, + "grad_norm": 0.8524972206331253, + "learning_rate": 0.00019973355759540082, + "loss": 12.7456, + "step": 1930 + }, + { + "epoch": 0.10515066740179561, + "grad_norm": 0.7654825536901725, + "learning_rate": 0.00019973291391288953, + "loss": 12.7896, + "step": 1931 + }, + { + "epoch": 0.10520512139837863, + "grad_norm": 0.8152675935793777, + "learning_rate": 0.00019973226945483834, + "loss": 12.7541, + "step": 1932 + }, + { + "epoch": 0.10525957539496164, + "grad_norm": 0.8117049500801464, + "learning_rate": 0.0001997316242212523, + "loss": 12.8113, + "step": 1933 + }, + { + "epoch": 0.10531402939154466, + "grad_norm": 0.8842097075775519, + "learning_rate": 0.00019973097821213642, + "loss": 12.7517, + "step": 1934 + }, + { + "epoch": 0.10536848338812767, + "grad_norm": 0.7566468144569976, + "learning_rate": 0.00019973033142749576, + "loss": 12.7313, + "step": 1935 + }, + { + "epoch": 0.10542293738471067, + "grad_norm": 0.7788525110569708, + "learning_rate": 0.00019972968386733532, + "loss": 12.8033, + "step": 1936 + }, + { + "epoch": 0.1054773913812937, + "grad_norm": 0.8621093718166254, + "learning_rate": 0.0001997290355316601, + "loss": 12.9192, + "step": 1937 + }, + { + "epoch": 0.1055318453778767, + "grad_norm": 0.732165906619715, + "learning_rate": 0.0001997283864204752, + "loss": 12.6428, + "step": 1938 + }, + { + "epoch": 0.10558629937445971, + "grad_norm": 0.7521490290685846, + "learning_rate": 0.00019972773653378562, + "loss": 12.6545, + "step": 1939 + }, + { + "epoch": 0.10564075337104273, + "grad_norm": 0.9478641791415195, + "learning_rate": 0.00019972708587159642, + "loss": 12.7364, + "step": 1940 + }, + { + "epoch": 0.10569520736762573, + "grad_norm": 0.7784312137825782, + "learning_rate": 0.0001997264344339127, + "loss": 12.664, + "step": 1941 + }, + { + "epoch": 0.10574966136420876, + "grad_norm": 0.7947087795298028, + "learning_rate": 0.00019972578222073953, + "loss": 12.7374, + "step": 1942 + }, + { + "epoch": 0.10580411536079176, + "grad_norm": 0.7660446958449624, + "learning_rate": 0.00019972512923208192, + "loss": 12.6068, + "step": 1943 + }, + { + "epoch": 0.10585856935737477, + "grad_norm": 0.9478451868901601, + "learning_rate": 0.000199724475467945, + "loss": 12.7313, + "step": 1944 + }, + { + "epoch": 0.10591302335395779, + "grad_norm": 0.8443286080664616, + "learning_rate": 0.00019972382092833381, + "loss": 12.7797, + "step": 1945 + }, + { + "epoch": 0.1059674773505408, + "grad_norm": 0.7757027982670757, + "learning_rate": 0.00019972316561325348, + "loss": 12.7802, + "step": 1946 + }, + { + "epoch": 0.1060219313471238, + "grad_norm": 0.7323537505609665, + "learning_rate": 0.0001997225095227091, + "loss": 12.7715, + "step": 1947 + }, + { + "epoch": 0.10607638534370682, + "grad_norm": 0.7258173389061959, + "learning_rate": 0.00019972185265670572, + "loss": 12.7192, + "step": 1948 + }, + { + "epoch": 0.10613083934028983, + "grad_norm": 0.9886967263255692, + "learning_rate": 0.00019972119501524853, + "loss": 12.948, + "step": 1949 + }, + { + "epoch": 0.10618529333687285, + "grad_norm": 0.779434641578507, + "learning_rate": 0.0001997205365983426, + "loss": 12.6265, + "step": 1950 + }, + { + "epoch": 0.10623974733345586, + "grad_norm": 0.7378281448418083, + "learning_rate": 0.00019971987740599305, + "loss": 12.6904, + "step": 1951 + }, + { + "epoch": 0.10629420133003886, + "grad_norm": 0.8763514796537106, + "learning_rate": 0.00019971921743820503, + "loss": 12.7581, + "step": 1952 + }, + { + "epoch": 0.10634865532662188, + "grad_norm": 0.7598026179643976, + "learning_rate": 0.00019971855669498364, + "loss": 12.6264, + "step": 1953 + }, + { + "epoch": 0.10640310932320489, + "grad_norm": 0.7677533412606428, + "learning_rate": 0.00019971789517633402, + "loss": 12.5165, + "step": 1954 + }, + { + "epoch": 0.1064575633197879, + "grad_norm": 0.8014143428907597, + "learning_rate": 0.00019971723288226133, + "loss": 12.8213, + "step": 1955 + }, + { + "epoch": 0.10651201731637092, + "grad_norm": 1.0166532682991876, + "learning_rate": 0.0001997165698127707, + "loss": 12.6291, + "step": 1956 + }, + { + "epoch": 0.10656647131295392, + "grad_norm": 0.7397763869887779, + "learning_rate": 0.00019971590596786732, + "loss": 12.7036, + "step": 1957 + }, + { + "epoch": 0.10662092530953694, + "grad_norm": 0.8243836863860697, + "learning_rate": 0.0001997152413475563, + "loss": 12.7095, + "step": 1958 + }, + { + "epoch": 0.10667537930611995, + "grad_norm": 0.7607529958960203, + "learning_rate": 0.0001997145759518429, + "loss": 12.7681, + "step": 1959 + }, + { + "epoch": 0.10672983330270296, + "grad_norm": 0.8500255738770411, + "learning_rate": 0.00019971390978073219, + "loss": 12.8752, + "step": 1960 + }, + { + "epoch": 0.10678428729928598, + "grad_norm": 0.8429022005007072, + "learning_rate": 0.0001997132428342294, + "loss": 12.7818, + "step": 1961 + }, + { + "epoch": 0.10683874129586898, + "grad_norm": 0.8973653852038352, + "learning_rate": 0.00019971257511233975, + "loss": 12.829, + "step": 1962 + }, + { + "epoch": 0.10689319529245199, + "grad_norm": 0.9450106747699376, + "learning_rate": 0.00019971190661506832, + "loss": 12.7089, + "step": 1963 + }, + { + "epoch": 0.10694764928903501, + "grad_norm": 0.8972410673730398, + "learning_rate": 0.00019971123734242044, + "loss": 12.469, + "step": 1964 + }, + { + "epoch": 0.10700210328561802, + "grad_norm": 0.781966142046788, + "learning_rate": 0.00019971056729440126, + "loss": 12.5841, + "step": 1965 + }, + { + "epoch": 0.10705655728220104, + "grad_norm": 0.8277637906824876, + "learning_rate": 0.00019970989647101597, + "loss": 12.6589, + "step": 1966 + }, + { + "epoch": 0.10711101127878404, + "grad_norm": 0.8526458345610508, + "learning_rate": 0.0001997092248722698, + "loss": 12.7865, + "step": 1967 + }, + { + "epoch": 0.10716546527536705, + "grad_norm": 0.7697247304481947, + "learning_rate": 0.00019970855249816798, + "loss": 12.7212, + "step": 1968 + }, + { + "epoch": 0.10721991927195007, + "grad_norm": 0.8853362692299411, + "learning_rate": 0.00019970787934871573, + "loss": 12.7994, + "step": 1969 + }, + { + "epoch": 0.10727437326853308, + "grad_norm": 0.7867428966469842, + "learning_rate": 0.0001997072054239183, + "loss": 12.5694, + "step": 1970 + }, + { + "epoch": 0.10732882726511608, + "grad_norm": 0.7358861967218404, + "learning_rate": 0.0001997065307237809, + "loss": 12.5993, + "step": 1971 + }, + { + "epoch": 0.1073832812616991, + "grad_norm": 0.7950033939733909, + "learning_rate": 0.00019970585524830883, + "loss": 12.6651, + "step": 1972 + }, + { + "epoch": 0.10743773525828211, + "grad_norm": 0.8758062462747864, + "learning_rate": 0.0001997051789975073, + "loss": 12.7358, + "step": 1973 + }, + { + "epoch": 0.10749218925486513, + "grad_norm": 0.8597195443725842, + "learning_rate": 0.00019970450197138155, + "loss": 12.7349, + "step": 1974 + }, + { + "epoch": 0.10754664325144814, + "grad_norm": 0.790691546135162, + "learning_rate": 0.00019970382416993688, + "loss": 12.8114, + "step": 1975 + }, + { + "epoch": 0.10760109724803114, + "grad_norm": 0.9024575737104155, + "learning_rate": 0.00019970314559317854, + "loss": 12.8282, + "step": 1976 + }, + { + "epoch": 0.10765555124461416, + "grad_norm": 0.7525079128703486, + "learning_rate": 0.00019970246624111186, + "loss": 12.7042, + "step": 1977 + }, + { + "epoch": 0.10771000524119717, + "grad_norm": 0.7830885538690692, + "learning_rate": 0.00019970178611374207, + "loss": 12.7363, + "step": 1978 + }, + { + "epoch": 0.10776445923778018, + "grad_norm": 0.7758410027452511, + "learning_rate": 0.00019970110521107446, + "loss": 12.711, + "step": 1979 + }, + { + "epoch": 0.1078189132343632, + "grad_norm": 0.786509695907984, + "learning_rate": 0.00019970042353311434, + "loss": 12.6893, + "step": 1980 + }, + { + "epoch": 0.1078733672309462, + "grad_norm": 0.7910151871874543, + "learning_rate": 0.00019969974107986703, + "loss": 12.6859, + "step": 1981 + }, + { + "epoch": 0.10792782122752922, + "grad_norm": 0.7356011892291389, + "learning_rate": 0.00019969905785133775, + "loss": 12.6665, + "step": 1982 + }, + { + "epoch": 0.10798227522411223, + "grad_norm": 0.791857100119298, + "learning_rate": 0.00019969837384753195, + "loss": 12.6574, + "step": 1983 + }, + { + "epoch": 0.10803672922069524, + "grad_norm": 0.9453268512645497, + "learning_rate": 0.00019969768906845484, + "loss": 12.9026, + "step": 1984 + }, + { + "epoch": 0.10809118321727826, + "grad_norm": 0.7369807898916599, + "learning_rate": 0.00019969700351411178, + "loss": 12.4894, + "step": 1985 + }, + { + "epoch": 0.10814563721386126, + "grad_norm": 0.7844522057126512, + "learning_rate": 0.0001996963171845081, + "loss": 12.6189, + "step": 1986 + }, + { + "epoch": 0.10820009121044427, + "grad_norm": 0.8845416386732657, + "learning_rate": 0.00019969563007964913, + "loss": 12.8518, + "step": 1987 + }, + { + "epoch": 0.10825454520702729, + "grad_norm": 0.8037598314151181, + "learning_rate": 0.00019969494219954025, + "loss": 12.7211, + "step": 1988 + }, + { + "epoch": 0.1083089992036103, + "grad_norm": 0.9389458047150615, + "learning_rate": 0.00019969425354418675, + "loss": 12.7243, + "step": 1989 + }, + { + "epoch": 0.1083634532001933, + "grad_norm": 0.7320127631185521, + "learning_rate": 0.00019969356411359405, + "loss": 12.6032, + "step": 1990 + }, + { + "epoch": 0.10841790719677633, + "grad_norm": 0.8657140221983711, + "learning_rate": 0.00019969287390776748, + "loss": 12.7469, + "step": 1991 + }, + { + "epoch": 0.10847236119335933, + "grad_norm": 0.7649625558778893, + "learning_rate": 0.00019969218292671234, + "loss": 12.7505, + "step": 1992 + }, + { + "epoch": 0.10852681518994235, + "grad_norm": 0.8960064777973007, + "learning_rate": 0.00019969149117043413, + "loss": 12.6676, + "step": 1993 + }, + { + "epoch": 0.10858126918652536, + "grad_norm": 1.017277538626505, + "learning_rate": 0.00019969079863893817, + "loss": 12.7488, + "step": 1994 + }, + { + "epoch": 0.10863572318310837, + "grad_norm": 0.8590189883083232, + "learning_rate": 0.00019969010533222982, + "loss": 12.742, + "step": 1995 + }, + { + "epoch": 0.10869017717969139, + "grad_norm": 0.8725955485441322, + "learning_rate": 0.00019968941125031447, + "loss": 12.7131, + "step": 1996 + }, + { + "epoch": 0.10874463117627439, + "grad_norm": 0.9533012684001186, + "learning_rate": 0.00019968871639319756, + "loss": 12.8064, + "step": 1997 + }, + { + "epoch": 0.1087990851728574, + "grad_norm": 0.8515093684993654, + "learning_rate": 0.0001996880207608845, + "loss": 12.7675, + "step": 1998 + }, + { + "epoch": 0.10885353916944042, + "grad_norm": 0.9449399025658869, + "learning_rate": 0.00019968732435338062, + "loss": 12.7264, + "step": 1999 + }, + { + "epoch": 0.10890799316602343, + "grad_norm": 0.8716562621301317, + "learning_rate": 0.0001996866271706914, + "loss": 12.7379, + "step": 2000 + }, + { + "epoch": 0.10896244716260645, + "grad_norm": 0.7605747741023537, + "learning_rate": 0.00019968592921282228, + "loss": 12.6486, + "step": 2001 + }, + { + "epoch": 0.10901690115918945, + "grad_norm": 0.8690279429757708, + "learning_rate": 0.00019968523047977864, + "loss": 12.727, + "step": 2002 + }, + { + "epoch": 0.10907135515577246, + "grad_norm": 0.7740230628936833, + "learning_rate": 0.00019968453097156594, + "loss": 12.7432, + "step": 2003 + }, + { + "epoch": 0.10912580915235548, + "grad_norm": 0.8301607370104841, + "learning_rate": 0.0001996838306881896, + "loss": 12.7203, + "step": 2004 + }, + { + "epoch": 0.10918026314893849, + "grad_norm": 0.7939352424523266, + "learning_rate": 0.00019968312962965508, + "loss": 12.695, + "step": 2005 + }, + { + "epoch": 0.10923471714552149, + "grad_norm": 0.880346087195917, + "learning_rate": 0.00019968242779596783, + "loss": 12.8456, + "step": 2006 + }, + { + "epoch": 0.10928917114210451, + "grad_norm": 0.7813320479992395, + "learning_rate": 0.00019968172518713327, + "loss": 12.7121, + "step": 2007 + }, + { + "epoch": 0.10934362513868752, + "grad_norm": 0.8484785526716306, + "learning_rate": 0.00019968102180315696, + "loss": 12.7572, + "step": 2008 + }, + { + "epoch": 0.10939807913527054, + "grad_norm": 0.898153371419266, + "learning_rate": 0.00019968031764404427, + "loss": 12.7857, + "step": 2009 + }, + { + "epoch": 0.10945253313185355, + "grad_norm": 0.741570262515584, + "learning_rate": 0.0001996796127098007, + "loss": 12.8034, + "step": 2010 + }, + { + "epoch": 0.10950698712843655, + "grad_norm": 0.9657599315611324, + "learning_rate": 0.00019967890700043177, + "loss": 12.5358, + "step": 2011 + }, + { + "epoch": 0.10956144112501957, + "grad_norm": 0.7940396053085513, + "learning_rate": 0.00019967820051594294, + "loss": 12.7363, + "step": 2012 + }, + { + "epoch": 0.10961589512160258, + "grad_norm": 1.0561796034970556, + "learning_rate": 0.0001996774932563397, + "loss": 12.7496, + "step": 2013 + }, + { + "epoch": 0.10967034911818559, + "grad_norm": 0.8155241579906948, + "learning_rate": 0.00019967678522162758, + "loss": 12.8103, + "step": 2014 + }, + { + "epoch": 0.10972480311476861, + "grad_norm": 0.7664867207293443, + "learning_rate": 0.00019967607641181205, + "loss": 12.7256, + "step": 2015 + }, + { + "epoch": 0.10977925711135161, + "grad_norm": 0.7413458849382605, + "learning_rate": 0.00019967536682689862, + "loss": 12.6486, + "step": 2016 + }, + { + "epoch": 0.10983371110793463, + "grad_norm": 0.7641870409209854, + "learning_rate": 0.00019967465646689284, + "loss": 12.6495, + "step": 2017 + }, + { + "epoch": 0.10988816510451764, + "grad_norm": 0.7952154969764736, + "learning_rate": 0.0001996739453318002, + "loss": 12.7041, + "step": 2018 + }, + { + "epoch": 0.10994261910110065, + "grad_norm": 0.8621410230361722, + "learning_rate": 0.00019967323342162625, + "loss": 12.7359, + "step": 2019 + }, + { + "epoch": 0.10999707309768367, + "grad_norm": 0.7941226482770086, + "learning_rate": 0.0001996725207363765, + "loss": 12.7605, + "step": 2020 + }, + { + "epoch": 0.11005152709426667, + "grad_norm": 0.8875273838164063, + "learning_rate": 0.00019967180727605656, + "loss": 12.8796, + "step": 2021 + }, + { + "epoch": 0.11010598109084968, + "grad_norm": 0.9317449387044903, + "learning_rate": 0.0001996710930406719, + "loss": 12.7256, + "step": 2022 + }, + { + "epoch": 0.1101604350874327, + "grad_norm": 0.7733791956195418, + "learning_rate": 0.00019967037803022812, + "loss": 12.7287, + "step": 2023 + }, + { + "epoch": 0.11021488908401571, + "grad_norm": 0.839253425039756, + "learning_rate": 0.00019966966224473076, + "loss": 12.8589, + "step": 2024 + }, + { + "epoch": 0.11026934308059873, + "grad_norm": 0.8383895898304654, + "learning_rate": 0.0001996689456841854, + "loss": 12.6426, + "step": 2025 + }, + { + "epoch": 0.11032379707718173, + "grad_norm": 0.80800769816932, + "learning_rate": 0.00019966822834859759, + "loss": 12.7299, + "step": 2026 + }, + { + "epoch": 0.11037825107376474, + "grad_norm": 0.9158105841488882, + "learning_rate": 0.00019966751023797294, + "loss": 12.8265, + "step": 2027 + }, + { + "epoch": 0.11043270507034776, + "grad_norm": 0.869986832787026, + "learning_rate": 0.00019966679135231702, + "loss": 12.7817, + "step": 2028 + }, + { + "epoch": 0.11048715906693077, + "grad_norm": 0.8128647516680227, + "learning_rate": 0.00019966607169163538, + "loss": 12.7022, + "step": 2029 + }, + { + "epoch": 0.11054161306351377, + "grad_norm": 0.8051905540730242, + "learning_rate": 0.00019966535125593368, + "loss": 12.8388, + "step": 2030 + }, + { + "epoch": 0.1105960670600968, + "grad_norm": 0.7719518429212509, + "learning_rate": 0.0001996646300452175, + "loss": 12.7637, + "step": 2031 + }, + { + "epoch": 0.1106505210566798, + "grad_norm": 0.8971123617842752, + "learning_rate": 0.00019966390805949242, + "loss": 12.8074, + "step": 2032 + }, + { + "epoch": 0.11070497505326282, + "grad_norm": 0.8091233612763397, + "learning_rate": 0.0001996631852987641, + "loss": 12.6037, + "step": 2033 + }, + { + "epoch": 0.11075942904984583, + "grad_norm": 0.8335833189649199, + "learning_rate": 0.0001996624617630381, + "loss": 12.5976, + "step": 2034 + }, + { + "epoch": 0.11081388304642883, + "grad_norm": 0.7532461021786356, + "learning_rate": 0.00019966173745232011, + "loss": 12.7016, + "step": 2035 + }, + { + "epoch": 0.11086833704301186, + "grad_norm": 0.9038619999472759, + "learning_rate": 0.00019966101236661575, + "loss": 12.5796, + "step": 2036 + }, + { + "epoch": 0.11092279103959486, + "grad_norm": 0.9020584325889739, + "learning_rate": 0.00019966028650593063, + "loss": 12.8904, + "step": 2037 + }, + { + "epoch": 0.11097724503617787, + "grad_norm": 0.8719156748425685, + "learning_rate": 0.0001996595598702704, + "loss": 12.8904, + "step": 2038 + }, + { + "epoch": 0.11103169903276089, + "grad_norm": 0.8031254838524301, + "learning_rate": 0.0001996588324596407, + "loss": 12.7358, + "step": 2039 + }, + { + "epoch": 0.1110861530293439, + "grad_norm": 0.8312455018258538, + "learning_rate": 0.00019965810427404726, + "loss": 12.7067, + "step": 2040 + }, + { + "epoch": 0.11114060702592692, + "grad_norm": 0.6682438162781495, + "learning_rate": 0.00019965737531349567, + "loss": 12.5912, + "step": 2041 + }, + { + "epoch": 0.11119506102250992, + "grad_norm": 0.9103969344133255, + "learning_rate": 0.00019965664557799163, + "loss": 12.9311, + "step": 2042 + }, + { + "epoch": 0.11124951501909293, + "grad_norm": 0.7586090916444304, + "learning_rate": 0.00019965591506754076, + "loss": 12.7437, + "step": 2043 + }, + { + "epoch": 0.11130396901567595, + "grad_norm": 0.7919965882278415, + "learning_rate": 0.0001996551837821488, + "loss": 12.7225, + "step": 2044 + }, + { + "epoch": 0.11135842301225896, + "grad_norm": 0.7519166731231169, + "learning_rate": 0.00019965445172182142, + "loss": 12.729, + "step": 2045 + }, + { + "epoch": 0.11141287700884196, + "grad_norm": 0.8124237368088452, + "learning_rate": 0.0001996537188865643, + "loss": 12.6507, + "step": 2046 + }, + { + "epoch": 0.11146733100542498, + "grad_norm": 0.8276721559395761, + "learning_rate": 0.0001996529852763832, + "loss": 12.849, + "step": 2047 + }, + { + "epoch": 0.11152178500200799, + "grad_norm": 0.7338358319650686, + "learning_rate": 0.00019965225089128372, + "loss": 12.7346, + "step": 2048 + }, + { + "epoch": 0.11157623899859101, + "grad_norm": 0.7879128849080722, + "learning_rate": 0.00019965151573127164, + "loss": 12.7471, + "step": 2049 + }, + { + "epoch": 0.11163069299517402, + "grad_norm": 0.7036905561446233, + "learning_rate": 0.00019965077979635268, + "loss": 12.691, + "step": 2050 + }, + { + "epoch": 0.11168514699175702, + "grad_norm": 0.7659057759800266, + "learning_rate": 0.00019965004308653253, + "loss": 12.7796, + "step": 2051 + }, + { + "epoch": 0.11173960098834004, + "grad_norm": 0.9112024972953926, + "learning_rate": 0.00019964930560181695, + "loss": 12.6461, + "step": 2052 + }, + { + "epoch": 0.11179405498492305, + "grad_norm": 0.815708476482424, + "learning_rate": 0.00019964856734221162, + "loss": 12.662, + "step": 2053 + }, + { + "epoch": 0.11184850898150606, + "grad_norm": 0.7861656973503325, + "learning_rate": 0.00019964782830772236, + "loss": 12.6978, + "step": 2054 + }, + { + "epoch": 0.11190296297808908, + "grad_norm": 0.7568981781640535, + "learning_rate": 0.00019964708849835484, + "loss": 12.8035, + "step": 2055 + }, + { + "epoch": 0.11195741697467208, + "grad_norm": 0.7885340729467678, + "learning_rate": 0.00019964634791411488, + "loss": 12.8172, + "step": 2056 + }, + { + "epoch": 0.11201187097125509, + "grad_norm": 0.7049502508081198, + "learning_rate": 0.00019964560655500818, + "loss": 12.6953, + "step": 2057 + }, + { + "epoch": 0.11206632496783811, + "grad_norm": 0.7515432089172787, + "learning_rate": 0.00019964486442104057, + "loss": 12.6345, + "step": 2058 + }, + { + "epoch": 0.11212077896442112, + "grad_norm": 0.810079398805017, + "learning_rate": 0.00019964412151221773, + "loss": 12.8406, + "step": 2059 + }, + { + "epoch": 0.11217523296100414, + "grad_norm": 0.8140262106072949, + "learning_rate": 0.00019964337782854555, + "loss": 12.5698, + "step": 2060 + }, + { + "epoch": 0.11222968695758714, + "grad_norm": 0.7161724218226323, + "learning_rate": 0.00019964263337002972, + "loss": 12.6271, + "step": 2061 + }, + { + "epoch": 0.11228414095417015, + "grad_norm": 0.753009588216702, + "learning_rate": 0.00019964188813667607, + "loss": 12.6349, + "step": 2062 + }, + { + "epoch": 0.11233859495075317, + "grad_norm": 0.7505899607318048, + "learning_rate": 0.0001996411421284904, + "loss": 12.5799, + "step": 2063 + }, + { + "epoch": 0.11239304894733618, + "grad_norm": 0.8278026206975396, + "learning_rate": 0.00019964039534547847, + "loss": 12.7407, + "step": 2064 + }, + { + "epoch": 0.11244750294391918, + "grad_norm": 0.8331622863627176, + "learning_rate": 0.00019963964778764613, + "loss": 12.6814, + "step": 2065 + }, + { + "epoch": 0.1125019569405022, + "grad_norm": 0.7699158187386606, + "learning_rate": 0.00019963889945499917, + "loss": 12.6718, + "step": 2066 + }, + { + "epoch": 0.11255641093708521, + "grad_norm": 0.7747270710988973, + "learning_rate": 0.00019963815034754344, + "loss": 12.7746, + "step": 2067 + }, + { + "epoch": 0.11261086493366823, + "grad_norm": 0.7494007434705581, + "learning_rate": 0.00019963740046528475, + "loss": 12.6735, + "step": 2068 + }, + { + "epoch": 0.11266531893025124, + "grad_norm": 0.746165912951711, + "learning_rate": 0.0001996366498082289, + "loss": 12.766, + "step": 2069 + }, + { + "epoch": 0.11271977292683424, + "grad_norm": 0.8449008773597039, + "learning_rate": 0.00019963589837638176, + "loss": 12.7827, + "step": 2070 + }, + { + "epoch": 0.11277422692341726, + "grad_norm": 0.7303376725361176, + "learning_rate": 0.00019963514616974916, + "loss": 12.738, + "step": 2071 + }, + { + "epoch": 0.11282868092000027, + "grad_norm": 0.7473221651838649, + "learning_rate": 0.00019963439318833697, + "loss": 12.7155, + "step": 2072 + }, + { + "epoch": 0.11288313491658328, + "grad_norm": 0.825100614677872, + "learning_rate": 0.000199633639432151, + "loss": 12.7331, + "step": 2073 + }, + { + "epoch": 0.1129375889131663, + "grad_norm": 0.7861781030471263, + "learning_rate": 0.00019963288490119717, + "loss": 12.7108, + "step": 2074 + }, + { + "epoch": 0.1129920429097493, + "grad_norm": 0.7526941015138744, + "learning_rate": 0.0001996321295954813, + "loss": 12.7691, + "step": 2075 + }, + { + "epoch": 0.11304649690633232, + "grad_norm": 0.8979029163830556, + "learning_rate": 0.00019963137351500931, + "loss": 12.6463, + "step": 2076 + }, + { + "epoch": 0.11310095090291533, + "grad_norm": 0.905851466359616, + "learning_rate": 0.00019963061665978705, + "loss": 12.8106, + "step": 2077 + }, + { + "epoch": 0.11315540489949834, + "grad_norm": 0.8874248846001883, + "learning_rate": 0.00019962985902982036, + "loss": 12.8702, + "step": 2078 + }, + { + "epoch": 0.11320985889608136, + "grad_norm": 0.7718457677154574, + "learning_rate": 0.00019962910062511525, + "loss": 12.7201, + "step": 2079 + }, + { + "epoch": 0.11326431289266436, + "grad_norm": 0.7637357437072458, + "learning_rate": 0.00019962834144567748, + "loss": 12.7155, + "step": 2080 + }, + { + "epoch": 0.11331876688924737, + "grad_norm": 0.8570310071456682, + "learning_rate": 0.00019962758149151303, + "loss": 12.8059, + "step": 2081 + }, + { + "epoch": 0.11337322088583039, + "grad_norm": 0.8012256975464873, + "learning_rate": 0.00019962682076262781, + "loss": 12.7262, + "step": 2082 + }, + { + "epoch": 0.1134276748824134, + "grad_norm": 0.7009570353759675, + "learning_rate": 0.00019962605925902775, + "loss": 12.6513, + "step": 2083 + }, + { + "epoch": 0.11348212887899642, + "grad_norm": 0.8049734064740511, + "learning_rate": 0.00019962529698071873, + "loss": 12.8019, + "step": 2084 + }, + { + "epoch": 0.11353658287557943, + "grad_norm": 0.7857397204885804, + "learning_rate": 0.00019962453392770668, + "loss": 12.7982, + "step": 2085 + }, + { + "epoch": 0.11359103687216243, + "grad_norm": 0.7731529643017948, + "learning_rate": 0.00019962377009999756, + "loss": 12.7888, + "step": 2086 + }, + { + "epoch": 0.11364549086874545, + "grad_norm": 0.741120737210802, + "learning_rate": 0.0001996230054975973, + "loss": 12.5751, + "step": 2087 + }, + { + "epoch": 0.11369994486532846, + "grad_norm": 0.7620596420969351, + "learning_rate": 0.0001996222401205118, + "loss": 12.7663, + "step": 2088 + }, + { + "epoch": 0.11375439886191147, + "grad_norm": 0.7492434504656282, + "learning_rate": 0.0001996214739687471, + "loss": 12.7384, + "step": 2089 + }, + { + "epoch": 0.11380885285849449, + "grad_norm": 0.7840620448864547, + "learning_rate": 0.0001996207070423091, + "loss": 12.6857, + "step": 2090 + }, + { + "epoch": 0.11386330685507749, + "grad_norm": 0.8200841690305539, + "learning_rate": 0.00019961993934120378, + "loss": 12.7391, + "step": 2091 + }, + { + "epoch": 0.11391776085166051, + "grad_norm": 0.8040405029865603, + "learning_rate": 0.0001996191708654371, + "loss": 12.7379, + "step": 2092 + }, + { + "epoch": 0.11397221484824352, + "grad_norm": 0.7587519143034004, + "learning_rate": 0.00019961840161501505, + "loss": 12.6308, + "step": 2093 + }, + { + "epoch": 0.11402666884482653, + "grad_norm": 0.784489810754187, + "learning_rate": 0.0001996176315899436, + "loss": 12.7887, + "step": 2094 + }, + { + "epoch": 0.11408112284140955, + "grad_norm": 0.8164944082645404, + "learning_rate": 0.0001996168607902287, + "loss": 12.7018, + "step": 2095 + }, + { + "epoch": 0.11413557683799255, + "grad_norm": 0.835692331915501, + "learning_rate": 0.00019961608921587645, + "loss": 12.7757, + "step": 2096 + }, + { + "epoch": 0.11419003083457556, + "grad_norm": 0.7558616368210048, + "learning_rate": 0.00019961531686689274, + "loss": 12.5786, + "step": 2097 + }, + { + "epoch": 0.11424448483115858, + "grad_norm": 0.7079213163165838, + "learning_rate": 0.00019961454374328364, + "loss": 12.6923, + "step": 2098 + }, + { + "epoch": 0.11429893882774159, + "grad_norm": 0.7888926831701923, + "learning_rate": 0.00019961376984505512, + "loss": 12.825, + "step": 2099 + }, + { + "epoch": 0.1143533928243246, + "grad_norm": 0.774072981035808, + "learning_rate": 0.00019961299517221324, + "loss": 12.7196, + "step": 2100 + }, + { + "epoch": 0.11440784682090761, + "grad_norm": 0.6690168651460398, + "learning_rate": 0.00019961221972476402, + "loss": 12.5567, + "step": 2101 + }, + { + "epoch": 0.11446230081749062, + "grad_norm": 0.8531511807742205, + "learning_rate": 0.00019961144350271343, + "loss": 12.7074, + "step": 2102 + }, + { + "epoch": 0.11451675481407364, + "grad_norm": 0.7090196312756721, + "learning_rate": 0.0001996106665060676, + "loss": 12.6886, + "step": 2103 + }, + { + "epoch": 0.11457120881065665, + "grad_norm": 0.7685376179371318, + "learning_rate": 0.0001996098887348325, + "loss": 12.7699, + "step": 2104 + }, + { + "epoch": 0.11462566280723965, + "grad_norm": 0.740224080424988, + "learning_rate": 0.0001996091101890142, + "loss": 12.6349, + "step": 2105 + }, + { + "epoch": 0.11468011680382267, + "grad_norm": 0.9454100900329399, + "learning_rate": 0.00019960833086861873, + "loss": 12.8987, + "step": 2106 + }, + { + "epoch": 0.11473457080040568, + "grad_norm": 0.7763313314121196, + "learning_rate": 0.00019960755077365222, + "loss": 12.6243, + "step": 2107 + }, + { + "epoch": 0.1147890247969887, + "grad_norm": 0.7103990252977854, + "learning_rate": 0.0001996067699041207, + "loss": 12.6774, + "step": 2108 + }, + { + "epoch": 0.11484347879357171, + "grad_norm": 0.7405021657321451, + "learning_rate": 0.00019960598826003018, + "loss": 12.6477, + "step": 2109 + }, + { + "epoch": 0.11489793279015471, + "grad_norm": 0.8219760829652516, + "learning_rate": 0.00019960520584138682, + "loss": 12.7941, + "step": 2110 + }, + { + "epoch": 0.11495238678673773, + "grad_norm": 0.9959298108093985, + "learning_rate": 0.00019960442264819664, + "loss": 12.7902, + "step": 2111 + }, + { + "epoch": 0.11500684078332074, + "grad_norm": 0.6913064619941739, + "learning_rate": 0.00019960363868046582, + "loss": 12.5739, + "step": 2112 + }, + { + "epoch": 0.11506129477990375, + "grad_norm": 0.7620375931662848, + "learning_rate": 0.00019960285393820037, + "loss": 12.653, + "step": 2113 + }, + { + "epoch": 0.11511574877648677, + "grad_norm": 0.8280792459478163, + "learning_rate": 0.00019960206842140643, + "loss": 12.6963, + "step": 2114 + }, + { + "epoch": 0.11517020277306977, + "grad_norm": 0.7494683299858886, + "learning_rate": 0.0001996012821300901, + "loss": 12.6244, + "step": 2115 + }, + { + "epoch": 0.1152246567696528, + "grad_norm": 0.706202371763292, + "learning_rate": 0.00019960049506425752, + "loss": 12.6273, + "step": 2116 + }, + { + "epoch": 0.1152791107662358, + "grad_norm": 0.8221177533629022, + "learning_rate": 0.00019959970722391478, + "loss": 12.8283, + "step": 2117 + }, + { + "epoch": 0.11533356476281881, + "grad_norm": 0.831064960973766, + "learning_rate": 0.00019959891860906799, + "loss": 12.6878, + "step": 2118 + }, + { + "epoch": 0.11538801875940183, + "grad_norm": 0.881850146847089, + "learning_rate": 0.00019959812921972332, + "loss": 12.8005, + "step": 2119 + }, + { + "epoch": 0.11544247275598483, + "grad_norm": 0.8511223229165487, + "learning_rate": 0.0001995973390558869, + "loss": 12.7423, + "step": 2120 + }, + { + "epoch": 0.11549692675256784, + "grad_norm": 0.8376433573079606, + "learning_rate": 0.00019959654811756486, + "loss": 12.8614, + "step": 2121 + }, + { + "epoch": 0.11555138074915086, + "grad_norm": 0.8510792016650545, + "learning_rate": 0.00019959575640476334, + "loss": 12.6251, + "step": 2122 + }, + { + "epoch": 0.11560583474573387, + "grad_norm": 0.8033614865041665, + "learning_rate": 0.00019959496391748857, + "loss": 12.7687, + "step": 2123 + }, + { + "epoch": 0.11566028874231687, + "grad_norm": 0.8177322004397564, + "learning_rate": 0.00019959417065574663, + "loss": 12.5712, + "step": 2124 + }, + { + "epoch": 0.1157147427388999, + "grad_norm": 0.8300869860612424, + "learning_rate": 0.00019959337661954368, + "loss": 12.8096, + "step": 2125 + }, + { + "epoch": 0.1157691967354829, + "grad_norm": 0.7413698447803599, + "learning_rate": 0.000199592581808886, + "loss": 12.6082, + "step": 2126 + }, + { + "epoch": 0.11582365073206592, + "grad_norm": 0.7823550575389198, + "learning_rate": 0.00019959178622377965, + "loss": 12.6772, + "step": 2127 + }, + { + "epoch": 0.11587810472864893, + "grad_norm": 0.8882803396640111, + "learning_rate": 0.00019959098986423087, + "loss": 12.8569, + "step": 2128 + }, + { + "epoch": 0.11593255872523194, + "grad_norm": 0.826671747324549, + "learning_rate": 0.00019959019273024588, + "loss": 12.7348, + "step": 2129 + }, + { + "epoch": 0.11598701272181496, + "grad_norm": 0.7841105201498944, + "learning_rate": 0.00019958939482183084, + "loss": 12.7031, + "step": 2130 + }, + { + "epoch": 0.11604146671839796, + "grad_norm": 0.7900489427215688, + "learning_rate": 0.00019958859613899196, + "loss": 12.5509, + "step": 2131 + }, + { + "epoch": 0.11609592071498097, + "grad_norm": 0.723579613398403, + "learning_rate": 0.0001995877966817355, + "loss": 12.6316, + "step": 2132 + }, + { + "epoch": 0.11615037471156399, + "grad_norm": 1.030053378973563, + "learning_rate": 0.00019958699645006758, + "loss": 12.9159, + "step": 2133 + }, + { + "epoch": 0.116204828708147, + "grad_norm": 0.857899856222025, + "learning_rate": 0.00019958619544399445, + "loss": 12.744, + "step": 2134 + }, + { + "epoch": 0.11625928270473002, + "grad_norm": 0.7480106693179064, + "learning_rate": 0.00019958539366352241, + "loss": 12.8448, + "step": 2135 + }, + { + "epoch": 0.11631373670131302, + "grad_norm": 0.7751551904107694, + "learning_rate": 0.00019958459110865765, + "loss": 12.6368, + "step": 2136 + }, + { + "epoch": 0.11636819069789603, + "grad_norm": 0.7603853151602749, + "learning_rate": 0.0001995837877794064, + "loss": 12.7677, + "step": 2137 + }, + { + "epoch": 0.11642264469447905, + "grad_norm": 0.881158263097917, + "learning_rate": 0.0001995829836757749, + "loss": 12.7606, + "step": 2138 + }, + { + "epoch": 0.11647709869106206, + "grad_norm": 0.7802242944553751, + "learning_rate": 0.00019958217879776944, + "loss": 12.6844, + "step": 2139 + }, + { + "epoch": 0.11653155268764506, + "grad_norm": 0.9848652595670689, + "learning_rate": 0.00019958137314539625, + "loss": 12.5426, + "step": 2140 + }, + { + "epoch": 0.11658600668422808, + "grad_norm": 0.7941566177687074, + "learning_rate": 0.00019958056671866162, + "loss": 12.8575, + "step": 2141 + }, + { + "epoch": 0.11664046068081109, + "grad_norm": 0.7449880770493477, + "learning_rate": 0.00019957975951757177, + "loss": 12.6769, + "step": 2142 + }, + { + "epoch": 0.11669491467739411, + "grad_norm": 0.7723677676886868, + "learning_rate": 0.00019957895154213302, + "loss": 12.6147, + "step": 2143 + }, + { + "epoch": 0.11674936867397712, + "grad_norm": 0.7912735436279764, + "learning_rate": 0.00019957814279235165, + "loss": 12.6037, + "step": 2144 + }, + { + "epoch": 0.11680382267056012, + "grad_norm": 0.7972391102056795, + "learning_rate": 0.00019957733326823394, + "loss": 12.7607, + "step": 2145 + }, + { + "epoch": 0.11685827666714314, + "grad_norm": 0.8313684183370367, + "learning_rate": 0.00019957652296978618, + "loss": 12.6734, + "step": 2146 + }, + { + "epoch": 0.11691273066372615, + "grad_norm": 0.792697041691428, + "learning_rate": 0.00019957571189701469, + "loss": 12.5648, + "step": 2147 + }, + { + "epoch": 0.11696718466030916, + "grad_norm": 0.9063581429753217, + "learning_rate": 0.00019957490004992575, + "loss": 12.7155, + "step": 2148 + }, + { + "epoch": 0.11702163865689218, + "grad_norm": 0.8434538546620092, + "learning_rate": 0.00019957408742852573, + "loss": 12.6305, + "step": 2149 + }, + { + "epoch": 0.11707609265347518, + "grad_norm": 0.87158769555442, + "learning_rate": 0.00019957327403282088, + "loss": 12.6689, + "step": 2150 + }, + { + "epoch": 0.1171305466500582, + "grad_norm": 0.8126919819323308, + "learning_rate": 0.00019957245986281755, + "loss": 12.6945, + "step": 2151 + }, + { + "epoch": 0.11718500064664121, + "grad_norm": 0.8233288581078374, + "learning_rate": 0.00019957164491852207, + "loss": 12.7854, + "step": 2152 + }, + { + "epoch": 0.11723945464322422, + "grad_norm": 0.7885447903071905, + "learning_rate": 0.0001995708291999408, + "loss": 12.5977, + "step": 2153 + }, + { + "epoch": 0.11729390863980724, + "grad_norm": 0.8239947426294945, + "learning_rate": 0.00019957001270708003, + "loss": 12.6609, + "step": 2154 + }, + { + "epoch": 0.11734836263639024, + "grad_norm": 0.7073707699017527, + "learning_rate": 0.00019956919543994615, + "loss": 12.6269, + "step": 2155 + }, + { + "epoch": 0.11740281663297325, + "grad_norm": 0.8425301365131279, + "learning_rate": 0.00019956837739854556, + "loss": 12.8125, + "step": 2156 + }, + { + "epoch": 0.11745727062955627, + "grad_norm": 0.7271759383651856, + "learning_rate": 0.0001995675585828845, + "loss": 12.6695, + "step": 2157 + }, + { + "epoch": 0.11751172462613928, + "grad_norm": 0.8336817554060225, + "learning_rate": 0.00019956673899296944, + "loss": 12.7262, + "step": 2158 + }, + { + "epoch": 0.1175661786227223, + "grad_norm": 0.8076930572191388, + "learning_rate": 0.00019956591862880675, + "loss": 12.7619, + "step": 2159 + }, + { + "epoch": 0.1176206326193053, + "grad_norm": 0.7989577147925232, + "learning_rate": 0.00019956509749040273, + "loss": 12.6479, + "step": 2160 + }, + { + "epoch": 0.11767508661588831, + "grad_norm": 0.8037586122289786, + "learning_rate": 0.00019956427557776384, + "loss": 12.6592, + "step": 2161 + }, + { + "epoch": 0.11772954061247133, + "grad_norm": 0.7611174663838151, + "learning_rate": 0.00019956345289089643, + "loss": 12.6262, + "step": 2162 + }, + { + "epoch": 0.11778399460905434, + "grad_norm": 0.8435221901079686, + "learning_rate": 0.0001995626294298069, + "loss": 12.72, + "step": 2163 + }, + { + "epoch": 0.11783844860563734, + "grad_norm": 0.8431094578143535, + "learning_rate": 0.00019956180519450171, + "loss": 12.7281, + "step": 2164 + }, + { + "epoch": 0.11789290260222036, + "grad_norm": 1.0147865888423293, + "learning_rate": 0.00019956098018498723, + "loss": 12.7111, + "step": 2165 + }, + { + "epoch": 0.11794735659880337, + "grad_norm": 0.7481049960190732, + "learning_rate": 0.00019956015440126984, + "loss": 12.751, + "step": 2166 + }, + { + "epoch": 0.11800181059538639, + "grad_norm": 1.0240643900963289, + "learning_rate": 0.000199559327843356, + "loss": 12.8251, + "step": 2167 + }, + { + "epoch": 0.1180562645919694, + "grad_norm": 0.8048856773837297, + "learning_rate": 0.00019955850051125214, + "loss": 12.7436, + "step": 2168 + }, + { + "epoch": 0.1181107185885524, + "grad_norm": 0.7799355673998167, + "learning_rate": 0.0001995576724049647, + "loss": 12.7746, + "step": 2169 + }, + { + "epoch": 0.11816517258513543, + "grad_norm": 0.898073428935087, + "learning_rate": 0.00019955684352450007, + "loss": 12.6445, + "step": 2170 + }, + { + "epoch": 0.11821962658171843, + "grad_norm": 0.8455278039776454, + "learning_rate": 0.00019955601386986471, + "loss": 12.9123, + "step": 2171 + }, + { + "epoch": 0.11827408057830144, + "grad_norm": 0.8917322796775545, + "learning_rate": 0.00019955518344106512, + "loss": 12.63, + "step": 2172 + }, + { + "epoch": 0.11832853457488446, + "grad_norm": 0.8617661704418199, + "learning_rate": 0.00019955435223810772, + "loss": 12.8358, + "step": 2173 + }, + { + "epoch": 0.11838298857146747, + "grad_norm": 0.915808254151994, + "learning_rate": 0.00019955352026099901, + "loss": 12.5354, + "step": 2174 + }, + { + "epoch": 0.11843744256805049, + "grad_norm": 1.055264887510652, + "learning_rate": 0.0001995526875097454, + "loss": 12.6227, + "step": 2175 + }, + { + "epoch": 0.11849189656463349, + "grad_norm": 0.9186264289083459, + "learning_rate": 0.0001995518539843534, + "loss": 12.617, + "step": 2176 + }, + { + "epoch": 0.1185463505612165, + "grad_norm": 1.0774450319609858, + "learning_rate": 0.0001995510196848295, + "loss": 12.8788, + "step": 2177 + }, + { + "epoch": 0.11860080455779952, + "grad_norm": 0.7694483552357013, + "learning_rate": 0.00019955018461118018, + "loss": 12.7367, + "step": 2178 + }, + { + "epoch": 0.11865525855438253, + "grad_norm": 1.2952253320732638, + "learning_rate": 0.0001995493487634119, + "loss": 12.7683, + "step": 2179 + }, + { + "epoch": 0.11870971255096553, + "grad_norm": 0.8391668405024848, + "learning_rate": 0.00019954851214153124, + "loss": 12.5749, + "step": 2180 + }, + { + "epoch": 0.11876416654754855, + "grad_norm": 0.9121798107531427, + "learning_rate": 0.0001995476747455446, + "loss": 12.7191, + "step": 2181 + }, + { + "epoch": 0.11881862054413156, + "grad_norm": 0.8153326608698361, + "learning_rate": 0.0001995468365754586, + "loss": 12.6872, + "step": 2182 + }, + { + "epoch": 0.11887307454071458, + "grad_norm": 0.7900303177250834, + "learning_rate": 0.00019954599763127967, + "loss": 12.6046, + "step": 2183 + }, + { + "epoch": 0.11892752853729759, + "grad_norm": 0.7569958139616827, + "learning_rate": 0.00019954515791301437, + "loss": 12.6619, + "step": 2184 + }, + { + "epoch": 0.11898198253388059, + "grad_norm": 0.8371377500237001, + "learning_rate": 0.00019954431742066927, + "loss": 12.6206, + "step": 2185 + }, + { + "epoch": 0.11903643653046361, + "grad_norm": 0.853149233833002, + "learning_rate": 0.00019954347615425082, + "loss": 12.6786, + "step": 2186 + }, + { + "epoch": 0.11909089052704662, + "grad_norm": 0.7817696062905213, + "learning_rate": 0.0001995426341137656, + "loss": 12.7055, + "step": 2187 + }, + { + "epoch": 0.11914534452362963, + "grad_norm": 0.7871792577118936, + "learning_rate": 0.00019954179129922018, + "loss": 12.6681, + "step": 2188 + }, + { + "epoch": 0.11919979852021265, + "grad_norm": 0.8416226746173138, + "learning_rate": 0.0001995409477106211, + "loss": 12.8002, + "step": 2189 + }, + { + "epoch": 0.11925425251679565, + "grad_norm": 0.7951692811202323, + "learning_rate": 0.0001995401033479749, + "loss": 12.8236, + "step": 2190 + }, + { + "epoch": 0.11930870651337866, + "grad_norm": 0.8553946427982531, + "learning_rate": 0.0001995392582112882, + "loss": 12.6695, + "step": 2191 + }, + { + "epoch": 0.11936316050996168, + "grad_norm": 0.7860369946268659, + "learning_rate": 0.00019953841230056752, + "loss": 12.5614, + "step": 2192 + }, + { + "epoch": 0.11941761450654469, + "grad_norm": 0.7248201454741638, + "learning_rate": 0.0001995375656158194, + "loss": 12.5566, + "step": 2193 + }, + { + "epoch": 0.1194720685031277, + "grad_norm": 0.7820765623167737, + "learning_rate": 0.00019953671815705056, + "loss": 12.7546, + "step": 2194 + }, + { + "epoch": 0.11952652249971071, + "grad_norm": 0.7850126988810134, + "learning_rate": 0.00019953586992426747, + "loss": 12.589, + "step": 2195 + }, + { + "epoch": 0.11958097649629372, + "grad_norm": 0.7990527946487643, + "learning_rate": 0.00019953502091747677, + "loss": 12.6696, + "step": 2196 + }, + { + "epoch": 0.11963543049287674, + "grad_norm": 0.8866117370502644, + "learning_rate": 0.00019953417113668505, + "loss": 12.8036, + "step": 2197 + }, + { + "epoch": 0.11968988448945975, + "grad_norm": 0.7082177203292337, + "learning_rate": 0.00019953332058189892, + "loss": 12.6838, + "step": 2198 + }, + { + "epoch": 0.11974433848604275, + "grad_norm": 0.8070197457949011, + "learning_rate": 0.000199532469253125, + "loss": 12.7203, + "step": 2199 + }, + { + "epoch": 0.11979879248262577, + "grad_norm": 0.8369868055876668, + "learning_rate": 0.00019953161715036992, + "loss": 12.6899, + "step": 2200 + }, + { + "epoch": 0.11985324647920878, + "grad_norm": 0.8999597921541851, + "learning_rate": 0.0001995307642736403, + "loss": 12.6945, + "step": 2201 + }, + { + "epoch": 0.1199077004757918, + "grad_norm": 0.7692650720140853, + "learning_rate": 0.00019952991062294274, + "loss": 12.7636, + "step": 2202 + }, + { + "epoch": 0.11996215447237481, + "grad_norm": 0.8510153607320751, + "learning_rate": 0.0001995290561982839, + "loss": 12.7078, + "step": 2203 + }, + { + "epoch": 0.12001660846895781, + "grad_norm": 0.783743892081945, + "learning_rate": 0.00019952820099967044, + "loss": 12.7044, + "step": 2204 + }, + { + "epoch": 0.12007106246554083, + "grad_norm": 0.7653316210747961, + "learning_rate": 0.000199527345027109, + "loss": 12.7126, + "step": 2205 + }, + { + "epoch": 0.12012551646212384, + "grad_norm": 0.8536028075609401, + "learning_rate": 0.00019952648828060622, + "loss": 12.7369, + "step": 2206 + }, + { + "epoch": 0.12017997045870685, + "grad_norm": 0.7582973421226827, + "learning_rate": 0.0001995256307601688, + "loss": 12.7263, + "step": 2207 + }, + { + "epoch": 0.12023442445528987, + "grad_norm": 0.9344165946344044, + "learning_rate": 0.00019952477246580337, + "loss": 12.6734, + "step": 2208 + }, + { + "epoch": 0.12028887845187287, + "grad_norm": 0.7814808828459261, + "learning_rate": 0.00019952391339751665, + "loss": 12.7027, + "step": 2209 + }, + { + "epoch": 0.1203433324484559, + "grad_norm": 0.8403333966762496, + "learning_rate": 0.00019952305355531525, + "loss": 12.7408, + "step": 2210 + }, + { + "epoch": 0.1203977864450389, + "grad_norm": 0.8319704410053225, + "learning_rate": 0.0001995221929392059, + "loss": 12.7889, + "step": 2211 + }, + { + "epoch": 0.12045224044162191, + "grad_norm": 0.7947473187321004, + "learning_rate": 0.00019952133154919527, + "loss": 12.7497, + "step": 2212 + }, + { + "epoch": 0.12050669443820493, + "grad_norm": 0.7839426718385665, + "learning_rate": 0.00019952046938529012, + "loss": 12.7165, + "step": 2213 + }, + { + "epoch": 0.12056114843478793, + "grad_norm": 0.7638925777090524, + "learning_rate": 0.00019951960644749706, + "loss": 12.6444, + "step": 2214 + }, + { + "epoch": 0.12061560243137094, + "grad_norm": 0.7897163234635378, + "learning_rate": 0.00019951874273582291, + "loss": 12.6192, + "step": 2215 + }, + { + "epoch": 0.12067005642795396, + "grad_norm": 0.7563778420238759, + "learning_rate": 0.0001995178782502743, + "loss": 12.6769, + "step": 2216 + }, + { + "epoch": 0.12072451042453697, + "grad_norm": 0.9390704289738171, + "learning_rate": 0.00019951701299085798, + "loss": 12.8826, + "step": 2217 + }, + { + "epoch": 0.12077896442111999, + "grad_norm": 0.8678848428558147, + "learning_rate": 0.00019951614695758069, + "loss": 12.8051, + "step": 2218 + }, + { + "epoch": 0.120833418417703, + "grad_norm": 0.9110099563247891, + "learning_rate": 0.00019951528015044913, + "loss": 12.7489, + "step": 2219 + }, + { + "epoch": 0.120887872414286, + "grad_norm": 0.8804716108031068, + "learning_rate": 0.00019951441256947007, + "loss": 12.6187, + "step": 2220 + }, + { + "epoch": 0.12094232641086902, + "grad_norm": 0.7909037529558258, + "learning_rate": 0.00019951354421465024, + "loss": 12.688, + "step": 2221 + }, + { + "epoch": 0.12099678040745203, + "grad_norm": 0.8543656619867279, + "learning_rate": 0.00019951267508599643, + "loss": 12.7867, + "step": 2222 + }, + { + "epoch": 0.12105123440403504, + "grad_norm": 0.6982517176985668, + "learning_rate": 0.00019951180518351536, + "loss": 12.6393, + "step": 2223 + }, + { + "epoch": 0.12110568840061806, + "grad_norm": 0.9337327935082366, + "learning_rate": 0.0001995109345072138, + "loss": 12.8072, + "step": 2224 + }, + { + "epoch": 0.12116014239720106, + "grad_norm": 0.9208777629968543, + "learning_rate": 0.00019951006305709852, + "loss": 12.6814, + "step": 2225 + }, + { + "epoch": 0.12121459639378408, + "grad_norm": 0.7661250813181665, + "learning_rate": 0.0001995091908331763, + "loss": 12.7313, + "step": 2226 + }, + { + "epoch": 0.12126905039036709, + "grad_norm": 0.8286799047391251, + "learning_rate": 0.00019950831783545393, + "loss": 12.6088, + "step": 2227 + }, + { + "epoch": 0.1213235043869501, + "grad_norm": 0.9007022814790493, + "learning_rate": 0.00019950744406393818, + "loss": 12.8683, + "step": 2228 + }, + { + "epoch": 0.12137795838353312, + "grad_norm": 0.7584611212657733, + "learning_rate": 0.00019950656951863588, + "loss": 12.6202, + "step": 2229 + }, + { + "epoch": 0.12143241238011612, + "grad_norm": 0.852208382943685, + "learning_rate": 0.0001995056941995538, + "loss": 12.8656, + "step": 2230 + }, + { + "epoch": 0.12148686637669913, + "grad_norm": 0.6967002768684686, + "learning_rate": 0.00019950481810669874, + "loss": 12.6892, + "step": 2231 + }, + { + "epoch": 0.12154132037328215, + "grad_norm": 0.8661162169371703, + "learning_rate": 0.00019950394124007757, + "loss": 12.5622, + "step": 2232 + }, + { + "epoch": 0.12159577436986516, + "grad_norm": 0.7339221495289832, + "learning_rate": 0.000199503063599697, + "loss": 12.6426, + "step": 2233 + }, + { + "epoch": 0.12165022836644818, + "grad_norm": 0.8185295105254345, + "learning_rate": 0.000199502185185564, + "loss": 12.76, + "step": 2234 + }, + { + "epoch": 0.12170468236303118, + "grad_norm": 0.7841512755708057, + "learning_rate": 0.00019950130599768527, + "loss": 12.8623, + "step": 2235 + }, + { + "epoch": 0.12175913635961419, + "grad_norm": 0.8584145810028976, + "learning_rate": 0.0001995004260360677, + "loss": 12.7982, + "step": 2236 + }, + { + "epoch": 0.12181359035619721, + "grad_norm": 0.8331097929496447, + "learning_rate": 0.00019949954530071814, + "loss": 12.7162, + "step": 2237 + }, + { + "epoch": 0.12186804435278022, + "grad_norm": 0.7656528692633603, + "learning_rate": 0.00019949866379164344, + "loss": 12.6292, + "step": 2238 + }, + { + "epoch": 0.12192249834936322, + "grad_norm": 0.8247940047932069, + "learning_rate": 0.00019949778150885042, + "loss": 12.5262, + "step": 2239 + }, + { + "epoch": 0.12197695234594624, + "grad_norm": 0.7842828299270601, + "learning_rate": 0.00019949689845234598, + "loss": 12.7852, + "step": 2240 + }, + { + "epoch": 0.12203140634252925, + "grad_norm": 0.8133244862462937, + "learning_rate": 0.00019949601462213696, + "loss": 12.7031, + "step": 2241 + }, + { + "epoch": 0.12208586033911227, + "grad_norm": 0.713675822257346, + "learning_rate": 0.00019949513001823026, + "loss": 12.592, + "step": 2242 + }, + { + "epoch": 0.12214031433569528, + "grad_norm": 0.8027291483722292, + "learning_rate": 0.00019949424464063274, + "loss": 12.7108, + "step": 2243 + }, + { + "epoch": 0.12219476833227828, + "grad_norm": 0.8099594121442285, + "learning_rate": 0.0001994933584893513, + "loss": 12.5928, + "step": 2244 + }, + { + "epoch": 0.1222492223288613, + "grad_norm": 0.772771167685069, + "learning_rate": 0.0001994924715643928, + "loss": 12.5073, + "step": 2245 + }, + { + "epoch": 0.12230367632544431, + "grad_norm": 0.7813372535261173, + "learning_rate": 0.00019949158386576413, + "loss": 12.4561, + "step": 2246 + }, + { + "epoch": 0.12235813032202732, + "grad_norm": 0.789388708629161, + "learning_rate": 0.00019949069539347227, + "loss": 12.7137, + "step": 2247 + }, + { + "epoch": 0.12241258431861034, + "grad_norm": 0.7318944286969588, + "learning_rate": 0.00019948980614752405, + "loss": 12.5061, + "step": 2248 + }, + { + "epoch": 0.12246703831519334, + "grad_norm": 0.8059878366303711, + "learning_rate": 0.00019948891612792645, + "loss": 12.6272, + "step": 2249 + }, + { + "epoch": 0.12252149231177636, + "grad_norm": 0.8177049921666123, + "learning_rate": 0.0001994880253346863, + "loss": 12.6948, + "step": 2250 + }, + { + "epoch": 0.12257594630835937, + "grad_norm": 0.760296658253682, + "learning_rate": 0.00019948713376781064, + "loss": 12.6752, + "step": 2251 + }, + { + "epoch": 0.12263040030494238, + "grad_norm": 0.781897864488266, + "learning_rate": 0.0001994862414273063, + "loss": 12.6803, + "step": 2252 + }, + { + "epoch": 0.1226848543015254, + "grad_norm": 0.7677497443346096, + "learning_rate": 0.00019948534831318025, + "loss": 12.7446, + "step": 2253 + }, + { + "epoch": 0.1227393082981084, + "grad_norm": 0.7855649193744076, + "learning_rate": 0.0001994844544254395, + "loss": 12.7643, + "step": 2254 + }, + { + "epoch": 0.12279376229469141, + "grad_norm": 0.8211053950537586, + "learning_rate": 0.0001994835597640909, + "loss": 12.7967, + "step": 2255 + }, + { + "epoch": 0.12284821629127443, + "grad_norm": 0.8089345545600491, + "learning_rate": 0.00019948266432914146, + "loss": 12.8287, + "step": 2256 + }, + { + "epoch": 0.12290267028785744, + "grad_norm": 0.9045745536594514, + "learning_rate": 0.00019948176812059818, + "loss": 12.7589, + "step": 2257 + }, + { + "epoch": 0.12295712428444044, + "grad_norm": 0.8362285356153368, + "learning_rate": 0.00019948087113846796, + "loss": 12.9202, + "step": 2258 + }, + { + "epoch": 0.12301157828102346, + "grad_norm": 0.7167029837106066, + "learning_rate": 0.0001994799733827578, + "loss": 12.6848, + "step": 2259 + }, + { + "epoch": 0.12306603227760647, + "grad_norm": 0.9069713997609858, + "learning_rate": 0.0001994790748534747, + "loss": 12.6965, + "step": 2260 + }, + { + "epoch": 0.12312048627418949, + "grad_norm": 0.8544585569223123, + "learning_rate": 0.00019947817555062563, + "loss": 12.774, + "step": 2261 + }, + { + "epoch": 0.1231749402707725, + "grad_norm": 0.7448066164304611, + "learning_rate": 0.00019947727547421756, + "loss": 12.6796, + "step": 2262 + }, + { + "epoch": 0.1232293942673555, + "grad_norm": 0.7350628811540768, + "learning_rate": 0.00019947637462425753, + "loss": 12.7487, + "step": 2263 + }, + { + "epoch": 0.12328384826393853, + "grad_norm": 0.8324612782436918, + "learning_rate": 0.00019947547300075254, + "loss": 12.6417, + "step": 2264 + }, + { + "epoch": 0.12333830226052153, + "grad_norm": 0.8118064750379741, + "learning_rate": 0.00019947457060370957, + "loss": 12.6985, + "step": 2265 + }, + { + "epoch": 0.12339275625710454, + "grad_norm": 0.7497361623227577, + "learning_rate": 0.00019947366743313568, + "loss": 12.6324, + "step": 2266 + }, + { + "epoch": 0.12344721025368756, + "grad_norm": 0.8414180110506619, + "learning_rate": 0.00019947276348903782, + "loss": 12.6279, + "step": 2267 + }, + { + "epoch": 0.12350166425027057, + "grad_norm": 0.7324293159778035, + "learning_rate": 0.00019947185877142314, + "loss": 12.7224, + "step": 2268 + }, + { + "epoch": 0.12355611824685359, + "grad_norm": 0.8854429184534627, + "learning_rate": 0.00019947095328029857, + "loss": 12.7163, + "step": 2269 + }, + { + "epoch": 0.12361057224343659, + "grad_norm": 0.9119324234099492, + "learning_rate": 0.00019947004701567118, + "loss": 12.8091, + "step": 2270 + }, + { + "epoch": 0.1236650262400196, + "grad_norm": 0.8905496427432454, + "learning_rate": 0.00019946913997754802, + "loss": 12.7258, + "step": 2271 + }, + { + "epoch": 0.12371948023660262, + "grad_norm": 0.9118610556427518, + "learning_rate": 0.00019946823216593614, + "loss": 12.7396, + "step": 2272 + }, + { + "epoch": 0.12377393423318563, + "grad_norm": 0.7735477988456516, + "learning_rate": 0.00019946732358084264, + "loss": 12.8204, + "step": 2273 + }, + { + "epoch": 0.12382838822976863, + "grad_norm": 0.9140441735532921, + "learning_rate": 0.00019946641422227454, + "loss": 12.6388, + "step": 2274 + }, + { + "epoch": 0.12388284222635165, + "grad_norm": 0.7906245809149918, + "learning_rate": 0.00019946550409023889, + "loss": 12.6582, + "step": 2275 + }, + { + "epoch": 0.12393729622293466, + "grad_norm": 0.9227199554541721, + "learning_rate": 0.00019946459318474283, + "loss": 12.7565, + "step": 2276 + }, + { + "epoch": 0.12399175021951768, + "grad_norm": 0.8634134768192193, + "learning_rate": 0.00019946368150579343, + "loss": 12.7553, + "step": 2277 + }, + { + "epoch": 0.12404620421610069, + "grad_norm": 0.8388803922129331, + "learning_rate": 0.00019946276905339773, + "loss": 12.872, + "step": 2278 + }, + { + "epoch": 0.12410065821268369, + "grad_norm": 0.8528527510163847, + "learning_rate": 0.00019946185582756288, + "loss": 12.8716, + "step": 2279 + }, + { + "epoch": 0.12415511220926671, + "grad_norm": 0.9367156268725985, + "learning_rate": 0.00019946094182829595, + "loss": 12.7734, + "step": 2280 + }, + { + "epoch": 0.12420956620584972, + "grad_norm": 0.7778224462800217, + "learning_rate": 0.00019946002705560406, + "loss": 12.6889, + "step": 2281 + }, + { + "epoch": 0.12426402020243273, + "grad_norm": 0.7491322077833296, + "learning_rate": 0.0001994591115094943, + "loss": 12.6262, + "step": 2282 + }, + { + "epoch": 0.12431847419901575, + "grad_norm": 0.8402926697141994, + "learning_rate": 0.00019945819518997384, + "loss": 12.7234, + "step": 2283 + }, + { + "epoch": 0.12437292819559875, + "grad_norm": 0.8636335512953711, + "learning_rate": 0.00019945727809704975, + "loss": 12.608, + "step": 2284 + }, + { + "epoch": 0.12442738219218177, + "grad_norm": 0.7371257107997498, + "learning_rate": 0.0001994563602307292, + "loss": 12.5674, + "step": 2285 + }, + { + "epoch": 0.12448183618876478, + "grad_norm": 0.7708748365298516, + "learning_rate": 0.0001994554415910193, + "loss": 12.6949, + "step": 2286 + }, + { + "epoch": 0.12453629018534779, + "grad_norm": 0.8447973867023547, + "learning_rate": 0.0001994545221779272, + "loss": 12.688, + "step": 2287 + }, + { + "epoch": 0.12459074418193081, + "grad_norm": 1.108122665821199, + "learning_rate": 0.0001994536019914601, + "loss": 12.9816, + "step": 2288 + }, + { + "epoch": 0.12464519817851381, + "grad_norm": 0.8541583693169076, + "learning_rate": 0.00019945268103162506, + "loss": 12.6994, + "step": 2289 + }, + { + "epoch": 0.12469965217509682, + "grad_norm": 0.8429082152082202, + "learning_rate": 0.00019945175929842935, + "loss": 12.7596, + "step": 2290 + }, + { + "epoch": 0.12475410617167984, + "grad_norm": 1.0515128515445635, + "learning_rate": 0.00019945083679188006, + "loss": 12.8312, + "step": 2291 + }, + { + "epoch": 0.12480856016826285, + "grad_norm": 0.8531126268343717, + "learning_rate": 0.00019944991351198435, + "loss": 12.7377, + "step": 2292 + }, + { + "epoch": 0.12486301416484587, + "grad_norm": 0.7856715624294648, + "learning_rate": 0.0001994489894587495, + "loss": 12.6877, + "step": 2293 + }, + { + "epoch": 0.12491746816142887, + "grad_norm": 1.0596739665009962, + "learning_rate": 0.00019944806463218257, + "loss": 12.6832, + "step": 2294 + }, + { + "epoch": 0.12497192215801188, + "grad_norm": 0.8027737513539224, + "learning_rate": 0.0001994471390322908, + "loss": 12.7159, + "step": 2295 + }, + { + "epoch": 0.1250263761545949, + "grad_norm": 0.9906726771062633, + "learning_rate": 0.00019944621265908147, + "loss": 12.6231, + "step": 2296 + }, + { + "epoch": 0.1250808301511779, + "grad_norm": 0.8841377858178393, + "learning_rate": 0.00019944528551256167, + "loss": 12.8105, + "step": 2297 + }, + { + "epoch": 0.12513528414776093, + "grad_norm": 0.9171916869214579, + "learning_rate": 0.00019944435759273866, + "loss": 12.7818, + "step": 2298 + }, + { + "epoch": 0.12518973814434392, + "grad_norm": 0.8517112317085067, + "learning_rate": 0.00019944342889961964, + "loss": 12.6482, + "step": 2299 + }, + { + "epoch": 0.12524419214092694, + "grad_norm": 0.8005258077612063, + "learning_rate": 0.00019944249943321186, + "loss": 12.7086, + "step": 2300 + }, + { + "epoch": 0.12529864613750996, + "grad_norm": 0.7865738100260907, + "learning_rate": 0.00019944156919352252, + "loss": 12.6759, + "step": 2301 + }, + { + "epoch": 0.12535310013409295, + "grad_norm": 0.8039482381347336, + "learning_rate": 0.00019944063818055888, + "loss": 12.6076, + "step": 2302 + }, + { + "epoch": 0.12540755413067597, + "grad_norm": 0.7764854255852931, + "learning_rate": 0.0001994397063943281, + "loss": 12.7396, + "step": 2303 + }, + { + "epoch": 0.125462008127259, + "grad_norm": 0.837158284917924, + "learning_rate": 0.00019943877383483757, + "loss": 12.5878, + "step": 2304 + }, + { + "epoch": 0.12551646212384202, + "grad_norm": 0.8371200249719656, + "learning_rate": 0.0001994378405020944, + "loss": 12.7985, + "step": 2305 + }, + { + "epoch": 0.125570916120425, + "grad_norm": 0.8137839369614454, + "learning_rate": 0.00019943690639610592, + "loss": 12.8366, + "step": 2306 + }, + { + "epoch": 0.12562537011700803, + "grad_norm": 0.8556346327405181, + "learning_rate": 0.00019943597151687937, + "loss": 12.7881, + "step": 2307 + }, + { + "epoch": 0.12567982411359105, + "grad_norm": 0.7709814149035997, + "learning_rate": 0.00019943503586442206, + "loss": 12.7591, + "step": 2308 + }, + { + "epoch": 0.12573427811017404, + "grad_norm": 0.7898640756861948, + "learning_rate": 0.00019943409943874122, + "loss": 12.6926, + "step": 2309 + }, + { + "epoch": 0.12578873210675706, + "grad_norm": 0.7367569366625164, + "learning_rate": 0.00019943316223984414, + "loss": 12.7746, + "step": 2310 + }, + { + "epoch": 0.12584318610334008, + "grad_norm": 0.7454982746348482, + "learning_rate": 0.00019943222426773813, + "loss": 12.7185, + "step": 2311 + }, + { + "epoch": 0.12589764009992307, + "grad_norm": 0.7967554594677826, + "learning_rate": 0.00019943128552243046, + "loss": 12.6746, + "step": 2312 + }, + { + "epoch": 0.1259520940965061, + "grad_norm": 0.7758457456666698, + "learning_rate": 0.00019943034600392845, + "loss": 12.783, + "step": 2313 + }, + { + "epoch": 0.12600654809308912, + "grad_norm": 0.8175081193982631, + "learning_rate": 0.00019942940571223935, + "loss": 12.7546, + "step": 2314 + }, + { + "epoch": 0.1260610020896721, + "grad_norm": 0.7223717731557533, + "learning_rate": 0.00019942846464737058, + "loss": 12.6316, + "step": 2315 + }, + { + "epoch": 0.12611545608625513, + "grad_norm": 0.7201639049114413, + "learning_rate": 0.00019942752280932937, + "loss": 12.6579, + "step": 2316 + }, + { + "epoch": 0.12616991008283815, + "grad_norm": 0.6723939651791366, + "learning_rate": 0.00019942658019812305, + "loss": 12.7038, + "step": 2317 + }, + { + "epoch": 0.12622436407942114, + "grad_norm": 0.756418238434827, + "learning_rate": 0.000199425636813759, + "loss": 12.7699, + "step": 2318 + }, + { + "epoch": 0.12627881807600416, + "grad_norm": 0.7184296523805659, + "learning_rate": 0.0001994246926562445, + "loss": 12.6331, + "step": 2319 + }, + { + "epoch": 0.12633327207258718, + "grad_norm": 0.8591570290913682, + "learning_rate": 0.0001994237477255869, + "loss": 12.7931, + "step": 2320 + }, + { + "epoch": 0.12638772606917018, + "grad_norm": 0.7893364107713335, + "learning_rate": 0.0001994228020217936, + "loss": 12.7506, + "step": 2321 + }, + { + "epoch": 0.1264421800657532, + "grad_norm": 0.8951456925670717, + "learning_rate": 0.00019942185554487193, + "loss": 12.7711, + "step": 2322 + }, + { + "epoch": 0.12649663406233622, + "grad_norm": 0.9598649738838202, + "learning_rate": 0.0001994209082948292, + "loss": 12.616, + "step": 2323 + }, + { + "epoch": 0.12655108805891924, + "grad_norm": 0.7527902561752199, + "learning_rate": 0.00019941996027167286, + "loss": 12.5971, + "step": 2324 + }, + { + "epoch": 0.12660554205550223, + "grad_norm": 0.768501031065377, + "learning_rate": 0.0001994190114754102, + "loss": 12.6549, + "step": 2325 + }, + { + "epoch": 0.12665999605208525, + "grad_norm": 0.817273213089173, + "learning_rate": 0.00019941806190604863, + "loss": 12.5479, + "step": 2326 + }, + { + "epoch": 0.12671445004866827, + "grad_norm": 0.8645238416141345, + "learning_rate": 0.00019941711156359554, + "loss": 12.8375, + "step": 2327 + }, + { + "epoch": 0.12676890404525126, + "grad_norm": 1.278825422993769, + "learning_rate": 0.00019941616044805833, + "loss": 12.6886, + "step": 2328 + }, + { + "epoch": 0.12682335804183428, + "grad_norm": 0.7926814342834742, + "learning_rate": 0.0001994152085594444, + "loss": 12.8379, + "step": 2329 + }, + { + "epoch": 0.1268778120384173, + "grad_norm": 0.8287504474355063, + "learning_rate": 0.0001994142558977611, + "loss": 12.6508, + "step": 2330 + }, + { + "epoch": 0.1269322660350003, + "grad_norm": 1.0534286243495066, + "learning_rate": 0.0001994133024630159, + "loss": 12.7352, + "step": 2331 + }, + { + "epoch": 0.12698672003158332, + "grad_norm": 0.7493620548408583, + "learning_rate": 0.00019941234825521616, + "loss": 12.7401, + "step": 2332 + }, + { + "epoch": 0.12704117402816634, + "grad_norm": 0.8318357837689145, + "learning_rate": 0.00019941139327436935, + "loss": 12.652, + "step": 2333 + }, + { + "epoch": 0.12709562802474933, + "grad_norm": 0.7715913785784512, + "learning_rate": 0.0001994104375204829, + "loss": 12.7926, + "step": 2334 + }, + { + "epoch": 0.12715008202133235, + "grad_norm": 0.7542855191671053, + "learning_rate": 0.00019940948099356418, + "loss": 12.6063, + "step": 2335 + }, + { + "epoch": 0.12720453601791537, + "grad_norm": 0.9658985909567381, + "learning_rate": 0.00019940852369362068, + "loss": 12.6083, + "step": 2336 + }, + { + "epoch": 0.12725899001449836, + "grad_norm": 0.6851974060541999, + "learning_rate": 0.00019940756562065982, + "loss": 12.5249, + "step": 2337 + }, + { + "epoch": 0.12731344401108138, + "grad_norm": 0.7785530678240814, + "learning_rate": 0.00019940660677468904, + "loss": 12.7989, + "step": 2338 + }, + { + "epoch": 0.1273678980076644, + "grad_norm": 0.7379846325689866, + "learning_rate": 0.00019940564715571586, + "loss": 12.6929, + "step": 2339 + }, + { + "epoch": 0.12742235200424742, + "grad_norm": 0.792901212371482, + "learning_rate": 0.00019940468676374765, + "loss": 12.779, + "step": 2340 + }, + { + "epoch": 0.12747680600083042, + "grad_norm": 0.7337004708678737, + "learning_rate": 0.000199403725598792, + "loss": 12.6837, + "step": 2341 + }, + { + "epoch": 0.12753125999741344, + "grad_norm": 0.7368460634198486, + "learning_rate": 0.00019940276366085623, + "loss": 12.6762, + "step": 2342 + }, + { + "epoch": 0.12758571399399646, + "grad_norm": 0.7485113299926779, + "learning_rate": 0.00019940180094994792, + "loss": 12.6945, + "step": 2343 + }, + { + "epoch": 0.12764016799057945, + "grad_norm": 0.7121957433105658, + "learning_rate": 0.00019940083746607456, + "loss": 12.6726, + "step": 2344 + }, + { + "epoch": 0.12769462198716247, + "grad_norm": 1.5233087180492444, + "learning_rate": 0.0001993998732092436, + "loss": 12.6615, + "step": 2345 + }, + { + "epoch": 0.1277490759837455, + "grad_norm": 0.7962138637784169, + "learning_rate": 0.00019939890817946259, + "loss": 12.6639, + "step": 2346 + }, + { + "epoch": 0.12780352998032848, + "grad_norm": 0.7149436169520963, + "learning_rate": 0.00019939794237673896, + "loss": 12.7055, + "step": 2347 + }, + { + "epoch": 0.1278579839769115, + "grad_norm": 0.7602346657839104, + "learning_rate": 0.00019939697580108025, + "loss": 12.7092, + "step": 2348 + }, + { + "epoch": 0.12791243797349452, + "grad_norm": 0.751533657680629, + "learning_rate": 0.00019939600845249403, + "loss": 12.6796, + "step": 2349 + }, + { + "epoch": 0.12796689197007752, + "grad_norm": 0.7980077399207712, + "learning_rate": 0.00019939504033098776, + "loss": 12.6877, + "step": 2350 + }, + { + "epoch": 0.12802134596666054, + "grad_norm": 0.7532150415596107, + "learning_rate": 0.000199394071436569, + "loss": 12.7105, + "step": 2351 + }, + { + "epoch": 0.12807579996324356, + "grad_norm": 0.7315030759058961, + "learning_rate": 0.00019939310176924523, + "loss": 12.646, + "step": 2352 + }, + { + "epoch": 0.12813025395982655, + "grad_norm": 0.7864302503318793, + "learning_rate": 0.00019939213132902408, + "loss": 12.6704, + "step": 2353 + }, + { + "epoch": 0.12818470795640957, + "grad_norm": 0.7913586628644312, + "learning_rate": 0.00019939116011591303, + "loss": 12.7471, + "step": 2354 + }, + { + "epoch": 0.1282391619529926, + "grad_norm": 0.8119181526158408, + "learning_rate": 0.00019939018812991966, + "loss": 12.7335, + "step": 2355 + }, + { + "epoch": 0.1282936159495756, + "grad_norm": 0.7735205615622864, + "learning_rate": 0.00019938921537105152, + "loss": 12.7511, + "step": 2356 + }, + { + "epoch": 0.1283480699461586, + "grad_norm": 0.7968665962275414, + "learning_rate": 0.00019938824183931617, + "loss": 12.7516, + "step": 2357 + }, + { + "epoch": 0.12840252394274163, + "grad_norm": 0.7338392610553128, + "learning_rate": 0.00019938726753472116, + "loss": 12.6123, + "step": 2358 + }, + { + "epoch": 0.12845697793932465, + "grad_norm": 0.7827847883010737, + "learning_rate": 0.00019938629245727413, + "loss": 12.7633, + "step": 2359 + }, + { + "epoch": 0.12851143193590764, + "grad_norm": 0.7450380140451616, + "learning_rate": 0.00019938531660698258, + "loss": 12.773, + "step": 2360 + }, + { + "epoch": 0.12856588593249066, + "grad_norm": 0.7989051885239961, + "learning_rate": 0.00019938433998385418, + "loss": 12.6847, + "step": 2361 + }, + { + "epoch": 0.12862033992907368, + "grad_norm": 0.7356763210194389, + "learning_rate": 0.00019938336258789647, + "loss": 12.655, + "step": 2362 + }, + { + "epoch": 0.12867479392565667, + "grad_norm": 0.8986725123346193, + "learning_rate": 0.00019938238441911705, + "loss": 12.9576, + "step": 2363 + }, + { + "epoch": 0.1287292479222397, + "grad_norm": 0.7481316704368125, + "learning_rate": 0.00019938140547752354, + "loss": 12.737, + "step": 2364 + }, + { + "epoch": 0.1287837019188227, + "grad_norm": 0.7894228636204875, + "learning_rate": 0.0001993804257631236, + "loss": 12.7876, + "step": 2365 + }, + { + "epoch": 0.1288381559154057, + "grad_norm": 0.8122936643074704, + "learning_rate": 0.0001993794452759248, + "loss": 12.7443, + "step": 2366 + }, + { + "epoch": 0.12889260991198873, + "grad_norm": 0.7765834068020498, + "learning_rate": 0.00019937846401593473, + "loss": 12.8403, + "step": 2367 + }, + { + "epoch": 0.12894706390857175, + "grad_norm": 0.6921187212435841, + "learning_rate": 0.00019937748198316105, + "loss": 12.6311, + "step": 2368 + }, + { + "epoch": 0.12900151790515474, + "grad_norm": 0.7502571830149722, + "learning_rate": 0.00019937649917761143, + "loss": 12.6613, + "step": 2369 + }, + { + "epoch": 0.12905597190173776, + "grad_norm": 0.7821551845777908, + "learning_rate": 0.0001993755155992935, + "loss": 12.8988, + "step": 2370 + }, + { + "epoch": 0.12911042589832078, + "grad_norm": 0.7418220682205031, + "learning_rate": 0.00019937453124821487, + "loss": 12.6488, + "step": 2371 + }, + { + "epoch": 0.1291648798949038, + "grad_norm": 0.8117136365721119, + "learning_rate": 0.00019937354612438321, + "loss": 12.6928, + "step": 2372 + }, + { + "epoch": 0.1292193338914868, + "grad_norm": 0.7981778766751755, + "learning_rate": 0.00019937256022780622, + "loss": 12.8224, + "step": 2373 + }, + { + "epoch": 0.1292737878880698, + "grad_norm": 0.7479712328362667, + "learning_rate": 0.00019937157355849153, + "loss": 12.6721, + "step": 2374 + }, + { + "epoch": 0.12932824188465283, + "grad_norm": 0.7884826639891351, + "learning_rate": 0.0001993705861164468, + "loss": 12.8831, + "step": 2375 + }, + { + "epoch": 0.12938269588123583, + "grad_norm": 0.7255531485760002, + "learning_rate": 0.00019936959790167974, + "loss": 12.6682, + "step": 2376 + }, + { + "epoch": 0.12943714987781885, + "grad_norm": 0.7100455346830856, + "learning_rate": 0.00019936860891419804, + "loss": 12.669, + "step": 2377 + }, + { + "epoch": 0.12949160387440187, + "grad_norm": 0.7806212789623882, + "learning_rate": 0.00019936761915400936, + "loss": 12.532, + "step": 2378 + }, + { + "epoch": 0.12954605787098486, + "grad_norm": 0.7431760599900052, + "learning_rate": 0.0001993666286211214, + "loss": 12.6516, + "step": 2379 + }, + { + "epoch": 0.12960051186756788, + "grad_norm": 0.7213877200770814, + "learning_rate": 0.00019936563731554188, + "loss": 12.6298, + "step": 2380 + }, + { + "epoch": 0.1296549658641509, + "grad_norm": 0.8736262624845132, + "learning_rate": 0.0001993646452372785, + "loss": 12.8446, + "step": 2381 + }, + { + "epoch": 0.1297094198607339, + "grad_norm": 0.7855509988563504, + "learning_rate": 0.000199363652386339, + "loss": 12.7075, + "step": 2382 + }, + { + "epoch": 0.1297638738573169, + "grad_norm": 0.7299958322594098, + "learning_rate": 0.00019936265876273104, + "loss": 12.6899, + "step": 2383 + }, + { + "epoch": 0.12981832785389993, + "grad_norm": 0.8168217259134454, + "learning_rate": 0.00019936166436646237, + "loss": 12.7358, + "step": 2384 + }, + { + "epoch": 0.12987278185048293, + "grad_norm": 0.7768691061368659, + "learning_rate": 0.00019936066919754077, + "loss": 12.705, + "step": 2385 + }, + { + "epoch": 0.12992723584706595, + "grad_norm": 0.8847332828074436, + "learning_rate": 0.00019935967325597392, + "loss": 12.7719, + "step": 2386 + }, + { + "epoch": 0.12998168984364897, + "grad_norm": 0.7083908366634801, + "learning_rate": 0.0001993586765417696, + "loss": 12.7458, + "step": 2387 + }, + { + "epoch": 0.13003614384023196, + "grad_norm": 0.7875102036153997, + "learning_rate": 0.00019935767905493557, + "loss": 12.6281, + "step": 2388 + }, + { + "epoch": 0.13009059783681498, + "grad_norm": 0.7395792011876929, + "learning_rate": 0.00019935668079547957, + "loss": 12.6863, + "step": 2389 + }, + { + "epoch": 0.130145051833398, + "grad_norm": 0.7621670186428621, + "learning_rate": 0.00019935568176340928, + "loss": 12.7617, + "step": 2390 + }, + { + "epoch": 0.13019950582998102, + "grad_norm": 0.7847645774595151, + "learning_rate": 0.00019935468195873262, + "loss": 12.7119, + "step": 2391 + }, + { + "epoch": 0.13025395982656401, + "grad_norm": 0.7375661269533667, + "learning_rate": 0.00019935368138145727, + "loss": 12.7158, + "step": 2392 + }, + { + "epoch": 0.13030841382314703, + "grad_norm": 0.8126269955672756, + "learning_rate": 0.00019935268003159101, + "loss": 12.6302, + "step": 2393 + }, + { + "epoch": 0.13036286781973005, + "grad_norm": 0.8268543789843175, + "learning_rate": 0.0001993516779091417, + "loss": 12.7678, + "step": 2394 + }, + { + "epoch": 0.13041732181631305, + "grad_norm": 0.8205187707857627, + "learning_rate": 0.00019935067501411705, + "loss": 12.6628, + "step": 2395 + }, + { + "epoch": 0.13047177581289607, + "grad_norm": 0.7378866568993926, + "learning_rate": 0.0001993496713465249, + "loss": 12.7025, + "step": 2396 + }, + { + "epoch": 0.1305262298094791, + "grad_norm": 0.877940758716135, + "learning_rate": 0.00019934866690637302, + "loss": 12.72, + "step": 2397 + }, + { + "epoch": 0.13058068380606208, + "grad_norm": 0.7267513693465012, + "learning_rate": 0.00019934766169366929, + "loss": 12.6185, + "step": 2398 + }, + { + "epoch": 0.1306351378026451, + "grad_norm": 0.6846617254702555, + "learning_rate": 0.0001993466557084214, + "loss": 12.6491, + "step": 2399 + }, + { + "epoch": 0.13068959179922812, + "grad_norm": 0.7491981620181035, + "learning_rate": 0.00019934564895063734, + "loss": 12.6363, + "step": 2400 + }, + { + "epoch": 0.13074404579581111, + "grad_norm": 0.7751118812572666, + "learning_rate": 0.00019934464142032482, + "loss": 12.7001, + "step": 2401 + }, + { + "epoch": 0.13079849979239414, + "grad_norm": 0.8264496185916214, + "learning_rate": 0.00019934363311749172, + "loss": 12.73, + "step": 2402 + }, + { + "epoch": 0.13085295378897716, + "grad_norm": 0.8321637617252079, + "learning_rate": 0.00019934262404214584, + "loss": 12.6028, + "step": 2403 + }, + { + "epoch": 0.13090740778556015, + "grad_norm": 0.7848027006827053, + "learning_rate": 0.0001993416141942951, + "loss": 12.9119, + "step": 2404 + }, + { + "epoch": 0.13096186178214317, + "grad_norm": 0.857679708710815, + "learning_rate": 0.00019934060357394725, + "loss": 12.6421, + "step": 2405 + }, + { + "epoch": 0.1310163157787262, + "grad_norm": 0.8055289289746956, + "learning_rate": 0.00019933959218111026, + "loss": 12.6846, + "step": 2406 + }, + { + "epoch": 0.1310707697753092, + "grad_norm": 0.7747666351779556, + "learning_rate": 0.0001993385800157919, + "loss": 12.7859, + "step": 2407 + }, + { + "epoch": 0.1311252237718922, + "grad_norm": 0.756568540020912, + "learning_rate": 0.00019933756707800012, + "loss": 12.8728, + "step": 2408 + }, + { + "epoch": 0.13117967776847522, + "grad_norm": 0.7997659508246298, + "learning_rate": 0.00019933655336774276, + "loss": 12.7844, + "step": 2409 + }, + { + "epoch": 0.13123413176505824, + "grad_norm": 0.8394640526431785, + "learning_rate": 0.00019933553888502767, + "loss": 12.7839, + "step": 2410 + }, + { + "epoch": 0.13128858576164124, + "grad_norm": 0.8828481353034261, + "learning_rate": 0.0001993345236298628, + "loss": 12.6463, + "step": 2411 + }, + { + "epoch": 0.13134303975822426, + "grad_norm": 0.7343617366920848, + "learning_rate": 0.000199333507602256, + "loss": 12.7517, + "step": 2412 + }, + { + "epoch": 0.13139749375480728, + "grad_norm": 0.8459936383316501, + "learning_rate": 0.0001993324908022152, + "loss": 12.7808, + "step": 2413 + }, + { + "epoch": 0.13145194775139027, + "grad_norm": 0.7608363034537843, + "learning_rate": 0.00019933147322974827, + "loss": 12.7196, + "step": 2414 + }, + { + "epoch": 0.1315064017479733, + "grad_norm": 0.6668086521669773, + "learning_rate": 0.00019933045488486318, + "loss": 12.5531, + "step": 2415 + }, + { + "epoch": 0.1315608557445563, + "grad_norm": 0.7002068468454027, + "learning_rate": 0.00019932943576756777, + "loss": 12.7032, + "step": 2416 + }, + { + "epoch": 0.1316153097411393, + "grad_norm": 0.9000736480641097, + "learning_rate": 0.00019932841587787006, + "loss": 12.6522, + "step": 2417 + }, + { + "epoch": 0.13166976373772232, + "grad_norm": 0.7721331574493356, + "learning_rate": 0.0001993273952157779, + "loss": 12.6671, + "step": 2418 + }, + { + "epoch": 0.13172421773430534, + "grad_norm": 0.8308437806311205, + "learning_rate": 0.00019932637378129926, + "loss": 12.5842, + "step": 2419 + }, + { + "epoch": 0.13177867173088834, + "grad_norm": 0.8219447869064825, + "learning_rate": 0.00019932535157444206, + "loss": 12.9223, + "step": 2420 + }, + { + "epoch": 0.13183312572747136, + "grad_norm": 0.8560251642378786, + "learning_rate": 0.00019932432859521432, + "loss": 12.7095, + "step": 2421 + }, + { + "epoch": 0.13188757972405438, + "grad_norm": 0.7308984604514989, + "learning_rate": 0.00019932330484362392, + "loss": 12.7873, + "step": 2422 + }, + { + "epoch": 0.1319420337206374, + "grad_norm": 0.810039047049506, + "learning_rate": 0.00019932228031967886, + "loss": 12.6626, + "step": 2423 + }, + { + "epoch": 0.1319964877172204, + "grad_norm": 0.7508769945880449, + "learning_rate": 0.00019932125502338706, + "loss": 12.7308, + "step": 2424 + }, + { + "epoch": 0.1320509417138034, + "grad_norm": 0.756496840443002, + "learning_rate": 0.00019932022895475653, + "loss": 12.7622, + "step": 2425 + }, + { + "epoch": 0.13210539571038643, + "grad_norm": 0.8579954679459876, + "learning_rate": 0.00019931920211379526, + "loss": 12.6203, + "step": 2426 + }, + { + "epoch": 0.13215984970696942, + "grad_norm": 0.8188439639621717, + "learning_rate": 0.00019931817450051121, + "loss": 12.777, + "step": 2427 + }, + { + "epoch": 0.13221430370355244, + "grad_norm": 0.7532868906855189, + "learning_rate": 0.00019931714611491237, + "loss": 12.8348, + "step": 2428 + }, + { + "epoch": 0.13226875770013546, + "grad_norm": 0.7958028147926751, + "learning_rate": 0.00019931611695700677, + "loss": 12.694, + "step": 2429 + }, + { + "epoch": 0.13232321169671846, + "grad_norm": 0.7814894757386551, + "learning_rate": 0.00019931508702680236, + "loss": 12.6925, + "step": 2430 + }, + { + "epoch": 0.13237766569330148, + "grad_norm": 0.8324479778200059, + "learning_rate": 0.00019931405632430722, + "loss": 12.6425, + "step": 2431 + }, + { + "epoch": 0.1324321196898845, + "grad_norm": 0.8092206373047455, + "learning_rate": 0.00019931302484952926, + "loss": 12.8163, + "step": 2432 + }, + { + "epoch": 0.1324865736864675, + "grad_norm": 0.7377978092615135, + "learning_rate": 0.00019931199260247664, + "loss": 12.5553, + "step": 2433 + }, + { + "epoch": 0.1325410276830505, + "grad_norm": 0.755796514648496, + "learning_rate": 0.00019931095958315725, + "loss": 12.7638, + "step": 2434 + }, + { + "epoch": 0.13259548167963353, + "grad_norm": 0.8040857355178929, + "learning_rate": 0.00019930992579157922, + "loss": 12.7484, + "step": 2435 + }, + { + "epoch": 0.13264993567621652, + "grad_norm": 0.7939845278937419, + "learning_rate": 0.00019930889122775054, + "loss": 12.5594, + "step": 2436 + }, + { + "epoch": 0.13270438967279954, + "grad_norm": 0.7699514168539291, + "learning_rate": 0.00019930785589167927, + "loss": 12.7626, + "step": 2437 + }, + { + "epoch": 0.13275884366938256, + "grad_norm": 1.033416481849656, + "learning_rate": 0.00019930681978337343, + "loss": 12.8065, + "step": 2438 + }, + { + "epoch": 0.13281329766596559, + "grad_norm": 0.7417687328317938, + "learning_rate": 0.00019930578290284115, + "loss": 12.6928, + "step": 2439 + }, + { + "epoch": 0.13286775166254858, + "grad_norm": 0.8562815513817471, + "learning_rate": 0.00019930474525009043, + "loss": 12.8132, + "step": 2440 + }, + { + "epoch": 0.1329222056591316, + "grad_norm": 0.8832737109439536, + "learning_rate": 0.00019930370682512936, + "loss": 12.7486, + "step": 2441 + }, + { + "epoch": 0.13297665965571462, + "grad_norm": 0.8655967771917551, + "learning_rate": 0.000199302667627966, + "loss": 12.7677, + "step": 2442 + }, + { + "epoch": 0.1330311136522976, + "grad_norm": 0.8352025276927129, + "learning_rate": 0.00019930162765860847, + "loss": 12.6883, + "step": 2443 + }, + { + "epoch": 0.13308556764888063, + "grad_norm": 0.7657269519474376, + "learning_rate": 0.0001993005869170648, + "loss": 12.639, + "step": 2444 + }, + { + "epoch": 0.13314002164546365, + "grad_norm": 1.0200413938057031, + "learning_rate": 0.00019929954540334315, + "loss": 12.6656, + "step": 2445 + }, + { + "epoch": 0.13319447564204664, + "grad_norm": 0.7208134883027584, + "learning_rate": 0.00019929850311745155, + "loss": 12.7271, + "step": 2446 + }, + { + "epoch": 0.13324892963862967, + "grad_norm": 0.8273881026402172, + "learning_rate": 0.00019929746005939813, + "loss": 12.7771, + "step": 2447 + }, + { + "epoch": 0.13330338363521269, + "grad_norm": 0.7493445587390899, + "learning_rate": 0.00019929641622919104, + "loss": 12.6896, + "step": 2448 + }, + { + "epoch": 0.13335783763179568, + "grad_norm": 0.742375521459718, + "learning_rate": 0.00019929537162683835, + "loss": 12.6528, + "step": 2449 + }, + { + "epoch": 0.1334122916283787, + "grad_norm": 0.9208561020185162, + "learning_rate": 0.0001992943262523482, + "loss": 12.7464, + "step": 2450 + }, + { + "epoch": 0.13346674562496172, + "grad_norm": 0.7293788209577327, + "learning_rate": 0.0001992932801057287, + "loss": 12.6756, + "step": 2451 + }, + { + "epoch": 0.1335211996215447, + "grad_norm": 0.9920197065465052, + "learning_rate": 0.00019929223318698803, + "loss": 12.7579, + "step": 2452 + }, + { + "epoch": 0.13357565361812773, + "grad_norm": 0.9922516429306217, + "learning_rate": 0.0001992911854961343, + "loss": 12.6291, + "step": 2453 + }, + { + "epoch": 0.13363010761471075, + "grad_norm": 0.8123075917723392, + "learning_rate": 0.00019929013703317563, + "loss": 12.7533, + "step": 2454 + }, + { + "epoch": 0.13368456161129375, + "grad_norm": 0.8599656764223896, + "learning_rate": 0.00019928908779812026, + "loss": 12.6901, + "step": 2455 + }, + { + "epoch": 0.13373901560787677, + "grad_norm": 0.8907052736792533, + "learning_rate": 0.00019928803779097623, + "loss": 12.7852, + "step": 2456 + }, + { + "epoch": 0.13379346960445979, + "grad_norm": 0.8338265561480035, + "learning_rate": 0.00019928698701175178, + "loss": 12.6099, + "step": 2457 + }, + { + "epoch": 0.1338479236010428, + "grad_norm": 0.9079405685140904, + "learning_rate": 0.0001992859354604551, + "loss": 12.6389, + "step": 2458 + }, + { + "epoch": 0.1339023775976258, + "grad_norm": 0.8278803604932522, + "learning_rate": 0.00019928488313709434, + "loss": 12.7932, + "step": 2459 + }, + { + "epoch": 0.13395683159420882, + "grad_norm": 0.8003952204297358, + "learning_rate": 0.00019928383004167764, + "loss": 12.7772, + "step": 2460 + }, + { + "epoch": 0.13401128559079184, + "grad_norm": 0.672183505197408, + "learning_rate": 0.00019928277617421326, + "loss": 12.5482, + "step": 2461 + }, + { + "epoch": 0.13406573958737483, + "grad_norm": 0.7396340985238629, + "learning_rate": 0.00019928172153470933, + "loss": 12.7081, + "step": 2462 + }, + { + "epoch": 0.13412019358395785, + "grad_norm": 0.735938888362427, + "learning_rate": 0.00019928066612317412, + "loss": 12.5721, + "step": 2463 + }, + { + "epoch": 0.13417464758054087, + "grad_norm": 0.7331259242018381, + "learning_rate": 0.00019927960993961578, + "loss": 12.5909, + "step": 2464 + }, + { + "epoch": 0.13422910157712387, + "grad_norm": 0.8850945952602003, + "learning_rate": 0.00019927855298404253, + "loss": 12.7318, + "step": 2465 + }, + { + "epoch": 0.1342835555737069, + "grad_norm": 0.7471688438849723, + "learning_rate": 0.00019927749525646264, + "loss": 12.6139, + "step": 2466 + }, + { + "epoch": 0.1343380095702899, + "grad_norm": 0.8094440813264298, + "learning_rate": 0.0001992764367568843, + "loss": 12.8425, + "step": 2467 + }, + { + "epoch": 0.1343924635668729, + "grad_norm": 0.8310365902118073, + "learning_rate": 0.0001992753774853157, + "loss": 12.652, + "step": 2468 + }, + { + "epoch": 0.13444691756345592, + "grad_norm": 0.7309480519652607, + "learning_rate": 0.00019927431744176514, + "loss": 12.6415, + "step": 2469 + }, + { + "epoch": 0.13450137156003894, + "grad_norm": 0.8196147699360571, + "learning_rate": 0.00019927325662624082, + "loss": 12.7008, + "step": 2470 + }, + { + "epoch": 0.13455582555662193, + "grad_norm": 0.8298741408117133, + "learning_rate": 0.00019927219503875103, + "loss": 12.7277, + "step": 2471 + }, + { + "epoch": 0.13461027955320495, + "grad_norm": 0.6554252989687059, + "learning_rate": 0.00019927113267930398, + "loss": 12.6143, + "step": 2472 + }, + { + "epoch": 0.13466473354978797, + "grad_norm": 0.7473824587961554, + "learning_rate": 0.000199270069547908, + "loss": 12.6095, + "step": 2473 + }, + { + "epoch": 0.134719187546371, + "grad_norm": 0.8160647428862018, + "learning_rate": 0.00019926900564457128, + "loss": 12.692, + "step": 2474 + }, + { + "epoch": 0.134773641542954, + "grad_norm": 0.7492014235360235, + "learning_rate": 0.0001992679409693021, + "loss": 12.7103, + "step": 2475 + }, + { + "epoch": 0.134828095539537, + "grad_norm": 0.839607875095279, + "learning_rate": 0.0001992668755221088, + "loss": 12.8071, + "step": 2476 + }, + { + "epoch": 0.13488254953612003, + "grad_norm": 0.801927344699619, + "learning_rate": 0.00019926580930299963, + "loss": 12.7339, + "step": 2477 + }, + { + "epoch": 0.13493700353270302, + "grad_norm": 0.8960165647196202, + "learning_rate": 0.00019926474231198285, + "loss": 12.6735, + "step": 2478 + }, + { + "epoch": 0.13499145752928604, + "grad_norm": 0.7933722921821766, + "learning_rate": 0.0001992636745490668, + "loss": 12.7974, + "step": 2479 + }, + { + "epoch": 0.13504591152586906, + "grad_norm": 0.9451088853717208, + "learning_rate": 0.0001992626060142598, + "loss": 12.5382, + "step": 2480 + }, + { + "epoch": 0.13510036552245205, + "grad_norm": 0.8345545262101434, + "learning_rate": 0.00019926153670757007, + "loss": 12.7088, + "step": 2481 + }, + { + "epoch": 0.13515481951903507, + "grad_norm": 0.7211833903776345, + "learning_rate": 0.00019926046662900606, + "loss": 12.6001, + "step": 2482 + }, + { + "epoch": 0.1352092735156181, + "grad_norm": 0.8170867246642093, + "learning_rate": 0.00019925939577857595, + "loss": 12.7586, + "step": 2483 + }, + { + "epoch": 0.1352637275122011, + "grad_norm": 0.8766455026502107, + "learning_rate": 0.00019925832415628817, + "loss": 12.8397, + "step": 2484 + }, + { + "epoch": 0.1353181815087841, + "grad_norm": 0.8304795223103192, + "learning_rate": 0.000199257251762151, + "loss": 12.8194, + "step": 2485 + }, + { + "epoch": 0.13537263550536713, + "grad_norm": 0.8258332528487458, + "learning_rate": 0.0001992561785961728, + "loss": 12.8221, + "step": 2486 + }, + { + "epoch": 0.13542708950195012, + "grad_norm": 0.7722041721631009, + "learning_rate": 0.0001992551046583619, + "loss": 12.6855, + "step": 2487 + }, + { + "epoch": 0.13548154349853314, + "grad_norm": 0.8196913899421888, + "learning_rate": 0.00019925402994872666, + "loss": 12.6272, + "step": 2488 + }, + { + "epoch": 0.13553599749511616, + "grad_norm": 0.8220844770061915, + "learning_rate": 0.00019925295446727548, + "loss": 12.7254, + "step": 2489 + }, + { + "epoch": 0.13559045149169918, + "grad_norm": 0.7441266743639062, + "learning_rate": 0.00019925187821401663, + "loss": 12.6433, + "step": 2490 + }, + { + "epoch": 0.13564490548828217, + "grad_norm": 0.7922384453675508, + "learning_rate": 0.00019925080118895857, + "loss": 12.7242, + "step": 2491 + }, + { + "epoch": 0.1356993594848652, + "grad_norm": 0.7615577137990498, + "learning_rate": 0.00019924972339210962, + "loss": 12.6124, + "step": 2492 + }, + { + "epoch": 0.13575381348144822, + "grad_norm": 0.8763841639212409, + "learning_rate": 0.00019924864482347813, + "loss": 12.7149, + "step": 2493 + }, + { + "epoch": 0.1358082674780312, + "grad_norm": 0.755861381321944, + "learning_rate": 0.00019924756548307258, + "loss": 12.7621, + "step": 2494 + }, + { + "epoch": 0.13586272147461423, + "grad_norm": 0.7151226118739447, + "learning_rate": 0.00019924648537090133, + "loss": 12.5913, + "step": 2495 + }, + { + "epoch": 0.13591717547119725, + "grad_norm": 0.7134477879040643, + "learning_rate": 0.00019924540448697275, + "loss": 12.7356, + "step": 2496 + }, + { + "epoch": 0.13597162946778024, + "grad_norm": 0.7416597457191845, + "learning_rate": 0.00019924432283129526, + "loss": 12.6654, + "step": 2497 + }, + { + "epoch": 0.13602608346436326, + "grad_norm": 0.8960485887635284, + "learning_rate": 0.00019924324040387727, + "loss": 12.6783, + "step": 2498 + }, + { + "epoch": 0.13608053746094628, + "grad_norm": 0.8043344559108634, + "learning_rate": 0.0001992421572047272, + "loss": 12.6638, + "step": 2499 + }, + { + "epoch": 0.13613499145752928, + "grad_norm": 0.8477672467383391, + "learning_rate": 0.00019924107323385348, + "loss": 12.6786, + "step": 2500 + }, + { + "epoch": 0.1361894454541123, + "grad_norm": 0.8350506299141978, + "learning_rate": 0.00019923998849126452, + "loss": 12.7616, + "step": 2501 + }, + { + "epoch": 0.13624389945069532, + "grad_norm": 0.755115387664609, + "learning_rate": 0.0001992389029769688, + "loss": 12.5811, + "step": 2502 + }, + { + "epoch": 0.1362983534472783, + "grad_norm": 0.8296141248216201, + "learning_rate": 0.00019923781669097467, + "loss": 12.697, + "step": 2503 + }, + { + "epoch": 0.13635280744386133, + "grad_norm": 0.777487444662502, + "learning_rate": 0.00019923672963329068, + "loss": 12.9226, + "step": 2504 + }, + { + "epoch": 0.13640726144044435, + "grad_norm": 0.8009633102281982, + "learning_rate": 0.00019923564180392522, + "loss": 12.7052, + "step": 2505 + }, + { + "epoch": 0.13646171543702737, + "grad_norm": 0.8660245138597285, + "learning_rate": 0.00019923455320288678, + "loss": 12.6371, + "step": 2506 + }, + { + "epoch": 0.13651616943361036, + "grad_norm": 0.799333483782979, + "learning_rate": 0.0001992334638301838, + "loss": 12.5216, + "step": 2507 + }, + { + "epoch": 0.13657062343019338, + "grad_norm": 0.7890530468567644, + "learning_rate": 0.00019923237368582478, + "loss": 12.7124, + "step": 2508 + }, + { + "epoch": 0.1366250774267764, + "grad_norm": 0.8051322273979854, + "learning_rate": 0.00019923128276981816, + "loss": 12.7529, + "step": 2509 + }, + { + "epoch": 0.1366795314233594, + "grad_norm": 0.7613188323047255, + "learning_rate": 0.00019923019108217244, + "loss": 12.6688, + "step": 2510 + }, + { + "epoch": 0.13673398541994242, + "grad_norm": 0.7175554055582198, + "learning_rate": 0.00019922909862289613, + "loss": 12.6997, + "step": 2511 + }, + { + "epoch": 0.13678843941652544, + "grad_norm": 0.7617772690976709, + "learning_rate": 0.00019922800539199772, + "loss": 12.7001, + "step": 2512 + }, + { + "epoch": 0.13684289341310843, + "grad_norm": 0.8815313899645151, + "learning_rate": 0.00019922691138948566, + "loss": 12.7137, + "step": 2513 + }, + { + "epoch": 0.13689734740969145, + "grad_norm": 0.6953988947363619, + "learning_rate": 0.0001992258166153685, + "loss": 12.6242, + "step": 2514 + }, + { + "epoch": 0.13695180140627447, + "grad_norm": 0.7211808446872298, + "learning_rate": 0.00019922472106965482, + "loss": 12.6297, + "step": 2515 + }, + { + "epoch": 0.13700625540285746, + "grad_norm": 0.7805289346970223, + "learning_rate": 0.00019922362475235301, + "loss": 12.7185, + "step": 2516 + }, + { + "epoch": 0.13706070939944048, + "grad_norm": 0.7435082825109165, + "learning_rate": 0.00019922252766347166, + "loss": 12.6931, + "step": 2517 + }, + { + "epoch": 0.1371151633960235, + "grad_norm": 0.7593964045770272, + "learning_rate": 0.0001992214298030193, + "loss": 12.7587, + "step": 2518 + }, + { + "epoch": 0.1371696173926065, + "grad_norm": 1.0711067900247304, + "learning_rate": 0.00019922033117100447, + "loss": 12.7403, + "step": 2519 + }, + { + "epoch": 0.13722407138918952, + "grad_norm": 0.8442713964578787, + "learning_rate": 0.0001992192317674357, + "loss": 12.6896, + "step": 2520 + }, + { + "epoch": 0.13727852538577254, + "grad_norm": 0.7379985680047709, + "learning_rate": 0.00019921813159232151, + "loss": 12.5902, + "step": 2521 + }, + { + "epoch": 0.13733297938235553, + "grad_norm": 0.641792785472628, + "learning_rate": 0.00019921703064567056, + "loss": 12.5831, + "step": 2522 + }, + { + "epoch": 0.13738743337893855, + "grad_norm": 0.7085982767459995, + "learning_rate": 0.0001992159289274913, + "loss": 12.5308, + "step": 2523 + }, + { + "epoch": 0.13744188737552157, + "grad_norm": 0.8689263472913269, + "learning_rate": 0.00019921482643779235, + "loss": 12.6573, + "step": 2524 + }, + { + "epoch": 0.1374963413721046, + "grad_norm": 0.8102736269735316, + "learning_rate": 0.00019921372317658224, + "loss": 12.7661, + "step": 2525 + }, + { + "epoch": 0.13755079536868758, + "grad_norm": 0.9204667558381068, + "learning_rate": 0.00019921261914386963, + "loss": 12.6912, + "step": 2526 + }, + { + "epoch": 0.1376052493652706, + "grad_norm": 0.7654727932975922, + "learning_rate": 0.000199211514339663, + "loss": 12.7644, + "step": 2527 + }, + { + "epoch": 0.13765970336185362, + "grad_norm": 1.0129557668524476, + "learning_rate": 0.00019921040876397103, + "loss": 12.7797, + "step": 2528 + }, + { + "epoch": 0.13771415735843662, + "grad_norm": 0.7456783582541698, + "learning_rate": 0.0001992093024168023, + "loss": 12.6864, + "step": 2529 + }, + { + "epoch": 0.13776861135501964, + "grad_norm": 0.7895332665858877, + "learning_rate": 0.00019920819529816534, + "loss": 12.686, + "step": 2530 + }, + { + "epoch": 0.13782306535160266, + "grad_norm": 0.7372087027065335, + "learning_rate": 0.0001992070874080689, + "loss": 12.7634, + "step": 2531 + }, + { + "epoch": 0.13787751934818565, + "grad_norm": 0.8003564961950486, + "learning_rate": 0.00019920597874652143, + "loss": 12.6631, + "step": 2532 + }, + { + "epoch": 0.13793197334476867, + "grad_norm": 0.7813288190540522, + "learning_rate": 0.0001992048693135317, + "loss": 12.6787, + "step": 2533 + }, + { + "epoch": 0.1379864273413517, + "grad_norm": 0.7680793879946148, + "learning_rate": 0.0001992037591091082, + "loss": 12.772, + "step": 2534 + }, + { + "epoch": 0.13804088133793468, + "grad_norm": 0.7710086521748657, + "learning_rate": 0.00019920264813325965, + "loss": 12.6736, + "step": 2535 + }, + { + "epoch": 0.1380953353345177, + "grad_norm": 0.7414826352875737, + "learning_rate": 0.0001992015363859947, + "loss": 12.7004, + "step": 2536 + }, + { + "epoch": 0.13814978933110073, + "grad_norm": 0.8210043929211484, + "learning_rate": 0.00019920042386732196, + "loss": 12.6852, + "step": 2537 + }, + { + "epoch": 0.13820424332768372, + "grad_norm": 0.7534293641843519, + "learning_rate": 0.0001991993105772501, + "loss": 12.6567, + "step": 2538 + }, + { + "epoch": 0.13825869732426674, + "grad_norm": 0.8809000111534994, + "learning_rate": 0.00019919819651578776, + "loss": 12.7414, + "step": 2539 + }, + { + "epoch": 0.13831315132084976, + "grad_norm": 0.7685448480984839, + "learning_rate": 0.0001991970816829436, + "loss": 12.686, + "step": 2540 + }, + { + "epoch": 0.13836760531743278, + "grad_norm": 0.7667664978318831, + "learning_rate": 0.0001991959660787263, + "loss": 12.6296, + "step": 2541 + }, + { + "epoch": 0.13842205931401577, + "grad_norm": 0.810234524969098, + "learning_rate": 0.00019919484970314452, + "loss": 12.7798, + "step": 2542 + }, + { + "epoch": 0.1384765133105988, + "grad_norm": 0.7663707870937774, + "learning_rate": 0.00019919373255620695, + "loss": 12.6499, + "step": 2543 + }, + { + "epoch": 0.1385309673071818, + "grad_norm": 0.7614050036552538, + "learning_rate": 0.0001991926146379223, + "loss": 12.7253, + "step": 2544 + }, + { + "epoch": 0.1385854213037648, + "grad_norm": 0.7614062286590654, + "learning_rate": 0.00019919149594829926, + "loss": 12.6519, + "step": 2545 + }, + { + "epoch": 0.13863987530034783, + "grad_norm": 0.7781884061851773, + "learning_rate": 0.00019919037648734647, + "loss": 12.7314, + "step": 2546 + }, + { + "epoch": 0.13869432929693085, + "grad_norm": 0.7678996328527208, + "learning_rate": 0.0001991892562550727, + "loss": 12.628, + "step": 2547 + }, + { + "epoch": 0.13874878329351384, + "grad_norm": 0.8359242351768541, + "learning_rate": 0.00019918813525148665, + "loss": 12.6445, + "step": 2548 + }, + { + "epoch": 0.13880323729009686, + "grad_norm": 0.772133255022356, + "learning_rate": 0.000199187013476597, + "loss": 12.7278, + "step": 2549 + }, + { + "epoch": 0.13885769128667988, + "grad_norm": 0.9285555529661113, + "learning_rate": 0.0001991858909304125, + "loss": 12.6475, + "step": 2550 + }, + { + "epoch": 0.13891214528326287, + "grad_norm": 0.748614685498673, + "learning_rate": 0.00019918476761294191, + "loss": 12.6584, + "step": 2551 + }, + { + "epoch": 0.1389665992798459, + "grad_norm": 0.9228432707306571, + "learning_rate": 0.00019918364352419391, + "loss": 12.6811, + "step": 2552 + }, + { + "epoch": 0.1390210532764289, + "grad_norm": 0.8046804415097579, + "learning_rate": 0.00019918251866417729, + "loss": 12.5896, + "step": 2553 + }, + { + "epoch": 0.1390755072730119, + "grad_norm": 0.841468945990471, + "learning_rate": 0.00019918139303290073, + "loss": 12.5972, + "step": 2554 + }, + { + "epoch": 0.13912996126959493, + "grad_norm": 1.0219679946081544, + "learning_rate": 0.00019918026663037305, + "loss": 12.4939, + "step": 2555 + }, + { + "epoch": 0.13918441526617795, + "grad_norm": 0.7614948596107192, + "learning_rate": 0.000199179139456603, + "loss": 12.7957, + "step": 2556 + }, + { + "epoch": 0.13923886926276097, + "grad_norm": 1.125763541075716, + "learning_rate": 0.00019917801151159931, + "loss": 12.6848, + "step": 2557 + }, + { + "epoch": 0.13929332325934396, + "grad_norm": 0.7167558919471144, + "learning_rate": 0.00019917688279537076, + "loss": 12.6508, + "step": 2558 + }, + { + "epoch": 0.13934777725592698, + "grad_norm": 1.0221540450131401, + "learning_rate": 0.00019917575330792616, + "loss": 12.6141, + "step": 2559 + }, + { + "epoch": 0.13940223125251, + "grad_norm": 0.863351781333219, + "learning_rate": 0.00019917462304927424, + "loss": 12.7706, + "step": 2560 + }, + { + "epoch": 0.139456685249093, + "grad_norm": 0.845744534563339, + "learning_rate": 0.00019917349201942385, + "loss": 12.8127, + "step": 2561 + }, + { + "epoch": 0.139511139245676, + "grad_norm": 0.8801687590672354, + "learning_rate": 0.00019917236021838375, + "loss": 12.7274, + "step": 2562 + }, + { + "epoch": 0.13956559324225903, + "grad_norm": 0.7298449140222769, + "learning_rate": 0.00019917122764616274, + "loss": 12.779, + "step": 2563 + }, + { + "epoch": 0.13962004723884203, + "grad_norm": 0.7832122595156031, + "learning_rate": 0.00019917009430276962, + "loss": 12.5244, + "step": 2564 + }, + { + "epoch": 0.13967450123542505, + "grad_norm": 0.7527806598249291, + "learning_rate": 0.00019916896018821323, + "loss": 12.5743, + "step": 2565 + }, + { + "epoch": 0.13972895523200807, + "grad_norm": 0.7631874231541864, + "learning_rate": 0.0001991678253025024, + "loss": 12.6855, + "step": 2566 + }, + { + "epoch": 0.13978340922859106, + "grad_norm": 0.7998953837645839, + "learning_rate": 0.0001991666896456459, + "loss": 12.6305, + "step": 2567 + }, + { + "epoch": 0.13983786322517408, + "grad_norm": 0.7844095551718671, + "learning_rate": 0.00019916555321765258, + "loss": 12.7029, + "step": 2568 + }, + { + "epoch": 0.1398923172217571, + "grad_norm": 0.761532138143132, + "learning_rate": 0.0001991644160185313, + "loss": 12.6633, + "step": 2569 + }, + { + "epoch": 0.1399467712183401, + "grad_norm": 0.7509558404118452, + "learning_rate": 0.0001991632780482909, + "loss": 12.7437, + "step": 2570 + }, + { + "epoch": 0.14000122521492311, + "grad_norm": 0.7299977982300054, + "learning_rate": 0.0001991621393069402, + "loss": 12.5211, + "step": 2571 + }, + { + "epoch": 0.14005567921150613, + "grad_norm": 0.8211312042072629, + "learning_rate": 0.0001991609997944881, + "loss": 12.7196, + "step": 2572 + }, + { + "epoch": 0.14011013320808915, + "grad_norm": 0.735678027919518, + "learning_rate": 0.00019915985951094342, + "loss": 12.6854, + "step": 2573 + }, + { + "epoch": 0.14016458720467215, + "grad_norm": 0.7765696874719373, + "learning_rate": 0.00019915871845631506, + "loss": 12.6538, + "step": 2574 + }, + { + "epoch": 0.14021904120125517, + "grad_norm": 0.7117218844333242, + "learning_rate": 0.00019915757663061188, + "loss": 12.6685, + "step": 2575 + }, + { + "epoch": 0.1402734951978382, + "grad_norm": 0.8627660601894691, + "learning_rate": 0.00019915643403384272, + "loss": 12.7439, + "step": 2576 + }, + { + "epoch": 0.14032794919442118, + "grad_norm": 0.7356120518935028, + "learning_rate": 0.00019915529066601652, + "loss": 12.8082, + "step": 2577 + }, + { + "epoch": 0.1403824031910042, + "grad_norm": 0.7130187369102607, + "learning_rate": 0.00019915414652714217, + "loss": 12.7221, + "step": 2578 + }, + { + "epoch": 0.14043685718758722, + "grad_norm": 0.7332688257439224, + "learning_rate": 0.00019915300161722852, + "loss": 12.7258, + "step": 2579 + }, + { + "epoch": 0.14049131118417021, + "grad_norm": 0.7630906052367811, + "learning_rate": 0.00019915185593628453, + "loss": 12.6905, + "step": 2580 + }, + { + "epoch": 0.14054576518075323, + "grad_norm": 0.9138139283846225, + "learning_rate": 0.00019915070948431905, + "loss": 12.6803, + "step": 2581 + }, + { + "epoch": 0.14060021917733626, + "grad_norm": 0.7234398462032869, + "learning_rate": 0.00019914956226134103, + "loss": 12.669, + "step": 2582 + }, + { + "epoch": 0.14065467317391925, + "grad_norm": 0.7252327764689702, + "learning_rate": 0.0001991484142673594, + "loss": 12.6976, + "step": 2583 + }, + { + "epoch": 0.14070912717050227, + "grad_norm": 0.7898203683406123, + "learning_rate": 0.00019914726550238307, + "loss": 12.8086, + "step": 2584 + }, + { + "epoch": 0.1407635811670853, + "grad_norm": 0.752966658793422, + "learning_rate": 0.00019914611596642096, + "loss": 12.6919, + "step": 2585 + }, + { + "epoch": 0.14081803516366828, + "grad_norm": 0.9067939979456567, + "learning_rate": 0.00019914496565948207, + "loss": 12.6859, + "step": 2586 + }, + { + "epoch": 0.1408724891602513, + "grad_norm": 0.715271796852438, + "learning_rate": 0.00019914381458157525, + "loss": 12.6533, + "step": 2587 + }, + { + "epoch": 0.14092694315683432, + "grad_norm": 0.763482898726477, + "learning_rate": 0.00019914266273270953, + "loss": 12.5664, + "step": 2588 + }, + { + "epoch": 0.14098139715341731, + "grad_norm": 0.7376861678712301, + "learning_rate": 0.0001991415101128938, + "loss": 12.6123, + "step": 2589 + }, + { + "epoch": 0.14103585115000034, + "grad_norm": 0.8084686482048893, + "learning_rate": 0.00019914035672213712, + "loss": 12.6054, + "step": 2590 + }, + { + "epoch": 0.14109030514658336, + "grad_norm": 0.8438341417341487, + "learning_rate": 0.00019913920256044837, + "loss": 12.7978, + "step": 2591 + }, + { + "epoch": 0.14114475914316638, + "grad_norm": 0.7656913542268167, + "learning_rate": 0.00019913804762783656, + "loss": 12.6378, + "step": 2592 + }, + { + "epoch": 0.14119921313974937, + "grad_norm": 0.7226525503659902, + "learning_rate": 0.00019913689192431065, + "loss": 12.6956, + "step": 2593 + }, + { + "epoch": 0.1412536671363324, + "grad_norm": 0.7566159616987096, + "learning_rate": 0.00019913573544987968, + "loss": 12.7507, + "step": 2594 + }, + { + "epoch": 0.1413081211329154, + "grad_norm": 0.9359513791113954, + "learning_rate": 0.00019913457820455258, + "loss": 12.7758, + "step": 2595 + }, + { + "epoch": 0.1413625751294984, + "grad_norm": 0.7716699789726437, + "learning_rate": 0.00019913342018833835, + "loss": 12.5928, + "step": 2596 + }, + { + "epoch": 0.14141702912608142, + "grad_norm": 0.855705424725422, + "learning_rate": 0.00019913226140124608, + "loss": 12.6545, + "step": 2597 + }, + { + "epoch": 0.14147148312266444, + "grad_norm": 0.7477286757754348, + "learning_rate": 0.00019913110184328466, + "loss": 12.6477, + "step": 2598 + }, + { + "epoch": 0.14152593711924744, + "grad_norm": 0.7656128177871533, + "learning_rate": 0.0001991299415144632, + "loss": 12.6694, + "step": 2599 + }, + { + "epoch": 0.14158039111583046, + "grad_norm": 0.8201103526357508, + "learning_rate": 0.00019912878041479067, + "loss": 12.7311, + "step": 2600 + }, + { + "epoch": 0.14163484511241348, + "grad_norm": 0.7757724450627341, + "learning_rate": 0.00019912761854427616, + "loss": 12.7812, + "step": 2601 + }, + { + "epoch": 0.14168929910899647, + "grad_norm": 0.7652702562645756, + "learning_rate": 0.00019912645590292865, + "loss": 12.8233, + "step": 2602 + }, + { + "epoch": 0.1417437531055795, + "grad_norm": 0.8294807319246501, + "learning_rate": 0.00019912529249075718, + "loss": 12.606, + "step": 2603 + }, + { + "epoch": 0.1417982071021625, + "grad_norm": 0.7085809011050793, + "learning_rate": 0.0001991241283077708, + "loss": 12.6265, + "step": 2604 + }, + { + "epoch": 0.1418526610987455, + "grad_norm": 0.7706834778729315, + "learning_rate": 0.00019912296335397863, + "loss": 12.7192, + "step": 2605 + }, + { + "epoch": 0.14190711509532852, + "grad_norm": 0.6831567962305096, + "learning_rate": 0.00019912179762938964, + "loss": 12.5742, + "step": 2606 + }, + { + "epoch": 0.14196156909191154, + "grad_norm": 0.7245976056716881, + "learning_rate": 0.0001991206311340129, + "loss": 12.7203, + "step": 2607 + }, + { + "epoch": 0.14201602308849456, + "grad_norm": 0.7092967526585179, + "learning_rate": 0.00019911946386785755, + "loss": 12.7227, + "step": 2608 + }, + { + "epoch": 0.14207047708507756, + "grad_norm": 0.8299215978940133, + "learning_rate": 0.0001991182958309326, + "loss": 12.6424, + "step": 2609 + }, + { + "epoch": 0.14212493108166058, + "grad_norm": 0.7427920778837738, + "learning_rate": 0.00019911712702324716, + "loss": 12.7091, + "step": 2610 + }, + { + "epoch": 0.1421793850782436, + "grad_norm": 0.8184591293143213, + "learning_rate": 0.00019911595744481034, + "loss": 12.7721, + "step": 2611 + }, + { + "epoch": 0.1422338390748266, + "grad_norm": 0.8594294714337313, + "learning_rate": 0.00019911478709563123, + "loss": 12.7675, + "step": 2612 + }, + { + "epoch": 0.1422882930714096, + "grad_norm": 0.7905177187484469, + "learning_rate": 0.00019911361597571887, + "loss": 12.7313, + "step": 2613 + }, + { + "epoch": 0.14234274706799263, + "grad_norm": 0.6542339698773333, + "learning_rate": 0.00019911244408508241, + "loss": 12.6825, + "step": 2614 + }, + { + "epoch": 0.14239720106457562, + "grad_norm": 0.6864872742719939, + "learning_rate": 0.000199111271423731, + "loss": 12.7059, + "step": 2615 + }, + { + "epoch": 0.14245165506115864, + "grad_norm": 0.8509600880961752, + "learning_rate": 0.0001991100979916737, + "loss": 12.825, + "step": 2616 + }, + { + "epoch": 0.14250610905774166, + "grad_norm": 0.8027983418211375, + "learning_rate": 0.00019910892378891966, + "loss": 12.7105, + "step": 2617 + }, + { + "epoch": 0.14256056305432466, + "grad_norm": 0.7234317331840034, + "learning_rate": 0.000199107748815478, + "loss": 12.6505, + "step": 2618 + }, + { + "epoch": 0.14261501705090768, + "grad_norm": 0.7664867974063245, + "learning_rate": 0.0001991065730713579, + "loss": 12.7388, + "step": 2619 + }, + { + "epoch": 0.1426694710474907, + "grad_norm": 0.7636742822664422, + "learning_rate": 0.00019910539655656844, + "loss": 12.6586, + "step": 2620 + }, + { + "epoch": 0.1427239250440737, + "grad_norm": 0.785599042442663, + "learning_rate": 0.0001991042192711188, + "loss": 12.6612, + "step": 2621 + }, + { + "epoch": 0.1427783790406567, + "grad_norm": 0.6841416305877701, + "learning_rate": 0.00019910304121501811, + "loss": 12.5579, + "step": 2622 + }, + { + "epoch": 0.14283283303723973, + "grad_norm": 0.7405822791815138, + "learning_rate": 0.00019910186238827557, + "loss": 12.7123, + "step": 2623 + }, + { + "epoch": 0.14288728703382275, + "grad_norm": 0.7209279711717004, + "learning_rate": 0.00019910068279090036, + "loss": 12.6044, + "step": 2624 + }, + { + "epoch": 0.14294174103040574, + "grad_norm": 0.8261997294453759, + "learning_rate": 0.0001990995024229016, + "loss": 12.6075, + "step": 2625 + }, + { + "epoch": 0.14299619502698876, + "grad_norm": 0.7613534265460365, + "learning_rate": 0.00019909832128428846, + "loss": 12.6447, + "step": 2626 + }, + { + "epoch": 0.14305064902357179, + "grad_norm": 0.7563704893104031, + "learning_rate": 0.0001990971393750702, + "loss": 12.5855, + "step": 2627 + }, + { + "epoch": 0.14310510302015478, + "grad_norm": 0.7399588497910308, + "learning_rate": 0.00019909595669525594, + "loss": 12.6363, + "step": 2628 + }, + { + "epoch": 0.1431595570167378, + "grad_norm": 0.7423730889180716, + "learning_rate": 0.0001990947732448549, + "loss": 12.623, + "step": 2629 + }, + { + "epoch": 0.14321401101332082, + "grad_norm": 0.7780993727686799, + "learning_rate": 0.00019909358902387626, + "loss": 12.6195, + "step": 2630 + }, + { + "epoch": 0.1432684650099038, + "grad_norm": 0.695759998723745, + "learning_rate": 0.0001990924040323293, + "loss": 12.6837, + "step": 2631 + }, + { + "epoch": 0.14332291900648683, + "grad_norm": 0.7818291616707426, + "learning_rate": 0.0001990912182702232, + "loss": 12.7589, + "step": 2632 + }, + { + "epoch": 0.14337737300306985, + "grad_norm": 0.728177780364699, + "learning_rate": 0.0001990900317375671, + "loss": 12.6438, + "step": 2633 + }, + { + "epoch": 0.14343182699965284, + "grad_norm": 0.7192789143055213, + "learning_rate": 0.00019908884443437037, + "loss": 12.7923, + "step": 2634 + }, + { + "epoch": 0.14348628099623587, + "grad_norm": 0.7560552245070992, + "learning_rate": 0.00019908765636064213, + "loss": 12.5752, + "step": 2635 + }, + { + "epoch": 0.14354073499281889, + "grad_norm": 0.7917742140949682, + "learning_rate": 0.00019908646751639166, + "loss": 12.7673, + "step": 2636 + }, + { + "epoch": 0.14359518898940188, + "grad_norm": 0.7889192817451162, + "learning_rate": 0.00019908527790162822, + "loss": 12.7232, + "step": 2637 + }, + { + "epoch": 0.1436496429859849, + "grad_norm": 0.7590576115205909, + "learning_rate": 0.000199084087516361, + "loss": 12.629, + "step": 2638 + }, + { + "epoch": 0.14370409698256792, + "grad_norm": 0.7968942636474549, + "learning_rate": 0.00019908289636059933, + "loss": 12.7596, + "step": 2639 + }, + { + "epoch": 0.14375855097915094, + "grad_norm": 0.7118388249750108, + "learning_rate": 0.00019908170443435244, + "loss": 12.6103, + "step": 2640 + }, + { + "epoch": 0.14381300497573393, + "grad_norm": 0.7676927176741307, + "learning_rate": 0.0001990805117376296, + "loss": 12.659, + "step": 2641 + }, + { + "epoch": 0.14386745897231695, + "grad_norm": 0.74628843718605, + "learning_rate": 0.0001990793182704401, + "loss": 12.6684, + "step": 2642 + }, + { + "epoch": 0.14392191296889997, + "grad_norm": 0.765142973884477, + "learning_rate": 0.00019907812403279314, + "loss": 12.6766, + "step": 2643 + }, + { + "epoch": 0.14397636696548297, + "grad_norm": 1.120038400913752, + "learning_rate": 0.00019907692902469814, + "loss": 12.7644, + "step": 2644 + }, + { + "epoch": 0.144030820962066, + "grad_norm": 0.8613421582516949, + "learning_rate": 0.0001990757332461643, + "loss": 12.7654, + "step": 2645 + }, + { + "epoch": 0.144085274958649, + "grad_norm": 0.7597908331512231, + "learning_rate": 0.00019907453669720096, + "loss": 12.732, + "step": 2646 + }, + { + "epoch": 0.144139728955232, + "grad_norm": 0.9086438951971259, + "learning_rate": 0.0001990733393778174, + "loss": 12.6952, + "step": 2647 + }, + { + "epoch": 0.14419418295181502, + "grad_norm": 0.735107309446148, + "learning_rate": 0.00019907214128802293, + "loss": 12.6305, + "step": 2648 + }, + { + "epoch": 0.14424863694839804, + "grad_norm": 0.8510749009659634, + "learning_rate": 0.0001990709424278269, + "loss": 12.6497, + "step": 2649 + }, + { + "epoch": 0.14430309094498103, + "grad_norm": 2.553487336315287, + "learning_rate": 0.00019906974279723858, + "loss": 12.859, + "step": 2650 + }, + { + "epoch": 0.14435754494156405, + "grad_norm": 0.8497281216272348, + "learning_rate": 0.00019906854239626733, + "loss": 12.6034, + "step": 2651 + }, + { + "epoch": 0.14441199893814707, + "grad_norm": 0.7741178450896294, + "learning_rate": 0.0001990673412249225, + "loss": 12.7448, + "step": 2652 + }, + { + "epoch": 0.14446645293473007, + "grad_norm": 0.8050047338057631, + "learning_rate": 0.00019906613928321338, + "loss": 12.5269, + "step": 2653 + }, + { + "epoch": 0.1445209069313131, + "grad_norm": 0.764721188085812, + "learning_rate": 0.00019906493657114935, + "loss": 12.8101, + "step": 2654 + }, + { + "epoch": 0.1445753609278961, + "grad_norm": 0.9205425051456191, + "learning_rate": 0.0001990637330887398, + "loss": 12.6569, + "step": 2655 + }, + { + "epoch": 0.1446298149244791, + "grad_norm": 0.7721467979457826, + "learning_rate": 0.00019906252883599402, + "loss": 12.7271, + "step": 2656 + }, + { + "epoch": 0.14468426892106212, + "grad_norm": 0.7858920669134861, + "learning_rate": 0.0001990613238129214, + "loss": 12.8252, + "step": 2657 + }, + { + "epoch": 0.14473872291764514, + "grad_norm": 0.765720003106791, + "learning_rate": 0.0001990601180195313, + "loss": 12.7792, + "step": 2658 + }, + { + "epoch": 0.14479317691422816, + "grad_norm": 0.7591642681701061, + "learning_rate": 0.00019905891145583312, + "loss": 12.6342, + "step": 2659 + }, + { + "epoch": 0.14484763091081115, + "grad_norm": 0.7738866144071866, + "learning_rate": 0.00019905770412183626, + "loss": 12.7038, + "step": 2660 + }, + { + "epoch": 0.14490208490739417, + "grad_norm": 0.7751819170403272, + "learning_rate": 0.00019905649601755006, + "loss": 12.7029, + "step": 2661 + }, + { + "epoch": 0.1449565389039772, + "grad_norm": 1.037253868229172, + "learning_rate": 0.00019905528714298393, + "loss": 12.6876, + "step": 2662 + }, + { + "epoch": 0.1450109929005602, + "grad_norm": 0.8228831140879636, + "learning_rate": 0.00019905407749814727, + "loss": 12.6693, + "step": 2663 + }, + { + "epoch": 0.1450654468971432, + "grad_norm": 0.7472181487251265, + "learning_rate": 0.0001990528670830495, + "loss": 12.5575, + "step": 2664 + }, + { + "epoch": 0.14511990089372623, + "grad_norm": 0.8676633248694123, + "learning_rate": 0.00019905165589770003, + "loss": 12.6504, + "step": 2665 + }, + { + "epoch": 0.14517435489030922, + "grad_norm": 0.833166831577227, + "learning_rate": 0.00019905044394210827, + "loss": 12.8559, + "step": 2666 + }, + { + "epoch": 0.14522880888689224, + "grad_norm": 0.7431818143738881, + "learning_rate": 0.00019904923121628367, + "loss": 12.599, + "step": 2667 + }, + { + "epoch": 0.14528326288347526, + "grad_norm": 0.9159990154597104, + "learning_rate": 0.0001990480177202356, + "loss": 12.7133, + "step": 2668 + }, + { + "epoch": 0.14533771688005825, + "grad_norm": 0.7690112766314912, + "learning_rate": 0.00019904680345397355, + "loss": 12.7107, + "step": 2669 + }, + { + "epoch": 0.14539217087664127, + "grad_norm": 0.7763570974348, + "learning_rate": 0.00019904558841750696, + "loss": 12.797, + "step": 2670 + }, + { + "epoch": 0.1454466248732243, + "grad_norm": 0.885475603482203, + "learning_rate": 0.00019904437261084526, + "loss": 12.648, + "step": 2671 + }, + { + "epoch": 0.1455010788698073, + "grad_norm": 0.7985814589081149, + "learning_rate": 0.00019904315603399788, + "loss": 12.9738, + "step": 2672 + }, + { + "epoch": 0.1455555328663903, + "grad_norm": 0.937349046219652, + "learning_rate": 0.00019904193868697432, + "loss": 12.7084, + "step": 2673 + }, + { + "epoch": 0.14560998686297333, + "grad_norm": 0.7784287617162705, + "learning_rate": 0.00019904072056978404, + "loss": 12.6306, + "step": 2674 + }, + { + "epoch": 0.14566444085955635, + "grad_norm": 0.6986676816712994, + "learning_rate": 0.00019903950168243654, + "loss": 12.6791, + "step": 2675 + }, + { + "epoch": 0.14571889485613934, + "grad_norm": 0.7982549576873136, + "learning_rate": 0.0001990382820249412, + "loss": 12.5907, + "step": 2676 + }, + { + "epoch": 0.14577334885272236, + "grad_norm": 0.7865765018957946, + "learning_rate": 0.00019903706159730763, + "loss": 12.683, + "step": 2677 + }, + { + "epoch": 0.14582780284930538, + "grad_norm": 0.7727965652112779, + "learning_rate": 0.00019903584039954525, + "loss": 12.73, + "step": 2678 + }, + { + "epoch": 0.14588225684588838, + "grad_norm": 0.7201995605130986, + "learning_rate": 0.00019903461843166352, + "loss": 12.5938, + "step": 2679 + }, + { + "epoch": 0.1459367108424714, + "grad_norm": 0.776229206051119, + "learning_rate": 0.00019903339569367202, + "loss": 12.6367, + "step": 2680 + }, + { + "epoch": 0.14599116483905442, + "grad_norm": 0.7445473575011761, + "learning_rate": 0.00019903217218558025, + "loss": 12.7577, + "step": 2681 + }, + { + "epoch": 0.1460456188356374, + "grad_norm": 0.7552656467499095, + "learning_rate": 0.0001990309479073977, + "loss": 12.7018, + "step": 2682 + }, + { + "epoch": 0.14610007283222043, + "grad_norm": 0.7021799060374931, + "learning_rate": 0.00019902972285913386, + "loss": 12.5492, + "step": 2683 + }, + { + "epoch": 0.14615452682880345, + "grad_norm": 0.7101796742359953, + "learning_rate": 0.0001990284970407983, + "loss": 12.6654, + "step": 2684 + }, + { + "epoch": 0.14620898082538644, + "grad_norm": 0.7770953225119109, + "learning_rate": 0.00019902727045240055, + "loss": 12.7807, + "step": 2685 + }, + { + "epoch": 0.14626343482196946, + "grad_norm": 0.8205011594444994, + "learning_rate": 0.0001990260430939501, + "loss": 12.7381, + "step": 2686 + }, + { + "epoch": 0.14631788881855248, + "grad_norm": 0.8011777697339707, + "learning_rate": 0.00019902481496545657, + "loss": 12.6545, + "step": 2687 + }, + { + "epoch": 0.14637234281513548, + "grad_norm": 0.7561686468407363, + "learning_rate": 0.0001990235860669295, + "loss": 12.7167, + "step": 2688 + }, + { + "epoch": 0.1464267968117185, + "grad_norm": 0.7239528910960851, + "learning_rate": 0.00019902235639837837, + "loss": 12.7289, + "step": 2689 + }, + { + "epoch": 0.14648125080830152, + "grad_norm": 0.7739801417470364, + "learning_rate": 0.0001990211259598128, + "loss": 12.7053, + "step": 2690 + }, + { + "epoch": 0.14653570480488454, + "grad_norm": 0.8893441290926933, + "learning_rate": 0.00019901989475124235, + "loss": 12.7693, + "step": 2691 + }, + { + "epoch": 0.14659015880146753, + "grad_norm": 0.7881993519858137, + "learning_rate": 0.00019901866277267658, + "loss": 12.7999, + "step": 2692 + }, + { + "epoch": 0.14664461279805055, + "grad_norm": 0.7960471770020839, + "learning_rate": 0.0001990174300241251, + "loss": 12.7491, + "step": 2693 + }, + { + "epoch": 0.14669906679463357, + "grad_norm": 0.7753083056275317, + "learning_rate": 0.00019901619650559749, + "loss": 12.7001, + "step": 2694 + }, + { + "epoch": 0.14675352079121656, + "grad_norm": 0.7696722499154163, + "learning_rate": 0.00019901496221710332, + "loss": 12.6874, + "step": 2695 + }, + { + "epoch": 0.14680797478779958, + "grad_norm": 0.767633402938913, + "learning_rate": 0.00019901372715865217, + "loss": 12.6353, + "step": 2696 + }, + { + "epoch": 0.1468624287843826, + "grad_norm": 0.7438423585527448, + "learning_rate": 0.0001990124913302537, + "loss": 12.7425, + "step": 2697 + }, + { + "epoch": 0.1469168827809656, + "grad_norm": 0.7757341838499866, + "learning_rate": 0.0001990112547319175, + "loss": 12.8279, + "step": 2698 + }, + { + "epoch": 0.14697133677754862, + "grad_norm": 0.8258565883469542, + "learning_rate": 0.00019901001736365317, + "loss": 12.6837, + "step": 2699 + }, + { + "epoch": 0.14702579077413164, + "grad_norm": 0.8004203175040449, + "learning_rate": 0.00019900877922547034, + "loss": 12.7775, + "step": 2700 + }, + { + "epoch": 0.14708024477071463, + "grad_norm": 0.7947067824549463, + "learning_rate": 0.00019900754031737866, + "loss": 12.7518, + "step": 2701 + }, + { + "epoch": 0.14713469876729765, + "grad_norm": 0.9069636423116467, + "learning_rate": 0.00019900630063938773, + "loss": 12.7515, + "step": 2702 + }, + { + "epoch": 0.14718915276388067, + "grad_norm": 0.7338969251876433, + "learning_rate": 0.00019900506019150717, + "loss": 12.6569, + "step": 2703 + }, + { + "epoch": 0.14724360676046366, + "grad_norm": 0.747056261107894, + "learning_rate": 0.00019900381897374668, + "loss": 12.6114, + "step": 2704 + }, + { + "epoch": 0.14729806075704668, + "grad_norm": 0.792429734575613, + "learning_rate": 0.0001990025769861159, + "loss": 12.7098, + "step": 2705 + }, + { + "epoch": 0.1473525147536297, + "grad_norm": 0.7958552148528896, + "learning_rate": 0.0001990013342286245, + "loss": 12.6546, + "step": 2706 + }, + { + "epoch": 0.14740696875021272, + "grad_norm": 0.7635541867716035, + "learning_rate": 0.00019900009070128208, + "loss": 12.6584, + "step": 2707 + }, + { + "epoch": 0.14746142274679572, + "grad_norm": 0.820738536195993, + "learning_rate": 0.00019899884640409837, + "loss": 12.8891, + "step": 2708 + }, + { + "epoch": 0.14751587674337874, + "grad_norm": 0.678436148866974, + "learning_rate": 0.00019899760133708304, + "loss": 12.7063, + "step": 2709 + }, + { + "epoch": 0.14757033073996176, + "grad_norm": 0.7338891818564695, + "learning_rate": 0.00019899635550024573, + "loss": 12.6804, + "step": 2710 + }, + { + "epoch": 0.14762478473654475, + "grad_norm": 0.7677957679961459, + "learning_rate": 0.0001989951088935962, + "loss": 12.5545, + "step": 2711 + }, + { + "epoch": 0.14767923873312777, + "grad_norm": 0.7455124550308035, + "learning_rate": 0.00019899386151714407, + "loss": 12.7571, + "step": 2712 + }, + { + "epoch": 0.1477336927297108, + "grad_norm": 0.7554064266083648, + "learning_rate": 0.00019899261337089907, + "loss": 12.7199, + "step": 2713 + }, + { + "epoch": 0.14778814672629378, + "grad_norm": 0.7000666033089813, + "learning_rate": 0.0001989913644548709, + "loss": 12.7881, + "step": 2714 + }, + { + "epoch": 0.1478426007228768, + "grad_norm": 0.7409220000487262, + "learning_rate": 0.00019899011476906932, + "loss": 12.6374, + "step": 2715 + }, + { + "epoch": 0.14789705471945983, + "grad_norm": 0.7127590442514861, + "learning_rate": 0.00019898886431350397, + "loss": 12.6687, + "step": 2716 + }, + { + "epoch": 0.14795150871604282, + "grad_norm": 0.8103966832252935, + "learning_rate": 0.0001989876130881846, + "loss": 12.6737, + "step": 2717 + }, + { + "epoch": 0.14800596271262584, + "grad_norm": 0.7294411546682413, + "learning_rate": 0.000198986361093121, + "loss": 12.6861, + "step": 2718 + }, + { + "epoch": 0.14806041670920886, + "grad_norm": 0.7964709794829142, + "learning_rate": 0.0001989851083283228, + "loss": 12.5385, + "step": 2719 + }, + { + "epoch": 0.14811487070579185, + "grad_norm": 0.7113450423677704, + "learning_rate": 0.00019898385479379986, + "loss": 12.601, + "step": 2720 + }, + { + "epoch": 0.14816932470237487, + "grad_norm": 0.7822473870313964, + "learning_rate": 0.00019898260048956183, + "loss": 12.603, + "step": 2721 + }, + { + "epoch": 0.1482237786989579, + "grad_norm": 0.8135540599153726, + "learning_rate": 0.0001989813454156185, + "loss": 12.6545, + "step": 2722 + }, + { + "epoch": 0.14827823269554088, + "grad_norm": 0.8233043530388209, + "learning_rate": 0.00019898008957197966, + "loss": 12.7226, + "step": 2723 + }, + { + "epoch": 0.1483326866921239, + "grad_norm": 0.9156143678477184, + "learning_rate": 0.000198978832958655, + "loss": 12.7056, + "step": 2724 + }, + { + "epoch": 0.14838714068870693, + "grad_norm": 0.810162685105311, + "learning_rate": 0.00019897757557565436, + "loss": 12.682, + "step": 2725 + }, + { + "epoch": 0.14844159468528995, + "grad_norm": 0.8342449403585397, + "learning_rate": 0.00019897631742298746, + "loss": 12.7605, + "step": 2726 + }, + { + "epoch": 0.14849604868187294, + "grad_norm": 0.7489492357339741, + "learning_rate": 0.00019897505850066414, + "loss": 12.6675, + "step": 2727 + }, + { + "epoch": 0.14855050267845596, + "grad_norm": 0.8945862280358247, + "learning_rate": 0.00019897379880869418, + "loss": 12.7221, + "step": 2728 + }, + { + "epoch": 0.14860495667503898, + "grad_norm": 0.8578394375204317, + "learning_rate": 0.00019897253834708735, + "loss": 12.6227, + "step": 2729 + }, + { + "epoch": 0.14865941067162197, + "grad_norm": 0.8320179117170514, + "learning_rate": 0.00019897127711585347, + "loss": 12.7534, + "step": 2730 + }, + { + "epoch": 0.148713864668205, + "grad_norm": 0.913571833439365, + "learning_rate": 0.00019897001511500232, + "loss": 12.6558, + "step": 2731 + }, + { + "epoch": 0.148768318664788, + "grad_norm": 0.7032303064815117, + "learning_rate": 0.00019896875234454378, + "loss": 12.6742, + "step": 2732 + }, + { + "epoch": 0.148822772661371, + "grad_norm": 0.863979360450013, + "learning_rate": 0.0001989674888044876, + "loss": 12.6833, + "step": 2733 + }, + { + "epoch": 0.14887722665795403, + "grad_norm": 0.8477605359647903, + "learning_rate": 0.00019896622449484363, + "loss": 12.724, + "step": 2734 + }, + { + "epoch": 0.14893168065453705, + "grad_norm": 0.7009812417053154, + "learning_rate": 0.00019896495941562167, + "loss": 12.477, + "step": 2735 + }, + { + "epoch": 0.14898613465112004, + "grad_norm": 1.1028258025088578, + "learning_rate": 0.00019896369356683165, + "loss": 12.7799, + "step": 2736 + }, + { + "epoch": 0.14904058864770306, + "grad_norm": 0.823652454643868, + "learning_rate": 0.00019896242694848333, + "loss": 12.7377, + "step": 2737 + }, + { + "epoch": 0.14909504264428608, + "grad_norm": 0.9733281948709586, + "learning_rate": 0.00019896115956058655, + "loss": 12.7747, + "step": 2738 + }, + { + "epoch": 0.14914949664086907, + "grad_norm": 0.7976004474814257, + "learning_rate": 0.00019895989140315123, + "loss": 12.8456, + "step": 2739 + }, + { + "epoch": 0.1492039506374521, + "grad_norm": 0.9644150577357924, + "learning_rate": 0.0001989586224761872, + "loss": 12.7945, + "step": 2740 + }, + { + "epoch": 0.1492584046340351, + "grad_norm": 0.8759892203051738, + "learning_rate": 0.0001989573527797043, + "loss": 12.7189, + "step": 2741 + }, + { + "epoch": 0.14931285863061813, + "grad_norm": 0.8336760294714186, + "learning_rate": 0.00019895608231371246, + "loss": 12.6849, + "step": 2742 + }, + { + "epoch": 0.14936731262720113, + "grad_norm": 0.8626112894874401, + "learning_rate": 0.0001989548110782215, + "loss": 12.733, + "step": 2743 + }, + { + "epoch": 0.14942176662378415, + "grad_norm": 0.7318897808235327, + "learning_rate": 0.00019895353907324137, + "loss": 12.6482, + "step": 2744 + }, + { + "epoch": 0.14947622062036717, + "grad_norm": 0.9435418862621813, + "learning_rate": 0.0001989522662987819, + "loss": 12.788, + "step": 2745 + }, + { + "epoch": 0.14953067461695016, + "grad_norm": 0.8022696319211658, + "learning_rate": 0.000198950992754853, + "loss": 12.438, + "step": 2746 + }, + { + "epoch": 0.14958512861353318, + "grad_norm": 0.7575661421408524, + "learning_rate": 0.00019894971844146463, + "loss": 12.7393, + "step": 2747 + }, + { + "epoch": 0.1496395826101162, + "grad_norm": 0.7805792001603048, + "learning_rate": 0.00019894844335862662, + "loss": 12.657, + "step": 2748 + }, + { + "epoch": 0.1496940366066992, + "grad_norm": 0.8126920101810677, + "learning_rate": 0.00019894716750634892, + "loss": 12.8064, + "step": 2749 + }, + { + "epoch": 0.1497484906032822, + "grad_norm": 0.8474201711084006, + "learning_rate": 0.00019894589088464146, + "loss": 12.754, + "step": 2750 + }, + { + "epoch": 0.14980294459986523, + "grad_norm": 0.7755472736544625, + "learning_rate": 0.00019894461349351415, + "loss": 12.822, + "step": 2751 + }, + { + "epoch": 0.14985739859644823, + "grad_norm": 0.7593805950832129, + "learning_rate": 0.00019894333533297694, + "loss": 12.6108, + "step": 2752 + }, + { + "epoch": 0.14991185259303125, + "grad_norm": 0.7520301260416404, + "learning_rate": 0.0001989420564030398, + "loss": 12.6183, + "step": 2753 + }, + { + "epoch": 0.14996630658961427, + "grad_norm": 0.749234270126392, + "learning_rate": 0.0001989407767037126, + "loss": 12.6935, + "step": 2754 + }, + { + "epoch": 0.15002076058619726, + "grad_norm": 0.8756663060744522, + "learning_rate": 0.00019893949623500534, + "loss": 12.7225, + "step": 2755 + }, + { + "epoch": 0.15007521458278028, + "grad_norm": 0.7676286733455003, + "learning_rate": 0.00019893821499692793, + "loss": 12.7744, + "step": 2756 + }, + { + "epoch": 0.1501296685793633, + "grad_norm": 0.7605119624902635, + "learning_rate": 0.0001989369329894904, + "loss": 12.6357, + "step": 2757 + }, + { + "epoch": 0.15018412257594632, + "grad_norm": 0.8580930249417181, + "learning_rate": 0.00019893565021270268, + "loss": 12.6745, + "step": 2758 + }, + { + "epoch": 0.15023857657252931, + "grad_norm": 0.7583813012738259, + "learning_rate": 0.00019893436666657474, + "loss": 12.7093, + "step": 2759 + }, + { + "epoch": 0.15029303056911233, + "grad_norm": 0.8563488809263123, + "learning_rate": 0.0001989330823511166, + "loss": 12.7741, + "step": 2760 + }, + { + "epoch": 0.15034748456569536, + "grad_norm": 0.7543344960375925, + "learning_rate": 0.00019893179726633822, + "loss": 12.6322, + "step": 2761 + }, + { + "epoch": 0.15040193856227835, + "grad_norm": 0.8365295252292132, + "learning_rate": 0.0001989305114122496, + "loss": 12.6755, + "step": 2762 + }, + { + "epoch": 0.15045639255886137, + "grad_norm": 0.717356381989756, + "learning_rate": 0.00019892922478886068, + "loss": 12.6843, + "step": 2763 + }, + { + "epoch": 0.1505108465554444, + "grad_norm": 0.7970472395126367, + "learning_rate": 0.00019892793739618157, + "loss": 12.7954, + "step": 2764 + }, + { + "epoch": 0.15056530055202738, + "grad_norm": 0.8229387020294725, + "learning_rate": 0.0001989266492342222, + "loss": 12.6806, + "step": 2765 + }, + { + "epoch": 0.1506197545486104, + "grad_norm": 0.7299889919023793, + "learning_rate": 0.00019892536030299262, + "loss": 12.7584, + "step": 2766 + }, + { + "epoch": 0.15067420854519342, + "grad_norm": 0.8202907454446939, + "learning_rate": 0.00019892407060250286, + "loss": 12.8171, + "step": 2767 + }, + { + "epoch": 0.15072866254177641, + "grad_norm": 0.7692958776460064, + "learning_rate": 0.00019892278013276292, + "loss": 12.7063, + "step": 2768 + }, + { + "epoch": 0.15078311653835944, + "grad_norm": 0.7407039841520368, + "learning_rate": 0.0001989214888937829, + "loss": 12.6678, + "step": 2769 + }, + { + "epoch": 0.15083757053494246, + "grad_norm": 0.6944323470913601, + "learning_rate": 0.00019892019688557273, + "loss": 12.6697, + "step": 2770 + }, + { + "epoch": 0.15089202453152545, + "grad_norm": 0.9503595753704032, + "learning_rate": 0.00019891890410814257, + "loss": 12.6659, + "step": 2771 + }, + { + "epoch": 0.15094647852810847, + "grad_norm": 0.7002113714765726, + "learning_rate": 0.0001989176105615024, + "loss": 12.5938, + "step": 2772 + }, + { + "epoch": 0.1510009325246915, + "grad_norm": 0.7207990649669148, + "learning_rate": 0.0001989163162456623, + "loss": 12.6238, + "step": 2773 + }, + { + "epoch": 0.1510553865212745, + "grad_norm": 0.7609110684132062, + "learning_rate": 0.00019891502116063233, + "loss": 12.5131, + "step": 2774 + }, + { + "epoch": 0.1511098405178575, + "grad_norm": 0.7841808488718762, + "learning_rate": 0.00019891372530642256, + "loss": 12.7415, + "step": 2775 + }, + { + "epoch": 0.15116429451444052, + "grad_norm": 0.803554084916481, + "learning_rate": 0.0001989124286830431, + "loss": 12.848, + "step": 2776 + }, + { + "epoch": 0.15121874851102354, + "grad_norm": 0.7090519416935699, + "learning_rate": 0.00019891113129050402, + "loss": 12.6934, + "step": 2777 + }, + { + "epoch": 0.15127320250760654, + "grad_norm": 0.9183418134038173, + "learning_rate": 0.00019890983312881538, + "loss": 12.6013, + "step": 2778 + }, + { + "epoch": 0.15132765650418956, + "grad_norm": 0.6772283912015566, + "learning_rate": 0.00019890853419798728, + "loss": 12.6931, + "step": 2779 + }, + { + "epoch": 0.15138211050077258, + "grad_norm": 0.7876816429288338, + "learning_rate": 0.0001989072344980298, + "loss": 12.753, + "step": 2780 + }, + { + "epoch": 0.15143656449735557, + "grad_norm": 0.6960728512524404, + "learning_rate": 0.00019890593402895312, + "loss": 12.606, + "step": 2781 + }, + { + "epoch": 0.1514910184939386, + "grad_norm": 0.7555357427749649, + "learning_rate": 0.00019890463279076731, + "loss": 12.5668, + "step": 2782 + }, + { + "epoch": 0.1515454724905216, + "grad_norm": 0.7010977415108224, + "learning_rate": 0.00019890333078348248, + "loss": 12.6256, + "step": 2783 + }, + { + "epoch": 0.1515999264871046, + "grad_norm": 0.7232246106573289, + "learning_rate": 0.00019890202800710877, + "loss": 12.7359, + "step": 2784 + }, + { + "epoch": 0.15165438048368762, + "grad_norm": 0.809163964335571, + "learning_rate": 0.0001989007244616563, + "loss": 12.8173, + "step": 2785 + }, + { + "epoch": 0.15170883448027064, + "grad_norm": 0.790540688648389, + "learning_rate": 0.0001988994201471352, + "loss": 12.5647, + "step": 2786 + }, + { + "epoch": 0.15176328847685364, + "grad_norm": 0.6886477145769496, + "learning_rate": 0.00019889811506355564, + "loss": 12.5209, + "step": 2787 + }, + { + "epoch": 0.15181774247343666, + "grad_norm": 0.6892518048460101, + "learning_rate": 0.00019889680921092776, + "loss": 12.5624, + "step": 2788 + }, + { + "epoch": 0.15187219647001968, + "grad_norm": 0.7063700559267442, + "learning_rate": 0.0001988955025892617, + "loss": 12.6218, + "step": 2789 + }, + { + "epoch": 0.15192665046660267, + "grad_norm": 0.7893300858011191, + "learning_rate": 0.0001988941951985676, + "loss": 12.641, + "step": 2790 + }, + { + "epoch": 0.1519811044631857, + "grad_norm": 0.7314835383366215, + "learning_rate": 0.0001988928870388557, + "loss": 12.5973, + "step": 2791 + }, + { + "epoch": 0.1520355584597687, + "grad_norm": 0.7727702514239536, + "learning_rate": 0.00019889157811013607, + "loss": 12.6172, + "step": 2792 + }, + { + "epoch": 0.15209001245635173, + "grad_norm": 0.7357199420721806, + "learning_rate": 0.000198890268412419, + "loss": 12.6283, + "step": 2793 + }, + { + "epoch": 0.15214446645293472, + "grad_norm": 0.7029293740093456, + "learning_rate": 0.00019888895794571457, + "loss": 12.7138, + "step": 2794 + }, + { + "epoch": 0.15219892044951774, + "grad_norm": 0.8140800206572754, + "learning_rate": 0.00019888764671003304, + "loss": 12.7915, + "step": 2795 + }, + { + "epoch": 0.15225337444610076, + "grad_norm": 0.7189172652761978, + "learning_rate": 0.0001988863347053846, + "loss": 12.6233, + "step": 2796 + }, + { + "epoch": 0.15230782844268376, + "grad_norm": 0.6867836990626499, + "learning_rate": 0.00019888502193177944, + "loss": 12.671, + "step": 2797 + }, + { + "epoch": 0.15236228243926678, + "grad_norm": 0.7207526699098658, + "learning_rate": 0.00019888370838922774, + "loss": 12.6913, + "step": 2798 + }, + { + "epoch": 0.1524167364358498, + "grad_norm": 0.834167090031831, + "learning_rate": 0.00019888239407773973, + "loss": 12.5075, + "step": 2799 + }, + { + "epoch": 0.1524711904324328, + "grad_norm": 0.7527501201976892, + "learning_rate": 0.00019888107899732567, + "loss": 12.5776, + "step": 2800 + }, + { + "epoch": 0.1525256444290158, + "grad_norm": 0.705797037985442, + "learning_rate": 0.00019887976314799576, + "loss": 12.5589, + "step": 2801 + }, + { + "epoch": 0.15258009842559883, + "grad_norm": 0.8131126845625408, + "learning_rate": 0.00019887844652976023, + "loss": 12.9504, + "step": 2802 + }, + { + "epoch": 0.15263455242218182, + "grad_norm": 0.7197483465561019, + "learning_rate": 0.00019887712914262932, + "loss": 12.6358, + "step": 2803 + }, + { + "epoch": 0.15268900641876484, + "grad_norm": 0.7211535406910872, + "learning_rate": 0.00019887581098661326, + "loss": 12.6082, + "step": 2804 + }, + { + "epoch": 0.15274346041534786, + "grad_norm": 0.7153783931890983, + "learning_rate": 0.0001988744920617223, + "loss": 12.6324, + "step": 2805 + }, + { + "epoch": 0.15279791441193086, + "grad_norm": 0.8625198065469974, + "learning_rate": 0.00019887317236796673, + "loss": 12.6345, + "step": 2806 + }, + { + "epoch": 0.15285236840851388, + "grad_norm": 0.7268672211214469, + "learning_rate": 0.00019887185190535676, + "loss": 12.5933, + "step": 2807 + }, + { + "epoch": 0.1529068224050969, + "grad_norm": 0.7191003964132267, + "learning_rate": 0.00019887053067390271, + "loss": 12.718, + "step": 2808 + }, + { + "epoch": 0.15296127640167992, + "grad_norm": 0.9156558752399968, + "learning_rate": 0.00019886920867361486, + "loss": 12.5851, + "step": 2809 + }, + { + "epoch": 0.1530157303982629, + "grad_norm": 0.7778164323577543, + "learning_rate": 0.00019886788590450343, + "loss": 12.5485, + "step": 2810 + }, + { + "epoch": 0.15307018439484593, + "grad_norm": 0.7567328985770456, + "learning_rate": 0.00019886656236657875, + "loss": 12.7041, + "step": 2811 + }, + { + "epoch": 0.15312463839142895, + "grad_norm": 0.7678411182063787, + "learning_rate": 0.00019886523805985108, + "loss": 12.6737, + "step": 2812 + }, + { + "epoch": 0.15317909238801194, + "grad_norm": 0.8627798690144588, + "learning_rate": 0.0001988639129843308, + "loss": 12.6411, + "step": 2813 + }, + { + "epoch": 0.15323354638459497, + "grad_norm": 0.735304833237844, + "learning_rate": 0.00019886258714002807, + "loss": 12.6742, + "step": 2814 + }, + { + "epoch": 0.15328800038117799, + "grad_norm": 0.7593330629316116, + "learning_rate": 0.00019886126052695333, + "loss": 12.6313, + "step": 2815 + }, + { + "epoch": 0.15334245437776098, + "grad_norm": 0.6571157408790251, + "learning_rate": 0.00019885993314511686, + "loss": 12.6673, + "step": 2816 + }, + { + "epoch": 0.153396908374344, + "grad_norm": 0.7836367975743452, + "learning_rate": 0.00019885860499452895, + "loss": 12.6894, + "step": 2817 + }, + { + "epoch": 0.15345136237092702, + "grad_norm": 0.7174222449230672, + "learning_rate": 0.00019885727607519993, + "loss": 12.5857, + "step": 2818 + }, + { + "epoch": 0.15350581636751, + "grad_norm": 0.8054844096099334, + "learning_rate": 0.00019885594638714018, + "loss": 12.7942, + "step": 2819 + }, + { + "epoch": 0.15356027036409303, + "grad_norm": 0.7531539053210494, + "learning_rate": 0.00019885461593036, + "loss": 12.7152, + "step": 2820 + }, + { + "epoch": 0.15361472436067605, + "grad_norm": 0.8023736412652699, + "learning_rate": 0.00019885328470486976, + "loss": 12.8768, + "step": 2821 + }, + { + "epoch": 0.15366917835725905, + "grad_norm": 0.7543875214548945, + "learning_rate": 0.0001988519527106798, + "loss": 12.5622, + "step": 2822 + }, + { + "epoch": 0.15372363235384207, + "grad_norm": 0.7248015081840921, + "learning_rate": 0.0001988506199478005, + "loss": 12.6281, + "step": 2823 + }, + { + "epoch": 0.15377808635042509, + "grad_norm": 0.7369565113377408, + "learning_rate": 0.00019884928641624217, + "loss": 12.5889, + "step": 2824 + }, + { + "epoch": 0.1538325403470081, + "grad_norm": 0.755269816391, + "learning_rate": 0.00019884795211601522, + "loss": 12.659, + "step": 2825 + }, + { + "epoch": 0.1538869943435911, + "grad_norm": 0.8702353886072459, + "learning_rate": 0.00019884661704713003, + "loss": 12.6518, + "step": 2826 + }, + { + "epoch": 0.15394144834017412, + "grad_norm": 0.8471764604812616, + "learning_rate": 0.00019884528120959693, + "loss": 12.6379, + "step": 2827 + }, + { + "epoch": 0.15399590233675714, + "grad_norm": 0.7872512739321531, + "learning_rate": 0.00019884394460342636, + "loss": 12.509, + "step": 2828 + }, + { + "epoch": 0.15405035633334013, + "grad_norm": 0.8235514382992309, + "learning_rate": 0.00019884260722862873, + "loss": 12.6816, + "step": 2829 + }, + { + "epoch": 0.15410481032992315, + "grad_norm": 1.414682065801912, + "learning_rate": 0.0001988412690852144, + "loss": 12.7264, + "step": 2830 + }, + { + "epoch": 0.15415926432650617, + "grad_norm": 0.7508374862892704, + "learning_rate": 0.00019883993017319376, + "loss": 12.7554, + "step": 2831 + }, + { + "epoch": 0.15421371832308917, + "grad_norm": 0.7707106426936741, + "learning_rate": 0.00019883859049257726, + "loss": 12.6894, + "step": 2832 + }, + { + "epoch": 0.1542681723196722, + "grad_norm": 0.7810345921420442, + "learning_rate": 0.00019883725004337532, + "loss": 12.6557, + "step": 2833 + }, + { + "epoch": 0.1543226263162552, + "grad_norm": 0.6986731637448959, + "learning_rate": 0.00019883590882559834, + "loss": 12.7629, + "step": 2834 + }, + { + "epoch": 0.1543770803128382, + "grad_norm": 0.8182701064880522, + "learning_rate": 0.0001988345668392568, + "loss": 12.7564, + "step": 2835 + }, + { + "epoch": 0.15443153430942122, + "grad_norm": 0.7583269846678126, + "learning_rate": 0.00019883322408436102, + "loss": 12.5635, + "step": 2836 + }, + { + "epoch": 0.15448598830600424, + "grad_norm": 0.721085359058006, + "learning_rate": 0.00019883188056092155, + "loss": 12.5803, + "step": 2837 + }, + { + "epoch": 0.15454044230258723, + "grad_norm": 0.8972990116057311, + "learning_rate": 0.00019883053626894878, + "loss": 12.721, + "step": 2838 + }, + { + "epoch": 0.15459489629917025, + "grad_norm": 0.7696583704912467, + "learning_rate": 0.00019882919120845324, + "loss": 12.6906, + "step": 2839 + }, + { + "epoch": 0.15464935029575327, + "grad_norm": 0.8683192989979782, + "learning_rate": 0.0001988278453794453, + "loss": 12.7031, + "step": 2840 + }, + { + "epoch": 0.1547038042923363, + "grad_norm": 0.7580129590742768, + "learning_rate": 0.00019882649878193544, + "loss": 12.66, + "step": 2841 + }, + { + "epoch": 0.1547582582889193, + "grad_norm": 0.8619060801915377, + "learning_rate": 0.00019882515141593417, + "loss": 12.7066, + "step": 2842 + }, + { + "epoch": 0.1548127122855023, + "grad_norm": 0.8092195760188854, + "learning_rate": 0.00019882380328145195, + "loss": 12.7587, + "step": 2843 + }, + { + "epoch": 0.15486716628208533, + "grad_norm": 0.7464848175934536, + "learning_rate": 0.0001988224543784993, + "loss": 12.6538, + "step": 2844 + }, + { + "epoch": 0.15492162027866832, + "grad_norm": 0.8128263278093952, + "learning_rate": 0.0001988211047070866, + "loss": 12.6386, + "step": 2845 + }, + { + "epoch": 0.15497607427525134, + "grad_norm": 0.7961419281399771, + "learning_rate": 0.0001988197542672245, + "loss": 12.6517, + "step": 2846 + }, + { + "epoch": 0.15503052827183436, + "grad_norm": 0.8514430955903212, + "learning_rate": 0.00019881840305892336, + "loss": 12.6183, + "step": 2847 + }, + { + "epoch": 0.15508498226841735, + "grad_norm": 0.7891271901770917, + "learning_rate": 0.00019881705108219376, + "loss": 12.6508, + "step": 2848 + }, + { + "epoch": 0.15513943626500037, + "grad_norm": 0.8303345806312178, + "learning_rate": 0.0001988156983370462, + "loss": 12.604, + "step": 2849 + }, + { + "epoch": 0.1551938902615834, + "grad_norm": 0.8482866449361063, + "learning_rate": 0.0001988143448234912, + "loss": 12.6734, + "step": 2850 + }, + { + "epoch": 0.1552483442581664, + "grad_norm": 0.7796472708220716, + "learning_rate": 0.0001988129905415393, + "loss": 12.4922, + "step": 2851 + }, + { + "epoch": 0.1553027982547494, + "grad_norm": 0.9465182394861125, + "learning_rate": 0.000198811635491201, + "loss": 12.7001, + "step": 2852 + }, + { + "epoch": 0.15535725225133243, + "grad_norm": 0.7281216127951547, + "learning_rate": 0.00019881027967248683, + "loss": 12.6122, + "step": 2853 + }, + { + "epoch": 0.15541170624791542, + "grad_norm": 0.9286753088079683, + "learning_rate": 0.00019880892308540737, + "loss": 12.8227, + "step": 2854 + }, + { + "epoch": 0.15546616024449844, + "grad_norm": 0.7763893618414947, + "learning_rate": 0.00019880756572997316, + "loss": 12.7216, + "step": 2855 + }, + { + "epoch": 0.15552061424108146, + "grad_norm": 0.7916498937728372, + "learning_rate": 0.00019880620760619476, + "loss": 12.6203, + "step": 2856 + }, + { + "epoch": 0.15557506823766445, + "grad_norm": 0.767571823535825, + "learning_rate": 0.0001988048487140827, + "loss": 12.5859, + "step": 2857 + }, + { + "epoch": 0.15562952223424747, + "grad_norm": 0.7645270956833853, + "learning_rate": 0.00019880348905364757, + "loss": 12.7418, + "step": 2858 + }, + { + "epoch": 0.1556839762308305, + "grad_norm": 0.9638854534597893, + "learning_rate": 0.00019880212862489994, + "loss": 12.6037, + "step": 2859 + }, + { + "epoch": 0.15573843022741352, + "grad_norm": 0.7436913641362939, + "learning_rate": 0.0001988007674278504, + "loss": 12.6679, + "step": 2860 + }, + { + "epoch": 0.1557928842239965, + "grad_norm": 0.8370044615080934, + "learning_rate": 0.00019879940546250953, + "loss": 12.623, + "step": 2861 + }, + { + "epoch": 0.15584733822057953, + "grad_norm": 0.6821081371810797, + "learning_rate": 0.0001987980427288879, + "loss": 12.5769, + "step": 2862 + }, + { + "epoch": 0.15590179221716255, + "grad_norm": 0.7644536575584995, + "learning_rate": 0.0001987966792269961, + "loss": 12.503, + "step": 2863 + }, + { + "epoch": 0.15595624621374554, + "grad_norm": 0.7734674709924113, + "learning_rate": 0.00019879531495684477, + "loss": 12.6612, + "step": 2864 + }, + { + "epoch": 0.15601070021032856, + "grad_norm": 0.7339254537144414, + "learning_rate": 0.00019879394991844453, + "loss": 12.5314, + "step": 2865 + }, + { + "epoch": 0.15606515420691158, + "grad_norm": 0.754806928649199, + "learning_rate": 0.00019879258411180595, + "loss": 12.6488, + "step": 2866 + }, + { + "epoch": 0.15611960820349458, + "grad_norm": 0.7838025302628356, + "learning_rate": 0.00019879121753693966, + "loss": 12.624, + "step": 2867 + }, + { + "epoch": 0.1561740622000776, + "grad_norm": 0.9476835270509917, + "learning_rate": 0.00019878985019385629, + "loss": 12.7108, + "step": 2868 + }, + { + "epoch": 0.15622851619666062, + "grad_norm": 0.7778603460412541, + "learning_rate": 0.0001987884820825665, + "loss": 12.7608, + "step": 2869 + }, + { + "epoch": 0.1562829701932436, + "grad_norm": 0.8146019433611156, + "learning_rate": 0.00019878711320308088, + "loss": 12.5931, + "step": 2870 + }, + { + "epoch": 0.15633742418982663, + "grad_norm": 0.8538501969506916, + "learning_rate": 0.00019878574355541013, + "loss": 12.7687, + "step": 2871 + }, + { + "epoch": 0.15639187818640965, + "grad_norm": 0.7748395850487926, + "learning_rate": 0.00019878437313956485, + "loss": 12.5725, + "step": 2872 + }, + { + "epoch": 0.15644633218299264, + "grad_norm": 0.9468791007489141, + "learning_rate": 0.00019878300195555574, + "loss": 12.7611, + "step": 2873 + }, + { + "epoch": 0.15650078617957566, + "grad_norm": 0.8541710954520909, + "learning_rate": 0.0001987816300033934, + "loss": 12.757, + "step": 2874 + }, + { + "epoch": 0.15655524017615868, + "grad_norm": 0.8001891904172195, + "learning_rate": 0.00019878025728308857, + "loss": 12.5571, + "step": 2875 + }, + { + "epoch": 0.1566096941727417, + "grad_norm": 0.7420531715125143, + "learning_rate": 0.0001987788837946519, + "loss": 12.8099, + "step": 2876 + }, + { + "epoch": 0.1566641481693247, + "grad_norm": 0.8668620926535715, + "learning_rate": 0.00019877750953809403, + "loss": 12.8152, + "step": 2877 + }, + { + "epoch": 0.15671860216590772, + "grad_norm": 0.7066024735778503, + "learning_rate": 0.00019877613451342572, + "loss": 12.6544, + "step": 2878 + }, + { + "epoch": 0.15677305616249074, + "grad_norm": 0.7301587483127819, + "learning_rate": 0.0001987747587206576, + "loss": 12.6587, + "step": 2879 + }, + { + "epoch": 0.15682751015907373, + "grad_norm": 0.8221851904995162, + "learning_rate": 0.0001987733821598004, + "loss": 12.614, + "step": 2880 + }, + { + "epoch": 0.15688196415565675, + "grad_norm": 0.7800050428325034, + "learning_rate": 0.00019877200483086482, + "loss": 12.7346, + "step": 2881 + }, + { + "epoch": 0.15693641815223977, + "grad_norm": 0.7538371294006235, + "learning_rate": 0.00019877062673386155, + "loss": 12.7771, + "step": 2882 + }, + { + "epoch": 0.15699087214882276, + "grad_norm": 0.7737338005678903, + "learning_rate": 0.00019876924786880136, + "loss": 12.6477, + "step": 2883 + }, + { + "epoch": 0.15704532614540578, + "grad_norm": 0.7794075495170949, + "learning_rate": 0.0001987678682356949, + "loss": 12.8022, + "step": 2884 + }, + { + "epoch": 0.1570997801419888, + "grad_norm": 0.8710909802578073, + "learning_rate": 0.00019876648783455293, + "loss": 12.659, + "step": 2885 + }, + { + "epoch": 0.1571542341385718, + "grad_norm": 0.668760916103844, + "learning_rate": 0.0001987651066653862, + "loss": 12.686, + "step": 2886 + }, + { + "epoch": 0.15720868813515482, + "grad_norm": 0.7562285864065703, + "learning_rate": 0.00019876372472820545, + "loss": 12.6519, + "step": 2887 + }, + { + "epoch": 0.15726314213173784, + "grad_norm": 0.7185663162783882, + "learning_rate": 0.0001987623420230214, + "loss": 12.6415, + "step": 2888 + }, + { + "epoch": 0.15731759612832083, + "grad_norm": 0.6729839607124669, + "learning_rate": 0.00019876095854984483, + "loss": 12.6349, + "step": 2889 + }, + { + "epoch": 0.15737205012490385, + "grad_norm": 0.7545275225102389, + "learning_rate": 0.00019875957430868645, + "loss": 12.7718, + "step": 2890 + }, + { + "epoch": 0.15742650412148687, + "grad_norm": 0.7892422343726105, + "learning_rate": 0.0001987581892995571, + "loss": 12.566, + "step": 2891 + }, + { + "epoch": 0.1574809581180699, + "grad_norm": 0.800924414122386, + "learning_rate": 0.0001987568035224675, + "loss": 12.7672, + "step": 2892 + }, + { + "epoch": 0.15753541211465288, + "grad_norm": 0.7405628682209897, + "learning_rate": 0.0001987554169774284, + "loss": 12.6177, + "step": 2893 + }, + { + "epoch": 0.1575898661112359, + "grad_norm": 0.8188446538091821, + "learning_rate": 0.00019875402966445065, + "loss": 12.8018, + "step": 2894 + }, + { + "epoch": 0.15764432010781892, + "grad_norm": 0.6775815545337175, + "learning_rate": 0.00019875264158354498, + "loss": 12.9058, + "step": 2895 + }, + { + "epoch": 0.15769877410440192, + "grad_norm": 0.7745018906486485, + "learning_rate": 0.00019875125273472222, + "loss": 12.6926, + "step": 2896 + }, + { + "epoch": 0.15775322810098494, + "grad_norm": 0.6856967013924328, + "learning_rate": 0.00019874986311799316, + "loss": 12.6651, + "step": 2897 + }, + { + "epoch": 0.15780768209756796, + "grad_norm": 0.8311969803551889, + "learning_rate": 0.00019874847273336862, + "loss": 12.8152, + "step": 2898 + }, + { + "epoch": 0.15786213609415095, + "grad_norm": 0.7484518446515533, + "learning_rate": 0.00019874708158085938, + "loss": 12.7422, + "step": 2899 + }, + { + "epoch": 0.15791659009073397, + "grad_norm": 0.9253746140131502, + "learning_rate": 0.00019874568966047625, + "loss": 12.8505, + "step": 2900 + }, + { + "epoch": 0.157971044087317, + "grad_norm": 0.7642243120959947, + "learning_rate": 0.0001987442969722301, + "loss": 12.6175, + "step": 2901 + }, + { + "epoch": 0.15802549808389998, + "grad_norm": 0.7379359930600351, + "learning_rate": 0.00019874290351613177, + "loss": 12.7439, + "step": 2902 + }, + { + "epoch": 0.158079952080483, + "grad_norm": 0.7361247160464598, + "learning_rate": 0.000198741509292192, + "loss": 12.5702, + "step": 2903 + }, + { + "epoch": 0.15813440607706603, + "grad_norm": 0.7632560096569945, + "learning_rate": 0.00019874011430042173, + "loss": 12.5885, + "step": 2904 + }, + { + "epoch": 0.15818886007364902, + "grad_norm": 0.7121434411200048, + "learning_rate": 0.00019873871854083177, + "loss": 12.6405, + "step": 2905 + }, + { + "epoch": 0.15824331407023204, + "grad_norm": 0.6974010357137779, + "learning_rate": 0.00019873732201343297, + "loss": 12.6232, + "step": 2906 + }, + { + "epoch": 0.15829776806681506, + "grad_norm": 0.6934937196933434, + "learning_rate": 0.00019873592471823622, + "loss": 12.5748, + "step": 2907 + }, + { + "epoch": 0.15835222206339808, + "grad_norm": 0.8136084193231012, + "learning_rate": 0.0001987345266552523, + "loss": 12.705, + "step": 2908 + }, + { + "epoch": 0.15840667605998107, + "grad_norm": 0.8153925561329763, + "learning_rate": 0.0001987331278244922, + "loss": 12.6548, + "step": 2909 + }, + { + "epoch": 0.1584611300565641, + "grad_norm": 0.7461307062300498, + "learning_rate": 0.00019873172822596673, + "loss": 12.6724, + "step": 2910 + }, + { + "epoch": 0.1585155840531471, + "grad_norm": 0.7591005159059452, + "learning_rate": 0.0001987303278596868, + "loss": 12.7604, + "step": 2911 + }, + { + "epoch": 0.1585700380497301, + "grad_norm": 0.7719825542635549, + "learning_rate": 0.00019872892672566326, + "loss": 12.755, + "step": 2912 + }, + { + "epoch": 0.15862449204631313, + "grad_norm": 0.7409777070497231, + "learning_rate": 0.000198727524823907, + "loss": 12.7466, + "step": 2913 + }, + { + "epoch": 0.15867894604289615, + "grad_norm": 0.8675143707913091, + "learning_rate": 0.000198726122154429, + "loss": 12.642, + "step": 2914 + }, + { + "epoch": 0.15873340003947914, + "grad_norm": 0.8154491006459972, + "learning_rate": 0.0001987247187172401, + "loss": 12.749, + "step": 2915 + }, + { + "epoch": 0.15878785403606216, + "grad_norm": 0.8774062749892383, + "learning_rate": 0.0001987233145123512, + "loss": 12.744, + "step": 2916 + }, + { + "epoch": 0.15884230803264518, + "grad_norm": 0.7171864769206286, + "learning_rate": 0.00019872190953977331, + "loss": 12.6279, + "step": 2917 + }, + { + "epoch": 0.15889676202922817, + "grad_norm": 0.8983152653112244, + "learning_rate": 0.0001987205037995173, + "loss": 12.6355, + "step": 2918 + }, + { + "epoch": 0.1589512160258112, + "grad_norm": 0.8848065508819202, + "learning_rate": 0.00019871909729159403, + "loss": 12.8295, + "step": 2919 + }, + { + "epoch": 0.1590056700223942, + "grad_norm": 0.8467832737921628, + "learning_rate": 0.00019871769001601454, + "loss": 12.5814, + "step": 2920 + }, + { + "epoch": 0.1590601240189772, + "grad_norm": 0.84887487191556, + "learning_rate": 0.00019871628197278972, + "loss": 12.8861, + "step": 2921 + }, + { + "epoch": 0.15911457801556023, + "grad_norm": 0.8061164552631324, + "learning_rate": 0.00019871487316193057, + "loss": 12.7834, + "step": 2922 + }, + { + "epoch": 0.15916903201214325, + "grad_norm": 0.7107138305932041, + "learning_rate": 0.000198713463583448, + "loss": 12.5469, + "step": 2923 + }, + { + "epoch": 0.15922348600872624, + "grad_norm": 0.6743243016861461, + "learning_rate": 0.00019871205323735298, + "loss": 12.6979, + "step": 2924 + }, + { + "epoch": 0.15927794000530926, + "grad_norm": 0.7720761548238595, + "learning_rate": 0.00019871064212365647, + "loss": 12.5449, + "step": 2925 + }, + { + "epoch": 0.15933239400189228, + "grad_norm": 0.7922355008399666, + "learning_rate": 0.00019870923024236948, + "loss": 12.6402, + "step": 2926 + }, + { + "epoch": 0.1593868479984753, + "grad_norm": 0.8070662326895096, + "learning_rate": 0.00019870781759350292, + "loss": 12.6296, + "step": 2927 + }, + { + "epoch": 0.1594413019950583, + "grad_norm": 0.6803895819196606, + "learning_rate": 0.00019870640417706784, + "loss": 12.5325, + "step": 2928 + }, + { + "epoch": 0.1594957559916413, + "grad_norm": 0.7126599815491631, + "learning_rate": 0.00019870498999307522, + "loss": 12.6576, + "step": 2929 + }, + { + "epoch": 0.15955020998822433, + "grad_norm": 0.8448286288702788, + "learning_rate": 0.00019870357504153603, + "loss": 12.7188, + "step": 2930 + }, + { + "epoch": 0.15960466398480733, + "grad_norm": 0.7214711269702817, + "learning_rate": 0.00019870215932246127, + "loss": 12.6394, + "step": 2931 + }, + { + "epoch": 0.15965911798139035, + "grad_norm": 0.829531978726736, + "learning_rate": 0.000198700742835862, + "loss": 12.7261, + "step": 2932 + }, + { + "epoch": 0.15971357197797337, + "grad_norm": 0.7058088612808299, + "learning_rate": 0.00019869932558174919, + "loss": 12.678, + "step": 2933 + }, + { + "epoch": 0.15976802597455636, + "grad_norm": 0.7317555508306628, + "learning_rate": 0.00019869790756013385, + "loss": 12.5755, + "step": 2934 + }, + { + "epoch": 0.15982247997113938, + "grad_norm": 0.6647603557904068, + "learning_rate": 0.00019869648877102707, + "loss": 12.6979, + "step": 2935 + }, + { + "epoch": 0.1598769339677224, + "grad_norm": 0.8407777414091054, + "learning_rate": 0.00019869506921443982, + "loss": 12.7168, + "step": 2936 + }, + { + "epoch": 0.1599313879643054, + "grad_norm": 0.743608301092564, + "learning_rate": 0.0001986936488903832, + "loss": 12.756, + "step": 2937 + }, + { + "epoch": 0.15998584196088841, + "grad_norm": 0.8049838269368632, + "learning_rate": 0.00019869222779886814, + "loss": 12.6823, + "step": 2938 + }, + { + "epoch": 0.16004029595747143, + "grad_norm": 0.7606150998743767, + "learning_rate": 0.00019869080593990578, + "loss": 12.6691, + "step": 2939 + }, + { + "epoch": 0.16009474995405443, + "grad_norm": 0.6970909985961943, + "learning_rate": 0.0001986893833135072, + "loss": 12.7718, + "step": 2940 + }, + { + "epoch": 0.16014920395063745, + "grad_norm": 0.7769916732202724, + "learning_rate": 0.00019868795991968342, + "loss": 12.6826, + "step": 2941 + }, + { + "epoch": 0.16020365794722047, + "grad_norm": 0.7659872792765248, + "learning_rate": 0.0001986865357584455, + "loss": 12.7583, + "step": 2942 + }, + { + "epoch": 0.1602581119438035, + "grad_norm": 0.8445126991147179, + "learning_rate": 0.00019868511082980455, + "loss": 12.8178, + "step": 2943 + }, + { + "epoch": 0.16031256594038648, + "grad_norm": 0.763277351074957, + "learning_rate": 0.0001986836851337716, + "loss": 12.6646, + "step": 2944 + }, + { + "epoch": 0.1603670199369695, + "grad_norm": 1.0276730264139622, + "learning_rate": 0.00019868225867035778, + "loss": 12.837, + "step": 2945 + }, + { + "epoch": 0.16042147393355252, + "grad_norm": 1.0233540700530248, + "learning_rate": 0.00019868083143957416, + "loss": 12.7812, + "step": 2946 + }, + { + "epoch": 0.16047592793013551, + "grad_norm": 0.7844462560812835, + "learning_rate": 0.00019867940344143185, + "loss": 12.6027, + "step": 2947 + }, + { + "epoch": 0.16053038192671854, + "grad_norm": 0.8081790004592988, + "learning_rate": 0.00019867797467594195, + "loss": 12.5724, + "step": 2948 + }, + { + "epoch": 0.16058483592330156, + "grad_norm": 0.7615334540876233, + "learning_rate": 0.00019867654514311558, + "loss": 12.6042, + "step": 2949 + }, + { + "epoch": 0.16063928991988455, + "grad_norm": 0.7693417927056728, + "learning_rate": 0.00019867511484296385, + "loss": 12.7132, + "step": 2950 + }, + { + "epoch": 0.16069374391646757, + "grad_norm": 0.7432777721279483, + "learning_rate": 0.00019867368377549785, + "loss": 12.6545, + "step": 2951 + }, + { + "epoch": 0.1607481979130506, + "grad_norm": 0.7256121338548468, + "learning_rate": 0.00019867225194072875, + "loss": 12.6685, + "step": 2952 + }, + { + "epoch": 0.16080265190963358, + "grad_norm": 0.7448417158991302, + "learning_rate": 0.00019867081933866768, + "loss": 12.711, + "step": 2953 + }, + { + "epoch": 0.1608571059062166, + "grad_norm": 0.8333907442536371, + "learning_rate": 0.0001986693859693258, + "loss": 12.6638, + "step": 2954 + }, + { + "epoch": 0.16091155990279962, + "grad_norm": 0.813839754003139, + "learning_rate": 0.00019866795183271418, + "loss": 12.7326, + "step": 2955 + }, + { + "epoch": 0.16096601389938262, + "grad_norm": 0.8254595639514176, + "learning_rate": 0.000198666516928844, + "loss": 12.8003, + "step": 2956 + }, + { + "epoch": 0.16102046789596564, + "grad_norm": 0.7082473755766474, + "learning_rate": 0.0001986650812577265, + "loss": 12.7089, + "step": 2957 + }, + { + "epoch": 0.16107492189254866, + "grad_norm": 0.8686973365777078, + "learning_rate": 0.00019866364481937275, + "loss": 12.8251, + "step": 2958 + }, + { + "epoch": 0.16112937588913168, + "grad_norm": 0.8270284101134693, + "learning_rate": 0.00019866220761379397, + "loss": 12.8409, + "step": 2959 + }, + { + "epoch": 0.16118382988571467, + "grad_norm": 0.7209478339761264, + "learning_rate": 0.00019866076964100132, + "loss": 12.7463, + "step": 2960 + }, + { + "epoch": 0.1612382838822977, + "grad_norm": 0.9011747411779443, + "learning_rate": 0.00019865933090100593, + "loss": 12.7234, + "step": 2961 + }, + { + "epoch": 0.1612927378788807, + "grad_norm": 0.7302090317140539, + "learning_rate": 0.00019865789139381906, + "loss": 12.6734, + "step": 2962 + }, + { + "epoch": 0.1613471918754637, + "grad_norm": 0.8439839869720785, + "learning_rate": 0.00019865645111945192, + "loss": 12.7653, + "step": 2963 + }, + { + "epoch": 0.16140164587204672, + "grad_norm": 0.7831387332858022, + "learning_rate": 0.00019865501007791564, + "loss": 12.6617, + "step": 2964 + }, + { + "epoch": 0.16145609986862974, + "grad_norm": 0.751759533220739, + "learning_rate": 0.00019865356826922147, + "loss": 12.6347, + "step": 2965 + }, + { + "epoch": 0.16151055386521274, + "grad_norm": 0.784713141210689, + "learning_rate": 0.0001986521256933806, + "loss": 12.6752, + "step": 2966 + }, + { + "epoch": 0.16156500786179576, + "grad_norm": 0.9349312628518618, + "learning_rate": 0.00019865068235040427, + "loss": 12.8224, + "step": 2967 + }, + { + "epoch": 0.16161946185837878, + "grad_norm": 0.796775119153196, + "learning_rate": 0.00019864923824030367, + "loss": 12.6793, + "step": 2968 + }, + { + "epoch": 0.16167391585496177, + "grad_norm": 0.9158652239242536, + "learning_rate": 0.00019864779336309005, + "loss": 12.752, + "step": 2969 + }, + { + "epoch": 0.1617283698515448, + "grad_norm": 0.7597183903240505, + "learning_rate": 0.0001986463477187747, + "loss": 12.7695, + "step": 2970 + }, + { + "epoch": 0.1617828238481278, + "grad_norm": 0.9777933769595428, + "learning_rate": 0.00019864490130736874, + "loss": 12.7579, + "step": 2971 + }, + { + "epoch": 0.1618372778447108, + "grad_norm": 0.6985247110538119, + "learning_rate": 0.0001986434541288835, + "loss": 12.6177, + "step": 2972 + }, + { + "epoch": 0.16189173184129382, + "grad_norm": 0.7980997965207286, + "learning_rate": 0.00019864200618333023, + "loss": 12.7239, + "step": 2973 + }, + { + "epoch": 0.16194618583787684, + "grad_norm": 0.8623726372473786, + "learning_rate": 0.00019864055747072018, + "loss": 12.7306, + "step": 2974 + }, + { + "epoch": 0.16200063983445986, + "grad_norm": 0.7770705515852193, + "learning_rate": 0.0001986391079910646, + "loss": 12.8426, + "step": 2975 + }, + { + "epoch": 0.16205509383104286, + "grad_norm": 0.7478972825442104, + "learning_rate": 0.00019863765774437477, + "loss": 12.6541, + "step": 2976 + }, + { + "epoch": 0.16210954782762588, + "grad_norm": 0.7143585535799926, + "learning_rate": 0.000198636206730662, + "loss": 12.7268, + "step": 2977 + }, + { + "epoch": 0.1621640018242089, + "grad_norm": 0.8776883368027423, + "learning_rate": 0.0001986347549499375, + "loss": 12.831, + "step": 2978 + }, + { + "epoch": 0.1622184558207919, + "grad_norm": 0.7508476394175179, + "learning_rate": 0.00019863330240221263, + "loss": 12.833, + "step": 2979 + }, + { + "epoch": 0.1622729098173749, + "grad_norm": 0.8346959291607994, + "learning_rate": 0.00019863184908749866, + "loss": 12.6451, + "step": 2980 + }, + { + "epoch": 0.16232736381395793, + "grad_norm": 0.7329729295070289, + "learning_rate": 0.0001986303950058069, + "loss": 12.743, + "step": 2981 + }, + { + "epoch": 0.16238181781054092, + "grad_norm": 0.7357999171068264, + "learning_rate": 0.00019862894015714865, + "loss": 12.6297, + "step": 2982 + }, + { + "epoch": 0.16243627180712394, + "grad_norm": 0.7464982470254528, + "learning_rate": 0.00019862748454153523, + "loss": 12.7155, + "step": 2983 + }, + { + "epoch": 0.16249072580370696, + "grad_norm": 0.7276796561806051, + "learning_rate": 0.00019862602815897792, + "loss": 12.7921, + "step": 2984 + }, + { + "epoch": 0.16254517980028996, + "grad_norm": 0.7468609325498567, + "learning_rate": 0.0001986245710094881, + "loss": 12.7216, + "step": 2985 + }, + { + "epoch": 0.16259963379687298, + "grad_norm": 0.7581697225369602, + "learning_rate": 0.00019862311309307704, + "loss": 12.6114, + "step": 2986 + }, + { + "epoch": 0.162654087793456, + "grad_norm": 0.7915931346131438, + "learning_rate": 0.00019862165440975616, + "loss": 12.745, + "step": 2987 + }, + { + "epoch": 0.162708541790039, + "grad_norm": 0.775253020930781, + "learning_rate": 0.00019862019495953673, + "loss": 12.6253, + "step": 2988 + }, + { + "epoch": 0.162762995786622, + "grad_norm": 0.7540019789596962, + "learning_rate": 0.0001986187347424301, + "loss": 12.6637, + "step": 2989 + }, + { + "epoch": 0.16281744978320503, + "grad_norm": 0.7955802651375394, + "learning_rate": 0.00019861727375844769, + "loss": 12.6314, + "step": 2990 + }, + { + "epoch": 0.16287190377978802, + "grad_norm": 0.704076718526564, + "learning_rate": 0.00019861581200760083, + "loss": 12.654, + "step": 2991 + }, + { + "epoch": 0.16292635777637104, + "grad_norm": 0.752020939332643, + "learning_rate": 0.00019861434948990084, + "loss": 12.7554, + "step": 2992 + }, + { + "epoch": 0.16298081177295407, + "grad_norm": 0.8500661663949628, + "learning_rate": 0.00019861288620535915, + "loss": 12.7574, + "step": 2993 + }, + { + "epoch": 0.16303526576953709, + "grad_norm": 0.7151514384071872, + "learning_rate": 0.00019861142215398713, + "loss": 12.6612, + "step": 2994 + }, + { + "epoch": 0.16308971976612008, + "grad_norm": 0.8014697789286036, + "learning_rate": 0.00019860995733579615, + "loss": 12.6775, + "step": 2995 + }, + { + "epoch": 0.1631441737627031, + "grad_norm": 0.7902169978055278, + "learning_rate": 0.0001986084917507976, + "loss": 12.8139, + "step": 2996 + }, + { + "epoch": 0.16319862775928612, + "grad_norm": 0.7667894415875761, + "learning_rate": 0.00019860702539900287, + "loss": 12.5831, + "step": 2997 + }, + { + "epoch": 0.1632530817558691, + "grad_norm": 0.7606419870046596, + "learning_rate": 0.00019860555828042338, + "loss": 12.5801, + "step": 2998 + }, + { + "epoch": 0.16330753575245213, + "grad_norm": 0.7268333122039216, + "learning_rate": 0.00019860409039507054, + "loss": 12.6711, + "step": 2999 + }, + { + "epoch": 0.16336198974903515, + "grad_norm": 0.8293781927525129, + "learning_rate": 0.00019860262174295574, + "loss": 12.667, + "step": 3000 + }, + { + "epoch": 0.16341644374561815, + "grad_norm": 0.6821855115896598, + "learning_rate": 0.00019860115232409045, + "loss": 12.646, + "step": 3001 + }, + { + "epoch": 0.16347089774220117, + "grad_norm": 0.8003391933674313, + "learning_rate": 0.00019859968213848604, + "loss": 12.5705, + "step": 3002 + }, + { + "epoch": 0.16352535173878419, + "grad_norm": 0.8315845013677727, + "learning_rate": 0.00019859821118615396, + "loss": 12.7818, + "step": 3003 + }, + { + "epoch": 0.16357980573536718, + "grad_norm": 0.8041144709489726, + "learning_rate": 0.00019859673946710568, + "loss": 12.6929, + "step": 3004 + }, + { + "epoch": 0.1636342597319502, + "grad_norm": 0.7491663856126463, + "learning_rate": 0.0001985952669813526, + "loss": 12.6536, + "step": 3005 + }, + { + "epoch": 0.16368871372853322, + "grad_norm": 0.7213642535047672, + "learning_rate": 0.0001985937937289062, + "loss": 12.6771, + "step": 3006 + }, + { + "epoch": 0.1637431677251162, + "grad_norm": 0.7064321339419524, + "learning_rate": 0.00019859231970977792, + "loss": 12.7313, + "step": 3007 + }, + { + "epoch": 0.16379762172169923, + "grad_norm": 0.7118779395124208, + "learning_rate": 0.00019859084492397923, + "loss": 12.6198, + "step": 3008 + }, + { + "epoch": 0.16385207571828225, + "grad_norm": 0.776431535884499, + "learning_rate": 0.0001985893693715216, + "loss": 12.6581, + "step": 3009 + }, + { + "epoch": 0.16390652971486527, + "grad_norm": 0.7060943938035943, + "learning_rate": 0.00019858789305241648, + "loss": 12.6482, + "step": 3010 + }, + { + "epoch": 0.16396098371144827, + "grad_norm": 0.7462608617503164, + "learning_rate": 0.0001985864159666754, + "loss": 12.7181, + "step": 3011 + }, + { + "epoch": 0.1640154377080313, + "grad_norm": 0.7949141644882773, + "learning_rate": 0.0001985849381143098, + "loss": 12.7098, + "step": 3012 + }, + { + "epoch": 0.1640698917046143, + "grad_norm": 0.7700097507545192, + "learning_rate": 0.00019858345949533117, + "loss": 12.6542, + "step": 3013 + }, + { + "epoch": 0.1641243457011973, + "grad_norm": 0.7223619675646541, + "learning_rate": 0.00019858198010975106, + "loss": 12.7549, + "step": 3014 + }, + { + "epoch": 0.16417879969778032, + "grad_norm": 0.6751448073396646, + "learning_rate": 0.00019858049995758094, + "loss": 12.6315, + "step": 3015 + }, + { + "epoch": 0.16423325369436334, + "grad_norm": 0.7771226973080849, + "learning_rate": 0.0001985790190388323, + "loss": 12.6497, + "step": 3016 + }, + { + "epoch": 0.16428770769094633, + "grad_norm": 0.6711637065491515, + "learning_rate": 0.00019857753735351668, + "loss": 12.5573, + "step": 3017 + }, + { + "epoch": 0.16434216168752935, + "grad_norm": 0.712214129858097, + "learning_rate": 0.0001985760549016456, + "loss": 12.7238, + "step": 3018 + }, + { + "epoch": 0.16439661568411237, + "grad_norm": 0.7331697045163766, + "learning_rate": 0.0001985745716832306, + "loss": 12.5742, + "step": 3019 + }, + { + "epoch": 0.16445106968069537, + "grad_norm": 0.7119826674527502, + "learning_rate": 0.0001985730876982832, + "loss": 12.6483, + "step": 3020 + }, + { + "epoch": 0.1645055236772784, + "grad_norm": 0.6974600839247409, + "learning_rate": 0.0001985716029468149, + "loss": 12.6442, + "step": 3021 + }, + { + "epoch": 0.1645599776738614, + "grad_norm": 0.8670722829202118, + "learning_rate": 0.00019857011742883734, + "loss": 12.8303, + "step": 3022 + }, + { + "epoch": 0.1646144316704444, + "grad_norm": 0.7635478127989597, + "learning_rate": 0.00019856863114436197, + "loss": 12.5336, + "step": 3023 + }, + { + "epoch": 0.16466888566702742, + "grad_norm": 0.7542722269262729, + "learning_rate": 0.0001985671440934004, + "loss": 12.6527, + "step": 3024 + }, + { + "epoch": 0.16472333966361044, + "grad_norm": 0.8416074307555329, + "learning_rate": 0.00019856565627596423, + "loss": 12.7418, + "step": 3025 + }, + { + "epoch": 0.16477779366019346, + "grad_norm": 0.7287356370803173, + "learning_rate": 0.00019856416769206496, + "loss": 12.7479, + "step": 3026 + }, + { + "epoch": 0.16483224765677645, + "grad_norm": 0.8005722489828518, + "learning_rate": 0.00019856267834171418, + "loss": 12.6369, + "step": 3027 + }, + { + "epoch": 0.16488670165335947, + "grad_norm": 0.8305008235817432, + "learning_rate": 0.00019856118822492348, + "loss": 12.6217, + "step": 3028 + }, + { + "epoch": 0.1649411556499425, + "grad_norm": 0.7033320894025007, + "learning_rate": 0.0001985596973417045, + "loss": 12.6356, + "step": 3029 + }, + { + "epoch": 0.1649956096465255, + "grad_norm": 0.7793475423768594, + "learning_rate": 0.00019855820569206873, + "loss": 12.5976, + "step": 3030 + }, + { + "epoch": 0.1650500636431085, + "grad_norm": 0.7107485974300436, + "learning_rate": 0.00019855671327602786, + "loss": 12.6629, + "step": 3031 + }, + { + "epoch": 0.16510451763969153, + "grad_norm": 0.8246162928773088, + "learning_rate": 0.00019855522009359344, + "loss": 12.7947, + "step": 3032 + }, + { + "epoch": 0.16515897163627452, + "grad_norm": 0.8131775899827965, + "learning_rate": 0.0001985537261447771, + "loss": 12.7613, + "step": 3033 + }, + { + "epoch": 0.16521342563285754, + "grad_norm": 0.7587550524533624, + "learning_rate": 0.00019855223142959045, + "loss": 12.6487, + "step": 3034 + }, + { + "epoch": 0.16526787962944056, + "grad_norm": 0.9463315526005036, + "learning_rate": 0.00019855073594804513, + "loss": 12.8261, + "step": 3035 + }, + { + "epoch": 0.16532233362602355, + "grad_norm": 0.6858694256651935, + "learning_rate": 0.0001985492397001528, + "loss": 12.6521, + "step": 3036 + }, + { + "epoch": 0.16537678762260657, + "grad_norm": 0.9290574372859011, + "learning_rate": 0.00019854774268592503, + "loss": 12.7764, + "step": 3037 + }, + { + "epoch": 0.1654312416191896, + "grad_norm": 0.7226513185149467, + "learning_rate": 0.00019854624490537345, + "loss": 12.5919, + "step": 3038 + }, + { + "epoch": 0.1654856956157726, + "grad_norm": 0.8268556528120298, + "learning_rate": 0.00019854474635850976, + "loss": 12.5752, + "step": 3039 + }, + { + "epoch": 0.1655401496123556, + "grad_norm": 0.6935520582092968, + "learning_rate": 0.0001985432470453456, + "loss": 12.624, + "step": 3040 + }, + { + "epoch": 0.16559460360893863, + "grad_norm": 0.7905359262554176, + "learning_rate": 0.00019854174696589265, + "loss": 12.6458, + "step": 3041 + }, + { + "epoch": 0.16564905760552165, + "grad_norm": 0.8584511171853852, + "learning_rate": 0.00019854024612016256, + "loss": 12.787, + "step": 3042 + }, + { + "epoch": 0.16570351160210464, + "grad_norm": 0.7234888567257534, + "learning_rate": 0.00019853874450816695, + "loss": 12.7128, + "step": 3043 + }, + { + "epoch": 0.16575796559868766, + "grad_norm": 0.8364271048465408, + "learning_rate": 0.00019853724212991756, + "loss": 12.7128, + "step": 3044 + }, + { + "epoch": 0.16581241959527068, + "grad_norm": 0.7495999449297123, + "learning_rate": 0.00019853573898542603, + "loss": 12.7308, + "step": 3045 + }, + { + "epoch": 0.16586687359185368, + "grad_norm": 0.7474271252052067, + "learning_rate": 0.0001985342350747041, + "loss": 12.5932, + "step": 3046 + }, + { + "epoch": 0.1659213275884367, + "grad_norm": 0.7774008075239626, + "learning_rate": 0.00019853273039776342, + "loss": 12.7495, + "step": 3047 + }, + { + "epoch": 0.16597578158501972, + "grad_norm": 0.7127847304079973, + "learning_rate": 0.0001985312249546157, + "loss": 12.8242, + "step": 3048 + }, + { + "epoch": 0.1660302355816027, + "grad_norm": 0.8142909144845354, + "learning_rate": 0.00019852971874527263, + "loss": 12.7933, + "step": 3049 + }, + { + "epoch": 0.16608468957818573, + "grad_norm": 0.8571399956290684, + "learning_rate": 0.00019852821176974595, + "loss": 12.7599, + "step": 3050 + }, + { + "epoch": 0.16613914357476875, + "grad_norm": 0.7255126471391168, + "learning_rate": 0.0001985267040280474, + "loss": 12.6355, + "step": 3051 + }, + { + "epoch": 0.16619359757135174, + "grad_norm": 0.8548735300335262, + "learning_rate": 0.00019852519552018868, + "loss": 12.7333, + "step": 3052 + }, + { + "epoch": 0.16624805156793476, + "grad_norm": 0.9585000229029373, + "learning_rate": 0.0001985236862461815, + "loss": 12.6209, + "step": 3053 + }, + { + "epoch": 0.16630250556451778, + "grad_norm": 0.8346856967389, + "learning_rate": 0.0001985221762060376, + "loss": 12.8314, + "step": 3054 + }, + { + "epoch": 0.16635695956110078, + "grad_norm": 0.781151732816868, + "learning_rate": 0.00019852066539976875, + "loss": 12.6076, + "step": 3055 + }, + { + "epoch": 0.1664114135576838, + "grad_norm": 0.8753576736840428, + "learning_rate": 0.00019851915382738668, + "loss": 12.7246, + "step": 3056 + }, + { + "epoch": 0.16646586755426682, + "grad_norm": 0.7551967445969222, + "learning_rate": 0.00019851764148890317, + "loss": 12.672, + "step": 3057 + }, + { + "epoch": 0.1665203215508498, + "grad_norm": 0.7104804983466242, + "learning_rate": 0.00019851612838432992, + "loss": 12.5429, + "step": 3058 + }, + { + "epoch": 0.16657477554743283, + "grad_norm": 0.810574169985711, + "learning_rate": 0.00019851461451367877, + "loss": 12.6264, + "step": 3059 + }, + { + "epoch": 0.16662922954401585, + "grad_norm": 0.7584177948039127, + "learning_rate": 0.00019851309987696145, + "loss": 12.7277, + "step": 3060 + }, + { + "epoch": 0.16668368354059887, + "grad_norm": 0.7381274284204892, + "learning_rate": 0.00019851158447418973, + "loss": 12.7544, + "step": 3061 + }, + { + "epoch": 0.16673813753718186, + "grad_norm": 0.7600336760368321, + "learning_rate": 0.00019851006830537543, + "loss": 12.6901, + "step": 3062 + }, + { + "epoch": 0.16679259153376488, + "grad_norm": 0.8158717737436204, + "learning_rate": 0.00019850855137053028, + "loss": 12.7702, + "step": 3063 + }, + { + "epoch": 0.1668470455303479, + "grad_norm": 0.74919777778861, + "learning_rate": 0.00019850703366966615, + "loss": 12.605, + "step": 3064 + }, + { + "epoch": 0.1669014995269309, + "grad_norm": 0.7882382250155103, + "learning_rate": 0.00019850551520279478, + "loss": 12.806, + "step": 3065 + }, + { + "epoch": 0.16695595352351392, + "grad_norm": 0.7955354153451767, + "learning_rate": 0.00019850399596992802, + "loss": 12.6832, + "step": 3066 + }, + { + "epoch": 0.16701040752009694, + "grad_norm": 0.7322099613336235, + "learning_rate": 0.00019850247597107766, + "loss": 12.6294, + "step": 3067 + }, + { + "epoch": 0.16706486151667993, + "grad_norm": 0.7805140231750468, + "learning_rate": 0.00019850095520625556, + "loss": 12.7636, + "step": 3068 + }, + { + "epoch": 0.16711931551326295, + "grad_norm": 0.7115740455768171, + "learning_rate": 0.00019849943367547347, + "loss": 12.5487, + "step": 3069 + }, + { + "epoch": 0.16717376950984597, + "grad_norm": 0.7621711099837365, + "learning_rate": 0.0001984979113787433, + "loss": 12.7246, + "step": 3070 + }, + { + "epoch": 0.16722822350642896, + "grad_norm": 0.7449652831037106, + "learning_rate": 0.00019849638831607682, + "loss": 12.6964, + "step": 3071 + }, + { + "epoch": 0.16728267750301198, + "grad_norm": 0.7552484504379975, + "learning_rate": 0.00019849486448748592, + "loss": 12.73, + "step": 3072 + }, + { + "epoch": 0.167337131499595, + "grad_norm": 0.7649404312476997, + "learning_rate": 0.00019849333989298246, + "loss": 12.6314, + "step": 3073 + }, + { + "epoch": 0.167391585496178, + "grad_norm": 0.8408831313771824, + "learning_rate": 0.00019849181453257826, + "loss": 12.7805, + "step": 3074 + }, + { + "epoch": 0.16744603949276102, + "grad_norm": 0.7302841277403656, + "learning_rate": 0.00019849028840628516, + "loss": 12.5063, + "step": 3075 + }, + { + "epoch": 0.16750049348934404, + "grad_norm": 0.7544122102561781, + "learning_rate": 0.00019848876151411511, + "loss": 12.7941, + "step": 3076 + }, + { + "epoch": 0.16755494748592706, + "grad_norm": 0.7918574476711356, + "learning_rate": 0.00019848723385607989, + "loss": 12.6725, + "step": 3077 + }, + { + "epoch": 0.16760940148251005, + "grad_norm": 0.751101721987298, + "learning_rate": 0.00019848570543219146, + "loss": 12.6924, + "step": 3078 + }, + { + "epoch": 0.16766385547909307, + "grad_norm": 0.6838336506900237, + "learning_rate": 0.00019848417624246162, + "loss": 12.6729, + "step": 3079 + }, + { + "epoch": 0.1677183094756761, + "grad_norm": 0.7408676965449393, + "learning_rate": 0.00019848264628690237, + "loss": 12.7387, + "step": 3080 + }, + { + "epoch": 0.16777276347225908, + "grad_norm": 0.7696366140174506, + "learning_rate": 0.00019848111556552552, + "loss": 12.7856, + "step": 3081 + }, + { + "epoch": 0.1678272174688421, + "grad_norm": 0.8200316785096359, + "learning_rate": 0.00019847958407834298, + "loss": 12.6265, + "step": 3082 + }, + { + "epoch": 0.16788167146542513, + "grad_norm": 0.681234990809023, + "learning_rate": 0.0001984780518253667, + "loss": 12.6042, + "step": 3083 + }, + { + "epoch": 0.16793612546200812, + "grad_norm": 0.8458155782816993, + "learning_rate": 0.00019847651880660857, + "loss": 12.6334, + "step": 3084 + }, + { + "epoch": 0.16799057945859114, + "grad_norm": 0.8541159959566602, + "learning_rate": 0.00019847498502208053, + "loss": 12.6321, + "step": 3085 + }, + { + "epoch": 0.16804503345517416, + "grad_norm": 0.7817962520050384, + "learning_rate": 0.00019847345047179445, + "loss": 12.6858, + "step": 3086 + }, + { + "epoch": 0.16809948745175715, + "grad_norm": 0.740149125300525, + "learning_rate": 0.00019847191515576235, + "loss": 12.6561, + "step": 3087 + }, + { + "epoch": 0.16815394144834017, + "grad_norm": 0.7569091163765901, + "learning_rate": 0.0001984703790739961, + "loss": 12.6237, + "step": 3088 + }, + { + "epoch": 0.1682083954449232, + "grad_norm": 0.8628887769130403, + "learning_rate": 0.00019846884222650768, + "loss": 12.7761, + "step": 3089 + }, + { + "epoch": 0.16826284944150618, + "grad_norm": 0.8704712641782385, + "learning_rate": 0.00019846730461330902, + "loss": 12.76, + "step": 3090 + }, + { + "epoch": 0.1683173034380892, + "grad_norm": 0.7408081447934876, + "learning_rate": 0.0001984657662344121, + "loss": 12.6615, + "step": 3091 + }, + { + "epoch": 0.16837175743467223, + "grad_norm": 0.710881601350767, + "learning_rate": 0.00019846422708982885, + "loss": 12.5017, + "step": 3092 + }, + { + "epoch": 0.16842621143125525, + "grad_norm": 0.8098661766532628, + "learning_rate": 0.00019846268717957127, + "loss": 12.6211, + "step": 3093 + }, + { + "epoch": 0.16848066542783824, + "grad_norm": 0.775012426784901, + "learning_rate": 0.00019846114650365134, + "loss": 12.6198, + "step": 3094 + }, + { + "epoch": 0.16853511942442126, + "grad_norm": 0.80326601496314, + "learning_rate": 0.000198459605062081, + "loss": 12.6613, + "step": 3095 + }, + { + "epoch": 0.16858957342100428, + "grad_norm": 0.7354652502997332, + "learning_rate": 0.00019845806285487228, + "loss": 12.615, + "step": 3096 + }, + { + "epoch": 0.16864402741758727, + "grad_norm": 0.7450886072732027, + "learning_rate": 0.00019845651988203712, + "loss": 12.54, + "step": 3097 + }, + { + "epoch": 0.1686984814141703, + "grad_norm": 0.7933552614013386, + "learning_rate": 0.00019845497614358757, + "loss": 12.5981, + "step": 3098 + }, + { + "epoch": 0.1687529354107533, + "grad_norm": 0.7968877726596918, + "learning_rate": 0.0001984534316395356, + "loss": 12.6495, + "step": 3099 + }, + { + "epoch": 0.1688073894073363, + "grad_norm": 0.7612335336482233, + "learning_rate": 0.00019845188636989324, + "loss": 12.7011, + "step": 3100 + }, + { + "epoch": 0.16886184340391933, + "grad_norm": 0.7379465764326891, + "learning_rate": 0.00019845034033467253, + "loss": 12.647, + "step": 3101 + }, + { + "epoch": 0.16891629740050235, + "grad_norm": 0.6342380056369946, + "learning_rate": 0.0001984487935338854, + "loss": 12.3162, + "step": 3102 + }, + { + "epoch": 0.16897075139708534, + "grad_norm": 0.7992548336863188, + "learning_rate": 0.000198447245967544, + "loss": 12.6207, + "step": 3103 + }, + { + "epoch": 0.16902520539366836, + "grad_norm": 0.7347690982602108, + "learning_rate": 0.0001984456976356603, + "loss": 12.6189, + "step": 3104 + }, + { + "epoch": 0.16907965939025138, + "grad_norm": 0.7411609138616736, + "learning_rate": 0.0001984441485382463, + "loss": 12.6495, + "step": 3105 + }, + { + "epoch": 0.16913411338683437, + "grad_norm": 0.8311106556317931, + "learning_rate": 0.00019844259867531414, + "loss": 12.6338, + "step": 3106 + }, + { + "epoch": 0.1691885673834174, + "grad_norm": 0.766495636804695, + "learning_rate": 0.00019844104804687582, + "loss": 12.7428, + "step": 3107 + }, + { + "epoch": 0.1692430213800004, + "grad_norm": 0.8369928669689569, + "learning_rate": 0.0001984394966529434, + "loss": 12.7464, + "step": 3108 + }, + { + "epoch": 0.16929747537658343, + "grad_norm": 0.8005274422552616, + "learning_rate": 0.00019843794449352892, + "loss": 12.5939, + "step": 3109 + }, + { + "epoch": 0.16935192937316643, + "grad_norm": 0.6739084256004799, + "learning_rate": 0.0001984363915686445, + "loss": 12.4955, + "step": 3110 + }, + { + "epoch": 0.16940638336974945, + "grad_norm": 0.8504633667447858, + "learning_rate": 0.00019843483787830216, + "loss": 12.6238, + "step": 3111 + }, + { + "epoch": 0.16946083736633247, + "grad_norm": 0.8152522438802968, + "learning_rate": 0.00019843328342251406, + "loss": 12.8807, + "step": 3112 + }, + { + "epoch": 0.16951529136291546, + "grad_norm": 0.7834123281067574, + "learning_rate": 0.0001984317282012922, + "loss": 12.7353, + "step": 3113 + }, + { + "epoch": 0.16956974535949848, + "grad_norm": 0.7587210169588774, + "learning_rate": 0.00019843017221464875, + "loss": 12.6709, + "step": 3114 + }, + { + "epoch": 0.1696241993560815, + "grad_norm": 0.8104141569775231, + "learning_rate": 0.00019842861546259574, + "loss": 12.6332, + "step": 3115 + }, + { + "epoch": 0.1696786533526645, + "grad_norm": 0.959464087999072, + "learning_rate": 0.00019842705794514533, + "loss": 12.6767, + "step": 3116 + }, + { + "epoch": 0.16973310734924751, + "grad_norm": 0.7914692125288751, + "learning_rate": 0.00019842549966230961, + "loss": 12.8013, + "step": 3117 + }, + { + "epoch": 0.16978756134583053, + "grad_norm": 0.6492714647380093, + "learning_rate": 0.0001984239406141007, + "loss": 12.5668, + "step": 3118 + }, + { + "epoch": 0.16984201534241353, + "grad_norm": 0.9351858633515623, + "learning_rate": 0.00019842238080053072, + "loss": 12.64, + "step": 3119 + }, + { + "epoch": 0.16989646933899655, + "grad_norm": 0.8610702832263224, + "learning_rate": 0.00019842082022161178, + "loss": 12.7856, + "step": 3120 + }, + { + "epoch": 0.16995092333557957, + "grad_norm": 0.7780332218014057, + "learning_rate": 0.00019841925887735607, + "loss": 12.6922, + "step": 3121 + }, + { + "epoch": 0.17000537733216256, + "grad_norm": 0.8377695759243816, + "learning_rate": 0.0001984176967677757, + "loss": 12.7686, + "step": 3122 + }, + { + "epoch": 0.17005983132874558, + "grad_norm": 0.8259157927171062, + "learning_rate": 0.0001984161338928828, + "loss": 12.6289, + "step": 3123 + }, + { + "epoch": 0.1701142853253286, + "grad_norm": 0.797915984011378, + "learning_rate": 0.00019841457025268953, + "loss": 12.6509, + "step": 3124 + }, + { + "epoch": 0.1701687393219116, + "grad_norm": 0.7578127442759067, + "learning_rate": 0.00019841300584720807, + "loss": 12.6557, + "step": 3125 + }, + { + "epoch": 0.17022319331849461, + "grad_norm": 0.8260447831694434, + "learning_rate": 0.00019841144067645058, + "loss": 12.6095, + "step": 3126 + }, + { + "epoch": 0.17027764731507763, + "grad_norm": 0.9170401923117737, + "learning_rate": 0.00019840987474042918, + "loss": 12.6943, + "step": 3127 + }, + { + "epoch": 0.17033210131166066, + "grad_norm": 0.6838717466562656, + "learning_rate": 0.0001984083080391561, + "loss": 12.5913, + "step": 3128 + }, + { + "epoch": 0.17038655530824365, + "grad_norm": 0.6734878857682747, + "learning_rate": 0.00019840674057264356, + "loss": 12.5914, + "step": 3129 + }, + { + "epoch": 0.17044100930482667, + "grad_norm": 0.8851280009559688, + "learning_rate": 0.00019840517234090367, + "loss": 12.7171, + "step": 3130 + }, + { + "epoch": 0.1704954633014097, + "grad_norm": 0.8336628423928786, + "learning_rate": 0.00019840360334394869, + "loss": 12.5769, + "step": 3131 + }, + { + "epoch": 0.17054991729799268, + "grad_norm": 0.8233460415229792, + "learning_rate": 0.00019840203358179075, + "loss": 12.6803, + "step": 3132 + }, + { + "epoch": 0.1706043712945757, + "grad_norm": 0.8575100075433871, + "learning_rate": 0.00019840046305444212, + "loss": 12.6334, + "step": 3133 + }, + { + "epoch": 0.17065882529115872, + "grad_norm": 0.8402671498996254, + "learning_rate": 0.00019839889176191498, + "loss": 12.7051, + "step": 3134 + }, + { + "epoch": 0.17071327928774171, + "grad_norm": 0.8442756476235956, + "learning_rate": 0.00019839731970422152, + "loss": 12.598, + "step": 3135 + }, + { + "epoch": 0.17076773328432474, + "grad_norm": 0.7790665163489613, + "learning_rate": 0.00019839574688137405, + "loss": 12.673, + "step": 3136 + }, + { + "epoch": 0.17082218728090776, + "grad_norm": 0.6776444996263663, + "learning_rate": 0.00019839417329338473, + "loss": 12.5547, + "step": 3137 + }, + { + "epoch": 0.17087664127749075, + "grad_norm": 0.7170712222884784, + "learning_rate": 0.00019839259894026583, + "loss": 12.6168, + "step": 3138 + }, + { + "epoch": 0.17093109527407377, + "grad_norm": 0.8250292301702511, + "learning_rate": 0.00019839102382202957, + "loss": 12.6966, + "step": 3139 + }, + { + "epoch": 0.1709855492706568, + "grad_norm": 0.776378688447562, + "learning_rate": 0.00019838944793868822, + "loss": 12.6753, + "step": 3140 + }, + { + "epoch": 0.17104000326723978, + "grad_norm": 0.7560489984821958, + "learning_rate": 0.000198387871290254, + "loss": 12.5989, + "step": 3141 + }, + { + "epoch": 0.1710944572638228, + "grad_norm": 0.7974557424591885, + "learning_rate": 0.0001983862938767392, + "loss": 12.5405, + "step": 3142 + }, + { + "epoch": 0.17114891126040582, + "grad_norm": 0.7014349054792098, + "learning_rate": 0.0001983847156981561, + "loss": 12.5353, + "step": 3143 + }, + { + "epoch": 0.17120336525698884, + "grad_norm": 0.723315929267737, + "learning_rate": 0.00019838313675451695, + "loss": 12.6474, + "step": 3144 + }, + { + "epoch": 0.17125781925357184, + "grad_norm": 0.9779302802477249, + "learning_rate": 0.00019838155704583402, + "loss": 12.8977, + "step": 3145 + }, + { + "epoch": 0.17131227325015486, + "grad_norm": 0.7365367954267793, + "learning_rate": 0.00019837997657211958, + "loss": 12.68, + "step": 3146 + }, + { + "epoch": 0.17136672724673788, + "grad_norm": 0.7836697915049639, + "learning_rate": 0.00019837839533338599, + "loss": 12.5582, + "step": 3147 + }, + { + "epoch": 0.17142118124332087, + "grad_norm": 0.694905046743744, + "learning_rate": 0.00019837681332964544, + "loss": 12.6245, + "step": 3148 + }, + { + "epoch": 0.1714756352399039, + "grad_norm": 0.7345872077956427, + "learning_rate": 0.00019837523056091035, + "loss": 12.6434, + "step": 3149 + }, + { + "epoch": 0.1715300892364869, + "grad_norm": 0.8676001382683689, + "learning_rate": 0.00019837364702719295, + "loss": 12.7774, + "step": 3150 + }, + { + "epoch": 0.1715845432330699, + "grad_norm": 0.749859401437326, + "learning_rate": 0.00019837206272850554, + "loss": 12.7502, + "step": 3151 + }, + { + "epoch": 0.17163899722965292, + "grad_norm": 0.7496417092437363, + "learning_rate": 0.00019837047766486053, + "loss": 12.7435, + "step": 3152 + }, + { + "epoch": 0.17169345122623594, + "grad_norm": 0.7099237668025427, + "learning_rate": 0.00019836889183627015, + "loss": 12.495, + "step": 3153 + }, + { + "epoch": 0.17174790522281894, + "grad_norm": 0.7285641308612487, + "learning_rate": 0.0001983673052427468, + "loss": 12.5386, + "step": 3154 + }, + { + "epoch": 0.17180235921940196, + "grad_norm": 0.7009655388375987, + "learning_rate": 0.00019836571788430275, + "loss": 12.7553, + "step": 3155 + }, + { + "epoch": 0.17185681321598498, + "grad_norm": 0.7463493574611223, + "learning_rate": 0.0001983641297609504, + "loss": 12.636, + "step": 3156 + }, + { + "epoch": 0.17191126721256797, + "grad_norm": 0.787048181874437, + "learning_rate": 0.00019836254087270206, + "loss": 12.7262, + "step": 3157 + }, + { + "epoch": 0.171965721209151, + "grad_norm": 0.814045454316862, + "learning_rate": 0.00019836095121957014, + "loss": 12.6681, + "step": 3158 + }, + { + "epoch": 0.172020175205734, + "grad_norm": 0.8867722235844003, + "learning_rate": 0.00019835936080156698, + "loss": 12.5865, + "step": 3159 + }, + { + "epoch": 0.17207462920231703, + "grad_norm": 0.7674062863483792, + "learning_rate": 0.00019835776961870492, + "loss": 12.5926, + "step": 3160 + }, + { + "epoch": 0.17212908319890002, + "grad_norm": 0.877774539505825, + "learning_rate": 0.00019835617767099633, + "loss": 12.604, + "step": 3161 + }, + { + "epoch": 0.17218353719548304, + "grad_norm": 0.7515196326795087, + "learning_rate": 0.00019835458495845362, + "loss": 12.683, + "step": 3162 + }, + { + "epoch": 0.17223799119206606, + "grad_norm": 0.7810814803217557, + "learning_rate": 0.00019835299148108918, + "loss": 12.5438, + "step": 3163 + }, + { + "epoch": 0.17229244518864906, + "grad_norm": 0.8523834949136776, + "learning_rate": 0.00019835139723891536, + "loss": 12.7393, + "step": 3164 + }, + { + "epoch": 0.17234689918523208, + "grad_norm": 0.7607407245315153, + "learning_rate": 0.0001983498022319446, + "loss": 12.7176, + "step": 3165 + }, + { + "epoch": 0.1724013531818151, + "grad_norm": 0.76191705067529, + "learning_rate": 0.00019834820646018927, + "loss": 12.5316, + "step": 3166 + }, + { + "epoch": 0.1724558071783981, + "grad_norm": 0.7260989726107212, + "learning_rate": 0.0001983466099236618, + "loss": 12.7281, + "step": 3167 + }, + { + "epoch": 0.1725102611749811, + "grad_norm": 0.7763558291867316, + "learning_rate": 0.00019834501262237458, + "loss": 12.6782, + "step": 3168 + }, + { + "epoch": 0.17256471517156413, + "grad_norm": 0.7983010911998738, + "learning_rate": 0.0001983434145563401, + "loss": 12.7117, + "step": 3169 + }, + { + "epoch": 0.17261916916814712, + "grad_norm": 0.7605819883968827, + "learning_rate": 0.00019834181572557066, + "loss": 12.7948, + "step": 3170 + }, + { + "epoch": 0.17267362316473014, + "grad_norm": 0.8374294116547367, + "learning_rate": 0.00019834021613007882, + "loss": 12.7513, + "step": 3171 + }, + { + "epoch": 0.17272807716131316, + "grad_norm": 0.6585356583533877, + "learning_rate": 0.00019833861576987698, + "loss": 12.5781, + "step": 3172 + }, + { + "epoch": 0.17278253115789616, + "grad_norm": 0.9425149602084731, + "learning_rate": 0.00019833701464497754, + "loss": 12.7727, + "step": 3173 + }, + { + "epoch": 0.17283698515447918, + "grad_norm": 0.7464082223439957, + "learning_rate": 0.000198335412755393, + "loss": 12.7494, + "step": 3174 + }, + { + "epoch": 0.1728914391510622, + "grad_norm": 0.8587155233360285, + "learning_rate": 0.00019833381010113578, + "loss": 12.5224, + "step": 3175 + }, + { + "epoch": 0.17294589314764522, + "grad_norm": 0.747952703741679, + "learning_rate": 0.0001983322066822184, + "loss": 12.4944, + "step": 3176 + }, + { + "epoch": 0.1730003471442282, + "grad_norm": 0.7827107540027283, + "learning_rate": 0.00019833060249865327, + "loss": 12.6914, + "step": 3177 + }, + { + "epoch": 0.17305480114081123, + "grad_norm": 0.7433316310571197, + "learning_rate": 0.0001983289975504529, + "loss": 12.4551, + "step": 3178 + }, + { + "epoch": 0.17310925513739425, + "grad_norm": 0.8347754411825807, + "learning_rate": 0.00019832739183762969, + "loss": 12.7181, + "step": 3179 + }, + { + "epoch": 0.17316370913397724, + "grad_norm": 0.7751754459911796, + "learning_rate": 0.00019832578536019626, + "loss": 12.6883, + "step": 3180 + }, + { + "epoch": 0.17321816313056027, + "grad_norm": 0.8741734159288691, + "learning_rate": 0.00019832417811816501, + "loss": 12.7347, + "step": 3181 + }, + { + "epoch": 0.17327261712714329, + "grad_norm": 0.8318468746826131, + "learning_rate": 0.00019832257011154845, + "loss": 12.6819, + "step": 3182 + }, + { + "epoch": 0.17332707112372628, + "grad_norm": 0.7039626157093785, + "learning_rate": 0.0001983209613403591, + "loss": 12.674, + "step": 3183 + }, + { + "epoch": 0.1733815251203093, + "grad_norm": 0.9025010506269673, + "learning_rate": 0.0001983193518046095, + "loss": 12.5735, + "step": 3184 + }, + { + "epoch": 0.17343597911689232, + "grad_norm": 0.7546511643836435, + "learning_rate": 0.00019831774150431213, + "loss": 12.6793, + "step": 3185 + }, + { + "epoch": 0.1734904331134753, + "grad_norm": 0.8394354751748568, + "learning_rate": 0.00019831613043947948, + "loss": 12.6723, + "step": 3186 + }, + { + "epoch": 0.17354488711005833, + "grad_norm": 0.7574481668785278, + "learning_rate": 0.0001983145186101241, + "loss": 12.5683, + "step": 3187 + }, + { + "epoch": 0.17359934110664135, + "grad_norm": 0.9026138172851492, + "learning_rate": 0.0001983129060162586, + "loss": 12.638, + "step": 3188 + }, + { + "epoch": 0.17365379510322435, + "grad_norm": 0.6765043031700043, + "learning_rate": 0.00019831129265789537, + "loss": 12.6092, + "step": 3189 + }, + { + "epoch": 0.17370824909980737, + "grad_norm": 0.7886541026462125, + "learning_rate": 0.0001983096785350471, + "loss": 12.7237, + "step": 3190 + }, + { + "epoch": 0.1737627030963904, + "grad_norm": 0.7844971851503274, + "learning_rate": 0.0001983080636477263, + "loss": 12.7892, + "step": 3191 + }, + { + "epoch": 0.17381715709297338, + "grad_norm": 0.6930078606339654, + "learning_rate": 0.00019830644799594547, + "loss": 12.5887, + "step": 3192 + }, + { + "epoch": 0.1738716110895564, + "grad_norm": 0.7923600954734632, + "learning_rate": 0.00019830483157971723, + "loss": 12.6886, + "step": 3193 + }, + { + "epoch": 0.17392606508613942, + "grad_norm": 0.8338987623626151, + "learning_rate": 0.00019830321439905414, + "loss": 12.7361, + "step": 3194 + }, + { + "epoch": 0.17398051908272244, + "grad_norm": 0.7749506167348895, + "learning_rate": 0.00019830159645396875, + "loss": 12.849, + "step": 3195 + }, + { + "epoch": 0.17403497307930543, + "grad_norm": 0.7390801603555475, + "learning_rate": 0.00019829997774447366, + "loss": 12.5025, + "step": 3196 + }, + { + "epoch": 0.17408942707588845, + "grad_norm": 0.6948756460703287, + "learning_rate": 0.0001982983582705815, + "loss": 12.6153, + "step": 3197 + }, + { + "epoch": 0.17414388107247147, + "grad_norm": 0.7011689581732615, + "learning_rate": 0.00019829673803230477, + "loss": 12.7191, + "step": 3198 + }, + { + "epoch": 0.17419833506905447, + "grad_norm": 0.6850588411098556, + "learning_rate": 0.00019829511702965612, + "loss": 12.5695, + "step": 3199 + }, + { + "epoch": 0.1742527890656375, + "grad_norm": 0.7668874363449892, + "learning_rate": 0.0001982934952626482, + "loss": 12.7662, + "step": 3200 + }, + { + "epoch": 0.1743072430622205, + "grad_norm": 0.7504905218030471, + "learning_rate": 0.00019829187273129355, + "loss": 12.6012, + "step": 3201 + }, + { + "epoch": 0.1743616970588035, + "grad_norm": 0.7171816702004552, + "learning_rate": 0.00019829024943560485, + "loss": 12.6831, + "step": 3202 + }, + { + "epoch": 0.17441615105538652, + "grad_norm": 0.698158237637559, + "learning_rate": 0.00019828862537559462, + "loss": 12.5953, + "step": 3203 + }, + { + "epoch": 0.17447060505196954, + "grad_norm": 0.8732362510339932, + "learning_rate": 0.0001982870005512756, + "loss": 12.6716, + "step": 3204 + }, + { + "epoch": 0.17452505904855253, + "grad_norm": 0.8154029138565516, + "learning_rate": 0.00019828537496266037, + "loss": 12.6764, + "step": 3205 + }, + { + "epoch": 0.17457951304513555, + "grad_norm": 0.7699923056388698, + "learning_rate": 0.0001982837486097616, + "loss": 12.6628, + "step": 3206 + }, + { + "epoch": 0.17463396704171857, + "grad_norm": 0.7861215643164291, + "learning_rate": 0.00019828212149259188, + "loss": 12.6717, + "step": 3207 + }, + { + "epoch": 0.17468842103830157, + "grad_norm": 0.6983338883252923, + "learning_rate": 0.0001982804936111639, + "loss": 12.6699, + "step": 3208 + }, + { + "epoch": 0.1747428750348846, + "grad_norm": 0.8298596274818021, + "learning_rate": 0.00019827886496549037, + "loss": 12.5887, + "step": 3209 + }, + { + "epoch": 0.1747973290314676, + "grad_norm": 0.7776100334544354, + "learning_rate": 0.0001982772355555839, + "loss": 12.6827, + "step": 3210 + }, + { + "epoch": 0.17485178302805063, + "grad_norm": 0.7710869955834241, + "learning_rate": 0.00019827560538145713, + "loss": 12.6449, + "step": 3211 + }, + { + "epoch": 0.17490623702463362, + "grad_norm": 0.7444032286300318, + "learning_rate": 0.0001982739744431228, + "loss": 12.6241, + "step": 3212 + }, + { + "epoch": 0.17496069102121664, + "grad_norm": 0.8584217284851458, + "learning_rate": 0.00019827234274059352, + "loss": 12.6233, + "step": 3213 + }, + { + "epoch": 0.17501514501779966, + "grad_norm": 0.7472171044808118, + "learning_rate": 0.00019827071027388207, + "loss": 12.7727, + "step": 3214 + }, + { + "epoch": 0.17506959901438265, + "grad_norm": 0.7241449526372566, + "learning_rate": 0.00019826907704300107, + "loss": 12.6059, + "step": 3215 + }, + { + "epoch": 0.17512405301096567, + "grad_norm": 0.6725069804670222, + "learning_rate": 0.00019826744304796327, + "loss": 12.5751, + "step": 3216 + }, + { + "epoch": 0.1751785070075487, + "grad_norm": 0.6894594253663173, + "learning_rate": 0.00019826580828878135, + "loss": 12.6274, + "step": 3217 + }, + { + "epoch": 0.1752329610041317, + "grad_norm": 0.7304834406583183, + "learning_rate": 0.000198264172765468, + "loss": 12.7152, + "step": 3218 + }, + { + "epoch": 0.1752874150007147, + "grad_norm": 0.8386256205995041, + "learning_rate": 0.00019826253647803598, + "loss": 12.6445, + "step": 3219 + }, + { + "epoch": 0.17534186899729773, + "grad_norm": 1.270172642023828, + "learning_rate": 0.000198260899426498, + "loss": 12.5894, + "step": 3220 + }, + { + "epoch": 0.17539632299388072, + "grad_norm": 0.7344319767853419, + "learning_rate": 0.00019825926161086679, + "loss": 12.7958, + "step": 3221 + }, + { + "epoch": 0.17545077699046374, + "grad_norm": 0.734680231052794, + "learning_rate": 0.00019825762303115506, + "loss": 12.6942, + "step": 3222 + }, + { + "epoch": 0.17550523098704676, + "grad_norm": 0.8336006775340267, + "learning_rate": 0.00019825598368737558, + "loss": 12.8308, + "step": 3223 + }, + { + "epoch": 0.17555968498362975, + "grad_norm": 0.7262882153581018, + "learning_rate": 0.00019825434357954112, + "loss": 12.6866, + "step": 3224 + }, + { + "epoch": 0.17561413898021278, + "grad_norm": 0.7392613877216176, + "learning_rate": 0.0001982527027076644, + "loss": 12.6137, + "step": 3225 + }, + { + "epoch": 0.1756685929767958, + "grad_norm": 0.6975694053545508, + "learning_rate": 0.0001982510610717582, + "loss": 12.6543, + "step": 3226 + }, + { + "epoch": 0.17572304697337882, + "grad_norm": 0.7534491680096834, + "learning_rate": 0.00019824941867183524, + "loss": 12.6165, + "step": 3227 + }, + { + "epoch": 0.1757775009699618, + "grad_norm": 0.7788563975747334, + "learning_rate": 0.00019824777550790835, + "loss": 12.5636, + "step": 3228 + }, + { + "epoch": 0.17583195496654483, + "grad_norm": 0.7506448121279704, + "learning_rate": 0.00019824613157999027, + "loss": 12.5972, + "step": 3229 + }, + { + "epoch": 0.17588640896312785, + "grad_norm": 0.6771530701488644, + "learning_rate": 0.0001982444868880938, + "loss": 12.6503, + "step": 3230 + }, + { + "epoch": 0.17594086295971084, + "grad_norm": 0.8799280336028142, + "learning_rate": 0.0001982428414322317, + "loss": 12.7639, + "step": 3231 + }, + { + "epoch": 0.17599531695629386, + "grad_norm": 0.7902749117766416, + "learning_rate": 0.0001982411952124168, + "loss": 12.768, + "step": 3232 + }, + { + "epoch": 0.17604977095287688, + "grad_norm": 0.6973865579783748, + "learning_rate": 0.00019823954822866191, + "loss": 12.5541, + "step": 3233 + }, + { + "epoch": 0.17610422494945988, + "grad_norm": 0.6975249553608402, + "learning_rate": 0.0001982379004809798, + "loss": 12.6062, + "step": 3234 + }, + { + "epoch": 0.1761586789460429, + "grad_norm": 0.7168173724680111, + "learning_rate": 0.0001982362519693833, + "loss": 12.7842, + "step": 3235 + }, + { + "epoch": 0.17621313294262592, + "grad_norm": 0.7565153526890861, + "learning_rate": 0.00019823460269388526, + "loss": 12.7059, + "step": 3236 + }, + { + "epoch": 0.1762675869392089, + "grad_norm": 0.8209036578975398, + "learning_rate": 0.00019823295265449843, + "loss": 12.6638, + "step": 3237 + }, + { + "epoch": 0.17632204093579193, + "grad_norm": 0.7981299201729861, + "learning_rate": 0.00019823130185123573, + "loss": 12.7187, + "step": 3238 + }, + { + "epoch": 0.17637649493237495, + "grad_norm": 0.7747231650804178, + "learning_rate": 0.00019822965028410994, + "loss": 12.6225, + "step": 3239 + }, + { + "epoch": 0.17643094892895794, + "grad_norm": 0.7333857540750321, + "learning_rate": 0.0001982279979531339, + "loss": 12.6742, + "step": 3240 + }, + { + "epoch": 0.17648540292554096, + "grad_norm": 0.8397253043859492, + "learning_rate": 0.0001982263448583205, + "loss": 12.7436, + "step": 3241 + }, + { + "epoch": 0.17653985692212398, + "grad_norm": 0.7815864103858401, + "learning_rate": 0.00019822469099968254, + "loss": 12.6333, + "step": 3242 + }, + { + "epoch": 0.176594310918707, + "grad_norm": 0.8209808262548153, + "learning_rate": 0.0001982230363772329, + "loss": 12.7196, + "step": 3243 + }, + { + "epoch": 0.17664876491529, + "grad_norm": 0.7262043789621151, + "learning_rate": 0.0001982213809909845, + "loss": 12.6364, + "step": 3244 + }, + { + "epoch": 0.17670321891187302, + "grad_norm": 0.8607138924882243, + "learning_rate": 0.00019821972484095017, + "loss": 12.7051, + "step": 3245 + }, + { + "epoch": 0.17675767290845604, + "grad_norm": 0.8057809652765885, + "learning_rate": 0.00019821806792714276, + "loss": 12.5776, + "step": 3246 + }, + { + "epoch": 0.17681212690503903, + "grad_norm": 0.7042559054435703, + "learning_rate": 0.0001982164102495752, + "loss": 12.4916, + "step": 3247 + }, + { + "epoch": 0.17686658090162205, + "grad_norm": 1.057714095909052, + "learning_rate": 0.00019821475180826034, + "loss": 12.668, + "step": 3248 + }, + { + "epoch": 0.17692103489820507, + "grad_norm": 0.7899993707377523, + "learning_rate": 0.00019821309260321113, + "loss": 12.5721, + "step": 3249 + }, + { + "epoch": 0.17697548889478806, + "grad_norm": 0.8471101192669516, + "learning_rate": 0.00019821143263444043, + "loss": 12.5475, + "step": 3250 + }, + { + "epoch": 0.17702994289137108, + "grad_norm": 0.6735795320454601, + "learning_rate": 0.00019820977190196116, + "loss": 12.5648, + "step": 3251 + }, + { + "epoch": 0.1770843968879541, + "grad_norm": 0.9204359196176464, + "learning_rate": 0.00019820811040578625, + "loss": 12.9584, + "step": 3252 + }, + { + "epoch": 0.1771388508845371, + "grad_norm": 0.9138396791378408, + "learning_rate": 0.00019820644814592858, + "loss": 12.6978, + "step": 3253 + }, + { + "epoch": 0.17719330488112012, + "grad_norm": 0.7301330238149067, + "learning_rate": 0.0001982047851224011, + "loss": 12.545, + "step": 3254 + }, + { + "epoch": 0.17724775887770314, + "grad_norm": 0.9013668175356127, + "learning_rate": 0.00019820312133521674, + "loss": 12.6496, + "step": 3255 + }, + { + "epoch": 0.17730221287428613, + "grad_norm": 0.9341817253221993, + "learning_rate": 0.00019820145678438845, + "loss": 12.6747, + "step": 3256 + }, + { + "epoch": 0.17735666687086915, + "grad_norm": 0.7502328588779547, + "learning_rate": 0.00019819979146992915, + "loss": 12.6536, + "step": 3257 + }, + { + "epoch": 0.17741112086745217, + "grad_norm": 0.7681904747639378, + "learning_rate": 0.0001981981253918518, + "loss": 12.7236, + "step": 3258 + }, + { + "epoch": 0.17746557486403516, + "grad_norm": 0.8920610418799703, + "learning_rate": 0.0001981964585501694, + "loss": 12.6539, + "step": 3259 + }, + { + "epoch": 0.17752002886061818, + "grad_norm": 0.7628111397992703, + "learning_rate": 0.00019819479094489482, + "loss": 12.714, + "step": 3260 + }, + { + "epoch": 0.1775744828572012, + "grad_norm": 0.7071311753164595, + "learning_rate": 0.00019819312257604113, + "loss": 12.6685, + "step": 3261 + }, + { + "epoch": 0.17762893685378423, + "grad_norm": 0.8942352428065906, + "learning_rate": 0.0001981914534436212, + "loss": 12.7463, + "step": 3262 + }, + { + "epoch": 0.17768339085036722, + "grad_norm": 0.8637475451289109, + "learning_rate": 0.0001981897835476481, + "loss": 12.5307, + "step": 3263 + }, + { + "epoch": 0.17773784484695024, + "grad_norm": 0.9470761933823619, + "learning_rate": 0.00019818811288813476, + "loss": 12.7053, + "step": 3264 + }, + { + "epoch": 0.17779229884353326, + "grad_norm": 0.7384704398906923, + "learning_rate": 0.0001981864414650942, + "loss": 12.6304, + "step": 3265 + }, + { + "epoch": 0.17784675284011625, + "grad_norm": 0.7974334541792593, + "learning_rate": 0.00019818476927853937, + "loss": 12.7698, + "step": 3266 + }, + { + "epoch": 0.17790120683669927, + "grad_norm": 0.728317539288044, + "learning_rate": 0.00019818309632848333, + "loss": 12.6083, + "step": 3267 + }, + { + "epoch": 0.1779556608332823, + "grad_norm": 0.6856666037687628, + "learning_rate": 0.00019818142261493908, + "loss": 12.6041, + "step": 3268 + }, + { + "epoch": 0.17801011482986528, + "grad_norm": 0.8080880119981516, + "learning_rate": 0.00019817974813791963, + "loss": 12.7661, + "step": 3269 + }, + { + "epoch": 0.1780645688264483, + "grad_norm": 0.7107996562679639, + "learning_rate": 0.00019817807289743798, + "loss": 12.6347, + "step": 3270 + }, + { + "epoch": 0.17811902282303133, + "grad_norm": 0.790679300367646, + "learning_rate": 0.00019817639689350715, + "loss": 12.6127, + "step": 3271 + }, + { + "epoch": 0.17817347681961432, + "grad_norm": 0.7567804862897068, + "learning_rate": 0.00019817472012614021, + "loss": 12.7102, + "step": 3272 + }, + { + "epoch": 0.17822793081619734, + "grad_norm": 0.8069231009329779, + "learning_rate": 0.00019817304259535018, + "loss": 12.7045, + "step": 3273 + }, + { + "epoch": 0.17828238481278036, + "grad_norm": 0.7595982136123148, + "learning_rate": 0.00019817136430115012, + "loss": 12.6997, + "step": 3274 + }, + { + "epoch": 0.17833683880936335, + "grad_norm": 0.7309321844207629, + "learning_rate": 0.00019816968524355306, + "loss": 12.6139, + "step": 3275 + }, + { + "epoch": 0.17839129280594637, + "grad_norm": 0.7192650353233917, + "learning_rate": 0.0001981680054225721, + "loss": 12.6906, + "step": 3276 + }, + { + "epoch": 0.1784457468025294, + "grad_norm": 0.7683459462222892, + "learning_rate": 0.0001981663248382202, + "loss": 12.758, + "step": 3277 + }, + { + "epoch": 0.1785002007991124, + "grad_norm": 0.7604309449895051, + "learning_rate": 0.00019816464349051055, + "loss": 12.708, + "step": 3278 + }, + { + "epoch": 0.1785546547956954, + "grad_norm": 0.6915703638272156, + "learning_rate": 0.00019816296137945612, + "loss": 12.638, + "step": 3279 + }, + { + "epoch": 0.17860910879227843, + "grad_norm": 1.1253518154266444, + "learning_rate": 0.00019816127850507008, + "loss": 12.7063, + "step": 3280 + }, + { + "epoch": 0.17866356278886145, + "grad_norm": 0.7630225540272567, + "learning_rate": 0.00019815959486736543, + "loss": 12.627, + "step": 3281 + }, + { + "epoch": 0.17871801678544444, + "grad_norm": 0.9935145198532191, + "learning_rate": 0.00019815791046635538, + "loss": 12.7206, + "step": 3282 + }, + { + "epoch": 0.17877247078202746, + "grad_norm": 0.736334650149649, + "learning_rate": 0.0001981562253020529, + "loss": 12.8125, + "step": 3283 + }, + { + "epoch": 0.17882692477861048, + "grad_norm": 0.7808149186506571, + "learning_rate": 0.00019815453937447117, + "loss": 12.6405, + "step": 3284 + }, + { + "epoch": 0.17888137877519347, + "grad_norm": 0.8210946819500441, + "learning_rate": 0.0001981528526836233, + "loss": 12.6571, + "step": 3285 + }, + { + "epoch": 0.1789358327717765, + "grad_norm": 0.86965004642108, + "learning_rate": 0.00019815116522952235, + "loss": 12.7401, + "step": 3286 + }, + { + "epoch": 0.1789902867683595, + "grad_norm": 0.7868117017342933, + "learning_rate": 0.0001981494770121815, + "loss": 12.5413, + "step": 3287 + }, + { + "epoch": 0.1790447407649425, + "grad_norm": 0.7415827071978902, + "learning_rate": 0.00019814778803161387, + "loss": 12.6923, + "step": 3288 + }, + { + "epoch": 0.17909919476152553, + "grad_norm": 0.6851120650197461, + "learning_rate": 0.00019814609828783258, + "loss": 12.56, + "step": 3289 + }, + { + "epoch": 0.17915364875810855, + "grad_norm": 0.8225961915115911, + "learning_rate": 0.00019814440778085076, + "loss": 12.6163, + "step": 3290 + }, + { + "epoch": 0.17920810275469154, + "grad_norm": 0.7409751466179318, + "learning_rate": 0.00019814271651068154, + "loss": 12.6108, + "step": 3291 + }, + { + "epoch": 0.17926255675127456, + "grad_norm": 0.7492258593142511, + "learning_rate": 0.0001981410244773381, + "loss": 12.5378, + "step": 3292 + }, + { + "epoch": 0.17931701074785758, + "grad_norm": 0.7253161388755572, + "learning_rate": 0.00019813933168083363, + "loss": 12.7377, + "step": 3293 + }, + { + "epoch": 0.1793714647444406, + "grad_norm": 0.8344633552358383, + "learning_rate": 0.00019813763812118125, + "loss": 12.7151, + "step": 3294 + }, + { + "epoch": 0.1794259187410236, + "grad_norm": 0.7765629199089185, + "learning_rate": 0.00019813594379839415, + "loss": 12.5645, + "step": 3295 + }, + { + "epoch": 0.1794803727376066, + "grad_norm": 0.8082931219937859, + "learning_rate": 0.00019813424871248546, + "loss": 12.7144, + "step": 3296 + }, + { + "epoch": 0.17953482673418963, + "grad_norm": 0.7589009678676194, + "learning_rate": 0.00019813255286346842, + "loss": 12.7288, + "step": 3297 + }, + { + "epoch": 0.17958928073077263, + "grad_norm": 0.790735241959672, + "learning_rate": 0.0001981308562513562, + "loss": 12.6141, + "step": 3298 + }, + { + "epoch": 0.17964373472735565, + "grad_norm": 0.7073433938535153, + "learning_rate": 0.00019812915887616194, + "loss": 12.5315, + "step": 3299 + }, + { + "epoch": 0.17969818872393867, + "grad_norm": 0.7446488567697064, + "learning_rate": 0.00019812746073789893, + "loss": 12.6907, + "step": 3300 + }, + { + "epoch": 0.17975264272052166, + "grad_norm": 0.7491534452924706, + "learning_rate": 0.0001981257618365803, + "loss": 12.6864, + "step": 3301 + }, + { + "epoch": 0.17980709671710468, + "grad_norm": 0.821475530739584, + "learning_rate": 0.0001981240621722193, + "loss": 12.6571, + "step": 3302 + }, + { + "epoch": 0.1798615507136877, + "grad_norm": 0.8260413049475901, + "learning_rate": 0.00019812236174482917, + "loss": 12.7932, + "step": 3303 + }, + { + "epoch": 0.1799160047102707, + "grad_norm": 0.8552027146811687, + "learning_rate": 0.00019812066055442307, + "loss": 12.7166, + "step": 3304 + }, + { + "epoch": 0.17997045870685371, + "grad_norm": 0.691431305896408, + "learning_rate": 0.00019811895860101424, + "loss": 12.7565, + "step": 3305 + }, + { + "epoch": 0.18002491270343673, + "grad_norm": 0.7541206039195266, + "learning_rate": 0.00019811725588461595, + "loss": 12.7832, + "step": 3306 + }, + { + "epoch": 0.18007936670001973, + "grad_norm": 0.7329115364844615, + "learning_rate": 0.00019811555240524143, + "loss": 12.6535, + "step": 3307 + }, + { + "epoch": 0.18013382069660275, + "grad_norm": 0.7848715954841441, + "learning_rate": 0.00019811384816290393, + "loss": 12.7121, + "step": 3308 + }, + { + "epoch": 0.18018827469318577, + "grad_norm": 0.7401571327497739, + "learning_rate": 0.0001981121431576167, + "loss": 12.6728, + "step": 3309 + }, + { + "epoch": 0.1802427286897688, + "grad_norm": 0.7385315373003121, + "learning_rate": 0.00019811043738939295, + "loss": 12.6642, + "step": 3310 + }, + { + "epoch": 0.18029718268635178, + "grad_norm": 0.795431374670674, + "learning_rate": 0.00019810873085824603, + "loss": 12.5564, + "step": 3311 + }, + { + "epoch": 0.1803516366829348, + "grad_norm": 0.7905420016747329, + "learning_rate": 0.00019810702356418914, + "loss": 12.7727, + "step": 3312 + }, + { + "epoch": 0.18040609067951782, + "grad_norm": 0.856463722382322, + "learning_rate": 0.00019810531550723556, + "loss": 12.7771, + "step": 3313 + }, + { + "epoch": 0.18046054467610081, + "grad_norm": 0.663682842099032, + "learning_rate": 0.00019810360668739863, + "loss": 12.5482, + "step": 3314 + }, + { + "epoch": 0.18051499867268384, + "grad_norm": 0.7304808709709537, + "learning_rate": 0.0001981018971046916, + "loss": 12.6683, + "step": 3315 + }, + { + "epoch": 0.18056945266926686, + "grad_norm": 0.8716973827013581, + "learning_rate": 0.00019810018675912774, + "loss": 12.6943, + "step": 3316 + }, + { + "epoch": 0.18062390666584985, + "grad_norm": 0.8052038652161817, + "learning_rate": 0.0001980984756507204, + "loss": 12.7423, + "step": 3317 + }, + { + "epoch": 0.18067836066243287, + "grad_norm": 0.6865640501576407, + "learning_rate": 0.00019809676377948287, + "loss": 12.5838, + "step": 3318 + }, + { + "epoch": 0.1807328146590159, + "grad_norm": 0.7787199776120453, + "learning_rate": 0.00019809505114542843, + "loss": 12.7606, + "step": 3319 + }, + { + "epoch": 0.18078726865559888, + "grad_norm": 0.6657871497753294, + "learning_rate": 0.00019809333774857045, + "loss": 12.619, + "step": 3320 + }, + { + "epoch": 0.1808417226521819, + "grad_norm": 0.7472678299687542, + "learning_rate": 0.00019809162358892218, + "loss": 12.778, + "step": 3321 + }, + { + "epoch": 0.18089617664876492, + "grad_norm": 0.7521286002919139, + "learning_rate": 0.00019808990866649703, + "loss": 12.5975, + "step": 3322 + }, + { + "epoch": 0.18095063064534792, + "grad_norm": 0.7485712192091213, + "learning_rate": 0.0001980881929813083, + "loss": 12.5831, + "step": 3323 + }, + { + "epoch": 0.18100508464193094, + "grad_norm": 0.7729968986430557, + "learning_rate": 0.00019808647653336934, + "loss": 12.6564, + "step": 3324 + }, + { + "epoch": 0.18105953863851396, + "grad_norm": 0.7335308303636128, + "learning_rate": 0.00019808475932269346, + "loss": 12.628, + "step": 3325 + }, + { + "epoch": 0.18111399263509695, + "grad_norm": 0.8883852418423221, + "learning_rate": 0.00019808304134929405, + "loss": 12.7038, + "step": 3326 + }, + { + "epoch": 0.18116844663167997, + "grad_norm": 0.8086481974347323, + "learning_rate": 0.00019808132261318447, + "loss": 12.5788, + "step": 3327 + }, + { + "epoch": 0.181222900628263, + "grad_norm": 0.7483757045666115, + "learning_rate": 0.00019807960311437809, + "loss": 12.7052, + "step": 3328 + }, + { + "epoch": 0.181277354624846, + "grad_norm": 0.7724274921487599, + "learning_rate": 0.00019807788285288824, + "loss": 12.6693, + "step": 3329 + }, + { + "epoch": 0.181331808621429, + "grad_norm": 0.741722193902972, + "learning_rate": 0.00019807616182872833, + "loss": 12.5813, + "step": 3330 + }, + { + "epoch": 0.18138626261801202, + "grad_norm": 0.7272278492201305, + "learning_rate": 0.00019807444004191175, + "loss": 12.5707, + "step": 3331 + }, + { + "epoch": 0.18144071661459504, + "grad_norm": 0.7854612846382387, + "learning_rate": 0.00019807271749245188, + "loss": 12.8055, + "step": 3332 + }, + { + "epoch": 0.18149517061117804, + "grad_norm": 0.823783487606722, + "learning_rate": 0.00019807099418036212, + "loss": 12.6623, + "step": 3333 + }, + { + "epoch": 0.18154962460776106, + "grad_norm": 0.7936635697631835, + "learning_rate": 0.00019806927010565585, + "loss": 12.5691, + "step": 3334 + }, + { + "epoch": 0.18160407860434408, + "grad_norm": 0.7383972750382136, + "learning_rate": 0.0001980675452683465, + "loss": 12.4969, + "step": 3335 + }, + { + "epoch": 0.18165853260092707, + "grad_norm": 0.8732299978656494, + "learning_rate": 0.00019806581966844746, + "loss": 12.6761, + "step": 3336 + }, + { + "epoch": 0.1817129865975101, + "grad_norm": 0.8064179318291677, + "learning_rate": 0.00019806409330597218, + "loss": 12.7272, + "step": 3337 + }, + { + "epoch": 0.1817674405940931, + "grad_norm": 0.7422343085071372, + "learning_rate": 0.00019806236618093403, + "loss": 12.6766, + "step": 3338 + }, + { + "epoch": 0.1818218945906761, + "grad_norm": 0.8175942277868645, + "learning_rate": 0.0001980606382933465, + "loss": 12.6871, + "step": 3339 + }, + { + "epoch": 0.18187634858725912, + "grad_norm": 0.7404435885300982, + "learning_rate": 0.000198058909643223, + "loss": 12.5923, + "step": 3340 + }, + { + "epoch": 0.18193080258384214, + "grad_norm": 0.8017199061331057, + "learning_rate": 0.00019805718023057695, + "loss": 12.6781, + "step": 3341 + }, + { + "epoch": 0.18198525658042514, + "grad_norm": 0.6885014995804388, + "learning_rate": 0.00019805545005542184, + "loss": 12.7684, + "step": 3342 + }, + { + "epoch": 0.18203971057700816, + "grad_norm": 0.7907680166328611, + "learning_rate": 0.00019805371911777112, + "loss": 12.8546, + "step": 3343 + }, + { + "epoch": 0.18209416457359118, + "grad_norm": 0.7060355670276862, + "learning_rate": 0.00019805198741763822, + "loss": 12.4949, + "step": 3344 + }, + { + "epoch": 0.1821486185701742, + "grad_norm": 0.9516032000572783, + "learning_rate": 0.00019805025495503662, + "loss": 12.7103, + "step": 3345 + }, + { + "epoch": 0.1822030725667572, + "grad_norm": 0.7449253439223767, + "learning_rate": 0.00019804852172997981, + "loss": 12.5891, + "step": 3346 + }, + { + "epoch": 0.1822575265633402, + "grad_norm": 0.729946216620088, + "learning_rate": 0.00019804678774248125, + "loss": 12.8393, + "step": 3347 + }, + { + "epoch": 0.18231198055992323, + "grad_norm": 0.9178332662915702, + "learning_rate": 0.00019804505299255443, + "loss": 12.6575, + "step": 3348 + }, + { + "epoch": 0.18236643455650622, + "grad_norm": 0.7749283115288399, + "learning_rate": 0.00019804331748021282, + "loss": 12.6209, + "step": 3349 + }, + { + "epoch": 0.18242088855308924, + "grad_norm": 0.9139733963471919, + "learning_rate": 0.00019804158120546994, + "loss": 12.7139, + "step": 3350 + }, + { + "epoch": 0.18247534254967226, + "grad_norm": 0.7763906942557518, + "learning_rate": 0.00019803984416833927, + "loss": 12.5467, + "step": 3351 + }, + { + "epoch": 0.18252979654625526, + "grad_norm": 0.7252198432139424, + "learning_rate": 0.00019803810636883436, + "loss": 12.6515, + "step": 3352 + }, + { + "epoch": 0.18258425054283828, + "grad_norm": 0.9978346602920308, + "learning_rate": 0.00019803636780696865, + "loss": 12.6464, + "step": 3353 + }, + { + "epoch": 0.1826387045394213, + "grad_norm": 0.7093383174056861, + "learning_rate": 0.00019803462848275573, + "loss": 12.7392, + "step": 3354 + }, + { + "epoch": 0.1826931585360043, + "grad_norm": 0.9341940050524234, + "learning_rate": 0.00019803288839620911, + "loss": 12.7433, + "step": 3355 + }, + { + "epoch": 0.1827476125325873, + "grad_norm": 0.7787445813299527, + "learning_rate": 0.00019803114754734228, + "loss": 12.6275, + "step": 3356 + }, + { + "epoch": 0.18280206652917033, + "grad_norm": 0.7238973947276285, + "learning_rate": 0.00019802940593616883, + "loss": 12.5868, + "step": 3357 + }, + { + "epoch": 0.18285652052575332, + "grad_norm": 0.8909256720217067, + "learning_rate": 0.00019802766356270227, + "loss": 12.6952, + "step": 3358 + }, + { + "epoch": 0.18291097452233634, + "grad_norm": 0.9060266000302456, + "learning_rate": 0.00019802592042695614, + "loss": 12.6288, + "step": 3359 + }, + { + "epoch": 0.18296542851891937, + "grad_norm": 0.8317096102878461, + "learning_rate": 0.00019802417652894406, + "loss": 12.8364, + "step": 3360 + }, + { + "epoch": 0.18301988251550239, + "grad_norm": 0.7046624142474951, + "learning_rate": 0.00019802243186867953, + "loss": 12.6454, + "step": 3361 + }, + { + "epoch": 0.18307433651208538, + "grad_norm": 0.7724845162821498, + "learning_rate": 0.0001980206864461761, + "loss": 12.5423, + "step": 3362 + }, + { + "epoch": 0.1831287905086684, + "grad_norm": 0.7007103141816504, + "learning_rate": 0.0001980189402614474, + "loss": 12.5998, + "step": 3363 + }, + { + "epoch": 0.18318324450525142, + "grad_norm": 0.7136727791052012, + "learning_rate": 0.000198017193314507, + "loss": 12.5667, + "step": 3364 + }, + { + "epoch": 0.1832376985018344, + "grad_norm": 0.7296237009613211, + "learning_rate": 0.00019801544560536845, + "loss": 12.6287, + "step": 3365 + }, + { + "epoch": 0.18329215249841743, + "grad_norm": 0.8388171730170142, + "learning_rate": 0.00019801369713404535, + "loss": 12.6726, + "step": 3366 + }, + { + "epoch": 0.18334660649500045, + "grad_norm": 0.7969860000049219, + "learning_rate": 0.0001980119479005513, + "loss": 12.6525, + "step": 3367 + }, + { + "epoch": 0.18340106049158345, + "grad_norm": 0.8607464188262216, + "learning_rate": 0.00019801019790489992, + "loss": 12.8035, + "step": 3368 + }, + { + "epoch": 0.18345551448816647, + "grad_norm": 0.8417130114551891, + "learning_rate": 0.00019800844714710478, + "loss": 12.6438, + "step": 3369 + }, + { + "epoch": 0.18350996848474949, + "grad_norm": 0.7454307447608005, + "learning_rate": 0.00019800669562717956, + "loss": 12.6587, + "step": 3370 + }, + { + "epoch": 0.18356442248133248, + "grad_norm": 0.7564182643950726, + "learning_rate": 0.00019800494334513781, + "loss": 12.7091, + "step": 3371 + }, + { + "epoch": 0.1836188764779155, + "grad_norm": 0.751046314032216, + "learning_rate": 0.00019800319030099318, + "loss": 12.5357, + "step": 3372 + }, + { + "epoch": 0.18367333047449852, + "grad_norm": 0.6986462694900774, + "learning_rate": 0.00019800143649475934, + "loss": 12.6685, + "step": 3373 + }, + { + "epoch": 0.1837277844710815, + "grad_norm": 0.7858772949677835, + "learning_rate": 0.00019799968192644985, + "loss": 12.7326, + "step": 3374 + }, + { + "epoch": 0.18378223846766453, + "grad_norm": 0.6825472324506604, + "learning_rate": 0.00019799792659607842, + "loss": 12.6151, + "step": 3375 + }, + { + "epoch": 0.18383669246424755, + "grad_norm": 0.7411115867200715, + "learning_rate": 0.0001979961705036587, + "loss": 12.5658, + "step": 3376 + }, + { + "epoch": 0.18389114646083055, + "grad_norm": 0.8472704769040416, + "learning_rate": 0.0001979944136492043, + "loss": 12.7508, + "step": 3377 + }, + { + "epoch": 0.18394560045741357, + "grad_norm": 0.756717561964962, + "learning_rate": 0.0001979926560327289, + "loss": 12.6634, + "step": 3378 + }, + { + "epoch": 0.1840000544539966, + "grad_norm": 0.7606214364101355, + "learning_rate": 0.00019799089765424618, + "loss": 12.5667, + "step": 3379 + }, + { + "epoch": 0.1840545084505796, + "grad_norm": 0.7270193995130935, + "learning_rate": 0.00019798913851376982, + "loss": 12.6351, + "step": 3380 + }, + { + "epoch": 0.1841089624471626, + "grad_norm": 0.6805493559802995, + "learning_rate": 0.00019798737861131347, + "loss": 12.6746, + "step": 3381 + }, + { + "epoch": 0.18416341644374562, + "grad_norm": 0.7056558366013929, + "learning_rate": 0.00019798561794689086, + "loss": 12.6943, + "step": 3382 + }, + { + "epoch": 0.18421787044032864, + "grad_norm": 0.7657220496359567, + "learning_rate": 0.0001979838565205156, + "loss": 12.6128, + "step": 3383 + }, + { + "epoch": 0.18427232443691163, + "grad_norm": 0.9039108815858633, + "learning_rate": 0.0001979820943322015, + "loss": 12.6562, + "step": 3384 + }, + { + "epoch": 0.18432677843349465, + "grad_norm": 0.86946493008817, + "learning_rate": 0.00019798033138196218, + "loss": 12.5324, + "step": 3385 + }, + { + "epoch": 0.18438123243007767, + "grad_norm": 0.7912167800969766, + "learning_rate": 0.00019797856766981135, + "loss": 12.6925, + "step": 3386 + }, + { + "epoch": 0.18443568642666067, + "grad_norm": 0.7999489254168872, + "learning_rate": 0.00019797680319576276, + "loss": 12.6894, + "step": 3387 + }, + { + "epoch": 0.1844901404232437, + "grad_norm": 0.7324214346710298, + "learning_rate": 0.00019797503795983016, + "loss": 12.6803, + "step": 3388 + }, + { + "epoch": 0.1845445944198267, + "grad_norm": 0.756609872779989, + "learning_rate": 0.00019797327196202717, + "loss": 12.6434, + "step": 3389 + }, + { + "epoch": 0.1845990484164097, + "grad_norm": 0.7015324007008875, + "learning_rate": 0.00019797150520236762, + "loss": 12.6552, + "step": 3390 + }, + { + "epoch": 0.18465350241299272, + "grad_norm": 0.7122255001024863, + "learning_rate": 0.0001979697376808652, + "loss": 12.5259, + "step": 3391 + }, + { + "epoch": 0.18470795640957574, + "grad_norm": 0.9522221558042216, + "learning_rate": 0.0001979679693975337, + "loss": 12.673, + "step": 3392 + }, + { + "epoch": 0.18476241040615873, + "grad_norm": 0.7110517403519583, + "learning_rate": 0.00019796620035238678, + "loss": 12.6476, + "step": 3393 + }, + { + "epoch": 0.18481686440274175, + "grad_norm": 0.7595909509678056, + "learning_rate": 0.00019796443054543832, + "loss": 12.6126, + "step": 3394 + }, + { + "epoch": 0.18487131839932477, + "grad_norm": 0.687617268889062, + "learning_rate": 0.000197962659976702, + "loss": 12.5402, + "step": 3395 + }, + { + "epoch": 0.1849257723959078, + "grad_norm": 0.7039148224733114, + "learning_rate": 0.00019796088864619158, + "loss": 12.7266, + "step": 3396 + }, + { + "epoch": 0.1849802263924908, + "grad_norm": 0.7788472692865894, + "learning_rate": 0.00019795911655392089, + "loss": 12.5487, + "step": 3397 + }, + { + "epoch": 0.1850346803890738, + "grad_norm": 0.7193934435877664, + "learning_rate": 0.00019795734369990368, + "loss": 12.6761, + "step": 3398 + }, + { + "epoch": 0.18508913438565683, + "grad_norm": 0.7146683469628885, + "learning_rate": 0.00019795557008415372, + "loss": 12.4951, + "step": 3399 + }, + { + "epoch": 0.18514358838223982, + "grad_norm": 0.781946581429394, + "learning_rate": 0.00019795379570668487, + "loss": 12.5245, + "step": 3400 + }, + { + "epoch": 0.18519804237882284, + "grad_norm": 0.6925827278733167, + "learning_rate": 0.00019795202056751082, + "loss": 12.6528, + "step": 3401 + }, + { + "epoch": 0.18525249637540586, + "grad_norm": 0.7960254736251621, + "learning_rate": 0.00019795024466664546, + "loss": 12.8152, + "step": 3402 + }, + { + "epoch": 0.18530695037198885, + "grad_norm": 0.7310673158847482, + "learning_rate": 0.00019794846800410255, + "loss": 12.6397, + "step": 3403 + }, + { + "epoch": 0.18536140436857187, + "grad_norm": 0.73383372208105, + "learning_rate": 0.00019794669057989595, + "loss": 12.5688, + "step": 3404 + }, + { + "epoch": 0.1854158583651549, + "grad_norm": 0.836279865981678, + "learning_rate": 0.00019794491239403944, + "loss": 12.5942, + "step": 3405 + }, + { + "epoch": 0.1854703123617379, + "grad_norm": 0.7712799032342063, + "learning_rate": 0.0001979431334465469, + "loss": 12.6425, + "step": 3406 + }, + { + "epoch": 0.1855247663583209, + "grad_norm": 0.6909617201882076, + "learning_rate": 0.0001979413537374321, + "loss": 12.588, + "step": 3407 + }, + { + "epoch": 0.18557922035490393, + "grad_norm": 0.7048457518660487, + "learning_rate": 0.0001979395732667089, + "loss": 12.6108, + "step": 3408 + }, + { + "epoch": 0.18563367435148692, + "grad_norm": 0.7466368679226278, + "learning_rate": 0.00019793779203439117, + "loss": 12.6437, + "step": 3409 + }, + { + "epoch": 0.18568812834806994, + "grad_norm": 0.7581910439112242, + "learning_rate": 0.00019793601004049273, + "loss": 12.6338, + "step": 3410 + }, + { + "epoch": 0.18574258234465296, + "grad_norm": 0.6889479667097653, + "learning_rate": 0.00019793422728502747, + "loss": 12.6501, + "step": 3411 + }, + { + "epoch": 0.18579703634123598, + "grad_norm": 0.7053095188967593, + "learning_rate": 0.00019793244376800924, + "loss": 12.6259, + "step": 3412 + }, + { + "epoch": 0.18585149033781898, + "grad_norm": 0.9028178062423818, + "learning_rate": 0.0001979306594894519, + "loss": 12.6088, + "step": 3413 + }, + { + "epoch": 0.185905944334402, + "grad_norm": 0.7099854745460383, + "learning_rate": 0.00019792887444936933, + "loss": 12.6394, + "step": 3414 + }, + { + "epoch": 0.18596039833098502, + "grad_norm": 0.6928213414862964, + "learning_rate": 0.0001979270886477754, + "loss": 12.4603, + "step": 3415 + }, + { + "epoch": 0.186014852327568, + "grad_norm": 0.6587384490980607, + "learning_rate": 0.000197925302084684, + "loss": 12.6143, + "step": 3416 + }, + { + "epoch": 0.18606930632415103, + "grad_norm": 0.7064330914174174, + "learning_rate": 0.00019792351476010905, + "loss": 12.5708, + "step": 3417 + }, + { + "epoch": 0.18612376032073405, + "grad_norm": 0.7396116906610933, + "learning_rate": 0.00019792172667406442, + "loss": 12.6673, + "step": 3418 + }, + { + "epoch": 0.18617821431731704, + "grad_norm": 0.7228600413696578, + "learning_rate": 0.000197919937826564, + "loss": 12.5551, + "step": 3419 + }, + { + "epoch": 0.18623266831390006, + "grad_norm": 0.8579167739390514, + "learning_rate": 0.00019791814821762174, + "loss": 12.6024, + "step": 3420 + }, + { + "epoch": 0.18628712231048308, + "grad_norm": 0.8120542124616528, + "learning_rate": 0.00019791635784725155, + "loss": 12.8604, + "step": 3421 + }, + { + "epoch": 0.18634157630706608, + "grad_norm": 0.7673698957399567, + "learning_rate": 0.00019791456671546732, + "loss": 12.6082, + "step": 3422 + }, + { + "epoch": 0.1863960303036491, + "grad_norm": 0.785400566174054, + "learning_rate": 0.000197912774822283, + "loss": 12.6288, + "step": 3423 + }, + { + "epoch": 0.18645048430023212, + "grad_norm": 0.7907865406949788, + "learning_rate": 0.00019791098216771254, + "loss": 12.6943, + "step": 3424 + }, + { + "epoch": 0.1865049382968151, + "grad_norm": 0.8620753392475704, + "learning_rate": 0.00019790918875176985, + "loss": 12.8335, + "step": 3425 + }, + { + "epoch": 0.18655939229339813, + "grad_norm": 0.7712543156805853, + "learning_rate": 0.0001979073945744689, + "loss": 12.6657, + "step": 3426 + }, + { + "epoch": 0.18661384628998115, + "grad_norm": 0.8981380922682508, + "learning_rate": 0.00019790559963582362, + "loss": 12.7522, + "step": 3427 + }, + { + "epoch": 0.18666830028656417, + "grad_norm": 0.7264599236766378, + "learning_rate": 0.00019790380393584797, + "loss": 12.6306, + "step": 3428 + }, + { + "epoch": 0.18672275428314716, + "grad_norm": 0.7341988030503047, + "learning_rate": 0.00019790200747455593, + "loss": 12.6287, + "step": 3429 + }, + { + "epoch": 0.18677720827973018, + "grad_norm": 0.6866438472063808, + "learning_rate": 0.00019790021025196147, + "loss": 12.6171, + "step": 3430 + }, + { + "epoch": 0.1868316622763132, + "grad_norm": 0.8609424467053118, + "learning_rate": 0.00019789841226807857, + "loss": 12.6022, + "step": 3431 + }, + { + "epoch": 0.1868861162728962, + "grad_norm": 0.735664009505058, + "learning_rate": 0.00019789661352292116, + "loss": 12.7115, + "step": 3432 + }, + { + "epoch": 0.18694057026947922, + "grad_norm": 0.8061194136233919, + "learning_rate": 0.0001978948140165033, + "loss": 12.6571, + "step": 3433 + }, + { + "epoch": 0.18699502426606224, + "grad_norm": 0.7001891909756394, + "learning_rate": 0.00019789301374883894, + "loss": 12.6699, + "step": 3434 + }, + { + "epoch": 0.18704947826264523, + "grad_norm": 0.6471394560553497, + "learning_rate": 0.00019789121271994206, + "loss": 12.6167, + "step": 3435 + }, + { + "epoch": 0.18710393225922825, + "grad_norm": 0.8114243930971807, + "learning_rate": 0.00019788941092982674, + "loss": 12.5519, + "step": 3436 + }, + { + "epoch": 0.18715838625581127, + "grad_norm": 0.7556975256688439, + "learning_rate": 0.00019788760837850694, + "loss": 12.6453, + "step": 3437 + }, + { + "epoch": 0.18721284025239426, + "grad_norm": 0.8111179653669358, + "learning_rate": 0.00019788580506599664, + "loss": 12.7268, + "step": 3438 + }, + { + "epoch": 0.18726729424897728, + "grad_norm": 0.7792236041663045, + "learning_rate": 0.00019788400099230992, + "loss": 12.471, + "step": 3439 + }, + { + "epoch": 0.1873217482455603, + "grad_norm": 0.7636202013337339, + "learning_rate": 0.00019788219615746083, + "loss": 12.7225, + "step": 3440 + }, + { + "epoch": 0.1873762022421433, + "grad_norm": 0.7703390768900522, + "learning_rate": 0.00019788039056146332, + "loss": 12.77, + "step": 3441 + }, + { + "epoch": 0.18743065623872632, + "grad_norm": 0.6989497989715391, + "learning_rate": 0.0001978785842043315, + "loss": 12.572, + "step": 3442 + }, + { + "epoch": 0.18748511023530934, + "grad_norm": 0.7531763129520263, + "learning_rate": 0.0001978767770860794, + "loss": 12.6281, + "step": 3443 + }, + { + "epoch": 0.18753956423189233, + "grad_norm": 0.7656064978206467, + "learning_rate": 0.00019787496920672107, + "loss": 12.683, + "step": 3444 + }, + { + "epoch": 0.18759401822847535, + "grad_norm": 0.6959064866014217, + "learning_rate": 0.00019787316056627053, + "loss": 12.6926, + "step": 3445 + }, + { + "epoch": 0.18764847222505837, + "grad_norm": 0.7310517195944778, + "learning_rate": 0.00019787135116474191, + "loss": 12.6944, + "step": 3446 + }, + { + "epoch": 0.1877029262216414, + "grad_norm": 0.7050888293219245, + "learning_rate": 0.00019786954100214926, + "loss": 12.6899, + "step": 3447 + }, + { + "epoch": 0.18775738021822438, + "grad_norm": 0.7331840748186392, + "learning_rate": 0.00019786773007850664, + "loss": 12.7043, + "step": 3448 + }, + { + "epoch": 0.1878118342148074, + "grad_norm": 0.8760959354346283, + "learning_rate": 0.0001978659183938281, + "loss": 12.5749, + "step": 3449 + }, + { + "epoch": 0.18786628821139043, + "grad_norm": 0.6531939284147806, + "learning_rate": 0.00019786410594812784, + "loss": 12.533, + "step": 3450 + }, + { + "epoch": 0.18792074220797342, + "grad_norm": 0.7905262008052769, + "learning_rate": 0.00019786229274141982, + "loss": 12.6418, + "step": 3451 + }, + { + "epoch": 0.18797519620455644, + "grad_norm": 0.7837407214029306, + "learning_rate": 0.00019786047877371821, + "loss": 12.4968, + "step": 3452 + }, + { + "epoch": 0.18802965020113946, + "grad_norm": 0.6829892747844226, + "learning_rate": 0.0001978586640450371, + "loss": 12.5865, + "step": 3453 + }, + { + "epoch": 0.18808410419772245, + "grad_norm": 0.7905919519547869, + "learning_rate": 0.00019785684855539066, + "loss": 12.7048, + "step": 3454 + }, + { + "epoch": 0.18813855819430547, + "grad_norm": 0.7202467701468647, + "learning_rate": 0.0001978550323047929, + "loss": 12.6159, + "step": 3455 + }, + { + "epoch": 0.1881930121908885, + "grad_norm": 0.7070151712650198, + "learning_rate": 0.00019785321529325803, + "loss": 12.654, + "step": 3456 + }, + { + "epoch": 0.18824746618747148, + "grad_norm": 0.7513944704198623, + "learning_rate": 0.0001978513975208001, + "loss": 12.6776, + "step": 3457 + }, + { + "epoch": 0.1883019201840545, + "grad_norm": 0.7102718478964493, + "learning_rate": 0.00019784957898743335, + "loss": 12.4791, + "step": 3458 + }, + { + "epoch": 0.18835637418063753, + "grad_norm": 0.6849626063754578, + "learning_rate": 0.00019784775969317183, + "loss": 12.6471, + "step": 3459 + }, + { + "epoch": 0.18841082817722052, + "grad_norm": 0.7217393791466998, + "learning_rate": 0.00019784593963802975, + "loss": 12.6078, + "step": 3460 + }, + { + "epoch": 0.18846528217380354, + "grad_norm": 0.9547534304507546, + "learning_rate": 0.0001978441188220212, + "loss": 12.6762, + "step": 3461 + }, + { + "epoch": 0.18851973617038656, + "grad_norm": 0.7400562576011662, + "learning_rate": 0.0001978422972451604, + "loss": 12.5686, + "step": 3462 + }, + { + "epoch": 0.18857419016696958, + "grad_norm": 0.7819096937512829, + "learning_rate": 0.00019784047490746146, + "loss": 12.7465, + "step": 3463 + }, + { + "epoch": 0.18862864416355257, + "grad_norm": 0.6978935409395687, + "learning_rate": 0.00019783865180893862, + "loss": 12.6414, + "step": 3464 + }, + { + "epoch": 0.1886830981601356, + "grad_norm": 0.745264747694646, + "learning_rate": 0.000197836827949606, + "loss": 12.6252, + "step": 3465 + }, + { + "epoch": 0.1887375521567186, + "grad_norm": 0.7120841832106395, + "learning_rate": 0.0001978350033294778, + "loss": 12.5873, + "step": 3466 + }, + { + "epoch": 0.1887920061533016, + "grad_norm": 0.7915147314262276, + "learning_rate": 0.00019783317794856817, + "loss": 12.4969, + "step": 3467 + }, + { + "epoch": 0.18884646014988463, + "grad_norm": 0.7766806085597008, + "learning_rate": 0.00019783135180689138, + "loss": 12.5835, + "step": 3468 + }, + { + "epoch": 0.18890091414646765, + "grad_norm": 0.8489177098808214, + "learning_rate": 0.0001978295249044616, + "loss": 12.7021, + "step": 3469 + }, + { + "epoch": 0.18895536814305064, + "grad_norm": 0.7755385665913826, + "learning_rate": 0.000197827697241293, + "loss": 12.656, + "step": 3470 + }, + { + "epoch": 0.18900982213963366, + "grad_norm": 0.7276129651295314, + "learning_rate": 0.00019782586881739983, + "loss": 12.6165, + "step": 3471 + }, + { + "epoch": 0.18906427613621668, + "grad_norm": 0.7669102297638349, + "learning_rate": 0.0001978240396327963, + "loss": 12.6595, + "step": 3472 + }, + { + "epoch": 0.18911873013279967, + "grad_norm": 0.7420823944772779, + "learning_rate": 0.00019782220968749665, + "loss": 12.5876, + "step": 3473 + }, + { + "epoch": 0.1891731841293827, + "grad_norm": 0.7468557020621868, + "learning_rate": 0.0001978203789815151, + "loss": 12.6916, + "step": 3474 + }, + { + "epoch": 0.1892276381259657, + "grad_norm": 0.7106718007187686, + "learning_rate": 0.00019781854751486582, + "loss": 12.5696, + "step": 3475 + }, + { + "epoch": 0.1892820921225487, + "grad_norm": 0.7332562200016313, + "learning_rate": 0.00019781671528756314, + "loss": 12.7741, + "step": 3476 + }, + { + "epoch": 0.18933654611913173, + "grad_norm": 0.7833748917159779, + "learning_rate": 0.00019781488229962132, + "loss": 12.689, + "step": 3477 + }, + { + "epoch": 0.18939100011571475, + "grad_norm": 0.7741583004705924, + "learning_rate": 0.0001978130485510545, + "loss": 12.7469, + "step": 3478 + }, + { + "epoch": 0.18944545411229777, + "grad_norm": 0.7177523289604667, + "learning_rate": 0.00019781121404187707, + "loss": 12.4962, + "step": 3479 + }, + { + "epoch": 0.18949990810888076, + "grad_norm": 0.809544019401128, + "learning_rate": 0.0001978093787721032, + "loss": 12.7359, + "step": 3480 + }, + { + "epoch": 0.18955436210546378, + "grad_norm": 0.8553144004000792, + "learning_rate": 0.00019780754274174723, + "loss": 12.818, + "step": 3481 + }, + { + "epoch": 0.1896088161020468, + "grad_norm": 0.7081591841393877, + "learning_rate": 0.00019780570595082336, + "loss": 12.5759, + "step": 3482 + }, + { + "epoch": 0.1896632700986298, + "grad_norm": 0.8695116509656418, + "learning_rate": 0.00019780386839934595, + "loss": 12.6974, + "step": 3483 + }, + { + "epoch": 0.18971772409521281, + "grad_norm": 0.8223589842834956, + "learning_rate": 0.00019780203008732924, + "loss": 12.6353, + "step": 3484 + }, + { + "epoch": 0.18977217809179583, + "grad_norm": 0.7446285543581759, + "learning_rate": 0.00019780019101478758, + "loss": 12.6826, + "step": 3485 + }, + { + "epoch": 0.18982663208837883, + "grad_norm": 0.6896485167003834, + "learning_rate": 0.00019779835118173523, + "loss": 12.4591, + "step": 3486 + }, + { + "epoch": 0.18988108608496185, + "grad_norm": 0.7866481478000416, + "learning_rate": 0.00019779651058818645, + "loss": 12.7327, + "step": 3487 + }, + { + "epoch": 0.18993554008154487, + "grad_norm": 0.7697027786000047, + "learning_rate": 0.00019779466923415564, + "loss": 12.5679, + "step": 3488 + }, + { + "epoch": 0.18998999407812786, + "grad_norm": 0.779594290786733, + "learning_rate": 0.00019779282711965705, + "loss": 12.643, + "step": 3489 + }, + { + "epoch": 0.19004444807471088, + "grad_norm": 0.8184831223384603, + "learning_rate": 0.00019779098424470507, + "loss": 12.6132, + "step": 3490 + }, + { + "epoch": 0.1900989020712939, + "grad_norm": 0.7787519451121007, + "learning_rate": 0.000197789140609314, + "loss": 12.7536, + "step": 3491 + }, + { + "epoch": 0.1901533560678769, + "grad_norm": 0.7319492566811413, + "learning_rate": 0.00019778729621349817, + "loss": 12.6267, + "step": 3492 + }, + { + "epoch": 0.19020781006445991, + "grad_norm": 0.9438662989100811, + "learning_rate": 0.0001977854510572719, + "loss": 12.7885, + "step": 3493 + }, + { + "epoch": 0.19026226406104293, + "grad_norm": 0.7472861900488181, + "learning_rate": 0.0001977836051406496, + "loss": 12.5697, + "step": 3494 + }, + { + "epoch": 0.19031671805762596, + "grad_norm": 0.7530749349726638, + "learning_rate": 0.00019778175846364558, + "loss": 12.7655, + "step": 3495 + }, + { + "epoch": 0.19037117205420895, + "grad_norm": 0.717289086680717, + "learning_rate": 0.00019777991102627417, + "loss": 12.7446, + "step": 3496 + }, + { + "epoch": 0.19042562605079197, + "grad_norm": 0.7927481726756973, + "learning_rate": 0.0001977780628285498, + "loss": 12.6446, + "step": 3497 + }, + { + "epoch": 0.190480080047375, + "grad_norm": 0.7975134354075496, + "learning_rate": 0.00019777621387048684, + "loss": 12.6087, + "step": 3498 + }, + { + "epoch": 0.19053453404395798, + "grad_norm": 0.7714088806598007, + "learning_rate": 0.0001977743641520996, + "loss": 12.6812, + "step": 3499 + }, + { + "epoch": 0.190588988040541, + "grad_norm": 0.7761793014499357, + "learning_rate": 0.00019777251367340254, + "loss": 12.4691, + "step": 3500 + }, + { + "epoch": 0.19064344203712402, + "grad_norm": 0.7577645649898556, + "learning_rate": 0.00019777066243441, + "loss": 12.7787, + "step": 3501 + }, + { + "epoch": 0.19069789603370702, + "grad_norm": 0.8089804177245533, + "learning_rate": 0.0001977688104351364, + "loss": 12.6915, + "step": 3502 + }, + { + "epoch": 0.19075235003029004, + "grad_norm": 0.7639733935034829, + "learning_rate": 0.00019776695767559615, + "loss": 12.6868, + "step": 3503 + }, + { + "epoch": 0.19080680402687306, + "grad_norm": 0.8485259089297601, + "learning_rate": 0.0001977651041558036, + "loss": 12.6876, + "step": 3504 + }, + { + "epoch": 0.19086125802345605, + "grad_norm": 0.7641466460154173, + "learning_rate": 0.00019776324987577323, + "loss": 12.7128, + "step": 3505 + }, + { + "epoch": 0.19091571202003907, + "grad_norm": 1.031112607794016, + "learning_rate": 0.00019776139483551944, + "loss": 12.7414, + "step": 3506 + }, + { + "epoch": 0.1909701660166221, + "grad_norm": 0.7165868709934979, + "learning_rate": 0.00019775953903505665, + "loss": 12.5893, + "step": 3507 + }, + { + "epoch": 0.19102462001320508, + "grad_norm": 0.6812113548886141, + "learning_rate": 0.00019775768247439927, + "loss": 12.5462, + "step": 3508 + }, + { + "epoch": 0.1910790740097881, + "grad_norm": 0.8234924571777671, + "learning_rate": 0.0001977558251535618, + "loss": 12.5584, + "step": 3509 + }, + { + "epoch": 0.19113352800637112, + "grad_norm": 0.7551867552780744, + "learning_rate": 0.0001977539670725586, + "loss": 12.651, + "step": 3510 + }, + { + "epoch": 0.19118798200295412, + "grad_norm": 0.8335507001913836, + "learning_rate": 0.00019775210823140416, + "loss": 12.7434, + "step": 3511 + }, + { + "epoch": 0.19124243599953714, + "grad_norm": 0.9366970779404721, + "learning_rate": 0.00019775024863011293, + "loss": 12.7378, + "step": 3512 + }, + { + "epoch": 0.19129688999612016, + "grad_norm": 0.788139924247736, + "learning_rate": 0.0001977483882686994, + "loss": 12.6645, + "step": 3513 + }, + { + "epoch": 0.19135134399270318, + "grad_norm": 1.008796680829356, + "learning_rate": 0.000197746527147178, + "loss": 12.79, + "step": 3514 + }, + { + "epoch": 0.19140579798928617, + "grad_norm": 0.7957919244741967, + "learning_rate": 0.0001977446652655632, + "loss": 12.727, + "step": 3515 + }, + { + "epoch": 0.1914602519858692, + "grad_norm": 0.7216544125020525, + "learning_rate": 0.0001977428026238695, + "loss": 12.7369, + "step": 3516 + }, + { + "epoch": 0.1915147059824522, + "grad_norm": 0.7966202844772511, + "learning_rate": 0.00019774093922211137, + "loss": 12.6249, + "step": 3517 + }, + { + "epoch": 0.1915691599790352, + "grad_norm": 0.761783076179875, + "learning_rate": 0.00019773907506030332, + "loss": 12.6354, + "step": 3518 + }, + { + "epoch": 0.19162361397561822, + "grad_norm": 0.7296691536804387, + "learning_rate": 0.0001977372101384598, + "loss": 12.6756, + "step": 3519 + }, + { + "epoch": 0.19167806797220124, + "grad_norm": 0.7538202468078059, + "learning_rate": 0.00019773534445659537, + "loss": 12.7087, + "step": 3520 + }, + { + "epoch": 0.19173252196878424, + "grad_norm": 0.8182761392904533, + "learning_rate": 0.00019773347801472452, + "loss": 12.6986, + "step": 3521 + }, + { + "epoch": 0.19178697596536726, + "grad_norm": 0.6941479890687625, + "learning_rate": 0.00019773161081286172, + "loss": 12.7346, + "step": 3522 + }, + { + "epoch": 0.19184142996195028, + "grad_norm": 0.7328857273974302, + "learning_rate": 0.00019772974285102156, + "loss": 12.707, + "step": 3523 + }, + { + "epoch": 0.19189588395853327, + "grad_norm": 0.8779759648629172, + "learning_rate": 0.00019772787412921853, + "loss": 12.6799, + "step": 3524 + }, + { + "epoch": 0.1919503379551163, + "grad_norm": 0.8102918945732309, + "learning_rate": 0.00019772600464746715, + "loss": 12.7078, + "step": 3525 + }, + { + "epoch": 0.1920047919516993, + "grad_norm": 0.7432385246336675, + "learning_rate": 0.00019772413440578197, + "loss": 12.576, + "step": 3526 + }, + { + "epoch": 0.1920592459482823, + "grad_norm": 0.7526038693013306, + "learning_rate": 0.00019772226340417754, + "loss": 12.6807, + "step": 3527 + }, + { + "epoch": 0.19211369994486532, + "grad_norm": 0.8225261794530042, + "learning_rate": 0.00019772039164266838, + "loss": 12.5654, + "step": 3528 + }, + { + "epoch": 0.19216815394144834, + "grad_norm": 0.725848010653964, + "learning_rate": 0.00019771851912126908, + "loss": 12.6766, + "step": 3529 + }, + { + "epoch": 0.19222260793803136, + "grad_norm": 0.7908395411886413, + "learning_rate": 0.00019771664583999418, + "loss": 12.5595, + "step": 3530 + }, + { + "epoch": 0.19227706193461436, + "grad_norm": 0.745581408980533, + "learning_rate": 0.00019771477179885826, + "loss": 12.7509, + "step": 3531 + }, + { + "epoch": 0.19233151593119738, + "grad_norm": 0.8018799413676838, + "learning_rate": 0.00019771289699787589, + "loss": 12.695, + "step": 3532 + }, + { + "epoch": 0.1923859699277804, + "grad_norm": 0.810353107267892, + "learning_rate": 0.00019771102143706167, + "loss": 12.6875, + "step": 3533 + }, + { + "epoch": 0.1924404239243634, + "grad_norm": 0.7348926531608867, + "learning_rate": 0.00019770914511643012, + "loss": 12.6932, + "step": 3534 + }, + { + "epoch": 0.1924948779209464, + "grad_norm": 0.8973143644014755, + "learning_rate": 0.0001977072680359959, + "loss": 12.6792, + "step": 3535 + }, + { + "epoch": 0.19254933191752943, + "grad_norm": 0.7632452197424912, + "learning_rate": 0.00019770539019577357, + "loss": 12.7185, + "step": 3536 + }, + { + "epoch": 0.19260378591411242, + "grad_norm": 0.7927514340501035, + "learning_rate": 0.00019770351159577773, + "loss": 12.5981, + "step": 3537 + }, + { + "epoch": 0.19265823991069544, + "grad_norm": 0.7068230142827054, + "learning_rate": 0.000197701632236023, + "loss": 12.8504, + "step": 3538 + }, + { + "epoch": 0.19271269390727847, + "grad_norm": 0.7134631121518779, + "learning_rate": 0.000197699752116524, + "loss": 12.6706, + "step": 3539 + }, + { + "epoch": 0.19276714790386146, + "grad_norm": 0.8083137902876478, + "learning_rate": 0.00019769787123729535, + "loss": 12.6535, + "step": 3540 + }, + { + "epoch": 0.19282160190044448, + "grad_norm": 0.7201291945896695, + "learning_rate": 0.00019769598959835168, + "loss": 12.6102, + "step": 3541 + }, + { + "epoch": 0.1928760558970275, + "grad_norm": 0.7248858192915315, + "learning_rate": 0.00019769410719970757, + "loss": 12.5687, + "step": 3542 + }, + { + "epoch": 0.1929305098936105, + "grad_norm": 0.8362500648989917, + "learning_rate": 0.00019769222404137773, + "loss": 12.7465, + "step": 3543 + }, + { + "epoch": 0.1929849638901935, + "grad_norm": 0.705968235830725, + "learning_rate": 0.00019769034012337677, + "loss": 12.5646, + "step": 3544 + }, + { + "epoch": 0.19303941788677653, + "grad_norm": 0.7222014227029238, + "learning_rate": 0.00019768845544571931, + "loss": 12.6912, + "step": 3545 + }, + { + "epoch": 0.19309387188335955, + "grad_norm": 0.8222074073217709, + "learning_rate": 0.0001976865700084201, + "loss": 12.558, + "step": 3546 + }, + { + "epoch": 0.19314832587994255, + "grad_norm": 0.803715808297221, + "learning_rate": 0.0001976846838114937, + "loss": 12.765, + "step": 3547 + }, + { + "epoch": 0.19320277987652557, + "grad_norm": 0.9054215650732284, + "learning_rate": 0.00019768279685495482, + "loss": 12.6645, + "step": 3548 + }, + { + "epoch": 0.19325723387310859, + "grad_norm": 0.7785454014057418, + "learning_rate": 0.00019768090913881815, + "loss": 12.7655, + "step": 3549 + }, + { + "epoch": 0.19331168786969158, + "grad_norm": 0.734109054191284, + "learning_rate": 0.00019767902066309832, + "loss": 12.6601, + "step": 3550 + }, + { + "epoch": 0.1933661418662746, + "grad_norm": 0.838383336586159, + "learning_rate": 0.00019767713142781007, + "loss": 12.6776, + "step": 3551 + }, + { + "epoch": 0.19342059586285762, + "grad_norm": 0.7860534851895626, + "learning_rate": 0.00019767524143296804, + "loss": 12.6996, + "step": 3552 + }, + { + "epoch": 0.1934750498594406, + "grad_norm": 0.7393473966552592, + "learning_rate": 0.00019767335067858696, + "loss": 12.7366, + "step": 3553 + }, + { + "epoch": 0.19352950385602363, + "grad_norm": 0.7676063068921294, + "learning_rate": 0.00019767145916468155, + "loss": 12.7784, + "step": 3554 + }, + { + "epoch": 0.19358395785260665, + "grad_norm": 0.6940158010728265, + "learning_rate": 0.00019766956689126647, + "loss": 12.7075, + "step": 3555 + }, + { + "epoch": 0.19363841184918965, + "grad_norm": 0.8176845403633383, + "learning_rate": 0.00019766767385835646, + "loss": 12.6988, + "step": 3556 + }, + { + "epoch": 0.19369286584577267, + "grad_norm": 0.8256601427666457, + "learning_rate": 0.00019766578006596625, + "loss": 12.705, + "step": 3557 + }, + { + "epoch": 0.1937473198423557, + "grad_norm": 0.7528657863935839, + "learning_rate": 0.00019766388551411055, + "loss": 12.5481, + "step": 3558 + }, + { + "epoch": 0.19380177383893868, + "grad_norm": 0.7907231207947625, + "learning_rate": 0.00019766199020280407, + "loss": 12.5655, + "step": 3559 + }, + { + "epoch": 0.1938562278355217, + "grad_norm": 0.8433905921188369, + "learning_rate": 0.0001976600941320616, + "loss": 12.6689, + "step": 3560 + }, + { + "epoch": 0.19391068183210472, + "grad_norm": 0.7864133499683497, + "learning_rate": 0.00019765819730189788, + "loss": 12.5944, + "step": 3561 + }, + { + "epoch": 0.19396513582868774, + "grad_norm": 0.7921089125035029, + "learning_rate": 0.00019765629971232762, + "loss": 12.7421, + "step": 3562 + }, + { + "epoch": 0.19401958982527073, + "grad_norm": 0.769961242712185, + "learning_rate": 0.00019765440136336563, + "loss": 12.5976, + "step": 3563 + }, + { + "epoch": 0.19407404382185375, + "grad_norm": 0.7854683420579901, + "learning_rate": 0.0001976525022550266, + "loss": 12.7104, + "step": 3564 + }, + { + "epoch": 0.19412849781843677, + "grad_norm": 0.7055786570242942, + "learning_rate": 0.00019765060238732533, + "loss": 12.6875, + "step": 3565 + }, + { + "epoch": 0.19418295181501977, + "grad_norm": 0.6676731987699449, + "learning_rate": 0.0001976487017602766, + "loss": 12.6477, + "step": 3566 + }, + { + "epoch": 0.1942374058116028, + "grad_norm": 0.7538115788203865, + "learning_rate": 0.0001976468003738952, + "loss": 12.7183, + "step": 3567 + }, + { + "epoch": 0.1942918598081858, + "grad_norm": 0.75439765158688, + "learning_rate": 0.00019764489822819594, + "loss": 12.6667, + "step": 3568 + }, + { + "epoch": 0.1943463138047688, + "grad_norm": 0.7020514027822288, + "learning_rate": 0.00019764299532319354, + "loss": 12.6256, + "step": 3569 + }, + { + "epoch": 0.19440076780135182, + "grad_norm": 0.7834423558108528, + "learning_rate": 0.00019764109165890283, + "loss": 12.6853, + "step": 3570 + }, + { + "epoch": 0.19445522179793484, + "grad_norm": 0.8136321623655445, + "learning_rate": 0.00019763918723533864, + "loss": 12.6909, + "step": 3571 + }, + { + "epoch": 0.19450967579451783, + "grad_norm": 0.8091687627500386, + "learning_rate": 0.00019763728205251572, + "loss": 12.8608, + "step": 3572 + }, + { + "epoch": 0.19456412979110085, + "grad_norm": 0.8657728047541944, + "learning_rate": 0.00019763537611044892, + "loss": 12.6988, + "step": 3573 + }, + { + "epoch": 0.19461858378768387, + "grad_norm": 0.7915471591130552, + "learning_rate": 0.0001976334694091531, + "loss": 12.491, + "step": 3574 + }, + { + "epoch": 0.19467303778426687, + "grad_norm": 0.7494389684777423, + "learning_rate": 0.00019763156194864306, + "loss": 12.721, + "step": 3575 + }, + { + "epoch": 0.1947274917808499, + "grad_norm": 0.73901543888061, + "learning_rate": 0.0001976296537289336, + "loss": 12.7059, + "step": 3576 + }, + { + "epoch": 0.1947819457774329, + "grad_norm": 0.7546077616744777, + "learning_rate": 0.00019762774475003955, + "loss": 12.6357, + "step": 3577 + }, + { + "epoch": 0.1948363997740159, + "grad_norm": 0.7994713205953073, + "learning_rate": 0.00019762583501197582, + "loss": 12.7197, + "step": 3578 + }, + { + "epoch": 0.19489085377059892, + "grad_norm": 0.7869063678294197, + "learning_rate": 0.00019762392451475722, + "loss": 12.6343, + "step": 3579 + }, + { + "epoch": 0.19494530776718194, + "grad_norm": 0.7443898321946537, + "learning_rate": 0.0001976220132583986, + "loss": 12.5181, + "step": 3580 + }, + { + "epoch": 0.19499976176376496, + "grad_norm": 0.6598295189631749, + "learning_rate": 0.00019762010124291484, + "loss": 12.4168, + "step": 3581 + }, + { + "epoch": 0.19505421576034795, + "grad_norm": 0.6901751080579652, + "learning_rate": 0.0001976181884683208, + "loss": 12.7246, + "step": 3582 + }, + { + "epoch": 0.19510866975693097, + "grad_norm": 0.8222129196616311, + "learning_rate": 0.00019761627493463136, + "loss": 12.6252, + "step": 3583 + }, + { + "epoch": 0.195163123753514, + "grad_norm": 0.694561116471691, + "learning_rate": 0.00019761436064186138, + "loss": 12.6051, + "step": 3584 + }, + { + "epoch": 0.195217577750097, + "grad_norm": 0.7677200945317284, + "learning_rate": 0.0001976124455900258, + "loss": 12.7399, + "step": 3585 + }, + { + "epoch": 0.19527203174668, + "grad_norm": 0.7603252493444852, + "learning_rate": 0.00019761052977913942, + "loss": 12.5219, + "step": 3586 + }, + { + "epoch": 0.19532648574326303, + "grad_norm": 0.7413408765375056, + "learning_rate": 0.00019760861320921723, + "loss": 12.7147, + "step": 3587 + }, + { + "epoch": 0.19538093973984602, + "grad_norm": 0.7076795749184328, + "learning_rate": 0.00019760669588027408, + "loss": 12.5845, + "step": 3588 + }, + { + "epoch": 0.19543539373642904, + "grad_norm": 0.7097969871713746, + "learning_rate": 0.0001976047777923249, + "loss": 12.5254, + "step": 3589 + }, + { + "epoch": 0.19548984773301206, + "grad_norm": 0.6822010258754557, + "learning_rate": 0.0001976028589453846, + "loss": 12.5417, + "step": 3590 + }, + { + "epoch": 0.19554430172959505, + "grad_norm": 0.7519241744027907, + "learning_rate": 0.00019760093933946809, + "loss": 12.5946, + "step": 3591 + }, + { + "epoch": 0.19559875572617808, + "grad_norm": 0.7397640484556057, + "learning_rate": 0.00019759901897459033, + "loss": 12.6998, + "step": 3592 + }, + { + "epoch": 0.1956532097227611, + "grad_norm": 0.7444813318525035, + "learning_rate": 0.0001975970978507662, + "loss": 12.6904, + "step": 3593 + }, + { + "epoch": 0.1957076637193441, + "grad_norm": 0.6485345517894925, + "learning_rate": 0.0001975951759680107, + "loss": 12.5317, + "step": 3594 + }, + { + "epoch": 0.1957621177159271, + "grad_norm": 0.7359840794178918, + "learning_rate": 0.00019759325332633872, + "loss": 12.683, + "step": 3595 + }, + { + "epoch": 0.19581657171251013, + "grad_norm": 0.7701808720342013, + "learning_rate": 0.00019759132992576528, + "loss": 12.7741, + "step": 3596 + }, + { + "epoch": 0.19587102570909315, + "grad_norm": 0.7356083817207066, + "learning_rate": 0.00019758940576630524, + "loss": 12.445, + "step": 3597 + }, + { + "epoch": 0.19592547970567614, + "grad_norm": 0.7055571693231458, + "learning_rate": 0.00019758748084797363, + "loss": 12.6499, + "step": 3598 + }, + { + "epoch": 0.19597993370225916, + "grad_norm": 0.7139018855523392, + "learning_rate": 0.00019758555517078544, + "loss": 12.5885, + "step": 3599 + }, + { + "epoch": 0.19603438769884218, + "grad_norm": 0.6686289752594662, + "learning_rate": 0.00019758362873475557, + "loss": 12.6985, + "step": 3600 + }, + { + "epoch": 0.19608884169542518, + "grad_norm": 0.7238026367304898, + "learning_rate": 0.00019758170153989904, + "loss": 12.6118, + "step": 3601 + }, + { + "epoch": 0.1961432956920082, + "grad_norm": 0.7231655713054286, + "learning_rate": 0.00019757977358623083, + "loss": 12.5382, + "step": 3602 + }, + { + "epoch": 0.19619774968859122, + "grad_norm": 0.7240620390875896, + "learning_rate": 0.00019757784487376597, + "loss": 12.6104, + "step": 3603 + }, + { + "epoch": 0.1962522036851742, + "grad_norm": 0.8309518403394776, + "learning_rate": 0.00019757591540251937, + "loss": 12.786, + "step": 3604 + }, + { + "epoch": 0.19630665768175723, + "grad_norm": 0.7182033651055718, + "learning_rate": 0.00019757398517250612, + "loss": 12.6781, + "step": 3605 + }, + { + "epoch": 0.19636111167834025, + "grad_norm": 0.6624124827710981, + "learning_rate": 0.0001975720541837412, + "loss": 12.6035, + "step": 3606 + }, + { + "epoch": 0.19641556567492324, + "grad_norm": 0.8103886947473927, + "learning_rate": 0.00019757012243623963, + "loss": 12.5608, + "step": 3607 + }, + { + "epoch": 0.19647001967150626, + "grad_norm": 0.6924633612516438, + "learning_rate": 0.0001975681899300164, + "loss": 12.6219, + "step": 3608 + }, + { + "epoch": 0.19652447366808928, + "grad_norm": 0.7250157464819681, + "learning_rate": 0.0001975662566650866, + "loss": 12.6682, + "step": 3609 + }, + { + "epoch": 0.19657892766467228, + "grad_norm": 0.8019069046132882, + "learning_rate": 0.0001975643226414652, + "loss": 12.5714, + "step": 3610 + }, + { + "epoch": 0.1966333816612553, + "grad_norm": 0.8310541251566375, + "learning_rate": 0.00019756238785916729, + "loss": 12.6828, + "step": 3611 + }, + { + "epoch": 0.19668783565783832, + "grad_norm": 0.6882459675921699, + "learning_rate": 0.00019756045231820784, + "loss": 12.6901, + "step": 3612 + }, + { + "epoch": 0.19674228965442134, + "grad_norm": 0.8262341699163686, + "learning_rate": 0.000197558516018602, + "loss": 12.7507, + "step": 3613 + }, + { + "epoch": 0.19679674365100433, + "grad_norm": 0.7657928247465275, + "learning_rate": 0.00019755657896036475, + "loss": 12.6165, + "step": 3614 + }, + { + "epoch": 0.19685119764758735, + "grad_norm": 0.7631725744086381, + "learning_rate": 0.0001975546411435112, + "loss": 12.5987, + "step": 3615 + }, + { + "epoch": 0.19690565164417037, + "grad_norm": 0.7345946704374849, + "learning_rate": 0.0001975527025680564, + "loss": 12.594, + "step": 3616 + }, + { + "epoch": 0.19696010564075336, + "grad_norm": 0.7681731733906019, + "learning_rate": 0.00019755076323401543, + "loss": 12.6932, + "step": 3617 + }, + { + "epoch": 0.19701455963733638, + "grad_norm": 0.8080127465641304, + "learning_rate": 0.00019754882314140335, + "loss": 12.7142, + "step": 3618 + }, + { + "epoch": 0.1970690136339194, + "grad_norm": 0.8400502525702276, + "learning_rate": 0.00019754688229023528, + "loss": 12.6429, + "step": 3619 + }, + { + "epoch": 0.1971234676305024, + "grad_norm": 0.7696932357860664, + "learning_rate": 0.00019754494068052628, + "loss": 12.6779, + "step": 3620 + }, + { + "epoch": 0.19717792162708542, + "grad_norm": 0.7362750272307614, + "learning_rate": 0.00019754299831229146, + "loss": 12.6583, + "step": 3621 + }, + { + "epoch": 0.19723237562366844, + "grad_norm": 0.9548179295323732, + "learning_rate": 0.00019754105518554594, + "loss": 12.7402, + "step": 3622 + }, + { + "epoch": 0.19728682962025143, + "grad_norm": 0.7760525845636607, + "learning_rate": 0.0001975391113003048, + "loss": 12.7086, + "step": 3623 + }, + { + "epoch": 0.19734128361683445, + "grad_norm": 0.9345824260359603, + "learning_rate": 0.0001975371666565832, + "loss": 12.653, + "step": 3624 + }, + { + "epoch": 0.19739573761341747, + "grad_norm": 0.8025717083886728, + "learning_rate": 0.00019753522125439622, + "loss": 12.6398, + "step": 3625 + }, + { + "epoch": 0.19745019161000046, + "grad_norm": 0.7705338143926113, + "learning_rate": 0.00019753327509375898, + "loss": 12.5353, + "step": 3626 + }, + { + "epoch": 0.19750464560658348, + "grad_norm": 0.9081350449246679, + "learning_rate": 0.00019753132817468667, + "loss": 12.6306, + "step": 3627 + }, + { + "epoch": 0.1975590996031665, + "grad_norm": 0.7150794542815331, + "learning_rate": 0.00019752938049719438, + "loss": 12.6129, + "step": 3628 + }, + { + "epoch": 0.19761355359974953, + "grad_norm": 0.7866917662670726, + "learning_rate": 0.0001975274320612973, + "loss": 12.7477, + "step": 3629 + }, + { + "epoch": 0.19766800759633252, + "grad_norm": 0.7704851762300791, + "learning_rate": 0.00019752548286701053, + "loss": 12.6923, + "step": 3630 + }, + { + "epoch": 0.19772246159291554, + "grad_norm": 0.830753690438923, + "learning_rate": 0.00019752353291434922, + "loss": 12.7148, + "step": 3631 + }, + { + "epoch": 0.19777691558949856, + "grad_norm": 0.9358040617724722, + "learning_rate": 0.00019752158220332858, + "loss": 12.8388, + "step": 3632 + }, + { + "epoch": 0.19783136958608155, + "grad_norm": 0.7476298552772048, + "learning_rate": 0.0001975196307339638, + "loss": 12.6567, + "step": 3633 + }, + { + "epoch": 0.19788582358266457, + "grad_norm": 0.7800103956608793, + "learning_rate": 0.00019751767850627, + "loss": 12.6094, + "step": 3634 + }, + { + "epoch": 0.1979402775792476, + "grad_norm": 0.7566188148483881, + "learning_rate": 0.00019751572552026235, + "loss": 12.6221, + "step": 3635 + }, + { + "epoch": 0.19799473157583058, + "grad_norm": 0.6787264592319261, + "learning_rate": 0.0001975137717759561, + "loss": 12.7796, + "step": 3636 + }, + { + "epoch": 0.1980491855724136, + "grad_norm": 0.7822758510904512, + "learning_rate": 0.00019751181727336637, + "loss": 12.6914, + "step": 3637 + }, + { + "epoch": 0.19810363956899663, + "grad_norm": 0.710249806995049, + "learning_rate": 0.00019750986201250842, + "loss": 12.7077, + "step": 3638 + }, + { + "epoch": 0.19815809356557962, + "grad_norm": 0.7490688178996289, + "learning_rate": 0.00019750790599339744, + "loss": 12.7766, + "step": 3639 + }, + { + "epoch": 0.19821254756216264, + "grad_norm": 0.7043863922849712, + "learning_rate": 0.00019750594921604862, + "loss": 12.6596, + "step": 3640 + }, + { + "epoch": 0.19826700155874566, + "grad_norm": 0.8152263313748491, + "learning_rate": 0.0001975039916804772, + "loss": 12.7449, + "step": 3641 + }, + { + "epoch": 0.19832145555532865, + "grad_norm": 0.7191860510140848, + "learning_rate": 0.00019750203338669836, + "loss": 12.7179, + "step": 3642 + }, + { + "epoch": 0.19837590955191167, + "grad_norm": 0.7064986821688003, + "learning_rate": 0.00019750007433472737, + "loss": 12.6284, + "step": 3643 + }, + { + "epoch": 0.1984303635484947, + "grad_norm": 1.002927774669751, + "learning_rate": 0.00019749811452457946, + "loss": 12.4528, + "step": 3644 + }, + { + "epoch": 0.19848481754507769, + "grad_norm": 0.7787618250614149, + "learning_rate": 0.00019749615395626985, + "loss": 12.6582, + "step": 3645 + }, + { + "epoch": 0.1985392715416607, + "grad_norm": 0.8308895758175826, + "learning_rate": 0.0001974941926298138, + "loss": 12.7564, + "step": 3646 + }, + { + "epoch": 0.19859372553824373, + "grad_norm": 0.697413418831933, + "learning_rate": 0.00019749223054522656, + "loss": 12.6837, + "step": 3647 + }, + { + "epoch": 0.19864817953482675, + "grad_norm": 0.6846133126736759, + "learning_rate": 0.0001974902677025234, + "loss": 12.4405, + "step": 3648 + }, + { + "epoch": 0.19870263353140974, + "grad_norm": 0.7054346767013996, + "learning_rate": 0.00019748830410171956, + "loss": 12.6491, + "step": 3649 + }, + { + "epoch": 0.19875708752799276, + "grad_norm": 0.8471597612637782, + "learning_rate": 0.00019748633974283033, + "loss": 12.7337, + "step": 3650 + }, + { + "epoch": 0.19881154152457578, + "grad_norm": 0.6812288046264803, + "learning_rate": 0.00019748437462587096, + "loss": 12.5475, + "step": 3651 + }, + { + "epoch": 0.19886599552115877, + "grad_norm": 0.8246124260906363, + "learning_rate": 0.00019748240875085672, + "loss": 12.6354, + "step": 3652 + }, + { + "epoch": 0.1989204495177418, + "grad_norm": 0.7300175077201014, + "learning_rate": 0.00019748044211780297, + "loss": 12.5718, + "step": 3653 + }, + { + "epoch": 0.1989749035143248, + "grad_norm": 0.7580912082168959, + "learning_rate": 0.00019747847472672493, + "loss": 12.6165, + "step": 3654 + }, + { + "epoch": 0.1990293575109078, + "grad_norm": 0.6839688816562365, + "learning_rate": 0.00019747650657763792, + "loss": 12.6779, + "step": 3655 + }, + { + "epoch": 0.19908381150749083, + "grad_norm": 0.7581402696591821, + "learning_rate": 0.00019747453767055725, + "loss": 12.5625, + "step": 3656 + }, + { + "epoch": 0.19913826550407385, + "grad_norm": 0.7254368507070799, + "learning_rate": 0.00019747256800549824, + "loss": 12.529, + "step": 3657 + }, + { + "epoch": 0.19919271950065684, + "grad_norm": 0.6925063557379147, + "learning_rate": 0.00019747059758247617, + "loss": 12.56, + "step": 3658 + }, + { + "epoch": 0.19924717349723986, + "grad_norm": 0.7606285519649787, + "learning_rate": 0.00019746862640150642, + "loss": 12.5587, + "step": 3659 + }, + { + "epoch": 0.19930162749382288, + "grad_norm": 0.726365152862436, + "learning_rate": 0.00019746665446260426, + "loss": 12.6198, + "step": 3660 + }, + { + "epoch": 0.19935608149040587, + "grad_norm": 0.6861309571156384, + "learning_rate": 0.00019746468176578503, + "loss": 12.7161, + "step": 3661 + }, + { + "epoch": 0.1994105354869889, + "grad_norm": 0.8451387517182866, + "learning_rate": 0.00019746270831106415, + "loss": 12.6925, + "step": 3662 + }, + { + "epoch": 0.19946498948357191, + "grad_norm": 0.8460799924334934, + "learning_rate": 0.00019746073409845685, + "loss": 12.5613, + "step": 3663 + }, + { + "epoch": 0.19951944348015493, + "grad_norm": 0.8082933138547622, + "learning_rate": 0.00019745875912797857, + "loss": 12.4263, + "step": 3664 + }, + { + "epoch": 0.19957389747673793, + "grad_norm": 0.6649570550012347, + "learning_rate": 0.00019745678339964462, + "loss": 12.512, + "step": 3665 + }, + { + "epoch": 0.19962835147332095, + "grad_norm": 0.7607425799101774, + "learning_rate": 0.00019745480691347038, + "loss": 12.7488, + "step": 3666 + }, + { + "epoch": 0.19968280546990397, + "grad_norm": 0.7754492850947158, + "learning_rate": 0.00019745282966947123, + "loss": 12.7272, + "step": 3667 + }, + { + "epoch": 0.19973725946648696, + "grad_norm": 0.7840978078878431, + "learning_rate": 0.00019745085166766253, + "loss": 12.6944, + "step": 3668 + }, + { + "epoch": 0.19979171346306998, + "grad_norm": 0.6741926222868774, + "learning_rate": 0.00019744887290805963, + "loss": 12.5978, + "step": 3669 + }, + { + "epoch": 0.199846167459653, + "grad_norm": 0.7342454477300046, + "learning_rate": 0.000197446893390678, + "loss": 12.6297, + "step": 3670 + }, + { + "epoch": 0.199900621456236, + "grad_norm": 0.8151529838473607, + "learning_rate": 0.00019744491311553296, + "loss": 12.8349, + "step": 3671 + }, + { + "epoch": 0.19995507545281901, + "grad_norm": 0.738421652122142, + "learning_rate": 0.00019744293208263995, + "loss": 12.7038, + "step": 3672 + }, + { + "epoch": 0.20000952944940203, + "grad_norm": 0.7168652829178292, + "learning_rate": 0.00019744095029201438, + "loss": 12.6497, + "step": 3673 + }, + { + "epoch": 0.20006398344598503, + "grad_norm": 0.8215008669818747, + "learning_rate": 0.0001974389677436716, + "loss": 12.6954, + "step": 3674 + }, + { + "epoch": 0.20011843744256805, + "grad_norm": 0.8097171866747358, + "learning_rate": 0.0001974369844376271, + "loss": 12.5313, + "step": 3675 + }, + { + "epoch": 0.20017289143915107, + "grad_norm": 0.7084186130888639, + "learning_rate": 0.00019743500037389624, + "loss": 12.6974, + "step": 3676 + }, + { + "epoch": 0.20022734543573406, + "grad_norm": 0.7524209666082665, + "learning_rate": 0.00019743301555249446, + "loss": 12.5611, + "step": 3677 + }, + { + "epoch": 0.20028179943231708, + "grad_norm": 0.7798370366657434, + "learning_rate": 0.00019743102997343725, + "loss": 12.5747, + "step": 3678 + }, + { + "epoch": 0.2003362534289001, + "grad_norm": 0.8022141403799491, + "learning_rate": 0.00019742904363674, + "loss": 12.7231, + "step": 3679 + }, + { + "epoch": 0.20039070742548312, + "grad_norm": 0.8906558553915508, + "learning_rate": 0.00019742705654241815, + "loss": 12.6655, + "step": 3680 + }, + { + "epoch": 0.20044516142206611, + "grad_norm": 0.6849201171068462, + "learning_rate": 0.00019742506869048718, + "loss": 12.6287, + "step": 3681 + }, + { + "epoch": 0.20049961541864914, + "grad_norm": 0.7380211492839521, + "learning_rate": 0.00019742308008096254, + "loss": 12.806, + "step": 3682 + }, + { + "epoch": 0.20055406941523216, + "grad_norm": 0.7440520912943274, + "learning_rate": 0.00019742109071385972, + "loss": 12.6976, + "step": 3683 + }, + { + "epoch": 0.20060852341181515, + "grad_norm": 0.8094904770848835, + "learning_rate": 0.0001974191005891941, + "loss": 12.7635, + "step": 3684 + }, + { + "epoch": 0.20066297740839817, + "grad_norm": 1.0039098394519197, + "learning_rate": 0.0001974171097069813, + "loss": 12.8509, + "step": 3685 + }, + { + "epoch": 0.2007174314049812, + "grad_norm": 0.720703183546859, + "learning_rate": 0.00019741511806723664, + "loss": 12.6221, + "step": 3686 + }, + { + "epoch": 0.20077188540156418, + "grad_norm": 0.8402311327294334, + "learning_rate": 0.00019741312566997572, + "loss": 12.6864, + "step": 3687 + }, + { + "epoch": 0.2008263393981472, + "grad_norm": 0.6559263382773919, + "learning_rate": 0.00019741113251521398, + "loss": 12.4228, + "step": 3688 + }, + { + "epoch": 0.20088079339473022, + "grad_norm": 0.721615468741822, + "learning_rate": 0.00019740913860296697, + "loss": 12.6649, + "step": 3689 + }, + { + "epoch": 0.20093524739131322, + "grad_norm": 0.7573016725666538, + "learning_rate": 0.00019740714393325014, + "loss": 12.5624, + "step": 3690 + }, + { + "epoch": 0.20098970138789624, + "grad_norm": 0.8287294148461015, + "learning_rate": 0.00019740514850607904, + "loss": 12.7974, + "step": 3691 + }, + { + "epoch": 0.20104415538447926, + "grad_norm": 0.7486278885824178, + "learning_rate": 0.00019740315232146913, + "loss": 12.6035, + "step": 3692 + }, + { + "epoch": 0.20109860938106225, + "grad_norm": 0.7321991888679197, + "learning_rate": 0.00019740115537943603, + "loss": 12.7047, + "step": 3693 + }, + { + "epoch": 0.20115306337764527, + "grad_norm": 0.7230726632935411, + "learning_rate": 0.00019739915767999518, + "loss": 12.6969, + "step": 3694 + }, + { + "epoch": 0.2012075173742283, + "grad_norm": 0.8336807145808396, + "learning_rate": 0.00019739715922316214, + "loss": 12.7886, + "step": 3695 + }, + { + "epoch": 0.2012619713708113, + "grad_norm": 0.8261360393578282, + "learning_rate": 0.00019739516000895246, + "loss": 12.7838, + "step": 3696 + }, + { + "epoch": 0.2013164253673943, + "grad_norm": 0.7161373279083758, + "learning_rate": 0.00019739316003738167, + "loss": 12.6303, + "step": 3697 + }, + { + "epoch": 0.20137087936397732, + "grad_norm": 0.6807051699186523, + "learning_rate": 0.00019739115930846537, + "loss": 12.626, + "step": 3698 + }, + { + "epoch": 0.20142533336056034, + "grad_norm": 0.6295910457426575, + "learning_rate": 0.00019738915782221907, + "loss": 12.5255, + "step": 3699 + }, + { + "epoch": 0.20147978735714334, + "grad_norm": 0.7683727689702888, + "learning_rate": 0.00019738715557865834, + "loss": 12.7087, + "step": 3700 + }, + { + "epoch": 0.20153424135372636, + "grad_norm": 0.744167704136972, + "learning_rate": 0.00019738515257779877, + "loss": 12.5179, + "step": 3701 + }, + { + "epoch": 0.20158869535030938, + "grad_norm": 0.69197717897407, + "learning_rate": 0.0001973831488196559, + "loss": 12.4385, + "step": 3702 + }, + { + "epoch": 0.20164314934689237, + "grad_norm": 0.7643508370180074, + "learning_rate": 0.00019738114430424534, + "loss": 12.5915, + "step": 3703 + }, + { + "epoch": 0.2016976033434754, + "grad_norm": 0.7776169332587081, + "learning_rate": 0.00019737913903158268, + "loss": 12.7894, + "step": 3704 + }, + { + "epoch": 0.2017520573400584, + "grad_norm": 0.7447411020234989, + "learning_rate": 0.0001973771330016835, + "loss": 12.5682, + "step": 3705 + }, + { + "epoch": 0.2018065113366414, + "grad_norm": 0.7246836967348779, + "learning_rate": 0.0001973751262145634, + "loss": 12.6121, + "step": 3706 + }, + { + "epoch": 0.20186096533322442, + "grad_norm": 0.7535649752440313, + "learning_rate": 0.00019737311867023798, + "loss": 12.6519, + "step": 3707 + }, + { + "epoch": 0.20191541932980744, + "grad_norm": 0.7390233694094689, + "learning_rate": 0.0001973711103687229, + "loss": 12.8212, + "step": 3708 + }, + { + "epoch": 0.20196987332639044, + "grad_norm": 0.8148239607128785, + "learning_rate": 0.00019736910131003369, + "loss": 12.6742, + "step": 3709 + }, + { + "epoch": 0.20202432732297346, + "grad_norm": 0.7451607895808458, + "learning_rate": 0.00019736709149418603, + "loss": 12.7395, + "step": 3710 + }, + { + "epoch": 0.20207878131955648, + "grad_norm": 0.7093270748909235, + "learning_rate": 0.00019736508092119554, + "loss": 12.644, + "step": 3711 + }, + { + "epoch": 0.20213323531613947, + "grad_norm": 0.8758348356705866, + "learning_rate": 0.00019736306959107787, + "loss": 12.8661, + "step": 3712 + }, + { + "epoch": 0.2021876893127225, + "grad_norm": 0.6816408808300222, + "learning_rate": 0.00019736105750384864, + "loss": 12.6664, + "step": 3713 + }, + { + "epoch": 0.2022421433093055, + "grad_norm": 0.7525269361120305, + "learning_rate": 0.00019735904465952348, + "loss": 12.6835, + "step": 3714 + }, + { + "epoch": 0.20229659730588853, + "grad_norm": 0.8220374711376354, + "learning_rate": 0.00019735703105811807, + "loss": 12.6514, + "step": 3715 + }, + { + "epoch": 0.20235105130247152, + "grad_norm": 0.6714431280177346, + "learning_rate": 0.00019735501669964806, + "loss": 12.6893, + "step": 3716 + }, + { + "epoch": 0.20240550529905454, + "grad_norm": 0.9556940251485908, + "learning_rate": 0.00019735300158412911, + "loss": 12.7626, + "step": 3717 + }, + { + "epoch": 0.20245995929563756, + "grad_norm": 0.7312910563842567, + "learning_rate": 0.0001973509857115769, + "loss": 12.5876, + "step": 3718 + }, + { + "epoch": 0.20251441329222056, + "grad_norm": 0.7475186407951433, + "learning_rate": 0.0001973489690820071, + "loss": 12.6019, + "step": 3719 + }, + { + "epoch": 0.20256886728880358, + "grad_norm": 0.7941381101364623, + "learning_rate": 0.0001973469516954354, + "loss": 12.648, + "step": 3720 + }, + { + "epoch": 0.2026233212853866, + "grad_norm": 0.7493618173219636, + "learning_rate": 0.00019734493355187747, + "loss": 12.5324, + "step": 3721 + }, + { + "epoch": 0.2026777752819696, + "grad_norm": 0.7908043493982647, + "learning_rate": 0.00019734291465134903, + "loss": 12.6918, + "step": 3722 + }, + { + "epoch": 0.2027322292785526, + "grad_norm": 0.9627774794856256, + "learning_rate": 0.00019734089499386573, + "loss": 12.6197, + "step": 3723 + }, + { + "epoch": 0.20278668327513563, + "grad_norm": 0.7098037134646387, + "learning_rate": 0.0001973388745794433, + "loss": 12.6087, + "step": 3724 + }, + { + "epoch": 0.20284113727171862, + "grad_norm": 0.7636725436853691, + "learning_rate": 0.0001973368534080975, + "loss": 12.6153, + "step": 3725 + }, + { + "epoch": 0.20289559126830164, + "grad_norm": 0.7626349542601414, + "learning_rate": 0.00019733483147984395, + "loss": 12.6811, + "step": 3726 + }, + { + "epoch": 0.20295004526488467, + "grad_norm": 0.74736286062578, + "learning_rate": 0.00019733280879469847, + "loss": 12.6376, + "step": 3727 + }, + { + "epoch": 0.20300449926146766, + "grad_norm": 0.7195779098222057, + "learning_rate": 0.00019733078535267673, + "loss": 12.6022, + "step": 3728 + }, + { + "epoch": 0.20305895325805068, + "grad_norm": 0.7584375166225391, + "learning_rate": 0.00019732876115379449, + "loss": 12.6474, + "step": 3729 + }, + { + "epoch": 0.2031134072546337, + "grad_norm": 0.6761342916077878, + "learning_rate": 0.00019732673619806746, + "loss": 12.5903, + "step": 3730 + }, + { + "epoch": 0.20316786125121672, + "grad_norm": 0.7292770150961099, + "learning_rate": 0.00019732471048551143, + "loss": 12.5185, + "step": 3731 + }, + { + "epoch": 0.2032223152477997, + "grad_norm": 0.6717013120758528, + "learning_rate": 0.00019732268401614214, + "loss": 12.7058, + "step": 3732 + }, + { + "epoch": 0.20327676924438273, + "grad_norm": 0.7336554411162386, + "learning_rate": 0.00019732065678997529, + "loss": 12.6187, + "step": 3733 + }, + { + "epoch": 0.20333122324096575, + "grad_norm": 0.6693441989255688, + "learning_rate": 0.00019731862880702675, + "loss": 12.5719, + "step": 3734 + }, + { + "epoch": 0.20338567723754875, + "grad_norm": 0.7673519285327757, + "learning_rate": 0.0001973166000673122, + "loss": 12.6662, + "step": 3735 + }, + { + "epoch": 0.20344013123413177, + "grad_norm": 0.6610446427454186, + "learning_rate": 0.00019731457057084746, + "loss": 12.6042, + "step": 3736 + }, + { + "epoch": 0.2034945852307148, + "grad_norm": 0.7575110207153776, + "learning_rate": 0.0001973125403176483, + "loss": 12.6657, + "step": 3737 + }, + { + "epoch": 0.20354903922729778, + "grad_norm": 0.7236274014841737, + "learning_rate": 0.00019731050930773048, + "loss": 12.665, + "step": 3738 + }, + { + "epoch": 0.2036034932238808, + "grad_norm": 0.7145867625466173, + "learning_rate": 0.00019730847754110983, + "loss": 12.6669, + "step": 3739 + }, + { + "epoch": 0.20365794722046382, + "grad_norm": 0.7723555587156568, + "learning_rate": 0.00019730644501780216, + "loss": 12.5307, + "step": 3740 + }, + { + "epoch": 0.2037124012170468, + "grad_norm": 0.7367181865589261, + "learning_rate": 0.00019730441173782323, + "loss": 12.5939, + "step": 3741 + }, + { + "epoch": 0.20376685521362983, + "grad_norm": 0.7185443582889142, + "learning_rate": 0.0001973023777011889, + "loss": 12.5633, + "step": 3742 + }, + { + "epoch": 0.20382130921021285, + "grad_norm": 0.9380030962463133, + "learning_rate": 0.00019730034290791495, + "loss": 12.8148, + "step": 3743 + }, + { + "epoch": 0.20387576320679585, + "grad_norm": 0.7097460165179456, + "learning_rate": 0.00019729830735801723, + "loss": 12.5936, + "step": 3744 + }, + { + "epoch": 0.20393021720337887, + "grad_norm": 0.7630980977859622, + "learning_rate": 0.00019729627105151157, + "loss": 12.5326, + "step": 3745 + }, + { + "epoch": 0.2039846711999619, + "grad_norm": 0.7874843041862235, + "learning_rate": 0.00019729423398841375, + "loss": 12.7463, + "step": 3746 + }, + { + "epoch": 0.2040391251965449, + "grad_norm": 0.7706619050603203, + "learning_rate": 0.00019729219616873965, + "loss": 12.5942, + "step": 3747 + }, + { + "epoch": 0.2040935791931279, + "grad_norm": 0.8042598954022532, + "learning_rate": 0.00019729015759250516, + "loss": 12.6013, + "step": 3748 + }, + { + "epoch": 0.20414803318971092, + "grad_norm": 0.7745473764242377, + "learning_rate": 0.00019728811825972604, + "loss": 12.5237, + "step": 3749 + }, + { + "epoch": 0.20420248718629394, + "grad_norm": 0.7852220734332034, + "learning_rate": 0.0001972860781704182, + "loss": 12.652, + "step": 3750 + }, + { + "epoch": 0.20425694118287693, + "grad_norm": 0.7271940145806121, + "learning_rate": 0.00019728403732459756, + "loss": 12.6442, + "step": 3751 + }, + { + "epoch": 0.20431139517945995, + "grad_norm": 0.796056460138645, + "learning_rate": 0.00019728199572227988, + "loss": 12.4352, + "step": 3752 + }, + { + "epoch": 0.20436584917604297, + "grad_norm": 0.9419382278691907, + "learning_rate": 0.0001972799533634811, + "loss": 12.5889, + "step": 3753 + }, + { + "epoch": 0.20442030317262597, + "grad_norm": 0.687316538090516, + "learning_rate": 0.0001972779102482171, + "loss": 12.5692, + "step": 3754 + }, + { + "epoch": 0.204474757169209, + "grad_norm": 0.7396866618011716, + "learning_rate": 0.00019727586637650373, + "loss": 12.5719, + "step": 3755 + }, + { + "epoch": 0.204529211165792, + "grad_norm": 0.8042947033928121, + "learning_rate": 0.00019727382174835692, + "loss": 12.6433, + "step": 3756 + }, + { + "epoch": 0.204583665162375, + "grad_norm": 0.783706359978815, + "learning_rate": 0.00019727177636379257, + "loss": 12.6691, + "step": 3757 + }, + { + "epoch": 0.20463811915895802, + "grad_norm": 0.8823304070658445, + "learning_rate": 0.00019726973022282657, + "loss": 12.6565, + "step": 3758 + }, + { + "epoch": 0.20469257315554104, + "grad_norm": 0.7065231139049295, + "learning_rate": 0.00019726768332547484, + "loss": 12.642, + "step": 3759 + }, + { + "epoch": 0.20474702715212403, + "grad_norm": 0.8411424177529961, + "learning_rate": 0.00019726563567175326, + "loss": 12.6264, + "step": 3760 + }, + { + "epoch": 0.20480148114870705, + "grad_norm": 0.7193541612205515, + "learning_rate": 0.00019726358726167783, + "loss": 12.7119, + "step": 3761 + }, + { + "epoch": 0.20485593514529007, + "grad_norm": 0.826046012175022, + "learning_rate": 0.0001972615380952644, + "loss": 12.6619, + "step": 3762 + }, + { + "epoch": 0.2049103891418731, + "grad_norm": 0.8003066900229244, + "learning_rate": 0.00019725948817252896, + "loss": 12.6848, + "step": 3763 + }, + { + "epoch": 0.2049648431384561, + "grad_norm": 0.707534024551744, + "learning_rate": 0.00019725743749348743, + "loss": 12.6277, + "step": 3764 + }, + { + "epoch": 0.2050192971350391, + "grad_norm": 0.7530481312430138, + "learning_rate": 0.00019725538605815573, + "loss": 12.6478, + "step": 3765 + }, + { + "epoch": 0.20507375113162213, + "grad_norm": 0.7858723007595092, + "learning_rate": 0.00019725333386654987, + "loss": 12.5266, + "step": 3766 + }, + { + "epoch": 0.20512820512820512, + "grad_norm": 0.7056438702746797, + "learning_rate": 0.00019725128091868576, + "loss": 12.5997, + "step": 3767 + }, + { + "epoch": 0.20518265912478814, + "grad_norm": 0.8041505336620463, + "learning_rate": 0.00019724922721457938, + "loss": 12.7159, + "step": 3768 + }, + { + "epoch": 0.20523711312137116, + "grad_norm": 0.7398059661675201, + "learning_rate": 0.00019724717275424673, + "loss": 12.6444, + "step": 3769 + }, + { + "epoch": 0.20529156711795415, + "grad_norm": 0.6868492068938551, + "learning_rate": 0.00019724511753770374, + "loss": 12.6264, + "step": 3770 + }, + { + "epoch": 0.20534602111453717, + "grad_norm": 0.7634898216311271, + "learning_rate": 0.0001972430615649664, + "loss": 12.629, + "step": 3771 + }, + { + "epoch": 0.2054004751111202, + "grad_norm": 0.8049756446724106, + "learning_rate": 0.00019724100483605069, + "loss": 12.7396, + "step": 3772 + }, + { + "epoch": 0.2054549291077032, + "grad_norm": 0.7387584826388791, + "learning_rate": 0.00019723894735097262, + "loss": 12.5023, + "step": 3773 + }, + { + "epoch": 0.2055093831042862, + "grad_norm": 0.6968592334321247, + "learning_rate": 0.00019723688910974822, + "loss": 12.6262, + "step": 3774 + }, + { + "epoch": 0.20556383710086923, + "grad_norm": 0.7328486248615598, + "learning_rate": 0.00019723483011239345, + "loss": 12.6452, + "step": 3775 + }, + { + "epoch": 0.20561829109745222, + "grad_norm": 0.8873669973092565, + "learning_rate": 0.00019723277035892434, + "loss": 12.4639, + "step": 3776 + }, + { + "epoch": 0.20567274509403524, + "grad_norm": 0.800906708179299, + "learning_rate": 0.00019723070984935687, + "loss": 12.6349, + "step": 3777 + }, + { + "epoch": 0.20572719909061826, + "grad_norm": 0.7353268544823438, + "learning_rate": 0.00019722864858370714, + "loss": 12.6264, + "step": 3778 + }, + { + "epoch": 0.20578165308720126, + "grad_norm": 0.7185208506391326, + "learning_rate": 0.00019722658656199112, + "loss": 12.7086, + "step": 3779 + }, + { + "epoch": 0.20583610708378428, + "grad_norm": 0.7791559699979477, + "learning_rate": 0.00019722452378422484, + "loss": 12.6312, + "step": 3780 + }, + { + "epoch": 0.2058905610803673, + "grad_norm": 0.7679050894103957, + "learning_rate": 0.00019722246025042438, + "loss": 12.6771, + "step": 3781 + }, + { + "epoch": 0.20594501507695032, + "grad_norm": 0.7874917981449171, + "learning_rate": 0.00019722039596060573, + "loss": 12.5139, + "step": 3782 + }, + { + "epoch": 0.2059994690735333, + "grad_norm": 0.7991378424352726, + "learning_rate": 0.00019721833091478498, + "loss": 12.7016, + "step": 3783 + }, + { + "epoch": 0.20605392307011633, + "grad_norm": 0.6959229181476858, + "learning_rate": 0.0001972162651129782, + "loss": 12.746, + "step": 3784 + }, + { + "epoch": 0.20610837706669935, + "grad_norm": 0.8788473918123754, + "learning_rate": 0.0001972141985552015, + "loss": 12.6087, + "step": 3785 + }, + { + "epoch": 0.20616283106328234, + "grad_norm": 0.7331671459525739, + "learning_rate": 0.0001972121312414708, + "loss": 12.3949, + "step": 3786 + }, + { + "epoch": 0.20621728505986536, + "grad_norm": 0.7483671298906623, + "learning_rate": 0.0001972100631718023, + "loss": 12.6329, + "step": 3787 + }, + { + "epoch": 0.20627173905644838, + "grad_norm": 0.7858065652580846, + "learning_rate": 0.00019720799434621206, + "loss": 12.512, + "step": 3788 + }, + { + "epoch": 0.20632619305303138, + "grad_norm": 0.8633490504031776, + "learning_rate": 0.00019720592476471613, + "loss": 12.7427, + "step": 3789 + }, + { + "epoch": 0.2063806470496144, + "grad_norm": 0.7954996022566327, + "learning_rate": 0.00019720385442733063, + "loss": 12.7359, + "step": 3790 + }, + { + "epoch": 0.20643510104619742, + "grad_norm": 0.7359411612203388, + "learning_rate": 0.00019720178333407166, + "loss": 12.5897, + "step": 3791 + }, + { + "epoch": 0.2064895550427804, + "grad_norm": 0.834858116614354, + "learning_rate": 0.00019719971148495535, + "loss": 12.7141, + "step": 3792 + }, + { + "epoch": 0.20654400903936343, + "grad_norm": 0.8172693244285171, + "learning_rate": 0.00019719763887999774, + "loss": 12.6519, + "step": 3793 + }, + { + "epoch": 0.20659846303594645, + "grad_norm": 0.7447324971926574, + "learning_rate": 0.00019719556551921503, + "loss": 12.7726, + "step": 3794 + }, + { + "epoch": 0.20665291703252944, + "grad_norm": 0.790954552773982, + "learning_rate": 0.00019719349140262326, + "loss": 12.648, + "step": 3795 + }, + { + "epoch": 0.20670737102911246, + "grad_norm": 0.7456368748812312, + "learning_rate": 0.00019719141653023865, + "loss": 12.5901, + "step": 3796 + }, + { + "epoch": 0.20676182502569548, + "grad_norm": 0.7694652585477508, + "learning_rate": 0.00019718934090207725, + "loss": 12.5993, + "step": 3797 + }, + { + "epoch": 0.2068162790222785, + "grad_norm": 0.6836342209484225, + "learning_rate": 0.00019718726451815524, + "loss": 12.6304, + "step": 3798 + }, + { + "epoch": 0.2068707330188615, + "grad_norm": 0.7862179588431599, + "learning_rate": 0.00019718518737848876, + "loss": 12.5865, + "step": 3799 + }, + { + "epoch": 0.20692518701544452, + "grad_norm": 0.7866118126659232, + "learning_rate": 0.000197183109483094, + "loss": 12.7151, + "step": 3800 + }, + { + "epoch": 0.20697964101202754, + "grad_norm": 0.7387733279981449, + "learning_rate": 0.00019718103083198705, + "loss": 12.5917, + "step": 3801 + }, + { + "epoch": 0.20703409500861053, + "grad_norm": 0.7673889903132298, + "learning_rate": 0.0001971789514251841, + "loss": 12.6231, + "step": 3802 + }, + { + "epoch": 0.20708854900519355, + "grad_norm": 0.7998345043574764, + "learning_rate": 0.00019717687126270133, + "loss": 12.556, + "step": 3803 + }, + { + "epoch": 0.20714300300177657, + "grad_norm": 0.6636026554154101, + "learning_rate": 0.00019717479034455493, + "loss": 12.5632, + "step": 3804 + }, + { + "epoch": 0.20719745699835956, + "grad_norm": 0.7661527649226619, + "learning_rate": 0.00019717270867076106, + "loss": 12.5358, + "step": 3805 + }, + { + "epoch": 0.20725191099494258, + "grad_norm": 0.8041290947239476, + "learning_rate": 0.00019717062624133593, + "loss": 12.779, + "step": 3806 + }, + { + "epoch": 0.2073063649915256, + "grad_norm": 0.8016300510471224, + "learning_rate": 0.00019716854305629569, + "loss": 12.8196, + "step": 3807 + }, + { + "epoch": 0.2073608189881086, + "grad_norm": 0.7844723455880096, + "learning_rate": 0.00019716645911565657, + "loss": 12.7492, + "step": 3808 + }, + { + "epoch": 0.20741527298469162, + "grad_norm": 0.646050380941143, + "learning_rate": 0.00019716437441943477, + "loss": 12.6152, + "step": 3809 + }, + { + "epoch": 0.20746972698127464, + "grad_norm": 0.7273011653324999, + "learning_rate": 0.0001971622889676465, + "loss": 12.5066, + "step": 3810 + }, + { + "epoch": 0.20752418097785763, + "grad_norm": 0.6956035958581436, + "learning_rate": 0.00019716020276030796, + "loss": 12.5895, + "step": 3811 + }, + { + "epoch": 0.20757863497444065, + "grad_norm": 0.68685569219118, + "learning_rate": 0.00019715811579743543, + "loss": 12.613, + "step": 3812 + }, + { + "epoch": 0.20763308897102367, + "grad_norm": 0.6921888162595382, + "learning_rate": 0.00019715602807904504, + "loss": 12.6016, + "step": 3813 + }, + { + "epoch": 0.2076875429676067, + "grad_norm": 0.7068906851597578, + "learning_rate": 0.0001971539396051531, + "loss": 12.5869, + "step": 3814 + }, + { + "epoch": 0.20774199696418968, + "grad_norm": 0.6871538704273896, + "learning_rate": 0.00019715185037577586, + "loss": 12.5944, + "step": 3815 + }, + { + "epoch": 0.2077964509607727, + "grad_norm": 0.7600040985296437, + "learning_rate": 0.00019714976039092954, + "loss": 12.5654, + "step": 3816 + }, + { + "epoch": 0.20785090495735573, + "grad_norm": 0.6744308421520104, + "learning_rate": 0.00019714766965063036, + "loss": 12.5306, + "step": 3817 + }, + { + "epoch": 0.20790535895393872, + "grad_norm": 0.7878816728043071, + "learning_rate": 0.00019714557815489462, + "loss": 12.793, + "step": 3818 + }, + { + "epoch": 0.20795981295052174, + "grad_norm": 0.8099819105548215, + "learning_rate": 0.0001971434859037386, + "loss": 12.7286, + "step": 3819 + }, + { + "epoch": 0.20801426694710476, + "grad_norm": 0.6456719660154859, + "learning_rate": 0.0001971413928971785, + "loss": 12.6326, + "step": 3820 + }, + { + "epoch": 0.20806872094368775, + "grad_norm": 0.8364155602205413, + "learning_rate": 0.00019713929913523068, + "loss": 12.6638, + "step": 3821 + }, + { + "epoch": 0.20812317494027077, + "grad_norm": 0.6785665463512941, + "learning_rate": 0.00019713720461791135, + "loss": 12.6289, + "step": 3822 + }, + { + "epoch": 0.2081776289368538, + "grad_norm": 0.7327840517026947, + "learning_rate": 0.00019713510934523683, + "loss": 12.6189, + "step": 3823 + }, + { + "epoch": 0.20823208293343679, + "grad_norm": 0.7269210860075299, + "learning_rate": 0.00019713301331722343, + "loss": 12.749, + "step": 3824 + }, + { + "epoch": 0.2082865369300198, + "grad_norm": 0.7482373158985721, + "learning_rate": 0.0001971309165338874, + "loss": 12.6211, + "step": 3825 + }, + { + "epoch": 0.20834099092660283, + "grad_norm": 0.6911904425702748, + "learning_rate": 0.0001971288189952451, + "loss": 12.5941, + "step": 3826 + }, + { + "epoch": 0.20839544492318582, + "grad_norm": 0.8193143300411501, + "learning_rate": 0.0001971267207013128, + "loss": 12.6969, + "step": 3827 + }, + { + "epoch": 0.20844989891976884, + "grad_norm": 0.711827074497607, + "learning_rate": 0.00019712462165210684, + "loss": 12.6972, + "step": 3828 + }, + { + "epoch": 0.20850435291635186, + "grad_norm": 0.7307796661654593, + "learning_rate": 0.00019712252184764354, + "loss": 12.6634, + "step": 3829 + }, + { + "epoch": 0.20855880691293488, + "grad_norm": 0.750460068624912, + "learning_rate": 0.00019712042128793922, + "loss": 12.6106, + "step": 3830 + }, + { + "epoch": 0.20861326090951787, + "grad_norm": 0.7586157633791372, + "learning_rate": 0.0001971183199730102, + "loss": 12.8095, + "step": 3831 + }, + { + "epoch": 0.2086677149061009, + "grad_norm": 0.7641058986350736, + "learning_rate": 0.00019711621790287286, + "loss": 12.6426, + "step": 3832 + }, + { + "epoch": 0.2087221689026839, + "grad_norm": 0.7337260905000521, + "learning_rate": 0.00019711411507754352, + "loss": 12.4951, + "step": 3833 + }, + { + "epoch": 0.2087766228992669, + "grad_norm": 0.7356825958053724, + "learning_rate": 0.0001971120114970385, + "loss": 12.4995, + "step": 3834 + }, + { + "epoch": 0.20883107689584993, + "grad_norm": 0.6250276991757239, + "learning_rate": 0.00019710990716137423, + "loss": 12.5067, + "step": 3835 + }, + { + "epoch": 0.20888553089243295, + "grad_norm": 0.7157164003208205, + "learning_rate": 0.00019710780207056702, + "loss": 12.5821, + "step": 3836 + }, + { + "epoch": 0.20893998488901594, + "grad_norm": 0.781156813133304, + "learning_rate": 0.00019710569622463327, + "loss": 12.7614, + "step": 3837 + }, + { + "epoch": 0.20899443888559896, + "grad_norm": 0.8273223237189452, + "learning_rate": 0.00019710358962358933, + "loss": 12.6423, + "step": 3838 + }, + { + "epoch": 0.20904889288218198, + "grad_norm": 0.6768823229002183, + "learning_rate": 0.0001971014822674516, + "loss": 12.6379, + "step": 3839 + }, + { + "epoch": 0.20910334687876497, + "grad_norm": 0.7061533212592964, + "learning_rate": 0.00019709937415623646, + "loss": 12.7136, + "step": 3840 + }, + { + "epoch": 0.209157800875348, + "grad_norm": 0.686532646992562, + "learning_rate": 0.00019709726528996027, + "loss": 12.6175, + "step": 3841 + }, + { + "epoch": 0.209212254871931, + "grad_norm": 0.7922752222688225, + "learning_rate": 0.00019709515566863951, + "loss": 12.7968, + "step": 3842 + }, + { + "epoch": 0.209266708868514, + "grad_norm": 0.6801245682410086, + "learning_rate": 0.00019709304529229053, + "loss": 12.7147, + "step": 3843 + }, + { + "epoch": 0.20932116286509703, + "grad_norm": 0.6555963750355286, + "learning_rate": 0.0001970909341609297, + "loss": 12.5695, + "step": 3844 + }, + { + "epoch": 0.20937561686168005, + "grad_norm": 0.7913863987878447, + "learning_rate": 0.00019708882227457354, + "loss": 12.6226, + "step": 3845 + }, + { + "epoch": 0.20943007085826304, + "grad_norm": 0.7239585315891942, + "learning_rate": 0.00019708670963323842, + "loss": 12.6389, + "step": 3846 + }, + { + "epoch": 0.20948452485484606, + "grad_norm": 0.7520411193774028, + "learning_rate": 0.00019708459623694072, + "loss": 12.7562, + "step": 3847 + }, + { + "epoch": 0.20953897885142908, + "grad_norm": 0.6390701288621873, + "learning_rate": 0.00019708248208569695, + "loss": 12.536, + "step": 3848 + }, + { + "epoch": 0.2095934328480121, + "grad_norm": 0.7632552432870136, + "learning_rate": 0.0001970803671795235, + "loss": 12.7752, + "step": 3849 + }, + { + "epoch": 0.2096478868445951, + "grad_norm": 0.6980795126129343, + "learning_rate": 0.00019707825151843683, + "loss": 12.6178, + "step": 3850 + }, + { + "epoch": 0.20970234084117811, + "grad_norm": 0.6297619428404707, + "learning_rate": 0.0001970761351024534, + "loss": 12.6154, + "step": 3851 + }, + { + "epoch": 0.20975679483776113, + "grad_norm": 0.716503403711023, + "learning_rate": 0.0001970740179315897, + "loss": 12.6761, + "step": 3852 + }, + { + "epoch": 0.20981124883434413, + "grad_norm": 1.0613728402924163, + "learning_rate": 0.0001970719000058621, + "loss": 12.6331, + "step": 3853 + }, + { + "epoch": 0.20986570283092715, + "grad_norm": 0.8591171392678476, + "learning_rate": 0.00019706978132528718, + "loss": 12.5784, + "step": 3854 + }, + { + "epoch": 0.20992015682751017, + "grad_norm": 0.714119371116891, + "learning_rate": 0.00019706766188988133, + "loss": 12.6617, + "step": 3855 + }, + { + "epoch": 0.20997461082409316, + "grad_norm": 0.7094014273501245, + "learning_rate": 0.00019706554169966105, + "loss": 12.6061, + "step": 3856 + }, + { + "epoch": 0.21002906482067618, + "grad_norm": 0.9055915065824638, + "learning_rate": 0.00019706342075464286, + "loss": 12.688, + "step": 3857 + }, + { + "epoch": 0.2100835188172592, + "grad_norm": 0.7266838551953319, + "learning_rate": 0.00019706129905484323, + "loss": 12.6648, + "step": 3858 + }, + { + "epoch": 0.2101379728138422, + "grad_norm": 0.7897653004405011, + "learning_rate": 0.00019705917660027867, + "loss": 12.5621, + "step": 3859 + }, + { + "epoch": 0.21019242681042521, + "grad_norm": 0.6577066357431925, + "learning_rate": 0.00019705705339096566, + "loss": 12.6753, + "step": 3860 + }, + { + "epoch": 0.21024688080700824, + "grad_norm": 0.7631655878819147, + "learning_rate": 0.00019705492942692074, + "loss": 12.5231, + "step": 3861 + }, + { + "epoch": 0.21030133480359123, + "grad_norm": 0.6823330943710011, + "learning_rate": 0.00019705280470816043, + "loss": 12.6195, + "step": 3862 + }, + { + "epoch": 0.21035578880017425, + "grad_norm": 0.6802375107907133, + "learning_rate": 0.0001970506792347012, + "loss": 12.5155, + "step": 3863 + }, + { + "epoch": 0.21041024279675727, + "grad_norm": 0.7288057731179113, + "learning_rate": 0.00019704855300655964, + "loss": 12.7704, + "step": 3864 + }, + { + "epoch": 0.2104646967933403, + "grad_norm": 0.7067370312565961, + "learning_rate": 0.00019704642602375223, + "loss": 12.7041, + "step": 3865 + }, + { + "epoch": 0.21051915078992328, + "grad_norm": 0.7328334988150504, + "learning_rate": 0.00019704429828629554, + "loss": 12.793, + "step": 3866 + }, + { + "epoch": 0.2105736047865063, + "grad_norm": 0.743035582800159, + "learning_rate": 0.00019704216979420612, + "loss": 12.6079, + "step": 3867 + }, + { + "epoch": 0.21062805878308932, + "grad_norm": 0.6952611456974587, + "learning_rate": 0.00019704004054750055, + "loss": 12.6263, + "step": 3868 + }, + { + "epoch": 0.21068251277967232, + "grad_norm": 0.6967072401084503, + "learning_rate": 0.0001970379105461953, + "loss": 12.7435, + "step": 3869 + }, + { + "epoch": 0.21073696677625534, + "grad_norm": 0.6810544243648045, + "learning_rate": 0.00019703577979030698, + "loss": 12.6692, + "step": 3870 + }, + { + "epoch": 0.21079142077283836, + "grad_norm": 0.7173889510000903, + "learning_rate": 0.0001970336482798522, + "loss": 12.4846, + "step": 3871 + }, + { + "epoch": 0.21084587476942135, + "grad_norm": 0.6766072230821139, + "learning_rate": 0.0001970315160148475, + "loss": 12.6103, + "step": 3872 + }, + { + "epoch": 0.21090032876600437, + "grad_norm": 0.7340416998263765, + "learning_rate": 0.00019702938299530942, + "loss": 12.5992, + "step": 3873 + }, + { + "epoch": 0.2109547827625874, + "grad_norm": 0.7250266860347498, + "learning_rate": 0.00019702724922125462, + "loss": 12.596, + "step": 3874 + }, + { + "epoch": 0.21100923675917038, + "grad_norm": 0.7231440439405903, + "learning_rate": 0.00019702511469269965, + "loss": 12.6429, + "step": 3875 + }, + { + "epoch": 0.2110636907557534, + "grad_norm": 0.7575607296218159, + "learning_rate": 0.0001970229794096611, + "loss": 12.637, + "step": 3876 + }, + { + "epoch": 0.21111814475233642, + "grad_norm": 0.7560500121927132, + "learning_rate": 0.0001970208433721556, + "loss": 12.5314, + "step": 3877 + }, + { + "epoch": 0.21117259874891942, + "grad_norm": 0.7222307087053694, + "learning_rate": 0.00019701870658019976, + "loss": 12.6902, + "step": 3878 + }, + { + "epoch": 0.21122705274550244, + "grad_norm": 0.6647439342103886, + "learning_rate": 0.0001970165690338102, + "loss": 12.5535, + "step": 3879 + }, + { + "epoch": 0.21128150674208546, + "grad_norm": 0.6745547574619641, + "learning_rate": 0.00019701443073300349, + "loss": 12.3277, + "step": 3880 + }, + { + "epoch": 0.21133596073866848, + "grad_norm": 0.683311103597084, + "learning_rate": 0.00019701229167779633, + "loss": 12.6541, + "step": 3881 + }, + { + "epoch": 0.21139041473525147, + "grad_norm": 0.7031825756420972, + "learning_rate": 0.0001970101518682053, + "loss": 12.3421, + "step": 3882 + }, + { + "epoch": 0.2114448687318345, + "grad_norm": 0.6709281810865525, + "learning_rate": 0.0001970080113042471, + "loss": 12.6411, + "step": 3883 + }, + { + "epoch": 0.2114993227284175, + "grad_norm": 0.7013708067172494, + "learning_rate": 0.00019700586998593829, + "loss": 12.622, + "step": 3884 + }, + { + "epoch": 0.2115537767250005, + "grad_norm": 0.6970959113609223, + "learning_rate": 0.0001970037279132956, + "loss": 12.6072, + "step": 3885 + }, + { + "epoch": 0.21160823072158352, + "grad_norm": 0.7172589159645827, + "learning_rate": 0.00019700158508633564, + "loss": 12.6771, + "step": 3886 + }, + { + "epoch": 0.21166268471816654, + "grad_norm": 0.6968691606849642, + "learning_rate": 0.00019699944150507507, + "loss": 12.6998, + "step": 3887 + }, + { + "epoch": 0.21171713871474954, + "grad_norm": 0.7427929804273878, + "learning_rate": 0.0001969972971695306, + "loss": 12.6571, + "step": 3888 + }, + { + "epoch": 0.21177159271133256, + "grad_norm": 0.7081419720758875, + "learning_rate": 0.00019699515207971885, + "loss": 12.7094, + "step": 3889 + }, + { + "epoch": 0.21182604670791558, + "grad_norm": 0.826287521696027, + "learning_rate": 0.00019699300623565657, + "loss": 12.5314, + "step": 3890 + }, + { + "epoch": 0.21188050070449857, + "grad_norm": 0.7260506698265387, + "learning_rate": 0.00019699085963736042, + "loss": 12.6649, + "step": 3891 + }, + { + "epoch": 0.2119349547010816, + "grad_norm": 0.8452203006842348, + "learning_rate": 0.00019698871228484704, + "loss": 12.6858, + "step": 3892 + }, + { + "epoch": 0.2119894086976646, + "grad_norm": 0.7775963563602492, + "learning_rate": 0.00019698656417813318, + "loss": 12.5169, + "step": 3893 + }, + { + "epoch": 0.2120438626942476, + "grad_norm": 0.7891925501823804, + "learning_rate": 0.00019698441531723553, + "loss": 12.7274, + "step": 3894 + }, + { + "epoch": 0.21209831669083062, + "grad_norm": 0.6844535277171849, + "learning_rate": 0.0001969822657021708, + "loss": 12.494, + "step": 3895 + }, + { + "epoch": 0.21215277068741364, + "grad_norm": 0.7594849342870383, + "learning_rate": 0.0001969801153329557, + "loss": 12.6393, + "step": 3896 + }, + { + "epoch": 0.21220722468399666, + "grad_norm": 0.9077815788828331, + "learning_rate": 0.000196977964209607, + "loss": 12.6181, + "step": 3897 + }, + { + "epoch": 0.21226167868057966, + "grad_norm": 0.7621298337666311, + "learning_rate": 0.00019697581233214134, + "loss": 12.6645, + "step": 3898 + }, + { + "epoch": 0.21231613267716268, + "grad_norm": 0.8366099030579992, + "learning_rate": 0.00019697365970057553, + "loss": 12.6583, + "step": 3899 + }, + { + "epoch": 0.2123705866737457, + "grad_norm": 0.7217507276712453, + "learning_rate": 0.00019697150631492626, + "loss": 12.6469, + "step": 3900 + }, + { + "epoch": 0.2124250406703287, + "grad_norm": 0.8339395971536379, + "learning_rate": 0.00019696935217521032, + "loss": 12.7546, + "step": 3901 + }, + { + "epoch": 0.2124794946669117, + "grad_norm": 0.7181593477883866, + "learning_rate": 0.00019696719728144442, + "loss": 12.5382, + "step": 3902 + }, + { + "epoch": 0.21253394866349473, + "grad_norm": 0.7617599944487994, + "learning_rate": 0.0001969650416336453, + "loss": 12.4544, + "step": 3903 + }, + { + "epoch": 0.21258840266007772, + "grad_norm": 0.7392836979080841, + "learning_rate": 0.0001969628852318298, + "loss": 12.5215, + "step": 3904 + }, + { + "epoch": 0.21264285665666074, + "grad_norm": 0.7660976492049502, + "learning_rate": 0.00019696072807601464, + "loss": 12.5948, + "step": 3905 + }, + { + "epoch": 0.21269731065324377, + "grad_norm": 0.7933437707513886, + "learning_rate": 0.0001969585701662166, + "loss": 12.7446, + "step": 3906 + }, + { + "epoch": 0.21275176464982676, + "grad_norm": 0.7222756885902184, + "learning_rate": 0.00019695641150245242, + "loss": 12.5878, + "step": 3907 + }, + { + "epoch": 0.21280621864640978, + "grad_norm": 0.7396778956759119, + "learning_rate": 0.000196954252084739, + "loss": 12.6364, + "step": 3908 + }, + { + "epoch": 0.2128606726429928, + "grad_norm": 0.7406513542494016, + "learning_rate": 0.000196952091913093, + "loss": 12.4174, + "step": 3909 + }, + { + "epoch": 0.2129151266395758, + "grad_norm": 0.801324775162043, + "learning_rate": 0.00019694993098753126, + "loss": 12.6281, + "step": 3910 + }, + { + "epoch": 0.2129695806361588, + "grad_norm": 0.7922004514295725, + "learning_rate": 0.0001969477693080706, + "loss": 12.711, + "step": 3911 + }, + { + "epoch": 0.21302403463274183, + "grad_norm": 0.7192166126667284, + "learning_rate": 0.00019694560687472787, + "loss": 12.713, + "step": 3912 + }, + { + "epoch": 0.21307848862932482, + "grad_norm": 0.8280048325074828, + "learning_rate": 0.00019694344368751984, + "loss": 12.7116, + "step": 3913 + }, + { + "epoch": 0.21313294262590785, + "grad_norm": 0.7244318785527158, + "learning_rate": 0.00019694127974646334, + "loss": 12.6594, + "step": 3914 + }, + { + "epoch": 0.21318739662249087, + "grad_norm": 0.8168065098975937, + "learning_rate": 0.00019693911505157515, + "loss": 12.6092, + "step": 3915 + }, + { + "epoch": 0.21324185061907389, + "grad_norm": 0.6701580431033446, + "learning_rate": 0.00019693694960287218, + "loss": 12.5596, + "step": 3916 + }, + { + "epoch": 0.21329630461565688, + "grad_norm": 0.7177138044919541, + "learning_rate": 0.0001969347834003712, + "loss": 12.5675, + "step": 3917 + }, + { + "epoch": 0.2133507586122399, + "grad_norm": 0.820418801242099, + "learning_rate": 0.00019693261644408908, + "loss": 12.5892, + "step": 3918 + }, + { + "epoch": 0.21340521260882292, + "grad_norm": 0.8034108120593468, + "learning_rate": 0.00019693044873404274, + "loss": 12.7667, + "step": 3919 + }, + { + "epoch": 0.2134596666054059, + "grad_norm": 0.7710641017405725, + "learning_rate": 0.00019692828027024893, + "loss": 12.7673, + "step": 3920 + }, + { + "epoch": 0.21351412060198893, + "grad_norm": 0.6792728233039044, + "learning_rate": 0.00019692611105272457, + "loss": 12.6426, + "step": 3921 + }, + { + "epoch": 0.21356857459857195, + "grad_norm": 0.8051947786505732, + "learning_rate": 0.0001969239410814865, + "loss": 12.6935, + "step": 3922 + }, + { + "epoch": 0.21362302859515495, + "grad_norm": 0.8199349179966275, + "learning_rate": 0.00019692177035655163, + "loss": 12.5231, + "step": 3923 + }, + { + "epoch": 0.21367748259173797, + "grad_norm": 0.6997119387345971, + "learning_rate": 0.00019691959887793684, + "loss": 12.6468, + "step": 3924 + }, + { + "epoch": 0.213731936588321, + "grad_norm": 0.8372167910644972, + "learning_rate": 0.00019691742664565895, + "loss": 12.6043, + "step": 3925 + }, + { + "epoch": 0.21378639058490398, + "grad_norm": 0.6962495433547871, + "learning_rate": 0.0001969152536597349, + "loss": 12.684, + "step": 3926 + }, + { + "epoch": 0.213840844581487, + "grad_norm": 0.7266854442111711, + "learning_rate": 0.00019691307992018161, + "loss": 12.4952, + "step": 3927 + }, + { + "epoch": 0.21389529857807002, + "grad_norm": 0.7655950665136835, + "learning_rate": 0.00019691090542701595, + "loss": 12.4292, + "step": 3928 + }, + { + "epoch": 0.213949752574653, + "grad_norm": 0.7277789599591761, + "learning_rate": 0.00019690873018025483, + "loss": 12.6611, + "step": 3929 + }, + { + "epoch": 0.21400420657123603, + "grad_norm": 0.8271872279766458, + "learning_rate": 0.0001969065541799152, + "loss": 12.6395, + "step": 3930 + }, + { + "epoch": 0.21405866056781905, + "grad_norm": 0.661706779368455, + "learning_rate": 0.00019690437742601394, + "loss": 12.6383, + "step": 3931 + }, + { + "epoch": 0.21411311456440207, + "grad_norm": 0.7064738109764515, + "learning_rate": 0.00019690219991856797, + "loss": 12.6467, + "step": 3932 + }, + { + "epoch": 0.21416756856098507, + "grad_norm": 0.7982907102311182, + "learning_rate": 0.00019690002165759424, + "loss": 12.5564, + "step": 3933 + }, + { + "epoch": 0.2142220225575681, + "grad_norm": 0.7074857690670541, + "learning_rate": 0.00019689784264310972, + "loss": 12.5441, + "step": 3934 + }, + { + "epoch": 0.2142764765541511, + "grad_norm": 0.7952675669800652, + "learning_rate": 0.00019689566287513132, + "loss": 12.7703, + "step": 3935 + }, + { + "epoch": 0.2143309305507341, + "grad_norm": 0.7824586864295231, + "learning_rate": 0.00019689348235367598, + "loss": 12.7611, + "step": 3936 + }, + { + "epoch": 0.21438538454731712, + "grad_norm": 0.7943644437390582, + "learning_rate": 0.00019689130107876067, + "loss": 12.6341, + "step": 3937 + }, + { + "epoch": 0.21443983854390014, + "grad_norm": 0.7623897601744621, + "learning_rate": 0.00019688911905040238, + "loss": 12.6317, + "step": 3938 + }, + { + "epoch": 0.21449429254048313, + "grad_norm": 0.728279967862069, + "learning_rate": 0.00019688693626861804, + "loss": 12.5935, + "step": 3939 + }, + { + "epoch": 0.21454874653706615, + "grad_norm": 0.7511841158265858, + "learning_rate": 0.00019688475273342464, + "loss": 12.505, + "step": 3940 + }, + { + "epoch": 0.21460320053364917, + "grad_norm": 0.7418238611000724, + "learning_rate": 0.00019688256844483914, + "loss": 12.565, + "step": 3941 + }, + { + "epoch": 0.21465765453023217, + "grad_norm": 0.718602048206933, + "learning_rate": 0.00019688038340287856, + "loss": 12.7313, + "step": 3942 + }, + { + "epoch": 0.2147121085268152, + "grad_norm": 0.6239293268350437, + "learning_rate": 0.00019687819760755987, + "loss": 12.6087, + "step": 3943 + }, + { + "epoch": 0.2147665625233982, + "grad_norm": 0.7081825399431875, + "learning_rate": 0.00019687601105890004, + "loss": 12.5982, + "step": 3944 + }, + { + "epoch": 0.2148210165199812, + "grad_norm": 0.7000307810121966, + "learning_rate": 0.0001968738237569161, + "loss": 12.5143, + "step": 3945 + }, + { + "epoch": 0.21487547051656422, + "grad_norm": 0.7241333911326127, + "learning_rate": 0.0001968716357016251, + "loss": 12.6107, + "step": 3946 + }, + { + "epoch": 0.21492992451314724, + "grad_norm": 0.7073929167858216, + "learning_rate": 0.00019686944689304402, + "loss": 12.6784, + "step": 3947 + }, + { + "epoch": 0.21498437850973026, + "grad_norm": 0.6963246525209683, + "learning_rate": 0.00019686725733118982, + "loss": 12.6063, + "step": 3948 + }, + { + "epoch": 0.21503883250631325, + "grad_norm": 0.696928836554792, + "learning_rate": 0.00019686506701607965, + "loss": 12.5421, + "step": 3949 + }, + { + "epoch": 0.21509328650289627, + "grad_norm": 0.7947151519515174, + "learning_rate": 0.00019686287594773043, + "loss": 12.7203, + "step": 3950 + }, + { + "epoch": 0.2151477404994793, + "grad_norm": 0.6941464137226102, + "learning_rate": 0.00019686068412615927, + "loss": 12.651, + "step": 3951 + }, + { + "epoch": 0.2152021944960623, + "grad_norm": 0.7422280709843154, + "learning_rate": 0.00019685849155138315, + "loss": 12.6142, + "step": 3952 + }, + { + "epoch": 0.2152566484926453, + "grad_norm": 0.6568159068976495, + "learning_rate": 0.00019685629822341919, + "loss": 12.4753, + "step": 3953 + }, + { + "epoch": 0.21531110248922833, + "grad_norm": 0.7266034209404038, + "learning_rate": 0.0001968541041422844, + "loss": 12.7158, + "step": 3954 + }, + { + "epoch": 0.21536555648581132, + "grad_norm": 0.6945221783504558, + "learning_rate": 0.00019685190930799585, + "loss": 12.6453, + "step": 3955 + }, + { + "epoch": 0.21542001048239434, + "grad_norm": 0.7526163319281612, + "learning_rate": 0.00019684971372057063, + "loss": 12.6438, + "step": 3956 + }, + { + "epoch": 0.21547446447897736, + "grad_norm": 0.7752381893997414, + "learning_rate": 0.00019684751738002575, + "loss": 12.6173, + "step": 3957 + }, + { + "epoch": 0.21552891847556035, + "grad_norm": 0.6891814177801274, + "learning_rate": 0.00019684532028637836, + "loss": 12.5664, + "step": 3958 + }, + { + "epoch": 0.21558337247214338, + "grad_norm": 0.7040966702280157, + "learning_rate": 0.0001968431224396455, + "loss": 12.6211, + "step": 3959 + }, + { + "epoch": 0.2156378264687264, + "grad_norm": 0.6086700373400811, + "learning_rate": 0.00019684092383984433, + "loss": 12.5274, + "step": 3960 + }, + { + "epoch": 0.2156922804653094, + "grad_norm": 0.7380800816251231, + "learning_rate": 0.00019683872448699184, + "loss": 12.683, + "step": 3961 + }, + { + "epoch": 0.2157467344618924, + "grad_norm": 0.7400584232257849, + "learning_rate": 0.00019683652438110523, + "loss": 12.6166, + "step": 3962 + }, + { + "epoch": 0.21580118845847543, + "grad_norm": 0.756629350492766, + "learning_rate": 0.00019683432352220158, + "loss": 12.6753, + "step": 3963 + }, + { + "epoch": 0.21585564245505845, + "grad_norm": 0.6501914484379949, + "learning_rate": 0.00019683212191029794, + "loss": 12.4404, + "step": 3964 + }, + { + "epoch": 0.21591009645164144, + "grad_norm": 0.7284031309302922, + "learning_rate": 0.0001968299195454115, + "loss": 12.5702, + "step": 3965 + }, + { + "epoch": 0.21596455044822446, + "grad_norm": 0.7840331234544401, + "learning_rate": 0.00019682771642755937, + "loss": 12.8293, + "step": 3966 + }, + { + "epoch": 0.21601900444480748, + "grad_norm": 0.7101177614963934, + "learning_rate": 0.00019682551255675867, + "loss": 12.7148, + "step": 3967 + }, + { + "epoch": 0.21607345844139048, + "grad_norm": 0.7470213722881334, + "learning_rate": 0.00019682330793302657, + "loss": 12.7036, + "step": 3968 + }, + { + "epoch": 0.2161279124379735, + "grad_norm": 1.004377002918978, + "learning_rate": 0.00019682110255638018, + "loss": 12.6216, + "step": 3969 + }, + { + "epoch": 0.21618236643455652, + "grad_norm": 0.7040489342355073, + "learning_rate": 0.00019681889642683668, + "loss": 12.648, + "step": 3970 + }, + { + "epoch": 0.2162368204311395, + "grad_norm": 0.7142888356405561, + "learning_rate": 0.00019681668954441315, + "loss": 12.5634, + "step": 3971 + }, + { + "epoch": 0.21629127442772253, + "grad_norm": 0.7280247286864808, + "learning_rate": 0.00019681448190912682, + "loss": 12.5871, + "step": 3972 + }, + { + "epoch": 0.21634572842430555, + "grad_norm": 0.7804693598806084, + "learning_rate": 0.00019681227352099487, + "loss": 12.7291, + "step": 3973 + }, + { + "epoch": 0.21640018242088854, + "grad_norm": 0.7894803374028877, + "learning_rate": 0.00019681006438003443, + "loss": 12.6476, + "step": 3974 + }, + { + "epoch": 0.21645463641747156, + "grad_norm": 0.6976484929957576, + "learning_rate": 0.00019680785448626274, + "loss": 12.6018, + "step": 3975 + }, + { + "epoch": 0.21650909041405458, + "grad_norm": 0.6978787462272699, + "learning_rate": 0.00019680564383969687, + "loss": 12.651, + "step": 3976 + }, + { + "epoch": 0.21656354441063758, + "grad_norm": 0.7023747519995679, + "learning_rate": 0.00019680343244035412, + "loss": 12.7279, + "step": 3977 + }, + { + "epoch": 0.2166179984072206, + "grad_norm": 0.7581605914443555, + "learning_rate": 0.00019680122028825162, + "loss": 12.5454, + "step": 3978 + }, + { + "epoch": 0.21667245240380362, + "grad_norm": 0.8027276031510364, + "learning_rate": 0.00019679900738340663, + "loss": 12.6674, + "step": 3979 + }, + { + "epoch": 0.2167269064003866, + "grad_norm": 0.7443417644929946, + "learning_rate": 0.0001967967937258363, + "loss": 12.694, + "step": 3980 + }, + { + "epoch": 0.21678136039696963, + "grad_norm": 0.7614531973040706, + "learning_rate": 0.00019679457931555787, + "loss": 12.7878, + "step": 3981 + }, + { + "epoch": 0.21683581439355265, + "grad_norm": 0.7739638001554833, + "learning_rate": 0.00019679236415258856, + "loss": 12.672, + "step": 3982 + }, + { + "epoch": 0.21689026839013567, + "grad_norm": 0.7819656619703798, + "learning_rate": 0.0001967901482369456, + "loss": 12.5756, + "step": 3983 + }, + { + "epoch": 0.21694472238671866, + "grad_norm": 0.7461819005859127, + "learning_rate": 0.00019678793156864622, + "loss": 12.7048, + "step": 3984 + }, + { + "epoch": 0.21699917638330168, + "grad_norm": 0.7125263632988991, + "learning_rate": 0.00019678571414770763, + "loss": 12.5498, + "step": 3985 + }, + { + "epoch": 0.2170536303798847, + "grad_norm": 0.7633302060511443, + "learning_rate": 0.0001967834959741471, + "loss": 12.5501, + "step": 3986 + }, + { + "epoch": 0.2171080843764677, + "grad_norm": 0.6537109819979228, + "learning_rate": 0.0001967812770479819, + "loss": 12.5498, + "step": 3987 + }, + { + "epoch": 0.21716253837305072, + "grad_norm": 0.6956757640626371, + "learning_rate": 0.00019677905736922928, + "loss": 12.6805, + "step": 3988 + }, + { + "epoch": 0.21721699236963374, + "grad_norm": 0.9038073036106121, + "learning_rate": 0.00019677683693790642, + "loss": 12.5616, + "step": 3989 + }, + { + "epoch": 0.21727144636621673, + "grad_norm": 0.7845108311106693, + "learning_rate": 0.0001967746157540307, + "loss": 12.6692, + "step": 3990 + }, + { + "epoch": 0.21732590036279975, + "grad_norm": 0.865577954805613, + "learning_rate": 0.00019677239381761928, + "loss": 12.5767, + "step": 3991 + }, + { + "epoch": 0.21738035435938277, + "grad_norm": 0.6931754652824569, + "learning_rate": 0.00019677017112868956, + "loss": 12.5629, + "step": 3992 + }, + { + "epoch": 0.21743480835596576, + "grad_norm": 0.7771496190857541, + "learning_rate": 0.00019676794768725873, + "loss": 12.6854, + "step": 3993 + }, + { + "epoch": 0.21748926235254878, + "grad_norm": 0.7092020476329501, + "learning_rate": 0.0001967657234933441, + "loss": 12.6091, + "step": 3994 + }, + { + "epoch": 0.2175437163491318, + "grad_norm": 0.693179443968465, + "learning_rate": 0.000196763498546963, + "loss": 12.7026, + "step": 3995 + }, + { + "epoch": 0.2175981703457148, + "grad_norm": 0.731434066275842, + "learning_rate": 0.00019676127284813267, + "loss": 12.6795, + "step": 3996 + }, + { + "epoch": 0.21765262434229782, + "grad_norm": 0.7032078557389057, + "learning_rate": 0.0001967590463968705, + "loss": 12.6302, + "step": 3997 + }, + { + "epoch": 0.21770707833888084, + "grad_norm": 0.635970585227238, + "learning_rate": 0.00019675681919319372, + "loss": 12.6628, + "step": 3998 + }, + { + "epoch": 0.21776153233546386, + "grad_norm": 0.8211180272514582, + "learning_rate": 0.00019675459123711972, + "loss": 12.8123, + "step": 3999 + }, + { + "epoch": 0.21781598633204685, + "grad_norm": 0.7603117513168859, + "learning_rate": 0.00019675236252866577, + "loss": 12.578, + "step": 4000 + }, + { + "epoch": 0.21787044032862987, + "grad_norm": 0.7594642151567548, + "learning_rate": 0.00019675013306784923, + "loss": 12.5305, + "step": 4001 + }, + { + "epoch": 0.2179248943252129, + "grad_norm": 0.7699914028970279, + "learning_rate": 0.00019674790285468746, + "loss": 12.7227, + "step": 4002 + }, + { + "epoch": 0.21797934832179588, + "grad_norm": 0.7397614162642909, + "learning_rate": 0.00019674567188919775, + "loss": 12.678, + "step": 4003 + }, + { + "epoch": 0.2180338023183789, + "grad_norm": 0.7175728299201021, + "learning_rate": 0.00019674344017139744, + "loss": 12.6919, + "step": 4004 + }, + { + "epoch": 0.21808825631496193, + "grad_norm": 0.7697025967962892, + "learning_rate": 0.00019674120770130394, + "loss": 12.7325, + "step": 4005 + }, + { + "epoch": 0.21814271031154492, + "grad_norm": 0.7006179379055654, + "learning_rate": 0.00019673897447893462, + "loss": 12.5337, + "step": 4006 + }, + { + "epoch": 0.21819716430812794, + "grad_norm": 0.7105004628449897, + "learning_rate": 0.00019673674050430678, + "loss": 12.677, + "step": 4007 + }, + { + "epoch": 0.21825161830471096, + "grad_norm": 0.6571087553060003, + "learning_rate": 0.0001967345057774378, + "loss": 12.6363, + "step": 4008 + }, + { + "epoch": 0.21830607230129395, + "grad_norm": 0.7093641040329042, + "learning_rate": 0.00019673227029834512, + "loss": 12.5658, + "step": 4009 + }, + { + "epoch": 0.21836052629787697, + "grad_norm": 0.6380887380541235, + "learning_rate": 0.00019673003406704605, + "loss": 12.5858, + "step": 4010 + }, + { + "epoch": 0.21841498029446, + "grad_norm": 0.7916333527955144, + "learning_rate": 0.00019672779708355804, + "loss": 12.7207, + "step": 4011 + }, + { + "epoch": 0.21846943429104299, + "grad_norm": 0.6850591486656107, + "learning_rate": 0.00019672555934789845, + "loss": 12.6482, + "step": 4012 + }, + { + "epoch": 0.218523888287626, + "grad_norm": 0.7198343448962357, + "learning_rate": 0.0001967233208600847, + "loss": 12.6282, + "step": 4013 + }, + { + "epoch": 0.21857834228420903, + "grad_norm": 0.7496108215300044, + "learning_rate": 0.0001967210816201342, + "loss": 12.6532, + "step": 4014 + }, + { + "epoch": 0.21863279628079205, + "grad_norm": 0.7301845943741914, + "learning_rate": 0.0001967188416280643, + "loss": 12.5878, + "step": 4015 + }, + { + "epoch": 0.21868725027737504, + "grad_norm": 0.7480537564589492, + "learning_rate": 0.0001967166008838925, + "loss": 12.7344, + "step": 4016 + }, + { + "epoch": 0.21874170427395806, + "grad_norm": 0.7040530822679284, + "learning_rate": 0.00019671435938763615, + "loss": 12.6365, + "step": 4017 + }, + { + "epoch": 0.21879615827054108, + "grad_norm": 0.729618764939438, + "learning_rate": 0.0001967121171393128, + "loss": 12.6541, + "step": 4018 + }, + { + "epoch": 0.21885061226712407, + "grad_norm": 0.7578991783646115, + "learning_rate": 0.00019670987413893974, + "loss": 12.7008, + "step": 4019 + }, + { + "epoch": 0.2189050662637071, + "grad_norm": 0.7139122569181902, + "learning_rate": 0.00019670763038653452, + "loss": 12.6024, + "step": 4020 + }, + { + "epoch": 0.2189595202602901, + "grad_norm": 0.7190580028658345, + "learning_rate": 0.00019670538588211455, + "loss": 12.6473, + "step": 4021 + }, + { + "epoch": 0.2190139742568731, + "grad_norm": 0.7332441857723779, + "learning_rate": 0.00019670314062569726, + "loss": 12.6767, + "step": 4022 + }, + { + "epoch": 0.21906842825345613, + "grad_norm": 0.7084478878468786, + "learning_rate": 0.00019670089461730012, + "loss": 12.657, + "step": 4023 + }, + { + "epoch": 0.21912288225003915, + "grad_norm": 0.7499768445803511, + "learning_rate": 0.00019669864785694063, + "loss": 12.5707, + "step": 4024 + }, + { + "epoch": 0.21917733624662214, + "grad_norm": 0.8299453060159917, + "learning_rate": 0.00019669640034463624, + "loss": 12.8349, + "step": 4025 + }, + { + "epoch": 0.21923179024320516, + "grad_norm": 0.7774346746945877, + "learning_rate": 0.00019669415208040442, + "loss": 12.5669, + "step": 4026 + }, + { + "epoch": 0.21928624423978818, + "grad_norm": 0.7413977284793508, + "learning_rate": 0.00019669190306426264, + "loss": 12.407, + "step": 4027 + }, + { + "epoch": 0.21934069823637117, + "grad_norm": 0.7179477776342924, + "learning_rate": 0.00019668965329622845, + "loss": 12.6059, + "step": 4028 + }, + { + "epoch": 0.2193951522329542, + "grad_norm": 0.7544261333086814, + "learning_rate": 0.00019668740277631926, + "loss": 12.5511, + "step": 4029 + }, + { + "epoch": 0.21944960622953721, + "grad_norm": 0.6788519137175846, + "learning_rate": 0.0001966851515045526, + "loss": 12.6608, + "step": 4030 + }, + { + "epoch": 0.21950406022612023, + "grad_norm": 0.7230530691838619, + "learning_rate": 0.00019668289948094602, + "loss": 12.6335, + "step": 4031 + }, + { + "epoch": 0.21955851422270323, + "grad_norm": 0.7056961235171023, + "learning_rate": 0.000196680646705517, + "loss": 12.673, + "step": 4032 + }, + { + "epoch": 0.21961296821928625, + "grad_norm": 0.7378250013137789, + "learning_rate": 0.00019667839317828305, + "loss": 12.7235, + "step": 4033 + }, + { + "epoch": 0.21966742221586927, + "grad_norm": 0.7101142061368517, + "learning_rate": 0.00019667613889926168, + "loss": 12.6265, + "step": 4034 + }, + { + "epoch": 0.21972187621245226, + "grad_norm": 0.708434740030311, + "learning_rate": 0.00019667388386847048, + "loss": 12.6863, + "step": 4035 + }, + { + "epoch": 0.21977633020903528, + "grad_norm": 0.7302184396771872, + "learning_rate": 0.00019667162808592695, + "loss": 12.4923, + "step": 4036 + }, + { + "epoch": 0.2198307842056183, + "grad_norm": 0.9297099830656682, + "learning_rate": 0.0001966693715516486, + "loss": 12.5281, + "step": 4037 + }, + { + "epoch": 0.2198852382022013, + "grad_norm": 0.7556286735752703, + "learning_rate": 0.00019666711426565302, + "loss": 12.6344, + "step": 4038 + }, + { + "epoch": 0.21993969219878431, + "grad_norm": 0.7837233340630051, + "learning_rate": 0.00019666485622795774, + "loss": 12.5989, + "step": 4039 + }, + { + "epoch": 0.21999414619536733, + "grad_norm": 0.7453415181216037, + "learning_rate": 0.00019666259743858033, + "loss": 12.6815, + "step": 4040 + }, + { + "epoch": 0.22004860019195033, + "grad_norm": 0.800459321357, + "learning_rate": 0.00019666033789753832, + "loss": 12.6644, + "step": 4041 + }, + { + "epoch": 0.22010305418853335, + "grad_norm": 0.7277918752524827, + "learning_rate": 0.00019665807760484936, + "loss": 12.7014, + "step": 4042 + }, + { + "epoch": 0.22015750818511637, + "grad_norm": 0.7666164066093426, + "learning_rate": 0.00019665581656053095, + "loss": 12.5904, + "step": 4043 + }, + { + "epoch": 0.22021196218169936, + "grad_norm": 0.8457032281116588, + "learning_rate": 0.00019665355476460072, + "loss": 12.6473, + "step": 4044 + }, + { + "epoch": 0.22026641617828238, + "grad_norm": 0.8591039778091485, + "learning_rate": 0.00019665129221707625, + "loss": 12.6753, + "step": 4045 + }, + { + "epoch": 0.2203208701748654, + "grad_norm": 0.8961209104006046, + "learning_rate": 0.0001966490289179751, + "loss": 12.3711, + "step": 4046 + }, + { + "epoch": 0.2203753241714484, + "grad_norm": 0.703577480574146, + "learning_rate": 0.0001966467648673149, + "loss": 12.4597, + "step": 4047 + }, + { + "epoch": 0.22042977816803141, + "grad_norm": 0.7176771691091773, + "learning_rate": 0.00019664450006511323, + "loss": 12.4849, + "step": 4048 + }, + { + "epoch": 0.22048423216461444, + "grad_norm": 0.9445294769580541, + "learning_rate": 0.00019664223451138776, + "loss": 12.6456, + "step": 4049 + }, + { + "epoch": 0.22053868616119746, + "grad_norm": 0.79355294350249, + "learning_rate": 0.00019663996820615604, + "loss": 12.5788, + "step": 4050 + }, + { + "epoch": 0.22059314015778045, + "grad_norm": 0.7021683884325197, + "learning_rate": 0.0001966377011494357, + "loss": 12.628, + "step": 4051 + }, + { + "epoch": 0.22064759415436347, + "grad_norm": 0.8065361759591646, + "learning_rate": 0.00019663543334124444, + "loss": 12.6926, + "step": 4052 + }, + { + "epoch": 0.2207020481509465, + "grad_norm": 0.7289818176356541, + "learning_rate": 0.00019663316478159984, + "loss": 12.627, + "step": 4053 + }, + { + "epoch": 0.22075650214752948, + "grad_norm": 0.6840374913718438, + "learning_rate": 0.0001966308954705195, + "loss": 12.639, + "step": 4054 + }, + { + "epoch": 0.2208109561441125, + "grad_norm": 0.7007821582356654, + "learning_rate": 0.00019662862540802115, + "loss": 12.6939, + "step": 4055 + }, + { + "epoch": 0.22086541014069552, + "grad_norm": 0.820489336660405, + "learning_rate": 0.00019662635459412239, + "loss": 12.6103, + "step": 4056 + }, + { + "epoch": 0.22091986413727852, + "grad_norm": 0.6884190201784677, + "learning_rate": 0.0001966240830288409, + "loss": 12.6194, + "step": 4057 + }, + { + "epoch": 0.22097431813386154, + "grad_norm": 0.7121741594391826, + "learning_rate": 0.00019662181071219433, + "loss": 12.6443, + "step": 4058 + }, + { + "epoch": 0.22102877213044456, + "grad_norm": 0.8198708361122542, + "learning_rate": 0.00019661953764420036, + "loss": 12.7249, + "step": 4059 + }, + { + "epoch": 0.22108322612702755, + "grad_norm": 0.7579769411330018, + "learning_rate": 0.00019661726382487666, + "loss": 12.5446, + "step": 4060 + }, + { + "epoch": 0.22113768012361057, + "grad_norm": 0.7321724612096204, + "learning_rate": 0.0001966149892542409, + "loss": 12.6478, + "step": 4061 + }, + { + "epoch": 0.2211921341201936, + "grad_norm": 0.7611930700464427, + "learning_rate": 0.0001966127139323108, + "loss": 12.6653, + "step": 4062 + }, + { + "epoch": 0.22124658811677658, + "grad_norm": 0.7843978002683786, + "learning_rate": 0.00019661043785910404, + "loss": 12.6038, + "step": 4063 + }, + { + "epoch": 0.2213010421133596, + "grad_norm": 0.6158107207239923, + "learning_rate": 0.00019660816103463831, + "loss": 12.5457, + "step": 4064 + }, + { + "epoch": 0.22135549610994262, + "grad_norm": 0.7349160639773851, + "learning_rate": 0.00019660588345893132, + "loss": 12.7054, + "step": 4065 + }, + { + "epoch": 0.22140995010652564, + "grad_norm": 0.8064390119251176, + "learning_rate": 0.00019660360513200074, + "loss": 12.7049, + "step": 4066 + }, + { + "epoch": 0.22146440410310864, + "grad_norm": 0.8285649273192288, + "learning_rate": 0.00019660132605386438, + "loss": 12.7298, + "step": 4067 + }, + { + "epoch": 0.22151885809969166, + "grad_norm": 0.7237315038079152, + "learning_rate": 0.00019659904622453987, + "loss": 12.6079, + "step": 4068 + }, + { + "epoch": 0.22157331209627468, + "grad_norm": 0.6493306300297004, + "learning_rate": 0.00019659676564404502, + "loss": 12.6085, + "step": 4069 + }, + { + "epoch": 0.22162776609285767, + "grad_norm": 0.6758583142842389, + "learning_rate": 0.00019659448431239747, + "loss": 12.6432, + "step": 4070 + }, + { + "epoch": 0.2216822200894407, + "grad_norm": 0.7787709224078277, + "learning_rate": 0.00019659220222961504, + "loss": 12.7466, + "step": 4071 + }, + { + "epoch": 0.2217366740860237, + "grad_norm": 0.7290007115312267, + "learning_rate": 0.00019658991939571543, + "loss": 12.8019, + "step": 4072 + }, + { + "epoch": 0.2217911280826067, + "grad_norm": 0.8270168433996183, + "learning_rate": 0.0001965876358107164, + "loss": 12.5678, + "step": 4073 + }, + { + "epoch": 0.22184558207918972, + "grad_norm": 0.679111176559926, + "learning_rate": 0.00019658535147463576, + "loss": 12.5322, + "step": 4074 + }, + { + "epoch": 0.22190003607577274, + "grad_norm": 0.7672042597534164, + "learning_rate": 0.0001965830663874912, + "loss": 12.4992, + "step": 4075 + }, + { + "epoch": 0.22195449007235574, + "grad_norm": 0.7822125277954906, + "learning_rate": 0.00019658078054930053, + "loss": 12.6805, + "step": 4076 + }, + { + "epoch": 0.22200894406893876, + "grad_norm": 0.708356713729573, + "learning_rate": 0.0001965784939600815, + "loss": 12.7529, + "step": 4077 + }, + { + "epoch": 0.22206339806552178, + "grad_norm": 0.9155739250602781, + "learning_rate": 0.00019657620661985193, + "loss": 12.8248, + "step": 4078 + }, + { + "epoch": 0.22211785206210477, + "grad_norm": 0.7570284536971716, + "learning_rate": 0.00019657391852862956, + "loss": 12.7426, + "step": 4079 + }, + { + "epoch": 0.2221723060586878, + "grad_norm": 0.7462136050334757, + "learning_rate": 0.0001965716296864322, + "loss": 12.7032, + "step": 4080 + }, + { + "epoch": 0.2222267600552708, + "grad_norm": 0.7066401415807251, + "learning_rate": 0.00019656934009327769, + "loss": 12.587, + "step": 4081 + }, + { + "epoch": 0.22228121405185383, + "grad_norm": 0.7253805256660538, + "learning_rate": 0.00019656704974918375, + "loss": 12.511, + "step": 4082 + }, + { + "epoch": 0.22233566804843682, + "grad_norm": 0.7647726895218305, + "learning_rate": 0.00019656475865416825, + "loss": 12.6477, + "step": 4083 + }, + { + "epoch": 0.22239012204501984, + "grad_norm": 0.7215912435196542, + "learning_rate": 0.00019656246680824902, + "loss": 12.623, + "step": 4084 + }, + { + "epoch": 0.22244457604160287, + "grad_norm": 0.7155861894393772, + "learning_rate": 0.00019656017421144383, + "loss": 12.6334, + "step": 4085 + }, + { + "epoch": 0.22249903003818586, + "grad_norm": 0.7067711985235319, + "learning_rate": 0.00019655788086377052, + "loss": 12.6266, + "step": 4086 + }, + { + "epoch": 0.22255348403476888, + "grad_norm": 0.7096197888718686, + "learning_rate": 0.00019655558676524696, + "loss": 12.6845, + "step": 4087 + }, + { + "epoch": 0.2226079380313519, + "grad_norm": 0.7265919662042608, + "learning_rate": 0.00019655329191589096, + "loss": 12.5453, + "step": 4088 + }, + { + "epoch": 0.2226623920279349, + "grad_norm": 0.7522879143900745, + "learning_rate": 0.00019655099631572036, + "loss": 12.6943, + "step": 4089 + }, + { + "epoch": 0.2227168460245179, + "grad_norm": 0.6771672916143178, + "learning_rate": 0.00019654869996475302, + "loss": 12.5641, + "step": 4090 + }, + { + "epoch": 0.22277130002110093, + "grad_norm": 0.7851229816759847, + "learning_rate": 0.00019654640286300681, + "loss": 12.6618, + "step": 4091 + }, + { + "epoch": 0.22282575401768392, + "grad_norm": 0.72448806730371, + "learning_rate": 0.00019654410501049956, + "loss": 12.5751, + "step": 4092 + }, + { + "epoch": 0.22288020801426695, + "grad_norm": 0.7173123686937297, + "learning_rate": 0.0001965418064072492, + "loss": 12.6472, + "step": 4093 + }, + { + "epoch": 0.22293466201084997, + "grad_norm": 0.6722797988171498, + "learning_rate": 0.00019653950705327352, + "loss": 12.6206, + "step": 4094 + }, + { + "epoch": 0.22298911600743296, + "grad_norm": 0.7023256897730658, + "learning_rate": 0.00019653720694859045, + "loss": 12.5819, + "step": 4095 + }, + { + "epoch": 0.22304357000401598, + "grad_norm": 0.6768174394192674, + "learning_rate": 0.0001965349060932179, + "loss": 12.6696, + "step": 4096 + }, + { + "epoch": 0.223098024000599, + "grad_norm": 0.6975337948647515, + "learning_rate": 0.0001965326044871737, + "loss": 12.5153, + "step": 4097 + }, + { + "epoch": 0.22315247799718202, + "grad_norm": 0.6670804020560877, + "learning_rate": 0.00019653030213047582, + "loss": 12.5818, + "step": 4098 + }, + { + "epoch": 0.223206931993765, + "grad_norm": 0.7435266309532783, + "learning_rate": 0.00019652799902314207, + "loss": 12.6334, + "step": 4099 + }, + { + "epoch": 0.22326138599034803, + "grad_norm": 0.6434074828767778, + "learning_rate": 0.00019652569516519043, + "loss": 12.7553, + "step": 4100 + }, + { + "epoch": 0.22331583998693105, + "grad_norm": 0.7702468225912249, + "learning_rate": 0.00019652339055663883, + "loss": 12.6044, + "step": 4101 + }, + { + "epoch": 0.22337029398351405, + "grad_norm": 0.6827954463588795, + "learning_rate": 0.0001965210851975051, + "loss": 12.6484, + "step": 4102 + }, + { + "epoch": 0.22342474798009707, + "grad_norm": 0.6972485509404794, + "learning_rate": 0.00019651877908780728, + "loss": 12.6238, + "step": 4103 + }, + { + "epoch": 0.2234792019766801, + "grad_norm": 0.6520662489966935, + "learning_rate": 0.0001965164722275632, + "loss": 12.5524, + "step": 4104 + }, + { + "epoch": 0.22353365597326308, + "grad_norm": 0.7948014592544079, + "learning_rate": 0.00019651416461679086, + "loss": 12.7693, + "step": 4105 + }, + { + "epoch": 0.2235881099698461, + "grad_norm": 0.6865331043079922, + "learning_rate": 0.00019651185625550822, + "loss": 12.6462, + "step": 4106 + }, + { + "epoch": 0.22364256396642912, + "grad_norm": 0.678447503680815, + "learning_rate": 0.00019650954714373316, + "loss": 12.4801, + "step": 4107 + }, + { + "epoch": 0.2236970179630121, + "grad_norm": 0.6861766778644109, + "learning_rate": 0.0001965072372814837, + "loss": 12.7116, + "step": 4108 + }, + { + "epoch": 0.22375147195959513, + "grad_norm": 0.6552148787788998, + "learning_rate": 0.00019650492666877778, + "loss": 12.5291, + "step": 4109 + }, + { + "epoch": 0.22380592595617815, + "grad_norm": 0.7820906246883227, + "learning_rate": 0.00019650261530563336, + "loss": 12.773, + "step": 4110 + }, + { + "epoch": 0.22386037995276115, + "grad_norm": 0.7375117802220131, + "learning_rate": 0.00019650030319206844, + "loss": 12.698, + "step": 4111 + }, + { + "epoch": 0.22391483394934417, + "grad_norm": 0.6853124627196624, + "learning_rate": 0.00019649799032810095, + "loss": 12.6565, + "step": 4112 + }, + { + "epoch": 0.2239692879459272, + "grad_norm": 0.8280545884927758, + "learning_rate": 0.0001964956767137489, + "loss": 12.5425, + "step": 4113 + }, + { + "epoch": 0.22402374194251018, + "grad_norm": 0.7010860592903173, + "learning_rate": 0.00019649336234903031, + "loss": 12.6033, + "step": 4114 + }, + { + "epoch": 0.2240781959390932, + "grad_norm": 0.8662239711101437, + "learning_rate": 0.00019649104723396313, + "loss": 12.7076, + "step": 4115 + }, + { + "epoch": 0.22413264993567622, + "grad_norm": 0.7016765695316105, + "learning_rate": 0.0001964887313685654, + "loss": 12.6596, + "step": 4116 + }, + { + "epoch": 0.22418710393225924, + "grad_norm": 0.7540474476328786, + "learning_rate": 0.0001964864147528551, + "loss": 12.6279, + "step": 4117 + }, + { + "epoch": 0.22424155792884223, + "grad_norm": 0.8586664519926759, + "learning_rate": 0.0001964840973868503, + "loss": 12.677, + "step": 4118 + }, + { + "epoch": 0.22429601192542525, + "grad_norm": 0.7518180989190282, + "learning_rate": 0.00019648177927056892, + "loss": 12.652, + "step": 4119 + }, + { + "epoch": 0.22435046592200827, + "grad_norm": 0.8042141580978586, + "learning_rate": 0.00019647946040402908, + "loss": 12.6802, + "step": 4120 + }, + { + "epoch": 0.22440491991859127, + "grad_norm": 0.7799997298026489, + "learning_rate": 0.00019647714078724877, + "loss": 12.6783, + "step": 4121 + }, + { + "epoch": 0.2244593739151743, + "grad_norm": 0.7540211261418177, + "learning_rate": 0.00019647482042024607, + "loss": 12.5943, + "step": 4122 + }, + { + "epoch": 0.2245138279117573, + "grad_norm": 0.8132608128572707, + "learning_rate": 0.00019647249930303894, + "loss": 12.5836, + "step": 4123 + }, + { + "epoch": 0.2245682819083403, + "grad_norm": 0.6984708197327121, + "learning_rate": 0.0001964701774356455, + "loss": 12.5904, + "step": 4124 + }, + { + "epoch": 0.22462273590492332, + "grad_norm": 0.8561128483421145, + "learning_rate": 0.00019646785481808375, + "loss": 12.5673, + "step": 4125 + }, + { + "epoch": 0.22467718990150634, + "grad_norm": 0.7102278937221332, + "learning_rate": 0.0001964655314503718, + "loss": 12.6511, + "step": 4126 + }, + { + "epoch": 0.22473164389808933, + "grad_norm": 0.8040309226278436, + "learning_rate": 0.0001964632073325277, + "loss": 12.5945, + "step": 4127 + }, + { + "epoch": 0.22478609789467235, + "grad_norm": 0.7324994720574526, + "learning_rate": 0.00019646088246456952, + "loss": 12.5847, + "step": 4128 + }, + { + "epoch": 0.22484055189125537, + "grad_norm": 0.6946331463546693, + "learning_rate": 0.00019645855684651535, + "loss": 12.6176, + "step": 4129 + }, + { + "epoch": 0.22489500588783837, + "grad_norm": 0.8127685703864836, + "learning_rate": 0.00019645623047838325, + "loss": 12.6553, + "step": 4130 + }, + { + "epoch": 0.2249494598844214, + "grad_norm": 0.7716413856032072, + "learning_rate": 0.00019645390336019136, + "loss": 12.6135, + "step": 4131 + }, + { + "epoch": 0.2250039138810044, + "grad_norm": 0.7148837586860729, + "learning_rate": 0.0001964515754919577, + "loss": 12.608, + "step": 4132 + }, + { + "epoch": 0.22505836787758743, + "grad_norm": 0.7684066446503315, + "learning_rate": 0.00019644924687370045, + "loss": 12.5608, + "step": 4133 + }, + { + "epoch": 0.22511282187417042, + "grad_norm": 0.7642611411607005, + "learning_rate": 0.00019644691750543767, + "loss": 12.5928, + "step": 4134 + }, + { + "epoch": 0.22516727587075344, + "grad_norm": 0.8170139904918936, + "learning_rate": 0.0001964445873871875, + "loss": 12.588, + "step": 4135 + }, + { + "epoch": 0.22522172986733646, + "grad_norm": 0.754685287492516, + "learning_rate": 0.000196442256518968, + "loss": 12.65, + "step": 4136 + }, + { + "epoch": 0.22527618386391945, + "grad_norm": 0.7479712814785009, + "learning_rate": 0.00019643992490079736, + "loss": 12.509, + "step": 4137 + }, + { + "epoch": 0.22533063786050248, + "grad_norm": 0.7957960856658557, + "learning_rate": 0.00019643759253269372, + "loss": 12.5958, + "step": 4138 + }, + { + "epoch": 0.2253850918570855, + "grad_norm": 0.6433026845291914, + "learning_rate": 0.00019643525941467516, + "loss": 12.5226, + "step": 4139 + }, + { + "epoch": 0.2254395458536685, + "grad_norm": 0.7612427732206075, + "learning_rate": 0.00019643292554675986, + "loss": 12.536, + "step": 4140 + }, + { + "epoch": 0.2254939998502515, + "grad_norm": 0.7186653394613721, + "learning_rate": 0.00019643059092896596, + "loss": 12.5145, + "step": 4141 + }, + { + "epoch": 0.22554845384683453, + "grad_norm": 0.8423148084191446, + "learning_rate": 0.0001964282555613116, + "loss": 12.6958, + "step": 4142 + }, + { + "epoch": 0.22560290784341752, + "grad_norm": 0.7648405356031992, + "learning_rate": 0.00019642591944381497, + "loss": 12.5947, + "step": 4143 + }, + { + "epoch": 0.22565736184000054, + "grad_norm": 0.7652229852198938, + "learning_rate": 0.0001964235825764942, + "loss": 12.6697, + "step": 4144 + }, + { + "epoch": 0.22571181583658356, + "grad_norm": 0.7824181915244368, + "learning_rate": 0.00019642124495936752, + "loss": 12.581, + "step": 4145 + }, + { + "epoch": 0.22576626983316656, + "grad_norm": 0.766258138818385, + "learning_rate": 0.000196418906592453, + "loss": 12.6982, + "step": 4146 + }, + { + "epoch": 0.22582072382974958, + "grad_norm": 0.7751052091342521, + "learning_rate": 0.00019641656747576897, + "loss": 12.6076, + "step": 4147 + }, + { + "epoch": 0.2258751778263326, + "grad_norm": 0.9516446815823854, + "learning_rate": 0.00019641422760933351, + "loss": 12.8024, + "step": 4148 + }, + { + "epoch": 0.22592963182291562, + "grad_norm": 0.6470356345065555, + "learning_rate": 0.00019641188699316483, + "loss": 12.6156, + "step": 4149 + }, + { + "epoch": 0.2259840858194986, + "grad_norm": 0.7168144094402956, + "learning_rate": 0.00019640954562728119, + "loss": 12.5586, + "step": 4150 + }, + { + "epoch": 0.22603853981608163, + "grad_norm": 0.704367688027529, + "learning_rate": 0.0001964072035117007, + "loss": 12.7024, + "step": 4151 + }, + { + "epoch": 0.22609299381266465, + "grad_norm": 0.7572302834035705, + "learning_rate": 0.00019640486064644166, + "loss": 12.6288, + "step": 4152 + }, + { + "epoch": 0.22614744780924764, + "grad_norm": 0.8705790782861066, + "learning_rate": 0.00019640251703152229, + "loss": 12.5788, + "step": 4153 + }, + { + "epoch": 0.22620190180583066, + "grad_norm": 0.719569601862707, + "learning_rate": 0.00019640017266696073, + "loss": 12.6585, + "step": 4154 + }, + { + "epoch": 0.22625635580241368, + "grad_norm": 0.8192872285406801, + "learning_rate": 0.00019639782755277526, + "loss": 12.8433, + "step": 4155 + }, + { + "epoch": 0.22631080979899668, + "grad_norm": 0.7591508591799239, + "learning_rate": 0.00019639548168898414, + "loss": 12.7183, + "step": 4156 + }, + { + "epoch": 0.2263652637955797, + "grad_norm": 0.6542259611617403, + "learning_rate": 0.0001963931350756056, + "loss": 12.542, + "step": 4157 + }, + { + "epoch": 0.22641971779216272, + "grad_norm": 0.7187092777667181, + "learning_rate": 0.00019639078771265783, + "loss": 12.6347, + "step": 4158 + }, + { + "epoch": 0.2264741717887457, + "grad_norm": 0.7491587955339019, + "learning_rate": 0.00019638843960015918, + "loss": 12.6626, + "step": 4159 + }, + { + "epoch": 0.22652862578532873, + "grad_norm": 0.6890269683428409, + "learning_rate": 0.00019638609073812784, + "loss": 12.6092, + "step": 4160 + }, + { + "epoch": 0.22658307978191175, + "grad_norm": 0.7204526761660863, + "learning_rate": 0.0001963837411265821, + "loss": 12.5484, + "step": 4161 + }, + { + "epoch": 0.22663753377849474, + "grad_norm": 0.6620763697964263, + "learning_rate": 0.0001963813907655402, + "loss": 12.5442, + "step": 4162 + }, + { + "epoch": 0.22669198777507776, + "grad_norm": 0.6967383157647706, + "learning_rate": 0.00019637903965502048, + "loss": 12.6577, + "step": 4163 + }, + { + "epoch": 0.22674644177166078, + "grad_norm": 0.7205232920046198, + "learning_rate": 0.00019637668779504114, + "loss": 12.7114, + "step": 4164 + }, + { + "epoch": 0.2268008957682438, + "grad_norm": 0.8163517909426771, + "learning_rate": 0.00019637433518562055, + "loss": 12.673, + "step": 4165 + }, + { + "epoch": 0.2268553497648268, + "grad_norm": 0.7954287675018735, + "learning_rate": 0.00019637198182677695, + "loss": 12.5768, + "step": 4166 + }, + { + "epoch": 0.22690980376140982, + "grad_norm": 0.7460431744584577, + "learning_rate": 0.00019636962771852866, + "loss": 12.4162, + "step": 4167 + }, + { + "epoch": 0.22696425775799284, + "grad_norm": 0.7643922709161617, + "learning_rate": 0.00019636727286089398, + "loss": 12.5648, + "step": 4168 + }, + { + "epoch": 0.22701871175457583, + "grad_norm": 0.6811069165096117, + "learning_rate": 0.00019636491725389123, + "loss": 12.5059, + "step": 4169 + }, + { + "epoch": 0.22707316575115885, + "grad_norm": 0.6880108265779822, + "learning_rate": 0.00019636256089753876, + "loss": 12.5661, + "step": 4170 + }, + { + "epoch": 0.22712761974774187, + "grad_norm": 0.8314396438696734, + "learning_rate": 0.0001963602037918548, + "loss": 12.5678, + "step": 4171 + }, + { + "epoch": 0.22718207374432486, + "grad_norm": 0.8376904101132442, + "learning_rate": 0.00019635784593685776, + "loss": 12.8749, + "step": 4172 + }, + { + "epoch": 0.22723652774090788, + "grad_norm": 0.7170796749752048, + "learning_rate": 0.00019635548733256596, + "loss": 12.5654, + "step": 4173 + }, + { + "epoch": 0.2272909817374909, + "grad_norm": 0.7785199073373608, + "learning_rate": 0.00019635312797899773, + "loss": 12.6354, + "step": 4174 + }, + { + "epoch": 0.2273454357340739, + "grad_norm": 0.683307625656454, + "learning_rate": 0.0001963507678761714, + "loss": 12.516, + "step": 4175 + }, + { + "epoch": 0.22739988973065692, + "grad_norm": 0.7206391649286215, + "learning_rate": 0.00019634840702410534, + "loss": 12.5217, + "step": 4176 + }, + { + "epoch": 0.22745434372723994, + "grad_norm": 0.7691043392271019, + "learning_rate": 0.00019634604542281793, + "loss": 12.6352, + "step": 4177 + }, + { + "epoch": 0.22750879772382293, + "grad_norm": 0.7698192268790165, + "learning_rate": 0.0001963436830723275, + "loss": 12.6516, + "step": 4178 + }, + { + "epoch": 0.22756325172040595, + "grad_norm": 0.8346404720305081, + "learning_rate": 0.00019634131997265243, + "loss": 12.6, + "step": 4179 + }, + { + "epoch": 0.22761770571698897, + "grad_norm": 0.6818073699154533, + "learning_rate": 0.0001963389561238111, + "loss": 12.6393, + "step": 4180 + }, + { + "epoch": 0.22767215971357196, + "grad_norm": 0.8939568346328681, + "learning_rate": 0.00019633659152582192, + "loss": 12.6412, + "step": 4181 + }, + { + "epoch": 0.22772661371015498, + "grad_norm": 0.8633836344610171, + "learning_rate": 0.0001963342261787032, + "loss": 12.5934, + "step": 4182 + }, + { + "epoch": 0.227781067706738, + "grad_norm": 0.7662553649513152, + "learning_rate": 0.00019633186008247342, + "loss": 12.7304, + "step": 4183 + }, + { + "epoch": 0.22783552170332103, + "grad_norm": 0.8173319496752366, + "learning_rate": 0.00019632949323715093, + "loss": 12.7181, + "step": 4184 + }, + { + "epoch": 0.22788997569990402, + "grad_norm": 0.7528487750285134, + "learning_rate": 0.00019632712564275414, + "loss": 12.574, + "step": 4185 + }, + { + "epoch": 0.22794442969648704, + "grad_norm": 0.9025976985810679, + "learning_rate": 0.00019632475729930147, + "loss": 12.3497, + "step": 4186 + }, + { + "epoch": 0.22799888369307006, + "grad_norm": 0.7802567733274954, + "learning_rate": 0.00019632238820681134, + "loss": 12.7456, + "step": 4187 + }, + { + "epoch": 0.22805333768965305, + "grad_norm": 0.8131014048200902, + "learning_rate": 0.0001963200183653022, + "loss": 12.57, + "step": 4188 + }, + { + "epoch": 0.22810779168623607, + "grad_norm": 0.75760509813138, + "learning_rate": 0.00019631764777479238, + "loss": 12.723, + "step": 4189 + }, + { + "epoch": 0.2281622456828191, + "grad_norm": 0.7666646103675416, + "learning_rate": 0.0001963152764353004, + "loss": 12.524, + "step": 4190 + }, + { + "epoch": 0.22821669967940209, + "grad_norm": 0.7536721587180825, + "learning_rate": 0.00019631290434684467, + "loss": 12.4945, + "step": 4191 + }, + { + "epoch": 0.2282711536759851, + "grad_norm": 0.7901263068923782, + "learning_rate": 0.00019631053150944366, + "loss": 12.6857, + "step": 4192 + }, + { + "epoch": 0.22832560767256813, + "grad_norm": 0.7848747736826245, + "learning_rate": 0.00019630815792311582, + "loss": 12.6599, + "step": 4193 + }, + { + "epoch": 0.22838006166915112, + "grad_norm": 0.7355671871543848, + "learning_rate": 0.00019630578358787956, + "loss": 12.585, + "step": 4194 + }, + { + "epoch": 0.22843451566573414, + "grad_norm": 0.8300002825783853, + "learning_rate": 0.0001963034085037534, + "loss": 12.7075, + "step": 4195 + }, + { + "epoch": 0.22848896966231716, + "grad_norm": 0.7966639359103621, + "learning_rate": 0.00019630103267075577, + "loss": 12.7292, + "step": 4196 + }, + { + "epoch": 0.22854342365890015, + "grad_norm": 0.7727276636202759, + "learning_rate": 0.00019629865608890515, + "loss": 12.648, + "step": 4197 + }, + { + "epoch": 0.22859787765548317, + "grad_norm": 0.7582843931050108, + "learning_rate": 0.00019629627875822006, + "loss": 12.6394, + "step": 4198 + }, + { + "epoch": 0.2286523316520662, + "grad_norm": 0.7575204216635002, + "learning_rate": 0.00019629390067871894, + "loss": 12.563, + "step": 4199 + }, + { + "epoch": 0.2287067856486492, + "grad_norm": 0.7514351983682588, + "learning_rate": 0.00019629152185042032, + "loss": 12.5458, + "step": 4200 + }, + { + "epoch": 0.2287612396452322, + "grad_norm": 0.7343913978310641, + "learning_rate": 0.00019628914227334265, + "loss": 12.6761, + "step": 4201 + }, + { + "epoch": 0.22881569364181523, + "grad_norm": 0.7953266280694237, + "learning_rate": 0.0001962867619475045, + "loss": 12.4819, + "step": 4202 + }, + { + "epoch": 0.22887014763839825, + "grad_norm": 0.674170778375874, + "learning_rate": 0.0001962843808729243, + "loss": 12.6999, + "step": 4203 + }, + { + "epoch": 0.22892460163498124, + "grad_norm": 0.8598325420364537, + "learning_rate": 0.00019628199904962065, + "loss": 12.5789, + "step": 4204 + }, + { + "epoch": 0.22897905563156426, + "grad_norm": 0.7080454670134934, + "learning_rate": 0.00019627961647761198, + "loss": 12.7293, + "step": 4205 + }, + { + "epoch": 0.22903350962814728, + "grad_norm": 0.8012826117160416, + "learning_rate": 0.0001962772331569169, + "loss": 12.5654, + "step": 4206 + }, + { + "epoch": 0.22908796362473027, + "grad_norm": 0.8221366642183998, + "learning_rate": 0.0001962748490875539, + "loss": 12.7532, + "step": 4207 + }, + { + "epoch": 0.2291424176213133, + "grad_norm": 0.7487133557436079, + "learning_rate": 0.00019627246426954153, + "loss": 12.7589, + "step": 4208 + }, + { + "epoch": 0.22919687161789631, + "grad_norm": 0.8018833033180254, + "learning_rate": 0.00019627007870289833, + "loss": 12.5792, + "step": 4209 + }, + { + "epoch": 0.2292513256144793, + "grad_norm": 0.6758886976115429, + "learning_rate": 0.00019626769238764285, + "loss": 12.5047, + "step": 4210 + }, + { + "epoch": 0.22930577961106233, + "grad_norm": 0.7721195598513376, + "learning_rate": 0.00019626530532379366, + "loss": 12.6491, + "step": 4211 + }, + { + "epoch": 0.22936023360764535, + "grad_norm": 0.7287958100996574, + "learning_rate": 0.00019626291751136932, + "loss": 12.7247, + "step": 4212 + }, + { + "epoch": 0.22941468760422834, + "grad_norm": 0.70739885144157, + "learning_rate": 0.00019626052895038837, + "loss": 12.6829, + "step": 4213 + }, + { + "epoch": 0.22946914160081136, + "grad_norm": 0.9481536470865546, + "learning_rate": 0.00019625813964086941, + "loss": 12.6122, + "step": 4214 + }, + { + "epoch": 0.22952359559739438, + "grad_norm": 0.7477751782480188, + "learning_rate": 0.000196255749582831, + "loss": 12.5888, + "step": 4215 + }, + { + "epoch": 0.2295780495939774, + "grad_norm": 0.8458601177972982, + "learning_rate": 0.00019625335877629176, + "loss": 12.5862, + "step": 4216 + }, + { + "epoch": 0.2296325035905604, + "grad_norm": 0.7034832614989683, + "learning_rate": 0.00019625096722127025, + "loss": 12.6872, + "step": 4217 + }, + { + "epoch": 0.22968695758714341, + "grad_norm": 0.8110735484700187, + "learning_rate": 0.0001962485749177851, + "loss": 12.425, + "step": 4218 + }, + { + "epoch": 0.22974141158372643, + "grad_norm": 0.8934675458851745, + "learning_rate": 0.00019624618186585483, + "loss": 12.7089, + "step": 4219 + }, + { + "epoch": 0.22979586558030943, + "grad_norm": 0.9536438597146135, + "learning_rate": 0.00019624378806549816, + "loss": 12.8108, + "step": 4220 + }, + { + "epoch": 0.22985031957689245, + "grad_norm": 0.7748427675417733, + "learning_rate": 0.00019624139351673368, + "loss": 12.592, + "step": 4221 + }, + { + "epoch": 0.22990477357347547, + "grad_norm": 0.7547516755231537, + "learning_rate": 0.00019623899821957994, + "loss": 12.6094, + "step": 4222 + }, + { + "epoch": 0.22995922757005846, + "grad_norm": 0.7069906171041361, + "learning_rate": 0.0001962366021740556, + "loss": 12.4284, + "step": 4223 + }, + { + "epoch": 0.23001368156664148, + "grad_norm": 0.7339094492319244, + "learning_rate": 0.00019623420538017933, + "loss": 12.6512, + "step": 4224 + }, + { + "epoch": 0.2300681355632245, + "grad_norm": 0.7168079873618214, + "learning_rate": 0.00019623180783796972, + "loss": 12.7724, + "step": 4225 + }, + { + "epoch": 0.2301225895598075, + "grad_norm": 0.7628963015436998, + "learning_rate": 0.00019622940954744546, + "loss": 12.7698, + "step": 4226 + }, + { + "epoch": 0.23017704355639051, + "grad_norm": 0.800313160513469, + "learning_rate": 0.00019622701050862516, + "loss": 12.668, + "step": 4227 + }, + { + "epoch": 0.23023149755297354, + "grad_norm": 0.7402059656718677, + "learning_rate": 0.0001962246107215275, + "loss": 12.7014, + "step": 4228 + }, + { + "epoch": 0.23028595154955653, + "grad_norm": 0.768773436899772, + "learning_rate": 0.0001962222101861711, + "loss": 12.6177, + "step": 4229 + }, + { + "epoch": 0.23034040554613955, + "grad_norm": 0.7433993649806422, + "learning_rate": 0.00019621980890257467, + "loss": 12.6767, + "step": 4230 + }, + { + "epoch": 0.23039485954272257, + "grad_norm": 0.7848663476635999, + "learning_rate": 0.0001962174068707569, + "loss": 12.6785, + "step": 4231 + }, + { + "epoch": 0.2304493135393056, + "grad_norm": 0.9097231002227915, + "learning_rate": 0.00019621500409073642, + "loss": 12.7234, + "step": 4232 + }, + { + "epoch": 0.23050376753588858, + "grad_norm": 0.737656135746251, + "learning_rate": 0.0001962126005625319, + "loss": 12.5546, + "step": 4233 + }, + { + "epoch": 0.2305582215324716, + "grad_norm": 0.7434625052577787, + "learning_rate": 0.0001962101962861621, + "loss": 12.6206, + "step": 4234 + }, + { + "epoch": 0.23061267552905462, + "grad_norm": 0.7067678014290748, + "learning_rate": 0.00019620779126164567, + "loss": 12.5727, + "step": 4235 + }, + { + "epoch": 0.23066712952563762, + "grad_norm": 0.8034904350345814, + "learning_rate": 0.00019620538548900134, + "loss": 12.5893, + "step": 4236 + }, + { + "epoch": 0.23072158352222064, + "grad_norm": 0.7364403510918875, + "learning_rate": 0.00019620297896824778, + "loss": 12.6274, + "step": 4237 + }, + { + "epoch": 0.23077603751880366, + "grad_norm": 0.7482614937671299, + "learning_rate": 0.00019620057169940372, + "loss": 12.6263, + "step": 4238 + }, + { + "epoch": 0.23083049151538665, + "grad_norm": 0.7421278997437073, + "learning_rate": 0.0001961981636824879, + "loss": 12.5673, + "step": 4239 + }, + { + "epoch": 0.23088494551196967, + "grad_norm": 0.7276833601350934, + "learning_rate": 0.000196195754917519, + "loss": 12.5512, + "step": 4240 + }, + { + "epoch": 0.2309393995085527, + "grad_norm": 0.7876421122370915, + "learning_rate": 0.00019619334540451578, + "loss": 12.6634, + "step": 4241 + }, + { + "epoch": 0.23099385350513568, + "grad_norm": 0.6979158172927132, + "learning_rate": 0.00019619093514349698, + "loss": 12.5341, + "step": 4242 + }, + { + "epoch": 0.2310483075017187, + "grad_norm": 0.655866451015755, + "learning_rate": 0.00019618852413448134, + "loss": 12.5973, + "step": 4243 + }, + { + "epoch": 0.23110276149830172, + "grad_norm": 0.8482436189951839, + "learning_rate": 0.0001961861123774876, + "loss": 12.7595, + "step": 4244 + }, + { + "epoch": 0.23115721549488472, + "grad_norm": 0.6987739037734397, + "learning_rate": 0.00019618369987253452, + "loss": 12.5868, + "step": 4245 + }, + { + "epoch": 0.23121166949146774, + "grad_norm": 0.7143858283375534, + "learning_rate": 0.0001961812866196409, + "loss": 12.563, + "step": 4246 + }, + { + "epoch": 0.23126612348805076, + "grad_norm": 0.7598538747605517, + "learning_rate": 0.00019617887261882543, + "loss": 12.6624, + "step": 4247 + }, + { + "epoch": 0.23132057748463375, + "grad_norm": 0.6621791276826825, + "learning_rate": 0.0001961764578701069, + "loss": 12.4685, + "step": 4248 + }, + { + "epoch": 0.23137503148121677, + "grad_norm": 0.6874364514880499, + "learning_rate": 0.00019617404237350412, + "loss": 12.6243, + "step": 4249 + }, + { + "epoch": 0.2314294854777998, + "grad_norm": 0.6843292576653252, + "learning_rate": 0.00019617162612903588, + "loss": 12.6069, + "step": 4250 + }, + { + "epoch": 0.2314839394743828, + "grad_norm": 0.7386083117850814, + "learning_rate": 0.00019616920913672093, + "loss": 12.6869, + "step": 4251 + }, + { + "epoch": 0.2315383934709658, + "grad_norm": 0.7430864456399691, + "learning_rate": 0.00019616679139657808, + "loss": 12.7086, + "step": 4252 + }, + { + "epoch": 0.23159284746754882, + "grad_norm": 0.7003175792165027, + "learning_rate": 0.00019616437290862613, + "loss": 12.5829, + "step": 4253 + }, + { + "epoch": 0.23164730146413184, + "grad_norm": 0.789833782665007, + "learning_rate": 0.0001961619536728839, + "loss": 12.6162, + "step": 4254 + }, + { + "epoch": 0.23170175546071484, + "grad_norm": 0.7355859612170126, + "learning_rate": 0.00019615953368937018, + "loss": 12.456, + "step": 4255 + }, + { + "epoch": 0.23175620945729786, + "grad_norm": 0.7273326403014023, + "learning_rate": 0.0001961571129581038, + "loss": 12.639, + "step": 4256 + }, + { + "epoch": 0.23181066345388088, + "grad_norm": 0.7677796485027044, + "learning_rate": 0.00019615469147910358, + "loss": 12.7071, + "step": 4257 + }, + { + "epoch": 0.23186511745046387, + "grad_norm": 0.6403050854559836, + "learning_rate": 0.00019615226925238837, + "loss": 12.5584, + "step": 4258 + }, + { + "epoch": 0.2319195714470469, + "grad_norm": 0.7559823101393152, + "learning_rate": 0.00019614984627797699, + "loss": 12.5595, + "step": 4259 + }, + { + "epoch": 0.2319740254436299, + "grad_norm": 0.6971580283174519, + "learning_rate": 0.00019614742255588826, + "loss": 12.6703, + "step": 4260 + }, + { + "epoch": 0.2320284794402129, + "grad_norm": 0.7388504771360798, + "learning_rate": 0.00019614499808614106, + "loss": 12.6664, + "step": 4261 + }, + { + "epoch": 0.23208293343679592, + "grad_norm": 0.8428539947928397, + "learning_rate": 0.00019614257286875423, + "loss": 12.64, + "step": 4262 + }, + { + "epoch": 0.23213738743337894, + "grad_norm": 0.6449467533861518, + "learning_rate": 0.00019614014690374666, + "loss": 12.5579, + "step": 4263 + }, + { + "epoch": 0.23219184142996194, + "grad_norm": 0.7948607639082977, + "learning_rate": 0.00019613772019113715, + "loss": 12.603, + "step": 4264 + }, + { + "epoch": 0.23224629542654496, + "grad_norm": 0.7533943506759686, + "learning_rate": 0.0001961352927309446, + "loss": 12.6152, + "step": 4265 + }, + { + "epoch": 0.23230074942312798, + "grad_norm": 0.7791838528326387, + "learning_rate": 0.0001961328645231879, + "loss": 12.5939, + "step": 4266 + }, + { + "epoch": 0.232355203419711, + "grad_norm": 0.73616378505053, + "learning_rate": 0.00019613043556788594, + "loss": 12.5496, + "step": 4267 + }, + { + "epoch": 0.232409657416294, + "grad_norm": 0.7783342577870753, + "learning_rate": 0.0001961280058650576, + "loss": 12.74, + "step": 4268 + }, + { + "epoch": 0.232464111412877, + "grad_norm": 0.7484188737899983, + "learning_rate": 0.0001961255754147217, + "loss": 12.5975, + "step": 4269 + }, + { + "epoch": 0.23251856540946003, + "grad_norm": 0.6277519572272489, + "learning_rate": 0.00019612314421689727, + "loss": 12.4444, + "step": 4270 + }, + { + "epoch": 0.23257301940604302, + "grad_norm": 0.7895078058046108, + "learning_rate": 0.00019612071227160315, + "loss": 12.5838, + "step": 4271 + }, + { + "epoch": 0.23262747340262604, + "grad_norm": 0.7453697623277638, + "learning_rate": 0.00019611827957885823, + "loss": 12.4951, + "step": 4272 + }, + { + "epoch": 0.23268192739920907, + "grad_norm": 0.7374459052273574, + "learning_rate": 0.00019611584613868146, + "loss": 12.6958, + "step": 4273 + }, + { + "epoch": 0.23273638139579206, + "grad_norm": 0.7871237598849445, + "learning_rate": 0.00019611341195109174, + "loss": 12.6594, + "step": 4274 + }, + { + "epoch": 0.23279083539237508, + "grad_norm": 0.7746752949147973, + "learning_rate": 0.00019611097701610804, + "loss": 12.7189, + "step": 4275 + }, + { + "epoch": 0.2328452893889581, + "grad_norm": 0.8101023763463789, + "learning_rate": 0.00019610854133374922, + "loss": 12.6418, + "step": 4276 + }, + { + "epoch": 0.2328997433855411, + "grad_norm": 0.876845460569168, + "learning_rate": 0.00019610610490403428, + "loss": 12.5988, + "step": 4277 + }, + { + "epoch": 0.2329541973821241, + "grad_norm": 0.6829682379865623, + "learning_rate": 0.00019610366772698218, + "loss": 12.7112, + "step": 4278 + }, + { + "epoch": 0.23300865137870713, + "grad_norm": 0.8823059947422979, + "learning_rate": 0.00019610122980261182, + "loss": 12.7105, + "step": 4279 + }, + { + "epoch": 0.23306310537529012, + "grad_norm": 0.6757896880882112, + "learning_rate": 0.00019609879113094216, + "loss": 12.5127, + "step": 4280 + }, + { + "epoch": 0.23311755937187315, + "grad_norm": 0.7799192024165947, + "learning_rate": 0.0001960963517119922, + "loss": 12.6248, + "step": 4281 + }, + { + "epoch": 0.23317201336845617, + "grad_norm": 0.7738107472625442, + "learning_rate": 0.0001960939115457809, + "loss": 12.6383, + "step": 4282 + }, + { + "epoch": 0.2332264673650392, + "grad_norm": 0.6654857619867839, + "learning_rate": 0.00019609147063232723, + "loss": 12.5648, + "step": 4283 + }, + { + "epoch": 0.23328092136162218, + "grad_norm": 0.7831452104370247, + "learning_rate": 0.0001960890289716502, + "loss": 12.6137, + "step": 4284 + }, + { + "epoch": 0.2333353753582052, + "grad_norm": 0.7727339044966621, + "learning_rate": 0.0001960865865637687, + "loss": 12.7041, + "step": 4285 + }, + { + "epoch": 0.23338982935478822, + "grad_norm": 0.7715279175080322, + "learning_rate": 0.00019608414340870184, + "loss": 12.7531, + "step": 4286 + }, + { + "epoch": 0.2334442833513712, + "grad_norm": 0.8001630128369412, + "learning_rate": 0.00019608169950646859, + "loss": 12.5734, + "step": 4287 + }, + { + "epoch": 0.23349873734795423, + "grad_norm": 0.787395148810035, + "learning_rate": 0.00019607925485708787, + "loss": 12.6792, + "step": 4288 + }, + { + "epoch": 0.23355319134453725, + "grad_norm": 0.842592994942548, + "learning_rate": 0.00019607680946057875, + "loss": 12.666, + "step": 4289 + }, + { + "epoch": 0.23360764534112025, + "grad_norm": 0.7512388506883075, + "learning_rate": 0.0001960743633169603, + "loss": 12.59, + "step": 4290 + }, + { + "epoch": 0.23366209933770327, + "grad_norm": 0.72247188846382, + "learning_rate": 0.00019607191642625145, + "loss": 12.6564, + "step": 4291 + }, + { + "epoch": 0.2337165533342863, + "grad_norm": 0.8745336571336647, + "learning_rate": 0.0001960694687884713, + "loss": 12.5119, + "step": 4292 + }, + { + "epoch": 0.23377100733086928, + "grad_norm": 0.6912730098958464, + "learning_rate": 0.0001960670204036388, + "loss": 12.5903, + "step": 4293 + }, + { + "epoch": 0.2338254613274523, + "grad_norm": 0.8363161507249239, + "learning_rate": 0.00019606457127177308, + "loss": 12.6366, + "step": 4294 + }, + { + "epoch": 0.23387991532403532, + "grad_norm": 0.7493847572979708, + "learning_rate": 0.00019606212139289313, + "loss": 12.6625, + "step": 4295 + }, + { + "epoch": 0.2339343693206183, + "grad_norm": 0.7003006750115612, + "learning_rate": 0.00019605967076701802, + "loss": 12.5667, + "step": 4296 + }, + { + "epoch": 0.23398882331720133, + "grad_norm": 0.7875107147194136, + "learning_rate": 0.00019605721939416678, + "loss": 12.6736, + "step": 4297 + }, + { + "epoch": 0.23404327731378435, + "grad_norm": 0.7435004985456852, + "learning_rate": 0.00019605476727435855, + "loss": 12.6327, + "step": 4298 + }, + { + "epoch": 0.23409773131036737, + "grad_norm": 0.9089254508292351, + "learning_rate": 0.0001960523144076123, + "loss": 12.7987, + "step": 4299 + }, + { + "epoch": 0.23415218530695037, + "grad_norm": 0.7142398932570184, + "learning_rate": 0.00019604986079394711, + "loss": 12.6418, + "step": 4300 + }, + { + "epoch": 0.2342066393035334, + "grad_norm": 0.6312072194048043, + "learning_rate": 0.00019604740643338215, + "loss": 12.5074, + "step": 4301 + }, + { + "epoch": 0.2342610933001164, + "grad_norm": 0.6643375200710461, + "learning_rate": 0.00019604495132593644, + "loss": 12.3109, + "step": 4302 + }, + { + "epoch": 0.2343155472966994, + "grad_norm": 0.7941833749668644, + "learning_rate": 0.00019604249547162906, + "loss": 12.6772, + "step": 4303 + }, + { + "epoch": 0.23437000129328242, + "grad_norm": 0.709594513312813, + "learning_rate": 0.00019604003887047916, + "loss": 12.5156, + "step": 4304 + }, + { + "epoch": 0.23442445528986544, + "grad_norm": 0.7762370769912964, + "learning_rate": 0.00019603758152250577, + "loss": 12.8532, + "step": 4305 + }, + { + "epoch": 0.23447890928644843, + "grad_norm": 0.7910019480267998, + "learning_rate": 0.00019603512342772808, + "loss": 12.5901, + "step": 4306 + }, + { + "epoch": 0.23453336328303145, + "grad_norm": 0.6729287952350281, + "learning_rate": 0.00019603266458616514, + "loss": 12.5129, + "step": 4307 + }, + { + "epoch": 0.23458781727961447, + "grad_norm": 0.7864051008022672, + "learning_rate": 0.00019603020499783612, + "loss": 12.411, + "step": 4308 + }, + { + "epoch": 0.23464227127619747, + "grad_norm": 0.7335956838909282, + "learning_rate": 0.00019602774466276007, + "loss": 12.6596, + "step": 4309 + }, + { + "epoch": 0.2346967252727805, + "grad_norm": 0.7876811390143509, + "learning_rate": 0.00019602528358095625, + "loss": 12.6153, + "step": 4310 + }, + { + "epoch": 0.2347511792693635, + "grad_norm": 0.7767342521011587, + "learning_rate": 0.00019602282175244367, + "loss": 12.5518, + "step": 4311 + }, + { + "epoch": 0.2348056332659465, + "grad_norm": 0.7399594559128674, + "learning_rate": 0.00019602035917724153, + "loss": 12.6122, + "step": 4312 + }, + { + "epoch": 0.23486008726252952, + "grad_norm": 0.8289972152340461, + "learning_rate": 0.000196017895855369, + "loss": 12.5061, + "step": 4313 + }, + { + "epoch": 0.23491454125911254, + "grad_norm": 0.7408918207456543, + "learning_rate": 0.00019601543178684517, + "loss": 12.5571, + "step": 4314 + }, + { + "epoch": 0.23496899525569553, + "grad_norm": 0.8245995095661995, + "learning_rate": 0.00019601296697168926, + "loss": 12.6654, + "step": 4315 + }, + { + "epoch": 0.23502344925227855, + "grad_norm": 0.7435772943999259, + "learning_rate": 0.00019601050140992044, + "loss": 12.6058, + "step": 4316 + }, + { + "epoch": 0.23507790324886157, + "grad_norm": 0.7958728150332822, + "learning_rate": 0.00019600803510155782, + "loss": 12.7924, + "step": 4317 + }, + { + "epoch": 0.2351323572454446, + "grad_norm": 0.7486224387089223, + "learning_rate": 0.00019600556804662064, + "loss": 12.634, + "step": 4318 + }, + { + "epoch": 0.2351868112420276, + "grad_norm": 0.7969138995287975, + "learning_rate": 0.00019600310024512808, + "loss": 12.6919, + "step": 4319 + }, + { + "epoch": 0.2352412652386106, + "grad_norm": 0.70620545564187, + "learning_rate": 0.00019600063169709927, + "loss": 12.718, + "step": 4320 + }, + { + "epoch": 0.23529571923519363, + "grad_norm": 0.7393046120739886, + "learning_rate": 0.0001959981624025535, + "loss": 12.5344, + "step": 4321 + }, + { + "epoch": 0.23535017323177662, + "grad_norm": 0.7254327438539935, + "learning_rate": 0.00019599569236150986, + "loss": 12.5618, + "step": 4322 + }, + { + "epoch": 0.23540462722835964, + "grad_norm": 0.6887552576178985, + "learning_rate": 0.00019599322157398764, + "loss": 12.6536, + "step": 4323 + }, + { + "epoch": 0.23545908122494266, + "grad_norm": 0.7553241878725601, + "learning_rate": 0.00019599075004000607, + "loss": 12.5744, + "step": 4324 + }, + { + "epoch": 0.23551353522152566, + "grad_norm": 0.7499048113007982, + "learning_rate": 0.00019598827775958432, + "loss": 12.6286, + "step": 4325 + }, + { + "epoch": 0.23556798921810868, + "grad_norm": 0.8636137911339891, + "learning_rate": 0.0001959858047327416, + "loss": 12.7978, + "step": 4326 + }, + { + "epoch": 0.2356224432146917, + "grad_norm": 0.7486050817099174, + "learning_rate": 0.00019598333095949716, + "loss": 12.5916, + "step": 4327 + }, + { + "epoch": 0.2356768972112747, + "grad_norm": 0.719927559818087, + "learning_rate": 0.00019598085643987025, + "loss": 12.6053, + "step": 4328 + }, + { + "epoch": 0.2357313512078577, + "grad_norm": 0.7384823362730173, + "learning_rate": 0.0001959783811738801, + "loss": 12.8224, + "step": 4329 + }, + { + "epoch": 0.23578580520444073, + "grad_norm": 0.6720819302333678, + "learning_rate": 0.00019597590516154598, + "loss": 12.5344, + "step": 4330 + }, + { + "epoch": 0.23584025920102372, + "grad_norm": 0.7603169777358452, + "learning_rate": 0.00019597342840288711, + "loss": 12.6039, + "step": 4331 + }, + { + "epoch": 0.23589471319760674, + "grad_norm": 0.7062455194557229, + "learning_rate": 0.00019597095089792278, + "loss": 12.6485, + "step": 4332 + }, + { + "epoch": 0.23594916719418976, + "grad_norm": 0.735680768440551, + "learning_rate": 0.0001959684726466722, + "loss": 12.8305, + "step": 4333 + }, + { + "epoch": 0.23600362119077278, + "grad_norm": 0.6484772548160889, + "learning_rate": 0.00019596599364915472, + "loss": 12.4678, + "step": 4334 + }, + { + "epoch": 0.23605807518735578, + "grad_norm": 0.7327670055095351, + "learning_rate": 0.0001959635139053896, + "loss": 12.5556, + "step": 4335 + }, + { + "epoch": 0.2361125291839388, + "grad_norm": 0.7395264390154621, + "learning_rate": 0.00019596103341539608, + "loss": 12.8079, + "step": 4336 + }, + { + "epoch": 0.23616698318052182, + "grad_norm": 0.6869678528143364, + "learning_rate": 0.00019595855217919347, + "loss": 12.5308, + "step": 4337 + }, + { + "epoch": 0.2362214371771048, + "grad_norm": 0.7119042798010712, + "learning_rate": 0.00019595607019680107, + "loss": 12.8081, + "step": 4338 + }, + { + "epoch": 0.23627589117368783, + "grad_norm": 0.8493247308149601, + "learning_rate": 0.00019595358746823819, + "loss": 12.594, + "step": 4339 + }, + { + "epoch": 0.23633034517027085, + "grad_norm": 0.7133131279686575, + "learning_rate": 0.0001959511039935241, + "loss": 12.5086, + "step": 4340 + }, + { + "epoch": 0.23638479916685384, + "grad_norm": 0.905017738612647, + "learning_rate": 0.00019594861977267813, + "loss": 12.7374, + "step": 4341 + }, + { + "epoch": 0.23643925316343686, + "grad_norm": 0.7011975944578445, + "learning_rate": 0.0001959461348057196, + "loss": 12.6633, + "step": 4342 + }, + { + "epoch": 0.23649370716001988, + "grad_norm": 0.6679431058281032, + "learning_rate": 0.00019594364909266787, + "loss": 12.5931, + "step": 4343 + }, + { + "epoch": 0.23654816115660288, + "grad_norm": 0.7666953457547615, + "learning_rate": 0.0001959411626335422, + "loss": 12.6656, + "step": 4344 + }, + { + "epoch": 0.2366026151531859, + "grad_norm": 0.7048870608679273, + "learning_rate": 0.00019593867542836197, + "loss": 12.6733, + "step": 4345 + }, + { + "epoch": 0.23665706914976892, + "grad_norm": 0.6755394346574072, + "learning_rate": 0.0001959361874771465, + "loss": 12.5798, + "step": 4346 + }, + { + "epoch": 0.2367115231463519, + "grad_norm": 0.8117541978725121, + "learning_rate": 0.0001959336987799152, + "loss": 12.576, + "step": 4347 + }, + { + "epoch": 0.23676597714293493, + "grad_norm": 0.7168325850886029, + "learning_rate": 0.00019593120933668733, + "loss": 12.6734, + "step": 4348 + }, + { + "epoch": 0.23682043113951795, + "grad_norm": 0.7886452783719964, + "learning_rate": 0.00019592871914748229, + "loss": 12.8169, + "step": 4349 + }, + { + "epoch": 0.23687488513610097, + "grad_norm": 0.6665072365679392, + "learning_rate": 0.00019592622821231942, + "loss": 12.574, + "step": 4350 + }, + { + "epoch": 0.23692933913268396, + "grad_norm": 0.7309078222381065, + "learning_rate": 0.00019592373653121815, + "loss": 12.5544, + "step": 4351 + }, + { + "epoch": 0.23698379312926698, + "grad_norm": 0.7226790696502614, + "learning_rate": 0.00019592124410419782, + "loss": 12.6653, + "step": 4352 + }, + { + "epoch": 0.23703824712585, + "grad_norm": 0.7930188571842032, + "learning_rate": 0.00019591875093127778, + "loss": 12.5467, + "step": 4353 + }, + { + "epoch": 0.237092701122433, + "grad_norm": 0.7132902283812247, + "learning_rate": 0.00019591625701247743, + "loss": 12.5414, + "step": 4354 + }, + { + "epoch": 0.23714715511901602, + "grad_norm": 0.71476247205427, + "learning_rate": 0.00019591376234781623, + "loss": 12.664, + "step": 4355 + }, + { + "epoch": 0.23720160911559904, + "grad_norm": 0.7012311447820085, + "learning_rate": 0.0001959112669373135, + "loss": 12.5862, + "step": 4356 + }, + { + "epoch": 0.23725606311218203, + "grad_norm": 0.6501016857711592, + "learning_rate": 0.0001959087707809887, + "loss": 12.6829, + "step": 4357 + }, + { + "epoch": 0.23731051710876505, + "grad_norm": 0.6585230561181619, + "learning_rate": 0.0001959062738788612, + "loss": 12.6696, + "step": 4358 + }, + { + "epoch": 0.23736497110534807, + "grad_norm": 0.7190576518237434, + "learning_rate": 0.00019590377623095043, + "loss": 12.5348, + "step": 4359 + }, + { + "epoch": 0.23741942510193106, + "grad_norm": 0.7866991075164738, + "learning_rate": 0.0001959012778372758, + "loss": 12.6122, + "step": 4360 + }, + { + "epoch": 0.23747387909851408, + "grad_norm": 0.6349344258638765, + "learning_rate": 0.00019589877869785678, + "loss": 12.518, + "step": 4361 + }, + { + "epoch": 0.2375283330950971, + "grad_norm": 0.7519536264575788, + "learning_rate": 0.00019589627881271273, + "loss": 12.7277, + "step": 4362 + }, + { + "epoch": 0.2375827870916801, + "grad_norm": 0.6524831429079407, + "learning_rate": 0.00019589377818186318, + "loss": 12.549, + "step": 4363 + }, + { + "epoch": 0.23763724108826312, + "grad_norm": 0.6799023071625376, + "learning_rate": 0.0001958912768053275, + "loss": 12.5668, + "step": 4364 + }, + { + "epoch": 0.23769169508484614, + "grad_norm": 0.6664392562505124, + "learning_rate": 0.00019588877468312518, + "loss": 12.7095, + "step": 4365 + }, + { + "epoch": 0.23774614908142916, + "grad_norm": 0.6533629567695574, + "learning_rate": 0.00019588627181527568, + "loss": 12.5943, + "step": 4366 + }, + { + "epoch": 0.23780060307801215, + "grad_norm": 0.6506573048539561, + "learning_rate": 0.00019588376820179845, + "loss": 12.4782, + "step": 4367 + }, + { + "epoch": 0.23785505707459517, + "grad_norm": 0.7732206437895202, + "learning_rate": 0.00019588126384271294, + "loss": 12.7024, + "step": 4368 + }, + { + "epoch": 0.2379095110711782, + "grad_norm": 0.7102121516925589, + "learning_rate": 0.00019587875873803865, + "loss": 12.5706, + "step": 4369 + }, + { + "epoch": 0.23796396506776119, + "grad_norm": 0.7655504404935409, + "learning_rate": 0.00019587625288779506, + "loss": 12.6266, + "step": 4370 + }, + { + "epoch": 0.2380184190643442, + "grad_norm": 0.7034358027577429, + "learning_rate": 0.00019587374629200164, + "loss": 12.665, + "step": 4371 + }, + { + "epoch": 0.23807287306092723, + "grad_norm": 0.7218635469509956, + "learning_rate": 0.0001958712389506779, + "loss": 12.4119, + "step": 4372 + }, + { + "epoch": 0.23812732705751022, + "grad_norm": 0.7219544872173718, + "learning_rate": 0.00019586873086384333, + "loss": 12.6191, + "step": 4373 + }, + { + "epoch": 0.23818178105409324, + "grad_norm": 0.6753360708985948, + "learning_rate": 0.0001958662220315174, + "loss": 12.6027, + "step": 4374 + }, + { + "epoch": 0.23823623505067626, + "grad_norm": 0.7042105075748244, + "learning_rate": 0.0001958637124537197, + "loss": 12.678, + "step": 4375 + }, + { + "epoch": 0.23829068904725925, + "grad_norm": 0.6937149586216751, + "learning_rate": 0.00019586120213046964, + "loss": 12.6869, + "step": 4376 + }, + { + "epoch": 0.23834514304384227, + "grad_norm": 0.7638703568099982, + "learning_rate": 0.00019585869106178685, + "loss": 12.8213, + "step": 4377 + }, + { + "epoch": 0.2383995970404253, + "grad_norm": 0.711513978494034, + "learning_rate": 0.00019585617924769078, + "loss": 12.6086, + "step": 4378 + }, + { + "epoch": 0.23845405103700829, + "grad_norm": 0.6760817286826006, + "learning_rate": 0.00019585366668820097, + "loss": 12.7248, + "step": 4379 + }, + { + "epoch": 0.2385085050335913, + "grad_norm": 0.8389641014052955, + "learning_rate": 0.000195851153383337, + "loss": 12.6197, + "step": 4380 + }, + { + "epoch": 0.23856295903017433, + "grad_norm": 0.7277777233565489, + "learning_rate": 0.00019584863933311836, + "loss": 12.514, + "step": 4381 + }, + { + "epoch": 0.23861741302675732, + "grad_norm": 0.7616073170124373, + "learning_rate": 0.00019584612453756465, + "loss": 12.6506, + "step": 4382 + }, + { + "epoch": 0.23867186702334034, + "grad_norm": 0.6709118088136903, + "learning_rate": 0.00019584360899669537, + "loss": 12.7244, + "step": 4383 + }, + { + "epoch": 0.23872632101992336, + "grad_norm": 0.746987612988271, + "learning_rate": 0.00019584109271053018, + "loss": 12.5503, + "step": 4384 + }, + { + "epoch": 0.23878077501650638, + "grad_norm": 0.7572221948189825, + "learning_rate": 0.0001958385756790885, + "loss": 12.5647, + "step": 4385 + }, + { + "epoch": 0.23883522901308937, + "grad_norm": 0.6949784375654698, + "learning_rate": 0.00019583605790239004, + "loss": 12.561, + "step": 4386 + }, + { + "epoch": 0.2388896830096724, + "grad_norm": 0.741463522239718, + "learning_rate": 0.00019583353938045433, + "loss": 12.6955, + "step": 4387 + }, + { + "epoch": 0.2389441370062554, + "grad_norm": 0.6497906124337596, + "learning_rate": 0.0001958310201133009, + "loss": 12.5737, + "step": 4388 + }, + { + "epoch": 0.2389985910028384, + "grad_norm": 0.7040373728229842, + "learning_rate": 0.0001958285001009494, + "loss": 12.6129, + "step": 4389 + }, + { + "epoch": 0.23905304499942143, + "grad_norm": 0.7392310666609192, + "learning_rate": 0.00019582597934341943, + "loss": 12.6106, + "step": 4390 + }, + { + "epoch": 0.23910749899600445, + "grad_norm": 0.7254938521643921, + "learning_rate": 0.00019582345784073058, + "loss": 12.5225, + "step": 4391 + }, + { + "epoch": 0.23916195299258744, + "grad_norm": 0.694043218706121, + "learning_rate": 0.00019582093559290242, + "loss": 12.5082, + "step": 4392 + }, + { + "epoch": 0.23921640698917046, + "grad_norm": 0.6702855650816293, + "learning_rate": 0.0001958184125999546, + "loss": 12.6206, + "step": 4393 + }, + { + "epoch": 0.23927086098575348, + "grad_norm": 0.693921757939034, + "learning_rate": 0.00019581588886190675, + "loss": 12.4757, + "step": 4394 + }, + { + "epoch": 0.23932531498233647, + "grad_norm": 0.6787515322047294, + "learning_rate": 0.00019581336437877848, + "loss": 12.3603, + "step": 4395 + }, + { + "epoch": 0.2393797689789195, + "grad_norm": 0.7769332202563006, + "learning_rate": 0.0001958108391505894, + "loss": 12.4286, + "step": 4396 + }, + { + "epoch": 0.23943422297550251, + "grad_norm": 0.7618523412111193, + "learning_rate": 0.0001958083131773592, + "loss": 12.5702, + "step": 4397 + }, + { + "epoch": 0.2394886769720855, + "grad_norm": 0.6980552748721157, + "learning_rate": 0.0001958057864591075, + "loss": 12.546, + "step": 4398 + }, + { + "epoch": 0.23954313096866853, + "grad_norm": 0.9703323367362652, + "learning_rate": 0.00019580325899585388, + "loss": 12.6259, + "step": 4399 + }, + { + "epoch": 0.23959758496525155, + "grad_norm": 0.6874644663477578, + "learning_rate": 0.0001958007307876181, + "loss": 12.6013, + "step": 4400 + }, + { + "epoch": 0.23965203896183457, + "grad_norm": 0.7883185675467992, + "learning_rate": 0.00019579820183441974, + "loss": 12.6963, + "step": 4401 + }, + { + "epoch": 0.23970649295841756, + "grad_norm": 0.7134853056312774, + "learning_rate": 0.0001957956721362785, + "loss": 12.5664, + "step": 4402 + }, + { + "epoch": 0.23976094695500058, + "grad_norm": 0.6907933935357229, + "learning_rate": 0.0001957931416932141, + "loss": 12.6162, + "step": 4403 + }, + { + "epoch": 0.2398154009515836, + "grad_norm": 0.6759806570517407, + "learning_rate": 0.0001957906105052461, + "loss": 12.4443, + "step": 4404 + }, + { + "epoch": 0.2398698549481666, + "grad_norm": 0.6749819770696659, + "learning_rate": 0.00019578807857239427, + "loss": 12.4787, + "step": 4405 + }, + { + "epoch": 0.23992430894474961, + "grad_norm": 0.6790796285000683, + "learning_rate": 0.0001957855458946783, + "loss": 12.5896, + "step": 4406 + }, + { + "epoch": 0.23997876294133264, + "grad_norm": 0.8447902766005312, + "learning_rate": 0.00019578301247211784, + "loss": 12.7252, + "step": 4407 + }, + { + "epoch": 0.24003321693791563, + "grad_norm": 0.6929824546055435, + "learning_rate": 0.00019578047830473263, + "loss": 12.6199, + "step": 4408 + }, + { + "epoch": 0.24008767093449865, + "grad_norm": 0.684029054330614, + "learning_rate": 0.00019577794339254234, + "loss": 12.578, + "step": 4409 + }, + { + "epoch": 0.24014212493108167, + "grad_norm": 0.7338430360544661, + "learning_rate": 0.00019577540773556672, + "loss": 12.6144, + "step": 4410 + }, + { + "epoch": 0.24019657892766466, + "grad_norm": 0.6985904077182428, + "learning_rate": 0.0001957728713338254, + "loss": 12.6199, + "step": 4411 + }, + { + "epoch": 0.24025103292424768, + "grad_norm": 0.6805492339585597, + "learning_rate": 0.00019577033418733826, + "loss": 12.6823, + "step": 4412 + }, + { + "epoch": 0.2403054869208307, + "grad_norm": 0.7185307030195718, + "learning_rate": 0.0001957677962961249, + "loss": 12.6543, + "step": 4413 + }, + { + "epoch": 0.2403599409174137, + "grad_norm": 0.6971857831409217, + "learning_rate": 0.0001957652576602051, + "loss": 12.5886, + "step": 4414 + }, + { + "epoch": 0.24041439491399672, + "grad_norm": 0.6637590622574503, + "learning_rate": 0.0001957627182795986, + "loss": 12.6217, + "step": 4415 + }, + { + "epoch": 0.24046884891057974, + "grad_norm": 0.7310212965379054, + "learning_rate": 0.00019576017815432515, + "loss": 12.6621, + "step": 4416 + }, + { + "epoch": 0.24052330290716276, + "grad_norm": 0.7393074452512833, + "learning_rate": 0.0001957576372844045, + "loss": 12.5586, + "step": 4417 + }, + { + "epoch": 0.24057775690374575, + "grad_norm": 0.7296197794415451, + "learning_rate": 0.00019575509566985638, + "loss": 12.569, + "step": 4418 + }, + { + "epoch": 0.24063221090032877, + "grad_norm": 0.7011152828829472, + "learning_rate": 0.00019575255331070058, + "loss": 12.5821, + "step": 4419 + }, + { + "epoch": 0.2406866648969118, + "grad_norm": 0.8052641623839756, + "learning_rate": 0.0001957500102069569, + "loss": 12.4746, + "step": 4420 + }, + { + "epoch": 0.24074111889349478, + "grad_norm": 0.6879345374295561, + "learning_rate": 0.00019574746635864506, + "loss": 12.5922, + "step": 4421 + }, + { + "epoch": 0.2407955728900778, + "grad_norm": 1.0817216594551267, + "learning_rate": 0.00019574492176578485, + "loss": 12.6664, + "step": 4422 + }, + { + "epoch": 0.24085002688666082, + "grad_norm": 0.678943951419676, + "learning_rate": 0.00019574237642839607, + "loss": 12.6064, + "step": 4423 + }, + { + "epoch": 0.24090448088324382, + "grad_norm": 0.7892394980152566, + "learning_rate": 0.00019573983034649854, + "loss": 12.6721, + "step": 4424 + }, + { + "epoch": 0.24095893487982684, + "grad_norm": 0.6887403346294946, + "learning_rate": 0.00019573728352011204, + "loss": 12.5072, + "step": 4425 + }, + { + "epoch": 0.24101338887640986, + "grad_norm": 0.7236036893698674, + "learning_rate": 0.00019573473594925632, + "loss": 12.7498, + "step": 4426 + }, + { + "epoch": 0.24106784287299285, + "grad_norm": 0.7725445214988449, + "learning_rate": 0.0001957321876339513, + "loss": 12.4948, + "step": 4427 + }, + { + "epoch": 0.24112229686957587, + "grad_norm": 0.7523623474627931, + "learning_rate": 0.0001957296385742167, + "loss": 12.6427, + "step": 4428 + }, + { + "epoch": 0.2411767508661589, + "grad_norm": 0.7879142777310196, + "learning_rate": 0.0001957270887700724, + "loss": 12.6412, + "step": 4429 + }, + { + "epoch": 0.24123120486274188, + "grad_norm": 0.6593665147814708, + "learning_rate": 0.00019572453822153818, + "loss": 12.6723, + "step": 4430 + }, + { + "epoch": 0.2412856588593249, + "grad_norm": 0.7554934253466723, + "learning_rate": 0.0001957219869286339, + "loss": 12.7087, + "step": 4431 + }, + { + "epoch": 0.24134011285590792, + "grad_norm": 0.6994668680293442, + "learning_rate": 0.0001957194348913794, + "loss": 12.3919, + "step": 4432 + }, + { + "epoch": 0.24139456685249094, + "grad_norm": 0.6874407428394054, + "learning_rate": 0.00019571688210979452, + "loss": 12.5445, + "step": 4433 + }, + { + "epoch": 0.24144902084907394, + "grad_norm": 0.667137984555039, + "learning_rate": 0.00019571432858389912, + "loss": 12.4685, + "step": 4434 + }, + { + "epoch": 0.24150347484565696, + "grad_norm": 0.7996917414516334, + "learning_rate": 0.00019571177431371303, + "loss": 12.6403, + "step": 4435 + }, + { + "epoch": 0.24155792884223998, + "grad_norm": 0.6913213809619198, + "learning_rate": 0.00019570921929925616, + "loss": 12.7323, + "step": 4436 + }, + { + "epoch": 0.24161238283882297, + "grad_norm": 0.7221294539274075, + "learning_rate": 0.00019570666354054835, + "loss": 12.5116, + "step": 4437 + }, + { + "epoch": 0.241666836835406, + "grad_norm": 0.6955443045709032, + "learning_rate": 0.00019570410703760946, + "loss": 12.7694, + "step": 4438 + }, + { + "epoch": 0.241721290831989, + "grad_norm": 0.7939884123941371, + "learning_rate": 0.0001957015497904594, + "loss": 12.6727, + "step": 4439 + }, + { + "epoch": 0.241775744828572, + "grad_norm": 0.6435736362998078, + "learning_rate": 0.00019569899179911802, + "loss": 12.6093, + "step": 4440 + }, + { + "epoch": 0.24183019882515502, + "grad_norm": 0.6785583000701847, + "learning_rate": 0.00019569643306360524, + "loss": 12.6087, + "step": 4441 + }, + { + "epoch": 0.24188465282173804, + "grad_norm": 0.8739610450351043, + "learning_rate": 0.00019569387358394096, + "loss": 12.5868, + "step": 4442 + }, + { + "epoch": 0.24193910681832104, + "grad_norm": 0.6785969720317832, + "learning_rate": 0.00019569131336014502, + "loss": 12.6191, + "step": 4443 + }, + { + "epoch": 0.24199356081490406, + "grad_norm": 0.6706174843742324, + "learning_rate": 0.00019568875239223745, + "loss": 12.6506, + "step": 4444 + }, + { + "epoch": 0.24204801481148708, + "grad_norm": 0.6871753042850144, + "learning_rate": 0.00019568619068023805, + "loss": 12.6058, + "step": 4445 + }, + { + "epoch": 0.24210246880807007, + "grad_norm": 0.7924294900049751, + "learning_rate": 0.0001956836282241668, + "loss": 12.597, + "step": 4446 + }, + { + "epoch": 0.2421569228046531, + "grad_norm": 0.7458698287245629, + "learning_rate": 0.0001956810650240436, + "loss": 12.5591, + "step": 4447 + }, + { + "epoch": 0.2422113768012361, + "grad_norm": 0.7321544092410979, + "learning_rate": 0.0001956785010798884, + "loss": 12.5925, + "step": 4448 + }, + { + "epoch": 0.2422658307978191, + "grad_norm": 0.7487279437867312, + "learning_rate": 0.00019567593639172116, + "loss": 12.8325, + "step": 4449 + }, + { + "epoch": 0.24232028479440212, + "grad_norm": 0.6905942359080257, + "learning_rate": 0.00019567337095956173, + "loss": 12.5908, + "step": 4450 + }, + { + "epoch": 0.24237473879098514, + "grad_norm": 0.7374695447895508, + "learning_rate": 0.00019567080478343017, + "loss": 12.727, + "step": 4451 + }, + { + "epoch": 0.24242919278756817, + "grad_norm": 0.6704989604779045, + "learning_rate": 0.00019566823786334637, + "loss": 12.5303, + "step": 4452 + }, + { + "epoch": 0.24248364678415116, + "grad_norm": 0.8433399654324624, + "learning_rate": 0.0001956656701993303, + "loss": 12.7586, + "step": 4453 + }, + { + "epoch": 0.24253810078073418, + "grad_norm": 0.6902272620976142, + "learning_rate": 0.00019566310179140194, + "loss": 12.5854, + "step": 4454 + }, + { + "epoch": 0.2425925547773172, + "grad_norm": 0.6414382311660554, + "learning_rate": 0.0001956605326395813, + "loss": 12.5247, + "step": 4455 + }, + { + "epoch": 0.2426470087739002, + "grad_norm": 0.6781107751163772, + "learning_rate": 0.0001956579627438883, + "loss": 12.6554, + "step": 4456 + }, + { + "epoch": 0.2427014627704832, + "grad_norm": 0.6529822601179021, + "learning_rate": 0.0001956553921043429, + "loss": 12.4097, + "step": 4457 + }, + { + "epoch": 0.24275591676706623, + "grad_norm": 0.64671283128718, + "learning_rate": 0.00019565282072096514, + "loss": 12.6114, + "step": 4458 + }, + { + "epoch": 0.24281037076364922, + "grad_norm": 0.7302055774093955, + "learning_rate": 0.000195650248593775, + "loss": 12.7056, + "step": 4459 + }, + { + "epoch": 0.24286482476023225, + "grad_norm": 0.6869491716617113, + "learning_rate": 0.00019564767572279252, + "loss": 12.5807, + "step": 4460 + }, + { + "epoch": 0.24291927875681527, + "grad_norm": 0.7093695704767358, + "learning_rate": 0.00019564510210803767, + "loss": 12.6981, + "step": 4461 + }, + { + "epoch": 0.24297373275339826, + "grad_norm": 0.7599144682393941, + "learning_rate": 0.00019564252774953046, + "loss": 12.6494, + "step": 4462 + }, + { + "epoch": 0.24302818674998128, + "grad_norm": 0.6510888654090853, + "learning_rate": 0.00019563995264729092, + "loss": 12.5583, + "step": 4463 + }, + { + "epoch": 0.2430826407465643, + "grad_norm": 0.6517012342100406, + "learning_rate": 0.00019563737680133904, + "loss": 12.5919, + "step": 4464 + }, + { + "epoch": 0.2431370947431473, + "grad_norm": 0.7345404494271482, + "learning_rate": 0.0001956348002116949, + "loss": 12.5523, + "step": 4465 + }, + { + "epoch": 0.2431915487397303, + "grad_norm": 0.6549193855626367, + "learning_rate": 0.0001956322228783785, + "loss": 12.6657, + "step": 4466 + }, + { + "epoch": 0.24324600273631333, + "grad_norm": 0.713822158827217, + "learning_rate": 0.00019562964480140992, + "loss": 12.6496, + "step": 4467 + }, + { + "epoch": 0.24330045673289635, + "grad_norm": 0.6505310466098979, + "learning_rate": 0.00019562706598080917, + "loss": 12.6382, + "step": 4468 + }, + { + "epoch": 0.24335491072947935, + "grad_norm": 0.6734696028472843, + "learning_rate": 0.00019562448641659633, + "loss": 12.5192, + "step": 4469 + }, + { + "epoch": 0.24340936472606237, + "grad_norm": 0.7173359735388568, + "learning_rate": 0.00019562190610879142, + "loss": 12.6257, + "step": 4470 + }, + { + "epoch": 0.2434638187226454, + "grad_norm": 0.6768112396798218, + "learning_rate": 0.0001956193250574146, + "loss": 12.4036, + "step": 4471 + }, + { + "epoch": 0.24351827271922838, + "grad_norm": 0.8482952575831432, + "learning_rate": 0.0001956167432624858, + "loss": 12.6744, + "step": 4472 + }, + { + "epoch": 0.2435727267158114, + "grad_norm": 0.6896232479270313, + "learning_rate": 0.0001956141607240252, + "loss": 12.5744, + "step": 4473 + }, + { + "epoch": 0.24362718071239442, + "grad_norm": 0.7615411170903834, + "learning_rate": 0.00019561157744205283, + "loss": 12.5567, + "step": 4474 + }, + { + "epoch": 0.2436816347089774, + "grad_norm": 0.7651799858552987, + "learning_rate": 0.00019560899341658882, + "loss": 12.6451, + "step": 4475 + }, + { + "epoch": 0.24373608870556043, + "grad_norm": 0.7505220181840234, + "learning_rate": 0.00019560640864765326, + "loss": 12.6047, + "step": 4476 + }, + { + "epoch": 0.24379054270214345, + "grad_norm": 0.7986243578023776, + "learning_rate": 0.0001956038231352662, + "loss": 12.5866, + "step": 4477 + }, + { + "epoch": 0.24384499669872645, + "grad_norm": 0.7308259229200806, + "learning_rate": 0.0001956012368794478, + "loss": 12.5419, + "step": 4478 + }, + { + "epoch": 0.24389945069530947, + "grad_norm": 0.7536534796161848, + "learning_rate": 0.00019559864988021814, + "loss": 12.4244, + "step": 4479 + }, + { + "epoch": 0.2439539046918925, + "grad_norm": 0.7359555051915565, + "learning_rate": 0.00019559606213759737, + "loss": 12.6927, + "step": 4480 + }, + { + "epoch": 0.24400835868847548, + "grad_norm": 0.6842061251039289, + "learning_rate": 0.00019559347365160555, + "loss": 12.6481, + "step": 4481 + }, + { + "epoch": 0.2440628126850585, + "grad_norm": 0.7077400251306337, + "learning_rate": 0.00019559088442226287, + "loss": 12.4308, + "step": 4482 + }, + { + "epoch": 0.24411726668164152, + "grad_norm": 0.7482556729844912, + "learning_rate": 0.00019558829444958942, + "loss": 12.5591, + "step": 4483 + }, + { + "epoch": 0.24417172067822454, + "grad_norm": 0.6449707271355242, + "learning_rate": 0.0001955857037336054, + "loss": 12.7376, + "step": 4484 + }, + { + "epoch": 0.24422617467480753, + "grad_norm": 0.7326655502768871, + "learning_rate": 0.00019558311227433088, + "loss": 12.4296, + "step": 4485 + }, + { + "epoch": 0.24428062867139055, + "grad_norm": 0.7200166033113317, + "learning_rate": 0.0001955805200717861, + "loss": 12.6405, + "step": 4486 + }, + { + "epoch": 0.24433508266797357, + "grad_norm": 0.7553742880217637, + "learning_rate": 0.00019557792712599113, + "loss": 12.5288, + "step": 4487 + }, + { + "epoch": 0.24438953666455657, + "grad_norm": 1.3477492623330136, + "learning_rate": 0.00019557533343696616, + "loss": 12.7448, + "step": 4488 + }, + { + "epoch": 0.2444439906611396, + "grad_norm": 0.747810008635918, + "learning_rate": 0.00019557273900473138, + "loss": 12.6993, + "step": 4489 + }, + { + "epoch": 0.2444984446577226, + "grad_norm": 0.7578599870862847, + "learning_rate": 0.00019557014382930697, + "loss": 12.5171, + "step": 4490 + }, + { + "epoch": 0.2445528986543056, + "grad_norm": 0.7716666873016084, + "learning_rate": 0.0001955675479107131, + "loss": 12.4974, + "step": 4491 + }, + { + "epoch": 0.24460735265088862, + "grad_norm": 0.7491187824359241, + "learning_rate": 0.0001955649512489699, + "loss": 12.6716, + "step": 4492 + }, + { + "epoch": 0.24466180664747164, + "grad_norm": 0.7317652042296284, + "learning_rate": 0.00019556235384409764, + "loss": 12.5856, + "step": 4493 + }, + { + "epoch": 0.24471626064405463, + "grad_norm": 0.7157174847925366, + "learning_rate": 0.00019555975569611654, + "loss": 12.7635, + "step": 4494 + }, + { + "epoch": 0.24477071464063765, + "grad_norm": 0.7177168932371809, + "learning_rate": 0.00019555715680504666, + "loss": 12.5829, + "step": 4495 + }, + { + "epoch": 0.24482516863722067, + "grad_norm": 0.7339058397426691, + "learning_rate": 0.0001955545571709084, + "loss": 12.6403, + "step": 4496 + }, + { + "epoch": 0.24487962263380367, + "grad_norm": 0.7295417996740731, + "learning_rate": 0.0001955519567937218, + "loss": 12.613, + "step": 4497 + }, + { + "epoch": 0.2449340766303867, + "grad_norm": 0.7351610935959223, + "learning_rate": 0.00019554935567350721, + "loss": 12.5433, + "step": 4498 + }, + { + "epoch": 0.2449885306269697, + "grad_norm": 0.6982007901341996, + "learning_rate": 0.00019554675381028478, + "loss": 12.7112, + "step": 4499 + }, + { + "epoch": 0.24504298462355273, + "grad_norm": 0.6846838175361749, + "learning_rate": 0.00019554415120407478, + "loss": 12.5285, + "step": 4500 + }, + { + "epoch": 0.24509743862013572, + "grad_norm": 0.7254197253188015, + "learning_rate": 0.00019554154785489744, + "loss": 12.4814, + "step": 4501 + }, + { + "epoch": 0.24515189261671874, + "grad_norm": 0.709088370584864, + "learning_rate": 0.000195538943762773, + "loss": 12.5766, + "step": 4502 + }, + { + "epoch": 0.24520634661330176, + "grad_norm": 0.9468716232091118, + "learning_rate": 0.00019553633892772172, + "loss": 12.6409, + "step": 4503 + }, + { + "epoch": 0.24526080060988475, + "grad_norm": 0.7480225416160845, + "learning_rate": 0.00019553373334976385, + "loss": 12.6486, + "step": 4504 + }, + { + "epoch": 0.24531525460646778, + "grad_norm": 0.8838370936909044, + "learning_rate": 0.00019553112702891962, + "loss": 12.545, + "step": 4505 + }, + { + "epoch": 0.2453697086030508, + "grad_norm": 0.7603176328637311, + "learning_rate": 0.00019552851996520936, + "loss": 12.6969, + "step": 4506 + }, + { + "epoch": 0.2454241625996338, + "grad_norm": 0.6745237563930223, + "learning_rate": 0.00019552591215865327, + "loss": 12.5745, + "step": 4507 + }, + { + "epoch": 0.2454786165962168, + "grad_norm": 0.7399326419492032, + "learning_rate": 0.0001955233036092717, + "loss": 12.6872, + "step": 4508 + }, + { + "epoch": 0.24553307059279983, + "grad_norm": 0.7077478699077312, + "learning_rate": 0.0001955206943170849, + "loss": 12.5911, + "step": 4509 + }, + { + "epoch": 0.24558752458938282, + "grad_norm": 0.7307766289166041, + "learning_rate": 0.00019551808428211313, + "loss": 12.5886, + "step": 4510 + }, + { + "epoch": 0.24564197858596584, + "grad_norm": 0.6607937087585422, + "learning_rate": 0.00019551547350437677, + "loss": 12.6056, + "step": 4511 + }, + { + "epoch": 0.24569643258254886, + "grad_norm": 0.6781420170848789, + "learning_rate": 0.000195512861983896, + "loss": 12.5309, + "step": 4512 + }, + { + "epoch": 0.24575088657913186, + "grad_norm": 0.7274451215503108, + "learning_rate": 0.00019551024972069126, + "loss": 12.6383, + "step": 4513 + }, + { + "epoch": 0.24580534057571488, + "grad_norm": 0.7328547155168079, + "learning_rate": 0.00019550763671478277, + "loss": 12.6395, + "step": 4514 + }, + { + "epoch": 0.2458597945722979, + "grad_norm": 0.7098023166777683, + "learning_rate": 0.00019550502296619089, + "loss": 12.5694, + "step": 4515 + }, + { + "epoch": 0.2459142485688809, + "grad_norm": 0.8286201301595062, + "learning_rate": 0.00019550240847493594, + "loss": 12.6161, + "step": 4516 + }, + { + "epoch": 0.2459687025654639, + "grad_norm": 0.74030169363629, + "learning_rate": 0.00019549979324103825, + "loss": 12.6025, + "step": 4517 + }, + { + "epoch": 0.24602315656204693, + "grad_norm": 0.6559365019851384, + "learning_rate": 0.00019549717726451813, + "loss": 12.6041, + "step": 4518 + }, + { + "epoch": 0.24607761055862995, + "grad_norm": 0.7240693992512116, + "learning_rate": 0.00019549456054539596, + "loss": 12.6527, + "step": 4519 + }, + { + "epoch": 0.24613206455521294, + "grad_norm": 0.8447464162145961, + "learning_rate": 0.0001954919430836921, + "loss": 12.5594, + "step": 4520 + }, + { + "epoch": 0.24618651855179596, + "grad_norm": 0.7760926547157477, + "learning_rate": 0.00019548932487942683, + "loss": 12.6414, + "step": 4521 + }, + { + "epoch": 0.24624097254837898, + "grad_norm": 0.7862686525620901, + "learning_rate": 0.00019548670593262055, + "loss": 12.5695, + "step": 4522 + }, + { + "epoch": 0.24629542654496198, + "grad_norm": 0.6977819603616029, + "learning_rate": 0.00019548408624329368, + "loss": 12.5997, + "step": 4523 + }, + { + "epoch": 0.246349880541545, + "grad_norm": 0.7581882072818931, + "learning_rate": 0.0001954814658114665, + "loss": 12.5679, + "step": 4524 + }, + { + "epoch": 0.24640433453812802, + "grad_norm": 0.7417220054167826, + "learning_rate": 0.00019547884463715944, + "loss": 12.5288, + "step": 4525 + }, + { + "epoch": 0.246458788534711, + "grad_norm": 0.6571353614508388, + "learning_rate": 0.00019547622272039287, + "loss": 12.5123, + "step": 4526 + }, + { + "epoch": 0.24651324253129403, + "grad_norm": 0.9819943625994235, + "learning_rate": 0.0001954736000611872, + "loss": 12.6171, + "step": 4527 + }, + { + "epoch": 0.24656769652787705, + "grad_norm": 0.8662000801950319, + "learning_rate": 0.0001954709766595628, + "loss": 12.702, + "step": 4528 + }, + { + "epoch": 0.24662215052446004, + "grad_norm": 0.7330313779627509, + "learning_rate": 0.00019546835251554008, + "loss": 12.6235, + "step": 4529 + }, + { + "epoch": 0.24667660452104306, + "grad_norm": 0.6708599988369802, + "learning_rate": 0.00019546572762913942, + "loss": 12.6261, + "step": 4530 + }, + { + "epoch": 0.24673105851762608, + "grad_norm": 0.6875341786134898, + "learning_rate": 0.00019546310200038125, + "loss": 12.648, + "step": 4531 + }, + { + "epoch": 0.24678551251420908, + "grad_norm": 0.6884220381444792, + "learning_rate": 0.000195460475629286, + "loss": 12.6227, + "step": 4532 + }, + { + "epoch": 0.2468399665107921, + "grad_norm": 0.7260017148269406, + "learning_rate": 0.0001954578485158741, + "loss": 12.4933, + "step": 4533 + }, + { + "epoch": 0.24689442050737512, + "grad_norm": 0.7986836441157589, + "learning_rate": 0.00019545522066016595, + "loss": 12.5843, + "step": 4534 + }, + { + "epoch": 0.24694887450395814, + "grad_norm": 0.7316985170974813, + "learning_rate": 0.00019545259206218198, + "loss": 12.6712, + "step": 4535 + }, + { + "epoch": 0.24700332850054113, + "grad_norm": 0.716498663788949, + "learning_rate": 0.00019544996272194266, + "loss": 12.6374, + "step": 4536 + }, + { + "epoch": 0.24705778249712415, + "grad_norm": 0.6871600074292533, + "learning_rate": 0.00019544733263946845, + "loss": 12.5411, + "step": 4537 + }, + { + "epoch": 0.24711223649370717, + "grad_norm": 0.8407670283999826, + "learning_rate": 0.0001954447018147797, + "loss": 12.6246, + "step": 4538 + }, + { + "epoch": 0.24716669049029016, + "grad_norm": 0.722063707325831, + "learning_rate": 0.00019544207024789703, + "loss": 12.4979, + "step": 4539 + }, + { + "epoch": 0.24722114448687318, + "grad_norm": 0.7595660700514336, + "learning_rate": 0.00019543943793884076, + "loss": 12.6016, + "step": 4540 + }, + { + "epoch": 0.2472755984834562, + "grad_norm": 0.7803560129403373, + "learning_rate": 0.00019543680488763143, + "loss": 12.6529, + "step": 4541 + }, + { + "epoch": 0.2473300524800392, + "grad_norm": 0.6852586791744784, + "learning_rate": 0.00019543417109428953, + "loss": 12.6201, + "step": 4542 + }, + { + "epoch": 0.24738450647662222, + "grad_norm": 0.708842264429155, + "learning_rate": 0.00019543153655883545, + "loss": 12.6531, + "step": 4543 + }, + { + "epoch": 0.24743896047320524, + "grad_norm": 0.6785694911685014, + "learning_rate": 0.0001954289012812898, + "loss": 12.6396, + "step": 4544 + }, + { + "epoch": 0.24749341446978823, + "grad_norm": 0.6822476492481611, + "learning_rate": 0.00019542626526167296, + "loss": 12.5853, + "step": 4545 + }, + { + "epoch": 0.24754786846637125, + "grad_norm": 0.6755924097105872, + "learning_rate": 0.0001954236285000055, + "loss": 12.5671, + "step": 4546 + }, + { + "epoch": 0.24760232246295427, + "grad_norm": 0.702809518986082, + "learning_rate": 0.0001954209909963079, + "loss": 12.5755, + "step": 4547 + }, + { + "epoch": 0.24765677645953726, + "grad_norm": 0.6967556394981288, + "learning_rate": 0.0001954183527506007, + "loss": 12.6763, + "step": 4548 + }, + { + "epoch": 0.24771123045612028, + "grad_norm": 0.662645469888712, + "learning_rate": 0.00019541571376290436, + "loss": 12.5686, + "step": 4549 + }, + { + "epoch": 0.2477656844527033, + "grad_norm": 0.74349023652389, + "learning_rate": 0.00019541307403323944, + "loss": 12.6814, + "step": 4550 + }, + { + "epoch": 0.24782013844928633, + "grad_norm": 0.6806950441026586, + "learning_rate": 0.00019541043356162643, + "loss": 12.5481, + "step": 4551 + }, + { + "epoch": 0.24787459244586932, + "grad_norm": 0.6826692501332223, + "learning_rate": 0.0001954077923480859, + "loss": 12.522, + "step": 4552 + }, + { + "epoch": 0.24792904644245234, + "grad_norm": 0.7024226341344949, + "learning_rate": 0.00019540515039263837, + "loss": 12.5405, + "step": 4553 + }, + { + "epoch": 0.24798350043903536, + "grad_norm": 0.7953597336673244, + "learning_rate": 0.00019540250769530443, + "loss": 12.6771, + "step": 4554 + }, + { + "epoch": 0.24803795443561835, + "grad_norm": 0.6986416150802731, + "learning_rate": 0.00019539986425610453, + "loss": 12.5993, + "step": 4555 + }, + { + "epoch": 0.24809240843220137, + "grad_norm": 0.7842682390286129, + "learning_rate": 0.00019539722007505934, + "loss": 12.6273, + "step": 4556 + }, + { + "epoch": 0.2481468624287844, + "grad_norm": 0.7765687604639168, + "learning_rate": 0.00019539457515218932, + "loss": 12.5093, + "step": 4557 + }, + { + "epoch": 0.24820131642536739, + "grad_norm": 0.8003804060961116, + "learning_rate": 0.00019539192948751514, + "loss": 12.5759, + "step": 4558 + }, + { + "epoch": 0.2482557704219504, + "grad_norm": 0.7202326885917015, + "learning_rate": 0.0001953892830810573, + "loss": 12.4316, + "step": 4559 + }, + { + "epoch": 0.24831022441853343, + "grad_norm": 0.7474728919149864, + "learning_rate": 0.00019538663593283637, + "loss": 12.6285, + "step": 4560 + }, + { + "epoch": 0.24836467841511642, + "grad_norm": 0.7812249465629221, + "learning_rate": 0.00019538398804287298, + "loss": 12.6414, + "step": 4561 + }, + { + "epoch": 0.24841913241169944, + "grad_norm": 0.693946563301219, + "learning_rate": 0.00019538133941118772, + "loss": 12.4846, + "step": 4562 + }, + { + "epoch": 0.24847358640828246, + "grad_norm": 0.7697627760440247, + "learning_rate": 0.00019537869003780116, + "loss": 12.6134, + "step": 4563 + }, + { + "epoch": 0.24852804040486545, + "grad_norm": 0.908532305342887, + "learning_rate": 0.0001953760399227339, + "loss": 12.6881, + "step": 4564 + }, + { + "epoch": 0.24858249440144847, + "grad_norm": 0.6788580510320924, + "learning_rate": 0.00019537338906600659, + "loss": 12.5017, + "step": 4565 + }, + { + "epoch": 0.2486369483980315, + "grad_norm": 0.7197728170770006, + "learning_rate": 0.00019537073746763977, + "loss": 12.5242, + "step": 4566 + }, + { + "epoch": 0.2486914023946145, + "grad_norm": 0.8205666944352907, + "learning_rate": 0.00019536808512765413, + "loss": 12.7204, + "step": 4567 + }, + { + "epoch": 0.2487458563911975, + "grad_norm": 0.7434051541276757, + "learning_rate": 0.00019536543204607025, + "loss": 12.6962, + "step": 4568 + }, + { + "epoch": 0.24880031038778053, + "grad_norm": 0.6981666345210649, + "learning_rate": 0.00019536277822290878, + "loss": 12.645, + "step": 4569 + }, + { + "epoch": 0.24885476438436355, + "grad_norm": 0.7823360214580403, + "learning_rate": 0.00019536012365819038, + "loss": 12.6003, + "step": 4570 + }, + { + "epoch": 0.24890921838094654, + "grad_norm": 0.7563770621808997, + "learning_rate": 0.00019535746835193564, + "loss": 12.6248, + "step": 4571 + }, + { + "epoch": 0.24896367237752956, + "grad_norm": 0.7685321115617461, + "learning_rate": 0.00019535481230416524, + "loss": 12.6604, + "step": 4572 + }, + { + "epoch": 0.24901812637411258, + "grad_norm": 0.8625199287410501, + "learning_rate": 0.00019535215551489982, + "loss": 12.7927, + "step": 4573 + }, + { + "epoch": 0.24907258037069557, + "grad_norm": 0.8455092054785058, + "learning_rate": 0.00019534949798416006, + "loss": 12.6436, + "step": 4574 + }, + { + "epoch": 0.2491270343672786, + "grad_norm": 0.8004646051656875, + "learning_rate": 0.00019534683971196662, + "loss": 12.7403, + "step": 4575 + }, + { + "epoch": 0.24918148836386161, + "grad_norm": 0.7167876264537866, + "learning_rate": 0.00019534418069834013, + "loss": 12.6044, + "step": 4576 + }, + { + "epoch": 0.2492359423604446, + "grad_norm": 0.7419510607817421, + "learning_rate": 0.00019534152094330133, + "loss": 12.581, + "step": 4577 + }, + { + "epoch": 0.24929039635702763, + "grad_norm": 0.8028717993107427, + "learning_rate": 0.00019533886044687088, + "loss": 12.5883, + "step": 4578 + }, + { + "epoch": 0.24934485035361065, + "grad_norm": 0.7294878102992232, + "learning_rate": 0.00019533619920906946, + "loss": 12.5647, + "step": 4579 + }, + { + "epoch": 0.24939930435019364, + "grad_norm": 0.7266599602586925, + "learning_rate": 0.00019533353722991776, + "loss": 12.6332, + "step": 4580 + }, + { + "epoch": 0.24945375834677666, + "grad_norm": 0.7077771439865166, + "learning_rate": 0.00019533087450943648, + "loss": 12.6462, + "step": 4581 + }, + { + "epoch": 0.24950821234335968, + "grad_norm": 0.675721997388249, + "learning_rate": 0.00019532821104764633, + "loss": 12.706, + "step": 4582 + }, + { + "epoch": 0.24956266633994267, + "grad_norm": 0.729758379877045, + "learning_rate": 0.00019532554684456805, + "loss": 12.7759, + "step": 4583 + }, + { + "epoch": 0.2496171203365257, + "grad_norm": 0.8246553097877232, + "learning_rate": 0.0001953228819002223, + "loss": 12.5836, + "step": 4584 + }, + { + "epoch": 0.24967157433310871, + "grad_norm": 0.6787022092779801, + "learning_rate": 0.00019532021621462988, + "loss": 12.5742, + "step": 4585 + }, + { + "epoch": 0.24972602832969173, + "grad_norm": 0.7525877839664052, + "learning_rate": 0.00019531754978781141, + "loss": 12.528, + "step": 4586 + }, + { + "epoch": 0.24978048232627473, + "grad_norm": 0.7884072027096795, + "learning_rate": 0.00019531488261978773, + "loss": 12.5593, + "step": 4587 + }, + { + "epoch": 0.24983493632285775, + "grad_norm": 0.7306243201798892, + "learning_rate": 0.00019531221471057956, + "loss": 12.6264, + "step": 4588 + }, + { + "epoch": 0.24988939031944077, + "grad_norm": 0.6650271940846968, + "learning_rate": 0.00019530954606020759, + "loss": 12.5565, + "step": 4589 + }, + { + "epoch": 0.24994384431602376, + "grad_norm": 0.6988800412957692, + "learning_rate": 0.0001953068766686926, + "loss": 12.5774, + "step": 4590 + }, + { + "epoch": 0.24999829831260678, + "grad_norm": 0.8740708821128162, + "learning_rate": 0.0001953042065360554, + "loss": 12.8126, + "step": 4591 + }, + { + "epoch": 0.2500527523091898, + "grad_norm": 0.8266690825077698, + "learning_rate": 0.00019530153566231666, + "loss": 12.5194, + "step": 4592 + }, + { + "epoch": 0.2501072063057728, + "grad_norm": 0.6552219976399752, + "learning_rate": 0.00019529886404749723, + "loss": 12.4998, + "step": 4593 + }, + { + "epoch": 0.2501616603023558, + "grad_norm": 0.7949170552992534, + "learning_rate": 0.00019529619169161781, + "loss": 12.6232, + "step": 4594 + }, + { + "epoch": 0.2502161142989388, + "grad_norm": 0.7088863266575556, + "learning_rate": 0.00019529351859469928, + "loss": 12.6323, + "step": 4595 + }, + { + "epoch": 0.25027056829552186, + "grad_norm": 0.841859299254048, + "learning_rate": 0.0001952908447567623, + "loss": 12.6773, + "step": 4596 + }, + { + "epoch": 0.25032502229210485, + "grad_norm": 0.7756336581104332, + "learning_rate": 0.00019528817017782778, + "loss": 12.4028, + "step": 4597 + }, + { + "epoch": 0.25037947628868784, + "grad_norm": 0.7572196940976758, + "learning_rate": 0.00019528549485791646, + "loss": 12.5988, + "step": 4598 + }, + { + "epoch": 0.2504339302852709, + "grad_norm": 0.8035920475602101, + "learning_rate": 0.00019528281879704912, + "loss": 12.5705, + "step": 4599 + }, + { + "epoch": 0.2504883842818539, + "grad_norm": 0.6541580909787088, + "learning_rate": 0.00019528014199524663, + "loss": 12.53, + "step": 4600 + }, + { + "epoch": 0.2505428382784369, + "grad_norm": 0.7846336945332629, + "learning_rate": 0.0001952774644525298, + "loss": 12.6758, + "step": 4601 + }, + { + "epoch": 0.2505972922750199, + "grad_norm": 0.7364919279335622, + "learning_rate": 0.00019527478616891938, + "loss": 12.5414, + "step": 4602 + }, + { + "epoch": 0.2506517462716029, + "grad_norm": 0.7374226191410725, + "learning_rate": 0.00019527210714443628, + "loss": 12.6017, + "step": 4603 + }, + { + "epoch": 0.2507062002681859, + "grad_norm": 0.6828349388848473, + "learning_rate": 0.00019526942737910127, + "loss": 12.6738, + "step": 4604 + }, + { + "epoch": 0.25076065426476896, + "grad_norm": 0.7463262104769107, + "learning_rate": 0.00019526674687293525, + "loss": 12.655, + "step": 4605 + }, + { + "epoch": 0.25081510826135195, + "grad_norm": 0.6958274601778409, + "learning_rate": 0.000195264065625959, + "loss": 12.4563, + "step": 4606 + }, + { + "epoch": 0.25086956225793494, + "grad_norm": 0.8744339011342619, + "learning_rate": 0.0001952613836381934, + "loss": 12.6371, + "step": 4607 + }, + { + "epoch": 0.250924016254518, + "grad_norm": 0.7447951983208602, + "learning_rate": 0.00019525870090965935, + "loss": 12.5699, + "step": 4608 + }, + { + "epoch": 0.250978470251101, + "grad_norm": 0.8770057055867937, + "learning_rate": 0.00019525601744037764, + "loss": 12.6631, + "step": 4609 + }, + { + "epoch": 0.25103292424768403, + "grad_norm": 0.704687414173674, + "learning_rate": 0.00019525333323036913, + "loss": 12.4892, + "step": 4610 + }, + { + "epoch": 0.251087378244267, + "grad_norm": 0.8420204054546363, + "learning_rate": 0.0001952506482796548, + "loss": 12.5162, + "step": 4611 + }, + { + "epoch": 0.25114183224085, + "grad_norm": 0.7180873889371547, + "learning_rate": 0.00019524796258825537, + "loss": 12.6455, + "step": 4612 + }, + { + "epoch": 0.25119628623743306, + "grad_norm": 0.737165717454661, + "learning_rate": 0.00019524527615619186, + "loss": 12.7034, + "step": 4613 + }, + { + "epoch": 0.25125074023401606, + "grad_norm": 0.7384097064777905, + "learning_rate": 0.0001952425889834851, + "loss": 12.7689, + "step": 4614 + }, + { + "epoch": 0.25130519423059905, + "grad_norm": 0.8840678095617902, + "learning_rate": 0.00019523990107015598, + "loss": 12.7451, + "step": 4615 + }, + { + "epoch": 0.2513596482271821, + "grad_norm": 0.7163607304960726, + "learning_rate": 0.00019523721241622547, + "loss": 12.4507, + "step": 4616 + }, + { + "epoch": 0.2514141022237651, + "grad_norm": 0.7197221870928445, + "learning_rate": 0.0001952345230217144, + "loss": 12.757, + "step": 4617 + }, + { + "epoch": 0.2514685562203481, + "grad_norm": 0.8539264131334994, + "learning_rate": 0.0001952318328866437, + "loss": 12.5577, + "step": 4618 + }, + { + "epoch": 0.25152301021693113, + "grad_norm": 0.7491476006380345, + "learning_rate": 0.00019522914201103428, + "loss": 12.6483, + "step": 4619 + }, + { + "epoch": 0.2515774642135141, + "grad_norm": 0.6817222614207579, + "learning_rate": 0.00019522645039490708, + "loss": 12.4453, + "step": 4620 + }, + { + "epoch": 0.2516319182100971, + "grad_norm": 0.8458457675512254, + "learning_rate": 0.00019522375803828306, + "loss": 12.6245, + "step": 4621 + }, + { + "epoch": 0.25168637220668016, + "grad_norm": 0.8751640549465032, + "learning_rate": 0.0001952210649411831, + "loss": 12.6798, + "step": 4622 + }, + { + "epoch": 0.25174082620326316, + "grad_norm": 0.753038570022357, + "learning_rate": 0.0001952183711036282, + "loss": 12.5469, + "step": 4623 + }, + { + "epoch": 0.25179528019984615, + "grad_norm": 0.7411325111987628, + "learning_rate": 0.00019521567652563927, + "loss": 12.592, + "step": 4624 + }, + { + "epoch": 0.2518497341964292, + "grad_norm": 0.6747257568929965, + "learning_rate": 0.0001952129812072373, + "loss": 12.4863, + "step": 4625 + }, + { + "epoch": 0.2519041881930122, + "grad_norm": 0.7675323270556783, + "learning_rate": 0.00019521028514844316, + "loss": 12.4248, + "step": 4626 + }, + { + "epoch": 0.2519586421895952, + "grad_norm": 0.6906531701123305, + "learning_rate": 0.00019520758834927788, + "loss": 12.6437, + "step": 4627 + }, + { + "epoch": 0.25201309618617823, + "grad_norm": 0.8792626963119561, + "learning_rate": 0.00019520489080976247, + "loss": 12.6249, + "step": 4628 + }, + { + "epoch": 0.2520675501827612, + "grad_norm": 0.736497661373532, + "learning_rate": 0.00019520219252991785, + "loss": 12.5359, + "step": 4629 + }, + { + "epoch": 0.2521220041793442, + "grad_norm": 0.8371944344160387, + "learning_rate": 0.000195199493509765, + "loss": 12.6581, + "step": 4630 + }, + { + "epoch": 0.25217645817592726, + "grad_norm": 0.6801403777773931, + "learning_rate": 0.00019519679374932494, + "loss": 12.6248, + "step": 4631 + }, + { + "epoch": 0.25223091217251026, + "grad_norm": 0.7314974324886186, + "learning_rate": 0.00019519409324861864, + "loss": 12.5213, + "step": 4632 + }, + { + "epoch": 0.25228536616909325, + "grad_norm": 0.7927647348104103, + "learning_rate": 0.0001951913920076671, + "loss": 12.551, + "step": 4633 + }, + { + "epoch": 0.2523398201656763, + "grad_norm": 0.652316124095355, + "learning_rate": 0.00019518869002649135, + "loss": 12.5875, + "step": 4634 + }, + { + "epoch": 0.2523942741622593, + "grad_norm": 0.7051434214547742, + "learning_rate": 0.00019518598730511238, + "loss": 12.6323, + "step": 4635 + }, + { + "epoch": 0.2524487281588423, + "grad_norm": 0.7956144347172418, + "learning_rate": 0.00019518328384355118, + "loss": 12.7241, + "step": 4636 + }, + { + "epoch": 0.25250318215542533, + "grad_norm": 0.7341236561572148, + "learning_rate": 0.00019518057964182882, + "loss": 12.6638, + "step": 4637 + }, + { + "epoch": 0.2525576361520083, + "grad_norm": 0.8583819376050722, + "learning_rate": 0.0001951778746999663, + "loss": 12.5505, + "step": 4638 + }, + { + "epoch": 0.2526120901485913, + "grad_norm": 0.7002431638214149, + "learning_rate": 0.00019517516901798468, + "loss": 12.5828, + "step": 4639 + }, + { + "epoch": 0.25266654414517437, + "grad_norm": 0.6704356853347958, + "learning_rate": 0.00019517246259590502, + "loss": 12.3791, + "step": 4640 + }, + { + "epoch": 0.25272099814175736, + "grad_norm": 0.776267355947518, + "learning_rate": 0.0001951697554337483, + "loss": 12.6886, + "step": 4641 + }, + { + "epoch": 0.25277545213834035, + "grad_norm": 0.6812531539118747, + "learning_rate": 0.0001951670475315356, + "loss": 12.5179, + "step": 4642 + }, + { + "epoch": 0.2528299061349234, + "grad_norm": 0.7270298855868224, + "learning_rate": 0.00019516433888928795, + "loss": 12.5398, + "step": 4643 + }, + { + "epoch": 0.2528843601315064, + "grad_norm": 0.6768849151005556, + "learning_rate": 0.00019516162950702649, + "loss": 12.5489, + "step": 4644 + }, + { + "epoch": 0.25293881412808944, + "grad_norm": 0.6724977998810044, + "learning_rate": 0.00019515891938477222, + "loss": 12.5863, + "step": 4645 + }, + { + "epoch": 0.25299326812467243, + "grad_norm": 0.7309444180220243, + "learning_rate": 0.00019515620852254625, + "loss": 12.4596, + "step": 4646 + }, + { + "epoch": 0.2530477221212554, + "grad_norm": 0.7977134686558187, + "learning_rate": 0.00019515349692036962, + "loss": 12.7878, + "step": 4647 + }, + { + "epoch": 0.2531021761178385, + "grad_norm": 0.6862794292196578, + "learning_rate": 0.00019515078457826344, + "loss": 12.4598, + "step": 4648 + }, + { + "epoch": 0.25315663011442147, + "grad_norm": 0.701878486537764, + "learning_rate": 0.0001951480714962488, + "loss": 12.5413, + "step": 4649 + }, + { + "epoch": 0.25321108411100446, + "grad_norm": 0.8180141005964334, + "learning_rate": 0.0001951453576743468, + "loss": 12.396, + "step": 4650 + }, + { + "epoch": 0.2532655381075875, + "grad_norm": 0.7346431155727067, + "learning_rate": 0.00019514264311257858, + "loss": 12.6374, + "step": 4651 + }, + { + "epoch": 0.2533199921041705, + "grad_norm": 0.6830933567935363, + "learning_rate": 0.00019513992781096517, + "loss": 12.6301, + "step": 4652 + }, + { + "epoch": 0.2533744461007535, + "grad_norm": 0.716153198686455, + "learning_rate": 0.00019513721176952776, + "loss": 12.568, + "step": 4653 + }, + { + "epoch": 0.25342890009733654, + "grad_norm": 0.6666130419809555, + "learning_rate": 0.00019513449498828738, + "loss": 12.5816, + "step": 4654 + }, + { + "epoch": 0.25348335409391953, + "grad_norm": 0.7618240849952472, + "learning_rate": 0.00019513177746726526, + "loss": 12.5489, + "step": 4655 + }, + { + "epoch": 0.2535378080905025, + "grad_norm": 0.7861808479939798, + "learning_rate": 0.0001951290592064825, + "loss": 12.6229, + "step": 4656 + }, + { + "epoch": 0.2535922620870856, + "grad_norm": 0.7792128962577906, + "learning_rate": 0.00019512634020596022, + "loss": 12.6316, + "step": 4657 + }, + { + "epoch": 0.25364671608366857, + "grad_norm": 0.728082447715372, + "learning_rate": 0.00019512362046571953, + "loss": 12.6125, + "step": 4658 + }, + { + "epoch": 0.25370117008025156, + "grad_norm": 0.6593637366358605, + "learning_rate": 0.00019512089998578163, + "loss": 12.4644, + "step": 4659 + }, + { + "epoch": 0.2537556240768346, + "grad_norm": 0.6873917824355797, + "learning_rate": 0.00019511817876616765, + "loss": 12.6692, + "step": 4660 + }, + { + "epoch": 0.2538100780734176, + "grad_norm": 0.6782354154408184, + "learning_rate": 0.00019511545680689878, + "loss": 12.506, + "step": 4661 + }, + { + "epoch": 0.2538645320700006, + "grad_norm": 0.7415950041617407, + "learning_rate": 0.00019511273410799615, + "loss": 12.5421, + "step": 4662 + }, + { + "epoch": 0.25391898606658364, + "grad_norm": 0.767687803641539, + "learning_rate": 0.00019511001066948097, + "loss": 12.6216, + "step": 4663 + }, + { + "epoch": 0.25397344006316663, + "grad_norm": 0.856036373749487, + "learning_rate": 0.00019510728649137438, + "loss": 12.651, + "step": 4664 + }, + { + "epoch": 0.2540278940597496, + "grad_norm": 0.7352629375371106, + "learning_rate": 0.0001951045615736976, + "loss": 12.5666, + "step": 4665 + }, + { + "epoch": 0.2540823480563327, + "grad_norm": 0.7208212681736773, + "learning_rate": 0.00019510183591647174, + "loss": 12.6415, + "step": 4666 + }, + { + "epoch": 0.25413680205291567, + "grad_norm": 0.7564664297026775, + "learning_rate": 0.00019509910951971812, + "loss": 12.7613, + "step": 4667 + }, + { + "epoch": 0.25419125604949866, + "grad_norm": 0.6759363508762495, + "learning_rate": 0.00019509638238345787, + "loss": 12.5443, + "step": 4668 + }, + { + "epoch": 0.2542457100460817, + "grad_norm": 0.7440815182574181, + "learning_rate": 0.00019509365450771219, + "loss": 12.6338, + "step": 4669 + }, + { + "epoch": 0.2543001640426647, + "grad_norm": 0.7406671177032623, + "learning_rate": 0.00019509092589250232, + "loss": 12.5788, + "step": 4670 + }, + { + "epoch": 0.2543546180392477, + "grad_norm": 0.7009872427479097, + "learning_rate": 0.00019508819653784942, + "loss": 12.6131, + "step": 4671 + }, + { + "epoch": 0.25440907203583074, + "grad_norm": 0.7815802022245177, + "learning_rate": 0.00019508546644377478, + "loss": 12.7572, + "step": 4672 + }, + { + "epoch": 0.25446352603241373, + "grad_norm": 0.6203163966102102, + "learning_rate": 0.00019508273561029963, + "loss": 12.6344, + "step": 4673 + }, + { + "epoch": 0.2545179800289967, + "grad_norm": 0.6469569272437944, + "learning_rate": 0.00019508000403744517, + "loss": 12.7187, + "step": 4674 + }, + { + "epoch": 0.2545724340255798, + "grad_norm": 0.709319102432162, + "learning_rate": 0.00019507727172523264, + "loss": 12.6195, + "step": 4675 + }, + { + "epoch": 0.25462688802216277, + "grad_norm": 0.6847848179775061, + "learning_rate": 0.0001950745386736833, + "loss": 12.717, + "step": 4676 + }, + { + "epoch": 0.2546813420187458, + "grad_norm": 0.6724445808052889, + "learning_rate": 0.0001950718048828184, + "loss": 12.4394, + "step": 4677 + }, + { + "epoch": 0.2547357960153288, + "grad_norm": 0.7258117422289937, + "learning_rate": 0.00019506907035265924, + "loss": 12.6126, + "step": 4678 + }, + { + "epoch": 0.2547902500119118, + "grad_norm": 0.7029932545816441, + "learning_rate": 0.000195066335083227, + "loss": 12.6328, + "step": 4679 + }, + { + "epoch": 0.25484470400849485, + "grad_norm": 0.7363875572775394, + "learning_rate": 0.00019506359907454302, + "loss": 12.6315, + "step": 4680 + }, + { + "epoch": 0.25489915800507784, + "grad_norm": 0.7120951164580571, + "learning_rate": 0.00019506086232662858, + "loss": 12.4803, + "step": 4681 + }, + { + "epoch": 0.25495361200166083, + "grad_norm": 0.7888478831502377, + "learning_rate": 0.00019505812483950488, + "loss": 12.6586, + "step": 4682 + }, + { + "epoch": 0.2550080659982439, + "grad_norm": 0.6638333450862576, + "learning_rate": 0.00019505538661319328, + "loss": 12.5237, + "step": 4683 + }, + { + "epoch": 0.2550625199948269, + "grad_norm": 0.6695752920801711, + "learning_rate": 0.00019505264764771505, + "loss": 12.5669, + "step": 4684 + }, + { + "epoch": 0.25511697399140987, + "grad_norm": 0.7552995697491214, + "learning_rate": 0.00019504990794309151, + "loss": 12.5755, + "step": 4685 + }, + { + "epoch": 0.2551714279879929, + "grad_norm": 0.7076072556809054, + "learning_rate": 0.00019504716749934394, + "loss": 12.5666, + "step": 4686 + }, + { + "epoch": 0.2552258819845759, + "grad_norm": 0.7688772490788264, + "learning_rate": 0.00019504442631649362, + "loss": 12.7533, + "step": 4687 + }, + { + "epoch": 0.2552803359811589, + "grad_norm": 0.7287168089619401, + "learning_rate": 0.00019504168439456193, + "loss": 12.4461, + "step": 4688 + }, + { + "epoch": 0.25533478997774195, + "grad_norm": 0.7621658441592705, + "learning_rate": 0.00019503894173357017, + "loss": 12.4661, + "step": 4689 + }, + { + "epoch": 0.25538924397432494, + "grad_norm": 0.6661750509434194, + "learning_rate": 0.00019503619833353966, + "loss": 12.5776, + "step": 4690 + }, + { + "epoch": 0.25544369797090793, + "grad_norm": 0.8341339017909717, + "learning_rate": 0.00019503345419449172, + "loss": 12.7292, + "step": 4691 + }, + { + "epoch": 0.255498151967491, + "grad_norm": 0.7358515877426577, + "learning_rate": 0.0001950307093164477, + "loss": 12.6913, + "step": 4692 + }, + { + "epoch": 0.255552605964074, + "grad_norm": 0.7633606014871572, + "learning_rate": 0.00019502796369942895, + "loss": 12.5919, + "step": 4693 + }, + { + "epoch": 0.25560705996065697, + "grad_norm": 0.7378217364635543, + "learning_rate": 0.00019502521734345685, + "loss": 12.6138, + "step": 4694 + }, + { + "epoch": 0.25566151395724, + "grad_norm": 0.7411934737877932, + "learning_rate": 0.00019502247024855268, + "loss": 12.5775, + "step": 4695 + }, + { + "epoch": 0.255715967953823, + "grad_norm": 0.6504062988581084, + "learning_rate": 0.00019501972241473786, + "loss": 12.7351, + "step": 4696 + }, + { + "epoch": 0.255770421950406, + "grad_norm": 0.7644212366301545, + "learning_rate": 0.00019501697384203376, + "loss": 12.5324, + "step": 4697 + }, + { + "epoch": 0.25582487594698905, + "grad_norm": 0.6450263672882387, + "learning_rate": 0.00019501422453046174, + "loss": 12.4571, + "step": 4698 + }, + { + "epoch": 0.25587932994357204, + "grad_norm": 0.7020161938921881, + "learning_rate": 0.00019501147448004318, + "loss": 12.5161, + "step": 4699 + }, + { + "epoch": 0.25593378394015504, + "grad_norm": 0.6694991425623197, + "learning_rate": 0.00019500872369079944, + "loss": 12.5092, + "step": 4700 + }, + { + "epoch": 0.2559882379367381, + "grad_norm": 0.6472421122968421, + "learning_rate": 0.0001950059721627519, + "loss": 12.4466, + "step": 4701 + }, + { + "epoch": 0.2560426919333211, + "grad_norm": 0.6765932649098179, + "learning_rate": 0.00019500321989592204, + "loss": 12.5659, + "step": 4702 + }, + { + "epoch": 0.25609714592990407, + "grad_norm": 0.7110563531735874, + "learning_rate": 0.0001950004668903312, + "loss": 12.5553, + "step": 4703 + }, + { + "epoch": 0.2561515999264871, + "grad_norm": 0.729546587513424, + "learning_rate": 0.0001949977131460008, + "loss": 12.5421, + "step": 4704 + }, + { + "epoch": 0.2562060539230701, + "grad_norm": 0.6992897637338722, + "learning_rate": 0.00019499495866295225, + "loss": 12.6526, + "step": 4705 + }, + { + "epoch": 0.2562605079196531, + "grad_norm": 0.7222729187165062, + "learning_rate": 0.00019499220344120697, + "loss": 12.6912, + "step": 4706 + }, + { + "epoch": 0.25631496191623615, + "grad_norm": 0.8148839595759461, + "learning_rate": 0.00019498944748078638, + "loss": 12.6897, + "step": 4707 + }, + { + "epoch": 0.25636941591281914, + "grad_norm": 0.6505194283908418, + "learning_rate": 0.0001949866907817119, + "loss": 12.553, + "step": 4708 + }, + { + "epoch": 0.25642386990940214, + "grad_norm": 0.753503763796297, + "learning_rate": 0.000194983933344005, + "loss": 12.5926, + "step": 4709 + }, + { + "epoch": 0.2564783239059852, + "grad_norm": 0.7228660846866438, + "learning_rate": 0.0001949811751676871, + "loss": 12.5322, + "step": 4710 + }, + { + "epoch": 0.2565327779025682, + "grad_norm": 0.749767652793384, + "learning_rate": 0.00019497841625277967, + "loss": 12.5987, + "step": 4711 + }, + { + "epoch": 0.2565872318991512, + "grad_norm": 0.7264752568702961, + "learning_rate": 0.00019497565659930413, + "loss": 12.6687, + "step": 4712 + }, + { + "epoch": 0.2566416858957342, + "grad_norm": 0.7555201243456778, + "learning_rate": 0.00019497289620728196, + "loss": 12.5931, + "step": 4713 + }, + { + "epoch": 0.2566961398923172, + "grad_norm": 0.7307088174451337, + "learning_rate": 0.00019497013507673464, + "loss": 12.6921, + "step": 4714 + }, + { + "epoch": 0.25675059388890026, + "grad_norm": 0.7658765194364088, + "learning_rate": 0.00019496737320768358, + "loss": 12.6566, + "step": 4715 + }, + { + "epoch": 0.25680504788548325, + "grad_norm": 0.8126722060344513, + "learning_rate": 0.00019496461060015036, + "loss": 12.6151, + "step": 4716 + }, + { + "epoch": 0.25685950188206624, + "grad_norm": 0.6542168082813249, + "learning_rate": 0.00019496184725415635, + "loss": 12.4329, + "step": 4717 + }, + { + "epoch": 0.2569139558786493, + "grad_norm": 0.6634464108209316, + "learning_rate": 0.00019495908316972314, + "loss": 12.4756, + "step": 4718 + }, + { + "epoch": 0.2569684098752323, + "grad_norm": 0.7463975669736462, + "learning_rate": 0.00019495631834687212, + "loss": 12.6905, + "step": 4719 + }, + { + "epoch": 0.2570228638718153, + "grad_norm": 0.7573454303384313, + "learning_rate": 0.00019495355278562488, + "loss": 12.7064, + "step": 4720 + }, + { + "epoch": 0.2570773178683983, + "grad_norm": 0.734300825979996, + "learning_rate": 0.00019495078648600287, + "loss": 12.6519, + "step": 4721 + }, + { + "epoch": 0.2571317718649813, + "grad_norm": 0.6784505364892656, + "learning_rate": 0.00019494801944802762, + "loss": 12.4599, + "step": 4722 + }, + { + "epoch": 0.2571862258615643, + "grad_norm": 0.8001786847577275, + "learning_rate": 0.00019494525167172068, + "loss": 12.7742, + "step": 4723 + }, + { + "epoch": 0.25724067985814736, + "grad_norm": 0.6908462600955748, + "learning_rate": 0.00019494248315710352, + "loss": 12.6113, + "step": 4724 + }, + { + "epoch": 0.25729513385473035, + "grad_norm": 0.6502127393656697, + "learning_rate": 0.0001949397139041977, + "loss": 12.2824, + "step": 4725 + }, + { + "epoch": 0.25734958785131334, + "grad_norm": 0.7030919669677971, + "learning_rate": 0.00019493694391302472, + "loss": 12.6083, + "step": 4726 + }, + { + "epoch": 0.2574040418478964, + "grad_norm": 0.6798754788609441, + "learning_rate": 0.00019493417318360617, + "loss": 12.4886, + "step": 4727 + }, + { + "epoch": 0.2574584958444794, + "grad_norm": 0.7230420580117327, + "learning_rate": 0.00019493140171596355, + "loss": 12.6009, + "step": 4728 + }, + { + "epoch": 0.2575129498410624, + "grad_norm": 0.720929907434017, + "learning_rate": 0.00019492862951011843, + "loss": 12.6411, + "step": 4729 + }, + { + "epoch": 0.2575674038376454, + "grad_norm": 0.7651425086653374, + "learning_rate": 0.00019492585656609237, + "loss": 12.6533, + "step": 4730 + }, + { + "epoch": 0.2576218578342284, + "grad_norm": 0.8158447242348982, + "learning_rate": 0.00019492308288390694, + "loss": 12.69, + "step": 4731 + }, + { + "epoch": 0.2576763118308114, + "grad_norm": 0.7244070048646627, + "learning_rate": 0.00019492030846358368, + "loss": 12.6977, + "step": 4732 + }, + { + "epoch": 0.25773076582739446, + "grad_norm": 0.7611098891379391, + "learning_rate": 0.0001949175333051442, + "loss": 12.7027, + "step": 4733 + }, + { + "epoch": 0.25778521982397745, + "grad_norm": 0.7838925677761639, + "learning_rate": 0.00019491475740861006, + "loss": 12.607, + "step": 4734 + }, + { + "epoch": 0.25783967382056044, + "grad_norm": 0.6918956552390313, + "learning_rate": 0.00019491198077400284, + "loss": 12.5875, + "step": 4735 + }, + { + "epoch": 0.2578941278171435, + "grad_norm": 0.8093985749932328, + "learning_rate": 0.00019490920340134416, + "loss": 12.691, + "step": 4736 + }, + { + "epoch": 0.2579485818137265, + "grad_norm": 0.7667091966816839, + "learning_rate": 0.00019490642529065556, + "loss": 12.6538, + "step": 4737 + }, + { + "epoch": 0.2580030358103095, + "grad_norm": 0.8050816124530826, + "learning_rate": 0.00019490364644195873, + "loss": 12.6441, + "step": 4738 + }, + { + "epoch": 0.2580574898068925, + "grad_norm": 0.7796984412669973, + "learning_rate": 0.0001949008668552752, + "loss": 12.4625, + "step": 4739 + }, + { + "epoch": 0.2581119438034755, + "grad_norm": 0.7954269326095742, + "learning_rate": 0.00019489808653062662, + "loss": 12.6378, + "step": 4740 + }, + { + "epoch": 0.2581663978000585, + "grad_norm": 0.8546297428341599, + "learning_rate": 0.0001948953054680346, + "loss": 12.6656, + "step": 4741 + }, + { + "epoch": 0.25822085179664156, + "grad_norm": 0.6836882241215804, + "learning_rate": 0.0001948925236675208, + "loss": 12.6027, + "step": 4742 + }, + { + "epoch": 0.25827530579322455, + "grad_norm": 0.775457407414149, + "learning_rate": 0.00019488974112910677, + "loss": 12.6133, + "step": 4743 + }, + { + "epoch": 0.2583297597898076, + "grad_norm": 0.6811598006775803, + "learning_rate": 0.00019488695785281425, + "loss": 12.6013, + "step": 4744 + }, + { + "epoch": 0.2583842137863906, + "grad_norm": 0.8090432005026313, + "learning_rate": 0.00019488417383866483, + "loss": 12.7681, + "step": 4745 + }, + { + "epoch": 0.2584386677829736, + "grad_norm": 0.9336443651864992, + "learning_rate": 0.00019488138908668013, + "loss": 12.7671, + "step": 4746 + }, + { + "epoch": 0.25849312177955663, + "grad_norm": 0.6696001900123343, + "learning_rate": 0.00019487860359688184, + "loss": 12.6525, + "step": 4747 + }, + { + "epoch": 0.2585475757761396, + "grad_norm": 0.7765604683795253, + "learning_rate": 0.00019487581736929164, + "loss": 12.563, + "step": 4748 + }, + { + "epoch": 0.2586020297727226, + "grad_norm": 0.716271800511286, + "learning_rate": 0.00019487303040393114, + "loss": 12.6551, + "step": 4749 + }, + { + "epoch": 0.25865648376930567, + "grad_norm": 0.6616733783918025, + "learning_rate": 0.00019487024270082207, + "loss": 12.5925, + "step": 4750 + }, + { + "epoch": 0.25871093776588866, + "grad_norm": 0.7618926859107245, + "learning_rate": 0.0001948674542599861, + "loss": 12.6893, + "step": 4751 + }, + { + "epoch": 0.25876539176247165, + "grad_norm": 0.747910560094233, + "learning_rate": 0.00019486466508144488, + "loss": 12.6486, + "step": 4752 + }, + { + "epoch": 0.2588198457590547, + "grad_norm": 0.8001813888712213, + "learning_rate": 0.0001948618751652201, + "loss": 12.5778, + "step": 4753 + }, + { + "epoch": 0.2588742997556377, + "grad_norm": 0.7698063187849554, + "learning_rate": 0.00019485908451133348, + "loss": 12.577, + "step": 4754 + }, + { + "epoch": 0.2589287537522207, + "grad_norm": 0.7181274213017971, + "learning_rate": 0.0001948562931198067, + "loss": 12.5075, + "step": 4755 + }, + { + "epoch": 0.25898320774880373, + "grad_norm": 0.815996907375043, + "learning_rate": 0.00019485350099066154, + "loss": 12.7174, + "step": 4756 + }, + { + "epoch": 0.2590376617453867, + "grad_norm": 0.8277762565865211, + "learning_rate": 0.00019485070812391957, + "loss": 12.6259, + "step": 4757 + }, + { + "epoch": 0.2590921157419697, + "grad_norm": 0.8255336053710628, + "learning_rate": 0.00019484791451960262, + "loss": 12.5534, + "step": 4758 + }, + { + "epoch": 0.25914656973855277, + "grad_norm": 0.8046986799888131, + "learning_rate": 0.00019484512017773237, + "loss": 12.5145, + "step": 4759 + }, + { + "epoch": 0.25920102373513576, + "grad_norm": 0.7738111831193657, + "learning_rate": 0.00019484232509833058, + "loss": 12.663, + "step": 4760 + }, + { + "epoch": 0.25925547773171875, + "grad_norm": 0.7868122530602073, + "learning_rate": 0.00019483952928141894, + "loss": 12.5728, + "step": 4761 + }, + { + "epoch": 0.2593099317283018, + "grad_norm": 0.8252109638843789, + "learning_rate": 0.00019483673272701927, + "loss": 12.6221, + "step": 4762 + }, + { + "epoch": 0.2593643857248848, + "grad_norm": 0.752918560811296, + "learning_rate": 0.00019483393543515322, + "loss": 12.5168, + "step": 4763 + }, + { + "epoch": 0.2594188397214678, + "grad_norm": 0.6652628886031119, + "learning_rate": 0.00019483113740584256, + "loss": 12.5479, + "step": 4764 + }, + { + "epoch": 0.25947329371805083, + "grad_norm": 0.8406383840699488, + "learning_rate": 0.0001948283386391091, + "loss": 12.5782, + "step": 4765 + }, + { + "epoch": 0.2595277477146338, + "grad_norm": 0.6806302328159711, + "learning_rate": 0.00019482553913497457, + "loss": 12.5683, + "step": 4766 + }, + { + "epoch": 0.2595822017112168, + "grad_norm": 0.6847797554348255, + "learning_rate": 0.00019482273889346075, + "loss": 12.4412, + "step": 4767 + }, + { + "epoch": 0.25963665570779987, + "grad_norm": 0.7825591587553425, + "learning_rate": 0.0001948199379145894, + "loss": 12.5548, + "step": 4768 + }, + { + "epoch": 0.25969110970438286, + "grad_norm": 0.6803581758239637, + "learning_rate": 0.00019481713619838234, + "loss": 12.6537, + "step": 4769 + }, + { + "epoch": 0.25974556370096585, + "grad_norm": 0.8865961981147894, + "learning_rate": 0.0001948143337448613, + "loss": 12.716, + "step": 4770 + }, + { + "epoch": 0.2598000176975489, + "grad_norm": 0.8012560988317154, + "learning_rate": 0.0001948115305540481, + "loss": 12.6787, + "step": 4771 + }, + { + "epoch": 0.2598544716941319, + "grad_norm": 0.7402747722806818, + "learning_rate": 0.00019480872662596457, + "loss": 12.5813, + "step": 4772 + }, + { + "epoch": 0.2599089256907149, + "grad_norm": 0.7862461301402442, + "learning_rate": 0.00019480592196063245, + "loss": 12.6447, + "step": 4773 + }, + { + "epoch": 0.25996337968729794, + "grad_norm": 0.6896585549357185, + "learning_rate": 0.0001948031165580736, + "loss": 12.6612, + "step": 4774 + }, + { + "epoch": 0.26001783368388093, + "grad_norm": 0.7165201707189586, + "learning_rate": 0.0001948003104183098, + "loss": 12.6166, + "step": 4775 + }, + { + "epoch": 0.2600722876804639, + "grad_norm": 0.772230066475727, + "learning_rate": 0.0001947975035413629, + "loss": 12.5939, + "step": 4776 + }, + { + "epoch": 0.26012674167704697, + "grad_norm": 0.6933004562202322, + "learning_rate": 0.0001947946959272547, + "loss": 12.6642, + "step": 4777 + }, + { + "epoch": 0.26018119567362996, + "grad_norm": 0.7647195199286283, + "learning_rate": 0.0001947918875760071, + "loss": 12.5439, + "step": 4778 + }, + { + "epoch": 0.260235649670213, + "grad_norm": 0.7889058775371521, + "learning_rate": 0.00019478907848764182, + "loss": 12.7009, + "step": 4779 + }, + { + "epoch": 0.260290103666796, + "grad_norm": 0.7187506723561059, + "learning_rate": 0.0001947862686621808, + "loss": 12.5066, + "step": 4780 + }, + { + "epoch": 0.260344557663379, + "grad_norm": 0.6356252354910998, + "learning_rate": 0.00019478345809964583, + "loss": 12.4976, + "step": 4781 + }, + { + "epoch": 0.26039901165996204, + "grad_norm": 0.8288372345791144, + "learning_rate": 0.00019478064680005885, + "loss": 12.6177, + "step": 4782 + }, + { + "epoch": 0.26045346565654504, + "grad_norm": 0.7627296418926722, + "learning_rate": 0.00019477783476344162, + "loss": 12.6029, + "step": 4783 + }, + { + "epoch": 0.26050791965312803, + "grad_norm": 0.6565336081551778, + "learning_rate": 0.0001947750219898161, + "loss": 12.4271, + "step": 4784 + }, + { + "epoch": 0.2605623736497111, + "grad_norm": 0.6840268317999146, + "learning_rate": 0.00019477220847920405, + "loss": 12.573, + "step": 4785 + }, + { + "epoch": 0.26061682764629407, + "grad_norm": 0.728386555080175, + "learning_rate": 0.00019476939423162745, + "loss": 12.696, + "step": 4786 + }, + { + "epoch": 0.26067128164287706, + "grad_norm": 0.7978933441928162, + "learning_rate": 0.00019476657924710815, + "loss": 12.5638, + "step": 4787 + }, + { + "epoch": 0.2607257356394601, + "grad_norm": 0.8473115840042355, + "learning_rate": 0.00019476376352566804, + "loss": 12.4955, + "step": 4788 + }, + { + "epoch": 0.2607801896360431, + "grad_norm": 0.7401753572573325, + "learning_rate": 0.000194760947067329, + "loss": 12.6471, + "step": 4789 + }, + { + "epoch": 0.2608346436326261, + "grad_norm": 0.7178835390453441, + "learning_rate": 0.00019475812987211294, + "loss": 12.4338, + "step": 4790 + }, + { + "epoch": 0.26088909762920914, + "grad_norm": 0.8262283811762116, + "learning_rate": 0.00019475531194004176, + "loss": 12.7366, + "step": 4791 + }, + { + "epoch": 0.26094355162579214, + "grad_norm": 0.6845621603784252, + "learning_rate": 0.00019475249327113742, + "loss": 12.6441, + "step": 4792 + }, + { + "epoch": 0.26099800562237513, + "grad_norm": 0.7739572980317966, + "learning_rate": 0.0001947496738654218, + "loss": 12.493, + "step": 4793 + }, + { + "epoch": 0.2610524596189582, + "grad_norm": 0.6352423477991079, + "learning_rate": 0.0001947468537229168, + "loss": 12.5679, + "step": 4794 + }, + { + "epoch": 0.26110691361554117, + "grad_norm": 0.6745795145399036, + "learning_rate": 0.0001947440328436444, + "loss": 12.5574, + "step": 4795 + }, + { + "epoch": 0.26116136761212416, + "grad_norm": 0.6633490884387779, + "learning_rate": 0.0001947412112276265, + "loss": 12.5556, + "step": 4796 + }, + { + "epoch": 0.2612158216087072, + "grad_norm": 0.6370335086578439, + "learning_rate": 0.00019473838887488506, + "loss": 12.4562, + "step": 4797 + }, + { + "epoch": 0.2612702756052902, + "grad_norm": 0.6154624229446888, + "learning_rate": 0.00019473556578544201, + "loss": 12.4528, + "step": 4798 + }, + { + "epoch": 0.2613247296018732, + "grad_norm": 0.7574987473958148, + "learning_rate": 0.00019473274195931932, + "loss": 12.5948, + "step": 4799 + }, + { + "epoch": 0.26137918359845624, + "grad_norm": 0.660503906328179, + "learning_rate": 0.00019472991739653893, + "loss": 12.5449, + "step": 4800 + }, + { + "epoch": 0.26143363759503924, + "grad_norm": 0.680312990012292, + "learning_rate": 0.00019472709209712282, + "loss": 12.599, + "step": 4801 + }, + { + "epoch": 0.26148809159162223, + "grad_norm": 0.6525863345468783, + "learning_rate": 0.00019472426606109299, + "loss": 12.5575, + "step": 4802 + }, + { + "epoch": 0.2615425455882053, + "grad_norm": 0.719565202973381, + "learning_rate": 0.00019472143928847134, + "loss": 12.6364, + "step": 4803 + }, + { + "epoch": 0.26159699958478827, + "grad_norm": 0.6854658367976487, + "learning_rate": 0.0001947186117792799, + "loss": 12.6155, + "step": 4804 + }, + { + "epoch": 0.26165145358137126, + "grad_norm": 0.6462389765626788, + "learning_rate": 0.00019471578353354066, + "loss": 12.4988, + "step": 4805 + }, + { + "epoch": 0.2617059075779543, + "grad_norm": 0.7059880398408249, + "learning_rate": 0.0001947129545512756, + "loss": 12.6068, + "step": 4806 + }, + { + "epoch": 0.2617603615745373, + "grad_norm": 0.6738481299117473, + "learning_rate": 0.00019471012483250673, + "loss": 12.6121, + "step": 4807 + }, + { + "epoch": 0.2618148155711203, + "grad_norm": 0.7047780449464357, + "learning_rate": 0.00019470729437725604, + "loss": 12.5718, + "step": 4808 + }, + { + "epoch": 0.26186926956770334, + "grad_norm": 0.6574646377296559, + "learning_rate": 0.00019470446318554553, + "loss": 12.5995, + "step": 4809 + }, + { + "epoch": 0.26192372356428634, + "grad_norm": 0.9005206763366099, + "learning_rate": 0.00019470163125739727, + "loss": 12.6679, + "step": 4810 + }, + { + "epoch": 0.2619781775608694, + "grad_norm": 0.7396838483648821, + "learning_rate": 0.0001946987985928332, + "loss": 12.5763, + "step": 4811 + }, + { + "epoch": 0.2620326315574524, + "grad_norm": 0.7708475776756964, + "learning_rate": 0.00019469596519187542, + "loss": 12.6198, + "step": 4812 + }, + { + "epoch": 0.26208708555403537, + "grad_norm": 0.7720119216448281, + "learning_rate": 0.00019469313105454595, + "loss": 12.6241, + "step": 4813 + }, + { + "epoch": 0.2621415395506184, + "grad_norm": 0.7334360755883025, + "learning_rate": 0.00019469029618086677, + "loss": 12.6099, + "step": 4814 + }, + { + "epoch": 0.2621959935472014, + "grad_norm": 0.7044540805944692, + "learning_rate": 0.00019468746057086002, + "loss": 12.5668, + "step": 4815 + }, + { + "epoch": 0.2622504475437844, + "grad_norm": 0.7207583424255852, + "learning_rate": 0.00019468462422454766, + "loss": 12.6091, + "step": 4816 + }, + { + "epoch": 0.26230490154036745, + "grad_norm": 0.7101695408830705, + "learning_rate": 0.00019468178714195179, + "loss": 12.6525, + "step": 4817 + }, + { + "epoch": 0.26235935553695044, + "grad_norm": 0.6242627094997542, + "learning_rate": 0.00019467894932309444, + "loss": 12.5819, + "step": 4818 + }, + { + "epoch": 0.26241380953353344, + "grad_norm": 0.757938269214848, + "learning_rate": 0.00019467611076799774, + "loss": 12.6262, + "step": 4819 + }, + { + "epoch": 0.2624682635301165, + "grad_norm": 0.6433189196932046, + "learning_rate": 0.00019467327147668371, + "loss": 12.502, + "step": 4820 + }, + { + "epoch": 0.2625227175266995, + "grad_norm": 0.6852794377013726, + "learning_rate": 0.00019467043144917443, + "loss": 12.6259, + "step": 4821 + }, + { + "epoch": 0.26257717152328247, + "grad_norm": 0.7833380402320976, + "learning_rate": 0.000194667590685492, + "loss": 12.6435, + "step": 4822 + }, + { + "epoch": 0.2626316255198655, + "grad_norm": 0.6945606666550747, + "learning_rate": 0.00019466474918565854, + "loss": 12.4367, + "step": 4823 + }, + { + "epoch": 0.2626860795164485, + "grad_norm": 0.7123898769033836, + "learning_rate": 0.00019466190694969612, + "loss": 12.509, + "step": 4824 + }, + { + "epoch": 0.2627405335130315, + "grad_norm": 0.7654131241421731, + "learning_rate": 0.00019465906397762682, + "loss": 12.495, + "step": 4825 + }, + { + "epoch": 0.26279498750961455, + "grad_norm": 0.8301231959722061, + "learning_rate": 0.00019465622026947275, + "loss": 12.757, + "step": 4826 + }, + { + "epoch": 0.26284944150619755, + "grad_norm": 0.6288940118080169, + "learning_rate": 0.00019465337582525604, + "loss": 12.4903, + "step": 4827 + }, + { + "epoch": 0.26290389550278054, + "grad_norm": 0.7417776345417484, + "learning_rate": 0.0001946505306449988, + "loss": 12.5361, + "step": 4828 + }, + { + "epoch": 0.2629583494993636, + "grad_norm": 0.7752400517739513, + "learning_rate": 0.00019464768472872318, + "loss": 12.6089, + "step": 4829 + }, + { + "epoch": 0.2630128034959466, + "grad_norm": 0.5998676948980813, + "learning_rate": 0.00019464483807645128, + "loss": 12.4756, + "step": 4830 + }, + { + "epoch": 0.26306725749252957, + "grad_norm": 0.7658214146870006, + "learning_rate": 0.00019464199068820528, + "loss": 12.6996, + "step": 4831 + }, + { + "epoch": 0.2631217114891126, + "grad_norm": 0.72957856770116, + "learning_rate": 0.00019463914256400723, + "loss": 12.4247, + "step": 4832 + }, + { + "epoch": 0.2631761654856956, + "grad_norm": 0.7943362289892251, + "learning_rate": 0.0001946362937038794, + "loss": 12.6922, + "step": 4833 + }, + { + "epoch": 0.2632306194822786, + "grad_norm": 0.6523756773773719, + "learning_rate": 0.00019463344410784383, + "loss": 12.4808, + "step": 4834 + }, + { + "epoch": 0.26328507347886165, + "grad_norm": 0.6861067893129501, + "learning_rate": 0.00019463059377592274, + "loss": 12.5381, + "step": 4835 + }, + { + "epoch": 0.26333952747544465, + "grad_norm": 0.6409161291059233, + "learning_rate": 0.0001946277427081383, + "loss": 12.5751, + "step": 4836 + }, + { + "epoch": 0.26339398147202764, + "grad_norm": 0.8461938435521417, + "learning_rate": 0.00019462489090451266, + "loss": 12.6144, + "step": 4837 + }, + { + "epoch": 0.2634484354686107, + "grad_norm": 0.6873872509981911, + "learning_rate": 0.000194622038365068, + "loss": 12.6681, + "step": 4838 + }, + { + "epoch": 0.2635028894651937, + "grad_norm": 0.809696924328214, + "learning_rate": 0.00019461918508982646, + "loss": 12.5898, + "step": 4839 + }, + { + "epoch": 0.26355734346177667, + "grad_norm": 0.7459462348725406, + "learning_rate": 0.00019461633107881033, + "loss": 12.7425, + "step": 4840 + }, + { + "epoch": 0.2636117974583597, + "grad_norm": 0.6404604693639886, + "learning_rate": 0.0001946134763320417, + "loss": 12.5621, + "step": 4841 + }, + { + "epoch": 0.2636662514549427, + "grad_norm": 0.6782170515842123, + "learning_rate": 0.00019461062084954285, + "loss": 12.5363, + "step": 4842 + }, + { + "epoch": 0.2637207054515257, + "grad_norm": 0.7028036018440581, + "learning_rate": 0.0001946077646313359, + "loss": 12.5666, + "step": 4843 + }, + { + "epoch": 0.26377515944810875, + "grad_norm": 0.6826374646724563, + "learning_rate": 0.00019460490767744313, + "loss": 12.6038, + "step": 4844 + }, + { + "epoch": 0.26382961344469175, + "grad_norm": 0.6777141901476166, + "learning_rate": 0.00019460204998788673, + "loss": 12.5829, + "step": 4845 + }, + { + "epoch": 0.2638840674412748, + "grad_norm": 0.6738235373260646, + "learning_rate": 0.00019459919156268894, + "loss": 12.5814, + "step": 4846 + }, + { + "epoch": 0.2639385214378578, + "grad_norm": 0.6708886000655676, + "learning_rate": 0.00019459633240187193, + "loss": 12.6222, + "step": 4847 + }, + { + "epoch": 0.2639929754344408, + "grad_norm": 0.7305811832108289, + "learning_rate": 0.00019459347250545803, + "loss": 12.5633, + "step": 4848 + }, + { + "epoch": 0.26404742943102383, + "grad_norm": 0.7260244255550706, + "learning_rate": 0.00019459061187346942, + "loss": 12.5623, + "step": 4849 + }, + { + "epoch": 0.2641018834276068, + "grad_norm": 0.6852163751677094, + "learning_rate": 0.0001945877505059283, + "loss": 12.5025, + "step": 4850 + }, + { + "epoch": 0.2641563374241898, + "grad_norm": 0.6700399729180287, + "learning_rate": 0.000194584888402857, + "loss": 12.5428, + "step": 4851 + }, + { + "epoch": 0.26421079142077286, + "grad_norm": 0.7012076478375378, + "learning_rate": 0.00019458202556427775, + "loss": 12.6202, + "step": 4852 + }, + { + "epoch": 0.26426524541735585, + "grad_norm": 0.767091545667911, + "learning_rate": 0.0001945791619902128, + "loss": 12.4397, + "step": 4853 + }, + { + "epoch": 0.26431969941393885, + "grad_norm": 0.7176345157155276, + "learning_rate": 0.00019457629768068443, + "loss": 12.5838, + "step": 4854 + }, + { + "epoch": 0.2643741534105219, + "grad_norm": 0.7077784155956962, + "learning_rate": 0.0001945734326357149, + "loss": 12.4677, + "step": 4855 + }, + { + "epoch": 0.2644286074071049, + "grad_norm": 0.6837911365687319, + "learning_rate": 0.00019457056685532652, + "loss": 12.492, + "step": 4856 + }, + { + "epoch": 0.2644830614036879, + "grad_norm": 0.7466557995988053, + "learning_rate": 0.0001945677003395415, + "loss": 12.5368, + "step": 4857 + }, + { + "epoch": 0.26453751540027093, + "grad_norm": 0.7541821594245927, + "learning_rate": 0.00019456483308838226, + "loss": 12.7146, + "step": 4858 + }, + { + "epoch": 0.2645919693968539, + "grad_norm": 0.7277307319226684, + "learning_rate": 0.00019456196510187095, + "loss": 12.548, + "step": 4859 + }, + { + "epoch": 0.2646464233934369, + "grad_norm": 0.750180857479281, + "learning_rate": 0.00019455909638002998, + "loss": 12.6692, + "step": 4860 + }, + { + "epoch": 0.26470087739001996, + "grad_norm": 0.6882113934021395, + "learning_rate": 0.0001945562269228816, + "loss": 12.5586, + "step": 4861 + }, + { + "epoch": 0.26475533138660295, + "grad_norm": 0.6876200797232995, + "learning_rate": 0.00019455335673044814, + "loss": 12.6751, + "step": 4862 + }, + { + "epoch": 0.26480978538318595, + "grad_norm": 0.7413062407404183, + "learning_rate": 0.00019455048580275193, + "loss": 12.56, + "step": 4863 + }, + { + "epoch": 0.264864239379769, + "grad_norm": 0.7997680346304358, + "learning_rate": 0.0001945476141398153, + "loss": 12.5148, + "step": 4864 + }, + { + "epoch": 0.264918693376352, + "grad_norm": 0.7037783550914863, + "learning_rate": 0.00019454474174166055, + "loss": 12.4402, + "step": 4865 + }, + { + "epoch": 0.264973147372935, + "grad_norm": 0.719253456465287, + "learning_rate": 0.00019454186860831004, + "loss": 12.526, + "step": 4866 + }, + { + "epoch": 0.26502760136951803, + "grad_norm": 0.7821364793198318, + "learning_rate": 0.0001945389947397861, + "loss": 12.6374, + "step": 4867 + }, + { + "epoch": 0.265082055366101, + "grad_norm": 0.8169352730341746, + "learning_rate": 0.0001945361201361111, + "loss": 12.6756, + "step": 4868 + }, + { + "epoch": 0.265136509362684, + "grad_norm": 0.6669004440259494, + "learning_rate": 0.00019453324479730736, + "loss": 12.512, + "step": 4869 + }, + { + "epoch": 0.26519096335926706, + "grad_norm": 0.6948054201853392, + "learning_rate": 0.00019453036872339727, + "loss": 12.6744, + "step": 4870 + }, + { + "epoch": 0.26524541735585005, + "grad_norm": 0.6595901733830262, + "learning_rate": 0.00019452749191440315, + "loss": 12.6445, + "step": 4871 + }, + { + "epoch": 0.26529987135243305, + "grad_norm": 0.7250972141249324, + "learning_rate": 0.00019452461437034744, + "loss": 12.6358, + "step": 4872 + }, + { + "epoch": 0.2653543253490161, + "grad_norm": 0.8354176409927749, + "learning_rate": 0.00019452173609125245, + "loss": 12.6428, + "step": 4873 + }, + { + "epoch": 0.2654087793455991, + "grad_norm": 0.6850492700866151, + "learning_rate": 0.0001945188570771406, + "loss": 12.565, + "step": 4874 + }, + { + "epoch": 0.2654632333421821, + "grad_norm": 0.7474772489873514, + "learning_rate": 0.00019451597732803426, + "loss": 12.4716, + "step": 4875 + }, + { + "epoch": 0.26551768733876513, + "grad_norm": 0.7239459348388512, + "learning_rate": 0.00019451309684395581, + "loss": 12.5, + "step": 4876 + }, + { + "epoch": 0.2655721413353481, + "grad_norm": 0.7013468148695667, + "learning_rate": 0.0001945102156249277, + "loss": 12.6859, + "step": 4877 + }, + { + "epoch": 0.26562659533193117, + "grad_norm": 0.6306546397409388, + "learning_rate": 0.00019450733367097232, + "loss": 12.258, + "step": 4878 + }, + { + "epoch": 0.26568104932851416, + "grad_norm": 0.7732678904459518, + "learning_rate": 0.00019450445098211203, + "loss": 12.5905, + "step": 4879 + }, + { + "epoch": 0.26573550332509716, + "grad_norm": 0.7102571767926045, + "learning_rate": 0.00019450156755836928, + "loss": 12.5944, + "step": 4880 + }, + { + "epoch": 0.2657899573216802, + "grad_norm": 0.7428771349602378, + "learning_rate": 0.0001944986833997665, + "loss": 12.7083, + "step": 4881 + }, + { + "epoch": 0.2658444113182632, + "grad_norm": 0.6644542846070517, + "learning_rate": 0.0001944957985063261, + "loss": 12.5964, + "step": 4882 + }, + { + "epoch": 0.2658988653148462, + "grad_norm": 0.6076981577547662, + "learning_rate": 0.00019449291287807055, + "loss": 12.5323, + "step": 4883 + }, + { + "epoch": 0.26595331931142924, + "grad_norm": 0.6774321021161596, + "learning_rate": 0.00019449002651502224, + "loss": 12.628, + "step": 4884 + }, + { + "epoch": 0.26600777330801223, + "grad_norm": 0.6469750254177549, + "learning_rate": 0.00019448713941720364, + "loss": 12.5591, + "step": 4885 + }, + { + "epoch": 0.2660622273045952, + "grad_norm": 0.6478552446275883, + "learning_rate": 0.00019448425158463724, + "loss": 12.5149, + "step": 4886 + }, + { + "epoch": 0.26611668130117827, + "grad_norm": 0.6353492916660458, + "learning_rate": 0.0001944813630173454, + "loss": 12.6437, + "step": 4887 + }, + { + "epoch": 0.26617113529776126, + "grad_norm": 0.7929604305168886, + "learning_rate": 0.00019447847371535066, + "loss": 12.6283, + "step": 4888 + }, + { + "epoch": 0.26622558929434426, + "grad_norm": 0.6898416589120709, + "learning_rate": 0.00019447558367867543, + "loss": 12.5089, + "step": 4889 + }, + { + "epoch": 0.2662800432909273, + "grad_norm": 0.7295361652237429, + "learning_rate": 0.0001944726929073423, + "loss": 12.5732, + "step": 4890 + }, + { + "epoch": 0.2663344972875103, + "grad_norm": 0.7027949921759142, + "learning_rate": 0.00019446980140137358, + "loss": 12.6327, + "step": 4891 + }, + { + "epoch": 0.2663889512840933, + "grad_norm": 0.7035390676555576, + "learning_rate": 0.0001944669091607919, + "loss": 12.6155, + "step": 4892 + }, + { + "epoch": 0.26644340528067634, + "grad_norm": 0.6927777072601664, + "learning_rate": 0.00019446401618561967, + "loss": 12.6609, + "step": 4893 + }, + { + "epoch": 0.26649785927725933, + "grad_norm": 0.7113014488080182, + "learning_rate": 0.0001944611224758794, + "loss": 12.6829, + "step": 4894 + }, + { + "epoch": 0.2665523132738423, + "grad_norm": 0.6937576303435324, + "learning_rate": 0.00019445822803159358, + "loss": 12.4869, + "step": 4895 + }, + { + "epoch": 0.26660676727042537, + "grad_norm": 0.7508896257482275, + "learning_rate": 0.00019445533285278478, + "loss": 12.6736, + "step": 4896 + }, + { + "epoch": 0.26666122126700836, + "grad_norm": 0.6664214127305361, + "learning_rate": 0.00019445243693947547, + "loss": 12.7157, + "step": 4897 + }, + { + "epoch": 0.26671567526359136, + "grad_norm": 0.7160016567620926, + "learning_rate": 0.00019444954029168815, + "loss": 12.5127, + "step": 4898 + }, + { + "epoch": 0.2667701292601744, + "grad_norm": 0.7095136875260772, + "learning_rate": 0.00019444664290944538, + "loss": 12.6951, + "step": 4899 + }, + { + "epoch": 0.2668245832567574, + "grad_norm": 0.7889749179258148, + "learning_rate": 0.00019444374479276968, + "loss": 12.5083, + "step": 4900 + }, + { + "epoch": 0.2668790372533404, + "grad_norm": 0.6209670074747341, + "learning_rate": 0.00019444084594168358, + "loss": 12.5154, + "step": 4901 + }, + { + "epoch": 0.26693349124992344, + "grad_norm": 0.7192247657918985, + "learning_rate": 0.0001944379463562096, + "loss": 12.6027, + "step": 4902 + }, + { + "epoch": 0.26698794524650643, + "grad_norm": 0.877614733642246, + "learning_rate": 0.00019443504603637032, + "loss": 12.4859, + "step": 4903 + }, + { + "epoch": 0.2670423992430894, + "grad_norm": 0.622920397255274, + "learning_rate": 0.0001944321449821883, + "loss": 12.5212, + "step": 4904 + }, + { + "epoch": 0.26709685323967247, + "grad_norm": 0.8231902555703173, + "learning_rate": 0.0001944292431936861, + "loss": 12.5708, + "step": 4905 + }, + { + "epoch": 0.26715130723625546, + "grad_norm": 0.7014509423459101, + "learning_rate": 0.00019442634067088623, + "loss": 12.6247, + "step": 4906 + }, + { + "epoch": 0.26720576123283846, + "grad_norm": 0.7338425811141995, + "learning_rate": 0.00019442343741381133, + "loss": 12.567, + "step": 4907 + }, + { + "epoch": 0.2672602152294215, + "grad_norm": 0.7128851004410109, + "learning_rate": 0.00019442053342248392, + "loss": 12.5589, + "step": 4908 + }, + { + "epoch": 0.2673146692260045, + "grad_norm": 0.7640141068258829, + "learning_rate": 0.00019441762869692664, + "loss": 12.5248, + "step": 4909 + }, + { + "epoch": 0.2673691232225875, + "grad_norm": 0.7195831536886108, + "learning_rate": 0.000194414723237162, + "loss": 12.6038, + "step": 4910 + }, + { + "epoch": 0.26742357721917054, + "grad_norm": 0.702981087273901, + "learning_rate": 0.00019441181704321267, + "loss": 12.6929, + "step": 4911 + }, + { + "epoch": 0.26747803121575353, + "grad_norm": 0.6649595768416755, + "learning_rate": 0.00019440891011510123, + "loss": 12.6238, + "step": 4912 + }, + { + "epoch": 0.2675324852123366, + "grad_norm": 0.7465051804702764, + "learning_rate": 0.00019440600245285023, + "loss": 12.5796, + "step": 4913 + }, + { + "epoch": 0.26758693920891957, + "grad_norm": 0.6771595729186441, + "learning_rate": 0.00019440309405648236, + "loss": 12.6006, + "step": 4914 + }, + { + "epoch": 0.26764139320550256, + "grad_norm": 0.7984906107431816, + "learning_rate": 0.0001944001849260202, + "loss": 12.5348, + "step": 4915 + }, + { + "epoch": 0.2676958472020856, + "grad_norm": 0.6987809489400444, + "learning_rate": 0.00019439727506148635, + "loss": 12.6028, + "step": 4916 + }, + { + "epoch": 0.2677503011986686, + "grad_norm": 0.7515636815166795, + "learning_rate": 0.00019439436446290346, + "loss": 12.5774, + "step": 4917 + }, + { + "epoch": 0.2678047551952516, + "grad_norm": 0.7772563152009838, + "learning_rate": 0.00019439145313029417, + "loss": 12.6186, + "step": 4918 + }, + { + "epoch": 0.26785920919183465, + "grad_norm": 0.7773179505401135, + "learning_rate": 0.00019438854106368112, + "loss": 12.6273, + "step": 4919 + }, + { + "epoch": 0.26791366318841764, + "grad_norm": 0.8029707125727654, + "learning_rate": 0.00019438562826308692, + "loss": 12.7813, + "step": 4920 + }, + { + "epoch": 0.26796811718500063, + "grad_norm": 0.6739494023869455, + "learning_rate": 0.00019438271472853427, + "loss": 12.5858, + "step": 4921 + }, + { + "epoch": 0.2680225711815837, + "grad_norm": 0.7172542758143176, + "learning_rate": 0.0001943798004600458, + "loss": 12.7093, + "step": 4922 + }, + { + "epoch": 0.2680770251781667, + "grad_norm": 0.8540214909937995, + "learning_rate": 0.00019437688545764417, + "loss": 12.6435, + "step": 4923 + }, + { + "epoch": 0.26813147917474967, + "grad_norm": 0.6922381771184539, + "learning_rate": 0.00019437396972135206, + "loss": 12.5729, + "step": 4924 + }, + { + "epoch": 0.2681859331713327, + "grad_norm": 0.7388596470211966, + "learning_rate": 0.00019437105325119212, + "loss": 12.6941, + "step": 4925 + }, + { + "epoch": 0.2682403871679157, + "grad_norm": 0.6799801483566477, + "learning_rate": 0.00019436813604718705, + "loss": 12.58, + "step": 4926 + }, + { + "epoch": 0.2682948411644987, + "grad_norm": 0.677525774257368, + "learning_rate": 0.00019436521810935954, + "loss": 12.3613, + "step": 4927 + }, + { + "epoch": 0.26834929516108175, + "grad_norm": 0.7070330522678878, + "learning_rate": 0.00019436229943773224, + "loss": 12.5587, + "step": 4928 + }, + { + "epoch": 0.26840374915766474, + "grad_norm": 0.7431247845470497, + "learning_rate": 0.0001943593800323279, + "loss": 12.7184, + "step": 4929 + }, + { + "epoch": 0.26845820315424773, + "grad_norm": 0.7448892262812907, + "learning_rate": 0.00019435645989316917, + "loss": 12.521, + "step": 4930 + }, + { + "epoch": 0.2685126571508308, + "grad_norm": 0.7563793589303949, + "learning_rate": 0.00019435353902027882, + "loss": 12.8593, + "step": 4931 + }, + { + "epoch": 0.2685671111474138, + "grad_norm": 0.6965002870763543, + "learning_rate": 0.00019435061741367952, + "loss": 12.5444, + "step": 4932 + }, + { + "epoch": 0.26862156514399677, + "grad_norm": 0.7845469164455278, + "learning_rate": 0.00019434769507339396, + "loss": 12.6502, + "step": 4933 + }, + { + "epoch": 0.2686760191405798, + "grad_norm": 0.6084132495946784, + "learning_rate": 0.00019434477199944494, + "loss": 12.6393, + "step": 4934 + }, + { + "epoch": 0.2687304731371628, + "grad_norm": 0.7270494062285637, + "learning_rate": 0.00019434184819185516, + "loss": 12.5558, + "step": 4935 + }, + { + "epoch": 0.2687849271337458, + "grad_norm": 0.7449005630000488, + "learning_rate": 0.0001943389236506473, + "loss": 12.6898, + "step": 4936 + }, + { + "epoch": 0.26883938113032885, + "grad_norm": 0.7542098282712374, + "learning_rate": 0.0001943359983758442, + "loss": 12.5602, + "step": 4937 + }, + { + "epoch": 0.26889383512691184, + "grad_norm": 0.6825357213113328, + "learning_rate": 0.00019433307236746853, + "loss": 12.6414, + "step": 4938 + }, + { + "epoch": 0.26894828912349483, + "grad_norm": 0.7596587455890367, + "learning_rate": 0.00019433014562554306, + "loss": 12.4791, + "step": 4939 + }, + { + "epoch": 0.2690027431200779, + "grad_norm": 0.7335019063446175, + "learning_rate": 0.00019432721815009057, + "loss": 12.5679, + "step": 4940 + }, + { + "epoch": 0.2690571971166609, + "grad_norm": 0.8002044902081036, + "learning_rate": 0.0001943242899411338, + "loss": 12.6291, + "step": 4941 + }, + { + "epoch": 0.26911165111324387, + "grad_norm": 0.7181184194287559, + "learning_rate": 0.00019432136099869555, + "loss": 12.5769, + "step": 4942 + }, + { + "epoch": 0.2691661051098269, + "grad_norm": 0.693469130365641, + "learning_rate": 0.0001943184313227986, + "loss": 12.6736, + "step": 4943 + }, + { + "epoch": 0.2692205591064099, + "grad_norm": 0.7883304347264259, + "learning_rate": 0.00019431550091346565, + "loss": 12.4797, + "step": 4944 + }, + { + "epoch": 0.26927501310299296, + "grad_norm": 0.7675592218688019, + "learning_rate": 0.0001943125697707196, + "loss": 12.599, + "step": 4945 + }, + { + "epoch": 0.26932946709957595, + "grad_norm": 0.721373700751928, + "learning_rate": 0.0001943096378945832, + "loss": 12.436, + "step": 4946 + }, + { + "epoch": 0.26938392109615894, + "grad_norm": 0.7432999919007355, + "learning_rate": 0.0001943067052850792, + "loss": 12.6031, + "step": 4947 + }, + { + "epoch": 0.269438375092742, + "grad_norm": 0.700930234427065, + "learning_rate": 0.00019430377194223043, + "loss": 12.6158, + "step": 4948 + }, + { + "epoch": 0.269492829089325, + "grad_norm": 0.8856859606553761, + "learning_rate": 0.00019430083786605977, + "loss": 12.6485, + "step": 4949 + }, + { + "epoch": 0.269547283085908, + "grad_norm": 0.7651445866542139, + "learning_rate": 0.00019429790305658994, + "loss": 12.5601, + "step": 4950 + }, + { + "epoch": 0.269601737082491, + "grad_norm": 0.8712471278900041, + "learning_rate": 0.00019429496751384383, + "loss": 12.5648, + "step": 4951 + }, + { + "epoch": 0.269656191079074, + "grad_norm": 0.836714647644876, + "learning_rate": 0.00019429203123784422, + "loss": 12.5715, + "step": 4952 + }, + { + "epoch": 0.269710645075657, + "grad_norm": 0.7545099563149713, + "learning_rate": 0.00019428909422861398, + "loss": 12.4512, + "step": 4953 + }, + { + "epoch": 0.26976509907224006, + "grad_norm": 0.7759249414188034, + "learning_rate": 0.0001942861564861759, + "loss": 12.576, + "step": 4954 + }, + { + "epoch": 0.26981955306882305, + "grad_norm": 0.8070255412765706, + "learning_rate": 0.0001942832180105529, + "loss": 12.4277, + "step": 4955 + }, + { + "epoch": 0.26987400706540604, + "grad_norm": 0.7411828713292871, + "learning_rate": 0.00019428027880176777, + "loss": 12.5935, + "step": 4956 + }, + { + "epoch": 0.2699284610619891, + "grad_norm": 0.8762418560538697, + "learning_rate": 0.00019427733885984337, + "loss": 12.7198, + "step": 4957 + }, + { + "epoch": 0.2699829150585721, + "grad_norm": 0.822480922374177, + "learning_rate": 0.00019427439818480257, + "loss": 12.5418, + "step": 4958 + }, + { + "epoch": 0.2700373690551551, + "grad_norm": 0.7559488805950916, + "learning_rate": 0.00019427145677666823, + "loss": 12.5301, + "step": 4959 + }, + { + "epoch": 0.2700918230517381, + "grad_norm": 0.7075578365647424, + "learning_rate": 0.00019426851463546325, + "loss": 12.5837, + "step": 4960 + }, + { + "epoch": 0.2701462770483211, + "grad_norm": 0.7078321812508837, + "learning_rate": 0.0001942655717612105, + "loss": 12.6233, + "step": 4961 + }, + { + "epoch": 0.2702007310449041, + "grad_norm": 0.7725000755982798, + "learning_rate": 0.00019426262815393284, + "loss": 12.4912, + "step": 4962 + }, + { + "epoch": 0.27025518504148716, + "grad_norm": 0.7809703620738555, + "learning_rate": 0.00019425968381365317, + "loss": 12.7238, + "step": 4963 + }, + { + "epoch": 0.27030963903807015, + "grad_norm": 0.7543002957619203, + "learning_rate": 0.0001942567387403944, + "loss": 12.5083, + "step": 4964 + }, + { + "epoch": 0.27036409303465314, + "grad_norm": 0.7969983235647131, + "learning_rate": 0.00019425379293417944, + "loss": 12.6048, + "step": 4965 + }, + { + "epoch": 0.2704185470312362, + "grad_norm": 0.7196112073402323, + "learning_rate": 0.00019425084639503116, + "loss": 12.5924, + "step": 4966 + }, + { + "epoch": 0.2704730010278192, + "grad_norm": 0.7257632197119582, + "learning_rate": 0.00019424789912297249, + "loss": 12.6349, + "step": 4967 + }, + { + "epoch": 0.2705274550244022, + "grad_norm": 0.9378435895655606, + "learning_rate": 0.00019424495111802637, + "loss": 12.6213, + "step": 4968 + }, + { + "epoch": 0.2705819090209852, + "grad_norm": 0.7084946448579053, + "learning_rate": 0.00019424200238021567, + "loss": 12.5467, + "step": 4969 + }, + { + "epoch": 0.2706363630175682, + "grad_norm": 0.7483908594182052, + "learning_rate": 0.0001942390529095634, + "loss": 12.6569, + "step": 4970 + }, + { + "epoch": 0.2706908170141512, + "grad_norm": 0.7118408982414774, + "learning_rate": 0.00019423610270609244, + "loss": 12.5747, + "step": 4971 + }, + { + "epoch": 0.27074527101073426, + "grad_norm": 0.68660993044931, + "learning_rate": 0.0001942331517698257, + "loss": 12.4705, + "step": 4972 + }, + { + "epoch": 0.27079972500731725, + "grad_norm": 0.7802996073573238, + "learning_rate": 0.00019423020010078622, + "loss": 12.5696, + "step": 4973 + }, + { + "epoch": 0.27085417900390024, + "grad_norm": 0.674993499837841, + "learning_rate": 0.00019422724769899686, + "loss": 12.6346, + "step": 4974 + }, + { + "epoch": 0.2709086330004833, + "grad_norm": 0.9356688643207316, + "learning_rate": 0.00019422429456448064, + "loss": 12.8044, + "step": 4975 + }, + { + "epoch": 0.2709630869970663, + "grad_norm": 0.7621397899824885, + "learning_rate": 0.00019422134069726053, + "loss": 12.571, + "step": 4976 + }, + { + "epoch": 0.2710175409936493, + "grad_norm": 0.6867695978399014, + "learning_rate": 0.00019421838609735942, + "loss": 12.5917, + "step": 4977 + }, + { + "epoch": 0.2710719949902323, + "grad_norm": 0.8231682604535635, + "learning_rate": 0.0001942154307648004, + "loss": 12.5752, + "step": 4978 + }, + { + "epoch": 0.2711264489868153, + "grad_norm": 0.8111827291656801, + "learning_rate": 0.00019421247469960634, + "loss": 12.6336, + "step": 4979 + }, + { + "epoch": 0.27118090298339836, + "grad_norm": 1.042410249315156, + "learning_rate": 0.00019420951790180029, + "loss": 12.6439, + "step": 4980 + }, + { + "epoch": 0.27123535697998136, + "grad_norm": 0.7076263354008396, + "learning_rate": 0.00019420656037140525, + "loss": 12.6046, + "step": 4981 + }, + { + "epoch": 0.27128981097656435, + "grad_norm": 0.6515365376827323, + "learning_rate": 0.00019420360210844418, + "loss": 12.5624, + "step": 4982 + }, + { + "epoch": 0.2713442649731474, + "grad_norm": 0.7022423293026825, + "learning_rate": 0.0001942006431129401, + "loss": 12.5117, + "step": 4983 + }, + { + "epoch": 0.2713987189697304, + "grad_norm": 0.7954526938737386, + "learning_rate": 0.00019419768338491605, + "loss": 12.7401, + "step": 4984 + }, + { + "epoch": 0.2714531729663134, + "grad_norm": 0.8688496563455215, + "learning_rate": 0.00019419472292439498, + "loss": 12.7052, + "step": 4985 + }, + { + "epoch": 0.27150762696289643, + "grad_norm": 0.6874496497060338, + "learning_rate": 0.00019419176173139996, + "loss": 12.653, + "step": 4986 + }, + { + "epoch": 0.2715620809594794, + "grad_norm": 0.7350358087124989, + "learning_rate": 0.000194188799805954, + "loss": 12.7017, + "step": 4987 + }, + { + "epoch": 0.2716165349560624, + "grad_norm": 0.7917394937965483, + "learning_rate": 0.00019418583714808017, + "loss": 12.5159, + "step": 4988 + }, + { + "epoch": 0.27167098895264546, + "grad_norm": 0.7488765240242293, + "learning_rate": 0.00019418287375780146, + "loss": 12.6222, + "step": 4989 + }, + { + "epoch": 0.27172544294922846, + "grad_norm": 0.7146616561206384, + "learning_rate": 0.00019417990963514086, + "loss": 12.4543, + "step": 4990 + }, + { + "epoch": 0.27177989694581145, + "grad_norm": 0.7287116095576452, + "learning_rate": 0.00019417694478012157, + "loss": 12.5467, + "step": 4991 + }, + { + "epoch": 0.2718343509423945, + "grad_norm": 0.8550421741239044, + "learning_rate": 0.00019417397919276654, + "loss": 12.6141, + "step": 4992 + }, + { + "epoch": 0.2718888049389775, + "grad_norm": 0.8027759839067112, + "learning_rate": 0.00019417101287309886, + "loss": 12.6347, + "step": 4993 + }, + { + "epoch": 0.2719432589355605, + "grad_norm": 0.7161657765497795, + "learning_rate": 0.00019416804582114157, + "loss": 12.4361, + "step": 4994 + }, + { + "epoch": 0.27199771293214353, + "grad_norm": 0.7008462549080949, + "learning_rate": 0.0001941650780369178, + "loss": 12.6173, + "step": 4995 + }, + { + "epoch": 0.2720521669287265, + "grad_norm": 0.7668462501840719, + "learning_rate": 0.00019416210952045057, + "loss": 12.5077, + "step": 4996 + }, + { + "epoch": 0.2721066209253095, + "grad_norm": 0.7308418987478974, + "learning_rate": 0.000194159140271763, + "loss": 12.5051, + "step": 4997 + }, + { + "epoch": 0.27216107492189257, + "grad_norm": 0.7483771822126182, + "learning_rate": 0.00019415617029087815, + "loss": 12.7825, + "step": 4998 + }, + { + "epoch": 0.27221552891847556, + "grad_norm": 0.6782389168304814, + "learning_rate": 0.00019415319957781914, + "loss": 12.5248, + "step": 4999 + }, + { + "epoch": 0.27226998291505855, + "grad_norm": 0.7489393248917106, + "learning_rate": 0.00019415022813260903, + "loss": 12.7125, + "step": 5000 + }, + { + "epoch": 0.2723244369116416, + "grad_norm": 0.679800953165827, + "learning_rate": 0.000194147255955271, + "loss": 12.6751, + "step": 5001 + }, + { + "epoch": 0.2723788909082246, + "grad_norm": 0.7888451496769374, + "learning_rate": 0.0001941442830458281, + "loss": 12.662, + "step": 5002 + }, + { + "epoch": 0.2724333449048076, + "grad_norm": 0.7325560605775505, + "learning_rate": 0.00019414130940430347, + "loss": 12.7182, + "step": 5003 + }, + { + "epoch": 0.27248779890139063, + "grad_norm": 0.7237907170438983, + "learning_rate": 0.0001941383350307202, + "loss": 12.4911, + "step": 5004 + }, + { + "epoch": 0.2725422528979736, + "grad_norm": 0.6903596227181271, + "learning_rate": 0.0001941353599251015, + "loss": 12.4553, + "step": 5005 + }, + { + "epoch": 0.2725967068945566, + "grad_norm": 0.8111728486824789, + "learning_rate": 0.00019413238408747042, + "loss": 12.6992, + "step": 5006 + }, + { + "epoch": 0.27265116089113967, + "grad_norm": 0.8038359449773015, + "learning_rate": 0.00019412940751785016, + "loss": 12.5698, + "step": 5007 + }, + { + "epoch": 0.27270561488772266, + "grad_norm": 0.6879193280914043, + "learning_rate": 0.00019412643021626385, + "loss": 12.5842, + "step": 5008 + }, + { + "epoch": 0.27276006888430565, + "grad_norm": 0.7823374563346943, + "learning_rate": 0.0001941234521827346, + "loss": 12.5844, + "step": 5009 + }, + { + "epoch": 0.2728145228808887, + "grad_norm": 1.109044580363149, + "learning_rate": 0.00019412047341728562, + "loss": 12.6108, + "step": 5010 + }, + { + "epoch": 0.2728689768774717, + "grad_norm": 0.6525462085960635, + "learning_rate": 0.00019411749391994002, + "loss": 12.5394, + "step": 5011 + }, + { + "epoch": 0.27292343087405474, + "grad_norm": 0.8202131839297926, + "learning_rate": 0.00019411451369072104, + "loss": 12.652, + "step": 5012 + }, + { + "epoch": 0.27297788487063773, + "grad_norm": 0.6646217005732907, + "learning_rate": 0.00019411153272965183, + "loss": 12.3951, + "step": 5013 + }, + { + "epoch": 0.2730323388672207, + "grad_norm": 0.7073205373515769, + "learning_rate": 0.00019410855103675552, + "loss": 12.4288, + "step": 5014 + }, + { + "epoch": 0.2730867928638038, + "grad_norm": 0.7478502526049412, + "learning_rate": 0.0001941055686120554, + "loss": 12.6234, + "step": 5015 + }, + { + "epoch": 0.27314124686038677, + "grad_norm": 0.7901002256999474, + "learning_rate": 0.00019410258545557452, + "loss": 12.6419, + "step": 5016 + }, + { + "epoch": 0.27319570085696976, + "grad_norm": 0.7292447218005194, + "learning_rate": 0.0001940996015673362, + "loss": 12.6508, + "step": 5017 + }, + { + "epoch": 0.2732501548535528, + "grad_norm": 0.7317662343794659, + "learning_rate": 0.00019409661694736355, + "loss": 12.6472, + "step": 5018 + }, + { + "epoch": 0.2733046088501358, + "grad_norm": 0.7269954070004496, + "learning_rate": 0.0001940936315956799, + "loss": 12.4842, + "step": 5019 + }, + { + "epoch": 0.2733590628467188, + "grad_norm": 0.68273618885286, + "learning_rate": 0.00019409064551230833, + "loss": 12.5733, + "step": 5020 + }, + { + "epoch": 0.27341351684330184, + "grad_norm": 0.7839706162049932, + "learning_rate": 0.00019408765869727214, + "loss": 12.5424, + "step": 5021 + }, + { + "epoch": 0.27346797083988483, + "grad_norm": 0.7754399766077261, + "learning_rate": 0.00019408467115059454, + "loss": 12.6342, + "step": 5022 + }, + { + "epoch": 0.2735224248364678, + "grad_norm": 0.6691754583961804, + "learning_rate": 0.00019408168287229875, + "loss": 12.4901, + "step": 5023 + }, + { + "epoch": 0.2735768788330509, + "grad_norm": 0.7275096808427216, + "learning_rate": 0.00019407869386240805, + "loss": 12.6807, + "step": 5024 + }, + { + "epoch": 0.27363133282963387, + "grad_norm": 0.7222259300134751, + "learning_rate": 0.00019407570412094562, + "loss": 12.687, + "step": 5025 + }, + { + "epoch": 0.27368578682621686, + "grad_norm": 0.6536571577211265, + "learning_rate": 0.00019407271364793474, + "loss": 12.5979, + "step": 5026 + }, + { + "epoch": 0.2737402408227999, + "grad_norm": 0.6961205317617686, + "learning_rate": 0.00019406972244339867, + "loss": 12.5989, + "step": 5027 + }, + { + "epoch": 0.2737946948193829, + "grad_norm": 0.7038426126899354, + "learning_rate": 0.00019406673050736067, + "loss": 12.6524, + "step": 5028 + }, + { + "epoch": 0.2738491488159659, + "grad_norm": 0.7279571986835491, + "learning_rate": 0.000194063737839844, + "loss": 12.6209, + "step": 5029 + }, + { + "epoch": 0.27390360281254894, + "grad_norm": 0.7821768847303526, + "learning_rate": 0.0001940607444408719, + "loss": 12.657, + "step": 5030 + }, + { + "epoch": 0.27395805680913193, + "grad_norm": 0.7535716318076771, + "learning_rate": 0.0001940577503104677, + "loss": 12.5637, + "step": 5031 + }, + { + "epoch": 0.2740125108057149, + "grad_norm": 0.740051362894615, + "learning_rate": 0.00019405475544865465, + "loss": 12.616, + "step": 5032 + }, + { + "epoch": 0.274066964802298, + "grad_norm": 0.7922028822553647, + "learning_rate": 0.00019405175985545605, + "loss": 12.5247, + "step": 5033 + }, + { + "epoch": 0.27412141879888097, + "grad_norm": 0.6497358398767344, + "learning_rate": 0.00019404876353089522, + "loss": 12.4952, + "step": 5034 + }, + { + "epoch": 0.27417587279546396, + "grad_norm": 0.7550312787281879, + "learning_rate": 0.0001940457664749954, + "loss": 12.5047, + "step": 5035 + }, + { + "epoch": 0.274230326792047, + "grad_norm": 0.6699670017875384, + "learning_rate": 0.00019404276868777994, + "loss": 12.6481, + "step": 5036 + }, + { + "epoch": 0.27428478078863, + "grad_norm": 0.7225857141499656, + "learning_rate": 0.00019403977016927212, + "loss": 12.3542, + "step": 5037 + }, + { + "epoch": 0.274339234785213, + "grad_norm": 0.6977057679519985, + "learning_rate": 0.0001940367709194953, + "loss": 12.5138, + "step": 5038 + }, + { + "epoch": 0.27439368878179604, + "grad_norm": 0.704198464905272, + "learning_rate": 0.0001940337709384728, + "loss": 12.5917, + "step": 5039 + }, + { + "epoch": 0.27444814277837903, + "grad_norm": 0.6711982258147914, + "learning_rate": 0.0001940307702262279, + "loss": 12.6132, + "step": 5040 + }, + { + "epoch": 0.274502596774962, + "grad_norm": 0.8079769295912083, + "learning_rate": 0.00019402776878278395, + "loss": 12.6885, + "step": 5041 + }, + { + "epoch": 0.2745570507715451, + "grad_norm": 0.6559387708196829, + "learning_rate": 0.00019402476660816432, + "loss": 12.5405, + "step": 5042 + }, + { + "epoch": 0.27461150476812807, + "grad_norm": 0.7379381995247378, + "learning_rate": 0.00019402176370239232, + "loss": 12.6785, + "step": 5043 + }, + { + "epoch": 0.27466595876471106, + "grad_norm": 0.690471720110709, + "learning_rate": 0.00019401876006549132, + "loss": 12.5732, + "step": 5044 + }, + { + "epoch": 0.2747204127612941, + "grad_norm": 0.7080593199826031, + "learning_rate": 0.0001940157556974847, + "loss": 12.6293, + "step": 5045 + }, + { + "epoch": 0.2747748667578771, + "grad_norm": 0.6739896979204867, + "learning_rate": 0.0001940127505983958, + "loss": 12.5172, + "step": 5046 + }, + { + "epoch": 0.27482932075446015, + "grad_norm": 0.651768469028915, + "learning_rate": 0.00019400974476824795, + "loss": 12.5162, + "step": 5047 + }, + { + "epoch": 0.27488377475104314, + "grad_norm": 0.838781952055105, + "learning_rate": 0.00019400673820706458, + "loss": 12.6232, + "step": 5048 + }, + { + "epoch": 0.27493822874762613, + "grad_norm": 0.6672933137610362, + "learning_rate": 0.00019400373091486904, + "loss": 12.6947, + "step": 5049 + }, + { + "epoch": 0.2749926827442092, + "grad_norm": 0.7835512965878694, + "learning_rate": 0.00019400072289168474, + "loss": 12.5005, + "step": 5050 + }, + { + "epoch": 0.2750471367407922, + "grad_norm": 0.74299265506198, + "learning_rate": 0.00019399771413753506, + "loss": 12.5162, + "step": 5051 + }, + { + "epoch": 0.27510159073737517, + "grad_norm": 0.7492972464867815, + "learning_rate": 0.00019399470465244337, + "loss": 12.6574, + "step": 5052 + }, + { + "epoch": 0.2751560447339582, + "grad_norm": 0.7271180956005475, + "learning_rate": 0.0001939916944364331, + "loss": 12.47, + "step": 5053 + }, + { + "epoch": 0.2752104987305412, + "grad_norm": 0.671577089718614, + "learning_rate": 0.00019398868348952764, + "loss": 12.6404, + "step": 5054 + }, + { + "epoch": 0.2752649527271242, + "grad_norm": 0.7593124681164001, + "learning_rate": 0.00019398567181175042, + "loss": 12.6242, + "step": 5055 + }, + { + "epoch": 0.27531940672370725, + "grad_norm": 0.7147968774089865, + "learning_rate": 0.0001939826594031249, + "loss": 12.4892, + "step": 5056 + }, + { + "epoch": 0.27537386072029024, + "grad_norm": 0.6362963231401051, + "learning_rate": 0.0001939796462636744, + "loss": 12.5185, + "step": 5057 + }, + { + "epoch": 0.27542831471687323, + "grad_norm": 0.710749446087054, + "learning_rate": 0.0001939766323934224, + "loss": 12.5753, + "step": 5058 + }, + { + "epoch": 0.2754827687134563, + "grad_norm": 0.6111662941975942, + "learning_rate": 0.0001939736177923924, + "loss": 12.5201, + "step": 5059 + }, + { + "epoch": 0.2755372227100393, + "grad_norm": 0.6880616611302837, + "learning_rate": 0.00019397060246060776, + "loss": 12.6284, + "step": 5060 + }, + { + "epoch": 0.27559167670662227, + "grad_norm": 0.6281572869817057, + "learning_rate": 0.00019396758639809197, + "loss": 12.5722, + "step": 5061 + }, + { + "epoch": 0.2756461307032053, + "grad_norm": 0.790136974222293, + "learning_rate": 0.00019396456960486846, + "loss": 12.6321, + "step": 5062 + }, + { + "epoch": 0.2757005846997883, + "grad_norm": 0.6693878597166492, + "learning_rate": 0.0001939615520809607, + "loss": 12.5091, + "step": 5063 + }, + { + "epoch": 0.2757550386963713, + "grad_norm": 0.6646705864772172, + "learning_rate": 0.00019395853382639215, + "loss": 12.6344, + "step": 5064 + }, + { + "epoch": 0.27580949269295435, + "grad_norm": 0.7191256074928883, + "learning_rate": 0.00019395551484118628, + "loss": 12.6256, + "step": 5065 + }, + { + "epoch": 0.27586394668953734, + "grad_norm": 0.8147113211400999, + "learning_rate": 0.0001939524951253666, + "loss": 12.6511, + "step": 5066 + }, + { + "epoch": 0.27591840068612034, + "grad_norm": 0.6417233825294415, + "learning_rate": 0.00019394947467895652, + "loss": 12.543, + "step": 5067 + }, + { + "epoch": 0.2759728546827034, + "grad_norm": 0.6366238255470038, + "learning_rate": 0.0001939464535019796, + "loss": 12.5125, + "step": 5068 + }, + { + "epoch": 0.2760273086792864, + "grad_norm": 0.6932555039744597, + "learning_rate": 0.0001939434315944593, + "loss": 12.6213, + "step": 5069 + }, + { + "epoch": 0.27608176267586937, + "grad_norm": 0.7148171868609757, + "learning_rate": 0.0001939404089564191, + "loss": 12.5546, + "step": 5070 + }, + { + "epoch": 0.2761362166724524, + "grad_norm": 0.7662802540895398, + "learning_rate": 0.00019393738558788254, + "loss": 12.4323, + "step": 5071 + }, + { + "epoch": 0.2761906706690354, + "grad_norm": 0.7005674022938893, + "learning_rate": 0.00019393436148887314, + "loss": 12.7013, + "step": 5072 + }, + { + "epoch": 0.2762451246656184, + "grad_norm": 0.7041144517869332, + "learning_rate": 0.00019393133665941437, + "loss": 12.6209, + "step": 5073 + }, + { + "epoch": 0.27629957866220145, + "grad_norm": 0.6777116434436815, + "learning_rate": 0.00019392831109952977, + "loss": 12.4813, + "step": 5074 + }, + { + "epoch": 0.27635403265878444, + "grad_norm": 0.6845631530681943, + "learning_rate": 0.00019392528480924285, + "loss": 12.5907, + "step": 5075 + }, + { + "epoch": 0.27640848665536744, + "grad_norm": 0.7114657166282016, + "learning_rate": 0.00019392225778857723, + "loss": 12.5673, + "step": 5076 + }, + { + "epoch": 0.2764629406519505, + "grad_norm": 0.8107231975439327, + "learning_rate": 0.00019391923003755633, + "loss": 12.5245, + "step": 5077 + }, + { + "epoch": 0.2765173946485335, + "grad_norm": 0.8040081885700733, + "learning_rate": 0.00019391620155620375, + "loss": 12.5707, + "step": 5078 + }, + { + "epoch": 0.2765718486451165, + "grad_norm": 0.6572804074345328, + "learning_rate": 0.00019391317234454305, + "loss": 12.6469, + "step": 5079 + }, + { + "epoch": 0.2766263026416995, + "grad_norm": 0.8236022163967431, + "learning_rate": 0.00019391014240259776, + "loss": 12.5572, + "step": 5080 + }, + { + "epoch": 0.2766807566382825, + "grad_norm": 0.658914373309915, + "learning_rate": 0.00019390711173039146, + "loss": 12.5925, + "step": 5081 + }, + { + "epoch": 0.27673521063486556, + "grad_norm": 0.7943194386641196, + "learning_rate": 0.00019390408032794772, + "loss": 12.6722, + "step": 5082 + }, + { + "epoch": 0.27678966463144855, + "grad_norm": 0.6973239731440578, + "learning_rate": 0.00019390104819529008, + "loss": 12.552, + "step": 5083 + }, + { + "epoch": 0.27684411862803154, + "grad_norm": 0.7418809699302191, + "learning_rate": 0.00019389801533244218, + "loss": 12.5722, + "step": 5084 + }, + { + "epoch": 0.2768985726246146, + "grad_norm": 0.7422872676063248, + "learning_rate": 0.00019389498173942756, + "loss": 12.5761, + "step": 5085 + }, + { + "epoch": 0.2769530266211976, + "grad_norm": 0.703580772909161, + "learning_rate": 0.0001938919474162698, + "loss": 12.5502, + "step": 5086 + }, + { + "epoch": 0.2770074806177806, + "grad_norm": 0.7000256855033609, + "learning_rate": 0.00019388891236299253, + "loss": 12.6968, + "step": 5087 + }, + { + "epoch": 0.2770619346143636, + "grad_norm": 0.7135620202619926, + "learning_rate": 0.0001938858765796193, + "loss": 12.6916, + "step": 5088 + }, + { + "epoch": 0.2771163886109466, + "grad_norm": 0.7652487105619586, + "learning_rate": 0.00019388284006617375, + "loss": 12.3941, + "step": 5089 + }, + { + "epoch": 0.2771708426075296, + "grad_norm": 0.690744559913229, + "learning_rate": 0.0001938798028226795, + "loss": 12.5351, + "step": 5090 + }, + { + "epoch": 0.27722529660411266, + "grad_norm": 0.8156095234373029, + "learning_rate": 0.0001938767648491602, + "loss": 12.4954, + "step": 5091 + }, + { + "epoch": 0.27727975060069565, + "grad_norm": 0.7090953026934278, + "learning_rate": 0.00019387372614563936, + "loss": 12.5321, + "step": 5092 + }, + { + "epoch": 0.27733420459727864, + "grad_norm": 0.6409777844639443, + "learning_rate": 0.00019387068671214072, + "loss": 12.4822, + "step": 5093 + }, + { + "epoch": 0.2773886585938617, + "grad_norm": 0.7587500601030793, + "learning_rate": 0.0001938676465486879, + "loss": 12.6798, + "step": 5094 + }, + { + "epoch": 0.2774431125904447, + "grad_norm": 0.649528915312248, + "learning_rate": 0.0001938646056553045, + "loss": 12.502, + "step": 5095 + }, + { + "epoch": 0.2774975665870277, + "grad_norm": 0.747554472997202, + "learning_rate": 0.00019386156403201416, + "loss": 12.5573, + "step": 5096 + }, + { + "epoch": 0.2775520205836107, + "grad_norm": 0.8592539276831329, + "learning_rate": 0.00019385852167884057, + "loss": 12.476, + "step": 5097 + }, + { + "epoch": 0.2776064745801937, + "grad_norm": 0.7200503490316053, + "learning_rate": 0.00019385547859580743, + "loss": 12.5044, + "step": 5098 + }, + { + "epoch": 0.2776609285767767, + "grad_norm": 0.6728688453119079, + "learning_rate": 0.00019385243478293828, + "loss": 12.6178, + "step": 5099 + }, + { + "epoch": 0.27771538257335976, + "grad_norm": 0.6660773015222236, + "learning_rate": 0.0001938493902402569, + "loss": 12.6218, + "step": 5100 + }, + { + "epoch": 0.27776983656994275, + "grad_norm": 0.6954282062789127, + "learning_rate": 0.00019384634496778688, + "loss": 12.4654, + "step": 5101 + }, + { + "epoch": 0.27782429056652574, + "grad_norm": 0.7156888717451666, + "learning_rate": 0.00019384329896555196, + "loss": 12.6465, + "step": 5102 + }, + { + "epoch": 0.2778787445631088, + "grad_norm": 0.7179172725399301, + "learning_rate": 0.00019384025223357582, + "loss": 12.5168, + "step": 5103 + }, + { + "epoch": 0.2779331985596918, + "grad_norm": 0.7759017642906217, + "learning_rate": 0.00019383720477188216, + "loss": 12.602, + "step": 5104 + }, + { + "epoch": 0.2779876525562748, + "grad_norm": 0.6510398797162207, + "learning_rate": 0.00019383415658049465, + "loss": 12.4956, + "step": 5105 + }, + { + "epoch": 0.2780421065528578, + "grad_norm": 0.6642562478621538, + "learning_rate": 0.00019383110765943697, + "loss": 12.4895, + "step": 5106 + }, + { + "epoch": 0.2780965605494408, + "grad_norm": 0.6856961004079563, + "learning_rate": 0.00019382805800873288, + "loss": 12.519, + "step": 5107 + }, + { + "epoch": 0.2781510145460238, + "grad_norm": 0.6423899847602803, + "learning_rate": 0.0001938250076284061, + "loss": 12.5387, + "step": 5108 + }, + { + "epoch": 0.27820546854260686, + "grad_norm": 0.636575246825305, + "learning_rate": 0.00019382195651848028, + "loss": 12.7195, + "step": 5109 + }, + { + "epoch": 0.27825992253918985, + "grad_norm": 0.6970889423434784, + "learning_rate": 0.00019381890467897922, + "loss": 12.6636, + "step": 5110 + }, + { + "epoch": 0.27831437653577284, + "grad_norm": 0.626582862445139, + "learning_rate": 0.00019381585210992663, + "loss": 12.5385, + "step": 5111 + }, + { + "epoch": 0.2783688305323559, + "grad_norm": 0.7246283791178956, + "learning_rate": 0.00019381279881134625, + "loss": 12.573, + "step": 5112 + }, + { + "epoch": 0.2784232845289389, + "grad_norm": 0.6601903163083505, + "learning_rate": 0.00019380974478326178, + "loss": 12.6021, + "step": 5113 + }, + { + "epoch": 0.27847773852552193, + "grad_norm": 0.762912600904577, + "learning_rate": 0.000193806690025697, + "loss": 12.6755, + "step": 5114 + }, + { + "epoch": 0.2785321925221049, + "grad_norm": 0.676212979343806, + "learning_rate": 0.0001938036345386757, + "loss": 12.5951, + "step": 5115 + }, + { + "epoch": 0.2785866465186879, + "grad_norm": 0.7940947673405626, + "learning_rate": 0.0001938005783222216, + "loss": 12.5618, + "step": 5116 + }, + { + "epoch": 0.27864110051527097, + "grad_norm": 0.6346251485123232, + "learning_rate": 0.00019379752137635848, + "loss": 12.4973, + "step": 5117 + }, + { + "epoch": 0.27869555451185396, + "grad_norm": 0.8342476054070347, + "learning_rate": 0.00019379446370111007, + "loss": 12.6665, + "step": 5118 + }, + { + "epoch": 0.27875000850843695, + "grad_norm": 0.715632773565721, + "learning_rate": 0.0001937914052965002, + "loss": 12.641, + "step": 5119 + }, + { + "epoch": 0.27880446250502, + "grad_norm": 0.6382536033591416, + "learning_rate": 0.00019378834616255264, + "loss": 12.6341, + "step": 5120 + }, + { + "epoch": 0.278858916501603, + "grad_norm": 0.6833566071913983, + "learning_rate": 0.0001937852862992912, + "loss": 12.6692, + "step": 5121 + }, + { + "epoch": 0.278913370498186, + "grad_norm": 0.7187158872693135, + "learning_rate": 0.00019378222570673955, + "loss": 12.5714, + "step": 5122 + }, + { + "epoch": 0.27896782449476903, + "grad_norm": 0.7008947274359243, + "learning_rate": 0.00019377916438492168, + "loss": 12.577, + "step": 5123 + }, + { + "epoch": 0.279022278491352, + "grad_norm": 0.803851898243202, + "learning_rate": 0.00019377610233386124, + "loss": 12.7355, + "step": 5124 + }, + { + "epoch": 0.279076732487935, + "grad_norm": 0.7559303634246921, + "learning_rate": 0.00019377303955358217, + "loss": 12.6045, + "step": 5125 + }, + { + "epoch": 0.27913118648451807, + "grad_norm": 0.8192745121555173, + "learning_rate": 0.00019376997604410816, + "loss": 12.7426, + "step": 5126 + }, + { + "epoch": 0.27918564048110106, + "grad_norm": 0.7247911238027126, + "learning_rate": 0.0001937669118054631, + "loss": 12.598, + "step": 5127 + }, + { + "epoch": 0.27924009447768405, + "grad_norm": 0.7163926957202394, + "learning_rate": 0.00019376384683767085, + "loss": 12.5991, + "step": 5128 + }, + { + "epoch": 0.2792945484742671, + "grad_norm": 0.633941715979679, + "learning_rate": 0.00019376078114075515, + "loss": 12.509, + "step": 5129 + }, + { + "epoch": 0.2793490024708501, + "grad_norm": 0.8875319613920101, + "learning_rate": 0.00019375771471473994, + "loss": 12.5074, + "step": 5130 + }, + { + "epoch": 0.2794034564674331, + "grad_norm": 0.6848286176502122, + "learning_rate": 0.00019375464755964897, + "loss": 12.605, + "step": 5131 + }, + { + "epoch": 0.27945791046401613, + "grad_norm": 0.7319492347428458, + "learning_rate": 0.00019375157967550617, + "loss": 12.6155, + "step": 5132 + }, + { + "epoch": 0.2795123644605991, + "grad_norm": 0.9091714525701203, + "learning_rate": 0.00019374851106233534, + "loss": 12.7187, + "step": 5133 + }, + { + "epoch": 0.2795668184571821, + "grad_norm": 0.7162150438667014, + "learning_rate": 0.0001937454417201604, + "loss": 12.5939, + "step": 5134 + }, + { + "epoch": 0.27962127245376517, + "grad_norm": 0.9315398296781449, + "learning_rate": 0.00019374237164900514, + "loss": 12.4741, + "step": 5135 + }, + { + "epoch": 0.27967572645034816, + "grad_norm": 0.7612621418120459, + "learning_rate": 0.00019373930084889352, + "loss": 12.5645, + "step": 5136 + }, + { + "epoch": 0.27973018044693115, + "grad_norm": 0.8713698804427293, + "learning_rate": 0.00019373622931984934, + "loss": 12.4994, + "step": 5137 + }, + { + "epoch": 0.2797846344435142, + "grad_norm": 0.8327126204853291, + "learning_rate": 0.00019373315706189655, + "loss": 12.3497, + "step": 5138 + }, + { + "epoch": 0.2798390884400972, + "grad_norm": 0.7275140927615509, + "learning_rate": 0.000193730084075059, + "loss": 12.6424, + "step": 5139 + }, + { + "epoch": 0.2798935424366802, + "grad_norm": 0.7794188688974611, + "learning_rate": 0.0001937270103593606, + "loss": 12.5054, + "step": 5140 + }, + { + "epoch": 0.27994799643326324, + "grad_norm": 0.6425933749368322, + "learning_rate": 0.00019372393591482524, + "loss": 12.5495, + "step": 5141 + }, + { + "epoch": 0.28000245042984623, + "grad_norm": 0.7633781746174405, + "learning_rate": 0.00019372086074147685, + "loss": 12.634, + "step": 5142 + }, + { + "epoch": 0.2800569044264292, + "grad_norm": 0.7500822577947447, + "learning_rate": 0.00019371778483933934, + "loss": 12.7041, + "step": 5143 + }, + { + "epoch": 0.28011135842301227, + "grad_norm": 0.6833744271936957, + "learning_rate": 0.0001937147082084366, + "loss": 12.5521, + "step": 5144 + }, + { + "epoch": 0.28016581241959526, + "grad_norm": 0.7690187012529756, + "learning_rate": 0.00019371163084879256, + "loss": 12.4258, + "step": 5145 + }, + { + "epoch": 0.2802202664161783, + "grad_norm": 0.7467336494036554, + "learning_rate": 0.00019370855276043121, + "loss": 12.5721, + "step": 5146 + }, + { + "epoch": 0.2802747204127613, + "grad_norm": 0.7625607151593078, + "learning_rate": 0.0001937054739433764, + "loss": 12.5985, + "step": 5147 + }, + { + "epoch": 0.2803291744093443, + "grad_norm": 0.8703165636039294, + "learning_rate": 0.0001937023943976521, + "loss": 12.638, + "step": 5148 + }, + { + "epoch": 0.28038362840592734, + "grad_norm": 0.725423606749526, + "learning_rate": 0.0001936993141232823, + "loss": 12.5103, + "step": 5149 + }, + { + "epoch": 0.28043808240251034, + "grad_norm": 0.763692107957062, + "learning_rate": 0.0001936962331202909, + "loss": 12.5039, + "step": 5150 + }, + { + "epoch": 0.28049253639909333, + "grad_norm": 0.6316565367613988, + "learning_rate": 0.00019369315138870188, + "loss": 12.4585, + "step": 5151 + }, + { + "epoch": 0.2805469903956764, + "grad_norm": 0.7350835482152517, + "learning_rate": 0.0001936900689285392, + "loss": 12.6043, + "step": 5152 + }, + { + "epoch": 0.28060144439225937, + "grad_norm": 0.6690143152131562, + "learning_rate": 0.00019368698573982686, + "loss": 12.5098, + "step": 5153 + }, + { + "epoch": 0.28065589838884236, + "grad_norm": 0.7372146097111318, + "learning_rate": 0.00019368390182258877, + "loss": 12.5539, + "step": 5154 + }, + { + "epoch": 0.2807103523854254, + "grad_norm": 0.6211436551518016, + "learning_rate": 0.00019368081717684898, + "loss": 12.5419, + "step": 5155 + }, + { + "epoch": 0.2807648063820084, + "grad_norm": 0.7063545203040411, + "learning_rate": 0.00019367773180263141, + "loss": 12.4829, + "step": 5156 + }, + { + "epoch": 0.2808192603785914, + "grad_norm": 0.7194962328640749, + "learning_rate": 0.00019367464569996012, + "loss": 12.6408, + "step": 5157 + }, + { + "epoch": 0.28087371437517444, + "grad_norm": 0.6871823163250769, + "learning_rate": 0.00019367155886885906, + "loss": 12.6144, + "step": 5158 + }, + { + "epoch": 0.28092816837175744, + "grad_norm": 0.6661938851213246, + "learning_rate": 0.00019366847130935225, + "loss": 12.5857, + "step": 5159 + }, + { + "epoch": 0.28098262236834043, + "grad_norm": 0.6744656602031699, + "learning_rate": 0.0001936653830214637, + "loss": 12.5844, + "step": 5160 + }, + { + "epoch": 0.2810370763649235, + "grad_norm": 0.6865644296890181, + "learning_rate": 0.00019366229400521743, + "loss": 12.4689, + "step": 5161 + }, + { + "epoch": 0.28109153036150647, + "grad_norm": 0.7895194676119835, + "learning_rate": 0.00019365920426063745, + "loss": 12.7955, + "step": 5162 + }, + { + "epoch": 0.28114598435808946, + "grad_norm": 0.6310374554477493, + "learning_rate": 0.00019365611378774778, + "loss": 12.6577, + "step": 5163 + }, + { + "epoch": 0.2812004383546725, + "grad_norm": 1.0253218473524535, + "learning_rate": 0.0001936530225865725, + "loss": 12.6858, + "step": 5164 + }, + { + "epoch": 0.2812548923512555, + "grad_norm": 0.6754084523502711, + "learning_rate": 0.0001936499306571356, + "loss": 12.6863, + "step": 5165 + }, + { + "epoch": 0.2813093463478385, + "grad_norm": 0.6427002592447747, + "learning_rate": 0.00019364683799946112, + "loss": 12.5656, + "step": 5166 + }, + { + "epoch": 0.28136380034442154, + "grad_norm": 0.674802174929845, + "learning_rate": 0.0001936437446135731, + "loss": 12.4581, + "step": 5167 + }, + { + "epoch": 0.28141825434100454, + "grad_norm": 0.7627046858697148, + "learning_rate": 0.00019364065049949566, + "loss": 12.5511, + "step": 5168 + }, + { + "epoch": 0.28147270833758753, + "grad_norm": 0.6432456260881265, + "learning_rate": 0.00019363755565725277, + "loss": 12.4937, + "step": 5169 + }, + { + "epoch": 0.2815271623341706, + "grad_norm": 0.6403274159229349, + "learning_rate": 0.0001936344600868686, + "loss": 12.6198, + "step": 5170 + }, + { + "epoch": 0.28158161633075357, + "grad_norm": 0.6892405055573441, + "learning_rate": 0.00019363136378836712, + "loss": 12.5993, + "step": 5171 + }, + { + "epoch": 0.28163607032733656, + "grad_norm": 0.6442568030583643, + "learning_rate": 0.00019362826676177247, + "loss": 12.5813, + "step": 5172 + }, + { + "epoch": 0.2816905243239196, + "grad_norm": 0.6189612613344044, + "learning_rate": 0.00019362516900710874, + "loss": 12.56, + "step": 5173 + }, + { + "epoch": 0.2817449783205026, + "grad_norm": 0.6617043807798307, + "learning_rate": 0.00019362207052439997, + "loss": 12.5102, + "step": 5174 + }, + { + "epoch": 0.2817994323170856, + "grad_norm": 0.6492188492603169, + "learning_rate": 0.00019361897131367028, + "loss": 12.5758, + "step": 5175 + }, + { + "epoch": 0.28185388631366864, + "grad_norm": 0.7473061625828011, + "learning_rate": 0.00019361587137494378, + "loss": 12.6179, + "step": 5176 + }, + { + "epoch": 0.28190834031025164, + "grad_norm": 0.6547901694630822, + "learning_rate": 0.00019361277070824455, + "loss": 12.539, + "step": 5177 + }, + { + "epoch": 0.28196279430683463, + "grad_norm": 0.7669279291636536, + "learning_rate": 0.00019360966931359673, + "loss": 12.5469, + "step": 5178 + }, + { + "epoch": 0.2820172483034177, + "grad_norm": 0.7186642421638226, + "learning_rate": 0.0001936065671910244, + "loss": 12.5702, + "step": 5179 + }, + { + "epoch": 0.28207170230000067, + "grad_norm": 0.6731075258080949, + "learning_rate": 0.00019360346434055172, + "loss": 12.51, + "step": 5180 + }, + { + "epoch": 0.2821261562965837, + "grad_norm": 0.6667099900813783, + "learning_rate": 0.00019360036076220282, + "loss": 12.5498, + "step": 5181 + }, + { + "epoch": 0.2821806102931667, + "grad_norm": 0.7785210314930021, + "learning_rate": 0.0001935972564560018, + "loss": 12.6939, + "step": 5182 + }, + { + "epoch": 0.2822350642897497, + "grad_norm": 0.7067372272734197, + "learning_rate": 0.0001935941514219728, + "loss": 12.5571, + "step": 5183 + }, + { + "epoch": 0.28228951828633275, + "grad_norm": 0.678421616002225, + "learning_rate": 0.00019359104566014, + "loss": 12.599, + "step": 5184 + }, + { + "epoch": 0.28234397228291574, + "grad_norm": 0.6551604392091743, + "learning_rate": 0.00019358793917052756, + "loss": 12.5489, + "step": 5185 + }, + { + "epoch": 0.28239842627949874, + "grad_norm": 0.7103555805268724, + "learning_rate": 0.00019358483195315958, + "loss": 12.5438, + "step": 5186 + }, + { + "epoch": 0.2824528802760818, + "grad_norm": 0.7069284288372881, + "learning_rate": 0.00019358172400806028, + "loss": 12.5381, + "step": 5187 + }, + { + "epoch": 0.2825073342726648, + "grad_norm": 0.6722826182924235, + "learning_rate": 0.00019357861533525377, + "loss": 12.586, + "step": 5188 + }, + { + "epoch": 0.28256178826924777, + "grad_norm": 0.699610418795585, + "learning_rate": 0.00019357550593476425, + "loss": 12.5382, + "step": 5189 + }, + { + "epoch": 0.2826162422658308, + "grad_norm": 0.8746550655544723, + "learning_rate": 0.00019357239580661593, + "loss": 12.4991, + "step": 5190 + }, + { + "epoch": 0.2826706962624138, + "grad_norm": 0.668798828037185, + "learning_rate": 0.00019356928495083297, + "loss": 12.6543, + "step": 5191 + }, + { + "epoch": 0.2827251502589968, + "grad_norm": 0.7044621987621039, + "learning_rate": 0.00019356617336743951, + "loss": 12.6718, + "step": 5192 + }, + { + "epoch": 0.28277960425557985, + "grad_norm": 0.6422951635514824, + "learning_rate": 0.00019356306105645983, + "loss": 12.5407, + "step": 5193 + }, + { + "epoch": 0.28283405825216285, + "grad_norm": 0.684927068062873, + "learning_rate": 0.00019355994801791812, + "loss": 12.532, + "step": 5194 + }, + { + "epoch": 0.28288851224874584, + "grad_norm": 0.723283260192857, + "learning_rate": 0.00019355683425183854, + "loss": 12.5527, + "step": 5195 + }, + { + "epoch": 0.2829429662453289, + "grad_norm": 0.7360719862470695, + "learning_rate": 0.0001935537197582453, + "loss": 12.6376, + "step": 5196 + }, + { + "epoch": 0.2829974202419119, + "grad_norm": 0.6883098067111958, + "learning_rate": 0.0001935506045371627, + "loss": 12.5532, + "step": 5197 + }, + { + "epoch": 0.28305187423849487, + "grad_norm": 0.6584212997077407, + "learning_rate": 0.00019354748858861487, + "loss": 12.5902, + "step": 5198 + }, + { + "epoch": 0.2831063282350779, + "grad_norm": 0.6992412627582681, + "learning_rate": 0.0001935443719126261, + "loss": 12.5555, + "step": 5199 + }, + { + "epoch": 0.2831607822316609, + "grad_norm": 0.7044431102255994, + "learning_rate": 0.0001935412545092206, + "loss": 12.5461, + "step": 5200 + }, + { + "epoch": 0.2832152362282439, + "grad_norm": 0.7073752555792435, + "learning_rate": 0.00019353813637842265, + "loss": 12.4574, + "step": 5201 + }, + { + "epoch": 0.28326969022482695, + "grad_norm": 0.7015761112063352, + "learning_rate": 0.00019353501752025643, + "loss": 12.6205, + "step": 5202 + }, + { + "epoch": 0.28332414422140995, + "grad_norm": 0.6979274453509807, + "learning_rate": 0.00019353189793474619, + "loss": 12.7032, + "step": 5203 + }, + { + "epoch": 0.28337859821799294, + "grad_norm": 0.6957153333269988, + "learning_rate": 0.0001935287776219163, + "loss": 12.5931, + "step": 5204 + }, + { + "epoch": 0.283433052214576, + "grad_norm": 0.6897842774068781, + "learning_rate": 0.00019352565658179092, + "loss": 12.5964, + "step": 5205 + }, + { + "epoch": 0.283487506211159, + "grad_norm": 0.6774486360522814, + "learning_rate": 0.00019352253481439436, + "loss": 12.6081, + "step": 5206 + }, + { + "epoch": 0.28354196020774197, + "grad_norm": 0.6775675886557151, + "learning_rate": 0.00019351941231975087, + "loss": 12.5461, + "step": 5207 + }, + { + "epoch": 0.283596414204325, + "grad_norm": 0.6820558537395075, + "learning_rate": 0.00019351628909788473, + "loss": 12.4608, + "step": 5208 + }, + { + "epoch": 0.283650868200908, + "grad_norm": 0.6534199168216551, + "learning_rate": 0.00019351316514882027, + "loss": 12.5736, + "step": 5209 + }, + { + "epoch": 0.283705322197491, + "grad_norm": 0.6448033586536517, + "learning_rate": 0.00019351004047258176, + "loss": 12.6143, + "step": 5210 + }, + { + "epoch": 0.28375977619407405, + "grad_norm": 0.6961255459081882, + "learning_rate": 0.0001935069150691935, + "loss": 12.5625, + "step": 5211 + }, + { + "epoch": 0.28381423019065705, + "grad_norm": 0.6788903485850866, + "learning_rate": 0.00019350378893867975, + "loss": 12.6184, + "step": 5212 + }, + { + "epoch": 0.2838686841872401, + "grad_norm": 0.6912612299861243, + "learning_rate": 0.00019350066208106487, + "loss": 12.608, + "step": 5213 + }, + { + "epoch": 0.2839231381838231, + "grad_norm": 0.6830298112202225, + "learning_rate": 0.00019349753449637318, + "loss": 12.7429, + "step": 5214 + }, + { + "epoch": 0.2839775921804061, + "grad_norm": 0.6651906648917761, + "learning_rate": 0.00019349440618462898, + "loss": 12.5816, + "step": 5215 + }, + { + "epoch": 0.28403204617698913, + "grad_norm": 0.7197247296245165, + "learning_rate": 0.00019349127714585657, + "loss": 12.4892, + "step": 5216 + }, + { + "epoch": 0.2840865001735721, + "grad_norm": 0.6601689123815974, + "learning_rate": 0.00019348814738008035, + "loss": 12.5705, + "step": 5217 + }, + { + "epoch": 0.2841409541701551, + "grad_norm": 0.8551875521767103, + "learning_rate": 0.00019348501688732462, + "loss": 12.6959, + "step": 5218 + }, + { + "epoch": 0.28419540816673816, + "grad_norm": 0.6824147784459194, + "learning_rate": 0.00019348188566761367, + "loss": 12.6101, + "step": 5219 + }, + { + "epoch": 0.28424986216332115, + "grad_norm": 0.6444811599986673, + "learning_rate": 0.00019347875372097194, + "loss": 12.5973, + "step": 5220 + }, + { + "epoch": 0.28430431615990415, + "grad_norm": 0.7913232678020938, + "learning_rate": 0.00019347562104742375, + "loss": 12.5736, + "step": 5221 + }, + { + "epoch": 0.2843587701564872, + "grad_norm": 0.6667682774936895, + "learning_rate": 0.0001934724876469934, + "loss": 12.5546, + "step": 5222 + }, + { + "epoch": 0.2844132241530702, + "grad_norm": 0.6818675106959271, + "learning_rate": 0.00019346935351970536, + "loss": 12.5956, + "step": 5223 + }, + { + "epoch": 0.2844676781496532, + "grad_norm": 0.741997237016835, + "learning_rate": 0.00019346621866558395, + "loss": 12.6094, + "step": 5224 + }, + { + "epoch": 0.28452213214623623, + "grad_norm": 0.7304820963044663, + "learning_rate": 0.00019346308308465355, + "loss": 12.5719, + "step": 5225 + }, + { + "epoch": 0.2845765861428192, + "grad_norm": 0.7698383608402133, + "learning_rate": 0.00019345994677693855, + "loss": 12.6115, + "step": 5226 + }, + { + "epoch": 0.2846310401394022, + "grad_norm": 0.6658509131843733, + "learning_rate": 0.0001934568097424633, + "loss": 12.4676, + "step": 5227 + }, + { + "epoch": 0.28468549413598526, + "grad_norm": 0.7863371286802923, + "learning_rate": 0.00019345367198125225, + "loss": 12.4797, + "step": 5228 + }, + { + "epoch": 0.28473994813256825, + "grad_norm": 0.7296124497290241, + "learning_rate": 0.00019345053349332977, + "loss": 12.6438, + "step": 5229 + }, + { + "epoch": 0.28479440212915125, + "grad_norm": 0.7198469126781113, + "learning_rate": 0.00019344739427872026, + "loss": 12.6073, + "step": 5230 + }, + { + "epoch": 0.2848488561257343, + "grad_norm": 0.7259120593841006, + "learning_rate": 0.00019344425433744813, + "loss": 12.4623, + "step": 5231 + }, + { + "epoch": 0.2849033101223173, + "grad_norm": 0.7251504404787701, + "learning_rate": 0.00019344111366953782, + "loss": 12.6461, + "step": 5232 + }, + { + "epoch": 0.2849577641189003, + "grad_norm": 0.7413458025890929, + "learning_rate": 0.00019343797227501375, + "loss": 12.6042, + "step": 5233 + }, + { + "epoch": 0.28501221811548333, + "grad_norm": 0.7515706622044661, + "learning_rate": 0.00019343483015390033, + "loss": 12.6015, + "step": 5234 + }, + { + "epoch": 0.2850666721120663, + "grad_norm": 0.7603678090629596, + "learning_rate": 0.000193431687306222, + "loss": 12.6083, + "step": 5235 + }, + { + "epoch": 0.2851211261086493, + "grad_norm": 0.6856133174388271, + "learning_rate": 0.0001934285437320032, + "loss": 12.5179, + "step": 5236 + }, + { + "epoch": 0.28517558010523236, + "grad_norm": 0.6688520904320578, + "learning_rate": 0.0001934253994312684, + "loss": 12.5741, + "step": 5237 + }, + { + "epoch": 0.28523003410181536, + "grad_norm": 0.6992197464644883, + "learning_rate": 0.000193422254404042, + "loss": 12.5731, + "step": 5238 + }, + { + "epoch": 0.28528448809839835, + "grad_norm": 0.7092290730706611, + "learning_rate": 0.0001934191086503485, + "loss": 12.649, + "step": 5239 + }, + { + "epoch": 0.2853389420949814, + "grad_norm": 0.7770595485512367, + "learning_rate": 0.00019341596217021235, + "loss": 12.5758, + "step": 5240 + }, + { + "epoch": 0.2853933960915644, + "grad_norm": 0.654164035203944, + "learning_rate": 0.00019341281496365798, + "loss": 12.6354, + "step": 5241 + }, + { + "epoch": 0.2854478500881474, + "grad_norm": 0.6837349818438978, + "learning_rate": 0.0001934096670307099, + "loss": 12.6693, + "step": 5242 + }, + { + "epoch": 0.28550230408473043, + "grad_norm": 0.7154457385795903, + "learning_rate": 0.00019340651837139257, + "loss": 12.5421, + "step": 5243 + }, + { + "epoch": 0.2855567580813134, + "grad_norm": 0.7576311967450255, + "learning_rate": 0.00019340336898573054, + "loss": 12.6667, + "step": 5244 + }, + { + "epoch": 0.2856112120778964, + "grad_norm": 0.709766937442989, + "learning_rate": 0.0001934002188737482, + "loss": 12.6011, + "step": 5245 + }, + { + "epoch": 0.28566566607447946, + "grad_norm": 0.6936480129371836, + "learning_rate": 0.00019339706803547015, + "loss": 12.692, + "step": 5246 + }, + { + "epoch": 0.28572012007106246, + "grad_norm": 0.7300076513644107, + "learning_rate": 0.0001933939164709208, + "loss": 12.5928, + "step": 5247 + }, + { + "epoch": 0.2857745740676455, + "grad_norm": 0.8340857027312786, + "learning_rate": 0.0001933907641801247, + "loss": 12.775, + "step": 5248 + }, + { + "epoch": 0.2858290280642285, + "grad_norm": 0.7522241994359548, + "learning_rate": 0.00019338761116310634, + "loss": 12.626, + "step": 5249 + }, + { + "epoch": 0.2858834820608115, + "grad_norm": 0.7167487172392022, + "learning_rate": 0.0001933844574198903, + "loss": 12.5693, + "step": 5250 + }, + { + "epoch": 0.28593793605739454, + "grad_norm": 0.6799345059630523, + "learning_rate": 0.000193381302950501, + "loss": 12.4642, + "step": 5251 + }, + { + "epoch": 0.28599239005397753, + "grad_norm": 0.8655447933860508, + "learning_rate": 0.00019337814775496307, + "loss": 12.6, + "step": 5252 + }, + { + "epoch": 0.2860468440505605, + "grad_norm": 0.6945184162411392, + "learning_rate": 0.000193374991833301, + "loss": 12.5597, + "step": 5253 + }, + { + "epoch": 0.28610129804714357, + "grad_norm": 0.7442352463692852, + "learning_rate": 0.00019337183518553931, + "loss": 12.5904, + "step": 5254 + }, + { + "epoch": 0.28615575204372656, + "grad_norm": 0.8143236594223469, + "learning_rate": 0.0001933686778117026, + "loss": 12.6048, + "step": 5255 + }, + { + "epoch": 0.28621020604030956, + "grad_norm": 0.6790276390562144, + "learning_rate": 0.0001933655197118154, + "loss": 12.5126, + "step": 5256 + }, + { + "epoch": 0.2862646600368926, + "grad_norm": 0.6440104432738986, + "learning_rate": 0.0001933623608859022, + "loss": 12.532, + "step": 5257 + }, + { + "epoch": 0.2863191140334756, + "grad_norm": 0.6313806996060203, + "learning_rate": 0.0001933592013339877, + "loss": 12.5107, + "step": 5258 + }, + { + "epoch": 0.2863735680300586, + "grad_norm": 0.6764511258985463, + "learning_rate": 0.00019335604105609632, + "loss": 12.4815, + "step": 5259 + }, + { + "epoch": 0.28642802202664164, + "grad_norm": 0.6467709401645445, + "learning_rate": 0.00019335288005225277, + "loss": 12.6475, + "step": 5260 + }, + { + "epoch": 0.28648247602322463, + "grad_norm": 0.6753532669897918, + "learning_rate": 0.00019334971832248154, + "loss": 12.5827, + "step": 5261 + }, + { + "epoch": 0.2865369300198076, + "grad_norm": 0.6827578003307204, + "learning_rate": 0.00019334655586680723, + "loss": 12.6088, + "step": 5262 + }, + { + "epoch": 0.28659138401639067, + "grad_norm": 0.7450610764899795, + "learning_rate": 0.0001933433926852545, + "loss": 12.6325, + "step": 5263 + }, + { + "epoch": 0.28664583801297366, + "grad_norm": 0.7387077864867498, + "learning_rate": 0.00019334022877784786, + "loss": 12.5494, + "step": 5264 + }, + { + "epoch": 0.28670029200955666, + "grad_norm": 0.8186123622954318, + "learning_rate": 0.00019333706414461195, + "loss": 12.7283, + "step": 5265 + }, + { + "epoch": 0.2867547460061397, + "grad_norm": 0.718727468080562, + "learning_rate": 0.00019333389878557137, + "loss": 12.5985, + "step": 5266 + }, + { + "epoch": 0.2868092000027227, + "grad_norm": 0.6681241377843442, + "learning_rate": 0.00019333073270075076, + "loss": 12.5538, + "step": 5267 + }, + { + "epoch": 0.2868636539993057, + "grad_norm": 0.6432945678201138, + "learning_rate": 0.0001933275658901747, + "loss": 12.5511, + "step": 5268 + }, + { + "epoch": 0.28691810799588874, + "grad_norm": 0.7267145364610287, + "learning_rate": 0.00019332439835386786, + "loss": 12.4169, + "step": 5269 + }, + { + "epoch": 0.28697256199247173, + "grad_norm": 0.6547373420430387, + "learning_rate": 0.00019332123009185482, + "loss": 12.54, + "step": 5270 + }, + { + "epoch": 0.2870270159890547, + "grad_norm": 0.6464456886624865, + "learning_rate": 0.00019331806110416027, + "loss": 12.6696, + "step": 5271 + }, + { + "epoch": 0.28708146998563777, + "grad_norm": 0.639416556182046, + "learning_rate": 0.0001933148913908088, + "loss": 12.606, + "step": 5272 + }, + { + "epoch": 0.28713592398222076, + "grad_norm": 0.765463803610551, + "learning_rate": 0.00019331172095182511, + "loss": 12.5596, + "step": 5273 + }, + { + "epoch": 0.28719037797880376, + "grad_norm": 0.6475594591585864, + "learning_rate": 0.00019330854978723383, + "loss": 12.57, + "step": 5274 + }, + { + "epoch": 0.2872448319753868, + "grad_norm": 0.738671421515384, + "learning_rate": 0.00019330537789705963, + "loss": 12.6256, + "step": 5275 + }, + { + "epoch": 0.2872992859719698, + "grad_norm": 0.6778383446733212, + "learning_rate": 0.00019330220528132713, + "loss": 12.4934, + "step": 5276 + }, + { + "epoch": 0.2873537399685528, + "grad_norm": 0.7386529725271362, + "learning_rate": 0.00019329903194006105, + "loss": 12.5469, + "step": 5277 + }, + { + "epoch": 0.28740819396513584, + "grad_norm": 0.714724000030084, + "learning_rate": 0.00019329585787328607, + "loss": 12.6229, + "step": 5278 + }, + { + "epoch": 0.28746264796171883, + "grad_norm": 0.6353740118316481, + "learning_rate": 0.00019329268308102685, + "loss": 12.5172, + "step": 5279 + }, + { + "epoch": 0.2875171019583019, + "grad_norm": 0.7049945078654178, + "learning_rate": 0.00019328950756330803, + "loss": 12.5439, + "step": 5280 + }, + { + "epoch": 0.28757155595488487, + "grad_norm": 0.7747680558953951, + "learning_rate": 0.00019328633132015442, + "loss": 12.6087, + "step": 5281 + }, + { + "epoch": 0.28762600995146786, + "grad_norm": 0.6793194770644609, + "learning_rate": 0.00019328315435159058, + "loss": 12.5076, + "step": 5282 + }, + { + "epoch": 0.2876804639480509, + "grad_norm": 0.7425325939186668, + "learning_rate": 0.00019327997665764137, + "loss": 12.6259, + "step": 5283 + }, + { + "epoch": 0.2877349179446339, + "grad_norm": 0.6843901494952742, + "learning_rate": 0.00019327679823833135, + "loss": 12.6013, + "step": 5284 + }, + { + "epoch": 0.2877893719412169, + "grad_norm": 0.6285854157813631, + "learning_rate": 0.00019327361909368535, + "loss": 12.6006, + "step": 5285 + }, + { + "epoch": 0.28784382593779995, + "grad_norm": 0.6295413056282074, + "learning_rate": 0.00019327043922372802, + "loss": 12.4413, + "step": 5286 + }, + { + "epoch": 0.28789827993438294, + "grad_norm": 0.7771093608805613, + "learning_rate": 0.00019326725862848414, + "loss": 12.5233, + "step": 5287 + }, + { + "epoch": 0.28795273393096593, + "grad_norm": 0.6761632961873533, + "learning_rate": 0.0001932640773079784, + "loss": 12.5032, + "step": 5288 + }, + { + "epoch": 0.288007187927549, + "grad_norm": 0.6379880807498205, + "learning_rate": 0.00019326089526223558, + "loss": 12.434, + "step": 5289 + }, + { + "epoch": 0.288061641924132, + "grad_norm": 0.6797398725626824, + "learning_rate": 0.00019325771249128034, + "loss": 12.6128, + "step": 5290 + }, + { + "epoch": 0.28811609592071497, + "grad_norm": 0.828936749194387, + "learning_rate": 0.00019325452899513753, + "loss": 12.6434, + "step": 5291 + }, + { + "epoch": 0.288170549917298, + "grad_norm": 0.6184918134959372, + "learning_rate": 0.00019325134477383188, + "loss": 12.5812, + "step": 5292 + }, + { + "epoch": 0.288225003913881, + "grad_norm": 0.750997225216585, + "learning_rate": 0.0001932481598273881, + "loss": 12.5099, + "step": 5293 + }, + { + "epoch": 0.288279457910464, + "grad_norm": 0.7042831646975033, + "learning_rate": 0.000193244974155831, + "loss": 12.6465, + "step": 5294 + }, + { + "epoch": 0.28833391190704705, + "grad_norm": 0.8397389045457825, + "learning_rate": 0.00019324178775918536, + "loss": 12.6856, + "step": 5295 + }, + { + "epoch": 0.28838836590363004, + "grad_norm": 0.6364067470380718, + "learning_rate": 0.0001932386006374759, + "loss": 12.604, + "step": 5296 + }, + { + "epoch": 0.28844281990021303, + "grad_norm": 0.697749718400721, + "learning_rate": 0.00019323541279072748, + "loss": 12.6742, + "step": 5297 + }, + { + "epoch": 0.2884972738967961, + "grad_norm": 0.6885876972827212, + "learning_rate": 0.00019323222421896484, + "loss": 12.5667, + "step": 5298 + }, + { + "epoch": 0.2885517278933791, + "grad_norm": 0.6932645326014508, + "learning_rate": 0.00019322903492221283, + "loss": 12.4531, + "step": 5299 + }, + { + "epoch": 0.28860618188996207, + "grad_norm": 0.73021297394276, + "learning_rate": 0.00019322584490049616, + "loss": 12.5136, + "step": 5300 + }, + { + "epoch": 0.2886606358865451, + "grad_norm": 0.7518670769526901, + "learning_rate": 0.00019322265415383969, + "loss": 12.5631, + "step": 5301 + }, + { + "epoch": 0.2887150898831281, + "grad_norm": 0.7825021435770262, + "learning_rate": 0.00019321946268226824, + "loss": 12.4216, + "step": 5302 + }, + { + "epoch": 0.2887695438797111, + "grad_norm": 0.7673677711990196, + "learning_rate": 0.00019321627048580662, + "loss": 12.5662, + "step": 5303 + }, + { + "epoch": 0.28882399787629415, + "grad_norm": 0.7472072722363353, + "learning_rate": 0.00019321307756447963, + "loss": 12.5825, + "step": 5304 + }, + { + "epoch": 0.28887845187287714, + "grad_norm": 0.8010391264329372, + "learning_rate": 0.00019320988391831217, + "loss": 12.4988, + "step": 5305 + }, + { + "epoch": 0.28893290586946013, + "grad_norm": 0.6811847137339496, + "learning_rate": 0.00019320668954732898, + "loss": 12.5728, + "step": 5306 + }, + { + "epoch": 0.2889873598660432, + "grad_norm": 0.7944142383842702, + "learning_rate": 0.00019320349445155492, + "loss": 12.4664, + "step": 5307 + }, + { + "epoch": 0.2890418138626262, + "grad_norm": 0.792613651605558, + "learning_rate": 0.0001932002986310149, + "loss": 12.7375, + "step": 5308 + }, + { + "epoch": 0.28909626785920917, + "grad_norm": 0.7122210755604018, + "learning_rate": 0.0001931971020857337, + "loss": 12.7662, + "step": 5309 + }, + { + "epoch": 0.2891507218557922, + "grad_norm": 0.734802658973589, + "learning_rate": 0.0001931939048157362, + "loss": 12.627, + "step": 5310 + }, + { + "epoch": 0.2892051758523752, + "grad_norm": 0.6693328648027058, + "learning_rate": 0.00019319070682104731, + "loss": 12.601, + "step": 5311 + }, + { + "epoch": 0.2892596298489582, + "grad_norm": 0.7485113286192445, + "learning_rate": 0.00019318750810169184, + "loss": 12.5305, + "step": 5312 + }, + { + "epoch": 0.28931408384554125, + "grad_norm": 0.7768309672774144, + "learning_rate": 0.00019318430865769464, + "loss": 12.5792, + "step": 5313 + }, + { + "epoch": 0.28936853784212424, + "grad_norm": 0.6765733125020098, + "learning_rate": 0.00019318110848908065, + "loss": 12.7013, + "step": 5314 + }, + { + "epoch": 0.2894229918387073, + "grad_norm": 0.8730525986186539, + "learning_rate": 0.00019317790759587475, + "loss": 12.5474, + "step": 5315 + }, + { + "epoch": 0.2894774458352903, + "grad_norm": 0.6296560239385222, + "learning_rate": 0.0001931747059781018, + "loss": 12.448, + "step": 5316 + }, + { + "epoch": 0.2895318998318733, + "grad_norm": 0.8603087911123338, + "learning_rate": 0.0001931715036357867, + "loss": 12.6103, + "step": 5317 + }, + { + "epoch": 0.2895863538284563, + "grad_norm": 1.3755849613015414, + "learning_rate": 0.0001931683005689544, + "loss": 12.4477, + "step": 5318 + }, + { + "epoch": 0.2896408078250393, + "grad_norm": 1.2027269928655746, + "learning_rate": 0.00019316509677762974, + "loss": 12.5979, + "step": 5319 + }, + { + "epoch": 0.2896952618216223, + "grad_norm": 0.8655129824139789, + "learning_rate": 0.00019316189226183767, + "loss": 12.4687, + "step": 5320 + }, + { + "epoch": 0.28974971581820536, + "grad_norm": 3.817913808390665, + "learning_rate": 0.00019315868702160312, + "loss": 12.7316, + "step": 5321 + }, + { + "epoch": 0.28980416981478835, + "grad_norm": 0.8343931440789721, + "learning_rate": 0.00019315548105695098, + "loss": 12.6442, + "step": 5322 + }, + { + "epoch": 0.28985862381137134, + "grad_norm": 6.952518704615793, + "learning_rate": 0.00019315227436790623, + "loss": 12.8677, + "step": 5323 + }, + { + "epoch": 0.2899130778079544, + "grad_norm": 0.9131341069424375, + "learning_rate": 0.0001931490669544937, + "loss": 12.7066, + "step": 5324 + }, + { + "epoch": 0.2899675318045374, + "grad_norm": 1.789670303581222, + "learning_rate": 0.00019314585881673846, + "loss": 12.8639, + "step": 5325 + }, + { + "epoch": 0.2900219858011204, + "grad_norm": 1.0026867171007399, + "learning_rate": 0.0001931426499546654, + "loss": 12.7645, + "step": 5326 + }, + { + "epoch": 0.2900764397977034, + "grad_norm": 1.0203134278952588, + "learning_rate": 0.00019313944036829944, + "loss": 12.7672, + "step": 5327 + }, + { + "epoch": 0.2901308937942864, + "grad_norm": 1.0440168423497518, + "learning_rate": 0.0001931362300576656, + "loss": 12.6295, + "step": 5328 + }, + { + "epoch": 0.2901853477908694, + "grad_norm": 1.0227780364237082, + "learning_rate": 0.0001931330190227888, + "loss": 12.6797, + "step": 5329 + }, + { + "epoch": 0.29023980178745246, + "grad_norm": 1.139990750255168, + "learning_rate": 0.00019312980726369404, + "loss": 12.6139, + "step": 5330 + }, + { + "epoch": 0.29029425578403545, + "grad_norm": 1.1023503359236262, + "learning_rate": 0.00019312659478040628, + "loss": 12.6556, + "step": 5331 + }, + { + "epoch": 0.29034870978061844, + "grad_norm": 0.8425326234254836, + "learning_rate": 0.00019312338157295052, + "loss": 12.7766, + "step": 5332 + }, + { + "epoch": 0.2904031637772015, + "grad_norm": 0.9347222571312567, + "learning_rate": 0.0001931201676413517, + "loss": 12.7023, + "step": 5333 + }, + { + "epoch": 0.2904576177737845, + "grad_norm": 0.7686786588022239, + "learning_rate": 0.00019311695298563484, + "loss": 12.7143, + "step": 5334 + }, + { + "epoch": 0.2905120717703675, + "grad_norm": 0.892660220273949, + "learning_rate": 0.00019311373760582494, + "loss": 12.7365, + "step": 5335 + }, + { + "epoch": 0.2905665257669505, + "grad_norm": 0.7462257993513306, + "learning_rate": 0.00019311052150194699, + "loss": 12.7928, + "step": 5336 + }, + { + "epoch": 0.2906209797635335, + "grad_norm": 0.9456338911665619, + "learning_rate": 0.00019310730467402603, + "loss": 12.6129, + "step": 5337 + }, + { + "epoch": 0.2906754337601165, + "grad_norm": 0.8339349763476298, + "learning_rate": 0.00019310408712208706, + "loss": 12.6664, + "step": 5338 + }, + { + "epoch": 0.29072988775669956, + "grad_norm": 0.7848635052755665, + "learning_rate": 0.00019310086884615507, + "loss": 12.6457, + "step": 5339 + }, + { + "epoch": 0.29078434175328255, + "grad_norm": 0.9228880011106196, + "learning_rate": 0.00019309764984625513, + "loss": 12.5757, + "step": 5340 + }, + { + "epoch": 0.29083879574986554, + "grad_norm": 0.7168949910507982, + "learning_rate": 0.00019309443012241226, + "loss": 12.5764, + "step": 5341 + }, + { + "epoch": 0.2908932497464486, + "grad_norm": 0.8025730823863417, + "learning_rate": 0.00019309120967465147, + "loss": 12.6072, + "step": 5342 + }, + { + "epoch": 0.2909477037430316, + "grad_norm": 0.767974913734144, + "learning_rate": 0.00019308798850299784, + "loss": 12.7317, + "step": 5343 + }, + { + "epoch": 0.2910021577396146, + "grad_norm": 0.7643361303317291, + "learning_rate": 0.0001930847666074764, + "loss": 12.6485, + "step": 5344 + }, + { + "epoch": 0.2910566117361976, + "grad_norm": 0.7860919414875597, + "learning_rate": 0.00019308154398811218, + "loss": 12.597, + "step": 5345 + }, + { + "epoch": 0.2911110657327806, + "grad_norm": 0.7617826115160656, + "learning_rate": 0.00019307832064493027, + "loss": 12.7591, + "step": 5346 + }, + { + "epoch": 0.29116551972936366, + "grad_norm": 0.6984577542248323, + "learning_rate": 0.00019307509657795575, + "loss": 12.519, + "step": 5347 + }, + { + "epoch": 0.29121997372594666, + "grad_norm": 0.7105852071424831, + "learning_rate": 0.00019307187178721366, + "loss": 12.6628, + "step": 5348 + }, + { + "epoch": 0.29127442772252965, + "grad_norm": 0.6818024501380333, + "learning_rate": 0.0001930686462727291, + "loss": 12.5448, + "step": 5349 + }, + { + "epoch": 0.2913288817191127, + "grad_norm": 0.6718881976533685, + "learning_rate": 0.00019306542003452712, + "loss": 12.6965, + "step": 5350 + }, + { + "epoch": 0.2913833357156957, + "grad_norm": 0.7582962470318719, + "learning_rate": 0.00019306219307263284, + "loss": 12.6478, + "step": 5351 + }, + { + "epoch": 0.2914377897122787, + "grad_norm": 0.8820400481683287, + "learning_rate": 0.0001930589653870713, + "loss": 12.6285, + "step": 5352 + }, + { + "epoch": 0.29149224370886173, + "grad_norm": 0.9586742288093614, + "learning_rate": 0.0001930557369778677, + "loss": 12.7713, + "step": 5353 + }, + { + "epoch": 0.2915466977054447, + "grad_norm": 0.7338990360689434, + "learning_rate": 0.00019305250784504706, + "loss": 12.6302, + "step": 5354 + }, + { + "epoch": 0.2916011517020277, + "grad_norm": 0.7582419862630847, + "learning_rate": 0.0001930492779886345, + "loss": 12.6738, + "step": 5355 + }, + { + "epoch": 0.29165560569861076, + "grad_norm": 0.840787783063441, + "learning_rate": 0.00019304604740865515, + "loss": 12.6798, + "step": 5356 + }, + { + "epoch": 0.29171005969519376, + "grad_norm": 0.6931359406914137, + "learning_rate": 0.00019304281610513414, + "loss": 12.505, + "step": 5357 + }, + { + "epoch": 0.29176451369177675, + "grad_norm": 0.8046820695731451, + "learning_rate": 0.00019303958407809656, + "loss": 12.545, + "step": 5358 + }, + { + "epoch": 0.2918189676883598, + "grad_norm": 0.7504096676621198, + "learning_rate": 0.00019303635132756762, + "loss": 12.488, + "step": 5359 + }, + { + "epoch": 0.2918734216849428, + "grad_norm": 0.6977037675551906, + "learning_rate": 0.0001930331178535724, + "loss": 12.4553, + "step": 5360 + }, + { + "epoch": 0.2919278756815258, + "grad_norm": 0.7411988969206796, + "learning_rate": 0.00019302988365613603, + "loss": 12.6309, + "step": 5361 + }, + { + "epoch": 0.29198232967810883, + "grad_norm": 0.7279922787640098, + "learning_rate": 0.0001930266487352837, + "loss": 12.5111, + "step": 5362 + }, + { + "epoch": 0.2920367836746918, + "grad_norm": 0.7811018864235276, + "learning_rate": 0.00019302341309104055, + "loss": 12.6204, + "step": 5363 + }, + { + "epoch": 0.2920912376712748, + "grad_norm": 0.7559002866108901, + "learning_rate": 0.00019302017672343172, + "loss": 12.4904, + "step": 5364 + }, + { + "epoch": 0.29214569166785787, + "grad_norm": 0.7466752855960884, + "learning_rate": 0.00019301693963248243, + "loss": 12.7218, + "step": 5365 + }, + { + "epoch": 0.29220014566444086, + "grad_norm": 0.7993271004037386, + "learning_rate": 0.00019301370181821782, + "loss": 12.6146, + "step": 5366 + }, + { + "epoch": 0.29225459966102385, + "grad_norm": 0.6983084821208461, + "learning_rate": 0.00019301046328066304, + "loss": 12.5839, + "step": 5367 + }, + { + "epoch": 0.2923090536576069, + "grad_norm": 0.8523665600366496, + "learning_rate": 0.00019300722401984332, + "loss": 12.6677, + "step": 5368 + }, + { + "epoch": 0.2923635076541899, + "grad_norm": 0.8207120293181236, + "learning_rate": 0.0001930039840357838, + "loss": 12.5995, + "step": 5369 + }, + { + "epoch": 0.2924179616507729, + "grad_norm": 0.6721964030090202, + "learning_rate": 0.0001930007433285097, + "loss": 12.4554, + "step": 5370 + }, + { + "epoch": 0.29247241564735593, + "grad_norm": 0.6800829699572934, + "learning_rate": 0.00019299750189804624, + "loss": 12.6383, + "step": 5371 + }, + { + "epoch": 0.2925268696439389, + "grad_norm": 0.8289854580672297, + "learning_rate": 0.00019299425974441862, + "loss": 12.6417, + "step": 5372 + }, + { + "epoch": 0.2925813236405219, + "grad_norm": 0.7095324885178306, + "learning_rate": 0.00019299101686765205, + "loss": 12.5994, + "step": 5373 + }, + { + "epoch": 0.29263577763710497, + "grad_norm": 0.6896328151277851, + "learning_rate": 0.00019298777326777171, + "loss": 12.3786, + "step": 5374 + }, + { + "epoch": 0.29269023163368796, + "grad_norm": 0.6994439095041498, + "learning_rate": 0.00019298452894480286, + "loss": 12.6018, + "step": 5375 + }, + { + "epoch": 0.29274468563027095, + "grad_norm": 0.80451622927954, + "learning_rate": 0.00019298128389877073, + "loss": 12.6109, + "step": 5376 + }, + { + "epoch": 0.292799139626854, + "grad_norm": 0.6689060517208213, + "learning_rate": 0.00019297803812970052, + "loss": 12.5704, + "step": 5377 + }, + { + "epoch": 0.292853593623437, + "grad_norm": 0.687941492721622, + "learning_rate": 0.00019297479163761755, + "loss": 12.6013, + "step": 5378 + }, + { + "epoch": 0.29290804762002, + "grad_norm": 0.6930940676293417, + "learning_rate": 0.00019297154442254693, + "loss": 12.5302, + "step": 5379 + }, + { + "epoch": 0.29296250161660303, + "grad_norm": 0.7573733554612111, + "learning_rate": 0.00019296829648451404, + "loss": 12.6814, + "step": 5380 + }, + { + "epoch": 0.293016955613186, + "grad_norm": 0.6483541127785282, + "learning_rate": 0.00019296504782354408, + "loss": 12.5151, + "step": 5381 + }, + { + "epoch": 0.2930714096097691, + "grad_norm": 0.6872342229388375, + "learning_rate": 0.0001929617984396623, + "loss": 12.5216, + "step": 5382 + }, + { + "epoch": 0.29312586360635207, + "grad_norm": 0.7146685548807061, + "learning_rate": 0.000192958548332894, + "loss": 12.8383, + "step": 5383 + }, + { + "epoch": 0.29318031760293506, + "grad_norm": 0.7128076251945845, + "learning_rate": 0.00019295529750326443, + "loss": 12.5799, + "step": 5384 + }, + { + "epoch": 0.2932347715995181, + "grad_norm": 0.6913826142462048, + "learning_rate": 0.0001929520459507989, + "loss": 12.5626, + "step": 5385 + }, + { + "epoch": 0.2932892255961011, + "grad_norm": 0.7745424972710939, + "learning_rate": 0.00019294879367552263, + "loss": 12.6744, + "step": 5386 + }, + { + "epoch": 0.2933436795926841, + "grad_norm": 0.6975593164609588, + "learning_rate": 0.00019294554067746098, + "loss": 12.5228, + "step": 5387 + }, + { + "epoch": 0.29339813358926714, + "grad_norm": 0.7678680088261888, + "learning_rate": 0.0001929422869566392, + "loss": 12.6008, + "step": 5388 + }, + { + "epoch": 0.29345258758585013, + "grad_norm": 0.692367853277397, + "learning_rate": 0.00019293903251308266, + "loss": 12.6218, + "step": 5389 + }, + { + "epoch": 0.2935070415824331, + "grad_norm": 0.8232991412974388, + "learning_rate": 0.00019293577734681656, + "loss": 12.7118, + "step": 5390 + }, + { + "epoch": 0.2935614955790162, + "grad_norm": 0.6167356791640606, + "learning_rate": 0.0001929325214578663, + "loss": 12.4945, + "step": 5391 + }, + { + "epoch": 0.29361594957559917, + "grad_norm": 0.6438436500011632, + "learning_rate": 0.00019292926484625714, + "loss": 12.6473, + "step": 5392 + }, + { + "epoch": 0.29367040357218216, + "grad_norm": 0.7587583883204099, + "learning_rate": 0.00019292600751201448, + "loss": 12.5496, + "step": 5393 + }, + { + "epoch": 0.2937248575687652, + "grad_norm": 0.7076624710372246, + "learning_rate": 0.00019292274945516359, + "loss": 12.3377, + "step": 5394 + }, + { + "epoch": 0.2937793115653482, + "grad_norm": 0.8745028515304532, + "learning_rate": 0.00019291949067572978, + "loss": 12.5201, + "step": 5395 + }, + { + "epoch": 0.2938337655619312, + "grad_norm": 0.7570972500445633, + "learning_rate": 0.00019291623117373847, + "loss": 12.5978, + "step": 5396 + }, + { + "epoch": 0.29388821955851424, + "grad_norm": 0.7985090072616512, + "learning_rate": 0.00019291297094921494, + "loss": 12.5774, + "step": 5397 + }, + { + "epoch": 0.29394267355509723, + "grad_norm": 0.8410990037792181, + "learning_rate": 0.00019290971000218457, + "loss": 12.7778, + "step": 5398 + }, + { + "epoch": 0.2939971275516802, + "grad_norm": 0.6325474282429254, + "learning_rate": 0.0001929064483326727, + "loss": 12.5482, + "step": 5399 + }, + { + "epoch": 0.2940515815482633, + "grad_norm": 0.7215869597593331, + "learning_rate": 0.00019290318594070475, + "loss": 12.6419, + "step": 5400 + }, + { + "epoch": 0.29410603554484627, + "grad_norm": 0.7388693424632827, + "learning_rate": 0.00019289992282630602, + "loss": 12.5937, + "step": 5401 + }, + { + "epoch": 0.29416048954142926, + "grad_norm": 0.9311053731409538, + "learning_rate": 0.0001928966589895019, + "loss": 12.673, + "step": 5402 + }, + { + "epoch": 0.2942149435380123, + "grad_norm": 0.7564163948266747, + "learning_rate": 0.00019289339443031778, + "loss": 12.5103, + "step": 5403 + }, + { + "epoch": 0.2942693975345953, + "grad_norm": 0.7845040973885391, + "learning_rate": 0.00019289012914877905, + "loss": 12.5595, + "step": 5404 + }, + { + "epoch": 0.2943238515311783, + "grad_norm": 0.7268199640406403, + "learning_rate": 0.00019288686314491115, + "loss": 12.5784, + "step": 5405 + }, + { + "epoch": 0.29437830552776134, + "grad_norm": 0.7117435618249173, + "learning_rate": 0.00019288359641873935, + "loss": 12.5584, + "step": 5406 + }, + { + "epoch": 0.29443275952434433, + "grad_norm": 0.7430669941676982, + "learning_rate": 0.00019288032897028917, + "loss": 12.5339, + "step": 5407 + }, + { + "epoch": 0.2944872135209273, + "grad_norm": 0.7309612448126213, + "learning_rate": 0.00019287706079958595, + "loss": 12.6017, + "step": 5408 + }, + { + "epoch": 0.2945416675175104, + "grad_norm": 0.7591821592955744, + "learning_rate": 0.00019287379190665517, + "loss": 12.5533, + "step": 5409 + }, + { + "epoch": 0.29459612151409337, + "grad_norm": 0.7451068498540538, + "learning_rate": 0.0001928705222915222, + "loss": 12.4671, + "step": 5410 + }, + { + "epoch": 0.29465057551067636, + "grad_norm": 0.7720985974444161, + "learning_rate": 0.00019286725195421243, + "loss": 12.6444, + "step": 5411 + }, + { + "epoch": 0.2947050295072594, + "grad_norm": 0.686883058089928, + "learning_rate": 0.00019286398089475134, + "loss": 12.5692, + "step": 5412 + }, + { + "epoch": 0.2947594835038424, + "grad_norm": 0.7205956053915606, + "learning_rate": 0.0001928607091131644, + "loss": 12.7151, + "step": 5413 + }, + { + "epoch": 0.29481393750042545, + "grad_norm": 0.7336566961369465, + "learning_rate": 0.000192857436609477, + "loss": 12.5186, + "step": 5414 + }, + { + "epoch": 0.29486839149700844, + "grad_norm": 0.6770882797888417, + "learning_rate": 0.0001928541633837146, + "loss": 12.5483, + "step": 5415 + }, + { + "epoch": 0.29492284549359143, + "grad_norm": 0.6642706676974497, + "learning_rate": 0.00019285088943590267, + "loss": 12.6158, + "step": 5416 + }, + { + "epoch": 0.2949772994901745, + "grad_norm": 0.6612267667643044, + "learning_rate": 0.00019284761476606662, + "loss": 12.6026, + "step": 5417 + }, + { + "epoch": 0.2950317534867575, + "grad_norm": 0.8225131551740218, + "learning_rate": 0.00019284433937423196, + "loss": 12.7422, + "step": 5418 + }, + { + "epoch": 0.29508620748334047, + "grad_norm": 0.9312293048136258, + "learning_rate": 0.00019284106326042415, + "loss": 12.5606, + "step": 5419 + }, + { + "epoch": 0.2951406614799235, + "grad_norm": 0.686764621159285, + "learning_rate": 0.00019283778642466864, + "loss": 12.6587, + "step": 5420 + }, + { + "epoch": 0.2951951154765065, + "grad_norm": 0.7232553800589134, + "learning_rate": 0.000192834508866991, + "loss": 12.6868, + "step": 5421 + }, + { + "epoch": 0.2952495694730895, + "grad_norm": 0.7771593086870223, + "learning_rate": 0.0001928312305874166, + "loss": 12.6444, + "step": 5422 + }, + { + "epoch": 0.29530402346967255, + "grad_norm": 0.6821004747227135, + "learning_rate": 0.00019282795158597098, + "loss": 12.5451, + "step": 5423 + }, + { + "epoch": 0.29535847746625554, + "grad_norm": 0.7158378326219672, + "learning_rate": 0.00019282467186267966, + "loss": 12.621, + "step": 5424 + }, + { + "epoch": 0.29541293146283853, + "grad_norm": 0.748752470958464, + "learning_rate": 0.0001928213914175681, + "loss": 12.6017, + "step": 5425 + }, + { + "epoch": 0.2954673854594216, + "grad_norm": 0.709105723596565, + "learning_rate": 0.00019281811025066183, + "loss": 12.6099, + "step": 5426 + }, + { + "epoch": 0.2955218394560046, + "grad_norm": 0.658589593936804, + "learning_rate": 0.0001928148283619864, + "loss": 12.6061, + "step": 5427 + }, + { + "epoch": 0.29557629345258757, + "grad_norm": 0.7269034855071365, + "learning_rate": 0.0001928115457515673, + "loss": 12.6086, + "step": 5428 + }, + { + "epoch": 0.2956307474491706, + "grad_norm": 0.8569243295411642, + "learning_rate": 0.00019280826241943003, + "loss": 12.7698, + "step": 5429 + }, + { + "epoch": 0.2956852014457536, + "grad_norm": 0.7756779258888888, + "learning_rate": 0.00019280497836560016, + "loss": 12.569, + "step": 5430 + }, + { + "epoch": 0.2957396554423366, + "grad_norm": 0.7274438498794342, + "learning_rate": 0.00019280169359010322, + "loss": 12.7412, + "step": 5431 + }, + { + "epoch": 0.29579410943891965, + "grad_norm": 0.6509255649787569, + "learning_rate": 0.00019279840809296474, + "loss": 12.4884, + "step": 5432 + }, + { + "epoch": 0.29584856343550264, + "grad_norm": 0.6538988898602316, + "learning_rate": 0.0001927951218742103, + "loss": 12.525, + "step": 5433 + }, + { + "epoch": 0.29590301743208564, + "grad_norm": 0.678004336147839, + "learning_rate": 0.00019279183493386542, + "loss": 12.663, + "step": 5434 + }, + { + "epoch": 0.2959574714286687, + "grad_norm": 0.6604899315721932, + "learning_rate": 0.00019278854727195564, + "loss": 12.5462, + "step": 5435 + }, + { + "epoch": 0.2960119254252517, + "grad_norm": 0.650865153177635, + "learning_rate": 0.00019278525888850658, + "loss": 12.5251, + "step": 5436 + }, + { + "epoch": 0.29606637942183467, + "grad_norm": 0.6905167141476374, + "learning_rate": 0.0001927819697835438, + "loss": 12.4289, + "step": 5437 + }, + { + "epoch": 0.2961208334184177, + "grad_norm": 0.6775595408948848, + "learning_rate": 0.00019277867995709286, + "loss": 12.5849, + "step": 5438 + }, + { + "epoch": 0.2961752874150007, + "grad_norm": 0.6750514376962531, + "learning_rate": 0.0001927753894091793, + "loss": 12.5668, + "step": 5439 + }, + { + "epoch": 0.2962297414115837, + "grad_norm": 0.7530309678722148, + "learning_rate": 0.0001927720981398288, + "loss": 12.4954, + "step": 5440 + }, + { + "epoch": 0.29628419540816675, + "grad_norm": 0.6601666121573037, + "learning_rate": 0.0001927688061490669, + "loss": 12.4699, + "step": 5441 + }, + { + "epoch": 0.29633864940474974, + "grad_norm": 0.6954527578087399, + "learning_rate": 0.0001927655134369192, + "loss": 12.5202, + "step": 5442 + }, + { + "epoch": 0.29639310340133274, + "grad_norm": 0.6441927111198292, + "learning_rate": 0.0001927622200034113, + "loss": 12.5601, + "step": 5443 + }, + { + "epoch": 0.2964475573979158, + "grad_norm": 0.6474561946353591, + "learning_rate": 0.00019275892584856883, + "loss": 12.485, + "step": 5444 + }, + { + "epoch": 0.2965020113944988, + "grad_norm": 0.6933453342076746, + "learning_rate": 0.0001927556309724174, + "loss": 12.4715, + "step": 5445 + }, + { + "epoch": 0.29655646539108177, + "grad_norm": 0.7682158885479328, + "learning_rate": 0.00019275233537498264, + "loss": 12.6883, + "step": 5446 + }, + { + "epoch": 0.2966109193876648, + "grad_norm": 0.6739621499325937, + "learning_rate": 0.00019274903905629014, + "loss": 12.6544, + "step": 5447 + }, + { + "epoch": 0.2966653733842478, + "grad_norm": 0.6354207232839544, + "learning_rate": 0.00019274574201636556, + "loss": 12.4936, + "step": 5448 + }, + { + "epoch": 0.29671982738083086, + "grad_norm": 0.658172915455953, + "learning_rate": 0.00019274244425523455, + "loss": 12.6144, + "step": 5449 + }, + { + "epoch": 0.29677428137741385, + "grad_norm": 0.6766134115227272, + "learning_rate": 0.00019273914577292274, + "loss": 12.5919, + "step": 5450 + }, + { + "epoch": 0.29682873537399684, + "grad_norm": 0.6557699542841391, + "learning_rate": 0.0001927358465694558, + "loss": 12.6577, + "step": 5451 + }, + { + "epoch": 0.2968831893705799, + "grad_norm": 0.6875351309441434, + "learning_rate": 0.00019273254664485933, + "loss": 12.6079, + "step": 5452 + }, + { + "epoch": 0.2969376433671629, + "grad_norm": 0.667682654878768, + "learning_rate": 0.00019272924599915902, + "loss": 12.5817, + "step": 5453 + }, + { + "epoch": 0.2969920973637459, + "grad_norm": 0.6779324528523384, + "learning_rate": 0.00019272594463238057, + "loss": 12.5989, + "step": 5454 + }, + { + "epoch": 0.2970465513603289, + "grad_norm": 0.624243957794944, + "learning_rate": 0.00019272264254454962, + "loss": 12.521, + "step": 5455 + }, + { + "epoch": 0.2971010053569119, + "grad_norm": 1.088983734238408, + "learning_rate": 0.00019271933973569186, + "loss": 12.5177, + "step": 5456 + }, + { + "epoch": 0.2971554593534949, + "grad_norm": 0.7190029177790371, + "learning_rate": 0.00019271603620583293, + "loss": 12.5043, + "step": 5457 + }, + { + "epoch": 0.29720991335007796, + "grad_norm": 0.6624008667037063, + "learning_rate": 0.00019271273195499856, + "loss": 12.5598, + "step": 5458 + }, + { + "epoch": 0.29726436734666095, + "grad_norm": 0.829880367133911, + "learning_rate": 0.0001927094269832145, + "loss": 12.7031, + "step": 5459 + }, + { + "epoch": 0.29731882134324394, + "grad_norm": 0.6624179427876619, + "learning_rate": 0.00019270612129050632, + "loss": 12.496, + "step": 5460 + }, + { + "epoch": 0.297373275339827, + "grad_norm": 0.7302525172954663, + "learning_rate": 0.00019270281487689982, + "loss": 12.6206, + "step": 5461 + }, + { + "epoch": 0.29742772933641, + "grad_norm": 0.7417326761674216, + "learning_rate": 0.0001926995077424207, + "loss": 12.7291, + "step": 5462 + }, + { + "epoch": 0.297482183332993, + "grad_norm": 0.6515096566908593, + "learning_rate": 0.00019269619988709466, + "loss": 12.4573, + "step": 5463 + }, + { + "epoch": 0.297536637329576, + "grad_norm": 0.660489757020803, + "learning_rate": 0.0001926928913109474, + "loss": 12.6159, + "step": 5464 + }, + { + "epoch": 0.297591091326159, + "grad_norm": 0.6976139432730505, + "learning_rate": 0.00019268958201400466, + "loss": 12.4181, + "step": 5465 + }, + { + "epoch": 0.297645545322742, + "grad_norm": 0.7180785514358637, + "learning_rate": 0.0001926862719962922, + "loss": 12.5845, + "step": 5466 + }, + { + "epoch": 0.29769999931932506, + "grad_norm": 0.6611172259802919, + "learning_rate": 0.00019268296125783576, + "loss": 12.6051, + "step": 5467 + }, + { + "epoch": 0.29775445331590805, + "grad_norm": 0.836722423389593, + "learning_rate": 0.00019267964979866108, + "loss": 12.5857, + "step": 5468 + }, + { + "epoch": 0.29780890731249104, + "grad_norm": 0.6652025661518768, + "learning_rate": 0.0001926763376187939, + "loss": 12.6493, + "step": 5469 + }, + { + "epoch": 0.2978633613090741, + "grad_norm": 0.7509908152819234, + "learning_rate": 0.00019267302471825994, + "loss": 12.657, + "step": 5470 + }, + { + "epoch": 0.2979178153056571, + "grad_norm": 0.6440705345244854, + "learning_rate": 0.00019266971109708502, + "loss": 12.6235, + "step": 5471 + }, + { + "epoch": 0.2979722693022401, + "grad_norm": 0.7864417869922518, + "learning_rate": 0.0001926663967552949, + "loss": 12.6673, + "step": 5472 + }, + { + "epoch": 0.2980267232988231, + "grad_norm": 0.675295748172897, + "learning_rate": 0.00019266308169291533, + "loss": 12.5634, + "step": 5473 + }, + { + "epoch": 0.2980811772954061, + "grad_norm": 0.7196578050262258, + "learning_rate": 0.00019265976590997208, + "loss": 12.5545, + "step": 5474 + }, + { + "epoch": 0.2981356312919891, + "grad_norm": 0.771184431209636, + "learning_rate": 0.00019265644940649095, + "loss": 12.6641, + "step": 5475 + }, + { + "epoch": 0.29819008528857216, + "grad_norm": 0.6625549461496372, + "learning_rate": 0.00019265313218249776, + "loss": 12.646, + "step": 5476 + }, + { + "epoch": 0.29824453928515515, + "grad_norm": 0.6714856696134495, + "learning_rate": 0.00019264981423801824, + "loss": 12.4311, + "step": 5477 + }, + { + "epoch": 0.29829899328173815, + "grad_norm": 0.6932694439785722, + "learning_rate": 0.00019264649557307825, + "loss": 12.66, + "step": 5478 + }, + { + "epoch": 0.2983534472783212, + "grad_norm": 0.7019140285607096, + "learning_rate": 0.00019264317618770358, + "loss": 12.5443, + "step": 5479 + }, + { + "epoch": 0.2984079012749042, + "grad_norm": 0.7285887391443051, + "learning_rate": 0.00019263985608192004, + "loss": 12.5274, + "step": 5480 + }, + { + "epoch": 0.29846235527148723, + "grad_norm": 0.6743474455149153, + "learning_rate": 0.0001926365352557534, + "loss": 12.4967, + "step": 5481 + }, + { + "epoch": 0.2985168092680702, + "grad_norm": 0.7497025945227477, + "learning_rate": 0.00019263321370922956, + "loss": 12.6501, + "step": 5482 + }, + { + "epoch": 0.2985712632646532, + "grad_norm": 0.7301840359950457, + "learning_rate": 0.0001926298914423743, + "loss": 12.5288, + "step": 5483 + }, + { + "epoch": 0.29862571726123627, + "grad_norm": 0.6743215481536896, + "learning_rate": 0.0001926265684552135, + "loss": 12.5529, + "step": 5484 + }, + { + "epoch": 0.29868017125781926, + "grad_norm": 0.7238826933992838, + "learning_rate": 0.00019262324474777297, + "loss": 12.5079, + "step": 5485 + }, + { + "epoch": 0.29873462525440225, + "grad_norm": 0.7257838708320444, + "learning_rate": 0.00019261992032007852, + "loss": 12.5381, + "step": 5486 + }, + { + "epoch": 0.2987890792509853, + "grad_norm": 0.7158109479170218, + "learning_rate": 0.00019261659517215608, + "loss": 12.555, + "step": 5487 + }, + { + "epoch": 0.2988435332475683, + "grad_norm": 0.8007473442264141, + "learning_rate": 0.00019261326930403142, + "loss": 12.6532, + "step": 5488 + }, + { + "epoch": 0.2988979872441513, + "grad_norm": 0.6721247192677079, + "learning_rate": 0.00019260994271573048, + "loss": 12.6069, + "step": 5489 + }, + { + "epoch": 0.29895244124073433, + "grad_norm": 0.7861205605607333, + "learning_rate": 0.00019260661540727907, + "loss": 12.5759, + "step": 5490 + }, + { + "epoch": 0.2990068952373173, + "grad_norm": 0.6645713018669855, + "learning_rate": 0.0001926032873787031, + "loss": 12.55, + "step": 5491 + }, + { + "epoch": 0.2990613492339003, + "grad_norm": 0.6227237580241888, + "learning_rate": 0.00019259995863002844, + "loss": 12.5693, + "step": 5492 + }, + { + "epoch": 0.29911580323048337, + "grad_norm": 0.7182566580451716, + "learning_rate": 0.00019259662916128097, + "loss": 12.6455, + "step": 5493 + }, + { + "epoch": 0.29917025722706636, + "grad_norm": 0.753398157729987, + "learning_rate": 0.00019259329897248657, + "loss": 12.6879, + "step": 5494 + }, + { + "epoch": 0.29922471122364935, + "grad_norm": 0.6709225872282627, + "learning_rate": 0.00019258996806367117, + "loss": 12.6203, + "step": 5495 + }, + { + "epoch": 0.2992791652202324, + "grad_norm": 0.7117214422038949, + "learning_rate": 0.00019258663643486067, + "loss": 12.569, + "step": 5496 + }, + { + "epoch": 0.2993336192168154, + "grad_norm": 0.6641688877669308, + "learning_rate": 0.00019258330408608088, + "loss": 12.5621, + "step": 5497 + }, + { + "epoch": 0.2993880732133984, + "grad_norm": 0.753378664160101, + "learning_rate": 0.00019257997101735787, + "loss": 12.6489, + "step": 5498 + }, + { + "epoch": 0.29944252720998144, + "grad_norm": 0.6934161511406282, + "learning_rate": 0.00019257663722871746, + "loss": 12.584, + "step": 5499 + }, + { + "epoch": 0.2994969812065644, + "grad_norm": 0.6696183362421898, + "learning_rate": 0.0001925733027201856, + "loss": 12.6701, + "step": 5500 + }, + { + "epoch": 0.2995514352031474, + "grad_norm": 0.7221308273553433, + "learning_rate": 0.0001925699674917882, + "loss": 12.5649, + "step": 5501 + }, + { + "epoch": 0.29960588919973047, + "grad_norm": 0.6253773296377263, + "learning_rate": 0.00019256663154355118, + "loss": 12.4283, + "step": 5502 + }, + { + "epoch": 0.29966034319631346, + "grad_norm": 0.6572697114319784, + "learning_rate": 0.00019256329487550054, + "loss": 12.6613, + "step": 5503 + }, + { + "epoch": 0.29971479719289645, + "grad_norm": 0.747338700790793, + "learning_rate": 0.0001925599574876622, + "loss": 12.5483, + "step": 5504 + }, + { + "epoch": 0.2997692511894795, + "grad_norm": 0.6906470887856688, + "learning_rate": 0.0001925566193800621, + "loss": 12.5226, + "step": 5505 + }, + { + "epoch": 0.2998237051860625, + "grad_norm": 0.7248451578017936, + "learning_rate": 0.00019255328055272624, + "loss": 12.3915, + "step": 5506 + }, + { + "epoch": 0.2998781591826455, + "grad_norm": 0.6518355865071961, + "learning_rate": 0.0001925499410056805, + "loss": 12.5653, + "step": 5507 + }, + { + "epoch": 0.29993261317922854, + "grad_norm": 0.6928218232220432, + "learning_rate": 0.00019254660073895092, + "loss": 12.5504, + "step": 5508 + }, + { + "epoch": 0.29998706717581153, + "grad_norm": 0.7400814092127778, + "learning_rate": 0.00019254325975256344, + "loss": 12.6619, + "step": 5509 + }, + { + "epoch": 0.3000415211723945, + "grad_norm": 0.6781743427380154, + "learning_rate": 0.00019253991804654407, + "loss": 12.4567, + "step": 5510 + }, + { + "epoch": 0.30009597516897757, + "grad_norm": 0.6651358657090074, + "learning_rate": 0.00019253657562091876, + "loss": 12.6494, + "step": 5511 + }, + { + "epoch": 0.30015042916556056, + "grad_norm": 0.7184827329752417, + "learning_rate": 0.00019253323247571356, + "loss": 12.6796, + "step": 5512 + }, + { + "epoch": 0.30020488316214355, + "grad_norm": 0.9321121206445516, + "learning_rate": 0.0001925298886109544, + "loss": 12.7286, + "step": 5513 + }, + { + "epoch": 0.3002593371587266, + "grad_norm": 0.7634100043428852, + "learning_rate": 0.00019252654402666727, + "loss": 12.7075, + "step": 5514 + }, + { + "epoch": 0.3003137911553096, + "grad_norm": 0.6500345565420961, + "learning_rate": 0.00019252319872287824, + "loss": 12.5077, + "step": 5515 + }, + { + "epoch": 0.30036824515189264, + "grad_norm": 0.7131473979329023, + "learning_rate": 0.00019251985269961336, + "loss": 12.6077, + "step": 5516 + }, + { + "epoch": 0.30042269914847564, + "grad_norm": 0.7203919344303918, + "learning_rate": 0.00019251650595689853, + "loss": 12.56, + "step": 5517 + }, + { + "epoch": 0.30047715314505863, + "grad_norm": 0.6665760403028634, + "learning_rate": 0.00019251315849475983, + "loss": 12.578, + "step": 5518 + }, + { + "epoch": 0.3005316071416417, + "grad_norm": 0.7873972857774381, + "learning_rate": 0.00019250981031322334, + "loss": 12.6244, + "step": 5519 + }, + { + "epoch": 0.30058606113822467, + "grad_norm": 0.6749580655656026, + "learning_rate": 0.00019250646141231502, + "loss": 12.4792, + "step": 5520 + }, + { + "epoch": 0.30064051513480766, + "grad_norm": 0.670393709534476, + "learning_rate": 0.00019250311179206092, + "loss": 12.4619, + "step": 5521 + }, + { + "epoch": 0.3006949691313907, + "grad_norm": 0.6852404075991394, + "learning_rate": 0.00019249976145248714, + "loss": 12.5381, + "step": 5522 + }, + { + "epoch": 0.3007494231279737, + "grad_norm": 0.7438713110773497, + "learning_rate": 0.0001924964103936197, + "loss": 12.5529, + "step": 5523 + }, + { + "epoch": 0.3008038771245567, + "grad_norm": 0.9422519483715919, + "learning_rate": 0.00019249305861548466, + "loss": 12.4879, + "step": 5524 + }, + { + "epoch": 0.30085833112113974, + "grad_norm": 0.7523445937435863, + "learning_rate": 0.00019248970611810808, + "loss": 12.572, + "step": 5525 + }, + { + "epoch": 0.30091278511772274, + "grad_norm": 0.7152895724954156, + "learning_rate": 0.00019248635290151605, + "loss": 12.5248, + "step": 5526 + }, + { + "epoch": 0.30096723911430573, + "grad_norm": 0.6343277394603853, + "learning_rate": 0.0001924829989657346, + "loss": 12.5778, + "step": 5527 + }, + { + "epoch": 0.3010216931108888, + "grad_norm": 0.8243064110016005, + "learning_rate": 0.00019247964431078987, + "loss": 12.5656, + "step": 5528 + }, + { + "epoch": 0.30107614710747177, + "grad_norm": 0.691016405989972, + "learning_rate": 0.0001924762889367079, + "loss": 12.5543, + "step": 5529 + }, + { + "epoch": 0.30113060110405476, + "grad_norm": 0.6264685213458856, + "learning_rate": 0.00019247293284351482, + "loss": 12.443, + "step": 5530 + }, + { + "epoch": 0.3011850551006378, + "grad_norm": 0.7283528943500993, + "learning_rate": 0.00019246957603123667, + "loss": 12.7175, + "step": 5531 + }, + { + "epoch": 0.3012395090972208, + "grad_norm": 0.7925468803778901, + "learning_rate": 0.00019246621849989962, + "loss": 12.6486, + "step": 5532 + }, + { + "epoch": 0.3012939630938038, + "grad_norm": 0.7083148796953568, + "learning_rate": 0.00019246286024952975, + "loss": 12.4917, + "step": 5533 + }, + { + "epoch": 0.30134841709038684, + "grad_norm": 0.6956661001652812, + "learning_rate": 0.00019245950128015315, + "loss": 12.6302, + "step": 5534 + }, + { + "epoch": 0.30140287108696984, + "grad_norm": 0.7563057436080856, + "learning_rate": 0.000192456141591796, + "loss": 12.224, + "step": 5535 + }, + { + "epoch": 0.30145732508355283, + "grad_norm": 0.6947294851805417, + "learning_rate": 0.00019245278118448436, + "loss": 12.5657, + "step": 5536 + }, + { + "epoch": 0.3015117790801359, + "grad_norm": 0.824002618811491, + "learning_rate": 0.00019244942005824437, + "loss": 12.6116, + "step": 5537 + }, + { + "epoch": 0.30156623307671887, + "grad_norm": 0.7311410585858291, + "learning_rate": 0.00019244605821310223, + "loss": 12.5115, + "step": 5538 + }, + { + "epoch": 0.30162068707330186, + "grad_norm": 0.6903199645204022, + "learning_rate": 0.000192442695649084, + "loss": 12.6615, + "step": 5539 + }, + { + "epoch": 0.3016751410698849, + "grad_norm": 0.7494884431333453, + "learning_rate": 0.0001924393323662159, + "loss": 12.6129, + "step": 5540 + }, + { + "epoch": 0.3017295950664679, + "grad_norm": 0.6206778446236714, + "learning_rate": 0.00019243596836452404, + "loss": 12.5177, + "step": 5541 + }, + { + "epoch": 0.3017840490630509, + "grad_norm": 0.6255558313734257, + "learning_rate": 0.00019243260364403458, + "loss": 12.503, + "step": 5542 + }, + { + "epoch": 0.30183850305963394, + "grad_norm": 0.7196447240519136, + "learning_rate": 0.00019242923820477368, + "loss": 12.4861, + "step": 5543 + }, + { + "epoch": 0.30189295705621694, + "grad_norm": 0.6812891385581262, + "learning_rate": 0.00019242587204676754, + "loss": 12.5954, + "step": 5544 + }, + { + "epoch": 0.30194741105279993, + "grad_norm": 0.7749909003181377, + "learning_rate": 0.0001924225051700423, + "loss": 12.7635, + "step": 5545 + }, + { + "epoch": 0.302001865049383, + "grad_norm": 0.6774056944182675, + "learning_rate": 0.00019241913757462418, + "loss": 12.6497, + "step": 5546 + }, + { + "epoch": 0.30205631904596597, + "grad_norm": 0.7271543948033461, + "learning_rate": 0.00019241576926053936, + "loss": 12.5526, + "step": 5547 + }, + { + "epoch": 0.302110773042549, + "grad_norm": 0.7058606456982123, + "learning_rate": 0.00019241240022781398, + "loss": 12.5594, + "step": 5548 + }, + { + "epoch": 0.302165227039132, + "grad_norm": 0.6679446080027783, + "learning_rate": 0.0001924090304764743, + "loss": 12.6517, + "step": 5549 + }, + { + "epoch": 0.302219681035715, + "grad_norm": 0.749234625813053, + "learning_rate": 0.00019240566000654653, + "loss": 12.6246, + "step": 5550 + }, + { + "epoch": 0.30227413503229805, + "grad_norm": 0.7261391403812401, + "learning_rate": 0.0001924022888180568, + "loss": 12.5897, + "step": 5551 + }, + { + "epoch": 0.30232858902888105, + "grad_norm": 0.7054391146170403, + "learning_rate": 0.00019239891691103143, + "loss": 12.5725, + "step": 5552 + }, + { + "epoch": 0.30238304302546404, + "grad_norm": 0.7040813463132223, + "learning_rate": 0.00019239554428549655, + "loss": 12.5343, + "step": 5553 + }, + { + "epoch": 0.3024374970220471, + "grad_norm": 0.7371343689528318, + "learning_rate": 0.00019239217094147844, + "loss": 12.7817, + "step": 5554 + }, + { + "epoch": 0.3024919510186301, + "grad_norm": 0.6871079768052895, + "learning_rate": 0.0001923887968790033, + "loss": 12.5964, + "step": 5555 + }, + { + "epoch": 0.30254640501521307, + "grad_norm": 0.789857271839963, + "learning_rate": 0.0001923854220980974, + "loss": 12.5383, + "step": 5556 + }, + { + "epoch": 0.3026008590117961, + "grad_norm": 0.640539924809829, + "learning_rate": 0.00019238204659878692, + "loss": 12.4824, + "step": 5557 + }, + { + "epoch": 0.3026553130083791, + "grad_norm": 0.727503213368253, + "learning_rate": 0.00019237867038109823, + "loss": 12.6039, + "step": 5558 + }, + { + "epoch": 0.3027097670049621, + "grad_norm": 0.773168800791757, + "learning_rate": 0.00019237529344505745, + "loss": 12.5989, + "step": 5559 + }, + { + "epoch": 0.30276422100154515, + "grad_norm": 0.6301257451951049, + "learning_rate": 0.0001923719157906909, + "loss": 12.4452, + "step": 5560 + }, + { + "epoch": 0.30281867499812815, + "grad_norm": 0.7305946483618813, + "learning_rate": 0.00019236853741802485, + "loss": 12.6938, + "step": 5561 + }, + { + "epoch": 0.30287312899471114, + "grad_norm": 0.8053865211101275, + "learning_rate": 0.00019236515832708558, + "loss": 12.6093, + "step": 5562 + }, + { + "epoch": 0.3029275829912942, + "grad_norm": 0.9633833175581472, + "learning_rate": 0.00019236177851789931, + "loss": 12.6175, + "step": 5563 + }, + { + "epoch": 0.3029820369878772, + "grad_norm": 0.6972993933779662, + "learning_rate": 0.0001923583979904924, + "loss": 12.4328, + "step": 5564 + }, + { + "epoch": 0.30303649098446017, + "grad_norm": 0.778723825604669, + "learning_rate": 0.0001923550167448911, + "loss": 12.592, + "step": 5565 + }, + { + "epoch": 0.3030909449810432, + "grad_norm": 0.6520394554064838, + "learning_rate": 0.00019235163478112166, + "loss": 12.5261, + "step": 5566 + }, + { + "epoch": 0.3031453989776262, + "grad_norm": 0.6738229807373549, + "learning_rate": 0.00019234825209921047, + "loss": 12.4971, + "step": 5567 + }, + { + "epoch": 0.3031998529742092, + "grad_norm": 0.7779394650488719, + "learning_rate": 0.00019234486869918377, + "loss": 12.5064, + "step": 5568 + }, + { + "epoch": 0.30325430697079225, + "grad_norm": 0.678783994289326, + "learning_rate": 0.00019234148458106785, + "loss": 12.5691, + "step": 5569 + }, + { + "epoch": 0.30330876096737525, + "grad_norm": 0.830383539000743, + "learning_rate": 0.0001923380997448891, + "loss": 12.559, + "step": 5570 + }, + { + "epoch": 0.30336321496395824, + "grad_norm": 0.7130216579918996, + "learning_rate": 0.00019233471419067378, + "loss": 12.5613, + "step": 5571 + }, + { + "epoch": 0.3034176689605413, + "grad_norm": 0.6777881662814074, + "learning_rate": 0.00019233132791844827, + "loss": 12.4879, + "step": 5572 + }, + { + "epoch": 0.3034721229571243, + "grad_norm": 0.6787392866295541, + "learning_rate": 0.00019232794092823884, + "loss": 12.5894, + "step": 5573 + }, + { + "epoch": 0.30352657695370727, + "grad_norm": 0.694352921112891, + "learning_rate": 0.00019232455322007184, + "loss": 12.4459, + "step": 5574 + }, + { + "epoch": 0.3035810309502903, + "grad_norm": 0.7222532512087805, + "learning_rate": 0.00019232116479397365, + "loss": 12.6226, + "step": 5575 + }, + { + "epoch": 0.3036354849468733, + "grad_norm": 0.7314508042146828, + "learning_rate": 0.0001923177756499706, + "loss": 12.689, + "step": 5576 + }, + { + "epoch": 0.3036899389434563, + "grad_norm": 0.6587607739908065, + "learning_rate": 0.00019231438578808907, + "loss": 12.4694, + "step": 5577 + }, + { + "epoch": 0.30374439294003935, + "grad_norm": 0.7635540326543027, + "learning_rate": 0.00019231099520835535, + "loss": 12.5989, + "step": 5578 + }, + { + "epoch": 0.30379884693662235, + "grad_norm": 0.7361430250165957, + "learning_rate": 0.0001923076039107959, + "loss": 12.5804, + "step": 5579 + }, + { + "epoch": 0.30385330093320534, + "grad_norm": 0.7585903650131236, + "learning_rate": 0.000192304211895437, + "loss": 12.5181, + "step": 5580 + }, + { + "epoch": 0.3039077549297884, + "grad_norm": 0.7600365444488244, + "learning_rate": 0.0001923008191623051, + "loss": 12.5854, + "step": 5581 + }, + { + "epoch": 0.3039622089263714, + "grad_norm": 0.8182750314826289, + "learning_rate": 0.00019229742571142655, + "loss": 12.4639, + "step": 5582 + }, + { + "epoch": 0.30401666292295443, + "grad_norm": 0.7193099733132268, + "learning_rate": 0.00019229403154282773, + "loss": 12.4747, + "step": 5583 + }, + { + "epoch": 0.3040711169195374, + "grad_norm": 0.7270686488441531, + "learning_rate": 0.00019229063665653504, + "loss": 12.5602, + "step": 5584 + }, + { + "epoch": 0.3041255709161204, + "grad_norm": 0.7135070121903767, + "learning_rate": 0.00019228724105257487, + "loss": 12.5432, + "step": 5585 + }, + { + "epoch": 0.30418002491270346, + "grad_norm": 0.6997931480434294, + "learning_rate": 0.00019228384473097366, + "loss": 12.6373, + "step": 5586 + }, + { + "epoch": 0.30423447890928645, + "grad_norm": 0.7479379443675384, + "learning_rate": 0.0001922804476917578, + "loss": 12.4373, + "step": 5587 + }, + { + "epoch": 0.30428893290586945, + "grad_norm": 0.6981561283860745, + "learning_rate": 0.0001922770499349537, + "loss": 12.6439, + "step": 5588 + }, + { + "epoch": 0.3043433869024525, + "grad_norm": 0.6339087812412747, + "learning_rate": 0.00019227365146058775, + "loss": 12.5173, + "step": 5589 + }, + { + "epoch": 0.3043978408990355, + "grad_norm": 0.8609219927489593, + "learning_rate": 0.00019227025226868644, + "loss": 12.5421, + "step": 5590 + }, + { + "epoch": 0.3044522948956185, + "grad_norm": 0.7118254434995729, + "learning_rate": 0.00019226685235927617, + "loss": 12.5478, + "step": 5591 + }, + { + "epoch": 0.30450674889220153, + "grad_norm": 0.6794508342052569, + "learning_rate": 0.0001922634517323834, + "loss": 12.5299, + "step": 5592 + }, + { + "epoch": 0.3045612028887845, + "grad_norm": 0.6586240853434104, + "learning_rate": 0.00019226005038803452, + "loss": 12.6055, + "step": 5593 + }, + { + "epoch": 0.3046156568853675, + "grad_norm": 0.77236087411886, + "learning_rate": 0.00019225664832625604, + "loss": 12.5975, + "step": 5594 + }, + { + "epoch": 0.30467011088195056, + "grad_norm": 0.6719406111328031, + "learning_rate": 0.0001922532455470744, + "loss": 12.5498, + "step": 5595 + }, + { + "epoch": 0.30472456487853355, + "grad_norm": 0.6505332143229099, + "learning_rate": 0.00019224984205051603, + "loss": 12.5016, + "step": 5596 + }, + { + "epoch": 0.30477901887511655, + "grad_norm": 0.6457023063792332, + "learning_rate": 0.00019224643783660744, + "loss": 12.5565, + "step": 5597 + }, + { + "epoch": 0.3048334728716996, + "grad_norm": 0.7990481600920323, + "learning_rate": 0.00019224303290537508, + "loss": 12.7244, + "step": 5598 + }, + { + "epoch": 0.3048879268682826, + "grad_norm": 0.6815455908401977, + "learning_rate": 0.00019223962725684542, + "loss": 12.5248, + "step": 5599 + }, + { + "epoch": 0.3049423808648656, + "grad_norm": 0.7270449504186137, + "learning_rate": 0.0001922362208910449, + "loss": 12.7111, + "step": 5600 + }, + { + "epoch": 0.30499683486144863, + "grad_norm": 0.7763686333017574, + "learning_rate": 0.0001922328138080001, + "loss": 12.7218, + "step": 5601 + }, + { + "epoch": 0.3050512888580316, + "grad_norm": 0.7284964808752892, + "learning_rate": 0.0001922294060077375, + "loss": 12.5986, + "step": 5602 + }, + { + "epoch": 0.3051057428546146, + "grad_norm": 0.6652999267288491, + "learning_rate": 0.00019222599749028354, + "loss": 12.6224, + "step": 5603 + }, + { + "epoch": 0.30516019685119766, + "grad_norm": 0.6816881250974832, + "learning_rate": 0.00019222258825566478, + "loss": 12.5041, + "step": 5604 + }, + { + "epoch": 0.30521465084778066, + "grad_norm": 1.0545143198680806, + "learning_rate": 0.0001922191783039077, + "loss": 12.4324, + "step": 5605 + }, + { + "epoch": 0.30526910484436365, + "grad_norm": 0.6908851851058719, + "learning_rate": 0.00019221576763503882, + "loss": 12.5667, + "step": 5606 + }, + { + "epoch": 0.3053235588409467, + "grad_norm": 0.7155566720462256, + "learning_rate": 0.00019221235624908466, + "loss": 12.4893, + "step": 5607 + }, + { + "epoch": 0.3053780128375297, + "grad_norm": 0.7677589294724539, + "learning_rate": 0.00019220894414607176, + "loss": 12.5424, + "step": 5608 + }, + { + "epoch": 0.3054324668341127, + "grad_norm": 0.6674925417573994, + "learning_rate": 0.00019220553132602664, + "loss": 12.4021, + "step": 5609 + }, + { + "epoch": 0.30548692083069573, + "grad_norm": 0.7080962937903494, + "learning_rate": 0.00019220211778897585, + "loss": 12.6609, + "step": 5610 + }, + { + "epoch": 0.3055413748272787, + "grad_norm": 0.6452885530540028, + "learning_rate": 0.0001921987035349459, + "loss": 12.4599, + "step": 5611 + }, + { + "epoch": 0.3055958288238617, + "grad_norm": 0.6681628070045235, + "learning_rate": 0.00019219528856396343, + "loss": 12.5753, + "step": 5612 + }, + { + "epoch": 0.30565028282044476, + "grad_norm": 0.6669997947535996, + "learning_rate": 0.00019219187287605491, + "loss": 12.4837, + "step": 5613 + }, + { + "epoch": 0.30570473681702776, + "grad_norm": 0.7083059515105996, + "learning_rate": 0.0001921884564712469, + "loss": 12.6538, + "step": 5614 + }, + { + "epoch": 0.3057591908136108, + "grad_norm": 0.642663359481727, + "learning_rate": 0.00019218503934956602, + "loss": 12.5575, + "step": 5615 + }, + { + "epoch": 0.3058136448101938, + "grad_norm": 0.6625459280194421, + "learning_rate": 0.0001921816215110388, + "loss": 12.6102, + "step": 5616 + }, + { + "epoch": 0.3058680988067768, + "grad_norm": 0.7182820810684242, + "learning_rate": 0.00019217820295569185, + "loss": 12.6836, + "step": 5617 + }, + { + "epoch": 0.30592255280335984, + "grad_norm": 0.753708065496197, + "learning_rate": 0.00019217478368355173, + "loss": 12.557, + "step": 5618 + }, + { + "epoch": 0.30597700679994283, + "grad_norm": 0.7136433532288005, + "learning_rate": 0.00019217136369464503, + "loss": 12.6235, + "step": 5619 + }, + { + "epoch": 0.3060314607965258, + "grad_norm": 0.7244893505472123, + "learning_rate": 0.00019216794298899833, + "loss": 12.5793, + "step": 5620 + }, + { + "epoch": 0.30608591479310887, + "grad_norm": 0.6818378516149921, + "learning_rate": 0.00019216452156663828, + "loss": 12.5673, + "step": 5621 + }, + { + "epoch": 0.30614036878969186, + "grad_norm": 0.6811251930879814, + "learning_rate": 0.00019216109942759145, + "loss": 12.6225, + "step": 5622 + }, + { + "epoch": 0.30619482278627486, + "grad_norm": 0.6603144888339486, + "learning_rate": 0.00019215767657188444, + "loss": 12.4698, + "step": 5623 + }, + { + "epoch": 0.3062492767828579, + "grad_norm": 0.6823421129114194, + "learning_rate": 0.00019215425299954389, + "loss": 12.5283, + "step": 5624 + }, + { + "epoch": 0.3063037307794409, + "grad_norm": 0.6790904544641594, + "learning_rate": 0.0001921508287105964, + "loss": 12.5425, + "step": 5625 + }, + { + "epoch": 0.3063581847760239, + "grad_norm": 0.6295559442425349, + "learning_rate": 0.00019214740370506863, + "loss": 12.554, + "step": 5626 + }, + { + "epoch": 0.30641263877260694, + "grad_norm": 0.6822492643876135, + "learning_rate": 0.0001921439779829872, + "loss": 12.5317, + "step": 5627 + }, + { + "epoch": 0.30646709276918993, + "grad_norm": 0.7730070563114577, + "learning_rate": 0.00019214055154437873, + "loss": 12.6159, + "step": 5628 + }, + { + "epoch": 0.3065215467657729, + "grad_norm": 0.865180713811351, + "learning_rate": 0.00019213712438926987, + "loss": 12.6358, + "step": 5629 + }, + { + "epoch": 0.30657600076235597, + "grad_norm": 0.651411926195066, + "learning_rate": 0.00019213369651768732, + "loss": 12.5468, + "step": 5630 + }, + { + "epoch": 0.30663045475893896, + "grad_norm": 0.6433491563576922, + "learning_rate": 0.00019213026792965767, + "loss": 12.5849, + "step": 5631 + }, + { + "epoch": 0.30668490875552196, + "grad_norm": 0.7643907663694384, + "learning_rate": 0.00019212683862520756, + "loss": 12.7352, + "step": 5632 + }, + { + "epoch": 0.306739362752105, + "grad_norm": 0.6868282637682263, + "learning_rate": 0.00019212340860436377, + "loss": 12.5962, + "step": 5633 + }, + { + "epoch": 0.306793816748688, + "grad_norm": 0.7598560634641103, + "learning_rate": 0.00019211997786715287, + "loss": 12.6059, + "step": 5634 + }, + { + "epoch": 0.306848270745271, + "grad_norm": 0.7288171464459738, + "learning_rate": 0.0001921165464136016, + "loss": 12.5483, + "step": 5635 + }, + { + "epoch": 0.30690272474185404, + "grad_norm": 0.6673891343969206, + "learning_rate": 0.00019211311424373662, + "loss": 12.6215, + "step": 5636 + }, + { + "epoch": 0.30695717873843703, + "grad_norm": 0.6744339225596823, + "learning_rate": 0.00019210968135758457, + "loss": 12.5486, + "step": 5637 + }, + { + "epoch": 0.30701163273502, + "grad_norm": 0.7259913596649055, + "learning_rate": 0.00019210624775517225, + "loss": 12.5398, + "step": 5638 + }, + { + "epoch": 0.30706608673160307, + "grad_norm": 0.6830246301584015, + "learning_rate": 0.00019210281343652626, + "loss": 12.7559, + "step": 5639 + }, + { + "epoch": 0.30712054072818606, + "grad_norm": 0.6785998966172879, + "learning_rate": 0.00019209937840167338, + "loss": 12.4974, + "step": 5640 + }, + { + "epoch": 0.30717499472476906, + "grad_norm": 0.8090827555182567, + "learning_rate": 0.00019209594265064025, + "loss": 12.5941, + "step": 5641 + }, + { + "epoch": 0.3072294487213521, + "grad_norm": 0.7481489237592109, + "learning_rate": 0.00019209250618345368, + "loss": 12.5914, + "step": 5642 + }, + { + "epoch": 0.3072839027179351, + "grad_norm": 0.6838197491142659, + "learning_rate": 0.0001920890690001403, + "loss": 12.6042, + "step": 5643 + }, + { + "epoch": 0.3073383567145181, + "grad_norm": 0.7250501351581554, + "learning_rate": 0.00019208563110072687, + "loss": 12.512, + "step": 5644 + }, + { + "epoch": 0.30739281071110114, + "grad_norm": 0.8233771978321007, + "learning_rate": 0.00019208219248524014, + "loss": 12.5859, + "step": 5645 + }, + { + "epoch": 0.30744726470768413, + "grad_norm": 0.6777946678039877, + "learning_rate": 0.0001920787531537068, + "loss": 12.6482, + "step": 5646 + }, + { + "epoch": 0.3075017187042671, + "grad_norm": 0.7460758660384199, + "learning_rate": 0.0001920753131061537, + "loss": 12.6217, + "step": 5647 + }, + { + "epoch": 0.30755617270085017, + "grad_norm": 0.6870262122278785, + "learning_rate": 0.0001920718723426075, + "loss": 12.6333, + "step": 5648 + }, + { + "epoch": 0.30761062669743316, + "grad_norm": 0.7277903111871953, + "learning_rate": 0.00019206843086309498, + "loss": 12.511, + "step": 5649 + }, + { + "epoch": 0.3076650806940162, + "grad_norm": 0.698143994457559, + "learning_rate": 0.00019206498866764288, + "loss": 12.5323, + "step": 5650 + }, + { + "epoch": 0.3077195346905992, + "grad_norm": 0.6871120504062017, + "learning_rate": 0.00019206154575627802, + "loss": 12.3899, + "step": 5651 + }, + { + "epoch": 0.3077739886871822, + "grad_norm": 0.8309989003900052, + "learning_rate": 0.00019205810212902713, + "loss": 12.3704, + "step": 5652 + }, + { + "epoch": 0.30782844268376525, + "grad_norm": 0.8425343080407433, + "learning_rate": 0.00019205465778591698, + "loss": 12.6307, + "step": 5653 + }, + { + "epoch": 0.30788289668034824, + "grad_norm": 0.6798149257167375, + "learning_rate": 0.0001920512127269744, + "loss": 12.6095, + "step": 5654 + }, + { + "epoch": 0.30793735067693123, + "grad_norm": 1.059836564785302, + "learning_rate": 0.00019204776695222616, + "loss": 12.655, + "step": 5655 + }, + { + "epoch": 0.3079918046735143, + "grad_norm": 0.7725726412551481, + "learning_rate": 0.00019204432046169903, + "loss": 12.6174, + "step": 5656 + }, + { + "epoch": 0.3080462586700973, + "grad_norm": 0.8319369769046256, + "learning_rate": 0.0001920408732554198, + "loss": 12.627, + "step": 5657 + }, + { + "epoch": 0.30810071266668027, + "grad_norm": 0.7314135357215268, + "learning_rate": 0.00019203742533341534, + "loss": 12.6254, + "step": 5658 + }, + { + "epoch": 0.3081551666632633, + "grad_norm": 0.761529435714816, + "learning_rate": 0.00019203397669571243, + "loss": 12.6114, + "step": 5659 + }, + { + "epoch": 0.3082096206598463, + "grad_norm": 0.8174963212126977, + "learning_rate": 0.00019203052734233786, + "loss": 12.6282, + "step": 5660 + }, + { + "epoch": 0.3082640746564293, + "grad_norm": 0.6595276890115349, + "learning_rate": 0.0001920270772733185, + "loss": 12.5181, + "step": 5661 + }, + { + "epoch": 0.30831852865301235, + "grad_norm": 0.7448298217772923, + "learning_rate": 0.00019202362648868112, + "loss": 12.5468, + "step": 5662 + }, + { + "epoch": 0.30837298264959534, + "grad_norm": 0.7681547956272415, + "learning_rate": 0.00019202017498845265, + "loss": 12.5377, + "step": 5663 + }, + { + "epoch": 0.30842743664617833, + "grad_norm": 0.6643886990380933, + "learning_rate": 0.00019201672277265982, + "loss": 12.6046, + "step": 5664 + }, + { + "epoch": 0.3084818906427614, + "grad_norm": 0.792858194541144, + "learning_rate": 0.00019201326984132953, + "loss": 12.5158, + "step": 5665 + }, + { + "epoch": 0.3085363446393444, + "grad_norm": 0.7871328263741095, + "learning_rate": 0.00019200981619448863, + "loss": 12.6348, + "step": 5666 + }, + { + "epoch": 0.30859079863592737, + "grad_norm": 0.6989003752746422, + "learning_rate": 0.00019200636183216397, + "loss": 12.5612, + "step": 5667 + }, + { + "epoch": 0.3086452526325104, + "grad_norm": 0.840356018473379, + "learning_rate": 0.0001920029067543824, + "loss": 12.5499, + "step": 5668 + }, + { + "epoch": 0.3086997066290934, + "grad_norm": 0.6120244188572662, + "learning_rate": 0.0001919994509611708, + "loss": 12.5905, + "step": 5669 + }, + { + "epoch": 0.3087541606256764, + "grad_norm": 0.7276319280590692, + "learning_rate": 0.00019199599445255606, + "loss": 12.5964, + "step": 5670 + }, + { + "epoch": 0.30880861462225945, + "grad_norm": 0.7024582912295813, + "learning_rate": 0.000191992537228565, + "loss": 12.5345, + "step": 5671 + }, + { + "epoch": 0.30886306861884244, + "grad_norm": 0.6446613012579606, + "learning_rate": 0.00019198907928922457, + "loss": 12.5057, + "step": 5672 + }, + { + "epoch": 0.30891752261542543, + "grad_norm": 0.7342596906337274, + "learning_rate": 0.00019198562063456163, + "loss": 12.5339, + "step": 5673 + }, + { + "epoch": 0.3089719766120085, + "grad_norm": 0.6693775956414059, + "learning_rate": 0.00019198216126460306, + "loss": 12.497, + "step": 5674 + }, + { + "epoch": 0.3090264306085915, + "grad_norm": 0.654154004134321, + "learning_rate": 0.0001919787011793758, + "loss": 12.5619, + "step": 5675 + }, + { + "epoch": 0.30908088460517447, + "grad_norm": 0.6961529048076709, + "learning_rate": 0.00019197524037890674, + "loss": 12.5442, + "step": 5676 + }, + { + "epoch": 0.3091353386017575, + "grad_norm": 0.6364479952408855, + "learning_rate": 0.0001919717788632228, + "loss": 12.4765, + "step": 5677 + }, + { + "epoch": 0.3091897925983405, + "grad_norm": 0.7031647693885875, + "learning_rate": 0.00019196831663235083, + "loss": 12.496, + "step": 5678 + }, + { + "epoch": 0.3092442465949235, + "grad_norm": 0.6375027088057562, + "learning_rate": 0.00019196485368631785, + "loss": 12.4551, + "step": 5679 + }, + { + "epoch": 0.30929870059150655, + "grad_norm": 0.6646869200935568, + "learning_rate": 0.00019196139002515073, + "loss": 12.6098, + "step": 5680 + }, + { + "epoch": 0.30935315458808954, + "grad_norm": 0.7643407814102523, + "learning_rate": 0.00019195792564887643, + "loss": 12.7625, + "step": 5681 + }, + { + "epoch": 0.3094076085846726, + "grad_norm": 0.6965512867985716, + "learning_rate": 0.00019195446055752187, + "loss": 12.6885, + "step": 5682 + }, + { + "epoch": 0.3094620625812556, + "grad_norm": 0.7183214873245715, + "learning_rate": 0.000191950994751114, + "loss": 12.671, + "step": 5683 + }, + { + "epoch": 0.3095165165778386, + "grad_norm": 0.7341201497898837, + "learning_rate": 0.0001919475282296798, + "loss": 12.5902, + "step": 5684 + }, + { + "epoch": 0.3095709705744216, + "grad_norm": 0.7231880725512029, + "learning_rate": 0.00019194406099324614, + "loss": 12.7504, + "step": 5685 + }, + { + "epoch": 0.3096254245710046, + "grad_norm": 0.8481118115905432, + "learning_rate": 0.0001919405930418401, + "loss": 12.5931, + "step": 5686 + }, + { + "epoch": 0.3096798785675876, + "grad_norm": 0.60463448001354, + "learning_rate": 0.00019193712437548858, + "loss": 12.5207, + "step": 5687 + }, + { + "epoch": 0.30973433256417066, + "grad_norm": 0.8003196827589169, + "learning_rate": 0.00019193365499421857, + "loss": 12.5607, + "step": 5688 + }, + { + "epoch": 0.30978878656075365, + "grad_norm": 0.6569732989360393, + "learning_rate": 0.000191930184898057, + "loss": 12.5348, + "step": 5689 + }, + { + "epoch": 0.30984324055733664, + "grad_norm": 0.6507547624096993, + "learning_rate": 0.00019192671408703094, + "loss": 12.5218, + "step": 5690 + }, + { + "epoch": 0.3098976945539197, + "grad_norm": 0.8016253940452907, + "learning_rate": 0.00019192324256116732, + "loss": 12.6849, + "step": 5691 + }, + { + "epoch": 0.3099521485505027, + "grad_norm": 0.6636968700459538, + "learning_rate": 0.00019191977032049313, + "loss": 12.4833, + "step": 5692 + }, + { + "epoch": 0.3100066025470857, + "grad_norm": 0.7364071722048001, + "learning_rate": 0.00019191629736503544, + "loss": 12.5185, + "step": 5693 + }, + { + "epoch": 0.3100610565436687, + "grad_norm": 0.7051502639556526, + "learning_rate": 0.00019191282369482115, + "loss": 12.6904, + "step": 5694 + }, + { + "epoch": 0.3101155105402517, + "grad_norm": 0.6651267585232153, + "learning_rate": 0.00019190934930987736, + "loss": 12.6454, + "step": 5695 + }, + { + "epoch": 0.3101699645368347, + "grad_norm": 0.6720576053944531, + "learning_rate": 0.00019190587421023106, + "loss": 12.6355, + "step": 5696 + }, + { + "epoch": 0.31022441853341776, + "grad_norm": 0.7403081220658613, + "learning_rate": 0.00019190239839590926, + "loss": 12.6477, + "step": 5697 + }, + { + "epoch": 0.31027887253000075, + "grad_norm": 0.6856887471881359, + "learning_rate": 0.000191898921866939, + "loss": 12.5374, + "step": 5698 + }, + { + "epoch": 0.31033332652658374, + "grad_norm": 0.7849792672826814, + "learning_rate": 0.00019189544462334731, + "loss": 12.5885, + "step": 5699 + }, + { + "epoch": 0.3103877805231668, + "grad_norm": 0.6940421079361554, + "learning_rate": 0.00019189196666516124, + "loss": 12.5049, + "step": 5700 + }, + { + "epoch": 0.3104422345197498, + "grad_norm": 0.7815347783955346, + "learning_rate": 0.00019188848799240782, + "loss": 12.6357, + "step": 5701 + }, + { + "epoch": 0.3104966885163328, + "grad_norm": 0.7871432775569294, + "learning_rate": 0.00019188500860511408, + "loss": 12.5207, + "step": 5702 + }, + { + "epoch": 0.3105511425129158, + "grad_norm": 0.729806048478542, + "learning_rate": 0.00019188152850330717, + "loss": 12.6383, + "step": 5703 + }, + { + "epoch": 0.3106055965094988, + "grad_norm": 0.6487578342015209, + "learning_rate": 0.00019187804768701404, + "loss": 12.4904, + "step": 5704 + }, + { + "epoch": 0.3106600505060818, + "grad_norm": 0.6137037639031049, + "learning_rate": 0.0001918745661562618, + "loss": 12.3875, + "step": 5705 + }, + { + "epoch": 0.31071450450266486, + "grad_norm": 0.6880702484472031, + "learning_rate": 0.00019187108391107756, + "loss": 12.5841, + "step": 5706 + }, + { + "epoch": 0.31076895849924785, + "grad_norm": 0.7192930692423495, + "learning_rate": 0.00019186760095148833, + "loss": 12.5358, + "step": 5707 + }, + { + "epoch": 0.31082341249583084, + "grad_norm": 0.6777797822609798, + "learning_rate": 0.00019186411727752125, + "loss": 12.6532, + "step": 5708 + }, + { + "epoch": 0.3108778664924139, + "grad_norm": 0.6988579087077, + "learning_rate": 0.00019186063288920336, + "loss": 12.4749, + "step": 5709 + }, + { + "epoch": 0.3109323204889969, + "grad_norm": 0.8106200711091491, + "learning_rate": 0.00019185714778656183, + "loss": 12.7109, + "step": 5710 + }, + { + "epoch": 0.3109867744855799, + "grad_norm": 0.7307713134379897, + "learning_rate": 0.00019185366196962367, + "loss": 12.5756, + "step": 5711 + }, + { + "epoch": 0.3110412284821629, + "grad_norm": 0.7003160909230988, + "learning_rate": 0.00019185017543841605, + "loss": 12.5781, + "step": 5712 + }, + { + "epoch": 0.3110956824787459, + "grad_norm": 0.7035305537630274, + "learning_rate": 0.00019184668819296604, + "loss": 12.592, + "step": 5713 + }, + { + "epoch": 0.3111501364753289, + "grad_norm": 0.6408295726992614, + "learning_rate": 0.00019184320023330083, + "loss": 12.4214, + "step": 5714 + }, + { + "epoch": 0.31120459047191196, + "grad_norm": 0.6757175305885018, + "learning_rate": 0.00019183971155944748, + "loss": 12.5193, + "step": 5715 + }, + { + "epoch": 0.31125904446849495, + "grad_norm": 0.6714370645069598, + "learning_rate": 0.0001918362221714331, + "loss": 12.5045, + "step": 5716 + }, + { + "epoch": 0.311313498465078, + "grad_norm": 0.763641142450625, + "learning_rate": 0.00019183273206928487, + "loss": 12.466, + "step": 5717 + }, + { + "epoch": 0.311367952461661, + "grad_norm": 0.6373620796361511, + "learning_rate": 0.0001918292412530299, + "loss": 12.4351, + "step": 5718 + }, + { + "epoch": 0.311422406458244, + "grad_norm": 0.773134594117127, + "learning_rate": 0.00019182574972269537, + "loss": 12.7381, + "step": 5719 + }, + { + "epoch": 0.31147686045482703, + "grad_norm": 0.6326104987283147, + "learning_rate": 0.0001918222574783084, + "loss": 12.5616, + "step": 5720 + }, + { + "epoch": 0.31153131445141, + "grad_norm": 0.7155966148875553, + "learning_rate": 0.0001918187645198962, + "loss": 12.6321, + "step": 5721 + }, + { + "epoch": 0.311585768447993, + "grad_norm": 0.7879595861603473, + "learning_rate": 0.00019181527084748582, + "loss": 12.5777, + "step": 5722 + }, + { + "epoch": 0.31164022244457606, + "grad_norm": 0.6353999496292784, + "learning_rate": 0.00019181177646110454, + "loss": 12.6064, + "step": 5723 + }, + { + "epoch": 0.31169467644115906, + "grad_norm": 0.7480236445199707, + "learning_rate": 0.00019180828136077947, + "loss": 12.5245, + "step": 5724 + }, + { + "epoch": 0.31174913043774205, + "grad_norm": 0.7011891796684117, + "learning_rate": 0.00019180478554653782, + "loss": 12.6022, + "step": 5725 + }, + { + "epoch": 0.3118035844343251, + "grad_norm": 0.7226576397297431, + "learning_rate": 0.00019180128901840677, + "loss": 12.461, + "step": 5726 + }, + { + "epoch": 0.3118580384309081, + "grad_norm": 0.8105551964611454, + "learning_rate": 0.0001917977917764135, + "loss": 12.5971, + "step": 5727 + }, + { + "epoch": 0.3119124924274911, + "grad_norm": 0.7105623987225386, + "learning_rate": 0.00019179429382058517, + "loss": 12.5568, + "step": 5728 + }, + { + "epoch": 0.31196694642407413, + "grad_norm": 0.7423286623670033, + "learning_rate": 0.00019179079515094908, + "loss": 12.5355, + "step": 5729 + }, + { + "epoch": 0.3120214004206571, + "grad_norm": 0.6921990475242991, + "learning_rate": 0.0001917872957675323, + "loss": 12.5362, + "step": 5730 + }, + { + "epoch": 0.3120758544172401, + "grad_norm": 0.6326208002108714, + "learning_rate": 0.00019178379567036217, + "loss": 12.4887, + "step": 5731 + }, + { + "epoch": 0.31213030841382317, + "grad_norm": 0.8282457364681612, + "learning_rate": 0.00019178029485946585, + "loss": 12.6543, + "step": 5732 + }, + { + "epoch": 0.31218476241040616, + "grad_norm": 0.6377328782854843, + "learning_rate": 0.00019177679333487056, + "loss": 12.5946, + "step": 5733 + }, + { + "epoch": 0.31223921640698915, + "grad_norm": 0.6920475291113465, + "learning_rate": 0.0001917732910966035, + "loss": 12.5284, + "step": 5734 + }, + { + "epoch": 0.3122936704035722, + "grad_norm": 0.7101041584136139, + "learning_rate": 0.00019176978814469198, + "loss": 12.5032, + "step": 5735 + }, + { + "epoch": 0.3123481244001552, + "grad_norm": 0.6111176119060407, + "learning_rate": 0.0001917662844791632, + "loss": 12.5313, + "step": 5736 + }, + { + "epoch": 0.3124025783967382, + "grad_norm": 0.6681293165538871, + "learning_rate": 0.00019176278010004435, + "loss": 12.5214, + "step": 5737 + }, + { + "epoch": 0.31245703239332123, + "grad_norm": 0.7689966914982728, + "learning_rate": 0.00019175927500736278, + "loss": 12.5465, + "step": 5738 + }, + { + "epoch": 0.3125114863899042, + "grad_norm": 0.6566846990967126, + "learning_rate": 0.00019175576920114567, + "loss": 12.4811, + "step": 5739 + }, + { + "epoch": 0.3125659403864872, + "grad_norm": 0.8515807575793801, + "learning_rate": 0.00019175226268142032, + "loss": 12.7253, + "step": 5740 + }, + { + "epoch": 0.31262039438307027, + "grad_norm": 0.8194373785790895, + "learning_rate": 0.00019174875544821402, + "loss": 12.5978, + "step": 5741 + }, + { + "epoch": 0.31267484837965326, + "grad_norm": 0.5959364512025092, + "learning_rate": 0.00019174524750155398, + "loss": 12.4635, + "step": 5742 + }, + { + "epoch": 0.31272930237623625, + "grad_norm": 0.7182012662497026, + "learning_rate": 0.00019174173884146752, + "loss": 12.6004, + "step": 5743 + }, + { + "epoch": 0.3127837563728193, + "grad_norm": 0.6191882074588925, + "learning_rate": 0.00019173822946798191, + "loss": 12.5388, + "step": 5744 + }, + { + "epoch": 0.3128382103694023, + "grad_norm": 0.7284139974989752, + "learning_rate": 0.00019173471938112443, + "loss": 12.5659, + "step": 5745 + }, + { + "epoch": 0.3128926643659853, + "grad_norm": 0.7308799859614893, + "learning_rate": 0.00019173120858092242, + "loss": 12.5344, + "step": 5746 + }, + { + "epoch": 0.31294711836256833, + "grad_norm": 0.7161742868249728, + "learning_rate": 0.0001917276970674031, + "loss": 12.6181, + "step": 5747 + }, + { + "epoch": 0.3130015723591513, + "grad_norm": 0.6887630271894791, + "learning_rate": 0.0001917241848405939, + "loss": 12.5654, + "step": 5748 + }, + { + "epoch": 0.3130560263557344, + "grad_norm": 0.7334236423854301, + "learning_rate": 0.000191720671900522, + "loss": 12.6554, + "step": 5749 + }, + { + "epoch": 0.31311048035231737, + "grad_norm": 0.7280109583218388, + "learning_rate": 0.00019171715824721478, + "loss": 12.6293, + "step": 5750 + }, + { + "epoch": 0.31316493434890036, + "grad_norm": 0.7742369837794514, + "learning_rate": 0.00019171364388069958, + "loss": 12.537, + "step": 5751 + }, + { + "epoch": 0.3132193883454834, + "grad_norm": 0.6332978382946135, + "learning_rate": 0.0001917101288010037, + "loss": 12.5003, + "step": 5752 + }, + { + "epoch": 0.3132738423420664, + "grad_norm": 0.8109885888601659, + "learning_rate": 0.00019170661300815445, + "loss": 12.5463, + "step": 5753 + }, + { + "epoch": 0.3133282963386494, + "grad_norm": 0.6776696760063646, + "learning_rate": 0.0001917030965021792, + "loss": 12.4903, + "step": 5754 + }, + { + "epoch": 0.31338275033523244, + "grad_norm": 0.7367968361944557, + "learning_rate": 0.00019169957928310533, + "loss": 12.5046, + "step": 5755 + }, + { + "epoch": 0.31343720433181543, + "grad_norm": 0.6710299186422611, + "learning_rate": 0.0001916960613509601, + "loss": 12.5471, + "step": 5756 + }, + { + "epoch": 0.3134916583283984, + "grad_norm": 0.8285856812284736, + "learning_rate": 0.00019169254270577098, + "loss": 12.6954, + "step": 5757 + }, + { + "epoch": 0.3135461123249815, + "grad_norm": 0.6826103991424295, + "learning_rate": 0.00019168902334756524, + "loss": 12.3569, + "step": 5758 + }, + { + "epoch": 0.31360056632156447, + "grad_norm": 0.7371646823095679, + "learning_rate": 0.0001916855032763703, + "loss": 12.5832, + "step": 5759 + }, + { + "epoch": 0.31365502031814746, + "grad_norm": 0.6338402069507585, + "learning_rate": 0.00019168198249221348, + "loss": 12.5793, + "step": 5760 + }, + { + "epoch": 0.3137094743147305, + "grad_norm": 0.6892790788000476, + "learning_rate": 0.00019167846099512218, + "loss": 12.5516, + "step": 5761 + }, + { + "epoch": 0.3137639283113135, + "grad_norm": 0.685422960143123, + "learning_rate": 0.00019167493878512382, + "loss": 12.6479, + "step": 5762 + }, + { + "epoch": 0.3138183823078965, + "grad_norm": 0.6800269200015937, + "learning_rate": 0.00019167141586224576, + "loss": 12.5571, + "step": 5763 + }, + { + "epoch": 0.31387283630447954, + "grad_norm": 0.8151275310663422, + "learning_rate": 0.00019166789222651537, + "loss": 12.7188, + "step": 5764 + }, + { + "epoch": 0.31392729030106253, + "grad_norm": 0.69512271148721, + "learning_rate": 0.0001916643678779601, + "loss": 12.6066, + "step": 5765 + }, + { + "epoch": 0.3139817442976455, + "grad_norm": 0.8421123782899708, + "learning_rate": 0.00019166084281660735, + "loss": 12.5477, + "step": 5766 + }, + { + "epoch": 0.3140361982942286, + "grad_norm": 0.7250909770675339, + "learning_rate": 0.0001916573170424845, + "loss": 12.6669, + "step": 5767 + }, + { + "epoch": 0.31409065229081157, + "grad_norm": 0.7765303098532939, + "learning_rate": 0.00019165379055561895, + "loss": 12.5521, + "step": 5768 + }, + { + "epoch": 0.31414510628739456, + "grad_norm": 0.757240010661953, + "learning_rate": 0.0001916502633560382, + "loss": 12.515, + "step": 5769 + }, + { + "epoch": 0.3141995602839776, + "grad_norm": 0.7924596813156907, + "learning_rate": 0.00019164673544376962, + "loss": 12.6334, + "step": 5770 + }, + { + "epoch": 0.3142540142805606, + "grad_norm": 0.6340821796162117, + "learning_rate": 0.00019164320681884064, + "loss": 12.5287, + "step": 5771 + }, + { + "epoch": 0.3143084682771436, + "grad_norm": 0.6938443956294725, + "learning_rate": 0.00019163967748127874, + "loss": 12.591, + "step": 5772 + }, + { + "epoch": 0.31436292227372664, + "grad_norm": 0.8052442207081102, + "learning_rate": 0.00019163614743111134, + "loss": 12.6676, + "step": 5773 + }, + { + "epoch": 0.31441737627030963, + "grad_norm": 0.6897969950775128, + "learning_rate": 0.00019163261666836588, + "loss": 12.5717, + "step": 5774 + }, + { + "epoch": 0.3144718302668926, + "grad_norm": 0.7234430598331005, + "learning_rate": 0.00019162908519306982, + "loss": 12.7961, + "step": 5775 + }, + { + "epoch": 0.3145262842634757, + "grad_norm": 0.7182030630629905, + "learning_rate": 0.00019162555300525062, + "loss": 12.6843, + "step": 5776 + }, + { + "epoch": 0.31458073826005867, + "grad_norm": 0.7212326014240544, + "learning_rate": 0.00019162202010493577, + "loss": 12.5077, + "step": 5777 + }, + { + "epoch": 0.31463519225664166, + "grad_norm": 0.7180839647466242, + "learning_rate": 0.00019161848649215272, + "loss": 12.6407, + "step": 5778 + }, + { + "epoch": 0.3146896462532247, + "grad_norm": 0.6995245318344423, + "learning_rate": 0.00019161495216692896, + "loss": 12.594, + "step": 5779 + }, + { + "epoch": 0.3147441002498077, + "grad_norm": 0.7030108004030603, + "learning_rate": 0.00019161141712929197, + "loss": 12.487, + "step": 5780 + }, + { + "epoch": 0.3147985542463907, + "grad_norm": 0.7292919566984026, + "learning_rate": 0.0001916078813792692, + "loss": 12.585, + "step": 5781 + }, + { + "epoch": 0.31485300824297374, + "grad_norm": 0.759506562847832, + "learning_rate": 0.00019160434491688824, + "loss": 12.4386, + "step": 5782 + }, + { + "epoch": 0.31490746223955673, + "grad_norm": 0.9639081006195463, + "learning_rate": 0.00019160080774217647, + "loss": 12.6828, + "step": 5783 + }, + { + "epoch": 0.3149619162361398, + "grad_norm": 0.7434314130278653, + "learning_rate": 0.00019159726985516152, + "loss": 12.4645, + "step": 5784 + }, + { + "epoch": 0.3150163702327228, + "grad_norm": 0.6400421001900726, + "learning_rate": 0.00019159373125587082, + "loss": 12.3502, + "step": 5785 + }, + { + "epoch": 0.31507082422930577, + "grad_norm": 0.6867545808438131, + "learning_rate": 0.00019159019194433188, + "loss": 12.5432, + "step": 5786 + }, + { + "epoch": 0.3151252782258888, + "grad_norm": 0.7896890316318025, + "learning_rate": 0.00019158665192057229, + "loss": 12.5461, + "step": 5787 + }, + { + "epoch": 0.3151797322224718, + "grad_norm": 0.6951193183704759, + "learning_rate": 0.00019158311118461948, + "loss": 12.6574, + "step": 5788 + }, + { + "epoch": 0.3152341862190548, + "grad_norm": 0.7847001380500844, + "learning_rate": 0.00019157956973650108, + "loss": 12.6444, + "step": 5789 + }, + { + "epoch": 0.31528864021563785, + "grad_norm": 0.7231280042579047, + "learning_rate": 0.0001915760275762446, + "loss": 12.5887, + "step": 5790 + }, + { + "epoch": 0.31534309421222084, + "grad_norm": 0.710818117137646, + "learning_rate": 0.00019157248470387753, + "loss": 12.3551, + "step": 5791 + }, + { + "epoch": 0.31539754820880384, + "grad_norm": 0.8236673096850398, + "learning_rate": 0.00019156894111942746, + "loss": 12.6124, + "step": 5792 + }, + { + "epoch": 0.3154520022053869, + "grad_norm": 0.6464910311423513, + "learning_rate": 0.00019156539682292197, + "loss": 12.5541, + "step": 5793 + }, + { + "epoch": 0.3155064562019699, + "grad_norm": 0.8011589161346442, + "learning_rate": 0.00019156185181438861, + "loss": 12.5346, + "step": 5794 + }, + { + "epoch": 0.31556091019855287, + "grad_norm": 0.7003550705119012, + "learning_rate": 0.00019155830609385492, + "loss": 12.5535, + "step": 5795 + }, + { + "epoch": 0.3156153641951359, + "grad_norm": 0.6682420425845063, + "learning_rate": 0.0001915547596613485, + "loss": 12.551, + "step": 5796 + }, + { + "epoch": 0.3156698181917189, + "grad_norm": 0.6748930099795875, + "learning_rate": 0.00019155121251689689, + "loss": 12.5044, + "step": 5797 + }, + { + "epoch": 0.3157242721883019, + "grad_norm": 0.650553657834858, + "learning_rate": 0.00019154766466052773, + "loss": 12.5407, + "step": 5798 + }, + { + "epoch": 0.31577872618488495, + "grad_norm": 0.702942885517404, + "learning_rate": 0.00019154411609226854, + "loss": 12.4614, + "step": 5799 + }, + { + "epoch": 0.31583318018146794, + "grad_norm": 0.6481075120474772, + "learning_rate": 0.000191540566812147, + "loss": 12.535, + "step": 5800 + }, + { + "epoch": 0.31588763417805094, + "grad_norm": 1.0794965868639268, + "learning_rate": 0.00019153701682019062, + "loss": 12.4537, + "step": 5801 + }, + { + "epoch": 0.315942088174634, + "grad_norm": 0.7350571771371223, + "learning_rate": 0.00019153346611642706, + "loss": 12.5537, + "step": 5802 + }, + { + "epoch": 0.315996542171217, + "grad_norm": 0.6826378435926155, + "learning_rate": 0.0001915299147008839, + "loss": 12.632, + "step": 5803 + }, + { + "epoch": 0.31605099616779997, + "grad_norm": 0.731159866122895, + "learning_rate": 0.0001915263625735888, + "loss": 12.564, + "step": 5804 + }, + { + "epoch": 0.316105450164383, + "grad_norm": 0.7153238353345873, + "learning_rate": 0.00019152280973456934, + "loss": 12.6477, + "step": 5805 + }, + { + "epoch": 0.316159904160966, + "grad_norm": 0.7530356054837524, + "learning_rate": 0.0001915192561838532, + "loss": 12.6045, + "step": 5806 + }, + { + "epoch": 0.316214358157549, + "grad_norm": 0.7768597439292041, + "learning_rate": 0.00019151570192146793, + "loss": 12.6109, + "step": 5807 + }, + { + "epoch": 0.31626881215413205, + "grad_norm": 0.7119692150042135, + "learning_rate": 0.00019151214694744124, + "loss": 12.5061, + "step": 5808 + }, + { + "epoch": 0.31632326615071504, + "grad_norm": 0.6405228772522945, + "learning_rate": 0.00019150859126180073, + "loss": 12.6042, + "step": 5809 + }, + { + "epoch": 0.31637772014729804, + "grad_norm": 0.7223526465464335, + "learning_rate": 0.00019150503486457408, + "loss": 12.5468, + "step": 5810 + }, + { + "epoch": 0.3164321741438811, + "grad_norm": 0.783686761881016, + "learning_rate": 0.00019150147775578893, + "loss": 12.5976, + "step": 5811 + }, + { + "epoch": 0.3164866281404641, + "grad_norm": 0.6661836937905445, + "learning_rate": 0.00019149791993547296, + "loss": 12.4926, + "step": 5812 + }, + { + "epoch": 0.31654108213704707, + "grad_norm": 0.7332835327296134, + "learning_rate": 0.00019149436140365378, + "loss": 12.6416, + "step": 5813 + }, + { + "epoch": 0.3165955361336301, + "grad_norm": 0.689079313091683, + "learning_rate": 0.00019149080216035916, + "loss": 12.521, + "step": 5814 + }, + { + "epoch": 0.3166499901302131, + "grad_norm": 0.7952304139427612, + "learning_rate": 0.00019148724220561665, + "loss": 12.6607, + "step": 5815 + }, + { + "epoch": 0.31670444412679616, + "grad_norm": 0.6523041678617995, + "learning_rate": 0.00019148368153945407, + "loss": 12.491, + "step": 5816 + }, + { + "epoch": 0.31675889812337915, + "grad_norm": 0.6227683054782411, + "learning_rate": 0.000191480120161899, + "loss": 12.5964, + "step": 5817 + }, + { + "epoch": 0.31681335211996214, + "grad_norm": 0.6687279981265427, + "learning_rate": 0.00019147655807297918, + "loss": 12.6218, + "step": 5818 + }, + { + "epoch": 0.3168678061165452, + "grad_norm": 0.6862606107733062, + "learning_rate": 0.0001914729952727223, + "loss": 12.5118, + "step": 5819 + }, + { + "epoch": 0.3169222601131282, + "grad_norm": 0.7142573441962577, + "learning_rate": 0.0001914694317611561, + "loss": 12.5311, + "step": 5820 + }, + { + "epoch": 0.3169767141097112, + "grad_norm": 0.6552484972372591, + "learning_rate": 0.0001914658675383082, + "loss": 12.5544, + "step": 5821 + }, + { + "epoch": 0.3170311681062942, + "grad_norm": 0.9383767153400039, + "learning_rate": 0.00019146230260420644, + "loss": 12.6369, + "step": 5822 + }, + { + "epoch": 0.3170856221028772, + "grad_norm": 0.7370541874647787, + "learning_rate": 0.00019145873695887843, + "loss": 12.639, + "step": 5823 + }, + { + "epoch": 0.3171400760994602, + "grad_norm": 0.7141483693702555, + "learning_rate": 0.00019145517060235195, + "loss": 12.6066, + "step": 5824 + }, + { + "epoch": 0.31719453009604326, + "grad_norm": 0.7333381815447372, + "learning_rate": 0.00019145160353465474, + "loss": 12.5564, + "step": 5825 + }, + { + "epoch": 0.31724898409262625, + "grad_norm": 0.6183938441937711, + "learning_rate": 0.00019144803575581453, + "loss": 12.5607, + "step": 5826 + }, + { + "epoch": 0.31730343808920924, + "grad_norm": 0.6892142292238302, + "learning_rate": 0.00019144446726585904, + "loss": 12.5779, + "step": 5827 + }, + { + "epoch": 0.3173578920857923, + "grad_norm": 0.6630403431973069, + "learning_rate": 0.00019144089806481606, + "loss": 12.5383, + "step": 5828 + }, + { + "epoch": 0.3174123460823753, + "grad_norm": 0.6879898272065529, + "learning_rate": 0.0001914373281527133, + "loss": 12.6813, + "step": 5829 + }, + { + "epoch": 0.3174668000789583, + "grad_norm": 0.9279147458061492, + "learning_rate": 0.00019143375752957856, + "loss": 12.5834, + "step": 5830 + }, + { + "epoch": 0.3175212540755413, + "grad_norm": 0.8261607137141636, + "learning_rate": 0.0001914301861954396, + "loss": 12.6757, + "step": 5831 + }, + { + "epoch": 0.3175757080721243, + "grad_norm": 0.7070172220832147, + "learning_rate": 0.00019142661415032415, + "loss": 12.5782, + "step": 5832 + }, + { + "epoch": 0.3176301620687073, + "grad_norm": 0.7416069134775134, + "learning_rate": 0.00019142304139426, + "loss": 12.4508, + "step": 5833 + }, + { + "epoch": 0.31768461606529036, + "grad_norm": 0.6759809445170305, + "learning_rate": 0.000191419467927275, + "loss": 12.4728, + "step": 5834 + }, + { + "epoch": 0.31773907006187335, + "grad_norm": 0.7479228483812418, + "learning_rate": 0.00019141589374939685, + "loss": 12.5918, + "step": 5835 + }, + { + "epoch": 0.31779352405845634, + "grad_norm": 0.7018073897109439, + "learning_rate": 0.0001914123188606534, + "loss": 12.6481, + "step": 5836 + }, + { + "epoch": 0.3178479780550394, + "grad_norm": 0.6509538967378394, + "learning_rate": 0.0001914087432610724, + "loss": 12.6216, + "step": 5837 + }, + { + "epoch": 0.3179024320516224, + "grad_norm": 0.6956653775989627, + "learning_rate": 0.0001914051669506817, + "loss": 12.4914, + "step": 5838 + }, + { + "epoch": 0.3179568860482054, + "grad_norm": 0.6488704800793494, + "learning_rate": 0.0001914015899295091, + "loss": 12.4812, + "step": 5839 + }, + { + "epoch": 0.3180113400447884, + "grad_norm": 0.6687389598321076, + "learning_rate": 0.00019139801219758242, + "loss": 12.5834, + "step": 5840 + }, + { + "epoch": 0.3180657940413714, + "grad_norm": 0.67578294702017, + "learning_rate": 0.00019139443375492944, + "loss": 12.5554, + "step": 5841 + }, + { + "epoch": 0.3181202480379544, + "grad_norm": 0.6408985153317925, + "learning_rate": 0.00019139085460157803, + "loss": 12.5064, + "step": 5842 + }, + { + "epoch": 0.31817470203453746, + "grad_norm": 0.6129408061020057, + "learning_rate": 0.00019138727473755603, + "loss": 12.5581, + "step": 5843 + }, + { + "epoch": 0.31822915603112045, + "grad_norm": 0.7117051176119991, + "learning_rate": 0.00019138369416289122, + "loss": 12.6514, + "step": 5844 + }, + { + "epoch": 0.31828361002770345, + "grad_norm": 0.6685305734872339, + "learning_rate": 0.0001913801128776115, + "loss": 12.6237, + "step": 5845 + }, + { + "epoch": 0.3183380640242865, + "grad_norm": 0.7935940982999596, + "learning_rate": 0.0001913765308817447, + "loss": 12.6416, + "step": 5846 + }, + { + "epoch": 0.3183925180208695, + "grad_norm": 0.6777446131367032, + "learning_rate": 0.00019137294817531863, + "loss": 12.4684, + "step": 5847 + }, + { + "epoch": 0.3184469720174525, + "grad_norm": 0.6927049733411546, + "learning_rate": 0.00019136936475836126, + "loss": 12.4775, + "step": 5848 + }, + { + "epoch": 0.3185014260140355, + "grad_norm": 0.6803616170636894, + "learning_rate": 0.00019136578063090034, + "loss": 12.4689, + "step": 5849 + }, + { + "epoch": 0.3185558800106185, + "grad_norm": 0.6612387529143992, + "learning_rate": 0.0001913621957929638, + "loss": 12.61, + "step": 5850 + }, + { + "epoch": 0.31861033400720157, + "grad_norm": 0.6785997938224911, + "learning_rate": 0.0001913586102445795, + "loss": 12.6917, + "step": 5851 + }, + { + "epoch": 0.31866478800378456, + "grad_norm": 0.6857052369825005, + "learning_rate": 0.00019135502398577532, + "loss": 12.6363, + "step": 5852 + }, + { + "epoch": 0.31871924200036755, + "grad_norm": 0.6627549169132189, + "learning_rate": 0.00019135143701657915, + "loss": 12.5624, + "step": 5853 + }, + { + "epoch": 0.3187736959969506, + "grad_norm": 0.6077596656319667, + "learning_rate": 0.00019134784933701892, + "loss": 12.4125, + "step": 5854 + }, + { + "epoch": 0.3188281499935336, + "grad_norm": 0.7000816467705235, + "learning_rate": 0.00019134426094712245, + "loss": 12.636, + "step": 5855 + }, + { + "epoch": 0.3188826039901166, + "grad_norm": 0.6772191962377853, + "learning_rate": 0.00019134067184691772, + "loss": 12.6985, + "step": 5856 + }, + { + "epoch": 0.31893705798669963, + "grad_norm": 0.7770632506722829, + "learning_rate": 0.0001913370820364326, + "loss": 12.5842, + "step": 5857 + }, + { + "epoch": 0.3189915119832826, + "grad_norm": 0.7579582324057819, + "learning_rate": 0.00019133349151569503, + "loss": 12.4904, + "step": 5858 + }, + { + "epoch": 0.3190459659798656, + "grad_norm": 0.7548108905123265, + "learning_rate": 0.0001913299002847329, + "loss": 12.5497, + "step": 5859 + }, + { + "epoch": 0.31910041997644867, + "grad_norm": 0.6842505571032034, + "learning_rate": 0.00019132630834357413, + "loss": 12.5264, + "step": 5860 + }, + { + "epoch": 0.31915487397303166, + "grad_norm": 0.7389234910801799, + "learning_rate": 0.0001913227156922467, + "loss": 12.5738, + "step": 5861 + }, + { + "epoch": 0.31920932796961465, + "grad_norm": 0.6907600700325771, + "learning_rate": 0.0001913191223307785, + "loss": 12.5408, + "step": 5862 + }, + { + "epoch": 0.3192637819661977, + "grad_norm": 0.6885413039328351, + "learning_rate": 0.0001913155282591975, + "loss": 12.4737, + "step": 5863 + }, + { + "epoch": 0.3193182359627807, + "grad_norm": 0.7449988965819492, + "learning_rate": 0.00019131193347753163, + "loss": 12.6084, + "step": 5864 + }, + { + "epoch": 0.3193726899593637, + "grad_norm": 0.8251864478061979, + "learning_rate": 0.00019130833798580886, + "loss": 12.5428, + "step": 5865 + }, + { + "epoch": 0.31942714395594674, + "grad_norm": 0.6841118849731743, + "learning_rate": 0.00019130474178405714, + "loss": 12.641, + "step": 5866 + }, + { + "epoch": 0.31948159795252973, + "grad_norm": 0.7122261775745119, + "learning_rate": 0.00019130114487230442, + "loss": 12.5943, + "step": 5867 + }, + { + "epoch": 0.3195360519491127, + "grad_norm": 0.6901954748558911, + "learning_rate": 0.0001912975472505787, + "loss": 12.5068, + "step": 5868 + }, + { + "epoch": 0.31959050594569577, + "grad_norm": 0.7314037157651961, + "learning_rate": 0.00019129394891890793, + "loss": 12.4722, + "step": 5869 + }, + { + "epoch": 0.31964495994227876, + "grad_norm": 0.7117573572189002, + "learning_rate": 0.00019129034987732012, + "loss": 12.578, + "step": 5870 + }, + { + "epoch": 0.31969941393886175, + "grad_norm": 0.7010924063861873, + "learning_rate": 0.00019128675012584326, + "loss": 12.6654, + "step": 5871 + }, + { + "epoch": 0.3197538679354448, + "grad_norm": 0.6064541420225625, + "learning_rate": 0.00019128314966450528, + "loss": 12.55, + "step": 5872 + }, + { + "epoch": 0.3198083219320278, + "grad_norm": 0.6321466814864312, + "learning_rate": 0.00019127954849333423, + "loss": 12.5765, + "step": 5873 + }, + { + "epoch": 0.3198627759286108, + "grad_norm": 0.6947632669120551, + "learning_rate": 0.0001912759466123581, + "loss": 12.514, + "step": 5874 + }, + { + "epoch": 0.31991722992519384, + "grad_norm": 0.6828212967587132, + "learning_rate": 0.0001912723440216049, + "loss": 12.5452, + "step": 5875 + }, + { + "epoch": 0.31997168392177683, + "grad_norm": 0.7090710584294178, + "learning_rate": 0.00019126874072110267, + "loss": 12.5601, + "step": 5876 + }, + { + "epoch": 0.3200261379183598, + "grad_norm": 0.641093483322115, + "learning_rate": 0.0001912651367108794, + "loss": 12.5624, + "step": 5877 + }, + { + "epoch": 0.32008059191494287, + "grad_norm": 0.6538327455468403, + "learning_rate": 0.0001912615319909631, + "loss": 12.5216, + "step": 5878 + }, + { + "epoch": 0.32013504591152586, + "grad_norm": 0.6449607621299951, + "learning_rate": 0.00019125792656138186, + "loss": 12.5711, + "step": 5879 + }, + { + "epoch": 0.32018949990810885, + "grad_norm": 0.7118519253223596, + "learning_rate": 0.00019125432042216365, + "loss": 12.5681, + "step": 5880 + }, + { + "epoch": 0.3202439539046919, + "grad_norm": 0.6947313023045424, + "learning_rate": 0.00019125071357333658, + "loss": 12.5832, + "step": 5881 + }, + { + "epoch": 0.3202984079012749, + "grad_norm": 0.719459911799742, + "learning_rate": 0.0001912471060149286, + "loss": 12.5015, + "step": 5882 + }, + { + "epoch": 0.32035286189785794, + "grad_norm": 0.662669731033275, + "learning_rate": 0.00019124349774696787, + "loss": 12.5893, + "step": 5883 + }, + { + "epoch": 0.32040731589444094, + "grad_norm": 0.7102087719102528, + "learning_rate": 0.00019123988876948236, + "loss": 12.537, + "step": 5884 + }, + { + "epoch": 0.32046176989102393, + "grad_norm": 0.6084472187136899, + "learning_rate": 0.0001912362790825002, + "loss": 12.5018, + "step": 5885 + }, + { + "epoch": 0.320516223887607, + "grad_norm": 0.6703985463238232, + "learning_rate": 0.0001912326686860494, + "loss": 12.5211, + "step": 5886 + }, + { + "epoch": 0.32057067788418997, + "grad_norm": 0.6735533812325952, + "learning_rate": 0.00019122905758015812, + "loss": 12.5738, + "step": 5887 + }, + { + "epoch": 0.32062513188077296, + "grad_norm": 0.8245981215897119, + "learning_rate": 0.00019122544576485434, + "loss": 12.6971, + "step": 5888 + }, + { + "epoch": 0.320679585877356, + "grad_norm": 0.7387246587967972, + "learning_rate": 0.00019122183324016625, + "loss": 12.7761, + "step": 5889 + }, + { + "epoch": 0.320734039873939, + "grad_norm": 0.6311846714662387, + "learning_rate": 0.00019121822000612185, + "loss": 12.6251, + "step": 5890 + }, + { + "epoch": 0.320788493870522, + "grad_norm": 0.6762614014696763, + "learning_rate": 0.0001912146060627493, + "loss": 12.5361, + "step": 5891 + }, + { + "epoch": 0.32084294786710504, + "grad_norm": 0.7312297128165434, + "learning_rate": 0.00019121099141007663, + "loss": 12.6024, + "step": 5892 + }, + { + "epoch": 0.32089740186368804, + "grad_norm": 0.6691564713248157, + "learning_rate": 0.00019120737604813205, + "loss": 12.6048, + "step": 5893 + }, + { + "epoch": 0.32095185586027103, + "grad_norm": 0.6445853731650664, + "learning_rate": 0.00019120375997694358, + "loss": 12.5692, + "step": 5894 + }, + { + "epoch": 0.3210063098568541, + "grad_norm": 0.68421869894897, + "learning_rate": 0.00019120014319653938, + "loss": 12.583, + "step": 5895 + }, + { + "epoch": 0.32106076385343707, + "grad_norm": 0.8049595047614536, + "learning_rate": 0.0001911965257069476, + "loss": 12.7101, + "step": 5896 + }, + { + "epoch": 0.32111521785002006, + "grad_norm": 0.667047744432576, + "learning_rate": 0.00019119290750819633, + "loss": 12.5683, + "step": 5897 + }, + { + "epoch": 0.3211696718466031, + "grad_norm": 0.623587825427952, + "learning_rate": 0.00019118928860031368, + "loss": 12.5035, + "step": 5898 + }, + { + "epoch": 0.3212241258431861, + "grad_norm": 0.7162232836644334, + "learning_rate": 0.00019118566898332787, + "loss": 12.5323, + "step": 5899 + }, + { + "epoch": 0.3212785798397691, + "grad_norm": 0.6843716242945724, + "learning_rate": 0.000191182048657267, + "loss": 12.5902, + "step": 5900 + }, + { + "epoch": 0.32133303383635214, + "grad_norm": 0.6313048020288865, + "learning_rate": 0.00019117842762215922, + "loss": 12.5645, + "step": 5901 + }, + { + "epoch": 0.32138748783293514, + "grad_norm": 0.6700544330362783, + "learning_rate": 0.0001911748058780327, + "loss": 12.5279, + "step": 5902 + }, + { + "epoch": 0.32144194182951813, + "grad_norm": 0.6428991072690676, + "learning_rate": 0.0001911711834249156, + "loss": 12.4831, + "step": 5903 + }, + { + "epoch": 0.3214963958261012, + "grad_norm": 0.7827750281315372, + "learning_rate": 0.00019116756026283608, + "loss": 12.7236, + "step": 5904 + }, + { + "epoch": 0.32155084982268417, + "grad_norm": 0.655512340891206, + "learning_rate": 0.00019116393639182232, + "loss": 12.5124, + "step": 5905 + }, + { + "epoch": 0.32160530381926716, + "grad_norm": 0.6778863662038578, + "learning_rate": 0.00019116031181190253, + "loss": 12.488, + "step": 5906 + }, + { + "epoch": 0.3216597578158502, + "grad_norm": 0.7940130979158228, + "learning_rate": 0.00019115668652310486, + "loss": 12.6014, + "step": 5907 + }, + { + "epoch": 0.3217142118124332, + "grad_norm": 0.7597082595800774, + "learning_rate": 0.0001911530605254575, + "loss": 12.6357, + "step": 5908 + }, + { + "epoch": 0.3217686658090162, + "grad_norm": 0.6902457091288527, + "learning_rate": 0.00019114943381898865, + "loss": 12.6692, + "step": 5909 + }, + { + "epoch": 0.32182311980559924, + "grad_norm": 0.7174150884814654, + "learning_rate": 0.0001911458064037265, + "loss": 12.5264, + "step": 5910 + }, + { + "epoch": 0.32187757380218224, + "grad_norm": 0.6886629116964688, + "learning_rate": 0.00019114217827969932, + "loss": 12.5167, + "step": 5911 + }, + { + "epoch": 0.32193202779876523, + "grad_norm": 0.717546620919752, + "learning_rate": 0.00019113854944693523, + "loss": 12.5693, + "step": 5912 + }, + { + "epoch": 0.3219864817953483, + "grad_norm": 0.6188720374301961, + "learning_rate": 0.00019113491990546252, + "loss": 12.459, + "step": 5913 + }, + { + "epoch": 0.32204093579193127, + "grad_norm": 0.6450251252181151, + "learning_rate": 0.00019113128965530943, + "loss": 12.458, + "step": 5914 + }, + { + "epoch": 0.32209538978851426, + "grad_norm": 0.6846813998165503, + "learning_rate": 0.00019112765869650405, + "loss": 12.4775, + "step": 5915 + }, + { + "epoch": 0.3221498437850973, + "grad_norm": 0.6962122434622048, + "learning_rate": 0.0001911240270290748, + "loss": 12.6427, + "step": 5916 + }, + { + "epoch": 0.3222042977816803, + "grad_norm": 0.6254373248490377, + "learning_rate": 0.0001911203946530498, + "loss": 12.5056, + "step": 5917 + }, + { + "epoch": 0.32225875177826335, + "grad_norm": 0.6217407116597037, + "learning_rate": 0.00019111676156845735, + "loss": 12.4804, + "step": 5918 + }, + { + "epoch": 0.32231320577484635, + "grad_norm": 0.7202595630434092, + "learning_rate": 0.00019111312777532566, + "loss": 12.5083, + "step": 5919 + }, + { + "epoch": 0.32236765977142934, + "grad_norm": 0.6820159788858451, + "learning_rate": 0.000191109493273683, + "loss": 12.4608, + "step": 5920 + }, + { + "epoch": 0.3224221137680124, + "grad_norm": 0.7085165494770425, + "learning_rate": 0.0001911058580635577, + "loss": 12.4741, + "step": 5921 + }, + { + "epoch": 0.3224765677645954, + "grad_norm": 0.6757057788462777, + "learning_rate": 0.0001911022221449779, + "loss": 12.4002, + "step": 5922 + }, + { + "epoch": 0.32253102176117837, + "grad_norm": 0.7049609976493082, + "learning_rate": 0.00019109858551797198, + "loss": 12.5587, + "step": 5923 + }, + { + "epoch": 0.3225854757577614, + "grad_norm": 0.6373738244859181, + "learning_rate": 0.00019109494818256816, + "loss": 12.3764, + "step": 5924 + }, + { + "epoch": 0.3226399297543444, + "grad_norm": 0.6628983708514188, + "learning_rate": 0.00019109131013879475, + "loss": 12.4796, + "step": 5925 + }, + { + "epoch": 0.3226943837509274, + "grad_norm": 0.6350872853299196, + "learning_rate": 0.00019108767138668005, + "loss": 12.4353, + "step": 5926 + }, + { + "epoch": 0.32274883774751045, + "grad_norm": 0.6424512411543644, + "learning_rate": 0.00019108403192625236, + "loss": 12.4423, + "step": 5927 + }, + { + "epoch": 0.32280329174409345, + "grad_norm": 0.7047099081442071, + "learning_rate": 0.00019108039175753992, + "loss": 12.6113, + "step": 5928 + }, + { + "epoch": 0.32285774574067644, + "grad_norm": 0.6543119269839901, + "learning_rate": 0.00019107675088057108, + "loss": 12.6195, + "step": 5929 + }, + { + "epoch": 0.3229121997372595, + "grad_norm": 0.717057102160912, + "learning_rate": 0.00019107310929537417, + "loss": 12.3956, + "step": 5930 + }, + { + "epoch": 0.3229666537338425, + "grad_norm": 0.7362917478799178, + "learning_rate": 0.0001910694670019775, + "loss": 12.6826, + "step": 5931 + }, + { + "epoch": 0.32302110773042547, + "grad_norm": 0.6330396736954499, + "learning_rate": 0.00019106582400040938, + "loss": 12.5617, + "step": 5932 + }, + { + "epoch": 0.3230755617270085, + "grad_norm": 0.6650985491791709, + "learning_rate": 0.00019106218029069812, + "loss": 12.6838, + "step": 5933 + }, + { + "epoch": 0.3231300157235915, + "grad_norm": 0.6495988085164426, + "learning_rate": 0.0001910585358728721, + "loss": 12.4752, + "step": 5934 + }, + { + "epoch": 0.3231844697201745, + "grad_norm": 0.595367641986107, + "learning_rate": 0.0001910548907469596, + "loss": 12.49, + "step": 5935 + }, + { + "epoch": 0.32323892371675755, + "grad_norm": 0.7195058905322125, + "learning_rate": 0.00019105124491298906, + "loss": 12.5126, + "step": 5936 + }, + { + "epoch": 0.32329337771334055, + "grad_norm": 0.6263694786258669, + "learning_rate": 0.00019104759837098868, + "loss": 12.6228, + "step": 5937 + }, + { + "epoch": 0.32334783170992354, + "grad_norm": 0.6735130785459177, + "learning_rate": 0.000191043951120987, + "loss": 12.4233, + "step": 5938 + }, + { + "epoch": 0.3234022857065066, + "grad_norm": 0.6343518628030865, + "learning_rate": 0.00019104030316301223, + "loss": 12.5953, + "step": 5939 + }, + { + "epoch": 0.3234567397030896, + "grad_norm": 0.6339707761405831, + "learning_rate": 0.0001910366544970928, + "loss": 12.4378, + "step": 5940 + }, + { + "epoch": 0.3235111936996726, + "grad_norm": 0.9740575909840364, + "learning_rate": 0.00019103300512325708, + "loss": 12.6041, + "step": 5941 + }, + { + "epoch": 0.3235656476962556, + "grad_norm": 0.6471335562445306, + "learning_rate": 0.00019102935504153348, + "loss": 12.3995, + "step": 5942 + }, + { + "epoch": 0.3236201016928386, + "grad_norm": 0.6965000222741758, + "learning_rate": 0.00019102570425195032, + "loss": 12.4683, + "step": 5943 + }, + { + "epoch": 0.3236745556894216, + "grad_norm": 0.6845613506290676, + "learning_rate": 0.000191022052754536, + "loss": 12.5708, + "step": 5944 + }, + { + "epoch": 0.32372900968600465, + "grad_norm": 1.055426781939791, + "learning_rate": 0.00019101840054931897, + "loss": 12.5449, + "step": 5945 + }, + { + "epoch": 0.32378346368258765, + "grad_norm": 0.6478370530528846, + "learning_rate": 0.0001910147476363276, + "loss": 12.4678, + "step": 5946 + }, + { + "epoch": 0.32383791767917064, + "grad_norm": 0.7236229953266451, + "learning_rate": 0.00019101109401559025, + "loss": 12.45, + "step": 5947 + }, + { + "epoch": 0.3238923716757537, + "grad_norm": 0.7776354757900491, + "learning_rate": 0.0001910074396871354, + "loss": 12.7088, + "step": 5948 + }, + { + "epoch": 0.3239468256723367, + "grad_norm": 0.747095157378711, + "learning_rate": 0.00019100378465099143, + "loss": 12.4241, + "step": 5949 + }, + { + "epoch": 0.32400127966891973, + "grad_norm": 0.7026491659725063, + "learning_rate": 0.00019100012890718674, + "loss": 12.5919, + "step": 5950 + }, + { + "epoch": 0.3240557336655027, + "grad_norm": 0.6995609889373499, + "learning_rate": 0.00019099647245574981, + "loss": 12.6247, + "step": 5951 + }, + { + "epoch": 0.3241101876620857, + "grad_norm": 0.8774373263694469, + "learning_rate": 0.00019099281529670907, + "loss": 12.4489, + "step": 5952 + }, + { + "epoch": 0.32416464165866876, + "grad_norm": 0.6405476430956029, + "learning_rate": 0.0001909891574300929, + "loss": 12.5104, + "step": 5953 + }, + { + "epoch": 0.32421909565525175, + "grad_norm": 0.6562550029524373, + "learning_rate": 0.00019098549885592983, + "loss": 12.663, + "step": 5954 + }, + { + "epoch": 0.32427354965183475, + "grad_norm": 0.66271688148055, + "learning_rate": 0.00019098183957424824, + "loss": 12.5827, + "step": 5955 + }, + { + "epoch": 0.3243280036484178, + "grad_norm": 0.6147209762320107, + "learning_rate": 0.0001909781795850766, + "loss": 12.5188, + "step": 5956 + }, + { + "epoch": 0.3243824576450008, + "grad_norm": 0.7348392864009599, + "learning_rate": 0.00019097451888844337, + "loss": 12.5578, + "step": 5957 + }, + { + "epoch": 0.3244369116415838, + "grad_norm": 0.6520863870566146, + "learning_rate": 0.00019097085748437704, + "loss": 12.2885, + "step": 5958 + }, + { + "epoch": 0.32449136563816683, + "grad_norm": 0.6079312001278564, + "learning_rate": 0.00019096719537290606, + "loss": 12.3898, + "step": 5959 + }, + { + "epoch": 0.3245458196347498, + "grad_norm": 0.6314519112721095, + "learning_rate": 0.00019096353255405892, + "loss": 12.4193, + "step": 5960 + }, + { + "epoch": 0.3246002736313328, + "grad_norm": 0.6570340381863177, + "learning_rate": 0.0001909598690278641, + "loss": 12.4791, + "step": 5961 + }, + { + "epoch": 0.32465472762791586, + "grad_norm": 0.6753086555455631, + "learning_rate": 0.0001909562047943501, + "loss": 12.5465, + "step": 5962 + }, + { + "epoch": 0.32470918162449885, + "grad_norm": 0.730108073855115, + "learning_rate": 0.00019095253985354534, + "loss": 12.5237, + "step": 5963 + }, + { + "epoch": 0.32476363562108185, + "grad_norm": 0.6205806638320429, + "learning_rate": 0.00019094887420547844, + "loss": 12.5215, + "step": 5964 + }, + { + "epoch": 0.3248180896176649, + "grad_norm": 0.6353200889527242, + "learning_rate": 0.0001909452078501778, + "loss": 12.5001, + "step": 5965 + }, + { + "epoch": 0.3248725436142479, + "grad_norm": 0.9432560778936758, + "learning_rate": 0.000190941540787672, + "loss": 12.6029, + "step": 5966 + }, + { + "epoch": 0.3249269976108309, + "grad_norm": 0.6341818292480685, + "learning_rate": 0.00019093787301798952, + "loss": 12.5276, + "step": 5967 + }, + { + "epoch": 0.32498145160741393, + "grad_norm": 0.6416082367670348, + "learning_rate": 0.00019093420454115886, + "loss": 12.593, + "step": 5968 + }, + { + "epoch": 0.3250359056039969, + "grad_norm": 0.8068450582418325, + "learning_rate": 0.00019093053535720861, + "loss": 12.5589, + "step": 5969 + }, + { + "epoch": 0.3250903596005799, + "grad_norm": 0.7187539550802996, + "learning_rate": 0.00019092686546616725, + "loss": 12.4668, + "step": 5970 + }, + { + "epoch": 0.32514481359716296, + "grad_norm": 0.7413479385730268, + "learning_rate": 0.00019092319486806335, + "loss": 12.5876, + "step": 5971 + }, + { + "epoch": 0.32519926759374596, + "grad_norm": 0.7031008087918882, + "learning_rate": 0.00019091952356292544, + "loss": 12.5985, + "step": 5972 + }, + { + "epoch": 0.32525372159032895, + "grad_norm": 0.715757330929554, + "learning_rate": 0.00019091585155078206, + "loss": 12.5552, + "step": 5973 + }, + { + "epoch": 0.325308175586912, + "grad_norm": 0.8596223200264557, + "learning_rate": 0.00019091217883166178, + "loss": 12.5456, + "step": 5974 + }, + { + "epoch": 0.325362629583495, + "grad_norm": 0.607716778638219, + "learning_rate": 0.00019090850540559316, + "loss": 12.3831, + "step": 5975 + }, + { + "epoch": 0.325417083580078, + "grad_norm": 0.69632444557811, + "learning_rate": 0.00019090483127260472, + "loss": 12.5443, + "step": 5976 + }, + { + "epoch": 0.32547153757666103, + "grad_norm": 0.7908830884226349, + "learning_rate": 0.00019090115643272508, + "loss": 12.6423, + "step": 5977 + }, + { + "epoch": 0.325525991573244, + "grad_norm": 0.7317983369210814, + "learning_rate": 0.00019089748088598282, + "loss": 12.6053, + "step": 5978 + }, + { + "epoch": 0.325580445569827, + "grad_norm": 0.6860666876617846, + "learning_rate": 0.0001908938046324065, + "loss": 12.4089, + "step": 5979 + }, + { + "epoch": 0.32563489956641006, + "grad_norm": 0.7133453663417441, + "learning_rate": 0.0001908901276720247, + "loss": 12.65, + "step": 5980 + }, + { + "epoch": 0.32568935356299306, + "grad_norm": 0.6771444424428702, + "learning_rate": 0.00019088645000486603, + "loss": 12.6196, + "step": 5981 + }, + { + "epoch": 0.32574380755957605, + "grad_norm": 0.6618186331476609, + "learning_rate": 0.0001908827716309591, + "loss": 12.4131, + "step": 5982 + }, + { + "epoch": 0.3257982615561591, + "grad_norm": 0.666251776743328, + "learning_rate": 0.0001908790925503325, + "loss": 12.5763, + "step": 5983 + }, + { + "epoch": 0.3258527155527421, + "grad_norm": 0.6423401599328059, + "learning_rate": 0.00019087541276301479, + "loss": 12.6437, + "step": 5984 + }, + { + "epoch": 0.32590716954932514, + "grad_norm": 0.684078308546453, + "learning_rate": 0.00019087173226903467, + "loss": 12.4504, + "step": 5985 + }, + { + "epoch": 0.32596162354590813, + "grad_norm": 0.679832543699596, + "learning_rate": 0.00019086805106842072, + "loss": 12.6542, + "step": 5986 + }, + { + "epoch": 0.3260160775424911, + "grad_norm": 0.6848783699223823, + "learning_rate": 0.00019086436916120153, + "loss": 12.6109, + "step": 5987 + }, + { + "epoch": 0.32607053153907417, + "grad_norm": 0.6741532940091434, + "learning_rate": 0.0001908606865474058, + "loss": 12.492, + "step": 5988 + }, + { + "epoch": 0.32612498553565716, + "grad_norm": 0.6973214200387228, + "learning_rate": 0.00019085700322706215, + "loss": 12.5396, + "step": 5989 + }, + { + "epoch": 0.32617943953224016, + "grad_norm": 0.6819143354267655, + "learning_rate": 0.00019085331920019917, + "loss": 12.5189, + "step": 5990 + }, + { + "epoch": 0.3262338935288232, + "grad_norm": 0.7607303886975703, + "learning_rate": 0.00019084963446684556, + "loss": 12.5383, + "step": 5991 + }, + { + "epoch": 0.3262883475254062, + "grad_norm": 0.7104535783369613, + "learning_rate": 0.00019084594902702996, + "loss": 12.6959, + "step": 5992 + }, + { + "epoch": 0.3263428015219892, + "grad_norm": 0.6869245037086184, + "learning_rate": 0.000190842262880781, + "loss": 12.4045, + "step": 5993 + }, + { + "epoch": 0.32639725551857224, + "grad_norm": 0.6891889185433268, + "learning_rate": 0.0001908385760281274, + "loss": 12.6741, + "step": 5994 + }, + { + "epoch": 0.32645170951515523, + "grad_norm": 0.7146884740935388, + "learning_rate": 0.0001908348884690978, + "loss": 12.5421, + "step": 5995 + }, + { + "epoch": 0.3265061635117382, + "grad_norm": 0.660700226334539, + "learning_rate": 0.00019083120020372087, + "loss": 12.4129, + "step": 5996 + }, + { + "epoch": 0.32656061750832127, + "grad_norm": 0.6804830605995829, + "learning_rate": 0.0001908275112320253, + "loss": 12.3702, + "step": 5997 + }, + { + "epoch": 0.32661507150490426, + "grad_norm": 0.6558070236451653, + "learning_rate": 0.00019082382155403976, + "loss": 12.5424, + "step": 5998 + }, + { + "epoch": 0.32666952550148726, + "grad_norm": 0.6585152289440808, + "learning_rate": 0.00019082013116979293, + "loss": 12.6123, + "step": 5999 + }, + { + "epoch": 0.3267239794980703, + "grad_norm": 0.6741903332937523, + "learning_rate": 0.00019081644007931355, + "loss": 12.4616, + "step": 6000 + }, + { + "epoch": 0.3267784334946533, + "grad_norm": 0.6166158213759575, + "learning_rate": 0.0001908127482826303, + "loss": 12.6367, + "step": 6001 + }, + { + "epoch": 0.3268328874912363, + "grad_norm": 0.6880785638149681, + "learning_rate": 0.0001908090557797719, + "loss": 12.4693, + "step": 6002 + }, + { + "epoch": 0.32688734148781934, + "grad_norm": 0.6656609169611086, + "learning_rate": 0.00019080536257076706, + "loss": 12.5524, + "step": 6003 + }, + { + "epoch": 0.32694179548440233, + "grad_norm": 0.8859792489670578, + "learning_rate": 0.00019080166865564446, + "loss": 12.4053, + "step": 6004 + }, + { + "epoch": 0.3269962494809853, + "grad_norm": 0.6772545056208928, + "learning_rate": 0.0001907979740344329, + "loss": 12.5152, + "step": 6005 + }, + { + "epoch": 0.32705070347756837, + "grad_norm": 0.7193161190009105, + "learning_rate": 0.00019079427870716105, + "loss": 12.575, + "step": 6006 + }, + { + "epoch": 0.32710515747415136, + "grad_norm": 0.6766156295467368, + "learning_rate": 0.00019079058267385763, + "loss": 12.5792, + "step": 6007 + }, + { + "epoch": 0.32715961147073436, + "grad_norm": 0.65637458444529, + "learning_rate": 0.00019078688593455144, + "loss": 12.4607, + "step": 6008 + }, + { + "epoch": 0.3272140654673174, + "grad_norm": 0.7477260194565057, + "learning_rate": 0.0001907831884892712, + "loss": 12.6327, + "step": 6009 + }, + { + "epoch": 0.3272685194639004, + "grad_norm": 0.7543706752417337, + "learning_rate": 0.00019077949033804566, + "loss": 12.5272, + "step": 6010 + }, + { + "epoch": 0.3273229734604834, + "grad_norm": 0.7882627811447465, + "learning_rate": 0.0001907757914809036, + "loss": 12.5733, + "step": 6011 + }, + { + "epoch": 0.32737742745706644, + "grad_norm": 0.6942572401963698, + "learning_rate": 0.00019077209191787375, + "loss": 12.4568, + "step": 6012 + }, + { + "epoch": 0.32743188145364943, + "grad_norm": 0.71472777836806, + "learning_rate": 0.00019076839164898488, + "loss": 12.503, + "step": 6013 + }, + { + "epoch": 0.3274863354502324, + "grad_norm": 0.6458760780360772, + "learning_rate": 0.00019076469067426578, + "loss": 12.5897, + "step": 6014 + }, + { + "epoch": 0.3275407894468155, + "grad_norm": 0.6523758206718518, + "learning_rate": 0.0001907609889937452, + "loss": 12.5079, + "step": 6015 + }, + { + "epoch": 0.32759524344339847, + "grad_norm": 0.6639159221337562, + "learning_rate": 0.00019075728660745197, + "loss": 12.4423, + "step": 6016 + }, + { + "epoch": 0.3276496974399815, + "grad_norm": 0.7536010462227256, + "learning_rate": 0.00019075358351541488, + "loss": 12.5858, + "step": 6017 + }, + { + "epoch": 0.3277041514365645, + "grad_norm": 0.7421258119115142, + "learning_rate": 0.0001907498797176627, + "loss": 12.6127, + "step": 6018 + }, + { + "epoch": 0.3277586054331475, + "grad_norm": 0.6908788776147277, + "learning_rate": 0.00019074617521422423, + "loss": 12.561, + "step": 6019 + }, + { + "epoch": 0.32781305942973055, + "grad_norm": 0.6923049654838966, + "learning_rate": 0.00019074247000512825, + "loss": 12.3786, + "step": 6020 + }, + { + "epoch": 0.32786751342631354, + "grad_norm": 0.6881176792638315, + "learning_rate": 0.00019073876409040366, + "loss": 12.5307, + "step": 6021 + }, + { + "epoch": 0.32792196742289653, + "grad_norm": 0.6819501022563224, + "learning_rate": 0.0001907350574700792, + "loss": 12.5123, + "step": 6022 + }, + { + "epoch": 0.3279764214194796, + "grad_norm": 0.6768439916521216, + "learning_rate": 0.0001907313501441837, + "loss": 12.626, + "step": 6023 + }, + { + "epoch": 0.3280308754160626, + "grad_norm": 0.7041701961335308, + "learning_rate": 0.000190727642112746, + "loss": 12.4595, + "step": 6024 + }, + { + "epoch": 0.32808532941264557, + "grad_norm": 0.704760083849161, + "learning_rate": 0.00019072393337579499, + "loss": 12.5799, + "step": 6025 + }, + { + "epoch": 0.3281397834092286, + "grad_norm": 0.6995993768892956, + "learning_rate": 0.00019072022393335942, + "loss": 12.384, + "step": 6026 + }, + { + "epoch": 0.3281942374058116, + "grad_norm": 0.7527545179313321, + "learning_rate": 0.00019071651378546817, + "loss": 12.5722, + "step": 6027 + }, + { + "epoch": 0.3282486914023946, + "grad_norm": 0.6389152098343975, + "learning_rate": 0.0001907128029321501, + "loss": 12.4403, + "step": 6028 + }, + { + "epoch": 0.32830314539897765, + "grad_norm": 0.6888724064811103, + "learning_rate": 0.00019070909137343408, + "loss": 12.6418, + "step": 6029 + }, + { + "epoch": 0.32835759939556064, + "grad_norm": 0.6751648270814726, + "learning_rate": 0.00019070537910934895, + "loss": 12.6121, + "step": 6030 + }, + { + "epoch": 0.32841205339214363, + "grad_norm": 0.7575202162096621, + "learning_rate": 0.00019070166613992357, + "loss": 12.6031, + "step": 6031 + }, + { + "epoch": 0.3284665073887267, + "grad_norm": 0.6958833801819441, + "learning_rate": 0.00019069795246518683, + "loss": 12.6046, + "step": 6032 + }, + { + "epoch": 0.3285209613853097, + "grad_norm": 0.6457389212528368, + "learning_rate": 0.0001906942380851676, + "loss": 12.4216, + "step": 6033 + }, + { + "epoch": 0.32857541538189267, + "grad_norm": 0.6765810427732396, + "learning_rate": 0.00019069052299989475, + "loss": 12.5173, + "step": 6034 + }, + { + "epoch": 0.3286298693784757, + "grad_norm": 0.6486662081210898, + "learning_rate": 0.00019068680720939722, + "loss": 12.5056, + "step": 6035 + }, + { + "epoch": 0.3286843233750587, + "grad_norm": 0.6393177510255944, + "learning_rate": 0.00019068309071370386, + "loss": 12.5253, + "step": 6036 + }, + { + "epoch": 0.3287387773716417, + "grad_norm": 0.7043933590676535, + "learning_rate": 0.00019067937351284356, + "loss": 12.4581, + "step": 6037 + }, + { + "epoch": 0.32879323136822475, + "grad_norm": 0.7073559719836241, + "learning_rate": 0.00019067565560684525, + "loss": 12.6616, + "step": 6038 + }, + { + "epoch": 0.32884768536480774, + "grad_norm": 0.6695162577213956, + "learning_rate": 0.00019067193699573784, + "loss": 12.4207, + "step": 6039 + }, + { + "epoch": 0.32890213936139073, + "grad_norm": 0.6957632386249147, + "learning_rate": 0.00019066821767955023, + "loss": 12.6009, + "step": 6040 + }, + { + "epoch": 0.3289565933579738, + "grad_norm": 0.6786726297932553, + "learning_rate": 0.00019066449765831135, + "loss": 12.5736, + "step": 6041 + }, + { + "epoch": 0.3290110473545568, + "grad_norm": 0.7542246599893674, + "learning_rate": 0.00019066077693205018, + "loss": 12.4497, + "step": 6042 + }, + { + "epoch": 0.32906550135113977, + "grad_norm": 0.6266163221327751, + "learning_rate": 0.00019065705550079556, + "loss": 12.4521, + "step": 6043 + }, + { + "epoch": 0.3291199553477228, + "grad_norm": 0.6632501992328229, + "learning_rate": 0.0001906533333645765, + "loss": 12.5465, + "step": 6044 + }, + { + "epoch": 0.3291744093443058, + "grad_norm": 0.7173246364462722, + "learning_rate": 0.0001906496105234219, + "loss": 12.4319, + "step": 6045 + }, + { + "epoch": 0.3292288633408888, + "grad_norm": 0.6601271095282973, + "learning_rate": 0.00019064588697736073, + "loss": 12.6277, + "step": 6046 + }, + { + "epoch": 0.32928331733747185, + "grad_norm": 0.7662957908838449, + "learning_rate": 0.00019064216272642192, + "loss": 12.4888, + "step": 6047 + }, + { + "epoch": 0.32933777133405484, + "grad_norm": 0.9193653350114078, + "learning_rate": 0.00019063843777063447, + "loss": 12.7076, + "step": 6048 + }, + { + "epoch": 0.32939222533063783, + "grad_norm": 0.9441997690647264, + "learning_rate": 0.00019063471211002732, + "loss": 12.5137, + "step": 6049 + }, + { + "epoch": 0.3294466793272209, + "grad_norm": 0.7440618082238332, + "learning_rate": 0.0001906309857446295, + "loss": 12.6509, + "step": 6050 + }, + { + "epoch": 0.3295011333238039, + "grad_norm": 0.6706221552479642, + "learning_rate": 0.00019062725867446985, + "loss": 12.4081, + "step": 6051 + }, + { + "epoch": 0.3295555873203869, + "grad_norm": 0.7669635585081318, + "learning_rate": 0.0001906235308995775, + "loss": 12.3935, + "step": 6052 + }, + { + "epoch": 0.3296100413169699, + "grad_norm": 0.8102674228184634, + "learning_rate": 0.00019061980241998137, + "loss": 12.6809, + "step": 6053 + }, + { + "epoch": 0.3296644953135529, + "grad_norm": 0.6051046325273823, + "learning_rate": 0.00019061607323571042, + "loss": 12.3263, + "step": 6054 + }, + { + "epoch": 0.32971894931013596, + "grad_norm": 0.6912898651248617, + "learning_rate": 0.00019061234334679373, + "loss": 12.4846, + "step": 6055 + }, + { + "epoch": 0.32977340330671895, + "grad_norm": 0.7745620986720093, + "learning_rate": 0.00019060861275326026, + "loss": 12.6085, + "step": 6056 + }, + { + "epoch": 0.32982785730330194, + "grad_norm": 0.6763436075375078, + "learning_rate": 0.000190604881455139, + "loss": 12.6575, + "step": 6057 + }, + { + "epoch": 0.329882311299885, + "grad_norm": 0.7307302740737248, + "learning_rate": 0.000190601149452459, + "loss": 12.4515, + "step": 6058 + }, + { + "epoch": 0.329936765296468, + "grad_norm": 0.7506534175175649, + "learning_rate": 0.00019059741674524924, + "loss": 12.7086, + "step": 6059 + }, + { + "epoch": 0.329991219293051, + "grad_norm": 0.6310610641892026, + "learning_rate": 0.0001905936833335388, + "loss": 12.5889, + "step": 6060 + }, + { + "epoch": 0.330045673289634, + "grad_norm": 0.749115126468661, + "learning_rate": 0.00019058994921735672, + "loss": 12.6396, + "step": 6061 + }, + { + "epoch": 0.330100127286217, + "grad_norm": 0.6680972574726077, + "learning_rate": 0.00019058621439673194, + "loss": 12.6789, + "step": 6062 + }, + { + "epoch": 0.3301545812828, + "grad_norm": 0.6498361336323334, + "learning_rate": 0.00019058247887169361, + "loss": 12.5572, + "step": 6063 + }, + { + "epoch": 0.33020903527938306, + "grad_norm": 0.728884306740224, + "learning_rate": 0.0001905787426422707, + "loss": 12.4432, + "step": 6064 + }, + { + "epoch": 0.33026348927596605, + "grad_norm": 0.6381536620973719, + "learning_rate": 0.00019057500570849234, + "loss": 12.4457, + "step": 6065 + }, + { + "epoch": 0.33031794327254904, + "grad_norm": 0.6814152295949502, + "learning_rate": 0.00019057126807038753, + "loss": 12.4274, + "step": 6066 + }, + { + "epoch": 0.3303723972691321, + "grad_norm": 0.7586037938328459, + "learning_rate": 0.00019056752972798532, + "loss": 12.4917, + "step": 6067 + }, + { + "epoch": 0.3304268512657151, + "grad_norm": 0.7013635463082079, + "learning_rate": 0.00019056379068131484, + "loss": 12.5281, + "step": 6068 + }, + { + "epoch": 0.3304813052622981, + "grad_norm": 0.749680286113315, + "learning_rate": 0.00019056005093040512, + "loss": 12.5624, + "step": 6069 + }, + { + "epoch": 0.3305357592588811, + "grad_norm": 0.7209385704762635, + "learning_rate": 0.00019055631047528528, + "loss": 12.5035, + "step": 6070 + }, + { + "epoch": 0.3305902132554641, + "grad_norm": 0.7540812916934254, + "learning_rate": 0.00019055256931598438, + "loss": 12.5679, + "step": 6071 + }, + { + "epoch": 0.3306446672520471, + "grad_norm": 0.7230512092731918, + "learning_rate": 0.0001905488274525315, + "loss": 12.6442, + "step": 6072 + }, + { + "epoch": 0.33069912124863016, + "grad_norm": 1.0561995452306372, + "learning_rate": 0.00019054508488495575, + "loss": 12.5927, + "step": 6073 + }, + { + "epoch": 0.33075357524521315, + "grad_norm": 0.7502687032082127, + "learning_rate": 0.00019054134161328626, + "loss": 12.5629, + "step": 6074 + }, + { + "epoch": 0.33080802924179614, + "grad_norm": 0.7999552476061841, + "learning_rate": 0.00019053759763755209, + "loss": 12.5188, + "step": 6075 + }, + { + "epoch": 0.3308624832383792, + "grad_norm": 0.7713322123206169, + "learning_rate": 0.0001905338529577824, + "loss": 12.5704, + "step": 6076 + }, + { + "epoch": 0.3309169372349622, + "grad_norm": 0.8188983023010487, + "learning_rate": 0.00019053010757400624, + "loss": 12.6, + "step": 6077 + }, + { + "epoch": 0.3309713912315452, + "grad_norm": 0.9511635742495728, + "learning_rate": 0.00019052636148625282, + "loss": 12.5989, + "step": 6078 + }, + { + "epoch": 0.3310258452281282, + "grad_norm": 0.6767946812635767, + "learning_rate": 0.00019052261469455122, + "loss": 12.5205, + "step": 6079 + }, + { + "epoch": 0.3310802992247112, + "grad_norm": 0.9301201404025043, + "learning_rate": 0.0001905188671989306, + "loss": 12.7243, + "step": 6080 + }, + { + "epoch": 0.3311347532212942, + "grad_norm": 0.9015049000301367, + "learning_rate": 0.0001905151189994201, + "loss": 12.4499, + "step": 6081 + }, + { + "epoch": 0.33118920721787726, + "grad_norm": 0.8382596251722875, + "learning_rate": 0.0001905113700960488, + "loss": 12.7573, + "step": 6082 + }, + { + "epoch": 0.33124366121446025, + "grad_norm": 0.7181926507073133, + "learning_rate": 0.00019050762048884596, + "loss": 12.473, + "step": 6083 + }, + { + "epoch": 0.3312981152110433, + "grad_norm": 0.8447246018943421, + "learning_rate": 0.0001905038701778407, + "loss": 12.515, + "step": 6084 + }, + { + "epoch": 0.3313525692076263, + "grad_norm": 0.752685655651504, + "learning_rate": 0.00019050011916306212, + "loss": 12.5246, + "step": 6085 + }, + { + "epoch": 0.3314070232042093, + "grad_norm": 0.8405698619472585, + "learning_rate": 0.00019049636744453944, + "loss": 12.5166, + "step": 6086 + }, + { + "epoch": 0.33146147720079233, + "grad_norm": 0.7746313603997682, + "learning_rate": 0.00019049261502230184, + "loss": 12.4498, + "step": 6087 + }, + { + "epoch": 0.3315159311973753, + "grad_norm": 0.8372687652387514, + "learning_rate": 0.00019048886189637848, + "loss": 12.5878, + "step": 6088 + }, + { + "epoch": 0.3315703851939583, + "grad_norm": 0.814258741770328, + "learning_rate": 0.0001904851080667986, + "loss": 12.4877, + "step": 6089 + }, + { + "epoch": 0.33162483919054137, + "grad_norm": 0.7280978774222061, + "learning_rate": 0.00019048135353359129, + "loss": 12.5206, + "step": 6090 + }, + { + "epoch": 0.33167929318712436, + "grad_norm": 0.7893481669060788, + "learning_rate": 0.00019047759829678585, + "loss": 12.6034, + "step": 6091 + }, + { + "epoch": 0.33173374718370735, + "grad_norm": 0.7136790992150998, + "learning_rate": 0.00019047384235641138, + "loss": 12.5442, + "step": 6092 + }, + { + "epoch": 0.3317882011802904, + "grad_norm": 0.725570200940181, + "learning_rate": 0.00019047008571249717, + "loss": 12.5172, + "step": 6093 + }, + { + "epoch": 0.3318426551768734, + "grad_norm": 0.7172907098779102, + "learning_rate": 0.0001904663283650724, + "loss": 12.4823, + "step": 6094 + }, + { + "epoch": 0.3318971091734564, + "grad_norm": 0.650578796409706, + "learning_rate": 0.0001904625703141663, + "loss": 12.4808, + "step": 6095 + }, + { + "epoch": 0.33195156317003943, + "grad_norm": 0.7862084399830296, + "learning_rate": 0.00019045881155980808, + "loss": 12.5653, + "step": 6096 + }, + { + "epoch": 0.3320060171666224, + "grad_norm": 0.6625604436200386, + "learning_rate": 0.00019045505210202698, + "loss": 12.6665, + "step": 6097 + }, + { + "epoch": 0.3320604711632054, + "grad_norm": 0.6707462864315522, + "learning_rate": 0.00019045129194085217, + "loss": 12.6802, + "step": 6098 + }, + { + "epoch": 0.33211492515978847, + "grad_norm": 0.6954853299279311, + "learning_rate": 0.000190447531076313, + "loss": 12.6003, + "step": 6099 + }, + { + "epoch": 0.33216937915637146, + "grad_norm": 0.7880675830322864, + "learning_rate": 0.00019044376950843862, + "loss": 12.6436, + "step": 6100 + }, + { + "epoch": 0.33222383315295445, + "grad_norm": 0.8561969283621422, + "learning_rate": 0.00019044000723725837, + "loss": 12.6777, + "step": 6101 + }, + { + "epoch": 0.3322782871495375, + "grad_norm": 0.7027593267734076, + "learning_rate": 0.0001904362442628014, + "loss": 12.5563, + "step": 6102 + }, + { + "epoch": 0.3323327411461205, + "grad_norm": 0.9422237632774334, + "learning_rate": 0.0001904324805850971, + "loss": 12.5437, + "step": 6103 + }, + { + "epoch": 0.3323871951427035, + "grad_norm": 0.6967880551838675, + "learning_rate": 0.0001904287162041746, + "loss": 12.485, + "step": 6104 + }, + { + "epoch": 0.33244164913928653, + "grad_norm": 0.7064817408333989, + "learning_rate": 0.00019042495112006326, + "loss": 12.4872, + "step": 6105 + }, + { + "epoch": 0.3324961031358695, + "grad_norm": 0.861640560379946, + "learning_rate": 0.00019042118533279235, + "loss": 12.5858, + "step": 6106 + }, + { + "epoch": 0.3325505571324525, + "grad_norm": 0.7716607183906974, + "learning_rate": 0.00019041741884239113, + "loss": 12.5587, + "step": 6107 + }, + { + "epoch": 0.33260501112903557, + "grad_norm": 0.763541649976914, + "learning_rate": 0.00019041365164888888, + "loss": 12.7393, + "step": 6108 + }, + { + "epoch": 0.33265946512561856, + "grad_norm": 0.637888834883395, + "learning_rate": 0.00019040988375231495, + "loss": 12.5298, + "step": 6109 + }, + { + "epoch": 0.33271391912220155, + "grad_norm": 0.7149384308769094, + "learning_rate": 0.00019040611515269858, + "loss": 12.6393, + "step": 6110 + }, + { + "epoch": 0.3327683731187846, + "grad_norm": 0.6569658440919112, + "learning_rate": 0.0001904023458500691, + "loss": 12.5275, + "step": 6111 + }, + { + "epoch": 0.3328228271153676, + "grad_norm": 0.6572020856486586, + "learning_rate": 0.0001903985758444558, + "loss": 12.6111, + "step": 6112 + }, + { + "epoch": 0.3328772811119506, + "grad_norm": 0.6516354181852332, + "learning_rate": 0.00019039480513588806, + "loss": 12.5082, + "step": 6113 + }, + { + "epoch": 0.33293173510853363, + "grad_norm": 0.680454891451818, + "learning_rate": 0.00019039103372439512, + "loss": 12.5876, + "step": 6114 + }, + { + "epoch": 0.3329861891051166, + "grad_norm": 0.6523523263440932, + "learning_rate": 0.00019038726161000634, + "loss": 12.5589, + "step": 6115 + }, + { + "epoch": 0.3330406431016996, + "grad_norm": 0.6829586009320958, + "learning_rate": 0.00019038348879275106, + "loss": 12.6797, + "step": 6116 + }, + { + "epoch": 0.33309509709828267, + "grad_norm": 0.6661967631864086, + "learning_rate": 0.0001903797152726586, + "loss": 12.5284, + "step": 6117 + }, + { + "epoch": 0.33314955109486566, + "grad_norm": 0.6074346941940245, + "learning_rate": 0.00019037594104975836, + "loss": 12.4863, + "step": 6118 + }, + { + "epoch": 0.3332040050914487, + "grad_norm": 0.7862727576794416, + "learning_rate": 0.00019037216612407962, + "loss": 12.5459, + "step": 6119 + }, + { + "epoch": 0.3332584590880317, + "grad_norm": 0.6660531417792179, + "learning_rate": 0.00019036839049565177, + "loss": 12.4761, + "step": 6120 + }, + { + "epoch": 0.3333129130846147, + "grad_norm": 0.6365992142476833, + "learning_rate": 0.00019036461416450416, + "loss": 12.5827, + "step": 6121 + }, + { + "epoch": 0.33336736708119774, + "grad_norm": 0.6579708556000738, + "learning_rate": 0.00019036083713066612, + "loss": 12.6584, + "step": 6122 + }, + { + "epoch": 0.33342182107778073, + "grad_norm": 0.6397074316751428, + "learning_rate": 0.0001903570593941671, + "loss": 12.5182, + "step": 6123 + }, + { + "epoch": 0.3334762750743637, + "grad_norm": 0.6753784333459871, + "learning_rate": 0.00019035328095503643, + "loss": 12.4446, + "step": 6124 + }, + { + "epoch": 0.3335307290709468, + "grad_norm": 0.6293854289115678, + "learning_rate": 0.00019034950181330348, + "loss": 12.4655, + "step": 6125 + }, + { + "epoch": 0.33358518306752977, + "grad_norm": 0.6768015115197763, + "learning_rate": 0.00019034572196899766, + "loss": 12.6518, + "step": 6126 + }, + { + "epoch": 0.33363963706411276, + "grad_norm": 0.6028883086935058, + "learning_rate": 0.00019034194142214834, + "loss": 12.4527, + "step": 6127 + }, + { + "epoch": 0.3336940910606958, + "grad_norm": 0.7512506851451719, + "learning_rate": 0.00019033816017278497, + "loss": 12.6985, + "step": 6128 + }, + { + "epoch": 0.3337485450572788, + "grad_norm": 0.6914936250403905, + "learning_rate": 0.0001903343782209369, + "loss": 12.4452, + "step": 6129 + }, + { + "epoch": 0.3338029990538618, + "grad_norm": 0.6491276706247793, + "learning_rate": 0.00019033059556663353, + "loss": 12.6077, + "step": 6130 + }, + { + "epoch": 0.33385745305044484, + "grad_norm": 0.6610362402021156, + "learning_rate": 0.0001903268122099043, + "loss": 12.4978, + "step": 6131 + }, + { + "epoch": 0.33391190704702783, + "grad_norm": 0.6583859042445529, + "learning_rate": 0.00019032302815077866, + "loss": 12.5562, + "step": 6132 + }, + { + "epoch": 0.3339663610436108, + "grad_norm": 0.6849355025464567, + "learning_rate": 0.000190319243389286, + "loss": 12.5127, + "step": 6133 + }, + { + "epoch": 0.3340208150401939, + "grad_norm": 0.6238656111143649, + "learning_rate": 0.00019031545792545576, + "loss": 12.478, + "step": 6134 + }, + { + "epoch": 0.33407526903677687, + "grad_norm": 0.6796712942676156, + "learning_rate": 0.00019031167175931736, + "loss": 12.4635, + "step": 6135 + }, + { + "epoch": 0.33412972303335986, + "grad_norm": 0.6388572352631766, + "learning_rate": 0.00019030788489090027, + "loss": 12.6054, + "step": 6136 + }, + { + "epoch": 0.3341841770299429, + "grad_norm": 0.7070234751908636, + "learning_rate": 0.0001903040973202339, + "loss": 12.5226, + "step": 6137 + }, + { + "epoch": 0.3342386310265259, + "grad_norm": 0.7598039478084103, + "learning_rate": 0.00019030030904734774, + "loss": 12.423, + "step": 6138 + }, + { + "epoch": 0.3342930850231089, + "grad_norm": 0.7158751505478054, + "learning_rate": 0.00019029652007227123, + "loss": 12.5786, + "step": 6139 + }, + { + "epoch": 0.33434753901969194, + "grad_norm": 0.6145429949826542, + "learning_rate": 0.00019029273039503387, + "loss": 12.4051, + "step": 6140 + }, + { + "epoch": 0.33440199301627493, + "grad_norm": 1.1949463795614987, + "learning_rate": 0.00019028894001566507, + "loss": 12.7602, + "step": 6141 + }, + { + "epoch": 0.3344564470128579, + "grad_norm": 0.6342220342865346, + "learning_rate": 0.00019028514893419432, + "loss": 12.4909, + "step": 6142 + }, + { + "epoch": 0.334510901009441, + "grad_norm": 0.7442484705837343, + "learning_rate": 0.0001902813571506511, + "loss": 12.7064, + "step": 6143 + }, + { + "epoch": 0.33456535500602397, + "grad_norm": 0.6796848417099655, + "learning_rate": 0.00019027756466506493, + "loss": 12.5268, + "step": 6144 + }, + { + "epoch": 0.33461980900260696, + "grad_norm": 0.6546437357082819, + "learning_rate": 0.00019027377147746524, + "loss": 12.4814, + "step": 6145 + }, + { + "epoch": 0.33467426299919, + "grad_norm": 0.7005982307981831, + "learning_rate": 0.00019026997758788162, + "loss": 12.6283, + "step": 6146 + }, + { + "epoch": 0.334728716995773, + "grad_norm": 0.6438492247460936, + "learning_rate": 0.0001902661829963435, + "loss": 12.5332, + "step": 6147 + }, + { + "epoch": 0.334783170992356, + "grad_norm": 0.7068632930100087, + "learning_rate": 0.0001902623877028804, + "loss": 12.5771, + "step": 6148 + }, + { + "epoch": 0.33483762498893904, + "grad_norm": 0.740471689430596, + "learning_rate": 0.00019025859170752183, + "loss": 12.5317, + "step": 6149 + }, + { + "epoch": 0.33489207898552203, + "grad_norm": 0.6222370742701101, + "learning_rate": 0.0001902547950102973, + "loss": 12.5265, + "step": 6150 + }, + { + "epoch": 0.3349465329821051, + "grad_norm": 0.683206095891851, + "learning_rate": 0.00019025099761123637, + "loss": 12.6167, + "step": 6151 + }, + { + "epoch": 0.3350009869786881, + "grad_norm": 0.7004754573579197, + "learning_rate": 0.00019024719951036856, + "loss": 12.6346, + "step": 6152 + }, + { + "epoch": 0.33505544097527107, + "grad_norm": 0.677115356865327, + "learning_rate": 0.00019024340070772336, + "loss": 12.5155, + "step": 6153 + }, + { + "epoch": 0.3351098949718541, + "grad_norm": 0.6041487063468916, + "learning_rate": 0.00019023960120333037, + "loss": 12.5213, + "step": 6154 + }, + { + "epoch": 0.3351643489684371, + "grad_norm": 0.6440215634195646, + "learning_rate": 0.00019023580099721907, + "loss": 12.4842, + "step": 6155 + }, + { + "epoch": 0.3352188029650201, + "grad_norm": 0.7218149914408177, + "learning_rate": 0.00019023200008941912, + "loss": 12.5914, + "step": 6156 + }, + { + "epoch": 0.33527325696160315, + "grad_norm": 0.6775480085437992, + "learning_rate": 0.00019022819847995992, + "loss": 12.608, + "step": 6157 + }, + { + "epoch": 0.33532771095818614, + "grad_norm": 0.6925979630379857, + "learning_rate": 0.00019022439616887116, + "loss": 12.6059, + "step": 6158 + }, + { + "epoch": 0.33538216495476914, + "grad_norm": 0.6479672547492501, + "learning_rate": 0.00019022059315618238, + "loss": 12.4175, + "step": 6159 + }, + { + "epoch": 0.3354366189513522, + "grad_norm": 0.767868850600018, + "learning_rate": 0.0001902167894419231, + "loss": 12.5171, + "step": 6160 + }, + { + "epoch": 0.3354910729479352, + "grad_norm": 0.6072374536074713, + "learning_rate": 0.00019021298502612294, + "loss": 12.4668, + "step": 6161 + }, + { + "epoch": 0.33554552694451817, + "grad_norm": 0.8744038116543146, + "learning_rate": 0.0001902091799088115, + "loss": 12.5322, + "step": 6162 + }, + { + "epoch": 0.3355999809411012, + "grad_norm": 0.6870120413339742, + "learning_rate": 0.00019020537409001836, + "loss": 12.5823, + "step": 6163 + }, + { + "epoch": 0.3356544349376842, + "grad_norm": 0.6446608768392295, + "learning_rate": 0.00019020156756977309, + "loss": 12.5155, + "step": 6164 + }, + { + "epoch": 0.3357088889342672, + "grad_norm": 0.6695021313655353, + "learning_rate": 0.00019019776034810527, + "loss": 12.2837, + "step": 6165 + }, + { + "epoch": 0.33576334293085025, + "grad_norm": 0.6905675103604029, + "learning_rate": 0.00019019395242504458, + "loss": 12.5365, + "step": 6166 + }, + { + "epoch": 0.33581779692743324, + "grad_norm": 0.6892280744834528, + "learning_rate": 0.0001901901438006206, + "loss": 12.5121, + "step": 6167 + }, + { + "epoch": 0.33587225092401624, + "grad_norm": 0.6263608033387138, + "learning_rate": 0.00019018633447486288, + "loss": 12.5616, + "step": 6168 + }, + { + "epoch": 0.3359267049205993, + "grad_norm": 0.7290340194306969, + "learning_rate": 0.00019018252444780116, + "loss": 12.5323, + "step": 6169 + }, + { + "epoch": 0.3359811589171823, + "grad_norm": 0.6877483949236548, + "learning_rate": 0.00019017871371946498, + "loss": 12.5425, + "step": 6170 + }, + { + "epoch": 0.33603561291376527, + "grad_norm": 0.6121751565433697, + "learning_rate": 0.000190174902289884, + "loss": 12.4145, + "step": 6171 + }, + { + "epoch": 0.3360900669103483, + "grad_norm": 0.6668410447130814, + "learning_rate": 0.00019017109015908784, + "loss": 12.6139, + "step": 6172 + }, + { + "epoch": 0.3361445209069313, + "grad_norm": 0.6548950675385099, + "learning_rate": 0.0001901672773271062, + "loss": 12.7411, + "step": 6173 + }, + { + "epoch": 0.3361989749035143, + "grad_norm": 0.6620132852747327, + "learning_rate": 0.00019016346379396867, + "loss": 12.5068, + "step": 6174 + }, + { + "epoch": 0.33625342890009735, + "grad_norm": 0.6744886194570338, + "learning_rate": 0.00019015964955970493, + "loss": 12.3716, + "step": 6175 + }, + { + "epoch": 0.33630788289668034, + "grad_norm": 0.6460492116216238, + "learning_rate": 0.00019015583462434464, + "loss": 12.6015, + "step": 6176 + }, + { + "epoch": 0.33636233689326334, + "grad_norm": 0.7586681691462406, + "learning_rate": 0.00019015201898791743, + "loss": 12.5306, + "step": 6177 + }, + { + "epoch": 0.3364167908898464, + "grad_norm": 0.7473788029604737, + "learning_rate": 0.00019014820265045304, + "loss": 12.5624, + "step": 6178 + }, + { + "epoch": 0.3364712448864294, + "grad_norm": 0.6473311838707987, + "learning_rate": 0.0001901443856119811, + "loss": 12.446, + "step": 6179 + }, + { + "epoch": 0.33652569888301237, + "grad_norm": 0.7744353742659558, + "learning_rate": 0.0001901405678725313, + "loss": 12.6546, + "step": 6180 + }, + { + "epoch": 0.3365801528795954, + "grad_norm": 0.6991532130330834, + "learning_rate": 0.00019013674943213328, + "loss": 12.5082, + "step": 6181 + }, + { + "epoch": 0.3366346068761784, + "grad_norm": 0.7557822736917306, + "learning_rate": 0.00019013293029081685, + "loss": 12.436, + "step": 6182 + }, + { + "epoch": 0.3366890608727614, + "grad_norm": 0.6549872291957558, + "learning_rate": 0.00019012911044861158, + "loss": 12.5209, + "step": 6183 + }, + { + "epoch": 0.33674351486934445, + "grad_norm": 0.6970846774936678, + "learning_rate": 0.00019012528990554727, + "loss": 12.5207, + "step": 6184 + }, + { + "epoch": 0.33679796886592744, + "grad_norm": 0.8749360840578242, + "learning_rate": 0.00019012146866165358, + "loss": 12.5774, + "step": 6185 + }, + { + "epoch": 0.3368524228625105, + "grad_norm": 0.7170587629888132, + "learning_rate": 0.00019011764671696027, + "loss": 12.4228, + "step": 6186 + }, + { + "epoch": 0.3369068768590935, + "grad_norm": 0.7733647766370366, + "learning_rate": 0.000190113824071497, + "loss": 12.4979, + "step": 6187 + }, + { + "epoch": 0.3369613308556765, + "grad_norm": 0.7267443064106669, + "learning_rate": 0.00019011000072529348, + "loss": 12.6086, + "step": 6188 + }, + { + "epoch": 0.3370157848522595, + "grad_norm": 0.656912096319067, + "learning_rate": 0.00019010617667837953, + "loss": 12.4963, + "step": 6189 + }, + { + "epoch": 0.3370702388488425, + "grad_norm": 0.7539748798624772, + "learning_rate": 0.00019010235193078482, + "loss": 12.5574, + "step": 6190 + }, + { + "epoch": 0.3371246928454255, + "grad_norm": 0.7244926493537558, + "learning_rate": 0.00019009852648253913, + "loss": 12.6162, + "step": 6191 + }, + { + "epoch": 0.33717914684200856, + "grad_norm": 0.6673239984305686, + "learning_rate": 0.00019009470033367218, + "loss": 12.4831, + "step": 6192 + }, + { + "epoch": 0.33723360083859155, + "grad_norm": 0.6975327703524771, + "learning_rate": 0.00019009087348421372, + "loss": 12.5692, + "step": 6193 + }, + { + "epoch": 0.33728805483517454, + "grad_norm": 0.7778432350781286, + "learning_rate": 0.00019008704593419354, + "loss": 12.5819, + "step": 6194 + }, + { + "epoch": 0.3373425088317576, + "grad_norm": 0.6864969326899306, + "learning_rate": 0.0001900832176836414, + "loss": 12.4972, + "step": 6195 + }, + { + "epoch": 0.3373969628283406, + "grad_norm": 0.6202551937185172, + "learning_rate": 0.00019007938873258698, + "loss": 12.5221, + "step": 6196 + }, + { + "epoch": 0.3374514168249236, + "grad_norm": 0.6537274528177562, + "learning_rate": 0.0001900755590810602, + "loss": 12.5386, + "step": 6197 + }, + { + "epoch": 0.3375058708215066, + "grad_norm": 0.7079124921438565, + "learning_rate": 0.00019007172872909073, + "loss": 12.5338, + "step": 6198 + }, + { + "epoch": 0.3375603248180896, + "grad_norm": 0.6657694746805412, + "learning_rate": 0.00019006789767670842, + "loss": 12.4443, + "step": 6199 + }, + { + "epoch": 0.3376147788146726, + "grad_norm": 0.730262940364132, + "learning_rate": 0.000190064065923943, + "loss": 12.5203, + "step": 6200 + }, + { + "epoch": 0.33766923281125566, + "grad_norm": 0.630319901736221, + "learning_rate": 0.0001900602334708243, + "loss": 12.33, + "step": 6201 + }, + { + "epoch": 0.33772368680783865, + "grad_norm": 0.7177306256126688, + "learning_rate": 0.00019005640031738216, + "loss": 12.6088, + "step": 6202 + }, + { + "epoch": 0.33777814080442164, + "grad_norm": 0.7407616600834069, + "learning_rate": 0.00019005256646364632, + "loss": 12.551, + "step": 6203 + }, + { + "epoch": 0.3378325948010047, + "grad_norm": 0.6405237750463625, + "learning_rate": 0.00019004873190964664, + "loss": 12.7041, + "step": 6204 + }, + { + "epoch": 0.3378870487975877, + "grad_norm": 0.5858757409387871, + "learning_rate": 0.0001900448966554129, + "loss": 12.4482, + "step": 6205 + }, + { + "epoch": 0.3379415027941707, + "grad_norm": 0.7294245367979105, + "learning_rate": 0.00019004106070097496, + "loss": 12.5935, + "step": 6206 + }, + { + "epoch": 0.3379959567907537, + "grad_norm": 0.6463929455412016, + "learning_rate": 0.0001900372240463626, + "loss": 12.4734, + "step": 6207 + }, + { + "epoch": 0.3380504107873367, + "grad_norm": 0.6904944812387054, + "learning_rate": 0.0001900333866916057, + "loss": 12.5885, + "step": 6208 + }, + { + "epoch": 0.3381048647839197, + "grad_norm": 0.685170401259956, + "learning_rate": 0.0001900295486367341, + "loss": 12.5826, + "step": 6209 + }, + { + "epoch": 0.33815931878050276, + "grad_norm": 0.6357849225796319, + "learning_rate": 0.00019002570988177763, + "loss": 12.5158, + "step": 6210 + }, + { + "epoch": 0.33821377277708575, + "grad_norm": 0.6683542317848408, + "learning_rate": 0.00019002187042676613, + "loss": 12.5072, + "step": 6211 + }, + { + "epoch": 0.33826822677366875, + "grad_norm": 0.6495159611040929, + "learning_rate": 0.0001900180302717295, + "loss": 12.5352, + "step": 6212 + }, + { + "epoch": 0.3383226807702518, + "grad_norm": 0.6391516161165911, + "learning_rate": 0.00019001418941669754, + "loss": 12.5244, + "step": 6213 + }, + { + "epoch": 0.3383771347668348, + "grad_norm": 0.7089891987518301, + "learning_rate": 0.00019001034786170014, + "loss": 12.8155, + "step": 6214 + }, + { + "epoch": 0.3384315887634178, + "grad_norm": 0.7321755456555036, + "learning_rate": 0.0001900065056067672, + "loss": 12.6221, + "step": 6215 + }, + { + "epoch": 0.3384860427600008, + "grad_norm": 0.7487920427898623, + "learning_rate": 0.0001900026626519286, + "loss": 12.4454, + "step": 6216 + }, + { + "epoch": 0.3385404967565838, + "grad_norm": 0.7044242558236338, + "learning_rate": 0.00018999881899721416, + "loss": 12.5365, + "step": 6217 + }, + { + "epoch": 0.33859495075316687, + "grad_norm": 0.6597639513487883, + "learning_rate": 0.00018999497464265383, + "loss": 12.5274, + "step": 6218 + }, + { + "epoch": 0.33864940474974986, + "grad_norm": 0.76686473507453, + "learning_rate": 0.00018999112958827748, + "loss": 12.5673, + "step": 6219 + }, + { + "epoch": 0.33870385874633285, + "grad_norm": 0.6161336047920143, + "learning_rate": 0.000189987283834115, + "loss": 12.5084, + "step": 6220 + }, + { + "epoch": 0.3387583127429159, + "grad_norm": 0.8074532039395405, + "learning_rate": 0.00018998343738019634, + "loss": 12.4663, + "step": 6221 + }, + { + "epoch": 0.3388127667394989, + "grad_norm": 0.6652786762546103, + "learning_rate": 0.00018997959022655137, + "loss": 12.4228, + "step": 6222 + }, + { + "epoch": 0.3388672207360819, + "grad_norm": 0.7768352951927515, + "learning_rate": 0.00018997574237321002, + "loss": 12.5579, + "step": 6223 + }, + { + "epoch": 0.33892167473266493, + "grad_norm": 0.688925320784393, + "learning_rate": 0.0001899718938202022, + "loss": 12.4842, + "step": 6224 + }, + { + "epoch": 0.3389761287292479, + "grad_norm": 0.6166350003751984, + "learning_rate": 0.00018996804456755784, + "loss": 12.5745, + "step": 6225 + }, + { + "epoch": 0.3390305827258309, + "grad_norm": 0.7968250998062604, + "learning_rate": 0.00018996419461530687, + "loss": 12.4601, + "step": 6226 + }, + { + "epoch": 0.33908503672241397, + "grad_norm": 0.7204375657832244, + "learning_rate": 0.00018996034396347929, + "loss": 12.4444, + "step": 6227 + }, + { + "epoch": 0.33913949071899696, + "grad_norm": 0.7435334241715121, + "learning_rate": 0.00018995649261210497, + "loss": 12.5888, + "step": 6228 + }, + { + "epoch": 0.33919394471557995, + "grad_norm": 0.7132449890620907, + "learning_rate": 0.0001899526405612138, + "loss": 12.4273, + "step": 6229 + }, + { + "epoch": 0.339248398712163, + "grad_norm": 0.647762025838811, + "learning_rate": 0.00018994878781083589, + "loss": 12.4589, + "step": 6230 + }, + { + "epoch": 0.339302852708746, + "grad_norm": 0.8074144202446832, + "learning_rate": 0.00018994493436100108, + "loss": 12.5515, + "step": 6231 + }, + { + "epoch": 0.339357306705329, + "grad_norm": 0.7421418526430787, + "learning_rate": 0.00018994108021173943, + "loss": 12.6725, + "step": 6232 + }, + { + "epoch": 0.33941176070191204, + "grad_norm": 0.6315796994855236, + "learning_rate": 0.0001899372253630808, + "loss": 12.5219, + "step": 6233 + }, + { + "epoch": 0.33946621469849503, + "grad_norm": 0.7867514236741496, + "learning_rate": 0.00018993336981505527, + "loss": 12.5488, + "step": 6234 + }, + { + "epoch": 0.339520668695078, + "grad_norm": 0.6825545070067882, + "learning_rate": 0.00018992951356769274, + "loss": 12.6453, + "step": 6235 + }, + { + "epoch": 0.33957512269166107, + "grad_norm": 0.5581581779692462, + "learning_rate": 0.00018992565662102323, + "loss": 12.4608, + "step": 6236 + }, + { + "epoch": 0.33962957668824406, + "grad_norm": 0.6695781279945339, + "learning_rate": 0.00018992179897507679, + "loss": 12.5149, + "step": 6237 + }, + { + "epoch": 0.33968403068482705, + "grad_norm": 0.7055852494170761, + "learning_rate": 0.00018991794062988331, + "loss": 12.5044, + "step": 6238 + }, + { + "epoch": 0.3397384846814101, + "grad_norm": 0.7226116530536991, + "learning_rate": 0.00018991408158547285, + "loss": 12.4421, + "step": 6239 + }, + { + "epoch": 0.3397929386779931, + "grad_norm": 0.7363087436848126, + "learning_rate": 0.0001899102218418754, + "loss": 12.499, + "step": 6240 + }, + { + "epoch": 0.3398473926745761, + "grad_norm": 0.6300687166747003, + "learning_rate": 0.00018990636139912102, + "loss": 12.5233, + "step": 6241 + }, + { + "epoch": 0.33990184667115914, + "grad_norm": 0.793004713938736, + "learning_rate": 0.00018990250025723967, + "loss": 12.5717, + "step": 6242 + }, + { + "epoch": 0.33995630066774213, + "grad_norm": 0.6699161458232288, + "learning_rate": 0.0001898986384162614, + "loss": 12.5451, + "step": 6243 + }, + { + "epoch": 0.3400107546643251, + "grad_norm": 0.7459639388831515, + "learning_rate": 0.00018989477587621627, + "loss": 12.5257, + "step": 6244 + }, + { + "epoch": 0.34006520866090817, + "grad_norm": 0.7132170474264726, + "learning_rate": 0.00018989091263713428, + "loss": 12.5845, + "step": 6245 + }, + { + "epoch": 0.34011966265749116, + "grad_norm": 0.6045019422774155, + "learning_rate": 0.00018988704869904547, + "loss": 12.3912, + "step": 6246 + }, + { + "epoch": 0.34017411665407415, + "grad_norm": 0.666948712286019, + "learning_rate": 0.0001898831840619799, + "loss": 12.3408, + "step": 6247 + }, + { + "epoch": 0.3402285706506572, + "grad_norm": 0.7105971968384324, + "learning_rate": 0.0001898793187259676, + "loss": 12.5605, + "step": 6248 + }, + { + "epoch": 0.3402830246472402, + "grad_norm": 0.628138207855354, + "learning_rate": 0.00018987545269103865, + "loss": 12.46, + "step": 6249 + }, + { + "epoch": 0.3403374786438232, + "grad_norm": 0.7078715466744697, + "learning_rate": 0.00018987158595722313, + "loss": 12.5245, + "step": 6250 + }, + { + "epoch": 0.34039193264040624, + "grad_norm": 0.659737373649185, + "learning_rate": 0.00018986771852455109, + "loss": 12.604, + "step": 6251 + }, + { + "epoch": 0.34044638663698923, + "grad_norm": 0.7146806557781517, + "learning_rate": 0.00018986385039305255, + "loss": 12.5897, + "step": 6252 + }, + { + "epoch": 0.3405008406335723, + "grad_norm": 0.6435362637324684, + "learning_rate": 0.00018985998156275765, + "loss": 12.5073, + "step": 6253 + }, + { + "epoch": 0.34055529463015527, + "grad_norm": 0.6518305515914652, + "learning_rate": 0.00018985611203369652, + "loss": 12.5177, + "step": 6254 + }, + { + "epoch": 0.34060974862673826, + "grad_norm": 0.7293022805036647, + "learning_rate": 0.00018985224180589913, + "loss": 12.3901, + "step": 6255 + }, + { + "epoch": 0.3406642026233213, + "grad_norm": 0.7226616833868063, + "learning_rate": 0.00018984837087939567, + "loss": 12.5533, + "step": 6256 + }, + { + "epoch": 0.3407186566199043, + "grad_norm": 0.6316932747556198, + "learning_rate": 0.00018984449925421622, + "loss": 12.2969, + "step": 6257 + }, + { + "epoch": 0.3407731106164873, + "grad_norm": 0.650964042997637, + "learning_rate": 0.00018984062693039086, + "loss": 12.545, + "step": 6258 + }, + { + "epoch": 0.34082756461307034, + "grad_norm": 0.6313622298812942, + "learning_rate": 0.00018983675390794972, + "loss": 12.5437, + "step": 6259 + }, + { + "epoch": 0.34088201860965334, + "grad_norm": 0.6856129479064164, + "learning_rate": 0.0001898328801869229, + "loss": 12.5644, + "step": 6260 + }, + { + "epoch": 0.34093647260623633, + "grad_norm": 0.6754733609937804, + "learning_rate": 0.0001898290057673406, + "loss": 12.5439, + "step": 6261 + }, + { + "epoch": 0.3409909266028194, + "grad_norm": 0.6175904268818588, + "learning_rate": 0.00018982513064923283, + "loss": 12.4148, + "step": 6262 + }, + { + "epoch": 0.34104538059940237, + "grad_norm": 0.7029133910496881, + "learning_rate": 0.00018982125483262978, + "loss": 12.6239, + "step": 6263 + }, + { + "epoch": 0.34109983459598536, + "grad_norm": 0.6764524896672984, + "learning_rate": 0.0001898173783175616, + "loss": 12.4716, + "step": 6264 + }, + { + "epoch": 0.3411542885925684, + "grad_norm": 0.6473075046714889, + "learning_rate": 0.00018981350110405844, + "loss": 12.3433, + "step": 6265 + }, + { + "epoch": 0.3412087425891514, + "grad_norm": 0.7340041196610504, + "learning_rate": 0.0001898096231921504, + "loss": 12.6536, + "step": 6266 + }, + { + "epoch": 0.3412631965857344, + "grad_norm": 0.6704673154087119, + "learning_rate": 0.00018980574458186774, + "loss": 12.5468, + "step": 6267 + }, + { + "epoch": 0.34131765058231744, + "grad_norm": 0.6352924231527934, + "learning_rate": 0.00018980186527324048, + "loss": 12.5107, + "step": 6268 + }, + { + "epoch": 0.34137210457890044, + "grad_norm": 0.6933220771845349, + "learning_rate": 0.00018979798526629887, + "loss": 12.4702, + "step": 6269 + }, + { + "epoch": 0.34142655857548343, + "grad_norm": 0.6339519467155723, + "learning_rate": 0.00018979410456107306, + "loss": 12.3719, + "step": 6270 + }, + { + "epoch": 0.3414810125720665, + "grad_norm": 0.7721477942860392, + "learning_rate": 0.00018979022315759325, + "loss": 12.6534, + "step": 6271 + }, + { + "epoch": 0.34153546656864947, + "grad_norm": 0.6153794386548143, + "learning_rate": 0.00018978634105588961, + "loss": 12.574, + "step": 6272 + }, + { + "epoch": 0.34158992056523246, + "grad_norm": 0.6769230590319805, + "learning_rate": 0.00018978245825599234, + "loss": 12.4373, + "step": 6273 + }, + { + "epoch": 0.3416443745618155, + "grad_norm": 0.8119995792072666, + "learning_rate": 0.00018977857475793158, + "loss": 12.5873, + "step": 6274 + }, + { + "epoch": 0.3416988285583985, + "grad_norm": 0.6749571198734036, + "learning_rate": 0.0001897746905617376, + "loss": 12.6444, + "step": 6275 + }, + { + "epoch": 0.3417532825549815, + "grad_norm": 0.7317901747313105, + "learning_rate": 0.00018977080566744055, + "loss": 12.469, + "step": 6276 + }, + { + "epoch": 0.34180773655156454, + "grad_norm": 0.635670753116196, + "learning_rate": 0.00018976692007507067, + "loss": 12.3512, + "step": 6277 + }, + { + "epoch": 0.34186219054814754, + "grad_norm": 0.7243933147841601, + "learning_rate": 0.00018976303378465814, + "loss": 12.5583, + "step": 6278 + }, + { + "epoch": 0.34191664454473053, + "grad_norm": 0.6084842547409487, + "learning_rate": 0.00018975914679623325, + "loss": 12.538, + "step": 6279 + }, + { + "epoch": 0.3419710985413136, + "grad_norm": 0.7593564909409162, + "learning_rate": 0.00018975525910982615, + "loss": 12.6837, + "step": 6280 + }, + { + "epoch": 0.34202555253789657, + "grad_norm": 0.636182795196838, + "learning_rate": 0.0001897513707254671, + "loss": 12.5124, + "step": 6281 + }, + { + "epoch": 0.34208000653447956, + "grad_norm": 0.8155321522237161, + "learning_rate": 0.00018974748164318636, + "loss": 12.5131, + "step": 6282 + }, + { + "epoch": 0.3421344605310626, + "grad_norm": 0.6426416580788399, + "learning_rate": 0.00018974359186301417, + "loss": 12.5234, + "step": 6283 + }, + { + "epoch": 0.3421889145276456, + "grad_norm": 0.8193055100539444, + "learning_rate": 0.00018973970138498071, + "loss": 12.5375, + "step": 6284 + }, + { + "epoch": 0.34224336852422865, + "grad_norm": 0.7185734747917539, + "learning_rate": 0.00018973581020911634, + "loss": 12.4833, + "step": 6285 + }, + { + "epoch": 0.34229782252081165, + "grad_norm": 0.6288019118690862, + "learning_rate": 0.0001897319183354512, + "loss": 12.6266, + "step": 6286 + }, + { + "epoch": 0.34235227651739464, + "grad_norm": 0.7132135055944064, + "learning_rate": 0.00018972802576401566, + "loss": 12.6449, + "step": 6287 + }, + { + "epoch": 0.3424067305139777, + "grad_norm": 0.5785610189135727, + "learning_rate": 0.00018972413249483992, + "loss": 12.4041, + "step": 6288 + }, + { + "epoch": 0.3424611845105607, + "grad_norm": 0.703198984063344, + "learning_rate": 0.00018972023852795427, + "loss": 12.5444, + "step": 6289 + }, + { + "epoch": 0.34251563850714367, + "grad_norm": 0.668041192115363, + "learning_rate": 0.000189716343863389, + "loss": 12.5276, + "step": 6290 + }, + { + "epoch": 0.3425700925037267, + "grad_norm": 0.7179533760536456, + "learning_rate": 0.00018971244850117443, + "loss": 12.4907, + "step": 6291 + }, + { + "epoch": 0.3426245465003097, + "grad_norm": 0.6547689597350955, + "learning_rate": 0.0001897085524413408, + "loss": 12.5585, + "step": 6292 + }, + { + "epoch": 0.3426790004968927, + "grad_norm": 0.6892810246093969, + "learning_rate": 0.0001897046556839184, + "loss": 12.6043, + "step": 6293 + }, + { + "epoch": 0.34273345449347575, + "grad_norm": 0.6293810182504478, + "learning_rate": 0.0001897007582289376, + "loss": 12.5168, + "step": 6294 + }, + { + "epoch": 0.34278790849005875, + "grad_norm": 0.6465223613402978, + "learning_rate": 0.0001896968600764286, + "loss": 12.5322, + "step": 6295 + }, + { + "epoch": 0.34284236248664174, + "grad_norm": 0.6450240234888404, + "learning_rate": 0.00018969296122642185, + "loss": 12.5049, + "step": 6296 + }, + { + "epoch": 0.3428968164832248, + "grad_norm": 0.700539836061821, + "learning_rate": 0.00018968906167894753, + "loss": 12.5086, + "step": 6297 + }, + { + "epoch": 0.3429512704798078, + "grad_norm": 0.6584639102315182, + "learning_rate": 0.00018968516143403604, + "loss": 12.4562, + "step": 6298 + }, + { + "epoch": 0.34300572447639077, + "grad_norm": 0.6823128388229975, + "learning_rate": 0.00018968126049171772, + "loss": 12.5732, + "step": 6299 + }, + { + "epoch": 0.3430601784729738, + "grad_norm": 0.6786200126620332, + "learning_rate": 0.00018967735885202285, + "loss": 12.353, + "step": 6300 + }, + { + "epoch": 0.3431146324695568, + "grad_norm": 0.7928388839289368, + "learning_rate": 0.0001896734565149818, + "loss": 12.5546, + "step": 6301 + }, + { + "epoch": 0.3431690864661398, + "grad_norm": 0.8269561615090272, + "learning_rate": 0.00018966955348062494, + "loss": 12.4672, + "step": 6302 + }, + { + "epoch": 0.34322354046272285, + "grad_norm": 0.6666521103507326, + "learning_rate": 0.00018966564974898256, + "loss": 12.5375, + "step": 6303 + }, + { + "epoch": 0.34327799445930585, + "grad_norm": 0.6701268520745489, + "learning_rate": 0.00018966174532008507, + "loss": 12.6131, + "step": 6304 + }, + { + "epoch": 0.34333244845588884, + "grad_norm": 0.6766200647318046, + "learning_rate": 0.0001896578401939628, + "loss": 12.5149, + "step": 6305 + }, + { + "epoch": 0.3433869024524719, + "grad_norm": 0.6599092895292733, + "learning_rate": 0.00018965393437064614, + "loss": 12.4065, + "step": 6306 + }, + { + "epoch": 0.3434413564490549, + "grad_norm": 0.6659625059287979, + "learning_rate": 0.00018965002785016543, + "loss": 12.5598, + "step": 6307 + }, + { + "epoch": 0.3434958104456379, + "grad_norm": 0.8131428911036774, + "learning_rate": 0.0001896461206325511, + "loss": 12.6066, + "step": 6308 + }, + { + "epoch": 0.3435502644422209, + "grad_norm": 0.6293054241618713, + "learning_rate": 0.00018964221271783349, + "loss": 12.5573, + "step": 6309 + }, + { + "epoch": 0.3436047184388039, + "grad_norm": 0.618067658836677, + "learning_rate": 0.000189638304106043, + "loss": 12.4552, + "step": 6310 + }, + { + "epoch": 0.3436591724353869, + "grad_norm": 0.8179694432192177, + "learning_rate": 0.00018963439479721003, + "loss": 12.5569, + "step": 6311 + }, + { + "epoch": 0.34371362643196995, + "grad_norm": 0.6206028828875811, + "learning_rate": 0.00018963048479136497, + "loss": 12.5951, + "step": 6312 + }, + { + "epoch": 0.34376808042855295, + "grad_norm": 0.662690160695336, + "learning_rate": 0.0001896265740885382, + "loss": 12.555, + "step": 6313 + }, + { + "epoch": 0.34382253442513594, + "grad_norm": 0.692073058932196, + "learning_rate": 0.0001896226626887602, + "loss": 12.4739, + "step": 6314 + }, + { + "epoch": 0.343876988421719, + "grad_norm": 0.6636264210544816, + "learning_rate": 0.00018961875059206136, + "loss": 12.6116, + "step": 6315 + }, + { + "epoch": 0.343931442418302, + "grad_norm": 0.7960881435996826, + "learning_rate": 0.00018961483779847204, + "loss": 12.5392, + "step": 6316 + }, + { + "epoch": 0.343985896414885, + "grad_norm": 0.7024499992984288, + "learning_rate": 0.00018961092430802275, + "loss": 12.5088, + "step": 6317 + }, + { + "epoch": 0.344040350411468, + "grad_norm": 0.5998701393444636, + "learning_rate": 0.00018960701012074387, + "loss": 12.4843, + "step": 6318 + }, + { + "epoch": 0.344094804408051, + "grad_norm": 0.6386075283677938, + "learning_rate": 0.00018960309523666585, + "loss": 12.5588, + "step": 6319 + }, + { + "epoch": 0.34414925840463406, + "grad_norm": 0.6187861240384069, + "learning_rate": 0.00018959917965581912, + "loss": 12.5536, + "step": 6320 + }, + { + "epoch": 0.34420371240121705, + "grad_norm": 0.700612269892657, + "learning_rate": 0.00018959526337823416, + "loss": 12.5407, + "step": 6321 + }, + { + "epoch": 0.34425816639780005, + "grad_norm": 0.713290950613392, + "learning_rate": 0.00018959134640394141, + "loss": 12.5776, + "step": 6322 + }, + { + "epoch": 0.3443126203943831, + "grad_norm": 0.6599590469218795, + "learning_rate": 0.0001895874287329713, + "loss": 12.646, + "step": 6323 + }, + { + "epoch": 0.3443670743909661, + "grad_norm": 0.7129764251208779, + "learning_rate": 0.00018958351036535437, + "loss": 12.5006, + "step": 6324 + }, + { + "epoch": 0.3444215283875491, + "grad_norm": 0.6853621732756515, + "learning_rate": 0.000189579591301121, + "loss": 12.4555, + "step": 6325 + }, + { + "epoch": 0.34447598238413213, + "grad_norm": 0.6806288970030699, + "learning_rate": 0.00018957567154030173, + "loss": 12.5047, + "step": 6326 + }, + { + "epoch": 0.3445304363807151, + "grad_norm": 0.694590261163472, + "learning_rate": 0.000189571751082927, + "loss": 12.6542, + "step": 6327 + }, + { + "epoch": 0.3445848903772981, + "grad_norm": 0.6753126132952569, + "learning_rate": 0.0001895678299290273, + "loss": 12.4212, + "step": 6328 + }, + { + "epoch": 0.34463934437388116, + "grad_norm": 0.7192878280330849, + "learning_rate": 0.00018956390807863316, + "loss": 12.3633, + "step": 6329 + }, + { + "epoch": 0.34469379837046416, + "grad_norm": 0.7326573316491729, + "learning_rate": 0.00018955998553177504, + "loss": 12.5626, + "step": 6330 + }, + { + "epoch": 0.34474825236704715, + "grad_norm": 0.5918551403736377, + "learning_rate": 0.00018955606228848347, + "loss": 12.4205, + "step": 6331 + }, + { + "epoch": 0.3448027063636302, + "grad_norm": 0.7891637583902474, + "learning_rate": 0.00018955213834878892, + "loss": 12.5029, + "step": 6332 + }, + { + "epoch": 0.3448571603602132, + "grad_norm": 0.5980834101002779, + "learning_rate": 0.00018954821371272194, + "loss": 12.4273, + "step": 6333 + }, + { + "epoch": 0.3449116143567962, + "grad_norm": 0.8125656724378687, + "learning_rate": 0.000189544288380313, + "loss": 12.5797, + "step": 6334 + }, + { + "epoch": 0.34496606835337923, + "grad_norm": 0.7509595238380199, + "learning_rate": 0.0001895403623515927, + "loss": 12.5089, + "step": 6335 + }, + { + "epoch": 0.3450205223499622, + "grad_norm": 0.7248800926147912, + "learning_rate": 0.0001895364356265915, + "loss": 12.5702, + "step": 6336 + }, + { + "epoch": 0.3450749763465452, + "grad_norm": 0.8055583215818384, + "learning_rate": 0.00018953250820533994, + "loss": 12.5792, + "step": 6337 + }, + { + "epoch": 0.34512943034312826, + "grad_norm": 0.5895339977865024, + "learning_rate": 0.00018952858008786861, + "loss": 12.5369, + "step": 6338 + }, + { + "epoch": 0.34518388433971126, + "grad_norm": 0.7844342988424144, + "learning_rate": 0.000189524651274208, + "loss": 12.6507, + "step": 6339 + }, + { + "epoch": 0.34523833833629425, + "grad_norm": 0.641138721002702, + "learning_rate": 0.00018952072176438875, + "loss": 12.473, + "step": 6340 + }, + { + "epoch": 0.3452927923328773, + "grad_norm": 0.7272744757161392, + "learning_rate": 0.0001895167915584413, + "loss": 12.5863, + "step": 6341 + }, + { + "epoch": 0.3453472463294603, + "grad_norm": 0.7097480150671507, + "learning_rate": 0.0001895128606563963, + "loss": 12.6004, + "step": 6342 + }, + { + "epoch": 0.3454017003260433, + "grad_norm": 0.6246608664740043, + "learning_rate": 0.0001895089290582843, + "loss": 12.4141, + "step": 6343 + }, + { + "epoch": 0.34545615432262633, + "grad_norm": 0.6930736444561254, + "learning_rate": 0.0001895049967641358, + "loss": 12.4728, + "step": 6344 + }, + { + "epoch": 0.3455106083192093, + "grad_norm": 0.6358442922013844, + "learning_rate": 0.00018950106377398147, + "loss": 12.601, + "step": 6345 + }, + { + "epoch": 0.3455650623157923, + "grad_norm": 0.6550122925149894, + "learning_rate": 0.00018949713008785187, + "loss": 12.6569, + "step": 6346 + }, + { + "epoch": 0.34561951631237536, + "grad_norm": 0.6239208208894378, + "learning_rate": 0.00018949319570577756, + "loss": 12.5735, + "step": 6347 + }, + { + "epoch": 0.34567397030895836, + "grad_norm": 0.6506837444783518, + "learning_rate": 0.0001894892606277891, + "loss": 12.5761, + "step": 6348 + }, + { + "epoch": 0.34572842430554135, + "grad_norm": 0.6475058151362632, + "learning_rate": 0.00018948532485391724, + "loss": 12.516, + "step": 6349 + }, + { + "epoch": 0.3457828783021244, + "grad_norm": 0.6793182543915407, + "learning_rate": 0.00018948138838419243, + "loss": 12.3702, + "step": 6350 + }, + { + "epoch": 0.3458373322987074, + "grad_norm": 0.6628263555202152, + "learning_rate": 0.00018947745121864534, + "loss": 12.4945, + "step": 6351 + }, + { + "epoch": 0.34589178629529044, + "grad_norm": 0.6251765624796839, + "learning_rate": 0.0001894735133573066, + "loss": 12.5348, + "step": 6352 + }, + { + "epoch": 0.34594624029187343, + "grad_norm": 0.6143638650692151, + "learning_rate": 0.0001894695748002068, + "loss": 12.4181, + "step": 6353 + }, + { + "epoch": 0.3460006942884564, + "grad_norm": 0.6529588496376133, + "learning_rate": 0.0001894656355473766, + "loss": 12.5347, + "step": 6354 + }, + { + "epoch": 0.34605514828503947, + "grad_norm": 0.6120964415426231, + "learning_rate": 0.0001894616955988466, + "loss": 12.5119, + "step": 6355 + }, + { + "epoch": 0.34610960228162246, + "grad_norm": 0.7211947140757892, + "learning_rate": 0.00018945775495464746, + "loss": 12.5204, + "step": 6356 + }, + { + "epoch": 0.34616405627820546, + "grad_norm": 0.7094435979733846, + "learning_rate": 0.0001894538136148098, + "loss": 12.6202, + "step": 6357 + }, + { + "epoch": 0.3462185102747885, + "grad_norm": 0.7143215980668297, + "learning_rate": 0.00018944987157936433, + "loss": 12.5306, + "step": 6358 + }, + { + "epoch": 0.3462729642713715, + "grad_norm": 0.594273115652585, + "learning_rate": 0.00018944592884834158, + "loss": 12.5041, + "step": 6359 + }, + { + "epoch": 0.3463274182679545, + "grad_norm": 0.6021587890983173, + "learning_rate": 0.00018944198542177233, + "loss": 12.4764, + "step": 6360 + }, + { + "epoch": 0.34638187226453754, + "grad_norm": 0.6925011127854152, + "learning_rate": 0.00018943804129968722, + "loss": 12.4984, + "step": 6361 + }, + { + "epoch": 0.34643632626112053, + "grad_norm": 0.6341869152621732, + "learning_rate": 0.00018943409648211688, + "loss": 12.5632, + "step": 6362 + }, + { + "epoch": 0.3464907802577035, + "grad_norm": 0.6482645542487078, + "learning_rate": 0.00018943015096909203, + "loss": 12.539, + "step": 6363 + }, + { + "epoch": 0.34654523425428657, + "grad_norm": 1.0140057634003605, + "learning_rate": 0.0001894262047606433, + "loss": 12.5651, + "step": 6364 + }, + { + "epoch": 0.34659968825086956, + "grad_norm": 0.7749586692842871, + "learning_rate": 0.0001894222578568014, + "loss": 12.6487, + "step": 6365 + }, + { + "epoch": 0.34665414224745256, + "grad_norm": 0.7511098701115632, + "learning_rate": 0.00018941831025759705, + "loss": 12.5013, + "step": 6366 + }, + { + "epoch": 0.3467085962440356, + "grad_norm": 0.6900038024918674, + "learning_rate": 0.00018941436196306092, + "loss": 12.4541, + "step": 6367 + }, + { + "epoch": 0.3467630502406186, + "grad_norm": 0.649129436938782, + "learning_rate": 0.0001894104129732237, + "loss": 12.5152, + "step": 6368 + }, + { + "epoch": 0.3468175042372016, + "grad_norm": 0.6369552672624779, + "learning_rate": 0.00018940646328811616, + "loss": 12.4275, + "step": 6369 + }, + { + "epoch": 0.34687195823378464, + "grad_norm": 0.6559936737005914, + "learning_rate": 0.0001894025129077689, + "loss": 12.5292, + "step": 6370 + }, + { + "epoch": 0.34692641223036763, + "grad_norm": 0.6739824166369338, + "learning_rate": 0.00018939856183221277, + "loss": 12.5955, + "step": 6371 + }, + { + "epoch": 0.3469808662269506, + "grad_norm": 0.6310087891162702, + "learning_rate": 0.0001893946100614784, + "loss": 12.5557, + "step": 6372 + }, + { + "epoch": 0.34703532022353367, + "grad_norm": 0.7462373695583338, + "learning_rate": 0.00018939065759559655, + "loss": 12.498, + "step": 6373 + }, + { + "epoch": 0.34708977422011666, + "grad_norm": 0.6208249439174864, + "learning_rate": 0.00018938670443459797, + "loss": 12.535, + "step": 6374 + }, + { + "epoch": 0.34714422821669966, + "grad_norm": 0.6676441700139456, + "learning_rate": 0.0001893827505785134, + "loss": 12.5842, + "step": 6375 + }, + { + "epoch": 0.3471986822132827, + "grad_norm": 0.6282155039035795, + "learning_rate": 0.00018937879602737352, + "loss": 12.5026, + "step": 6376 + }, + { + "epoch": 0.3472531362098657, + "grad_norm": 0.7052680426249938, + "learning_rate": 0.00018937484078120916, + "loss": 12.4728, + "step": 6377 + }, + { + "epoch": 0.3473075902064487, + "grad_norm": 0.6920639408081833, + "learning_rate": 0.00018937088484005107, + "loss": 12.5618, + "step": 6378 + }, + { + "epoch": 0.34736204420303174, + "grad_norm": 0.6593306649432598, + "learning_rate": 0.00018936692820392995, + "loss": 12.5355, + "step": 6379 + }, + { + "epoch": 0.34741649819961473, + "grad_norm": 0.6217853510305813, + "learning_rate": 0.00018936297087287663, + "loss": 12.4688, + "step": 6380 + }, + { + "epoch": 0.3474709521961977, + "grad_norm": 0.7318749515701808, + "learning_rate": 0.00018935901284692188, + "loss": 12.475, + "step": 6381 + }, + { + "epoch": 0.3475254061927808, + "grad_norm": 0.711444766056599, + "learning_rate": 0.00018935505412609645, + "loss": 12.6268, + "step": 6382 + }, + { + "epoch": 0.34757986018936377, + "grad_norm": 0.7406652625637992, + "learning_rate": 0.0001893510947104311, + "loss": 12.5164, + "step": 6383 + }, + { + "epoch": 0.34763431418594676, + "grad_norm": 0.7130865533038795, + "learning_rate": 0.0001893471345999567, + "loss": 12.5484, + "step": 6384 + }, + { + "epoch": 0.3476887681825298, + "grad_norm": 0.6612167121536203, + "learning_rate": 0.000189343173794704, + "loss": 12.4729, + "step": 6385 + }, + { + "epoch": 0.3477432221791128, + "grad_norm": 0.6065860871835124, + "learning_rate": 0.00018933921229470375, + "loss": 12.5035, + "step": 6386 + }, + { + "epoch": 0.34779767617569585, + "grad_norm": 0.691402387102845, + "learning_rate": 0.00018933525009998684, + "loss": 12.5822, + "step": 6387 + }, + { + "epoch": 0.34785213017227884, + "grad_norm": 0.7433675288628595, + "learning_rate": 0.00018933128721058403, + "loss": 12.6504, + "step": 6388 + }, + { + "epoch": 0.34790658416886183, + "grad_norm": 0.6196010968741086, + "learning_rate": 0.00018932732362652617, + "loss": 12.4233, + "step": 6389 + }, + { + "epoch": 0.3479610381654449, + "grad_norm": 0.668475042083779, + "learning_rate": 0.00018932335934784407, + "loss": 12.5057, + "step": 6390 + }, + { + "epoch": 0.3480154921620279, + "grad_norm": 0.9062156252444816, + "learning_rate": 0.0001893193943745685, + "loss": 12.5867, + "step": 6391 + }, + { + "epoch": 0.34806994615861087, + "grad_norm": 0.6297267785296019, + "learning_rate": 0.0001893154287067304, + "loss": 12.5889, + "step": 6392 + }, + { + "epoch": 0.3481244001551939, + "grad_norm": 0.6707450566245595, + "learning_rate": 0.00018931146234436047, + "loss": 12.457, + "step": 6393 + }, + { + "epoch": 0.3481788541517769, + "grad_norm": 0.7407808248953949, + "learning_rate": 0.00018930749528748967, + "loss": 12.5703, + "step": 6394 + }, + { + "epoch": 0.3482333081483599, + "grad_norm": 0.679668366691635, + "learning_rate": 0.0001893035275361488, + "loss": 12.5665, + "step": 6395 + }, + { + "epoch": 0.34828776214494295, + "grad_norm": 0.6529951348154104, + "learning_rate": 0.0001892995590903688, + "loss": 12.5542, + "step": 6396 + }, + { + "epoch": 0.34834221614152594, + "grad_norm": 0.6383316905634563, + "learning_rate": 0.00018929558995018036, + "loss": 12.5076, + "step": 6397 + }, + { + "epoch": 0.34839667013810893, + "grad_norm": 0.7291276863483566, + "learning_rate": 0.00018929162011561447, + "loss": 12.5493, + "step": 6398 + }, + { + "epoch": 0.348451124134692, + "grad_norm": 0.6492707381007455, + "learning_rate": 0.00018928764958670198, + "loss": 12.4822, + "step": 6399 + }, + { + "epoch": 0.348505578131275, + "grad_norm": 0.7079173277913959, + "learning_rate": 0.00018928367836347373, + "loss": 12.5809, + "step": 6400 + }, + { + "epoch": 0.34856003212785797, + "grad_norm": 0.750570468667512, + "learning_rate": 0.00018927970644596064, + "loss": 12.481, + "step": 6401 + }, + { + "epoch": 0.348614486124441, + "grad_norm": 0.6244912076523662, + "learning_rate": 0.00018927573383419356, + "loss": 12.4996, + "step": 6402 + }, + { + "epoch": 0.348668940121024, + "grad_norm": 0.7211818342696633, + "learning_rate": 0.0001892717605282034, + "loss": 12.5767, + "step": 6403 + }, + { + "epoch": 0.348723394117607, + "grad_norm": 0.6794779875901277, + "learning_rate": 0.00018926778652802111, + "loss": 12.458, + "step": 6404 + }, + { + "epoch": 0.34877784811419005, + "grad_norm": 0.654629105489235, + "learning_rate": 0.0001892638118336775, + "loss": 12.5111, + "step": 6405 + }, + { + "epoch": 0.34883230211077304, + "grad_norm": 0.6827810366255843, + "learning_rate": 0.00018925983644520352, + "loss": 12.563, + "step": 6406 + }, + { + "epoch": 0.34888675610735603, + "grad_norm": 0.6691807604313643, + "learning_rate": 0.0001892558603626301, + "loss": 12.608, + "step": 6407 + }, + { + "epoch": 0.3489412101039391, + "grad_norm": 0.6556237166333346, + "learning_rate": 0.00018925188358598813, + "loss": 12.499, + "step": 6408 + }, + { + "epoch": 0.3489956641005221, + "grad_norm": 0.669163097908646, + "learning_rate": 0.00018924790611530857, + "loss": 12.563, + "step": 6409 + }, + { + "epoch": 0.34905011809710507, + "grad_norm": 0.6070951405750545, + "learning_rate": 0.00018924392795062226, + "loss": 12.3274, + "step": 6410 + }, + { + "epoch": 0.3491045720936881, + "grad_norm": 0.7249396675579225, + "learning_rate": 0.00018923994909196025, + "loss": 12.475, + "step": 6411 + }, + { + "epoch": 0.3491590260902711, + "grad_norm": 0.7266796712993444, + "learning_rate": 0.00018923596953935342, + "loss": 12.6071, + "step": 6412 + }, + { + "epoch": 0.3492134800868541, + "grad_norm": 0.6490803692654883, + "learning_rate": 0.00018923198929283276, + "loss": 12.5236, + "step": 6413 + }, + { + "epoch": 0.34926793408343715, + "grad_norm": 0.741220300259277, + "learning_rate": 0.00018922800835242915, + "loss": 12.635, + "step": 6414 + }, + { + "epoch": 0.34932238808002014, + "grad_norm": 0.7445484770609918, + "learning_rate": 0.0001892240267181736, + "loss": 12.6129, + "step": 6415 + }, + { + "epoch": 0.34937684207660313, + "grad_norm": 0.6759528956057483, + "learning_rate": 0.00018922004439009702, + "loss": 12.4762, + "step": 6416 + }, + { + "epoch": 0.3494312960731862, + "grad_norm": 0.6745366120055789, + "learning_rate": 0.00018921606136823046, + "loss": 12.6715, + "step": 6417 + }, + { + "epoch": 0.3494857500697692, + "grad_norm": 0.8237305176788643, + "learning_rate": 0.00018921207765260482, + "loss": 12.5819, + "step": 6418 + }, + { + "epoch": 0.3495402040663522, + "grad_norm": 0.7112539950806442, + "learning_rate": 0.00018920809324325107, + "loss": 12.5541, + "step": 6419 + }, + { + "epoch": 0.3495946580629352, + "grad_norm": 0.6667071155633175, + "learning_rate": 0.0001892041081402003, + "loss": 12.5276, + "step": 6420 + }, + { + "epoch": 0.3496491120595182, + "grad_norm": 0.6273084531789137, + "learning_rate": 0.0001892001223434834, + "loss": 12.5549, + "step": 6421 + }, + { + "epoch": 0.34970356605610126, + "grad_norm": 0.6750038564921762, + "learning_rate": 0.00018919613585313135, + "loss": 12.4548, + "step": 6422 + }, + { + "epoch": 0.34975802005268425, + "grad_norm": 0.6252492245083748, + "learning_rate": 0.00018919214866917522, + "loss": 12.4935, + "step": 6423 + }, + { + "epoch": 0.34981247404926724, + "grad_norm": 0.6984788655048768, + "learning_rate": 0.000189188160791646, + "loss": 12.5693, + "step": 6424 + }, + { + "epoch": 0.3498669280458503, + "grad_norm": 0.6671234434806845, + "learning_rate": 0.00018918417222057467, + "loss": 12.4412, + "step": 6425 + }, + { + "epoch": 0.3499213820424333, + "grad_norm": 0.7258796397397406, + "learning_rate": 0.00018918018295599224, + "loss": 12.489, + "step": 6426 + }, + { + "epoch": 0.3499758360390163, + "grad_norm": 0.6848281360826799, + "learning_rate": 0.00018917619299792978, + "loss": 12.555, + "step": 6427 + }, + { + "epoch": 0.3500302900355993, + "grad_norm": 0.6007537268845069, + "learning_rate": 0.00018917220234641828, + "loss": 12.4034, + "step": 6428 + }, + { + "epoch": 0.3500847440321823, + "grad_norm": 0.6459033432263563, + "learning_rate": 0.00018916821100148877, + "loss": 12.5938, + "step": 6429 + }, + { + "epoch": 0.3501391980287653, + "grad_norm": 0.6104972833604558, + "learning_rate": 0.00018916421896317232, + "loss": 12.5256, + "step": 6430 + }, + { + "epoch": 0.35019365202534836, + "grad_norm": 0.724356568467194, + "learning_rate": 0.00018916022623149994, + "loss": 12.5068, + "step": 6431 + }, + { + "epoch": 0.35024810602193135, + "grad_norm": 0.6684551809083941, + "learning_rate": 0.00018915623280650268, + "loss": 12.4023, + "step": 6432 + }, + { + "epoch": 0.35030256001851434, + "grad_norm": 0.6606085311908325, + "learning_rate": 0.00018915223868821158, + "loss": 12.3153, + "step": 6433 + }, + { + "epoch": 0.3503570140150974, + "grad_norm": 0.6925034443062139, + "learning_rate": 0.00018914824387665776, + "loss": 12.5406, + "step": 6434 + }, + { + "epoch": 0.3504114680116804, + "grad_norm": 0.658729625903655, + "learning_rate": 0.00018914424837187225, + "loss": 12.4132, + "step": 6435 + }, + { + "epoch": 0.3504659220082634, + "grad_norm": 0.6684606670577931, + "learning_rate": 0.0001891402521738861, + "loss": 12.4707, + "step": 6436 + }, + { + "epoch": 0.3505203760048464, + "grad_norm": 0.7201695038485614, + "learning_rate": 0.0001891362552827304, + "loss": 12.6224, + "step": 6437 + }, + { + "epoch": 0.3505748300014294, + "grad_norm": 0.7326773653782573, + "learning_rate": 0.00018913225769843624, + "loss": 12.5076, + "step": 6438 + }, + { + "epoch": 0.3506292839980124, + "grad_norm": 0.6495451475924698, + "learning_rate": 0.00018912825942103467, + "loss": 12.4381, + "step": 6439 + }, + { + "epoch": 0.35068373799459546, + "grad_norm": 0.6452599673897227, + "learning_rate": 0.00018912426045055683, + "loss": 12.5916, + "step": 6440 + }, + { + "epoch": 0.35073819199117845, + "grad_norm": 0.6765236346094069, + "learning_rate": 0.0001891202607870338, + "loss": 12.5231, + "step": 6441 + }, + { + "epoch": 0.35079264598776144, + "grad_norm": 0.7173390388248864, + "learning_rate": 0.00018911626043049666, + "loss": 12.6167, + "step": 6442 + }, + { + "epoch": 0.3508470999843445, + "grad_norm": 0.5881983259786162, + "learning_rate": 0.0001891122593809765, + "loss": 12.4536, + "step": 6443 + }, + { + "epoch": 0.3509015539809275, + "grad_norm": 0.6353387106698005, + "learning_rate": 0.00018910825763850456, + "loss": 12.4374, + "step": 6444 + }, + { + "epoch": 0.3509560079775105, + "grad_norm": 0.6819914447916594, + "learning_rate": 0.0001891042552031118, + "loss": 12.6692, + "step": 6445 + }, + { + "epoch": 0.3510104619740935, + "grad_norm": 0.6744783603916592, + "learning_rate": 0.00018910025207482942, + "loss": 12.5202, + "step": 6446 + }, + { + "epoch": 0.3510649159706765, + "grad_norm": 0.9038488748322342, + "learning_rate": 0.00018909624825368853, + "loss": 12.4483, + "step": 6447 + }, + { + "epoch": 0.3511193699672595, + "grad_norm": 0.6540032197278525, + "learning_rate": 0.00018909224373972027, + "loss": 12.4902, + "step": 6448 + }, + { + "epoch": 0.35117382396384256, + "grad_norm": 0.6431012659983741, + "learning_rate": 0.0001890882385329558, + "loss": 12.5164, + "step": 6449 + }, + { + "epoch": 0.35122827796042555, + "grad_norm": 0.6713430875386474, + "learning_rate": 0.00018908423263342626, + "loss": 12.5303, + "step": 6450 + }, + { + "epoch": 0.35128273195700854, + "grad_norm": 0.7401191797977572, + "learning_rate": 0.00018908022604116276, + "loss": 12.5052, + "step": 6451 + }, + { + "epoch": 0.3513371859535916, + "grad_norm": 0.6505333356783289, + "learning_rate": 0.00018907621875619647, + "loss": 12.5267, + "step": 6452 + }, + { + "epoch": 0.3513916399501746, + "grad_norm": 0.6812347254619805, + "learning_rate": 0.00018907221077855862, + "loss": 12.4048, + "step": 6453 + }, + { + "epoch": 0.35144609394675763, + "grad_norm": 0.6833135113192058, + "learning_rate": 0.00018906820210828028, + "loss": 12.5132, + "step": 6454 + }, + { + "epoch": 0.3515005479433406, + "grad_norm": 0.6871285856235322, + "learning_rate": 0.00018906419274539266, + "loss": 12.5483, + "step": 6455 + }, + { + "epoch": 0.3515550019399236, + "grad_norm": 0.7299322555996667, + "learning_rate": 0.00018906018268992694, + "loss": 12.396, + "step": 6456 + }, + { + "epoch": 0.35160945593650667, + "grad_norm": 0.6799131452832852, + "learning_rate": 0.00018905617194191435, + "loss": 12.4827, + "step": 6457 + }, + { + "epoch": 0.35166390993308966, + "grad_norm": 0.6595334118817054, + "learning_rate": 0.00018905216050138596, + "loss": 12.5873, + "step": 6458 + }, + { + "epoch": 0.35171836392967265, + "grad_norm": 0.7508304565503127, + "learning_rate": 0.00018904814836837307, + "loss": 12.5145, + "step": 6459 + }, + { + "epoch": 0.3517728179262557, + "grad_norm": 0.6701814667017392, + "learning_rate": 0.00018904413554290684, + "loss": 12.5822, + "step": 6460 + }, + { + "epoch": 0.3518272719228387, + "grad_norm": 0.6912066956582165, + "learning_rate": 0.0001890401220250185, + "loss": 12.6066, + "step": 6461 + }, + { + "epoch": 0.3518817259194217, + "grad_norm": 0.6390939063604993, + "learning_rate": 0.00018903610781473927, + "loss": 12.4256, + "step": 6462 + }, + { + "epoch": 0.35193617991600473, + "grad_norm": 0.7219330604492857, + "learning_rate": 0.00018903209291210027, + "loss": 12.6042, + "step": 6463 + }, + { + "epoch": 0.3519906339125877, + "grad_norm": 0.7091152392780032, + "learning_rate": 0.0001890280773171328, + "loss": 12.7038, + "step": 6464 + }, + { + "epoch": 0.3520450879091707, + "grad_norm": 0.8294803361278149, + "learning_rate": 0.0001890240610298681, + "loss": 12.5393, + "step": 6465 + }, + { + "epoch": 0.35209954190575377, + "grad_norm": 0.7327900394265904, + "learning_rate": 0.00018902004405033733, + "loss": 12.6018, + "step": 6466 + }, + { + "epoch": 0.35215399590233676, + "grad_norm": 0.7558412427812986, + "learning_rate": 0.0001890160263785718, + "loss": 12.3143, + "step": 6467 + }, + { + "epoch": 0.35220844989891975, + "grad_norm": 0.6485480492908395, + "learning_rate": 0.0001890120080146027, + "loss": 12.4945, + "step": 6468 + }, + { + "epoch": 0.3522629038955028, + "grad_norm": 0.7549308050018257, + "learning_rate": 0.00018900798895846134, + "loss": 12.4988, + "step": 6469 + }, + { + "epoch": 0.3523173578920858, + "grad_norm": 0.7610598992375268, + "learning_rate": 0.00018900396921017886, + "loss": 12.5119, + "step": 6470 + }, + { + "epoch": 0.3523718118886688, + "grad_norm": 0.6593766636270706, + "learning_rate": 0.00018899994876978664, + "loss": 12.5072, + "step": 6471 + }, + { + "epoch": 0.35242626588525183, + "grad_norm": 0.5841234835806068, + "learning_rate": 0.00018899592763731588, + "loss": 12.5108, + "step": 6472 + }, + { + "epoch": 0.3524807198818348, + "grad_norm": 0.6375815631140179, + "learning_rate": 0.00018899190581279788, + "loss": 12.5106, + "step": 6473 + }, + { + "epoch": 0.3525351738784178, + "grad_norm": 0.7265728910133166, + "learning_rate": 0.00018898788329626388, + "loss": 12.61, + "step": 6474 + }, + { + "epoch": 0.35258962787500087, + "grad_norm": 0.7442169125235223, + "learning_rate": 0.00018898386008774515, + "loss": 12.5473, + "step": 6475 + }, + { + "epoch": 0.35264408187158386, + "grad_norm": 0.7688054198431855, + "learning_rate": 0.00018897983618727305, + "loss": 12.5165, + "step": 6476 + }, + { + "epoch": 0.35269853586816685, + "grad_norm": 0.7814867856748376, + "learning_rate": 0.00018897581159487879, + "loss": 12.6789, + "step": 6477 + }, + { + "epoch": 0.3527529898647499, + "grad_norm": 0.8538938343014641, + "learning_rate": 0.00018897178631059372, + "loss": 12.6351, + "step": 6478 + }, + { + "epoch": 0.3528074438613329, + "grad_norm": 0.6388380559820188, + "learning_rate": 0.00018896776033444908, + "loss": 12.5478, + "step": 6479 + }, + { + "epoch": 0.3528618978579159, + "grad_norm": 0.6054334899810893, + "learning_rate": 0.00018896373366647623, + "loss": 12.4419, + "step": 6480 + }, + { + "epoch": 0.35291635185449893, + "grad_norm": 0.7531948587362995, + "learning_rate": 0.00018895970630670653, + "loss": 12.4213, + "step": 6481 + }, + { + "epoch": 0.3529708058510819, + "grad_norm": 0.7431856849626534, + "learning_rate": 0.00018895567825517117, + "loss": 12.4706, + "step": 6482 + }, + { + "epoch": 0.3530252598476649, + "grad_norm": 0.6282523964067674, + "learning_rate": 0.00018895164951190154, + "loss": 12.446, + "step": 6483 + }, + { + "epoch": 0.35307971384424797, + "grad_norm": 0.812969482425293, + "learning_rate": 0.00018894762007692898, + "loss": 12.5923, + "step": 6484 + }, + { + "epoch": 0.35313416784083096, + "grad_norm": 0.6418835602683038, + "learning_rate": 0.00018894358995028481, + "loss": 12.5747, + "step": 6485 + }, + { + "epoch": 0.353188621837414, + "grad_norm": 0.7819665290679157, + "learning_rate": 0.00018893955913200036, + "loss": 12.5215, + "step": 6486 + }, + { + "epoch": 0.353243075833997, + "grad_norm": 0.6917473065243283, + "learning_rate": 0.000188935527622107, + "loss": 12.4607, + "step": 6487 + }, + { + "epoch": 0.35329752983058, + "grad_norm": 0.7319956830798008, + "learning_rate": 0.00018893149542063603, + "loss": 12.5191, + "step": 6488 + }, + { + "epoch": 0.35335198382716304, + "grad_norm": 0.6774051839724597, + "learning_rate": 0.00018892746252761888, + "loss": 12.6299, + "step": 6489 + }, + { + "epoch": 0.35340643782374603, + "grad_norm": 0.6421731835455767, + "learning_rate": 0.00018892342894308683, + "loss": 12.4119, + "step": 6490 + }, + { + "epoch": 0.353460891820329, + "grad_norm": 0.6386697914025962, + "learning_rate": 0.0001889193946670713, + "loss": 12.4614, + "step": 6491 + }, + { + "epoch": 0.3535153458169121, + "grad_norm": 0.694150605451831, + "learning_rate": 0.00018891535969960368, + "loss": 12.3783, + "step": 6492 + }, + { + "epoch": 0.35356979981349507, + "grad_norm": 0.6789059976029038, + "learning_rate": 0.0001889113240407153, + "loss": 12.5107, + "step": 6493 + }, + { + "epoch": 0.35362425381007806, + "grad_norm": 0.6227305036984996, + "learning_rate": 0.0001889072876904375, + "loss": 12.4732, + "step": 6494 + }, + { + "epoch": 0.3536787078066611, + "grad_norm": 0.6914785944572073, + "learning_rate": 0.00018890325064880177, + "loss": 12.6268, + "step": 6495 + }, + { + "epoch": 0.3537331618032441, + "grad_norm": 0.8133604433595188, + "learning_rate": 0.00018889921291583944, + "loss": 12.4998, + "step": 6496 + }, + { + "epoch": 0.3537876157998271, + "grad_norm": 0.6797182621673394, + "learning_rate": 0.00018889517449158192, + "loss": 12.4768, + "step": 6497 + }, + { + "epoch": 0.35384206979641014, + "grad_norm": 0.7804143495476945, + "learning_rate": 0.0001888911353760606, + "loss": 12.6219, + "step": 6498 + }, + { + "epoch": 0.35389652379299313, + "grad_norm": 0.6579373319187583, + "learning_rate": 0.00018888709556930694, + "loss": 12.4734, + "step": 6499 + }, + { + "epoch": 0.3539509777895761, + "grad_norm": 0.6642270385457227, + "learning_rate": 0.00018888305507135228, + "loss": 12.5715, + "step": 6500 + }, + { + "epoch": 0.3540054317861592, + "grad_norm": 0.6342501988731931, + "learning_rate": 0.0001888790138822281, + "loss": 12.4374, + "step": 6501 + }, + { + "epoch": 0.35405988578274217, + "grad_norm": 0.6461119173969245, + "learning_rate": 0.0001888749720019658, + "loss": 12.4934, + "step": 6502 + }, + { + "epoch": 0.35411433977932516, + "grad_norm": 0.6868152071661218, + "learning_rate": 0.0001888709294305968, + "loss": 12.5943, + "step": 6503 + }, + { + "epoch": 0.3541687937759082, + "grad_norm": 0.6077876589490357, + "learning_rate": 0.00018886688616815258, + "loss": 12.4525, + "step": 6504 + }, + { + "epoch": 0.3542232477724912, + "grad_norm": 0.6724925347128431, + "learning_rate": 0.00018886284221466455, + "loss": 12.3494, + "step": 6505 + }, + { + "epoch": 0.3542777017690742, + "grad_norm": 0.7110664925398358, + "learning_rate": 0.00018885879757016413, + "loss": 12.5749, + "step": 6506 + }, + { + "epoch": 0.35433215576565724, + "grad_norm": 0.7291926327690469, + "learning_rate": 0.00018885475223468282, + "loss": 12.5879, + "step": 6507 + }, + { + "epoch": 0.35438660976224023, + "grad_norm": 0.7734607293232016, + "learning_rate": 0.00018885070620825202, + "loss": 12.5403, + "step": 6508 + }, + { + "epoch": 0.3544410637588232, + "grad_norm": 0.6355688010151955, + "learning_rate": 0.00018884665949090327, + "loss": 12.545, + "step": 6509 + }, + { + "epoch": 0.3544955177554063, + "grad_norm": 0.6731065364021674, + "learning_rate": 0.000188842612082668, + "loss": 12.4158, + "step": 6510 + }, + { + "epoch": 0.35454997175198927, + "grad_norm": 0.6785630727656202, + "learning_rate": 0.00018883856398357765, + "loss": 12.5455, + "step": 6511 + }, + { + "epoch": 0.35460442574857226, + "grad_norm": 0.677040878512404, + "learning_rate": 0.00018883451519366372, + "loss": 12.5873, + "step": 6512 + }, + { + "epoch": 0.3546588797451553, + "grad_norm": 0.65287559154684, + "learning_rate": 0.00018883046571295772, + "loss": 12.2615, + "step": 6513 + }, + { + "epoch": 0.3547133337417383, + "grad_norm": 0.5831793815322747, + "learning_rate": 0.00018882641554149112, + "loss": 12.5245, + "step": 6514 + }, + { + "epoch": 0.3547677877383213, + "grad_norm": 0.7114706506008506, + "learning_rate": 0.00018882236467929542, + "loss": 12.4625, + "step": 6515 + }, + { + "epoch": 0.35482224173490434, + "grad_norm": 0.649963966810882, + "learning_rate": 0.00018881831312640213, + "loss": 12.5165, + "step": 6516 + }, + { + "epoch": 0.35487669573148733, + "grad_norm": 0.6883907364877639, + "learning_rate": 0.0001888142608828427, + "loss": 12.6147, + "step": 6517 + }, + { + "epoch": 0.3549311497280703, + "grad_norm": 0.6099915381134763, + "learning_rate": 0.00018881020794864873, + "loss": 12.4195, + "step": 6518 + }, + { + "epoch": 0.3549856037246534, + "grad_norm": 0.6779392941887493, + "learning_rate": 0.00018880615432385165, + "loss": 12.5528, + "step": 6519 + }, + { + "epoch": 0.35504005772123637, + "grad_norm": 0.6853390762420948, + "learning_rate": 0.00018880210000848306, + "loss": 12.4175, + "step": 6520 + }, + { + "epoch": 0.3550945117178194, + "grad_norm": 0.7099141792622605, + "learning_rate": 0.0001887980450025744, + "loss": 12.576, + "step": 6521 + }, + { + "epoch": 0.3551489657144024, + "grad_norm": 0.6741612375122882, + "learning_rate": 0.0001887939893061573, + "loss": 12.5621, + "step": 6522 + }, + { + "epoch": 0.3552034197109854, + "grad_norm": 0.6367742915817244, + "learning_rate": 0.00018878993291926324, + "loss": 12.4885, + "step": 6523 + }, + { + "epoch": 0.35525787370756845, + "grad_norm": 0.6563563804700083, + "learning_rate": 0.00018878587584192374, + "loss": 12.3315, + "step": 6524 + }, + { + "epoch": 0.35531232770415144, + "grad_norm": 0.6755606542985129, + "learning_rate": 0.00018878181807417042, + "loss": 12.3851, + "step": 6525 + }, + { + "epoch": 0.35536678170073444, + "grad_norm": 0.6810645317599997, + "learning_rate": 0.00018877775961603476, + "loss": 12.4542, + "step": 6526 + }, + { + "epoch": 0.3554212356973175, + "grad_norm": 0.6584896277234938, + "learning_rate": 0.00018877370046754838, + "loss": 12.4904, + "step": 6527 + }, + { + "epoch": 0.3554756896939005, + "grad_norm": 0.782531194635525, + "learning_rate": 0.00018876964062874277, + "loss": 12.6789, + "step": 6528 + }, + { + "epoch": 0.35553014369048347, + "grad_norm": 0.7047130293938447, + "learning_rate": 0.0001887655800996496, + "loss": 12.4975, + "step": 6529 + }, + { + "epoch": 0.3555845976870665, + "grad_norm": 0.6392556211825621, + "learning_rate": 0.00018876151888030037, + "loss": 12.4937, + "step": 6530 + }, + { + "epoch": 0.3556390516836495, + "grad_norm": 0.6357688326994323, + "learning_rate": 0.00018875745697072668, + "loss": 12.498, + "step": 6531 + }, + { + "epoch": 0.3556935056802325, + "grad_norm": 0.7523241658001276, + "learning_rate": 0.00018875339437096012, + "loss": 12.5092, + "step": 6532 + }, + { + "epoch": 0.35574795967681555, + "grad_norm": 0.6611830202180823, + "learning_rate": 0.0001887493310810323, + "loss": 12.5987, + "step": 6533 + }, + { + "epoch": 0.35580241367339854, + "grad_norm": 0.6434045027474856, + "learning_rate": 0.0001887452671009748, + "loss": 12.4229, + "step": 6534 + }, + { + "epoch": 0.35585686766998154, + "grad_norm": 0.6348599415287733, + "learning_rate": 0.0001887412024308192, + "loss": 12.4465, + "step": 6535 + }, + { + "epoch": 0.3559113216665646, + "grad_norm": 0.6012998013242754, + "learning_rate": 0.00018873713707059716, + "loss": 12.4638, + "step": 6536 + }, + { + "epoch": 0.3559657756631476, + "grad_norm": 0.6225976072562621, + "learning_rate": 0.00018873307102034023, + "loss": 12.4929, + "step": 6537 + }, + { + "epoch": 0.35602022965973057, + "grad_norm": 0.6537989187335137, + "learning_rate": 0.00018872900428008004, + "loss": 12.4924, + "step": 6538 + }, + { + "epoch": 0.3560746836563136, + "grad_norm": 0.7310649052768982, + "learning_rate": 0.00018872493684984827, + "loss": 12.6, + "step": 6539 + }, + { + "epoch": 0.3561291376528966, + "grad_norm": 0.619228080128155, + "learning_rate": 0.00018872086872967652, + "loss": 12.4596, + "step": 6540 + }, + { + "epoch": 0.3561835916494796, + "grad_norm": 0.8602848445604844, + "learning_rate": 0.0001887167999195964, + "loss": 12.3719, + "step": 6541 + }, + { + "epoch": 0.35623804564606265, + "grad_norm": 0.6542020442995943, + "learning_rate": 0.00018871273041963954, + "loss": 12.5089, + "step": 6542 + }, + { + "epoch": 0.35629249964264564, + "grad_norm": 0.6450493139868647, + "learning_rate": 0.00018870866022983765, + "loss": 12.4721, + "step": 6543 + }, + { + "epoch": 0.35634695363922864, + "grad_norm": 0.6627589095066768, + "learning_rate": 0.00018870458935022234, + "loss": 12.5715, + "step": 6544 + }, + { + "epoch": 0.3564014076358117, + "grad_norm": 0.6679819323871122, + "learning_rate": 0.00018870051778082525, + "loss": 12.4713, + "step": 6545 + }, + { + "epoch": 0.3564558616323947, + "grad_norm": 0.5891025212950688, + "learning_rate": 0.00018869644552167803, + "loss": 12.4168, + "step": 6546 + }, + { + "epoch": 0.35651031562897767, + "grad_norm": 0.7443645315187565, + "learning_rate": 0.00018869237257281243, + "loss": 12.5215, + "step": 6547 + }, + { + "epoch": 0.3565647696255607, + "grad_norm": 0.675935462558438, + "learning_rate": 0.00018868829893426003, + "loss": 12.3665, + "step": 6548 + }, + { + "epoch": 0.3566192236221437, + "grad_norm": 0.6266163892118104, + "learning_rate": 0.00018868422460605259, + "loss": 12.4143, + "step": 6549 + }, + { + "epoch": 0.3566736776187267, + "grad_norm": 0.7024481342728484, + "learning_rate": 0.00018868014958822168, + "loss": 12.4768, + "step": 6550 + }, + { + "epoch": 0.35672813161530975, + "grad_norm": 0.7236654554637472, + "learning_rate": 0.0001886760738807991, + "loss": 12.5488, + "step": 6551 + }, + { + "epoch": 0.35678258561189274, + "grad_norm": 0.6277720760414693, + "learning_rate": 0.0001886719974838165, + "loss": 12.5349, + "step": 6552 + }, + { + "epoch": 0.3568370396084758, + "grad_norm": 0.5491554105623445, + "learning_rate": 0.00018866792039730557, + "loss": 12.4084, + "step": 6553 + }, + { + "epoch": 0.3568914936050588, + "grad_norm": 0.6574591041317043, + "learning_rate": 0.000188663842621298, + "loss": 12.5417, + "step": 6554 + }, + { + "epoch": 0.3569459476016418, + "grad_norm": 0.7825346624677734, + "learning_rate": 0.00018865976415582557, + "loss": 12.4618, + "step": 6555 + }, + { + "epoch": 0.3570004015982248, + "grad_norm": 0.6397378451307003, + "learning_rate": 0.0001886556850009199, + "loss": 12.4972, + "step": 6556 + }, + { + "epoch": 0.3570548555948078, + "grad_norm": 0.6964582007099602, + "learning_rate": 0.00018865160515661278, + "loss": 12.6336, + "step": 6557 + }, + { + "epoch": 0.3571093095913908, + "grad_norm": 0.6838137992784039, + "learning_rate": 0.0001886475246229359, + "loss": 12.6239, + "step": 6558 + }, + { + "epoch": 0.35716376358797386, + "grad_norm": 0.7702590269454724, + "learning_rate": 0.000188643443399921, + "loss": 12.2703, + "step": 6559 + }, + { + "epoch": 0.35721821758455685, + "grad_norm": 0.6275611356284347, + "learning_rate": 0.00018863936148759983, + "loss": 12.487, + "step": 6560 + }, + { + "epoch": 0.35727267158113984, + "grad_norm": 0.7105981531040471, + "learning_rate": 0.0001886352788860041, + "loss": 12.4876, + "step": 6561 + }, + { + "epoch": 0.3573271255777229, + "grad_norm": 0.6785012603603872, + "learning_rate": 0.0001886311955951656, + "loss": 12.5144, + "step": 6562 + }, + { + "epoch": 0.3573815795743059, + "grad_norm": 0.6480000289225968, + "learning_rate": 0.00018862711161511607, + "loss": 12.5371, + "step": 6563 + }, + { + "epoch": 0.3574360335708889, + "grad_norm": 0.6836036858096097, + "learning_rate": 0.0001886230269458872, + "loss": 12.484, + "step": 6564 + }, + { + "epoch": 0.3574904875674719, + "grad_norm": 0.6809995375722718, + "learning_rate": 0.00018861894158751086, + "loss": 12.5016, + "step": 6565 + }, + { + "epoch": 0.3575449415640549, + "grad_norm": 0.6525177660409279, + "learning_rate": 0.00018861485554001877, + "loss": 12.5633, + "step": 6566 + }, + { + "epoch": 0.3575993955606379, + "grad_norm": 0.6576641847219601, + "learning_rate": 0.00018861076880344267, + "loss": 12.4833, + "step": 6567 + }, + { + "epoch": 0.35765384955722096, + "grad_norm": 0.5990481118703957, + "learning_rate": 0.0001886066813778144, + "loss": 12.6255, + "step": 6568 + }, + { + "epoch": 0.35770830355380395, + "grad_norm": 0.805099690625791, + "learning_rate": 0.0001886025932631657, + "loss": 12.4749, + "step": 6569 + }, + { + "epoch": 0.35776275755038695, + "grad_norm": 0.6835538764125585, + "learning_rate": 0.0001885985044595284, + "loss": 12.5702, + "step": 6570 + }, + { + "epoch": 0.35781721154697, + "grad_norm": 0.6435072091519588, + "learning_rate": 0.00018859441496693426, + "loss": 12.7074, + "step": 6571 + }, + { + "epoch": 0.357871665543553, + "grad_norm": 0.6384713238741512, + "learning_rate": 0.0001885903247854151, + "loss": 12.3787, + "step": 6572 + }, + { + "epoch": 0.357926119540136, + "grad_norm": 0.7192638915760756, + "learning_rate": 0.00018858623391500268, + "loss": 12.5576, + "step": 6573 + }, + { + "epoch": 0.357980573536719, + "grad_norm": 0.6350450338580725, + "learning_rate": 0.0001885821423557289, + "loss": 12.5939, + "step": 6574 + }, + { + "epoch": 0.358035027533302, + "grad_norm": 0.7084619240659714, + "learning_rate": 0.00018857805010762547, + "loss": 12.5479, + "step": 6575 + }, + { + "epoch": 0.358089481529885, + "grad_norm": 0.6182206352160496, + "learning_rate": 0.0001885739571707243, + "loss": 12.4144, + "step": 6576 + }, + { + "epoch": 0.35814393552646806, + "grad_norm": 0.7050067760240177, + "learning_rate": 0.0001885698635450572, + "loss": 12.518, + "step": 6577 + }, + { + "epoch": 0.35819838952305105, + "grad_norm": 0.6444317754366243, + "learning_rate": 0.00018856576923065597, + "loss": 12.5393, + "step": 6578 + }, + { + "epoch": 0.35825284351963405, + "grad_norm": 0.6796109302689314, + "learning_rate": 0.00018856167422755246, + "loss": 12.3975, + "step": 6579 + }, + { + "epoch": 0.3583072975162171, + "grad_norm": 0.6790355456078522, + "learning_rate": 0.00018855757853577853, + "loss": 12.478, + "step": 6580 + }, + { + "epoch": 0.3583617515128001, + "grad_norm": 0.7346590134415578, + "learning_rate": 0.000188553482155366, + "loss": 12.5285, + "step": 6581 + }, + { + "epoch": 0.3584162055093831, + "grad_norm": 0.6529363025771956, + "learning_rate": 0.00018854938508634678, + "loss": 12.5329, + "step": 6582 + }, + { + "epoch": 0.3584706595059661, + "grad_norm": 0.6270728056967176, + "learning_rate": 0.00018854528732875265, + "loss": 12.4526, + "step": 6583 + }, + { + "epoch": 0.3585251135025491, + "grad_norm": 0.6167445436474197, + "learning_rate": 0.00018854118888261554, + "loss": 12.5071, + "step": 6584 + }, + { + "epoch": 0.3585795674991321, + "grad_norm": 0.6658190255876781, + "learning_rate": 0.0001885370897479673, + "loss": 12.611, + "step": 6585 + }, + { + "epoch": 0.35863402149571516, + "grad_norm": 0.7280245914122708, + "learning_rate": 0.0001885329899248398, + "loss": 12.603, + "step": 6586 + }, + { + "epoch": 0.35868847549229815, + "grad_norm": 0.6513125287893339, + "learning_rate": 0.0001885288894132649, + "loss": 12.4227, + "step": 6587 + }, + { + "epoch": 0.3587429294888812, + "grad_norm": 0.6482208097248535, + "learning_rate": 0.00018852478821327452, + "loss": 12.4279, + "step": 6588 + }, + { + "epoch": 0.3587973834854642, + "grad_norm": 0.7403836328311797, + "learning_rate": 0.00018852068632490058, + "loss": 12.5555, + "step": 6589 + }, + { + "epoch": 0.3588518374820472, + "grad_norm": 0.6310367450242624, + "learning_rate": 0.0001885165837481749, + "loss": 12.4712, + "step": 6590 + }, + { + "epoch": 0.35890629147863023, + "grad_norm": 0.7276121536456144, + "learning_rate": 0.0001885124804831294, + "loss": 12.5856, + "step": 6591 + }, + { + "epoch": 0.3589607454752132, + "grad_norm": 0.6186713256945982, + "learning_rate": 0.00018850837652979605, + "loss": 12.2877, + "step": 6592 + }, + { + "epoch": 0.3590151994717962, + "grad_norm": 0.6180211458265332, + "learning_rate": 0.00018850427188820673, + "loss": 12.553, + "step": 6593 + }, + { + "epoch": 0.35906965346837927, + "grad_norm": 0.6810319920116736, + "learning_rate": 0.0001885001665583933, + "loss": 12.5653, + "step": 6594 + }, + { + "epoch": 0.35912410746496226, + "grad_norm": 0.6478256244548984, + "learning_rate": 0.00018849606054038777, + "loss": 12.4771, + "step": 6595 + }, + { + "epoch": 0.35917856146154525, + "grad_norm": 0.6203360210240151, + "learning_rate": 0.00018849195383422202, + "loss": 12.6188, + "step": 6596 + }, + { + "epoch": 0.3592330154581283, + "grad_norm": 0.7394212994938031, + "learning_rate": 0.000188487846439928, + "loss": 12.4102, + "step": 6597 + }, + { + "epoch": 0.3592874694547113, + "grad_norm": 0.6700090375629415, + "learning_rate": 0.00018848373835753766, + "loss": 12.3701, + "step": 6598 + }, + { + "epoch": 0.3593419234512943, + "grad_norm": 0.6763191547744783, + "learning_rate": 0.0001884796295870829, + "loss": 12.5191, + "step": 6599 + }, + { + "epoch": 0.35939637744787734, + "grad_norm": 0.7225861178544174, + "learning_rate": 0.00018847552012859573, + "loss": 12.5006, + "step": 6600 + }, + { + "epoch": 0.35945083144446033, + "grad_norm": 0.6598424950049372, + "learning_rate": 0.00018847140998210806, + "loss": 12.4225, + "step": 6601 + }, + { + "epoch": 0.3595052854410433, + "grad_norm": 0.8022917346730528, + "learning_rate": 0.00018846729914765188, + "loss": 12.4835, + "step": 6602 + }, + { + "epoch": 0.35955973943762637, + "grad_norm": 0.7155075380930347, + "learning_rate": 0.00018846318762525915, + "loss": 12.6107, + "step": 6603 + }, + { + "epoch": 0.35961419343420936, + "grad_norm": 0.5788593511681562, + "learning_rate": 0.00018845907541496182, + "loss": 12.345, + "step": 6604 + }, + { + "epoch": 0.35966864743079235, + "grad_norm": 0.7692664908605247, + "learning_rate": 0.00018845496251679192, + "loss": 12.4208, + "step": 6605 + }, + { + "epoch": 0.3597231014273754, + "grad_norm": 0.6398281023425616, + "learning_rate": 0.00018845084893078136, + "loss": 12.4587, + "step": 6606 + }, + { + "epoch": 0.3597775554239584, + "grad_norm": 0.6285622503429014, + "learning_rate": 0.00018844673465696218, + "loss": 12.5134, + "step": 6607 + }, + { + "epoch": 0.3598320094205414, + "grad_norm": 0.7085105210743461, + "learning_rate": 0.00018844261969536637, + "loss": 12.5605, + "step": 6608 + }, + { + "epoch": 0.35988646341712444, + "grad_norm": 0.6771526198094195, + "learning_rate": 0.00018843850404602587, + "loss": 12.4348, + "step": 6609 + }, + { + "epoch": 0.35994091741370743, + "grad_norm": 0.6476584687119452, + "learning_rate": 0.0001884343877089728, + "loss": 12.4236, + "step": 6610 + }, + { + "epoch": 0.3599953714102904, + "grad_norm": 0.6365661190221357, + "learning_rate": 0.00018843027068423903, + "loss": 12.4869, + "step": 6611 + }, + { + "epoch": 0.36004982540687347, + "grad_norm": 0.6116624437632676, + "learning_rate": 0.0001884261529718567, + "loss": 12.4376, + "step": 6612 + }, + { + "epoch": 0.36010427940345646, + "grad_norm": 0.6161016949457048, + "learning_rate": 0.00018842203457185777, + "loss": 12.3886, + "step": 6613 + }, + { + "epoch": 0.36015873340003945, + "grad_norm": 0.6629262293434118, + "learning_rate": 0.00018841791548427427, + "loss": 12.4196, + "step": 6614 + }, + { + "epoch": 0.3602131873966225, + "grad_norm": 0.6301688987629763, + "learning_rate": 0.00018841379570913821, + "loss": 12.4263, + "step": 6615 + }, + { + "epoch": 0.3602676413932055, + "grad_norm": 0.6423425681866812, + "learning_rate": 0.00018840967524648165, + "loss": 12.5683, + "step": 6616 + }, + { + "epoch": 0.3603220953897885, + "grad_norm": 0.6722468104910277, + "learning_rate": 0.00018840555409633665, + "loss": 12.5797, + "step": 6617 + }, + { + "epoch": 0.36037654938637154, + "grad_norm": 0.683282736727712, + "learning_rate": 0.00018840143225873522, + "loss": 12.4557, + "step": 6618 + }, + { + "epoch": 0.36043100338295453, + "grad_norm": 0.5689102418426638, + "learning_rate": 0.00018839730973370942, + "loss": 12.4297, + "step": 6619 + }, + { + "epoch": 0.3604854573795376, + "grad_norm": 0.6515763103187273, + "learning_rate": 0.00018839318652129136, + "loss": 12.6092, + "step": 6620 + }, + { + "epoch": 0.36053991137612057, + "grad_norm": 0.6638057645135687, + "learning_rate": 0.000188389062621513, + "loss": 12.5255, + "step": 6621 + }, + { + "epoch": 0.36059436537270356, + "grad_norm": 0.7578143155135398, + "learning_rate": 0.0001883849380344065, + "loss": 12.4897, + "step": 6622 + }, + { + "epoch": 0.3606488193692866, + "grad_norm": 0.6929891599006907, + "learning_rate": 0.00018838081276000387, + "loss": 12.4673, + "step": 6623 + }, + { + "epoch": 0.3607032733658696, + "grad_norm": 0.6098098382946556, + "learning_rate": 0.00018837668679833725, + "loss": 12.4919, + "step": 6624 + }, + { + "epoch": 0.3607577273624526, + "grad_norm": 0.6995080045001343, + "learning_rate": 0.0001883725601494387, + "loss": 12.4121, + "step": 6625 + }, + { + "epoch": 0.36081218135903564, + "grad_norm": 0.6393239609282849, + "learning_rate": 0.00018836843281334024, + "loss": 12.4995, + "step": 6626 + }, + { + "epoch": 0.36086663535561864, + "grad_norm": 0.6179401434135583, + "learning_rate": 0.0001883643047900741, + "loss": 12.4329, + "step": 6627 + }, + { + "epoch": 0.36092108935220163, + "grad_norm": 0.7364688329740939, + "learning_rate": 0.00018836017607967227, + "loss": 12.5791, + "step": 6628 + }, + { + "epoch": 0.3609755433487847, + "grad_norm": 0.7445944340871038, + "learning_rate": 0.0001883560466821669, + "loss": 12.4741, + "step": 6629 + }, + { + "epoch": 0.36102999734536767, + "grad_norm": 0.6936132071756218, + "learning_rate": 0.00018835191659759008, + "loss": 12.5278, + "step": 6630 + }, + { + "epoch": 0.36108445134195066, + "grad_norm": 0.6341963179540083, + "learning_rate": 0.00018834778582597396, + "loss": 12.5872, + "step": 6631 + }, + { + "epoch": 0.3611389053385337, + "grad_norm": 0.6296041806316938, + "learning_rate": 0.00018834365436735064, + "loss": 12.4316, + "step": 6632 + }, + { + "epoch": 0.3611933593351167, + "grad_norm": 0.6374466468077414, + "learning_rate": 0.00018833952222175224, + "loss": 12.5258, + "step": 6633 + }, + { + "epoch": 0.3612478133316997, + "grad_norm": 0.6999344709830713, + "learning_rate": 0.0001883353893892109, + "loss": 12.5829, + "step": 6634 + }, + { + "epoch": 0.36130226732828274, + "grad_norm": 0.6904477138465861, + "learning_rate": 0.00018833125586975878, + "loss": 12.4849, + "step": 6635 + }, + { + "epoch": 0.36135672132486574, + "grad_norm": 0.662277379149618, + "learning_rate": 0.00018832712166342796, + "loss": 12.5352, + "step": 6636 + }, + { + "epoch": 0.36141117532144873, + "grad_norm": 0.821381962854756, + "learning_rate": 0.00018832298677025068, + "loss": 12.4751, + "step": 6637 + }, + { + "epoch": 0.3614656293180318, + "grad_norm": 0.6317517204534076, + "learning_rate": 0.000188318851190259, + "loss": 12.461, + "step": 6638 + }, + { + "epoch": 0.36152008331461477, + "grad_norm": 0.6556528541474681, + "learning_rate": 0.00018831471492348513, + "loss": 12.5782, + "step": 6639 + }, + { + "epoch": 0.36157453731119776, + "grad_norm": 0.6910220177221039, + "learning_rate": 0.00018831057796996124, + "loss": 12.5669, + "step": 6640 + }, + { + "epoch": 0.3616289913077808, + "grad_norm": 0.7937937585155153, + "learning_rate": 0.0001883064403297195, + "loss": 12.6712, + "step": 6641 + }, + { + "epoch": 0.3616834453043638, + "grad_norm": 0.7329019212487958, + "learning_rate": 0.00018830230200279205, + "loss": 12.4129, + "step": 6642 + }, + { + "epoch": 0.3617378993009468, + "grad_norm": 0.7721716116033265, + "learning_rate": 0.0001882981629892111, + "loss": 12.4551, + "step": 6643 + }, + { + "epoch": 0.36179235329752985, + "grad_norm": 0.7340558824600818, + "learning_rate": 0.00018829402328900883, + "loss": 12.3573, + "step": 6644 + }, + { + "epoch": 0.36184680729411284, + "grad_norm": 0.7346948738214002, + "learning_rate": 0.0001882898829022174, + "loss": 12.496, + "step": 6645 + }, + { + "epoch": 0.36190126129069583, + "grad_norm": 0.8316671174602631, + "learning_rate": 0.00018828574182886903, + "loss": 12.5202, + "step": 6646 + }, + { + "epoch": 0.3619557152872789, + "grad_norm": 0.7174804265086283, + "learning_rate": 0.00018828160006899598, + "loss": 12.4468, + "step": 6647 + }, + { + "epoch": 0.36201016928386187, + "grad_norm": 0.7011147967368828, + "learning_rate": 0.00018827745762263037, + "loss": 12.5342, + "step": 6648 + }, + { + "epoch": 0.36206462328044486, + "grad_norm": 0.683462054591662, + "learning_rate": 0.00018827331448980443, + "loss": 12.5567, + "step": 6649 + }, + { + "epoch": 0.3621190772770279, + "grad_norm": 0.8090110863198164, + "learning_rate": 0.00018826917067055044, + "loss": 12.5956, + "step": 6650 + }, + { + "epoch": 0.3621735312736109, + "grad_norm": 0.6607693188818475, + "learning_rate": 0.00018826502616490053, + "loss": 12.5962, + "step": 6651 + }, + { + "epoch": 0.3622279852701939, + "grad_norm": 0.6767689806560603, + "learning_rate": 0.000188260880972887, + "loss": 12.5465, + "step": 6652 + }, + { + "epoch": 0.36228243926677695, + "grad_norm": 0.6372209862019359, + "learning_rate": 0.00018825673509454202, + "loss": 12.2851, + "step": 6653 + }, + { + "epoch": 0.36233689326335994, + "grad_norm": 0.635820161629741, + "learning_rate": 0.0001882525885298979, + "loss": 12.4901, + "step": 6654 + }, + { + "epoch": 0.362391347259943, + "grad_norm": 0.7700231819144508, + "learning_rate": 0.00018824844127898687, + "loss": 12.6406, + "step": 6655 + }, + { + "epoch": 0.362445801256526, + "grad_norm": 0.6610835813691188, + "learning_rate": 0.00018824429334184112, + "loss": 12.5198, + "step": 6656 + }, + { + "epoch": 0.36250025525310897, + "grad_norm": 0.6923623425981027, + "learning_rate": 0.000188240144718493, + "loss": 12.5646, + "step": 6657 + }, + { + "epoch": 0.362554709249692, + "grad_norm": 0.7295852877262856, + "learning_rate": 0.00018823599540897465, + "loss": 12.5918, + "step": 6658 + }, + { + "epoch": 0.362609163246275, + "grad_norm": 0.5943802872107438, + "learning_rate": 0.00018823184541331845, + "loss": 12.3685, + "step": 6659 + }, + { + "epoch": 0.362663617242858, + "grad_norm": 0.7152397393362633, + "learning_rate": 0.0001882276947315566, + "loss": 12.4319, + "step": 6660 + }, + { + "epoch": 0.36271807123944105, + "grad_norm": 0.7250489702731249, + "learning_rate": 0.0001882235433637214, + "loss": 12.5264, + "step": 6661 + }, + { + "epoch": 0.36277252523602405, + "grad_norm": 0.5924669756695498, + "learning_rate": 0.00018821939130984517, + "loss": 12.2873, + "step": 6662 + }, + { + "epoch": 0.36282697923260704, + "grad_norm": 0.645723736104555, + "learning_rate": 0.00018821523856996013, + "loss": 12.5939, + "step": 6663 + }, + { + "epoch": 0.3628814332291901, + "grad_norm": 0.5875140836827224, + "learning_rate": 0.00018821108514409856, + "loss": 12.474, + "step": 6664 + }, + { + "epoch": 0.3629358872257731, + "grad_norm": 0.6503600619137254, + "learning_rate": 0.00018820693103229288, + "loss": 12.4141, + "step": 6665 + }, + { + "epoch": 0.36299034122235607, + "grad_norm": 0.6845049787509762, + "learning_rate": 0.00018820277623457526, + "loss": 12.6379, + "step": 6666 + }, + { + "epoch": 0.3630447952189391, + "grad_norm": 0.636216840869751, + "learning_rate": 0.00018819862075097806, + "loss": 12.3587, + "step": 6667 + }, + { + "epoch": 0.3630992492155221, + "grad_norm": 0.6478000866250779, + "learning_rate": 0.00018819446458153362, + "loss": 12.4389, + "step": 6668 + }, + { + "epoch": 0.3631537032121051, + "grad_norm": 0.6636993194800846, + "learning_rate": 0.0001881903077262742, + "loss": 12.4896, + "step": 6669 + }, + { + "epoch": 0.36320815720868815, + "grad_norm": 0.6230684019836505, + "learning_rate": 0.0001881861501852322, + "loss": 12.6979, + "step": 6670 + }, + { + "epoch": 0.36326261120527115, + "grad_norm": 0.6421208304249839, + "learning_rate": 0.00018818199195843986, + "loss": 12.4471, + "step": 6671 + }, + { + "epoch": 0.36331706520185414, + "grad_norm": 0.6028078428535656, + "learning_rate": 0.00018817783304592959, + "loss": 12.4494, + "step": 6672 + }, + { + "epoch": 0.3633715191984372, + "grad_norm": 0.6221622025387541, + "learning_rate": 0.00018817367344773372, + "loss": 12.4518, + "step": 6673 + }, + { + "epoch": 0.3634259731950202, + "grad_norm": 0.6563182110144833, + "learning_rate": 0.00018816951316388453, + "loss": 12.6353, + "step": 6674 + }, + { + "epoch": 0.3634804271916032, + "grad_norm": 0.6485631719952629, + "learning_rate": 0.00018816535219441446, + "loss": 12.4347, + "step": 6675 + }, + { + "epoch": 0.3635348811881862, + "grad_norm": 0.6252855421360843, + "learning_rate": 0.0001881611905393558, + "loss": 12.4695, + "step": 6676 + }, + { + "epoch": 0.3635893351847692, + "grad_norm": 0.6735701127245386, + "learning_rate": 0.00018815702819874097, + "loss": 12.5676, + "step": 6677 + }, + { + "epoch": 0.3636437891813522, + "grad_norm": 0.7415341714654862, + "learning_rate": 0.00018815286517260229, + "loss": 12.764, + "step": 6678 + }, + { + "epoch": 0.36369824317793525, + "grad_norm": 0.6322693357159745, + "learning_rate": 0.00018814870146097214, + "loss": 12.5009, + "step": 6679 + }, + { + "epoch": 0.36375269717451825, + "grad_norm": 0.6458801056972988, + "learning_rate": 0.00018814453706388287, + "loss": 12.5047, + "step": 6680 + }, + { + "epoch": 0.36380715117110124, + "grad_norm": 0.7096837355005496, + "learning_rate": 0.00018814037198136698, + "loss": 12.4631, + "step": 6681 + }, + { + "epoch": 0.3638616051676843, + "grad_norm": 0.6345182119218137, + "learning_rate": 0.0001881362062134567, + "loss": 12.4933, + "step": 6682 + }, + { + "epoch": 0.3639160591642673, + "grad_norm": 0.6692866959051389, + "learning_rate": 0.00018813203976018455, + "loss": 12.4697, + "step": 6683 + }, + { + "epoch": 0.3639705131608503, + "grad_norm": 0.6995810148133702, + "learning_rate": 0.00018812787262158286, + "loss": 12.5705, + "step": 6684 + }, + { + "epoch": 0.3640249671574333, + "grad_norm": 0.7825305327360113, + "learning_rate": 0.00018812370479768406, + "loss": 12.6368, + "step": 6685 + }, + { + "epoch": 0.3640794211540163, + "grad_norm": 0.6714265725827628, + "learning_rate": 0.00018811953628852056, + "loss": 12.5814, + "step": 6686 + }, + { + "epoch": 0.3641338751505993, + "grad_norm": 0.8482129985990133, + "learning_rate": 0.00018811536709412475, + "loss": 12.4584, + "step": 6687 + }, + { + "epoch": 0.36418832914718235, + "grad_norm": 0.6692639872542804, + "learning_rate": 0.00018811119721452908, + "loss": 12.4091, + "step": 6688 + }, + { + "epoch": 0.36424278314376535, + "grad_norm": 0.844682059447251, + "learning_rate": 0.00018810702664976594, + "loss": 12.72, + "step": 6689 + }, + { + "epoch": 0.3642972371403484, + "grad_norm": 0.7272075202619086, + "learning_rate": 0.0001881028553998678, + "loss": 12.4081, + "step": 6690 + }, + { + "epoch": 0.3643516911369314, + "grad_norm": 0.7004977677490962, + "learning_rate": 0.0001880986834648671, + "loss": 12.4538, + "step": 6691 + }, + { + "epoch": 0.3644061451335144, + "grad_norm": 0.8629727368672526, + "learning_rate": 0.00018809451084479624, + "loss": 12.6221, + "step": 6692 + }, + { + "epoch": 0.36446059913009743, + "grad_norm": 0.7851367318362038, + "learning_rate": 0.0001880903375396877, + "loss": 12.5663, + "step": 6693 + }, + { + "epoch": 0.3645150531266804, + "grad_norm": 0.6854981472693442, + "learning_rate": 0.0001880861635495739, + "loss": 12.4976, + "step": 6694 + }, + { + "epoch": 0.3645695071232634, + "grad_norm": 0.7607531587291991, + "learning_rate": 0.00018808198887448736, + "loss": 12.5451, + "step": 6695 + }, + { + "epoch": 0.36462396111984646, + "grad_norm": 0.6346208042477164, + "learning_rate": 0.00018807781351446048, + "loss": 12.526, + "step": 6696 + }, + { + "epoch": 0.36467841511642946, + "grad_norm": 0.669565982871631, + "learning_rate": 0.0001880736374695258, + "loss": 12.4964, + "step": 6697 + }, + { + "epoch": 0.36473286911301245, + "grad_norm": 0.7089403356676109, + "learning_rate": 0.00018806946073971569, + "loss": 12.4569, + "step": 6698 + }, + { + "epoch": 0.3647873231095955, + "grad_norm": 0.8414280784882965, + "learning_rate": 0.0001880652833250627, + "loss": 12.5536, + "step": 6699 + }, + { + "epoch": 0.3648417771061785, + "grad_norm": 0.6479374855180915, + "learning_rate": 0.00018806110522559926, + "loss": 12.4727, + "step": 6700 + }, + { + "epoch": 0.3648962311027615, + "grad_norm": 0.7031712491465153, + "learning_rate": 0.00018805692644135796, + "loss": 12.5094, + "step": 6701 + }, + { + "epoch": 0.36495068509934453, + "grad_norm": 0.6561201324217154, + "learning_rate": 0.00018805274697237119, + "loss": 12.6912, + "step": 6702 + }, + { + "epoch": 0.3650051390959275, + "grad_norm": 0.8083614773954539, + "learning_rate": 0.00018804856681867152, + "loss": 12.5316, + "step": 6703 + }, + { + "epoch": 0.3650595930925105, + "grad_norm": 0.67374064745605, + "learning_rate": 0.0001880443859802914, + "loss": 12.6457, + "step": 6704 + }, + { + "epoch": 0.36511404708909356, + "grad_norm": 0.6532324213824884, + "learning_rate": 0.00018804020445726337, + "loss": 12.5391, + "step": 6705 + }, + { + "epoch": 0.36516850108567656, + "grad_norm": 0.6757495628646862, + "learning_rate": 0.00018803602224962, + "loss": 12.5225, + "step": 6706 + }, + { + "epoch": 0.36522295508225955, + "grad_norm": 0.6432686278770082, + "learning_rate": 0.0001880318393573937, + "loss": 12.453, + "step": 6707 + }, + { + "epoch": 0.3652774090788426, + "grad_norm": 0.7883001660334636, + "learning_rate": 0.00018802765578061705, + "loss": 12.5551, + "step": 6708 + }, + { + "epoch": 0.3653318630754256, + "grad_norm": 0.6461071475432286, + "learning_rate": 0.00018802347151932264, + "loss": 12.55, + "step": 6709 + }, + { + "epoch": 0.3653863170720086, + "grad_norm": 0.6691570814181542, + "learning_rate": 0.0001880192865735429, + "loss": 12.6225, + "step": 6710 + }, + { + "epoch": 0.36544077106859163, + "grad_norm": 0.6922546666923483, + "learning_rate": 0.00018801510094331047, + "loss": 12.4039, + "step": 6711 + }, + { + "epoch": 0.3654952250651746, + "grad_norm": 0.5936430837492487, + "learning_rate": 0.00018801091462865784, + "loss": 12.4361, + "step": 6712 + }, + { + "epoch": 0.3655496790617576, + "grad_norm": 0.6664258393725, + "learning_rate": 0.00018800672762961758, + "loss": 12.4437, + "step": 6713 + }, + { + "epoch": 0.36560413305834066, + "grad_norm": 0.5887534964830626, + "learning_rate": 0.00018800253994622222, + "loss": 12.476, + "step": 6714 + }, + { + "epoch": 0.36565858705492366, + "grad_norm": 0.6531177932509925, + "learning_rate": 0.00018799835157850439, + "loss": 12.6041, + "step": 6715 + }, + { + "epoch": 0.36571304105150665, + "grad_norm": 0.7136114805228129, + "learning_rate": 0.00018799416252649658, + "loss": 12.5732, + "step": 6716 + }, + { + "epoch": 0.3657674950480897, + "grad_norm": 0.6176999560468083, + "learning_rate": 0.00018798997279023143, + "loss": 12.4697, + "step": 6717 + }, + { + "epoch": 0.3658219490446727, + "grad_norm": 0.641058439634723, + "learning_rate": 0.00018798578236974153, + "loss": 12.5075, + "step": 6718 + }, + { + "epoch": 0.3658764030412557, + "grad_norm": 0.693524325390353, + "learning_rate": 0.00018798159126505936, + "loss": 12.614, + "step": 6719 + }, + { + "epoch": 0.36593085703783873, + "grad_norm": 0.6367206031439175, + "learning_rate": 0.00018797739947621763, + "loss": 12.5712, + "step": 6720 + }, + { + "epoch": 0.3659853110344217, + "grad_norm": 0.6231257744211336, + "learning_rate": 0.00018797320700324885, + "loss": 12.4514, + "step": 6721 + }, + { + "epoch": 0.36603976503100477, + "grad_norm": 0.6479835224474679, + "learning_rate": 0.0001879690138461857, + "loss": 12.6704, + "step": 6722 + }, + { + "epoch": 0.36609421902758776, + "grad_norm": 0.6621267367927367, + "learning_rate": 0.00018796482000506072, + "loss": 12.4577, + "step": 6723 + }, + { + "epoch": 0.36614867302417076, + "grad_norm": 0.6044248186047356, + "learning_rate": 0.00018796062547990657, + "loss": 12.3559, + "step": 6724 + }, + { + "epoch": 0.3662031270207538, + "grad_norm": 0.6139721548320676, + "learning_rate": 0.00018795643027075585, + "loss": 12.503, + "step": 6725 + }, + { + "epoch": 0.3662575810173368, + "grad_norm": 0.652693152929206, + "learning_rate": 0.00018795223437764115, + "loss": 12.4726, + "step": 6726 + }, + { + "epoch": 0.3663120350139198, + "grad_norm": 0.5914756538338115, + "learning_rate": 0.0001879480378005951, + "loss": 12.3033, + "step": 6727 + }, + { + "epoch": 0.36636648901050284, + "grad_norm": 0.7254217447618527, + "learning_rate": 0.00018794384053965043, + "loss": 12.5628, + "step": 6728 + }, + { + "epoch": 0.36642094300708583, + "grad_norm": 0.5891298822369286, + "learning_rate": 0.00018793964259483968, + "loss": 12.4264, + "step": 6729 + }, + { + "epoch": 0.3664753970036688, + "grad_norm": 0.6553598414916912, + "learning_rate": 0.0001879354439661955, + "loss": 12.5271, + "step": 6730 + }, + { + "epoch": 0.36652985100025187, + "grad_norm": 0.6145565110030625, + "learning_rate": 0.0001879312446537506, + "loss": 12.3724, + "step": 6731 + }, + { + "epoch": 0.36658430499683486, + "grad_norm": 0.664373293286463, + "learning_rate": 0.00018792704465753755, + "loss": 12.6268, + "step": 6732 + }, + { + "epoch": 0.36663875899341786, + "grad_norm": 0.6838236991001653, + "learning_rate": 0.00018792284397758908, + "loss": 12.6584, + "step": 6733 + }, + { + "epoch": 0.3666932129900009, + "grad_norm": 0.6663236308420681, + "learning_rate": 0.00018791864261393784, + "loss": 12.4839, + "step": 6734 + }, + { + "epoch": 0.3667476669865839, + "grad_norm": 0.6418049564560676, + "learning_rate": 0.00018791444056661646, + "loss": 12.5051, + "step": 6735 + }, + { + "epoch": 0.3668021209831669, + "grad_norm": 0.5569929085876848, + "learning_rate": 0.0001879102378356577, + "loss": 12.5471, + "step": 6736 + }, + { + "epoch": 0.36685657497974994, + "grad_norm": 0.67123615672468, + "learning_rate": 0.00018790603442109412, + "loss": 12.4874, + "step": 6737 + }, + { + "epoch": 0.36691102897633293, + "grad_norm": 0.6303929900678815, + "learning_rate": 0.0001879018303229585, + "loss": 12.415, + "step": 6738 + }, + { + "epoch": 0.3669654829729159, + "grad_norm": 0.7224967526388294, + "learning_rate": 0.0001878976255412835, + "loss": 12.4743, + "step": 6739 + }, + { + "epoch": 0.36701993696949897, + "grad_norm": 0.6074412775014366, + "learning_rate": 0.00018789342007610187, + "loss": 12.4987, + "step": 6740 + }, + { + "epoch": 0.36707439096608196, + "grad_norm": 0.6347238589759355, + "learning_rate": 0.00018788921392744624, + "loss": 12.517, + "step": 6741 + }, + { + "epoch": 0.36712884496266496, + "grad_norm": 0.6664557545586834, + "learning_rate": 0.00018788500709534934, + "loss": 12.55, + "step": 6742 + }, + { + "epoch": 0.367183298959248, + "grad_norm": 0.6477748468933896, + "learning_rate": 0.00018788079957984385, + "loss": 12.4114, + "step": 6743 + }, + { + "epoch": 0.367237752955831, + "grad_norm": 0.6270361143383161, + "learning_rate": 0.00018787659138096258, + "loss": 12.3699, + "step": 6744 + }, + { + "epoch": 0.367292206952414, + "grad_norm": 0.6315587680247866, + "learning_rate": 0.0001878723824987382, + "loss": 12.5573, + "step": 6745 + }, + { + "epoch": 0.36734666094899704, + "grad_norm": 0.6682517723733418, + "learning_rate": 0.00018786817293320337, + "loss": 12.4946, + "step": 6746 + }, + { + "epoch": 0.36740111494558003, + "grad_norm": 0.5731857714905204, + "learning_rate": 0.00018786396268439094, + "loss": 12.5497, + "step": 6747 + }, + { + "epoch": 0.367455568942163, + "grad_norm": 0.7154608334837631, + "learning_rate": 0.00018785975175233363, + "loss": 12.4118, + "step": 6748 + }, + { + "epoch": 0.3675100229387461, + "grad_norm": 0.6314309590611179, + "learning_rate": 0.00018785554013706413, + "loss": 12.5279, + "step": 6749 + }, + { + "epoch": 0.36756447693532907, + "grad_norm": 0.7327550590135451, + "learning_rate": 0.0001878513278386152, + "loss": 12.644, + "step": 6750 + }, + { + "epoch": 0.36761893093191206, + "grad_norm": 0.6563415578448006, + "learning_rate": 0.0001878471148570196, + "loss": 12.4376, + "step": 6751 + }, + { + "epoch": 0.3676733849284951, + "grad_norm": 0.7432858762702095, + "learning_rate": 0.00018784290119231014, + "loss": 12.5806, + "step": 6752 + }, + { + "epoch": 0.3677278389250781, + "grad_norm": 0.6496962666478835, + "learning_rate": 0.00018783868684451953, + "loss": 12.4287, + "step": 6753 + }, + { + "epoch": 0.3677822929216611, + "grad_norm": 0.5991457432589863, + "learning_rate": 0.00018783447181368058, + "loss": 12.4827, + "step": 6754 + }, + { + "epoch": 0.36783674691824414, + "grad_norm": 0.8446082627576676, + "learning_rate": 0.00018783025609982602, + "loss": 12.4579, + "step": 6755 + }, + { + "epoch": 0.36789120091482713, + "grad_norm": 0.6363909276299062, + "learning_rate": 0.00018782603970298869, + "loss": 12.4383, + "step": 6756 + }, + { + "epoch": 0.3679456549114102, + "grad_norm": 0.6666559724643025, + "learning_rate": 0.00018782182262320132, + "loss": 12.6553, + "step": 6757 + }, + { + "epoch": 0.3680001089079932, + "grad_norm": 0.6768404315222415, + "learning_rate": 0.00018781760486049674, + "loss": 12.5123, + "step": 6758 + }, + { + "epoch": 0.36805456290457617, + "grad_norm": 0.6747314018648729, + "learning_rate": 0.00018781338641490772, + "loss": 12.5436, + "step": 6759 + }, + { + "epoch": 0.3681090169011592, + "grad_norm": 0.7112379071050334, + "learning_rate": 0.0001878091672864671, + "loss": 12.5604, + "step": 6760 + }, + { + "epoch": 0.3681634708977422, + "grad_norm": 0.7209862155449003, + "learning_rate": 0.00018780494747520766, + "loss": 12.4318, + "step": 6761 + }, + { + "epoch": 0.3682179248943252, + "grad_norm": 0.6681822817895532, + "learning_rate": 0.00018780072698116224, + "loss": 12.5159, + "step": 6762 + }, + { + "epoch": 0.36827237889090825, + "grad_norm": 0.6863679955143484, + "learning_rate": 0.00018779650580436362, + "loss": 12.5072, + "step": 6763 + }, + { + "epoch": 0.36832683288749124, + "grad_norm": 0.6044769521952945, + "learning_rate": 0.00018779228394484468, + "loss": 12.4695, + "step": 6764 + }, + { + "epoch": 0.36838128688407423, + "grad_norm": 0.6071552219295997, + "learning_rate": 0.0001877880614026382, + "loss": 12.4761, + "step": 6765 + }, + { + "epoch": 0.3684357408806573, + "grad_norm": 0.7369894755682067, + "learning_rate": 0.00018778383817777704, + "loss": 12.614, + "step": 6766 + }, + { + "epoch": 0.3684901948772403, + "grad_norm": 0.6885261413499081, + "learning_rate": 0.000187779614270294, + "loss": 12.6022, + "step": 6767 + }, + { + "epoch": 0.36854464887382327, + "grad_norm": 0.6742656846843884, + "learning_rate": 0.00018777538968022198, + "loss": 12.3581, + "step": 6768 + }, + { + "epoch": 0.3685991028704063, + "grad_norm": 0.7097374898586997, + "learning_rate": 0.0001877711644075938, + "loss": 12.4439, + "step": 6769 + }, + { + "epoch": 0.3686535568669893, + "grad_norm": 0.7041533992553495, + "learning_rate": 0.00018776693845244235, + "loss": 12.5134, + "step": 6770 + }, + { + "epoch": 0.3687080108635723, + "grad_norm": 0.6803788854145529, + "learning_rate": 0.00018776271181480047, + "loss": 12.5001, + "step": 6771 + }, + { + "epoch": 0.36876246486015535, + "grad_norm": 0.6677003831250875, + "learning_rate": 0.000187758484494701, + "loss": 12.4216, + "step": 6772 + }, + { + "epoch": 0.36881691885673834, + "grad_norm": 1.0267058549178372, + "learning_rate": 0.00018775425649217685, + "loss": 12.4736, + "step": 6773 + }, + { + "epoch": 0.36887137285332133, + "grad_norm": 0.7316182162253633, + "learning_rate": 0.0001877500278072609, + "loss": 12.399, + "step": 6774 + }, + { + "epoch": 0.3689258268499044, + "grad_norm": 0.6166976637914323, + "learning_rate": 0.000187745798439986, + "loss": 12.5217, + "step": 6775 + }, + { + "epoch": 0.3689802808464874, + "grad_norm": 0.677681470141228, + "learning_rate": 0.00018774156839038506, + "loss": 12.3609, + "step": 6776 + }, + { + "epoch": 0.36903473484307037, + "grad_norm": 0.6906842272517986, + "learning_rate": 0.00018773733765849095, + "loss": 12.4438, + "step": 6777 + }, + { + "epoch": 0.3690891888396534, + "grad_norm": 0.636827348006488, + "learning_rate": 0.0001877331062443366, + "loss": 12.6218, + "step": 6778 + }, + { + "epoch": 0.3691436428362364, + "grad_norm": 0.6668917508706977, + "learning_rate": 0.00018772887414795494, + "loss": 12.4354, + "step": 6779 + }, + { + "epoch": 0.3691980968328194, + "grad_norm": 0.7166015009254231, + "learning_rate": 0.00018772464136937878, + "loss": 12.475, + "step": 6780 + }, + { + "epoch": 0.36925255082940245, + "grad_norm": 0.5999520292287912, + "learning_rate": 0.00018772040790864116, + "loss": 12.498, + "step": 6781 + }, + { + "epoch": 0.36930700482598544, + "grad_norm": 0.7259940123210213, + "learning_rate": 0.00018771617376577487, + "loss": 12.5116, + "step": 6782 + }, + { + "epoch": 0.36936145882256843, + "grad_norm": 0.7414603041330218, + "learning_rate": 0.00018771193894081295, + "loss": 12.4914, + "step": 6783 + }, + { + "epoch": 0.3694159128191515, + "grad_norm": 0.6098300155081475, + "learning_rate": 0.00018770770343378828, + "loss": 12.3549, + "step": 6784 + }, + { + "epoch": 0.3694703668157345, + "grad_norm": 0.6771752107026059, + "learning_rate": 0.0001877034672447338, + "loss": 12.4367, + "step": 6785 + }, + { + "epoch": 0.36952482081231747, + "grad_norm": 0.6127061024770842, + "learning_rate": 0.00018769923037368244, + "loss": 12.5099, + "step": 6786 + }, + { + "epoch": 0.3695792748089005, + "grad_norm": 0.6713858656002362, + "learning_rate": 0.00018769499282066717, + "loss": 12.5197, + "step": 6787 + }, + { + "epoch": 0.3696337288054835, + "grad_norm": 0.6113076049001404, + "learning_rate": 0.0001876907545857209, + "loss": 12.4514, + "step": 6788 + }, + { + "epoch": 0.36968818280206656, + "grad_norm": 0.69200188488012, + "learning_rate": 0.00018768651566887664, + "loss": 12.5427, + "step": 6789 + }, + { + "epoch": 0.36974263679864955, + "grad_norm": 0.649783035094314, + "learning_rate": 0.00018768227607016735, + "loss": 12.5515, + "step": 6790 + }, + { + "epoch": 0.36979709079523254, + "grad_norm": 0.651731885718608, + "learning_rate": 0.00018767803578962594, + "loss": 12.3617, + "step": 6791 + }, + { + "epoch": 0.3698515447918156, + "grad_norm": 0.6236780373250338, + "learning_rate": 0.00018767379482728544, + "loss": 12.4804, + "step": 6792 + }, + { + "epoch": 0.3699059987883986, + "grad_norm": 0.6290590856325058, + "learning_rate": 0.00018766955318317877, + "loss": 12.5694, + "step": 6793 + }, + { + "epoch": 0.3699604527849816, + "grad_norm": 0.994191559529947, + "learning_rate": 0.000187665310857339, + "loss": 12.3978, + "step": 6794 + }, + { + "epoch": 0.3700149067815646, + "grad_norm": 0.6691599006801765, + "learning_rate": 0.00018766106784979907, + "loss": 12.5427, + "step": 6795 + }, + { + "epoch": 0.3700693607781476, + "grad_norm": 0.6725550264378077, + "learning_rate": 0.00018765682416059195, + "loss": 12.5197, + "step": 6796 + }, + { + "epoch": 0.3701238147747306, + "grad_norm": 0.7775858640825722, + "learning_rate": 0.00018765257978975067, + "loss": 12.565, + "step": 6797 + }, + { + "epoch": 0.37017826877131366, + "grad_norm": 0.7820371214677028, + "learning_rate": 0.00018764833473730824, + "loss": 12.5492, + "step": 6798 + }, + { + "epoch": 0.37023272276789665, + "grad_norm": 0.6070673240639204, + "learning_rate": 0.00018764408900329767, + "loss": 12.3199, + "step": 6799 + }, + { + "epoch": 0.37028717676447964, + "grad_norm": 0.7546031888907322, + "learning_rate": 0.00018763984258775195, + "loss": 12.4831, + "step": 6800 + }, + { + "epoch": 0.3703416307610627, + "grad_norm": 0.6614244889442318, + "learning_rate": 0.00018763559549070413, + "loss": 12.4839, + "step": 6801 + }, + { + "epoch": 0.3703960847576457, + "grad_norm": 0.66700405636965, + "learning_rate": 0.0001876313477121872, + "loss": 12.444, + "step": 6802 + }, + { + "epoch": 0.3704505387542287, + "grad_norm": 0.6430092498863155, + "learning_rate": 0.00018762709925223422, + "loss": 12.5453, + "step": 6803 + }, + { + "epoch": 0.3705049927508117, + "grad_norm": 0.6240645971741429, + "learning_rate": 0.00018762285011087823, + "loss": 12.5121, + "step": 6804 + }, + { + "epoch": 0.3705594467473947, + "grad_norm": 0.7551045479724068, + "learning_rate": 0.00018761860028815227, + "loss": 12.4146, + "step": 6805 + }, + { + "epoch": 0.3706139007439777, + "grad_norm": 0.6744163269445749, + "learning_rate": 0.00018761434978408937, + "loss": 12.4616, + "step": 6806 + }, + { + "epoch": 0.37066835474056076, + "grad_norm": 0.6083501446794017, + "learning_rate": 0.00018761009859872259, + "loss": 12.4124, + "step": 6807 + }, + { + "epoch": 0.37072280873714375, + "grad_norm": 0.6741727963074209, + "learning_rate": 0.000187605846732085, + "loss": 12.4202, + "step": 6808 + }, + { + "epoch": 0.37077726273372674, + "grad_norm": 0.6277622677085708, + "learning_rate": 0.00018760159418420967, + "loss": 12.482, + "step": 6809 + }, + { + "epoch": 0.3708317167303098, + "grad_norm": 0.6334500184621313, + "learning_rate": 0.00018759734095512962, + "loss": 12.4696, + "step": 6810 + }, + { + "epoch": 0.3708861707268928, + "grad_norm": 0.7533923247549392, + "learning_rate": 0.00018759308704487796, + "loss": 12.5491, + "step": 6811 + }, + { + "epoch": 0.3709406247234758, + "grad_norm": 0.6355387749570965, + "learning_rate": 0.0001875888324534878, + "loss": 12.4132, + "step": 6812 + }, + { + "epoch": 0.3709950787200588, + "grad_norm": 0.6199304004384589, + "learning_rate": 0.00018758457718099213, + "loss": 12.4855, + "step": 6813 + }, + { + "epoch": 0.3710495327166418, + "grad_norm": 0.6562595002491339, + "learning_rate": 0.00018758032122742415, + "loss": 12.5171, + "step": 6814 + }, + { + "epoch": 0.3711039867132248, + "grad_norm": 0.7378677389636937, + "learning_rate": 0.0001875760645928169, + "loss": 12.6526, + "step": 6815 + }, + { + "epoch": 0.37115844070980786, + "grad_norm": 0.7079173817993047, + "learning_rate": 0.00018757180727720348, + "loss": 12.5506, + "step": 6816 + }, + { + "epoch": 0.37121289470639085, + "grad_norm": 0.7743100219942269, + "learning_rate": 0.000187567549280617, + "loss": 12.42, + "step": 6817 + }, + { + "epoch": 0.37126734870297384, + "grad_norm": 0.6741818318526057, + "learning_rate": 0.00018756329060309055, + "loss": 12.5836, + "step": 6818 + }, + { + "epoch": 0.3713218026995569, + "grad_norm": 0.7489878835475556, + "learning_rate": 0.0001875590312446573, + "loss": 12.5678, + "step": 6819 + }, + { + "epoch": 0.3713762566961399, + "grad_norm": 0.7337700505792882, + "learning_rate": 0.0001875547712053503, + "loss": 12.3313, + "step": 6820 + }, + { + "epoch": 0.3714307106927229, + "grad_norm": 0.643565639957734, + "learning_rate": 0.00018755051048520275, + "loss": 12.3265, + "step": 6821 + }, + { + "epoch": 0.3714851646893059, + "grad_norm": 0.7085073729411158, + "learning_rate": 0.00018754624908424777, + "loss": 12.5107, + "step": 6822 + }, + { + "epoch": 0.3715396186858889, + "grad_norm": 0.7446332347373452, + "learning_rate": 0.00018754198700251842, + "loss": 12.5754, + "step": 6823 + }, + { + "epoch": 0.37159407268247197, + "grad_norm": 0.7543591298912838, + "learning_rate": 0.00018753772424004791, + "loss": 12.5493, + "step": 6824 + }, + { + "epoch": 0.37164852667905496, + "grad_norm": 0.779747215779012, + "learning_rate": 0.00018753346079686942, + "loss": 12.6352, + "step": 6825 + }, + { + "epoch": 0.37170298067563795, + "grad_norm": 0.6777786449446311, + "learning_rate": 0.00018752919667301603, + "loss": 12.4106, + "step": 6826 + }, + { + "epoch": 0.371757434672221, + "grad_norm": 0.7842274548150086, + "learning_rate": 0.0001875249318685209, + "loss": 12.545, + "step": 6827 + }, + { + "epoch": 0.371811888668804, + "grad_norm": 0.6771490065998343, + "learning_rate": 0.00018752066638341724, + "loss": 12.4642, + "step": 6828 + }, + { + "epoch": 0.371866342665387, + "grad_norm": 0.6722420889789897, + "learning_rate": 0.00018751640021773822, + "loss": 12.4239, + "step": 6829 + }, + { + "epoch": 0.37192079666197003, + "grad_norm": 0.7656829797000084, + "learning_rate": 0.00018751213337151699, + "loss": 12.4541, + "step": 6830 + }, + { + "epoch": 0.371975250658553, + "grad_norm": 0.6173941419610085, + "learning_rate": 0.00018750786584478674, + "loss": 12.5112, + "step": 6831 + }, + { + "epoch": 0.372029704655136, + "grad_norm": 0.6862551318512679, + "learning_rate": 0.00018750359763758064, + "loss": 12.5843, + "step": 6832 + }, + { + "epoch": 0.37208415865171907, + "grad_norm": 0.7438790929735515, + "learning_rate": 0.00018749932874993191, + "loss": 12.3711, + "step": 6833 + }, + { + "epoch": 0.37213861264830206, + "grad_norm": 0.7148570406597187, + "learning_rate": 0.00018749505918187368, + "loss": 12.6131, + "step": 6834 + }, + { + "epoch": 0.37219306664488505, + "grad_norm": 0.6099944499008384, + "learning_rate": 0.00018749078893343923, + "loss": 12.4924, + "step": 6835 + }, + { + "epoch": 0.3722475206414681, + "grad_norm": 0.6416494040504213, + "learning_rate": 0.00018748651800466176, + "loss": 12.4666, + "step": 6836 + }, + { + "epoch": 0.3723019746380511, + "grad_norm": 0.5960517211523131, + "learning_rate": 0.0001874822463955744, + "loss": 12.4964, + "step": 6837 + }, + { + "epoch": 0.3723564286346341, + "grad_norm": 0.7202137666628853, + "learning_rate": 0.00018747797410621043, + "loss": 12.5145, + "step": 6838 + }, + { + "epoch": 0.37241088263121713, + "grad_norm": 0.6686744255372455, + "learning_rate": 0.0001874737011366031, + "loss": 12.509, + "step": 6839 + }, + { + "epoch": 0.3724653366278001, + "grad_norm": 0.5923281634322939, + "learning_rate": 0.00018746942748678556, + "loss": 12.3796, + "step": 6840 + }, + { + "epoch": 0.3725197906243831, + "grad_norm": 0.6462348839480198, + "learning_rate": 0.00018746515315679112, + "loss": 12.4071, + "step": 6841 + }, + { + "epoch": 0.37257424462096617, + "grad_norm": 0.6403103417823736, + "learning_rate": 0.00018746087814665297, + "loss": 12.3721, + "step": 6842 + }, + { + "epoch": 0.37262869861754916, + "grad_norm": 0.6452511968744338, + "learning_rate": 0.00018745660245640433, + "loss": 12.4392, + "step": 6843 + }, + { + "epoch": 0.37268315261413215, + "grad_norm": 0.8042292528683954, + "learning_rate": 0.0001874523260860785, + "loss": 12.56, + "step": 6844 + }, + { + "epoch": 0.3727376066107152, + "grad_norm": 0.6047385552048565, + "learning_rate": 0.00018744804903570873, + "loss": 12.4637, + "step": 6845 + }, + { + "epoch": 0.3727920606072982, + "grad_norm": 0.6150113789305852, + "learning_rate": 0.00018744377130532826, + "loss": 12.4973, + "step": 6846 + }, + { + "epoch": 0.3728465146038812, + "grad_norm": 0.6082391745298686, + "learning_rate": 0.00018743949289497035, + "loss": 12.4028, + "step": 6847 + }, + { + "epoch": 0.37290096860046423, + "grad_norm": 0.7418134326699819, + "learning_rate": 0.00018743521380466832, + "loss": 12.5765, + "step": 6848 + }, + { + "epoch": 0.3729554225970472, + "grad_norm": 0.6251421328537706, + "learning_rate": 0.00018743093403445537, + "loss": 12.3914, + "step": 6849 + }, + { + "epoch": 0.3730098765936302, + "grad_norm": 0.6643823954217485, + "learning_rate": 0.00018742665358436483, + "loss": 12.576, + "step": 6850 + }, + { + "epoch": 0.37306433059021327, + "grad_norm": 0.6821878946854963, + "learning_rate": 0.00018742237245442995, + "loss": 12.4979, + "step": 6851 + }, + { + "epoch": 0.37311878458679626, + "grad_norm": 0.6284956737925181, + "learning_rate": 0.00018741809064468402, + "loss": 12.3956, + "step": 6852 + }, + { + "epoch": 0.37317323858337925, + "grad_norm": 0.5928727082407248, + "learning_rate": 0.0001874138081551604, + "loss": 12.3172, + "step": 6853 + }, + { + "epoch": 0.3732276925799623, + "grad_norm": 0.7075238356745377, + "learning_rate": 0.00018740952498589236, + "loss": 12.5216, + "step": 6854 + }, + { + "epoch": 0.3732821465765453, + "grad_norm": 0.8036941645145784, + "learning_rate": 0.00018740524113691314, + "loss": 12.7216, + "step": 6855 + }, + { + "epoch": 0.37333660057312834, + "grad_norm": 0.6723133376471627, + "learning_rate": 0.00018740095660825615, + "loss": 12.4274, + "step": 6856 + }, + { + "epoch": 0.37339105456971133, + "grad_norm": 0.691765799487332, + "learning_rate": 0.00018739667139995464, + "loss": 12.5515, + "step": 6857 + }, + { + "epoch": 0.3734455085662943, + "grad_norm": 0.6374179067955774, + "learning_rate": 0.00018739238551204198, + "loss": 12.3448, + "step": 6858 + }, + { + "epoch": 0.3734999625628774, + "grad_norm": 0.736793700634577, + "learning_rate": 0.00018738809894455147, + "loss": 12.4875, + "step": 6859 + }, + { + "epoch": 0.37355441655946037, + "grad_norm": 0.714006602323917, + "learning_rate": 0.00018738381169751644, + "loss": 12.5406, + "step": 6860 + }, + { + "epoch": 0.37360887055604336, + "grad_norm": 0.7584859501030283, + "learning_rate": 0.00018737952377097025, + "loss": 12.4817, + "step": 6861 + }, + { + "epoch": 0.3736633245526264, + "grad_norm": 0.6704402496309448, + "learning_rate": 0.0001873752351649462, + "loss": 12.4883, + "step": 6862 + }, + { + "epoch": 0.3737177785492094, + "grad_norm": 0.6354882856330369, + "learning_rate": 0.0001873709458794777, + "loss": 12.3731, + "step": 6863 + }, + { + "epoch": 0.3737722325457924, + "grad_norm": 0.7091904296222025, + "learning_rate": 0.0001873666559145981, + "loss": 12.5834, + "step": 6864 + }, + { + "epoch": 0.37382668654237544, + "grad_norm": 0.7834726155353686, + "learning_rate": 0.00018736236527034067, + "loss": 12.557, + "step": 6865 + }, + { + "epoch": 0.37388114053895843, + "grad_norm": 0.6743082186571374, + "learning_rate": 0.00018735807394673883, + "loss": 12.4709, + "step": 6866 + }, + { + "epoch": 0.3739355945355414, + "grad_norm": 0.6070605579786073, + "learning_rate": 0.000187353781943826, + "loss": 12.37, + "step": 6867 + }, + { + "epoch": 0.3739900485321245, + "grad_norm": 0.6865918490759351, + "learning_rate": 0.0001873494892616355, + "loss": 12.5524, + "step": 6868 + }, + { + "epoch": 0.37404450252870747, + "grad_norm": 0.662629494210567, + "learning_rate": 0.00018734519590020071, + "loss": 12.553, + "step": 6869 + }, + { + "epoch": 0.37409895652529046, + "grad_norm": 0.6170089067050327, + "learning_rate": 0.00018734090185955503, + "loss": 12.4318, + "step": 6870 + }, + { + "epoch": 0.3741534105218735, + "grad_norm": 0.6673199828990071, + "learning_rate": 0.00018733660713973188, + "loss": 12.344, + "step": 6871 + }, + { + "epoch": 0.3742078645184565, + "grad_norm": 0.9030032262663705, + "learning_rate": 0.0001873323117407646, + "loss": 12.4672, + "step": 6872 + }, + { + "epoch": 0.3742623185150395, + "grad_norm": 0.6476698207148913, + "learning_rate": 0.00018732801566268662, + "loss": 12.4269, + "step": 6873 + }, + { + "epoch": 0.37431677251162254, + "grad_norm": 0.6389493481839894, + "learning_rate": 0.00018732371890553136, + "loss": 12.3736, + "step": 6874 + }, + { + "epoch": 0.37437122650820553, + "grad_norm": 0.6581260506637848, + "learning_rate": 0.0001873194214693322, + "loss": 12.5084, + "step": 6875 + }, + { + "epoch": 0.3744256805047885, + "grad_norm": 0.7331480328607267, + "learning_rate": 0.0001873151233541226, + "loss": 12.4586, + "step": 6876 + }, + { + "epoch": 0.3744801345013716, + "grad_norm": 0.6757682925411165, + "learning_rate": 0.00018731082455993595, + "loss": 12.5669, + "step": 6877 + }, + { + "epoch": 0.37453458849795457, + "grad_norm": 0.7647719878421511, + "learning_rate": 0.00018730652508680567, + "loss": 12.6505, + "step": 6878 + }, + { + "epoch": 0.37458904249453756, + "grad_norm": 0.8106155568072291, + "learning_rate": 0.0001873022249347652, + "loss": 12.5532, + "step": 6879 + }, + { + "epoch": 0.3746434964911206, + "grad_norm": 0.6475029617687691, + "learning_rate": 0.000187297924103848, + "loss": 12.3556, + "step": 6880 + }, + { + "epoch": 0.3746979504877036, + "grad_norm": 0.6906212964994336, + "learning_rate": 0.0001872936225940875, + "loss": 12.5516, + "step": 6881 + }, + { + "epoch": 0.3747524044842866, + "grad_norm": 0.8103337070967035, + "learning_rate": 0.00018728932040551718, + "loss": 12.4691, + "step": 6882 + }, + { + "epoch": 0.37480685848086964, + "grad_norm": 0.6576683290497881, + "learning_rate": 0.00018728501753817044, + "loss": 12.5408, + "step": 6883 + }, + { + "epoch": 0.37486131247745264, + "grad_norm": 0.7051994294658306, + "learning_rate": 0.00018728071399208077, + "loss": 12.342, + "step": 6884 + }, + { + "epoch": 0.3749157664740356, + "grad_norm": 0.6795758079567125, + "learning_rate": 0.00018727640976728163, + "loss": 12.5536, + "step": 6885 + }, + { + "epoch": 0.3749702204706187, + "grad_norm": 0.7042609974729568, + "learning_rate": 0.00018727210486380649, + "loss": 12.5584, + "step": 6886 + }, + { + "epoch": 0.37502467446720167, + "grad_norm": 0.6567415913122908, + "learning_rate": 0.00018726779928168882, + "loss": 12.6032, + "step": 6887 + }, + { + "epoch": 0.37507912846378466, + "grad_norm": 0.6822256992051228, + "learning_rate": 0.00018726349302096212, + "loss": 12.522, + "step": 6888 + }, + { + "epoch": 0.3751335824603677, + "grad_norm": 0.7086191085604834, + "learning_rate": 0.00018725918608165988, + "loss": 12.5549, + "step": 6889 + }, + { + "epoch": 0.3751880364569507, + "grad_norm": 0.7026318941518679, + "learning_rate": 0.00018725487846381556, + "loss": 12.498, + "step": 6890 + }, + { + "epoch": 0.37524249045353375, + "grad_norm": 0.6746306298341949, + "learning_rate": 0.0001872505701674627, + "loss": 12.5457, + "step": 6891 + }, + { + "epoch": 0.37529694445011674, + "grad_norm": 0.9491430206384743, + "learning_rate": 0.0001872462611926347, + "loss": 12.5156, + "step": 6892 + }, + { + "epoch": 0.37535139844669974, + "grad_norm": 0.6081121088069118, + "learning_rate": 0.0001872419515393652, + "loss": 12.3423, + "step": 6893 + }, + { + "epoch": 0.3754058524432828, + "grad_norm": 0.6289634087585562, + "learning_rate": 0.00018723764120768762, + "loss": 12.6402, + "step": 6894 + }, + { + "epoch": 0.3754603064398658, + "grad_norm": 0.766105858999642, + "learning_rate": 0.00018723333019763554, + "loss": 12.333, + "step": 6895 + }, + { + "epoch": 0.37551476043644877, + "grad_norm": 0.7784877505068041, + "learning_rate": 0.00018722901850924247, + "loss": 12.5083, + "step": 6896 + }, + { + "epoch": 0.3755692144330318, + "grad_norm": 0.6293072113676585, + "learning_rate": 0.0001872247061425419, + "loss": 12.4921, + "step": 6897 + }, + { + "epoch": 0.3756236684296148, + "grad_norm": 0.6539959921320518, + "learning_rate": 0.00018722039309756737, + "loss": 12.488, + "step": 6898 + }, + { + "epoch": 0.3756781224261978, + "grad_norm": 0.7197826833160897, + "learning_rate": 0.00018721607937435247, + "loss": 12.2857, + "step": 6899 + }, + { + "epoch": 0.37573257642278085, + "grad_norm": 0.7122843009448809, + "learning_rate": 0.00018721176497293068, + "loss": 12.4334, + "step": 6900 + }, + { + "epoch": 0.37578703041936384, + "grad_norm": 0.6417828679475482, + "learning_rate": 0.0001872074498933356, + "loss": 12.4879, + "step": 6901 + }, + { + "epoch": 0.37584148441594684, + "grad_norm": 0.7345247568517772, + "learning_rate": 0.00018720313413560078, + "loss": 12.5805, + "step": 6902 + }, + { + "epoch": 0.3758959384125299, + "grad_norm": 0.6627147722447752, + "learning_rate": 0.00018719881769975973, + "loss": 12.4623, + "step": 6903 + }, + { + "epoch": 0.3759503924091129, + "grad_norm": 0.7144649763094523, + "learning_rate": 0.00018719450058584606, + "loss": 12.5279, + "step": 6904 + }, + { + "epoch": 0.37600484640569587, + "grad_norm": 0.7627784910727949, + "learning_rate": 0.00018719018279389336, + "loss": 12.713, + "step": 6905 + }, + { + "epoch": 0.3760593004022789, + "grad_norm": 0.6574616799381727, + "learning_rate": 0.00018718586432393515, + "loss": 12.4512, + "step": 6906 + }, + { + "epoch": 0.3761137543988619, + "grad_norm": 0.6138750975409948, + "learning_rate": 0.00018718154517600503, + "loss": 12.5478, + "step": 6907 + }, + { + "epoch": 0.3761682083954449, + "grad_norm": 0.6381407623000527, + "learning_rate": 0.00018717722535013662, + "loss": 12.523, + "step": 6908 + }, + { + "epoch": 0.37622266239202795, + "grad_norm": 0.5934615819263432, + "learning_rate": 0.00018717290484636346, + "loss": 12.4251, + "step": 6909 + }, + { + "epoch": 0.37627711638861094, + "grad_norm": 0.6521590898875078, + "learning_rate": 0.00018716858366471918, + "loss": 12.5358, + "step": 6910 + }, + { + "epoch": 0.37633157038519394, + "grad_norm": 0.7510537780545019, + "learning_rate": 0.00018716426180523737, + "loss": 12.4232, + "step": 6911 + }, + { + "epoch": 0.376386024381777, + "grad_norm": 0.6803248714221944, + "learning_rate": 0.00018715993926795167, + "loss": 12.5625, + "step": 6912 + }, + { + "epoch": 0.37644047837836, + "grad_norm": 0.6825366462464981, + "learning_rate": 0.00018715561605289565, + "loss": 12.4024, + "step": 6913 + }, + { + "epoch": 0.37649493237494297, + "grad_norm": 0.6789555150219736, + "learning_rate": 0.00018715129216010295, + "loss": 12.5354, + "step": 6914 + }, + { + "epoch": 0.376549386371526, + "grad_norm": 0.7318822770872114, + "learning_rate": 0.0001871469675896072, + "loss": 12.5837, + "step": 6915 + }, + { + "epoch": 0.376603840368109, + "grad_norm": 0.6626231088289419, + "learning_rate": 0.00018714264234144198, + "loss": 12.5106, + "step": 6916 + }, + { + "epoch": 0.376658294364692, + "grad_norm": 0.7217653612409614, + "learning_rate": 0.00018713831641564097, + "loss": 12.5852, + "step": 6917 + }, + { + "epoch": 0.37671274836127505, + "grad_norm": 0.6263709870042876, + "learning_rate": 0.0001871339898122378, + "loss": 12.4448, + "step": 6918 + }, + { + "epoch": 0.37676720235785804, + "grad_norm": 0.6811089588998432, + "learning_rate": 0.0001871296625312661, + "loss": 12.5556, + "step": 6919 + }, + { + "epoch": 0.37682165635444104, + "grad_norm": 0.6573727791834589, + "learning_rate": 0.0001871253345727596, + "loss": 12.4701, + "step": 6920 + }, + { + "epoch": 0.3768761103510241, + "grad_norm": 0.6684755993242253, + "learning_rate": 0.00018712100593675182, + "loss": 12.5007, + "step": 6921 + }, + { + "epoch": 0.3769305643476071, + "grad_norm": 0.6443217936756908, + "learning_rate": 0.0001871166766232765, + "loss": 12.4511, + "step": 6922 + }, + { + "epoch": 0.3769850183441901, + "grad_norm": 0.6535774774464812, + "learning_rate": 0.0001871123466323673, + "loss": 12.549, + "step": 6923 + }, + { + "epoch": 0.3770394723407731, + "grad_norm": 0.6911624086653004, + "learning_rate": 0.00018710801596405786, + "loss": 12.6717, + "step": 6924 + }, + { + "epoch": 0.3770939263373561, + "grad_norm": 0.6649678708702024, + "learning_rate": 0.0001871036846183819, + "loss": 12.3266, + "step": 6925 + }, + { + "epoch": 0.37714838033393916, + "grad_norm": 0.6838715546443267, + "learning_rate": 0.00018709935259537307, + "loss": 12.5144, + "step": 6926 + }, + { + "epoch": 0.37720283433052215, + "grad_norm": 0.6701906131016372, + "learning_rate": 0.00018709501989506508, + "loss": 12.6125, + "step": 6927 + }, + { + "epoch": 0.37725728832710514, + "grad_norm": 0.6492567726196441, + "learning_rate": 0.00018709068651749162, + "loss": 12.5389, + "step": 6928 + }, + { + "epoch": 0.3773117423236882, + "grad_norm": 0.7128276988468376, + "learning_rate": 0.0001870863524626864, + "loss": 12.5214, + "step": 6929 + }, + { + "epoch": 0.3773661963202712, + "grad_norm": 0.7077479095340496, + "learning_rate": 0.00018708201773068303, + "loss": 12.5846, + "step": 6930 + }, + { + "epoch": 0.3774206503168542, + "grad_norm": 0.6949116410713745, + "learning_rate": 0.00018707768232151533, + "loss": 12.6226, + "step": 6931 + }, + { + "epoch": 0.3774751043134372, + "grad_norm": 0.6231198852264871, + "learning_rate": 0.00018707334623521696, + "loss": 12.3808, + "step": 6932 + }, + { + "epoch": 0.3775295583100202, + "grad_norm": 0.6374499503869536, + "learning_rate": 0.00018706900947182165, + "loss": 12.378, + "step": 6933 + }, + { + "epoch": 0.3775840123066032, + "grad_norm": 0.6319859637637218, + "learning_rate": 0.00018706467203136312, + "loss": 12.5027, + "step": 6934 + }, + { + "epoch": 0.37763846630318626, + "grad_norm": 0.6557296163600854, + "learning_rate": 0.0001870603339138751, + "loss": 12.4791, + "step": 6935 + }, + { + "epoch": 0.37769292029976925, + "grad_norm": 0.7869772154386102, + "learning_rate": 0.0001870559951193913, + "loss": 12.4481, + "step": 6936 + }, + { + "epoch": 0.37774737429635225, + "grad_norm": 0.6653265908844587, + "learning_rate": 0.0001870516556479455, + "loss": 12.4008, + "step": 6937 + }, + { + "epoch": 0.3778018282929353, + "grad_norm": 0.6172815715116846, + "learning_rate": 0.00018704731549957146, + "loss": 12.3995, + "step": 6938 + }, + { + "epoch": 0.3778562822895183, + "grad_norm": 0.6468767865845889, + "learning_rate": 0.00018704297467430286, + "loss": 12.5116, + "step": 6939 + }, + { + "epoch": 0.3779107362861013, + "grad_norm": 0.759668058569879, + "learning_rate": 0.00018703863317217349, + "loss": 12.3574, + "step": 6940 + }, + { + "epoch": 0.3779651902826843, + "grad_norm": 0.7262095640332545, + "learning_rate": 0.0001870342909932171, + "loss": 12.5853, + "step": 6941 + }, + { + "epoch": 0.3780196442792673, + "grad_norm": 0.7033921882821391, + "learning_rate": 0.0001870299481374675, + "loss": 12.4206, + "step": 6942 + }, + { + "epoch": 0.3780740982758503, + "grad_norm": 0.6548268998794594, + "learning_rate": 0.00018702560460495844, + "loss": 12.4, + "step": 6943 + }, + { + "epoch": 0.37812855227243336, + "grad_norm": 0.7040645449818776, + "learning_rate": 0.00018702126039572364, + "loss": 12.5144, + "step": 6944 + }, + { + "epoch": 0.37818300626901635, + "grad_norm": 0.619790631113332, + "learning_rate": 0.00018701691550979696, + "loss": 12.4999, + "step": 6945 + }, + { + "epoch": 0.37823746026559935, + "grad_norm": 0.6725003511870317, + "learning_rate": 0.00018701256994721214, + "loss": 12.4599, + "step": 6946 + }, + { + "epoch": 0.3782919142621824, + "grad_norm": 0.833030015569806, + "learning_rate": 0.000187008223708003, + "loss": 12.405, + "step": 6947 + }, + { + "epoch": 0.3783463682587654, + "grad_norm": 0.6901930567573182, + "learning_rate": 0.00018700387679220328, + "loss": 12.4261, + "step": 6948 + }, + { + "epoch": 0.3784008222553484, + "grad_norm": 0.6502639678681048, + "learning_rate": 0.00018699952919984684, + "loss": 12.6263, + "step": 6949 + }, + { + "epoch": 0.3784552762519314, + "grad_norm": 0.8842314476326208, + "learning_rate": 0.0001869951809309675, + "loss": 12.7245, + "step": 6950 + }, + { + "epoch": 0.3785097302485144, + "grad_norm": 0.6350870700668845, + "learning_rate": 0.00018699083198559904, + "loss": 12.4577, + "step": 6951 + }, + { + "epoch": 0.3785641842450974, + "grad_norm": 0.6456154832906754, + "learning_rate": 0.00018698648236377524, + "loss": 12.4396, + "step": 6952 + }, + { + "epoch": 0.37861863824168046, + "grad_norm": 0.6606268237260083, + "learning_rate": 0.00018698213206553001, + "loss": 12.5647, + "step": 6953 + }, + { + "epoch": 0.37867309223826345, + "grad_norm": 0.6516744393667526, + "learning_rate": 0.00018697778109089713, + "loss": 12.4016, + "step": 6954 + }, + { + "epoch": 0.37872754623484645, + "grad_norm": 0.627243644658615, + "learning_rate": 0.00018697342943991042, + "loss": 12.4443, + "step": 6955 + }, + { + "epoch": 0.3787820002314295, + "grad_norm": 0.5718938120004171, + "learning_rate": 0.00018696907711260373, + "loss": 12.3243, + "step": 6956 + }, + { + "epoch": 0.3788364542280125, + "grad_norm": 0.6304957153919665, + "learning_rate": 0.00018696472410901092, + "loss": 12.5637, + "step": 6957 + }, + { + "epoch": 0.37889090822459554, + "grad_norm": 0.6404029072045193, + "learning_rate": 0.00018696037042916582, + "loss": 12.4211, + "step": 6958 + }, + { + "epoch": 0.3789453622211785, + "grad_norm": 0.5913884159550243, + "learning_rate": 0.00018695601607310233, + "loss": 12.4747, + "step": 6959 + }, + { + "epoch": 0.3789998162177615, + "grad_norm": 0.697847973853796, + "learning_rate": 0.00018695166104085425, + "loss": 12.5801, + "step": 6960 + }, + { + "epoch": 0.37905427021434457, + "grad_norm": 0.6430583472870945, + "learning_rate": 0.00018694730533245547, + "loss": 12.5845, + "step": 6961 + }, + { + "epoch": 0.37910872421092756, + "grad_norm": 0.715986131250449, + "learning_rate": 0.00018694294894793987, + "loss": 12.4404, + "step": 6962 + }, + { + "epoch": 0.37916317820751055, + "grad_norm": 0.6646346545669394, + "learning_rate": 0.00018693859188734132, + "loss": 12.5251, + "step": 6963 + }, + { + "epoch": 0.3792176322040936, + "grad_norm": 0.6636199953215705, + "learning_rate": 0.00018693423415069372, + "loss": 12.5536, + "step": 6964 + }, + { + "epoch": 0.3792720862006766, + "grad_norm": 0.688855654533608, + "learning_rate": 0.00018692987573803088, + "loss": 12.5833, + "step": 6965 + }, + { + "epoch": 0.3793265401972596, + "grad_norm": 0.7349669171500565, + "learning_rate": 0.0001869255166493868, + "loss": 12.3737, + "step": 6966 + }, + { + "epoch": 0.37938099419384264, + "grad_norm": 0.6417685318125261, + "learning_rate": 0.00018692115688479532, + "loss": 12.4549, + "step": 6967 + }, + { + "epoch": 0.37943544819042563, + "grad_norm": 0.6182139840504983, + "learning_rate": 0.00018691679644429034, + "loss": 12.3984, + "step": 6968 + }, + { + "epoch": 0.3794899021870086, + "grad_norm": 0.6748562831650746, + "learning_rate": 0.00018691243532790576, + "loss": 12.5668, + "step": 6969 + }, + { + "epoch": 0.37954435618359167, + "grad_norm": 0.6298528138760133, + "learning_rate": 0.00018690807353567552, + "loss": 12.3831, + "step": 6970 + }, + { + "epoch": 0.37959881018017466, + "grad_norm": 0.6123954408760172, + "learning_rate": 0.00018690371106763355, + "loss": 12.3804, + "step": 6971 + }, + { + "epoch": 0.37965326417675765, + "grad_norm": 0.6024966858966703, + "learning_rate": 0.0001868993479238137, + "loss": 12.4899, + "step": 6972 + }, + { + "epoch": 0.3797077181733407, + "grad_norm": 0.6696906147905399, + "learning_rate": 0.00018689498410424997, + "loss": 12.3153, + "step": 6973 + }, + { + "epoch": 0.3797621721699237, + "grad_norm": 0.6218780017682547, + "learning_rate": 0.00018689061960897626, + "loss": 12.2652, + "step": 6974 + }, + { + "epoch": 0.3798166261665067, + "grad_norm": 0.6388311787850542, + "learning_rate": 0.00018688625443802654, + "loss": 12.4931, + "step": 6975 + }, + { + "epoch": 0.37987108016308974, + "grad_norm": 0.6041740522690894, + "learning_rate": 0.00018688188859143474, + "loss": 12.3498, + "step": 6976 + }, + { + "epoch": 0.37992553415967273, + "grad_norm": 0.6607235444859734, + "learning_rate": 0.0001868775220692348, + "loss": 12.5813, + "step": 6977 + }, + { + "epoch": 0.3799799881562557, + "grad_norm": 0.6593464381004138, + "learning_rate": 0.00018687315487146065, + "loss": 12.4265, + "step": 6978 + }, + { + "epoch": 0.38003444215283877, + "grad_norm": 0.6444505430730013, + "learning_rate": 0.00018686878699814629, + "loss": 12.4545, + "step": 6979 + }, + { + "epoch": 0.38008889614942176, + "grad_norm": 0.6341830350464173, + "learning_rate": 0.0001868644184493257, + "loss": 12.4751, + "step": 6980 + }, + { + "epoch": 0.38014335014600475, + "grad_norm": 0.8000018745026319, + "learning_rate": 0.0001868600492250328, + "loss": 12.5567, + "step": 6981 + }, + { + "epoch": 0.3801978041425878, + "grad_norm": 0.6539842272711024, + "learning_rate": 0.00018685567932530162, + "loss": 12.5653, + "step": 6982 + }, + { + "epoch": 0.3802522581391708, + "grad_norm": 0.6947964022800268, + "learning_rate": 0.0001868513087501661, + "loss": 12.3735, + "step": 6983 + }, + { + "epoch": 0.3803067121357538, + "grad_norm": 0.586767907830935, + "learning_rate": 0.0001868469374996602, + "loss": 12.4559, + "step": 6984 + }, + { + "epoch": 0.38036116613233684, + "grad_norm": 0.650386122773557, + "learning_rate": 0.000186842565573818, + "loss": 12.3953, + "step": 6985 + }, + { + "epoch": 0.38041562012891983, + "grad_norm": 0.6709438303267575, + "learning_rate": 0.00018683819297267342, + "loss": 12.4941, + "step": 6986 + }, + { + "epoch": 0.3804700741255028, + "grad_norm": 0.6318954670614686, + "learning_rate": 0.0001868338196962605, + "loss": 12.4621, + "step": 6987 + }, + { + "epoch": 0.38052452812208587, + "grad_norm": 0.5801655542248927, + "learning_rate": 0.00018682944574461324, + "loss": 12.2644, + "step": 6988 + }, + { + "epoch": 0.38057898211866886, + "grad_norm": 0.6750535916093292, + "learning_rate": 0.00018682507111776565, + "loss": 12.5385, + "step": 6989 + }, + { + "epoch": 0.3806334361152519, + "grad_norm": 0.6481330046989016, + "learning_rate": 0.00018682069581575173, + "loss": 12.5772, + "step": 6990 + }, + { + "epoch": 0.3806878901118349, + "grad_norm": 0.6622877449441541, + "learning_rate": 0.00018681631983860552, + "loss": 12.4641, + "step": 6991 + }, + { + "epoch": 0.3807423441084179, + "grad_norm": 0.6137140645342416, + "learning_rate": 0.00018681194318636104, + "loss": 12.4936, + "step": 6992 + }, + { + "epoch": 0.38079679810500094, + "grad_norm": 0.6082401870926571, + "learning_rate": 0.00018680756585905234, + "loss": 12.4662, + "step": 6993 + }, + { + "epoch": 0.38085125210158394, + "grad_norm": 0.6308495684452727, + "learning_rate": 0.00018680318785671348, + "loss": 12.6563, + "step": 6994 + }, + { + "epoch": 0.38090570609816693, + "grad_norm": 0.7416546041029876, + "learning_rate": 0.00018679880917937843, + "loss": 12.5217, + "step": 6995 + }, + { + "epoch": 0.38096016009475, + "grad_norm": 0.7087242752036145, + "learning_rate": 0.00018679442982708132, + "loss": 12.4414, + "step": 6996 + }, + { + "epoch": 0.38101461409133297, + "grad_norm": 0.6176554026255653, + "learning_rate": 0.00018679004979985615, + "loss": 12.3972, + "step": 6997 + }, + { + "epoch": 0.38106906808791596, + "grad_norm": 0.6720561068355646, + "learning_rate": 0.00018678566909773698, + "loss": 12.517, + "step": 6998 + }, + { + "epoch": 0.381123522084499, + "grad_norm": 0.7147146419691247, + "learning_rate": 0.00018678128772075793, + "loss": 12.4048, + "step": 6999 + }, + { + "epoch": 0.381177976081082, + "grad_norm": 0.8218624835163882, + "learning_rate": 0.00018677690566895302, + "loss": 12.3689, + "step": 7000 + }, + { + "epoch": 0.381232430077665, + "grad_norm": 0.9732985061261495, + "learning_rate": 0.00018677252294235634, + "loss": 12.5755, + "step": 7001 + }, + { + "epoch": 0.38128688407424804, + "grad_norm": 0.7098918965887916, + "learning_rate": 0.00018676813954100196, + "loss": 12.4009, + "step": 7002 + }, + { + "epoch": 0.38134133807083104, + "grad_norm": 0.7461767152255029, + "learning_rate": 0.00018676375546492396, + "loss": 12.57, + "step": 7003 + }, + { + "epoch": 0.38139579206741403, + "grad_norm": 0.8675423218585037, + "learning_rate": 0.00018675937071415647, + "loss": 12.3617, + "step": 7004 + }, + { + "epoch": 0.3814502460639971, + "grad_norm": 0.717370084546687, + "learning_rate": 0.0001867549852887336, + "loss": 12.5304, + "step": 7005 + }, + { + "epoch": 0.38150470006058007, + "grad_norm": 0.6231210159631485, + "learning_rate": 0.00018675059918868935, + "loss": 12.3976, + "step": 7006 + }, + { + "epoch": 0.38155915405716306, + "grad_norm": 0.6817521240331398, + "learning_rate": 0.00018674621241405792, + "loss": 12.5481, + "step": 7007 + }, + { + "epoch": 0.3816136080537461, + "grad_norm": 0.7397513221624799, + "learning_rate": 0.0001867418249648734, + "loss": 12.6074, + "step": 7008 + }, + { + "epoch": 0.3816680620503291, + "grad_norm": 0.7386122621149642, + "learning_rate": 0.0001867374368411699, + "loss": 12.5742, + "step": 7009 + }, + { + "epoch": 0.3817225160469121, + "grad_norm": 0.6308245493328805, + "learning_rate": 0.00018673304804298156, + "loss": 12.4646, + "step": 7010 + }, + { + "epoch": 0.38177697004349515, + "grad_norm": 0.6765571167770338, + "learning_rate": 0.00018672865857034246, + "loss": 12.391, + "step": 7011 + }, + { + "epoch": 0.38183142404007814, + "grad_norm": 0.6044597658565793, + "learning_rate": 0.00018672426842328678, + "loss": 12.4188, + "step": 7012 + }, + { + "epoch": 0.38188587803666113, + "grad_norm": 0.6584114653028906, + "learning_rate": 0.00018671987760184865, + "loss": 12.513, + "step": 7013 + }, + { + "epoch": 0.3819403320332442, + "grad_norm": 0.6697607658639393, + "learning_rate": 0.0001867154861060622, + "loss": 12.4682, + "step": 7014 + }, + { + "epoch": 0.38199478602982717, + "grad_norm": 0.7005381190713698, + "learning_rate": 0.00018671109393596157, + "loss": 12.3883, + "step": 7015 + }, + { + "epoch": 0.38204924002641016, + "grad_norm": 0.6644075644800487, + "learning_rate": 0.00018670670109158097, + "loss": 12.5448, + "step": 7016 + }, + { + "epoch": 0.3821036940229932, + "grad_norm": 0.5827707034044087, + "learning_rate": 0.00018670230757295453, + "loss": 12.452, + "step": 7017 + }, + { + "epoch": 0.3821581480195762, + "grad_norm": 0.6974112601121403, + "learning_rate": 0.00018669791338011638, + "loss": 12.4363, + "step": 7018 + }, + { + "epoch": 0.3822126020161592, + "grad_norm": 0.6065298551477821, + "learning_rate": 0.00018669351851310074, + "loss": 12.4149, + "step": 7019 + }, + { + "epoch": 0.38226705601274225, + "grad_norm": 0.649312763578988, + "learning_rate": 0.0001866891229719417, + "loss": 12.5342, + "step": 7020 + }, + { + "epoch": 0.38232151000932524, + "grad_norm": 0.6438265673298778, + "learning_rate": 0.00018668472675667354, + "loss": 12.4505, + "step": 7021 + }, + { + "epoch": 0.38237596400590823, + "grad_norm": 0.6989175412573948, + "learning_rate": 0.00018668032986733044, + "loss": 12.5572, + "step": 7022 + }, + { + "epoch": 0.3824304180024913, + "grad_norm": 0.6689752525509498, + "learning_rate": 0.0001866759323039465, + "loss": 12.5695, + "step": 7023 + }, + { + "epoch": 0.38248487199907427, + "grad_norm": 0.6255947812736262, + "learning_rate": 0.00018667153406655605, + "loss": 12.4277, + "step": 7024 + }, + { + "epoch": 0.3825393259956573, + "grad_norm": 0.6846448014748062, + "learning_rate": 0.00018666713515519314, + "loss": 12.5029, + "step": 7025 + }, + { + "epoch": 0.3825937799922403, + "grad_norm": 0.6256232122015504, + "learning_rate": 0.0001866627355698921, + "loss": 12.4771, + "step": 7026 + }, + { + "epoch": 0.3826482339888233, + "grad_norm": 0.6257148059143375, + "learning_rate": 0.0001866583353106871, + "loss": 12.4713, + "step": 7027 + }, + { + "epoch": 0.38270268798540635, + "grad_norm": 0.7104753379111266, + "learning_rate": 0.00018665393437761231, + "loss": 12.488, + "step": 7028 + }, + { + "epoch": 0.38275714198198935, + "grad_norm": 0.6939144331346863, + "learning_rate": 0.000186649532770702, + "loss": 12.5109, + "step": 7029 + }, + { + "epoch": 0.38281159597857234, + "grad_norm": 0.600769214359039, + "learning_rate": 0.0001866451304899904, + "loss": 12.4113, + "step": 7030 + }, + { + "epoch": 0.3828660499751554, + "grad_norm": 0.569467143868859, + "learning_rate": 0.00018664072753551175, + "loss": 12.2896, + "step": 7031 + }, + { + "epoch": 0.3829205039717384, + "grad_norm": 0.7355576933800445, + "learning_rate": 0.0001866363239073003, + "loss": 12.4322, + "step": 7032 + }, + { + "epoch": 0.3829749579683214, + "grad_norm": 0.6909015301874856, + "learning_rate": 0.00018663191960539022, + "loss": 12.4023, + "step": 7033 + }, + { + "epoch": 0.3830294119649044, + "grad_norm": 0.7983345034187423, + "learning_rate": 0.0001866275146298158, + "loss": 12.4993, + "step": 7034 + }, + { + "epoch": 0.3830838659614874, + "grad_norm": 0.5951362455422403, + "learning_rate": 0.00018662310898061134, + "loss": 12.4463, + "step": 7035 + }, + { + "epoch": 0.3831383199580704, + "grad_norm": 0.6606117787777263, + "learning_rate": 0.00018661870265781103, + "loss": 12.6477, + "step": 7036 + }, + { + "epoch": 0.38319277395465345, + "grad_norm": 0.6052443542532031, + "learning_rate": 0.00018661429566144917, + "loss": 12.3899, + "step": 7037 + }, + { + "epoch": 0.38324722795123645, + "grad_norm": 0.6912374558936648, + "learning_rate": 0.00018660988799156002, + "loss": 12.3956, + "step": 7038 + }, + { + "epoch": 0.38330168194781944, + "grad_norm": 0.678500062864124, + "learning_rate": 0.00018660547964817784, + "loss": 12.4912, + "step": 7039 + }, + { + "epoch": 0.3833561359444025, + "grad_norm": 0.6451129761422665, + "learning_rate": 0.00018660107063133693, + "loss": 12.4821, + "step": 7040 + }, + { + "epoch": 0.3834105899409855, + "grad_norm": 0.7276540261481783, + "learning_rate": 0.0001865966609410716, + "loss": 12.5236, + "step": 7041 + }, + { + "epoch": 0.3834650439375685, + "grad_norm": 0.7008677196954861, + "learning_rate": 0.0001865922505774161, + "loss": 12.4493, + "step": 7042 + }, + { + "epoch": 0.3835194979341515, + "grad_norm": 0.6704835137426014, + "learning_rate": 0.00018658783954040472, + "loss": 12.6249, + "step": 7043 + }, + { + "epoch": 0.3835739519307345, + "grad_norm": 0.7532813678875407, + "learning_rate": 0.0001865834278300718, + "loss": 12.4514, + "step": 7044 + }, + { + "epoch": 0.3836284059273175, + "grad_norm": 0.6759469688196224, + "learning_rate": 0.0001865790154464516, + "loss": 12.6866, + "step": 7045 + }, + { + "epoch": 0.38368285992390055, + "grad_norm": 0.5925743120960456, + "learning_rate": 0.0001865746023895785, + "loss": 12.3878, + "step": 7046 + }, + { + "epoch": 0.38373731392048355, + "grad_norm": 0.7693887197413217, + "learning_rate": 0.00018657018865948674, + "loss": 12.6576, + "step": 7047 + }, + { + "epoch": 0.38379176791706654, + "grad_norm": 0.6898680905211517, + "learning_rate": 0.00018656577425621066, + "loss": 12.5065, + "step": 7048 + }, + { + "epoch": 0.3838462219136496, + "grad_norm": 0.7156536380503498, + "learning_rate": 0.00018656135917978462, + "loss": 12.4546, + "step": 7049 + }, + { + "epoch": 0.3839006759102326, + "grad_norm": 0.6994971638710071, + "learning_rate": 0.00018655694343024294, + "loss": 12.6109, + "step": 7050 + }, + { + "epoch": 0.3839551299068156, + "grad_norm": 0.7793553009856318, + "learning_rate": 0.00018655252700761996, + "loss": 12.4605, + "step": 7051 + }, + { + "epoch": 0.3840095839033986, + "grad_norm": 0.6759341611546079, + "learning_rate": 0.00018654810991195001, + "loss": 12.464, + "step": 7052 + }, + { + "epoch": 0.3840640378999816, + "grad_norm": 0.6301665691505982, + "learning_rate": 0.00018654369214326746, + "loss": 12.517, + "step": 7053 + }, + { + "epoch": 0.3841184918965646, + "grad_norm": 0.7135243832459273, + "learning_rate": 0.0001865392737016066, + "loss": 12.4516, + "step": 7054 + }, + { + "epoch": 0.38417294589314765, + "grad_norm": 0.6858399057276566, + "learning_rate": 0.00018653485458700186, + "loss": 12.5874, + "step": 7055 + }, + { + "epoch": 0.38422739988973065, + "grad_norm": 0.8149053831416443, + "learning_rate": 0.00018653043479948758, + "loss": 12.5103, + "step": 7056 + }, + { + "epoch": 0.3842818538863137, + "grad_norm": 0.7046987729073869, + "learning_rate": 0.00018652601433909814, + "loss": 12.5548, + "step": 7057 + }, + { + "epoch": 0.3843363078828967, + "grad_norm": 0.6290864552411815, + "learning_rate": 0.00018652159320586788, + "loss": 12.499, + "step": 7058 + }, + { + "epoch": 0.3843907618794797, + "grad_norm": 0.6916875542987843, + "learning_rate": 0.00018651717139983123, + "loss": 12.4951, + "step": 7059 + }, + { + "epoch": 0.38444521587606273, + "grad_norm": 0.6978979497616465, + "learning_rate": 0.0001865127489210225, + "loss": 12.6373, + "step": 7060 + }, + { + "epoch": 0.3844996698726457, + "grad_norm": 0.610515344658147, + "learning_rate": 0.0001865083257694762, + "loss": 12.5357, + "step": 7061 + }, + { + "epoch": 0.3845541238692287, + "grad_norm": 0.6823700404070057, + "learning_rate": 0.00018650390194522657, + "loss": 12.5222, + "step": 7062 + }, + { + "epoch": 0.38460857786581176, + "grad_norm": 0.6160180562682802, + "learning_rate": 0.00018649947744830815, + "loss": 12.4801, + "step": 7063 + }, + { + "epoch": 0.38466303186239476, + "grad_norm": 0.6670446903986819, + "learning_rate": 0.00018649505227875525, + "loss": 12.6218, + "step": 7064 + }, + { + "epoch": 0.38471748585897775, + "grad_norm": 0.5787244558977549, + "learning_rate": 0.00018649062643660234, + "loss": 12.3791, + "step": 7065 + }, + { + "epoch": 0.3847719398555608, + "grad_norm": 0.6615016138111451, + "learning_rate": 0.0001864861999218838, + "loss": 12.4197, + "step": 7066 + }, + { + "epoch": 0.3848263938521438, + "grad_norm": 0.6674308971925494, + "learning_rate": 0.00018648177273463407, + "loss": 12.5483, + "step": 7067 + }, + { + "epoch": 0.3848808478487268, + "grad_norm": 0.6715041283823071, + "learning_rate": 0.0001864773448748876, + "loss": 12.4357, + "step": 7068 + }, + { + "epoch": 0.38493530184530983, + "grad_norm": 0.6385821710498155, + "learning_rate": 0.00018647291634267874, + "loss": 12.45, + "step": 7069 + }, + { + "epoch": 0.3849897558418928, + "grad_norm": 0.667305118023495, + "learning_rate": 0.00018646848713804203, + "loss": 12.5501, + "step": 7070 + }, + { + "epoch": 0.3850442098384758, + "grad_norm": 0.6637185088679254, + "learning_rate": 0.0001864640572610118, + "loss": 12.5592, + "step": 7071 + }, + { + "epoch": 0.38509866383505886, + "grad_norm": 0.6979028179275554, + "learning_rate": 0.00018645962671162263, + "loss": 12.4207, + "step": 7072 + }, + { + "epoch": 0.38515311783164186, + "grad_norm": 0.7793517294375236, + "learning_rate": 0.00018645519548990888, + "loss": 12.4839, + "step": 7073 + }, + { + "epoch": 0.38520757182822485, + "grad_norm": 0.6047285988296436, + "learning_rate": 0.00018645076359590502, + "loss": 12.4372, + "step": 7074 + }, + { + "epoch": 0.3852620258248079, + "grad_norm": 0.6646751397762883, + "learning_rate": 0.00018644633102964556, + "loss": 12.593, + "step": 7075 + }, + { + "epoch": 0.3853164798213909, + "grad_norm": 0.729705495106553, + "learning_rate": 0.00018644189779116487, + "loss": 12.4074, + "step": 7076 + }, + { + "epoch": 0.3853709338179739, + "grad_norm": 0.6084925856486768, + "learning_rate": 0.00018643746388049754, + "loss": 12.5115, + "step": 7077 + }, + { + "epoch": 0.38542538781455693, + "grad_norm": 0.7647209717282059, + "learning_rate": 0.00018643302929767795, + "loss": 12.5896, + "step": 7078 + }, + { + "epoch": 0.3854798418111399, + "grad_norm": 0.5955347064856068, + "learning_rate": 0.00018642859404274068, + "loss": 12.5077, + "step": 7079 + }, + { + "epoch": 0.3855342958077229, + "grad_norm": 0.6422483619212731, + "learning_rate": 0.00018642415811572017, + "loss": 12.5562, + "step": 7080 + }, + { + "epoch": 0.38558874980430596, + "grad_norm": 0.694516033481041, + "learning_rate": 0.00018641972151665085, + "loss": 12.5192, + "step": 7081 + }, + { + "epoch": 0.38564320380088896, + "grad_norm": 0.6587257113621006, + "learning_rate": 0.00018641528424556735, + "loss": 12.5755, + "step": 7082 + }, + { + "epoch": 0.38569765779747195, + "grad_norm": 0.6763562072483827, + "learning_rate": 0.00018641084630250407, + "loss": 12.4134, + "step": 7083 + }, + { + "epoch": 0.385752111794055, + "grad_norm": 0.6203052034426025, + "learning_rate": 0.00018640640768749557, + "loss": 12.5064, + "step": 7084 + }, + { + "epoch": 0.385806565790638, + "grad_norm": 0.659861270783671, + "learning_rate": 0.00018640196840057636, + "loss": 12.4955, + "step": 7085 + }, + { + "epoch": 0.385861019787221, + "grad_norm": 0.6437770506543342, + "learning_rate": 0.00018639752844178093, + "loss": 12.5114, + "step": 7086 + }, + { + "epoch": 0.38591547378380403, + "grad_norm": 0.7570541110260534, + "learning_rate": 0.00018639308781114386, + "loss": 12.6241, + "step": 7087 + }, + { + "epoch": 0.385969927780387, + "grad_norm": 0.7411434827368806, + "learning_rate": 0.00018638864650869966, + "loss": 12.4622, + "step": 7088 + }, + { + "epoch": 0.38602438177697, + "grad_norm": 0.6590951056007902, + "learning_rate": 0.0001863842045344828, + "loss": 12.564, + "step": 7089 + }, + { + "epoch": 0.38607883577355306, + "grad_norm": 0.6097965134684681, + "learning_rate": 0.0001863797618885279, + "loss": 12.5037, + "step": 7090 + }, + { + "epoch": 0.38613328977013606, + "grad_norm": 0.6427854864860034, + "learning_rate": 0.00018637531857086952, + "loss": 12.4269, + "step": 7091 + }, + { + "epoch": 0.3861877437667191, + "grad_norm": 0.7155316157969179, + "learning_rate": 0.00018637087458154214, + "loss": 12.5963, + "step": 7092 + }, + { + "epoch": 0.3862421977633021, + "grad_norm": 0.6965946192322601, + "learning_rate": 0.00018636642992058038, + "loss": 12.5289, + "step": 7093 + }, + { + "epoch": 0.3862966517598851, + "grad_norm": 0.6009755117826611, + "learning_rate": 0.00018636198458801877, + "loss": 12.4643, + "step": 7094 + }, + { + "epoch": 0.38635110575646814, + "grad_norm": 0.7223297002920598, + "learning_rate": 0.0001863575385838919, + "loss": 12.6021, + "step": 7095 + }, + { + "epoch": 0.38640555975305113, + "grad_norm": 0.6603037776913001, + "learning_rate": 0.0001863530919082343, + "loss": 12.5357, + "step": 7096 + }, + { + "epoch": 0.3864600137496341, + "grad_norm": 0.6071533180146176, + "learning_rate": 0.00018634864456108056, + "loss": 12.4168, + "step": 7097 + }, + { + "epoch": 0.38651446774621717, + "grad_norm": 0.7172644840488438, + "learning_rate": 0.00018634419654246532, + "loss": 12.5584, + "step": 7098 + }, + { + "epoch": 0.38656892174280016, + "grad_norm": 0.7060305880193891, + "learning_rate": 0.00018633974785242313, + "loss": 12.4297, + "step": 7099 + }, + { + "epoch": 0.38662337573938316, + "grad_norm": 0.6810705096067343, + "learning_rate": 0.00018633529849098856, + "loss": 12.5561, + "step": 7100 + }, + { + "epoch": 0.3866778297359662, + "grad_norm": 0.6530001434960051, + "learning_rate": 0.0001863308484581962, + "loss": 12.5864, + "step": 7101 + }, + { + "epoch": 0.3867322837325492, + "grad_norm": 0.6948748912939586, + "learning_rate": 0.00018632639775408073, + "loss": 12.4706, + "step": 7102 + }, + { + "epoch": 0.3867867377291322, + "grad_norm": 0.6345644797992628, + "learning_rate": 0.0001863219463786767, + "loss": 12.395, + "step": 7103 + }, + { + "epoch": 0.38684119172571524, + "grad_norm": 0.7069631467744489, + "learning_rate": 0.00018631749433201876, + "loss": 12.6182, + "step": 7104 + }, + { + "epoch": 0.38689564572229823, + "grad_norm": 0.6557637249767786, + "learning_rate": 0.0001863130416141415, + "loss": 12.5655, + "step": 7105 + }, + { + "epoch": 0.3869500997188812, + "grad_norm": 0.5965658094492697, + "learning_rate": 0.00018630858822507956, + "loss": 12.4235, + "step": 7106 + }, + { + "epoch": 0.3870045537154643, + "grad_norm": 0.6341298768236069, + "learning_rate": 0.00018630413416486754, + "loss": 12.4438, + "step": 7107 + }, + { + "epoch": 0.38705900771204726, + "grad_norm": 0.6500506288593676, + "learning_rate": 0.0001862996794335401, + "loss": 12.5798, + "step": 7108 + }, + { + "epoch": 0.38711346170863026, + "grad_norm": 0.6411657990928703, + "learning_rate": 0.0001862952240311319, + "loss": 12.4585, + "step": 7109 + }, + { + "epoch": 0.3871679157052133, + "grad_norm": 0.6437618395015029, + "learning_rate": 0.00018629076795767755, + "loss": 12.4662, + "step": 7110 + }, + { + "epoch": 0.3872223697017963, + "grad_norm": 0.665479862318005, + "learning_rate": 0.00018628631121321172, + "loss": 12.4517, + "step": 7111 + }, + { + "epoch": 0.3872768236983793, + "grad_norm": 0.6880421473175254, + "learning_rate": 0.00018628185379776909, + "loss": 12.4043, + "step": 7112 + }, + { + "epoch": 0.38733127769496234, + "grad_norm": 0.6701975253629583, + "learning_rate": 0.00018627739571138425, + "loss": 12.4521, + "step": 7113 + }, + { + "epoch": 0.38738573169154533, + "grad_norm": 0.6120212944128985, + "learning_rate": 0.00018627293695409194, + "loss": 12.5299, + "step": 7114 + }, + { + "epoch": 0.3874401856881283, + "grad_norm": 0.6194184670856256, + "learning_rate": 0.0001862684775259268, + "loss": 12.4645, + "step": 7115 + }, + { + "epoch": 0.3874946396847114, + "grad_norm": 0.7047969299610378, + "learning_rate": 0.0001862640174269235, + "loss": 12.6798, + "step": 7116 + }, + { + "epoch": 0.38754909368129437, + "grad_norm": 0.7116460721530385, + "learning_rate": 0.00018625955665711673, + "loss": 12.5787, + "step": 7117 + }, + { + "epoch": 0.38760354767787736, + "grad_norm": 0.7729800247148316, + "learning_rate": 0.00018625509521654122, + "loss": 12.462, + "step": 7118 + }, + { + "epoch": 0.3876580016744604, + "grad_norm": 0.6599703454795484, + "learning_rate": 0.0001862506331052316, + "loss": 12.4132, + "step": 7119 + }, + { + "epoch": 0.3877124556710434, + "grad_norm": 0.7320634611929326, + "learning_rate": 0.0001862461703232226, + "loss": 12.6311, + "step": 7120 + }, + { + "epoch": 0.3877669096676264, + "grad_norm": 0.7085179533336992, + "learning_rate": 0.0001862417068705489, + "loss": 12.6353, + "step": 7121 + }, + { + "epoch": 0.38782136366420944, + "grad_norm": 0.7532299118642597, + "learning_rate": 0.00018623724274724522, + "loss": 12.4862, + "step": 7122 + }, + { + "epoch": 0.38787581766079243, + "grad_norm": 0.6448793596255141, + "learning_rate": 0.00018623277795334632, + "loss": 12.4253, + "step": 7123 + }, + { + "epoch": 0.3879302716573755, + "grad_norm": 0.6352504803054885, + "learning_rate": 0.00018622831248888682, + "loss": 12.4379, + "step": 7124 + }, + { + "epoch": 0.3879847256539585, + "grad_norm": 0.6218883827318556, + "learning_rate": 0.00018622384635390152, + "loss": 12.5006, + "step": 7125 + }, + { + "epoch": 0.38803917965054147, + "grad_norm": 0.6190690293378829, + "learning_rate": 0.00018621937954842516, + "loss": 12.4908, + "step": 7126 + }, + { + "epoch": 0.3880936336471245, + "grad_norm": 0.6275365107995106, + "learning_rate": 0.00018621491207249243, + "loss": 12.45, + "step": 7127 + }, + { + "epoch": 0.3881480876437075, + "grad_norm": 0.6047005196824049, + "learning_rate": 0.00018621044392613809, + "loss": 12.3975, + "step": 7128 + }, + { + "epoch": 0.3882025416402905, + "grad_norm": 0.6524608765328237, + "learning_rate": 0.00018620597510939687, + "loss": 12.4844, + "step": 7129 + }, + { + "epoch": 0.38825699563687355, + "grad_norm": 0.7510200705796262, + "learning_rate": 0.00018620150562230354, + "loss": 12.6705, + "step": 7130 + }, + { + "epoch": 0.38831144963345654, + "grad_norm": 0.73051352377799, + "learning_rate": 0.00018619703546489286, + "loss": 12.4825, + "step": 7131 + }, + { + "epoch": 0.38836590363003953, + "grad_norm": 0.7746106714477208, + "learning_rate": 0.00018619256463719953, + "loss": 12.5045, + "step": 7132 + }, + { + "epoch": 0.3884203576266226, + "grad_norm": 0.6517344434189947, + "learning_rate": 0.0001861880931392584, + "loss": 12.472, + "step": 7133 + }, + { + "epoch": 0.3884748116232056, + "grad_norm": 0.7236585827856387, + "learning_rate": 0.00018618362097110418, + "loss": 12.4292, + "step": 7134 + }, + { + "epoch": 0.38852926561978857, + "grad_norm": 0.7377438184733501, + "learning_rate": 0.0001861791481327717, + "loss": 12.52, + "step": 7135 + }, + { + "epoch": 0.3885837196163716, + "grad_norm": 0.701077172486775, + "learning_rate": 0.0001861746746242957, + "loss": 12.3832, + "step": 7136 + }, + { + "epoch": 0.3886381736129546, + "grad_norm": 0.7465602021994938, + "learning_rate": 0.00018617020044571096, + "loss": 12.477, + "step": 7137 + }, + { + "epoch": 0.3886926276095376, + "grad_norm": 0.6913046599592584, + "learning_rate": 0.00018616572559705232, + "loss": 12.494, + "step": 7138 + }, + { + "epoch": 0.38874708160612065, + "grad_norm": 0.798704738006824, + "learning_rate": 0.00018616125007835454, + "loss": 12.4876, + "step": 7139 + }, + { + "epoch": 0.38880153560270364, + "grad_norm": 0.7090710177877242, + "learning_rate": 0.0001861567738896524, + "loss": 12.518, + "step": 7140 + }, + { + "epoch": 0.38885598959928663, + "grad_norm": 0.6283988372735225, + "learning_rate": 0.00018615229703098076, + "loss": 12.612, + "step": 7141 + }, + { + "epoch": 0.3889104435958697, + "grad_norm": 0.7791594674546557, + "learning_rate": 0.00018614781950237444, + "loss": 12.6456, + "step": 7142 + }, + { + "epoch": 0.3889648975924527, + "grad_norm": 0.6618988345899924, + "learning_rate": 0.0001861433413038682, + "loss": 12.4756, + "step": 7143 + }, + { + "epoch": 0.38901935158903567, + "grad_norm": 0.7549097356269053, + "learning_rate": 0.0001861388624354969, + "loss": 12.4658, + "step": 7144 + }, + { + "epoch": 0.3890738055856187, + "grad_norm": 0.6821411593614526, + "learning_rate": 0.00018613438289729535, + "loss": 12.5978, + "step": 7145 + }, + { + "epoch": 0.3891282595822017, + "grad_norm": 0.6596391875897409, + "learning_rate": 0.00018612990268929838, + "loss": 12.433, + "step": 7146 + }, + { + "epoch": 0.3891827135787847, + "grad_norm": 0.6942742601285982, + "learning_rate": 0.00018612542181154087, + "loss": 12.4488, + "step": 7147 + }, + { + "epoch": 0.38923716757536775, + "grad_norm": 0.6441158155215414, + "learning_rate": 0.00018612094026405763, + "loss": 12.3776, + "step": 7148 + }, + { + "epoch": 0.38929162157195074, + "grad_norm": 0.6671097050311692, + "learning_rate": 0.0001861164580468835, + "loss": 12.6619, + "step": 7149 + }, + { + "epoch": 0.38934607556853373, + "grad_norm": 0.6378455673853604, + "learning_rate": 0.00018611197516005335, + "loss": 12.401, + "step": 7150 + }, + { + "epoch": 0.3894005295651168, + "grad_norm": 0.5825367655607089, + "learning_rate": 0.0001861074916036021, + "loss": 12.5309, + "step": 7151 + }, + { + "epoch": 0.3894549835616998, + "grad_norm": 0.7563755785105186, + "learning_rate": 0.00018610300737756448, + "loss": 12.4888, + "step": 7152 + }, + { + "epoch": 0.38950943755828277, + "grad_norm": 0.7009475975387238, + "learning_rate": 0.00018609852248197546, + "loss": 12.5647, + "step": 7153 + }, + { + "epoch": 0.3895638915548658, + "grad_norm": 0.6796488958941794, + "learning_rate": 0.0001860940369168699, + "loss": 12.4961, + "step": 7154 + }, + { + "epoch": 0.3896183455514488, + "grad_norm": 0.6557015786790266, + "learning_rate": 0.00018608955068228267, + "loss": 12.4476, + "step": 7155 + }, + { + "epoch": 0.3896727995480318, + "grad_norm": 0.7001203977916104, + "learning_rate": 0.00018608506377824864, + "loss": 12.5892, + "step": 7156 + }, + { + "epoch": 0.38972725354461485, + "grad_norm": 0.7131040980800413, + "learning_rate": 0.00018608057620480274, + "loss": 12.4833, + "step": 7157 + }, + { + "epoch": 0.38978170754119784, + "grad_norm": 0.7123328847352828, + "learning_rate": 0.00018607608796197982, + "loss": 12.5396, + "step": 7158 + }, + { + "epoch": 0.3898361615377809, + "grad_norm": 0.6952953693606627, + "learning_rate": 0.00018607159904981483, + "loss": 12.3928, + "step": 7159 + }, + { + "epoch": 0.3898906155343639, + "grad_norm": 0.6573785347664906, + "learning_rate": 0.0001860671094683426, + "loss": 12.4525, + "step": 7160 + }, + { + "epoch": 0.3899450695309469, + "grad_norm": 0.5640682053021612, + "learning_rate": 0.00018606261921759814, + "loss": 12.4185, + "step": 7161 + }, + { + "epoch": 0.3899995235275299, + "grad_norm": 0.6663834979866167, + "learning_rate": 0.00018605812829761633, + "loss": 12.4792, + "step": 7162 + }, + { + "epoch": 0.3900539775241129, + "grad_norm": 0.6167571085476997, + "learning_rate": 0.00018605363670843206, + "loss": 12.3879, + "step": 7163 + }, + { + "epoch": 0.3901084315206959, + "grad_norm": 0.6064142565549342, + "learning_rate": 0.0001860491444500803, + "loss": 12.5378, + "step": 7164 + }, + { + "epoch": 0.39016288551727896, + "grad_norm": 0.5654870240016255, + "learning_rate": 0.00018604465152259595, + "loss": 12.4184, + "step": 7165 + }, + { + "epoch": 0.39021733951386195, + "grad_norm": 0.7203300327021557, + "learning_rate": 0.00018604015792601396, + "loss": 12.4794, + "step": 7166 + }, + { + "epoch": 0.39027179351044494, + "grad_norm": 0.6751225100880477, + "learning_rate": 0.00018603566366036923, + "loss": 12.6103, + "step": 7167 + }, + { + "epoch": 0.390326247507028, + "grad_norm": 0.6162717287525183, + "learning_rate": 0.00018603116872569682, + "loss": 12.3781, + "step": 7168 + }, + { + "epoch": 0.390380701503611, + "grad_norm": 0.6081743008463881, + "learning_rate": 0.00018602667312203158, + "loss": 12.4616, + "step": 7169 + }, + { + "epoch": 0.390435155500194, + "grad_norm": 0.7200821163330738, + "learning_rate": 0.0001860221768494085, + "loss": 12.2856, + "step": 7170 + }, + { + "epoch": 0.390489609496777, + "grad_norm": 0.6801818674283662, + "learning_rate": 0.00018601767990786256, + "loss": 12.4423, + "step": 7171 + }, + { + "epoch": 0.39054406349336, + "grad_norm": 0.7458239719186429, + "learning_rate": 0.00018601318229742874, + "loss": 12.4531, + "step": 7172 + }, + { + "epoch": 0.390598517489943, + "grad_norm": 0.636396501553983, + "learning_rate": 0.00018600868401814194, + "loss": 12.418, + "step": 7173 + }, + { + "epoch": 0.39065297148652606, + "grad_norm": 0.6982245766018101, + "learning_rate": 0.00018600418507003723, + "loss": 12.6615, + "step": 7174 + }, + { + "epoch": 0.39070742548310905, + "grad_norm": 0.6725384848733218, + "learning_rate": 0.00018599968545314951, + "loss": 12.5772, + "step": 7175 + }, + { + "epoch": 0.39076187947969204, + "grad_norm": 0.6217640478486886, + "learning_rate": 0.00018599518516751386, + "loss": 12.4921, + "step": 7176 + }, + { + "epoch": 0.3908163334762751, + "grad_norm": 0.735065262741298, + "learning_rate": 0.0001859906842131652, + "loss": 12.606, + "step": 7177 + }, + { + "epoch": 0.3908707874728581, + "grad_norm": 0.6181369701959938, + "learning_rate": 0.00018598618259013856, + "loss": 12.3427, + "step": 7178 + }, + { + "epoch": 0.3909252414694411, + "grad_norm": 0.6777786413899306, + "learning_rate": 0.00018598168029846895, + "loss": 12.4036, + "step": 7179 + }, + { + "epoch": 0.3909796954660241, + "grad_norm": 0.679731794899129, + "learning_rate": 0.00018597717733819137, + "loss": 12.4271, + "step": 7180 + }, + { + "epoch": 0.3910341494626071, + "grad_norm": 0.6111586941549961, + "learning_rate": 0.00018597267370934085, + "loss": 12.4846, + "step": 7181 + }, + { + "epoch": 0.3910886034591901, + "grad_norm": 0.6457344417871698, + "learning_rate": 0.0001859681694119524, + "loss": 12.5326, + "step": 7182 + }, + { + "epoch": 0.39114305745577316, + "grad_norm": 0.6598166421712464, + "learning_rate": 0.00018596366444606106, + "loss": 12.3722, + "step": 7183 + }, + { + "epoch": 0.39119751145235615, + "grad_norm": 0.6505653730095216, + "learning_rate": 0.00018595915881170183, + "loss": 12.4866, + "step": 7184 + }, + { + "epoch": 0.39125196544893914, + "grad_norm": 0.8942655461387717, + "learning_rate": 0.00018595465250890975, + "loss": 12.6271, + "step": 7185 + }, + { + "epoch": 0.3913064194455222, + "grad_norm": 0.683973537855153, + "learning_rate": 0.00018595014553771992, + "loss": 12.4369, + "step": 7186 + }, + { + "epoch": 0.3913608734421052, + "grad_norm": 0.7404856511844542, + "learning_rate": 0.00018594563789816734, + "loss": 12.4747, + "step": 7187 + }, + { + "epoch": 0.3914153274386882, + "grad_norm": 0.7896839713826467, + "learning_rate": 0.00018594112959028706, + "loss": 12.3406, + "step": 7188 + }, + { + "epoch": 0.3914697814352712, + "grad_norm": 0.6195008716658261, + "learning_rate": 0.00018593662061411413, + "loss": 12.4288, + "step": 7189 + }, + { + "epoch": 0.3915242354318542, + "grad_norm": 0.7143577956920403, + "learning_rate": 0.00018593211096968362, + "loss": 12.4196, + "step": 7190 + }, + { + "epoch": 0.39157868942843727, + "grad_norm": 0.7425400955747433, + "learning_rate": 0.0001859276006570306, + "loss": 12.606, + "step": 7191 + }, + { + "epoch": 0.39163314342502026, + "grad_norm": 0.6396423199370846, + "learning_rate": 0.00018592308967619017, + "loss": 12.6347, + "step": 7192 + }, + { + "epoch": 0.39168759742160325, + "grad_norm": 0.6885635209124368, + "learning_rate": 0.00018591857802719737, + "loss": 12.5374, + "step": 7193 + }, + { + "epoch": 0.3917420514181863, + "grad_norm": 0.6645824603123485, + "learning_rate": 0.0001859140657100873, + "loss": 12.5546, + "step": 7194 + }, + { + "epoch": 0.3917965054147693, + "grad_norm": 0.5879980669589716, + "learning_rate": 0.00018590955272489504, + "loss": 12.3882, + "step": 7195 + }, + { + "epoch": 0.3918509594113523, + "grad_norm": 0.7996186092060785, + "learning_rate": 0.00018590503907165573, + "loss": 12.5485, + "step": 7196 + }, + { + "epoch": 0.39190541340793533, + "grad_norm": 0.6731604785694854, + "learning_rate": 0.0001859005247504044, + "loss": 12.5588, + "step": 7197 + }, + { + "epoch": 0.3919598674045183, + "grad_norm": 0.7331086743184092, + "learning_rate": 0.00018589600976117617, + "loss": 12.3912, + "step": 7198 + }, + { + "epoch": 0.3920143214011013, + "grad_norm": 0.6348125847667513, + "learning_rate": 0.0001858914941040062, + "loss": 12.5074, + "step": 7199 + }, + { + "epoch": 0.39206877539768437, + "grad_norm": 0.6554620268542093, + "learning_rate": 0.00018588697777892953, + "loss": 12.4549, + "step": 7200 + }, + { + "epoch": 0.39212322939426736, + "grad_norm": 0.6242822708657442, + "learning_rate": 0.00018588246078598135, + "loss": 12.5546, + "step": 7201 + }, + { + "epoch": 0.39217768339085035, + "grad_norm": 0.762153351802112, + "learning_rate": 0.00018587794312519674, + "loss": 12.5397, + "step": 7202 + }, + { + "epoch": 0.3922321373874334, + "grad_norm": 0.6007328039341426, + "learning_rate": 0.00018587342479661084, + "loss": 12.3427, + "step": 7203 + }, + { + "epoch": 0.3922865913840164, + "grad_norm": 0.6532959864928064, + "learning_rate": 0.00018586890580025878, + "loss": 12.5656, + "step": 7204 + }, + { + "epoch": 0.3923410453805994, + "grad_norm": 0.6905479326225938, + "learning_rate": 0.0001858643861361757, + "loss": 12.4903, + "step": 7205 + }, + { + "epoch": 0.39239549937718243, + "grad_norm": 0.851211578891341, + "learning_rate": 0.00018585986580439682, + "loss": 12.5757, + "step": 7206 + }, + { + "epoch": 0.3924499533737654, + "grad_norm": 0.683472164164967, + "learning_rate": 0.00018585534480495714, + "loss": 12.3313, + "step": 7207 + }, + { + "epoch": 0.3925044073703484, + "grad_norm": 0.607373149852656, + "learning_rate": 0.00018585082313789196, + "loss": 12.3781, + "step": 7208 + }, + { + "epoch": 0.39255886136693147, + "grad_norm": 0.6533168461112862, + "learning_rate": 0.00018584630080323633, + "loss": 12.5263, + "step": 7209 + }, + { + "epoch": 0.39261331536351446, + "grad_norm": 0.6703700361785917, + "learning_rate": 0.00018584177780102553, + "loss": 12.3924, + "step": 7210 + }, + { + "epoch": 0.39266776936009745, + "grad_norm": 0.7017657038020355, + "learning_rate": 0.00018583725413129462, + "loss": 12.6166, + "step": 7211 + }, + { + "epoch": 0.3927222233566805, + "grad_norm": 0.6458042616638657, + "learning_rate": 0.00018583272979407885, + "loss": 12.4746, + "step": 7212 + }, + { + "epoch": 0.3927766773532635, + "grad_norm": 0.6663919039166154, + "learning_rate": 0.00018582820478941337, + "loss": 12.4783, + "step": 7213 + }, + { + "epoch": 0.3928311313498465, + "grad_norm": 0.6678956163369926, + "learning_rate": 0.00018582367911733339, + "loss": 12.6578, + "step": 7214 + }, + { + "epoch": 0.39288558534642953, + "grad_norm": 0.6717646126286817, + "learning_rate": 0.00018581915277787406, + "loss": 12.5549, + "step": 7215 + }, + { + "epoch": 0.3929400393430125, + "grad_norm": 0.6514951420285618, + "learning_rate": 0.00018581462577107062, + "loss": 12.4215, + "step": 7216 + }, + { + "epoch": 0.3929944933395955, + "grad_norm": 0.6131590893663512, + "learning_rate": 0.00018581009809695828, + "loss": 12.413, + "step": 7217 + }, + { + "epoch": 0.39304894733617857, + "grad_norm": 0.7061939089881658, + "learning_rate": 0.0001858055697555722, + "loss": 12.4528, + "step": 7218 + }, + { + "epoch": 0.39310340133276156, + "grad_norm": 0.6396540860826965, + "learning_rate": 0.00018580104074694765, + "loss": 12.4773, + "step": 7219 + }, + { + "epoch": 0.39315785532934455, + "grad_norm": 0.6813268667296958, + "learning_rate": 0.00018579651107111979, + "loss": 12.1887, + "step": 7220 + }, + { + "epoch": 0.3932123093259276, + "grad_norm": 0.652895067238895, + "learning_rate": 0.00018579198072812386, + "loss": 12.5702, + "step": 7221 + }, + { + "epoch": 0.3932667633225106, + "grad_norm": 0.6312854585604019, + "learning_rate": 0.00018578744971799513, + "loss": 12.4433, + "step": 7222 + }, + { + "epoch": 0.3933212173190936, + "grad_norm": 0.7119148682281505, + "learning_rate": 0.0001857829180407688, + "loss": 12.4816, + "step": 7223 + }, + { + "epoch": 0.39337567131567663, + "grad_norm": 0.5874724663556574, + "learning_rate": 0.00018577838569648012, + "loss": 12.3591, + "step": 7224 + }, + { + "epoch": 0.3934301253122596, + "grad_norm": 0.724354991397679, + "learning_rate": 0.0001857738526851643, + "loss": 12.483, + "step": 7225 + }, + { + "epoch": 0.3934845793088427, + "grad_norm": 0.6301934564762179, + "learning_rate": 0.00018576931900685665, + "loss": 12.421, + "step": 7226 + }, + { + "epoch": 0.39353903330542567, + "grad_norm": 0.6071793867462213, + "learning_rate": 0.00018576478466159237, + "loss": 12.4453, + "step": 7227 + }, + { + "epoch": 0.39359348730200866, + "grad_norm": 0.6099905450348952, + "learning_rate": 0.00018576024964940673, + "loss": 12.3518, + "step": 7228 + }, + { + "epoch": 0.3936479412985917, + "grad_norm": 0.6502863285735471, + "learning_rate": 0.00018575571397033504, + "loss": 12.2838, + "step": 7229 + }, + { + "epoch": 0.3937023952951747, + "grad_norm": 0.6212062647781128, + "learning_rate": 0.00018575117762441252, + "loss": 12.5731, + "step": 7230 + }, + { + "epoch": 0.3937568492917577, + "grad_norm": 0.6241215749927159, + "learning_rate": 0.00018574664061167447, + "loss": 12.4702, + "step": 7231 + }, + { + "epoch": 0.39381130328834074, + "grad_norm": 0.6667971540441856, + "learning_rate": 0.00018574210293215615, + "loss": 12.5083, + "step": 7232 + }, + { + "epoch": 0.39386575728492373, + "grad_norm": 0.612237029823004, + "learning_rate": 0.00018573756458589288, + "loss": 12.4686, + "step": 7233 + }, + { + "epoch": 0.3939202112815067, + "grad_norm": 0.6123887965534771, + "learning_rate": 0.00018573302557291989, + "loss": 12.4532, + "step": 7234 + }, + { + "epoch": 0.3939746652780898, + "grad_norm": 0.5806781769345054, + "learning_rate": 0.00018572848589327255, + "loss": 12.4606, + "step": 7235 + }, + { + "epoch": 0.39402911927467277, + "grad_norm": 0.6702610377858681, + "learning_rate": 0.00018572394554698614, + "loss": 12.5989, + "step": 7236 + }, + { + "epoch": 0.39408357327125576, + "grad_norm": 0.6998756911883727, + "learning_rate": 0.0001857194045340959, + "loss": 12.3319, + "step": 7237 + }, + { + "epoch": 0.3941380272678388, + "grad_norm": 0.6148350823215484, + "learning_rate": 0.00018571486285463723, + "loss": 12.5016, + "step": 7238 + }, + { + "epoch": 0.3941924812644218, + "grad_norm": 0.6236150418109734, + "learning_rate": 0.0001857103205086454, + "loss": 12.5312, + "step": 7239 + }, + { + "epoch": 0.3942469352610048, + "grad_norm": 0.6139467227040993, + "learning_rate": 0.00018570577749615577, + "loss": 12.6076, + "step": 7240 + }, + { + "epoch": 0.39430138925758784, + "grad_norm": 0.6040187667043337, + "learning_rate": 0.00018570123381720364, + "loss": 12.4072, + "step": 7241 + }, + { + "epoch": 0.39435584325417083, + "grad_norm": 0.6223642956678106, + "learning_rate": 0.0001856966894718243, + "loss": 12.4661, + "step": 7242 + }, + { + "epoch": 0.3944102972507538, + "grad_norm": 0.659730055040347, + "learning_rate": 0.00018569214446005316, + "loss": 12.3883, + "step": 7243 + }, + { + "epoch": 0.3944647512473369, + "grad_norm": 0.6723695378689196, + "learning_rate": 0.00018568759878192554, + "loss": 12.3468, + "step": 7244 + }, + { + "epoch": 0.39451920524391987, + "grad_norm": 0.6392869809761946, + "learning_rate": 0.00018568305243747677, + "loss": 12.4045, + "step": 7245 + }, + { + "epoch": 0.39457365924050286, + "grad_norm": 0.6810339540317356, + "learning_rate": 0.0001856785054267422, + "loss": 12.5154, + "step": 7246 + }, + { + "epoch": 0.3946281132370859, + "grad_norm": 0.6434168951571108, + "learning_rate": 0.00018567395774975724, + "loss": 12.5282, + "step": 7247 + }, + { + "epoch": 0.3946825672336689, + "grad_norm": 0.6729813495271544, + "learning_rate": 0.0001856694094065572, + "loss": 12.5041, + "step": 7248 + }, + { + "epoch": 0.3947370212302519, + "grad_norm": 0.6876531212401764, + "learning_rate": 0.00018566486039717749, + "loss": 12.6692, + "step": 7249 + }, + { + "epoch": 0.39479147522683494, + "grad_norm": 0.587843823790897, + "learning_rate": 0.0001856603107216534, + "loss": 12.3987, + "step": 7250 + }, + { + "epoch": 0.39484592922341794, + "grad_norm": 0.6931917437986364, + "learning_rate": 0.0001856557603800204, + "loss": 12.3018, + "step": 7251 + }, + { + "epoch": 0.39490038322000093, + "grad_norm": 0.6971045277469835, + "learning_rate": 0.00018565120937231387, + "loss": 12.4893, + "step": 7252 + }, + { + "epoch": 0.394954837216584, + "grad_norm": 0.5889567317818012, + "learning_rate": 0.00018564665769856914, + "loss": 12.4292, + "step": 7253 + }, + { + "epoch": 0.39500929121316697, + "grad_norm": 0.5758889736487871, + "learning_rate": 0.00018564210535882168, + "loss": 12.3916, + "step": 7254 + }, + { + "epoch": 0.39506374520974996, + "grad_norm": 0.6136402012755989, + "learning_rate": 0.00018563755235310677, + "loss": 12.555, + "step": 7255 + }, + { + "epoch": 0.395118199206333, + "grad_norm": 0.6967106844765542, + "learning_rate": 0.00018563299868145996, + "loss": 12.3989, + "step": 7256 + }, + { + "epoch": 0.395172653202916, + "grad_norm": 0.6602235149415349, + "learning_rate": 0.00018562844434391655, + "loss": 12.5221, + "step": 7257 + }, + { + "epoch": 0.39522710719949905, + "grad_norm": 0.5970856829434794, + "learning_rate": 0.00018562388934051204, + "loss": 12.5314, + "step": 7258 + }, + { + "epoch": 0.39528156119608204, + "grad_norm": 0.702231835022106, + "learning_rate": 0.00018561933367128175, + "loss": 12.3386, + "step": 7259 + }, + { + "epoch": 0.39533601519266504, + "grad_norm": 0.6349517268734202, + "learning_rate": 0.0001856147773362612, + "loss": 12.4374, + "step": 7260 + }, + { + "epoch": 0.3953904691892481, + "grad_norm": 0.5998605612793768, + "learning_rate": 0.00018561022033548578, + "loss": 12.5123, + "step": 7261 + }, + { + "epoch": 0.3954449231858311, + "grad_norm": 0.798835307386948, + "learning_rate": 0.0001856056626689909, + "loss": 12.5172, + "step": 7262 + }, + { + "epoch": 0.39549937718241407, + "grad_norm": 0.6122143003084994, + "learning_rate": 0.00018560110433681209, + "loss": 12.4921, + "step": 7263 + }, + { + "epoch": 0.3955538311789971, + "grad_norm": 0.6398409323570206, + "learning_rate": 0.0001855965453389847, + "loss": 12.5301, + "step": 7264 + }, + { + "epoch": 0.3956082851755801, + "grad_norm": 0.6628705007558366, + "learning_rate": 0.00018559198567554423, + "loss": 12.5528, + "step": 7265 + }, + { + "epoch": 0.3956627391721631, + "grad_norm": 0.6082496779379879, + "learning_rate": 0.00018558742534652612, + "loss": 12.4979, + "step": 7266 + }, + { + "epoch": 0.39571719316874615, + "grad_norm": 0.6679068622193955, + "learning_rate": 0.00018558286435196584, + "loss": 12.4034, + "step": 7267 + }, + { + "epoch": 0.39577164716532914, + "grad_norm": 0.6283397647402458, + "learning_rate": 0.00018557830269189885, + "loss": 12.4187, + "step": 7268 + }, + { + "epoch": 0.39582610116191214, + "grad_norm": 0.6170647594859415, + "learning_rate": 0.0001855737403663606, + "loss": 12.4321, + "step": 7269 + }, + { + "epoch": 0.3958805551584952, + "grad_norm": 0.6354956277518662, + "learning_rate": 0.00018556917737538663, + "loss": 12.531, + "step": 7270 + }, + { + "epoch": 0.3959350091550782, + "grad_norm": 0.7097303096725949, + "learning_rate": 0.0001855646137190124, + "loss": 12.6097, + "step": 7271 + }, + { + "epoch": 0.39598946315166117, + "grad_norm": 0.634232155487576, + "learning_rate": 0.00018556004939727333, + "loss": 12.3796, + "step": 7272 + }, + { + "epoch": 0.3960439171482442, + "grad_norm": 0.6053037252161205, + "learning_rate": 0.00018555548441020502, + "loss": 12.4573, + "step": 7273 + }, + { + "epoch": 0.3960983711448272, + "grad_norm": 0.6051827555115559, + "learning_rate": 0.0001855509187578429, + "loss": 12.4087, + "step": 7274 + }, + { + "epoch": 0.3961528251414102, + "grad_norm": 0.5997554655096772, + "learning_rate": 0.00018554635244022246, + "loss": 12.5251, + "step": 7275 + }, + { + "epoch": 0.39620727913799325, + "grad_norm": 0.5727144957820633, + "learning_rate": 0.0001855417854573793, + "loss": 12.3609, + "step": 7276 + }, + { + "epoch": 0.39626173313457624, + "grad_norm": 0.626733753861516, + "learning_rate": 0.00018553721780934884, + "loss": 12.4887, + "step": 7277 + }, + { + "epoch": 0.39631618713115924, + "grad_norm": 0.6508171147743776, + "learning_rate": 0.0001855326494961666, + "loss": 12.4415, + "step": 7278 + }, + { + "epoch": 0.3963706411277423, + "grad_norm": 0.7338951360507143, + "learning_rate": 0.00018552808051786816, + "loss": 12.3495, + "step": 7279 + }, + { + "epoch": 0.3964250951243253, + "grad_norm": 0.6598739504874858, + "learning_rate": 0.00018552351087448903, + "loss": 12.4492, + "step": 7280 + }, + { + "epoch": 0.39647954912090827, + "grad_norm": 0.659297850454176, + "learning_rate": 0.00018551894056606473, + "loss": 12.5406, + "step": 7281 + }, + { + "epoch": 0.3965340031174913, + "grad_norm": 0.7926530178887912, + "learning_rate": 0.0001855143695926308, + "loss": 12.5387, + "step": 7282 + }, + { + "epoch": 0.3965884571140743, + "grad_norm": 0.8547824409637148, + "learning_rate": 0.00018550979795422281, + "loss": 12.4184, + "step": 7283 + }, + { + "epoch": 0.3966429111106573, + "grad_norm": 0.6204336063723833, + "learning_rate": 0.00018550522565087625, + "loss": 12.409, + "step": 7284 + }, + { + "epoch": 0.39669736510724035, + "grad_norm": 0.6544669397684993, + "learning_rate": 0.00018550065268262676, + "loss": 12.4706, + "step": 7285 + }, + { + "epoch": 0.39675181910382334, + "grad_norm": 0.6906323331020744, + "learning_rate": 0.00018549607904950983, + "loss": 12.3418, + "step": 7286 + }, + { + "epoch": 0.39680627310040634, + "grad_norm": 0.6146860200521059, + "learning_rate": 0.00018549150475156108, + "loss": 12.3718, + "step": 7287 + }, + { + "epoch": 0.3968607270969894, + "grad_norm": 0.7514602294337431, + "learning_rate": 0.00018548692978881601, + "loss": 12.6076, + "step": 7288 + }, + { + "epoch": 0.3969151810935724, + "grad_norm": 0.6983633388102153, + "learning_rate": 0.00018548235416131025, + "loss": 12.4947, + "step": 7289 + }, + { + "epoch": 0.39696963509015537, + "grad_norm": 0.7084821502082099, + "learning_rate": 0.0001854777778690794, + "loss": 12.3876, + "step": 7290 + }, + { + "epoch": 0.3970240890867384, + "grad_norm": 0.6327235755021157, + "learning_rate": 0.00018547320091215897, + "loss": 12.4699, + "step": 7291 + }, + { + "epoch": 0.3970785430833214, + "grad_norm": 0.623885800366473, + "learning_rate": 0.00018546862329058464, + "loss": 12.4228, + "step": 7292 + }, + { + "epoch": 0.39713299707990446, + "grad_norm": 0.752140264223175, + "learning_rate": 0.00018546404500439194, + "loss": 12.5489, + "step": 7293 + }, + { + "epoch": 0.39718745107648745, + "grad_norm": 0.6581033958309009, + "learning_rate": 0.0001854594660536165, + "loss": 12.1787, + "step": 7294 + }, + { + "epoch": 0.39724190507307044, + "grad_norm": 0.6692429709274312, + "learning_rate": 0.0001854548864382939, + "loss": 12.5248, + "step": 7295 + }, + { + "epoch": 0.3972963590696535, + "grad_norm": 0.6222330407924297, + "learning_rate": 0.00018545030615845978, + "loss": 12.3521, + "step": 7296 + }, + { + "epoch": 0.3973508130662365, + "grad_norm": 0.7111786469242476, + "learning_rate": 0.00018544572521414976, + "loss": 12.4401, + "step": 7297 + }, + { + "epoch": 0.3974052670628195, + "grad_norm": 0.6415092416534993, + "learning_rate": 0.00018544114360539947, + "loss": 12.3767, + "step": 7298 + }, + { + "epoch": 0.3974597210594025, + "grad_norm": 0.6301328211565949, + "learning_rate": 0.0001854365613322445, + "loss": 12.4496, + "step": 7299 + }, + { + "epoch": 0.3975141750559855, + "grad_norm": 0.5882984345484891, + "learning_rate": 0.00018543197839472047, + "loss": 12.4798, + "step": 7300 + }, + { + "epoch": 0.3975686290525685, + "grad_norm": 0.7185448650080746, + "learning_rate": 0.00018542739479286309, + "loss": 12.6752, + "step": 7301 + }, + { + "epoch": 0.39762308304915156, + "grad_norm": 0.5885969522111261, + "learning_rate": 0.00018542281052670795, + "loss": 12.3972, + "step": 7302 + }, + { + "epoch": 0.39767753704573455, + "grad_norm": 0.6529425986067049, + "learning_rate": 0.00018541822559629072, + "loss": 12.5375, + "step": 7303 + }, + { + "epoch": 0.39773199104231755, + "grad_norm": 0.6470559603557152, + "learning_rate": 0.00018541364000164702, + "loss": 12.4317, + "step": 7304 + }, + { + "epoch": 0.3977864450389006, + "grad_norm": 0.7064940258511272, + "learning_rate": 0.00018540905374281254, + "loss": 12.3257, + "step": 7305 + }, + { + "epoch": 0.3978408990354836, + "grad_norm": 0.7124757808162602, + "learning_rate": 0.00018540446681982294, + "loss": 12.4732, + "step": 7306 + }, + { + "epoch": 0.3978953530320666, + "grad_norm": 0.6063168223396287, + "learning_rate": 0.0001853998792327139, + "loss": 12.4021, + "step": 7307 + }, + { + "epoch": 0.3979498070286496, + "grad_norm": 0.6382347270216853, + "learning_rate": 0.00018539529098152103, + "loss": 12.49, + "step": 7308 + }, + { + "epoch": 0.3980042610252326, + "grad_norm": 0.6361698118939236, + "learning_rate": 0.0001853907020662801, + "loss": 12.4024, + "step": 7309 + }, + { + "epoch": 0.3980587150218156, + "grad_norm": 0.6411437562384148, + "learning_rate": 0.00018538611248702675, + "loss": 12.4099, + "step": 7310 + }, + { + "epoch": 0.39811316901839866, + "grad_norm": 0.7301694611151364, + "learning_rate": 0.00018538152224379666, + "loss": 12.5862, + "step": 7311 + }, + { + "epoch": 0.39816762301498165, + "grad_norm": 0.5686964007656635, + "learning_rate": 0.00018537693133662553, + "loss": 12.3635, + "step": 7312 + }, + { + "epoch": 0.39822207701156465, + "grad_norm": 0.669652844274776, + "learning_rate": 0.00018537233976554906, + "loss": 12.5609, + "step": 7313 + }, + { + "epoch": 0.3982765310081477, + "grad_norm": 0.653851941082085, + "learning_rate": 0.00018536774753060299, + "loss": 12.4567, + "step": 7314 + }, + { + "epoch": 0.3983309850047307, + "grad_norm": 0.6460005068343455, + "learning_rate": 0.00018536315463182294, + "loss": 12.498, + "step": 7315 + }, + { + "epoch": 0.3983854390013137, + "grad_norm": 0.6315188681360169, + "learning_rate": 0.00018535856106924472, + "loss": 12.3967, + "step": 7316 + }, + { + "epoch": 0.3984398929978967, + "grad_norm": 0.7126080248284609, + "learning_rate": 0.00018535396684290402, + "loss": 12.4171, + "step": 7317 + }, + { + "epoch": 0.3984943469944797, + "grad_norm": 0.6610001603463862, + "learning_rate": 0.00018534937195283658, + "loss": 12.3529, + "step": 7318 + }, + { + "epoch": 0.3985488009910627, + "grad_norm": 0.622858169622087, + "learning_rate": 0.00018534477639907805, + "loss": 12.4491, + "step": 7319 + }, + { + "epoch": 0.39860325498764576, + "grad_norm": 0.7276069110047648, + "learning_rate": 0.00018534018018166428, + "loss": 12.4966, + "step": 7320 + }, + { + "epoch": 0.39865770898422875, + "grad_norm": 0.6699895849428359, + "learning_rate": 0.00018533558330063095, + "loss": 12.5369, + "step": 7321 + }, + { + "epoch": 0.39871216298081175, + "grad_norm": 0.7054366604545919, + "learning_rate": 0.0001853309857560138, + "loss": 12.4876, + "step": 7322 + }, + { + "epoch": 0.3987666169773948, + "grad_norm": 0.6194120015384266, + "learning_rate": 0.00018532638754784858, + "loss": 12.4925, + "step": 7323 + }, + { + "epoch": 0.3988210709739778, + "grad_norm": 0.6784120549830108, + "learning_rate": 0.00018532178867617107, + "loss": 12.4642, + "step": 7324 + }, + { + "epoch": 0.39887552497056084, + "grad_norm": 0.619911638459784, + "learning_rate": 0.00018531718914101703, + "loss": 12.5518, + "step": 7325 + }, + { + "epoch": 0.39892997896714383, + "grad_norm": 0.5986202288989504, + "learning_rate": 0.00018531258894242223, + "loss": 12.4409, + "step": 7326 + }, + { + "epoch": 0.3989844329637268, + "grad_norm": 0.6360512448361856, + "learning_rate": 0.0001853079880804224, + "loss": 12.478, + "step": 7327 + }, + { + "epoch": 0.39903888696030987, + "grad_norm": 0.6720223179345861, + "learning_rate": 0.0001853033865550534, + "loss": 12.4442, + "step": 7328 + }, + { + "epoch": 0.39909334095689286, + "grad_norm": 0.6887841681858036, + "learning_rate": 0.0001852987843663509, + "loss": 12.5266, + "step": 7329 + }, + { + "epoch": 0.39914779495347585, + "grad_norm": 0.6453691945791873, + "learning_rate": 0.0001852941815143508, + "loss": 12.4266, + "step": 7330 + }, + { + "epoch": 0.3992022489500589, + "grad_norm": 0.6340510744267103, + "learning_rate": 0.00018528957799908882, + "loss": 12.5852, + "step": 7331 + }, + { + "epoch": 0.3992567029466419, + "grad_norm": 0.6540445359669609, + "learning_rate": 0.00018528497382060076, + "loss": 12.5478, + "step": 7332 + }, + { + "epoch": 0.3993111569432249, + "grad_norm": 0.6299686222054321, + "learning_rate": 0.00018528036897892246, + "loss": 12.5436, + "step": 7333 + }, + { + "epoch": 0.39936561093980794, + "grad_norm": 0.6467089196451699, + "learning_rate": 0.0001852757634740897, + "loss": 12.4378, + "step": 7334 + }, + { + "epoch": 0.39942006493639093, + "grad_norm": 0.6326128820218682, + "learning_rate": 0.0001852711573061383, + "loss": 12.5551, + "step": 7335 + }, + { + "epoch": 0.3994745189329739, + "grad_norm": 0.6174531091360957, + "learning_rate": 0.0001852665504751041, + "loss": 12.4177, + "step": 7336 + }, + { + "epoch": 0.39952897292955697, + "grad_norm": 0.621876904130212, + "learning_rate": 0.0001852619429810229, + "loss": 12.3719, + "step": 7337 + }, + { + "epoch": 0.39958342692613996, + "grad_norm": 0.6295489899432883, + "learning_rate": 0.00018525733482393055, + "loss": 12.4287, + "step": 7338 + }, + { + "epoch": 0.39963788092272295, + "grad_norm": 0.6920851383728188, + "learning_rate": 0.00018525272600386283, + "loss": 12.5537, + "step": 7339 + }, + { + "epoch": 0.399692334919306, + "grad_norm": 0.5565562726035974, + "learning_rate": 0.00018524811652085563, + "loss": 12.2755, + "step": 7340 + }, + { + "epoch": 0.399746788915889, + "grad_norm": 0.7016828793560161, + "learning_rate": 0.0001852435063749448, + "loss": 12.4688, + "step": 7341 + }, + { + "epoch": 0.399801242912472, + "grad_norm": 0.682530812325876, + "learning_rate": 0.00018523889556616612, + "loss": 12.5191, + "step": 7342 + }, + { + "epoch": 0.39985569690905504, + "grad_norm": 0.6143786563448566, + "learning_rate": 0.00018523428409455555, + "loss": 12.4387, + "step": 7343 + }, + { + "epoch": 0.39991015090563803, + "grad_norm": 0.5801256496016015, + "learning_rate": 0.00018522967196014887, + "loss": 12.4552, + "step": 7344 + }, + { + "epoch": 0.399964604902221, + "grad_norm": 0.6826321396517108, + "learning_rate": 0.00018522505916298196, + "loss": 12.5167, + "step": 7345 + }, + { + "epoch": 0.40001905889880407, + "grad_norm": 0.6173642642335142, + "learning_rate": 0.0001852204457030907, + "loss": 12.3756, + "step": 7346 + }, + { + "epoch": 0.40007351289538706, + "grad_norm": 0.7812678314690079, + "learning_rate": 0.00018521583158051093, + "loss": 12.5332, + "step": 7347 + }, + { + "epoch": 0.40012796689197005, + "grad_norm": 0.6466449436057614, + "learning_rate": 0.00018521121679527865, + "loss": 12.4713, + "step": 7348 + }, + { + "epoch": 0.4001824208885531, + "grad_norm": 0.6781616505750094, + "learning_rate": 0.00018520660134742958, + "loss": 12.5313, + "step": 7349 + }, + { + "epoch": 0.4002368748851361, + "grad_norm": 0.6356853248099988, + "learning_rate": 0.00018520198523699972, + "loss": 12.5831, + "step": 7350 + }, + { + "epoch": 0.4002913288817191, + "grad_norm": 0.6273074559411165, + "learning_rate": 0.00018519736846402493, + "loss": 12.5613, + "step": 7351 + }, + { + "epoch": 0.40034578287830214, + "grad_norm": 0.7374084228928206, + "learning_rate": 0.00018519275102854113, + "loss": 12.6294, + "step": 7352 + }, + { + "epoch": 0.40040023687488513, + "grad_norm": 0.6254505452815294, + "learning_rate": 0.00018518813293058419, + "loss": 12.5451, + "step": 7353 + }, + { + "epoch": 0.4004546908714681, + "grad_norm": 0.6133781391891753, + "learning_rate": 0.00018518351417019005, + "loss": 12.3828, + "step": 7354 + }, + { + "epoch": 0.40050914486805117, + "grad_norm": 0.6191631938029492, + "learning_rate": 0.0001851788947473946, + "loss": 12.5555, + "step": 7355 + }, + { + "epoch": 0.40056359886463416, + "grad_norm": 0.6316920146031418, + "learning_rate": 0.0001851742746622338, + "loss": 12.35, + "step": 7356 + }, + { + "epoch": 0.40061805286121716, + "grad_norm": 0.6218804069826839, + "learning_rate": 0.00018516965391474354, + "loss": 12.498, + "step": 7357 + }, + { + "epoch": 0.4006725068578002, + "grad_norm": 0.6234070247722966, + "learning_rate": 0.0001851650325049598, + "loss": 12.5307, + "step": 7358 + }, + { + "epoch": 0.4007269608543832, + "grad_norm": 0.6824202368507671, + "learning_rate": 0.00018516041043291844, + "loss": 12.5668, + "step": 7359 + }, + { + "epoch": 0.40078141485096624, + "grad_norm": 0.6674063532644566, + "learning_rate": 0.0001851557876986555, + "loss": 12.4694, + "step": 7360 + }, + { + "epoch": 0.40083586884754924, + "grad_norm": 0.7075034857406978, + "learning_rate": 0.00018515116430220684, + "loss": 12.5614, + "step": 7361 + }, + { + "epoch": 0.40089032284413223, + "grad_norm": 0.6642130474985378, + "learning_rate": 0.00018514654024360847, + "loss": 12.4663, + "step": 7362 + }, + { + "epoch": 0.4009447768407153, + "grad_norm": 0.7743771292825637, + "learning_rate": 0.0001851419155228963, + "loss": 12.6379, + "step": 7363 + }, + { + "epoch": 0.40099923083729827, + "grad_norm": 0.7207920923280146, + "learning_rate": 0.00018513729014010632, + "loss": 12.4966, + "step": 7364 + }, + { + "epoch": 0.40105368483388126, + "grad_norm": 0.6934995219290194, + "learning_rate": 0.0001851326640952745, + "loss": 12.4124, + "step": 7365 + }, + { + "epoch": 0.4011081388304643, + "grad_norm": 0.6028082259589631, + "learning_rate": 0.0001851280373884368, + "loss": 12.3458, + "step": 7366 + }, + { + "epoch": 0.4011625928270473, + "grad_norm": 0.6905902445618471, + "learning_rate": 0.0001851234100196292, + "loss": 12.3137, + "step": 7367 + }, + { + "epoch": 0.4012170468236303, + "grad_norm": 0.73889018111692, + "learning_rate": 0.0001851187819888877, + "loss": 12.4944, + "step": 7368 + }, + { + "epoch": 0.40127150082021334, + "grad_norm": 0.6805249740030216, + "learning_rate": 0.00018511415329624828, + "loss": 12.51, + "step": 7369 + }, + { + "epoch": 0.40132595481679634, + "grad_norm": 0.6078115817769721, + "learning_rate": 0.00018510952394174695, + "loss": 12.3567, + "step": 7370 + }, + { + "epoch": 0.40138040881337933, + "grad_norm": 0.655832926013966, + "learning_rate": 0.00018510489392541964, + "loss": 12.421, + "step": 7371 + }, + { + "epoch": 0.4014348628099624, + "grad_norm": 0.6681102398311158, + "learning_rate": 0.00018510026324730246, + "loss": 12.5392, + "step": 7372 + }, + { + "epoch": 0.40148931680654537, + "grad_norm": 0.6801679482138852, + "learning_rate": 0.0001850956319074313, + "loss": 12.5106, + "step": 7373 + }, + { + "epoch": 0.40154377080312836, + "grad_norm": 0.6802545273425757, + "learning_rate": 0.00018509099990584227, + "loss": 12.3287, + "step": 7374 + }, + { + "epoch": 0.4015982247997114, + "grad_norm": 0.6294918505790813, + "learning_rate": 0.00018508636724257136, + "loss": 12.4195, + "step": 7375 + }, + { + "epoch": 0.4016526787962944, + "grad_norm": 0.7557878557287484, + "learning_rate": 0.00018508173391765457, + "loss": 12.4527, + "step": 7376 + }, + { + "epoch": 0.4017071327928774, + "grad_norm": 0.5810675161036324, + "learning_rate": 0.00018507709993112795, + "loss": 12.4873, + "step": 7377 + }, + { + "epoch": 0.40176158678946045, + "grad_norm": 0.7014067302253529, + "learning_rate": 0.00018507246528302757, + "loss": 12.5783, + "step": 7378 + }, + { + "epoch": 0.40181604078604344, + "grad_norm": 0.7097681235618133, + "learning_rate": 0.00018506782997338938, + "loss": 12.5405, + "step": 7379 + }, + { + "epoch": 0.40187049478262643, + "grad_norm": 0.7227925006604735, + "learning_rate": 0.00018506319400224953, + "loss": 12.5748, + "step": 7380 + }, + { + "epoch": 0.4019249487792095, + "grad_norm": 0.6721929918043782, + "learning_rate": 0.000185058557369644, + "loss": 12.5959, + "step": 7381 + }, + { + "epoch": 0.40197940277579247, + "grad_norm": 0.6215806863778806, + "learning_rate": 0.00018505392007560882, + "loss": 12.6075, + "step": 7382 + }, + { + "epoch": 0.40203385677237546, + "grad_norm": 0.6407617755681561, + "learning_rate": 0.00018504928212018015, + "loss": 12.5772, + "step": 7383 + }, + { + "epoch": 0.4020883107689585, + "grad_norm": 0.6070598483952332, + "learning_rate": 0.00018504464350339398, + "loss": 12.4512, + "step": 7384 + }, + { + "epoch": 0.4021427647655415, + "grad_norm": 0.715882401990176, + "learning_rate": 0.0001850400042252864, + "loss": 12.6478, + "step": 7385 + }, + { + "epoch": 0.4021972187621245, + "grad_norm": 0.7149775663341427, + "learning_rate": 0.00018503536428589348, + "loss": 12.6278, + "step": 7386 + }, + { + "epoch": 0.40225167275870755, + "grad_norm": 0.618326261294556, + "learning_rate": 0.00018503072368525133, + "loss": 12.4231, + "step": 7387 + }, + { + "epoch": 0.40230612675529054, + "grad_norm": 0.6969232561838311, + "learning_rate": 0.000185026082423396, + "loss": 12.5187, + "step": 7388 + }, + { + "epoch": 0.40236058075187353, + "grad_norm": 0.6183554586340254, + "learning_rate": 0.00018502144050036356, + "loss": 12.6005, + "step": 7389 + }, + { + "epoch": 0.4024150347484566, + "grad_norm": 0.5869917083721967, + "learning_rate": 0.00018501679791619018, + "loss": 12.3899, + "step": 7390 + }, + { + "epoch": 0.40246948874503957, + "grad_norm": 0.6950697425232363, + "learning_rate": 0.0001850121546709119, + "loss": 12.5312, + "step": 7391 + }, + { + "epoch": 0.4025239427416226, + "grad_norm": 0.666716020931147, + "learning_rate": 0.0001850075107645649, + "loss": 12.5483, + "step": 7392 + }, + { + "epoch": 0.4025783967382056, + "grad_norm": 0.6547101411207449, + "learning_rate": 0.00018500286619718516, + "loss": 12.432, + "step": 7393 + }, + { + "epoch": 0.4026328507347886, + "grad_norm": 0.598399101548866, + "learning_rate": 0.00018499822096880894, + "loss": 12.3849, + "step": 7394 + }, + { + "epoch": 0.40268730473137165, + "grad_norm": 0.6814945968918992, + "learning_rate": 0.0001849935750794723, + "loss": 12.3536, + "step": 7395 + }, + { + "epoch": 0.40274175872795465, + "grad_norm": 0.6626293967806715, + "learning_rate": 0.00018498892852921134, + "loss": 12.4122, + "step": 7396 + }, + { + "epoch": 0.40279621272453764, + "grad_norm": 0.6900476293869696, + "learning_rate": 0.0001849842813180622, + "loss": 12.55, + "step": 7397 + }, + { + "epoch": 0.4028506667211207, + "grad_norm": 0.6745622136773487, + "learning_rate": 0.00018497963344606106, + "loss": 12.4947, + "step": 7398 + }, + { + "epoch": 0.4029051207177037, + "grad_norm": 0.7267336187426208, + "learning_rate": 0.00018497498491324406, + "loss": 12.5677, + "step": 7399 + }, + { + "epoch": 0.4029595747142867, + "grad_norm": 0.6591495967434877, + "learning_rate": 0.00018497033571964727, + "loss": 12.5507, + "step": 7400 + }, + { + "epoch": 0.4030140287108697, + "grad_norm": 0.6366391704310337, + "learning_rate": 0.00018496568586530695, + "loss": 12.5178, + "step": 7401 + }, + { + "epoch": 0.4030684827074527, + "grad_norm": 0.6943015623607408, + "learning_rate": 0.00018496103535025918, + "loss": 12.4766, + "step": 7402 + }, + { + "epoch": 0.4031229367040357, + "grad_norm": 0.6496156865013774, + "learning_rate": 0.00018495638417454017, + "loss": 12.3915, + "step": 7403 + }, + { + "epoch": 0.40317739070061875, + "grad_norm": 0.7010064864414909, + "learning_rate": 0.0001849517323381861, + "loss": 12.4634, + "step": 7404 + }, + { + "epoch": 0.40323184469720175, + "grad_norm": 0.6761556369951273, + "learning_rate": 0.00018494707984123307, + "loss": 12.3944, + "step": 7405 + }, + { + "epoch": 0.40328629869378474, + "grad_norm": 0.6087198016655698, + "learning_rate": 0.0001849424266837173, + "loss": 12.4833, + "step": 7406 + }, + { + "epoch": 0.4033407526903678, + "grad_norm": 0.7169273106444296, + "learning_rate": 0.00018493777286567498, + "loss": 12.5492, + "step": 7407 + }, + { + "epoch": 0.4033952066869508, + "grad_norm": 0.6440946600719528, + "learning_rate": 0.00018493311838714232, + "loss": 12.4559, + "step": 7408 + }, + { + "epoch": 0.4034496606835338, + "grad_norm": 0.6450930034300398, + "learning_rate": 0.00018492846324815547, + "loss": 12.6247, + "step": 7409 + }, + { + "epoch": 0.4035041146801168, + "grad_norm": 0.6440379111155046, + "learning_rate": 0.0001849238074487506, + "loss": 12.4345, + "step": 7410 + }, + { + "epoch": 0.4035585686766998, + "grad_norm": 0.7276183434116561, + "learning_rate": 0.00018491915098896403, + "loss": 12.4772, + "step": 7411 + }, + { + "epoch": 0.4036130226732828, + "grad_norm": 0.6384627542574353, + "learning_rate": 0.0001849144938688319, + "loss": 12.3952, + "step": 7412 + }, + { + "epoch": 0.40366747666986585, + "grad_norm": 0.6931831770467287, + "learning_rate": 0.0001849098360883904, + "loss": 12.3868, + "step": 7413 + }, + { + "epoch": 0.40372193066644885, + "grad_norm": 0.6550735530488863, + "learning_rate": 0.00018490517764767578, + "loss": 12.5256, + "step": 7414 + }, + { + "epoch": 0.40377638466303184, + "grad_norm": 0.743756648354596, + "learning_rate": 0.00018490051854672424, + "loss": 12.4004, + "step": 7415 + }, + { + "epoch": 0.4038308386596149, + "grad_norm": 0.6575060670971092, + "learning_rate": 0.00018489585878557206, + "loss": 12.5674, + "step": 7416 + }, + { + "epoch": 0.4038852926561979, + "grad_norm": 0.6628270930000479, + "learning_rate": 0.00018489119836425543, + "loss": 12.3725, + "step": 7417 + }, + { + "epoch": 0.4039397466527809, + "grad_norm": 0.6239260359465777, + "learning_rate": 0.0001848865372828106, + "loss": 12.3349, + "step": 7418 + }, + { + "epoch": 0.4039942006493639, + "grad_norm": 0.6977175489948938, + "learning_rate": 0.00018488187554127383, + "loss": 12.545, + "step": 7419 + }, + { + "epoch": 0.4040486546459469, + "grad_norm": 0.7582230177928753, + "learning_rate": 0.00018487721313968137, + "loss": 12.348, + "step": 7420 + }, + { + "epoch": 0.4041031086425299, + "grad_norm": 0.6080224584126379, + "learning_rate": 0.00018487255007806945, + "loss": 12.5413, + "step": 7421 + }, + { + "epoch": 0.40415756263911295, + "grad_norm": 0.6923559615321203, + "learning_rate": 0.00018486788635647435, + "loss": 12.5759, + "step": 7422 + }, + { + "epoch": 0.40421201663569595, + "grad_norm": 0.6912263414510522, + "learning_rate": 0.00018486322197493234, + "loss": 12.4712, + "step": 7423 + }, + { + "epoch": 0.40426647063227894, + "grad_norm": 0.6863212712827147, + "learning_rate": 0.00018485855693347968, + "loss": 12.4547, + "step": 7424 + }, + { + "epoch": 0.404320924628862, + "grad_norm": 0.6403074472885916, + "learning_rate": 0.00018485389123215265, + "loss": 12.534, + "step": 7425 + }, + { + "epoch": 0.404375378625445, + "grad_norm": 0.6368847513628307, + "learning_rate": 0.00018484922487098753, + "loss": 12.3169, + "step": 7426 + }, + { + "epoch": 0.40442983262202803, + "grad_norm": 0.6936139800096027, + "learning_rate": 0.00018484455785002063, + "loss": 12.4682, + "step": 7427 + }, + { + "epoch": 0.404484286618611, + "grad_norm": 0.6983739029881817, + "learning_rate": 0.00018483989016928817, + "loss": 12.564, + "step": 7428 + }, + { + "epoch": 0.404538740615194, + "grad_norm": 0.6408473930715737, + "learning_rate": 0.00018483522182882655, + "loss": 12.4528, + "step": 7429 + }, + { + "epoch": 0.40459319461177706, + "grad_norm": 0.9015349932579146, + "learning_rate": 0.000184830552828672, + "loss": 12.5665, + "step": 7430 + }, + { + "epoch": 0.40464764860836006, + "grad_norm": 0.6763429151139749, + "learning_rate": 0.00018482588316886083, + "loss": 12.5239, + "step": 7431 + }, + { + "epoch": 0.40470210260494305, + "grad_norm": 0.6513052832654872, + "learning_rate": 0.0001848212128494294, + "loss": 12.3781, + "step": 7432 + }, + { + "epoch": 0.4047565566015261, + "grad_norm": 0.6240966308784069, + "learning_rate": 0.00018481654187041396, + "loss": 12.4837, + "step": 7433 + }, + { + "epoch": 0.4048110105981091, + "grad_norm": 0.7165993138055703, + "learning_rate": 0.00018481187023185086, + "loss": 12.5838, + "step": 7434 + }, + { + "epoch": 0.4048654645946921, + "grad_norm": 0.5818743862161894, + "learning_rate": 0.0001848071979337765, + "loss": 12.3615, + "step": 7435 + }, + { + "epoch": 0.40491991859127513, + "grad_norm": 0.6636559493509343, + "learning_rate": 0.00018480252497622706, + "loss": 12.505, + "step": 7436 + }, + { + "epoch": 0.4049743725878581, + "grad_norm": 0.646612671491056, + "learning_rate": 0.00018479785135923905, + "loss": 12.5242, + "step": 7437 + }, + { + "epoch": 0.4050288265844411, + "grad_norm": 0.6143601759897361, + "learning_rate": 0.00018479317708284865, + "loss": 12.2706, + "step": 7438 + }, + { + "epoch": 0.40508328058102416, + "grad_norm": 0.7183560522499322, + "learning_rate": 0.00018478850214709232, + "loss": 12.3756, + "step": 7439 + }, + { + "epoch": 0.40513773457760716, + "grad_norm": 0.6479854547071566, + "learning_rate": 0.00018478382655200636, + "loss": 12.4938, + "step": 7440 + }, + { + "epoch": 0.40519218857419015, + "grad_norm": 0.8079384121207523, + "learning_rate": 0.00018477915029762717, + "loss": 12.2812, + "step": 7441 + }, + { + "epoch": 0.4052466425707732, + "grad_norm": 0.6780665302072778, + "learning_rate": 0.00018477447338399107, + "loss": 12.5561, + "step": 7442 + }, + { + "epoch": 0.4053010965673562, + "grad_norm": 0.6485134302769748, + "learning_rate": 0.00018476979581113449, + "loss": 12.3967, + "step": 7443 + }, + { + "epoch": 0.4053555505639392, + "grad_norm": 0.646095910126911, + "learning_rate": 0.00018476511757909374, + "loss": 12.5835, + "step": 7444 + }, + { + "epoch": 0.40541000456052223, + "grad_norm": 0.5917501713499067, + "learning_rate": 0.0001847604386879052, + "loss": 12.4826, + "step": 7445 + }, + { + "epoch": 0.4054644585571052, + "grad_norm": 0.6986606862332128, + "learning_rate": 0.00018475575913760528, + "loss": 12.429, + "step": 7446 + }, + { + "epoch": 0.4055189125536882, + "grad_norm": 0.6715650695644763, + "learning_rate": 0.00018475107892823039, + "loss": 12.5835, + "step": 7447 + }, + { + "epoch": 0.40557336655027126, + "grad_norm": 0.6512536707903471, + "learning_rate": 0.00018474639805981686, + "loss": 12.4181, + "step": 7448 + }, + { + "epoch": 0.40562782054685426, + "grad_norm": 0.6601484346024479, + "learning_rate": 0.00018474171653240116, + "loss": 12.4515, + "step": 7449 + }, + { + "epoch": 0.40568227454343725, + "grad_norm": 0.7259142491346082, + "learning_rate": 0.00018473703434601963, + "loss": 12.6206, + "step": 7450 + }, + { + "epoch": 0.4057367285400203, + "grad_norm": 0.7088826468538713, + "learning_rate": 0.00018473235150070873, + "loss": 12.4792, + "step": 7451 + }, + { + "epoch": 0.4057911825366033, + "grad_norm": 0.6843809362146842, + "learning_rate": 0.00018472766799650485, + "loss": 12.542, + "step": 7452 + }, + { + "epoch": 0.4058456365331863, + "grad_norm": 0.6461760765926458, + "learning_rate": 0.0001847229838334444, + "loss": 12.4608, + "step": 7453 + }, + { + "epoch": 0.40590009052976933, + "grad_norm": 0.6445512760636828, + "learning_rate": 0.00018471829901156386, + "loss": 12.592, + "step": 7454 + }, + { + "epoch": 0.4059545445263523, + "grad_norm": 0.6575265356329029, + "learning_rate": 0.0001847136135308996, + "loss": 12.4246, + "step": 7455 + }, + { + "epoch": 0.4060089985229353, + "grad_norm": 0.6039633442973562, + "learning_rate": 0.00018470892739148807, + "loss": 12.4342, + "step": 7456 + }, + { + "epoch": 0.40606345251951836, + "grad_norm": 0.7255842086748665, + "learning_rate": 0.0001847042405933657, + "loss": 12.5407, + "step": 7457 + }, + { + "epoch": 0.40611790651610136, + "grad_norm": 0.7658324790055303, + "learning_rate": 0.000184699553136569, + "loss": 12.4578, + "step": 7458 + }, + { + "epoch": 0.4061723605126844, + "grad_norm": 0.6736769880323731, + "learning_rate": 0.00018469486502113432, + "loss": 12.4223, + "step": 7459 + }, + { + "epoch": 0.4062268145092674, + "grad_norm": 0.646846684475632, + "learning_rate": 0.00018469017624709818, + "loss": 12.4377, + "step": 7460 + }, + { + "epoch": 0.4062812685058504, + "grad_norm": 0.643852904553272, + "learning_rate": 0.00018468548681449702, + "loss": 12.4088, + "step": 7461 + }, + { + "epoch": 0.40633572250243344, + "grad_norm": 0.6213086606944596, + "learning_rate": 0.00018468079672336732, + "loss": 12.474, + "step": 7462 + }, + { + "epoch": 0.40639017649901643, + "grad_norm": 0.7137248569844133, + "learning_rate": 0.00018467610597374553, + "loss": 12.5347, + "step": 7463 + }, + { + "epoch": 0.4064446304955994, + "grad_norm": 0.6181355049536756, + "learning_rate": 0.0001846714145656682, + "loss": 12.4671, + "step": 7464 + }, + { + "epoch": 0.40649908449218247, + "grad_norm": 0.7793233068249615, + "learning_rate": 0.0001846667224991717, + "loss": 12.5739, + "step": 7465 + }, + { + "epoch": 0.40655353848876546, + "grad_norm": 0.6253303433337799, + "learning_rate": 0.00018466202977429256, + "loss": 12.4636, + "step": 7466 + }, + { + "epoch": 0.40660799248534846, + "grad_norm": 0.6846509132803774, + "learning_rate": 0.00018465733639106728, + "loss": 12.4586, + "step": 7467 + }, + { + "epoch": 0.4066624464819315, + "grad_norm": 0.604509255157627, + "learning_rate": 0.00018465264234953236, + "loss": 12.5897, + "step": 7468 + }, + { + "epoch": 0.4067169004785145, + "grad_norm": 0.6414263010065017, + "learning_rate": 0.00018464794764972434, + "loss": 12.5562, + "step": 7469 + }, + { + "epoch": 0.4067713544750975, + "grad_norm": 0.6395195367346062, + "learning_rate": 0.00018464325229167961, + "loss": 12.5577, + "step": 7470 + }, + { + "epoch": 0.40682580847168054, + "grad_norm": 0.8378691952895028, + "learning_rate": 0.00018463855627543483, + "loss": 12.5377, + "step": 7471 + }, + { + "epoch": 0.40688026246826353, + "grad_norm": 0.5838339172660505, + "learning_rate": 0.0001846338596010264, + "loss": 12.4138, + "step": 7472 + }, + { + "epoch": 0.4069347164648465, + "grad_norm": 0.7445508194750182, + "learning_rate": 0.0001846291622684909, + "loss": 12.5826, + "step": 7473 + }, + { + "epoch": 0.4069891704614296, + "grad_norm": 0.7064093062710597, + "learning_rate": 0.0001846244642778648, + "loss": 12.525, + "step": 7474 + }, + { + "epoch": 0.40704362445801257, + "grad_norm": 0.6330305455995636, + "learning_rate": 0.00018461976562918471, + "loss": 12.471, + "step": 7475 + }, + { + "epoch": 0.40709807845459556, + "grad_norm": 0.6030418041469815, + "learning_rate": 0.00018461506632248714, + "loss": 12.4623, + "step": 7476 + }, + { + "epoch": 0.4071525324511786, + "grad_norm": 0.7572696367663821, + "learning_rate": 0.00018461036635780863, + "loss": 12.4131, + "step": 7477 + }, + { + "epoch": 0.4072069864477616, + "grad_norm": 0.7001598074872516, + "learning_rate": 0.0001846056657351857, + "loss": 12.4616, + "step": 7478 + }, + { + "epoch": 0.4072614404443446, + "grad_norm": 0.6940485999902011, + "learning_rate": 0.0001846009644546549, + "loss": 12.4128, + "step": 7479 + }, + { + "epoch": 0.40731589444092764, + "grad_norm": 0.70654138952586, + "learning_rate": 0.00018459626251625286, + "loss": 12.4186, + "step": 7480 + }, + { + "epoch": 0.40737034843751063, + "grad_norm": 0.923957097435168, + "learning_rate": 0.0001845915599200161, + "loss": 12.4704, + "step": 7481 + }, + { + "epoch": 0.4074248024340936, + "grad_norm": 0.787704032185711, + "learning_rate": 0.00018458685666598114, + "loss": 12.5002, + "step": 7482 + }, + { + "epoch": 0.4074792564306767, + "grad_norm": 0.8237434142055088, + "learning_rate": 0.00018458215275418463, + "loss": 12.6203, + "step": 7483 + }, + { + "epoch": 0.40753371042725967, + "grad_norm": 0.7488862790251631, + "learning_rate": 0.0001845774481846631, + "loss": 12.3982, + "step": 7484 + }, + { + "epoch": 0.40758816442384266, + "grad_norm": 0.7451931572702298, + "learning_rate": 0.00018457274295745316, + "loss": 12.5894, + "step": 7485 + }, + { + "epoch": 0.4076426184204257, + "grad_norm": 0.6763644886168447, + "learning_rate": 0.0001845680370725914, + "loss": 12.4027, + "step": 7486 + }, + { + "epoch": 0.4076970724170087, + "grad_norm": 0.8090062704601079, + "learning_rate": 0.00018456333053011437, + "loss": 12.3973, + "step": 7487 + }, + { + "epoch": 0.4077515264135917, + "grad_norm": 0.6766492711891675, + "learning_rate": 0.00018455862333005872, + "loss": 12.5136, + "step": 7488 + }, + { + "epoch": 0.40780598041017474, + "grad_norm": 0.7354799773717808, + "learning_rate": 0.000184553915472461, + "loss": 12.5156, + "step": 7489 + }, + { + "epoch": 0.40786043440675773, + "grad_norm": 0.7358787223055255, + "learning_rate": 0.0001845492069573579, + "loss": 12.5716, + "step": 7490 + }, + { + "epoch": 0.4079148884033407, + "grad_norm": 0.7940397944309273, + "learning_rate": 0.00018454449778478597, + "loss": 12.6003, + "step": 7491 + }, + { + "epoch": 0.4079693423999238, + "grad_norm": 0.7276279277087789, + "learning_rate": 0.00018453978795478183, + "loss": 12.5072, + "step": 7492 + }, + { + "epoch": 0.40802379639650677, + "grad_norm": 0.65622616537559, + "learning_rate": 0.00018453507746738217, + "loss": 12.4548, + "step": 7493 + }, + { + "epoch": 0.4080782503930898, + "grad_norm": 0.6544295804803718, + "learning_rate": 0.00018453036632262352, + "loss": 12.5034, + "step": 7494 + }, + { + "epoch": 0.4081327043896728, + "grad_norm": 0.6667584813889628, + "learning_rate": 0.0001845256545205426, + "loss": 12.3758, + "step": 7495 + }, + { + "epoch": 0.4081871583862558, + "grad_norm": 0.7293652650925002, + "learning_rate": 0.000184520942061176, + "loss": 12.4498, + "step": 7496 + }, + { + "epoch": 0.40824161238283885, + "grad_norm": 0.6768095263738398, + "learning_rate": 0.00018451622894456038, + "loss": 12.4563, + "step": 7497 + }, + { + "epoch": 0.40829606637942184, + "grad_norm": 0.7193329277422399, + "learning_rate": 0.0001845115151707324, + "loss": 12.6072, + "step": 7498 + }, + { + "epoch": 0.40835052037600483, + "grad_norm": 0.6847329382109661, + "learning_rate": 0.00018450680073972867, + "loss": 12.4992, + "step": 7499 + }, + { + "epoch": 0.4084049743725879, + "grad_norm": 0.6414975022868673, + "learning_rate": 0.00018450208565158594, + "loss": 12.4932, + "step": 7500 + }, + { + "epoch": 0.4084594283691709, + "grad_norm": 0.6553741533966808, + "learning_rate": 0.0001844973699063408, + "loss": 12.4239, + "step": 7501 + }, + { + "epoch": 0.40851388236575387, + "grad_norm": 0.6885331621744704, + "learning_rate": 0.00018449265350402994, + "loss": 12.4297, + "step": 7502 + }, + { + "epoch": 0.4085683363623369, + "grad_norm": 0.6928052080715973, + "learning_rate": 0.00018448793644469002, + "loss": 12.485, + "step": 7503 + }, + { + "epoch": 0.4086227903589199, + "grad_norm": 0.6302817729042274, + "learning_rate": 0.00018448321872835773, + "loss": 12.4015, + "step": 7504 + }, + { + "epoch": 0.4086772443555029, + "grad_norm": 0.7513722166710587, + "learning_rate": 0.0001844785003550698, + "loss": 12.3403, + "step": 7505 + }, + { + "epoch": 0.40873169835208595, + "grad_norm": 0.6321223134835887, + "learning_rate": 0.00018447378132486288, + "loss": 12.3748, + "step": 7506 + }, + { + "epoch": 0.40878615234866894, + "grad_norm": 0.7182095663938053, + "learning_rate": 0.00018446906163777365, + "loss": 12.5056, + "step": 7507 + }, + { + "epoch": 0.40884060634525193, + "grad_norm": 0.7045543146464835, + "learning_rate": 0.00018446434129383885, + "loss": 12.435, + "step": 7508 + }, + { + "epoch": 0.408895060341835, + "grad_norm": 0.6874610053789847, + "learning_rate": 0.00018445962029309514, + "loss": 12.548, + "step": 7509 + }, + { + "epoch": 0.408949514338418, + "grad_norm": 0.6395404723303102, + "learning_rate": 0.00018445489863557927, + "loss": 12.5048, + "step": 7510 + }, + { + "epoch": 0.40900396833500097, + "grad_norm": 0.6515443035408928, + "learning_rate": 0.00018445017632132794, + "loss": 12.407, + "step": 7511 + }, + { + "epoch": 0.409058422331584, + "grad_norm": 0.6420344734182722, + "learning_rate": 0.0001844454533503779, + "loss": 12.4882, + "step": 7512 + }, + { + "epoch": 0.409112876328167, + "grad_norm": 0.6720582995151158, + "learning_rate": 0.00018444072972276584, + "loss": 12.7021, + "step": 7513 + }, + { + "epoch": 0.40916733032475, + "grad_norm": 0.6124770996425045, + "learning_rate": 0.00018443600543852851, + "loss": 12.3944, + "step": 7514 + }, + { + "epoch": 0.40922178432133305, + "grad_norm": 0.6943339952126241, + "learning_rate": 0.00018443128049770263, + "loss": 12.4117, + "step": 7515 + }, + { + "epoch": 0.40927623831791604, + "grad_norm": 0.5849011559469621, + "learning_rate": 0.00018442655490032498, + "loss": 12.2855, + "step": 7516 + }, + { + "epoch": 0.40933069231449903, + "grad_norm": 0.7283499647656516, + "learning_rate": 0.00018442182864643228, + "loss": 12.4723, + "step": 7517 + }, + { + "epoch": 0.4093851463110821, + "grad_norm": 0.7162779493886587, + "learning_rate": 0.00018441710173606123, + "loss": 12.593, + "step": 7518 + }, + { + "epoch": 0.4094396003076651, + "grad_norm": 0.6180021705599357, + "learning_rate": 0.00018441237416924868, + "loss": 12.5123, + "step": 7519 + }, + { + "epoch": 0.40949405430424807, + "grad_norm": 0.6582569756408991, + "learning_rate": 0.00018440764594603135, + "loss": 12.4382, + "step": 7520 + }, + { + "epoch": 0.4095485083008311, + "grad_norm": 0.6772835849000756, + "learning_rate": 0.00018440291706644602, + "loss": 12.5194, + "step": 7521 + }, + { + "epoch": 0.4096029622974141, + "grad_norm": 0.6426675103958575, + "learning_rate": 0.00018439818753052944, + "loss": 12.4174, + "step": 7522 + }, + { + "epoch": 0.4096574162939971, + "grad_norm": 0.737761035297852, + "learning_rate": 0.0001843934573383184, + "loss": 12.4666, + "step": 7523 + }, + { + "epoch": 0.40971187029058015, + "grad_norm": 0.7037342785824447, + "learning_rate": 0.00018438872648984965, + "loss": 12.5767, + "step": 7524 + }, + { + "epoch": 0.40976632428716314, + "grad_norm": 0.7693803436884855, + "learning_rate": 0.00018438399498516006, + "loss": 12.6765, + "step": 7525 + }, + { + "epoch": 0.4098207782837462, + "grad_norm": 0.6823533278812086, + "learning_rate": 0.00018437926282428637, + "loss": 12.5121, + "step": 7526 + }, + { + "epoch": 0.4098752322803292, + "grad_norm": 0.7116566941304763, + "learning_rate": 0.00018437453000726538, + "loss": 12.5272, + "step": 7527 + }, + { + "epoch": 0.4099296862769122, + "grad_norm": 0.6472268275714508, + "learning_rate": 0.00018436979653413385, + "loss": 12.619, + "step": 7528 + }, + { + "epoch": 0.4099841402734952, + "grad_norm": 0.625849243204641, + "learning_rate": 0.0001843650624049287, + "loss": 12.3751, + "step": 7529 + }, + { + "epoch": 0.4100385942700782, + "grad_norm": 0.6331093988199687, + "learning_rate": 0.00018436032761968662, + "loss": 12.4024, + "step": 7530 + }, + { + "epoch": 0.4100930482666612, + "grad_norm": 0.6527662119389117, + "learning_rate": 0.00018435559217844452, + "loss": 12.324, + "step": 7531 + }, + { + "epoch": 0.41014750226324426, + "grad_norm": 0.6435280590335135, + "learning_rate": 0.0001843508560812392, + "loss": 12.5237, + "step": 7532 + }, + { + "epoch": 0.41020195625982725, + "grad_norm": 0.602130591757125, + "learning_rate": 0.00018434611932810743, + "loss": 12.5467, + "step": 7533 + }, + { + "epoch": 0.41025641025641024, + "grad_norm": 0.6666486375237347, + "learning_rate": 0.00018434138191908615, + "loss": 12.5347, + "step": 7534 + }, + { + "epoch": 0.4103108642529933, + "grad_norm": 0.63944993563378, + "learning_rate": 0.0001843366438542121, + "loss": 12.6362, + "step": 7535 + }, + { + "epoch": 0.4103653182495763, + "grad_norm": 0.6376619610146421, + "learning_rate": 0.00018433190513352218, + "loss": 12.4419, + "step": 7536 + }, + { + "epoch": 0.4104197722461593, + "grad_norm": 0.6372513569437053, + "learning_rate": 0.0001843271657570532, + "loss": 12.5039, + "step": 7537 + }, + { + "epoch": 0.4104742262427423, + "grad_norm": 0.913361086553666, + "learning_rate": 0.00018432242572484205, + "loss": 12.601, + "step": 7538 + }, + { + "epoch": 0.4105286802393253, + "grad_norm": 0.6746220751420223, + "learning_rate": 0.00018431768503692557, + "loss": 12.5318, + "step": 7539 + }, + { + "epoch": 0.4105831342359083, + "grad_norm": 0.5908373611740666, + "learning_rate": 0.00018431294369334065, + "loss": 12.4197, + "step": 7540 + }, + { + "epoch": 0.41063758823249136, + "grad_norm": 0.5960354209167404, + "learning_rate": 0.00018430820169412413, + "loss": 12.3466, + "step": 7541 + }, + { + "epoch": 0.41069204222907435, + "grad_norm": 0.6649575614542876, + "learning_rate": 0.0001843034590393129, + "loss": 12.4657, + "step": 7542 + }, + { + "epoch": 0.41074649622565734, + "grad_norm": 0.6638348440625013, + "learning_rate": 0.0001842987157289438, + "loss": 12.5338, + "step": 7543 + }, + { + "epoch": 0.4108009502222404, + "grad_norm": 0.6651550518063203, + "learning_rate": 0.00018429397176305382, + "loss": 12.337, + "step": 7544 + }, + { + "epoch": 0.4108554042188234, + "grad_norm": 0.6785908004988509, + "learning_rate": 0.0001842892271416797, + "loss": 12.4929, + "step": 7545 + }, + { + "epoch": 0.4109098582154064, + "grad_norm": 0.6328267972406906, + "learning_rate": 0.00018428448186485848, + "loss": 12.4313, + "step": 7546 + }, + { + "epoch": 0.4109643122119894, + "grad_norm": 0.5950867640188962, + "learning_rate": 0.00018427973593262696, + "loss": 12.3173, + "step": 7547 + }, + { + "epoch": 0.4110187662085724, + "grad_norm": 0.7421067335083728, + "learning_rate": 0.0001842749893450221, + "loss": 12.7487, + "step": 7548 + }, + { + "epoch": 0.4110732202051554, + "grad_norm": 0.6206223920094129, + "learning_rate": 0.00018427024210208078, + "loss": 12.3902, + "step": 7549 + }, + { + "epoch": 0.41112767420173846, + "grad_norm": 0.6604135706333086, + "learning_rate": 0.0001842654942038399, + "loss": 12.3789, + "step": 7550 + }, + { + "epoch": 0.41118212819832145, + "grad_norm": 0.6180331174072714, + "learning_rate": 0.00018426074565033645, + "loss": 12.5446, + "step": 7551 + }, + { + "epoch": 0.41123658219490444, + "grad_norm": 0.6358907770850538, + "learning_rate": 0.00018425599644160726, + "loss": 12.3478, + "step": 7552 + }, + { + "epoch": 0.4112910361914875, + "grad_norm": 0.7649458824755346, + "learning_rate": 0.0001842512465776894, + "loss": 12.4298, + "step": 7553 + }, + { + "epoch": 0.4113454901880705, + "grad_norm": 0.661010889082727, + "learning_rate": 0.0001842464960586196, + "loss": 12.4604, + "step": 7554 + }, + { + "epoch": 0.4113999441846535, + "grad_norm": 0.5670705848084314, + "learning_rate": 0.000184241744884435, + "loss": 12.3934, + "step": 7555 + }, + { + "epoch": 0.4114543981812365, + "grad_norm": 0.6599957599839233, + "learning_rate": 0.00018423699305517244, + "loss": 12.5605, + "step": 7556 + }, + { + "epoch": 0.4115088521778195, + "grad_norm": 0.6912314311352966, + "learning_rate": 0.00018423224057086888, + "loss": 12.56, + "step": 7557 + }, + { + "epoch": 0.4115633061744025, + "grad_norm": 0.6548538166488451, + "learning_rate": 0.00018422748743156134, + "loss": 12.347, + "step": 7558 + }, + { + "epoch": 0.41161776017098556, + "grad_norm": 0.6104316468510993, + "learning_rate": 0.0001842227336372867, + "loss": 12.5359, + "step": 7559 + }, + { + "epoch": 0.41167221416756855, + "grad_norm": 0.7374021548975841, + "learning_rate": 0.00018421797918808194, + "loss": 12.5152, + "step": 7560 + }, + { + "epoch": 0.4117266681641516, + "grad_norm": 0.6458091136832962, + "learning_rate": 0.00018421322408398408, + "loss": 12.5076, + "step": 7561 + }, + { + "epoch": 0.4117811221607346, + "grad_norm": 0.6777619018612451, + "learning_rate": 0.00018420846832503006, + "loss": 12.6523, + "step": 7562 + }, + { + "epoch": 0.4118355761573176, + "grad_norm": 0.5836581592065011, + "learning_rate": 0.00018420371191125686, + "loss": 12.2898, + "step": 7563 + }, + { + "epoch": 0.41189003015390063, + "grad_norm": 0.728462007314925, + "learning_rate": 0.0001841989548427015, + "loss": 12.6287, + "step": 7564 + }, + { + "epoch": 0.4119444841504836, + "grad_norm": 0.7025051926962401, + "learning_rate": 0.00018419419711940093, + "loss": 12.4986, + "step": 7565 + }, + { + "epoch": 0.4119989381470666, + "grad_norm": 0.6196466337363593, + "learning_rate": 0.00018418943874139217, + "loss": 12.4648, + "step": 7566 + }, + { + "epoch": 0.41205339214364967, + "grad_norm": 0.6256416043269872, + "learning_rate": 0.00018418467970871222, + "loss": 12.3932, + "step": 7567 + }, + { + "epoch": 0.41210784614023266, + "grad_norm": 0.636122420202276, + "learning_rate": 0.00018417992002139807, + "loss": 12.3566, + "step": 7568 + }, + { + "epoch": 0.41216230013681565, + "grad_norm": 0.6657010523105482, + "learning_rate": 0.00018417515967948672, + "loss": 12.3594, + "step": 7569 + }, + { + "epoch": 0.4122167541333987, + "grad_norm": 0.6719196758228524, + "learning_rate": 0.00018417039868301528, + "loss": 12.5234, + "step": 7570 + }, + { + "epoch": 0.4122712081299817, + "grad_norm": 0.5998339464207478, + "learning_rate": 0.00018416563703202064, + "loss": 12.5577, + "step": 7571 + }, + { + "epoch": 0.4123256621265647, + "grad_norm": 0.8298355480163812, + "learning_rate": 0.00018416087472653992, + "loss": 12.6147, + "step": 7572 + }, + { + "epoch": 0.41238011612314773, + "grad_norm": 0.6592033463612728, + "learning_rate": 0.0001841561117666101, + "loss": 12.3951, + "step": 7573 + }, + { + "epoch": 0.4124345701197307, + "grad_norm": 0.7207947352174678, + "learning_rate": 0.00018415134815226826, + "loss": 12.4531, + "step": 7574 + }, + { + "epoch": 0.4124890241163137, + "grad_norm": 0.6148861802788756, + "learning_rate": 0.00018414658388355145, + "loss": 12.4126, + "step": 7575 + }, + { + "epoch": 0.41254347811289677, + "grad_norm": 0.6689104279292235, + "learning_rate": 0.00018414181896049664, + "loss": 12.5419, + "step": 7576 + }, + { + "epoch": 0.41259793210947976, + "grad_norm": 0.7014344990842885, + "learning_rate": 0.00018413705338314097, + "loss": 12.466, + "step": 7577 + }, + { + "epoch": 0.41265238610606275, + "grad_norm": 0.6714153913382441, + "learning_rate": 0.00018413228715152147, + "loss": 12.5583, + "step": 7578 + }, + { + "epoch": 0.4127068401026458, + "grad_norm": 0.7751802041986378, + "learning_rate": 0.00018412752026567518, + "loss": 12.6571, + "step": 7579 + }, + { + "epoch": 0.4127612940992288, + "grad_norm": 0.6278998134838513, + "learning_rate": 0.0001841227527256392, + "loss": 12.51, + "step": 7580 + }, + { + "epoch": 0.4128157480958118, + "grad_norm": 0.7082243757340546, + "learning_rate": 0.00018411798453145056, + "loss": 12.5604, + "step": 7581 + }, + { + "epoch": 0.41287020209239483, + "grad_norm": 0.6326366117703875, + "learning_rate": 0.00018411321568314638, + "loss": 12.3858, + "step": 7582 + }, + { + "epoch": 0.4129246560889778, + "grad_norm": 0.6677705390508786, + "learning_rate": 0.00018410844618076372, + "loss": 12.5072, + "step": 7583 + }, + { + "epoch": 0.4129791100855608, + "grad_norm": 0.6679816766096263, + "learning_rate": 0.00018410367602433972, + "loss": 12.5658, + "step": 7584 + }, + { + "epoch": 0.41303356408214387, + "grad_norm": 0.7019511202137079, + "learning_rate": 0.00018409890521391137, + "loss": 12.5263, + "step": 7585 + }, + { + "epoch": 0.41308801807872686, + "grad_norm": 0.6543623414632546, + "learning_rate": 0.00018409413374951587, + "loss": 12.4733, + "step": 7586 + }, + { + "epoch": 0.41314247207530985, + "grad_norm": 0.6004737182626931, + "learning_rate": 0.0001840893616311903, + "loss": 12.4799, + "step": 7587 + }, + { + "epoch": 0.4131969260718929, + "grad_norm": 0.6714918977825168, + "learning_rate": 0.0001840845888589717, + "loss": 12.5035, + "step": 7588 + }, + { + "epoch": 0.4132513800684759, + "grad_norm": 0.6687905292852764, + "learning_rate": 0.00018407981543289726, + "loss": 12.4394, + "step": 7589 + }, + { + "epoch": 0.4133058340650589, + "grad_norm": 0.5822307505939365, + "learning_rate": 0.00018407504135300407, + "loss": 12.4167, + "step": 7590 + }, + { + "epoch": 0.41336028806164193, + "grad_norm": 0.6797044733620369, + "learning_rate": 0.00018407026661932928, + "loss": 12.5009, + "step": 7591 + }, + { + "epoch": 0.4134147420582249, + "grad_norm": 0.6662696811313099, + "learning_rate": 0.00018406549123190996, + "loss": 12.4285, + "step": 7592 + }, + { + "epoch": 0.413469196054808, + "grad_norm": 0.6384663985910454, + "learning_rate": 0.0001840607151907833, + "loss": 12.4338, + "step": 7593 + }, + { + "epoch": 0.41352365005139097, + "grad_norm": 0.6563451161415367, + "learning_rate": 0.00018405593849598644, + "loss": 12.5337, + "step": 7594 + }, + { + "epoch": 0.41357810404797396, + "grad_norm": 0.6007396621690622, + "learning_rate": 0.00018405116114755647, + "loss": 12.4726, + "step": 7595 + }, + { + "epoch": 0.413632558044557, + "grad_norm": 0.6356480095698379, + "learning_rate": 0.00018404638314553062, + "loss": 12.3176, + "step": 7596 + }, + { + "epoch": 0.41368701204114, + "grad_norm": 0.6362794038608103, + "learning_rate": 0.00018404160448994597, + "loss": 12.454, + "step": 7597 + }, + { + "epoch": 0.413741466037723, + "grad_norm": 0.6294171250211859, + "learning_rate": 0.0001840368251808397, + "loss": 12.415, + "step": 7598 + }, + { + "epoch": 0.41379592003430604, + "grad_norm": 0.6220974427648912, + "learning_rate": 0.00018403204521824903, + "loss": 12.5613, + "step": 7599 + }, + { + "epoch": 0.41385037403088903, + "grad_norm": 0.7140220218064345, + "learning_rate": 0.00018402726460221104, + "loss": 12.4242, + "step": 7600 + }, + { + "epoch": 0.413904828027472, + "grad_norm": 0.6264698242072783, + "learning_rate": 0.00018402248333276297, + "loss": 12.4845, + "step": 7601 + }, + { + "epoch": 0.4139592820240551, + "grad_norm": 0.6131916443343639, + "learning_rate": 0.00018401770140994198, + "loss": 12.4454, + "step": 7602 + }, + { + "epoch": 0.41401373602063807, + "grad_norm": 0.7080235427810602, + "learning_rate": 0.00018401291883378523, + "loss": 12.4933, + "step": 7603 + }, + { + "epoch": 0.41406819001722106, + "grad_norm": 0.7726758600328698, + "learning_rate": 0.00018400813560432997, + "loss": 12.4119, + "step": 7604 + }, + { + "epoch": 0.4141226440138041, + "grad_norm": 0.6969997285025378, + "learning_rate": 0.00018400335172161333, + "loss": 12.4852, + "step": 7605 + }, + { + "epoch": 0.4141770980103871, + "grad_norm": 0.6923251834896971, + "learning_rate": 0.00018399856718567256, + "loss": 12.5334, + "step": 7606 + }, + { + "epoch": 0.4142315520069701, + "grad_norm": 0.709215148883989, + "learning_rate": 0.00018399378199654486, + "loss": 12.4814, + "step": 7607 + }, + { + "epoch": 0.41428600600355314, + "grad_norm": 0.7182335665032548, + "learning_rate": 0.00018398899615426737, + "loss": 12.5802, + "step": 7608 + }, + { + "epoch": 0.41434046000013613, + "grad_norm": 0.7747472579587373, + "learning_rate": 0.00018398420965887738, + "loss": 12.3487, + "step": 7609 + }, + { + "epoch": 0.4143949139967191, + "grad_norm": 0.6641584285672352, + "learning_rate": 0.00018397942251041212, + "loss": 12.4149, + "step": 7610 + }, + { + "epoch": 0.4144493679933022, + "grad_norm": 0.6559558849248063, + "learning_rate": 0.00018397463470890877, + "loss": 12.4061, + "step": 7611 + }, + { + "epoch": 0.41450382198988517, + "grad_norm": 0.6929051117161943, + "learning_rate": 0.00018396984625440458, + "loss": 12.4807, + "step": 7612 + }, + { + "epoch": 0.41455827598646816, + "grad_norm": 0.6789925394878573, + "learning_rate": 0.00018396505714693678, + "loss": 12.5116, + "step": 7613 + }, + { + "epoch": 0.4146127299830512, + "grad_norm": 0.7897268899930915, + "learning_rate": 0.00018396026738654264, + "loss": 12.432, + "step": 7614 + }, + { + "epoch": 0.4146671839796342, + "grad_norm": 0.6807339651106654, + "learning_rate": 0.00018395547697325933, + "loss": 12.3411, + "step": 7615 + }, + { + "epoch": 0.4147216379762172, + "grad_norm": 0.6682991423741861, + "learning_rate": 0.00018395068590712417, + "loss": 12.3832, + "step": 7616 + }, + { + "epoch": 0.41477609197280024, + "grad_norm": 0.7746811225957453, + "learning_rate": 0.00018394589418817443, + "loss": 12.4631, + "step": 7617 + }, + { + "epoch": 0.41483054596938324, + "grad_norm": 0.6964451730482353, + "learning_rate": 0.00018394110181644733, + "loss": 12.3519, + "step": 7618 + }, + { + "epoch": 0.41488499996596623, + "grad_norm": 0.6527959058968399, + "learning_rate": 0.00018393630879198013, + "loss": 12.4443, + "step": 7619 + }, + { + "epoch": 0.4149394539625493, + "grad_norm": 0.7411057629441736, + "learning_rate": 0.0001839315151148101, + "loss": 12.3707, + "step": 7620 + }, + { + "epoch": 0.41499390795913227, + "grad_norm": 0.742397366365453, + "learning_rate": 0.00018392672078497454, + "loss": 12.5815, + "step": 7621 + }, + { + "epoch": 0.41504836195571526, + "grad_norm": 0.6845206549741516, + "learning_rate": 0.00018392192580251075, + "loss": 12.5448, + "step": 7622 + }, + { + "epoch": 0.4151028159522983, + "grad_norm": 0.8617869176209406, + "learning_rate": 0.00018391713016745596, + "loss": 12.5105, + "step": 7623 + }, + { + "epoch": 0.4151572699488813, + "grad_norm": 0.6332044702163018, + "learning_rate": 0.00018391233387984754, + "loss": 12.4949, + "step": 7624 + }, + { + "epoch": 0.4152117239454643, + "grad_norm": 0.6921877222300677, + "learning_rate": 0.0001839075369397227, + "loss": 12.44, + "step": 7625 + }, + { + "epoch": 0.41526617794204734, + "grad_norm": 0.7304182216130374, + "learning_rate": 0.0001839027393471188, + "loss": 12.5507, + "step": 7626 + }, + { + "epoch": 0.41532063193863034, + "grad_norm": 0.6844024377851525, + "learning_rate": 0.00018389794110207312, + "loss": 12.5578, + "step": 7627 + }, + { + "epoch": 0.4153750859352134, + "grad_norm": 0.6777229470414581, + "learning_rate": 0.00018389314220462296, + "loss": 12.4082, + "step": 7628 + }, + { + "epoch": 0.4154295399317964, + "grad_norm": 0.6432218551933113, + "learning_rate": 0.0001838883426548057, + "loss": 12.5191, + "step": 7629 + }, + { + "epoch": 0.41548399392837937, + "grad_norm": 0.6859378464003325, + "learning_rate": 0.00018388354245265858, + "loss": 12.5486, + "step": 7630 + }, + { + "epoch": 0.4155384479249624, + "grad_norm": 0.6671747399551233, + "learning_rate": 0.000183878741598219, + "loss": 12.5289, + "step": 7631 + }, + { + "epoch": 0.4155929019215454, + "grad_norm": 0.6850227729337616, + "learning_rate": 0.00018387394009152425, + "loss": 12.5237, + "step": 7632 + }, + { + "epoch": 0.4156473559181284, + "grad_norm": 0.7907722026545728, + "learning_rate": 0.00018386913793261167, + "loss": 12.6377, + "step": 7633 + }, + { + "epoch": 0.41570180991471145, + "grad_norm": 0.7024232324059682, + "learning_rate": 0.0001838643351215186, + "loss": 12.4349, + "step": 7634 + }, + { + "epoch": 0.41575626391129444, + "grad_norm": 0.703812480485606, + "learning_rate": 0.0001838595316582824, + "loss": 12.5366, + "step": 7635 + }, + { + "epoch": 0.41581071790787744, + "grad_norm": 0.7252726705505731, + "learning_rate": 0.00018385472754294042, + "loss": 12.5311, + "step": 7636 + }, + { + "epoch": 0.4158651719044605, + "grad_norm": 0.7160236603812885, + "learning_rate": 0.00018384992277553001, + "loss": 12.5574, + "step": 7637 + }, + { + "epoch": 0.4159196259010435, + "grad_norm": 0.6561769782974751, + "learning_rate": 0.00018384511735608855, + "loss": 12.3621, + "step": 7638 + }, + { + "epoch": 0.41597407989762647, + "grad_norm": 0.7642371517971974, + "learning_rate": 0.0001838403112846534, + "loss": 12.4601, + "step": 7639 + }, + { + "epoch": 0.4160285338942095, + "grad_norm": 0.6832445256349677, + "learning_rate": 0.00018383550456126192, + "loss": 12.3823, + "step": 7640 + }, + { + "epoch": 0.4160829878907925, + "grad_norm": 0.9046044716944612, + "learning_rate": 0.00018383069718595153, + "loss": 12.681, + "step": 7641 + }, + { + "epoch": 0.4161374418873755, + "grad_norm": 0.6990758835934934, + "learning_rate": 0.00018382588915875952, + "loss": 12.5077, + "step": 7642 + }, + { + "epoch": 0.41619189588395855, + "grad_norm": 0.818672399509247, + "learning_rate": 0.00018382108047972336, + "loss": 12.4796, + "step": 7643 + }, + { + "epoch": 0.41624634988054154, + "grad_norm": 0.6347657355980698, + "learning_rate": 0.00018381627114888045, + "loss": 12.5621, + "step": 7644 + }, + { + "epoch": 0.41630080387712454, + "grad_norm": 0.7051436257130543, + "learning_rate": 0.00018381146116626816, + "loss": 12.3868, + "step": 7645 + }, + { + "epoch": 0.4163552578737076, + "grad_norm": 0.6580907573187873, + "learning_rate": 0.00018380665053192386, + "loss": 12.5146, + "step": 7646 + }, + { + "epoch": 0.4164097118702906, + "grad_norm": 0.7377588067672651, + "learning_rate": 0.00018380183924588498, + "loss": 12.5493, + "step": 7647 + }, + { + "epoch": 0.41646416586687357, + "grad_norm": 0.6434057170985927, + "learning_rate": 0.000183797027308189, + "loss": 12.4146, + "step": 7648 + }, + { + "epoch": 0.4165186198634566, + "grad_norm": 0.6201660562859044, + "learning_rate": 0.00018379221471887325, + "loss": 12.4045, + "step": 7649 + }, + { + "epoch": 0.4165730738600396, + "grad_norm": 0.7312758361049857, + "learning_rate": 0.00018378740147797517, + "loss": 12.5651, + "step": 7650 + }, + { + "epoch": 0.4166275278566226, + "grad_norm": 0.6562949596475576, + "learning_rate": 0.00018378258758553222, + "loss": 12.4739, + "step": 7651 + }, + { + "epoch": 0.41668198185320565, + "grad_norm": 0.8109608827473153, + "learning_rate": 0.00018377777304158182, + "loss": 12.3613, + "step": 7652 + }, + { + "epoch": 0.41673643584978864, + "grad_norm": 0.6547422099606501, + "learning_rate": 0.00018377295784616142, + "loss": 12.5337, + "step": 7653 + }, + { + "epoch": 0.41679088984637164, + "grad_norm": 0.7061811226970356, + "learning_rate": 0.00018376814199930842, + "loss": 12.5456, + "step": 7654 + }, + { + "epoch": 0.4168453438429547, + "grad_norm": 0.6012608861041727, + "learning_rate": 0.00018376332550106033, + "loss": 12.533, + "step": 7655 + }, + { + "epoch": 0.4168997978395377, + "grad_norm": 0.7006711823775731, + "learning_rate": 0.00018375850835145456, + "loss": 12.5389, + "step": 7656 + }, + { + "epoch": 0.41695425183612067, + "grad_norm": 0.6580649051091263, + "learning_rate": 0.0001837536905505286, + "loss": 12.4394, + "step": 7657 + }, + { + "epoch": 0.4170087058327037, + "grad_norm": 0.6760449622537299, + "learning_rate": 0.00018374887209831987, + "loss": 12.3788, + "step": 7658 + }, + { + "epoch": 0.4170631598292867, + "grad_norm": 0.6497353580956746, + "learning_rate": 0.00018374405299486588, + "loss": 12.4658, + "step": 7659 + }, + { + "epoch": 0.41711761382586976, + "grad_norm": 0.652611595647118, + "learning_rate": 0.0001837392332402041, + "loss": 12.4612, + "step": 7660 + }, + { + "epoch": 0.41717206782245275, + "grad_norm": 0.6578766692847864, + "learning_rate": 0.00018373441283437198, + "loss": 12.4691, + "step": 7661 + }, + { + "epoch": 0.41722652181903574, + "grad_norm": 0.6382698831671395, + "learning_rate": 0.00018372959177740704, + "loss": 12.4599, + "step": 7662 + }, + { + "epoch": 0.4172809758156188, + "grad_norm": 0.6850091154967107, + "learning_rate": 0.00018372477006934674, + "loss": 12.4671, + "step": 7663 + }, + { + "epoch": 0.4173354298122018, + "grad_norm": 0.6298292742381474, + "learning_rate": 0.0001837199477102286, + "loss": 12.4673, + "step": 7664 + }, + { + "epoch": 0.4173898838087848, + "grad_norm": 0.6113991567651845, + "learning_rate": 0.00018371512470009008, + "loss": 12.3753, + "step": 7665 + }, + { + "epoch": 0.4174443378053678, + "grad_norm": 0.7055999705123408, + "learning_rate": 0.00018371030103896872, + "loss": 12.5082, + "step": 7666 + }, + { + "epoch": 0.4174987918019508, + "grad_norm": 0.6371229065148822, + "learning_rate": 0.00018370547672690206, + "loss": 12.4813, + "step": 7667 + }, + { + "epoch": 0.4175532457985338, + "grad_norm": 0.7015226954955563, + "learning_rate": 0.00018370065176392752, + "loss": 12.4849, + "step": 7668 + }, + { + "epoch": 0.41760769979511686, + "grad_norm": 0.6310519097840122, + "learning_rate": 0.00018369582615008272, + "loss": 12.4796, + "step": 7669 + }, + { + "epoch": 0.41766215379169985, + "grad_norm": 0.5944925755085689, + "learning_rate": 0.00018369099988540513, + "loss": 12.2906, + "step": 7670 + }, + { + "epoch": 0.41771660778828285, + "grad_norm": 0.6578983386471594, + "learning_rate": 0.00018368617296993226, + "loss": 12.5302, + "step": 7671 + }, + { + "epoch": 0.4177710617848659, + "grad_norm": 0.6437522523863247, + "learning_rate": 0.00018368134540370173, + "loss": 12.4882, + "step": 7672 + }, + { + "epoch": 0.4178255157814489, + "grad_norm": 0.6162412646017736, + "learning_rate": 0.000183676517186751, + "loss": 12.4843, + "step": 7673 + }, + { + "epoch": 0.4178799697780319, + "grad_norm": 0.6899640431171089, + "learning_rate": 0.0001836716883191176, + "loss": 12.5048, + "step": 7674 + }, + { + "epoch": 0.4179344237746149, + "grad_norm": 0.6672292118932008, + "learning_rate": 0.00018366685880083914, + "loss": 12.4681, + "step": 7675 + }, + { + "epoch": 0.4179888777711979, + "grad_norm": 0.6079786159449238, + "learning_rate": 0.00018366202863195316, + "loss": 12.4824, + "step": 7676 + }, + { + "epoch": 0.4180433317677809, + "grad_norm": 0.7544946764584993, + "learning_rate": 0.00018365719781249725, + "loss": 12.5876, + "step": 7677 + }, + { + "epoch": 0.41809778576436396, + "grad_norm": 0.5962533245030253, + "learning_rate": 0.0001836523663425089, + "loss": 12.4612, + "step": 7678 + }, + { + "epoch": 0.41815223976094695, + "grad_norm": 0.6123575398733694, + "learning_rate": 0.00018364753422202575, + "loss": 12.5313, + "step": 7679 + }, + { + "epoch": 0.41820669375752995, + "grad_norm": 0.6940503398342436, + "learning_rate": 0.00018364270145108531, + "loss": 12.5112, + "step": 7680 + }, + { + "epoch": 0.418261147754113, + "grad_norm": 0.6680628632067475, + "learning_rate": 0.00018363786802972522, + "loss": 12.4506, + "step": 7681 + }, + { + "epoch": 0.418315601750696, + "grad_norm": 0.6543354862728545, + "learning_rate": 0.00018363303395798304, + "loss": 12.4137, + "step": 7682 + }, + { + "epoch": 0.418370055747279, + "grad_norm": 0.6030576150582967, + "learning_rate": 0.00018362819923589636, + "loss": 12.4018, + "step": 7683 + }, + { + "epoch": 0.418424509743862, + "grad_norm": 0.6680144702062961, + "learning_rate": 0.00018362336386350275, + "loss": 12.4991, + "step": 7684 + }, + { + "epoch": 0.418478963740445, + "grad_norm": 0.6222373278682574, + "learning_rate": 0.00018361852784083991, + "loss": 12.3989, + "step": 7685 + }, + { + "epoch": 0.418533417737028, + "grad_norm": 0.6886589633076576, + "learning_rate": 0.0001836136911679453, + "loss": 12.402, + "step": 7686 + }, + { + "epoch": 0.41858787173361106, + "grad_norm": 0.6532456915278813, + "learning_rate": 0.00018360885384485664, + "loss": 12.4871, + "step": 7687 + }, + { + "epoch": 0.41864232573019405, + "grad_norm": 1.1426480829233576, + "learning_rate": 0.0001836040158716115, + "loss": 12.3851, + "step": 7688 + }, + { + "epoch": 0.41869677972677705, + "grad_norm": 0.7287143760029692, + "learning_rate": 0.00018359917724824752, + "loss": 12.582, + "step": 7689 + }, + { + "epoch": 0.4187512337233601, + "grad_norm": 0.6414605281108499, + "learning_rate": 0.00018359433797480234, + "loss": 12.2483, + "step": 7690 + }, + { + "epoch": 0.4188056877199431, + "grad_norm": 0.5842099261062814, + "learning_rate": 0.00018358949805131352, + "loss": 12.373, + "step": 7691 + }, + { + "epoch": 0.4188601417165261, + "grad_norm": 0.6257821953679695, + "learning_rate": 0.00018358465747781878, + "loss": 12.4152, + "step": 7692 + }, + { + "epoch": 0.41891459571310913, + "grad_norm": 0.7349478349237475, + "learning_rate": 0.00018357981625435573, + "loss": 12.4382, + "step": 7693 + }, + { + "epoch": 0.4189690497096921, + "grad_norm": 0.6866824978335202, + "learning_rate": 0.000183574974380962, + "loss": 12.5315, + "step": 7694 + }, + { + "epoch": 0.41902350370627517, + "grad_norm": 0.7215980246716968, + "learning_rate": 0.00018357013185767526, + "loss": 12.6287, + "step": 7695 + }, + { + "epoch": 0.41907795770285816, + "grad_norm": 0.7164121518155415, + "learning_rate": 0.00018356528868453316, + "loss": 12.5496, + "step": 7696 + }, + { + "epoch": 0.41913241169944115, + "grad_norm": 0.6174833109503086, + "learning_rate": 0.00018356044486157334, + "loss": 12.4704, + "step": 7697 + }, + { + "epoch": 0.4191868656960242, + "grad_norm": 0.7061268318151425, + "learning_rate": 0.00018355560038883353, + "loss": 12.4084, + "step": 7698 + }, + { + "epoch": 0.4192413196926072, + "grad_norm": 0.7113906455959584, + "learning_rate": 0.00018355075526635132, + "loss": 12.5068, + "step": 7699 + }, + { + "epoch": 0.4192957736891902, + "grad_norm": 0.6160925389795541, + "learning_rate": 0.00018354590949416446, + "loss": 12.3842, + "step": 7700 + }, + { + "epoch": 0.41935022768577324, + "grad_norm": 0.764056388356585, + "learning_rate": 0.00018354106307231057, + "loss": 12.5408, + "step": 7701 + }, + { + "epoch": 0.41940468168235623, + "grad_norm": 0.6184383724182737, + "learning_rate": 0.00018353621600082737, + "loss": 12.4184, + "step": 7702 + }, + { + "epoch": 0.4194591356789392, + "grad_norm": 0.7259817883996311, + "learning_rate": 0.00018353136827975255, + "loss": 12.4332, + "step": 7703 + }, + { + "epoch": 0.41951358967552227, + "grad_norm": 0.7038668489917369, + "learning_rate": 0.0001835265199091238, + "loss": 12.4895, + "step": 7704 + }, + { + "epoch": 0.41956804367210526, + "grad_norm": 0.6629703207999516, + "learning_rate": 0.0001835216708889788, + "loss": 12.4573, + "step": 7705 + }, + { + "epoch": 0.41962249766868825, + "grad_norm": 0.6143947927874756, + "learning_rate": 0.0001835168212193553, + "loss": 12.5232, + "step": 7706 + }, + { + "epoch": 0.4196769516652713, + "grad_norm": 0.6349297012237477, + "learning_rate": 0.000183511970900291, + "loss": 12.567, + "step": 7707 + }, + { + "epoch": 0.4197314056618543, + "grad_norm": 0.824480006731654, + "learning_rate": 0.0001835071199318236, + "loss": 12.451, + "step": 7708 + }, + { + "epoch": 0.4197858596584373, + "grad_norm": 0.658652130143729, + "learning_rate": 0.00018350226831399084, + "loss": 12.4251, + "step": 7709 + }, + { + "epoch": 0.41984031365502034, + "grad_norm": 0.6313674763579805, + "learning_rate": 0.00018349741604683045, + "loss": 12.435, + "step": 7710 + }, + { + "epoch": 0.41989476765160333, + "grad_norm": 0.6637861440684033, + "learning_rate": 0.00018349256313038013, + "loss": 12.4185, + "step": 7711 + }, + { + "epoch": 0.4199492216481863, + "grad_norm": 0.7100167755541632, + "learning_rate": 0.00018348770956467766, + "loss": 12.5442, + "step": 7712 + }, + { + "epoch": 0.42000367564476937, + "grad_norm": 0.6598267047884887, + "learning_rate": 0.0001834828553497607, + "loss": 12.3989, + "step": 7713 + }, + { + "epoch": 0.42005812964135236, + "grad_norm": 0.6461052791122609, + "learning_rate": 0.0001834780004856671, + "loss": 12.3846, + "step": 7714 + }, + { + "epoch": 0.42011258363793536, + "grad_norm": 0.6652200852057961, + "learning_rate": 0.00018347314497243458, + "loss": 12.634, + "step": 7715 + }, + { + "epoch": 0.4201670376345184, + "grad_norm": 0.6463769020763129, + "learning_rate": 0.0001834682888101009, + "loss": 12.3472, + "step": 7716 + }, + { + "epoch": 0.4202214916311014, + "grad_norm": 0.7017130866496301, + "learning_rate": 0.00018346343199870374, + "loss": 12.4745, + "step": 7717 + }, + { + "epoch": 0.4202759456276844, + "grad_norm": 0.8069620567128515, + "learning_rate": 0.000183458574538281, + "loss": 12.4645, + "step": 7718 + }, + { + "epoch": 0.42033039962426744, + "grad_norm": 0.6694114333255915, + "learning_rate": 0.00018345371642887034, + "loss": 12.4017, + "step": 7719 + }, + { + "epoch": 0.42038485362085043, + "grad_norm": 0.6409125471734358, + "learning_rate": 0.0001834488576705096, + "loss": 12.4359, + "step": 7720 + }, + { + "epoch": 0.4204393076174334, + "grad_norm": 0.697416875461442, + "learning_rate": 0.0001834439982632366, + "loss": 12.5134, + "step": 7721 + }, + { + "epoch": 0.42049376161401647, + "grad_norm": 0.6530676618799656, + "learning_rate": 0.00018343913820708903, + "loss": 12.5292, + "step": 7722 + }, + { + "epoch": 0.42054821561059946, + "grad_norm": 0.6277856617161583, + "learning_rate": 0.00018343427750210474, + "loss": 12.5292, + "step": 7723 + }, + { + "epoch": 0.42060266960718246, + "grad_norm": 0.6596959812273022, + "learning_rate": 0.00018342941614832149, + "loss": 12.5169, + "step": 7724 + }, + { + "epoch": 0.4206571236037655, + "grad_norm": 0.6553076426372618, + "learning_rate": 0.00018342455414577713, + "loss": 12.5536, + "step": 7725 + }, + { + "epoch": 0.4207115776003485, + "grad_norm": 0.7052686980586507, + "learning_rate": 0.00018341969149450943, + "loss": 12.4405, + "step": 7726 + }, + { + "epoch": 0.42076603159693154, + "grad_norm": 0.6433534034167304, + "learning_rate": 0.00018341482819455625, + "loss": 12.5039, + "step": 7727 + }, + { + "epoch": 0.42082048559351454, + "grad_norm": 0.6185445890377947, + "learning_rate": 0.00018340996424595537, + "loss": 12.4427, + "step": 7728 + }, + { + "epoch": 0.42087493959009753, + "grad_norm": 0.6349314110007439, + "learning_rate": 0.0001834050996487446, + "loss": 12.5068, + "step": 7729 + }, + { + "epoch": 0.4209293935866806, + "grad_norm": 0.6737060455245588, + "learning_rate": 0.00018340023440296182, + "loss": 12.4331, + "step": 7730 + }, + { + "epoch": 0.42098384758326357, + "grad_norm": 0.6676306550303723, + "learning_rate": 0.00018339536850864478, + "loss": 12.59, + "step": 7731 + }, + { + "epoch": 0.42103830157984656, + "grad_norm": 0.673849100112351, + "learning_rate": 0.00018339050196583144, + "loss": 12.4062, + "step": 7732 + }, + { + "epoch": 0.4210927555764296, + "grad_norm": 0.6657925192980901, + "learning_rate": 0.0001833856347745595, + "loss": 12.4642, + "step": 7733 + }, + { + "epoch": 0.4211472095730126, + "grad_norm": 0.7768256422367489, + "learning_rate": 0.00018338076693486693, + "loss": 12.54, + "step": 7734 + }, + { + "epoch": 0.4212016635695956, + "grad_norm": 0.6839333718371754, + "learning_rate": 0.00018337589844679152, + "loss": 12.6317, + "step": 7735 + }, + { + "epoch": 0.42125611756617865, + "grad_norm": 0.7189502447091988, + "learning_rate": 0.00018337102931037112, + "loss": 12.4953, + "step": 7736 + }, + { + "epoch": 0.42131057156276164, + "grad_norm": 0.622438423831334, + "learning_rate": 0.00018336615952564364, + "loss": 12.4795, + "step": 7737 + }, + { + "epoch": 0.42136502555934463, + "grad_norm": 0.6967805388351358, + "learning_rate": 0.00018336128909264692, + "loss": 12.2899, + "step": 7738 + }, + { + "epoch": 0.4214194795559277, + "grad_norm": 0.7283347715661034, + "learning_rate": 0.00018335641801141883, + "loss": 12.5149, + "step": 7739 + }, + { + "epoch": 0.42147393355251067, + "grad_norm": 0.7300681925656116, + "learning_rate": 0.00018335154628199726, + "loss": 12.4365, + "step": 7740 + }, + { + "epoch": 0.42152838754909366, + "grad_norm": 0.886704235630511, + "learning_rate": 0.0001833466739044201, + "loss": 12.4463, + "step": 7741 + }, + { + "epoch": 0.4215828415456767, + "grad_norm": 0.6432836336371802, + "learning_rate": 0.0001833418008787252, + "loss": 12.4491, + "step": 7742 + }, + { + "epoch": 0.4216372955422597, + "grad_norm": 0.7927372042689552, + "learning_rate": 0.0001833369272049505, + "loss": 12.4499, + "step": 7743 + }, + { + "epoch": 0.4216917495388427, + "grad_norm": 0.6784228826890297, + "learning_rate": 0.00018333205288313385, + "loss": 12.4603, + "step": 7744 + }, + { + "epoch": 0.42174620353542575, + "grad_norm": 0.6279210026813303, + "learning_rate": 0.0001833271779133132, + "loss": 12.5222, + "step": 7745 + }, + { + "epoch": 0.42180065753200874, + "grad_norm": 0.6901241842062449, + "learning_rate": 0.00018332230229552645, + "loss": 12.5653, + "step": 7746 + }, + { + "epoch": 0.42185511152859173, + "grad_norm": 0.7646765823088739, + "learning_rate": 0.0001833174260298115, + "loss": 12.4381, + "step": 7747 + }, + { + "epoch": 0.4219095655251748, + "grad_norm": 0.5871207013967358, + "learning_rate": 0.00018331254911620626, + "loss": 12.3703, + "step": 7748 + }, + { + "epoch": 0.42196401952175777, + "grad_norm": 0.6341582639526494, + "learning_rate": 0.00018330767155474867, + "loss": 12.5204, + "step": 7749 + }, + { + "epoch": 0.42201847351834076, + "grad_norm": 0.7186670813039878, + "learning_rate": 0.00018330279334547668, + "loss": 12.5855, + "step": 7750 + }, + { + "epoch": 0.4220729275149238, + "grad_norm": 0.7608887056167734, + "learning_rate": 0.0001832979144884282, + "loss": 12.4069, + "step": 7751 + }, + { + "epoch": 0.4221273815115068, + "grad_norm": 0.7286736354425516, + "learning_rate": 0.00018329303498364113, + "loss": 12.5926, + "step": 7752 + }, + { + "epoch": 0.4221818355080898, + "grad_norm": 0.6667611301995382, + "learning_rate": 0.00018328815483115344, + "loss": 12.364, + "step": 7753 + }, + { + "epoch": 0.42223628950467285, + "grad_norm": 0.8559370596551145, + "learning_rate": 0.00018328327403100314, + "loss": 12.4806, + "step": 7754 + }, + { + "epoch": 0.42229074350125584, + "grad_norm": 0.5907731907724925, + "learning_rate": 0.00018327839258322812, + "loss": 12.3898, + "step": 7755 + }, + { + "epoch": 0.42234519749783883, + "grad_norm": 0.7229290398251434, + "learning_rate": 0.00018327351048786635, + "loss": 12.6165, + "step": 7756 + }, + { + "epoch": 0.4223996514944219, + "grad_norm": 0.7068869446224217, + "learning_rate": 0.00018326862774495578, + "loss": 12.5796, + "step": 7757 + }, + { + "epoch": 0.42245410549100487, + "grad_norm": 0.6291494939304955, + "learning_rate": 0.00018326374435453441, + "loss": 12.4155, + "step": 7758 + }, + { + "epoch": 0.42250855948758786, + "grad_norm": 0.6855015933078603, + "learning_rate": 0.00018325886031664022, + "loss": 12.5138, + "step": 7759 + }, + { + "epoch": 0.4225630134841709, + "grad_norm": 0.6254927977595047, + "learning_rate": 0.00018325397563131115, + "loss": 12.4501, + "step": 7760 + }, + { + "epoch": 0.4226174674807539, + "grad_norm": 0.5962974777849228, + "learning_rate": 0.0001832490902985852, + "loss": 12.5191, + "step": 7761 + }, + { + "epoch": 0.42267192147733695, + "grad_norm": 0.6723955151607713, + "learning_rate": 0.00018324420431850037, + "loss": 12.298, + "step": 7762 + }, + { + "epoch": 0.42272637547391995, + "grad_norm": 0.7398066987283984, + "learning_rate": 0.00018323931769109465, + "loss": 12.5349, + "step": 7763 + }, + { + "epoch": 0.42278082947050294, + "grad_norm": 0.6969458920048713, + "learning_rate": 0.00018323443041640602, + "loss": 12.4679, + "step": 7764 + }, + { + "epoch": 0.422835283467086, + "grad_norm": 0.6126110579796679, + "learning_rate": 0.00018322954249447252, + "loss": 12.4865, + "step": 7765 + }, + { + "epoch": 0.422889737463669, + "grad_norm": 0.7092533999165815, + "learning_rate": 0.00018322465392533216, + "loss": 12.5509, + "step": 7766 + }, + { + "epoch": 0.422944191460252, + "grad_norm": 0.6474367495039208, + "learning_rate": 0.0001832197647090229, + "loss": 12.3412, + "step": 7767 + }, + { + "epoch": 0.422998645456835, + "grad_norm": 0.6735126182790927, + "learning_rate": 0.00018321487484558276, + "loss": 12.508, + "step": 7768 + }, + { + "epoch": 0.423053099453418, + "grad_norm": 0.6488168446659008, + "learning_rate": 0.00018320998433504987, + "loss": 12.5817, + "step": 7769 + }, + { + "epoch": 0.423107553450001, + "grad_norm": 0.706447803660833, + "learning_rate": 0.00018320509317746217, + "loss": 12.4491, + "step": 7770 + }, + { + "epoch": 0.42316200744658405, + "grad_norm": 0.690598879814916, + "learning_rate": 0.00018320020137285766, + "loss": 12.4504, + "step": 7771 + }, + { + "epoch": 0.42321646144316705, + "grad_norm": 0.6087048075104945, + "learning_rate": 0.0001831953089212745, + "loss": 12.554, + "step": 7772 + }, + { + "epoch": 0.42327091543975004, + "grad_norm": 0.6844601908693717, + "learning_rate": 0.00018319041582275062, + "loss": 12.4907, + "step": 7773 + }, + { + "epoch": 0.4233253694363331, + "grad_norm": 0.6368271217345263, + "learning_rate": 0.00018318552207732415, + "loss": 12.5252, + "step": 7774 + }, + { + "epoch": 0.4233798234329161, + "grad_norm": 0.6215840343054171, + "learning_rate": 0.0001831806276850331, + "loss": 12.5888, + "step": 7775 + }, + { + "epoch": 0.4234342774294991, + "grad_norm": 0.6224392953398991, + "learning_rate": 0.00018317573264591553, + "loss": 12.4872, + "step": 7776 + }, + { + "epoch": 0.4234887314260821, + "grad_norm": 0.7030493089317692, + "learning_rate": 0.0001831708369600095, + "loss": 12.5528, + "step": 7777 + }, + { + "epoch": 0.4235431854226651, + "grad_norm": 0.6884725730053268, + "learning_rate": 0.0001831659406273531, + "loss": 12.4928, + "step": 7778 + }, + { + "epoch": 0.4235976394192481, + "grad_norm": 0.6763106331873534, + "learning_rate": 0.00018316104364798444, + "loss": 12.4183, + "step": 7779 + }, + { + "epoch": 0.42365209341583115, + "grad_norm": 0.6405527677746562, + "learning_rate": 0.00018315614602194152, + "loss": 12.4737, + "step": 7780 + }, + { + "epoch": 0.42370654741241415, + "grad_norm": 0.6945702310438716, + "learning_rate": 0.00018315124774926248, + "loss": 12.5326, + "step": 7781 + }, + { + "epoch": 0.42376100140899714, + "grad_norm": 0.6044615472501069, + "learning_rate": 0.00018314634882998538, + "loss": 12.3923, + "step": 7782 + }, + { + "epoch": 0.4238154554055802, + "grad_norm": 0.6661273133787237, + "learning_rate": 0.00018314144926414834, + "loss": 12.399, + "step": 7783 + }, + { + "epoch": 0.4238699094021632, + "grad_norm": 0.7311006197242605, + "learning_rate": 0.00018313654905178944, + "loss": 12.4251, + "step": 7784 + }, + { + "epoch": 0.4239243633987462, + "grad_norm": 0.6717220372865804, + "learning_rate": 0.00018313164819294678, + "loss": 12.4721, + "step": 7785 + }, + { + "epoch": 0.4239788173953292, + "grad_norm": 0.613043221582493, + "learning_rate": 0.0001831267466876585, + "loss": 12.3669, + "step": 7786 + }, + { + "epoch": 0.4240332713919122, + "grad_norm": 1.0101013612937542, + "learning_rate": 0.00018312184453596269, + "loss": 12.5856, + "step": 7787 + }, + { + "epoch": 0.4240877253884952, + "grad_norm": 0.6966337078675802, + "learning_rate": 0.00018311694173789748, + "loss": 12.5597, + "step": 7788 + }, + { + "epoch": 0.42414217938507826, + "grad_norm": 0.6240157771149879, + "learning_rate": 0.000183112038293501, + "loss": 12.4881, + "step": 7789 + }, + { + "epoch": 0.42419663338166125, + "grad_norm": 0.6642561139413364, + "learning_rate": 0.00018310713420281133, + "loss": 12.5333, + "step": 7790 + }, + { + "epoch": 0.42425108737824424, + "grad_norm": 0.6291943374062113, + "learning_rate": 0.00018310222946586667, + "loss": 12.3764, + "step": 7791 + }, + { + "epoch": 0.4243055413748273, + "grad_norm": 0.6930968219291624, + "learning_rate": 0.00018309732408270516, + "loss": 12.4469, + "step": 7792 + }, + { + "epoch": 0.4243599953714103, + "grad_norm": 0.5953476898224157, + "learning_rate": 0.00018309241805336492, + "loss": 12.4047, + "step": 7793 + }, + { + "epoch": 0.42441444936799333, + "grad_norm": 0.5844333523660687, + "learning_rate": 0.0001830875113778841, + "loss": 12.5129, + "step": 7794 + }, + { + "epoch": 0.4244689033645763, + "grad_norm": 0.7102053139363087, + "learning_rate": 0.00018308260405630085, + "loss": 12.4892, + "step": 7795 + }, + { + "epoch": 0.4245233573611593, + "grad_norm": 0.6652312253117758, + "learning_rate": 0.00018307769608865332, + "loss": 12.5766, + "step": 7796 + }, + { + "epoch": 0.42457781135774236, + "grad_norm": 0.7115652187451849, + "learning_rate": 0.00018307278747497973, + "loss": 12.4086, + "step": 7797 + }, + { + "epoch": 0.42463226535432536, + "grad_norm": 0.735827519599051, + "learning_rate": 0.00018306787821531818, + "loss": 12.6014, + "step": 7798 + }, + { + "epoch": 0.42468671935090835, + "grad_norm": 0.6433664841516628, + "learning_rate": 0.00018306296830970692, + "loss": 12.3885, + "step": 7799 + }, + { + "epoch": 0.4247411733474914, + "grad_norm": 0.6139720272683106, + "learning_rate": 0.00018305805775818405, + "loss": 12.4495, + "step": 7800 + }, + { + "epoch": 0.4247956273440744, + "grad_norm": 0.6668703605246699, + "learning_rate": 0.0001830531465607878, + "loss": 12.5359, + "step": 7801 + }, + { + "epoch": 0.4248500813406574, + "grad_norm": 0.630135798486746, + "learning_rate": 0.00018304823471755637, + "loss": 12.3706, + "step": 7802 + }, + { + "epoch": 0.42490453533724043, + "grad_norm": 0.6994353450861902, + "learning_rate": 0.00018304332222852793, + "loss": 12.5729, + "step": 7803 + }, + { + "epoch": 0.4249589893338234, + "grad_norm": 1.0254774883538031, + "learning_rate": 0.0001830384090937407, + "loss": 12.4567, + "step": 7804 + }, + { + "epoch": 0.4250134433304064, + "grad_norm": 0.7018597963711026, + "learning_rate": 0.00018303349531323287, + "loss": 12.4913, + "step": 7805 + }, + { + "epoch": 0.42506789732698946, + "grad_norm": 0.6812768819901065, + "learning_rate": 0.00018302858088704263, + "loss": 12.4232, + "step": 7806 + }, + { + "epoch": 0.42512235132357246, + "grad_norm": 0.6189707932825785, + "learning_rate": 0.00018302366581520824, + "loss": 12.4999, + "step": 7807 + }, + { + "epoch": 0.42517680532015545, + "grad_norm": 0.7008221094901383, + "learning_rate": 0.00018301875009776793, + "loss": 12.3748, + "step": 7808 + }, + { + "epoch": 0.4252312593167385, + "grad_norm": 0.685224508822657, + "learning_rate": 0.00018301383373475988, + "loss": 12.483, + "step": 7809 + }, + { + "epoch": 0.4252857133133215, + "grad_norm": 0.6439226473701067, + "learning_rate": 0.00018300891672622232, + "loss": 12.427, + "step": 7810 + }, + { + "epoch": 0.4253401673099045, + "grad_norm": 0.7611108891203666, + "learning_rate": 0.0001830039990721935, + "loss": 12.5046, + "step": 7811 + }, + { + "epoch": 0.42539462130648753, + "grad_norm": 0.67708873861166, + "learning_rate": 0.0001829990807727117, + "loss": 12.5311, + "step": 7812 + }, + { + "epoch": 0.4254490753030705, + "grad_norm": 0.5961418714371193, + "learning_rate": 0.00018299416182781508, + "loss": 12.3877, + "step": 7813 + }, + { + "epoch": 0.4255035292996535, + "grad_norm": 0.6176383889070957, + "learning_rate": 0.00018298924223754198, + "loss": 12.3607, + "step": 7814 + }, + { + "epoch": 0.42555798329623656, + "grad_norm": 0.629423468089237, + "learning_rate": 0.0001829843220019306, + "loss": 12.42, + "step": 7815 + }, + { + "epoch": 0.42561243729281956, + "grad_norm": 0.5963945048028937, + "learning_rate": 0.0001829794011210192, + "loss": 12.3582, + "step": 7816 + }, + { + "epoch": 0.42566689128940255, + "grad_norm": 0.6719580459725143, + "learning_rate": 0.00018297447959484607, + "loss": 12.4179, + "step": 7817 + }, + { + "epoch": 0.4257213452859856, + "grad_norm": 0.6363496362644461, + "learning_rate": 0.00018296955742344947, + "loss": 12.5432, + "step": 7818 + }, + { + "epoch": 0.4257757992825686, + "grad_norm": 0.7035535391398001, + "learning_rate": 0.0001829646346068677, + "loss": 12.4458, + "step": 7819 + }, + { + "epoch": 0.4258302532791516, + "grad_norm": 0.7125369757670239, + "learning_rate": 0.00018295971114513901, + "loss": 12.4325, + "step": 7820 + }, + { + "epoch": 0.42588470727573463, + "grad_norm": 0.5689821498560794, + "learning_rate": 0.00018295478703830167, + "loss": 12.4462, + "step": 7821 + }, + { + "epoch": 0.4259391612723176, + "grad_norm": 0.651504419915653, + "learning_rate": 0.00018294986228639402, + "loss": 12.5262, + "step": 7822 + }, + { + "epoch": 0.4259936152689006, + "grad_norm": 0.6846627781301227, + "learning_rate": 0.00018294493688945432, + "loss": 12.4642, + "step": 7823 + }, + { + "epoch": 0.42604806926548366, + "grad_norm": 0.5897441762051296, + "learning_rate": 0.0001829400108475209, + "loss": 12.3916, + "step": 7824 + }, + { + "epoch": 0.42610252326206666, + "grad_norm": 0.7217302351067372, + "learning_rate": 0.000182935084160632, + "loss": 12.4843, + "step": 7825 + }, + { + "epoch": 0.42615697725864965, + "grad_norm": 0.6421094690571366, + "learning_rate": 0.000182930156828826, + "loss": 12.416, + "step": 7826 + }, + { + "epoch": 0.4262114312552327, + "grad_norm": 0.5840267657072733, + "learning_rate": 0.00018292522885214122, + "loss": 12.448, + "step": 7827 + }, + { + "epoch": 0.4262658852518157, + "grad_norm": 0.5861991036359835, + "learning_rate": 0.00018292030023061594, + "loss": 12.4205, + "step": 7828 + }, + { + "epoch": 0.42632033924839874, + "grad_norm": 0.7221993662996262, + "learning_rate": 0.00018291537096428847, + "loss": 12.3286, + "step": 7829 + }, + { + "epoch": 0.42637479324498173, + "grad_norm": 0.6868999607560279, + "learning_rate": 0.00018291044105319721, + "loss": 12.5275, + "step": 7830 + }, + { + "epoch": 0.4264292472415647, + "grad_norm": 0.6912120530089493, + "learning_rate": 0.00018290551049738042, + "loss": 12.4169, + "step": 7831 + }, + { + "epoch": 0.42648370123814777, + "grad_norm": 0.6580756550522271, + "learning_rate": 0.00018290057929687653, + "loss": 12.6147, + "step": 7832 + }, + { + "epoch": 0.42653815523473076, + "grad_norm": 0.6170758835490263, + "learning_rate": 0.00018289564745172377, + "loss": 12.2711, + "step": 7833 + }, + { + "epoch": 0.42659260923131376, + "grad_norm": 0.7117110220895022, + "learning_rate": 0.0001828907149619606, + "loss": 12.4925, + "step": 7834 + }, + { + "epoch": 0.4266470632278968, + "grad_norm": 0.6381204388014844, + "learning_rate": 0.00018288578182762533, + "loss": 12.5005, + "step": 7835 + }, + { + "epoch": 0.4267015172244798, + "grad_norm": 0.6543463778852519, + "learning_rate": 0.00018288084804875626, + "loss": 12.6284, + "step": 7836 + }, + { + "epoch": 0.4267559712210628, + "grad_norm": 0.6685516297944075, + "learning_rate": 0.00018287591362539188, + "loss": 12.5296, + "step": 7837 + }, + { + "epoch": 0.42681042521764584, + "grad_norm": 0.8009985268061635, + "learning_rate": 0.0001828709785575705, + "loss": 12.407, + "step": 7838 + }, + { + "epoch": 0.42686487921422883, + "grad_norm": 0.6821136473506321, + "learning_rate": 0.00018286604284533045, + "loss": 12.5061, + "step": 7839 + }, + { + "epoch": 0.4269193332108118, + "grad_norm": 0.7352034903137306, + "learning_rate": 0.0001828611064887102, + "loss": 12.7067, + "step": 7840 + }, + { + "epoch": 0.4269737872073949, + "grad_norm": 0.6683441393984274, + "learning_rate": 0.00018285616948774807, + "loss": 12.5754, + "step": 7841 + }, + { + "epoch": 0.42702824120397787, + "grad_norm": 0.6816842155196888, + "learning_rate": 0.00018285123184248243, + "loss": 12.338, + "step": 7842 + }, + { + "epoch": 0.42708269520056086, + "grad_norm": 0.6624536746483938, + "learning_rate": 0.00018284629355295174, + "loss": 12.5175, + "step": 7843 + }, + { + "epoch": 0.4271371491971439, + "grad_norm": 0.7013469603718336, + "learning_rate": 0.0001828413546191944, + "loss": 12.5453, + "step": 7844 + }, + { + "epoch": 0.4271916031937269, + "grad_norm": 0.6046632729996528, + "learning_rate": 0.0001828364150412488, + "loss": 12.2746, + "step": 7845 + }, + { + "epoch": 0.4272460571903099, + "grad_norm": 0.625481111917555, + "learning_rate": 0.00018283147481915334, + "loss": 12.4797, + "step": 7846 + }, + { + "epoch": 0.42730051118689294, + "grad_norm": 0.6424818253362229, + "learning_rate": 0.00018282653395294642, + "loss": 12.5027, + "step": 7847 + }, + { + "epoch": 0.42735496518347593, + "grad_norm": 0.6601364261491262, + "learning_rate": 0.0001828215924426665, + "loss": 12.5228, + "step": 7848 + }, + { + "epoch": 0.4274094191800589, + "grad_norm": 0.6606556794480325, + "learning_rate": 0.00018281665028835198, + "loss": 12.351, + "step": 7849 + }, + { + "epoch": 0.427463873176642, + "grad_norm": 0.7391279774495332, + "learning_rate": 0.0001828117074900413, + "loss": 12.6615, + "step": 7850 + }, + { + "epoch": 0.42751832717322497, + "grad_norm": 0.666721835432075, + "learning_rate": 0.0001828067640477729, + "loss": 12.4141, + "step": 7851 + }, + { + "epoch": 0.42757278116980796, + "grad_norm": 0.6556499417303358, + "learning_rate": 0.00018280181996158522, + "loss": 12.5608, + "step": 7852 + }, + { + "epoch": 0.427627235166391, + "grad_norm": 0.7136819769413728, + "learning_rate": 0.0001827968752315167, + "loss": 12.6186, + "step": 7853 + }, + { + "epoch": 0.427681689162974, + "grad_norm": 0.6494044088806241, + "learning_rate": 0.00018279192985760578, + "loss": 12.4636, + "step": 7854 + }, + { + "epoch": 0.427736143159557, + "grad_norm": 0.6681216329881955, + "learning_rate": 0.00018278698383989092, + "loss": 12.5388, + "step": 7855 + }, + { + "epoch": 0.42779059715614004, + "grad_norm": 0.6351791202571165, + "learning_rate": 0.00018278203717841062, + "loss": 12.3689, + "step": 7856 + }, + { + "epoch": 0.42784505115272303, + "grad_norm": 0.7121964039171422, + "learning_rate": 0.0001827770898732033, + "loss": 12.475, + "step": 7857 + }, + { + "epoch": 0.427899505149306, + "grad_norm": 0.7322361257100048, + "learning_rate": 0.00018277214192430745, + "loss": 12.3005, + "step": 7858 + }, + { + "epoch": 0.4279539591458891, + "grad_norm": 0.7110469435756508, + "learning_rate": 0.00018276719333176154, + "loss": 12.5725, + "step": 7859 + }, + { + "epoch": 0.42800841314247207, + "grad_norm": 0.7114305715474563, + "learning_rate": 0.00018276224409560405, + "loss": 12.3835, + "step": 7860 + }, + { + "epoch": 0.4280628671390551, + "grad_norm": 0.6554373743972242, + "learning_rate": 0.00018275729421587348, + "loss": 12.4473, + "step": 7861 + }, + { + "epoch": 0.4281173211356381, + "grad_norm": 0.614966738063642, + "learning_rate": 0.0001827523436926083, + "loss": 12.5055, + "step": 7862 + }, + { + "epoch": 0.4281717751322211, + "grad_norm": 0.7763242654060049, + "learning_rate": 0.00018274739252584706, + "loss": 12.589, + "step": 7863 + }, + { + "epoch": 0.42822622912880415, + "grad_norm": 0.595923750032374, + "learning_rate": 0.00018274244071562817, + "loss": 12.4712, + "step": 7864 + }, + { + "epoch": 0.42828068312538714, + "grad_norm": 0.66789073012322, + "learning_rate": 0.0001827374882619902, + "loss": 12.5092, + "step": 7865 + }, + { + "epoch": 0.42833513712197013, + "grad_norm": 0.6993718554608768, + "learning_rate": 0.00018273253516497168, + "loss": 12.2774, + "step": 7866 + }, + { + "epoch": 0.4283895911185532, + "grad_norm": 0.6853933435979174, + "learning_rate": 0.00018272758142461103, + "loss": 12.4401, + "step": 7867 + }, + { + "epoch": 0.4284440451151362, + "grad_norm": 0.6046127331466988, + "learning_rate": 0.00018272262704094688, + "loss": 12.4843, + "step": 7868 + }, + { + "epoch": 0.42849849911171917, + "grad_norm": 0.7031626455104183, + "learning_rate": 0.0001827176720140177, + "loss": 12.4454, + "step": 7869 + }, + { + "epoch": 0.4285529531083022, + "grad_norm": 0.6102750691205855, + "learning_rate": 0.000182712716343862, + "loss": 12.2383, + "step": 7870 + }, + { + "epoch": 0.4286074071048852, + "grad_norm": 0.6430406643586948, + "learning_rate": 0.0001827077600305184, + "loss": 12.3706, + "step": 7871 + }, + { + "epoch": 0.4286618611014682, + "grad_norm": 0.7153973691384076, + "learning_rate": 0.00018270280307402533, + "loss": 12.4636, + "step": 7872 + }, + { + "epoch": 0.42871631509805125, + "grad_norm": 0.6111169539665425, + "learning_rate": 0.00018269784547442143, + "loss": 12.5269, + "step": 7873 + }, + { + "epoch": 0.42877076909463424, + "grad_norm": 0.6605820807457244, + "learning_rate": 0.00018269288723174522, + "loss": 12.5237, + "step": 7874 + }, + { + "epoch": 0.42882522309121723, + "grad_norm": 0.7294481892645149, + "learning_rate": 0.0001826879283460352, + "loss": 12.3791, + "step": 7875 + }, + { + "epoch": 0.4288796770878003, + "grad_norm": 0.7080461100945729, + "learning_rate": 0.00018268296881733005, + "loss": 12.378, + "step": 7876 + }, + { + "epoch": 0.4289341310843833, + "grad_norm": 0.6910783653995367, + "learning_rate": 0.00018267800864566825, + "loss": 12.4422, + "step": 7877 + }, + { + "epoch": 0.42898858508096627, + "grad_norm": 0.6612434872957814, + "learning_rate": 0.00018267304783108838, + "loss": 12.3455, + "step": 7878 + }, + { + "epoch": 0.4290430390775493, + "grad_norm": 0.6711938850299602, + "learning_rate": 0.000182668086373629, + "loss": 12.3548, + "step": 7879 + }, + { + "epoch": 0.4290974930741323, + "grad_norm": 0.626907677513012, + "learning_rate": 0.00018266312427332878, + "loss": 12.5163, + "step": 7880 + }, + { + "epoch": 0.4291519470707153, + "grad_norm": 0.6803065095321489, + "learning_rate": 0.0001826581615302262, + "loss": 12.3781, + "step": 7881 + }, + { + "epoch": 0.42920640106729835, + "grad_norm": 0.7196134342169394, + "learning_rate": 0.00018265319814435988, + "loss": 12.4378, + "step": 7882 + }, + { + "epoch": 0.42926085506388134, + "grad_norm": 0.6925727268427583, + "learning_rate": 0.00018264823411576846, + "loss": 12.599, + "step": 7883 + }, + { + "epoch": 0.42931530906046433, + "grad_norm": 0.6023802851561011, + "learning_rate": 0.00018264326944449048, + "loss": 12.4611, + "step": 7884 + }, + { + "epoch": 0.4293697630570474, + "grad_norm": 0.7294334658982401, + "learning_rate": 0.0001826383041305646, + "loss": 12.4378, + "step": 7885 + }, + { + "epoch": 0.4294242170536304, + "grad_norm": 0.6625800290992043, + "learning_rate": 0.0001826333381740294, + "loss": 12.4643, + "step": 7886 + }, + { + "epoch": 0.42947867105021337, + "grad_norm": 0.7471763593863193, + "learning_rate": 0.00018262837157492353, + "loss": 12.4879, + "step": 7887 + }, + { + "epoch": 0.4295331250467964, + "grad_norm": 0.6568300177128303, + "learning_rate": 0.00018262340433328558, + "loss": 12.3321, + "step": 7888 + }, + { + "epoch": 0.4295875790433794, + "grad_norm": 0.7096946912181274, + "learning_rate": 0.00018261843644915417, + "loss": 12.5937, + "step": 7889 + }, + { + "epoch": 0.4296420330399624, + "grad_norm": 0.6339033309753559, + "learning_rate": 0.00018261346792256794, + "loss": 12.5053, + "step": 7890 + }, + { + "epoch": 0.42969648703654545, + "grad_norm": 0.6136133084916588, + "learning_rate": 0.00018260849875356553, + "loss": 12.4255, + "step": 7891 + }, + { + "epoch": 0.42975094103312844, + "grad_norm": 0.5945688377929977, + "learning_rate": 0.0001826035289421856, + "loss": 12.4445, + "step": 7892 + }, + { + "epoch": 0.42980539502971143, + "grad_norm": 0.7462867216348839, + "learning_rate": 0.00018259855848846675, + "loss": 12.4613, + "step": 7893 + }, + { + "epoch": 0.4298598490262945, + "grad_norm": 0.6569392321802703, + "learning_rate": 0.00018259358739244766, + "loss": 12.5073, + "step": 7894 + }, + { + "epoch": 0.4299143030228775, + "grad_norm": 0.5983744188237977, + "learning_rate": 0.00018258861565416702, + "loss": 12.4993, + "step": 7895 + }, + { + "epoch": 0.4299687570194605, + "grad_norm": 0.7053622283322101, + "learning_rate": 0.0001825836432736634, + "loss": 12.3607, + "step": 7896 + }, + { + "epoch": 0.4300232110160435, + "grad_norm": 0.7054846914578173, + "learning_rate": 0.00018257867025097554, + "loss": 12.5458, + "step": 7897 + }, + { + "epoch": 0.4300776650126265, + "grad_norm": 0.6949317721624084, + "learning_rate": 0.00018257369658614212, + "loss": 12.4957, + "step": 7898 + }, + { + "epoch": 0.43013211900920956, + "grad_norm": 0.6836307012239016, + "learning_rate": 0.00018256872227920173, + "loss": 12.479, + "step": 7899 + }, + { + "epoch": 0.43018657300579255, + "grad_norm": 0.7228802923883042, + "learning_rate": 0.00018256374733019315, + "loss": 12.3583, + "step": 7900 + }, + { + "epoch": 0.43024102700237554, + "grad_norm": 0.9284099241529163, + "learning_rate": 0.00018255877173915504, + "loss": 12.5031, + "step": 7901 + }, + { + "epoch": 0.4302954809989586, + "grad_norm": 0.6694914115117954, + "learning_rate": 0.00018255379550612605, + "loss": 12.4756, + "step": 7902 + }, + { + "epoch": 0.4303499349955416, + "grad_norm": 0.6478362302886246, + "learning_rate": 0.0001825488186311449, + "loss": 12.558, + "step": 7903 + }, + { + "epoch": 0.4304043889921246, + "grad_norm": 0.6546490002593253, + "learning_rate": 0.0001825438411142503, + "loss": 12.4417, + "step": 7904 + }, + { + "epoch": 0.4304588429887076, + "grad_norm": 0.7087779462299327, + "learning_rate": 0.00018253886295548094, + "loss": 12.5224, + "step": 7905 + }, + { + "epoch": 0.4305132969852906, + "grad_norm": 0.6993477111814185, + "learning_rate": 0.00018253388415487556, + "loss": 12.5954, + "step": 7906 + }, + { + "epoch": 0.4305677509818736, + "grad_norm": 0.6535366877281594, + "learning_rate": 0.00018252890471247285, + "loss": 12.3998, + "step": 7907 + }, + { + "epoch": 0.43062220497845666, + "grad_norm": 0.6509191262966944, + "learning_rate": 0.00018252392462831153, + "loss": 12.3903, + "step": 7908 + }, + { + "epoch": 0.43067665897503965, + "grad_norm": 0.7174800548505674, + "learning_rate": 0.00018251894390243031, + "loss": 12.4249, + "step": 7909 + }, + { + "epoch": 0.43073111297162264, + "grad_norm": 0.7117353466917853, + "learning_rate": 0.00018251396253486798, + "loss": 12.5, + "step": 7910 + }, + { + "epoch": 0.4307855669682057, + "grad_norm": 0.5842736337850675, + "learning_rate": 0.00018250898052566322, + "loss": 12.3333, + "step": 7911 + }, + { + "epoch": 0.4308400209647887, + "grad_norm": 0.626341575150017, + "learning_rate": 0.0001825039978748548, + "loss": 12.4355, + "step": 7912 + }, + { + "epoch": 0.4308944749613717, + "grad_norm": 0.6902559185033911, + "learning_rate": 0.00018249901458248146, + "loss": 12.5578, + "step": 7913 + }, + { + "epoch": 0.4309489289579547, + "grad_norm": 0.6996091778217305, + "learning_rate": 0.00018249403064858193, + "loss": 12.4151, + "step": 7914 + }, + { + "epoch": 0.4310033829545377, + "grad_norm": 0.6589561733097002, + "learning_rate": 0.000182489046073195, + "loss": 12.5248, + "step": 7915 + }, + { + "epoch": 0.4310578369511207, + "grad_norm": 0.6274633751809366, + "learning_rate": 0.00018248406085635943, + "loss": 12.3801, + "step": 7916 + }, + { + "epoch": 0.43111229094770376, + "grad_norm": 0.7625790300943449, + "learning_rate": 0.00018247907499811393, + "loss": 12.5618, + "step": 7917 + }, + { + "epoch": 0.43116674494428675, + "grad_norm": 0.6880714481481783, + "learning_rate": 0.00018247408849849734, + "loss": 12.322, + "step": 7918 + }, + { + "epoch": 0.43122119894086974, + "grad_norm": 0.7149825408684415, + "learning_rate": 0.0001824691013575484, + "loss": 12.551, + "step": 7919 + }, + { + "epoch": 0.4312756529374528, + "grad_norm": 0.6958054940177517, + "learning_rate": 0.0001824641135753059, + "loss": 12.5937, + "step": 7920 + }, + { + "epoch": 0.4313301069340358, + "grad_norm": 0.6496322803509679, + "learning_rate": 0.00018245912515180862, + "loss": 12.4297, + "step": 7921 + }, + { + "epoch": 0.4313845609306188, + "grad_norm": 0.7743435913615766, + "learning_rate": 0.00018245413608709534, + "loss": 12.3787, + "step": 7922 + }, + { + "epoch": 0.4314390149272018, + "grad_norm": 0.6499155579167446, + "learning_rate": 0.0001824491463812049, + "loss": 12.4618, + "step": 7923 + }, + { + "epoch": 0.4314934689237848, + "grad_norm": 0.6323364011027064, + "learning_rate": 0.00018244415603417603, + "loss": 12.5198, + "step": 7924 + }, + { + "epoch": 0.4315479229203678, + "grad_norm": 0.6275726046028748, + "learning_rate": 0.0001824391650460476, + "loss": 12.4412, + "step": 7925 + }, + { + "epoch": 0.43160237691695086, + "grad_norm": 0.6563516386645423, + "learning_rate": 0.0001824341734168584, + "loss": 12.3843, + "step": 7926 + }, + { + "epoch": 0.43165683091353385, + "grad_norm": 0.5944999411676104, + "learning_rate": 0.00018242918114664725, + "loss": 12.4227, + "step": 7927 + }, + { + "epoch": 0.4317112849101169, + "grad_norm": 0.6126075950113996, + "learning_rate": 0.00018242418823545298, + "loss": 12.5283, + "step": 7928 + }, + { + "epoch": 0.4317657389066999, + "grad_norm": 0.591985788952654, + "learning_rate": 0.00018241919468331435, + "loss": 12.4677, + "step": 7929 + }, + { + "epoch": 0.4318201929032829, + "grad_norm": 0.6383852269541036, + "learning_rate": 0.00018241420049027027, + "loss": 12.474, + "step": 7930 + }, + { + "epoch": 0.43187464689986593, + "grad_norm": 0.5852504651876139, + "learning_rate": 0.00018240920565635956, + "loss": 12.3616, + "step": 7931 + }, + { + "epoch": 0.4319291008964489, + "grad_norm": 0.6130822749181293, + "learning_rate": 0.00018240421018162102, + "loss": 12.4868, + "step": 7932 + }, + { + "epoch": 0.4319835548930319, + "grad_norm": 0.6200945418246628, + "learning_rate": 0.00018239921406609352, + "loss": 12.3153, + "step": 7933 + }, + { + "epoch": 0.43203800888961497, + "grad_norm": 0.5977586740609638, + "learning_rate": 0.00018239421730981595, + "loss": 12.3454, + "step": 7934 + }, + { + "epoch": 0.43209246288619796, + "grad_norm": 0.7620159440437332, + "learning_rate": 0.00018238921991282708, + "loss": 12.5599, + "step": 7935 + }, + { + "epoch": 0.43214691688278095, + "grad_norm": 0.6388958721890771, + "learning_rate": 0.00018238422187516587, + "loss": 12.4949, + "step": 7936 + }, + { + "epoch": 0.432201370879364, + "grad_norm": 0.6201417753511387, + "learning_rate": 0.0001823792231968711, + "loss": 12.4605, + "step": 7937 + }, + { + "epoch": 0.432255824875947, + "grad_norm": 0.7082720364920682, + "learning_rate": 0.00018237422387798168, + "loss": 12.5924, + "step": 7938 + }, + { + "epoch": 0.43231027887253, + "grad_norm": 0.6764522855907903, + "learning_rate": 0.00018236922391853648, + "loss": 12.5394, + "step": 7939 + }, + { + "epoch": 0.43236473286911303, + "grad_norm": 0.6027504108616288, + "learning_rate": 0.00018236422331857437, + "loss": 12.3855, + "step": 7940 + }, + { + "epoch": 0.432419186865696, + "grad_norm": 0.7383961195275949, + "learning_rate": 0.00018235922207813428, + "loss": 12.6475, + "step": 7941 + }, + { + "epoch": 0.432473640862279, + "grad_norm": 0.6480989207450246, + "learning_rate": 0.00018235422019725507, + "loss": 12.4974, + "step": 7942 + }, + { + "epoch": 0.43252809485886207, + "grad_norm": 0.7626322657347785, + "learning_rate": 0.00018234921767597558, + "loss": 12.458, + "step": 7943 + }, + { + "epoch": 0.43258254885544506, + "grad_norm": 0.7024604934391259, + "learning_rate": 0.00018234421451433482, + "loss": 12.474, + "step": 7944 + }, + { + "epoch": 0.43263700285202805, + "grad_norm": 0.6236580831047059, + "learning_rate": 0.0001823392107123716, + "loss": 12.3832, + "step": 7945 + }, + { + "epoch": 0.4326914568486111, + "grad_norm": 0.7460718232888852, + "learning_rate": 0.0001823342062701249, + "loss": 12.639, + "step": 7946 + }, + { + "epoch": 0.4327459108451941, + "grad_norm": 0.6447139498024131, + "learning_rate": 0.00018232920118763356, + "loss": 12.5126, + "step": 7947 + }, + { + "epoch": 0.4328003648417771, + "grad_norm": 0.6928880261448563, + "learning_rate": 0.00018232419546493657, + "loss": 12.6822, + "step": 7948 + }, + { + "epoch": 0.43285481883836013, + "grad_norm": 0.625071578737955, + "learning_rate": 0.00018231918910207283, + "loss": 12.3265, + "step": 7949 + }, + { + "epoch": 0.4329092728349431, + "grad_norm": 0.5987110493663569, + "learning_rate": 0.0001823141820990813, + "loss": 12.5354, + "step": 7950 + }, + { + "epoch": 0.4329637268315261, + "grad_norm": 0.620178423324587, + "learning_rate": 0.00018230917445600085, + "loss": 12.4393, + "step": 7951 + }, + { + "epoch": 0.43301818082810917, + "grad_norm": 0.6307608473905716, + "learning_rate": 0.00018230416617287046, + "loss": 12.4671, + "step": 7952 + }, + { + "epoch": 0.43307263482469216, + "grad_norm": 0.7122894827254327, + "learning_rate": 0.0001822991572497291, + "loss": 12.4647, + "step": 7953 + }, + { + "epoch": 0.43312708882127515, + "grad_norm": 0.5671356599600159, + "learning_rate": 0.00018229414768661565, + "loss": 12.3425, + "step": 7954 + }, + { + "epoch": 0.4331815428178582, + "grad_norm": 0.6333834878199174, + "learning_rate": 0.00018228913748356913, + "loss": 12.4106, + "step": 7955 + }, + { + "epoch": 0.4332359968144412, + "grad_norm": 0.6650636281838411, + "learning_rate": 0.0001822841266406285, + "loss": 12.4607, + "step": 7956 + }, + { + "epoch": 0.4332904508110242, + "grad_norm": 0.585052172740277, + "learning_rate": 0.00018227911515783266, + "loss": 12.4691, + "step": 7957 + }, + { + "epoch": 0.43334490480760723, + "grad_norm": 0.6751958569542258, + "learning_rate": 0.00018227410303522064, + "loss": 12.3421, + "step": 7958 + }, + { + "epoch": 0.4333993588041902, + "grad_norm": 0.6306906606932072, + "learning_rate": 0.0001822690902728314, + "loss": 12.306, + "step": 7959 + }, + { + "epoch": 0.4334538128007732, + "grad_norm": 0.6095238580428013, + "learning_rate": 0.0001822640768707039, + "loss": 12.4476, + "step": 7960 + }, + { + "epoch": 0.43350826679735627, + "grad_norm": 0.6542022762270828, + "learning_rate": 0.00018225906282887718, + "loss": 12.4173, + "step": 7961 + }, + { + "epoch": 0.43356272079393926, + "grad_norm": 0.6344690057285265, + "learning_rate": 0.00018225404814739012, + "loss": 12.4391, + "step": 7962 + }, + { + "epoch": 0.4336171747905223, + "grad_norm": 0.7428938770750612, + "learning_rate": 0.00018224903282628186, + "loss": 12.4032, + "step": 7963 + }, + { + "epoch": 0.4336716287871053, + "grad_norm": 0.6437215141879193, + "learning_rate": 0.0001822440168655913, + "loss": 12.4873, + "step": 7964 + }, + { + "epoch": 0.4337260827836883, + "grad_norm": 0.6611723911944084, + "learning_rate": 0.00018223900026535748, + "loss": 12.4769, + "step": 7965 + }, + { + "epoch": 0.43378053678027134, + "grad_norm": 0.6673601417317901, + "learning_rate": 0.00018223398302561942, + "loss": 12.4524, + "step": 7966 + }, + { + "epoch": 0.43383499077685433, + "grad_norm": 0.7049890622715992, + "learning_rate": 0.0001822289651464161, + "loss": 12.4558, + "step": 7967 + }, + { + "epoch": 0.4338894447734373, + "grad_norm": 0.6010186771172481, + "learning_rate": 0.0001822239466277865, + "loss": 12.5184, + "step": 7968 + }, + { + "epoch": 0.4339438987700204, + "grad_norm": 0.6782184525687398, + "learning_rate": 0.0001822189274697698, + "loss": 12.5598, + "step": 7969 + }, + { + "epoch": 0.43399835276660337, + "grad_norm": 0.7535593508707078, + "learning_rate": 0.0001822139076724049, + "loss": 12.4382, + "step": 7970 + }, + { + "epoch": 0.43405280676318636, + "grad_norm": 0.7476354959202245, + "learning_rate": 0.00018220888723573082, + "loss": 12.3689, + "step": 7971 + }, + { + "epoch": 0.4341072607597694, + "grad_norm": 0.6468310309468458, + "learning_rate": 0.0001822038661597867, + "loss": 12.4683, + "step": 7972 + }, + { + "epoch": 0.4341617147563524, + "grad_norm": 0.6425677167614992, + "learning_rate": 0.0001821988444446115, + "loss": 12.5546, + "step": 7973 + }, + { + "epoch": 0.4342161687529354, + "grad_norm": 0.6616730114954977, + "learning_rate": 0.0001821938220902443, + "loss": 12.5815, + "step": 7974 + }, + { + "epoch": 0.43427062274951844, + "grad_norm": 0.6706196272119236, + "learning_rate": 0.0001821887990967242, + "loss": 12.4704, + "step": 7975 + }, + { + "epoch": 0.43432507674610143, + "grad_norm": 0.7037258987322563, + "learning_rate": 0.00018218377546409017, + "loss": 12.5475, + "step": 7976 + }, + { + "epoch": 0.4343795307426844, + "grad_norm": 0.6489047278433996, + "learning_rate": 0.00018217875119238132, + "loss": 12.3292, + "step": 7977 + }, + { + "epoch": 0.4344339847392675, + "grad_norm": 0.6616035776812114, + "learning_rate": 0.00018217372628163674, + "loss": 12.2886, + "step": 7978 + }, + { + "epoch": 0.43448843873585047, + "grad_norm": 0.7245041333919243, + "learning_rate": 0.00018216870073189546, + "loss": 12.3057, + "step": 7979 + }, + { + "epoch": 0.43454289273243346, + "grad_norm": 0.6823812931676292, + "learning_rate": 0.00018216367454319663, + "loss": 12.3488, + "step": 7980 + }, + { + "epoch": 0.4345973467290165, + "grad_norm": 0.6928454505678266, + "learning_rate": 0.00018215864771557925, + "loss": 12.4474, + "step": 7981 + }, + { + "epoch": 0.4346518007255995, + "grad_norm": 0.6692762111560462, + "learning_rate": 0.00018215362024908246, + "loss": 12.3722, + "step": 7982 + }, + { + "epoch": 0.4347062547221825, + "grad_norm": 0.7217401785786631, + "learning_rate": 0.0001821485921437453, + "loss": 12.5339, + "step": 7983 + }, + { + "epoch": 0.43476070871876554, + "grad_norm": 0.7266825040044337, + "learning_rate": 0.00018214356339960698, + "loss": 12.3977, + "step": 7984 + }, + { + "epoch": 0.43481516271534854, + "grad_norm": 0.6826875691189774, + "learning_rate": 0.0001821385340167065, + "loss": 12.4072, + "step": 7985 + }, + { + "epoch": 0.43486961671193153, + "grad_norm": 0.7064241066509669, + "learning_rate": 0.000182133503995083, + "loss": 12.4456, + "step": 7986 + }, + { + "epoch": 0.4349240707085146, + "grad_norm": 0.6175230417065565, + "learning_rate": 0.0001821284733347756, + "loss": 12.4795, + "step": 7987 + }, + { + "epoch": 0.43497852470509757, + "grad_norm": 0.6079407316866163, + "learning_rate": 0.00018212344203582344, + "loss": 12.3473, + "step": 7988 + }, + { + "epoch": 0.43503297870168056, + "grad_norm": 0.7163691665704485, + "learning_rate": 0.0001821184100982656, + "loss": 12.3703, + "step": 7989 + }, + { + "epoch": 0.4350874326982636, + "grad_norm": 0.6258311004616853, + "learning_rate": 0.00018211337752214123, + "loss": 12.3942, + "step": 7990 + }, + { + "epoch": 0.4351418866948466, + "grad_norm": 0.671124784124222, + "learning_rate": 0.0001821083443074895, + "loss": 12.4334, + "step": 7991 + }, + { + "epoch": 0.4351963406914296, + "grad_norm": 0.6216233607643935, + "learning_rate": 0.00018210331045434949, + "loss": 12.4195, + "step": 7992 + }, + { + "epoch": 0.43525079468801264, + "grad_norm": 0.742741508242918, + "learning_rate": 0.00018209827596276035, + "loss": 12.4239, + "step": 7993 + }, + { + "epoch": 0.43530524868459564, + "grad_norm": 0.6300698638468436, + "learning_rate": 0.00018209324083276126, + "loss": 12.3753, + "step": 7994 + }, + { + "epoch": 0.4353597026811787, + "grad_norm": 0.751409905949325, + "learning_rate": 0.00018208820506439137, + "loss": 12.529, + "step": 7995 + }, + { + "epoch": 0.4354141566777617, + "grad_norm": 0.6947725654359397, + "learning_rate": 0.00018208316865768985, + "loss": 12.5276, + "step": 7996 + }, + { + "epoch": 0.43546861067434467, + "grad_norm": 0.9447376665430275, + "learning_rate": 0.00018207813161269583, + "loss": 12.4355, + "step": 7997 + }, + { + "epoch": 0.4355230646709277, + "grad_norm": 0.6723041201443456, + "learning_rate": 0.00018207309392944846, + "loss": 12.4632, + "step": 7998 + }, + { + "epoch": 0.4355775186675107, + "grad_norm": 0.7976902368051231, + "learning_rate": 0.000182068055607987, + "loss": 12.6122, + "step": 7999 + }, + { + "epoch": 0.4356319726640937, + "grad_norm": 0.7345131489832951, + "learning_rate": 0.00018206301664835058, + "loss": 12.5908, + "step": 8000 + }, + { + "epoch": 0.43568642666067675, + "grad_norm": 0.6454712970992138, + "learning_rate": 0.00018205797705057834, + "loss": 12.5204, + "step": 8001 + }, + { + "epoch": 0.43574088065725974, + "grad_norm": 0.6711649663357349, + "learning_rate": 0.00018205293681470953, + "loss": 12.5056, + "step": 8002 + }, + { + "epoch": 0.43579533465384274, + "grad_norm": 0.6728369086830224, + "learning_rate": 0.0001820478959407833, + "loss": 12.3627, + "step": 8003 + }, + { + "epoch": 0.4358497886504258, + "grad_norm": 0.6597979370043341, + "learning_rate": 0.0001820428544288389, + "loss": 12.3749, + "step": 8004 + }, + { + "epoch": 0.4359042426470088, + "grad_norm": 0.6209904486774009, + "learning_rate": 0.0001820378122789155, + "loss": 12.4336, + "step": 8005 + }, + { + "epoch": 0.43595869664359177, + "grad_norm": 0.7713948991466982, + "learning_rate": 0.00018203276949105233, + "loss": 12.4182, + "step": 8006 + }, + { + "epoch": 0.4360131506401748, + "grad_norm": 0.6469371586470584, + "learning_rate": 0.00018202772606528856, + "loss": 12.3034, + "step": 8007 + }, + { + "epoch": 0.4360676046367578, + "grad_norm": 0.6995538373363558, + "learning_rate": 0.00018202268200166342, + "loss": 12.4704, + "step": 8008 + }, + { + "epoch": 0.4361220586333408, + "grad_norm": 0.666288730336914, + "learning_rate": 0.00018201763730021618, + "loss": 12.4474, + "step": 8009 + }, + { + "epoch": 0.43617651262992385, + "grad_norm": 0.6211419327657228, + "learning_rate": 0.00018201259196098604, + "loss": 12.4149, + "step": 8010 + }, + { + "epoch": 0.43623096662650684, + "grad_norm": 0.628883970811886, + "learning_rate": 0.0001820075459840122, + "loss": 12.4724, + "step": 8011 + }, + { + "epoch": 0.43628542062308984, + "grad_norm": 0.5950168445018583, + "learning_rate": 0.00018200249936933398, + "loss": 12.4613, + "step": 8012 + }, + { + "epoch": 0.4363398746196729, + "grad_norm": 0.6312701310341498, + "learning_rate": 0.00018199745211699053, + "loss": 12.4223, + "step": 8013 + }, + { + "epoch": 0.4363943286162559, + "grad_norm": 0.6932151999657433, + "learning_rate": 0.00018199240422702117, + "loss": 12.4752, + "step": 8014 + }, + { + "epoch": 0.43644878261283887, + "grad_norm": 0.6294977807573285, + "learning_rate": 0.0001819873556994651, + "loss": 12.4771, + "step": 8015 + }, + { + "epoch": 0.4365032366094219, + "grad_norm": 0.6264943199859186, + "learning_rate": 0.0001819823065343616, + "loss": 12.484, + "step": 8016 + }, + { + "epoch": 0.4365576906060049, + "grad_norm": 0.6616444184130431, + "learning_rate": 0.00018197725673174998, + "loss": 12.5049, + "step": 8017 + }, + { + "epoch": 0.4366121446025879, + "grad_norm": 0.5744172061896488, + "learning_rate": 0.00018197220629166943, + "loss": 12.4327, + "step": 8018 + }, + { + "epoch": 0.43666659859917095, + "grad_norm": 0.6289380832372987, + "learning_rate": 0.00018196715521415926, + "loss": 12.3483, + "step": 8019 + }, + { + "epoch": 0.43672105259575394, + "grad_norm": 0.6403498509864031, + "learning_rate": 0.00018196210349925873, + "loss": 12.438, + "step": 8020 + }, + { + "epoch": 0.43677550659233694, + "grad_norm": 0.579175241208905, + "learning_rate": 0.00018195705114700713, + "loss": 12.3951, + "step": 8021 + }, + { + "epoch": 0.43682996058892, + "grad_norm": 0.6219905875288234, + "learning_rate": 0.00018195199815744382, + "loss": 12.4101, + "step": 8022 + }, + { + "epoch": 0.436884414585503, + "grad_norm": 0.5955832250530326, + "learning_rate": 0.000181946944530608, + "loss": 12.3245, + "step": 8023 + }, + { + "epoch": 0.43693886858208597, + "grad_norm": 0.6072932595615831, + "learning_rate": 0.00018194189026653898, + "loss": 12.4663, + "step": 8024 + }, + { + "epoch": 0.436993322578669, + "grad_norm": 0.7165004470076959, + "learning_rate": 0.00018193683536527609, + "loss": 12.4848, + "step": 8025 + }, + { + "epoch": 0.437047776575252, + "grad_norm": 0.6513696149725915, + "learning_rate": 0.00018193177982685858, + "loss": 12.4224, + "step": 8026 + }, + { + "epoch": 0.437102230571835, + "grad_norm": 0.7428711180542187, + "learning_rate": 0.00018192672365132588, + "loss": 12.5178, + "step": 8027 + }, + { + "epoch": 0.43715668456841805, + "grad_norm": 0.6427803787757064, + "learning_rate": 0.0001819216668387172, + "loss": 12.4145, + "step": 8028 + }, + { + "epoch": 0.43721113856500105, + "grad_norm": 0.6207320706711867, + "learning_rate": 0.00018191660938907193, + "loss": 12.4405, + "step": 8029 + }, + { + "epoch": 0.4372655925615841, + "grad_norm": 0.6852852916241587, + "learning_rate": 0.00018191155130242934, + "loss": 12.4623, + "step": 8030 + }, + { + "epoch": 0.4373200465581671, + "grad_norm": 0.6840507058001489, + "learning_rate": 0.0001819064925788288, + "loss": 12.4987, + "step": 8031 + }, + { + "epoch": 0.4373745005547501, + "grad_norm": 0.6140790978364469, + "learning_rate": 0.00018190143321830963, + "loss": 12.373, + "step": 8032 + }, + { + "epoch": 0.4374289545513331, + "grad_norm": 0.6775222151229636, + "learning_rate": 0.0001818963732209112, + "loss": 12.5089, + "step": 8033 + }, + { + "epoch": 0.4374834085479161, + "grad_norm": 0.590068771249146, + "learning_rate": 0.00018189131258667282, + "loss": 12.4183, + "step": 8034 + }, + { + "epoch": 0.4375378625444991, + "grad_norm": 0.6183153379587679, + "learning_rate": 0.00018188625131563386, + "loss": 12.4238, + "step": 8035 + }, + { + "epoch": 0.43759231654108216, + "grad_norm": 0.7422525928327838, + "learning_rate": 0.00018188118940783368, + "loss": 12.5106, + "step": 8036 + }, + { + "epoch": 0.43764677053766515, + "grad_norm": 0.6068305483295992, + "learning_rate": 0.00018187612686331166, + "loss": 12.4831, + "step": 8037 + }, + { + "epoch": 0.43770122453424815, + "grad_norm": 0.6158291631235349, + "learning_rate": 0.00018187106368210712, + "loss": 12.4306, + "step": 8038 + }, + { + "epoch": 0.4377556785308312, + "grad_norm": 0.6327278698588021, + "learning_rate": 0.00018186599986425947, + "loss": 12.4772, + "step": 8039 + }, + { + "epoch": 0.4378101325274142, + "grad_norm": 0.691860427152302, + "learning_rate": 0.00018186093540980807, + "loss": 12.5121, + "step": 8040 + }, + { + "epoch": 0.4378645865239972, + "grad_norm": 0.6848902755048658, + "learning_rate": 0.00018185587031879232, + "loss": 12.3507, + "step": 8041 + }, + { + "epoch": 0.4379190405205802, + "grad_norm": 0.6024814082483306, + "learning_rate": 0.0001818508045912516, + "loss": 12.4528, + "step": 8042 + }, + { + "epoch": 0.4379734945171632, + "grad_norm": 0.6652691577868063, + "learning_rate": 0.00018184573822722529, + "loss": 12.3157, + "step": 8043 + }, + { + "epoch": 0.4380279485137462, + "grad_norm": 0.6569694406846195, + "learning_rate": 0.00018184067122675276, + "loss": 12.459, + "step": 8044 + }, + { + "epoch": 0.43808240251032926, + "grad_norm": 0.6420980248319899, + "learning_rate": 0.00018183560358987349, + "loss": 12.4096, + "step": 8045 + }, + { + "epoch": 0.43813685650691225, + "grad_norm": 0.6627008318367901, + "learning_rate": 0.00018183053531662684, + "loss": 12.3847, + "step": 8046 + }, + { + "epoch": 0.43819131050349525, + "grad_norm": 0.7059088848618572, + "learning_rate": 0.0001818254664070522, + "loss": 12.5215, + "step": 8047 + }, + { + "epoch": 0.4382457645000783, + "grad_norm": 0.6854425080494145, + "learning_rate": 0.00018182039686118903, + "loss": 12.5269, + "step": 8048 + }, + { + "epoch": 0.4383002184966613, + "grad_norm": 0.6535417405105993, + "learning_rate": 0.00018181532667907671, + "loss": 12.5775, + "step": 8049 + }, + { + "epoch": 0.4383546724932443, + "grad_norm": 0.6389520951864824, + "learning_rate": 0.0001818102558607547, + "loss": 12.2756, + "step": 8050 + }, + { + "epoch": 0.4384091264898273, + "grad_norm": 0.6091592745791563, + "learning_rate": 0.00018180518440626245, + "loss": 12.5519, + "step": 8051 + }, + { + "epoch": 0.4384635804864103, + "grad_norm": 0.592034414821979, + "learning_rate": 0.00018180011231563935, + "loss": 12.3247, + "step": 8052 + }, + { + "epoch": 0.4385180344829933, + "grad_norm": 0.6822262235027153, + "learning_rate": 0.00018179503958892483, + "loss": 12.4094, + "step": 8053 + }, + { + "epoch": 0.43857248847957636, + "grad_norm": 0.6903442897493688, + "learning_rate": 0.0001817899662261584, + "loss": 12.4012, + "step": 8054 + }, + { + "epoch": 0.43862694247615935, + "grad_norm": 0.5746722202516459, + "learning_rate": 0.00018178489222737946, + "loss": 12.3903, + "step": 8055 + }, + { + "epoch": 0.43868139647274235, + "grad_norm": 0.7103112826336199, + "learning_rate": 0.00018177981759262747, + "loss": 12.4608, + "step": 8056 + }, + { + "epoch": 0.4387358504693254, + "grad_norm": 0.7133739481466947, + "learning_rate": 0.00018177474232194195, + "loss": 12.3867, + "step": 8057 + }, + { + "epoch": 0.4387903044659084, + "grad_norm": 0.6869948568749044, + "learning_rate": 0.0001817696664153623, + "loss": 12.4986, + "step": 8058 + }, + { + "epoch": 0.4388447584624914, + "grad_norm": 0.67170120481675, + "learning_rate": 0.00018176458987292799, + "loss": 12.4175, + "step": 8059 + }, + { + "epoch": 0.43889921245907443, + "grad_norm": 0.7517173895741974, + "learning_rate": 0.00018175951269467855, + "loss": 12.4281, + "step": 8060 + }, + { + "epoch": 0.4389536664556574, + "grad_norm": 0.7072112142379776, + "learning_rate": 0.0001817544348806534, + "loss": 12.4413, + "step": 8061 + }, + { + "epoch": 0.43900812045224047, + "grad_norm": 0.6774523087574283, + "learning_rate": 0.00018174935643089203, + "loss": 12.4525, + "step": 8062 + }, + { + "epoch": 0.43906257444882346, + "grad_norm": 0.6340529043477183, + "learning_rate": 0.000181744277345434, + "loss": 12.3797, + "step": 8063 + }, + { + "epoch": 0.43911702844540645, + "grad_norm": 0.718405082762463, + "learning_rate": 0.00018173919762431875, + "loss": 12.5314, + "step": 8064 + }, + { + "epoch": 0.4391714824419895, + "grad_norm": 0.6952690412740806, + "learning_rate": 0.0001817341172675858, + "loss": 12.492, + "step": 8065 + }, + { + "epoch": 0.4392259364385725, + "grad_norm": 0.584979726754951, + "learning_rate": 0.00018172903627527463, + "loss": 12.378, + "step": 8066 + }, + { + "epoch": 0.4392803904351555, + "grad_norm": 0.7046022953559745, + "learning_rate": 0.00018172395464742479, + "loss": 12.5153, + "step": 8067 + }, + { + "epoch": 0.43933484443173854, + "grad_norm": 0.6895392327708775, + "learning_rate": 0.00018171887238407574, + "loss": 12.4722, + "step": 8068 + }, + { + "epoch": 0.43938929842832153, + "grad_norm": 0.593507928029048, + "learning_rate": 0.00018171378948526705, + "loss": 12.5153, + "step": 8069 + }, + { + "epoch": 0.4394437524249045, + "grad_norm": 0.6536079851636544, + "learning_rate": 0.00018170870595103823, + "loss": 12.5043, + "step": 8070 + }, + { + "epoch": 0.43949820642148757, + "grad_norm": 0.7102749722944064, + "learning_rate": 0.0001817036217814288, + "loss": 12.5844, + "step": 8071 + }, + { + "epoch": 0.43955266041807056, + "grad_norm": 0.6250358275118798, + "learning_rate": 0.0001816985369764783, + "loss": 12.4398, + "step": 8072 + }, + { + "epoch": 0.43960711441465355, + "grad_norm": 0.6833189631616499, + "learning_rate": 0.00018169345153622632, + "loss": 12.5777, + "step": 8073 + }, + { + "epoch": 0.4396615684112366, + "grad_norm": 0.6326543135044489, + "learning_rate": 0.00018168836546071234, + "loss": 12.5301, + "step": 8074 + }, + { + "epoch": 0.4397160224078196, + "grad_norm": 0.6599203053997212, + "learning_rate": 0.00018168327874997593, + "loss": 12.4672, + "step": 8075 + }, + { + "epoch": 0.4397704764044026, + "grad_norm": 0.67676831770541, + "learning_rate": 0.00018167819140405662, + "loss": 12.5454, + "step": 8076 + }, + { + "epoch": 0.43982493040098564, + "grad_norm": 0.6171632602120455, + "learning_rate": 0.00018167310342299403, + "loss": 12.3727, + "step": 8077 + }, + { + "epoch": 0.43987938439756863, + "grad_norm": 0.8192129927985445, + "learning_rate": 0.00018166801480682768, + "loss": 12.5696, + "step": 8078 + }, + { + "epoch": 0.4399338383941516, + "grad_norm": 0.6841813706928609, + "learning_rate": 0.00018166292555559714, + "loss": 12.4212, + "step": 8079 + }, + { + "epoch": 0.43998829239073467, + "grad_norm": 0.6756316788448604, + "learning_rate": 0.000181657835669342, + "loss": 12.4523, + "step": 8080 + }, + { + "epoch": 0.44004274638731766, + "grad_norm": 0.7117498961542793, + "learning_rate": 0.00018165274514810186, + "loss": 12.4312, + "step": 8081 + }, + { + "epoch": 0.44009720038390066, + "grad_norm": 0.6798671037484373, + "learning_rate": 0.00018164765399191626, + "loss": 12.5288, + "step": 8082 + }, + { + "epoch": 0.4401516543804837, + "grad_norm": 0.6398340227930036, + "learning_rate": 0.0001816425622008248, + "loss": 12.3392, + "step": 8083 + }, + { + "epoch": 0.4402061083770667, + "grad_norm": 0.7515836639185821, + "learning_rate": 0.0001816374697748671, + "loss": 12.5069, + "step": 8084 + }, + { + "epoch": 0.4402605623736497, + "grad_norm": 0.6114018054945423, + "learning_rate": 0.00018163237671408276, + "loss": 12.3876, + "step": 8085 + }, + { + "epoch": 0.44031501637023274, + "grad_norm": 0.5820204184592364, + "learning_rate": 0.00018162728301851134, + "loss": 12.4475, + "step": 8086 + }, + { + "epoch": 0.44036947036681573, + "grad_norm": 0.7451614160847416, + "learning_rate": 0.0001816221886881925, + "loss": 12.4133, + "step": 8087 + }, + { + "epoch": 0.4404239243633987, + "grad_norm": 0.6376617665113111, + "learning_rate": 0.0001816170937231658, + "loss": 12.4494, + "step": 8088 + }, + { + "epoch": 0.44047837835998177, + "grad_norm": 0.6485801097204811, + "learning_rate": 0.00018161199812347093, + "loss": 12.5021, + "step": 8089 + }, + { + "epoch": 0.44053283235656476, + "grad_norm": 0.6536790997705955, + "learning_rate": 0.00018160690188914744, + "loss": 12.486, + "step": 8090 + }, + { + "epoch": 0.44058728635314776, + "grad_norm": 0.6973380046290791, + "learning_rate": 0.000181601805020235, + "loss": 12.4213, + "step": 8091 + }, + { + "epoch": 0.4406417403497308, + "grad_norm": 0.6153978517164419, + "learning_rate": 0.00018159670751677326, + "loss": 12.4879, + "step": 8092 + }, + { + "epoch": 0.4406961943463138, + "grad_norm": 0.7662004917264686, + "learning_rate": 0.00018159160937880183, + "loss": 12.5327, + "step": 8093 + }, + { + "epoch": 0.4407506483428968, + "grad_norm": 0.6315076862191303, + "learning_rate": 0.00018158651060636038, + "loss": 12.5029, + "step": 8094 + }, + { + "epoch": 0.44080510233947984, + "grad_norm": 0.6570484528322555, + "learning_rate": 0.00018158141119948854, + "loss": 12.5021, + "step": 8095 + }, + { + "epoch": 0.44085955633606283, + "grad_norm": 0.6217452499352483, + "learning_rate": 0.00018157631115822595, + "loss": 12.4358, + "step": 8096 + }, + { + "epoch": 0.4409140103326459, + "grad_norm": 0.6097425662470402, + "learning_rate": 0.0001815712104826123, + "loss": 12.4138, + "step": 8097 + }, + { + "epoch": 0.44096846432922887, + "grad_norm": 0.5895926618645863, + "learning_rate": 0.0001815661091726872, + "loss": 12.3695, + "step": 8098 + }, + { + "epoch": 0.44102291832581186, + "grad_norm": 0.6377967778191228, + "learning_rate": 0.00018156100722849038, + "loss": 12.5448, + "step": 8099 + }, + { + "epoch": 0.4410773723223949, + "grad_norm": 0.6401808345387747, + "learning_rate": 0.0001815559046500615, + "loss": 12.4537, + "step": 8100 + }, + { + "epoch": 0.4411318263189779, + "grad_norm": 0.6287248883361566, + "learning_rate": 0.0001815508014374402, + "loss": 12.5005, + "step": 8101 + }, + { + "epoch": 0.4411862803155609, + "grad_norm": 0.6944250053649565, + "learning_rate": 0.00018154569759066625, + "loss": 12.5913, + "step": 8102 + }, + { + "epoch": 0.44124073431214395, + "grad_norm": 0.627346440387178, + "learning_rate": 0.00018154059310977923, + "loss": 12.4249, + "step": 8103 + }, + { + "epoch": 0.44129518830872694, + "grad_norm": 0.6420492552682078, + "learning_rate": 0.00018153548799481887, + "loss": 12.3946, + "step": 8104 + }, + { + "epoch": 0.44134964230530993, + "grad_norm": 0.681906372443578, + "learning_rate": 0.00018153038224582492, + "loss": 12.3387, + "step": 8105 + }, + { + "epoch": 0.441404096301893, + "grad_norm": 0.6210574333231363, + "learning_rate": 0.00018152527586283703, + "loss": 12.5162, + "step": 8106 + }, + { + "epoch": 0.44145855029847597, + "grad_norm": 0.64322418419475, + "learning_rate": 0.0001815201688458949, + "loss": 12.5622, + "step": 8107 + }, + { + "epoch": 0.44151300429505896, + "grad_norm": 0.8190346317269288, + "learning_rate": 0.0001815150611950383, + "loss": 12.4488, + "step": 8108 + }, + { + "epoch": 0.441567458291642, + "grad_norm": 0.6529614749866838, + "learning_rate": 0.00018150995291030692, + "loss": 12.4613, + "step": 8109 + }, + { + "epoch": 0.441621912288225, + "grad_norm": 0.7258579649878754, + "learning_rate": 0.00018150484399174046, + "loss": 12.4522, + "step": 8110 + }, + { + "epoch": 0.441676366284808, + "grad_norm": 0.6654211401665494, + "learning_rate": 0.00018149973443937868, + "loss": 12.3424, + "step": 8111 + }, + { + "epoch": 0.44173082028139105, + "grad_norm": 0.6318212930391481, + "learning_rate": 0.00018149462425326126, + "loss": 12.4647, + "step": 8112 + }, + { + "epoch": 0.44178527427797404, + "grad_norm": 0.8207170885860678, + "learning_rate": 0.00018148951343342803, + "loss": 12.4289, + "step": 8113 + }, + { + "epoch": 0.44183972827455703, + "grad_norm": 0.6194663346605774, + "learning_rate": 0.00018148440197991863, + "loss": 12.3711, + "step": 8114 + }, + { + "epoch": 0.4418941822711401, + "grad_norm": 0.7014168629119157, + "learning_rate": 0.0001814792898927729, + "loss": 12.3242, + "step": 8115 + }, + { + "epoch": 0.44194863626772307, + "grad_norm": 0.656875765463606, + "learning_rate": 0.00018147417717203048, + "loss": 12.4609, + "step": 8116 + }, + { + "epoch": 0.44200309026430606, + "grad_norm": 0.6422545749068618, + "learning_rate": 0.00018146906381773126, + "loss": 12.4707, + "step": 8117 + }, + { + "epoch": 0.4420575442608891, + "grad_norm": 0.6240047670798762, + "learning_rate": 0.0001814639498299149, + "loss": 12.4928, + "step": 8118 + }, + { + "epoch": 0.4421119982574721, + "grad_norm": 0.6781025678231234, + "learning_rate": 0.00018145883520862121, + "loss": 12.6163, + "step": 8119 + }, + { + "epoch": 0.4421664522540551, + "grad_norm": 0.636615698424067, + "learning_rate": 0.00018145371995388997, + "loss": 12.4772, + "step": 8120 + }, + { + "epoch": 0.44222090625063815, + "grad_norm": 0.6454832736621415, + "learning_rate": 0.00018144860406576094, + "loss": 12.3847, + "step": 8121 + }, + { + "epoch": 0.44227536024722114, + "grad_norm": 0.7496269835050098, + "learning_rate": 0.0001814434875442739, + "loss": 12.2562, + "step": 8122 + }, + { + "epoch": 0.44232981424380413, + "grad_norm": 0.5902049909043856, + "learning_rate": 0.00018143837038946865, + "loss": 12.5374, + "step": 8123 + }, + { + "epoch": 0.4423842682403872, + "grad_norm": 0.6287680351391354, + "learning_rate": 0.00018143325260138498, + "loss": 12.4129, + "step": 8124 + }, + { + "epoch": 0.44243872223697017, + "grad_norm": 0.6395184786336089, + "learning_rate": 0.00018142813418006268, + "loss": 12.5485, + "step": 8125 + }, + { + "epoch": 0.44249317623355316, + "grad_norm": 0.583233484282126, + "learning_rate": 0.00018142301512554153, + "loss": 12.4745, + "step": 8126 + }, + { + "epoch": 0.4425476302301362, + "grad_norm": 0.7393265090602186, + "learning_rate": 0.00018141789543786135, + "loss": 12.5784, + "step": 8127 + }, + { + "epoch": 0.4426020842267192, + "grad_norm": 0.6099498576642015, + "learning_rate": 0.000181412775117062, + "loss": 12.3518, + "step": 8128 + }, + { + "epoch": 0.44265653822330225, + "grad_norm": 0.616859217531763, + "learning_rate": 0.00018140765416318325, + "loss": 12.3706, + "step": 8129 + }, + { + "epoch": 0.44271099221988525, + "grad_norm": 0.5738815684564759, + "learning_rate": 0.00018140253257626493, + "loss": 12.4688, + "step": 8130 + }, + { + "epoch": 0.44276544621646824, + "grad_norm": 0.6287187273103042, + "learning_rate": 0.00018139741035634686, + "loss": 12.4894, + "step": 8131 + }, + { + "epoch": 0.4428199002130513, + "grad_norm": 0.7150333260319802, + "learning_rate": 0.00018139228750346887, + "loss": 12.7204, + "step": 8132 + }, + { + "epoch": 0.4428743542096343, + "grad_norm": 0.6655484781426367, + "learning_rate": 0.0001813871640176708, + "loss": 12.4733, + "step": 8133 + }, + { + "epoch": 0.4429288082062173, + "grad_norm": 0.638583761056179, + "learning_rate": 0.0001813820398989925, + "loss": 12.4027, + "step": 8134 + }, + { + "epoch": 0.4429832622028003, + "grad_norm": 0.6106168427470005, + "learning_rate": 0.00018137691514747383, + "loss": 12.35, + "step": 8135 + }, + { + "epoch": 0.4430377161993833, + "grad_norm": 0.7678202216164216, + "learning_rate": 0.0001813717897631546, + "loss": 12.3463, + "step": 8136 + }, + { + "epoch": 0.4430921701959663, + "grad_norm": 0.6351437344563395, + "learning_rate": 0.0001813666637460747, + "loss": 12.4133, + "step": 8137 + }, + { + "epoch": 0.44314662419254935, + "grad_norm": 0.6594886319959798, + "learning_rate": 0.00018136153709627398, + "loss": 12.2562, + "step": 8138 + }, + { + "epoch": 0.44320107818913235, + "grad_norm": 0.6110581042660779, + "learning_rate": 0.00018135640981379228, + "loss": 12.4918, + "step": 8139 + }, + { + "epoch": 0.44325553218571534, + "grad_norm": 0.6397466953877733, + "learning_rate": 0.0001813512818986695, + "loss": 12.394, + "step": 8140 + }, + { + "epoch": 0.4433099861822984, + "grad_norm": 0.6550859383422465, + "learning_rate": 0.0001813461533509455, + "loss": 12.4382, + "step": 8141 + }, + { + "epoch": 0.4433644401788814, + "grad_norm": 0.6917336007640664, + "learning_rate": 0.00018134102417066022, + "loss": 12.4055, + "step": 8142 + }, + { + "epoch": 0.4434188941754644, + "grad_norm": 0.5850970346286554, + "learning_rate": 0.00018133589435785343, + "loss": 12.4241, + "step": 8143 + }, + { + "epoch": 0.4434733481720474, + "grad_norm": 0.5951134489105335, + "learning_rate": 0.00018133076391256512, + "loss": 12.3374, + "step": 8144 + }, + { + "epoch": 0.4435278021686304, + "grad_norm": 0.6551033967638262, + "learning_rate": 0.00018132563283483516, + "loss": 12.487, + "step": 8145 + }, + { + "epoch": 0.4435822561652134, + "grad_norm": 0.6538463123605098, + "learning_rate": 0.0001813205011247034, + "loss": 12.3122, + "step": 8146 + }, + { + "epoch": 0.44363671016179645, + "grad_norm": 0.5950772399842815, + "learning_rate": 0.00018131536878220983, + "loss": 12.4683, + "step": 8147 + }, + { + "epoch": 0.44369116415837945, + "grad_norm": 0.6694727358020924, + "learning_rate": 0.0001813102358073943, + "loss": 12.4369, + "step": 8148 + }, + { + "epoch": 0.44374561815496244, + "grad_norm": 0.5837086135181214, + "learning_rate": 0.00018130510220029674, + "loss": 12.5195, + "step": 8149 + }, + { + "epoch": 0.4438000721515455, + "grad_norm": 0.6162354476460444, + "learning_rate": 0.00018129996796095705, + "loss": 12.4831, + "step": 8150 + }, + { + "epoch": 0.4438545261481285, + "grad_norm": 0.6581910316864154, + "learning_rate": 0.0001812948330894152, + "loss": 12.4733, + "step": 8151 + }, + { + "epoch": 0.4439089801447115, + "grad_norm": 0.6213906461100707, + "learning_rate": 0.0001812896975857111, + "loss": 12.5239, + "step": 8152 + }, + { + "epoch": 0.4439634341412945, + "grad_norm": 0.606727686748743, + "learning_rate": 0.00018128456144988467, + "loss": 12.5373, + "step": 8153 + }, + { + "epoch": 0.4440178881378775, + "grad_norm": 0.735147788381272, + "learning_rate": 0.00018127942468197582, + "loss": 12.5172, + "step": 8154 + }, + { + "epoch": 0.4440723421344605, + "grad_norm": 0.5921186091395286, + "learning_rate": 0.0001812742872820246, + "loss": 12.3883, + "step": 8155 + }, + { + "epoch": 0.44412679613104356, + "grad_norm": 0.6844273116732905, + "learning_rate": 0.00018126914925007087, + "loss": 12.4229, + "step": 8156 + }, + { + "epoch": 0.44418125012762655, + "grad_norm": 0.6966763682784651, + "learning_rate": 0.00018126401058615458, + "loss": 12.454, + "step": 8157 + }, + { + "epoch": 0.44423570412420954, + "grad_norm": 0.7599245377713695, + "learning_rate": 0.00018125887129031573, + "loss": 12.5486, + "step": 8158 + }, + { + "epoch": 0.4442901581207926, + "grad_norm": 0.5801118497495215, + "learning_rate": 0.0001812537313625943, + "loss": 12.4271, + "step": 8159 + }, + { + "epoch": 0.4443446121173756, + "grad_norm": 0.679983449444894, + "learning_rate": 0.00018124859080303018, + "loss": 12.3809, + "step": 8160 + }, + { + "epoch": 0.4443990661139586, + "grad_norm": 0.567276752664479, + "learning_rate": 0.00018124344961166342, + "loss": 12.3648, + "step": 8161 + }, + { + "epoch": 0.4444535201105416, + "grad_norm": 0.6698341318989772, + "learning_rate": 0.00018123830778853396, + "loss": 12.4784, + "step": 8162 + }, + { + "epoch": 0.4445079741071246, + "grad_norm": 0.6111490002980611, + "learning_rate": 0.0001812331653336818, + "loss": 12.432, + "step": 8163 + }, + { + "epoch": 0.44456242810370766, + "grad_norm": 0.6109325508698443, + "learning_rate": 0.00018122802224714693, + "loss": 12.4912, + "step": 8164 + }, + { + "epoch": 0.44461688210029066, + "grad_norm": 0.5743030968799749, + "learning_rate": 0.00018122287852896933, + "loss": 12.4442, + "step": 8165 + }, + { + "epoch": 0.44467133609687365, + "grad_norm": 0.6070693152333769, + "learning_rate": 0.00018121773417918902, + "loss": 12.3768, + "step": 8166 + }, + { + "epoch": 0.4447257900934567, + "grad_norm": 0.6136466125925997, + "learning_rate": 0.00018121258919784596, + "loss": 12.3442, + "step": 8167 + }, + { + "epoch": 0.4447802440900397, + "grad_norm": 0.6061840102523574, + "learning_rate": 0.0001812074435849802, + "loss": 12.3715, + "step": 8168 + }, + { + "epoch": 0.4448346980866227, + "grad_norm": 0.6503836273900797, + "learning_rate": 0.00018120229734063176, + "loss": 12.6093, + "step": 8169 + }, + { + "epoch": 0.44488915208320573, + "grad_norm": 0.641563324903526, + "learning_rate": 0.0001811971504648406, + "loss": 12.4406, + "step": 8170 + }, + { + "epoch": 0.4449436060797887, + "grad_norm": 0.7577204807990595, + "learning_rate": 0.00018119200295764682, + "loss": 12.5485, + "step": 8171 + }, + { + "epoch": 0.4449980600763717, + "grad_norm": 0.6820962027227742, + "learning_rate": 0.0001811868548190904, + "loss": 12.5358, + "step": 8172 + }, + { + "epoch": 0.44505251407295476, + "grad_norm": 0.6359158735353171, + "learning_rate": 0.00018118170604921135, + "loss": 12.5205, + "step": 8173 + }, + { + "epoch": 0.44510696806953776, + "grad_norm": 0.6111147020344397, + "learning_rate": 0.00018117655664804977, + "loss": 12.4213, + "step": 8174 + }, + { + "epoch": 0.44516142206612075, + "grad_norm": 0.6246736894085984, + "learning_rate": 0.00018117140661564566, + "loss": 12.4356, + "step": 8175 + }, + { + "epoch": 0.4452158760627038, + "grad_norm": 0.5749923246542837, + "learning_rate": 0.0001811662559520391, + "loss": 12.3527, + "step": 8176 + }, + { + "epoch": 0.4452703300592868, + "grad_norm": 0.6530224543649432, + "learning_rate": 0.0001811611046572701, + "loss": 12.5725, + "step": 8177 + }, + { + "epoch": 0.4453247840558698, + "grad_norm": 0.5985662252307675, + "learning_rate": 0.00018115595273137874, + "loss": 12.4586, + "step": 8178 + }, + { + "epoch": 0.44537923805245283, + "grad_norm": 0.7193898797443692, + "learning_rate": 0.0001811508001744051, + "loss": 12.4578, + "step": 8179 + }, + { + "epoch": 0.4454336920490358, + "grad_norm": 0.6444061956976649, + "learning_rate": 0.0001811456469863892, + "loss": 12.5317, + "step": 8180 + }, + { + "epoch": 0.4454881460456188, + "grad_norm": 0.5826868228600273, + "learning_rate": 0.00018114049316737116, + "loss": 12.5559, + "step": 8181 + }, + { + "epoch": 0.44554260004220186, + "grad_norm": 0.6538082635561483, + "learning_rate": 0.00018113533871739104, + "loss": 12.5837, + "step": 8182 + }, + { + "epoch": 0.44559705403878486, + "grad_norm": 0.6200677214450276, + "learning_rate": 0.00018113018363648893, + "loss": 12.4668, + "step": 8183 + }, + { + "epoch": 0.44565150803536785, + "grad_norm": 0.6283652497127699, + "learning_rate": 0.0001811250279247049, + "loss": 12.4758, + "step": 8184 + }, + { + "epoch": 0.4457059620319509, + "grad_norm": 0.7374868480204493, + "learning_rate": 0.00018111987158207904, + "loss": 12.3293, + "step": 8185 + }, + { + "epoch": 0.4457604160285339, + "grad_norm": 0.6923032516638095, + "learning_rate": 0.00018111471460865146, + "loss": 12.2851, + "step": 8186 + }, + { + "epoch": 0.4458148700251169, + "grad_norm": 0.6292759013526202, + "learning_rate": 0.00018110955700446224, + "loss": 12.4064, + "step": 8187 + }, + { + "epoch": 0.44586932402169993, + "grad_norm": 0.7390498575776396, + "learning_rate": 0.00018110439876955152, + "loss": 12.4175, + "step": 8188 + }, + { + "epoch": 0.4459237780182829, + "grad_norm": 0.6180650489407907, + "learning_rate": 0.00018109923990395938, + "loss": 12.3875, + "step": 8189 + }, + { + "epoch": 0.4459782320148659, + "grad_norm": 0.6974498935982044, + "learning_rate": 0.00018109408040772594, + "loss": 12.6687, + "step": 8190 + }, + { + "epoch": 0.44603268601144896, + "grad_norm": 0.7591191693971592, + "learning_rate": 0.00018108892028089138, + "loss": 12.4527, + "step": 8191 + }, + { + "epoch": 0.44608714000803196, + "grad_norm": 0.6208047689810073, + "learning_rate": 0.00018108375952349573, + "loss": 12.4469, + "step": 8192 + }, + { + "epoch": 0.44614159400461495, + "grad_norm": 0.781635480903214, + "learning_rate": 0.0001810785981355792, + "loss": 12.2055, + "step": 8193 + }, + { + "epoch": 0.446196048001198, + "grad_norm": 0.7186489072431105, + "learning_rate": 0.00018107343611718188, + "loss": 12.6631, + "step": 8194 + }, + { + "epoch": 0.446250501997781, + "grad_norm": 0.758734781288091, + "learning_rate": 0.0001810682734683439, + "loss": 12.2953, + "step": 8195 + }, + { + "epoch": 0.44630495599436404, + "grad_norm": 0.6159585950601735, + "learning_rate": 0.00018106311018910548, + "loss": 12.4242, + "step": 8196 + }, + { + "epoch": 0.44635940999094703, + "grad_norm": 0.7073770563576457, + "learning_rate": 0.0001810579462795067, + "loss": 12.5467, + "step": 8197 + }, + { + "epoch": 0.44641386398753, + "grad_norm": 0.6096509478965852, + "learning_rate": 0.00018105278173958774, + "loss": 12.2076, + "step": 8198 + }, + { + "epoch": 0.44646831798411307, + "grad_norm": 0.6392576052815786, + "learning_rate": 0.00018104761656938871, + "loss": 12.4672, + "step": 8199 + }, + { + "epoch": 0.44652277198069606, + "grad_norm": 0.6049885538084858, + "learning_rate": 0.00018104245076894988, + "loss": 12.4106, + "step": 8200 + }, + { + "epoch": 0.44657722597727906, + "grad_norm": 0.6642922311561321, + "learning_rate": 0.00018103728433831133, + "loss": 12.4219, + "step": 8201 + }, + { + "epoch": 0.4466316799738621, + "grad_norm": 0.7173739433890999, + "learning_rate": 0.00018103211727751327, + "loss": 12.36, + "step": 8202 + }, + { + "epoch": 0.4466861339704451, + "grad_norm": 0.6155354362488491, + "learning_rate": 0.0001810269495865959, + "loss": 12.4365, + "step": 8203 + }, + { + "epoch": 0.4467405879670281, + "grad_norm": 0.5944106403284649, + "learning_rate": 0.00018102178126559937, + "loss": 12.4833, + "step": 8204 + }, + { + "epoch": 0.44679504196361114, + "grad_norm": 0.6074043433035483, + "learning_rate": 0.00018101661231456387, + "loss": 12.4723, + "step": 8205 + }, + { + "epoch": 0.44684949596019413, + "grad_norm": 0.6043312912035862, + "learning_rate": 0.00018101144273352963, + "loss": 12.3855, + "step": 8206 + }, + { + "epoch": 0.4469039499567771, + "grad_norm": 0.6270599323257495, + "learning_rate": 0.0001810062725225368, + "loss": 12.4546, + "step": 8207 + }, + { + "epoch": 0.4469584039533602, + "grad_norm": 0.6661394924846142, + "learning_rate": 0.00018100110168162563, + "loss": 12.3606, + "step": 8208 + }, + { + "epoch": 0.44701285794994317, + "grad_norm": 0.7137124054573656, + "learning_rate": 0.0001809959302108363, + "loss": 12.589, + "step": 8209 + }, + { + "epoch": 0.44706731194652616, + "grad_norm": 0.6665658000172924, + "learning_rate": 0.00018099075811020904, + "loss": 12.3141, + "step": 8210 + }, + { + "epoch": 0.4471217659431092, + "grad_norm": 0.7127001415690272, + "learning_rate": 0.00018098558537978405, + "loss": 12.5324, + "step": 8211 + }, + { + "epoch": 0.4471762199396922, + "grad_norm": 0.6345169385181538, + "learning_rate": 0.00018098041201960156, + "loss": 12.44, + "step": 8212 + }, + { + "epoch": 0.4472306739362752, + "grad_norm": 0.6563976622677672, + "learning_rate": 0.00018097523802970185, + "loss": 12.3428, + "step": 8213 + }, + { + "epoch": 0.44728512793285824, + "grad_norm": 0.6687606642611973, + "learning_rate": 0.00018097006341012505, + "loss": 12.4602, + "step": 8214 + }, + { + "epoch": 0.44733958192944123, + "grad_norm": 0.7164323797540626, + "learning_rate": 0.0001809648881609115, + "loss": 12.5104, + "step": 8215 + }, + { + "epoch": 0.4473940359260242, + "grad_norm": 0.6878977953960316, + "learning_rate": 0.0001809597122821014, + "loss": 12.4367, + "step": 8216 + }, + { + "epoch": 0.4474484899226073, + "grad_norm": 0.5843306191896047, + "learning_rate": 0.000180954535773735, + "loss": 12.3788, + "step": 8217 + }, + { + "epoch": 0.44750294391919027, + "grad_norm": 0.627590152184721, + "learning_rate": 0.0001809493586358525, + "loss": 12.335, + "step": 8218 + }, + { + "epoch": 0.44755739791577326, + "grad_norm": 0.6730383060730796, + "learning_rate": 0.00018094418086849428, + "loss": 12.4502, + "step": 8219 + }, + { + "epoch": 0.4476118519123563, + "grad_norm": 0.6622308298602019, + "learning_rate": 0.0001809390024717005, + "loss": 12.442, + "step": 8220 + }, + { + "epoch": 0.4476663059089393, + "grad_norm": 0.5680820510071307, + "learning_rate": 0.0001809338234455115, + "loss": 12.5298, + "step": 8221 + }, + { + "epoch": 0.4477207599055223, + "grad_norm": 0.6048364187339758, + "learning_rate": 0.00018092864378996748, + "loss": 12.4445, + "step": 8222 + }, + { + "epoch": 0.44777521390210534, + "grad_norm": 0.7737050573569966, + "learning_rate": 0.00018092346350510877, + "loss": 12.5471, + "step": 8223 + }, + { + "epoch": 0.44782966789868833, + "grad_norm": 0.6120992948950894, + "learning_rate": 0.0001809182825909756, + "loss": 12.4766, + "step": 8224 + }, + { + "epoch": 0.4478841218952713, + "grad_norm": 0.7214950968754054, + "learning_rate": 0.00018091310104760834, + "loss": 12.4849, + "step": 8225 + }, + { + "epoch": 0.4479385758918544, + "grad_norm": 0.6105295559177767, + "learning_rate": 0.00018090791887504724, + "loss": 12.4912, + "step": 8226 + }, + { + "epoch": 0.44799302988843737, + "grad_norm": 0.5889511018642188, + "learning_rate": 0.00018090273607333256, + "loss": 12.5213, + "step": 8227 + }, + { + "epoch": 0.44804748388502036, + "grad_norm": 0.6737149319485795, + "learning_rate": 0.00018089755264250466, + "loss": 12.3463, + "step": 8228 + }, + { + "epoch": 0.4481019378816034, + "grad_norm": 0.6485216645092555, + "learning_rate": 0.00018089236858260383, + "loss": 12.4522, + "step": 8229 + }, + { + "epoch": 0.4481563918781864, + "grad_norm": 0.6290966852441273, + "learning_rate": 0.00018088718389367036, + "loss": 12.479, + "step": 8230 + }, + { + "epoch": 0.44821084587476945, + "grad_norm": 0.6435504811467402, + "learning_rate": 0.00018088199857574456, + "loss": 12.5209, + "step": 8231 + }, + { + "epoch": 0.44826529987135244, + "grad_norm": 0.6477490585103859, + "learning_rate": 0.00018087681262886682, + "loss": 12.4887, + "step": 8232 + }, + { + "epoch": 0.44831975386793543, + "grad_norm": 0.7152524441450054, + "learning_rate": 0.0001808716260530774, + "loss": 12.5928, + "step": 8233 + }, + { + "epoch": 0.4483742078645185, + "grad_norm": 0.6169314782015952, + "learning_rate": 0.00018086643884841666, + "loss": 12.4819, + "step": 8234 + }, + { + "epoch": 0.4484286618611015, + "grad_norm": 0.61995875690478, + "learning_rate": 0.00018086125101492493, + "loss": 12.5438, + "step": 8235 + }, + { + "epoch": 0.44848311585768447, + "grad_norm": 0.6410424565512555, + "learning_rate": 0.00018085606255264255, + "loss": 12.3693, + "step": 8236 + }, + { + "epoch": 0.4485375698542675, + "grad_norm": 0.675278433061094, + "learning_rate": 0.00018085087346160988, + "loss": 12.4955, + "step": 8237 + }, + { + "epoch": 0.4485920238508505, + "grad_norm": 0.6860164489550601, + "learning_rate": 0.00018084568374186722, + "loss": 12.3339, + "step": 8238 + }, + { + "epoch": 0.4486464778474335, + "grad_norm": 0.70350072710245, + "learning_rate": 0.000180840493393455, + "loss": 12.424, + "step": 8239 + }, + { + "epoch": 0.44870093184401655, + "grad_norm": 0.6038702419154595, + "learning_rate": 0.00018083530241641357, + "loss": 12.5123, + "step": 8240 + }, + { + "epoch": 0.44875538584059954, + "grad_norm": 0.6831009948912166, + "learning_rate": 0.00018083011081078323, + "loss": 12.4619, + "step": 8241 + }, + { + "epoch": 0.44880983983718253, + "grad_norm": 0.6303418377962721, + "learning_rate": 0.0001808249185766044, + "loss": 12.4241, + "step": 8242 + }, + { + "epoch": 0.4488642938337656, + "grad_norm": 0.6777704932279972, + "learning_rate": 0.00018081972571391749, + "loss": 12.538, + "step": 8243 + }, + { + "epoch": 0.4489187478303486, + "grad_norm": 0.6216889573416485, + "learning_rate": 0.00018081453222276278, + "loss": 12.5019, + "step": 8244 + }, + { + "epoch": 0.44897320182693157, + "grad_norm": 0.827652560719684, + "learning_rate": 0.00018080933810318072, + "loss": 12.473, + "step": 8245 + }, + { + "epoch": 0.4490276558235146, + "grad_norm": 0.6836190786922434, + "learning_rate": 0.00018080414335521172, + "loss": 12.4538, + "step": 8246 + }, + { + "epoch": 0.4490821098200976, + "grad_norm": 0.7056054941784216, + "learning_rate": 0.00018079894797889615, + "loss": 12.5438, + "step": 8247 + }, + { + "epoch": 0.4491365638166806, + "grad_norm": 0.7109116869401612, + "learning_rate": 0.00018079375197427438, + "loss": 12.5076, + "step": 8248 + }, + { + "epoch": 0.44919101781326365, + "grad_norm": 0.6384313871607461, + "learning_rate": 0.00018078855534138687, + "loss": 12.3335, + "step": 8249 + }, + { + "epoch": 0.44924547180984664, + "grad_norm": 0.6014901136296119, + "learning_rate": 0.00018078335808027402, + "loss": 12.4224, + "step": 8250 + }, + { + "epoch": 0.44929992580642963, + "grad_norm": 0.6995417512969619, + "learning_rate": 0.00018077816019097622, + "loss": 12.4247, + "step": 8251 + }, + { + "epoch": 0.4493543798030127, + "grad_norm": 0.646866040388919, + "learning_rate": 0.0001807729616735339, + "loss": 12.3516, + "step": 8252 + }, + { + "epoch": 0.4494088337995957, + "grad_norm": 0.6323119202761118, + "learning_rate": 0.00018076776252798747, + "loss": 12.466, + "step": 8253 + }, + { + "epoch": 0.44946328779617867, + "grad_norm": 0.6573015764103591, + "learning_rate": 0.0001807625627543774, + "loss": 12.3854, + "step": 8254 + }, + { + "epoch": 0.4495177417927617, + "grad_norm": 0.7660222464548213, + "learning_rate": 0.0001807573623527441, + "loss": 12.6872, + "step": 8255 + }, + { + "epoch": 0.4495721957893447, + "grad_norm": 0.6917988160741411, + "learning_rate": 0.00018075216132312796, + "loss": 12.4, + "step": 8256 + }, + { + "epoch": 0.4496266497859277, + "grad_norm": 0.6528921899765494, + "learning_rate": 0.0001807469596655695, + "loss": 12.4176, + "step": 8257 + }, + { + "epoch": 0.44968110378251075, + "grad_norm": 0.6387928727990673, + "learning_rate": 0.00018074175738010915, + "loss": 12.553, + "step": 8258 + }, + { + "epoch": 0.44973555777909374, + "grad_norm": 0.7032323497462504, + "learning_rate": 0.00018073655446678734, + "loss": 12.3964, + "step": 8259 + }, + { + "epoch": 0.44979001177567673, + "grad_norm": 0.6201282183769875, + "learning_rate": 0.00018073135092564456, + "loss": 12.4337, + "step": 8260 + }, + { + "epoch": 0.4498444657722598, + "grad_norm": 0.6546034000172873, + "learning_rate": 0.00018072614675672123, + "loss": 12.5565, + "step": 8261 + }, + { + "epoch": 0.4498989197688428, + "grad_norm": 0.6167943409390654, + "learning_rate": 0.00018072094196005788, + "loss": 12.3856, + "step": 8262 + }, + { + "epoch": 0.4499533737654258, + "grad_norm": 0.6411173044207982, + "learning_rate": 0.0001807157365356949, + "loss": 12.4758, + "step": 8263 + }, + { + "epoch": 0.4500078277620088, + "grad_norm": 0.6660852274828428, + "learning_rate": 0.00018071053048367283, + "loss": 12.3944, + "step": 8264 + }, + { + "epoch": 0.4500622817585918, + "grad_norm": 0.7313184201567402, + "learning_rate": 0.00018070532380403217, + "loss": 12.5177, + "step": 8265 + }, + { + "epoch": 0.45011673575517486, + "grad_norm": 0.5888785217430297, + "learning_rate": 0.00018070011649681336, + "loss": 12.4226, + "step": 8266 + }, + { + "epoch": 0.45017118975175785, + "grad_norm": 0.6271369849906314, + "learning_rate": 0.00018069490856205693, + "loss": 12.4255, + "step": 8267 + }, + { + "epoch": 0.45022564374834084, + "grad_norm": 0.6877928172862263, + "learning_rate": 0.00018068969999980334, + "loss": 12.4457, + "step": 8268 + }, + { + "epoch": 0.4502800977449239, + "grad_norm": 0.7126841747836358, + "learning_rate": 0.00018068449081009308, + "loss": 12.4507, + "step": 8269 + }, + { + "epoch": 0.4503345517415069, + "grad_norm": 0.5935590021761236, + "learning_rate": 0.00018067928099296676, + "loss": 12.4129, + "step": 8270 + }, + { + "epoch": 0.4503890057380899, + "grad_norm": 0.7332547699397844, + "learning_rate": 0.00018067407054846477, + "loss": 12.4179, + "step": 8271 + }, + { + "epoch": 0.4504434597346729, + "grad_norm": 0.6439231739492874, + "learning_rate": 0.00018066885947662768, + "loss": 12.4201, + "step": 8272 + }, + { + "epoch": 0.4504979137312559, + "grad_norm": 0.6404282685943609, + "learning_rate": 0.00018066364777749602, + "loss": 12.4724, + "step": 8273 + }, + { + "epoch": 0.4505523677278389, + "grad_norm": 0.6757165721643505, + "learning_rate": 0.00018065843545111033, + "loss": 12.3736, + "step": 8274 + }, + { + "epoch": 0.45060682172442196, + "grad_norm": 0.6436434529216677, + "learning_rate": 0.00018065322249751108, + "loss": 12.5005, + "step": 8275 + }, + { + "epoch": 0.45066127572100495, + "grad_norm": 0.7705414807500481, + "learning_rate": 0.00018064800891673887, + "loss": 12.5556, + "step": 8276 + }, + { + "epoch": 0.45071572971758794, + "grad_norm": 0.6253013831319388, + "learning_rate": 0.0001806427947088342, + "loss": 12.3188, + "step": 8277 + }, + { + "epoch": 0.450770183714171, + "grad_norm": 0.6693402182742427, + "learning_rate": 0.00018063757987383765, + "loss": 12.4618, + "step": 8278 + }, + { + "epoch": 0.450824637710754, + "grad_norm": 0.809093178489467, + "learning_rate": 0.00018063236441178977, + "loss": 12.3269, + "step": 8279 + }, + { + "epoch": 0.450879091707337, + "grad_norm": 0.6960771578027255, + "learning_rate": 0.0001806271483227311, + "loss": 12.446, + "step": 8280 + }, + { + "epoch": 0.45093354570392, + "grad_norm": 0.6836104620072424, + "learning_rate": 0.00018062193160670216, + "loss": 12.4956, + "step": 8281 + }, + { + "epoch": 0.450987999700503, + "grad_norm": 0.6288308608485124, + "learning_rate": 0.0001806167142637436, + "loss": 12.4298, + "step": 8282 + }, + { + "epoch": 0.451042453697086, + "grad_norm": 0.6590638845740401, + "learning_rate": 0.00018061149629389594, + "loss": 12.6313, + "step": 8283 + }, + { + "epoch": 0.45109690769366906, + "grad_norm": 0.713252599730932, + "learning_rate": 0.00018060627769719977, + "loss": 12.5886, + "step": 8284 + }, + { + "epoch": 0.45115136169025205, + "grad_norm": 0.6662160820965829, + "learning_rate": 0.00018060105847369563, + "loss": 12.4701, + "step": 8285 + }, + { + "epoch": 0.45120581568683504, + "grad_norm": 0.8098651522302789, + "learning_rate": 0.0001805958386234242, + "loss": 12.5582, + "step": 8286 + }, + { + "epoch": 0.4512602696834181, + "grad_norm": 0.5886671797308553, + "learning_rate": 0.000180590618146426, + "loss": 12.3877, + "step": 8287 + }, + { + "epoch": 0.4513147236800011, + "grad_norm": 0.6010781282278521, + "learning_rate": 0.00018058539704274163, + "loss": 12.4755, + "step": 8288 + }, + { + "epoch": 0.4513691776765841, + "grad_norm": 0.622368369814635, + "learning_rate": 0.00018058017531241168, + "loss": 12.4668, + "step": 8289 + }, + { + "epoch": 0.4514236316731671, + "grad_norm": 0.5814870801804356, + "learning_rate": 0.00018057495295547676, + "loss": 12.3564, + "step": 8290 + }, + { + "epoch": 0.4514780856697501, + "grad_norm": 0.6701604109652208, + "learning_rate": 0.00018056972997197753, + "loss": 12.5295, + "step": 8291 + }, + { + "epoch": 0.4515325396663331, + "grad_norm": 0.6107798920489389, + "learning_rate": 0.0001805645063619546, + "loss": 12.4094, + "step": 8292 + }, + { + "epoch": 0.45158699366291616, + "grad_norm": 0.6121065620221543, + "learning_rate": 0.00018055928212544847, + "loss": 12.5846, + "step": 8293 + }, + { + "epoch": 0.45164144765949915, + "grad_norm": 0.6625822088074984, + "learning_rate": 0.00018055405726249992, + "loss": 12.3911, + "step": 8294 + }, + { + "epoch": 0.45169590165608214, + "grad_norm": 0.6761019614906887, + "learning_rate": 0.0001805488317731495, + "loss": 12.5609, + "step": 8295 + }, + { + "epoch": 0.4517503556526652, + "grad_norm": 0.6057534358071944, + "learning_rate": 0.00018054360565743785, + "loss": 12.5217, + "step": 8296 + }, + { + "epoch": 0.4518048096492482, + "grad_norm": 0.5904779845769116, + "learning_rate": 0.0001805383789154056, + "loss": 12.506, + "step": 8297 + }, + { + "epoch": 0.45185926364583123, + "grad_norm": 0.7203160399786142, + "learning_rate": 0.00018053315154709342, + "loss": 12.6001, + "step": 8298 + }, + { + "epoch": 0.4519137176424142, + "grad_norm": 0.7114625436615308, + "learning_rate": 0.00018052792355254197, + "loss": 12.4177, + "step": 8299 + }, + { + "epoch": 0.4519681716389972, + "grad_norm": 0.6171990688626064, + "learning_rate": 0.00018052269493179184, + "loss": 12.3536, + "step": 8300 + }, + { + "epoch": 0.45202262563558027, + "grad_norm": 0.6358158470366418, + "learning_rate": 0.00018051746568488377, + "loss": 12.5709, + "step": 8301 + }, + { + "epoch": 0.45207707963216326, + "grad_norm": 0.6387813901796725, + "learning_rate": 0.00018051223581185837, + "loss": 12.3938, + "step": 8302 + }, + { + "epoch": 0.45213153362874625, + "grad_norm": 0.571110397702212, + "learning_rate": 0.0001805070053127563, + "loss": 12.4995, + "step": 8303 + }, + { + "epoch": 0.4521859876253293, + "grad_norm": 0.6288736694560902, + "learning_rate": 0.00018050177418761827, + "loss": 12.2448, + "step": 8304 + }, + { + "epoch": 0.4522404416219123, + "grad_norm": 0.6325902111994797, + "learning_rate": 0.00018049654243648496, + "loss": 12.3974, + "step": 8305 + }, + { + "epoch": 0.4522948956184953, + "grad_norm": 0.6493021827097402, + "learning_rate": 0.000180491310059397, + "loss": 12.4227, + "step": 8306 + }, + { + "epoch": 0.45234934961507833, + "grad_norm": 0.5846396599292224, + "learning_rate": 0.00018048607705639517, + "loss": 12.3472, + "step": 8307 + }, + { + "epoch": 0.4524038036116613, + "grad_norm": 0.649321265011039, + "learning_rate": 0.00018048084342752008, + "loss": 12.5292, + "step": 8308 + }, + { + "epoch": 0.4524582576082443, + "grad_norm": 0.6319409826716816, + "learning_rate": 0.00018047560917281246, + "loss": 12.5412, + "step": 8309 + }, + { + "epoch": 0.45251271160482737, + "grad_norm": 0.7202103340396316, + "learning_rate": 0.00018047037429231298, + "loss": 12.538, + "step": 8310 + }, + { + "epoch": 0.45256716560141036, + "grad_norm": 0.708175500993768, + "learning_rate": 0.00018046513878606238, + "loss": 12.6258, + "step": 8311 + }, + { + "epoch": 0.45262161959799335, + "grad_norm": 0.5807181380628393, + "learning_rate": 0.0001804599026541014, + "loss": 12.3849, + "step": 8312 + }, + { + "epoch": 0.4526760735945764, + "grad_norm": 0.6428716877562957, + "learning_rate": 0.0001804546658964707, + "loss": 12.4105, + "step": 8313 + }, + { + "epoch": 0.4527305275911594, + "grad_norm": 0.6653068583401828, + "learning_rate": 0.00018044942851321103, + "loss": 12.3114, + "step": 8314 + }, + { + "epoch": 0.4527849815877424, + "grad_norm": 0.6454680182879358, + "learning_rate": 0.0001804441905043631, + "loss": 12.532, + "step": 8315 + }, + { + "epoch": 0.45283943558432543, + "grad_norm": 0.5780892948692522, + "learning_rate": 0.00018043895186996766, + "loss": 12.3504, + "step": 8316 + }, + { + "epoch": 0.4528938895809084, + "grad_norm": 0.69638392730691, + "learning_rate": 0.00018043371261006546, + "loss": 12.4106, + "step": 8317 + }, + { + "epoch": 0.4529483435774914, + "grad_norm": 0.635956986948982, + "learning_rate": 0.0001804284727246972, + "loss": 12.5283, + "step": 8318 + }, + { + "epoch": 0.45300279757407447, + "grad_norm": 0.6313431542322339, + "learning_rate": 0.00018042323221390368, + "loss": 12.5512, + "step": 8319 + }, + { + "epoch": 0.45305725157065746, + "grad_norm": 0.6489741452015877, + "learning_rate": 0.0001804179910777256, + "loss": 12.4485, + "step": 8320 + }, + { + "epoch": 0.45311170556724045, + "grad_norm": 0.6315006712394421, + "learning_rate": 0.00018041274931620372, + "loss": 12.5214, + "step": 8321 + }, + { + "epoch": 0.4531661595638235, + "grad_norm": 0.5990546472160445, + "learning_rate": 0.00018040750692937881, + "loss": 12.525, + "step": 8322 + }, + { + "epoch": 0.4532206135604065, + "grad_norm": 0.5871850278370101, + "learning_rate": 0.00018040226391729166, + "loss": 12.3591, + "step": 8323 + }, + { + "epoch": 0.4532750675569895, + "grad_norm": 0.658322856276941, + "learning_rate": 0.00018039702027998304, + "loss": 12.5054, + "step": 8324 + }, + { + "epoch": 0.45332952155357253, + "grad_norm": 0.639803097756465, + "learning_rate": 0.0001803917760174937, + "loss": 12.5238, + "step": 8325 + }, + { + "epoch": 0.4533839755501555, + "grad_norm": 0.8436479688153324, + "learning_rate": 0.00018038653112986442, + "loss": 12.4266, + "step": 8326 + }, + { + "epoch": 0.4534384295467385, + "grad_norm": 0.662813388242909, + "learning_rate": 0.00018038128561713597, + "loss": 12.3991, + "step": 8327 + }, + { + "epoch": 0.45349288354332157, + "grad_norm": 0.6802356628734042, + "learning_rate": 0.00018037603947934917, + "loss": 12.3814, + "step": 8328 + }, + { + "epoch": 0.45354733753990456, + "grad_norm": 0.7191399813569995, + "learning_rate": 0.00018037079271654484, + "loss": 12.5178, + "step": 8329 + }, + { + "epoch": 0.4536017915364876, + "grad_norm": 0.7728736862833322, + "learning_rate": 0.00018036554532876374, + "loss": 12.58, + "step": 8330 + }, + { + "epoch": 0.4536562455330706, + "grad_norm": 0.7082727583696239, + "learning_rate": 0.00018036029731604666, + "loss": 12.3314, + "step": 8331 + }, + { + "epoch": 0.4537106995296536, + "grad_norm": 0.63936538910493, + "learning_rate": 0.00018035504867843446, + "loss": 12.5019, + "step": 8332 + }, + { + "epoch": 0.45376515352623664, + "grad_norm": 0.7309572506200637, + "learning_rate": 0.0001803497994159679, + "loss": 12.3034, + "step": 8333 + }, + { + "epoch": 0.45381960752281963, + "grad_norm": 0.7346023329112903, + "learning_rate": 0.00018034454952868783, + "loss": 12.493, + "step": 8334 + }, + { + "epoch": 0.4538740615194026, + "grad_norm": 0.6438331942903343, + "learning_rate": 0.00018033929901663508, + "loss": 12.4777, + "step": 8335 + }, + { + "epoch": 0.4539285155159857, + "grad_norm": 0.6429282768622242, + "learning_rate": 0.00018033404787985046, + "loss": 12.5446, + "step": 8336 + }, + { + "epoch": 0.45398296951256867, + "grad_norm": 0.6470283192872487, + "learning_rate": 0.00018032879611837484, + "loss": 12.4359, + "step": 8337 + }, + { + "epoch": 0.45403742350915166, + "grad_norm": 0.6652689528162133, + "learning_rate": 0.00018032354373224898, + "loss": 12.4367, + "step": 8338 + }, + { + "epoch": 0.4540918775057347, + "grad_norm": 0.7137004022132828, + "learning_rate": 0.00018031829072151382, + "loss": 12.4301, + "step": 8339 + }, + { + "epoch": 0.4541463315023177, + "grad_norm": 0.6441967324262494, + "learning_rate": 0.0001803130370862101, + "loss": 12.4321, + "step": 8340 + }, + { + "epoch": 0.4542007854989007, + "grad_norm": 0.7165948198137981, + "learning_rate": 0.0001803077828263788, + "loss": 12.3381, + "step": 8341 + }, + { + "epoch": 0.45425523949548374, + "grad_norm": 0.6936232718909088, + "learning_rate": 0.0001803025279420607, + "loss": 12.4531, + "step": 8342 + }, + { + "epoch": 0.45430969349206674, + "grad_norm": 0.6793974980558357, + "learning_rate": 0.00018029727243329665, + "loss": 12.4894, + "step": 8343 + }, + { + "epoch": 0.45436414748864973, + "grad_norm": 0.7578080572478764, + "learning_rate": 0.00018029201630012756, + "loss": 12.4671, + "step": 8344 + }, + { + "epoch": 0.4544186014852328, + "grad_norm": 0.6375758735624671, + "learning_rate": 0.00018028675954259428, + "loss": 12.4301, + "step": 8345 + }, + { + "epoch": 0.45447305548181577, + "grad_norm": 0.6403854044238918, + "learning_rate": 0.00018028150216073768, + "loss": 12.5084, + "step": 8346 + }, + { + "epoch": 0.45452750947839876, + "grad_norm": 0.744259474887641, + "learning_rate": 0.00018027624415459869, + "loss": 12.4074, + "step": 8347 + }, + { + "epoch": 0.4545819634749818, + "grad_norm": 0.6532864185316472, + "learning_rate": 0.00018027098552421812, + "loss": 12.1952, + "step": 8348 + }, + { + "epoch": 0.4546364174715648, + "grad_norm": 0.6709441299649114, + "learning_rate": 0.00018026572626963697, + "loss": 12.4102, + "step": 8349 + }, + { + "epoch": 0.4546908714681478, + "grad_norm": 0.6527345750406884, + "learning_rate": 0.000180260466390896, + "loss": 12.5381, + "step": 8350 + }, + { + "epoch": 0.45474532546473084, + "grad_norm": 0.5993932846023744, + "learning_rate": 0.00018025520588803624, + "loss": 12.3806, + "step": 8351 + }, + { + "epoch": 0.45479977946131384, + "grad_norm": 0.6560079767341911, + "learning_rate": 0.0001802499447610985, + "loss": 12.426, + "step": 8352 + }, + { + "epoch": 0.45485423345789683, + "grad_norm": 0.6208449905155884, + "learning_rate": 0.00018024468301012375, + "loss": 12.4476, + "step": 8353 + }, + { + "epoch": 0.4549086874544799, + "grad_norm": 0.5938706449016852, + "learning_rate": 0.00018023942063515288, + "loss": 12.3896, + "step": 8354 + }, + { + "epoch": 0.45496314145106287, + "grad_norm": 0.6840502037543044, + "learning_rate": 0.00018023415763622684, + "loss": 12.5287, + "step": 8355 + }, + { + "epoch": 0.45501759544764586, + "grad_norm": 0.6484201237103069, + "learning_rate": 0.0001802288940133865, + "loss": 12.5393, + "step": 8356 + }, + { + "epoch": 0.4550720494442289, + "grad_norm": 0.6536567142719277, + "learning_rate": 0.00018022362976667286, + "loss": 12.5158, + "step": 8357 + }, + { + "epoch": 0.4551265034408119, + "grad_norm": 0.6495453175136698, + "learning_rate": 0.00018021836489612682, + "loss": 12.402, + "step": 8358 + }, + { + "epoch": 0.4551809574373949, + "grad_norm": 0.6546075029096625, + "learning_rate": 0.0001802130994017893, + "loss": 12.4056, + "step": 8359 + }, + { + "epoch": 0.45523541143397794, + "grad_norm": 0.6169108403795847, + "learning_rate": 0.00018020783328370128, + "loss": 12.3984, + "step": 8360 + }, + { + "epoch": 0.45528986543056094, + "grad_norm": 0.7334837642317463, + "learning_rate": 0.00018020256654190372, + "loss": 12.5352, + "step": 8361 + }, + { + "epoch": 0.45534431942714393, + "grad_norm": 0.6355316885852189, + "learning_rate": 0.0001801972991764375, + "loss": 12.4515, + "step": 8362 + }, + { + "epoch": 0.455398773423727, + "grad_norm": 0.6762017192687889, + "learning_rate": 0.00018019203118734364, + "loss": 12.5213, + "step": 8363 + }, + { + "epoch": 0.45545322742030997, + "grad_norm": 0.652040432310015, + "learning_rate": 0.00018018676257466315, + "loss": 12.3384, + "step": 8364 + }, + { + "epoch": 0.455507681416893, + "grad_norm": 0.6052011344639239, + "learning_rate": 0.0001801814933384369, + "loss": 12.5054, + "step": 8365 + }, + { + "epoch": 0.455562135413476, + "grad_norm": 0.7345528713970678, + "learning_rate": 0.00018017622347870594, + "loss": 12.2335, + "step": 8366 + }, + { + "epoch": 0.455616589410059, + "grad_norm": 0.6469247135524236, + "learning_rate": 0.0001801709529955112, + "loss": 12.4192, + "step": 8367 + }, + { + "epoch": 0.45567104340664205, + "grad_norm": 0.562462112853363, + "learning_rate": 0.00018016568188889366, + "loss": 12.3651, + "step": 8368 + }, + { + "epoch": 0.45572549740322504, + "grad_norm": 0.7793939972367465, + "learning_rate": 0.0001801604101588944, + "loss": 12.5982, + "step": 8369 + }, + { + "epoch": 0.45577995139980804, + "grad_norm": 0.7634582703853848, + "learning_rate": 0.00018015513780555428, + "loss": 12.4733, + "step": 8370 + }, + { + "epoch": 0.4558344053963911, + "grad_norm": 0.7433426272281132, + "learning_rate": 0.00018014986482891443, + "loss": 12.3873, + "step": 8371 + }, + { + "epoch": 0.4558888593929741, + "grad_norm": 0.7151847951356256, + "learning_rate": 0.00018014459122901575, + "loss": 12.4266, + "step": 8372 + }, + { + "epoch": 0.45594331338955707, + "grad_norm": 0.6536945132481292, + "learning_rate": 0.00018013931700589926, + "loss": 12.4628, + "step": 8373 + }, + { + "epoch": 0.4559977673861401, + "grad_norm": 0.6432829911573463, + "learning_rate": 0.00018013404215960608, + "loss": 12.5578, + "step": 8374 + }, + { + "epoch": 0.4560522213827231, + "grad_norm": 0.7332230865995127, + "learning_rate": 0.0001801287666901771, + "loss": 12.4341, + "step": 8375 + }, + { + "epoch": 0.4561066753793061, + "grad_norm": 0.6581836725868025, + "learning_rate": 0.0001801234905976534, + "loss": 12.4149, + "step": 8376 + }, + { + "epoch": 0.45616112937588915, + "grad_norm": 0.7295789375243006, + "learning_rate": 0.000180118213882076, + "loss": 12.4868, + "step": 8377 + }, + { + "epoch": 0.45621558337247214, + "grad_norm": 0.6554620231047964, + "learning_rate": 0.00018011293654348592, + "loss": 12.4085, + "step": 8378 + }, + { + "epoch": 0.45627003736905514, + "grad_norm": 0.6970016512205633, + "learning_rate": 0.00018010765858192425, + "loss": 12.5611, + "step": 8379 + }, + { + "epoch": 0.4563244913656382, + "grad_norm": 0.7069624152061406, + "learning_rate": 0.00018010237999743195, + "loss": 12.4065, + "step": 8380 + }, + { + "epoch": 0.4563789453622212, + "grad_norm": 0.6534734284592699, + "learning_rate": 0.00018009710079005013, + "loss": 12.4033, + "step": 8381 + }, + { + "epoch": 0.45643339935880417, + "grad_norm": 0.6733227018555901, + "learning_rate": 0.00018009182095981983, + "loss": 12.6301, + "step": 8382 + }, + { + "epoch": 0.4564878533553872, + "grad_norm": 0.6883935790218004, + "learning_rate": 0.00018008654050678208, + "loss": 12.4693, + "step": 8383 + }, + { + "epoch": 0.4565423073519702, + "grad_norm": 0.6744703967734476, + "learning_rate": 0.000180081259430978, + "loss": 12.3845, + "step": 8384 + }, + { + "epoch": 0.4565967613485532, + "grad_norm": 0.7186218426749795, + "learning_rate": 0.00018007597773244855, + "loss": 12.3878, + "step": 8385 + }, + { + "epoch": 0.45665121534513625, + "grad_norm": 0.6349234443623138, + "learning_rate": 0.00018007069541123492, + "loss": 12.4049, + "step": 8386 + }, + { + "epoch": 0.45670566934171924, + "grad_norm": 0.5824037807653083, + "learning_rate": 0.0001800654124673781, + "loss": 12.3113, + "step": 8387 + }, + { + "epoch": 0.45676012333830224, + "grad_norm": 0.6222886438219286, + "learning_rate": 0.00018006012890091924, + "loss": 12.4073, + "step": 8388 + }, + { + "epoch": 0.4568145773348853, + "grad_norm": 0.7319420156442162, + "learning_rate": 0.00018005484471189938, + "loss": 12.5158, + "step": 8389 + }, + { + "epoch": 0.4568690313314683, + "grad_norm": 0.7122592024847274, + "learning_rate": 0.00018004955990035963, + "loss": 12.6155, + "step": 8390 + }, + { + "epoch": 0.45692348532805127, + "grad_norm": 0.8861540540847268, + "learning_rate": 0.00018004427446634104, + "loss": 12.4889, + "step": 8391 + }, + { + "epoch": 0.4569779393246343, + "grad_norm": 0.5764384661397526, + "learning_rate": 0.0001800389884098848, + "loss": 12.3368, + "step": 8392 + }, + { + "epoch": 0.4570323933212173, + "grad_norm": 0.6463869931658384, + "learning_rate": 0.00018003370173103194, + "loss": 12.4053, + "step": 8393 + }, + { + "epoch": 0.4570868473178003, + "grad_norm": 0.7557404332112136, + "learning_rate": 0.0001800284144298236, + "loss": 12.4105, + "step": 8394 + }, + { + "epoch": 0.45714130131438335, + "grad_norm": 0.6785861968861243, + "learning_rate": 0.00018002312650630087, + "loss": 12.5533, + "step": 8395 + }, + { + "epoch": 0.45719575531096635, + "grad_norm": 0.5997132875084884, + "learning_rate": 0.0001800178379605049, + "loss": 12.3683, + "step": 8396 + }, + { + "epoch": 0.4572502093075494, + "grad_norm": 0.6723666128208018, + "learning_rate": 0.0001800125487924768, + "loss": 12.4165, + "step": 8397 + }, + { + "epoch": 0.4573046633041324, + "grad_norm": 0.604105015372839, + "learning_rate": 0.00018000725900225767, + "loss": 12.3035, + "step": 8398 + }, + { + "epoch": 0.4573591173007154, + "grad_norm": 0.6976708969027182, + "learning_rate": 0.00018000196858988874, + "loss": 12.4663, + "step": 8399 + }, + { + "epoch": 0.4574135712972984, + "grad_norm": 0.6360785426257929, + "learning_rate": 0.00017999667755541106, + "loss": 12.3827, + "step": 8400 + }, + { + "epoch": 0.4574680252938814, + "grad_norm": 0.750727146607152, + "learning_rate": 0.00017999138589886576, + "loss": 12.4395, + "step": 8401 + }, + { + "epoch": 0.4575224792904644, + "grad_norm": 0.6165410796762938, + "learning_rate": 0.0001799860936202941, + "loss": 12.4172, + "step": 8402 + }, + { + "epoch": 0.45757693328704746, + "grad_norm": 0.664655318925348, + "learning_rate": 0.00017998080071973712, + "loss": 12.4702, + "step": 8403 + }, + { + "epoch": 0.45763138728363045, + "grad_norm": 0.6866522603182079, + "learning_rate": 0.00017997550719723603, + "loss": 12.3881, + "step": 8404 + }, + { + "epoch": 0.45768584128021345, + "grad_norm": 0.6207786327571557, + "learning_rate": 0.000179970213052832, + "loss": 12.5766, + "step": 8405 + }, + { + "epoch": 0.4577402952767965, + "grad_norm": 0.6271054032970457, + "learning_rate": 0.00017996491828656613, + "loss": 12.4155, + "step": 8406 + }, + { + "epoch": 0.4577947492733795, + "grad_norm": 0.788733076673143, + "learning_rate": 0.00017995962289847972, + "loss": 12.4865, + "step": 8407 + }, + { + "epoch": 0.4578492032699625, + "grad_norm": 0.6451839502353852, + "learning_rate": 0.00017995432688861383, + "loss": 12.3857, + "step": 8408 + }, + { + "epoch": 0.4579036572665455, + "grad_norm": 0.7922111769987087, + "learning_rate": 0.00017994903025700967, + "loss": 12.3343, + "step": 8409 + }, + { + "epoch": 0.4579581112631285, + "grad_norm": 0.6291459222045038, + "learning_rate": 0.00017994373300370847, + "loss": 12.3404, + "step": 8410 + }, + { + "epoch": 0.4580125652597115, + "grad_norm": 0.6257345055875672, + "learning_rate": 0.0001799384351287514, + "loss": 12.4469, + "step": 8411 + }, + { + "epoch": 0.45806701925629456, + "grad_norm": 0.6765850397057749, + "learning_rate": 0.00017993313663217962, + "loss": 12.3578, + "step": 8412 + }, + { + "epoch": 0.45812147325287755, + "grad_norm": 0.5731249213737087, + "learning_rate": 0.0001799278375140344, + "loss": 12.4651, + "step": 8413 + }, + { + "epoch": 0.45817592724946055, + "grad_norm": 0.7520989664714446, + "learning_rate": 0.00017992253777435687, + "loss": 12.4347, + "step": 8414 + }, + { + "epoch": 0.4582303812460436, + "grad_norm": 0.6548601863439822, + "learning_rate": 0.00017991723741318834, + "loss": 12.4428, + "step": 8415 + }, + { + "epoch": 0.4582848352426266, + "grad_norm": 0.6742770381298215, + "learning_rate": 0.0001799119364305699, + "loss": 12.5476, + "step": 8416 + }, + { + "epoch": 0.4583392892392096, + "grad_norm": 0.6649821067512534, + "learning_rate": 0.00017990663482654292, + "loss": 12.4915, + "step": 8417 + }, + { + "epoch": 0.45839374323579263, + "grad_norm": 0.7172279380276158, + "learning_rate": 0.0001799013326011485, + "loss": 12.5574, + "step": 8418 + }, + { + "epoch": 0.4584481972323756, + "grad_norm": 0.7547962084995847, + "learning_rate": 0.00017989602975442793, + "loss": 12.4675, + "step": 8419 + }, + { + "epoch": 0.4585026512289586, + "grad_norm": 0.5859887915230573, + "learning_rate": 0.00017989072628642243, + "loss": 12.4824, + "step": 8420 + }, + { + "epoch": 0.45855710522554166, + "grad_norm": 0.7088704213932157, + "learning_rate": 0.00017988542219717326, + "loss": 12.6085, + "step": 8421 + }, + { + "epoch": 0.45861155922212465, + "grad_norm": 0.6530669807244944, + "learning_rate": 0.00017988011748672162, + "loss": 12.5474, + "step": 8422 + }, + { + "epoch": 0.45866601321870765, + "grad_norm": 0.6012504516424516, + "learning_rate": 0.00017987481215510882, + "loss": 12.4466, + "step": 8423 + }, + { + "epoch": 0.4587204672152907, + "grad_norm": 0.6321909432910274, + "learning_rate": 0.00017986950620237609, + "loss": 12.4394, + "step": 8424 + }, + { + "epoch": 0.4587749212118737, + "grad_norm": 0.5955062332721126, + "learning_rate": 0.00017986419962856464, + "loss": 12.4106, + "step": 8425 + }, + { + "epoch": 0.4588293752084567, + "grad_norm": 0.5834126337825684, + "learning_rate": 0.00017985889243371582, + "loss": 12.3952, + "step": 8426 + }, + { + "epoch": 0.45888382920503973, + "grad_norm": 0.6306202539060862, + "learning_rate": 0.00017985358461787082, + "loss": 12.4875, + "step": 8427 + }, + { + "epoch": 0.4589382832016227, + "grad_norm": 0.6098987214034494, + "learning_rate": 0.00017984827618107102, + "loss": 12.5203, + "step": 8428 + }, + { + "epoch": 0.4589927371982057, + "grad_norm": 0.6298011928808441, + "learning_rate": 0.00017984296712335757, + "loss": 12.4491, + "step": 8429 + }, + { + "epoch": 0.45904719119478876, + "grad_norm": 0.6316231234434623, + "learning_rate": 0.00017983765744477185, + "loss": 12.3603, + "step": 8430 + }, + { + "epoch": 0.45910164519137175, + "grad_norm": 0.621224439891547, + "learning_rate": 0.0001798323471453551, + "loss": 12.5129, + "step": 8431 + }, + { + "epoch": 0.4591560991879548, + "grad_norm": 0.6215508990689357, + "learning_rate": 0.00017982703622514867, + "loss": 12.4447, + "step": 8432 + }, + { + "epoch": 0.4592105531845378, + "grad_norm": 0.6373085894914177, + "learning_rate": 0.00017982172468419377, + "loss": 12.3633, + "step": 8433 + }, + { + "epoch": 0.4592650071811208, + "grad_norm": 0.5999470984578853, + "learning_rate": 0.00017981641252253177, + "loss": 12.3336, + "step": 8434 + }, + { + "epoch": 0.45931946117770384, + "grad_norm": 0.6530057898257308, + "learning_rate": 0.000179811099740204, + "loss": 12.5767, + "step": 8435 + }, + { + "epoch": 0.45937391517428683, + "grad_norm": 0.6462165261279232, + "learning_rate": 0.00017980578633725166, + "loss": 12.4114, + "step": 8436 + }, + { + "epoch": 0.4594283691708698, + "grad_norm": 0.6665737763342021, + "learning_rate": 0.0001798004723137162, + "loss": 12.4862, + "step": 8437 + }, + { + "epoch": 0.45948282316745287, + "grad_norm": 0.5854356255683455, + "learning_rate": 0.00017979515766963888, + "loss": 12.3757, + "step": 8438 + }, + { + "epoch": 0.45953727716403586, + "grad_norm": 0.6429628084102862, + "learning_rate": 0.00017978984240506101, + "loss": 12.4527, + "step": 8439 + }, + { + "epoch": 0.45959173116061885, + "grad_norm": 0.6224284086182738, + "learning_rate": 0.00017978452652002392, + "loss": 12.3866, + "step": 8440 + }, + { + "epoch": 0.4596461851572019, + "grad_norm": 0.7085312267599722, + "learning_rate": 0.00017977921001456902, + "loss": 12.5016, + "step": 8441 + }, + { + "epoch": 0.4597006391537849, + "grad_norm": 0.6625040975468844, + "learning_rate": 0.0001797738928887376, + "loss": 12.5766, + "step": 8442 + }, + { + "epoch": 0.4597550931503679, + "grad_norm": 0.7112702932006277, + "learning_rate": 0.000179768575142571, + "loss": 12.4637, + "step": 8443 + }, + { + "epoch": 0.45980954714695094, + "grad_norm": 0.6543475319952922, + "learning_rate": 0.0001797632567761106, + "loss": 12.4763, + "step": 8444 + }, + { + "epoch": 0.45986400114353393, + "grad_norm": 0.6561221437463571, + "learning_rate": 0.00017975793778939768, + "loss": 12.2698, + "step": 8445 + }, + { + "epoch": 0.4599184551401169, + "grad_norm": 0.6304476519096577, + "learning_rate": 0.00017975261818247373, + "loss": 12.4731, + "step": 8446 + }, + { + "epoch": 0.45997290913669997, + "grad_norm": 0.6737388668545593, + "learning_rate": 0.00017974729795538, + "loss": 12.5866, + "step": 8447 + }, + { + "epoch": 0.46002736313328296, + "grad_norm": 0.64445630023729, + "learning_rate": 0.00017974197710815795, + "loss": 12.3961, + "step": 8448 + }, + { + "epoch": 0.46008181712986596, + "grad_norm": 0.570660478325581, + "learning_rate": 0.0001797366556408489, + "loss": 12.5089, + "step": 8449 + }, + { + "epoch": 0.460136271126449, + "grad_norm": 0.7022380367884817, + "learning_rate": 0.0001797313335534942, + "loss": 12.3741, + "step": 8450 + }, + { + "epoch": 0.460190725123032, + "grad_norm": 0.612890241562397, + "learning_rate": 0.00017972601084613533, + "loss": 12.4199, + "step": 8451 + }, + { + "epoch": 0.460245179119615, + "grad_norm": 0.6156966291816607, + "learning_rate": 0.0001797206875188136, + "loss": 12.5195, + "step": 8452 + }, + { + "epoch": 0.46029963311619804, + "grad_norm": 0.6462544029384523, + "learning_rate": 0.00017971536357157044, + "loss": 12.5192, + "step": 8453 + }, + { + "epoch": 0.46035408711278103, + "grad_norm": 0.6231812162951743, + "learning_rate": 0.00017971003900444727, + "loss": 12.424, + "step": 8454 + }, + { + "epoch": 0.460408541109364, + "grad_norm": 0.6855647949717323, + "learning_rate": 0.00017970471381748544, + "loss": 12.4334, + "step": 8455 + }, + { + "epoch": 0.46046299510594707, + "grad_norm": 0.6311800182849413, + "learning_rate": 0.0001796993880107264, + "loss": 12.3238, + "step": 8456 + }, + { + "epoch": 0.46051744910253006, + "grad_norm": 0.6337541676749251, + "learning_rate": 0.00017969406158421157, + "loss": 12.4585, + "step": 8457 + }, + { + "epoch": 0.46057190309911306, + "grad_norm": 0.6208653234499459, + "learning_rate": 0.0001796887345379823, + "loss": 12.4279, + "step": 8458 + }, + { + "epoch": 0.4606263570956961, + "grad_norm": 0.6330536996832912, + "learning_rate": 0.00017968340687208012, + "loss": 12.4568, + "step": 8459 + }, + { + "epoch": 0.4606808110922791, + "grad_norm": 0.6090976625280496, + "learning_rate": 0.0001796780785865464, + "loss": 12.3771, + "step": 8460 + }, + { + "epoch": 0.4607352650888621, + "grad_norm": 0.65006715249651, + "learning_rate": 0.00017967274968142257, + "loss": 12.5136, + "step": 8461 + }, + { + "epoch": 0.46078971908544514, + "grad_norm": 0.6091210194797642, + "learning_rate": 0.00017966742015675008, + "loss": 12.4713, + "step": 8462 + }, + { + "epoch": 0.46084417308202813, + "grad_norm": 0.5394330605868857, + "learning_rate": 0.00017966209001257032, + "loss": 12.3411, + "step": 8463 + }, + { + "epoch": 0.4608986270786112, + "grad_norm": 0.6299230807970284, + "learning_rate": 0.00017965675924892484, + "loss": 12.3845, + "step": 8464 + }, + { + "epoch": 0.46095308107519417, + "grad_norm": 0.6004734955181203, + "learning_rate": 0.00017965142786585504, + "loss": 12.2142, + "step": 8465 + }, + { + "epoch": 0.46100753507177716, + "grad_norm": 0.6013644648463967, + "learning_rate": 0.0001796460958634024, + "loss": 12.4469, + "step": 8466 + }, + { + "epoch": 0.4610619890683602, + "grad_norm": 0.6540201882122875, + "learning_rate": 0.00017964076324160832, + "loss": 12.4277, + "step": 8467 + }, + { + "epoch": 0.4611164430649432, + "grad_norm": 0.6842417699779186, + "learning_rate": 0.00017963543000051432, + "loss": 12.4336, + "step": 8468 + }, + { + "epoch": 0.4611708970615262, + "grad_norm": 0.6259991019467918, + "learning_rate": 0.00017963009614016187, + "loss": 12.4175, + "step": 8469 + }, + { + "epoch": 0.46122535105810925, + "grad_norm": 0.7977560619398021, + "learning_rate": 0.00017962476166059243, + "loss": 12.6586, + "step": 8470 + }, + { + "epoch": 0.46127980505469224, + "grad_norm": 0.7040812223013768, + "learning_rate": 0.0001796194265618475, + "loss": 12.4542, + "step": 8471 + }, + { + "epoch": 0.46133425905127523, + "grad_norm": 0.6170972449417943, + "learning_rate": 0.00017961409084396856, + "loss": 12.2917, + "step": 8472 + }, + { + "epoch": 0.4613887130478583, + "grad_norm": 0.5628385005146082, + "learning_rate": 0.00017960875450699707, + "loss": 12.4389, + "step": 8473 + }, + { + "epoch": 0.46144316704444127, + "grad_norm": 0.7106466298776539, + "learning_rate": 0.00017960341755097459, + "loss": 12.4437, + "step": 8474 + }, + { + "epoch": 0.46149762104102426, + "grad_norm": 0.6325480250324459, + "learning_rate": 0.00017959807997594256, + "loss": 12.3271, + "step": 8475 + }, + { + "epoch": 0.4615520750376073, + "grad_norm": 0.6688376322815393, + "learning_rate": 0.00017959274178194252, + "loss": 12.5363, + "step": 8476 + }, + { + "epoch": 0.4616065290341903, + "grad_norm": 0.6390089606861243, + "learning_rate": 0.00017958740296901597, + "loss": 12.5253, + "step": 8477 + }, + { + "epoch": 0.4616609830307733, + "grad_norm": 0.6483652623580444, + "learning_rate": 0.00017958206353720443, + "loss": 12.3701, + "step": 8478 + }, + { + "epoch": 0.46171543702735635, + "grad_norm": 0.5747024925879572, + "learning_rate": 0.00017957672348654943, + "loss": 12.3712, + "step": 8479 + }, + { + "epoch": 0.46176989102393934, + "grad_norm": 0.6407691144277222, + "learning_rate": 0.00017957138281709246, + "loss": 12.5068, + "step": 8480 + }, + { + "epoch": 0.46182434502052233, + "grad_norm": 0.5804922627948418, + "learning_rate": 0.00017956604152887507, + "loss": 12.4644, + "step": 8481 + }, + { + "epoch": 0.4618787990171054, + "grad_norm": 0.7734467267607033, + "learning_rate": 0.00017956069962193886, + "loss": 12.5521, + "step": 8482 + }, + { + "epoch": 0.46193325301368837, + "grad_norm": 0.5895896860218451, + "learning_rate": 0.00017955535709632522, + "loss": 12.441, + "step": 8483 + }, + { + "epoch": 0.46198770701027136, + "grad_norm": 0.7151105273525564, + "learning_rate": 0.00017955001395207585, + "loss": 12.3842, + "step": 8484 + }, + { + "epoch": 0.4620421610068544, + "grad_norm": 0.605451804966123, + "learning_rate": 0.0001795446701892322, + "loss": 12.3717, + "step": 8485 + }, + { + "epoch": 0.4620966150034374, + "grad_norm": 0.6796432599907761, + "learning_rate": 0.00017953932580783586, + "loss": 12.5606, + "step": 8486 + }, + { + "epoch": 0.4621510690000204, + "grad_norm": 0.6543516269832156, + "learning_rate": 0.00017953398080792837, + "loss": 12.3228, + "step": 8487 + }, + { + "epoch": 0.46220552299660345, + "grad_norm": 0.7231693338943618, + "learning_rate": 0.00017952863518955133, + "loss": 12.561, + "step": 8488 + }, + { + "epoch": 0.46225997699318644, + "grad_norm": 0.6967078738341969, + "learning_rate": 0.0001795232889527463, + "loss": 12.4842, + "step": 8489 + }, + { + "epoch": 0.46231443098976943, + "grad_norm": 0.6900231716925654, + "learning_rate": 0.0001795179420975548, + "loss": 12.5378, + "step": 8490 + }, + { + "epoch": 0.4623688849863525, + "grad_norm": 0.6565505110739019, + "learning_rate": 0.00017951259462401845, + "loss": 12.3519, + "step": 8491 + }, + { + "epoch": 0.4624233389829355, + "grad_norm": 0.5804690618308558, + "learning_rate": 0.00017950724653217884, + "loss": 12.4108, + "step": 8492 + }, + { + "epoch": 0.46247779297951846, + "grad_norm": 0.695013601610202, + "learning_rate": 0.00017950189782207755, + "loss": 12.4763, + "step": 8493 + }, + { + "epoch": 0.4625322469761015, + "grad_norm": 0.6476402393977978, + "learning_rate": 0.00017949654849375616, + "loss": 12.5224, + "step": 8494 + }, + { + "epoch": 0.4625867009726845, + "grad_norm": 0.650412323007607, + "learning_rate": 0.00017949119854725632, + "loss": 12.5714, + "step": 8495 + }, + { + "epoch": 0.4626411549692675, + "grad_norm": 0.6177719603875415, + "learning_rate": 0.00017948584798261954, + "loss": 12.4933, + "step": 8496 + }, + { + "epoch": 0.46269560896585055, + "grad_norm": 0.6140868763191671, + "learning_rate": 0.0001794804967998875, + "loss": 12.4619, + "step": 8497 + }, + { + "epoch": 0.46275006296243354, + "grad_norm": 0.6042822927413228, + "learning_rate": 0.00017947514499910177, + "loss": 12.3968, + "step": 8498 + }, + { + "epoch": 0.4628045169590166, + "grad_norm": 0.614738493564977, + "learning_rate": 0.00017946979258030399, + "loss": 12.4828, + "step": 8499 + }, + { + "epoch": 0.4628589709555996, + "grad_norm": 0.5737063644687317, + "learning_rate": 0.00017946443954353577, + "loss": 12.3964, + "step": 8500 + }, + { + "epoch": 0.4629134249521826, + "grad_norm": 0.6287436734117607, + "learning_rate": 0.00017945908588883877, + "loss": 12.5085, + "step": 8501 + }, + { + "epoch": 0.4629678789487656, + "grad_norm": 0.6167967595743099, + "learning_rate": 0.00017945373161625455, + "loss": 12.3852, + "step": 8502 + }, + { + "epoch": 0.4630223329453486, + "grad_norm": 0.6064207477081835, + "learning_rate": 0.00017944837672582485, + "loss": 12.3263, + "step": 8503 + }, + { + "epoch": 0.4630767869419316, + "grad_norm": 0.6333852371143583, + "learning_rate": 0.0001794430212175912, + "loss": 12.3366, + "step": 8504 + }, + { + "epoch": 0.46313124093851465, + "grad_norm": 0.6246520185592767, + "learning_rate": 0.0001794376650915953, + "loss": 12.3339, + "step": 8505 + }, + { + "epoch": 0.46318569493509765, + "grad_norm": 0.6553154091068601, + "learning_rate": 0.00017943230834787882, + "loss": 12.3773, + "step": 8506 + }, + { + "epoch": 0.46324014893168064, + "grad_norm": 0.6780757185364424, + "learning_rate": 0.00017942695098648335, + "loss": 12.4337, + "step": 8507 + }, + { + "epoch": 0.4632946029282637, + "grad_norm": 0.5992151156206015, + "learning_rate": 0.00017942159300745063, + "loss": 12.4676, + "step": 8508 + }, + { + "epoch": 0.4633490569248467, + "grad_norm": 0.7139367597885157, + "learning_rate": 0.00017941623441082225, + "loss": 12.4756, + "step": 8509 + }, + { + "epoch": 0.4634035109214297, + "grad_norm": 0.6074562947215172, + "learning_rate": 0.00017941087519663995, + "loss": 12.3758, + "step": 8510 + }, + { + "epoch": 0.4634579649180127, + "grad_norm": 0.5919073671557712, + "learning_rate": 0.00017940551536494535, + "loss": 12.4662, + "step": 8511 + }, + { + "epoch": 0.4635124189145957, + "grad_norm": 0.6288349721285509, + "learning_rate": 0.00017940015491578012, + "loss": 12.4327, + "step": 8512 + }, + { + "epoch": 0.4635668729111787, + "grad_norm": 0.6377686064725293, + "learning_rate": 0.000179394793849186, + "loss": 12.4457, + "step": 8513 + }, + { + "epoch": 0.46362132690776175, + "grad_norm": 0.6331751229664899, + "learning_rate": 0.00017938943216520462, + "loss": 12.4591, + "step": 8514 + }, + { + "epoch": 0.46367578090434475, + "grad_norm": 0.6205814254067495, + "learning_rate": 0.00017938406986387775, + "loss": 12.4356, + "step": 8515 + }, + { + "epoch": 0.46373023490092774, + "grad_norm": 0.6120530025858775, + "learning_rate": 0.000179378706945247, + "loss": 12.465, + "step": 8516 + }, + { + "epoch": 0.4637846888975108, + "grad_norm": 0.6576904060379765, + "learning_rate": 0.00017937334340935413, + "loss": 12.3719, + "step": 8517 + }, + { + "epoch": 0.4638391428940938, + "grad_norm": 0.6302025635351302, + "learning_rate": 0.0001793679792562408, + "loss": 12.3565, + "step": 8518 + }, + { + "epoch": 0.4638935968906768, + "grad_norm": 0.6226271009698168, + "learning_rate": 0.00017936261448594877, + "loss": 12.476, + "step": 8519 + }, + { + "epoch": 0.4639480508872598, + "grad_norm": 0.7097366292174755, + "learning_rate": 0.00017935724909851978, + "loss": 12.4391, + "step": 8520 + }, + { + "epoch": 0.4640025048838428, + "grad_norm": 0.6343089219332183, + "learning_rate": 0.00017935188309399546, + "loss": 12.411, + "step": 8521 + }, + { + "epoch": 0.4640569588804258, + "grad_norm": 0.6863122806284503, + "learning_rate": 0.00017934651647241764, + "loss": 12.5285, + "step": 8522 + }, + { + "epoch": 0.46411141287700886, + "grad_norm": 0.6400060317241383, + "learning_rate": 0.00017934114923382798, + "loss": 12.4462, + "step": 8523 + }, + { + "epoch": 0.46416586687359185, + "grad_norm": 0.614577460142748, + "learning_rate": 0.00017933578137826822, + "loss": 12.4177, + "step": 8524 + }, + { + "epoch": 0.46422032087017484, + "grad_norm": 0.6379323169224547, + "learning_rate": 0.00017933041290578013, + "loss": 12.45, + "step": 8525 + }, + { + "epoch": 0.4642747748667579, + "grad_norm": 0.7842891853867016, + "learning_rate": 0.00017932504381640545, + "loss": 12.3224, + "step": 8526 + }, + { + "epoch": 0.4643292288633409, + "grad_norm": 0.6253337426516982, + "learning_rate": 0.0001793196741101859, + "loss": 12.4774, + "step": 8527 + }, + { + "epoch": 0.4643836828599239, + "grad_norm": 0.7597331992933967, + "learning_rate": 0.00017931430378716328, + "loss": 12.5435, + "step": 8528 + }, + { + "epoch": 0.4644381368565069, + "grad_norm": 0.614380364429936, + "learning_rate": 0.00017930893284737932, + "loss": 12.3992, + "step": 8529 + }, + { + "epoch": 0.4644925908530899, + "grad_norm": 0.6301733598215885, + "learning_rate": 0.00017930356129087585, + "loss": 12.4247, + "step": 8530 + }, + { + "epoch": 0.46454704484967296, + "grad_norm": 0.7021075766021644, + "learning_rate": 0.00017929818911769453, + "loss": 12.5615, + "step": 8531 + }, + { + "epoch": 0.46460149884625596, + "grad_norm": 0.6635152504671655, + "learning_rate": 0.00017929281632787723, + "loss": 12.4488, + "step": 8532 + }, + { + "epoch": 0.46465595284283895, + "grad_norm": 0.6959204379962824, + "learning_rate": 0.00017928744292146568, + "loss": 12.3817, + "step": 8533 + }, + { + "epoch": 0.464710406839422, + "grad_norm": 0.5937204435815098, + "learning_rate": 0.00017928206889850169, + "loss": 12.4555, + "step": 8534 + }, + { + "epoch": 0.464764860836005, + "grad_norm": 0.7572250430526706, + "learning_rate": 0.00017927669425902703, + "loss": 12.4628, + "step": 8535 + }, + { + "epoch": 0.464819314832588, + "grad_norm": 0.6405099518133247, + "learning_rate": 0.00017927131900308347, + "loss": 12.3633, + "step": 8536 + }, + { + "epoch": 0.46487376882917103, + "grad_norm": 0.6410726229961785, + "learning_rate": 0.0001792659431307129, + "loss": 12.4437, + "step": 8537 + }, + { + "epoch": 0.464928222825754, + "grad_norm": 0.6256073298072141, + "learning_rate": 0.000179260566641957, + "loss": 12.3884, + "step": 8538 + }, + { + "epoch": 0.464982676822337, + "grad_norm": 0.620457959466047, + "learning_rate": 0.00017925518953685766, + "loss": 12.3345, + "step": 8539 + }, + { + "epoch": 0.46503713081892006, + "grad_norm": 0.6852978455402273, + "learning_rate": 0.0001792498118154567, + "loss": 12.5612, + "step": 8540 + }, + { + "epoch": 0.46509158481550306, + "grad_norm": 0.6724753812631421, + "learning_rate": 0.0001792444334777959, + "loss": 12.4961, + "step": 8541 + }, + { + "epoch": 0.46514603881208605, + "grad_norm": 0.6879903993263202, + "learning_rate": 0.0001792390545239171, + "loss": 12.5168, + "step": 8542 + }, + { + "epoch": 0.4652004928086691, + "grad_norm": 0.5520030988039244, + "learning_rate": 0.0001792336749538621, + "loss": 12.4215, + "step": 8543 + }, + { + "epoch": 0.4652549468052521, + "grad_norm": 0.5474435428367739, + "learning_rate": 0.00017922829476767278, + "loss": 12.3678, + "step": 8544 + }, + { + "epoch": 0.4653094008018351, + "grad_norm": 0.7329959158129233, + "learning_rate": 0.00017922291396539093, + "loss": 12.3445, + "step": 8545 + }, + { + "epoch": 0.46536385479841813, + "grad_norm": 0.6512212767290448, + "learning_rate": 0.00017921753254705844, + "loss": 12.5902, + "step": 8546 + }, + { + "epoch": 0.4654183087950011, + "grad_norm": 0.5896209592313086, + "learning_rate": 0.0001792121505127171, + "loss": 12.4124, + "step": 8547 + }, + { + "epoch": 0.4654727627915841, + "grad_norm": 0.5841614441736784, + "learning_rate": 0.00017920676786240883, + "loss": 12.3758, + "step": 8548 + }, + { + "epoch": 0.46552721678816716, + "grad_norm": 0.5898442729547501, + "learning_rate": 0.00017920138459617544, + "loss": 12.4799, + "step": 8549 + }, + { + "epoch": 0.46558167078475016, + "grad_norm": 0.5748459449326601, + "learning_rate": 0.00017919600071405881, + "loss": 12.3012, + "step": 8550 + }, + { + "epoch": 0.46563612478133315, + "grad_norm": 0.608117050757133, + "learning_rate": 0.0001791906162161008, + "loss": 12.4454, + "step": 8551 + }, + { + "epoch": 0.4656905787779162, + "grad_norm": 0.6672970333611582, + "learning_rate": 0.00017918523110234324, + "loss": 12.3224, + "step": 8552 + }, + { + "epoch": 0.4657450327744992, + "grad_norm": 0.7520661230221706, + "learning_rate": 0.00017917984537282807, + "loss": 12.354, + "step": 8553 + }, + { + "epoch": 0.4657994867710822, + "grad_norm": 0.6302720683788045, + "learning_rate": 0.00017917445902759714, + "loss": 12.3604, + "step": 8554 + }, + { + "epoch": 0.46585394076766523, + "grad_norm": 0.6579941205866369, + "learning_rate": 0.00017916907206669237, + "loss": 12.5587, + "step": 8555 + }, + { + "epoch": 0.4659083947642482, + "grad_norm": 0.6165873606250319, + "learning_rate": 0.00017916368449015556, + "loss": 12.431, + "step": 8556 + }, + { + "epoch": 0.4659628487608312, + "grad_norm": 0.7295377109985214, + "learning_rate": 0.0001791582962980287, + "loss": 12.3715, + "step": 8557 + }, + { + "epoch": 0.46601730275741426, + "grad_norm": 0.6578045921999068, + "learning_rate": 0.00017915290749035364, + "loss": 12.3832, + "step": 8558 + }, + { + "epoch": 0.46607175675399726, + "grad_norm": 0.6093581051436394, + "learning_rate": 0.0001791475180671723, + "loss": 12.5183, + "step": 8559 + }, + { + "epoch": 0.46612621075058025, + "grad_norm": 0.6087113495662672, + "learning_rate": 0.0001791421280285266, + "loss": 12.5348, + "step": 8560 + }, + { + "epoch": 0.4661806647471633, + "grad_norm": 0.6808914595686044, + "learning_rate": 0.00017913673737445844, + "loss": 12.5032, + "step": 8561 + }, + { + "epoch": 0.4662351187437463, + "grad_norm": 0.6960835995102816, + "learning_rate": 0.0001791313461050097, + "loss": 12.5811, + "step": 8562 + }, + { + "epoch": 0.4662895727403293, + "grad_norm": 0.7637939174979597, + "learning_rate": 0.0001791259542202224, + "loss": 12.4642, + "step": 8563 + }, + { + "epoch": 0.46634402673691233, + "grad_norm": 0.6495379560707983, + "learning_rate": 0.00017912056172013837, + "loss": 12.4818, + "step": 8564 + }, + { + "epoch": 0.4663984807334953, + "grad_norm": 0.644911863272382, + "learning_rate": 0.0001791151686047996, + "loss": 12.497, + "step": 8565 + }, + { + "epoch": 0.4664529347300784, + "grad_norm": 0.7269018585601411, + "learning_rate": 0.00017910977487424801, + "loss": 12.5889, + "step": 8566 + }, + { + "epoch": 0.46650738872666137, + "grad_norm": 0.5786874526945133, + "learning_rate": 0.00017910438052852557, + "loss": 12.2868, + "step": 8567 + }, + { + "epoch": 0.46656184272324436, + "grad_norm": 0.6627592741179691, + "learning_rate": 0.00017909898556767414, + "loss": 12.4585, + "step": 8568 + }, + { + "epoch": 0.4666162967198274, + "grad_norm": 0.693478438138713, + "learning_rate": 0.00017909358999173577, + "loss": 12.3521, + "step": 8569 + }, + { + "epoch": 0.4666707507164104, + "grad_norm": 0.6801219655549972, + "learning_rate": 0.0001790881938007524, + "loss": 12.4614, + "step": 8570 + }, + { + "epoch": 0.4667252047129934, + "grad_norm": 0.7301463752425134, + "learning_rate": 0.00017908279699476592, + "loss": 12.3549, + "step": 8571 + }, + { + "epoch": 0.46677965870957644, + "grad_norm": 0.6346179560399473, + "learning_rate": 0.00017907739957381838, + "loss": 12.3989, + "step": 8572 + }, + { + "epoch": 0.46683411270615943, + "grad_norm": 0.686851017492308, + "learning_rate": 0.00017907200153795171, + "loss": 12.3688, + "step": 8573 + }, + { + "epoch": 0.4668885667027424, + "grad_norm": 0.6605494864320264, + "learning_rate": 0.0001790666028872079, + "loss": 12.4429, + "step": 8574 + }, + { + "epoch": 0.4669430206993255, + "grad_norm": 0.7145770417283764, + "learning_rate": 0.00017906120362162894, + "loss": 12.3289, + "step": 8575 + }, + { + "epoch": 0.46699747469590847, + "grad_norm": 0.6897110873551012, + "learning_rate": 0.00017905580374125678, + "loss": 12.4011, + "step": 8576 + }, + { + "epoch": 0.46705192869249146, + "grad_norm": 0.7216304975696103, + "learning_rate": 0.00017905040324613344, + "loss": 12.472, + "step": 8577 + }, + { + "epoch": 0.4671063826890745, + "grad_norm": 0.6970788524101713, + "learning_rate": 0.00017904500213630092, + "loss": 12.507, + "step": 8578 + }, + { + "epoch": 0.4671608366856575, + "grad_norm": 0.631846296329978, + "learning_rate": 0.0001790396004118012, + "loss": 12.3616, + "step": 8579 + }, + { + "epoch": 0.4672152906822405, + "grad_norm": 0.6871599189413196, + "learning_rate": 0.0001790341980726763, + "loss": 12.3542, + "step": 8580 + }, + { + "epoch": 0.46726974467882354, + "grad_norm": 0.6227620922656812, + "learning_rate": 0.0001790287951189682, + "loss": 12.4218, + "step": 8581 + }, + { + "epoch": 0.46732419867540653, + "grad_norm": 0.6327965646327595, + "learning_rate": 0.00017902339155071896, + "loss": 12.5839, + "step": 8582 + }, + { + "epoch": 0.4673786526719895, + "grad_norm": 0.5943442388995044, + "learning_rate": 0.00017901798736797054, + "loss": 12.2484, + "step": 8583 + }, + { + "epoch": 0.4674331066685726, + "grad_norm": 0.6691449075498682, + "learning_rate": 0.00017901258257076503, + "loss": 12.5749, + "step": 8584 + }, + { + "epoch": 0.46748756066515557, + "grad_norm": 0.6469442060100173, + "learning_rate": 0.00017900717715914444, + "loss": 12.4268, + "step": 8585 + }, + { + "epoch": 0.46754201466173856, + "grad_norm": 0.7164673653306719, + "learning_rate": 0.00017900177113315075, + "loss": 12.3637, + "step": 8586 + }, + { + "epoch": 0.4675964686583216, + "grad_norm": 0.5329022400651572, + "learning_rate": 0.00017899636449282606, + "loss": 12.3726, + "step": 8587 + }, + { + "epoch": 0.4676509226549046, + "grad_norm": 0.7268214005708902, + "learning_rate": 0.0001789909572382124, + "loss": 12.4756, + "step": 8588 + }, + { + "epoch": 0.4677053766514876, + "grad_norm": 0.7438340959520051, + "learning_rate": 0.0001789855493693518, + "loss": 12.4556, + "step": 8589 + }, + { + "epoch": 0.46775983064807064, + "grad_norm": 0.5711552451773573, + "learning_rate": 0.00017898014088628633, + "loss": 12.442, + "step": 8590 + }, + { + "epoch": 0.46781428464465363, + "grad_norm": 0.760380130815662, + "learning_rate": 0.00017897473178905805, + "loss": 12.4894, + "step": 8591 + }, + { + "epoch": 0.4678687386412366, + "grad_norm": 0.6237848382862727, + "learning_rate": 0.00017896932207770895, + "loss": 12.3818, + "step": 8592 + }, + { + "epoch": 0.4679231926378197, + "grad_norm": 0.6191436468502493, + "learning_rate": 0.00017896391175228123, + "loss": 12.4273, + "step": 8593 + }, + { + "epoch": 0.46797764663440267, + "grad_norm": 0.6690538751938576, + "learning_rate": 0.00017895850081281687, + "loss": 12.4312, + "step": 8594 + }, + { + "epoch": 0.46803210063098566, + "grad_norm": 0.6807126771202414, + "learning_rate": 0.00017895308925935794, + "loss": 12.5251, + "step": 8595 + }, + { + "epoch": 0.4680865546275687, + "grad_norm": 0.6639063805134303, + "learning_rate": 0.00017894767709194658, + "loss": 12.4868, + "step": 8596 + }, + { + "epoch": 0.4681410086241517, + "grad_norm": 0.7908769442385336, + "learning_rate": 0.00017894226431062481, + "loss": 12.5073, + "step": 8597 + }, + { + "epoch": 0.46819546262073475, + "grad_norm": 0.665810749723039, + "learning_rate": 0.00017893685091543478, + "loss": 12.502, + "step": 8598 + }, + { + "epoch": 0.46824991661731774, + "grad_norm": 0.7095793243118013, + "learning_rate": 0.00017893143690641855, + "loss": 12.4091, + "step": 8599 + }, + { + "epoch": 0.46830437061390073, + "grad_norm": 0.6582534562533506, + "learning_rate": 0.00017892602228361824, + "loss": 12.3877, + "step": 8600 + }, + { + "epoch": 0.4683588246104838, + "grad_norm": 0.6099299580757082, + "learning_rate": 0.0001789206070470759, + "loss": 12.3216, + "step": 8601 + }, + { + "epoch": 0.4684132786070668, + "grad_norm": 0.6673127073946039, + "learning_rate": 0.00017891519119683376, + "loss": 12.5892, + "step": 8602 + }, + { + "epoch": 0.46846773260364977, + "grad_norm": 0.6467498612037779, + "learning_rate": 0.0001789097747329338, + "loss": 12.4686, + "step": 8603 + }, + { + "epoch": 0.4685221866002328, + "grad_norm": 0.6261087762490277, + "learning_rate": 0.00017890435765541821, + "loss": 12.5421, + "step": 8604 + }, + { + "epoch": 0.4685766405968158, + "grad_norm": 0.6264196217535764, + "learning_rate": 0.00017889893996432914, + "loss": 12.22, + "step": 8605 + }, + { + "epoch": 0.4686310945933988, + "grad_norm": 0.6518593103011723, + "learning_rate": 0.00017889352165970866, + "loss": 12.4012, + "step": 8606 + }, + { + "epoch": 0.46868554858998185, + "grad_norm": 0.5694633740086411, + "learning_rate": 0.0001788881027415989, + "loss": 12.45, + "step": 8607 + }, + { + "epoch": 0.46874000258656484, + "grad_norm": 0.7154487883983486, + "learning_rate": 0.00017888268321004203, + "loss": 12.4673, + "step": 8608 + }, + { + "epoch": 0.46879445658314783, + "grad_norm": 0.6391015557412403, + "learning_rate": 0.00017887726306508022, + "loss": 12.515, + "step": 8609 + }, + { + "epoch": 0.4688489105797309, + "grad_norm": 0.6805762056289465, + "learning_rate": 0.00017887184230675556, + "loss": 12.6188, + "step": 8610 + }, + { + "epoch": 0.4689033645763139, + "grad_norm": 0.6699372502313612, + "learning_rate": 0.00017886642093511025, + "loss": 12.5241, + "step": 8611 + }, + { + "epoch": 0.46895781857289687, + "grad_norm": 0.6015103170519449, + "learning_rate": 0.0001788609989501864, + "loss": 12.4285, + "step": 8612 + }, + { + "epoch": 0.4690122725694799, + "grad_norm": 0.6768216487927327, + "learning_rate": 0.0001788555763520262, + "loss": 12.3188, + "step": 8613 + }, + { + "epoch": 0.4690667265660629, + "grad_norm": 0.6785633981073629, + "learning_rate": 0.0001788501531406718, + "loss": 12.3742, + "step": 8614 + }, + { + "epoch": 0.4691211805626459, + "grad_norm": 0.6077029256627378, + "learning_rate": 0.00017884472931616543, + "loss": 12.3687, + "step": 8615 + }, + { + "epoch": 0.46917563455922895, + "grad_norm": 0.6621690317050741, + "learning_rate": 0.0001788393048785492, + "loss": 12.3839, + "step": 8616 + }, + { + "epoch": 0.46923008855581194, + "grad_norm": 0.6656474743344936, + "learning_rate": 0.0001788338798278653, + "loss": 12.3856, + "step": 8617 + }, + { + "epoch": 0.46928454255239493, + "grad_norm": 0.66881792016721, + "learning_rate": 0.00017882845416415595, + "loss": 12.5676, + "step": 8618 + }, + { + "epoch": 0.469338996548978, + "grad_norm": 0.767602290036045, + "learning_rate": 0.0001788230278874633, + "loss": 12.3283, + "step": 8619 + }, + { + "epoch": 0.469393450545561, + "grad_norm": 0.7045073223901752, + "learning_rate": 0.00017881760099782958, + "loss": 12.5283, + "step": 8620 + }, + { + "epoch": 0.46944790454214397, + "grad_norm": 0.6515931879334268, + "learning_rate": 0.00017881217349529697, + "loss": 12.4897, + "step": 8621 + }, + { + "epoch": 0.469502358538727, + "grad_norm": 0.6473209646368532, + "learning_rate": 0.0001788067453799077, + "loss": 12.4732, + "step": 8622 + }, + { + "epoch": 0.46955681253531, + "grad_norm": 0.7198939445665117, + "learning_rate": 0.00017880131665170393, + "loss": 12.465, + "step": 8623 + }, + { + "epoch": 0.469611266531893, + "grad_norm": 0.6935062918663318, + "learning_rate": 0.00017879588731072794, + "loss": 12.4833, + "step": 8624 + }, + { + "epoch": 0.46966572052847605, + "grad_norm": 0.6226308375788506, + "learning_rate": 0.0001787904573570219, + "loss": 12.5073, + "step": 8625 + }, + { + "epoch": 0.46972017452505904, + "grad_norm": 0.6849934242523069, + "learning_rate": 0.00017878502679062806, + "loss": 12.3697, + "step": 8626 + }, + { + "epoch": 0.46977462852164203, + "grad_norm": 0.6070916164445785, + "learning_rate": 0.00017877959561158862, + "loss": 12.4153, + "step": 8627 + }, + { + "epoch": 0.4698290825182251, + "grad_norm": 0.6603216087072952, + "learning_rate": 0.00017877416381994584, + "loss": 12.4331, + "step": 8628 + }, + { + "epoch": 0.4698835365148081, + "grad_norm": 0.5860755470881435, + "learning_rate": 0.00017876873141574198, + "loss": 12.4932, + "step": 8629 + }, + { + "epoch": 0.46993799051139107, + "grad_norm": 0.607831027666431, + "learning_rate": 0.00017876329839901922, + "loss": 12.4657, + "step": 8630 + }, + { + "epoch": 0.4699924445079741, + "grad_norm": 0.6506427431137368, + "learning_rate": 0.00017875786476981986, + "loss": 12.4483, + "step": 8631 + }, + { + "epoch": 0.4700468985045571, + "grad_norm": 0.6116981851750112, + "learning_rate": 0.0001787524305281861, + "loss": 12.4156, + "step": 8632 + }, + { + "epoch": 0.47010135250114016, + "grad_norm": 0.6386453334944827, + "learning_rate": 0.00017874699567416028, + "loss": 12.4169, + "step": 8633 + }, + { + "epoch": 0.47015580649772315, + "grad_norm": 0.5990352000029187, + "learning_rate": 0.0001787415602077846, + "loss": 12.4613, + "step": 8634 + }, + { + "epoch": 0.47021026049430614, + "grad_norm": 0.629306239232886, + "learning_rate": 0.00017873612412910134, + "loss": 12.3987, + "step": 8635 + }, + { + "epoch": 0.4702647144908892, + "grad_norm": 0.6362931725753519, + "learning_rate": 0.00017873068743815278, + "loss": 12.3794, + "step": 8636 + }, + { + "epoch": 0.4703191684874722, + "grad_norm": 0.6621394092121959, + "learning_rate": 0.00017872525013498122, + "loss": 12.4266, + "step": 8637 + }, + { + "epoch": 0.4703736224840552, + "grad_norm": 0.6522150718394919, + "learning_rate": 0.00017871981221962886, + "loss": 12.4786, + "step": 8638 + }, + { + "epoch": 0.4704280764806382, + "grad_norm": 0.6182797125326692, + "learning_rate": 0.00017871437369213806, + "loss": 12.3953, + "step": 8639 + }, + { + "epoch": 0.4704825304772212, + "grad_norm": 0.741453587509592, + "learning_rate": 0.0001787089345525511, + "loss": 12.4706, + "step": 8640 + }, + { + "epoch": 0.4705369844738042, + "grad_norm": 0.6616730918075981, + "learning_rate": 0.00017870349480091023, + "loss": 12.407, + "step": 8641 + }, + { + "epoch": 0.47059143847038726, + "grad_norm": 0.6721799261869218, + "learning_rate": 0.00017869805443725782, + "loss": 12.4711, + "step": 8642 + }, + { + "epoch": 0.47064589246697025, + "grad_norm": 0.6905179211399968, + "learning_rate": 0.00017869261346163616, + "loss": 12.4337, + "step": 8643 + }, + { + "epoch": 0.47070034646355324, + "grad_norm": 0.5665557718261672, + "learning_rate": 0.0001786871718740875, + "loss": 12.3544, + "step": 8644 + }, + { + "epoch": 0.4707548004601363, + "grad_norm": 0.6720406366448736, + "learning_rate": 0.0001786817296746542, + "loss": 12.3461, + "step": 8645 + }, + { + "epoch": 0.4708092544567193, + "grad_norm": 0.7474814381158854, + "learning_rate": 0.00017867628686337857, + "loss": 12.5816, + "step": 8646 + }, + { + "epoch": 0.4708637084533023, + "grad_norm": 0.6518981831942263, + "learning_rate": 0.00017867084344030295, + "loss": 12.4326, + "step": 8647 + }, + { + "epoch": 0.4709181624498853, + "grad_norm": 0.6892336094441212, + "learning_rate": 0.00017866539940546966, + "loss": 12.4625, + "step": 8648 + }, + { + "epoch": 0.4709726164464683, + "grad_norm": 0.68022474893785, + "learning_rate": 0.00017865995475892105, + "loss": 12.3368, + "step": 8649 + }, + { + "epoch": 0.4710270704430513, + "grad_norm": 0.7638458776577514, + "learning_rate": 0.00017865450950069943, + "loss": 12.5662, + "step": 8650 + }, + { + "epoch": 0.47108152443963436, + "grad_norm": 0.6359881616953548, + "learning_rate": 0.00017864906363084714, + "loss": 12.3063, + "step": 8651 + }, + { + "epoch": 0.47113597843621735, + "grad_norm": 0.6350355813886267, + "learning_rate": 0.00017864361714940653, + "loss": 12.4142, + "step": 8652 + }, + { + "epoch": 0.47119043243280034, + "grad_norm": 0.6276882384257606, + "learning_rate": 0.00017863817005642002, + "loss": 12.3826, + "step": 8653 + }, + { + "epoch": 0.4712448864293834, + "grad_norm": 0.6387207175067731, + "learning_rate": 0.00017863272235192987, + "loss": 12.4255, + "step": 8654 + }, + { + "epoch": 0.4712993404259664, + "grad_norm": 0.6642427666470622, + "learning_rate": 0.00017862727403597848, + "loss": 12.329, + "step": 8655 + }, + { + "epoch": 0.4713537944225494, + "grad_norm": 0.7565854838138347, + "learning_rate": 0.00017862182510860827, + "loss": 12.6547, + "step": 8656 + }, + { + "epoch": 0.4714082484191324, + "grad_norm": 0.655967108899123, + "learning_rate": 0.00017861637556986152, + "loss": 12.4853, + "step": 8657 + }, + { + "epoch": 0.4714627024157154, + "grad_norm": 0.5841788295309343, + "learning_rate": 0.00017861092541978063, + "loss": 12.4258, + "step": 8658 + }, + { + "epoch": 0.4715171564122984, + "grad_norm": 0.6802712400530733, + "learning_rate": 0.000178605474658408, + "loss": 12.164, + "step": 8659 + }, + { + "epoch": 0.47157161040888146, + "grad_norm": 0.5812630193366193, + "learning_rate": 0.00017860002328578606, + "loss": 12.4212, + "step": 8660 + }, + { + "epoch": 0.47162606440546445, + "grad_norm": 0.7439484017953842, + "learning_rate": 0.00017859457130195715, + "loss": 12.5639, + "step": 8661 + }, + { + "epoch": 0.47168051840204744, + "grad_norm": 0.6444918782437785, + "learning_rate": 0.00017858911870696366, + "loss": 12.4413, + "step": 8662 + }, + { + "epoch": 0.4717349723986305, + "grad_norm": 0.6042122509649913, + "learning_rate": 0.00017858366550084801, + "loss": 12.2887, + "step": 8663 + }, + { + "epoch": 0.4717894263952135, + "grad_norm": 0.719636542138421, + "learning_rate": 0.00017857821168365258, + "loss": 12.3589, + "step": 8664 + }, + { + "epoch": 0.47184388039179653, + "grad_norm": 0.6004943446157107, + "learning_rate": 0.00017857275725541983, + "loss": 12.55, + "step": 8665 + }, + { + "epoch": 0.4718983343883795, + "grad_norm": 0.6690559070467175, + "learning_rate": 0.00017856730221619212, + "loss": 12.3805, + "step": 8666 + }, + { + "epoch": 0.4719527883849625, + "grad_norm": 0.6520489856594532, + "learning_rate": 0.00017856184656601189, + "loss": 12.3617, + "step": 8667 + }, + { + "epoch": 0.47200724238154557, + "grad_norm": 0.5842016736999017, + "learning_rate": 0.00017855639030492156, + "loss": 12.3109, + "step": 8668 + }, + { + "epoch": 0.47206169637812856, + "grad_norm": 0.6875535840151342, + "learning_rate": 0.0001785509334329636, + "loss": 12.4851, + "step": 8669 + }, + { + "epoch": 0.47211615037471155, + "grad_norm": 0.6728011912417281, + "learning_rate": 0.0001785454759501804, + "loss": 12.4484, + "step": 8670 + }, + { + "epoch": 0.4721706043712946, + "grad_norm": 0.6394208019399653, + "learning_rate": 0.0001785400178566144, + "loss": 12.5169, + "step": 8671 + }, + { + "epoch": 0.4722250583678776, + "grad_norm": 0.7233003380113386, + "learning_rate": 0.00017853455915230803, + "loss": 12.5152, + "step": 8672 + }, + { + "epoch": 0.4722795123644606, + "grad_norm": 0.6577078659945517, + "learning_rate": 0.00017852909983730376, + "loss": 12.2267, + "step": 8673 + }, + { + "epoch": 0.47233396636104363, + "grad_norm": 0.6681012762094728, + "learning_rate": 0.00017852363991164406, + "loss": 12.4857, + "step": 8674 + }, + { + "epoch": 0.4723884203576266, + "grad_norm": 0.6492669412836646, + "learning_rate": 0.00017851817937537137, + "loss": 12.3342, + "step": 8675 + }, + { + "epoch": 0.4724428743542096, + "grad_norm": 0.6258051004677605, + "learning_rate": 0.00017851271822852817, + "loss": 12.3255, + "step": 8676 + }, + { + "epoch": 0.47249732835079267, + "grad_norm": 0.6177766546154925, + "learning_rate": 0.00017850725647115684, + "loss": 12.453, + "step": 8677 + }, + { + "epoch": 0.47255178234737566, + "grad_norm": 0.5976114556341114, + "learning_rate": 0.00017850179410329998, + "loss": 12.354, + "step": 8678 + }, + { + "epoch": 0.47260623634395865, + "grad_norm": 0.6943464826461088, + "learning_rate": 0.00017849633112499997, + "loss": 12.4133, + "step": 8679 + }, + { + "epoch": 0.4726606903405417, + "grad_norm": 0.6629539463742055, + "learning_rate": 0.00017849086753629934, + "loss": 12.5159, + "step": 8680 + }, + { + "epoch": 0.4727151443371247, + "grad_norm": 0.5857907651074736, + "learning_rate": 0.00017848540333724054, + "loss": 12.3979, + "step": 8681 + }, + { + "epoch": 0.4727695983337077, + "grad_norm": 0.66682024503514, + "learning_rate": 0.0001784799385278661, + "loss": 12.3639, + "step": 8682 + }, + { + "epoch": 0.47282405233029073, + "grad_norm": 0.6785937455203835, + "learning_rate": 0.0001784744731082185, + "loss": 12.7207, + "step": 8683 + }, + { + "epoch": 0.4728785063268737, + "grad_norm": 0.6830103773165097, + "learning_rate": 0.00017846900707834022, + "loss": 12.4353, + "step": 8684 + }, + { + "epoch": 0.4729329603234567, + "grad_norm": 0.6678547876212807, + "learning_rate": 0.0001784635404382738, + "loss": 12.5259, + "step": 8685 + }, + { + "epoch": 0.47298741432003977, + "grad_norm": 0.6663746763710154, + "learning_rate": 0.00017845807318806175, + "loss": 12.3938, + "step": 8686 + }, + { + "epoch": 0.47304186831662276, + "grad_norm": 0.6361131616374217, + "learning_rate": 0.00017845260532774654, + "loss": 12.271, + "step": 8687 + }, + { + "epoch": 0.47309632231320575, + "grad_norm": 0.7273696714957184, + "learning_rate": 0.00017844713685737069, + "loss": 12.2901, + "step": 8688 + }, + { + "epoch": 0.4731507763097888, + "grad_norm": 0.6327991357395276, + "learning_rate": 0.00017844166777697678, + "loss": 12.3317, + "step": 8689 + }, + { + "epoch": 0.4732052303063718, + "grad_norm": 0.6234225324962389, + "learning_rate": 0.0001784361980866073, + "loss": 12.4677, + "step": 8690 + }, + { + "epoch": 0.4732596843029548, + "grad_norm": 0.6428092940547633, + "learning_rate": 0.00017843072778630478, + "loss": 12.3023, + "step": 8691 + }, + { + "epoch": 0.47331413829953783, + "grad_norm": 0.5903591289493301, + "learning_rate": 0.00017842525687611179, + "loss": 12.3877, + "step": 8692 + }, + { + "epoch": 0.4733685922961208, + "grad_norm": 0.6341704384358905, + "learning_rate": 0.0001784197853560708, + "loss": 12.3853, + "step": 8693 + }, + { + "epoch": 0.4734230462927038, + "grad_norm": 0.6473153367853635, + "learning_rate": 0.00017841431322622447, + "loss": 12.4951, + "step": 8694 + }, + { + "epoch": 0.47347750028928687, + "grad_norm": 0.6562941393001934, + "learning_rate": 0.00017840884048661527, + "loss": 12.3385, + "step": 8695 + }, + { + "epoch": 0.47353195428586986, + "grad_norm": 0.5803770674490643, + "learning_rate": 0.0001784033671372858, + "loss": 12.5054, + "step": 8696 + }, + { + "epoch": 0.47358640828245285, + "grad_norm": 0.606714580039266, + "learning_rate": 0.00017839789317827855, + "loss": 12.4175, + "step": 8697 + }, + { + "epoch": 0.4736408622790359, + "grad_norm": 0.6835740224685091, + "learning_rate": 0.00017839241860963617, + "loss": 12.3783, + "step": 8698 + }, + { + "epoch": 0.4736953162756189, + "grad_norm": 0.6513242061593827, + "learning_rate": 0.00017838694343140117, + "loss": 12.4464, + "step": 8699 + }, + { + "epoch": 0.47374977027220194, + "grad_norm": 0.7031548801620282, + "learning_rate": 0.00017838146764361619, + "loss": 12.3203, + "step": 8700 + }, + { + "epoch": 0.47380422426878493, + "grad_norm": 0.6364147771234054, + "learning_rate": 0.00017837599124632375, + "loss": 12.3763, + "step": 8701 + }, + { + "epoch": 0.4738586782653679, + "grad_norm": 0.6186036418075409, + "learning_rate": 0.00017837051423956644, + "loss": 12.3576, + "step": 8702 + }, + { + "epoch": 0.473913132261951, + "grad_norm": 0.6176910401272563, + "learning_rate": 0.00017836503662338688, + "loss": 12.401, + "step": 8703 + }, + { + "epoch": 0.47396758625853397, + "grad_norm": 0.6096101648023464, + "learning_rate": 0.00017835955839782766, + "loss": 12.3551, + "step": 8704 + }, + { + "epoch": 0.47402204025511696, + "grad_norm": 0.5980960499670746, + "learning_rate": 0.00017835407956293136, + "loss": 12.3039, + "step": 8705 + }, + { + "epoch": 0.4740764942517, + "grad_norm": 0.6500700845049932, + "learning_rate": 0.00017834860011874064, + "loss": 12.4876, + "step": 8706 + }, + { + "epoch": 0.474130948248283, + "grad_norm": 0.6091034212568243, + "learning_rate": 0.00017834312006529803, + "loss": 12.4782, + "step": 8707 + }, + { + "epoch": 0.474185402244866, + "grad_norm": 0.6550246652321426, + "learning_rate": 0.00017833763940264618, + "loss": 12.4359, + "step": 8708 + }, + { + "epoch": 0.47423985624144904, + "grad_norm": 0.652718777215914, + "learning_rate": 0.0001783321581308277, + "loss": 12.5009, + "step": 8709 + }, + { + "epoch": 0.47429431023803204, + "grad_norm": 0.6357131550301056, + "learning_rate": 0.00017832667624988525, + "loss": 12.3438, + "step": 8710 + }, + { + "epoch": 0.47434876423461503, + "grad_norm": 0.6056142520938775, + "learning_rate": 0.00017832119375986143, + "loss": 12.3436, + "step": 8711 + }, + { + "epoch": 0.4744032182311981, + "grad_norm": 0.6836617173901649, + "learning_rate": 0.00017831571066079886, + "loss": 12.4552, + "step": 8712 + }, + { + "epoch": 0.47445767222778107, + "grad_norm": 0.6196554753439548, + "learning_rate": 0.00017831022695274018, + "loss": 12.4012, + "step": 8713 + }, + { + "epoch": 0.47451212622436406, + "grad_norm": 0.5652293429605024, + "learning_rate": 0.00017830474263572804, + "loss": 12.3303, + "step": 8714 + }, + { + "epoch": 0.4745665802209471, + "grad_norm": 0.6475256810886932, + "learning_rate": 0.00017829925770980514, + "loss": 12.5303, + "step": 8715 + }, + { + "epoch": 0.4746210342175301, + "grad_norm": 0.6036166661171558, + "learning_rate": 0.00017829377217501403, + "loss": 12.383, + "step": 8716 + }, + { + "epoch": 0.4746754882141131, + "grad_norm": 0.6985522299689985, + "learning_rate": 0.00017828828603139743, + "loss": 12.5098, + "step": 8717 + }, + { + "epoch": 0.47472994221069614, + "grad_norm": 0.6070778484391904, + "learning_rate": 0.00017828279927899798, + "loss": 12.391, + "step": 8718 + }, + { + "epoch": 0.47478439620727914, + "grad_norm": 0.5823182202407687, + "learning_rate": 0.00017827731191785836, + "loss": 12.3258, + "step": 8719 + }, + { + "epoch": 0.47483885020386213, + "grad_norm": 0.5893513288995854, + "learning_rate": 0.00017827182394802128, + "loss": 12.2139, + "step": 8720 + }, + { + "epoch": 0.4748933042004452, + "grad_norm": 0.7013688112671357, + "learning_rate": 0.0001782663353695293, + "loss": 12.5296, + "step": 8721 + }, + { + "epoch": 0.47494775819702817, + "grad_norm": 0.5962121584629747, + "learning_rate": 0.0001782608461824252, + "loss": 12.4412, + "step": 8722 + }, + { + "epoch": 0.47500221219361116, + "grad_norm": 0.658193756347851, + "learning_rate": 0.00017825535638675165, + "loss": 12.3744, + "step": 8723 + }, + { + "epoch": 0.4750566661901942, + "grad_norm": 0.5829182717805368, + "learning_rate": 0.00017824986598255133, + "loss": 12.5226, + "step": 8724 + }, + { + "epoch": 0.4751111201867772, + "grad_norm": 0.5963759135534142, + "learning_rate": 0.0001782443749698669, + "loss": 12.4221, + "step": 8725 + }, + { + "epoch": 0.4751655741833602, + "grad_norm": 0.620977287497132, + "learning_rate": 0.0001782388833487411, + "loss": 12.3833, + "step": 8726 + }, + { + "epoch": 0.47522002817994324, + "grad_norm": 0.6899744567652295, + "learning_rate": 0.00017823339111921663, + "loss": 12.4674, + "step": 8727 + }, + { + "epoch": 0.47527448217652624, + "grad_norm": 0.6151290935196063, + "learning_rate": 0.00017822789828133618, + "loss": 12.5067, + "step": 8728 + }, + { + "epoch": 0.47532893617310923, + "grad_norm": 0.569806095972506, + "learning_rate": 0.0001782224048351425, + "loss": 12.4047, + "step": 8729 + }, + { + "epoch": 0.4753833901696923, + "grad_norm": 0.5995478910348392, + "learning_rate": 0.00017821691078067823, + "loss": 12.4148, + "step": 8730 + }, + { + "epoch": 0.47543784416627527, + "grad_norm": 0.6286888717890325, + "learning_rate": 0.00017821141611798618, + "loss": 12.3511, + "step": 8731 + }, + { + "epoch": 0.4754922981628583, + "grad_norm": 0.5648835797724097, + "learning_rate": 0.00017820592084710906, + "loss": 12.2943, + "step": 8732 + }, + { + "epoch": 0.4755467521594413, + "grad_norm": 0.6112850266141724, + "learning_rate": 0.00017820042496808955, + "loss": 12.4158, + "step": 8733 + }, + { + "epoch": 0.4756012061560243, + "grad_norm": 0.661691812372672, + "learning_rate": 0.00017819492848097045, + "loss": 12.4666, + "step": 8734 + }, + { + "epoch": 0.47565566015260735, + "grad_norm": 0.594886371446287, + "learning_rate": 0.00017818943138579445, + "loss": 12.4039, + "step": 8735 + }, + { + "epoch": 0.47571011414919034, + "grad_norm": 0.6531180638084092, + "learning_rate": 0.00017818393368260432, + "loss": 12.3811, + "step": 8736 + }, + { + "epoch": 0.47576456814577334, + "grad_norm": 0.7078781658459506, + "learning_rate": 0.0001781784353714428, + "loss": 12.5008, + "step": 8737 + }, + { + "epoch": 0.4758190221423564, + "grad_norm": 0.596332682579423, + "learning_rate": 0.0001781729364523527, + "loss": 12.466, + "step": 8738 + }, + { + "epoch": 0.4758734761389394, + "grad_norm": 0.6160511904069175, + "learning_rate": 0.0001781674369253767, + "loss": 12.4099, + "step": 8739 + }, + { + "epoch": 0.47592793013552237, + "grad_norm": 0.591545992505644, + "learning_rate": 0.0001781619367905576, + "loss": 12.3754, + "step": 8740 + }, + { + "epoch": 0.4759823841321054, + "grad_norm": 0.7002224308550122, + "learning_rate": 0.0001781564360479382, + "loss": 12.3937, + "step": 8741 + }, + { + "epoch": 0.4760368381286884, + "grad_norm": 0.6085928202462332, + "learning_rate": 0.0001781509346975612, + "loss": 12.3617, + "step": 8742 + }, + { + "epoch": 0.4760912921252714, + "grad_norm": 0.6398433181849218, + "learning_rate": 0.00017814543273946947, + "loss": 12.4901, + "step": 8743 + }, + { + "epoch": 0.47614574612185445, + "grad_norm": 0.7506931499875767, + "learning_rate": 0.00017813993017370576, + "loss": 12.495, + "step": 8744 + }, + { + "epoch": 0.47620020011843744, + "grad_norm": 0.5799727249777821, + "learning_rate": 0.00017813442700031283, + "loss": 12.3573, + "step": 8745 + }, + { + "epoch": 0.47625465411502044, + "grad_norm": 0.5869373549653786, + "learning_rate": 0.00017812892321933345, + "loss": 12.4455, + "step": 8746 + }, + { + "epoch": 0.4763091081116035, + "grad_norm": 0.5965018552793068, + "learning_rate": 0.00017812341883081053, + "loss": 12.4557, + "step": 8747 + }, + { + "epoch": 0.4763635621081865, + "grad_norm": 0.6541023786405988, + "learning_rate": 0.00017811791383478675, + "loss": 12.5107, + "step": 8748 + }, + { + "epoch": 0.47641801610476947, + "grad_norm": 0.6206970523605138, + "learning_rate": 0.000178112408231305, + "loss": 12.5198, + "step": 8749 + }, + { + "epoch": 0.4764724701013525, + "grad_norm": 0.6499952009932339, + "learning_rate": 0.00017810690202040805, + "loss": 12.4502, + "step": 8750 + }, + { + "epoch": 0.4765269240979355, + "grad_norm": 0.6521865596139488, + "learning_rate": 0.00017810139520213874, + "loss": 12.4045, + "step": 8751 + }, + { + "epoch": 0.4765813780945185, + "grad_norm": 0.5973388862310863, + "learning_rate": 0.00017809588777653986, + "loss": 12.4887, + "step": 8752 + }, + { + "epoch": 0.47663583209110155, + "grad_norm": 0.5928298859565891, + "learning_rate": 0.0001780903797436543, + "loss": 12.376, + "step": 8753 + }, + { + "epoch": 0.47669028608768454, + "grad_norm": 0.6457349298774065, + "learning_rate": 0.00017808487110352483, + "loss": 12.5605, + "step": 8754 + }, + { + "epoch": 0.47674474008426754, + "grad_norm": 0.6119826626738841, + "learning_rate": 0.00017807936185619433, + "loss": 12.4986, + "step": 8755 + }, + { + "epoch": 0.4767991940808506, + "grad_norm": 0.5863087139114767, + "learning_rate": 0.0001780738520017056, + "loss": 12.3536, + "step": 8756 + }, + { + "epoch": 0.4768536480774336, + "grad_norm": 0.6428427397487143, + "learning_rate": 0.0001780683415401015, + "loss": 12.3435, + "step": 8757 + }, + { + "epoch": 0.47690810207401657, + "grad_norm": 0.6967791103950431, + "learning_rate": 0.00017806283047142488, + "loss": 12.5013, + "step": 8758 + }, + { + "epoch": 0.4769625560705996, + "grad_norm": 0.6705016723421287, + "learning_rate": 0.00017805731879571858, + "loss": 12.5209, + "step": 8759 + }, + { + "epoch": 0.4770170100671826, + "grad_norm": 0.6148028932835333, + "learning_rate": 0.00017805180651302553, + "loss": 12.5144, + "step": 8760 + }, + { + "epoch": 0.4770714640637656, + "grad_norm": 0.5768470841705992, + "learning_rate": 0.00017804629362338852, + "loss": 12.4492, + "step": 8761 + }, + { + "epoch": 0.47712591806034865, + "grad_norm": 0.694438897567131, + "learning_rate": 0.00017804078012685043, + "loss": 12.4018, + "step": 8762 + }, + { + "epoch": 0.47718037205693165, + "grad_norm": 0.6351675659575002, + "learning_rate": 0.00017803526602345416, + "loss": 12.5062, + "step": 8763 + }, + { + "epoch": 0.47723482605351464, + "grad_norm": 0.6840557802934399, + "learning_rate": 0.0001780297513132426, + "loss": 12.4258, + "step": 8764 + }, + { + "epoch": 0.4772892800500977, + "grad_norm": 0.6269476064332131, + "learning_rate": 0.00017802423599625855, + "loss": 12.3025, + "step": 8765 + }, + { + "epoch": 0.4773437340466807, + "grad_norm": 0.748368075863479, + "learning_rate": 0.000178018720072545, + "loss": 12.45, + "step": 8766 + }, + { + "epoch": 0.4773981880432637, + "grad_norm": 0.6079501771640944, + "learning_rate": 0.00017801320354214476, + "loss": 12.582, + "step": 8767 + }, + { + "epoch": 0.4774526420398467, + "grad_norm": 0.6021377793201013, + "learning_rate": 0.0001780076864051008, + "loss": 12.3335, + "step": 8768 + }, + { + "epoch": 0.4775070960364297, + "grad_norm": 0.6756093490145296, + "learning_rate": 0.000178002168661456, + "loss": 12.3745, + "step": 8769 + }, + { + "epoch": 0.47756155003301276, + "grad_norm": 0.6197236151026202, + "learning_rate": 0.0001779966503112532, + "loss": 12.3967, + "step": 8770 + }, + { + "epoch": 0.47761600402959575, + "grad_norm": 0.6525965658124591, + "learning_rate": 0.00017799113135453541, + "loss": 12.431, + "step": 8771 + }, + { + "epoch": 0.47767045802617875, + "grad_norm": 0.7092955749743148, + "learning_rate": 0.00017798561179134553, + "loss": 12.3853, + "step": 8772 + }, + { + "epoch": 0.4777249120227618, + "grad_norm": 0.6196543964995244, + "learning_rate": 0.0001779800916217264, + "loss": 12.4704, + "step": 8773 + }, + { + "epoch": 0.4777793660193448, + "grad_norm": 0.7215102850249048, + "learning_rate": 0.00017797457084572102, + "loss": 12.4219, + "step": 8774 + }, + { + "epoch": 0.4778338200159278, + "grad_norm": 0.6234504458117953, + "learning_rate": 0.0001779690494633723, + "loss": 12.4559, + "step": 8775 + }, + { + "epoch": 0.4778882740125108, + "grad_norm": 0.6269716209072117, + "learning_rate": 0.0001779635274747232, + "loss": 12.4085, + "step": 8776 + }, + { + "epoch": 0.4779427280090938, + "grad_norm": 0.7042413882289371, + "learning_rate": 0.0001779580048798166, + "loss": 12.2859, + "step": 8777 + }, + { + "epoch": 0.4779971820056768, + "grad_norm": 0.5900419912112504, + "learning_rate": 0.00017795248167869549, + "loss": 12.3652, + "step": 8778 + }, + { + "epoch": 0.47805163600225986, + "grad_norm": 0.6511384156373964, + "learning_rate": 0.0001779469578714028, + "loss": 12.3114, + "step": 8779 + }, + { + "epoch": 0.47810608999884285, + "grad_norm": 0.6670601280040062, + "learning_rate": 0.0001779414334579815, + "loss": 12.3957, + "step": 8780 + }, + { + "epoch": 0.47816054399542585, + "grad_norm": 0.5890634034622276, + "learning_rate": 0.00017793590843847456, + "loss": 12.3756, + "step": 8781 + }, + { + "epoch": 0.4782149979920089, + "grad_norm": 0.6218526690363533, + "learning_rate": 0.00017793038281292494, + "loss": 12.4279, + "step": 8782 + }, + { + "epoch": 0.4782694519885919, + "grad_norm": 0.6094031786819534, + "learning_rate": 0.00017792485658137553, + "loss": 12.3809, + "step": 8783 + }, + { + "epoch": 0.4783239059851749, + "grad_norm": 0.6674236673610006, + "learning_rate": 0.00017791932974386943, + "loss": 12.3462, + "step": 8784 + }, + { + "epoch": 0.47837835998175793, + "grad_norm": 0.8290033168357881, + "learning_rate": 0.00017791380230044955, + "loss": 12.4798, + "step": 8785 + }, + { + "epoch": 0.4784328139783409, + "grad_norm": 0.5698319474926054, + "learning_rate": 0.00017790827425115887, + "loss": 12.3464, + "step": 8786 + }, + { + "epoch": 0.4784872679749239, + "grad_norm": 0.7063867609709856, + "learning_rate": 0.00017790274559604033, + "loss": 12.4459, + "step": 8787 + }, + { + "epoch": 0.47854172197150696, + "grad_norm": 0.6248814974218623, + "learning_rate": 0.00017789721633513703, + "loss": 12.3679, + "step": 8788 + }, + { + "epoch": 0.47859617596808995, + "grad_norm": 0.6049202524282553, + "learning_rate": 0.0001778916864684919, + "loss": 12.4051, + "step": 8789 + }, + { + "epoch": 0.47865062996467295, + "grad_norm": 0.6707745185056192, + "learning_rate": 0.00017788615599614798, + "loss": 12.3868, + "step": 8790 + }, + { + "epoch": 0.478705083961256, + "grad_norm": 0.5512079569667026, + "learning_rate": 0.0001778806249181482, + "loss": 12.3534, + "step": 8791 + }, + { + "epoch": 0.478759537957839, + "grad_norm": 0.6225053075193171, + "learning_rate": 0.00017787509323453565, + "loss": 12.4305, + "step": 8792 + }, + { + "epoch": 0.478813991954422, + "grad_norm": 0.6709506145422299, + "learning_rate": 0.00017786956094535333, + "loss": 12.4308, + "step": 8793 + }, + { + "epoch": 0.47886844595100503, + "grad_norm": 0.6751250487279195, + "learning_rate": 0.0001778640280506442, + "loss": 12.4982, + "step": 8794 + }, + { + "epoch": 0.478922899947588, + "grad_norm": 0.6378585206417511, + "learning_rate": 0.00017785849455045138, + "loss": 12.4145, + "step": 8795 + }, + { + "epoch": 0.478977353944171, + "grad_norm": 0.6245405864362273, + "learning_rate": 0.0001778529604448178, + "loss": 12.2687, + "step": 8796 + }, + { + "epoch": 0.47903180794075406, + "grad_norm": 0.6113639663876627, + "learning_rate": 0.0001778474257337866, + "loss": 12.4223, + "step": 8797 + }, + { + "epoch": 0.47908626193733705, + "grad_norm": 0.5721990527380328, + "learning_rate": 0.0001778418904174007, + "loss": 12.1567, + "step": 8798 + }, + { + "epoch": 0.4791407159339201, + "grad_norm": 0.6724946226809764, + "learning_rate": 0.00017783635449570326, + "loss": 12.4662, + "step": 8799 + }, + { + "epoch": 0.4791951699305031, + "grad_norm": 0.6178789728367964, + "learning_rate": 0.00017783081796873725, + "loss": 12.431, + "step": 8800 + }, + { + "epoch": 0.4792496239270861, + "grad_norm": 0.6579330034517156, + "learning_rate": 0.00017782528083654575, + "loss": 12.2458, + "step": 8801 + }, + { + "epoch": 0.47930407792366914, + "grad_norm": 0.6058708382556764, + "learning_rate": 0.0001778197430991718, + "loss": 12.442, + "step": 8802 + }, + { + "epoch": 0.47935853192025213, + "grad_norm": 0.6377720679987039, + "learning_rate": 0.0001778142047566585, + "loss": 12.461, + "step": 8803 + }, + { + "epoch": 0.4794129859168351, + "grad_norm": 0.5970891817149896, + "learning_rate": 0.0001778086658090489, + "loss": 12.4052, + "step": 8804 + }, + { + "epoch": 0.47946743991341817, + "grad_norm": 0.6461955050830236, + "learning_rate": 0.00017780312625638605, + "loss": 12.3717, + "step": 8805 + }, + { + "epoch": 0.47952189391000116, + "grad_norm": 0.6468828453737918, + "learning_rate": 0.000177797586098713, + "loss": 12.4399, + "step": 8806 + }, + { + "epoch": 0.47957634790658416, + "grad_norm": 0.6621514434037283, + "learning_rate": 0.00017779204533607297, + "loss": 12.5993, + "step": 8807 + }, + { + "epoch": 0.4796308019031672, + "grad_norm": 0.58841400655816, + "learning_rate": 0.0001777865039685089, + "loss": 12.4557, + "step": 8808 + }, + { + "epoch": 0.4796852558997502, + "grad_norm": 0.6365515386699335, + "learning_rate": 0.00017778096199606394, + "loss": 12.3267, + "step": 8809 + }, + { + "epoch": 0.4797397098963332, + "grad_norm": 0.6823179343753297, + "learning_rate": 0.00017777541941878114, + "loss": 12.4056, + "step": 8810 + }, + { + "epoch": 0.47979416389291624, + "grad_norm": 0.5769821845021594, + "learning_rate": 0.00017776987623670368, + "loss": 12.4249, + "step": 8811 + }, + { + "epoch": 0.47984861788949923, + "grad_norm": 0.6255846491013269, + "learning_rate": 0.00017776433244987458, + "loss": 12.4606, + "step": 8812 + }, + { + "epoch": 0.4799030718860822, + "grad_norm": 0.6799133311882912, + "learning_rate": 0.000177758788058337, + "loss": 12.4782, + "step": 8813 + }, + { + "epoch": 0.47995752588266527, + "grad_norm": 0.6058273235144871, + "learning_rate": 0.00017775324306213406, + "loss": 12.4778, + "step": 8814 + }, + { + "epoch": 0.48001197987924826, + "grad_norm": 0.6519108191538442, + "learning_rate": 0.00017774769746130886, + "loss": 12.4488, + "step": 8815 + }, + { + "epoch": 0.48006643387583126, + "grad_norm": 0.5844059895700789, + "learning_rate": 0.00017774215125590455, + "loss": 12.4286, + "step": 8816 + }, + { + "epoch": 0.4801208878724143, + "grad_norm": 0.6788807574460705, + "learning_rate": 0.00017773660444596418, + "loss": 12.5376, + "step": 8817 + }, + { + "epoch": 0.4801753418689973, + "grad_norm": 0.6717908703669483, + "learning_rate": 0.00017773105703153096, + "loss": 12.4666, + "step": 8818 + }, + { + "epoch": 0.4802297958655803, + "grad_norm": 0.5889511648902014, + "learning_rate": 0.00017772550901264803, + "loss": 12.4461, + "step": 8819 + }, + { + "epoch": 0.48028424986216334, + "grad_norm": 0.7001047237575522, + "learning_rate": 0.00017771996038935846, + "loss": 12.5566, + "step": 8820 + }, + { + "epoch": 0.48033870385874633, + "grad_norm": 0.6073367192293863, + "learning_rate": 0.0001777144111617055, + "loss": 12.4075, + "step": 8821 + }, + { + "epoch": 0.4803931578553293, + "grad_norm": 0.7157381884611609, + "learning_rate": 0.0001777088613297322, + "loss": 12.4349, + "step": 8822 + }, + { + "epoch": 0.48044761185191237, + "grad_norm": 0.6451758187148338, + "learning_rate": 0.0001777033108934818, + "loss": 12.383, + "step": 8823 + }, + { + "epoch": 0.48050206584849536, + "grad_norm": 0.6259662761328934, + "learning_rate": 0.00017769775985299738, + "loss": 12.3062, + "step": 8824 + }, + { + "epoch": 0.48055651984507836, + "grad_norm": 0.5982251100488803, + "learning_rate": 0.00017769220820832218, + "loss": 12.2562, + "step": 8825 + }, + { + "epoch": 0.4806109738416614, + "grad_norm": 0.5716512373091801, + "learning_rate": 0.00017768665595949934, + "loss": 12.3767, + "step": 8826 + }, + { + "epoch": 0.4806654278382444, + "grad_norm": 0.6944768655221844, + "learning_rate": 0.00017768110310657204, + "loss": 12.5173, + "step": 8827 + }, + { + "epoch": 0.4807198818348274, + "grad_norm": 0.7524092316030287, + "learning_rate": 0.00017767554964958344, + "loss": 12.2713, + "step": 8828 + }, + { + "epoch": 0.48077433583141044, + "grad_norm": 0.6292076363016551, + "learning_rate": 0.00017766999558857673, + "loss": 12.3015, + "step": 8829 + }, + { + "epoch": 0.48082878982799343, + "grad_norm": 0.6170640257155307, + "learning_rate": 0.00017766444092359512, + "loss": 12.3525, + "step": 8830 + }, + { + "epoch": 0.4808832438245764, + "grad_norm": 0.63768550877622, + "learning_rate": 0.00017765888565468178, + "loss": 12.4197, + "step": 8831 + }, + { + "epoch": 0.48093769782115947, + "grad_norm": 0.6341427907839081, + "learning_rate": 0.00017765332978187997, + "loss": 12.4103, + "step": 8832 + }, + { + "epoch": 0.48099215181774246, + "grad_norm": 0.6629413848045236, + "learning_rate": 0.00017764777330523283, + "loss": 12.4884, + "step": 8833 + }, + { + "epoch": 0.4810466058143255, + "grad_norm": 0.6164832896056446, + "learning_rate": 0.00017764221622478354, + "loss": 12.4355, + "step": 8834 + }, + { + "epoch": 0.4811010598109085, + "grad_norm": 0.5901202811298281, + "learning_rate": 0.00017763665854057537, + "loss": 12.3688, + "step": 8835 + }, + { + "epoch": 0.4811555138074915, + "grad_norm": 0.6196660565552202, + "learning_rate": 0.00017763110025265154, + "loss": 12.4374, + "step": 8836 + }, + { + "epoch": 0.48120996780407455, + "grad_norm": 0.6734610187372511, + "learning_rate": 0.00017762554136105524, + "loss": 12.3679, + "step": 8837 + }, + { + "epoch": 0.48126442180065754, + "grad_norm": 0.8116360626391834, + "learning_rate": 0.00017761998186582972, + "loss": 12.5266, + "step": 8838 + }, + { + "epoch": 0.48131887579724053, + "grad_norm": 0.6981569929406607, + "learning_rate": 0.00017761442176701824, + "loss": 12.5177, + "step": 8839 + }, + { + "epoch": 0.4813733297938236, + "grad_norm": 0.5768553293400086, + "learning_rate": 0.00017760886106466396, + "loss": 12.3916, + "step": 8840 + }, + { + "epoch": 0.48142778379040657, + "grad_norm": 0.7085652244519366, + "learning_rate": 0.00017760329975881017, + "loss": 12.3696, + "step": 8841 + }, + { + "epoch": 0.48148223778698956, + "grad_norm": 0.6562927526967182, + "learning_rate": 0.0001775977378495001, + "loss": 12.3849, + "step": 8842 + }, + { + "epoch": 0.4815366917835726, + "grad_norm": 0.7220168778666222, + "learning_rate": 0.00017759217533677702, + "loss": 12.3901, + "step": 8843 + }, + { + "epoch": 0.4815911457801556, + "grad_norm": 0.5920359098395669, + "learning_rate": 0.00017758661222068415, + "loss": 12.3168, + "step": 8844 + }, + { + "epoch": 0.4816455997767386, + "grad_norm": 0.6872763879710608, + "learning_rate": 0.0001775810485012648, + "loss": 12.3874, + "step": 8845 + }, + { + "epoch": 0.48170005377332165, + "grad_norm": 0.5996468591319443, + "learning_rate": 0.00017757548417856217, + "loss": 12.4146, + "step": 8846 + }, + { + "epoch": 0.48175450776990464, + "grad_norm": 0.5938416569055235, + "learning_rate": 0.00017756991925261962, + "loss": 12.3266, + "step": 8847 + }, + { + "epoch": 0.48180896176648763, + "grad_norm": 0.7101647878990114, + "learning_rate": 0.00017756435372348034, + "loss": 12.4614, + "step": 8848 + }, + { + "epoch": 0.4818634157630707, + "grad_norm": 0.6568914198752358, + "learning_rate": 0.00017755878759118763, + "loss": 12.4564, + "step": 8849 + }, + { + "epoch": 0.48191786975965367, + "grad_norm": 0.6646572763387945, + "learning_rate": 0.00017755322085578478, + "loss": 12.2684, + "step": 8850 + }, + { + "epoch": 0.48197232375623666, + "grad_norm": 0.6122037385746242, + "learning_rate": 0.0001775476535173151, + "loss": 12.4584, + "step": 8851 + }, + { + "epoch": 0.4820267777528197, + "grad_norm": 0.6091843939075892, + "learning_rate": 0.00017754208557582186, + "loss": 12.4766, + "step": 8852 + }, + { + "epoch": 0.4820812317494027, + "grad_norm": 0.7753738208277444, + "learning_rate": 0.00017753651703134832, + "loss": 12.3508, + "step": 8853 + }, + { + "epoch": 0.4821356857459857, + "grad_norm": 0.9314633806434001, + "learning_rate": 0.00017753094788393786, + "loss": 12.5979, + "step": 8854 + }, + { + "epoch": 0.48219013974256875, + "grad_norm": 0.6731684795342191, + "learning_rate": 0.00017752537813363372, + "loss": 12.4653, + "step": 8855 + }, + { + "epoch": 0.48224459373915174, + "grad_norm": 0.6428856940797861, + "learning_rate": 0.00017751980778047928, + "loss": 12.5004, + "step": 8856 + }, + { + "epoch": 0.48229904773573473, + "grad_norm": 0.59991219571617, + "learning_rate": 0.00017751423682451777, + "loss": 12.3395, + "step": 8857 + }, + { + "epoch": 0.4823535017323178, + "grad_norm": 0.6919336816693169, + "learning_rate": 0.0001775086652657926, + "loss": 12.4303, + "step": 8858 + }, + { + "epoch": 0.4824079557289008, + "grad_norm": 0.7949891419946377, + "learning_rate": 0.000177503093104347, + "loss": 12.3935, + "step": 8859 + }, + { + "epoch": 0.48246240972548377, + "grad_norm": 0.5947020100389356, + "learning_rate": 0.0001774975203402244, + "loss": 12.3886, + "step": 8860 + }, + { + "epoch": 0.4825168637220668, + "grad_norm": 0.5988251917040165, + "learning_rate": 0.00017749194697346804, + "loss": 12.342, + "step": 8861 + }, + { + "epoch": 0.4825713177186498, + "grad_norm": 0.6354760855965604, + "learning_rate": 0.00017748637300412135, + "loss": 12.4184, + "step": 8862 + }, + { + "epoch": 0.4826257717152328, + "grad_norm": 0.6000525681944205, + "learning_rate": 0.00017748079843222758, + "loss": 12.4177, + "step": 8863 + }, + { + "epoch": 0.48268022571181585, + "grad_norm": 0.7000608948503445, + "learning_rate": 0.00017747522325783016, + "loss": 12.4286, + "step": 8864 + }, + { + "epoch": 0.48273467970839884, + "grad_norm": 0.6118907742416242, + "learning_rate": 0.0001774696474809724, + "loss": 12.3678, + "step": 8865 + }, + { + "epoch": 0.4827891337049819, + "grad_norm": 0.5942633424266189, + "learning_rate": 0.00017746407110169767, + "loss": 12.4994, + "step": 8866 + }, + { + "epoch": 0.4828435877015649, + "grad_norm": 0.6539654214633613, + "learning_rate": 0.00017745849412004937, + "loss": 12.3596, + "step": 8867 + }, + { + "epoch": 0.4828980416981479, + "grad_norm": 0.6677852512885757, + "learning_rate": 0.00017745291653607076, + "loss": 12.3255, + "step": 8868 + }, + { + "epoch": 0.4829524956947309, + "grad_norm": 0.651571708656775, + "learning_rate": 0.00017744733834980532, + "loss": 12.327, + "step": 8869 + }, + { + "epoch": 0.4830069496913139, + "grad_norm": 0.6349770322266751, + "learning_rate": 0.0001774417595612964, + "loss": 12.487, + "step": 8870 + }, + { + "epoch": 0.4830614036878969, + "grad_norm": 0.6019710222800663, + "learning_rate": 0.00017743618017058735, + "loss": 12.4126, + "step": 8871 + }, + { + "epoch": 0.48311585768447995, + "grad_norm": 0.7285677786372385, + "learning_rate": 0.0001774306001777216, + "loss": 12.4012, + "step": 8872 + }, + { + "epoch": 0.48317031168106295, + "grad_norm": 0.5809777718047995, + "learning_rate": 0.00017742501958274248, + "loss": 12.5225, + "step": 8873 + }, + { + "epoch": 0.48322476567764594, + "grad_norm": 0.6272483646028337, + "learning_rate": 0.00017741943838569347, + "loss": 12.4226, + "step": 8874 + }, + { + "epoch": 0.483279219674229, + "grad_norm": 0.6755453012097429, + "learning_rate": 0.0001774138565866179, + "loss": 12.3687, + "step": 8875 + }, + { + "epoch": 0.483333673670812, + "grad_norm": 0.6564591893347475, + "learning_rate": 0.0001774082741855592, + "loss": 12.3911, + "step": 8876 + }, + { + "epoch": 0.483388127667395, + "grad_norm": 0.6373866759242696, + "learning_rate": 0.00017740269118256076, + "loss": 12.5277, + "step": 8877 + }, + { + "epoch": 0.483442581663978, + "grad_norm": 0.5944427936719368, + "learning_rate": 0.00017739710757766605, + "loss": 12.5127, + "step": 8878 + }, + { + "epoch": 0.483497035660561, + "grad_norm": 0.6324204783412024, + "learning_rate": 0.00017739152337091843, + "loss": 12.4911, + "step": 8879 + }, + { + "epoch": 0.483551489657144, + "grad_norm": 0.6001119109761424, + "learning_rate": 0.00017738593856236133, + "loss": 12.384, + "step": 8880 + }, + { + "epoch": 0.48360594365372706, + "grad_norm": 0.6966066832215112, + "learning_rate": 0.00017738035315203822, + "loss": 12.3972, + "step": 8881 + }, + { + "epoch": 0.48366039765031005, + "grad_norm": 0.6212816073096596, + "learning_rate": 0.0001773747671399925, + "loss": 12.4002, + "step": 8882 + }, + { + "epoch": 0.48371485164689304, + "grad_norm": 0.5724581880947716, + "learning_rate": 0.0001773691805262676, + "loss": 12.427, + "step": 8883 + }, + { + "epoch": 0.4837693056434761, + "grad_norm": 0.6334340981535187, + "learning_rate": 0.000177363593310907, + "loss": 12.4921, + "step": 8884 + }, + { + "epoch": 0.4838237596400591, + "grad_norm": 0.5685247708491485, + "learning_rate": 0.00017735800549395413, + "loss": 12.4575, + "step": 8885 + }, + { + "epoch": 0.4838782136366421, + "grad_norm": 0.5689435269396318, + "learning_rate": 0.00017735241707545241, + "loss": 12.3163, + "step": 8886 + }, + { + "epoch": 0.4839326676332251, + "grad_norm": 0.6779432269703239, + "learning_rate": 0.00017734682805544533, + "loss": 12.3748, + "step": 8887 + }, + { + "epoch": 0.4839871216298081, + "grad_norm": 0.6796854341083812, + "learning_rate": 0.00017734123843397636, + "loss": 12.3531, + "step": 8888 + }, + { + "epoch": 0.4840415756263911, + "grad_norm": 0.5849010737268502, + "learning_rate": 0.00017733564821108895, + "loss": 12.5429, + "step": 8889 + }, + { + "epoch": 0.48409602962297416, + "grad_norm": 0.6728716617375359, + "learning_rate": 0.00017733005738682656, + "loss": 12.4443, + "step": 8890 + }, + { + "epoch": 0.48415048361955715, + "grad_norm": 0.6222616297984809, + "learning_rate": 0.00017732446596123268, + "loss": 12.3582, + "step": 8891 + }, + { + "epoch": 0.48420493761614014, + "grad_norm": 0.6025245468972172, + "learning_rate": 0.00017731887393435076, + "loss": 12.3947, + "step": 8892 + }, + { + "epoch": 0.4842593916127232, + "grad_norm": 0.6012661543715989, + "learning_rate": 0.00017731328130622434, + "loss": 12.33, + "step": 8893 + }, + { + "epoch": 0.4843138456093062, + "grad_norm": 0.6221467903127814, + "learning_rate": 0.00017730768807689687, + "loss": 12.4557, + "step": 8894 + }, + { + "epoch": 0.4843682996058892, + "grad_norm": 0.7082201340878812, + "learning_rate": 0.00017730209424641187, + "loss": 12.5036, + "step": 8895 + }, + { + "epoch": 0.4844227536024722, + "grad_norm": 0.5989029302995673, + "learning_rate": 0.00017729649981481277, + "loss": 12.4931, + "step": 8896 + }, + { + "epoch": 0.4844772075990552, + "grad_norm": 0.6217807482682669, + "learning_rate": 0.00017729090478214317, + "loss": 12.4975, + "step": 8897 + }, + { + "epoch": 0.4845316615956382, + "grad_norm": 0.7228601677700004, + "learning_rate": 0.00017728530914844654, + "loss": 12.4275, + "step": 8898 + }, + { + "epoch": 0.48458611559222126, + "grad_norm": 0.5895667014499496, + "learning_rate": 0.00017727971291376635, + "loss": 12.3745, + "step": 8899 + }, + { + "epoch": 0.48464056958880425, + "grad_norm": 0.5981741009577994, + "learning_rate": 0.0001772741160781462, + "loss": 12.4311, + "step": 8900 + }, + { + "epoch": 0.4846950235853873, + "grad_norm": 0.632341233499706, + "learning_rate": 0.00017726851864162952, + "loss": 12.5217, + "step": 8901 + }, + { + "epoch": 0.4847494775819703, + "grad_norm": 0.5661173025837181, + "learning_rate": 0.0001772629206042599, + "loss": 12.2706, + "step": 8902 + }, + { + "epoch": 0.4848039315785533, + "grad_norm": 0.5429235944245946, + "learning_rate": 0.00017725732196608086, + "loss": 12.3377, + "step": 8903 + }, + { + "epoch": 0.48485838557513633, + "grad_norm": 0.5779148781970922, + "learning_rate": 0.00017725172272713588, + "loss": 12.4942, + "step": 8904 + }, + { + "epoch": 0.4849128395717193, + "grad_norm": 0.5910247841671594, + "learning_rate": 0.0001772461228874686, + "loss": 12.4021, + "step": 8905 + }, + { + "epoch": 0.4849672935683023, + "grad_norm": 0.5960390491561129, + "learning_rate": 0.00017724052244712251, + "loss": 12.476, + "step": 8906 + }, + { + "epoch": 0.48502174756488536, + "grad_norm": 0.5979535906650704, + "learning_rate": 0.00017723492140614115, + "loss": 12.1532, + "step": 8907 + }, + { + "epoch": 0.48507620156146836, + "grad_norm": 0.5835970639327157, + "learning_rate": 0.0001772293197645681, + "loss": 12.426, + "step": 8908 + }, + { + "epoch": 0.48513065555805135, + "grad_norm": 0.578047193217132, + "learning_rate": 0.00017722371752244687, + "loss": 12.2607, + "step": 8909 + }, + { + "epoch": 0.4851851095546344, + "grad_norm": 0.6360885630082748, + "learning_rate": 0.00017721811467982112, + "loss": 12.4493, + "step": 8910 + }, + { + "epoch": 0.4852395635512174, + "grad_norm": 0.5881949679055104, + "learning_rate": 0.0001772125112367343, + "loss": 12.3724, + "step": 8911 + }, + { + "epoch": 0.4852940175478004, + "grad_norm": 0.6163484189248782, + "learning_rate": 0.0001772069071932301, + "loss": 12.4753, + "step": 8912 + }, + { + "epoch": 0.48534847154438343, + "grad_norm": 0.5804625073859097, + "learning_rate": 0.000177201302549352, + "loss": 12.4493, + "step": 8913 + }, + { + "epoch": 0.4854029255409664, + "grad_norm": 0.5913429595420551, + "learning_rate": 0.00017719569730514367, + "loss": 12.4647, + "step": 8914 + }, + { + "epoch": 0.4854573795375494, + "grad_norm": 0.6458546645180917, + "learning_rate": 0.00017719009146064863, + "loss": 12.402, + "step": 8915 + }, + { + "epoch": 0.48551183353413246, + "grad_norm": 0.6266882741409003, + "learning_rate": 0.00017718448501591048, + "loss": 12.427, + "step": 8916 + }, + { + "epoch": 0.48556628753071546, + "grad_norm": 0.7313824026603742, + "learning_rate": 0.00017717887797097284, + "loss": 12.3917, + "step": 8917 + }, + { + "epoch": 0.48562074152729845, + "grad_norm": 0.6531470914819183, + "learning_rate": 0.0001771732703258793, + "loss": 12.4148, + "step": 8918 + }, + { + "epoch": 0.4856751955238815, + "grad_norm": 0.5753462283529436, + "learning_rate": 0.00017716766208067348, + "loss": 12.2808, + "step": 8919 + }, + { + "epoch": 0.4857296495204645, + "grad_norm": 0.6291815969960752, + "learning_rate": 0.00017716205323539897, + "loss": 12.4196, + "step": 8920 + }, + { + "epoch": 0.4857841035170475, + "grad_norm": 0.6655582182857843, + "learning_rate": 0.00017715644379009938, + "loss": 12.6431, + "step": 8921 + }, + { + "epoch": 0.48583855751363053, + "grad_norm": 0.6351500978892104, + "learning_rate": 0.00017715083374481835, + "loss": 12.1745, + "step": 8922 + }, + { + "epoch": 0.4858930115102135, + "grad_norm": 0.7014680673123271, + "learning_rate": 0.00017714522309959953, + "loss": 12.5735, + "step": 8923 + }, + { + "epoch": 0.4859474655067965, + "grad_norm": 0.5961403293878741, + "learning_rate": 0.0001771396118544865, + "loss": 12.3263, + "step": 8924 + }, + { + "epoch": 0.48600191950337956, + "grad_norm": 0.6381696181668911, + "learning_rate": 0.00017713400000952292, + "loss": 12.5789, + "step": 8925 + }, + { + "epoch": 0.48605637349996256, + "grad_norm": 0.5781356158913226, + "learning_rate": 0.00017712838756475237, + "loss": 12.3219, + "step": 8926 + }, + { + "epoch": 0.48611082749654555, + "grad_norm": 0.6133175971860488, + "learning_rate": 0.0001771227745202186, + "loss": 12.5355, + "step": 8927 + }, + { + "epoch": 0.4861652814931286, + "grad_norm": 0.6944077918723269, + "learning_rate": 0.00017711716087596518, + "loss": 12.4948, + "step": 8928 + }, + { + "epoch": 0.4862197354897116, + "grad_norm": 0.6446646845681596, + "learning_rate": 0.00017711154663203578, + "loss": 12.5193, + "step": 8929 + }, + { + "epoch": 0.4862741894862946, + "grad_norm": 0.6286728175168206, + "learning_rate": 0.0001771059317884741, + "loss": 12.5637, + "step": 8930 + }, + { + "epoch": 0.48632864348287763, + "grad_norm": 0.6662026021201488, + "learning_rate": 0.0001771003163453237, + "loss": 12.4626, + "step": 8931 + }, + { + "epoch": 0.4863830974794606, + "grad_norm": 0.6301674124609281, + "learning_rate": 0.00017709470030262834, + "loss": 12.5071, + "step": 8932 + }, + { + "epoch": 0.4864375514760437, + "grad_norm": 0.5735200957479709, + "learning_rate": 0.00017708908366043168, + "loss": 12.4277, + "step": 8933 + }, + { + "epoch": 0.48649200547262667, + "grad_norm": 0.616838235274588, + "learning_rate": 0.00017708346641877735, + "loss": 12.4254, + "step": 8934 + }, + { + "epoch": 0.48654645946920966, + "grad_norm": 0.6986291067091521, + "learning_rate": 0.00017707784857770906, + "loss": 12.5306, + "step": 8935 + }, + { + "epoch": 0.4866009134657927, + "grad_norm": 0.6414495094248273, + "learning_rate": 0.00017707223013727053, + "loss": 12.438, + "step": 8936 + }, + { + "epoch": 0.4866553674623757, + "grad_norm": 0.7109706319146375, + "learning_rate": 0.00017706661109750536, + "loss": 12.3861, + "step": 8937 + }, + { + "epoch": 0.4867098214589587, + "grad_norm": 0.6688809895812369, + "learning_rate": 0.00017706099145845734, + "loss": 12.3729, + "step": 8938 + }, + { + "epoch": 0.48676427545554174, + "grad_norm": 0.6153038549431864, + "learning_rate": 0.0001770553712201701, + "loss": 12.4929, + "step": 8939 + }, + { + "epoch": 0.48681872945212473, + "grad_norm": 0.7329807332996898, + "learning_rate": 0.0001770497503826874, + "loss": 12.4184, + "step": 8940 + }, + { + "epoch": 0.4868731834487077, + "grad_norm": 0.7084924363855114, + "learning_rate": 0.0001770441289460529, + "loss": 12.3417, + "step": 8941 + }, + { + "epoch": 0.4869276374452908, + "grad_norm": 0.6815236874363345, + "learning_rate": 0.00017703850691031035, + "loss": 12.4823, + "step": 8942 + }, + { + "epoch": 0.48698209144187377, + "grad_norm": 0.577985264202812, + "learning_rate": 0.00017703288427550342, + "loss": 12.4893, + "step": 8943 + }, + { + "epoch": 0.48703654543845676, + "grad_norm": 0.5997747016250154, + "learning_rate": 0.0001770272610416759, + "loss": 12.4236, + "step": 8944 + }, + { + "epoch": 0.4870909994350398, + "grad_norm": 0.6303074713542345, + "learning_rate": 0.00017702163720887144, + "loss": 12.3766, + "step": 8945 + }, + { + "epoch": 0.4871454534316228, + "grad_norm": 0.6148182246803541, + "learning_rate": 0.00017701601277713382, + "loss": 12.4035, + "step": 8946 + }, + { + "epoch": 0.4871999074282058, + "grad_norm": 0.6129823580686586, + "learning_rate": 0.0001770103877465068, + "loss": 12.3825, + "step": 8947 + }, + { + "epoch": 0.48725436142478884, + "grad_norm": 0.623778798045754, + "learning_rate": 0.00017700476211703406, + "loss": 12.4081, + "step": 8948 + }, + { + "epoch": 0.48730881542137183, + "grad_norm": 0.6131756723701572, + "learning_rate": 0.0001769991358887594, + "loss": 12.346, + "step": 8949 + }, + { + "epoch": 0.4873632694179548, + "grad_norm": 0.6497155540267017, + "learning_rate": 0.00017699350906172655, + "loss": 12.4192, + "step": 8950 + }, + { + "epoch": 0.4874177234145379, + "grad_norm": 0.7017317517650884, + "learning_rate": 0.00017698788163597923, + "loss": 12.6197, + "step": 8951 + }, + { + "epoch": 0.48747217741112087, + "grad_norm": 0.5495695820318853, + "learning_rate": 0.00017698225361156126, + "loss": 12.4219, + "step": 8952 + }, + { + "epoch": 0.48752663140770386, + "grad_norm": 0.600004068120246, + "learning_rate": 0.00017697662498851634, + "loss": 12.4167, + "step": 8953 + }, + { + "epoch": 0.4875810854042869, + "grad_norm": 0.5677519445784427, + "learning_rate": 0.0001769709957668883, + "loss": 12.4206, + "step": 8954 + }, + { + "epoch": 0.4876355394008699, + "grad_norm": 0.6933956699220511, + "learning_rate": 0.00017696536594672093, + "loss": 12.4454, + "step": 8955 + }, + { + "epoch": 0.4876899933974529, + "grad_norm": 0.5852886126317223, + "learning_rate": 0.0001769597355280579, + "loss": 12.4294, + "step": 8956 + }, + { + "epoch": 0.48774444739403594, + "grad_norm": 0.6707874767566225, + "learning_rate": 0.00017695410451094309, + "loss": 12.3555, + "step": 8957 + }, + { + "epoch": 0.48779890139061893, + "grad_norm": 0.6275576481656014, + "learning_rate": 0.00017694847289542027, + "loss": 12.3984, + "step": 8958 + }, + { + "epoch": 0.4878533553872019, + "grad_norm": 0.7453159192884955, + "learning_rate": 0.0001769428406815332, + "loss": 12.3675, + "step": 8959 + }, + { + "epoch": 0.487907809383785, + "grad_norm": 0.7149271919206965, + "learning_rate": 0.0001769372078693257, + "loss": 12.4132, + "step": 8960 + }, + { + "epoch": 0.48796226338036797, + "grad_norm": 0.6762737105423414, + "learning_rate": 0.00017693157445884157, + "loss": 12.3941, + "step": 8961 + }, + { + "epoch": 0.48801671737695096, + "grad_norm": 0.6419090179100578, + "learning_rate": 0.00017692594045012463, + "loss": 12.304, + "step": 8962 + }, + { + "epoch": 0.488071171373534, + "grad_norm": 0.6168353578562813, + "learning_rate": 0.00017692030584321862, + "loss": 12.43, + "step": 8963 + }, + { + "epoch": 0.488125625370117, + "grad_norm": 0.7407722298231637, + "learning_rate": 0.0001769146706381675, + "loss": 12.4277, + "step": 8964 + }, + { + "epoch": 0.4881800793667, + "grad_norm": 0.6123927513441908, + "learning_rate": 0.00017690903483501494, + "loss": 12.3529, + "step": 8965 + }, + { + "epoch": 0.48823453336328304, + "grad_norm": 0.6927569113790386, + "learning_rate": 0.00017690339843380487, + "loss": 12.3037, + "step": 8966 + }, + { + "epoch": 0.48828898735986603, + "grad_norm": 0.7282325256842512, + "learning_rate": 0.00017689776143458102, + "loss": 12.4561, + "step": 8967 + }, + { + "epoch": 0.4883434413564491, + "grad_norm": 0.5931461718947436, + "learning_rate": 0.00017689212383738734, + "loss": 12.5021, + "step": 8968 + }, + { + "epoch": 0.4883978953530321, + "grad_norm": 0.6057001158655123, + "learning_rate": 0.00017688648564226757, + "loss": 12.3242, + "step": 8969 + }, + { + "epoch": 0.48845234934961507, + "grad_norm": 0.6998284602710015, + "learning_rate": 0.00017688084684926563, + "loss": 12.3638, + "step": 8970 + }, + { + "epoch": 0.4885068033461981, + "grad_norm": 0.6223628650538694, + "learning_rate": 0.0001768752074584253, + "loss": 12.3924, + "step": 8971 + }, + { + "epoch": 0.4885612573427811, + "grad_norm": 0.6172249769850962, + "learning_rate": 0.0001768695674697905, + "loss": 12.3722, + "step": 8972 + }, + { + "epoch": 0.4886157113393641, + "grad_norm": 0.6434240293370114, + "learning_rate": 0.00017686392688340502, + "loss": 12.4554, + "step": 8973 + }, + { + "epoch": 0.48867016533594715, + "grad_norm": 0.6284914832648406, + "learning_rate": 0.00017685828569931277, + "loss": 12.3951, + "step": 8974 + }, + { + "epoch": 0.48872461933253014, + "grad_norm": 0.6584633649303565, + "learning_rate": 0.00017685264391755756, + "loss": 12.4403, + "step": 8975 + }, + { + "epoch": 0.48877907332911313, + "grad_norm": 0.6239764584426922, + "learning_rate": 0.00017684700153818334, + "loss": 12.445, + "step": 8976 + }, + { + "epoch": 0.4888335273256962, + "grad_norm": 0.604252889534663, + "learning_rate": 0.00017684135856123394, + "loss": 12.3662, + "step": 8977 + }, + { + "epoch": 0.4888879813222792, + "grad_norm": 0.6768630849815542, + "learning_rate": 0.00017683571498675326, + "loss": 12.4315, + "step": 8978 + }, + { + "epoch": 0.48894243531886217, + "grad_norm": 0.561773748949832, + "learning_rate": 0.0001768300708147852, + "loss": 12.4046, + "step": 8979 + }, + { + "epoch": 0.4889968893154452, + "grad_norm": 0.6191608849730387, + "learning_rate": 0.00017682442604537358, + "loss": 12.3913, + "step": 8980 + }, + { + "epoch": 0.4890513433120282, + "grad_norm": 0.668467651031518, + "learning_rate": 0.00017681878067856235, + "loss": 12.427, + "step": 8981 + }, + { + "epoch": 0.4891057973086112, + "grad_norm": 0.5852875868444413, + "learning_rate": 0.0001768131347143954, + "loss": 12.2236, + "step": 8982 + }, + { + "epoch": 0.48916025130519425, + "grad_norm": 0.6495359491080023, + "learning_rate": 0.00017680748815291662, + "loss": 12.2845, + "step": 8983 + }, + { + "epoch": 0.48921470530177724, + "grad_norm": 0.7043442632709124, + "learning_rate": 0.00017680184099416995, + "loss": 12.3416, + "step": 8984 + }, + { + "epoch": 0.48926915929836023, + "grad_norm": 0.6821440893967539, + "learning_rate": 0.0001767961932381993, + "loss": 12.488, + "step": 8985 + }, + { + "epoch": 0.4893236132949433, + "grad_norm": 0.7518808059202897, + "learning_rate": 0.00017679054488504856, + "loss": 12.4599, + "step": 8986 + }, + { + "epoch": 0.4893780672915263, + "grad_norm": 0.6489067588764982, + "learning_rate": 0.00017678489593476164, + "loss": 12.3983, + "step": 8987 + }, + { + "epoch": 0.48943252128810927, + "grad_norm": 0.5932349853729175, + "learning_rate": 0.0001767792463873825, + "loss": 12.3459, + "step": 8988 + }, + { + "epoch": 0.4894869752846923, + "grad_norm": 0.6926889837411475, + "learning_rate": 0.0001767735962429551, + "loss": 12.4644, + "step": 8989 + }, + { + "epoch": 0.4895414292812753, + "grad_norm": 0.6420778151456797, + "learning_rate": 0.0001767679455015233, + "loss": 12.3545, + "step": 8990 + }, + { + "epoch": 0.4895958832778583, + "grad_norm": 0.6720821949354343, + "learning_rate": 0.0001767622941631311, + "loss": 12.4321, + "step": 8991 + }, + { + "epoch": 0.48965033727444135, + "grad_norm": 0.6577540205055701, + "learning_rate": 0.00017675664222782244, + "loss": 12.503, + "step": 8992 + }, + { + "epoch": 0.48970479127102434, + "grad_norm": 0.6245459109383175, + "learning_rate": 0.00017675098969564124, + "loss": 12.4008, + "step": 8993 + }, + { + "epoch": 0.48975924526760733, + "grad_norm": 0.6799189419701791, + "learning_rate": 0.0001767453365666315, + "loss": 12.566, + "step": 8994 + }, + { + "epoch": 0.4898136992641904, + "grad_norm": 0.6190799234397225, + "learning_rate": 0.0001767396828408371, + "loss": 12.4133, + "step": 8995 + }, + { + "epoch": 0.4898681532607734, + "grad_norm": 0.5648051502723331, + "learning_rate": 0.0001767340285183021, + "loss": 12.3986, + "step": 8996 + }, + { + "epoch": 0.48992260725735637, + "grad_norm": 0.7832200571204988, + "learning_rate": 0.0001767283735990704, + "loss": 12.4317, + "step": 8997 + }, + { + "epoch": 0.4899770612539394, + "grad_norm": 0.5896019405323225, + "learning_rate": 0.00017672271808318605, + "loss": 12.3887, + "step": 8998 + }, + { + "epoch": 0.4900315152505224, + "grad_norm": 0.5847485314578779, + "learning_rate": 0.0001767170619706929, + "loss": 12.2743, + "step": 8999 + }, + { + "epoch": 0.49008596924710546, + "grad_norm": 0.6561473704564957, + "learning_rate": 0.00017671140526163506, + "loss": 12.3998, + "step": 9000 + }, + { + "epoch": 0.49014042324368845, + "grad_norm": 0.6031748325909405, + "learning_rate": 0.00017670574795605645, + "loss": 12.3966, + "step": 9001 + }, + { + "epoch": 0.49019487724027144, + "grad_norm": 0.5969040835414722, + "learning_rate": 0.0001767000900540011, + "loss": 12.4078, + "step": 9002 + }, + { + "epoch": 0.4902493312368545, + "grad_norm": 0.7009410013108908, + "learning_rate": 0.00017669443155551298, + "loss": 12.3908, + "step": 9003 + }, + { + "epoch": 0.4903037852334375, + "grad_norm": 0.6204813872425611, + "learning_rate": 0.00017668877246063608, + "loss": 12.3522, + "step": 9004 + }, + { + "epoch": 0.4903582392300205, + "grad_norm": 0.6725773712069133, + "learning_rate": 0.00017668311276941445, + "loss": 12.4103, + "step": 9005 + }, + { + "epoch": 0.4904126932266035, + "grad_norm": 0.5686997026451569, + "learning_rate": 0.00017667745248189207, + "loss": 12.1809, + "step": 9006 + }, + { + "epoch": 0.4904671472231865, + "grad_norm": 0.6337705792607752, + "learning_rate": 0.00017667179159811295, + "loss": 12.3331, + "step": 9007 + }, + { + "epoch": 0.4905216012197695, + "grad_norm": 0.6859319984556346, + "learning_rate": 0.00017666613011812113, + "loss": 12.4774, + "step": 9008 + }, + { + "epoch": 0.49057605521635256, + "grad_norm": 0.68712970750493, + "learning_rate": 0.0001766604680419606, + "loss": 12.4661, + "step": 9009 + }, + { + "epoch": 0.49063050921293555, + "grad_norm": 0.6337798565063828, + "learning_rate": 0.00017665480536967546, + "loss": 12.5433, + "step": 9010 + }, + { + "epoch": 0.49068496320951854, + "grad_norm": 0.9691188287958652, + "learning_rate": 0.00017664914210130966, + "loss": 12.5172, + "step": 9011 + }, + { + "epoch": 0.4907394172061016, + "grad_norm": 0.624849717001331, + "learning_rate": 0.0001766434782369073, + "loss": 12.4614, + "step": 9012 + }, + { + "epoch": 0.4907938712026846, + "grad_norm": 0.6255963762613319, + "learning_rate": 0.0001766378137765124, + "loss": 12.4613, + "step": 9013 + }, + { + "epoch": 0.4908483251992676, + "grad_norm": 0.6966420063072758, + "learning_rate": 0.000176632148720169, + "loss": 12.3903, + "step": 9014 + }, + { + "epoch": 0.4909027791958506, + "grad_norm": 0.6181596567503763, + "learning_rate": 0.00017662648306792118, + "loss": 12.4234, + "step": 9015 + }, + { + "epoch": 0.4909572331924336, + "grad_norm": 0.6472714093995697, + "learning_rate": 0.00017662081681981296, + "loss": 12.4579, + "step": 9016 + }, + { + "epoch": 0.4910116871890166, + "grad_norm": 0.6307280492239105, + "learning_rate": 0.0001766151499758884, + "loss": 12.3809, + "step": 9017 + }, + { + "epoch": 0.49106614118559966, + "grad_norm": 0.7145252730355572, + "learning_rate": 0.0001766094825361916, + "loss": 12.4365, + "step": 9018 + }, + { + "epoch": 0.49112059518218265, + "grad_norm": 0.5779293889447368, + "learning_rate": 0.0001766038145007666, + "loss": 12.3208, + "step": 9019 + }, + { + "epoch": 0.49117504917876564, + "grad_norm": 0.5970030202119183, + "learning_rate": 0.00017659814586965753, + "loss": 12.4748, + "step": 9020 + }, + { + "epoch": 0.4912295031753487, + "grad_norm": 0.6713233594796585, + "learning_rate": 0.00017659247664290843, + "loss": 12.5403, + "step": 9021 + }, + { + "epoch": 0.4912839571719317, + "grad_norm": 0.6389280094245742, + "learning_rate": 0.00017658680682056336, + "loss": 12.3417, + "step": 9022 + }, + { + "epoch": 0.4913384111685147, + "grad_norm": 0.5997464523487829, + "learning_rate": 0.00017658113640266646, + "loss": 12.2971, + "step": 9023 + }, + { + "epoch": 0.4913928651650977, + "grad_norm": 0.5998221515239658, + "learning_rate": 0.00017657546538926176, + "loss": 12.4103, + "step": 9024 + }, + { + "epoch": 0.4914473191616807, + "grad_norm": 0.6621423715721438, + "learning_rate": 0.00017656979378039345, + "loss": 12.2766, + "step": 9025 + }, + { + "epoch": 0.4915017731582637, + "grad_norm": 0.641780152243786, + "learning_rate": 0.00017656412157610553, + "loss": 12.4449, + "step": 9026 + }, + { + "epoch": 0.49155622715484676, + "grad_norm": 0.630345320868909, + "learning_rate": 0.00017655844877644222, + "loss": 12.3306, + "step": 9027 + }, + { + "epoch": 0.49161068115142975, + "grad_norm": 0.6483735385691813, + "learning_rate": 0.00017655277538144754, + "loss": 12.376, + "step": 9028 + }, + { + "epoch": 0.49166513514801274, + "grad_norm": 0.6309651726102987, + "learning_rate": 0.00017654710139116563, + "loss": 12.5355, + "step": 9029 + }, + { + "epoch": 0.4917195891445958, + "grad_norm": 0.600940677237466, + "learning_rate": 0.00017654142680564064, + "loss": 12.334, + "step": 9030 + }, + { + "epoch": 0.4917740431411788, + "grad_norm": 0.5921764876537235, + "learning_rate": 0.0001765357516249167, + "loss": 12.2664, + "step": 9031 + }, + { + "epoch": 0.4918284971377618, + "grad_norm": 0.544108968363809, + "learning_rate": 0.0001765300758490379, + "loss": 12.4143, + "step": 9032 + }, + { + "epoch": 0.4918829511343448, + "grad_norm": 0.7068826009519629, + "learning_rate": 0.00017652439947804838, + "loss": 12.3461, + "step": 9033 + }, + { + "epoch": 0.4919374051309278, + "grad_norm": 0.6469820503926543, + "learning_rate": 0.00017651872251199229, + "loss": 12.3202, + "step": 9034 + }, + { + "epoch": 0.49199185912751087, + "grad_norm": 0.6601942119120693, + "learning_rate": 0.0001765130449509138, + "loss": 12.4268, + "step": 9035 + }, + { + "epoch": 0.49204631312409386, + "grad_norm": 0.5950297306819458, + "learning_rate": 0.00017650736679485706, + "loss": 12.4843, + "step": 9036 + }, + { + "epoch": 0.49210076712067685, + "grad_norm": 0.6617665756952725, + "learning_rate": 0.00017650168804386618, + "loss": 12.3563, + "step": 9037 + }, + { + "epoch": 0.4921552211172599, + "grad_norm": 0.6127139296457449, + "learning_rate": 0.00017649600869798534, + "loss": 12.4384, + "step": 9038 + }, + { + "epoch": 0.4922096751138429, + "grad_norm": 0.6596836507747746, + "learning_rate": 0.00017649032875725873, + "loss": 12.3869, + "step": 9039 + }, + { + "epoch": 0.4922641291104259, + "grad_norm": 0.7322953833439002, + "learning_rate": 0.00017648464822173047, + "loss": 12.4327, + "step": 9040 + }, + { + "epoch": 0.49231858310700893, + "grad_norm": 0.6495668511280798, + "learning_rate": 0.0001764789670914448, + "loss": 12.3549, + "step": 9041 + }, + { + "epoch": 0.4923730371035919, + "grad_norm": 0.6118572627218264, + "learning_rate": 0.00017647328536644582, + "loss": 12.4796, + "step": 9042 + }, + { + "epoch": 0.4924274911001749, + "grad_norm": 0.8118395035290029, + "learning_rate": 0.00017646760304677775, + "loss": 12.3732, + "step": 9043 + }, + { + "epoch": 0.49248194509675797, + "grad_norm": 0.6024457061523186, + "learning_rate": 0.0001764619201324848, + "loss": 12.3939, + "step": 9044 + }, + { + "epoch": 0.49253639909334096, + "grad_norm": 0.622607292398364, + "learning_rate": 0.0001764562366236111, + "loss": 12.4687, + "step": 9045 + }, + { + "epoch": 0.49259085308992395, + "grad_norm": 0.6375419946487572, + "learning_rate": 0.00017645055252020093, + "loss": 12.4258, + "step": 9046 + }, + { + "epoch": 0.492645307086507, + "grad_norm": 0.6396330427611137, + "learning_rate": 0.0001764448678222984, + "loss": 12.4937, + "step": 9047 + }, + { + "epoch": 0.49269976108309, + "grad_norm": 0.6192244845607952, + "learning_rate": 0.00017643918252994778, + "loss": 12.1713, + "step": 9048 + }, + { + "epoch": 0.492754215079673, + "grad_norm": 0.6223106593402868, + "learning_rate": 0.00017643349664319327, + "loss": 12.5063, + "step": 9049 + }, + { + "epoch": 0.49280866907625603, + "grad_norm": 0.6008700229121132, + "learning_rate": 0.00017642781016207902, + "loss": 12.4324, + "step": 9050 + }, + { + "epoch": 0.492863123072839, + "grad_norm": 0.6685584067925143, + "learning_rate": 0.00017642212308664935, + "loss": 12.4043, + "step": 9051 + }, + { + "epoch": 0.492917577069422, + "grad_norm": 0.652765870070671, + "learning_rate": 0.00017641643541694841, + "loss": 12.2954, + "step": 9052 + }, + { + "epoch": 0.49297203106600507, + "grad_norm": 0.5641445539183918, + "learning_rate": 0.0001764107471530205, + "loss": 12.2922, + "step": 9053 + }, + { + "epoch": 0.49302648506258806, + "grad_norm": 0.7735158333144617, + "learning_rate": 0.00017640505829490975, + "loss": 12.4085, + "step": 9054 + }, + { + "epoch": 0.49308093905917105, + "grad_norm": 0.7470430066736975, + "learning_rate": 0.00017639936884266045, + "loss": 12.349, + "step": 9055 + }, + { + "epoch": 0.4931353930557541, + "grad_norm": 0.7206711098167883, + "learning_rate": 0.00017639367879631687, + "loss": 12.299, + "step": 9056 + }, + { + "epoch": 0.4931898470523371, + "grad_norm": 0.7371347771651026, + "learning_rate": 0.00017638798815592326, + "loss": 12.4882, + "step": 9057 + }, + { + "epoch": 0.4932443010489201, + "grad_norm": 0.595033333497349, + "learning_rate": 0.0001763822969215238, + "loss": 12.3871, + "step": 9058 + }, + { + "epoch": 0.49329875504550313, + "grad_norm": 0.8054972406286517, + "learning_rate": 0.0001763766050931628, + "loss": 12.405, + "step": 9059 + }, + { + "epoch": 0.4933532090420861, + "grad_norm": 0.6098101737336612, + "learning_rate": 0.00017637091267088453, + "loss": 12.4194, + "step": 9060 + }, + { + "epoch": 0.4934076630386691, + "grad_norm": 0.838036559345574, + "learning_rate": 0.00017636521965473323, + "loss": 12.4134, + "step": 9061 + }, + { + "epoch": 0.49346211703525217, + "grad_norm": 0.6159087356357568, + "learning_rate": 0.00017635952604475318, + "loss": 12.4425, + "step": 9062 + }, + { + "epoch": 0.49351657103183516, + "grad_norm": 0.5772658512197129, + "learning_rate": 0.0001763538318409886, + "loss": 12.3848, + "step": 9063 + }, + { + "epoch": 0.49357102502841815, + "grad_norm": 0.648307453696023, + "learning_rate": 0.00017634813704348385, + "loss": 12.4, + "step": 9064 + }, + { + "epoch": 0.4936254790250012, + "grad_norm": 0.797999057121607, + "learning_rate": 0.0001763424416522832, + "loss": 12.4749, + "step": 9065 + }, + { + "epoch": 0.4936799330215842, + "grad_norm": 0.6154663448292866, + "learning_rate": 0.00017633674566743093, + "loss": 12.4569, + "step": 9066 + }, + { + "epoch": 0.49373438701816724, + "grad_norm": 0.6053841272700453, + "learning_rate": 0.00017633104908897128, + "loss": 12.4407, + "step": 9067 + }, + { + "epoch": 0.49378884101475023, + "grad_norm": 0.6125434288709458, + "learning_rate": 0.00017632535191694864, + "loss": 12.3684, + "step": 9068 + }, + { + "epoch": 0.4938432950113332, + "grad_norm": 0.5478148758765967, + "learning_rate": 0.0001763196541514072, + "loss": 12.466, + "step": 9069 + }, + { + "epoch": 0.4938977490079163, + "grad_norm": 0.6767775722685464, + "learning_rate": 0.00017631395579239138, + "loss": 12.4683, + "step": 9070 + }, + { + "epoch": 0.49395220300449927, + "grad_norm": 0.6565295047603227, + "learning_rate": 0.00017630825683994546, + "loss": 12.4402, + "step": 9071 + }, + { + "epoch": 0.49400665700108226, + "grad_norm": 0.6186660536186829, + "learning_rate": 0.0001763025572941137, + "loss": 12.4184, + "step": 9072 + }, + { + "epoch": 0.4940611109976653, + "grad_norm": 0.574512487793142, + "learning_rate": 0.00017629685715494047, + "loss": 12.3434, + "step": 9073 + }, + { + "epoch": 0.4941155649942483, + "grad_norm": 0.5786501159465615, + "learning_rate": 0.00017629115642247008, + "loss": 12.2469, + "step": 9074 + }, + { + "epoch": 0.4941700189908313, + "grad_norm": 0.6344422448863354, + "learning_rate": 0.00017628545509674686, + "loss": 12.3322, + "step": 9075 + }, + { + "epoch": 0.49422447298741434, + "grad_norm": 0.5947827175109859, + "learning_rate": 0.00017627975317781514, + "loss": 12.3867, + "step": 9076 + }, + { + "epoch": 0.49427892698399734, + "grad_norm": 0.7234156368573843, + "learning_rate": 0.00017627405066571926, + "loss": 12.4595, + "step": 9077 + }, + { + "epoch": 0.49433338098058033, + "grad_norm": 0.5716608289086761, + "learning_rate": 0.0001762683475605036, + "loss": 12.3055, + "step": 9078 + }, + { + "epoch": 0.4943878349771634, + "grad_norm": 0.6516898534652547, + "learning_rate": 0.0001762626438622125, + "loss": 12.2219, + "step": 9079 + }, + { + "epoch": 0.49444228897374637, + "grad_norm": 0.5531591292155007, + "learning_rate": 0.00017625693957089023, + "loss": 12.2898, + "step": 9080 + }, + { + "epoch": 0.49449674297032936, + "grad_norm": 0.6540053412369687, + "learning_rate": 0.00017625123468658128, + "loss": 12.3422, + "step": 9081 + }, + { + "epoch": 0.4945511969669124, + "grad_norm": 0.5822114864652976, + "learning_rate": 0.00017624552920932987, + "loss": 12.3534, + "step": 9082 + }, + { + "epoch": 0.4946056509634954, + "grad_norm": 0.647468952988309, + "learning_rate": 0.00017623982313918048, + "loss": 12.2724, + "step": 9083 + }, + { + "epoch": 0.4946601049600784, + "grad_norm": 0.6538053463120066, + "learning_rate": 0.00017623411647617744, + "loss": 12.4656, + "step": 9084 + }, + { + "epoch": 0.49471455895666144, + "grad_norm": 0.5912115377511528, + "learning_rate": 0.00017622840922036512, + "loss": 12.3684, + "step": 9085 + }, + { + "epoch": 0.49476901295324444, + "grad_norm": 0.5638563271249786, + "learning_rate": 0.00017622270137178793, + "loss": 12.2539, + "step": 9086 + }, + { + "epoch": 0.49482346694982743, + "grad_norm": 0.6885206221172212, + "learning_rate": 0.00017621699293049017, + "loss": 12.5263, + "step": 9087 + }, + { + "epoch": 0.4948779209464105, + "grad_norm": 0.6695687649039862, + "learning_rate": 0.00017621128389651633, + "loss": 12.4807, + "step": 9088 + }, + { + "epoch": 0.49493237494299347, + "grad_norm": 0.6267441366242752, + "learning_rate": 0.00017620557426991078, + "loss": 12.4484, + "step": 9089 + }, + { + "epoch": 0.49498682893957646, + "grad_norm": 0.7064926132108872, + "learning_rate": 0.00017619986405071787, + "loss": 12.4488, + "step": 9090 + }, + { + "epoch": 0.4950412829361595, + "grad_norm": 0.56968390851137, + "learning_rate": 0.00017619415323898206, + "loss": 12.3079, + "step": 9091 + }, + { + "epoch": 0.4950957369327425, + "grad_norm": 0.6304071941633153, + "learning_rate": 0.00017618844183474774, + "loss": 12.512, + "step": 9092 + }, + { + "epoch": 0.4951501909293255, + "grad_norm": 0.6627967334696621, + "learning_rate": 0.0001761827298380593, + "loss": 12.444, + "step": 9093 + }, + { + "epoch": 0.49520464492590854, + "grad_norm": 0.6285211262951765, + "learning_rate": 0.00017617701724896117, + "loss": 12.3301, + "step": 9094 + }, + { + "epoch": 0.49525909892249154, + "grad_norm": 0.6608859614262765, + "learning_rate": 0.00017617130406749783, + "loss": 12.4028, + "step": 9095 + }, + { + "epoch": 0.49531355291907453, + "grad_norm": 0.6317947756451693, + "learning_rate": 0.0001761655902937136, + "loss": 12.4618, + "step": 9096 + }, + { + "epoch": 0.4953680069156576, + "grad_norm": 0.7670429802659895, + "learning_rate": 0.000176159875927653, + "loss": 12.4435, + "step": 9097 + }, + { + "epoch": 0.49542246091224057, + "grad_norm": 0.7483558764694811, + "learning_rate": 0.00017615416096936043, + "loss": 12.4271, + "step": 9098 + }, + { + "epoch": 0.49547691490882356, + "grad_norm": 0.727556789374051, + "learning_rate": 0.00017614844541888035, + "loss": 12.2108, + "step": 9099 + }, + { + "epoch": 0.4955313689054066, + "grad_norm": 0.9103992034491821, + "learning_rate": 0.00017614272927625715, + "loss": 12.4841, + "step": 9100 + }, + { + "epoch": 0.4955858229019896, + "grad_norm": 0.593670948636598, + "learning_rate": 0.0001761370125415353, + "loss": 12.4133, + "step": 9101 + }, + { + "epoch": 0.49564027689857265, + "grad_norm": 0.7023290626503857, + "learning_rate": 0.00017613129521475932, + "loss": 12.3761, + "step": 9102 + }, + { + "epoch": 0.49569473089515564, + "grad_norm": 0.6374200651285543, + "learning_rate": 0.0001761255772959736, + "loss": 12.4001, + "step": 9103 + }, + { + "epoch": 0.49574918489173864, + "grad_norm": 0.7084811765004031, + "learning_rate": 0.00017611985878522265, + "loss": 12.4626, + "step": 9104 + }, + { + "epoch": 0.4958036388883217, + "grad_norm": 0.7987390974405981, + "learning_rate": 0.00017611413968255087, + "loss": 12.3393, + "step": 9105 + }, + { + "epoch": 0.4958580928849047, + "grad_norm": 0.6722889300823013, + "learning_rate": 0.00017610841998800283, + "loss": 12.3658, + "step": 9106 + }, + { + "epoch": 0.49591254688148767, + "grad_norm": 0.7000109463103993, + "learning_rate": 0.0001761026997016229, + "loss": 12.4182, + "step": 9107 + }, + { + "epoch": 0.4959670008780707, + "grad_norm": 0.6721741169406789, + "learning_rate": 0.00017609697882345565, + "loss": 12.5067, + "step": 9108 + }, + { + "epoch": 0.4960214548746537, + "grad_norm": 0.7401697808763813, + "learning_rate": 0.0001760912573535455, + "loss": 12.5111, + "step": 9109 + }, + { + "epoch": 0.4960759088712367, + "grad_norm": 0.718441618807066, + "learning_rate": 0.000176085535291937, + "loss": 12.3193, + "step": 9110 + }, + { + "epoch": 0.49613036286781975, + "grad_norm": 0.6037022793016471, + "learning_rate": 0.0001760798126386746, + "loss": 12.3863, + "step": 9111 + }, + { + "epoch": 0.49618481686440274, + "grad_norm": 0.6269927356974456, + "learning_rate": 0.00017607408939380282, + "loss": 12.3418, + "step": 9112 + }, + { + "epoch": 0.49623927086098574, + "grad_norm": 0.6458263636719211, + "learning_rate": 0.00017606836555736618, + "loss": 12.4122, + "step": 9113 + }, + { + "epoch": 0.4962937248575688, + "grad_norm": 0.6077674815766129, + "learning_rate": 0.0001760626411294092, + "loss": 12.3997, + "step": 9114 + }, + { + "epoch": 0.4963481788541518, + "grad_norm": 0.6827924582993539, + "learning_rate": 0.00017605691610997633, + "loss": 12.5016, + "step": 9115 + }, + { + "epoch": 0.49640263285073477, + "grad_norm": 1.061163683682852, + "learning_rate": 0.0001760511904991121, + "loss": 12.4663, + "step": 9116 + }, + { + "epoch": 0.4964570868473178, + "grad_norm": 0.6579332992213393, + "learning_rate": 0.0001760454642968611, + "loss": 12.4702, + "step": 9117 + }, + { + "epoch": 0.4965115408439008, + "grad_norm": 0.6817706542101409, + "learning_rate": 0.00017603973750326778, + "loss": 12.4496, + "step": 9118 + }, + { + "epoch": 0.4965659948404838, + "grad_norm": 0.6617212495815381, + "learning_rate": 0.00017603401011837675, + "loss": 12.4429, + "step": 9119 + }, + { + "epoch": 0.49662044883706685, + "grad_norm": 0.7923159528782391, + "learning_rate": 0.00017602828214223249, + "loss": 12.459, + "step": 9120 + }, + { + "epoch": 0.49667490283364985, + "grad_norm": 0.6616900559517679, + "learning_rate": 0.00017602255357487953, + "loss": 12.4547, + "step": 9121 + }, + { + "epoch": 0.49672935683023284, + "grad_norm": 0.6740391179889882, + "learning_rate": 0.00017601682441636246, + "loss": 12.3776, + "step": 9122 + }, + { + "epoch": 0.4967838108268159, + "grad_norm": 0.6543330850905006, + "learning_rate": 0.00017601109466672585, + "loss": 12.3831, + "step": 9123 + }, + { + "epoch": 0.4968382648233989, + "grad_norm": 0.6138310059376589, + "learning_rate": 0.00017600536432601417, + "loss": 12.4033, + "step": 9124 + }, + { + "epoch": 0.49689271881998187, + "grad_norm": 0.6241853205466411, + "learning_rate": 0.00017599963339427207, + "loss": 12.1249, + "step": 9125 + }, + { + "epoch": 0.4969471728165649, + "grad_norm": 0.654784791879004, + "learning_rate": 0.00017599390187154405, + "loss": 12.34, + "step": 9126 + }, + { + "epoch": 0.4970016268131479, + "grad_norm": 0.6626102053470619, + "learning_rate": 0.00017598816975787475, + "loss": 12.524, + "step": 9127 + }, + { + "epoch": 0.4970560808097309, + "grad_norm": 0.7690448781237839, + "learning_rate": 0.00017598243705330865, + "loss": 12.3843, + "step": 9128 + }, + { + "epoch": 0.49711053480631395, + "grad_norm": 0.5921755860108986, + "learning_rate": 0.0001759767037578904, + "loss": 12.3137, + "step": 9129 + }, + { + "epoch": 0.49716498880289695, + "grad_norm": 0.6724512511904382, + "learning_rate": 0.00017597096987166454, + "loss": 12.4212, + "step": 9130 + }, + { + "epoch": 0.49721944279947994, + "grad_norm": 0.6060506869183341, + "learning_rate": 0.0001759652353946757, + "loss": 12.4006, + "step": 9131 + }, + { + "epoch": 0.497273896796063, + "grad_norm": 0.5649231816877845, + "learning_rate": 0.00017595950032696844, + "loss": 12.3956, + "step": 9132 + }, + { + "epoch": 0.497328350792646, + "grad_norm": 0.7233811405518149, + "learning_rate": 0.00017595376466858736, + "loss": 12.4503, + "step": 9133 + }, + { + "epoch": 0.497382804789229, + "grad_norm": 0.6628960967687958, + "learning_rate": 0.0001759480284195771, + "loss": 12.4771, + "step": 9134 + }, + { + "epoch": 0.497437258785812, + "grad_norm": 0.68131829983016, + "learning_rate": 0.00017594229157998222, + "loss": 12.3811, + "step": 9135 + }, + { + "epoch": 0.497491712782395, + "grad_norm": 0.6134912800656412, + "learning_rate": 0.0001759365541498473, + "loss": 12.3098, + "step": 9136 + }, + { + "epoch": 0.49754616677897806, + "grad_norm": 0.6714589554358765, + "learning_rate": 0.00017593081612921705, + "loss": 12.411, + "step": 9137 + }, + { + "epoch": 0.49760062077556105, + "grad_norm": 0.6379318436445025, + "learning_rate": 0.00017592507751813601, + "loss": 12.4893, + "step": 9138 + }, + { + "epoch": 0.49765507477214405, + "grad_norm": 0.5473509143522587, + "learning_rate": 0.00017591933831664886, + "loss": 12.2698, + "step": 9139 + }, + { + "epoch": 0.4977095287687271, + "grad_norm": 0.6009501607651135, + "learning_rate": 0.0001759135985248002, + "loss": 12.2894, + "step": 9140 + }, + { + "epoch": 0.4977639827653101, + "grad_norm": 0.6799834772024416, + "learning_rate": 0.00017590785814263466, + "loss": 12.4481, + "step": 9141 + }, + { + "epoch": 0.4978184367618931, + "grad_norm": 0.6630782242113525, + "learning_rate": 0.00017590211717019688, + "loss": 12.3452, + "step": 9142 + }, + { + "epoch": 0.4978728907584761, + "grad_norm": 0.7008834707717136, + "learning_rate": 0.00017589637560753153, + "loss": 12.4784, + "step": 9143 + }, + { + "epoch": 0.4979273447550591, + "grad_norm": 0.6515565727629259, + "learning_rate": 0.0001758906334546832, + "loss": 12.3688, + "step": 9144 + }, + { + "epoch": 0.4979817987516421, + "grad_norm": 0.626932835931393, + "learning_rate": 0.0001758848907116966, + "loss": 12.3809, + "step": 9145 + }, + { + "epoch": 0.49803625274822516, + "grad_norm": 0.6795840967085799, + "learning_rate": 0.00017587914737861638, + "loss": 12.5972, + "step": 9146 + }, + { + "epoch": 0.49809070674480815, + "grad_norm": 0.6063465269047075, + "learning_rate": 0.00017587340345548717, + "loss": 12.4608, + "step": 9147 + }, + { + "epoch": 0.49814516074139115, + "grad_norm": 0.5828488884039613, + "learning_rate": 0.00017586765894235363, + "loss": 12.3942, + "step": 9148 + }, + { + "epoch": 0.4981996147379742, + "grad_norm": 0.5762418528638724, + "learning_rate": 0.0001758619138392605, + "loss": 12.3874, + "step": 9149 + }, + { + "epoch": 0.4982540687345572, + "grad_norm": 0.7426995154173507, + "learning_rate": 0.00017585616814625235, + "loss": 12.3388, + "step": 9150 + }, + { + "epoch": 0.4983085227311402, + "grad_norm": 0.6992365012527693, + "learning_rate": 0.00017585042186337395, + "loss": 12.4163, + "step": 9151 + }, + { + "epoch": 0.49836297672772323, + "grad_norm": 0.6667318576146535, + "learning_rate": 0.00017584467499066994, + "loss": 12.486, + "step": 9152 + }, + { + "epoch": 0.4984174307243062, + "grad_norm": 0.607362379750229, + "learning_rate": 0.000175838927528185, + "loss": 12.437, + "step": 9153 + }, + { + "epoch": 0.4984718847208892, + "grad_norm": 0.6540776468373742, + "learning_rate": 0.00017583317947596387, + "loss": 12.5109, + "step": 9154 + }, + { + "epoch": 0.49852633871747226, + "grad_norm": 0.6659725463738717, + "learning_rate": 0.00017582743083405122, + "loss": 12.3915, + "step": 9155 + }, + { + "epoch": 0.49858079271405525, + "grad_norm": 0.6255060583128188, + "learning_rate": 0.00017582168160249173, + "loss": 12.2338, + "step": 9156 + }, + { + "epoch": 0.49863524671063825, + "grad_norm": 0.5953756443739128, + "learning_rate": 0.00017581593178133016, + "loss": 12.3541, + "step": 9157 + }, + { + "epoch": 0.4986897007072213, + "grad_norm": 0.6483309366016954, + "learning_rate": 0.00017581018137061114, + "loss": 12.2435, + "step": 9158 + }, + { + "epoch": 0.4987441547038043, + "grad_norm": 0.5991782175764987, + "learning_rate": 0.00017580443037037947, + "loss": 12.4159, + "step": 9159 + }, + { + "epoch": 0.4987986087003873, + "grad_norm": 0.6104988237916549, + "learning_rate": 0.00017579867878067984, + "loss": 12.4037, + "step": 9160 + }, + { + "epoch": 0.49885306269697033, + "grad_norm": 0.6523693446003387, + "learning_rate": 0.00017579292660155696, + "loss": 12.4619, + "step": 9161 + }, + { + "epoch": 0.4989075166935533, + "grad_norm": 0.5850126153423292, + "learning_rate": 0.0001757871738330556, + "loss": 12.3452, + "step": 9162 + }, + { + "epoch": 0.4989619706901363, + "grad_norm": 0.6481344573786663, + "learning_rate": 0.00017578142047522044, + "loss": 12.4467, + "step": 9163 + }, + { + "epoch": 0.49901642468671936, + "grad_norm": 0.6272323440376265, + "learning_rate": 0.00017577566652809626, + "loss": 12.3522, + "step": 9164 + }, + { + "epoch": 0.49907087868330235, + "grad_norm": 0.6205903085328062, + "learning_rate": 0.00017576991199172776, + "loss": 12.5213, + "step": 9165 + }, + { + "epoch": 0.49912533267988535, + "grad_norm": 0.6477408614756291, + "learning_rate": 0.00017576415686615976, + "loss": 12.4615, + "step": 9166 + }, + { + "epoch": 0.4991797866764684, + "grad_norm": 0.7118857219939702, + "learning_rate": 0.00017575840115143692, + "loss": 12.5615, + "step": 9167 + }, + { + "epoch": 0.4992342406730514, + "grad_norm": 0.5831572365350796, + "learning_rate": 0.00017575264484760407, + "loss": 12.4012, + "step": 9168 + }, + { + "epoch": 0.49928869466963444, + "grad_norm": 0.6296968500770712, + "learning_rate": 0.00017574688795470595, + "loss": 12.5436, + "step": 9169 + }, + { + "epoch": 0.49934314866621743, + "grad_norm": 0.6197900937902061, + "learning_rate": 0.00017574113047278734, + "loss": 12.3763, + "step": 9170 + }, + { + "epoch": 0.4993976026628004, + "grad_norm": 0.695940103894005, + "learning_rate": 0.00017573537240189298, + "loss": 12.3472, + "step": 9171 + }, + { + "epoch": 0.49945205665938347, + "grad_norm": 0.6499827974246789, + "learning_rate": 0.00017572961374206768, + "loss": 12.4526, + "step": 9172 + }, + { + "epoch": 0.49950651065596646, + "grad_norm": 0.6880144057586078, + "learning_rate": 0.0001757238544933562, + "loss": 12.4985, + "step": 9173 + }, + { + "epoch": 0.49956096465254946, + "grad_norm": 0.7149621700845834, + "learning_rate": 0.00017571809465580332, + "loss": 12.4272, + "step": 9174 + }, + { + "epoch": 0.4996154186491325, + "grad_norm": 0.6413703173304228, + "learning_rate": 0.00017571233422945386, + "loss": 12.4654, + "step": 9175 + }, + { + "epoch": 0.4996698726457155, + "grad_norm": 0.6444690707512226, + "learning_rate": 0.00017570657321435256, + "loss": 12.3721, + "step": 9176 + }, + { + "epoch": 0.4997243266422985, + "grad_norm": 0.6632077405948433, + "learning_rate": 0.00017570081161054425, + "loss": 12.4597, + "step": 9177 + }, + { + "epoch": 0.49977878063888154, + "grad_norm": 0.6920115258147176, + "learning_rate": 0.00017569504941807376, + "loss": 12.4457, + "step": 9178 + }, + { + "epoch": 0.49983323463546453, + "grad_norm": 0.6958336434558215, + "learning_rate": 0.00017568928663698588, + "loss": 12.4001, + "step": 9179 + }, + { + "epoch": 0.4998876886320475, + "grad_norm": 0.6470874095176246, + "learning_rate": 0.00017568352326732538, + "loss": 12.3732, + "step": 9180 + }, + { + "epoch": 0.49994214262863057, + "grad_norm": 0.6266020668283614, + "learning_rate": 0.00017567775930913714, + "loss": 12.3153, + "step": 9181 + }, + { + "epoch": 0.49999659662521356, + "grad_norm": 0.6116119982186802, + "learning_rate": 0.00017567199476246591, + "loss": 12.4759, + "step": 9182 + }, + { + "epoch": 0.5000510506217966, + "grad_norm": 0.6341883591867439, + "learning_rate": 0.00017566622962735662, + "loss": 12.4495, + "step": 9183 + }, + { + "epoch": 0.5001055046183795, + "grad_norm": 0.5901644660053987, + "learning_rate": 0.00017566046390385398, + "loss": 12.433, + "step": 9184 + }, + { + "epoch": 0.5001599586149627, + "grad_norm": 0.6640999529367435, + "learning_rate": 0.00017565469759200293, + "loss": 12.2458, + "step": 9185 + }, + { + "epoch": 0.5002144126115456, + "grad_norm": 0.6842580247261328, + "learning_rate": 0.00017564893069184825, + "loss": 12.5172, + "step": 9186 + }, + { + "epoch": 0.5002688666081286, + "grad_norm": 0.5644833573755198, + "learning_rate": 0.00017564316320343477, + "loss": 12.3548, + "step": 9187 + }, + { + "epoch": 0.5003233206047116, + "grad_norm": 0.6150274179512815, + "learning_rate": 0.0001756373951268074, + "loss": 12.2932, + "step": 9188 + }, + { + "epoch": 0.5003777746012946, + "grad_norm": 0.5598091040693869, + "learning_rate": 0.00017563162646201094, + "loss": 12.4621, + "step": 9189 + }, + { + "epoch": 0.5004322285978776, + "grad_norm": 0.6553818557013275, + "learning_rate": 0.0001756258572090903, + "loss": 12.5473, + "step": 9190 + }, + { + "epoch": 0.5004866825944607, + "grad_norm": 0.8404702869084849, + "learning_rate": 0.0001756200873680903, + "loss": 12.4445, + "step": 9191 + }, + { + "epoch": 0.5005411365910437, + "grad_norm": 0.5642230661996501, + "learning_rate": 0.0001756143169390558, + "loss": 12.4559, + "step": 9192 + }, + { + "epoch": 0.5005955905876267, + "grad_norm": 0.6603679295883715, + "learning_rate": 0.0001756085459220317, + "loss": 12.4716, + "step": 9193 + }, + { + "epoch": 0.5006500445842097, + "grad_norm": 0.7212196764091794, + "learning_rate": 0.00017560277431706288, + "loss": 12.5586, + "step": 9194 + }, + { + "epoch": 0.5007044985807927, + "grad_norm": 0.6153404274085386, + "learning_rate": 0.0001755970021241942, + "loss": 12.3659, + "step": 9195 + }, + { + "epoch": 0.5007589525773757, + "grad_norm": 0.7446022353173876, + "learning_rate": 0.00017559122934347055, + "loss": 12.5325, + "step": 9196 + }, + { + "epoch": 0.5008134065739588, + "grad_norm": 0.6822957531520627, + "learning_rate": 0.0001755854559749368, + "loss": 12.284, + "step": 9197 + }, + { + "epoch": 0.5008678605705418, + "grad_norm": 0.5839316572008065, + "learning_rate": 0.00017557968201863792, + "loss": 12.4315, + "step": 9198 + }, + { + "epoch": 0.5009223145671248, + "grad_norm": 0.5920598310314543, + "learning_rate": 0.00017557390747461872, + "loss": 12.4369, + "step": 9199 + }, + { + "epoch": 0.5009767685637078, + "grad_norm": 0.5746963240067243, + "learning_rate": 0.0001755681323429242, + "loss": 12.4049, + "step": 9200 + }, + { + "epoch": 0.5010312225602908, + "grad_norm": 0.6387688032969914, + "learning_rate": 0.00017556235662359915, + "loss": 12.3825, + "step": 9201 + }, + { + "epoch": 0.5010856765568737, + "grad_norm": 0.6220273180989577, + "learning_rate": 0.00017555658031668853, + "loss": 12.3686, + "step": 9202 + }, + { + "epoch": 0.5011401305534569, + "grad_norm": 0.5955115588017998, + "learning_rate": 0.0001755508034222373, + "loss": 12.4406, + "step": 9203 + }, + { + "epoch": 0.5011945845500398, + "grad_norm": 0.6381359428691479, + "learning_rate": 0.00017554502594029038, + "loss": 12.3493, + "step": 9204 + }, + { + "epoch": 0.5012490385466228, + "grad_norm": 0.6268485567466849, + "learning_rate": 0.00017553924787089262, + "loss": 12.3149, + "step": 9205 + }, + { + "epoch": 0.5013034925432058, + "grad_norm": 0.5994229081680599, + "learning_rate": 0.000175533469214089, + "loss": 12.3849, + "step": 9206 + }, + { + "epoch": 0.5013579465397888, + "grad_norm": 0.6879692862918647, + "learning_rate": 0.0001755276899699245, + "loss": 12.3399, + "step": 9207 + }, + { + "epoch": 0.5014124005363718, + "grad_norm": 0.6485348858208008, + "learning_rate": 0.00017552191013844398, + "loss": 12.37, + "step": 9208 + }, + { + "epoch": 0.5014668545329549, + "grad_norm": 0.6290445813308777, + "learning_rate": 0.00017551612971969247, + "loss": 12.4224, + "step": 9209 + }, + { + "epoch": 0.5015213085295379, + "grad_norm": 0.7839468838039373, + "learning_rate": 0.0001755103487137148, + "loss": 12.5299, + "step": 9210 + }, + { + "epoch": 0.5015757625261209, + "grad_norm": 0.6271535640482853, + "learning_rate": 0.00017550456712055605, + "loss": 12.3695, + "step": 9211 + }, + { + "epoch": 0.5016302165227039, + "grad_norm": 0.6818818059207238, + "learning_rate": 0.00017549878494026112, + "loss": 12.2653, + "step": 9212 + }, + { + "epoch": 0.5016846705192869, + "grad_norm": 0.7178157128174575, + "learning_rate": 0.00017549300217287495, + "loss": 12.5114, + "step": 9213 + }, + { + "epoch": 0.5017391245158699, + "grad_norm": 0.6376885520060535, + "learning_rate": 0.00017548721881844255, + "loss": 12.3963, + "step": 9214 + }, + { + "epoch": 0.501793578512453, + "grad_norm": 0.7370977772095955, + "learning_rate": 0.0001754814348770089, + "loss": 12.5435, + "step": 9215 + }, + { + "epoch": 0.501848032509036, + "grad_norm": 0.636511787689217, + "learning_rate": 0.00017547565034861892, + "loss": 12.396, + "step": 9216 + }, + { + "epoch": 0.501902486505619, + "grad_norm": 0.8710913375149281, + "learning_rate": 0.00017546986523331762, + "loss": 12.4173, + "step": 9217 + }, + { + "epoch": 0.501956940502202, + "grad_norm": 0.6260169853757912, + "learning_rate": 0.00017546407953115002, + "loss": 12.455, + "step": 9218 + }, + { + "epoch": 0.502011394498785, + "grad_norm": 0.6306583641898258, + "learning_rate": 0.00017545829324216108, + "loss": 12.4029, + "step": 9219 + }, + { + "epoch": 0.5020658484953681, + "grad_norm": 0.8238270270686283, + "learning_rate": 0.0001754525063663958, + "loss": 12.5189, + "step": 9220 + }, + { + "epoch": 0.502120302491951, + "grad_norm": 0.5536911448623411, + "learning_rate": 0.00017544671890389914, + "loss": 12.3506, + "step": 9221 + }, + { + "epoch": 0.502174756488534, + "grad_norm": 0.6744248023901999, + "learning_rate": 0.00017544093085471616, + "loss": 12.4723, + "step": 9222 + }, + { + "epoch": 0.502229210485117, + "grad_norm": 0.6133959900954926, + "learning_rate": 0.00017543514221889187, + "loss": 12.4893, + "step": 9223 + }, + { + "epoch": 0.5022836644817, + "grad_norm": 0.5929697847285679, + "learning_rate": 0.00017542935299647122, + "loss": 12.4983, + "step": 9224 + }, + { + "epoch": 0.502338118478283, + "grad_norm": 0.6482710041361429, + "learning_rate": 0.00017542356318749934, + "loss": 12.287, + "step": 9225 + }, + { + "epoch": 0.5023925724748661, + "grad_norm": 0.6812798436350104, + "learning_rate": 0.0001754177727920211, + "loss": 12.422, + "step": 9226 + }, + { + "epoch": 0.5024470264714491, + "grad_norm": 0.5734535128505137, + "learning_rate": 0.00017541198181008169, + "loss": 12.3762, + "step": 9227 + }, + { + "epoch": 0.5025014804680321, + "grad_norm": 0.7298704517473773, + "learning_rate": 0.000175406190241726, + "loss": 12.3909, + "step": 9228 + }, + { + "epoch": 0.5025559344646151, + "grad_norm": 0.6211270568866112, + "learning_rate": 0.0001754003980869992, + "loss": 12.514, + "step": 9229 + }, + { + "epoch": 0.5026103884611981, + "grad_norm": 0.6538190401861905, + "learning_rate": 0.00017539460534594622, + "loss": 12.4927, + "step": 9230 + }, + { + "epoch": 0.5026648424577811, + "grad_norm": 0.649123256226189, + "learning_rate": 0.00017538881201861214, + "loss": 12.4245, + "step": 9231 + }, + { + "epoch": 0.5027192964543642, + "grad_norm": 0.5442248840080184, + "learning_rate": 0.00017538301810504203, + "loss": 12.3251, + "step": 9232 + }, + { + "epoch": 0.5027737504509472, + "grad_norm": 0.5847750120836829, + "learning_rate": 0.0001753772236052809, + "loss": 12.4215, + "step": 9233 + }, + { + "epoch": 0.5028282044475302, + "grad_norm": 0.6489129014356729, + "learning_rate": 0.00017537142851937386, + "loss": 12.4467, + "step": 9234 + }, + { + "epoch": 0.5028826584441132, + "grad_norm": 0.6464215828254797, + "learning_rate": 0.00017536563284736593, + "loss": 12.357, + "step": 9235 + }, + { + "epoch": 0.5029371124406962, + "grad_norm": 0.5998675678204343, + "learning_rate": 0.00017535983658930224, + "loss": 12.4077, + "step": 9236 + }, + { + "epoch": 0.5029915664372792, + "grad_norm": 0.6711699450059563, + "learning_rate": 0.0001753540397452278, + "loss": 12.4252, + "step": 9237 + }, + { + "epoch": 0.5030460204338623, + "grad_norm": 0.6565266462253379, + "learning_rate": 0.00017534824231518772, + "loss": 12.5112, + "step": 9238 + }, + { + "epoch": 0.5031004744304453, + "grad_norm": 0.7106207839496469, + "learning_rate": 0.00017534244429922704, + "loss": 12.3185, + "step": 9239 + }, + { + "epoch": 0.5031549284270282, + "grad_norm": 0.6746786233771818, + "learning_rate": 0.00017533664569739093, + "loss": 12.4076, + "step": 9240 + }, + { + "epoch": 0.5032093824236112, + "grad_norm": 0.7287877456392946, + "learning_rate": 0.00017533084650972437, + "loss": 12.4355, + "step": 9241 + }, + { + "epoch": 0.5032638364201942, + "grad_norm": 0.7183288979000483, + "learning_rate": 0.00017532504673627256, + "loss": 12.3689, + "step": 9242 + }, + { + "epoch": 0.5033182904167772, + "grad_norm": 0.5913008733418407, + "learning_rate": 0.00017531924637708053, + "loss": 12.3267, + "step": 9243 + }, + { + "epoch": 0.5033727444133603, + "grad_norm": 0.7399657849568171, + "learning_rate": 0.0001753134454321934, + "loss": 12.3304, + "step": 9244 + }, + { + "epoch": 0.5034271984099433, + "grad_norm": 0.6330603467608257, + "learning_rate": 0.00017530764390165634, + "loss": 12.3574, + "step": 9245 + }, + { + "epoch": 0.5034816524065263, + "grad_norm": 0.6249736543216847, + "learning_rate": 0.0001753018417855144, + "loss": 12.4009, + "step": 9246 + }, + { + "epoch": 0.5035361064031093, + "grad_norm": 0.6501824218490548, + "learning_rate": 0.00017529603908381268, + "loss": 12.4118, + "step": 9247 + }, + { + "epoch": 0.5035905603996923, + "grad_norm": 0.5848558170314551, + "learning_rate": 0.00017529023579659634, + "loss": 12.3934, + "step": 9248 + }, + { + "epoch": 0.5036450143962753, + "grad_norm": 0.6547297192921312, + "learning_rate": 0.00017528443192391048, + "loss": 12.4775, + "step": 9249 + }, + { + "epoch": 0.5036994683928584, + "grad_norm": 0.9148181678227723, + "learning_rate": 0.00017527862746580028, + "loss": 12.502, + "step": 9250 + }, + { + "epoch": 0.5037539223894414, + "grad_norm": 0.6406385901042989, + "learning_rate": 0.00017527282242231083, + "loss": 12.4692, + "step": 9251 + }, + { + "epoch": 0.5038083763860244, + "grad_norm": 0.639199761303182, + "learning_rate": 0.0001752670167934873, + "loss": 12.4294, + "step": 9252 + }, + { + "epoch": 0.5038628303826074, + "grad_norm": 0.6108791181730224, + "learning_rate": 0.00017526121057937486, + "loss": 12.566, + "step": 9253 + }, + { + "epoch": 0.5039172843791904, + "grad_norm": 0.6462775981127912, + "learning_rate": 0.00017525540378001856, + "loss": 12.4095, + "step": 9254 + }, + { + "epoch": 0.5039717383757735, + "grad_norm": 0.6117529957039444, + "learning_rate": 0.00017524959639546368, + "loss": 12.3417, + "step": 9255 + }, + { + "epoch": 0.5040261923723565, + "grad_norm": 0.6653108100707444, + "learning_rate": 0.0001752437884257553, + "loss": 12.5033, + "step": 9256 + }, + { + "epoch": 0.5040806463689395, + "grad_norm": 0.5761033208886187, + "learning_rate": 0.00017523797987093858, + "loss": 12.3672, + "step": 9257 + }, + { + "epoch": 0.5041351003655224, + "grad_norm": 0.6018386004957551, + "learning_rate": 0.00017523217073105873, + "loss": 12.2592, + "step": 9258 + }, + { + "epoch": 0.5041895543621054, + "grad_norm": 0.6977867384115372, + "learning_rate": 0.0001752263610061609, + "loss": 12.5126, + "step": 9259 + }, + { + "epoch": 0.5042440083586884, + "grad_norm": 0.6070018997053899, + "learning_rate": 0.00017522055069629027, + "loss": 12.4393, + "step": 9260 + }, + { + "epoch": 0.5042984623552715, + "grad_norm": 0.6458477465628019, + "learning_rate": 0.00017521473980149202, + "loss": 12.4836, + "step": 9261 + }, + { + "epoch": 0.5043529163518545, + "grad_norm": 0.5913666593119621, + "learning_rate": 0.00017520892832181134, + "loss": 12.3034, + "step": 9262 + }, + { + "epoch": 0.5044073703484375, + "grad_norm": 0.6387974436651321, + "learning_rate": 0.00017520311625729345, + "loss": 12.4383, + "step": 9263 + }, + { + "epoch": 0.5044618243450205, + "grad_norm": 0.6663629819407951, + "learning_rate": 0.00017519730360798346, + "loss": 12.3579, + "step": 9264 + }, + { + "epoch": 0.5045162783416035, + "grad_norm": 0.6743783218918941, + "learning_rate": 0.00017519149037392668, + "loss": 12.3556, + "step": 9265 + }, + { + "epoch": 0.5045707323381865, + "grad_norm": 0.5934684892792085, + "learning_rate": 0.0001751856765551682, + "loss": 12.6217, + "step": 9266 + }, + { + "epoch": 0.5046251863347696, + "grad_norm": 0.6542403394106991, + "learning_rate": 0.00017517986215175336, + "loss": 12.4348, + "step": 9267 + }, + { + "epoch": 0.5046796403313526, + "grad_norm": 0.6140821063343879, + "learning_rate": 0.00017517404716372727, + "loss": 12.2976, + "step": 9268 + }, + { + "epoch": 0.5047340943279356, + "grad_norm": 0.6302783978472125, + "learning_rate": 0.0001751682315911352, + "loss": 12.3735, + "step": 9269 + }, + { + "epoch": 0.5047885483245186, + "grad_norm": 0.617713894873678, + "learning_rate": 0.00017516241543402233, + "loss": 12.4546, + "step": 9270 + }, + { + "epoch": 0.5048430023211016, + "grad_norm": 0.6316766316871215, + "learning_rate": 0.00017515659869243391, + "loss": 12.3908, + "step": 9271 + }, + { + "epoch": 0.5048974563176846, + "grad_norm": 0.6274417258446252, + "learning_rate": 0.0001751507813664152, + "loss": 12.4156, + "step": 9272 + }, + { + "epoch": 0.5049519103142677, + "grad_norm": 0.6180856210866346, + "learning_rate": 0.00017514496345601144, + "loss": 12.4906, + "step": 9273 + }, + { + "epoch": 0.5050063643108507, + "grad_norm": 0.6618825336368301, + "learning_rate": 0.00017513914496126778, + "loss": 12.4492, + "step": 9274 + }, + { + "epoch": 0.5050608183074337, + "grad_norm": 0.6346775520196072, + "learning_rate": 0.00017513332588222955, + "loss": 12.4255, + "step": 9275 + }, + { + "epoch": 0.5051152723040166, + "grad_norm": 0.5713866022890768, + "learning_rate": 0.00017512750621894197, + "loss": 12.3917, + "step": 9276 + }, + { + "epoch": 0.5051697263005996, + "grad_norm": 0.6644568918871256, + "learning_rate": 0.0001751216859714503, + "loss": 12.5056, + "step": 9277 + }, + { + "epoch": 0.5052241802971826, + "grad_norm": 0.6559421524036552, + "learning_rate": 0.00017511586513979985, + "loss": 12.4277, + "step": 9278 + }, + { + "epoch": 0.5052786342937657, + "grad_norm": 0.5870845703090499, + "learning_rate": 0.00017511004372403578, + "loss": 12.5096, + "step": 9279 + }, + { + "epoch": 0.5053330882903487, + "grad_norm": 0.706795464340835, + "learning_rate": 0.00017510422172420343, + "loss": 12.3108, + "step": 9280 + }, + { + "epoch": 0.5053875422869317, + "grad_norm": 0.5700640206283255, + "learning_rate": 0.00017509839914034808, + "loss": 12.2902, + "step": 9281 + }, + { + "epoch": 0.5054419962835147, + "grad_norm": 0.6128756126723002, + "learning_rate": 0.00017509257597251497, + "loss": 12.4242, + "step": 9282 + }, + { + "epoch": 0.5054964502800977, + "grad_norm": 0.6653571480654208, + "learning_rate": 0.00017508675222074936, + "loss": 12.4248, + "step": 9283 + }, + { + "epoch": 0.5055509042766807, + "grad_norm": 0.6167569171939026, + "learning_rate": 0.00017508092788509662, + "loss": 12.4268, + "step": 9284 + }, + { + "epoch": 0.5056053582732638, + "grad_norm": 0.6094727507670129, + "learning_rate": 0.00017507510296560196, + "loss": 12.4102, + "step": 9285 + }, + { + "epoch": 0.5056598122698468, + "grad_norm": 0.6261696978568257, + "learning_rate": 0.00017506927746231075, + "loss": 12.3727, + "step": 9286 + }, + { + "epoch": 0.5057142662664298, + "grad_norm": 0.5853889054617658, + "learning_rate": 0.00017506345137526823, + "loss": 12.2828, + "step": 9287 + }, + { + "epoch": 0.5057687202630128, + "grad_norm": 0.6989389592903248, + "learning_rate": 0.00017505762470451972, + "loss": 12.5096, + "step": 9288 + }, + { + "epoch": 0.5058231742595958, + "grad_norm": 0.8410187354062965, + "learning_rate": 0.00017505179745011055, + "loss": 12.4025, + "step": 9289 + }, + { + "epoch": 0.5058776282561789, + "grad_norm": 0.6356104259521266, + "learning_rate": 0.000175045969612086, + "loss": 12.4196, + "step": 9290 + }, + { + "epoch": 0.5059320822527619, + "grad_norm": 0.8491671153494073, + "learning_rate": 0.0001750401411904914, + "loss": 12.3207, + "step": 9291 + }, + { + "epoch": 0.5059865362493449, + "grad_norm": 0.6281540370395706, + "learning_rate": 0.00017503431218537205, + "loss": 12.4043, + "step": 9292 + }, + { + "epoch": 0.5060409902459279, + "grad_norm": 0.7105148750537855, + "learning_rate": 0.00017502848259677337, + "loss": 12.4283, + "step": 9293 + }, + { + "epoch": 0.5060954442425109, + "grad_norm": 0.7066195639781923, + "learning_rate": 0.00017502265242474058, + "loss": 12.4412, + "step": 9294 + }, + { + "epoch": 0.5061498982390938, + "grad_norm": 0.7110921055568227, + "learning_rate": 0.0001750168216693191, + "loss": 12.4691, + "step": 9295 + }, + { + "epoch": 0.506204352235677, + "grad_norm": 0.6143960464742542, + "learning_rate": 0.00017501099033055418, + "loss": 12.4342, + "step": 9296 + }, + { + "epoch": 0.5062588062322599, + "grad_norm": 0.6211083975799827, + "learning_rate": 0.00017500515840849126, + "loss": 12.444, + "step": 9297 + }, + { + "epoch": 0.5063132602288429, + "grad_norm": 0.6150695172377717, + "learning_rate": 0.00017499932590317566, + "loss": 12.4752, + "step": 9298 + }, + { + "epoch": 0.5063677142254259, + "grad_norm": 0.5526792990362289, + "learning_rate": 0.0001749934928146527, + "loss": 12.4155, + "step": 9299 + }, + { + "epoch": 0.5064221682220089, + "grad_norm": 0.6248669052704635, + "learning_rate": 0.00017498765914296778, + "loss": 12.4054, + "step": 9300 + }, + { + "epoch": 0.5064766222185919, + "grad_norm": 0.5910774952254157, + "learning_rate": 0.00017498182488816623, + "loss": 12.3861, + "step": 9301 + }, + { + "epoch": 0.506531076215175, + "grad_norm": 0.5865642470842327, + "learning_rate": 0.00017497599005029344, + "loss": 12.2948, + "step": 9302 + }, + { + "epoch": 0.506585530211758, + "grad_norm": 0.6118141150631882, + "learning_rate": 0.00017497015462939478, + "loss": 12.4216, + "step": 9303 + }, + { + "epoch": 0.506639984208341, + "grad_norm": 0.6419707275290791, + "learning_rate": 0.00017496431862551562, + "loss": 12.1743, + "step": 9304 + }, + { + "epoch": 0.506694438204924, + "grad_norm": 0.6011774662281038, + "learning_rate": 0.00017495848203870134, + "loss": 12.3669, + "step": 9305 + }, + { + "epoch": 0.506748892201507, + "grad_norm": 0.65660842217026, + "learning_rate": 0.00017495264486899737, + "loss": 12.3027, + "step": 9306 + }, + { + "epoch": 0.50680334619809, + "grad_norm": 0.6817634176980636, + "learning_rate": 0.00017494680711644902, + "loss": 12.4326, + "step": 9307 + }, + { + "epoch": 0.5068578001946731, + "grad_norm": 0.561583501971307, + "learning_rate": 0.00017494096878110176, + "loss": 12.4116, + "step": 9308 + }, + { + "epoch": 0.5069122541912561, + "grad_norm": 0.5435703750824127, + "learning_rate": 0.00017493512986300095, + "loss": 12.3742, + "step": 9309 + }, + { + "epoch": 0.5069667081878391, + "grad_norm": 0.6022867408072567, + "learning_rate": 0.00017492929036219202, + "loss": 12.3662, + "step": 9310 + }, + { + "epoch": 0.5070211621844221, + "grad_norm": 0.7724953334573791, + "learning_rate": 0.00017492345027872035, + "loss": 12.5138, + "step": 9311 + }, + { + "epoch": 0.507075616181005, + "grad_norm": 0.6585920328470743, + "learning_rate": 0.00017491760961263136, + "loss": 12.3403, + "step": 9312 + }, + { + "epoch": 0.507130070177588, + "grad_norm": 0.6466273952386169, + "learning_rate": 0.00017491176836397046, + "loss": 12.4771, + "step": 9313 + }, + { + "epoch": 0.5071845241741711, + "grad_norm": 0.6009781768170371, + "learning_rate": 0.0001749059265327831, + "loss": 12.4441, + "step": 9314 + }, + { + "epoch": 0.5072389781707541, + "grad_norm": 0.6840403280864805, + "learning_rate": 0.0001749000841191147, + "loss": 12.5198, + "step": 9315 + }, + { + "epoch": 0.5072934321673371, + "grad_norm": 0.6337536226529283, + "learning_rate": 0.00017489424112301067, + "loss": 12.4631, + "step": 9316 + }, + { + "epoch": 0.5073478861639201, + "grad_norm": 0.6630045328127501, + "learning_rate": 0.00017488839754451648, + "loss": 12.4571, + "step": 9317 + }, + { + "epoch": 0.5074023401605031, + "grad_norm": 0.674610953829062, + "learning_rate": 0.00017488255338367754, + "loss": 12.3755, + "step": 9318 + }, + { + "epoch": 0.5074567941570862, + "grad_norm": 0.7346706962784108, + "learning_rate": 0.0001748767086405393, + "loss": 12.4546, + "step": 9319 + }, + { + "epoch": 0.5075112481536692, + "grad_norm": 0.6197681874525446, + "learning_rate": 0.00017487086331514725, + "loss": 12.3235, + "step": 9320 + }, + { + "epoch": 0.5075657021502522, + "grad_norm": 0.5401263427026369, + "learning_rate": 0.00017486501740754677, + "loss": 12.327, + "step": 9321 + }, + { + "epoch": 0.5076201561468352, + "grad_norm": 0.6569790738085491, + "learning_rate": 0.00017485917091778337, + "loss": 12.2445, + "step": 9322 + }, + { + "epoch": 0.5076746101434182, + "grad_norm": 0.6418331149863142, + "learning_rate": 0.00017485332384590253, + "loss": 12.3183, + "step": 9323 + }, + { + "epoch": 0.5077290641400012, + "grad_norm": 0.6243981903237095, + "learning_rate": 0.00017484747619194964, + "loss": 12.3514, + "step": 9324 + }, + { + "epoch": 0.5077835181365843, + "grad_norm": 0.714394137512393, + "learning_rate": 0.00017484162795597028, + "loss": 12.4041, + "step": 9325 + }, + { + "epoch": 0.5078379721331673, + "grad_norm": 0.6336579308981977, + "learning_rate": 0.00017483577913800984, + "loss": 12.4572, + "step": 9326 + }, + { + "epoch": 0.5078924261297503, + "grad_norm": 0.648306200933378, + "learning_rate": 0.00017482992973811382, + "loss": 12.4322, + "step": 9327 + }, + { + "epoch": 0.5079468801263333, + "grad_norm": 0.6449632020500176, + "learning_rate": 0.00017482407975632775, + "loss": 12.2739, + "step": 9328 + }, + { + "epoch": 0.5080013341229163, + "grad_norm": 0.560273711009082, + "learning_rate": 0.00017481822919269705, + "loss": 12.3498, + "step": 9329 + }, + { + "epoch": 0.5080557881194993, + "grad_norm": 0.7294627898226387, + "learning_rate": 0.00017481237804726728, + "loss": 12.3475, + "step": 9330 + }, + { + "epoch": 0.5081102421160824, + "grad_norm": 0.6666923064781625, + "learning_rate": 0.00017480652632008393, + "loss": 12.3618, + "step": 9331 + }, + { + "epoch": 0.5081646961126653, + "grad_norm": 0.611908423405081, + "learning_rate": 0.00017480067401119245, + "loss": 12.3883, + "step": 9332 + }, + { + "epoch": 0.5082191501092483, + "grad_norm": 0.5789056783313532, + "learning_rate": 0.0001747948211206384, + "loss": 12.3337, + "step": 9333 + }, + { + "epoch": 0.5082736041058313, + "grad_norm": 0.6528298517402605, + "learning_rate": 0.00017478896764846726, + "loss": 12.3418, + "step": 9334 + }, + { + "epoch": 0.5083280581024143, + "grad_norm": 0.586154787199463, + "learning_rate": 0.00017478311359472458, + "loss": 12.346, + "step": 9335 + }, + { + "epoch": 0.5083825120989973, + "grad_norm": 0.7375603848486024, + "learning_rate": 0.00017477725895945589, + "loss": 12.4789, + "step": 9336 + }, + { + "epoch": 0.5084369660955804, + "grad_norm": 0.5622036931182314, + "learning_rate": 0.00017477140374270666, + "loss": 12.3521, + "step": 9337 + }, + { + "epoch": 0.5084914200921634, + "grad_norm": 0.7094470049438123, + "learning_rate": 0.00017476554794452248, + "loss": 12.3785, + "step": 9338 + }, + { + "epoch": 0.5085458740887464, + "grad_norm": 0.5535802003127333, + "learning_rate": 0.00017475969156494883, + "loss": 12.3477, + "step": 9339 + }, + { + "epoch": 0.5086003280853294, + "grad_norm": 0.6060587087426393, + "learning_rate": 0.0001747538346040313, + "loss": 12.4283, + "step": 9340 + }, + { + "epoch": 0.5086547820819124, + "grad_norm": 0.6486830014576412, + "learning_rate": 0.00017474797706181546, + "loss": 12.5064, + "step": 9341 + }, + { + "epoch": 0.5087092360784954, + "grad_norm": 0.650171933766708, + "learning_rate": 0.00017474211893834672, + "loss": 12.3925, + "step": 9342 + }, + { + "epoch": 0.5087636900750785, + "grad_norm": 0.6178982347558384, + "learning_rate": 0.0001747362602336708, + "loss": 12.4251, + "step": 9343 + }, + { + "epoch": 0.5088181440716615, + "grad_norm": 0.6098889569054531, + "learning_rate": 0.00017473040094783318, + "loss": 12.4814, + "step": 9344 + }, + { + "epoch": 0.5088725980682445, + "grad_norm": 0.6397229844431023, + "learning_rate": 0.00017472454108087942, + "loss": 12.5545, + "step": 9345 + }, + { + "epoch": 0.5089270520648275, + "grad_norm": 0.662132294090361, + "learning_rate": 0.0001747186806328551, + "loss": 12.5107, + "step": 9346 + }, + { + "epoch": 0.5089815060614105, + "grad_norm": 0.7496901176361102, + "learning_rate": 0.00017471281960380577, + "loss": 12.4908, + "step": 9347 + }, + { + "epoch": 0.5090359600579935, + "grad_norm": 0.5732062504922385, + "learning_rate": 0.00017470695799377703, + "loss": 12.3264, + "step": 9348 + }, + { + "epoch": 0.5090904140545766, + "grad_norm": 0.6151031159507464, + "learning_rate": 0.00017470109580281447, + "loss": 12.304, + "step": 9349 + }, + { + "epoch": 0.5091448680511595, + "grad_norm": 0.6959356182660499, + "learning_rate": 0.00017469523303096363, + "loss": 12.3683, + "step": 9350 + }, + { + "epoch": 0.5091993220477425, + "grad_norm": 0.6226389570908005, + "learning_rate": 0.00017468936967827018, + "loss": 12.4791, + "step": 9351 + }, + { + "epoch": 0.5092537760443255, + "grad_norm": 0.6946591122987917, + "learning_rate": 0.00017468350574477965, + "loss": 12.3415, + "step": 9352 + }, + { + "epoch": 0.5093082300409085, + "grad_norm": 0.565289908611492, + "learning_rate": 0.00017467764123053764, + "loss": 12.3893, + "step": 9353 + }, + { + "epoch": 0.5093626840374916, + "grad_norm": 0.589008060870395, + "learning_rate": 0.0001746717761355898, + "loss": 12.3961, + "step": 9354 + }, + { + "epoch": 0.5094171380340746, + "grad_norm": 0.6423667999843898, + "learning_rate": 0.00017466591045998167, + "loss": 12.2483, + "step": 9355 + }, + { + "epoch": 0.5094715920306576, + "grad_norm": 0.5878197206271527, + "learning_rate": 0.0001746600442037589, + "loss": 12.2345, + "step": 9356 + }, + { + "epoch": 0.5095260460272406, + "grad_norm": 0.611952765287439, + "learning_rate": 0.0001746541773669671, + "loss": 12.3193, + "step": 9357 + }, + { + "epoch": 0.5095805000238236, + "grad_norm": 0.7025299355851088, + "learning_rate": 0.00017464830994965195, + "loss": 12.6056, + "step": 9358 + }, + { + "epoch": 0.5096349540204066, + "grad_norm": 0.5844492524972718, + "learning_rate": 0.000174642441951859, + "loss": 12.3313, + "step": 9359 + }, + { + "epoch": 0.5096894080169897, + "grad_norm": 0.5870217567121213, + "learning_rate": 0.00017463657337363388, + "loss": 12.2468, + "step": 9360 + }, + { + "epoch": 0.5097438620135727, + "grad_norm": 0.544716100069704, + "learning_rate": 0.00017463070421502226, + "loss": 12.4479, + "step": 9361 + }, + { + "epoch": 0.5097983160101557, + "grad_norm": 0.709392125597355, + "learning_rate": 0.00017462483447606977, + "loss": 12.5405, + "step": 9362 + }, + { + "epoch": 0.5098527700067387, + "grad_norm": 0.6876129323729496, + "learning_rate": 0.00017461896415682206, + "loss": 12.6349, + "step": 9363 + }, + { + "epoch": 0.5099072240033217, + "grad_norm": 0.5646918644140049, + "learning_rate": 0.00017461309325732474, + "loss": 12.4339, + "step": 9364 + }, + { + "epoch": 0.5099616779999047, + "grad_norm": 0.6483491971824025, + "learning_rate": 0.00017460722177762352, + "loss": 12.53, + "step": 9365 + }, + { + "epoch": 0.5100161319964878, + "grad_norm": 0.5661502815396339, + "learning_rate": 0.00017460134971776406, + "loss": 12.3638, + "step": 9366 + }, + { + "epoch": 0.5100705859930708, + "grad_norm": 0.6473770934053301, + "learning_rate": 0.00017459547707779195, + "loss": 12.4482, + "step": 9367 + }, + { + "epoch": 0.5101250399896538, + "grad_norm": 0.5870198823363907, + "learning_rate": 0.00017458960385775292, + "loss": 12.3818, + "step": 9368 + }, + { + "epoch": 0.5101794939862367, + "grad_norm": 0.622478533573811, + "learning_rate": 0.0001745837300576926, + "loss": 12.3057, + "step": 9369 + }, + { + "epoch": 0.5102339479828197, + "grad_norm": 0.6348645706981006, + "learning_rate": 0.00017457785567765672, + "loss": 12.4243, + "step": 9370 + }, + { + "epoch": 0.5102884019794027, + "grad_norm": 0.5427899853169943, + "learning_rate": 0.00017457198071769091, + "loss": 12.4867, + "step": 9371 + }, + { + "epoch": 0.5103428559759858, + "grad_norm": 0.9573474443809805, + "learning_rate": 0.00017456610517784088, + "loss": 12.3764, + "step": 9372 + }, + { + "epoch": 0.5103973099725688, + "grad_norm": 0.6348206524178484, + "learning_rate": 0.00017456022905815228, + "loss": 12.3435, + "step": 9373 + }, + { + "epoch": 0.5104517639691518, + "grad_norm": 0.6032852796079554, + "learning_rate": 0.00017455435235867088, + "loss": 12.5172, + "step": 9374 + }, + { + "epoch": 0.5105062179657348, + "grad_norm": 0.6296409528090846, + "learning_rate": 0.0001745484750794423, + "loss": 12.3565, + "step": 9375 + }, + { + "epoch": 0.5105606719623178, + "grad_norm": 0.5653690358017915, + "learning_rate": 0.0001745425972205123, + "loss": 12.2817, + "step": 9376 + }, + { + "epoch": 0.5106151259589008, + "grad_norm": 0.600763765793065, + "learning_rate": 0.00017453671878192654, + "loss": 12.3462, + "step": 9377 + }, + { + "epoch": 0.5106695799554839, + "grad_norm": 0.5921711828910851, + "learning_rate": 0.00017453083976373077, + "loss": 12.4408, + "step": 9378 + }, + { + "epoch": 0.5107240339520669, + "grad_norm": 0.653706867357765, + "learning_rate": 0.0001745249601659707, + "loss": 12.3856, + "step": 9379 + }, + { + "epoch": 0.5107784879486499, + "grad_norm": 0.6498084542557725, + "learning_rate": 0.00017451907998869204, + "loss": 12.5023, + "step": 9380 + }, + { + "epoch": 0.5108329419452329, + "grad_norm": 0.6621391340956829, + "learning_rate": 0.00017451319923194046, + "loss": 12.4081, + "step": 9381 + }, + { + "epoch": 0.5108873959418159, + "grad_norm": 0.5955740821428355, + "learning_rate": 0.0001745073178957618, + "loss": 12.4427, + "step": 9382 + }, + { + "epoch": 0.5109418499383989, + "grad_norm": 0.629955623693329, + "learning_rate": 0.00017450143598020173, + "loss": 12.4182, + "step": 9383 + }, + { + "epoch": 0.510996303934982, + "grad_norm": 0.620450415710074, + "learning_rate": 0.00017449555348530602, + "loss": 12.4561, + "step": 9384 + }, + { + "epoch": 0.511050757931565, + "grad_norm": 0.6694978927482998, + "learning_rate": 0.0001744896704111204, + "loss": 12.3695, + "step": 9385 + }, + { + "epoch": 0.511105211928148, + "grad_norm": 0.5680924165840874, + "learning_rate": 0.0001744837867576906, + "loss": 12.3348, + "step": 9386 + }, + { + "epoch": 0.5111596659247309, + "grad_norm": 0.6566945684404797, + "learning_rate": 0.00017447790252506237, + "loss": 12.3644, + "step": 9387 + }, + { + "epoch": 0.5112141199213139, + "grad_norm": 0.6437455314024562, + "learning_rate": 0.00017447201771328148, + "loss": 12.4965, + "step": 9388 + }, + { + "epoch": 0.511268573917897, + "grad_norm": 0.5732268946297673, + "learning_rate": 0.00017446613232239368, + "loss": 12.4202, + "step": 9389 + }, + { + "epoch": 0.51132302791448, + "grad_norm": 0.5722288630732446, + "learning_rate": 0.00017446024635244474, + "loss": 12.3932, + "step": 9390 + }, + { + "epoch": 0.511377481911063, + "grad_norm": 0.5989469527624901, + "learning_rate": 0.00017445435980348046, + "loss": 12.4469, + "step": 9391 + }, + { + "epoch": 0.511431935907646, + "grad_norm": 0.5669576255239827, + "learning_rate": 0.0001744484726755466, + "loss": 12.2855, + "step": 9392 + }, + { + "epoch": 0.511486389904229, + "grad_norm": 0.6190733787516781, + "learning_rate": 0.00017444258496868896, + "loss": 12.5061, + "step": 9393 + }, + { + "epoch": 0.511540843900812, + "grad_norm": 0.5913540005436615, + "learning_rate": 0.00017443669668295322, + "loss": 12.4111, + "step": 9394 + }, + { + "epoch": 0.5115952978973951, + "grad_norm": 0.6273924044684714, + "learning_rate": 0.0001744308078183853, + "loss": 12.4899, + "step": 9395 + }, + { + "epoch": 0.5116497518939781, + "grad_norm": 0.7100710497105639, + "learning_rate": 0.00017442491837503093, + "loss": 12.4267, + "step": 9396 + }, + { + "epoch": 0.5117042058905611, + "grad_norm": 0.6071059916931937, + "learning_rate": 0.0001744190283529359, + "loss": 12.342, + "step": 9397 + }, + { + "epoch": 0.5117586598871441, + "grad_norm": 0.6198389253712389, + "learning_rate": 0.000174413137752146, + "loss": 12.345, + "step": 9398 + }, + { + "epoch": 0.5118131138837271, + "grad_norm": 0.6068728098719426, + "learning_rate": 0.0001744072465727071, + "loss": 12.3248, + "step": 9399 + }, + { + "epoch": 0.5118675678803101, + "grad_norm": 0.5786485279810574, + "learning_rate": 0.00017440135481466495, + "loss": 12.3396, + "step": 9400 + }, + { + "epoch": 0.5119220218768932, + "grad_norm": 0.6021258048098241, + "learning_rate": 0.0001743954624780654, + "loss": 12.41, + "step": 9401 + }, + { + "epoch": 0.5119764758734762, + "grad_norm": 0.5835711794474728, + "learning_rate": 0.00017438956956295428, + "loss": 12.3808, + "step": 9402 + }, + { + "epoch": 0.5120309298700592, + "grad_norm": 0.6170318593463341, + "learning_rate": 0.00017438367606937733, + "loss": 12.4245, + "step": 9403 + }, + { + "epoch": 0.5120853838666422, + "grad_norm": 0.6225798029500026, + "learning_rate": 0.0001743777819973805, + "loss": 12.2819, + "step": 9404 + }, + { + "epoch": 0.5121398378632251, + "grad_norm": 0.6924692025557676, + "learning_rate": 0.00017437188734700952, + "loss": 12.3891, + "step": 9405 + }, + { + "epoch": 0.5121942918598081, + "grad_norm": 0.6360638878142899, + "learning_rate": 0.00017436599211831033, + "loss": 12.5471, + "step": 9406 + }, + { + "epoch": 0.5122487458563912, + "grad_norm": 0.6901262928698174, + "learning_rate": 0.00017436009631132865, + "loss": 12.4722, + "step": 9407 + }, + { + "epoch": 0.5123031998529742, + "grad_norm": 0.5930470801184227, + "learning_rate": 0.0001743541999261104, + "loss": 12.3287, + "step": 9408 + }, + { + "epoch": 0.5123576538495572, + "grad_norm": 0.6015812554557252, + "learning_rate": 0.00017434830296270145, + "loss": 12.4187, + "step": 9409 + }, + { + "epoch": 0.5124121078461402, + "grad_norm": 0.6768925144012764, + "learning_rate": 0.00017434240542114763, + "loss": 12.3967, + "step": 9410 + }, + { + "epoch": 0.5124665618427232, + "grad_norm": 0.5927749895779888, + "learning_rate": 0.0001743365073014948, + "loss": 12.329, + "step": 9411 + }, + { + "epoch": 0.5125210158393062, + "grad_norm": 0.6560057717129133, + "learning_rate": 0.0001743306086037888, + "loss": 12.4162, + "step": 9412 + }, + { + "epoch": 0.5125754698358893, + "grad_norm": 0.5679942053866345, + "learning_rate": 0.00017432470932807553, + "loss": 12.3746, + "step": 9413 + }, + { + "epoch": 0.5126299238324723, + "grad_norm": 0.5901890825124537, + "learning_rate": 0.00017431880947440086, + "loss": 12.3553, + "step": 9414 + }, + { + "epoch": 0.5126843778290553, + "grad_norm": 0.611893604121528, + "learning_rate": 0.00017431290904281068, + "loss": 12.4703, + "step": 9415 + }, + { + "epoch": 0.5127388318256383, + "grad_norm": 0.7235144795413267, + "learning_rate": 0.00017430700803335082, + "loss": 12.3188, + "step": 9416 + }, + { + "epoch": 0.5127932858222213, + "grad_norm": 0.5831544036131251, + "learning_rate": 0.00017430110644606726, + "loss": 12.4885, + "step": 9417 + }, + { + "epoch": 0.5128477398188043, + "grad_norm": 0.6350100789777187, + "learning_rate": 0.00017429520428100578, + "loss": 12.289, + "step": 9418 + }, + { + "epoch": 0.5129021938153874, + "grad_norm": 0.6235061151953825, + "learning_rate": 0.00017428930153821237, + "loss": 12.3618, + "step": 9419 + }, + { + "epoch": 0.5129566478119704, + "grad_norm": 0.73164702015834, + "learning_rate": 0.00017428339821773286, + "loss": 12.4974, + "step": 9420 + }, + { + "epoch": 0.5130111018085534, + "grad_norm": 0.5782408752389678, + "learning_rate": 0.0001742774943196132, + "loss": 12.2829, + "step": 9421 + }, + { + "epoch": 0.5130655558051364, + "grad_norm": 0.6331879886470847, + "learning_rate": 0.0001742715898438993, + "loss": 12.3868, + "step": 9422 + }, + { + "epoch": 0.5131200098017193, + "grad_norm": 0.667002434791157, + "learning_rate": 0.00017426568479063706, + "loss": 12.2425, + "step": 9423 + }, + { + "epoch": 0.5131744637983024, + "grad_norm": 0.7599167814718416, + "learning_rate": 0.0001742597791598724, + "loss": 12.2946, + "step": 9424 + }, + { + "epoch": 0.5132289177948854, + "grad_norm": 0.6900616178179273, + "learning_rate": 0.00017425387295165123, + "loss": 12.6106, + "step": 9425 + }, + { + "epoch": 0.5132833717914684, + "grad_norm": 0.629354602395147, + "learning_rate": 0.00017424796616601952, + "loss": 12.4014, + "step": 9426 + }, + { + "epoch": 0.5133378257880514, + "grad_norm": 0.6277706451765541, + "learning_rate": 0.00017424205880302312, + "loss": 12.4716, + "step": 9427 + }, + { + "epoch": 0.5133922797846344, + "grad_norm": 0.7167530697844063, + "learning_rate": 0.00017423615086270807, + "loss": 12.4673, + "step": 9428 + }, + { + "epoch": 0.5134467337812174, + "grad_norm": 0.5933066126575434, + "learning_rate": 0.00017423024234512025, + "loss": 12.3945, + "step": 9429 + }, + { + "epoch": 0.5135011877778005, + "grad_norm": 0.6372901686720958, + "learning_rate": 0.0001742243332503056, + "loss": 12.522, + "step": 9430 + }, + { + "epoch": 0.5135556417743835, + "grad_norm": 0.6803090489789331, + "learning_rate": 0.0001742184235783101, + "loss": 12.2965, + "step": 9431 + }, + { + "epoch": 0.5136100957709665, + "grad_norm": 0.6042308174334214, + "learning_rate": 0.0001742125133291797, + "loss": 12.2807, + "step": 9432 + }, + { + "epoch": 0.5136645497675495, + "grad_norm": 0.6633908205859076, + "learning_rate": 0.00017420660250296033, + "loss": 12.5998, + "step": 9433 + }, + { + "epoch": 0.5137190037641325, + "grad_norm": 0.6149312761371097, + "learning_rate": 0.000174200691099698, + "loss": 12.4411, + "step": 9434 + }, + { + "epoch": 0.5137734577607155, + "grad_norm": 0.5980568927740617, + "learning_rate": 0.00017419477911943864, + "loss": 12.2741, + "step": 9435 + }, + { + "epoch": 0.5138279117572986, + "grad_norm": 0.7464389865428256, + "learning_rate": 0.0001741888665622282, + "loss": 12.4924, + "step": 9436 + }, + { + "epoch": 0.5138823657538816, + "grad_norm": 0.6250246789630683, + "learning_rate": 0.00017418295342811274, + "loss": 12.4717, + "step": 9437 + }, + { + "epoch": 0.5139368197504646, + "grad_norm": 0.7034476056047101, + "learning_rate": 0.00017417703971713818, + "loss": 12.499, + "step": 9438 + }, + { + "epoch": 0.5139912737470476, + "grad_norm": 0.6775469200024059, + "learning_rate": 0.00017417112542935054, + "loss": 12.5894, + "step": 9439 + }, + { + "epoch": 0.5140457277436306, + "grad_norm": 0.6495361024701459, + "learning_rate": 0.00017416521056479577, + "loss": 12.4821, + "step": 9440 + }, + { + "epoch": 0.5141001817402135, + "grad_norm": 0.6612506125438407, + "learning_rate": 0.0001741592951235199, + "loss": 12.4193, + "step": 9441 + }, + { + "epoch": 0.5141546357367967, + "grad_norm": 0.5411831783413918, + "learning_rate": 0.00017415337910556888, + "loss": 12.2917, + "step": 9442 + }, + { + "epoch": 0.5142090897333796, + "grad_norm": 0.6008695783759355, + "learning_rate": 0.00017414746251098878, + "loss": 12.2273, + "step": 9443 + }, + { + "epoch": 0.5142635437299626, + "grad_norm": 0.6741696018952588, + "learning_rate": 0.00017414154533982554, + "loss": 12.3734, + "step": 9444 + }, + { + "epoch": 0.5143179977265456, + "grad_norm": 0.5877583006812053, + "learning_rate": 0.00017413562759212525, + "loss": 12.2825, + "step": 9445 + }, + { + "epoch": 0.5143724517231286, + "grad_norm": 0.6357369551078746, + "learning_rate": 0.00017412970926793388, + "loss": 12.4142, + "step": 9446 + }, + { + "epoch": 0.5144269057197116, + "grad_norm": 0.6243208273579955, + "learning_rate": 0.00017412379036729745, + "loss": 12.2237, + "step": 9447 + }, + { + "epoch": 0.5144813597162947, + "grad_norm": 0.6491307238798837, + "learning_rate": 0.00017411787089026198, + "loss": 12.3881, + "step": 9448 + }, + { + "epoch": 0.5145358137128777, + "grad_norm": 0.582036403334767, + "learning_rate": 0.00017411195083687355, + "loss": 12.3083, + "step": 9449 + }, + { + "epoch": 0.5145902677094607, + "grad_norm": 0.6185441686619526, + "learning_rate": 0.00017410603020717814, + "loss": 12.4296, + "step": 9450 + }, + { + "epoch": 0.5146447217060437, + "grad_norm": 0.557063880310617, + "learning_rate": 0.00017410010900122182, + "loss": 12.3116, + "step": 9451 + }, + { + "epoch": 0.5146991757026267, + "grad_norm": 0.6471912907317706, + "learning_rate": 0.0001740941872190506, + "loss": 12.4337, + "step": 9452 + }, + { + "epoch": 0.5147536296992098, + "grad_norm": 0.7343020253109421, + "learning_rate": 0.0001740882648607106, + "loss": 12.4328, + "step": 9453 + }, + { + "epoch": 0.5148080836957928, + "grad_norm": 0.5955622730731929, + "learning_rate": 0.0001740823419262478, + "loss": 12.384, + "step": 9454 + }, + { + "epoch": 0.5148625376923758, + "grad_norm": 0.6158589032229121, + "learning_rate": 0.0001740764184157083, + "loss": 12.3919, + "step": 9455 + }, + { + "epoch": 0.5149169916889588, + "grad_norm": 0.5839526669803079, + "learning_rate": 0.00017407049432913813, + "loss": 12.3303, + "step": 9456 + }, + { + "epoch": 0.5149714456855418, + "grad_norm": 0.6667413250980353, + "learning_rate": 0.0001740645696665834, + "loss": 12.4874, + "step": 9457 + }, + { + "epoch": 0.5150258996821248, + "grad_norm": 0.5917080616601487, + "learning_rate": 0.00017405864442809012, + "loss": 12.4685, + "step": 9458 + }, + { + "epoch": 0.5150803536787079, + "grad_norm": 0.6153657606574432, + "learning_rate": 0.0001740527186137044, + "loss": 12.4207, + "step": 9459 + }, + { + "epoch": 0.5151348076752909, + "grad_norm": 0.6172502537558318, + "learning_rate": 0.00017404679222347236, + "loss": 12.4641, + "step": 9460 + }, + { + "epoch": 0.5151892616718738, + "grad_norm": 0.5826857856034895, + "learning_rate": 0.00017404086525744003, + "loss": 12.2974, + "step": 9461 + }, + { + "epoch": 0.5152437156684568, + "grad_norm": 0.676492290899336, + "learning_rate": 0.00017403493771565352, + "loss": 12.2323, + "step": 9462 + }, + { + "epoch": 0.5152981696650398, + "grad_norm": 0.7872839686355796, + "learning_rate": 0.0001740290095981589, + "loss": 12.5144, + "step": 9463 + }, + { + "epoch": 0.5153526236616228, + "grad_norm": 0.6425476955150632, + "learning_rate": 0.0001740230809050023, + "loss": 12.489, + "step": 9464 + }, + { + "epoch": 0.5154070776582059, + "grad_norm": 0.5880650070662702, + "learning_rate": 0.0001740171516362298, + "loss": 12.3891, + "step": 9465 + }, + { + "epoch": 0.5154615316547889, + "grad_norm": 0.6126832975083883, + "learning_rate": 0.00017401122179188755, + "loss": 12.3447, + "step": 9466 + }, + { + "epoch": 0.5155159856513719, + "grad_norm": 0.6782429959572243, + "learning_rate": 0.0001740052913720216, + "loss": 12.3695, + "step": 9467 + }, + { + "epoch": 0.5155704396479549, + "grad_norm": 0.5803531624554442, + "learning_rate": 0.00017399936037667808, + "loss": 12.3674, + "step": 9468 + }, + { + "epoch": 0.5156248936445379, + "grad_norm": 0.5637358898329805, + "learning_rate": 0.00017399342880590314, + "loss": 12.4194, + "step": 9469 + }, + { + "epoch": 0.5156793476411209, + "grad_norm": 0.6286820639773689, + "learning_rate": 0.0001739874966597429, + "loss": 12.4978, + "step": 9470 + }, + { + "epoch": 0.515733801637704, + "grad_norm": 0.6761119351876996, + "learning_rate": 0.00017398156393824346, + "loss": 12.4509, + "step": 9471 + }, + { + "epoch": 0.515788255634287, + "grad_norm": 0.6664367437684052, + "learning_rate": 0.00017397563064145097, + "loss": 12.3517, + "step": 9472 + }, + { + "epoch": 0.51584270963087, + "grad_norm": 0.6423591597123725, + "learning_rate": 0.0001739696967694116, + "loss": 12.3267, + "step": 9473 + }, + { + "epoch": 0.515897163627453, + "grad_norm": 0.5995208433571146, + "learning_rate": 0.0001739637623221714, + "loss": 12.5092, + "step": 9474 + }, + { + "epoch": 0.515951617624036, + "grad_norm": 0.660880752926758, + "learning_rate": 0.00017395782729977665, + "loss": 12.4551, + "step": 9475 + }, + { + "epoch": 0.516006071620619, + "grad_norm": 0.5987676654785823, + "learning_rate": 0.0001739518917022734, + "loss": 12.3006, + "step": 9476 + }, + { + "epoch": 0.5160605256172021, + "grad_norm": 0.7086942101539682, + "learning_rate": 0.00017394595552970782, + "loss": 12.5718, + "step": 9477 + }, + { + "epoch": 0.516114979613785, + "grad_norm": 0.7278895586541723, + "learning_rate": 0.00017394001878212614, + "loss": 12.6292, + "step": 9478 + }, + { + "epoch": 0.516169433610368, + "grad_norm": 0.6225783838940412, + "learning_rate": 0.00017393408145957443, + "loss": 12.3287, + "step": 9479 + }, + { + "epoch": 0.516223887606951, + "grad_norm": 0.6250124899804488, + "learning_rate": 0.00017392814356209893, + "loss": 12.3882, + "step": 9480 + }, + { + "epoch": 0.516278341603534, + "grad_norm": 0.5823883657923915, + "learning_rate": 0.00017392220508974577, + "loss": 12.3334, + "step": 9481 + }, + { + "epoch": 0.516332795600117, + "grad_norm": 0.6525559938701772, + "learning_rate": 0.00017391626604256114, + "loss": 12.4486, + "step": 9482 + }, + { + "epoch": 0.5163872495967001, + "grad_norm": 0.6078119554876852, + "learning_rate": 0.00017391032642059126, + "loss": 12.4749, + "step": 9483 + }, + { + "epoch": 0.5164417035932831, + "grad_norm": 0.6965773196307412, + "learning_rate": 0.00017390438622388228, + "loss": 12.3629, + "step": 9484 + }, + { + "epoch": 0.5164961575898661, + "grad_norm": 0.648942053086012, + "learning_rate": 0.0001738984454524804, + "loss": 12.1682, + "step": 9485 + }, + { + "epoch": 0.5165506115864491, + "grad_norm": 0.6131617496363042, + "learning_rate": 0.0001738925041064318, + "loss": 12.3335, + "step": 9486 + }, + { + "epoch": 0.5166050655830321, + "grad_norm": 0.5930281381763447, + "learning_rate": 0.00017388656218578268, + "loss": 12.4831, + "step": 9487 + }, + { + "epoch": 0.5166595195796152, + "grad_norm": 0.6726746559978551, + "learning_rate": 0.0001738806196905793, + "loss": 12.3556, + "step": 9488 + }, + { + "epoch": 0.5167139735761982, + "grad_norm": 0.6729462439247543, + "learning_rate": 0.00017387467662086784, + "loss": 12.3627, + "step": 9489 + }, + { + "epoch": 0.5167684275727812, + "grad_norm": 0.6840423787451996, + "learning_rate": 0.00017386873297669446, + "loss": 12.4291, + "step": 9490 + }, + { + "epoch": 0.5168228815693642, + "grad_norm": 0.727027584770321, + "learning_rate": 0.00017386278875810543, + "loss": 12.3196, + "step": 9491 + }, + { + "epoch": 0.5168773355659472, + "grad_norm": 0.618238532380933, + "learning_rate": 0.000173856843965147, + "loss": 12.3743, + "step": 9492 + }, + { + "epoch": 0.5169317895625302, + "grad_norm": 0.633616462992701, + "learning_rate": 0.00017385089859786533, + "loss": 12.3825, + "step": 9493 + }, + { + "epoch": 0.5169862435591133, + "grad_norm": 0.6852956095611265, + "learning_rate": 0.00017384495265630672, + "loss": 12.3855, + "step": 9494 + }, + { + "epoch": 0.5170406975556963, + "grad_norm": 0.6353901270857567, + "learning_rate": 0.00017383900614051733, + "loss": 12.3742, + "step": 9495 + }, + { + "epoch": 0.5170951515522793, + "grad_norm": 0.6365417873794621, + "learning_rate": 0.00017383305905054348, + "loss": 12.3855, + "step": 9496 + }, + { + "epoch": 0.5171496055488622, + "grad_norm": 0.6392101507202402, + "learning_rate": 0.0001738271113864314, + "loss": 12.3915, + "step": 9497 + }, + { + "epoch": 0.5172040595454452, + "grad_norm": 0.636169211044633, + "learning_rate": 0.0001738211631482273, + "loss": 12.364, + "step": 9498 + }, + { + "epoch": 0.5172585135420282, + "grad_norm": 0.591556032814142, + "learning_rate": 0.00017381521433597742, + "loss": 12.3746, + "step": 9499 + }, + { + "epoch": 0.5173129675386113, + "grad_norm": 0.5989499850668308, + "learning_rate": 0.00017380926494972812, + "loss": 12.4657, + "step": 9500 + }, + { + "epoch": 0.5173674215351943, + "grad_norm": 0.5810893580636307, + "learning_rate": 0.00017380331498952555, + "loss": 12.3945, + "step": 9501 + }, + { + "epoch": 0.5174218755317773, + "grad_norm": 0.6071717173521719, + "learning_rate": 0.00017379736445541604, + "loss": 12.4434, + "step": 9502 + }, + { + "epoch": 0.5174763295283603, + "grad_norm": 0.5678552708006944, + "learning_rate": 0.00017379141334744586, + "loss": 12.3299, + "step": 9503 + }, + { + "epoch": 0.5175307835249433, + "grad_norm": 0.5897790600042302, + "learning_rate": 0.00017378546166566124, + "loss": 12.3707, + "step": 9504 + }, + { + "epoch": 0.5175852375215263, + "grad_norm": 0.722172830820239, + "learning_rate": 0.00017377950941010854, + "loss": 12.5973, + "step": 9505 + }, + { + "epoch": 0.5176396915181094, + "grad_norm": 0.6169290540123696, + "learning_rate": 0.000173773556580834, + "loss": 12.4218, + "step": 9506 + }, + { + "epoch": 0.5176941455146924, + "grad_norm": 0.5773241373097306, + "learning_rate": 0.00017376760317788388, + "loss": 12.5499, + "step": 9507 + }, + { + "epoch": 0.5177485995112754, + "grad_norm": 0.7301449323538458, + "learning_rate": 0.0001737616492013045, + "loss": 12.3069, + "step": 9508 + }, + { + "epoch": 0.5178030535078584, + "grad_norm": 0.5773852222233629, + "learning_rate": 0.0001737556946511422, + "loss": 12.3977, + "step": 9509 + }, + { + "epoch": 0.5178575075044414, + "grad_norm": 0.6155233235574765, + "learning_rate": 0.00017374973952744325, + "loss": 12.4442, + "step": 9510 + }, + { + "epoch": 0.5179119615010244, + "grad_norm": 0.6083803059234462, + "learning_rate": 0.00017374378383025392, + "loss": 12.4416, + "step": 9511 + }, + { + "epoch": 0.5179664154976075, + "grad_norm": 0.6844408691725016, + "learning_rate": 0.0001737378275596206, + "loss": 12.3738, + "step": 9512 + }, + { + "epoch": 0.5180208694941905, + "grad_norm": 0.6869586845702121, + "learning_rate": 0.00017373187071558955, + "loss": 12.5113, + "step": 9513 + }, + { + "epoch": 0.5180753234907735, + "grad_norm": 0.6628685359190278, + "learning_rate": 0.00017372591329820712, + "loss": 12.3677, + "step": 9514 + }, + { + "epoch": 0.5181297774873564, + "grad_norm": 0.6463424280414908, + "learning_rate": 0.0001737199553075196, + "loss": 12.6094, + "step": 9515 + }, + { + "epoch": 0.5181842314839394, + "grad_norm": 0.7198397510770467, + "learning_rate": 0.00017371399674357337, + "loss": 12.4187, + "step": 9516 + }, + { + "epoch": 0.5182386854805224, + "grad_norm": 0.5829815790571139, + "learning_rate": 0.0001737080376064147, + "loss": 12.4891, + "step": 9517 + }, + { + "epoch": 0.5182931394771055, + "grad_norm": 0.6760100979816667, + "learning_rate": 0.00017370207789609002, + "loss": 12.3712, + "step": 9518 + }, + { + "epoch": 0.5183475934736885, + "grad_norm": 0.6053151611373706, + "learning_rate": 0.00017369611761264563, + "loss": 12.3323, + "step": 9519 + }, + { + "epoch": 0.5184020474702715, + "grad_norm": 0.6811088166907076, + "learning_rate": 0.0001736901567561278, + "loss": 12.5549, + "step": 9520 + }, + { + "epoch": 0.5184565014668545, + "grad_norm": 0.6683517540045927, + "learning_rate": 0.000173684195326583, + "loss": 12.5542, + "step": 9521 + }, + { + "epoch": 0.5185109554634375, + "grad_norm": 0.6680094107438264, + "learning_rate": 0.00017367823332405756, + "loss": 12.4991, + "step": 9522 + }, + { + "epoch": 0.5185654094600206, + "grad_norm": 0.6365234439685792, + "learning_rate": 0.00017367227074859776, + "loss": 12.4027, + "step": 9523 + }, + { + "epoch": 0.5186198634566036, + "grad_norm": 0.6213685256896416, + "learning_rate": 0.0001736663076002501, + "loss": 12.2578, + "step": 9524 + }, + { + "epoch": 0.5186743174531866, + "grad_norm": 0.673525027278588, + "learning_rate": 0.00017366034387906085, + "loss": 12.6111, + "step": 9525 + }, + { + "epoch": 0.5187287714497696, + "grad_norm": 0.6045280088778902, + "learning_rate": 0.00017365437958507637, + "loss": 12.2815, + "step": 9526 + }, + { + "epoch": 0.5187832254463526, + "grad_norm": 0.6146377983697311, + "learning_rate": 0.00017364841471834312, + "loss": 12.3847, + "step": 9527 + }, + { + "epoch": 0.5188376794429356, + "grad_norm": 0.6224125406584722, + "learning_rate": 0.00017364244927890746, + "loss": 12.4285, + "step": 9528 + }, + { + "epoch": 0.5188921334395187, + "grad_norm": 0.6804900827370974, + "learning_rate": 0.00017363648326681577, + "loss": 12.3915, + "step": 9529 + }, + { + "epoch": 0.5189465874361017, + "grad_norm": 0.6187006734875188, + "learning_rate": 0.0001736305166821144, + "loss": 12.3694, + "step": 9530 + }, + { + "epoch": 0.5190010414326847, + "grad_norm": 0.6055164489750849, + "learning_rate": 0.0001736245495248498, + "loss": 12.5032, + "step": 9531 + }, + { + "epoch": 0.5190554954292677, + "grad_norm": 0.6791913560434254, + "learning_rate": 0.00017361858179506837, + "loss": 12.4238, + "step": 9532 + }, + { + "epoch": 0.5191099494258506, + "grad_norm": 0.6553159563366687, + "learning_rate": 0.0001736126134928165, + "loss": 12.3905, + "step": 9533 + }, + { + "epoch": 0.5191644034224336, + "grad_norm": 0.659497625157703, + "learning_rate": 0.00017360664461814058, + "loss": 12.3904, + "step": 9534 + }, + { + "epoch": 0.5192188574190167, + "grad_norm": 0.5837269312568923, + "learning_rate": 0.00017360067517108705, + "loss": 12.2439, + "step": 9535 + }, + { + "epoch": 0.5192733114155997, + "grad_norm": 0.606262104777017, + "learning_rate": 0.00017359470515170233, + "loss": 12.3929, + "step": 9536 + }, + { + "epoch": 0.5193277654121827, + "grad_norm": 0.6481663145224712, + "learning_rate": 0.00017358873456003283, + "loss": 12.3585, + "step": 9537 + }, + { + "epoch": 0.5193822194087657, + "grad_norm": 0.5731228553672747, + "learning_rate": 0.00017358276339612502, + "loss": 12.2875, + "step": 9538 + }, + { + "epoch": 0.5194366734053487, + "grad_norm": 0.641688670337266, + "learning_rate": 0.00017357679166002526, + "loss": 12.5662, + "step": 9539 + }, + { + "epoch": 0.5194911274019317, + "grad_norm": 0.6234379087948, + "learning_rate": 0.0001735708193517801, + "loss": 12.3924, + "step": 9540 + }, + { + "epoch": 0.5195455813985148, + "grad_norm": 0.6020449493619848, + "learning_rate": 0.00017356484647143585, + "loss": 12.6541, + "step": 9541 + }, + { + "epoch": 0.5196000353950978, + "grad_norm": 0.5694624446011876, + "learning_rate": 0.00017355887301903904, + "loss": 12.3805, + "step": 9542 + }, + { + "epoch": 0.5196544893916808, + "grad_norm": 0.66644121793506, + "learning_rate": 0.00017355289899463608, + "loss": 12.4674, + "step": 9543 + }, + { + "epoch": 0.5197089433882638, + "grad_norm": 0.6037137741273337, + "learning_rate": 0.00017354692439827346, + "loss": 12.3857, + "step": 9544 + }, + { + "epoch": 0.5197633973848468, + "grad_norm": 0.6096151676019201, + "learning_rate": 0.0001735409492299976, + "loss": 12.3534, + "step": 9545 + }, + { + "epoch": 0.5198178513814298, + "grad_norm": 0.5819488973845875, + "learning_rate": 0.00017353497348985503, + "loss": 12.3819, + "step": 9546 + }, + { + "epoch": 0.5198723053780129, + "grad_norm": 0.7652426584827898, + "learning_rate": 0.00017352899717789212, + "loss": 12.4079, + "step": 9547 + }, + { + "epoch": 0.5199267593745959, + "grad_norm": 0.5772110029012527, + "learning_rate": 0.00017352302029415543, + "loss": 12.3658, + "step": 9548 + }, + { + "epoch": 0.5199812133711789, + "grad_norm": 0.5710471850503702, + "learning_rate": 0.0001735170428386914, + "loss": 12.4582, + "step": 9549 + }, + { + "epoch": 0.5200356673677619, + "grad_norm": 0.5670726519031118, + "learning_rate": 0.00017351106481154654, + "loss": 12.2941, + "step": 9550 + }, + { + "epoch": 0.5200901213643448, + "grad_norm": 0.7440285830960065, + "learning_rate": 0.00017350508621276727, + "loss": 12.4657, + "step": 9551 + }, + { + "epoch": 0.5201445753609278, + "grad_norm": 0.577867669932965, + "learning_rate": 0.00017349910704240014, + "loss": 12.2572, + "step": 9552 + }, + { + "epoch": 0.520199029357511, + "grad_norm": 0.5761404019054847, + "learning_rate": 0.00017349312730049164, + "loss": 12.4117, + "step": 9553 + }, + { + "epoch": 0.5202534833540939, + "grad_norm": 0.6203076031921424, + "learning_rate": 0.00017348714698708825, + "loss": 12.4216, + "step": 9554 + }, + { + "epoch": 0.5203079373506769, + "grad_norm": 0.6088496568906002, + "learning_rate": 0.00017348116610223647, + "loss": 12.4522, + "step": 9555 + }, + { + "epoch": 0.5203623913472599, + "grad_norm": 0.6520271374564597, + "learning_rate": 0.00017347518464598283, + "loss": 12.4652, + "step": 9556 + }, + { + "epoch": 0.5204168453438429, + "grad_norm": 0.6710044566860749, + "learning_rate": 0.00017346920261837384, + "loss": 12.4546, + "step": 9557 + }, + { + "epoch": 0.520471299340426, + "grad_norm": 0.6386795522228851, + "learning_rate": 0.000173463220019456, + "loss": 12.5289, + "step": 9558 + }, + { + "epoch": 0.520525753337009, + "grad_norm": 0.5617419711380065, + "learning_rate": 0.0001734572368492758, + "loss": 12.3144, + "step": 9559 + }, + { + "epoch": 0.520580207333592, + "grad_norm": 0.6464213236433154, + "learning_rate": 0.00017345125310787987, + "loss": 12.5115, + "step": 9560 + }, + { + "epoch": 0.520634661330175, + "grad_norm": 0.5952176049191022, + "learning_rate": 0.00017344526879531466, + "loss": 12.3311, + "step": 9561 + }, + { + "epoch": 0.520689115326758, + "grad_norm": 0.635195905401357, + "learning_rate": 0.00017343928391162672, + "loss": 12.4314, + "step": 9562 + }, + { + "epoch": 0.520743569323341, + "grad_norm": 0.7401122077992383, + "learning_rate": 0.0001734332984568626, + "loss": 12.5615, + "step": 9563 + }, + { + "epoch": 0.5207980233199241, + "grad_norm": 0.6328437476921704, + "learning_rate": 0.0001734273124310688, + "loss": 12.3782, + "step": 9564 + }, + { + "epoch": 0.5208524773165071, + "grad_norm": 0.6492627760999075, + "learning_rate": 0.00017342132583429197, + "loss": 12.2496, + "step": 9565 + }, + { + "epoch": 0.5209069313130901, + "grad_norm": 0.6882328974866949, + "learning_rate": 0.00017341533866657854, + "loss": 12.5301, + "step": 9566 + }, + { + "epoch": 0.5209613853096731, + "grad_norm": 0.5539351239324724, + "learning_rate": 0.00017340935092797515, + "loss": 12.3704, + "step": 9567 + }, + { + "epoch": 0.5210158393062561, + "grad_norm": 0.6120330624036531, + "learning_rate": 0.0001734033626185283, + "loss": 12.4241, + "step": 9568 + }, + { + "epoch": 0.521070293302839, + "grad_norm": 0.6746644635526738, + "learning_rate": 0.00017339737373828464, + "loss": 12.3715, + "step": 9569 + }, + { + "epoch": 0.5211247472994222, + "grad_norm": 0.6251845311684804, + "learning_rate": 0.0001733913842872907, + "loss": 12.3197, + "step": 9570 + }, + { + "epoch": 0.5211792012960051, + "grad_norm": 0.6403476161548947, + "learning_rate": 0.00017338539426559301, + "loss": 12.4137, + "step": 9571 + }, + { + "epoch": 0.5212336552925881, + "grad_norm": 0.6340704373689795, + "learning_rate": 0.00017337940367323818, + "loss": 12.3527, + "step": 9572 + }, + { + "epoch": 0.5212881092891711, + "grad_norm": 0.693150477056537, + "learning_rate": 0.00017337341251027284, + "loss": 12.5483, + "step": 9573 + }, + { + "epoch": 0.5213425632857541, + "grad_norm": 0.6887097515799869, + "learning_rate": 0.00017336742077674354, + "loss": 12.4556, + "step": 9574 + }, + { + "epoch": 0.5213970172823371, + "grad_norm": 0.6742663211247635, + "learning_rate": 0.00017336142847269685, + "loss": 12.1687, + "step": 9575 + }, + { + "epoch": 0.5214514712789202, + "grad_norm": 0.6037349611938911, + "learning_rate": 0.00017335543559817936, + "loss": 12.43, + "step": 9576 + }, + { + "epoch": 0.5215059252755032, + "grad_norm": 0.7071535351856332, + "learning_rate": 0.00017334944215323777, + "loss": 12.3907, + "step": 9577 + }, + { + "epoch": 0.5215603792720862, + "grad_norm": 0.6063097686747864, + "learning_rate": 0.00017334344813791858, + "loss": 12.4358, + "step": 9578 + }, + { + "epoch": 0.5216148332686692, + "grad_norm": 0.6568445631972262, + "learning_rate": 0.0001733374535522684, + "loss": 12.3488, + "step": 9579 + }, + { + "epoch": 0.5216692872652522, + "grad_norm": 0.6086562414750473, + "learning_rate": 0.00017333145839633395, + "loss": 12.3118, + "step": 9580 + }, + { + "epoch": 0.5217237412618352, + "grad_norm": 0.6720034758914826, + "learning_rate": 0.00017332546267016173, + "loss": 12.3787, + "step": 9581 + }, + { + "epoch": 0.5217781952584183, + "grad_norm": 0.6747441173462498, + "learning_rate": 0.00017331946637379848, + "loss": 12.3641, + "step": 9582 + }, + { + "epoch": 0.5218326492550013, + "grad_norm": 0.6362744321253769, + "learning_rate": 0.0001733134695072907, + "loss": 12.3927, + "step": 9583 + }, + { + "epoch": 0.5218871032515843, + "grad_norm": 0.584120914581976, + "learning_rate": 0.00017330747207068513, + "loss": 12.3041, + "step": 9584 + }, + { + "epoch": 0.5219415572481673, + "grad_norm": 0.6898276146568693, + "learning_rate": 0.00017330147406402835, + "loss": 12.5067, + "step": 9585 + }, + { + "epoch": 0.5219960112447503, + "grad_norm": 0.6092771119092754, + "learning_rate": 0.000173295475487367, + "loss": 12.3398, + "step": 9586 + }, + { + "epoch": 0.5220504652413334, + "grad_norm": 0.6183883034009364, + "learning_rate": 0.00017328947634074774, + "loss": 12.514, + "step": 9587 + }, + { + "epoch": 0.5221049192379164, + "grad_norm": 0.5953627715745956, + "learning_rate": 0.00017328347662421724, + "loss": 12.2748, + "step": 9588 + }, + { + "epoch": 0.5221593732344993, + "grad_norm": 0.6705637092696672, + "learning_rate": 0.00017327747633782213, + "loss": 12.388, + "step": 9589 + }, + { + "epoch": 0.5222138272310823, + "grad_norm": 0.5346323503006604, + "learning_rate": 0.00017327147548160907, + "loss": 12.384, + "step": 9590 + }, + { + "epoch": 0.5222682812276653, + "grad_norm": 0.5853878712849345, + "learning_rate": 0.00017326547405562473, + "loss": 12.286, + "step": 9591 + }, + { + "epoch": 0.5223227352242483, + "grad_norm": 0.5975763630320771, + "learning_rate": 0.0001732594720599158, + "loss": 12.4346, + "step": 9592 + }, + { + "epoch": 0.5223771892208314, + "grad_norm": 0.6822777284872105, + "learning_rate": 0.00017325346949452887, + "loss": 12.3276, + "step": 9593 + }, + { + "epoch": 0.5224316432174144, + "grad_norm": 0.5940144725337047, + "learning_rate": 0.00017324746635951073, + "loss": 12.4804, + "step": 9594 + }, + { + "epoch": 0.5224860972139974, + "grad_norm": 0.6963264100080574, + "learning_rate": 0.00017324146265490799, + "loss": 12.4868, + "step": 9595 + }, + { + "epoch": 0.5225405512105804, + "grad_norm": 0.6454399581047955, + "learning_rate": 0.0001732354583807674, + "loss": 12.4155, + "step": 9596 + }, + { + "epoch": 0.5225950052071634, + "grad_norm": 0.5968822395243275, + "learning_rate": 0.00017322945353713553, + "loss": 12.4344, + "step": 9597 + }, + { + "epoch": 0.5226494592037464, + "grad_norm": 0.5772180794144417, + "learning_rate": 0.00017322344812405917, + "loss": 12.4009, + "step": 9598 + }, + { + "epoch": 0.5227039132003295, + "grad_norm": 0.6661396756168132, + "learning_rate": 0.000173217442141585, + "loss": 12.5368, + "step": 9599 + }, + { + "epoch": 0.5227583671969125, + "grad_norm": 0.5916943710540453, + "learning_rate": 0.00017321143558975973, + "loss": 12.3441, + "step": 9600 + }, + { + "epoch": 0.5228128211934955, + "grad_norm": 0.6980471395490951, + "learning_rate": 0.00017320542846863005, + "loss": 12.4568, + "step": 9601 + }, + { + "epoch": 0.5228672751900785, + "grad_norm": 0.6134060077125584, + "learning_rate": 0.00017319942077824273, + "loss": 12.3968, + "step": 9602 + }, + { + "epoch": 0.5229217291866615, + "grad_norm": 0.6653622553223292, + "learning_rate": 0.00017319341251864439, + "loss": 12.3345, + "step": 9603 + }, + { + "epoch": 0.5229761831832445, + "grad_norm": 0.6385386789833732, + "learning_rate": 0.00017318740368988178, + "loss": 12.457, + "step": 9604 + }, + { + "epoch": 0.5230306371798276, + "grad_norm": 0.6778974500953844, + "learning_rate": 0.0001731813942920017, + "loss": 12.3487, + "step": 9605 + }, + { + "epoch": 0.5230850911764106, + "grad_norm": 0.6852089521188045, + "learning_rate": 0.00017317538432505078, + "loss": 12.576, + "step": 9606 + }, + { + "epoch": 0.5231395451729935, + "grad_norm": 0.6892428008205174, + "learning_rate": 0.00017316937378907582, + "loss": 12.4506, + "step": 9607 + }, + { + "epoch": 0.5231939991695765, + "grad_norm": 0.6417345906514075, + "learning_rate": 0.00017316336268412353, + "loss": 12.3974, + "step": 9608 + }, + { + "epoch": 0.5232484531661595, + "grad_norm": 0.6643182481858203, + "learning_rate": 0.00017315735101024066, + "loss": 12.4555, + "step": 9609 + }, + { + "epoch": 0.5233029071627425, + "grad_norm": 0.7006586374584878, + "learning_rate": 0.000173151338767474, + "loss": 12.3776, + "step": 9610 + }, + { + "epoch": 0.5233573611593256, + "grad_norm": 0.6279718640944982, + "learning_rate": 0.00017314532595587024, + "loss": 12.4017, + "step": 9611 + }, + { + "epoch": 0.5234118151559086, + "grad_norm": 0.7037251866782321, + "learning_rate": 0.00017313931257547613, + "loss": 12.4936, + "step": 9612 + }, + { + "epoch": 0.5234662691524916, + "grad_norm": 0.6205506357515014, + "learning_rate": 0.00017313329862633848, + "loss": 12.4313, + "step": 9613 + }, + { + "epoch": 0.5235207231490746, + "grad_norm": 0.6943185242804921, + "learning_rate": 0.00017312728410850405, + "loss": 12.4645, + "step": 9614 + }, + { + "epoch": 0.5235751771456576, + "grad_norm": 0.6045707069949517, + "learning_rate": 0.0001731212690220196, + "loss": 12.4103, + "step": 9615 + }, + { + "epoch": 0.5236296311422406, + "grad_norm": 0.5648009255297014, + "learning_rate": 0.00017311525336693184, + "loss": 12.1542, + "step": 9616 + }, + { + "epoch": 0.5236840851388237, + "grad_norm": 0.6593904156040367, + "learning_rate": 0.00017310923714328766, + "loss": 12.3242, + "step": 9617 + }, + { + "epoch": 0.5237385391354067, + "grad_norm": 0.7689015290127676, + "learning_rate": 0.0001731032203511338, + "loss": 12.3943, + "step": 9618 + }, + { + "epoch": 0.5237929931319897, + "grad_norm": 0.6050682600697579, + "learning_rate": 0.000173097202990517, + "loss": 12.3524, + "step": 9619 + }, + { + "epoch": 0.5238474471285727, + "grad_norm": 0.6355378601271892, + "learning_rate": 0.00017309118506148412, + "loss": 12.4862, + "step": 9620 + }, + { + "epoch": 0.5239019011251557, + "grad_norm": 0.625236672043179, + "learning_rate": 0.00017308516656408193, + "loss": 12.3745, + "step": 9621 + }, + { + "epoch": 0.5239563551217388, + "grad_norm": 0.6220329845669438, + "learning_rate": 0.0001730791474983572, + "loss": 12.4589, + "step": 9622 + }, + { + "epoch": 0.5240108091183218, + "grad_norm": 0.6320089105516793, + "learning_rate": 0.00017307312786435679, + "loss": 12.5162, + "step": 9623 + }, + { + "epoch": 0.5240652631149048, + "grad_norm": 0.617134722308437, + "learning_rate": 0.00017306710766212748, + "loss": 12.4909, + "step": 9624 + }, + { + "epoch": 0.5241197171114877, + "grad_norm": 0.6240965637008747, + "learning_rate": 0.00017306108689171606, + "loss": 12.4501, + "step": 9625 + }, + { + "epoch": 0.5241741711080707, + "grad_norm": 0.5989645747623337, + "learning_rate": 0.00017305506555316942, + "loss": 12.495, + "step": 9626 + }, + { + "epoch": 0.5242286251046537, + "grad_norm": 0.6775237364987579, + "learning_rate": 0.00017304904364653432, + "loss": 12.372, + "step": 9627 + }, + { + "epoch": 0.5242830791012368, + "grad_norm": 0.6764909063661962, + "learning_rate": 0.0001730430211718576, + "loss": 12.4157, + "step": 9628 + }, + { + "epoch": 0.5243375330978198, + "grad_norm": 0.5941504029768336, + "learning_rate": 0.0001730369981291861, + "loss": 12.2141, + "step": 9629 + }, + { + "epoch": 0.5243919870944028, + "grad_norm": 0.6937835193473201, + "learning_rate": 0.00017303097451856666, + "loss": 12.4662, + "step": 9630 + }, + { + "epoch": 0.5244464410909858, + "grad_norm": 0.5794049013015833, + "learning_rate": 0.0001730249503400461, + "loss": 12.391, + "step": 9631 + }, + { + "epoch": 0.5245008950875688, + "grad_norm": 0.5993816672770692, + "learning_rate": 0.0001730189255936713, + "loss": 12.4769, + "step": 9632 + }, + { + "epoch": 0.5245553490841518, + "grad_norm": 0.6214200612444263, + "learning_rate": 0.0001730129002794891, + "loss": 12.3002, + "step": 9633 + }, + { + "epoch": 0.5246098030807349, + "grad_norm": 0.6614838174941197, + "learning_rate": 0.00017300687439754627, + "loss": 12.4977, + "step": 9634 + }, + { + "epoch": 0.5246642570773179, + "grad_norm": 0.6361890413636067, + "learning_rate": 0.00017300084794788983, + "loss": 12.432, + "step": 9635 + }, + { + "epoch": 0.5247187110739009, + "grad_norm": 0.6870685161675998, + "learning_rate": 0.00017299482093056652, + "loss": 12.509, + "step": 9636 + }, + { + "epoch": 0.5247731650704839, + "grad_norm": 0.6120446501238316, + "learning_rate": 0.00017298879334562322, + "loss": 12.3235, + "step": 9637 + }, + { + "epoch": 0.5248276190670669, + "grad_norm": 0.782081122499058, + "learning_rate": 0.00017298276519310687, + "loss": 12.2507, + "step": 9638 + }, + { + "epoch": 0.5248820730636499, + "grad_norm": 0.6666692624757277, + "learning_rate": 0.00017297673647306426, + "loss": 12.5445, + "step": 9639 + }, + { + "epoch": 0.524936527060233, + "grad_norm": 0.5996083202027668, + "learning_rate": 0.00017297070718554232, + "loss": 12.3263, + "step": 9640 + }, + { + "epoch": 0.524990981056816, + "grad_norm": 0.6918954093983186, + "learning_rate": 0.00017296467733058792, + "loss": 12.4189, + "step": 9641 + }, + { + "epoch": 0.525045435053399, + "grad_norm": 0.5641966877474066, + "learning_rate": 0.00017295864690824794, + "loss": 12.4129, + "step": 9642 + }, + { + "epoch": 0.525099889049982, + "grad_norm": 0.6507912778509167, + "learning_rate": 0.0001729526159185693, + "loss": 12.5057, + "step": 9643 + }, + { + "epoch": 0.5251543430465649, + "grad_norm": 0.6194648051953271, + "learning_rate": 0.0001729465843615989, + "loss": 12.4627, + "step": 9644 + }, + { + "epoch": 0.5252087970431479, + "grad_norm": 0.6426678578240965, + "learning_rate": 0.00017294055223738362, + "loss": 12.4603, + "step": 9645 + }, + { + "epoch": 0.525263251039731, + "grad_norm": 0.7172559439221332, + "learning_rate": 0.00017293451954597034, + "loss": 12.3775, + "step": 9646 + }, + { + "epoch": 0.525317705036314, + "grad_norm": 0.6184352607295612, + "learning_rate": 0.00017292848628740605, + "loss": 12.4496, + "step": 9647 + }, + { + "epoch": 0.525372159032897, + "grad_norm": 0.66662565383955, + "learning_rate": 0.00017292245246173761, + "loss": 12.3351, + "step": 9648 + }, + { + "epoch": 0.52542661302948, + "grad_norm": 0.6044453012503522, + "learning_rate": 0.00017291641806901193, + "loss": 12.3472, + "step": 9649 + }, + { + "epoch": 0.525481067026063, + "grad_norm": 0.6094155769366099, + "learning_rate": 0.000172910383109276, + "loss": 12.2924, + "step": 9650 + }, + { + "epoch": 0.525535521022646, + "grad_norm": 0.6619147532997315, + "learning_rate": 0.00017290434758257666, + "loss": 12.3786, + "step": 9651 + }, + { + "epoch": 0.5255899750192291, + "grad_norm": 0.654740006199492, + "learning_rate": 0.0001728983114889609, + "loss": 12.5304, + "step": 9652 + }, + { + "epoch": 0.5256444290158121, + "grad_norm": 0.6360731990373002, + "learning_rate": 0.00017289227482847563, + "loss": 12.4277, + "step": 9653 + }, + { + "epoch": 0.5256988830123951, + "grad_norm": 0.7177684346359464, + "learning_rate": 0.00017288623760116782, + "loss": 12.2692, + "step": 9654 + }, + { + "epoch": 0.5257533370089781, + "grad_norm": 0.594606411338279, + "learning_rate": 0.0001728801998070844, + "loss": 12.5538, + "step": 9655 + }, + { + "epoch": 0.5258077910055611, + "grad_norm": 0.61978373942687, + "learning_rate": 0.00017287416144627237, + "loss": 12.4174, + "step": 9656 + }, + { + "epoch": 0.5258622450021442, + "grad_norm": 0.6187951813746733, + "learning_rate": 0.00017286812251877858, + "loss": 12.34, + "step": 9657 + }, + { + "epoch": 0.5259166989987272, + "grad_norm": 0.8047359145637732, + "learning_rate": 0.0001728620830246501, + "loss": 12.4556, + "step": 9658 + }, + { + "epoch": 0.5259711529953102, + "grad_norm": 0.6399250732558073, + "learning_rate": 0.00017285604296393377, + "loss": 12.4849, + "step": 9659 + }, + { + "epoch": 0.5260256069918932, + "grad_norm": 0.8625060189544411, + "learning_rate": 0.00017285000233667667, + "loss": 12.3901, + "step": 9660 + }, + { + "epoch": 0.5260800609884762, + "grad_norm": 0.589516031343896, + "learning_rate": 0.00017284396114292574, + "loss": 12.2904, + "step": 9661 + }, + { + "epoch": 0.5261345149850591, + "grad_norm": 0.6029145395554105, + "learning_rate": 0.00017283791938272795, + "loss": 12.2947, + "step": 9662 + }, + { + "epoch": 0.5261889689816422, + "grad_norm": 0.6933086437955884, + "learning_rate": 0.0001728318770561303, + "loss": 12.4769, + "step": 9663 + }, + { + "epoch": 0.5262434229782252, + "grad_norm": 0.7179526791292549, + "learning_rate": 0.00017282583416317973, + "loss": 12.5321, + "step": 9664 + }, + { + "epoch": 0.5262978769748082, + "grad_norm": 0.5920105115456481, + "learning_rate": 0.0001728197907039233, + "loss": 12.358, + "step": 9665 + }, + { + "epoch": 0.5263523309713912, + "grad_norm": 0.6865859863106162, + "learning_rate": 0.00017281374667840792, + "loss": 12.4423, + "step": 9666 + }, + { + "epoch": 0.5264067849679742, + "grad_norm": 0.6678078695824575, + "learning_rate": 0.00017280770208668065, + "loss": 12.4424, + "step": 9667 + }, + { + "epoch": 0.5264612389645572, + "grad_norm": 0.607173425284794, + "learning_rate": 0.00017280165692878848, + "loss": 12.378, + "step": 9668 + }, + { + "epoch": 0.5265156929611403, + "grad_norm": 0.6363930188981803, + "learning_rate": 0.00017279561120477844, + "loss": 12.3906, + "step": 9669 + }, + { + "epoch": 0.5265701469577233, + "grad_norm": 0.6234394296399584, + "learning_rate": 0.0001727895649146975, + "loss": 12.4683, + "step": 9670 + }, + { + "epoch": 0.5266246009543063, + "grad_norm": 0.5749083886394103, + "learning_rate": 0.0001727835180585927, + "loss": 12.4641, + "step": 9671 + }, + { + "epoch": 0.5266790549508893, + "grad_norm": 0.6452980338903855, + "learning_rate": 0.00017277747063651106, + "loss": 12.5147, + "step": 9672 + }, + { + "epoch": 0.5267335089474723, + "grad_norm": 0.5563195642011151, + "learning_rate": 0.0001727714226484996, + "loss": 12.3238, + "step": 9673 + }, + { + "epoch": 0.5267879629440553, + "grad_norm": 0.710159581261464, + "learning_rate": 0.00017276537409460533, + "loss": 12.4948, + "step": 9674 + }, + { + "epoch": 0.5268424169406384, + "grad_norm": 0.588714729325823, + "learning_rate": 0.00017275932497487536, + "loss": 12.3773, + "step": 9675 + }, + { + "epoch": 0.5268968709372214, + "grad_norm": 0.6465746286814458, + "learning_rate": 0.00017275327528935662, + "loss": 12.3818, + "step": 9676 + }, + { + "epoch": 0.5269513249338044, + "grad_norm": 0.6322551956059813, + "learning_rate": 0.00017274722503809624, + "loss": 12.4437, + "step": 9677 + }, + { + "epoch": 0.5270057789303874, + "grad_norm": 0.5644116231357436, + "learning_rate": 0.00017274117422114125, + "loss": 12.3734, + "step": 9678 + }, + { + "epoch": 0.5270602329269704, + "grad_norm": 0.6304177061479034, + "learning_rate": 0.0001727351228385387, + "loss": 12.4745, + "step": 9679 + }, + { + "epoch": 0.5271146869235533, + "grad_norm": 0.567406429966043, + "learning_rate": 0.00017272907089033559, + "loss": 12.4316, + "step": 9680 + }, + { + "epoch": 0.5271691409201364, + "grad_norm": 0.5429050725645992, + "learning_rate": 0.00017272301837657905, + "loss": 12.3625, + "step": 9681 + }, + { + "epoch": 0.5272235949167194, + "grad_norm": 0.6655180482040186, + "learning_rate": 0.00017271696529731612, + "loss": 12.5425, + "step": 9682 + }, + { + "epoch": 0.5272780489133024, + "grad_norm": 0.6268527508329086, + "learning_rate": 0.00017271091165259387, + "loss": 12.5065, + "step": 9683 + }, + { + "epoch": 0.5273325029098854, + "grad_norm": 0.5693725248078682, + "learning_rate": 0.0001727048574424594, + "loss": 12.3665, + "step": 9684 + }, + { + "epoch": 0.5273869569064684, + "grad_norm": 0.6043402385345585, + "learning_rate": 0.00017269880266695975, + "loss": 12.4099, + "step": 9685 + }, + { + "epoch": 0.5274414109030514, + "grad_norm": 0.6818348321281018, + "learning_rate": 0.00017269274732614203, + "loss": 12.2967, + "step": 9686 + }, + { + "epoch": 0.5274958648996345, + "grad_norm": 0.7589572456632353, + "learning_rate": 0.00017268669142005328, + "loss": 12.3437, + "step": 9687 + }, + { + "epoch": 0.5275503188962175, + "grad_norm": 0.701238628571967, + "learning_rate": 0.0001726806349487407, + "loss": 12.2364, + "step": 9688 + }, + { + "epoch": 0.5276047728928005, + "grad_norm": 0.6545375754439466, + "learning_rate": 0.00017267457791225125, + "loss": 12.5199, + "step": 9689 + }, + { + "epoch": 0.5276592268893835, + "grad_norm": 0.6524621393043788, + "learning_rate": 0.0001726685203106321, + "loss": 12.3886, + "step": 9690 + }, + { + "epoch": 0.5277136808859665, + "grad_norm": 0.6160583806710476, + "learning_rate": 0.00017266246214393038, + "loss": 12.3516, + "step": 9691 + }, + { + "epoch": 0.5277681348825496, + "grad_norm": 0.5785027843233695, + "learning_rate": 0.00017265640341219314, + "loss": 12.4351, + "step": 9692 + }, + { + "epoch": 0.5278225888791326, + "grad_norm": 0.6324907448675565, + "learning_rate": 0.00017265034411546753, + "loss": 12.4376, + "step": 9693 + }, + { + "epoch": 0.5278770428757156, + "grad_norm": 0.6337729746200753, + "learning_rate": 0.00017264428425380068, + "loss": 12.3728, + "step": 9694 + }, + { + "epoch": 0.5279314968722986, + "grad_norm": 0.5697052599210165, + "learning_rate": 0.0001726382238272397, + "loss": 12.3349, + "step": 9695 + }, + { + "epoch": 0.5279859508688816, + "grad_norm": 0.7966711094861342, + "learning_rate": 0.00017263216283583166, + "loss": 12.3451, + "step": 9696 + }, + { + "epoch": 0.5280404048654646, + "grad_norm": 0.5574509545166038, + "learning_rate": 0.0001726261012796238, + "loss": 12.4332, + "step": 9697 + }, + { + "epoch": 0.5280948588620477, + "grad_norm": 0.7417944850177249, + "learning_rate": 0.00017262003915866317, + "loss": 12.3422, + "step": 9698 + }, + { + "epoch": 0.5281493128586306, + "grad_norm": 0.7367901278701788, + "learning_rate": 0.0001726139764729969, + "loss": 12.3471, + "step": 9699 + }, + { + "epoch": 0.5282037668552136, + "grad_norm": 0.6468213805767332, + "learning_rate": 0.00017260791322267223, + "loss": 12.3907, + "step": 9700 + }, + { + "epoch": 0.5282582208517966, + "grad_norm": 0.5935120222843647, + "learning_rate": 0.00017260184940773621, + "loss": 12.3381, + "step": 9701 + }, + { + "epoch": 0.5283126748483796, + "grad_norm": 0.6189022965552061, + "learning_rate": 0.00017259578502823604, + "loss": 12.4009, + "step": 9702 + }, + { + "epoch": 0.5283671288449626, + "grad_norm": 0.6342628207844949, + "learning_rate": 0.0001725897200842189, + "loss": 12.3978, + "step": 9703 + }, + { + "epoch": 0.5284215828415457, + "grad_norm": 0.7407930942569653, + "learning_rate": 0.0001725836545757319, + "loss": 12.5439, + "step": 9704 + }, + { + "epoch": 0.5284760368381287, + "grad_norm": 0.6286378879070083, + "learning_rate": 0.00017257758850282225, + "loss": 12.3976, + "step": 9705 + }, + { + "epoch": 0.5285304908347117, + "grad_norm": 0.6695499155829157, + "learning_rate": 0.00017257152186553708, + "loss": 12.3466, + "step": 9706 + }, + { + "epoch": 0.5285849448312947, + "grad_norm": 0.6015846987937623, + "learning_rate": 0.00017256545466392357, + "loss": 12.2866, + "step": 9707 + }, + { + "epoch": 0.5286393988278777, + "grad_norm": 0.6078027280739121, + "learning_rate": 0.00017255938689802894, + "loss": 12.4379, + "step": 9708 + }, + { + "epoch": 0.5286938528244607, + "grad_norm": 0.638874690111006, + "learning_rate": 0.00017255331856790034, + "loss": 12.4772, + "step": 9709 + }, + { + "epoch": 0.5287483068210438, + "grad_norm": 0.672426844846369, + "learning_rate": 0.00017254724967358497, + "loss": 12.3671, + "step": 9710 + }, + { + "epoch": 0.5288027608176268, + "grad_norm": 0.6827308364374988, + "learning_rate": 0.00017254118021513005, + "loss": 12.405, + "step": 9711 + }, + { + "epoch": 0.5288572148142098, + "grad_norm": 0.6486871442666722, + "learning_rate": 0.00017253511019258273, + "loss": 12.4867, + "step": 9712 + }, + { + "epoch": 0.5289116688107928, + "grad_norm": 0.7229718641019813, + "learning_rate": 0.00017252903960599021, + "loss": 12.6646, + "step": 9713 + }, + { + "epoch": 0.5289661228073758, + "grad_norm": 0.5710992592647909, + "learning_rate": 0.00017252296845539974, + "loss": 12.3733, + "step": 9714 + }, + { + "epoch": 0.5290205768039588, + "grad_norm": 0.5865725811347708, + "learning_rate": 0.00017251689674085846, + "loss": 12.4057, + "step": 9715 + }, + { + "epoch": 0.5290750308005419, + "grad_norm": 0.574147815297626, + "learning_rate": 0.00017251082446241366, + "loss": 12.3963, + "step": 9716 + }, + { + "epoch": 0.5291294847971248, + "grad_norm": 0.5596637567178822, + "learning_rate": 0.00017250475162011256, + "loss": 12.31, + "step": 9717 + }, + { + "epoch": 0.5291839387937078, + "grad_norm": 0.83964741392583, + "learning_rate": 0.0001724986782140023, + "loss": 12.428, + "step": 9718 + }, + { + "epoch": 0.5292383927902908, + "grad_norm": 0.5941991496411796, + "learning_rate": 0.00017249260424413018, + "loss": 12.2539, + "step": 9719 + }, + { + "epoch": 0.5292928467868738, + "grad_norm": 0.6831774494563473, + "learning_rate": 0.0001724865297105434, + "loss": 12.4378, + "step": 9720 + }, + { + "epoch": 0.5293473007834569, + "grad_norm": 0.6542378395756087, + "learning_rate": 0.00017248045461328926, + "loss": 12.4948, + "step": 9721 + }, + { + "epoch": 0.5294017547800399, + "grad_norm": 0.8125360337002615, + "learning_rate": 0.0001724743789524149, + "loss": 12.5094, + "step": 9722 + }, + { + "epoch": 0.5294562087766229, + "grad_norm": 0.6991273418397774, + "learning_rate": 0.00017246830272796762, + "loss": 12.5143, + "step": 9723 + }, + { + "epoch": 0.5295106627732059, + "grad_norm": 0.5880633336171539, + "learning_rate": 0.00017246222593999468, + "loss": 12.3835, + "step": 9724 + }, + { + "epoch": 0.5295651167697889, + "grad_norm": 0.5804574850995645, + "learning_rate": 0.0001724561485885433, + "loss": 12.2713, + "step": 9725 + }, + { + "epoch": 0.5296195707663719, + "grad_norm": 0.7335910759132532, + "learning_rate": 0.0001724500706736608, + "loss": 12.3935, + "step": 9726 + }, + { + "epoch": 0.529674024762955, + "grad_norm": 0.5965335798603199, + "learning_rate": 0.00017244399219539436, + "loss": 12.3415, + "step": 9727 + }, + { + "epoch": 0.529728478759538, + "grad_norm": 0.5773773691831027, + "learning_rate": 0.0001724379131537913, + "loss": 12.3071, + "step": 9728 + }, + { + "epoch": 0.529782932756121, + "grad_norm": 0.5851148451591887, + "learning_rate": 0.00017243183354889887, + "loss": 12.4117, + "step": 9729 + }, + { + "epoch": 0.529837386752704, + "grad_norm": 0.8309450756095572, + "learning_rate": 0.00017242575338076435, + "loss": 12.4456, + "step": 9730 + }, + { + "epoch": 0.529891840749287, + "grad_norm": 0.6938917740148158, + "learning_rate": 0.00017241967264943506, + "loss": 12.3174, + "step": 9731 + }, + { + "epoch": 0.52994629474587, + "grad_norm": 0.5908224767547462, + "learning_rate": 0.00017241359135495822, + "loss": 12.4635, + "step": 9732 + }, + { + "epoch": 0.5300007487424531, + "grad_norm": 0.7278377385476976, + "learning_rate": 0.00017240750949738115, + "loss": 12.5118, + "step": 9733 + }, + { + "epoch": 0.5300552027390361, + "grad_norm": 0.6470128783211144, + "learning_rate": 0.00017240142707675117, + "loss": 12.4494, + "step": 9734 + }, + { + "epoch": 0.530109656735619, + "grad_norm": 0.6891827007678933, + "learning_rate": 0.00017239534409311554, + "loss": 12.4469, + "step": 9735 + }, + { + "epoch": 0.530164110732202, + "grad_norm": 0.6342013943166225, + "learning_rate": 0.00017238926054652157, + "loss": 12.3601, + "step": 9736 + }, + { + "epoch": 0.530218564728785, + "grad_norm": 0.6399841926597989, + "learning_rate": 0.00017238317643701655, + "loss": 12.425, + "step": 9737 + }, + { + "epoch": 0.530273018725368, + "grad_norm": 0.6107766450079387, + "learning_rate": 0.00017237709176464783, + "loss": 12.4542, + "step": 9738 + }, + { + "epoch": 0.5303274727219511, + "grad_norm": 0.5742461087396051, + "learning_rate": 0.00017237100652946273, + "loss": 12.3553, + "step": 9739 + }, + { + "epoch": 0.5303819267185341, + "grad_norm": 0.645047834726115, + "learning_rate": 0.00017236492073150852, + "loss": 12.306, + "step": 9740 + }, + { + "epoch": 0.5304363807151171, + "grad_norm": 1.1717071985382235, + "learning_rate": 0.00017235883437083252, + "loss": 12.4642, + "step": 9741 + }, + { + "epoch": 0.5304908347117001, + "grad_norm": 0.5822341358166635, + "learning_rate": 0.00017235274744748216, + "loss": 12.3577, + "step": 9742 + }, + { + "epoch": 0.5305452887082831, + "grad_norm": 0.6196614983883354, + "learning_rate": 0.00017234665996150464, + "loss": 12.4747, + "step": 9743 + }, + { + "epoch": 0.5305997427048661, + "grad_norm": 0.7803176794795792, + "learning_rate": 0.00017234057191294739, + "loss": 12.3551, + "step": 9744 + }, + { + "epoch": 0.5306541967014492, + "grad_norm": 0.5862024454149802, + "learning_rate": 0.00017233448330185774, + "loss": 12.2299, + "step": 9745 + }, + { + "epoch": 0.5307086506980322, + "grad_norm": 0.7186574421139408, + "learning_rate": 0.00017232839412828298, + "loss": 12.3433, + "step": 9746 + }, + { + "epoch": 0.5307631046946152, + "grad_norm": 0.7315895849141418, + "learning_rate": 0.0001723223043922705, + "loss": 12.3404, + "step": 9747 + }, + { + "epoch": 0.5308175586911982, + "grad_norm": 0.667927166615274, + "learning_rate": 0.00017231621409386766, + "loss": 12.5153, + "step": 9748 + }, + { + "epoch": 0.5308720126877812, + "grad_norm": 0.7132958588280709, + "learning_rate": 0.00017231012323312182, + "loss": 12.4401, + "step": 9749 + }, + { + "epoch": 0.5309264666843642, + "grad_norm": 0.9607096123762794, + "learning_rate": 0.00017230403181008034, + "loss": 12.4612, + "step": 9750 + }, + { + "epoch": 0.5309809206809473, + "grad_norm": 0.5842123349197016, + "learning_rate": 0.00017229793982479058, + "loss": 12.3212, + "step": 9751 + }, + { + "epoch": 0.5310353746775303, + "grad_norm": 0.6038032722093865, + "learning_rate": 0.0001722918472772999, + "loss": 12.3025, + "step": 9752 + }, + { + "epoch": 0.5310898286741133, + "grad_norm": 0.6729744395491271, + "learning_rate": 0.0001722857541676557, + "loss": 12.4765, + "step": 9753 + }, + { + "epoch": 0.5311442826706962, + "grad_norm": 0.6006912560201526, + "learning_rate": 0.00017227966049590535, + "loss": 12.4064, + "step": 9754 + }, + { + "epoch": 0.5311987366672792, + "grad_norm": 0.6781132871835124, + "learning_rate": 0.00017227356626209626, + "loss": 12.3, + "step": 9755 + }, + { + "epoch": 0.5312531906638623, + "grad_norm": 0.6127591248377919, + "learning_rate": 0.0001722674714662758, + "loss": 12.4081, + "step": 9756 + }, + { + "epoch": 0.5313076446604453, + "grad_norm": 0.5918857358398967, + "learning_rate": 0.00017226137610849133, + "loss": 12.3187, + "step": 9757 + }, + { + "epoch": 0.5313620986570283, + "grad_norm": 0.6277227353304781, + "learning_rate": 0.00017225528018879028, + "loss": 12.3859, + "step": 9758 + }, + { + "epoch": 0.5314165526536113, + "grad_norm": 0.634504921022194, + "learning_rate": 0.0001722491837072201, + "loss": 12.3973, + "step": 9759 + }, + { + "epoch": 0.5314710066501943, + "grad_norm": 0.662465830818154, + "learning_rate": 0.0001722430866638281, + "loss": 12.3916, + "step": 9760 + }, + { + "epoch": 0.5315254606467773, + "grad_norm": 0.5891039439024768, + "learning_rate": 0.00017223698905866178, + "loss": 12.3908, + "step": 9761 + }, + { + "epoch": 0.5315799146433604, + "grad_norm": 0.6549544766014891, + "learning_rate": 0.0001722308908917685, + "loss": 12.4351, + "step": 9762 + }, + { + "epoch": 0.5316343686399434, + "grad_norm": 0.6505523821972475, + "learning_rate": 0.0001722247921631957, + "loss": 12.3605, + "step": 9763 + }, + { + "epoch": 0.5316888226365264, + "grad_norm": 0.6311228049331078, + "learning_rate": 0.0001722186928729908, + "loss": 12.342, + "step": 9764 + }, + { + "epoch": 0.5317432766331094, + "grad_norm": 1.0094269914095666, + "learning_rate": 0.00017221259302120125, + "loss": 12.3917, + "step": 9765 + }, + { + "epoch": 0.5317977306296924, + "grad_norm": 0.6350641093722831, + "learning_rate": 0.00017220649260787444, + "loss": 12.3297, + "step": 9766 + }, + { + "epoch": 0.5318521846262754, + "grad_norm": 0.5972895412440087, + "learning_rate": 0.00017220039163305786, + "loss": 12.3677, + "step": 9767 + }, + { + "epoch": 0.5319066386228585, + "grad_norm": 0.5975628558195795, + "learning_rate": 0.0001721942900967989, + "loss": 12.3943, + "step": 9768 + }, + { + "epoch": 0.5319610926194415, + "grad_norm": 0.6235024665581594, + "learning_rate": 0.00017218818799914507, + "loss": 12.4072, + "step": 9769 + }, + { + "epoch": 0.5320155466160245, + "grad_norm": 0.5994943679323929, + "learning_rate": 0.00017218208534014378, + "loss": 12.4792, + "step": 9770 + }, + { + "epoch": 0.5320700006126075, + "grad_norm": 0.663856021291008, + "learning_rate": 0.00017217598211984248, + "loss": 12.1824, + "step": 9771 + }, + { + "epoch": 0.5321244546091904, + "grad_norm": 0.6260830279897032, + "learning_rate": 0.00017216987833828861, + "loss": 12.2323, + "step": 9772 + }, + { + "epoch": 0.5321789086057734, + "grad_norm": 0.6039866668740309, + "learning_rate": 0.0001721637739955297, + "loss": 12.4352, + "step": 9773 + }, + { + "epoch": 0.5322333626023565, + "grad_norm": 0.6712571763891201, + "learning_rate": 0.00017215766909161318, + "loss": 12.4429, + "step": 9774 + }, + { + "epoch": 0.5322878165989395, + "grad_norm": 0.5770603415893366, + "learning_rate": 0.0001721515636265865, + "loss": 12.2967, + "step": 9775 + }, + { + "epoch": 0.5323422705955225, + "grad_norm": 0.6857828943633366, + "learning_rate": 0.0001721454576004972, + "loss": 12.5149, + "step": 9776 + }, + { + "epoch": 0.5323967245921055, + "grad_norm": 0.6672425660206375, + "learning_rate": 0.00017213935101339267, + "loss": 12.4305, + "step": 9777 + }, + { + "epoch": 0.5324511785886885, + "grad_norm": 0.7012997332857596, + "learning_rate": 0.00017213324386532047, + "loss": 12.259, + "step": 9778 + }, + { + "epoch": 0.5325056325852715, + "grad_norm": 0.6089271523873652, + "learning_rate": 0.0001721271361563281, + "loss": 12.3164, + "step": 9779 + }, + { + "epoch": 0.5325600865818546, + "grad_norm": 0.6629686692249517, + "learning_rate": 0.00017212102788646298, + "loss": 12.4173, + "step": 9780 + }, + { + "epoch": 0.5326145405784376, + "grad_norm": 0.5903774965381847, + "learning_rate": 0.00017211491905577266, + "loss": 12.2993, + "step": 9781 + }, + { + "epoch": 0.5326689945750206, + "grad_norm": 0.6902349289697002, + "learning_rate": 0.00017210880966430465, + "loss": 12.3484, + "step": 9782 + }, + { + "epoch": 0.5327234485716036, + "grad_norm": 0.6697846969405893, + "learning_rate": 0.00017210269971210644, + "loss": 12.4456, + "step": 9783 + }, + { + "epoch": 0.5327779025681866, + "grad_norm": 0.6201544903874386, + "learning_rate": 0.00017209658919922554, + "loss": 12.4618, + "step": 9784 + }, + { + "epoch": 0.5328323565647696, + "grad_norm": 0.6438666747424316, + "learning_rate": 0.00017209047812570948, + "loss": 12.3211, + "step": 9785 + }, + { + "epoch": 0.5328868105613527, + "grad_norm": 0.7061246526826793, + "learning_rate": 0.00017208436649160578, + "loss": 12.5599, + "step": 9786 + }, + { + "epoch": 0.5329412645579357, + "grad_norm": 0.6512120724465694, + "learning_rate": 0.00017207825429696195, + "loss": 12.3956, + "step": 9787 + }, + { + "epoch": 0.5329957185545187, + "grad_norm": 0.6008826656968721, + "learning_rate": 0.00017207214154182552, + "loss": 12.3315, + "step": 9788 + }, + { + "epoch": 0.5330501725511017, + "grad_norm": 0.6151984110148345, + "learning_rate": 0.00017206602822624403, + "loss": 12.3331, + "step": 9789 + }, + { + "epoch": 0.5331046265476846, + "grad_norm": 0.7381432926540967, + "learning_rate": 0.00017205991435026503, + "loss": 12.2305, + "step": 9790 + }, + { + "epoch": 0.5331590805442677, + "grad_norm": 0.5831377313507006, + "learning_rate": 0.00017205379991393603, + "loss": 12.4046, + "step": 9791 + }, + { + "epoch": 0.5332135345408507, + "grad_norm": 0.7278580392286879, + "learning_rate": 0.00017204768491730464, + "loss": 12.3752, + "step": 9792 + }, + { + "epoch": 0.5332679885374337, + "grad_norm": 0.6868404243386019, + "learning_rate": 0.00017204156936041832, + "loss": 12.475, + "step": 9793 + }, + { + "epoch": 0.5333224425340167, + "grad_norm": 0.6386140397717891, + "learning_rate": 0.00017203545324332472, + "loss": 12.3795, + "step": 9794 + }, + { + "epoch": 0.5333768965305997, + "grad_norm": 0.741678294977336, + "learning_rate": 0.00017202933656607132, + "loss": 12.3816, + "step": 9795 + }, + { + "epoch": 0.5334313505271827, + "grad_norm": 0.6622029228786622, + "learning_rate": 0.00017202321932870577, + "loss": 12.5115, + "step": 9796 + }, + { + "epoch": 0.5334858045237658, + "grad_norm": 0.6693469010395893, + "learning_rate": 0.00017201710153127552, + "loss": 12.2848, + "step": 9797 + }, + { + "epoch": 0.5335402585203488, + "grad_norm": 0.6947314114981221, + "learning_rate": 0.00017201098317382824, + "loss": 12.4751, + "step": 9798 + }, + { + "epoch": 0.5335947125169318, + "grad_norm": 0.6436666397792562, + "learning_rate": 0.0001720048642564115, + "loss": 12.4354, + "step": 9799 + }, + { + "epoch": 0.5336491665135148, + "grad_norm": 0.587771081254023, + "learning_rate": 0.00017199874477907283, + "loss": 12.3793, + "step": 9800 + }, + { + "epoch": 0.5337036205100978, + "grad_norm": 0.6547024454832397, + "learning_rate": 0.00017199262474185988, + "loss": 12.4499, + "step": 9801 + }, + { + "epoch": 0.5337580745066808, + "grad_norm": 0.6429260236613371, + "learning_rate": 0.00017198650414482019, + "loss": 12.3718, + "step": 9802 + }, + { + "epoch": 0.5338125285032639, + "grad_norm": 0.6317710786618224, + "learning_rate": 0.00017198038298800136, + "loss": 12.3614, + "step": 9803 + }, + { + "epoch": 0.5338669824998469, + "grad_norm": 0.6850677083619697, + "learning_rate": 0.000171974261271451, + "loss": 12.3093, + "step": 9804 + }, + { + "epoch": 0.5339214364964299, + "grad_norm": 0.6419486047294934, + "learning_rate": 0.00017196813899521672, + "loss": 12.3757, + "step": 9805 + }, + { + "epoch": 0.5339758904930129, + "grad_norm": 0.6135614860091514, + "learning_rate": 0.00017196201615934614, + "loss": 12.3681, + "step": 9806 + }, + { + "epoch": 0.5340303444895959, + "grad_norm": 0.7531851068565291, + "learning_rate": 0.00017195589276388683, + "loss": 12.5129, + "step": 9807 + }, + { + "epoch": 0.5340847984861788, + "grad_norm": 0.6420256498410256, + "learning_rate": 0.00017194976880888642, + "loss": 12.4711, + "step": 9808 + }, + { + "epoch": 0.534139252482762, + "grad_norm": 0.5705655579112456, + "learning_rate": 0.00017194364429439258, + "loss": 12.3657, + "step": 9809 + }, + { + "epoch": 0.5341937064793449, + "grad_norm": 0.6295191539717218, + "learning_rate": 0.00017193751922045286, + "loss": 12.4479, + "step": 9810 + }, + { + "epoch": 0.5342481604759279, + "grad_norm": 0.6177632784310391, + "learning_rate": 0.00017193139358711497, + "loss": 12.477, + "step": 9811 + }, + { + "epoch": 0.5343026144725109, + "grad_norm": 0.6854705931839781, + "learning_rate": 0.00017192526739442647, + "loss": 12.4645, + "step": 9812 + }, + { + "epoch": 0.5343570684690939, + "grad_norm": 0.613644425046222, + "learning_rate": 0.000171919140642435, + "loss": 12.4764, + "step": 9813 + }, + { + "epoch": 0.5344115224656769, + "grad_norm": 0.5929606787046219, + "learning_rate": 0.0001719130133311883, + "loss": 12.4075, + "step": 9814 + }, + { + "epoch": 0.53446597646226, + "grad_norm": 0.6068627512682941, + "learning_rate": 0.0001719068854607339, + "loss": 12.3778, + "step": 9815 + }, + { + "epoch": 0.534520430458843, + "grad_norm": 0.6383825000704191, + "learning_rate": 0.00017190075703111952, + "loss": 12.4257, + "step": 9816 + }, + { + "epoch": 0.534574884455426, + "grad_norm": 0.630913310539644, + "learning_rate": 0.0001718946280423928, + "loss": 12.3683, + "step": 9817 + }, + { + "epoch": 0.534629338452009, + "grad_norm": 0.6614522218218793, + "learning_rate": 0.00017188849849460137, + "loss": 12.3796, + "step": 9818 + }, + { + "epoch": 0.534683792448592, + "grad_norm": 0.6034520952779118, + "learning_rate": 0.00017188236838779295, + "loss": 12.3671, + "step": 9819 + }, + { + "epoch": 0.534738246445175, + "grad_norm": 0.6531549455633217, + "learning_rate": 0.00017187623772201515, + "loss": 12.368, + "step": 9820 + }, + { + "epoch": 0.5347927004417581, + "grad_norm": 0.5538706811573829, + "learning_rate": 0.0001718701064973157, + "loss": 12.3199, + "step": 9821 + }, + { + "epoch": 0.5348471544383411, + "grad_norm": 0.6744276782703781, + "learning_rate": 0.00017186397471374222, + "loss": 12.5405, + "step": 9822 + }, + { + "epoch": 0.5349016084349241, + "grad_norm": 0.6165108152495673, + "learning_rate": 0.00017185784237134244, + "loss": 12.4017, + "step": 9823 + }, + { + "epoch": 0.5349560624315071, + "grad_norm": 0.6047927725391143, + "learning_rate": 0.00017185170947016403, + "loss": 12.4021, + "step": 9824 + }, + { + "epoch": 0.53501051642809, + "grad_norm": 0.677679534013877, + "learning_rate": 0.0001718455760102547, + "loss": 12.3044, + "step": 9825 + }, + { + "epoch": 0.5350649704246732, + "grad_norm": 0.5815455964482686, + "learning_rate": 0.00017183944199166207, + "loss": 12.4053, + "step": 9826 + }, + { + "epoch": 0.5351194244212562, + "grad_norm": 0.6780133563415122, + "learning_rate": 0.00017183330741443392, + "loss": 12.285, + "step": 9827 + }, + { + "epoch": 0.5351738784178391, + "grad_norm": 0.662949576246597, + "learning_rate": 0.0001718271722786179, + "loss": 12.4557, + "step": 9828 + }, + { + "epoch": 0.5352283324144221, + "grad_norm": 0.6269948079958946, + "learning_rate": 0.00017182103658426175, + "loss": 12.3465, + "step": 9829 + }, + { + "epoch": 0.5352827864110051, + "grad_norm": 0.846084103696285, + "learning_rate": 0.00017181490033141322, + "loss": 12.31, + "step": 9830 + }, + { + "epoch": 0.5353372404075881, + "grad_norm": 0.54992669181176, + "learning_rate": 0.00017180876352011995, + "loss": 12.2473, + "step": 9831 + }, + { + "epoch": 0.5353916944041712, + "grad_norm": 0.7478928913354632, + "learning_rate": 0.0001718026261504297, + "loss": 12.3651, + "step": 9832 + }, + { + "epoch": 0.5354461484007542, + "grad_norm": 0.7367492449815537, + "learning_rate": 0.00017179648822239016, + "loss": 12.4024, + "step": 9833 + }, + { + "epoch": 0.5355006023973372, + "grad_norm": 0.5685099530599846, + "learning_rate": 0.00017179034973604913, + "loss": 12.4241, + "step": 9834 + }, + { + "epoch": 0.5355550563939202, + "grad_norm": 0.849747866660649, + "learning_rate": 0.00017178421069145427, + "loss": 12.4344, + "step": 9835 + }, + { + "epoch": 0.5356095103905032, + "grad_norm": 0.6246212097436993, + "learning_rate": 0.00017177807108865336, + "loss": 12.2952, + "step": 9836 + }, + { + "epoch": 0.5356639643870862, + "grad_norm": 0.6160026522839797, + "learning_rate": 0.00017177193092769412, + "loss": 12.3952, + "step": 9837 + }, + { + "epoch": 0.5357184183836693, + "grad_norm": 0.6626744999205778, + "learning_rate": 0.0001717657902086243, + "loss": 12.23, + "step": 9838 + }, + { + "epoch": 0.5357728723802523, + "grad_norm": 0.6631853663477942, + "learning_rate": 0.0001717596489314917, + "loss": 12.4028, + "step": 9839 + }, + { + "epoch": 0.5358273263768353, + "grad_norm": 0.603045212597613, + "learning_rate": 0.00017175350709634402, + "loss": 12.3879, + "step": 9840 + }, + { + "epoch": 0.5358817803734183, + "grad_norm": 0.7324693325573454, + "learning_rate": 0.000171747364703229, + "loss": 12.345, + "step": 9841 + }, + { + "epoch": 0.5359362343700013, + "grad_norm": 0.6550016515664254, + "learning_rate": 0.00017174122175219448, + "loss": 12.4589, + "step": 9842 + }, + { + "epoch": 0.5359906883665843, + "grad_norm": 0.641150171504333, + "learning_rate": 0.00017173507824328819, + "loss": 12.3711, + "step": 9843 + }, + { + "epoch": 0.5360451423631674, + "grad_norm": 0.686298156521777, + "learning_rate": 0.00017172893417655792, + "loss": 12.2025, + "step": 9844 + }, + { + "epoch": 0.5360995963597504, + "grad_norm": 0.7136610616206259, + "learning_rate": 0.00017172278955205136, + "loss": 12.6081, + "step": 9845 + }, + { + "epoch": 0.5361540503563333, + "grad_norm": 0.600625392297172, + "learning_rate": 0.00017171664436981644, + "loss": 12.3953, + "step": 9846 + }, + { + "epoch": 0.5362085043529163, + "grad_norm": 0.6301836378631536, + "learning_rate": 0.00017171049862990082, + "loss": 12.0426, + "step": 9847 + }, + { + "epoch": 0.5362629583494993, + "grad_norm": 0.6714064351021483, + "learning_rate": 0.00017170435233235235, + "loss": 12.2334, + "step": 9848 + }, + { + "epoch": 0.5363174123460823, + "grad_norm": 0.706595530166399, + "learning_rate": 0.0001716982054772188, + "loss": 12.3248, + "step": 9849 + }, + { + "epoch": 0.5363718663426654, + "grad_norm": 0.9192217290029557, + "learning_rate": 0.00017169205806454797, + "loss": 12.5167, + "step": 9850 + }, + { + "epoch": 0.5364263203392484, + "grad_norm": 0.6517521190636777, + "learning_rate": 0.0001716859100943877, + "loss": 12.416, + "step": 9851 + }, + { + "epoch": 0.5364807743358314, + "grad_norm": 0.5578077998753249, + "learning_rate": 0.00017167976156678576, + "loss": 12.3878, + "step": 9852 + }, + { + "epoch": 0.5365352283324144, + "grad_norm": 0.5882604834303083, + "learning_rate": 0.00017167361248178996, + "loss": 12.3207, + "step": 9853 + }, + { + "epoch": 0.5365896823289974, + "grad_norm": 0.609667657721763, + "learning_rate": 0.00017166746283944816, + "loss": 12.4551, + "step": 9854 + }, + { + "epoch": 0.5366441363255805, + "grad_norm": 0.582794617796146, + "learning_rate": 0.00017166131263980812, + "loss": 12.4657, + "step": 9855 + }, + { + "epoch": 0.5366985903221635, + "grad_norm": 0.6743278602101175, + "learning_rate": 0.00017165516188291774, + "loss": 12.4754, + "step": 9856 + }, + { + "epoch": 0.5367530443187465, + "grad_norm": 0.615568670619058, + "learning_rate": 0.00017164901056882474, + "loss": 12.4393, + "step": 9857 + }, + { + "epoch": 0.5368074983153295, + "grad_norm": 0.6096022612379464, + "learning_rate": 0.00017164285869757705, + "loss": 12.3968, + "step": 9858 + }, + { + "epoch": 0.5368619523119125, + "grad_norm": 0.6849749417805655, + "learning_rate": 0.0001716367062692225, + "loss": 12.3788, + "step": 9859 + }, + { + "epoch": 0.5369164063084955, + "grad_norm": 0.6798822650795383, + "learning_rate": 0.0001716305532838089, + "loss": 12.3282, + "step": 9860 + }, + { + "epoch": 0.5369708603050786, + "grad_norm": 0.6290777902183159, + "learning_rate": 0.00017162439974138406, + "loss": 12.2592, + "step": 9861 + }, + { + "epoch": 0.5370253143016616, + "grad_norm": 0.6034768067187262, + "learning_rate": 0.0001716182456419959, + "loss": 12.2475, + "step": 9862 + }, + { + "epoch": 0.5370797682982446, + "grad_norm": 0.5760830659366515, + "learning_rate": 0.00017161209098569228, + "loss": 12.2946, + "step": 9863 + }, + { + "epoch": 0.5371342222948275, + "grad_norm": 0.7272255772367627, + "learning_rate": 0.00017160593577252102, + "loss": 12.2768, + "step": 9864 + }, + { + "epoch": 0.5371886762914105, + "grad_norm": 0.6931591166323151, + "learning_rate": 0.00017159978000252997, + "loss": 12.4305, + "step": 9865 + }, + { + "epoch": 0.5372431302879935, + "grad_norm": 0.5948832222548013, + "learning_rate": 0.00017159362367576706, + "loss": 12.4153, + "step": 9866 + }, + { + "epoch": 0.5372975842845766, + "grad_norm": 0.5757398920114913, + "learning_rate": 0.0001715874667922801, + "loss": 12.4143, + "step": 9867 + }, + { + "epoch": 0.5373520382811596, + "grad_norm": 0.6099368336620453, + "learning_rate": 0.00017158130935211697, + "loss": 12.4317, + "step": 9868 + }, + { + "epoch": 0.5374064922777426, + "grad_norm": 0.577534146116071, + "learning_rate": 0.00017157515135532563, + "loss": 12.4152, + "step": 9869 + }, + { + "epoch": 0.5374609462743256, + "grad_norm": 0.6264027513040179, + "learning_rate": 0.00017156899280195388, + "loss": 12.4539, + "step": 9870 + }, + { + "epoch": 0.5375154002709086, + "grad_norm": 0.5972610104177685, + "learning_rate": 0.00017156283369204964, + "loss": 12.4466, + "step": 9871 + }, + { + "epoch": 0.5375698542674916, + "grad_norm": 0.6230837192559272, + "learning_rate": 0.00017155667402566081, + "loss": 12.3657, + "step": 9872 + }, + { + "epoch": 0.5376243082640747, + "grad_norm": 0.587195318824459, + "learning_rate": 0.00017155051380283526, + "loss": 12.4126, + "step": 9873 + }, + { + "epoch": 0.5376787622606577, + "grad_norm": 0.6135653893384941, + "learning_rate": 0.00017154435302362098, + "loss": 12.5026, + "step": 9874 + }, + { + "epoch": 0.5377332162572407, + "grad_norm": 0.7244856558646927, + "learning_rate": 0.00017153819168806575, + "loss": 12.3259, + "step": 9875 + }, + { + "epoch": 0.5377876702538237, + "grad_norm": 0.6456028771473306, + "learning_rate": 0.00017153202979621756, + "loss": 12.4228, + "step": 9876 + }, + { + "epoch": 0.5378421242504067, + "grad_norm": 0.6730816084709884, + "learning_rate": 0.00017152586734812432, + "loss": 12.3757, + "step": 9877 + }, + { + "epoch": 0.5378965782469897, + "grad_norm": 0.6794971975431958, + "learning_rate": 0.00017151970434383393, + "loss": 12.2723, + "step": 9878 + }, + { + "epoch": 0.5379510322435728, + "grad_norm": 0.6763756419411008, + "learning_rate": 0.0001715135407833943, + "loss": 12.2668, + "step": 9879 + }, + { + "epoch": 0.5380054862401558, + "grad_norm": 0.6493172433037002, + "learning_rate": 0.00017150737666685344, + "loss": 12.2866, + "step": 9880 + }, + { + "epoch": 0.5380599402367388, + "grad_norm": 0.59240170327952, + "learning_rate": 0.0001715012119942592, + "loss": 12.3289, + "step": 9881 + }, + { + "epoch": 0.5381143942333217, + "grad_norm": 0.7180125834990535, + "learning_rate": 0.00017149504676565954, + "loss": 12.4702, + "step": 9882 + }, + { + "epoch": 0.5381688482299047, + "grad_norm": 0.6399415374368775, + "learning_rate": 0.00017148888098110244, + "loss": 12.333, + "step": 9883 + }, + { + "epoch": 0.5382233022264877, + "grad_norm": 0.6806379391970908, + "learning_rate": 0.00017148271464063574, + "loss": 12.4301, + "step": 9884 + }, + { + "epoch": 0.5382777562230708, + "grad_norm": 0.687678562545243, + "learning_rate": 0.00017147654774430753, + "loss": 12.289, + "step": 9885 + }, + { + "epoch": 0.5383322102196538, + "grad_norm": 0.7406076855620836, + "learning_rate": 0.00017147038029216566, + "loss": 12.5046, + "step": 9886 + }, + { + "epoch": 0.5383866642162368, + "grad_norm": 0.7058845572303966, + "learning_rate": 0.00017146421228425815, + "loss": 12.3471, + "step": 9887 + }, + { + "epoch": 0.5384411182128198, + "grad_norm": 0.6568949054306508, + "learning_rate": 0.00017145804372063295, + "loss": 12.4707, + "step": 9888 + }, + { + "epoch": 0.5384955722094028, + "grad_norm": 0.8311696208643002, + "learning_rate": 0.000171451874601338, + "loss": 12.5002, + "step": 9889 + }, + { + "epoch": 0.5385500262059859, + "grad_norm": 0.7688399949416185, + "learning_rate": 0.00017144570492642127, + "loss": 12.1362, + "step": 9890 + }, + { + "epoch": 0.5386044802025689, + "grad_norm": 0.7953307230082599, + "learning_rate": 0.0001714395346959308, + "loss": 12.5111, + "step": 9891 + }, + { + "epoch": 0.5386589341991519, + "grad_norm": 0.6708137368403178, + "learning_rate": 0.00017143336390991451, + "loss": 12.3523, + "step": 9892 + }, + { + "epoch": 0.5387133881957349, + "grad_norm": 0.6617697777443491, + "learning_rate": 0.0001714271925684204, + "loss": 12.3565, + "step": 9893 + }, + { + "epoch": 0.5387678421923179, + "grad_norm": 0.6677088180209257, + "learning_rate": 0.00017142102067149647, + "loss": 12.4431, + "step": 9894 + }, + { + "epoch": 0.5388222961889009, + "grad_norm": 0.583972439542916, + "learning_rate": 0.00017141484821919068, + "loss": 12.4952, + "step": 9895 + }, + { + "epoch": 0.538876750185484, + "grad_norm": 0.717347607231282, + "learning_rate": 0.0001714086752115511, + "loss": 12.3808, + "step": 9896 + }, + { + "epoch": 0.538931204182067, + "grad_norm": 0.6595704544058548, + "learning_rate": 0.00017140250164862563, + "loss": 12.4887, + "step": 9897 + }, + { + "epoch": 0.53898565817865, + "grad_norm": 0.6045318794148249, + "learning_rate": 0.00017139632753046237, + "loss": 12.3262, + "step": 9898 + }, + { + "epoch": 0.539040112175233, + "grad_norm": 0.6248683294042436, + "learning_rate": 0.0001713901528571093, + "loss": 12.3629, + "step": 9899 + }, + { + "epoch": 0.539094566171816, + "grad_norm": 0.6558946098725765, + "learning_rate": 0.0001713839776286144, + "loss": 12.5053, + "step": 9900 + }, + { + "epoch": 0.5391490201683989, + "grad_norm": 0.6491145532761589, + "learning_rate": 0.00017137780184502574, + "loss": 12.4372, + "step": 9901 + }, + { + "epoch": 0.539203474164982, + "grad_norm": 0.6929592259968642, + "learning_rate": 0.0001713716255063913, + "loss": 12.3385, + "step": 9902 + }, + { + "epoch": 0.539257928161565, + "grad_norm": 0.6085672631263342, + "learning_rate": 0.00017136544861275917, + "loss": 12.3886, + "step": 9903 + }, + { + "epoch": 0.539312382158148, + "grad_norm": 0.5951696288345016, + "learning_rate": 0.0001713592711641773, + "loss": 12.2177, + "step": 9904 + }, + { + "epoch": 0.539366836154731, + "grad_norm": 0.6179769387240053, + "learning_rate": 0.00017135309316069382, + "loss": 12.3241, + "step": 9905 + }, + { + "epoch": 0.539421290151314, + "grad_norm": 0.6229249331386542, + "learning_rate": 0.00017134691460235667, + "loss": 12.386, + "step": 9906 + }, + { + "epoch": 0.539475744147897, + "grad_norm": 0.6238149637276051, + "learning_rate": 0.000171340735489214, + "loss": 12.4721, + "step": 9907 + }, + { + "epoch": 0.5395301981444801, + "grad_norm": 0.6983604951623149, + "learning_rate": 0.00017133455582131374, + "loss": 12.4289, + "step": 9908 + }, + { + "epoch": 0.5395846521410631, + "grad_norm": 0.6441007491976519, + "learning_rate": 0.00017132837559870407, + "loss": 12.4204, + "step": 9909 + }, + { + "epoch": 0.5396391061376461, + "grad_norm": 0.5910950723368182, + "learning_rate": 0.00017132219482143298, + "loss": 12.5011, + "step": 9910 + }, + { + "epoch": 0.5396935601342291, + "grad_norm": 0.6216212968450316, + "learning_rate": 0.00017131601348954853, + "loss": 12.4437, + "step": 9911 + }, + { + "epoch": 0.5397480141308121, + "grad_norm": 0.5852352225909889, + "learning_rate": 0.0001713098316030988, + "loss": 12.2003, + "step": 9912 + }, + { + "epoch": 0.5398024681273951, + "grad_norm": 0.6166938464575757, + "learning_rate": 0.00017130364916213186, + "loss": 12.3618, + "step": 9913 + }, + { + "epoch": 0.5398569221239782, + "grad_norm": 0.6585778033684443, + "learning_rate": 0.00017129746616669576, + "loss": 12.4934, + "step": 9914 + }, + { + "epoch": 0.5399113761205612, + "grad_norm": 0.5912480969666162, + "learning_rate": 0.00017129128261683863, + "loss": 12.3631, + "step": 9915 + }, + { + "epoch": 0.5399658301171442, + "grad_norm": 0.580616927395157, + "learning_rate": 0.00017128509851260858, + "loss": 12.4729, + "step": 9916 + }, + { + "epoch": 0.5400202841137272, + "grad_norm": 0.6584369629337092, + "learning_rate": 0.0001712789138540536, + "loss": 12.4848, + "step": 9917 + }, + { + "epoch": 0.5400747381103101, + "grad_norm": 0.6410516271365128, + "learning_rate": 0.0001712727286412218, + "loss": 12.1675, + "step": 9918 + }, + { + "epoch": 0.5401291921068931, + "grad_norm": 0.6732803345789796, + "learning_rate": 0.00017126654287416137, + "loss": 12.3771, + "step": 9919 + }, + { + "epoch": 0.5401836461034762, + "grad_norm": 0.733088340455232, + "learning_rate": 0.0001712603565529203, + "loss": 12.5004, + "step": 9920 + }, + { + "epoch": 0.5402381001000592, + "grad_norm": 0.580679752904814, + "learning_rate": 0.0001712541696775468, + "loss": 12.3632, + "step": 9921 + }, + { + "epoch": 0.5402925540966422, + "grad_norm": 0.6136950209253756, + "learning_rate": 0.00017124798224808888, + "loss": 12.3828, + "step": 9922 + }, + { + "epoch": 0.5403470080932252, + "grad_norm": 0.666839386087224, + "learning_rate": 0.00017124179426459475, + "loss": 12.3126, + "step": 9923 + }, + { + "epoch": 0.5404014620898082, + "grad_norm": 0.5709082525476437, + "learning_rate": 0.00017123560572711245, + "loss": 12.3103, + "step": 9924 + }, + { + "epoch": 0.5404559160863913, + "grad_norm": 0.7116277072509676, + "learning_rate": 0.00017122941663569012, + "loss": 12.4852, + "step": 9925 + }, + { + "epoch": 0.5405103700829743, + "grad_norm": 0.6497855971920484, + "learning_rate": 0.00017122322699037593, + "loss": 12.441, + "step": 9926 + }, + { + "epoch": 0.5405648240795573, + "grad_norm": 0.671279944463193, + "learning_rate": 0.00017121703679121798, + "loss": 12.3126, + "step": 9927 + }, + { + "epoch": 0.5406192780761403, + "grad_norm": 0.5639881985662691, + "learning_rate": 0.00017121084603826438, + "loss": 12.3626, + "step": 9928 + }, + { + "epoch": 0.5406737320727233, + "grad_norm": 0.6615971952124131, + "learning_rate": 0.00017120465473156334, + "loss": 12.501, + "step": 9929 + }, + { + "epoch": 0.5407281860693063, + "grad_norm": 0.617198644141775, + "learning_rate": 0.00017119846287116296, + "loss": 12.466, + "step": 9930 + }, + { + "epoch": 0.5407826400658894, + "grad_norm": 0.6471490533856712, + "learning_rate": 0.00017119227045711135, + "loss": 12.3989, + "step": 9931 + }, + { + "epoch": 0.5408370940624724, + "grad_norm": 0.5967197680845255, + "learning_rate": 0.00017118607748945673, + "loss": 12.3919, + "step": 9932 + }, + { + "epoch": 0.5408915480590554, + "grad_norm": 0.6255433894022774, + "learning_rate": 0.00017117988396824724, + "loss": 12.3807, + "step": 9933 + }, + { + "epoch": 0.5409460020556384, + "grad_norm": 0.6593193505343585, + "learning_rate": 0.00017117368989353105, + "loss": 12.4509, + "step": 9934 + }, + { + "epoch": 0.5410004560522214, + "grad_norm": 0.6377169358835266, + "learning_rate": 0.00017116749526535627, + "loss": 12.3728, + "step": 9935 + }, + { + "epoch": 0.5410549100488043, + "grad_norm": 0.5641378918184573, + "learning_rate": 0.00017116130008377117, + "loss": 12.3931, + "step": 9936 + }, + { + "epoch": 0.5411093640453875, + "grad_norm": 0.5968616645753267, + "learning_rate": 0.0001711551043488238, + "loss": 12.4199, + "step": 9937 + }, + { + "epoch": 0.5411638180419704, + "grad_norm": 0.6743990900177875, + "learning_rate": 0.00017114890806056243, + "loss": 12.4669, + "step": 9938 + }, + { + "epoch": 0.5412182720385534, + "grad_norm": 0.6191577597038501, + "learning_rate": 0.0001711427112190352, + "loss": 12.5016, + "step": 9939 + }, + { + "epoch": 0.5412727260351364, + "grad_norm": 0.5782102735631727, + "learning_rate": 0.0001711365138242904, + "loss": 12.2758, + "step": 9940 + }, + { + "epoch": 0.5413271800317194, + "grad_norm": 0.6275492666648163, + "learning_rate": 0.00017113031587637608, + "loss": 12.3231, + "step": 9941 + }, + { + "epoch": 0.5413816340283024, + "grad_norm": 0.5737239024137321, + "learning_rate": 0.0001711241173753405, + "loss": 12.2454, + "step": 9942 + }, + { + "epoch": 0.5414360880248855, + "grad_norm": 0.5841077817920884, + "learning_rate": 0.00017111791832123184, + "loss": 12.3617, + "step": 9943 + }, + { + "epoch": 0.5414905420214685, + "grad_norm": 0.5125432900399025, + "learning_rate": 0.00017111171871409835, + "loss": 12.2638, + "step": 9944 + }, + { + "epoch": 0.5415449960180515, + "grad_norm": 0.627035765756675, + "learning_rate": 0.00017110551855398817, + "loss": 12.427, + "step": 9945 + }, + { + "epoch": 0.5415994500146345, + "grad_norm": 0.6375482313018308, + "learning_rate": 0.0001710993178409496, + "loss": 12.319, + "step": 9946 + }, + { + "epoch": 0.5416539040112175, + "grad_norm": 0.6194514620473593, + "learning_rate": 0.00017109311657503078, + "loss": 12.3979, + "step": 9947 + }, + { + "epoch": 0.5417083580078005, + "grad_norm": 0.6710169185222208, + "learning_rate": 0.00017108691475627996, + "loss": 12.3065, + "step": 9948 + }, + { + "epoch": 0.5417628120043836, + "grad_norm": 0.6494699821871501, + "learning_rate": 0.00017108071238474537, + "loss": 12.4168, + "step": 9949 + }, + { + "epoch": 0.5418172660009666, + "grad_norm": 0.6433398470683385, + "learning_rate": 0.00017107450946047528, + "loss": 12.3035, + "step": 9950 + }, + { + "epoch": 0.5418717199975496, + "grad_norm": 0.5649414899964764, + "learning_rate": 0.00017106830598351784, + "loss": 12.3674, + "step": 9951 + }, + { + "epoch": 0.5419261739941326, + "grad_norm": 0.5838873388720432, + "learning_rate": 0.00017106210195392136, + "loss": 12.4877, + "step": 9952 + }, + { + "epoch": 0.5419806279907156, + "grad_norm": 0.6215618900230296, + "learning_rate": 0.00017105589737173403, + "loss": 12.3699, + "step": 9953 + }, + { + "epoch": 0.5420350819872986, + "grad_norm": 0.5427453094602321, + "learning_rate": 0.00017104969223700415, + "loss": 12.2128, + "step": 9954 + }, + { + "epoch": 0.5420895359838817, + "grad_norm": 0.5825640076004421, + "learning_rate": 0.00017104348654977994, + "loss": 12.3704, + "step": 9955 + }, + { + "epoch": 0.5421439899804646, + "grad_norm": 0.611865041734815, + "learning_rate": 0.00017103728031010967, + "loss": 12.3571, + "step": 9956 + }, + { + "epoch": 0.5421984439770476, + "grad_norm": 0.6584718062782783, + "learning_rate": 0.0001710310735180416, + "loss": 12.4224, + "step": 9957 + }, + { + "epoch": 0.5422528979736306, + "grad_norm": 0.5394590262995481, + "learning_rate": 0.000171024866173624, + "loss": 12.3904, + "step": 9958 + }, + { + "epoch": 0.5423073519702136, + "grad_norm": 0.6251349184276233, + "learning_rate": 0.00017101865827690512, + "loss": 12.4797, + "step": 9959 + }, + { + "epoch": 0.5423618059667967, + "grad_norm": 0.6043745031335845, + "learning_rate": 0.0001710124498279332, + "loss": 12.4913, + "step": 9960 + }, + { + "epoch": 0.5424162599633797, + "grad_norm": 0.6416028422300966, + "learning_rate": 0.00017100624082675662, + "loss": 12.3101, + "step": 9961 + }, + { + "epoch": 0.5424707139599627, + "grad_norm": 0.586956971175586, + "learning_rate": 0.00017100003127342358, + "loss": 12.4128, + "step": 9962 + }, + { + "epoch": 0.5425251679565457, + "grad_norm": 0.5895692580881068, + "learning_rate": 0.0001709938211679824, + "loss": 12.3444, + "step": 9963 + }, + { + "epoch": 0.5425796219531287, + "grad_norm": 0.6892529704657986, + "learning_rate": 0.00017098761051048133, + "loss": 12.6901, + "step": 9964 + }, + { + "epoch": 0.5426340759497117, + "grad_norm": 0.5922876095576688, + "learning_rate": 0.00017098139930096874, + "loss": 12.1998, + "step": 9965 + }, + { + "epoch": 0.5426885299462948, + "grad_norm": 0.6353639822693112, + "learning_rate": 0.00017097518753949286, + "loss": 12.3397, + "step": 9966 + }, + { + "epoch": 0.5427429839428778, + "grad_norm": 0.6014554204328489, + "learning_rate": 0.00017096897522610202, + "loss": 12.4212, + "step": 9967 + }, + { + "epoch": 0.5427974379394608, + "grad_norm": 0.6129355783996963, + "learning_rate": 0.00017096276236084452, + "loss": 12.3241, + "step": 9968 + }, + { + "epoch": 0.5428518919360438, + "grad_norm": 0.6168784082239562, + "learning_rate": 0.0001709565489437687, + "loss": 12.3898, + "step": 9969 + }, + { + "epoch": 0.5429063459326268, + "grad_norm": 0.5455036782520887, + "learning_rate": 0.00017095033497492286, + "loss": 12.2893, + "step": 9970 + }, + { + "epoch": 0.5429607999292098, + "grad_norm": 0.6710022912122282, + "learning_rate": 0.00017094412045435528, + "loss": 12.3584, + "step": 9971 + }, + { + "epoch": 0.5430152539257929, + "grad_norm": 0.6179083424353565, + "learning_rate": 0.00017093790538211433, + "loss": 12.579, + "step": 9972 + }, + { + "epoch": 0.5430697079223759, + "grad_norm": 0.5654658232435088, + "learning_rate": 0.00017093168975824838, + "loss": 12.3976, + "step": 9973 + }, + { + "epoch": 0.5431241619189588, + "grad_norm": 0.5869301776598929, + "learning_rate": 0.0001709254735828057, + "loss": 12.4018, + "step": 9974 + }, + { + "epoch": 0.5431786159155418, + "grad_norm": 0.5699624764075849, + "learning_rate": 0.0001709192568558346, + "loss": 12.3251, + "step": 9975 + }, + { + "epoch": 0.5432330699121248, + "grad_norm": 0.6164089422163238, + "learning_rate": 0.00017091303957738347, + "loss": 12.4561, + "step": 9976 + }, + { + "epoch": 0.5432875239087078, + "grad_norm": 0.6110377746277542, + "learning_rate": 0.00017090682174750069, + "loss": 12.3636, + "step": 9977 + }, + { + "epoch": 0.5433419779052909, + "grad_norm": 0.6089179595565541, + "learning_rate": 0.00017090060336623456, + "loss": 12.4271, + "step": 9978 + }, + { + "epoch": 0.5433964319018739, + "grad_norm": 0.6118550994671074, + "learning_rate": 0.00017089438443363344, + "loss": 12.4179, + "step": 9979 + }, + { + "epoch": 0.5434508858984569, + "grad_norm": 0.7076959617147793, + "learning_rate": 0.0001708881649497457, + "loss": 12.4697, + "step": 9980 + }, + { + "epoch": 0.5435053398950399, + "grad_norm": 0.6383711008143893, + "learning_rate": 0.00017088194491461967, + "loss": 12.1758, + "step": 9981 + }, + { + "epoch": 0.5435597938916229, + "grad_norm": 0.6280069425682963, + "learning_rate": 0.00017087572432830382, + "loss": 12.3434, + "step": 9982 + }, + { + "epoch": 0.5436142478882059, + "grad_norm": 0.6599043049497079, + "learning_rate": 0.0001708695031908464, + "loss": 12.4343, + "step": 9983 + }, + { + "epoch": 0.543668701884789, + "grad_norm": 0.5897655562410326, + "learning_rate": 0.00017086328150229586, + "loss": 12.3205, + "step": 9984 + }, + { + "epoch": 0.543723155881372, + "grad_norm": 0.7437039719219413, + "learning_rate": 0.00017085705926270058, + "loss": 12.4402, + "step": 9985 + }, + { + "epoch": 0.543777609877955, + "grad_norm": 0.6357530106706325, + "learning_rate": 0.00017085083647210887, + "loss": 12.3601, + "step": 9986 + }, + { + "epoch": 0.543832063874538, + "grad_norm": 0.6374285771150914, + "learning_rate": 0.0001708446131305692, + "loss": 12.5205, + "step": 9987 + }, + { + "epoch": 0.543886517871121, + "grad_norm": 0.5896956056677658, + "learning_rate": 0.00017083838923812993, + "loss": 12.3106, + "step": 9988 + }, + { + "epoch": 0.5439409718677041, + "grad_norm": 0.6115372665262078, + "learning_rate": 0.00017083216479483947, + "loss": 12.3934, + "step": 9989 + }, + { + "epoch": 0.5439954258642871, + "grad_norm": 0.6755730245071239, + "learning_rate": 0.00017082593980074625, + "loss": 12.3121, + "step": 9990 + }, + { + "epoch": 0.5440498798608701, + "grad_norm": 0.6178738428597439, + "learning_rate": 0.00017081971425589857, + "loss": 12.3926, + "step": 9991 + }, + { + "epoch": 0.544104333857453, + "grad_norm": 0.6346592570491943, + "learning_rate": 0.00017081348816034496, + "loss": 12.5108, + "step": 9992 + }, + { + "epoch": 0.544158787854036, + "grad_norm": 0.8524667556741318, + "learning_rate": 0.00017080726151413381, + "loss": 12.514, + "step": 9993 + }, + { + "epoch": 0.544213241850619, + "grad_norm": 0.6071086314951164, + "learning_rate": 0.00017080103431731352, + "loss": 12.494, + "step": 9994 + }, + { + "epoch": 0.5442676958472021, + "grad_norm": 0.5870626705245917, + "learning_rate": 0.00017079480656993247, + "loss": 12.3616, + "step": 9995 + }, + { + "epoch": 0.5443221498437851, + "grad_norm": 0.6119188183797124, + "learning_rate": 0.00017078857827203917, + "loss": 12.5749, + "step": 9996 + }, + { + "epoch": 0.5443766038403681, + "grad_norm": 0.6068760617708026, + "learning_rate": 0.00017078234942368198, + "loss": 12.4392, + "step": 9997 + }, + { + "epoch": 0.5444310578369511, + "grad_norm": 0.6126603120868624, + "learning_rate": 0.00017077612002490942, + "loss": 12.3884, + "step": 9998 + }, + { + "epoch": 0.5444855118335341, + "grad_norm": 0.5868375210438797, + "learning_rate": 0.00017076989007576985, + "loss": 12.3587, + "step": 9999 + }, + { + "epoch": 0.5445399658301171, + "grad_norm": 0.6864597119491456, + "learning_rate": 0.00017076365957631174, + "loss": 12.4161, + "step": 10000 + }, + { + "epoch": 0.5445944198267002, + "grad_norm": 0.622011095040674, + "learning_rate": 0.00017075742852658355, + "loss": 12.3495, + "step": 10001 + }, + { + "epoch": 0.5446488738232832, + "grad_norm": 0.5766789005686187, + "learning_rate": 0.00017075119692663374, + "loss": 12.4083, + "step": 10002 + }, + { + "epoch": 0.5447033278198662, + "grad_norm": 0.5888590620506866, + "learning_rate": 0.0001707449647765108, + "loss": 12.2743, + "step": 10003 + }, + { + "epoch": 0.5447577818164492, + "grad_norm": 0.5877421821137575, + "learning_rate": 0.00017073873207626309, + "loss": 12.3029, + "step": 10004 + }, + { + "epoch": 0.5448122358130322, + "grad_norm": 0.618434215481345, + "learning_rate": 0.00017073249882593912, + "loss": 12.2655, + "step": 10005 + }, + { + "epoch": 0.5448666898096152, + "grad_norm": 0.6291436904487675, + "learning_rate": 0.00017072626502558742, + "loss": 12.3931, + "step": 10006 + }, + { + "epoch": 0.5449211438061983, + "grad_norm": 0.5865487556526245, + "learning_rate": 0.0001707200306752564, + "loss": 12.2635, + "step": 10007 + }, + { + "epoch": 0.5449755978027813, + "grad_norm": 0.6079297100272073, + "learning_rate": 0.00017071379577499458, + "loss": 12.4051, + "step": 10008 + }, + { + "epoch": 0.5450300517993643, + "grad_norm": 0.6016245121971452, + "learning_rate": 0.00017070756032485043, + "loss": 12.3416, + "step": 10009 + }, + { + "epoch": 0.5450845057959472, + "grad_norm": 0.5856170250429272, + "learning_rate": 0.00017070132432487242, + "loss": 12.3265, + "step": 10010 + }, + { + "epoch": 0.5451389597925302, + "grad_norm": 0.6551742421540866, + "learning_rate": 0.00017069508777510904, + "loss": 12.4072, + "step": 10011 + }, + { + "epoch": 0.5451934137891132, + "grad_norm": 0.666724847008867, + "learning_rate": 0.00017068885067560884, + "loss": 12.453, + "step": 10012 + }, + { + "epoch": 0.5452478677856963, + "grad_norm": 0.5980244266505121, + "learning_rate": 0.00017068261302642025, + "loss": 12.4869, + "step": 10013 + }, + { + "epoch": 0.5453023217822793, + "grad_norm": 0.6442562753056685, + "learning_rate": 0.00017067637482759182, + "loss": 12.4074, + "step": 10014 + }, + { + "epoch": 0.5453567757788623, + "grad_norm": 0.5781895247879832, + "learning_rate": 0.00017067013607917204, + "loss": 12.2852, + "step": 10015 + }, + { + "epoch": 0.5454112297754453, + "grad_norm": 0.5659964634909223, + "learning_rate": 0.00017066389678120942, + "loss": 12.3117, + "step": 10016 + }, + { + "epoch": 0.5454656837720283, + "grad_norm": 0.5984647462305562, + "learning_rate": 0.00017065765693375254, + "loss": 12.2896, + "step": 10017 + }, + { + "epoch": 0.5455201377686113, + "grad_norm": 0.5927320661463614, + "learning_rate": 0.0001706514165368498, + "loss": 12.3072, + "step": 10018 + }, + { + "epoch": 0.5455745917651944, + "grad_norm": 0.6037064365778668, + "learning_rate": 0.00017064517559054983, + "loss": 12.3729, + "step": 10019 + }, + { + "epoch": 0.5456290457617774, + "grad_norm": 0.5755367621481371, + "learning_rate": 0.00017063893409490115, + "loss": 12.409, + "step": 10020 + }, + { + "epoch": 0.5456834997583604, + "grad_norm": 0.6320289142883861, + "learning_rate": 0.00017063269204995222, + "loss": 12.3564, + "step": 10021 + }, + { + "epoch": 0.5457379537549434, + "grad_norm": 0.6002473882539896, + "learning_rate": 0.00017062644945575167, + "loss": 12.3511, + "step": 10022 + }, + { + "epoch": 0.5457924077515264, + "grad_norm": 0.5886482899613877, + "learning_rate": 0.000170620206312348, + "loss": 12.3213, + "step": 10023 + }, + { + "epoch": 0.5458468617481095, + "grad_norm": 0.5881266992555014, + "learning_rate": 0.00017061396261978978, + "loss": 12.3273, + "step": 10024 + }, + { + "epoch": 0.5459013157446925, + "grad_norm": 0.6677865513934675, + "learning_rate": 0.00017060771837812552, + "loss": 12.448, + "step": 10025 + }, + { + "epoch": 0.5459557697412755, + "grad_norm": 0.5722455480945151, + "learning_rate": 0.0001706014735874038, + "loss": 12.4596, + "step": 10026 + }, + { + "epoch": 0.5460102237378585, + "grad_norm": 0.657568952206474, + "learning_rate": 0.00017059522824767318, + "loss": 12.4349, + "step": 10027 + }, + { + "epoch": 0.5460646777344415, + "grad_norm": 0.6101702886040364, + "learning_rate": 0.00017058898235898225, + "loss": 12.1885, + "step": 10028 + }, + { + "epoch": 0.5461191317310244, + "grad_norm": 0.6164690188553941, + "learning_rate": 0.00017058273592137954, + "loss": 12.3377, + "step": 10029 + }, + { + "epoch": 0.5461735857276075, + "grad_norm": 0.5743311102393344, + "learning_rate": 0.00017057648893491363, + "loss": 12.3184, + "step": 10030 + }, + { + "epoch": 0.5462280397241905, + "grad_norm": 0.6036523761968633, + "learning_rate": 0.0001705702413996331, + "loss": 12.3641, + "step": 10031 + }, + { + "epoch": 0.5462824937207735, + "grad_norm": 0.6380111248095689, + "learning_rate": 0.00017056399331558656, + "loss": 12.3799, + "step": 10032 + }, + { + "epoch": 0.5463369477173565, + "grad_norm": 0.7449245804750874, + "learning_rate": 0.00017055774468282257, + "loss": 12.4631, + "step": 10033 + }, + { + "epoch": 0.5463914017139395, + "grad_norm": 0.602894598301536, + "learning_rate": 0.00017055149550138974, + "loss": 12.4042, + "step": 10034 + }, + { + "epoch": 0.5464458557105225, + "grad_norm": 0.5657734285602912, + "learning_rate": 0.0001705452457713366, + "loss": 12.451, + "step": 10035 + }, + { + "epoch": 0.5465003097071056, + "grad_norm": 0.576113903088083, + "learning_rate": 0.0001705389954927118, + "loss": 12.3255, + "step": 10036 + }, + { + "epoch": 0.5465547637036886, + "grad_norm": 0.6616302304908845, + "learning_rate": 0.000170532744665564, + "loss": 12.4435, + "step": 10037 + }, + { + "epoch": 0.5466092177002716, + "grad_norm": 0.6821592199096708, + "learning_rate": 0.00017052649328994174, + "loss": 12.5157, + "step": 10038 + }, + { + "epoch": 0.5466636716968546, + "grad_norm": 0.6096340643614813, + "learning_rate": 0.0001705202413658936, + "loss": 12.4992, + "step": 10039 + }, + { + "epoch": 0.5467181256934376, + "grad_norm": 0.6176713315553298, + "learning_rate": 0.0001705139888934683, + "loss": 12.2644, + "step": 10040 + }, + { + "epoch": 0.5467725796900206, + "grad_norm": 0.5861837007873087, + "learning_rate": 0.00017050773587271433, + "loss": 12.2729, + "step": 10041 + }, + { + "epoch": 0.5468270336866037, + "grad_norm": 0.5764941831254127, + "learning_rate": 0.0001705014823036804, + "loss": 12.3468, + "step": 10042 + }, + { + "epoch": 0.5468814876831867, + "grad_norm": 0.6265747671891215, + "learning_rate": 0.00017049522818641513, + "loss": 12.3629, + "step": 10043 + }, + { + "epoch": 0.5469359416797697, + "grad_norm": 0.8012125745132999, + "learning_rate": 0.00017048897352096713, + "loss": 12.3787, + "step": 10044 + }, + { + "epoch": 0.5469903956763527, + "grad_norm": 0.6351309622415717, + "learning_rate": 0.00017048271830738507, + "loss": 12.4865, + "step": 10045 + }, + { + "epoch": 0.5470448496729357, + "grad_norm": 0.5905862443496168, + "learning_rate": 0.00017047646254571755, + "loss": 12.322, + "step": 10046 + }, + { + "epoch": 0.5470993036695186, + "grad_norm": 0.6815468712793522, + "learning_rate": 0.00017047020623601328, + "loss": 12.4611, + "step": 10047 + }, + { + "epoch": 0.5471537576661017, + "grad_norm": 0.6174565560323156, + "learning_rate": 0.00017046394937832084, + "loss": 12.329, + "step": 10048 + }, + { + "epoch": 0.5472082116626847, + "grad_norm": 0.5276648368815748, + "learning_rate": 0.00017045769197268892, + "loss": 12.2771, + "step": 10049 + }, + { + "epoch": 0.5472626656592677, + "grad_norm": 0.6482574075836264, + "learning_rate": 0.00017045143401916613, + "loss": 12.3731, + "step": 10050 + }, + { + "epoch": 0.5473171196558507, + "grad_norm": 0.7609587337450041, + "learning_rate": 0.00017044517551780125, + "loss": 12.3264, + "step": 10051 + }, + { + "epoch": 0.5473715736524337, + "grad_norm": 0.5531086997696827, + "learning_rate": 0.0001704389164686428, + "loss": 12.3699, + "step": 10052 + }, + { + "epoch": 0.5474260276490167, + "grad_norm": 0.6909288910934123, + "learning_rate": 0.00017043265687173955, + "loss": 12.4163, + "step": 10053 + }, + { + "epoch": 0.5474804816455998, + "grad_norm": 0.6600937201127297, + "learning_rate": 0.00017042639672714015, + "loss": 12.3825, + "step": 10054 + }, + { + "epoch": 0.5475349356421828, + "grad_norm": 0.69648958811755, + "learning_rate": 0.0001704201360348933, + "loss": 12.2439, + "step": 10055 + }, + { + "epoch": 0.5475893896387658, + "grad_norm": 0.5877556274036734, + "learning_rate": 0.00017041387479504764, + "loss": 12.2162, + "step": 10056 + }, + { + "epoch": 0.5476438436353488, + "grad_norm": 0.5761911274527789, + "learning_rate": 0.00017040761300765188, + "loss": 12.3922, + "step": 10057 + }, + { + "epoch": 0.5476982976319318, + "grad_norm": 0.7246599447399307, + "learning_rate": 0.00017040135067275473, + "loss": 12.3671, + "step": 10058 + }, + { + "epoch": 0.5477527516285149, + "grad_norm": 0.5770976085961329, + "learning_rate": 0.00017039508779040485, + "loss": 12.3825, + "step": 10059 + }, + { + "epoch": 0.5478072056250979, + "grad_norm": 0.6303142158584125, + "learning_rate": 0.00017038882436065097, + "loss": 12.4369, + "step": 10060 + }, + { + "epoch": 0.5478616596216809, + "grad_norm": 0.6858517319715612, + "learning_rate": 0.0001703825603835418, + "loss": 12.349, + "step": 10061 + }, + { + "epoch": 0.5479161136182639, + "grad_norm": 0.6793237037550006, + "learning_rate": 0.000170376295859126, + "loss": 12.3091, + "step": 10062 + }, + { + "epoch": 0.5479705676148469, + "grad_norm": 0.6272479682486032, + "learning_rate": 0.00017037003078745238, + "loss": 12.3884, + "step": 10063 + }, + { + "epoch": 0.5480250216114299, + "grad_norm": 0.6504731696059936, + "learning_rate": 0.00017036376516856955, + "loss": 12.387, + "step": 10064 + }, + { + "epoch": 0.548079475608013, + "grad_norm": 0.6217625065902543, + "learning_rate": 0.00017035749900252628, + "loss": 12.4143, + "step": 10065 + }, + { + "epoch": 0.548133929604596, + "grad_norm": 0.5958225152012777, + "learning_rate": 0.00017035123228937134, + "loss": 12.2557, + "step": 10066 + }, + { + "epoch": 0.5481883836011789, + "grad_norm": 0.6090956331991192, + "learning_rate": 0.0001703449650291534, + "loss": 12.3631, + "step": 10067 + }, + { + "epoch": 0.5482428375977619, + "grad_norm": 0.6552031333626795, + "learning_rate": 0.00017033869722192122, + "loss": 12.4323, + "step": 10068 + }, + { + "epoch": 0.5482972915943449, + "grad_norm": 0.6393619229917588, + "learning_rate": 0.00017033242886772354, + "loss": 12.3953, + "step": 10069 + }, + { + "epoch": 0.5483517455909279, + "grad_norm": 0.5917431234863446, + "learning_rate": 0.00017032615996660905, + "loss": 12.4018, + "step": 10070 + }, + { + "epoch": 0.548406199587511, + "grad_norm": 0.6115700758668392, + "learning_rate": 0.0001703198905186266, + "loss": 12.4389, + "step": 10071 + }, + { + "epoch": 0.548460653584094, + "grad_norm": 0.5654829808206024, + "learning_rate": 0.0001703136205238249, + "loss": 12.3738, + "step": 10072 + }, + { + "epoch": 0.548515107580677, + "grad_norm": 0.628121296976426, + "learning_rate": 0.00017030734998225265, + "loss": 12.5239, + "step": 10073 + }, + { + "epoch": 0.54856956157726, + "grad_norm": 0.721825207992574, + "learning_rate": 0.00017030107889395865, + "loss": 12.4119, + "step": 10074 + }, + { + "epoch": 0.548624015573843, + "grad_norm": 0.5710087081755535, + "learning_rate": 0.0001702948072589917, + "loss": 12.3945, + "step": 10075 + }, + { + "epoch": 0.548678469570426, + "grad_norm": 0.5846170878386958, + "learning_rate": 0.0001702885350774005, + "loss": 12.3839, + "step": 10076 + }, + { + "epoch": 0.5487329235670091, + "grad_norm": 0.6058575657945217, + "learning_rate": 0.00017028226234923395, + "loss": 12.3056, + "step": 10077 + }, + { + "epoch": 0.5487873775635921, + "grad_norm": 0.5312676469096749, + "learning_rate": 0.00017027598907454067, + "loss": 12.4184, + "step": 10078 + }, + { + "epoch": 0.5488418315601751, + "grad_norm": 0.6177746184293285, + "learning_rate": 0.00017026971525336952, + "loss": 12.4248, + "step": 10079 + }, + { + "epoch": 0.5488962855567581, + "grad_norm": 0.7212320189974394, + "learning_rate": 0.0001702634408857693, + "loss": 12.3873, + "step": 10080 + }, + { + "epoch": 0.5489507395533411, + "grad_norm": 0.6057541329016711, + "learning_rate": 0.00017025716597178877, + "loss": 12.4775, + "step": 10081 + }, + { + "epoch": 0.549005193549924, + "grad_norm": 0.5892684255084142, + "learning_rate": 0.00017025089051147675, + "loss": 12.3358, + "step": 10082 + }, + { + "epoch": 0.5490596475465072, + "grad_norm": 0.5561580135163213, + "learning_rate": 0.00017024461450488202, + "loss": 12.3502, + "step": 10083 + }, + { + "epoch": 0.5491141015430901, + "grad_norm": 0.6613960500018959, + "learning_rate": 0.00017023833795205338, + "loss": 12.3686, + "step": 10084 + }, + { + "epoch": 0.5491685555396731, + "grad_norm": 0.7155502582204928, + "learning_rate": 0.00017023206085303965, + "loss": 12.461, + "step": 10085 + }, + { + "epoch": 0.5492230095362561, + "grad_norm": 0.6151275535683852, + "learning_rate": 0.00017022578320788963, + "loss": 12.3676, + "step": 10086 + }, + { + "epoch": 0.5492774635328391, + "grad_norm": 0.6465807638215435, + "learning_rate": 0.00017021950501665213, + "loss": 12.3349, + "step": 10087 + }, + { + "epoch": 0.5493319175294221, + "grad_norm": 0.6165783500464499, + "learning_rate": 0.00017021322627937602, + "loss": 12.4243, + "step": 10088 + }, + { + "epoch": 0.5493863715260052, + "grad_norm": 0.6234214096780826, + "learning_rate": 0.00017020694699611006, + "loss": 12.4481, + "step": 10089 + }, + { + "epoch": 0.5494408255225882, + "grad_norm": 0.6739255625363676, + "learning_rate": 0.0001702006671669031, + "loss": 12.4595, + "step": 10090 + }, + { + "epoch": 0.5494952795191712, + "grad_norm": 0.6137959527599097, + "learning_rate": 0.000170194386791804, + "loss": 12.4325, + "step": 10091 + }, + { + "epoch": 0.5495497335157542, + "grad_norm": 0.6374066951523512, + "learning_rate": 0.00017018810587086155, + "loss": 12.3797, + "step": 10092 + }, + { + "epoch": 0.5496041875123372, + "grad_norm": 0.6517927348980738, + "learning_rate": 0.00017018182440412468, + "loss": 12.3723, + "step": 10093 + }, + { + "epoch": 0.5496586415089203, + "grad_norm": 0.6960033350251431, + "learning_rate": 0.0001701755423916421, + "loss": 12.429, + "step": 10094 + }, + { + "epoch": 0.5497130955055033, + "grad_norm": 0.6500400376490462, + "learning_rate": 0.00017016925983346276, + "loss": 12.4318, + "step": 10095 + }, + { + "epoch": 0.5497675495020863, + "grad_norm": 0.6493288079349868, + "learning_rate": 0.0001701629767296355, + "loss": 12.3368, + "step": 10096 + }, + { + "epoch": 0.5498220034986693, + "grad_norm": 0.6466650608087391, + "learning_rate": 0.00017015669308020917, + "loss": 12.3259, + "step": 10097 + }, + { + "epoch": 0.5498764574952523, + "grad_norm": 0.620918020415698, + "learning_rate": 0.00017015040888523263, + "loss": 12.3392, + "step": 10098 + }, + { + "epoch": 0.5499309114918353, + "grad_norm": 0.603558345767907, + "learning_rate": 0.00017014412414475473, + "loss": 12.3562, + "step": 10099 + }, + { + "epoch": 0.5499853654884184, + "grad_norm": 0.688925327145977, + "learning_rate": 0.00017013783885882434, + "loss": 12.4577, + "step": 10100 + }, + { + "epoch": 0.5500398194850014, + "grad_norm": 0.6370679160685081, + "learning_rate": 0.00017013155302749038, + "loss": 12.3625, + "step": 10101 + }, + { + "epoch": 0.5500942734815844, + "grad_norm": 0.6302329868740851, + "learning_rate": 0.0001701252666508017, + "loss": 12.3363, + "step": 10102 + }, + { + "epoch": 0.5501487274781673, + "grad_norm": 0.6859771904180949, + "learning_rate": 0.0001701189797288072, + "loss": 12.521, + "step": 10103 + }, + { + "epoch": 0.5502031814747503, + "grad_norm": 0.5725597900784214, + "learning_rate": 0.00017011269226155574, + "loss": 12.2956, + "step": 10104 + }, + { + "epoch": 0.5502576354713333, + "grad_norm": 0.6190597219118882, + "learning_rate": 0.00017010640424909622, + "loss": 12.4949, + "step": 10105 + }, + { + "epoch": 0.5503120894679164, + "grad_norm": 0.6656002874448648, + "learning_rate": 0.00017010011569147754, + "loss": 12.3395, + "step": 10106 + }, + { + "epoch": 0.5503665434644994, + "grad_norm": 0.6106957375460383, + "learning_rate": 0.0001700938265887486, + "loss": 12.2956, + "step": 10107 + }, + { + "epoch": 0.5504209974610824, + "grad_norm": 0.6678328208838216, + "learning_rate": 0.00017008753694095836, + "loss": 12.2999, + "step": 10108 + }, + { + "epoch": 0.5504754514576654, + "grad_norm": 0.6389549113603158, + "learning_rate": 0.0001700812467481556, + "loss": 12.3789, + "step": 10109 + }, + { + "epoch": 0.5505299054542484, + "grad_norm": 0.6767434786700843, + "learning_rate": 0.00017007495601038938, + "loss": 12.4813, + "step": 10110 + }, + { + "epoch": 0.5505843594508314, + "grad_norm": 0.6668464446484444, + "learning_rate": 0.00017006866472770856, + "loss": 12.3595, + "step": 10111 + }, + { + "epoch": 0.5506388134474145, + "grad_norm": 0.5833039778043949, + "learning_rate": 0.00017006237290016201, + "loss": 12.3746, + "step": 10112 + }, + { + "epoch": 0.5506932674439975, + "grad_norm": 0.6248730751987213, + "learning_rate": 0.00017005608052779868, + "loss": 12.3588, + "step": 10113 + }, + { + "epoch": 0.5507477214405805, + "grad_norm": 0.6230806633482726, + "learning_rate": 0.00017004978761066757, + "loss": 12.4175, + "step": 10114 + }, + { + "epoch": 0.5508021754371635, + "grad_norm": 0.6442464098858275, + "learning_rate": 0.00017004349414881753, + "loss": 12.3379, + "step": 10115 + }, + { + "epoch": 0.5508566294337465, + "grad_norm": 0.6266003331634916, + "learning_rate": 0.00017003720014229754, + "loss": 12.2762, + "step": 10116 + }, + { + "epoch": 0.5509110834303295, + "grad_norm": 0.552814280799876, + "learning_rate": 0.00017003090559115656, + "loss": 12.4019, + "step": 10117 + }, + { + "epoch": 0.5509655374269126, + "grad_norm": 0.5373811486501685, + "learning_rate": 0.0001700246104954435, + "loss": 12.3237, + "step": 10118 + }, + { + "epoch": 0.5510199914234956, + "grad_norm": 0.5684336237386249, + "learning_rate": 0.0001700183148552073, + "loss": 12.2904, + "step": 10119 + }, + { + "epoch": 0.5510744454200786, + "grad_norm": 0.5888264690377857, + "learning_rate": 0.00017001201867049696, + "loss": 12.1247, + "step": 10120 + }, + { + "epoch": 0.5511288994166615, + "grad_norm": 0.5779840814679416, + "learning_rate": 0.0001700057219413614, + "loss": 12.3402, + "step": 10121 + }, + { + "epoch": 0.5511833534132445, + "grad_norm": 0.588683708129219, + "learning_rate": 0.00016999942466784966, + "loss": 12.4069, + "step": 10122 + }, + { + "epoch": 0.5512378074098276, + "grad_norm": 0.678480509040718, + "learning_rate": 0.00016999312685001062, + "loss": 12.2873, + "step": 10123 + }, + { + "epoch": 0.5512922614064106, + "grad_norm": 0.5976064724968893, + "learning_rate": 0.0001699868284878933, + "loss": 12.2787, + "step": 10124 + }, + { + "epoch": 0.5513467154029936, + "grad_norm": 0.5916676144071182, + "learning_rate": 0.00016998052958154666, + "loss": 12.3731, + "step": 10125 + }, + { + "epoch": 0.5514011693995766, + "grad_norm": 0.6197923034996309, + "learning_rate": 0.00016997423013101966, + "loss": 12.39, + "step": 10126 + }, + { + "epoch": 0.5514556233961596, + "grad_norm": 0.6549741234780643, + "learning_rate": 0.00016996793013636136, + "loss": 12.4356, + "step": 10127 + }, + { + "epoch": 0.5515100773927426, + "grad_norm": 0.6488966161442786, + "learning_rate": 0.00016996162959762067, + "loss": 12.4121, + "step": 10128 + }, + { + "epoch": 0.5515645313893257, + "grad_norm": 0.5858223558061898, + "learning_rate": 0.00016995532851484663, + "loss": 12.4635, + "step": 10129 + }, + { + "epoch": 0.5516189853859087, + "grad_norm": 0.6451824416075728, + "learning_rate": 0.00016994902688808821, + "loss": 12.5343, + "step": 10130 + }, + { + "epoch": 0.5516734393824917, + "grad_norm": 0.6601972702580284, + "learning_rate": 0.00016994272471739443, + "loss": 12.3874, + "step": 10131 + }, + { + "epoch": 0.5517278933790747, + "grad_norm": 0.6182476910168957, + "learning_rate": 0.00016993642200281432, + "loss": 12.1093, + "step": 10132 + }, + { + "epoch": 0.5517823473756577, + "grad_norm": 0.6032874164440537, + "learning_rate": 0.00016993011874439682, + "loss": 12.5125, + "step": 10133 + }, + { + "epoch": 0.5518368013722407, + "grad_norm": 0.6843307644197107, + "learning_rate": 0.00016992381494219103, + "loss": 12.4988, + "step": 10134 + }, + { + "epoch": 0.5518912553688238, + "grad_norm": 0.6136490984502141, + "learning_rate": 0.0001699175105962459, + "loss": 12.4561, + "step": 10135 + }, + { + "epoch": 0.5519457093654068, + "grad_norm": 0.5347265528309477, + "learning_rate": 0.00016991120570661048, + "loss": 12.365, + "step": 10136 + }, + { + "epoch": 0.5520001633619898, + "grad_norm": 0.8540603107055513, + "learning_rate": 0.00016990490027333385, + "loss": 12.4968, + "step": 10137 + }, + { + "epoch": 0.5520546173585728, + "grad_norm": 0.576818131022261, + "learning_rate": 0.00016989859429646496, + "loss": 12.3099, + "step": 10138 + }, + { + "epoch": 0.5521090713551557, + "grad_norm": 0.6561478050780211, + "learning_rate": 0.00016989228777605284, + "loss": 12.3842, + "step": 10139 + }, + { + "epoch": 0.5521635253517387, + "grad_norm": 0.655042351179101, + "learning_rate": 0.0001698859807121466, + "loss": 12.3027, + "step": 10140 + }, + { + "epoch": 0.5522179793483218, + "grad_norm": 0.6798724729283924, + "learning_rate": 0.00016987967310479527, + "loss": 12.3973, + "step": 10141 + }, + { + "epoch": 0.5522724333449048, + "grad_norm": 0.6240649516322961, + "learning_rate": 0.00016987336495404788, + "loss": 12.2627, + "step": 10142 + }, + { + "epoch": 0.5523268873414878, + "grad_norm": 0.6839594413179446, + "learning_rate": 0.00016986705625995346, + "loss": 12.253, + "step": 10143 + }, + { + "epoch": 0.5523813413380708, + "grad_norm": 0.7110332810078518, + "learning_rate": 0.00016986074702256108, + "loss": 12.3136, + "step": 10144 + }, + { + "epoch": 0.5524357953346538, + "grad_norm": 0.6476071616334153, + "learning_rate": 0.00016985443724191988, + "loss": 12.2806, + "step": 10145 + }, + { + "epoch": 0.5524902493312368, + "grad_norm": 0.6209525263190081, + "learning_rate": 0.0001698481269180788, + "loss": 12.3907, + "step": 10146 + }, + { + "epoch": 0.5525447033278199, + "grad_norm": 0.7060291692093761, + "learning_rate": 0.00016984181605108703, + "loss": 12.3933, + "step": 10147 + }, + { + "epoch": 0.5525991573244029, + "grad_norm": 0.6365440049787343, + "learning_rate": 0.00016983550464099353, + "loss": 12.3701, + "step": 10148 + }, + { + "epoch": 0.5526536113209859, + "grad_norm": 0.6457145627912881, + "learning_rate": 0.00016982919268784748, + "loss": 12.3692, + "step": 10149 + }, + { + "epoch": 0.5527080653175689, + "grad_norm": 0.5874194707290666, + "learning_rate": 0.0001698228801916979, + "loss": 12.339, + "step": 10150 + }, + { + "epoch": 0.5527625193141519, + "grad_norm": 0.6827235132202331, + "learning_rate": 0.0001698165671525939, + "loss": 12.4536, + "step": 10151 + }, + { + "epoch": 0.5528169733107349, + "grad_norm": 0.603995056239589, + "learning_rate": 0.00016981025357058456, + "loss": 12.2051, + "step": 10152 + }, + { + "epoch": 0.552871427307318, + "grad_norm": 0.5555255216750008, + "learning_rate": 0.00016980393944571897, + "loss": 12.4311, + "step": 10153 + }, + { + "epoch": 0.552925881303901, + "grad_norm": 0.6700662946116988, + "learning_rate": 0.00016979762477804623, + "loss": 12.2479, + "step": 10154 + }, + { + "epoch": 0.552980335300484, + "grad_norm": 0.6441931767950304, + "learning_rate": 0.0001697913095676155, + "loss": 12.4213, + "step": 10155 + }, + { + "epoch": 0.553034789297067, + "grad_norm": 0.6226291914939351, + "learning_rate": 0.00016978499381447578, + "loss": 12.5313, + "step": 10156 + }, + { + "epoch": 0.55308924329365, + "grad_norm": 0.6237855423231016, + "learning_rate": 0.0001697786775186763, + "loss": 12.3455, + "step": 10157 + }, + { + "epoch": 0.553143697290233, + "grad_norm": 0.650542570178941, + "learning_rate": 0.0001697723606802661, + "loss": 12.3525, + "step": 10158 + }, + { + "epoch": 0.553198151286816, + "grad_norm": 0.650553829713924, + "learning_rate": 0.00016976604329929434, + "loss": 12.3732, + "step": 10159 + }, + { + "epoch": 0.553252605283399, + "grad_norm": 0.6459488683075125, + "learning_rate": 0.00016975972537581008, + "loss": 12.3562, + "step": 10160 + }, + { + "epoch": 0.553307059279982, + "grad_norm": 0.6128540893647996, + "learning_rate": 0.00016975340690986252, + "loss": 12.4002, + "step": 10161 + }, + { + "epoch": 0.553361513276565, + "grad_norm": 0.5602485489371666, + "learning_rate": 0.0001697470879015008, + "loss": 12.2805, + "step": 10162 + }, + { + "epoch": 0.553415967273148, + "grad_norm": 0.625146438344186, + "learning_rate": 0.000169740768350774, + "loss": 12.349, + "step": 10163 + }, + { + "epoch": 0.5534704212697311, + "grad_norm": 0.6917484957140339, + "learning_rate": 0.0001697344482577313, + "loss": 12.3946, + "step": 10164 + }, + { + "epoch": 0.5535248752663141, + "grad_norm": 0.5665765845757563, + "learning_rate": 0.00016972812762242184, + "loss": 12.3455, + "step": 10165 + }, + { + "epoch": 0.5535793292628971, + "grad_norm": 0.7588018958176995, + "learning_rate": 0.00016972180644489476, + "loss": 12.3631, + "step": 10166 + }, + { + "epoch": 0.5536337832594801, + "grad_norm": 0.6318823657954354, + "learning_rate": 0.0001697154847251992, + "loss": 12.3629, + "step": 10167 + }, + { + "epoch": 0.5536882372560631, + "grad_norm": 0.6292295929223287, + "learning_rate": 0.00016970916246338436, + "loss": 12.3089, + "step": 10168 + }, + { + "epoch": 0.5537426912526461, + "grad_norm": 0.6476934125782788, + "learning_rate": 0.00016970283965949938, + "loss": 12.3641, + "step": 10169 + }, + { + "epoch": 0.5537971452492292, + "grad_norm": 0.5997142351922357, + "learning_rate": 0.00016969651631359344, + "loss": 12.3802, + "step": 10170 + }, + { + "epoch": 0.5538515992458122, + "grad_norm": 0.6729543585756947, + "learning_rate": 0.0001696901924257157, + "loss": 12.3821, + "step": 10171 + }, + { + "epoch": 0.5539060532423952, + "grad_norm": 0.6407345390836635, + "learning_rate": 0.0001696838679959153, + "loss": 12.3207, + "step": 10172 + }, + { + "epoch": 0.5539605072389782, + "grad_norm": 0.5957211650140359, + "learning_rate": 0.00016967754302424153, + "loss": 12.3158, + "step": 10173 + }, + { + "epoch": 0.5540149612355612, + "grad_norm": 0.6822451915992077, + "learning_rate": 0.00016967121751074345, + "loss": 12.4724, + "step": 10174 + }, + { + "epoch": 0.5540694152321441, + "grad_norm": 0.6309367173751734, + "learning_rate": 0.0001696648914554703, + "loss": 12.2831, + "step": 10175 + }, + { + "epoch": 0.5541238692287273, + "grad_norm": 0.6642919979889967, + "learning_rate": 0.00016965856485847127, + "loss": 12.4086, + "step": 10176 + }, + { + "epoch": 0.5541783232253102, + "grad_norm": 0.6122523256520281, + "learning_rate": 0.00016965223771979554, + "loss": 12.3491, + "step": 10177 + }, + { + "epoch": 0.5542327772218932, + "grad_norm": 0.5613470115381749, + "learning_rate": 0.0001696459100394924, + "loss": 12.2773, + "step": 10178 + }, + { + "epoch": 0.5542872312184762, + "grad_norm": 0.667435452999538, + "learning_rate": 0.0001696395818176109, + "loss": 12.4032, + "step": 10179 + }, + { + "epoch": 0.5543416852150592, + "grad_norm": 0.5723821218990318, + "learning_rate": 0.00016963325305420038, + "loss": 12.1993, + "step": 10180 + }, + { + "epoch": 0.5543961392116422, + "grad_norm": 0.6263000496737242, + "learning_rate": 0.00016962692374931, + "loss": 12.4251, + "step": 10181 + }, + { + "epoch": 0.5544505932082253, + "grad_norm": 0.5787263181251557, + "learning_rate": 0.00016962059390298898, + "loss": 12.2304, + "step": 10182 + }, + { + "epoch": 0.5545050472048083, + "grad_norm": 0.6270003142827943, + "learning_rate": 0.00016961426351528656, + "loss": 12.3031, + "step": 10183 + }, + { + "epoch": 0.5545595012013913, + "grad_norm": 0.654321820603327, + "learning_rate": 0.00016960793258625193, + "loss": 12.3821, + "step": 10184 + }, + { + "epoch": 0.5546139551979743, + "grad_norm": 0.5579974955006948, + "learning_rate": 0.00016960160111593434, + "loss": 12.2972, + "step": 10185 + }, + { + "epoch": 0.5546684091945573, + "grad_norm": 0.6189528881881449, + "learning_rate": 0.00016959526910438304, + "loss": 12.3706, + "step": 10186 + }, + { + "epoch": 0.5547228631911403, + "grad_norm": 0.6198609688250306, + "learning_rate": 0.00016958893655164725, + "loss": 12.2323, + "step": 10187 + }, + { + "epoch": 0.5547773171877234, + "grad_norm": 0.6120967966749081, + "learning_rate": 0.00016958260345777623, + "loss": 12.3337, + "step": 10188 + }, + { + "epoch": 0.5548317711843064, + "grad_norm": 0.6552890680808466, + "learning_rate": 0.00016957626982281926, + "loss": 12.3121, + "step": 10189 + }, + { + "epoch": 0.5548862251808894, + "grad_norm": 0.6545837701604202, + "learning_rate": 0.00016956993564682548, + "loss": 12.4834, + "step": 10190 + }, + { + "epoch": 0.5549406791774724, + "grad_norm": 0.5906345025180717, + "learning_rate": 0.0001695636009298442, + "loss": 12.3531, + "step": 10191 + }, + { + "epoch": 0.5549951331740554, + "grad_norm": 0.6952886347113761, + "learning_rate": 0.00016955726567192473, + "loss": 12.3136, + "step": 10192 + }, + { + "epoch": 0.5550495871706385, + "grad_norm": 0.7026250302498258, + "learning_rate": 0.0001695509298731163, + "loss": 12.4446, + "step": 10193 + }, + { + "epoch": 0.5551040411672215, + "grad_norm": 0.5808109078704449, + "learning_rate": 0.00016954459353346818, + "loss": 12.408, + "step": 10194 + }, + { + "epoch": 0.5551584951638044, + "grad_norm": 0.5422649522193064, + "learning_rate": 0.00016953825665302964, + "loss": 12.4269, + "step": 10195 + }, + { + "epoch": 0.5552129491603874, + "grad_norm": 0.5874935703337653, + "learning_rate": 0.00016953191923184995, + "loss": 12.3086, + "step": 10196 + }, + { + "epoch": 0.5552674031569704, + "grad_norm": 0.7222743809111872, + "learning_rate": 0.00016952558126997837, + "loss": 12.4497, + "step": 10197 + }, + { + "epoch": 0.5553218571535534, + "grad_norm": 0.61007669020962, + "learning_rate": 0.00016951924276746425, + "loss": 12.4958, + "step": 10198 + }, + { + "epoch": 0.5553763111501365, + "grad_norm": 0.6786102403273151, + "learning_rate": 0.0001695129037243568, + "loss": 12.3535, + "step": 10199 + }, + { + "epoch": 0.5554307651467195, + "grad_norm": 0.6785427181748215, + "learning_rate": 0.00016950656414070538, + "loss": 12.3854, + "step": 10200 + }, + { + "epoch": 0.5554852191433025, + "grad_norm": 0.6132769542909614, + "learning_rate": 0.00016950022401655926, + "loss": 12.3423, + "step": 10201 + }, + { + "epoch": 0.5555396731398855, + "grad_norm": 0.6373404879138806, + "learning_rate": 0.00016949388335196774, + "loss": 12.3078, + "step": 10202 + }, + { + "epoch": 0.5555941271364685, + "grad_norm": 0.6435052463794952, + "learning_rate": 0.0001694875421469801, + "loss": 12.4221, + "step": 10203 + }, + { + "epoch": 0.5556485811330515, + "grad_norm": 0.5621073972746748, + "learning_rate": 0.00016948120040164572, + "loss": 12.2795, + "step": 10204 + }, + { + "epoch": 0.5557030351296346, + "grad_norm": 0.7960388659184452, + "learning_rate": 0.00016947485811601384, + "loss": 12.5329, + "step": 10205 + }, + { + "epoch": 0.5557574891262176, + "grad_norm": 0.7107737798884084, + "learning_rate": 0.00016946851529013384, + "loss": 12.3655, + "step": 10206 + }, + { + "epoch": 0.5558119431228006, + "grad_norm": 0.8029325573958841, + "learning_rate": 0.00016946217192405501, + "loss": 12.4019, + "step": 10207 + }, + { + "epoch": 0.5558663971193836, + "grad_norm": 0.7149271685846226, + "learning_rate": 0.0001694558280178267, + "loss": 12.5062, + "step": 10208 + }, + { + "epoch": 0.5559208511159666, + "grad_norm": 0.615736615125051, + "learning_rate": 0.0001694494835714982, + "loss": 12.2179, + "step": 10209 + }, + { + "epoch": 0.5559753051125496, + "grad_norm": 0.7058656536912001, + "learning_rate": 0.00016944313858511886, + "loss": 12.3396, + "step": 10210 + }, + { + "epoch": 0.5560297591091327, + "grad_norm": 0.7412585447567388, + "learning_rate": 0.00016943679305873803, + "loss": 12.343, + "step": 10211 + }, + { + "epoch": 0.5560842131057157, + "grad_norm": 0.6208830350613329, + "learning_rate": 0.00016943044699240507, + "loss": 12.3917, + "step": 10212 + }, + { + "epoch": 0.5561386671022986, + "grad_norm": 0.7483707455695057, + "learning_rate": 0.00016942410038616932, + "loss": 12.3829, + "step": 10213 + }, + { + "epoch": 0.5561931210988816, + "grad_norm": 0.6240932002378885, + "learning_rate": 0.00016941775324008009, + "loss": 12.3216, + "step": 10214 + }, + { + "epoch": 0.5562475750954646, + "grad_norm": 0.654778145902865, + "learning_rate": 0.00016941140555418679, + "loss": 12.314, + "step": 10215 + }, + { + "epoch": 0.5563020290920476, + "grad_norm": 0.7072083853390658, + "learning_rate": 0.00016940505732853875, + "loss": 12.3792, + "step": 10216 + }, + { + "epoch": 0.5563564830886307, + "grad_norm": 0.6776859696620402, + "learning_rate": 0.00016939870856318533, + "loss": 12.5402, + "step": 10217 + }, + { + "epoch": 0.5564109370852137, + "grad_norm": 0.7617841105325746, + "learning_rate": 0.00016939235925817595, + "loss": 12.5152, + "step": 10218 + }, + { + "epoch": 0.5564653910817967, + "grad_norm": 0.560249323418678, + "learning_rate": 0.00016938600941355993, + "loss": 12.3686, + "step": 10219 + }, + { + "epoch": 0.5565198450783797, + "grad_norm": 0.6307457001537511, + "learning_rate": 0.00016937965902938666, + "loss": 12.3672, + "step": 10220 + }, + { + "epoch": 0.5565742990749627, + "grad_norm": 0.6787027309352848, + "learning_rate": 0.00016937330810570553, + "loss": 12.4397, + "step": 10221 + }, + { + "epoch": 0.5566287530715457, + "grad_norm": 0.5921244360742721, + "learning_rate": 0.00016936695664256592, + "loss": 12.3569, + "step": 10222 + }, + { + "epoch": 0.5566832070681288, + "grad_norm": 0.6766229603763056, + "learning_rate": 0.00016936060464001724, + "loss": 12.2169, + "step": 10223 + }, + { + "epoch": 0.5567376610647118, + "grad_norm": 0.592326425822779, + "learning_rate": 0.00016935425209810883, + "loss": 12.3279, + "step": 10224 + }, + { + "epoch": 0.5567921150612948, + "grad_norm": 0.6994820038562872, + "learning_rate": 0.00016934789901689018, + "loss": 12.3537, + "step": 10225 + }, + { + "epoch": 0.5568465690578778, + "grad_norm": 0.6581250385024046, + "learning_rate": 0.00016934154539641061, + "loss": 12.1664, + "step": 10226 + }, + { + "epoch": 0.5569010230544608, + "grad_norm": 0.6607535805023356, + "learning_rate": 0.00016933519123671955, + "loss": 12.3211, + "step": 10227 + }, + { + "epoch": 0.5569554770510439, + "grad_norm": 0.6100695309818525, + "learning_rate": 0.00016932883653786643, + "loss": 12.4688, + "step": 10228 + }, + { + "epoch": 0.5570099310476269, + "grad_norm": 0.6388569267528083, + "learning_rate": 0.0001693224812999006, + "loss": 12.3869, + "step": 10229 + }, + { + "epoch": 0.5570643850442099, + "grad_norm": 0.5718089310683809, + "learning_rate": 0.0001693161255228716, + "loss": 12.3244, + "step": 10230 + }, + { + "epoch": 0.5571188390407928, + "grad_norm": 0.644449510908533, + "learning_rate": 0.00016930976920682874, + "loss": 12.4063, + "step": 10231 + }, + { + "epoch": 0.5571732930373758, + "grad_norm": 0.5847832011670807, + "learning_rate": 0.0001693034123518215, + "loss": 12.2008, + "step": 10232 + }, + { + "epoch": 0.5572277470339588, + "grad_norm": 0.6134128450599079, + "learning_rate": 0.0001692970549578993, + "loss": 12.2552, + "step": 10233 + }, + { + "epoch": 0.5572822010305419, + "grad_norm": 0.627849807475507, + "learning_rate": 0.00016929069702511157, + "loss": 12.488, + "step": 10234 + }, + { + "epoch": 0.5573366550271249, + "grad_norm": 0.6765909156462427, + "learning_rate": 0.0001692843385535078, + "loss": 12.4191, + "step": 10235 + }, + { + "epoch": 0.5573911090237079, + "grad_norm": 0.6289812044691674, + "learning_rate": 0.00016927797954313734, + "loss": 12.3557, + "step": 10236 + }, + { + "epoch": 0.5574455630202909, + "grad_norm": 0.6431241847241467, + "learning_rate": 0.00016927161999404975, + "loss": 12.267, + "step": 10237 + }, + { + "epoch": 0.5575000170168739, + "grad_norm": 0.6980001507327088, + "learning_rate": 0.00016926525990629442, + "loss": 12.423, + "step": 10238 + }, + { + "epoch": 0.5575544710134569, + "grad_norm": 0.6329606744797434, + "learning_rate": 0.00016925889927992075, + "loss": 12.228, + "step": 10239 + }, + { + "epoch": 0.55760892501004, + "grad_norm": 0.7344798136238765, + "learning_rate": 0.00016925253811497833, + "loss": 12.4051, + "step": 10240 + }, + { + "epoch": 0.557663379006623, + "grad_norm": 0.6263600033987291, + "learning_rate": 0.0001692461764115165, + "loss": 12.2633, + "step": 10241 + }, + { + "epoch": 0.557717833003206, + "grad_norm": 0.5513225969056479, + "learning_rate": 0.00016923981416958484, + "loss": 12.2928, + "step": 10242 + }, + { + "epoch": 0.557772286999789, + "grad_norm": 0.5850290222487247, + "learning_rate": 0.00016923345138923277, + "loss": 12.2904, + "step": 10243 + }, + { + "epoch": 0.557826740996372, + "grad_norm": 0.6890330863275668, + "learning_rate": 0.00016922708807050975, + "loss": 12.3438, + "step": 10244 + }, + { + "epoch": 0.557881194992955, + "grad_norm": 0.6781093042843492, + "learning_rate": 0.0001692207242134653, + "loss": 12.5031, + "step": 10245 + }, + { + "epoch": 0.5579356489895381, + "grad_norm": 0.668948770477723, + "learning_rate": 0.00016921435981814888, + "loss": 12.405, + "step": 10246 + }, + { + "epoch": 0.5579901029861211, + "grad_norm": 0.677180028277028, + "learning_rate": 0.00016920799488461002, + "loss": 12.4159, + "step": 10247 + }, + { + "epoch": 0.558044556982704, + "grad_norm": 0.6147996023247381, + "learning_rate": 0.00016920162941289814, + "loss": 12.4116, + "step": 10248 + }, + { + "epoch": 0.558099010979287, + "grad_norm": 0.6424965673205608, + "learning_rate": 0.0001691952634030628, + "loss": 12.4225, + "step": 10249 + }, + { + "epoch": 0.55815346497587, + "grad_norm": 0.5620499261722912, + "learning_rate": 0.0001691888968551535, + "loss": 12.2864, + "step": 10250 + }, + { + "epoch": 0.558207918972453, + "grad_norm": 0.6026066590718929, + "learning_rate": 0.00016918252976921974, + "loss": 12.2664, + "step": 10251 + }, + { + "epoch": 0.5582623729690361, + "grad_norm": 0.6285570429220287, + "learning_rate": 0.000169176162145311, + "loss": 12.395, + "step": 10252 + }, + { + "epoch": 0.5583168269656191, + "grad_norm": 0.5369568931221449, + "learning_rate": 0.00016916979398347686, + "loss": 12.2386, + "step": 10253 + }, + { + "epoch": 0.5583712809622021, + "grad_norm": 0.6300205811992972, + "learning_rate": 0.00016916342528376676, + "loss": 12.3813, + "step": 10254 + }, + { + "epoch": 0.5584257349587851, + "grad_norm": 0.5810627694012088, + "learning_rate": 0.00016915705604623029, + "loss": 12.3711, + "step": 10255 + }, + { + "epoch": 0.5584801889553681, + "grad_norm": 0.6655122858572802, + "learning_rate": 0.00016915068627091696, + "loss": 12.2979, + "step": 10256 + }, + { + "epoch": 0.5585346429519512, + "grad_norm": 0.6426723429091911, + "learning_rate": 0.00016914431595787627, + "loss": 12.349, + "step": 10257 + }, + { + "epoch": 0.5585890969485342, + "grad_norm": 0.5969986660433423, + "learning_rate": 0.00016913794510715785, + "loss": 12.1996, + "step": 10258 + }, + { + "epoch": 0.5586435509451172, + "grad_norm": 0.6376400979426741, + "learning_rate": 0.0001691315737188111, + "loss": 12.2946, + "step": 10259 + }, + { + "epoch": 0.5586980049417002, + "grad_norm": 0.6622662726743559, + "learning_rate": 0.00016912520179288566, + "loss": 12.4308, + "step": 10260 + }, + { + "epoch": 0.5587524589382832, + "grad_norm": 0.611903090619256, + "learning_rate": 0.00016911882932943106, + "loss": 12.3932, + "step": 10261 + }, + { + "epoch": 0.5588069129348662, + "grad_norm": 0.6023841459818996, + "learning_rate": 0.00016911245632849684, + "loss": 12.3412, + "step": 10262 + }, + { + "epoch": 0.5588613669314493, + "grad_norm": 0.5844063374490647, + "learning_rate": 0.0001691060827901326, + "loss": 12.2498, + "step": 10263 + }, + { + "epoch": 0.5589158209280323, + "grad_norm": 0.599319382728331, + "learning_rate": 0.00016909970871438788, + "loss": 12.3185, + "step": 10264 + }, + { + "epoch": 0.5589702749246153, + "grad_norm": 0.5960458771528314, + "learning_rate": 0.0001690933341013122, + "loss": 12.4048, + "step": 10265 + }, + { + "epoch": 0.5590247289211983, + "grad_norm": 0.64571759754071, + "learning_rate": 0.00016908695895095517, + "loss": 12.4689, + "step": 10266 + }, + { + "epoch": 0.5590791829177812, + "grad_norm": 0.8047723957217814, + "learning_rate": 0.00016908058326336634, + "loss": 12.4611, + "step": 10267 + }, + { + "epoch": 0.5591336369143642, + "grad_norm": 0.683586842980497, + "learning_rate": 0.00016907420703859538, + "loss": 12.2063, + "step": 10268 + }, + { + "epoch": 0.5591880909109473, + "grad_norm": 0.7147507808213387, + "learning_rate": 0.00016906783027669176, + "loss": 12.4311, + "step": 10269 + }, + { + "epoch": 0.5592425449075303, + "grad_norm": 0.5674940897992446, + "learning_rate": 0.0001690614529777051, + "loss": 12.4471, + "step": 10270 + }, + { + "epoch": 0.5592969989041133, + "grad_norm": 0.557602325062582, + "learning_rate": 0.00016905507514168502, + "loss": 12.4233, + "step": 10271 + }, + { + "epoch": 0.5593514529006963, + "grad_norm": 0.6316895645364624, + "learning_rate": 0.00016904869676868107, + "loss": 12.4648, + "step": 10272 + }, + { + "epoch": 0.5594059068972793, + "grad_norm": 0.6515018637341894, + "learning_rate": 0.0001690423178587429, + "loss": 12.4008, + "step": 10273 + }, + { + "epoch": 0.5594603608938623, + "grad_norm": 0.5898901503061371, + "learning_rate": 0.00016903593841192008, + "loss": 12.3275, + "step": 10274 + }, + { + "epoch": 0.5595148148904454, + "grad_norm": 0.5662931590043602, + "learning_rate": 0.00016902955842826222, + "loss": 12.3504, + "step": 10275 + }, + { + "epoch": 0.5595692688870284, + "grad_norm": 0.6778051602663335, + "learning_rate": 0.00016902317790781895, + "loss": 12.25, + "step": 10276 + }, + { + "epoch": 0.5596237228836114, + "grad_norm": 0.6092788463144142, + "learning_rate": 0.00016901679685063986, + "loss": 12.3418, + "step": 10277 + }, + { + "epoch": 0.5596781768801944, + "grad_norm": 0.618180036489811, + "learning_rate": 0.0001690104152567746, + "loss": 12.3607, + "step": 10278 + }, + { + "epoch": 0.5597326308767774, + "grad_norm": 0.5860970509812605, + "learning_rate": 0.00016900403312627277, + "loss": 12.3366, + "step": 10279 + }, + { + "epoch": 0.5597870848733604, + "grad_norm": 0.601893208709934, + "learning_rate": 0.00016899765045918401, + "loss": 12.458, + "step": 10280 + }, + { + "epoch": 0.5598415388699435, + "grad_norm": 0.5861072547051973, + "learning_rate": 0.00016899126725555794, + "loss": 12.4812, + "step": 10281 + }, + { + "epoch": 0.5598959928665265, + "grad_norm": 0.5875897748674739, + "learning_rate": 0.0001689848835154442, + "loss": 12.1096, + "step": 10282 + }, + { + "epoch": 0.5599504468631095, + "grad_norm": 0.6160324753910715, + "learning_rate": 0.00016897849923889246, + "loss": 12.3096, + "step": 10283 + }, + { + "epoch": 0.5600049008596925, + "grad_norm": 0.6142925610254328, + "learning_rate": 0.0001689721144259523, + "loss": 12.3904, + "step": 10284 + }, + { + "epoch": 0.5600593548562754, + "grad_norm": 0.5637627348753075, + "learning_rate": 0.00016896572907667347, + "loss": 12.2946, + "step": 10285 + }, + { + "epoch": 0.5601138088528584, + "grad_norm": 0.5518503687523599, + "learning_rate": 0.00016895934319110555, + "loss": 12.3553, + "step": 10286 + }, + { + "epoch": 0.5601682628494415, + "grad_norm": 0.6328617298624295, + "learning_rate": 0.00016895295676929817, + "loss": 12.4334, + "step": 10287 + }, + { + "epoch": 0.5602227168460245, + "grad_norm": 0.620085531994173, + "learning_rate": 0.0001689465698113011, + "loss": 12.3402, + "step": 10288 + }, + { + "epoch": 0.5602771708426075, + "grad_norm": 0.5559505540958086, + "learning_rate": 0.00016894018231716385, + "loss": 12.2649, + "step": 10289 + }, + { + "epoch": 0.5603316248391905, + "grad_norm": 0.7008838030079539, + "learning_rate": 0.00016893379428693626, + "loss": 12.4094, + "step": 10290 + }, + { + "epoch": 0.5603860788357735, + "grad_norm": 0.605239972915833, + "learning_rate": 0.0001689274057206679, + "loss": 12.3578, + "step": 10291 + }, + { + "epoch": 0.5604405328323566, + "grad_norm": 0.6596637595277397, + "learning_rate": 0.00016892101661840846, + "loss": 12.2733, + "step": 10292 + }, + { + "epoch": 0.5604949868289396, + "grad_norm": 0.7951158901442011, + "learning_rate": 0.00016891462698020768, + "loss": 12.4856, + "step": 10293 + }, + { + "epoch": 0.5605494408255226, + "grad_norm": 0.6574492755937013, + "learning_rate": 0.00016890823680611517, + "loss": 12.2087, + "step": 10294 + }, + { + "epoch": 0.5606038948221056, + "grad_norm": 0.653813488593268, + "learning_rate": 0.00016890184609618064, + "loss": 12.1862, + "step": 10295 + }, + { + "epoch": 0.5606583488186886, + "grad_norm": 0.7244385612792106, + "learning_rate": 0.0001688954548504538, + "loss": 12.3884, + "step": 10296 + }, + { + "epoch": 0.5607128028152716, + "grad_norm": 0.6141729334360738, + "learning_rate": 0.00016888906306898436, + "loss": 12.3005, + "step": 10297 + }, + { + "epoch": 0.5607672568118547, + "grad_norm": 0.7246938177635958, + "learning_rate": 0.00016888267075182206, + "loss": 12.5213, + "step": 10298 + }, + { + "epoch": 0.5608217108084377, + "grad_norm": 0.7038881657945165, + "learning_rate": 0.00016887627789901653, + "loss": 12.5174, + "step": 10299 + }, + { + "epoch": 0.5608761648050207, + "grad_norm": 0.6321702896975392, + "learning_rate": 0.00016886988451061749, + "loss": 12.2754, + "step": 10300 + }, + { + "epoch": 0.5609306188016037, + "grad_norm": 0.6103387747847123, + "learning_rate": 0.0001688634905866747, + "loss": 12.5386, + "step": 10301 + }, + { + "epoch": 0.5609850727981867, + "grad_norm": 0.732025469317435, + "learning_rate": 0.00016885709612723783, + "loss": 12.4871, + "step": 10302 + }, + { + "epoch": 0.5610395267947696, + "grad_norm": 0.6325210977683443, + "learning_rate": 0.00016885070113235667, + "loss": 12.4183, + "step": 10303 + }, + { + "epoch": 0.5610939807913528, + "grad_norm": 0.6393350500913199, + "learning_rate": 0.00016884430560208088, + "loss": 12.4113, + "step": 10304 + }, + { + "epoch": 0.5611484347879357, + "grad_norm": 0.5909162202743099, + "learning_rate": 0.00016883790953646025, + "loss": 12.3757, + "step": 10305 + }, + { + "epoch": 0.5612028887845187, + "grad_norm": 0.7084515897215775, + "learning_rate": 0.0001688315129355445, + "loss": 12.3812, + "step": 10306 + }, + { + "epoch": 0.5612573427811017, + "grad_norm": 0.6225796371499448, + "learning_rate": 0.00016882511579938334, + "loss": 12.1873, + "step": 10307 + }, + { + "epoch": 0.5613117967776847, + "grad_norm": 0.5897419700844331, + "learning_rate": 0.00016881871812802652, + "loss": 12.2328, + "step": 10308 + }, + { + "epoch": 0.5613662507742677, + "grad_norm": 0.626257819909121, + "learning_rate": 0.00016881231992152385, + "loss": 12.2687, + "step": 10309 + }, + { + "epoch": 0.5614207047708508, + "grad_norm": 0.6783609424573152, + "learning_rate": 0.00016880592117992502, + "loss": 12.4556, + "step": 10310 + }, + { + "epoch": 0.5614751587674338, + "grad_norm": 0.5433012075072357, + "learning_rate": 0.0001687995219032798, + "loss": 12.3478, + "step": 10311 + }, + { + "epoch": 0.5615296127640168, + "grad_norm": 0.6673882896234298, + "learning_rate": 0.00016879312209163797, + "loss": 12.5079, + "step": 10312 + }, + { + "epoch": 0.5615840667605998, + "grad_norm": 0.636024906690236, + "learning_rate": 0.00016878672174504926, + "loss": 12.3692, + "step": 10313 + }, + { + "epoch": 0.5616385207571828, + "grad_norm": 0.5898485469427649, + "learning_rate": 0.00016878032086356352, + "loss": 12.4541, + "step": 10314 + }, + { + "epoch": 0.5616929747537658, + "grad_norm": 0.5545109457042696, + "learning_rate": 0.00016877391944723042, + "loss": 12.3289, + "step": 10315 + }, + { + "epoch": 0.5617474287503489, + "grad_norm": 0.6048015573465777, + "learning_rate": 0.00016876751749609982, + "loss": 12.043, + "step": 10316 + }, + { + "epoch": 0.5618018827469319, + "grad_norm": 0.6191032621169081, + "learning_rate": 0.00016876111501022147, + "loss": 12.4121, + "step": 10317 + }, + { + "epoch": 0.5618563367435149, + "grad_norm": 0.6512499912882861, + "learning_rate": 0.00016875471198964513, + "loss": 12.4435, + "step": 10318 + }, + { + "epoch": 0.5619107907400979, + "grad_norm": 0.5871615324615288, + "learning_rate": 0.00016874830843442067, + "loss": 12.3567, + "step": 10319 + }, + { + "epoch": 0.5619652447366809, + "grad_norm": 0.5919840115805446, + "learning_rate": 0.00016874190434459777, + "loss": 12.4784, + "step": 10320 + }, + { + "epoch": 0.5620196987332639, + "grad_norm": 0.5943262467755525, + "learning_rate": 0.00016873549972022634, + "loss": 12.2569, + "step": 10321 + }, + { + "epoch": 0.562074152729847, + "grad_norm": 0.6056333912357738, + "learning_rate": 0.00016872909456135612, + "loss": 12.2407, + "step": 10322 + }, + { + "epoch": 0.56212860672643, + "grad_norm": 0.6412606462162442, + "learning_rate": 0.00016872268886803692, + "loss": 12.4993, + "step": 10323 + }, + { + "epoch": 0.5621830607230129, + "grad_norm": 0.5951322402046102, + "learning_rate": 0.00016871628264031864, + "loss": 12.2904, + "step": 10324 + }, + { + "epoch": 0.5622375147195959, + "grad_norm": 0.5957303297392885, + "learning_rate": 0.00016870987587825094, + "loss": 12.2501, + "step": 10325 + }, + { + "epoch": 0.5622919687161789, + "grad_norm": 0.6737073150568589, + "learning_rate": 0.00016870346858188374, + "loss": 12.3033, + "step": 10326 + }, + { + "epoch": 0.562346422712762, + "grad_norm": 0.5901110815473725, + "learning_rate": 0.00016869706075126688, + "loss": 12.3207, + "step": 10327 + }, + { + "epoch": 0.562400876709345, + "grad_norm": 0.6913805237600109, + "learning_rate": 0.00016869065238645013, + "loss": 12.2452, + "step": 10328 + }, + { + "epoch": 0.562455330705928, + "grad_norm": 0.597764961757747, + "learning_rate": 0.00016868424348748335, + "loss": 12.3216, + "step": 10329 + }, + { + "epoch": 0.562509784702511, + "grad_norm": 0.5970006749445501, + "learning_rate": 0.00016867783405441638, + "loss": 12.1318, + "step": 10330 + }, + { + "epoch": 0.562564238699094, + "grad_norm": 0.5930412764100742, + "learning_rate": 0.00016867142408729904, + "loss": 12.3634, + "step": 10331 + }, + { + "epoch": 0.562618692695677, + "grad_norm": 0.6212977251363766, + "learning_rate": 0.0001686650135861812, + "loss": 12.3131, + "step": 10332 + }, + { + "epoch": 0.5626731466922601, + "grad_norm": 0.5688540545730704, + "learning_rate": 0.00016865860255111268, + "loss": 12.3945, + "step": 10333 + }, + { + "epoch": 0.5627276006888431, + "grad_norm": 0.5789013058650888, + "learning_rate": 0.0001686521909821434, + "loss": 12.2789, + "step": 10334 + }, + { + "epoch": 0.5627820546854261, + "grad_norm": 0.6335481871931581, + "learning_rate": 0.00016864577887932313, + "loss": 12.4302, + "step": 10335 + }, + { + "epoch": 0.5628365086820091, + "grad_norm": 0.5939013797957428, + "learning_rate": 0.00016863936624270177, + "loss": 12.3346, + "step": 10336 + }, + { + "epoch": 0.5628909626785921, + "grad_norm": 0.6006353953254291, + "learning_rate": 0.0001686329530723292, + "loss": 12.2658, + "step": 10337 + }, + { + "epoch": 0.5629454166751751, + "grad_norm": 0.6047800477910621, + "learning_rate": 0.0001686265393682553, + "loss": 12.3286, + "step": 10338 + }, + { + "epoch": 0.5629998706717582, + "grad_norm": 0.6373248187535643, + "learning_rate": 0.0001686201251305299, + "loss": 12.3815, + "step": 10339 + }, + { + "epoch": 0.5630543246683412, + "grad_norm": 0.6356603420763361, + "learning_rate": 0.00016861371035920288, + "loss": 12.3143, + "step": 10340 + }, + { + "epoch": 0.5631087786649241, + "grad_norm": 0.5582684519094009, + "learning_rate": 0.0001686072950543242, + "loss": 12.3037, + "step": 10341 + }, + { + "epoch": 0.5631632326615071, + "grad_norm": 0.5789571181814924, + "learning_rate": 0.0001686008792159436, + "loss": 12.317, + "step": 10342 + }, + { + "epoch": 0.5632176866580901, + "grad_norm": 0.6739099289032652, + "learning_rate": 0.00016859446284411112, + "loss": 12.3593, + "step": 10343 + }, + { + "epoch": 0.5632721406546731, + "grad_norm": 0.5921743437082129, + "learning_rate": 0.00016858804593887657, + "loss": 12.274, + "step": 10344 + }, + { + "epoch": 0.5633265946512562, + "grad_norm": 0.5708854669236476, + "learning_rate": 0.0001685816285002899, + "loss": 12.2073, + "step": 10345 + }, + { + "epoch": 0.5633810486478392, + "grad_norm": 0.6308677383448402, + "learning_rate": 0.00016857521052840096, + "loss": 12.4676, + "step": 10346 + }, + { + "epoch": 0.5634355026444222, + "grad_norm": 0.6795680317404507, + "learning_rate": 0.0001685687920232597, + "loss": 12.4565, + "step": 10347 + }, + { + "epoch": 0.5634899566410052, + "grad_norm": 0.5569416277182532, + "learning_rate": 0.000168562372984916, + "loss": 12.347, + "step": 10348 + }, + { + "epoch": 0.5635444106375882, + "grad_norm": 0.562407880823792, + "learning_rate": 0.00016855595341341977, + "loss": 12.3733, + "step": 10349 + }, + { + "epoch": 0.5635988646341712, + "grad_norm": 0.5202134266489101, + "learning_rate": 0.000168549533308821, + "loss": 12.2996, + "step": 10350 + }, + { + "epoch": 0.5636533186307543, + "grad_norm": 0.6628255911322798, + "learning_rate": 0.00016854311267116954, + "loss": 12.4203, + "step": 10351 + }, + { + "epoch": 0.5637077726273373, + "grad_norm": 0.596815972075363, + "learning_rate": 0.00016853669150051535, + "loss": 12.3696, + "step": 10352 + }, + { + "epoch": 0.5637622266239203, + "grad_norm": 0.5586849210176513, + "learning_rate": 0.00016853026979690833, + "loss": 12.2975, + "step": 10353 + }, + { + "epoch": 0.5638166806205033, + "grad_norm": 0.6823568075738485, + "learning_rate": 0.00016852384756039848, + "loss": 12.443, + "step": 10354 + }, + { + "epoch": 0.5638711346170863, + "grad_norm": 0.5933142978854868, + "learning_rate": 0.00016851742479103563, + "loss": 12.2016, + "step": 10355 + }, + { + "epoch": 0.5639255886136693, + "grad_norm": 0.597479194997447, + "learning_rate": 0.00016851100148886986, + "loss": 12.3549, + "step": 10356 + }, + { + "epoch": 0.5639800426102524, + "grad_norm": 0.6636350663771243, + "learning_rate": 0.00016850457765395102, + "loss": 12.4796, + "step": 10357 + }, + { + "epoch": 0.5640344966068354, + "grad_norm": 0.6607380152702863, + "learning_rate": 0.0001684981532863291, + "loss": 12.2784, + "step": 10358 + }, + { + "epoch": 0.5640889506034183, + "grad_norm": 0.6877667224482812, + "learning_rate": 0.00016849172838605408, + "loss": 12.403, + "step": 10359 + }, + { + "epoch": 0.5641434046000013, + "grad_norm": 0.5659503162516681, + "learning_rate": 0.00016848530295317585, + "loss": 12.44, + "step": 10360 + }, + { + "epoch": 0.5641978585965843, + "grad_norm": 0.685369008360853, + "learning_rate": 0.00016847887698774445, + "loss": 12.4309, + "step": 10361 + }, + { + "epoch": 0.5642523125931674, + "grad_norm": 0.6320646119966852, + "learning_rate": 0.0001684724504898098, + "loss": 12.339, + "step": 10362 + }, + { + "epoch": 0.5643067665897504, + "grad_norm": 0.6212202368714012, + "learning_rate": 0.00016846602345942191, + "loss": 12.4303, + "step": 10363 + }, + { + "epoch": 0.5643612205863334, + "grad_norm": 0.6367487206819554, + "learning_rate": 0.00016845959589663074, + "loss": 12.421, + "step": 10364 + }, + { + "epoch": 0.5644156745829164, + "grad_norm": 0.681183679084473, + "learning_rate": 0.00016845316780148627, + "loss": 12.3895, + "step": 10365 + }, + { + "epoch": 0.5644701285794994, + "grad_norm": 0.6529961642623131, + "learning_rate": 0.00016844673917403849, + "loss": 12.4642, + "step": 10366 + }, + { + "epoch": 0.5645245825760824, + "grad_norm": 0.621415406056903, + "learning_rate": 0.00016844031001433734, + "loss": 12.2171, + "step": 10367 + }, + { + "epoch": 0.5645790365726655, + "grad_norm": 0.5613170596312487, + "learning_rate": 0.00016843388032243293, + "loss": 12.2744, + "step": 10368 + }, + { + "epoch": 0.5646334905692485, + "grad_norm": 0.7287747104215444, + "learning_rate": 0.00016842745009837515, + "loss": 12.4659, + "step": 10369 + }, + { + "epoch": 0.5646879445658315, + "grad_norm": 0.544141330334673, + "learning_rate": 0.00016842101934221406, + "loss": 12.2132, + "step": 10370 + }, + { + "epoch": 0.5647423985624145, + "grad_norm": 0.6041199910559678, + "learning_rate": 0.00016841458805399965, + "loss": 12.3917, + "step": 10371 + }, + { + "epoch": 0.5647968525589975, + "grad_norm": 0.6225477911605025, + "learning_rate": 0.0001684081562337819, + "loss": 12.4877, + "step": 10372 + }, + { + "epoch": 0.5648513065555805, + "grad_norm": 0.589768904637465, + "learning_rate": 0.0001684017238816109, + "loss": 12.3144, + "step": 10373 + }, + { + "epoch": 0.5649057605521636, + "grad_norm": 0.6221244535619938, + "learning_rate": 0.0001683952909975366, + "loss": 12.4835, + "step": 10374 + }, + { + "epoch": 0.5649602145487466, + "grad_norm": 0.6425525815174361, + "learning_rate": 0.00016838885758160905, + "loss": 12.4115, + "step": 10375 + }, + { + "epoch": 0.5650146685453296, + "grad_norm": 0.6606842162669248, + "learning_rate": 0.0001683824236338783, + "loss": 12.3993, + "step": 10376 + }, + { + "epoch": 0.5650691225419125, + "grad_norm": 0.6084374603777637, + "learning_rate": 0.0001683759891543943, + "loss": 12.3677, + "step": 10377 + }, + { + "epoch": 0.5651235765384955, + "grad_norm": 0.6991656978018909, + "learning_rate": 0.00016836955414320715, + "loss": 12.3793, + "step": 10378 + }, + { + "epoch": 0.5651780305350785, + "grad_norm": 0.6138426062310326, + "learning_rate": 0.00016836311860036693, + "loss": 12.3313, + "step": 10379 + }, + { + "epoch": 0.5652324845316616, + "grad_norm": 0.6275506547891637, + "learning_rate": 0.0001683566825259236, + "loss": 12.2722, + "step": 10380 + }, + { + "epoch": 0.5652869385282446, + "grad_norm": 0.9161209021743207, + "learning_rate": 0.00016835024591992724, + "loss": 12.3421, + "step": 10381 + }, + { + "epoch": 0.5653413925248276, + "grad_norm": 0.6321254761895913, + "learning_rate": 0.00016834380878242794, + "loss": 12.2944, + "step": 10382 + }, + { + "epoch": 0.5653958465214106, + "grad_norm": 0.6345049944779382, + "learning_rate": 0.00016833737111347568, + "loss": 12.484, + "step": 10383 + }, + { + "epoch": 0.5654503005179936, + "grad_norm": 0.623691170627115, + "learning_rate": 0.0001683309329131206, + "loss": 12.3452, + "step": 10384 + }, + { + "epoch": 0.5655047545145766, + "grad_norm": 0.6169359428690868, + "learning_rate": 0.0001683244941814127, + "loss": 12.2734, + "step": 10385 + }, + { + "epoch": 0.5655592085111597, + "grad_norm": 0.5911971693461668, + "learning_rate": 0.00016831805491840208, + "loss": 12.4244, + "step": 10386 + }, + { + "epoch": 0.5656136625077427, + "grad_norm": 0.6138794288462133, + "learning_rate": 0.0001683116151241388, + "loss": 12.3843, + "step": 10387 + }, + { + "epoch": 0.5656681165043257, + "grad_norm": 0.6087658301053654, + "learning_rate": 0.00016830517479867298, + "loss": 12.2409, + "step": 10388 + }, + { + "epoch": 0.5657225705009087, + "grad_norm": 0.5051577573517587, + "learning_rate": 0.00016829873394205464, + "loss": 12.2384, + "step": 10389 + }, + { + "epoch": 0.5657770244974917, + "grad_norm": 0.5880845343996741, + "learning_rate": 0.0001682922925543339, + "loss": 12.3321, + "step": 10390 + }, + { + "epoch": 0.5658314784940748, + "grad_norm": 0.5928845999091579, + "learning_rate": 0.00016828585063556083, + "loss": 12.2001, + "step": 10391 + }, + { + "epoch": 0.5658859324906578, + "grad_norm": 0.6827691259284938, + "learning_rate": 0.0001682794081857855, + "loss": 12.5109, + "step": 10392 + }, + { + "epoch": 0.5659403864872408, + "grad_norm": 0.5642687647023201, + "learning_rate": 0.0001682729652050581, + "loss": 12.3254, + "step": 10393 + }, + { + "epoch": 0.5659948404838238, + "grad_norm": 0.6148651923790508, + "learning_rate": 0.00016826652169342867, + "loss": 12.3495, + "step": 10394 + }, + { + "epoch": 0.5660492944804068, + "grad_norm": 0.6452765348559973, + "learning_rate": 0.00016826007765094732, + "loss": 12.3754, + "step": 10395 + }, + { + "epoch": 0.5661037484769897, + "grad_norm": 0.5965890572179519, + "learning_rate": 0.00016825363307766412, + "loss": 12.418, + "step": 10396 + }, + { + "epoch": 0.5661582024735728, + "grad_norm": 0.5471346501778647, + "learning_rate": 0.00016824718797362923, + "loss": 12.3318, + "step": 10397 + }, + { + "epoch": 0.5662126564701558, + "grad_norm": 0.6751846501375386, + "learning_rate": 0.00016824074233889278, + "loss": 12.5202, + "step": 10398 + }, + { + "epoch": 0.5662671104667388, + "grad_norm": 0.6218888957514784, + "learning_rate": 0.00016823429617350487, + "loss": 12.276, + "step": 10399 + }, + { + "epoch": 0.5663215644633218, + "grad_norm": 0.6492663612660368, + "learning_rate": 0.00016822784947751563, + "loss": 12.391, + "step": 10400 + }, + { + "epoch": 0.5663760184599048, + "grad_norm": 0.6232618935944115, + "learning_rate": 0.0001682214022509752, + "loss": 12.3163, + "step": 10401 + }, + { + "epoch": 0.5664304724564878, + "grad_norm": 0.649271965491751, + "learning_rate": 0.00016821495449393368, + "loss": 12.4013, + "step": 10402 + }, + { + "epoch": 0.5664849264530709, + "grad_norm": 0.66748998807262, + "learning_rate": 0.00016820850620644125, + "loss": 12.2451, + "step": 10403 + }, + { + "epoch": 0.5665393804496539, + "grad_norm": 0.7082320217361083, + "learning_rate": 0.00016820205738854804, + "loss": 12.2669, + "step": 10404 + }, + { + "epoch": 0.5665938344462369, + "grad_norm": 0.6083087122633456, + "learning_rate": 0.0001681956080403042, + "loss": 12.344, + "step": 10405 + }, + { + "epoch": 0.5666482884428199, + "grad_norm": 0.5845446637283482, + "learning_rate": 0.00016818915816175985, + "loss": 12.3627, + "step": 10406 + }, + { + "epoch": 0.5667027424394029, + "grad_norm": 0.6304622759286626, + "learning_rate": 0.00016818270775296519, + "loss": 12.3687, + "step": 10407 + }, + { + "epoch": 0.5667571964359859, + "grad_norm": 0.5473269293115471, + "learning_rate": 0.00016817625681397034, + "loss": 12.213, + "step": 10408 + }, + { + "epoch": 0.566811650432569, + "grad_norm": 0.6984369382628332, + "learning_rate": 0.00016816980534482552, + "loss": 12.2926, + "step": 10409 + }, + { + "epoch": 0.566866104429152, + "grad_norm": 0.6662395723038156, + "learning_rate": 0.00016816335334558083, + "loss": 12.4141, + "step": 10410 + }, + { + "epoch": 0.566920558425735, + "grad_norm": 0.6210259725286218, + "learning_rate": 0.0001681569008162865, + "loss": 12.3193, + "step": 10411 + }, + { + "epoch": 0.566975012422318, + "grad_norm": 0.5429922620912881, + "learning_rate": 0.00016815044775699266, + "loss": 12.2855, + "step": 10412 + }, + { + "epoch": 0.567029466418901, + "grad_norm": 0.5977332368552712, + "learning_rate": 0.0001681439941677495, + "loss": 12.2268, + "step": 10413 + }, + { + "epoch": 0.5670839204154839, + "grad_norm": 0.6159050614116562, + "learning_rate": 0.00016813754004860724, + "loss": 12.1776, + "step": 10414 + }, + { + "epoch": 0.567138374412067, + "grad_norm": 0.582597585497368, + "learning_rate": 0.00016813108539961603, + "loss": 12.3413, + "step": 10415 + }, + { + "epoch": 0.56719282840865, + "grad_norm": 0.6513474497024757, + "learning_rate": 0.00016812463022082607, + "loss": 12.394, + "step": 10416 + }, + { + "epoch": 0.567247282405233, + "grad_norm": 0.5895717171760396, + "learning_rate": 0.00016811817451228757, + "loss": 12.3976, + "step": 10417 + }, + { + "epoch": 0.567301736401816, + "grad_norm": 0.6145716743427717, + "learning_rate": 0.00016811171827405073, + "loss": 12.4249, + "step": 10418 + }, + { + "epoch": 0.567356190398399, + "grad_norm": 0.6429917849423017, + "learning_rate": 0.00016810526150616572, + "loss": 12.3861, + "step": 10419 + }, + { + "epoch": 0.567410644394982, + "grad_norm": 0.5341869197548992, + "learning_rate": 0.0001680988042086828, + "loss": 12.2854, + "step": 10420 + }, + { + "epoch": 0.5674650983915651, + "grad_norm": 0.6370657440114558, + "learning_rate": 0.00016809234638165212, + "loss": 12.3632, + "step": 10421 + }, + { + "epoch": 0.5675195523881481, + "grad_norm": 0.6345636872655567, + "learning_rate": 0.00016808588802512398, + "loss": 12.3265, + "step": 10422 + }, + { + "epoch": 0.5675740063847311, + "grad_norm": 0.5308037534905589, + "learning_rate": 0.00016807942913914855, + "loss": 12.2471, + "step": 10423 + }, + { + "epoch": 0.5676284603813141, + "grad_norm": 0.7217080141177963, + "learning_rate": 0.00016807296972377604, + "loss": 12.3981, + "step": 10424 + }, + { + "epoch": 0.5676829143778971, + "grad_norm": 0.6042083497364779, + "learning_rate": 0.00016806650977905672, + "loss": 12.2527, + "step": 10425 + }, + { + "epoch": 0.5677373683744802, + "grad_norm": 0.5745092248979259, + "learning_rate": 0.00016806004930504078, + "loss": 12.1766, + "step": 10426 + }, + { + "epoch": 0.5677918223710632, + "grad_norm": 0.5705259495585121, + "learning_rate": 0.0001680535883017785, + "loss": 12.2859, + "step": 10427 + }, + { + "epoch": 0.5678462763676462, + "grad_norm": 0.5417193186976931, + "learning_rate": 0.0001680471267693201, + "loss": 12.4, + "step": 10428 + }, + { + "epoch": 0.5679007303642292, + "grad_norm": 0.5834722615703474, + "learning_rate": 0.00016804066470771584, + "loss": 12.3393, + "step": 10429 + }, + { + "epoch": 0.5679551843608122, + "grad_norm": 0.7036143280210932, + "learning_rate": 0.00016803420211701598, + "loss": 12.5283, + "step": 10430 + }, + { + "epoch": 0.5680096383573952, + "grad_norm": 0.5979312547772383, + "learning_rate": 0.00016802773899727072, + "loss": 12.4094, + "step": 10431 + }, + { + "epoch": 0.5680640923539783, + "grad_norm": 0.6090176540758172, + "learning_rate": 0.00016802127534853035, + "loss": 12.285, + "step": 10432 + }, + { + "epoch": 0.5681185463505612, + "grad_norm": 0.6339843877396358, + "learning_rate": 0.00016801481117084514, + "loss": 12.3795, + "step": 10433 + }, + { + "epoch": 0.5681730003471442, + "grad_norm": 0.6647505095307242, + "learning_rate": 0.0001680083464642654, + "loss": 12.4322, + "step": 10434 + }, + { + "epoch": 0.5682274543437272, + "grad_norm": 0.5818159565641814, + "learning_rate": 0.00016800188122884132, + "loss": 12.4275, + "step": 10435 + }, + { + "epoch": 0.5682819083403102, + "grad_norm": 0.6228052874764259, + "learning_rate": 0.00016799541546462317, + "loss": 12.3025, + "step": 10436 + }, + { + "epoch": 0.5683363623368932, + "grad_norm": 0.6335954949628722, + "learning_rate": 0.0001679889491716613, + "loss": 12.4529, + "step": 10437 + }, + { + "epoch": 0.5683908163334763, + "grad_norm": 0.6038600388041353, + "learning_rate": 0.00016798248235000597, + "loss": 12.3209, + "step": 10438 + }, + { + "epoch": 0.5684452703300593, + "grad_norm": 0.6117645541689094, + "learning_rate": 0.00016797601499970744, + "loss": 12.3349, + "step": 10439 + }, + { + "epoch": 0.5684997243266423, + "grad_norm": 0.7230541776819218, + "learning_rate": 0.00016796954712081604, + "loss": 12.4097, + "step": 10440 + }, + { + "epoch": 0.5685541783232253, + "grad_norm": 0.6428774400716817, + "learning_rate": 0.000167963078713382, + "loss": 12.394, + "step": 10441 + }, + { + "epoch": 0.5686086323198083, + "grad_norm": 0.6505580888411074, + "learning_rate": 0.00016795660977745572, + "loss": 12.1937, + "step": 10442 + }, + { + "epoch": 0.5686630863163913, + "grad_norm": 0.6437749858847259, + "learning_rate": 0.0001679501403130874, + "loss": 12.2522, + "step": 10443 + }, + { + "epoch": 0.5687175403129744, + "grad_norm": 0.5941359930214059, + "learning_rate": 0.00016794367032032742, + "loss": 12.3814, + "step": 10444 + }, + { + "epoch": 0.5687719943095574, + "grad_norm": 0.6410446086869066, + "learning_rate": 0.00016793719979922605, + "loss": 12.4891, + "step": 10445 + }, + { + "epoch": 0.5688264483061404, + "grad_norm": 0.5759340350528791, + "learning_rate": 0.00016793072874983362, + "loss": 12.3136, + "step": 10446 + }, + { + "epoch": 0.5688809023027234, + "grad_norm": 0.6219826681075947, + "learning_rate": 0.00016792425717220046, + "loss": 12.4172, + "step": 10447 + }, + { + "epoch": 0.5689353562993064, + "grad_norm": 0.5902531241211377, + "learning_rate": 0.00016791778506637688, + "loss": 12.3866, + "step": 10448 + }, + { + "epoch": 0.5689898102958894, + "grad_norm": 0.5902582942414536, + "learning_rate": 0.0001679113124324132, + "loss": 12.3021, + "step": 10449 + }, + { + "epoch": 0.5690442642924725, + "grad_norm": 0.5751312773984891, + "learning_rate": 0.0001679048392703598, + "loss": 12.3701, + "step": 10450 + }, + { + "epoch": 0.5690987182890554, + "grad_norm": 0.6790559083330199, + "learning_rate": 0.00016789836558026697, + "loss": 12.3923, + "step": 10451 + }, + { + "epoch": 0.5691531722856384, + "grad_norm": 0.6452000054378788, + "learning_rate": 0.00016789189136218505, + "loss": 12.3022, + "step": 10452 + }, + { + "epoch": 0.5692076262822214, + "grad_norm": 0.5691936565874609, + "learning_rate": 0.0001678854166161644, + "loss": 12.3396, + "step": 10453 + }, + { + "epoch": 0.5692620802788044, + "grad_norm": 0.5693653034216821, + "learning_rate": 0.00016787894134225536, + "loss": 12.386, + "step": 10454 + }, + { + "epoch": 0.5693165342753874, + "grad_norm": 0.6508692340480876, + "learning_rate": 0.0001678724655405083, + "loss": 12.2845, + "step": 10455 + }, + { + "epoch": 0.5693709882719705, + "grad_norm": 0.6186699806695385, + "learning_rate": 0.00016786598921097358, + "loss": 12.3824, + "step": 10456 + }, + { + "epoch": 0.5694254422685535, + "grad_norm": 0.7158082493266653, + "learning_rate": 0.00016785951235370153, + "loss": 12.4277, + "step": 10457 + }, + { + "epoch": 0.5694798962651365, + "grad_norm": 0.656346164357275, + "learning_rate": 0.0001678530349687425, + "loss": 12.3382, + "step": 10458 + }, + { + "epoch": 0.5695343502617195, + "grad_norm": 0.7203813696149984, + "learning_rate": 0.00016784655705614693, + "loss": 12.3475, + "step": 10459 + }, + { + "epoch": 0.5695888042583025, + "grad_norm": 0.6609864561370712, + "learning_rate": 0.00016784007861596518, + "loss": 12.4706, + "step": 10460 + }, + { + "epoch": 0.5696432582548856, + "grad_norm": 0.6048227744145742, + "learning_rate": 0.00016783359964824755, + "loss": 12.3333, + "step": 10461 + }, + { + "epoch": 0.5696977122514686, + "grad_norm": 0.7298358242850399, + "learning_rate": 0.0001678271201530445, + "loss": 12.3152, + "step": 10462 + }, + { + "epoch": 0.5697521662480516, + "grad_norm": 0.6721553190851784, + "learning_rate": 0.00016782064013040637, + "loss": 12.3147, + "step": 10463 + }, + { + "epoch": 0.5698066202446346, + "grad_norm": 0.6208164219046698, + "learning_rate": 0.0001678141595803836, + "loss": 12.3959, + "step": 10464 + }, + { + "epoch": 0.5698610742412176, + "grad_norm": 0.7267473495148199, + "learning_rate": 0.00016780767850302654, + "loss": 12.2043, + "step": 10465 + }, + { + "epoch": 0.5699155282378006, + "grad_norm": 0.5815895018152115, + "learning_rate": 0.0001678011968983856, + "loss": 12.3113, + "step": 10466 + }, + { + "epoch": 0.5699699822343837, + "grad_norm": 0.6407164249371863, + "learning_rate": 0.0001677947147665112, + "loss": 12.335, + "step": 10467 + }, + { + "epoch": 0.5700244362309667, + "grad_norm": 0.7096471186991803, + "learning_rate": 0.00016778823210745366, + "loss": 12.314, + "step": 10468 + }, + { + "epoch": 0.5700788902275497, + "grad_norm": 0.6327973720906753, + "learning_rate": 0.0001677817489212635, + "loss": 12.3507, + "step": 10469 + }, + { + "epoch": 0.5701333442241326, + "grad_norm": 0.5987693162255612, + "learning_rate": 0.00016777526520799115, + "loss": 12.2983, + "step": 10470 + }, + { + "epoch": 0.5701877982207156, + "grad_norm": 0.6552558142622182, + "learning_rate": 0.0001677687809676869, + "loss": 12.3778, + "step": 10471 + }, + { + "epoch": 0.5702422522172986, + "grad_norm": 0.5967141861925086, + "learning_rate": 0.00016776229620040124, + "loss": 12.3893, + "step": 10472 + }, + { + "epoch": 0.5702967062138817, + "grad_norm": 0.6176664295765997, + "learning_rate": 0.00016775581090618463, + "loss": 12.4305, + "step": 10473 + }, + { + "epoch": 0.5703511602104647, + "grad_norm": 0.6591993654093905, + "learning_rate": 0.00016774932508508748, + "loss": 12.3443, + "step": 10474 + }, + { + "epoch": 0.5704056142070477, + "grad_norm": 0.584863376348845, + "learning_rate": 0.0001677428387371602, + "loss": 12.3306, + "step": 10475 + }, + { + "epoch": 0.5704600682036307, + "grad_norm": 0.5963330913592126, + "learning_rate": 0.00016773635186245324, + "loss": 12.4846, + "step": 10476 + }, + { + "epoch": 0.5705145222002137, + "grad_norm": 0.6438054090971593, + "learning_rate": 0.00016772986446101707, + "loss": 12.3858, + "step": 10477 + }, + { + "epoch": 0.5705689761967967, + "grad_norm": 0.6064314578781214, + "learning_rate": 0.0001677233765329021, + "loss": 12.3667, + "step": 10478 + }, + { + "epoch": 0.5706234301933798, + "grad_norm": 0.5348696070350466, + "learning_rate": 0.00016771688807815883, + "loss": 12.3922, + "step": 10479 + }, + { + "epoch": 0.5706778841899628, + "grad_norm": 0.6136181266951442, + "learning_rate": 0.00016771039909683767, + "loss": 12.3923, + "step": 10480 + }, + { + "epoch": 0.5707323381865458, + "grad_norm": 0.6356924851395139, + "learning_rate": 0.00016770390958898904, + "loss": 12.4682, + "step": 10481 + }, + { + "epoch": 0.5707867921831288, + "grad_norm": 0.5611718815605147, + "learning_rate": 0.0001676974195546635, + "loss": 12.4042, + "step": 10482 + }, + { + "epoch": 0.5708412461797118, + "grad_norm": 0.5599668471880284, + "learning_rate": 0.00016769092899391146, + "loss": 12.3262, + "step": 10483 + }, + { + "epoch": 0.5708957001762948, + "grad_norm": 0.6186802999347444, + "learning_rate": 0.0001676844379067834, + "loss": 12.3928, + "step": 10484 + }, + { + "epoch": 0.5709501541728779, + "grad_norm": 0.5786533893527239, + "learning_rate": 0.00016767794629332983, + "loss": 12.331, + "step": 10485 + }, + { + "epoch": 0.5710046081694609, + "grad_norm": 0.5934158335680023, + "learning_rate": 0.00016767145415360116, + "loss": 12.1695, + "step": 10486 + }, + { + "epoch": 0.5710590621660439, + "grad_norm": 0.6016145692911932, + "learning_rate": 0.00016766496148764792, + "loss": 12.2989, + "step": 10487 + }, + { + "epoch": 0.5711135161626268, + "grad_norm": 0.6239976307642991, + "learning_rate": 0.0001676584682955206, + "loss": 12.3911, + "step": 10488 + }, + { + "epoch": 0.5711679701592098, + "grad_norm": 0.6082768551019266, + "learning_rate": 0.0001676519745772697, + "loss": 12.0827, + "step": 10489 + }, + { + "epoch": 0.5712224241557928, + "grad_norm": 0.5871346045320904, + "learning_rate": 0.00016764548033294568, + "loss": 12.4464, + "step": 10490 + }, + { + "epoch": 0.5712768781523759, + "grad_norm": 0.613933828774959, + "learning_rate": 0.00016763898556259907, + "loss": 12.3568, + "step": 10491 + }, + { + "epoch": 0.5713313321489589, + "grad_norm": 0.5991384426015036, + "learning_rate": 0.00016763249026628037, + "loss": 12.4625, + "step": 10492 + }, + { + "epoch": 0.5713857861455419, + "grad_norm": 0.5514872373075452, + "learning_rate": 0.00016762599444404005, + "loss": 12.383, + "step": 10493 + }, + { + "epoch": 0.5714402401421249, + "grad_norm": 0.5531371468233106, + "learning_rate": 0.0001676194980959287, + "loss": 12.3299, + "step": 10494 + }, + { + "epoch": 0.5714946941387079, + "grad_norm": 0.5941667125298707, + "learning_rate": 0.00016761300122199678, + "loss": 12.2687, + "step": 10495 + }, + { + "epoch": 0.571549148135291, + "grad_norm": 0.574816716156372, + "learning_rate": 0.00016760650382229483, + "loss": 12.1507, + "step": 10496 + }, + { + "epoch": 0.571603602131874, + "grad_norm": 0.5661023144426395, + "learning_rate": 0.00016760000589687336, + "loss": 12.3588, + "step": 10497 + }, + { + "epoch": 0.571658056128457, + "grad_norm": 0.5950806459266346, + "learning_rate": 0.00016759350744578288, + "loss": 12.3072, + "step": 10498 + }, + { + "epoch": 0.57171251012504, + "grad_norm": 0.5788836627410617, + "learning_rate": 0.00016758700846907401, + "loss": 12.3097, + "step": 10499 + }, + { + "epoch": 0.571766964121623, + "grad_norm": 0.5946412198769391, + "learning_rate": 0.00016758050896679717, + "loss": 12.3366, + "step": 10500 + }, + { + "epoch": 0.571821418118206, + "grad_norm": 0.6167651308416774, + "learning_rate": 0.000167574008939003, + "loss": 12.3193, + "step": 10501 + }, + { + "epoch": 0.5718758721147891, + "grad_norm": 0.6110161549280384, + "learning_rate": 0.00016756750838574198, + "loss": 12.1554, + "step": 10502 + }, + { + "epoch": 0.5719303261113721, + "grad_norm": 0.596606633093422, + "learning_rate": 0.00016756100730706467, + "loss": 12.4674, + "step": 10503 + }, + { + "epoch": 0.5719847801079551, + "grad_norm": 0.626567890368138, + "learning_rate": 0.00016755450570302166, + "loss": 12.3227, + "step": 10504 + }, + { + "epoch": 0.572039234104538, + "grad_norm": 0.57946182580484, + "learning_rate": 0.0001675480035736635, + "loss": 12.2348, + "step": 10505 + }, + { + "epoch": 0.572093688101121, + "grad_norm": 0.5888367222286589, + "learning_rate": 0.00016754150091904073, + "loss": 12.3907, + "step": 10506 + }, + { + "epoch": 0.572148142097704, + "grad_norm": 0.5917082862865675, + "learning_rate": 0.00016753499773920392, + "loss": 12.4763, + "step": 10507 + }, + { + "epoch": 0.5722025960942871, + "grad_norm": 0.5822949981872415, + "learning_rate": 0.00016752849403420364, + "loss": 12.4209, + "step": 10508 + }, + { + "epoch": 0.5722570500908701, + "grad_norm": 0.6384365923747648, + "learning_rate": 0.00016752198980409045, + "loss": 12.578, + "step": 10509 + }, + { + "epoch": 0.5723115040874531, + "grad_norm": 0.563623509323039, + "learning_rate": 0.00016751548504891496, + "loss": 12.2381, + "step": 10510 + }, + { + "epoch": 0.5723659580840361, + "grad_norm": 0.5695499848134334, + "learning_rate": 0.00016750897976872773, + "loss": 12.3816, + "step": 10511 + }, + { + "epoch": 0.5724204120806191, + "grad_norm": 0.5884756822216776, + "learning_rate": 0.00016750247396357936, + "loss": 12.4354, + "step": 10512 + }, + { + "epoch": 0.5724748660772021, + "grad_norm": 0.6719139096999517, + "learning_rate": 0.0001674959676335204, + "loss": 12.3141, + "step": 10513 + }, + { + "epoch": 0.5725293200737852, + "grad_norm": 0.6136542932337656, + "learning_rate": 0.0001674894607786015, + "loss": 12.1397, + "step": 10514 + }, + { + "epoch": 0.5725837740703682, + "grad_norm": 0.6274914061291436, + "learning_rate": 0.00016748295339887327, + "loss": 12.3897, + "step": 10515 + }, + { + "epoch": 0.5726382280669512, + "grad_norm": 0.7877708430325404, + "learning_rate": 0.00016747644549438623, + "loss": 12.4755, + "step": 10516 + }, + { + "epoch": 0.5726926820635342, + "grad_norm": 0.6202086631901177, + "learning_rate": 0.00016746993706519105, + "loss": 12.4147, + "step": 10517 + }, + { + "epoch": 0.5727471360601172, + "grad_norm": 0.5956267922455494, + "learning_rate": 0.0001674634281113383, + "loss": 12.3638, + "step": 10518 + }, + { + "epoch": 0.5728015900567002, + "grad_norm": 0.6043432463810966, + "learning_rate": 0.00016745691863287866, + "loss": 12.3319, + "step": 10519 + }, + { + "epoch": 0.5728560440532833, + "grad_norm": 0.6873759483732338, + "learning_rate": 0.0001674504086298627, + "loss": 12.5279, + "step": 10520 + }, + { + "epoch": 0.5729104980498663, + "grad_norm": 0.6071589858995201, + "learning_rate": 0.00016744389810234103, + "loss": 12.1313, + "step": 10521 + }, + { + "epoch": 0.5729649520464493, + "grad_norm": 0.6503637091496456, + "learning_rate": 0.00016743738705036432, + "loss": 12.3438, + "step": 10522 + }, + { + "epoch": 0.5730194060430323, + "grad_norm": 0.6086095458823582, + "learning_rate": 0.00016743087547398315, + "loss": 12.3626, + "step": 10523 + }, + { + "epoch": 0.5730738600396152, + "grad_norm": 0.5756750286388267, + "learning_rate": 0.00016742436337324821, + "loss": 12.4315, + "step": 10524 + }, + { + "epoch": 0.5731283140361983, + "grad_norm": 0.5641999601089314, + "learning_rate": 0.00016741785074821013, + "loss": 12.1776, + "step": 10525 + }, + { + "epoch": 0.5731827680327813, + "grad_norm": 0.5547702884056563, + "learning_rate": 0.00016741133759891948, + "loss": 12.1544, + "step": 10526 + }, + { + "epoch": 0.5732372220293643, + "grad_norm": 0.542306466498164, + "learning_rate": 0.00016740482392542703, + "loss": 12.4514, + "step": 10527 + }, + { + "epoch": 0.5732916760259473, + "grad_norm": 0.6474642132436456, + "learning_rate": 0.00016739830972778332, + "loss": 12.4235, + "step": 10528 + }, + { + "epoch": 0.5733461300225303, + "grad_norm": 0.6199669347079713, + "learning_rate": 0.00016739179500603902, + "loss": 12.2876, + "step": 10529 + }, + { + "epoch": 0.5734005840191133, + "grad_norm": 0.5828210115421902, + "learning_rate": 0.0001673852797602449, + "loss": 12.2469, + "step": 10530 + }, + { + "epoch": 0.5734550380156964, + "grad_norm": 0.6730918250509988, + "learning_rate": 0.0001673787639904515, + "loss": 12.4733, + "step": 10531 + }, + { + "epoch": 0.5735094920122794, + "grad_norm": 0.6046063478226098, + "learning_rate": 0.00016737224769670955, + "loss": 12.3557, + "step": 10532 + }, + { + "epoch": 0.5735639460088624, + "grad_norm": 0.6125427393316394, + "learning_rate": 0.0001673657308790697, + "loss": 12.3181, + "step": 10533 + }, + { + "epoch": 0.5736184000054454, + "grad_norm": 0.6463772232187843, + "learning_rate": 0.0001673592135375826, + "loss": 12.4027, + "step": 10534 + }, + { + "epoch": 0.5736728540020284, + "grad_norm": 0.6079419740833918, + "learning_rate": 0.00016735269567229902, + "loss": 12.5024, + "step": 10535 + }, + { + "epoch": 0.5737273079986114, + "grad_norm": 0.6303049403018266, + "learning_rate": 0.00016734617728326952, + "loss": 12.3737, + "step": 10536 + }, + { + "epoch": 0.5737817619951945, + "grad_norm": 0.6191188706847653, + "learning_rate": 0.0001673396583705449, + "loss": 12.4352, + "step": 10537 + }, + { + "epoch": 0.5738362159917775, + "grad_norm": 0.64668220420118, + "learning_rate": 0.00016733313893417575, + "loss": 12.3668, + "step": 10538 + }, + { + "epoch": 0.5738906699883605, + "grad_norm": 0.59202145795084, + "learning_rate": 0.00016732661897421284, + "loss": 12.4155, + "step": 10539 + }, + { + "epoch": 0.5739451239849435, + "grad_norm": 0.6330526863126181, + "learning_rate": 0.0001673200984907069, + "loss": 12.3955, + "step": 10540 + }, + { + "epoch": 0.5739995779815265, + "grad_norm": 0.6282798030727328, + "learning_rate": 0.00016731357748370852, + "loss": 12.2411, + "step": 10541 + }, + { + "epoch": 0.5740540319781094, + "grad_norm": 0.5847016640651334, + "learning_rate": 0.00016730705595326847, + "loss": 12.2756, + "step": 10542 + }, + { + "epoch": 0.5741084859746926, + "grad_norm": 0.6828398564940577, + "learning_rate": 0.00016730053389943752, + "loss": 12.3235, + "step": 10543 + }, + { + "epoch": 0.5741629399712755, + "grad_norm": 0.6756906716831694, + "learning_rate": 0.0001672940113222663, + "loss": 12.3097, + "step": 10544 + }, + { + "epoch": 0.5742173939678585, + "grad_norm": 0.5620571029291483, + "learning_rate": 0.00016728748822180558, + "loss": 12.4714, + "step": 10545 + }, + { + "epoch": 0.5742718479644415, + "grad_norm": 0.763273675651076, + "learning_rate": 0.00016728096459810605, + "loss": 12.3871, + "step": 10546 + }, + { + "epoch": 0.5743263019610245, + "grad_norm": 0.6695252802059574, + "learning_rate": 0.00016727444045121844, + "loss": 12.4432, + "step": 10547 + }, + { + "epoch": 0.5743807559576075, + "grad_norm": 0.5481293077792526, + "learning_rate": 0.00016726791578119352, + "loss": 12.352, + "step": 10548 + }, + { + "epoch": 0.5744352099541906, + "grad_norm": 0.7292923862086983, + "learning_rate": 0.00016726139058808204, + "loss": 12.3642, + "step": 10549 + }, + { + "epoch": 0.5744896639507736, + "grad_norm": 0.6413924900299116, + "learning_rate": 0.00016725486487193466, + "loss": 12.4177, + "step": 10550 + }, + { + "epoch": 0.5745441179473566, + "grad_norm": 0.6287623470629832, + "learning_rate": 0.0001672483386328022, + "loss": 12.4007, + "step": 10551 + }, + { + "epoch": 0.5745985719439396, + "grad_norm": 0.6708372301718295, + "learning_rate": 0.00016724181187073532, + "loss": 12.4079, + "step": 10552 + }, + { + "epoch": 0.5746530259405226, + "grad_norm": 0.6457982109614273, + "learning_rate": 0.0001672352845857849, + "loss": 12.4311, + "step": 10553 + }, + { + "epoch": 0.5747074799371056, + "grad_norm": 0.612411900516974, + "learning_rate": 0.00016722875677800163, + "loss": 12.4046, + "step": 10554 + }, + { + "epoch": 0.5747619339336887, + "grad_norm": 0.719502918505751, + "learning_rate": 0.00016722222844743625, + "loss": 12.3984, + "step": 10555 + }, + { + "epoch": 0.5748163879302717, + "grad_norm": 0.5937007029444668, + "learning_rate": 0.00016721569959413955, + "loss": 12.3199, + "step": 10556 + }, + { + "epoch": 0.5748708419268547, + "grad_norm": 0.6120740818602591, + "learning_rate": 0.00016720917021816232, + "loss": 12.175, + "step": 10557 + }, + { + "epoch": 0.5749252959234377, + "grad_norm": 0.5793545511529767, + "learning_rate": 0.00016720264031955528, + "loss": 12.3855, + "step": 10558 + }, + { + "epoch": 0.5749797499200207, + "grad_norm": 0.611501589018356, + "learning_rate": 0.00016719610989836926, + "loss": 12.4145, + "step": 10559 + }, + { + "epoch": 0.5750342039166038, + "grad_norm": 0.5726434094961663, + "learning_rate": 0.000167189578954655, + "loss": 12.3545, + "step": 10560 + }, + { + "epoch": 0.5750886579131868, + "grad_norm": 0.6006267957757423, + "learning_rate": 0.00016718304748846333, + "loss": 12.3245, + "step": 10561 + }, + { + "epoch": 0.5751431119097697, + "grad_norm": 0.6673176404464749, + "learning_rate": 0.000167176515499845, + "loss": 12.3836, + "step": 10562 + }, + { + "epoch": 0.5751975659063527, + "grad_norm": 0.5485759701750896, + "learning_rate": 0.00016716998298885082, + "loss": 12.3111, + "step": 10563 + }, + { + "epoch": 0.5752520199029357, + "grad_norm": 0.6530685793868931, + "learning_rate": 0.00016716344995553159, + "loss": 12.2943, + "step": 10564 + }, + { + "epoch": 0.5753064738995187, + "grad_norm": 0.6586676988577265, + "learning_rate": 0.00016715691639993812, + "loss": 12.2898, + "step": 10565 + }, + { + "epoch": 0.5753609278961018, + "grad_norm": 0.6148634157182052, + "learning_rate": 0.00016715038232212122, + "loss": 12.4235, + "step": 10566 + }, + { + "epoch": 0.5754153818926848, + "grad_norm": 0.597136346674799, + "learning_rate": 0.00016714384772213166, + "loss": 12.3168, + "step": 10567 + }, + { + "epoch": 0.5754698358892678, + "grad_norm": 0.6297749668051609, + "learning_rate": 0.0001671373126000203, + "loss": 12.3847, + "step": 10568 + }, + { + "epoch": 0.5755242898858508, + "grad_norm": 0.5814828821447569, + "learning_rate": 0.0001671307769558379, + "loss": 12.3572, + "step": 10569 + }, + { + "epoch": 0.5755787438824338, + "grad_norm": 0.5795019959416221, + "learning_rate": 0.00016712424078963535, + "loss": 12.3285, + "step": 10570 + }, + { + "epoch": 0.5756331978790168, + "grad_norm": 0.6514041703969438, + "learning_rate": 0.00016711770410146343, + "loss": 12.3869, + "step": 10571 + }, + { + "epoch": 0.5756876518755999, + "grad_norm": 0.5718901998179207, + "learning_rate": 0.00016711116689137302, + "loss": 12.3213, + "step": 10572 + }, + { + "epoch": 0.5757421058721829, + "grad_norm": 0.6235036946951503, + "learning_rate": 0.0001671046291594149, + "loss": 12.3728, + "step": 10573 + }, + { + "epoch": 0.5757965598687659, + "grad_norm": 0.49537043109391665, + "learning_rate": 0.00016709809090563991, + "loss": 12.1873, + "step": 10574 + }, + { + "epoch": 0.5758510138653489, + "grad_norm": 0.5739977173419183, + "learning_rate": 0.00016709155213009895, + "loss": 12.4136, + "step": 10575 + }, + { + "epoch": 0.5759054678619319, + "grad_norm": 0.5843541261690129, + "learning_rate": 0.0001670850128328428, + "loss": 12.3284, + "step": 10576 + }, + { + "epoch": 0.5759599218585149, + "grad_norm": 0.6166488364641032, + "learning_rate": 0.00016707847301392236, + "loss": 12.3386, + "step": 10577 + }, + { + "epoch": 0.576014375855098, + "grad_norm": 0.6460908741074868, + "learning_rate": 0.00016707193267338844, + "loss": 12.4182, + "step": 10578 + }, + { + "epoch": 0.576068829851681, + "grad_norm": 0.6129110230694653, + "learning_rate": 0.00016706539181129195, + "loss": 12.2851, + "step": 10579 + }, + { + "epoch": 0.576123283848264, + "grad_norm": 0.6352271764388896, + "learning_rate": 0.00016705885042768372, + "loss": 12.3625, + "step": 10580 + }, + { + "epoch": 0.5761777378448469, + "grad_norm": 0.7409091782385997, + "learning_rate": 0.00016705230852261465, + "loss": 12.4322, + "step": 10581 + }, + { + "epoch": 0.5762321918414299, + "grad_norm": 0.5790484078560002, + "learning_rate": 0.00016704576609613553, + "loss": 12.3396, + "step": 10582 + }, + { + "epoch": 0.5762866458380129, + "grad_norm": 0.5480996478430834, + "learning_rate": 0.0001670392231482973, + "loss": 12.2522, + "step": 10583 + }, + { + "epoch": 0.576341099834596, + "grad_norm": 0.70971966257255, + "learning_rate": 0.00016703267967915086, + "loss": 12.3536, + "step": 10584 + }, + { + "epoch": 0.576395553831179, + "grad_norm": 0.6783815309016988, + "learning_rate": 0.00016702613568874702, + "loss": 12.3734, + "step": 10585 + }, + { + "epoch": 0.576450007827762, + "grad_norm": 0.6505384157020722, + "learning_rate": 0.00016701959117713675, + "loss": 12.4639, + "step": 10586 + }, + { + "epoch": 0.576504461824345, + "grad_norm": 0.5584833546030031, + "learning_rate": 0.0001670130461443709, + "loss": 12.245, + "step": 10587 + }, + { + "epoch": 0.576558915820928, + "grad_norm": 0.6178251227046433, + "learning_rate": 0.00016700650059050035, + "loss": 12.3666, + "step": 10588 + }, + { + "epoch": 0.576613369817511, + "grad_norm": 0.6046084629237625, + "learning_rate": 0.00016699995451557599, + "loss": 12.3203, + "step": 10589 + }, + { + "epoch": 0.5766678238140941, + "grad_norm": 0.6602116593538877, + "learning_rate": 0.00016699340791964876, + "loss": 12.3469, + "step": 10590 + }, + { + "epoch": 0.5767222778106771, + "grad_norm": 0.5750978231757151, + "learning_rate": 0.0001669868608027696, + "loss": 12.2704, + "step": 10591 + }, + { + "epoch": 0.5767767318072601, + "grad_norm": 0.713198247391486, + "learning_rate": 0.00016698031316498933, + "loss": 12.4092, + "step": 10592 + }, + { + "epoch": 0.5768311858038431, + "grad_norm": 0.5734036929963306, + "learning_rate": 0.00016697376500635894, + "loss": 12.3519, + "step": 10593 + }, + { + "epoch": 0.5768856398004261, + "grad_norm": 0.6086445680995939, + "learning_rate": 0.00016696721632692926, + "loss": 12.3711, + "step": 10594 + }, + { + "epoch": 0.5769400937970092, + "grad_norm": 0.5706324360010867, + "learning_rate": 0.00016696066712675134, + "loss": 12.3824, + "step": 10595 + }, + { + "epoch": 0.5769945477935922, + "grad_norm": 0.5712190011034245, + "learning_rate": 0.000166954117405876, + "loss": 12.1989, + "step": 10596 + }, + { + "epoch": 0.5770490017901752, + "grad_norm": 0.597188553842159, + "learning_rate": 0.00016694756716435427, + "loss": 12.224, + "step": 10597 + }, + { + "epoch": 0.5771034557867581, + "grad_norm": 0.5742012347288625, + "learning_rate": 0.000166941016402237, + "loss": 12.2663, + "step": 10598 + }, + { + "epoch": 0.5771579097833411, + "grad_norm": 0.5841098572750677, + "learning_rate": 0.00016693446511957514, + "loss": 12.3919, + "step": 10599 + }, + { + "epoch": 0.5772123637799241, + "grad_norm": 0.5455079817642827, + "learning_rate": 0.00016692791331641968, + "loss": 12.1867, + "step": 10600 + }, + { + "epoch": 0.5772668177765072, + "grad_norm": 0.6024082007021208, + "learning_rate": 0.00016692136099282154, + "loss": 12.2927, + "step": 10601 + }, + { + "epoch": 0.5773212717730902, + "grad_norm": 0.5426123737996211, + "learning_rate": 0.0001669148081488317, + "loss": 12.3577, + "step": 10602 + }, + { + "epoch": 0.5773757257696732, + "grad_norm": 0.6663180089415239, + "learning_rate": 0.00016690825478450104, + "loss": 12.3949, + "step": 10603 + }, + { + "epoch": 0.5774301797662562, + "grad_norm": 0.6024385898428207, + "learning_rate": 0.0001669017008998806, + "loss": 12.2587, + "step": 10604 + }, + { + "epoch": 0.5774846337628392, + "grad_norm": 0.5818281069112449, + "learning_rate": 0.0001668951464950213, + "loss": 12.261, + "step": 10605 + }, + { + "epoch": 0.5775390877594222, + "grad_norm": 0.5903345499037228, + "learning_rate": 0.00016688859156997415, + "loss": 12.2928, + "step": 10606 + }, + { + "epoch": 0.5775935417560053, + "grad_norm": 0.5794801873580113, + "learning_rate": 0.00016688203612479004, + "loss": 12.4412, + "step": 10607 + }, + { + "epoch": 0.5776479957525883, + "grad_norm": 0.6805587694466598, + "learning_rate": 0.00016687548015952003, + "loss": 12.3809, + "step": 10608 + }, + { + "epoch": 0.5777024497491713, + "grad_norm": 0.6843568923556025, + "learning_rate": 0.0001668689236742151, + "loss": 12.3297, + "step": 10609 + }, + { + "epoch": 0.5777569037457543, + "grad_norm": 0.6010521056935886, + "learning_rate": 0.00016686236666892617, + "loss": 12.3508, + "step": 10610 + }, + { + "epoch": 0.5778113577423373, + "grad_norm": 0.5713679177009463, + "learning_rate": 0.00016685580914370428, + "loss": 12.4091, + "step": 10611 + }, + { + "epoch": 0.5778658117389203, + "grad_norm": 0.6063980630363172, + "learning_rate": 0.00016684925109860038, + "loss": 12.3177, + "step": 10612 + }, + { + "epoch": 0.5779202657355034, + "grad_norm": 0.613071108821596, + "learning_rate": 0.0001668426925336655, + "loss": 12.3598, + "step": 10613 + }, + { + "epoch": 0.5779747197320864, + "grad_norm": 0.6988243533468197, + "learning_rate": 0.0001668361334489507, + "loss": 12.3623, + "step": 10614 + }, + { + "epoch": 0.5780291737286694, + "grad_norm": 0.6237317929685072, + "learning_rate": 0.00016682957384450684, + "loss": 12.3544, + "step": 10615 + }, + { + "epoch": 0.5780836277252523, + "grad_norm": 0.5961809283619698, + "learning_rate": 0.00016682301372038504, + "loss": 12.4062, + "step": 10616 + }, + { + "epoch": 0.5781380817218353, + "grad_norm": 0.6010875834008084, + "learning_rate": 0.00016681645307663626, + "loss": 12.2882, + "step": 10617 + }, + { + "epoch": 0.5781925357184183, + "grad_norm": 0.5755892550319603, + "learning_rate": 0.00016680989191331157, + "loss": 12.2412, + "step": 10618 + }, + { + "epoch": 0.5782469897150014, + "grad_norm": 0.6412664803443131, + "learning_rate": 0.0001668033302304619, + "loss": 12.2868, + "step": 10619 + }, + { + "epoch": 0.5783014437115844, + "grad_norm": 0.5556221992873542, + "learning_rate": 0.00016679676802813838, + "loss": 12.3292, + "step": 10620 + }, + { + "epoch": 0.5783558977081674, + "grad_norm": 0.6912225305689517, + "learning_rate": 0.00016679020530639197, + "loss": 12.4867, + "step": 10621 + }, + { + "epoch": 0.5784103517047504, + "grad_norm": 0.5984882630019411, + "learning_rate": 0.00016678364206527372, + "loss": 12.3977, + "step": 10622 + }, + { + "epoch": 0.5784648057013334, + "grad_norm": 0.561838444976537, + "learning_rate": 0.00016677707830483468, + "loss": 12.3208, + "step": 10623 + }, + { + "epoch": 0.5785192596979164, + "grad_norm": 0.6207495383056233, + "learning_rate": 0.00016677051402512588, + "loss": 12.3599, + "step": 10624 + }, + { + "epoch": 0.5785737136944995, + "grad_norm": 0.5758976293841622, + "learning_rate": 0.00016676394922619835, + "loss": 12.2409, + "step": 10625 + }, + { + "epoch": 0.5786281676910825, + "grad_norm": 0.5909502850088595, + "learning_rate": 0.00016675738390810319, + "loss": 12.3915, + "step": 10626 + }, + { + "epoch": 0.5786826216876655, + "grad_norm": 0.6320800577473589, + "learning_rate": 0.0001667508180708914, + "loss": 12.3689, + "step": 10627 + }, + { + "epoch": 0.5787370756842485, + "grad_norm": 0.5729497199492393, + "learning_rate": 0.00016674425171461402, + "loss": 12.2811, + "step": 10628 + }, + { + "epoch": 0.5787915296808315, + "grad_norm": 0.6089854746204572, + "learning_rate": 0.0001667376848393222, + "loss": 12.3424, + "step": 10629 + }, + { + "epoch": 0.5788459836774146, + "grad_norm": 0.6253594955335714, + "learning_rate": 0.0001667311174450669, + "loss": 12.3144, + "step": 10630 + }, + { + "epoch": 0.5789004376739976, + "grad_norm": 0.5667608081147807, + "learning_rate": 0.0001667245495318993, + "loss": 12.3947, + "step": 10631 + }, + { + "epoch": 0.5789548916705806, + "grad_norm": 0.6135287869538242, + "learning_rate": 0.00016671798109987035, + "loss": 12.3181, + "step": 10632 + }, + { + "epoch": 0.5790093456671636, + "grad_norm": 0.5553515736538324, + "learning_rate": 0.00016671141214903124, + "loss": 12.2642, + "step": 10633 + }, + { + "epoch": 0.5790637996637465, + "grad_norm": 0.5392317028369967, + "learning_rate": 0.00016670484267943296, + "loss": 12.43, + "step": 10634 + }, + { + "epoch": 0.5791182536603295, + "grad_norm": 0.6236632127739922, + "learning_rate": 0.00016669827269112666, + "loss": 12.3473, + "step": 10635 + }, + { + "epoch": 0.5791727076569126, + "grad_norm": 0.5955810877538578, + "learning_rate": 0.00016669170218416342, + "loss": 12.1647, + "step": 10636 + }, + { + "epoch": 0.5792271616534956, + "grad_norm": 0.683404071019631, + "learning_rate": 0.0001666851311585943, + "loss": 12.4758, + "step": 10637 + }, + { + "epoch": 0.5792816156500786, + "grad_norm": 0.5430831866083706, + "learning_rate": 0.0001666785596144704, + "loss": 12.1772, + "step": 10638 + }, + { + "epoch": 0.5793360696466616, + "grad_norm": 0.6560979890502256, + "learning_rate": 0.00016667198755184286, + "loss": 12.3382, + "step": 10639 + }, + { + "epoch": 0.5793905236432446, + "grad_norm": 0.5806297471494171, + "learning_rate": 0.00016666541497076278, + "loss": 12.28, + "step": 10640 + }, + { + "epoch": 0.5794449776398276, + "grad_norm": 0.613292184104423, + "learning_rate": 0.00016665884187128124, + "loss": 12.4239, + "step": 10641 + }, + { + "epoch": 0.5794994316364107, + "grad_norm": 0.7204491078949676, + "learning_rate": 0.00016665226825344936, + "loss": 12.3856, + "step": 10642 + }, + { + "epoch": 0.5795538856329937, + "grad_norm": 0.5956727900025345, + "learning_rate": 0.00016664569411731827, + "loss": 12.248, + "step": 10643 + }, + { + "epoch": 0.5796083396295767, + "grad_norm": 0.6538876553886454, + "learning_rate": 0.00016663911946293908, + "loss": 12.261, + "step": 10644 + }, + { + "epoch": 0.5796627936261597, + "grad_norm": 0.6123826693547904, + "learning_rate": 0.00016663254429036292, + "loss": 12.3832, + "step": 10645 + }, + { + "epoch": 0.5797172476227427, + "grad_norm": 0.6421627005585591, + "learning_rate": 0.00016662596859964092, + "loss": 12.3551, + "step": 10646 + }, + { + "epoch": 0.5797717016193257, + "grad_norm": 0.6798698187958897, + "learning_rate": 0.00016661939239082422, + "loss": 12.4644, + "step": 10647 + }, + { + "epoch": 0.5798261556159088, + "grad_norm": 0.620210117669651, + "learning_rate": 0.00016661281566396395, + "loss": 12.2705, + "step": 10648 + }, + { + "epoch": 0.5798806096124918, + "grad_norm": 0.6167602633822272, + "learning_rate": 0.00016660623841911127, + "loss": 12.3554, + "step": 10649 + }, + { + "epoch": 0.5799350636090748, + "grad_norm": 0.7079619021600241, + "learning_rate": 0.0001665996606563173, + "loss": 12.37, + "step": 10650 + }, + { + "epoch": 0.5799895176056578, + "grad_norm": 0.6471379288128399, + "learning_rate": 0.0001665930823756332, + "loss": 12.2851, + "step": 10651 + }, + { + "epoch": 0.5800439716022407, + "grad_norm": 0.6250926694948886, + "learning_rate": 0.00016658650357711014, + "loss": 12.5016, + "step": 10652 + }, + { + "epoch": 0.5800984255988237, + "grad_norm": 0.7285422010970602, + "learning_rate": 0.00016657992426079922, + "loss": 12.3769, + "step": 10653 + }, + { + "epoch": 0.5801528795954068, + "grad_norm": 0.5863535987053579, + "learning_rate": 0.00016657334442675168, + "loss": 12.2994, + "step": 10654 + }, + { + "epoch": 0.5802073335919898, + "grad_norm": 1.0460052189248172, + "learning_rate": 0.00016656676407501863, + "loss": 12.3731, + "step": 10655 + }, + { + "epoch": 0.5802617875885728, + "grad_norm": 0.6388253747460793, + "learning_rate": 0.00016656018320565128, + "loss": 12.3517, + "step": 10656 + }, + { + "epoch": 0.5803162415851558, + "grad_norm": 0.6482402051013342, + "learning_rate": 0.00016655360181870078, + "loss": 12.1586, + "step": 10657 + }, + { + "epoch": 0.5803706955817388, + "grad_norm": 0.6075242808500345, + "learning_rate": 0.0001665470199142183, + "loss": 12.2407, + "step": 10658 + }, + { + "epoch": 0.5804251495783219, + "grad_norm": 0.5845619181963486, + "learning_rate": 0.000166540437492255, + "loss": 12.2608, + "step": 10659 + }, + { + "epoch": 0.5804796035749049, + "grad_norm": 0.6034154716906985, + "learning_rate": 0.00016653385455286213, + "loss": 12.3674, + "step": 10660 + }, + { + "epoch": 0.5805340575714879, + "grad_norm": 0.5829797356728805, + "learning_rate": 0.0001665272710960909, + "loss": 12.3774, + "step": 10661 + }, + { + "epoch": 0.5805885115680709, + "grad_norm": 0.6855172700573546, + "learning_rate": 0.00016652068712199239, + "loss": 12.4455, + "step": 10662 + }, + { + "epoch": 0.5806429655646539, + "grad_norm": 0.6075926608862877, + "learning_rate": 0.00016651410263061786, + "loss": 12.4299, + "step": 10663 + }, + { + "epoch": 0.5806974195612369, + "grad_norm": 0.5648326355226941, + "learning_rate": 0.00016650751762201855, + "loss": 12.3492, + "step": 10664 + }, + { + "epoch": 0.58075187355782, + "grad_norm": 0.6824578992264828, + "learning_rate": 0.00016650093209624557, + "loss": 12.3532, + "step": 10665 + }, + { + "epoch": 0.580806327554403, + "grad_norm": 0.5804425579415122, + "learning_rate": 0.00016649434605335025, + "loss": 12.2374, + "step": 10666 + }, + { + "epoch": 0.580860781550986, + "grad_norm": 0.6508938636771904, + "learning_rate": 0.00016648775949338373, + "loss": 12.4581, + "step": 10667 + }, + { + "epoch": 0.580915235547569, + "grad_norm": 0.6252040919432668, + "learning_rate": 0.00016648117241639722, + "loss": 12.3138, + "step": 10668 + }, + { + "epoch": 0.580969689544152, + "grad_norm": 0.6052652126208161, + "learning_rate": 0.000166474584822442, + "loss": 12.4502, + "step": 10669 + }, + { + "epoch": 0.581024143540735, + "grad_norm": 0.6035779320123137, + "learning_rate": 0.00016646799671156924, + "loss": 12.2472, + "step": 10670 + }, + { + "epoch": 0.581078597537318, + "grad_norm": 0.6325234440978983, + "learning_rate": 0.00016646140808383023, + "loss": 12.2745, + "step": 10671 + }, + { + "epoch": 0.581133051533901, + "grad_norm": 0.5833686290661749, + "learning_rate": 0.0001664548189392761, + "loss": 12.3776, + "step": 10672 + }, + { + "epoch": 0.581187505530484, + "grad_norm": 0.5924299333881107, + "learning_rate": 0.00016644822927795817, + "loss": 12.3912, + "step": 10673 + }, + { + "epoch": 0.581241959527067, + "grad_norm": 0.5966865121409107, + "learning_rate": 0.00016644163909992768, + "loss": 12.2941, + "step": 10674 + }, + { + "epoch": 0.58129641352365, + "grad_norm": 0.571458474944623, + "learning_rate": 0.00016643504840523586, + "loss": 12.2669, + "step": 10675 + }, + { + "epoch": 0.581350867520233, + "grad_norm": 0.6325490664211092, + "learning_rate": 0.00016642845719393398, + "loss": 12.2097, + "step": 10676 + }, + { + "epoch": 0.5814053215168161, + "grad_norm": 0.5714605500093208, + "learning_rate": 0.00016642186546607322, + "loss": 12.2973, + "step": 10677 + }, + { + "epoch": 0.5814597755133991, + "grad_norm": 0.5435175392981482, + "learning_rate": 0.00016641527322170494, + "loss": 12.315, + "step": 10678 + }, + { + "epoch": 0.5815142295099821, + "grad_norm": 0.5737258722843944, + "learning_rate": 0.00016640868046088037, + "loss": 12.325, + "step": 10679 + }, + { + "epoch": 0.5815686835065651, + "grad_norm": 0.7006617008446661, + "learning_rate": 0.00016640208718365074, + "loss": 12.5486, + "step": 10680 + }, + { + "epoch": 0.5816231375031481, + "grad_norm": 0.6174886267902199, + "learning_rate": 0.00016639549339006736, + "loss": 12.3535, + "step": 10681 + }, + { + "epoch": 0.5816775914997311, + "grad_norm": 0.6928596384395789, + "learning_rate": 0.00016638889908018146, + "loss": 12.2791, + "step": 10682 + }, + { + "epoch": 0.5817320454963142, + "grad_norm": 0.6084022066172773, + "learning_rate": 0.00016638230425404437, + "loss": 12.3526, + "step": 10683 + }, + { + "epoch": 0.5817864994928972, + "grad_norm": 0.5687893948538565, + "learning_rate": 0.00016637570891170732, + "loss": 12.4278, + "step": 10684 + }, + { + "epoch": 0.5818409534894802, + "grad_norm": 0.6759339144459159, + "learning_rate": 0.00016636911305322168, + "loss": 12.366, + "step": 10685 + }, + { + "epoch": 0.5818954074860632, + "grad_norm": 0.6087122523927804, + "learning_rate": 0.00016636251667863868, + "loss": 12.3715, + "step": 10686 + }, + { + "epoch": 0.5819498614826462, + "grad_norm": 0.6395203659277144, + "learning_rate": 0.00016635591978800957, + "loss": 12.2675, + "step": 10687 + }, + { + "epoch": 0.5820043154792292, + "grad_norm": 0.7568721198329359, + "learning_rate": 0.00016634932238138574, + "loss": 12.435, + "step": 10688 + }, + { + "epoch": 0.5820587694758123, + "grad_norm": 0.6359199886095321, + "learning_rate": 0.00016634272445881844, + "loss": 12.3296, + "step": 10689 + }, + { + "epoch": 0.5821132234723952, + "grad_norm": 0.6054270859888692, + "learning_rate": 0.000166336126020359, + "loss": 12.3969, + "step": 10690 + }, + { + "epoch": 0.5821676774689782, + "grad_norm": 0.6476557577065587, + "learning_rate": 0.0001663295270660587, + "loss": 12.3727, + "step": 10691 + }, + { + "epoch": 0.5822221314655612, + "grad_norm": 0.548446163354268, + "learning_rate": 0.0001663229275959689, + "loss": 12.1942, + "step": 10692 + }, + { + "epoch": 0.5822765854621442, + "grad_norm": 0.5874516079699055, + "learning_rate": 0.00016631632761014088, + "loss": 12.294, + "step": 10693 + }, + { + "epoch": 0.5823310394587273, + "grad_norm": 0.7223823835601816, + "learning_rate": 0.00016630972710862595, + "loss": 12.3554, + "step": 10694 + }, + { + "epoch": 0.5823854934553103, + "grad_norm": 0.6923158838651609, + "learning_rate": 0.0001663031260914755, + "loss": 12.4702, + "step": 10695 + }, + { + "epoch": 0.5824399474518933, + "grad_norm": 0.6265117547125986, + "learning_rate": 0.0001662965245587408, + "loss": 12.266, + "step": 10696 + }, + { + "epoch": 0.5824944014484763, + "grad_norm": 0.6029729478880793, + "learning_rate": 0.00016628992251047322, + "loss": 12.3511, + "step": 10697 + }, + { + "epoch": 0.5825488554450593, + "grad_norm": 0.5424235117126971, + "learning_rate": 0.00016628331994672407, + "loss": 12.2765, + "step": 10698 + }, + { + "epoch": 0.5826033094416423, + "grad_norm": 0.6365971163700855, + "learning_rate": 0.00016627671686754471, + "loss": 12.447, + "step": 10699 + }, + { + "epoch": 0.5826577634382254, + "grad_norm": 0.6047554713915011, + "learning_rate": 0.00016627011327298652, + "loss": 12.2762, + "step": 10700 + }, + { + "epoch": 0.5827122174348084, + "grad_norm": 0.635240192156098, + "learning_rate": 0.00016626350916310078, + "loss": 12.3235, + "step": 10701 + }, + { + "epoch": 0.5827666714313914, + "grad_norm": 0.6115231624219086, + "learning_rate": 0.00016625690453793887, + "loss": 12.3012, + "step": 10702 + }, + { + "epoch": 0.5828211254279744, + "grad_norm": 0.6807057914114546, + "learning_rate": 0.0001662502993975522, + "loss": 12.2399, + "step": 10703 + }, + { + "epoch": 0.5828755794245574, + "grad_norm": 0.6231743295966256, + "learning_rate": 0.00016624369374199205, + "loss": 12.2065, + "step": 10704 + }, + { + "epoch": 0.5829300334211404, + "grad_norm": 0.620544586110963, + "learning_rate": 0.00016623708757130986, + "loss": 12.3695, + "step": 10705 + }, + { + "epoch": 0.5829844874177235, + "grad_norm": 0.5689139742984867, + "learning_rate": 0.00016623048088555695, + "loss": 12.3346, + "step": 10706 + }, + { + "epoch": 0.5830389414143065, + "grad_norm": 0.5860319651455662, + "learning_rate": 0.0001662238736847847, + "loss": 12.4899, + "step": 10707 + }, + { + "epoch": 0.5830933954108894, + "grad_norm": 0.6495902268166277, + "learning_rate": 0.00016621726596904456, + "loss": 12.3605, + "step": 10708 + }, + { + "epoch": 0.5831478494074724, + "grad_norm": 0.6258170212278469, + "learning_rate": 0.00016621065773838779, + "loss": 12.1947, + "step": 10709 + }, + { + "epoch": 0.5832023034040554, + "grad_norm": 0.6346930520938242, + "learning_rate": 0.00016620404899286587, + "loss": 12.4736, + "step": 10710 + }, + { + "epoch": 0.5832567574006384, + "grad_norm": 0.6556064557353416, + "learning_rate": 0.00016619743973253018, + "loss": 12.3694, + "step": 10711 + }, + { + "epoch": 0.5833112113972215, + "grad_norm": 0.6348183258851631, + "learning_rate": 0.0001661908299574321, + "loss": 12.3759, + "step": 10712 + }, + { + "epoch": 0.5833656653938045, + "grad_norm": 0.5517331420369923, + "learning_rate": 0.00016618421966762298, + "loss": 12.2347, + "step": 10713 + }, + { + "epoch": 0.5834201193903875, + "grad_norm": 0.6066742407789687, + "learning_rate": 0.0001661776088631543, + "loss": 12.5373, + "step": 10714 + }, + { + "epoch": 0.5834745733869705, + "grad_norm": 0.5705601878893244, + "learning_rate": 0.00016617099754407744, + "loss": 12.3859, + "step": 10715 + }, + { + "epoch": 0.5835290273835535, + "grad_norm": 0.6543925268915343, + "learning_rate": 0.00016616438571044379, + "loss": 12.2492, + "step": 10716 + }, + { + "epoch": 0.5835834813801365, + "grad_norm": 0.559740718934576, + "learning_rate": 0.0001661577733623048, + "loss": 12.3315, + "step": 10717 + }, + { + "epoch": 0.5836379353767196, + "grad_norm": 0.7204271917294793, + "learning_rate": 0.00016615116049971184, + "loss": 12.3104, + "step": 10718 + }, + { + "epoch": 0.5836923893733026, + "grad_norm": 0.5570670501316027, + "learning_rate": 0.0001661445471227164, + "loss": 12.2308, + "step": 10719 + }, + { + "epoch": 0.5837468433698856, + "grad_norm": 0.9876212452291068, + "learning_rate": 0.00016613793323136983, + "loss": 12.3422, + "step": 10720 + }, + { + "epoch": 0.5838012973664686, + "grad_norm": 0.5470216742140983, + "learning_rate": 0.00016613131882572363, + "loss": 12.2743, + "step": 10721 + }, + { + "epoch": 0.5838557513630516, + "grad_norm": 0.6491742554543132, + "learning_rate": 0.00016612470390582918, + "loss": 12.4555, + "step": 10722 + }, + { + "epoch": 0.5839102053596346, + "grad_norm": 0.5823548501056763, + "learning_rate": 0.00016611808847173798, + "loss": 12.31, + "step": 10723 + }, + { + "epoch": 0.5839646593562177, + "grad_norm": 0.6165231576326428, + "learning_rate": 0.00016611147252350137, + "loss": 12.2579, + "step": 10724 + }, + { + "epoch": 0.5840191133528007, + "grad_norm": 0.6820426398894804, + "learning_rate": 0.00016610485606117093, + "loss": 12.3685, + "step": 10725 + }, + { + "epoch": 0.5840735673493836, + "grad_norm": 0.5495775000325852, + "learning_rate": 0.00016609823908479804, + "loss": 12.3707, + "step": 10726 + }, + { + "epoch": 0.5841280213459666, + "grad_norm": 0.7082474221272369, + "learning_rate": 0.00016609162159443412, + "loss": 12.3602, + "step": 10727 + }, + { + "epoch": 0.5841824753425496, + "grad_norm": 0.5726077459516642, + "learning_rate": 0.0001660850035901307, + "loss": 12.3536, + "step": 10728 + }, + { + "epoch": 0.5842369293391327, + "grad_norm": 0.6416664026749477, + "learning_rate": 0.00016607838507193918, + "loss": 12.3802, + "step": 10729 + }, + { + "epoch": 0.5842913833357157, + "grad_norm": 0.5706655013876935, + "learning_rate": 0.00016607176603991106, + "loss": 12.3543, + "step": 10730 + }, + { + "epoch": 0.5843458373322987, + "grad_norm": 0.6420489842793392, + "learning_rate": 0.00016606514649409782, + "loss": 12.3892, + "step": 10731 + }, + { + "epoch": 0.5844002913288817, + "grad_norm": 0.5834620300852785, + "learning_rate": 0.00016605852643455094, + "loss": 12.275, + "step": 10732 + }, + { + "epoch": 0.5844547453254647, + "grad_norm": 0.5570349802673065, + "learning_rate": 0.00016605190586132184, + "loss": 12.4122, + "step": 10733 + }, + { + "epoch": 0.5845091993220477, + "grad_norm": 0.6062036403931096, + "learning_rate": 0.00016604528477446207, + "loss": 12.2577, + "step": 10734 + }, + { + "epoch": 0.5845636533186308, + "grad_norm": 0.6008188923462245, + "learning_rate": 0.00016603866317402307, + "loss": 12.4129, + "step": 10735 + }, + { + "epoch": 0.5846181073152138, + "grad_norm": 0.5870953040476993, + "learning_rate": 0.00016603204106005638, + "loss": 12.3131, + "step": 10736 + }, + { + "epoch": 0.5846725613117968, + "grad_norm": 0.5775654571884085, + "learning_rate": 0.00016602541843261347, + "loss": 12.1748, + "step": 10737 + }, + { + "epoch": 0.5847270153083798, + "grad_norm": 0.6057726613370205, + "learning_rate": 0.0001660187952917458, + "loss": 12.3621, + "step": 10738 + }, + { + "epoch": 0.5847814693049628, + "grad_norm": 0.6902081706570492, + "learning_rate": 0.00016601217163750488, + "loss": 12.5363, + "step": 10739 + }, + { + "epoch": 0.5848359233015458, + "grad_norm": 0.577736725073556, + "learning_rate": 0.0001660055474699423, + "loss": 12.3637, + "step": 10740 + }, + { + "epoch": 0.5848903772981289, + "grad_norm": 0.5566594113188544, + "learning_rate": 0.0001659989227891095, + "loss": 12.3016, + "step": 10741 + }, + { + "epoch": 0.5849448312947119, + "grad_norm": 0.5903814894869801, + "learning_rate": 0.000165992297595058, + "loss": 12.2161, + "step": 10742 + }, + { + "epoch": 0.5849992852912949, + "grad_norm": 0.6212620436447776, + "learning_rate": 0.00016598567188783934, + "loss": 12.2501, + "step": 10743 + }, + { + "epoch": 0.5850537392878778, + "grad_norm": 0.6641309249568489, + "learning_rate": 0.000165979045667505, + "loss": 12.3462, + "step": 10744 + }, + { + "epoch": 0.5851081932844608, + "grad_norm": 0.6597269040271564, + "learning_rate": 0.00016597241893410658, + "loss": 12.4123, + "step": 10745 + }, + { + "epoch": 0.5851626472810438, + "grad_norm": 0.6040301077382447, + "learning_rate": 0.00016596579168769553, + "loss": 12.328, + "step": 10746 + }, + { + "epoch": 0.5852171012776269, + "grad_norm": 0.6346788135306696, + "learning_rate": 0.00016595916392832342, + "loss": 12.4054, + "step": 10747 + }, + { + "epoch": 0.5852715552742099, + "grad_norm": 0.6213996954737067, + "learning_rate": 0.00016595253565604182, + "loss": 12.2539, + "step": 10748 + }, + { + "epoch": 0.5853260092707929, + "grad_norm": 0.7113686564804004, + "learning_rate": 0.00016594590687090224, + "loss": 12.3629, + "step": 10749 + }, + { + "epoch": 0.5853804632673759, + "grad_norm": 0.6576936109864955, + "learning_rate": 0.0001659392775729562, + "loss": 12.3767, + "step": 10750 + }, + { + "epoch": 0.5854349172639589, + "grad_norm": 0.6351268245810511, + "learning_rate": 0.00016593264776225528, + "loss": 12.4524, + "step": 10751 + }, + { + "epoch": 0.5854893712605419, + "grad_norm": 0.7243956423960791, + "learning_rate": 0.00016592601743885106, + "loss": 12.2262, + "step": 10752 + }, + { + "epoch": 0.585543825257125, + "grad_norm": 0.5824637034568017, + "learning_rate": 0.00016591938660279507, + "loss": 12.1616, + "step": 10753 + }, + { + "epoch": 0.585598279253708, + "grad_norm": 0.6756771306162614, + "learning_rate": 0.00016591275525413887, + "loss": 12.3993, + "step": 10754 + }, + { + "epoch": 0.585652733250291, + "grad_norm": 0.6143731101255684, + "learning_rate": 0.00016590612339293403, + "loss": 12.2452, + "step": 10755 + }, + { + "epoch": 0.585707187246874, + "grad_norm": 0.6759993024361596, + "learning_rate": 0.0001658994910192321, + "loss": 12.3465, + "step": 10756 + }, + { + "epoch": 0.585761641243457, + "grad_norm": 0.5906562704215874, + "learning_rate": 0.0001658928581330847, + "loss": 12.3051, + "step": 10757 + }, + { + "epoch": 0.58581609524004, + "grad_norm": 0.7311117839177587, + "learning_rate": 0.0001658862247345434, + "loss": 12.3553, + "step": 10758 + }, + { + "epoch": 0.5858705492366231, + "grad_norm": 0.5858251134557313, + "learning_rate": 0.00016587959082365976, + "loss": 12.4093, + "step": 10759 + }, + { + "epoch": 0.5859250032332061, + "grad_norm": 0.6897485784356513, + "learning_rate": 0.00016587295640048535, + "loss": 12.2952, + "step": 10760 + }, + { + "epoch": 0.5859794572297891, + "grad_norm": 0.6207767171834111, + "learning_rate": 0.0001658663214650718, + "loss": 12.1458, + "step": 10761 + }, + { + "epoch": 0.586033911226372, + "grad_norm": 0.6302702106318713, + "learning_rate": 0.0001658596860174707, + "loss": 12.352, + "step": 10762 + }, + { + "epoch": 0.586088365222955, + "grad_norm": 0.6097753042029566, + "learning_rate": 0.0001658530500577336, + "loss": 12.4807, + "step": 10763 + }, + { + "epoch": 0.5861428192195381, + "grad_norm": 0.691173923513651, + "learning_rate": 0.00016584641358591217, + "loss": 12.3892, + "step": 10764 + }, + { + "epoch": 0.5861972732161211, + "grad_norm": 0.6443847707271058, + "learning_rate": 0.00016583977660205798, + "loss": 12.4726, + "step": 10765 + }, + { + "epoch": 0.5862517272127041, + "grad_norm": 0.5835580112949047, + "learning_rate": 0.00016583313910622263, + "loss": 12.3632, + "step": 10766 + }, + { + "epoch": 0.5863061812092871, + "grad_norm": 0.5751536494120262, + "learning_rate": 0.00016582650109845778, + "loss": 12.5108, + "step": 10767 + }, + { + "epoch": 0.5863606352058701, + "grad_norm": 0.6089091834016841, + "learning_rate": 0.00016581986257881498, + "loss": 12.2119, + "step": 10768 + }, + { + "epoch": 0.5864150892024531, + "grad_norm": 0.5850101068908325, + "learning_rate": 0.00016581322354734592, + "loss": 12.4388, + "step": 10769 + }, + { + "epoch": 0.5864695431990362, + "grad_norm": 0.6138029272911438, + "learning_rate": 0.0001658065840041022, + "loss": 12.3174, + "step": 10770 + }, + { + "epoch": 0.5865239971956192, + "grad_norm": 0.5908410281167666, + "learning_rate": 0.00016579994394913544, + "loss": 12.4531, + "step": 10771 + }, + { + "epoch": 0.5865784511922022, + "grad_norm": 0.5352941946472402, + "learning_rate": 0.00016579330338249728, + "loss": 12.3342, + "step": 10772 + }, + { + "epoch": 0.5866329051887852, + "grad_norm": 0.5648190561349754, + "learning_rate": 0.00016578666230423935, + "loss": 12.2958, + "step": 10773 + }, + { + "epoch": 0.5866873591853682, + "grad_norm": 0.6392221609775258, + "learning_rate": 0.0001657800207144133, + "loss": 12.4991, + "step": 10774 + }, + { + "epoch": 0.5867418131819512, + "grad_norm": 0.6129857611687894, + "learning_rate": 0.00016577337861307077, + "loss": 12.4036, + "step": 10775 + }, + { + "epoch": 0.5867962671785343, + "grad_norm": 0.627119067624158, + "learning_rate": 0.00016576673600026345, + "loss": 12.3447, + "step": 10776 + }, + { + "epoch": 0.5868507211751173, + "grad_norm": 0.5847458260011068, + "learning_rate": 0.00016576009287604294, + "loss": 12.3191, + "step": 10777 + }, + { + "epoch": 0.5869051751717003, + "grad_norm": 0.6644982665189451, + "learning_rate": 0.00016575344924046092, + "loss": 12.3171, + "step": 10778 + }, + { + "epoch": 0.5869596291682833, + "grad_norm": 0.6030409735792923, + "learning_rate": 0.00016574680509356908, + "loss": 12.3177, + "step": 10779 + }, + { + "epoch": 0.5870140831648663, + "grad_norm": 0.639646300876133, + "learning_rate": 0.00016574016043541901, + "loss": 12.3706, + "step": 10780 + }, + { + "epoch": 0.5870685371614492, + "grad_norm": 0.6364669286753462, + "learning_rate": 0.00016573351526606248, + "loss": 12.3207, + "step": 10781 + }, + { + "epoch": 0.5871229911580323, + "grad_norm": 0.6027203188028022, + "learning_rate": 0.00016572686958555107, + "loss": 12.3687, + "step": 10782 + }, + { + "epoch": 0.5871774451546153, + "grad_norm": 0.586665394642215, + "learning_rate": 0.00016572022339393652, + "loss": 12.1757, + "step": 10783 + }, + { + "epoch": 0.5872318991511983, + "grad_norm": 0.5734703142469153, + "learning_rate": 0.00016571357669127048, + "loss": 12.3066, + "step": 10784 + }, + { + "epoch": 0.5872863531477813, + "grad_norm": 0.6073111293302085, + "learning_rate": 0.00016570692947760464, + "loss": 12.3468, + "step": 10785 + }, + { + "epoch": 0.5873408071443643, + "grad_norm": 0.6017266465861099, + "learning_rate": 0.00016570028175299072, + "loss": 12.0879, + "step": 10786 + }, + { + "epoch": 0.5873952611409473, + "grad_norm": 0.6343969140689849, + "learning_rate": 0.0001656936335174804, + "loss": 12.3714, + "step": 10787 + }, + { + "epoch": 0.5874497151375304, + "grad_norm": 0.6209890963122027, + "learning_rate": 0.00016568698477112533, + "loss": 12.4377, + "step": 10788 + }, + { + "epoch": 0.5875041691341134, + "grad_norm": 0.5831684736009015, + "learning_rate": 0.00016568033551397728, + "loss": 12.3306, + "step": 10789 + }, + { + "epoch": 0.5875586231306964, + "grad_norm": 0.6591646439527126, + "learning_rate": 0.00016567368574608792, + "loss": 12.4348, + "step": 10790 + }, + { + "epoch": 0.5876130771272794, + "grad_norm": 0.6084255226376905, + "learning_rate": 0.00016566703546750896, + "loss": 12.2603, + "step": 10791 + }, + { + "epoch": 0.5876675311238624, + "grad_norm": 0.5709180561818189, + "learning_rate": 0.00016566038467829213, + "loss": 12.384, + "step": 10792 + }, + { + "epoch": 0.5877219851204454, + "grad_norm": 0.6491470129190081, + "learning_rate": 0.00016565373337848913, + "loss": 12.4071, + "step": 10793 + }, + { + "epoch": 0.5877764391170285, + "grad_norm": 0.6136482109577118, + "learning_rate": 0.00016564708156815167, + "loss": 12.2885, + "step": 10794 + }, + { + "epoch": 0.5878308931136115, + "grad_norm": 0.5805410890498897, + "learning_rate": 0.00016564042924733152, + "loss": 12.3627, + "step": 10795 + }, + { + "epoch": 0.5878853471101945, + "grad_norm": 0.5737420997967682, + "learning_rate": 0.00016563377641608037, + "loss": 12.336, + "step": 10796 + }, + { + "epoch": 0.5879398011067775, + "grad_norm": 0.6520150115053743, + "learning_rate": 0.00016562712307445, + "loss": 12.3845, + "step": 10797 + }, + { + "epoch": 0.5879942551033605, + "grad_norm": 0.5972841491680021, + "learning_rate": 0.0001656204692224921, + "loss": 12.3759, + "step": 10798 + }, + { + "epoch": 0.5880487090999436, + "grad_norm": 0.5493871853747994, + "learning_rate": 0.0001656138148602584, + "loss": 12.1783, + "step": 10799 + }, + { + "epoch": 0.5881031630965265, + "grad_norm": 0.5822890201774393, + "learning_rate": 0.0001656071599878007, + "loss": 12.2892, + "step": 10800 + }, + { + "epoch": 0.5881576170931095, + "grad_norm": 0.6324818215286, + "learning_rate": 0.0001656005046051707, + "loss": 12.3577, + "step": 10801 + }, + { + "epoch": 0.5882120710896925, + "grad_norm": 0.5642795838039929, + "learning_rate": 0.0001655938487124202, + "loss": 12.3569, + "step": 10802 + }, + { + "epoch": 0.5882665250862755, + "grad_norm": 0.6910574345507834, + "learning_rate": 0.00016558719230960094, + "loss": 12.4795, + "step": 10803 + }, + { + "epoch": 0.5883209790828585, + "grad_norm": 0.542802476435009, + "learning_rate": 0.00016558053539676463, + "loss": 12.2123, + "step": 10804 + }, + { + "epoch": 0.5883754330794416, + "grad_norm": 0.6043617230353788, + "learning_rate": 0.0001655738779739631, + "loss": 12.3511, + "step": 10805 + }, + { + "epoch": 0.5884298870760246, + "grad_norm": 0.5661938081811106, + "learning_rate": 0.0001655672200412481, + "loss": 12.3841, + "step": 10806 + }, + { + "epoch": 0.5884843410726076, + "grad_norm": 0.5513840276449579, + "learning_rate": 0.0001655605615986714, + "loss": 12.3578, + "step": 10807 + }, + { + "epoch": 0.5885387950691906, + "grad_norm": 0.5935567757194304, + "learning_rate": 0.00016555390264628482, + "loss": 12.2641, + "step": 10808 + }, + { + "epoch": 0.5885932490657736, + "grad_norm": 0.6094467186140181, + "learning_rate": 0.00016554724318414005, + "loss": 12.3186, + "step": 10809 + }, + { + "epoch": 0.5886477030623566, + "grad_norm": 0.587326969918712, + "learning_rate": 0.00016554058321228892, + "loss": 12.3582, + "step": 10810 + }, + { + "epoch": 0.5887021570589397, + "grad_norm": 0.605656258922736, + "learning_rate": 0.00016553392273078324, + "loss": 12.3631, + "step": 10811 + }, + { + "epoch": 0.5887566110555227, + "grad_norm": 0.5416599548400733, + "learning_rate": 0.00016552726173967478, + "loss": 12.1228, + "step": 10812 + }, + { + "epoch": 0.5888110650521057, + "grad_norm": 0.6324066438312699, + "learning_rate": 0.00016552060023901537, + "loss": 12.368, + "step": 10813 + }, + { + "epoch": 0.5888655190486887, + "grad_norm": 0.5809986823631331, + "learning_rate": 0.00016551393822885675, + "loss": 12.251, + "step": 10814 + }, + { + "epoch": 0.5889199730452717, + "grad_norm": 0.5715275065882698, + "learning_rate": 0.00016550727570925077, + "loss": 12.3118, + "step": 10815 + }, + { + "epoch": 0.5889744270418547, + "grad_norm": 0.5556233171285592, + "learning_rate": 0.00016550061268024924, + "loss": 12.3867, + "step": 10816 + }, + { + "epoch": 0.5890288810384378, + "grad_norm": 0.551949562675175, + "learning_rate": 0.00016549394914190394, + "loss": 12.2687, + "step": 10817 + }, + { + "epoch": 0.5890833350350207, + "grad_norm": 0.6604225135995097, + "learning_rate": 0.0001654872850942667, + "loss": 12.2866, + "step": 10818 + }, + { + "epoch": 0.5891377890316037, + "grad_norm": 0.6122826288090935, + "learning_rate": 0.00016548062053738943, + "loss": 12.3139, + "step": 10819 + }, + { + "epoch": 0.5891922430281867, + "grad_norm": 0.6336015057735573, + "learning_rate": 0.00016547395547132377, + "loss": 12.3545, + "step": 10820 + }, + { + "epoch": 0.5892466970247697, + "grad_norm": 0.6077660242831809, + "learning_rate": 0.0001654672898961217, + "loss": 12.4284, + "step": 10821 + }, + { + "epoch": 0.5893011510213527, + "grad_norm": 0.5933924734135819, + "learning_rate": 0.00016546062381183504, + "loss": 12.3477, + "step": 10822 + }, + { + "epoch": 0.5893556050179358, + "grad_norm": 0.5704821578368983, + "learning_rate": 0.00016545395721851553, + "loss": 12.3658, + "step": 10823 + }, + { + "epoch": 0.5894100590145188, + "grad_norm": 0.5531890981277237, + "learning_rate": 0.00016544729011621509, + "loss": 12.1956, + "step": 10824 + }, + { + "epoch": 0.5894645130111018, + "grad_norm": 0.6078577362006647, + "learning_rate": 0.00016544062250498556, + "loss": 12.1014, + "step": 10825 + }, + { + "epoch": 0.5895189670076848, + "grad_norm": 0.6041162891948126, + "learning_rate": 0.00016543395438487876, + "loss": 12.3498, + "step": 10826 + }, + { + "epoch": 0.5895734210042678, + "grad_norm": 0.59779517633144, + "learning_rate": 0.00016542728575594657, + "loss": 12.3556, + "step": 10827 + }, + { + "epoch": 0.5896278750008509, + "grad_norm": 0.6802641767919028, + "learning_rate": 0.00016542061661824081, + "loss": 12.3733, + "step": 10828 + }, + { + "epoch": 0.5896823289974339, + "grad_norm": 0.6133795427116091, + "learning_rate": 0.0001654139469718134, + "loss": 12.3098, + "step": 10829 + }, + { + "epoch": 0.5897367829940169, + "grad_norm": 0.625457286881729, + "learning_rate": 0.00016540727681671613, + "loss": 12.326, + "step": 10830 + }, + { + "epoch": 0.5897912369905999, + "grad_norm": 0.5949393486441156, + "learning_rate": 0.00016540060615300096, + "loss": 12.2548, + "step": 10831 + }, + { + "epoch": 0.5898456909871829, + "grad_norm": 0.5972619721684153, + "learning_rate": 0.00016539393498071967, + "loss": 12.2434, + "step": 10832 + }, + { + "epoch": 0.5899001449837659, + "grad_norm": 0.6253120159387654, + "learning_rate": 0.00016538726329992418, + "loss": 12.392, + "step": 10833 + }, + { + "epoch": 0.589954598980349, + "grad_norm": 0.6160790096062935, + "learning_rate": 0.00016538059111066635, + "loss": 12.4191, + "step": 10834 + }, + { + "epoch": 0.590009052976932, + "grad_norm": 0.6596655883939287, + "learning_rate": 0.0001653739184129981, + "loss": 12.4099, + "step": 10835 + }, + { + "epoch": 0.590063506973515, + "grad_norm": 0.5650329151645308, + "learning_rate": 0.0001653672452069713, + "loss": 12.2878, + "step": 10836 + }, + { + "epoch": 0.5901179609700979, + "grad_norm": 0.582767331234623, + "learning_rate": 0.00016536057149263783, + "loss": 12.3784, + "step": 10837 + }, + { + "epoch": 0.5901724149666809, + "grad_norm": 0.6578672810819257, + "learning_rate": 0.0001653538972700496, + "loss": 12.3418, + "step": 10838 + }, + { + "epoch": 0.5902268689632639, + "grad_norm": 0.5438566298833007, + "learning_rate": 0.0001653472225392585, + "loss": 12.3361, + "step": 10839 + }, + { + "epoch": 0.590281322959847, + "grad_norm": 0.5609747831008461, + "learning_rate": 0.00016534054730031643, + "loss": 12.4159, + "step": 10840 + }, + { + "epoch": 0.59033577695643, + "grad_norm": 0.5772256672982281, + "learning_rate": 0.00016533387155327533, + "loss": 12.3209, + "step": 10841 + }, + { + "epoch": 0.590390230953013, + "grad_norm": 0.564930919863431, + "learning_rate": 0.00016532719529818704, + "loss": 12.1613, + "step": 10842 + }, + { + "epoch": 0.590444684949596, + "grad_norm": 0.5629067263774307, + "learning_rate": 0.00016532051853510358, + "loss": 12.4296, + "step": 10843 + }, + { + "epoch": 0.590499138946179, + "grad_norm": 0.5442203642637816, + "learning_rate": 0.00016531384126407681, + "loss": 12.3441, + "step": 10844 + }, + { + "epoch": 0.590553592942762, + "grad_norm": 0.6046789600876962, + "learning_rate": 0.00016530716348515863, + "loss": 12.3517, + "step": 10845 + }, + { + "epoch": 0.5906080469393451, + "grad_norm": 0.580830860122665, + "learning_rate": 0.000165300485198401, + "loss": 12.2095, + "step": 10846 + }, + { + "epoch": 0.5906625009359281, + "grad_norm": 0.6466629974220863, + "learning_rate": 0.00016529380640385589, + "loss": 12.4153, + "step": 10847 + }, + { + "epoch": 0.5907169549325111, + "grad_norm": 0.6339639913605484, + "learning_rate": 0.00016528712710157513, + "loss": 12.4205, + "step": 10848 + }, + { + "epoch": 0.5907714089290941, + "grad_norm": 0.6195441409327394, + "learning_rate": 0.00016528044729161075, + "loss": 12.2604, + "step": 10849 + }, + { + "epoch": 0.5908258629256771, + "grad_norm": 0.6118504980387967, + "learning_rate": 0.00016527376697401464, + "loss": 12.3924, + "step": 10850 + }, + { + "epoch": 0.5908803169222601, + "grad_norm": 0.5882431282833214, + "learning_rate": 0.0001652670861488388, + "loss": 12.2959, + "step": 10851 + }, + { + "epoch": 0.5909347709188432, + "grad_norm": 0.6832713314144646, + "learning_rate": 0.00016526040481613515, + "loss": 12.3463, + "step": 10852 + }, + { + "epoch": 0.5909892249154262, + "grad_norm": 0.6671222847117588, + "learning_rate": 0.00016525372297595563, + "loss": 12.3169, + "step": 10853 + }, + { + "epoch": 0.5910436789120092, + "grad_norm": 0.5630347120264059, + "learning_rate": 0.0001652470406283522, + "loss": 12.3274, + "step": 10854 + }, + { + "epoch": 0.5910981329085921, + "grad_norm": 0.69419863059305, + "learning_rate": 0.0001652403577733769, + "loss": 12.4147, + "step": 10855 + }, + { + "epoch": 0.5911525869051751, + "grad_norm": 0.6071255184021517, + "learning_rate": 0.0001652336744110816, + "loss": 12.2184, + "step": 10856 + }, + { + "epoch": 0.5912070409017581, + "grad_norm": 0.6381170907717836, + "learning_rate": 0.0001652269905415183, + "loss": 12.4493, + "step": 10857 + }, + { + "epoch": 0.5912614948983412, + "grad_norm": 0.6156674336316467, + "learning_rate": 0.000165220306164739, + "loss": 12.4876, + "step": 10858 + }, + { + "epoch": 0.5913159488949242, + "grad_norm": 0.6300556219877623, + "learning_rate": 0.00016521362128079568, + "loss": 12.4234, + "step": 10859 + }, + { + "epoch": 0.5913704028915072, + "grad_norm": 0.5903640259607501, + "learning_rate": 0.00016520693588974026, + "loss": 12.3806, + "step": 10860 + }, + { + "epoch": 0.5914248568880902, + "grad_norm": 0.6030768851262162, + "learning_rate": 0.0001652002499916248, + "loss": 12.2749, + "step": 10861 + }, + { + "epoch": 0.5914793108846732, + "grad_norm": 0.5589972880187812, + "learning_rate": 0.00016519356358650125, + "loss": 12.2358, + "step": 10862 + }, + { + "epoch": 0.5915337648812563, + "grad_norm": 0.6032709667447022, + "learning_rate": 0.00016518687667442164, + "loss": 12.352, + "step": 10863 + }, + { + "epoch": 0.5915882188778393, + "grad_norm": 0.5754656477262209, + "learning_rate": 0.00016518018925543791, + "loss": 12.3834, + "step": 10864 + }, + { + "epoch": 0.5916426728744223, + "grad_norm": 0.7367348500947619, + "learning_rate": 0.0001651735013296021, + "loss": 12.4114, + "step": 10865 + }, + { + "epoch": 0.5916971268710053, + "grad_norm": 0.5771815214137505, + "learning_rate": 0.00016516681289696625, + "loss": 12.2369, + "step": 10866 + }, + { + "epoch": 0.5917515808675883, + "grad_norm": 0.5986111921661764, + "learning_rate": 0.00016516012395758231, + "loss": 12.2583, + "step": 10867 + }, + { + "epoch": 0.5918060348641713, + "grad_norm": 0.7461476212901754, + "learning_rate": 0.00016515343451150232, + "loss": 12.3905, + "step": 10868 + }, + { + "epoch": 0.5918604888607544, + "grad_norm": 0.6240151613156143, + "learning_rate": 0.0001651467445587783, + "loss": 12.3152, + "step": 10869 + }, + { + "epoch": 0.5919149428573374, + "grad_norm": 0.6227323246109967, + "learning_rate": 0.00016514005409946228, + "loss": 12.2702, + "step": 10870 + }, + { + "epoch": 0.5919693968539204, + "grad_norm": 0.6066607791261394, + "learning_rate": 0.00016513336313360623, + "loss": 12.4799, + "step": 10871 + }, + { + "epoch": 0.5920238508505034, + "grad_norm": 0.621186986997314, + "learning_rate": 0.00016512667166126228, + "loss": 12.3654, + "step": 10872 + }, + { + "epoch": 0.5920783048470863, + "grad_norm": 0.6127640166956305, + "learning_rate": 0.0001651199796824824, + "loss": 12.3064, + "step": 10873 + }, + { + "epoch": 0.5921327588436693, + "grad_norm": 0.5862571393568207, + "learning_rate": 0.00016511328719731862, + "loss": 12.268, + "step": 10874 + }, + { + "epoch": 0.5921872128402524, + "grad_norm": 0.5395592050833669, + "learning_rate": 0.00016510659420582302, + "loss": 12.2713, + "step": 10875 + }, + { + "epoch": 0.5922416668368354, + "grad_norm": 0.540435062260064, + "learning_rate": 0.0001650999007080476, + "loss": 12.3872, + "step": 10876 + }, + { + "epoch": 0.5922961208334184, + "grad_norm": 0.5877623442475959, + "learning_rate": 0.00016509320670404445, + "loss": 12.2217, + "step": 10877 + }, + { + "epoch": 0.5923505748300014, + "grad_norm": 0.5951671429030703, + "learning_rate": 0.0001650865121938656, + "loss": 12.4027, + "step": 10878 + }, + { + "epoch": 0.5924050288265844, + "grad_norm": 0.6337323510468316, + "learning_rate": 0.0001650798171775631, + "loss": 12.3653, + "step": 10879 + }, + { + "epoch": 0.5924594828231674, + "grad_norm": 0.6740988540522351, + "learning_rate": 0.00016507312165518908, + "loss": 12.3495, + "step": 10880 + }, + { + "epoch": 0.5925139368197505, + "grad_norm": 0.5834673840846617, + "learning_rate": 0.00016506642562679548, + "loss": 12.0696, + "step": 10881 + }, + { + "epoch": 0.5925683908163335, + "grad_norm": 0.595216213228313, + "learning_rate": 0.0001650597290924345, + "loss": 12.3565, + "step": 10882 + }, + { + "epoch": 0.5926228448129165, + "grad_norm": 0.6432560364263551, + "learning_rate": 0.00016505303205215815, + "loss": 12.3607, + "step": 10883 + }, + { + "epoch": 0.5926772988094995, + "grad_norm": 0.586856815044308, + "learning_rate": 0.0001650463345060185, + "loss": 12.312, + "step": 10884 + }, + { + "epoch": 0.5927317528060825, + "grad_norm": 0.5832360201559386, + "learning_rate": 0.00016503963645406763, + "loss": 12.2108, + "step": 10885 + }, + { + "epoch": 0.5927862068026655, + "grad_norm": 0.6363677337191588, + "learning_rate": 0.00016503293789635768, + "loss": 12.2894, + "step": 10886 + }, + { + "epoch": 0.5928406607992486, + "grad_norm": 0.6366862094278207, + "learning_rate": 0.00016502623883294065, + "loss": 12.2471, + "step": 10887 + }, + { + "epoch": 0.5928951147958316, + "grad_norm": 0.6290404501627677, + "learning_rate": 0.00016501953926386872, + "loss": 12.3175, + "step": 10888 + }, + { + "epoch": 0.5929495687924146, + "grad_norm": 0.6032367742735714, + "learning_rate": 0.00016501283918919394, + "loss": 12.303, + "step": 10889 + }, + { + "epoch": 0.5930040227889976, + "grad_norm": 0.5908359572494107, + "learning_rate": 0.00016500613860896842, + "loss": 12.3907, + "step": 10890 + }, + { + "epoch": 0.5930584767855805, + "grad_norm": 0.6263770904868716, + "learning_rate": 0.00016499943752324426, + "loss": 12.3837, + "step": 10891 + }, + { + "epoch": 0.5931129307821635, + "grad_norm": 0.5627338226718853, + "learning_rate": 0.0001649927359320736, + "loss": 12.3537, + "step": 10892 + }, + { + "epoch": 0.5931673847787466, + "grad_norm": 0.5863381767298946, + "learning_rate": 0.00016498603383550847, + "loss": 12.311, + "step": 10893 + }, + { + "epoch": 0.5932218387753296, + "grad_norm": 0.5987115449104812, + "learning_rate": 0.0001649793312336011, + "loss": 12.4243, + "step": 10894 + }, + { + "epoch": 0.5932762927719126, + "grad_norm": 0.6024755309453307, + "learning_rate": 0.0001649726281264035, + "loss": 12.3362, + "step": 10895 + }, + { + "epoch": 0.5933307467684956, + "grad_norm": 0.5197680717807689, + "learning_rate": 0.00016496592451396787, + "loss": 12.3468, + "step": 10896 + }, + { + "epoch": 0.5933852007650786, + "grad_norm": 0.6111589305287626, + "learning_rate": 0.00016495922039634633, + "loss": 12.2591, + "step": 10897 + }, + { + "epoch": 0.5934396547616617, + "grad_norm": 0.6600374756617288, + "learning_rate": 0.000164952515773591, + "loss": 12.4693, + "step": 10898 + }, + { + "epoch": 0.5934941087582447, + "grad_norm": 0.6008571027135967, + "learning_rate": 0.00016494581064575397, + "loss": 12.2133, + "step": 10899 + }, + { + "epoch": 0.5935485627548277, + "grad_norm": 0.6174842596701852, + "learning_rate": 0.0001649391050128875, + "loss": 12.3995, + "step": 10900 + }, + { + "epoch": 0.5936030167514107, + "grad_norm": 0.5966629019054353, + "learning_rate": 0.0001649323988750436, + "loss": 12.3835, + "step": 10901 + }, + { + "epoch": 0.5936574707479937, + "grad_norm": 0.6241955185191029, + "learning_rate": 0.0001649256922322745, + "loss": 12.3144, + "step": 10902 + }, + { + "epoch": 0.5937119247445767, + "grad_norm": 0.6195522331438866, + "learning_rate": 0.00016491898508463234, + "loss": 12.3529, + "step": 10903 + }, + { + "epoch": 0.5937663787411598, + "grad_norm": 0.6498619538274419, + "learning_rate": 0.00016491227743216925, + "loss": 12.346, + "step": 10904 + }, + { + "epoch": 0.5938208327377428, + "grad_norm": 0.6874857310664496, + "learning_rate": 0.00016490556927493738, + "loss": 12.3812, + "step": 10905 + }, + { + "epoch": 0.5938752867343258, + "grad_norm": 0.5901940883933078, + "learning_rate": 0.00016489886061298896, + "loss": 12.3213, + "step": 10906 + }, + { + "epoch": 0.5939297407309088, + "grad_norm": 0.686173498986228, + "learning_rate": 0.0001648921514463761, + "loss": 12.3013, + "step": 10907 + }, + { + "epoch": 0.5939841947274918, + "grad_norm": 0.6804757445573253, + "learning_rate": 0.000164885441775151, + "loss": 12.4066, + "step": 10908 + }, + { + "epoch": 0.5940386487240747, + "grad_norm": 0.5947407757059022, + "learning_rate": 0.0001648787315993658, + "loss": 12.3664, + "step": 10909 + }, + { + "epoch": 0.5940931027206579, + "grad_norm": 0.5568712242564401, + "learning_rate": 0.0001648720209190727, + "loss": 12.3068, + "step": 10910 + }, + { + "epoch": 0.5941475567172408, + "grad_norm": 0.6673244286932989, + "learning_rate": 0.00016486530973432387, + "loss": 12.4117, + "step": 10911 + }, + { + "epoch": 0.5942020107138238, + "grad_norm": 0.5962148224888308, + "learning_rate": 0.00016485859804517156, + "loss": 12.3541, + "step": 10912 + }, + { + "epoch": 0.5942564647104068, + "grad_norm": 0.7011669751181724, + "learning_rate": 0.00016485188585166794, + "loss": 12.3054, + "step": 10913 + }, + { + "epoch": 0.5943109187069898, + "grad_norm": 0.5983241513440197, + "learning_rate": 0.0001648451731538651, + "loss": 12.3304, + "step": 10914 + }, + { + "epoch": 0.5943653727035728, + "grad_norm": 0.6500449585705957, + "learning_rate": 0.00016483845995181537, + "loss": 12.3092, + "step": 10915 + }, + { + "epoch": 0.5944198267001559, + "grad_norm": 0.7336462821120124, + "learning_rate": 0.0001648317462455709, + "loss": 12.2935, + "step": 10916 + }, + { + "epoch": 0.5944742806967389, + "grad_norm": 0.6561148282831947, + "learning_rate": 0.00016482503203518387, + "loss": 12.3112, + "step": 10917 + }, + { + "epoch": 0.5945287346933219, + "grad_norm": 0.5695500651013421, + "learning_rate": 0.00016481831732070656, + "loss": 12.4201, + "step": 10918 + }, + { + "epoch": 0.5945831886899049, + "grad_norm": 0.5918640574477187, + "learning_rate": 0.0001648116021021911, + "loss": 12.185, + "step": 10919 + }, + { + "epoch": 0.5946376426864879, + "grad_norm": 0.5833610791625058, + "learning_rate": 0.00016480488637968978, + "loss": 12.256, + "step": 10920 + }, + { + "epoch": 0.5946920966830709, + "grad_norm": 0.6358894046455806, + "learning_rate": 0.00016479817015325478, + "loss": 12.2758, + "step": 10921 + }, + { + "epoch": 0.594746550679654, + "grad_norm": 0.6786451896587979, + "learning_rate": 0.00016479145342293837, + "loss": 12.3409, + "step": 10922 + }, + { + "epoch": 0.594801004676237, + "grad_norm": 0.6305781227330443, + "learning_rate": 0.00016478473618879272, + "loss": 12.2793, + "step": 10923 + }, + { + "epoch": 0.59485545867282, + "grad_norm": 0.6153787188546451, + "learning_rate": 0.00016477801845087012, + "loss": 12.2978, + "step": 10924 + }, + { + "epoch": 0.594909912669403, + "grad_norm": 0.6843363709563547, + "learning_rate": 0.00016477130020922277, + "loss": 12.4037, + "step": 10925 + }, + { + "epoch": 0.594964366665986, + "grad_norm": 0.6054378314241865, + "learning_rate": 0.00016476458146390296, + "loss": 12.3749, + "step": 10926 + }, + { + "epoch": 0.595018820662569, + "grad_norm": 0.6777685328611128, + "learning_rate": 0.0001647578622149629, + "loss": 12.3875, + "step": 10927 + }, + { + "epoch": 0.595073274659152, + "grad_norm": 0.6999698137431501, + "learning_rate": 0.00016475114246245482, + "loss": 12.2763, + "step": 10928 + }, + { + "epoch": 0.595127728655735, + "grad_norm": 0.6153828446300298, + "learning_rate": 0.00016474442220643098, + "loss": 12.386, + "step": 10929 + }, + { + "epoch": 0.595182182652318, + "grad_norm": 0.6144697499687396, + "learning_rate": 0.00016473770144694367, + "loss": 12.4144, + "step": 10930 + }, + { + "epoch": 0.595236636648901, + "grad_norm": 0.6330746485334776, + "learning_rate": 0.00016473098018404513, + "loss": 12.3863, + "step": 10931 + }, + { + "epoch": 0.595291090645484, + "grad_norm": 0.5948147818482774, + "learning_rate": 0.00016472425841778767, + "loss": 12.3491, + "step": 10932 + }, + { + "epoch": 0.5953455446420671, + "grad_norm": 0.5672408621231471, + "learning_rate": 0.0001647175361482235, + "loss": 12.1104, + "step": 10933 + }, + { + "epoch": 0.5953999986386501, + "grad_norm": 0.6833874407205437, + "learning_rate": 0.0001647108133754049, + "loss": 12.4, + "step": 10934 + }, + { + "epoch": 0.5954544526352331, + "grad_norm": 0.6494403309332063, + "learning_rate": 0.0001647040900993842, + "loss": 12.31, + "step": 10935 + }, + { + "epoch": 0.5955089066318161, + "grad_norm": 0.6642420865825933, + "learning_rate": 0.0001646973663202136, + "loss": 12.435, + "step": 10936 + }, + { + "epoch": 0.5955633606283991, + "grad_norm": 0.7412025529100646, + "learning_rate": 0.00016469064203794543, + "loss": 12.5043, + "step": 10937 + }, + { + "epoch": 0.5956178146249821, + "grad_norm": 0.6756310812150244, + "learning_rate": 0.00016468391725263203, + "loss": 12.4486, + "step": 10938 + }, + { + "epoch": 0.5956722686215652, + "grad_norm": 0.6006988925370298, + "learning_rate": 0.0001646771919643256, + "loss": 12.2226, + "step": 10939 + }, + { + "epoch": 0.5957267226181482, + "grad_norm": 0.7039158770908783, + "learning_rate": 0.0001646704661730785, + "loss": 12.4274, + "step": 10940 + }, + { + "epoch": 0.5957811766147312, + "grad_norm": 0.6309968178306992, + "learning_rate": 0.000164663739878943, + "loss": 12.4284, + "step": 10941 + }, + { + "epoch": 0.5958356306113142, + "grad_norm": 0.5875586723598385, + "learning_rate": 0.00016465701308197143, + "loss": 12.3632, + "step": 10942 + }, + { + "epoch": 0.5958900846078972, + "grad_norm": 0.6376103253963531, + "learning_rate": 0.00016465028578221605, + "loss": 12.3814, + "step": 10943 + }, + { + "epoch": 0.5959445386044802, + "grad_norm": 0.5810749677416577, + "learning_rate": 0.00016464355797972922, + "loss": 12.3421, + "step": 10944 + }, + { + "epoch": 0.5959989926010633, + "grad_norm": 0.6630727468617987, + "learning_rate": 0.00016463682967456325, + "loss": 12.3919, + "step": 10945 + }, + { + "epoch": 0.5960534465976463, + "grad_norm": 0.633931614301666, + "learning_rate": 0.00016463010086677048, + "loss": 12.36, + "step": 10946 + }, + { + "epoch": 0.5961079005942292, + "grad_norm": 0.58915438361807, + "learning_rate": 0.00016462337155640316, + "loss": 12.3195, + "step": 10947 + }, + { + "epoch": 0.5961623545908122, + "grad_norm": 0.5674771043498464, + "learning_rate": 0.0001646166417435137, + "loss": 12.3314, + "step": 10948 + }, + { + "epoch": 0.5962168085873952, + "grad_norm": 0.6278043046671282, + "learning_rate": 0.00016460991142815435, + "loss": 12.4202, + "step": 10949 + }, + { + "epoch": 0.5962712625839782, + "grad_norm": 0.6091194463995885, + "learning_rate": 0.00016460318061037752, + "loss": 12.2741, + "step": 10950 + }, + { + "epoch": 0.5963257165805613, + "grad_norm": 0.5929091072302531, + "learning_rate": 0.00016459644929023553, + "loss": 12.215, + "step": 10951 + }, + { + "epoch": 0.5963801705771443, + "grad_norm": 0.539969046068889, + "learning_rate": 0.00016458971746778072, + "loss": 12.2799, + "step": 10952 + }, + { + "epoch": 0.5964346245737273, + "grad_norm": 0.5477239819241904, + "learning_rate": 0.00016458298514306546, + "loss": 12.2254, + "step": 10953 + }, + { + "epoch": 0.5964890785703103, + "grad_norm": 0.5945626403678087, + "learning_rate": 0.000164576252316142, + "loss": 12.2452, + "step": 10954 + }, + { + "epoch": 0.5965435325668933, + "grad_norm": 0.6720891255289737, + "learning_rate": 0.00016456951898706284, + "loss": 12.3979, + "step": 10955 + }, + { + "epoch": 0.5965979865634763, + "grad_norm": 0.6707997944507423, + "learning_rate": 0.00016456278515588024, + "loss": 12.4544, + "step": 10956 + }, + { + "epoch": 0.5966524405600594, + "grad_norm": 0.6431423071095793, + "learning_rate": 0.0001645560508226466, + "loss": 12.301, + "step": 10957 + }, + { + "epoch": 0.5967068945566424, + "grad_norm": 0.6959504961207419, + "learning_rate": 0.0001645493159874143, + "loss": 12.3559, + "step": 10958 + }, + { + "epoch": 0.5967613485532254, + "grad_norm": 0.5793628168767085, + "learning_rate": 0.00016454258065023568, + "loss": 12.3758, + "step": 10959 + }, + { + "epoch": 0.5968158025498084, + "grad_norm": 0.6424349375411575, + "learning_rate": 0.00016453584481116313, + "loss": 12.3487, + "step": 10960 + }, + { + "epoch": 0.5968702565463914, + "grad_norm": 0.6275970493399082, + "learning_rate": 0.00016452910847024901, + "loss": 12.4393, + "step": 10961 + }, + { + "epoch": 0.5969247105429745, + "grad_norm": 0.6446562906094953, + "learning_rate": 0.00016452237162754577, + "loss": 12.4605, + "step": 10962 + }, + { + "epoch": 0.5969791645395575, + "grad_norm": 0.5735543134471323, + "learning_rate": 0.00016451563428310575, + "loss": 12.2989, + "step": 10963 + }, + { + "epoch": 0.5970336185361405, + "grad_norm": 0.6222944111973252, + "learning_rate": 0.0001645088964369813, + "loss": 12.2292, + "step": 10964 + }, + { + "epoch": 0.5970880725327234, + "grad_norm": 0.5985903452256347, + "learning_rate": 0.00016450215808922483, + "loss": 12.3215, + "step": 10965 + }, + { + "epoch": 0.5971425265293064, + "grad_norm": 0.7412074618453982, + "learning_rate": 0.0001644954192398888, + "loss": 12.264, + "step": 10966 + }, + { + "epoch": 0.5971969805258894, + "grad_norm": 0.6439435344918585, + "learning_rate": 0.00016448867988902557, + "loss": 12.3645, + "step": 10967 + }, + { + "epoch": 0.5972514345224725, + "grad_norm": 0.6429930022822522, + "learning_rate": 0.00016448194003668756, + "loss": 12.1924, + "step": 10968 + }, + { + "epoch": 0.5973058885190555, + "grad_norm": 0.6067151013037995, + "learning_rate": 0.00016447519968292715, + "loss": 12.2995, + "step": 10969 + }, + { + "epoch": 0.5973603425156385, + "grad_norm": 0.6413785659924183, + "learning_rate": 0.0001644684588277968, + "loss": 12.2482, + "step": 10970 + }, + { + "epoch": 0.5974147965122215, + "grad_norm": 0.6230928886773899, + "learning_rate": 0.00016446171747134888, + "loss": 12.3143, + "step": 10971 + }, + { + "epoch": 0.5974692505088045, + "grad_norm": 0.6399022650982504, + "learning_rate": 0.0001644549756136358, + "loss": 12.2518, + "step": 10972 + }, + { + "epoch": 0.5975237045053875, + "grad_norm": 0.6035692650993121, + "learning_rate": 0.0001644482332547101, + "loss": 12.3228, + "step": 10973 + }, + { + "epoch": 0.5975781585019706, + "grad_norm": 0.5990368630408056, + "learning_rate": 0.00016444149039462409, + "loss": 12.218, + "step": 10974 + }, + { + "epoch": 0.5976326124985536, + "grad_norm": 0.6111786788841953, + "learning_rate": 0.00016443474703343024, + "loss": 12.3642, + "step": 10975 + }, + { + "epoch": 0.5976870664951366, + "grad_norm": 0.6189012848292096, + "learning_rate": 0.000164428003171181, + "loss": 12.3183, + "step": 10976 + }, + { + "epoch": 0.5977415204917196, + "grad_norm": 0.5916709873890514, + "learning_rate": 0.0001644212588079288, + "loss": 12.3817, + "step": 10977 + }, + { + "epoch": 0.5977959744883026, + "grad_norm": 0.7836661075155655, + "learning_rate": 0.00016441451394372608, + "loss": 12.2071, + "step": 10978 + }, + { + "epoch": 0.5978504284848856, + "grad_norm": 0.5987173941667159, + "learning_rate": 0.0001644077685786253, + "loss": 12.5297, + "step": 10979 + }, + { + "epoch": 0.5979048824814687, + "grad_norm": 0.5824740328749117, + "learning_rate": 0.00016440102271267893, + "loss": 12.3024, + "step": 10980 + }, + { + "epoch": 0.5979593364780517, + "grad_norm": 0.5981170336986796, + "learning_rate": 0.00016439427634593938, + "loss": 12.1627, + "step": 10981 + }, + { + "epoch": 0.5980137904746347, + "grad_norm": 0.6307947757447596, + "learning_rate": 0.00016438752947845915, + "loss": 12.3911, + "step": 10982 + }, + { + "epoch": 0.5980682444712176, + "grad_norm": 0.6119378461428637, + "learning_rate": 0.0001643807821102907, + "loss": 12.4517, + "step": 10983 + }, + { + "epoch": 0.5981226984678006, + "grad_norm": 0.5535905143981861, + "learning_rate": 0.00016437403424148647, + "loss": 12.1956, + "step": 10984 + }, + { + "epoch": 0.5981771524643836, + "grad_norm": 0.6197162454189239, + "learning_rate": 0.00016436728587209898, + "loss": 12.4103, + "step": 10985 + }, + { + "epoch": 0.5982316064609667, + "grad_norm": 0.6043613316867456, + "learning_rate": 0.00016436053700218066, + "loss": 12.3657, + "step": 10986 + }, + { + "epoch": 0.5982860604575497, + "grad_norm": 0.6080395119756771, + "learning_rate": 0.00016435378763178404, + "loss": 12.3862, + "step": 10987 + }, + { + "epoch": 0.5983405144541327, + "grad_norm": 0.6295366320303318, + "learning_rate": 0.00016434703776096153, + "loss": 12.3586, + "step": 10988 + }, + { + "epoch": 0.5983949684507157, + "grad_norm": 0.6145068345615352, + "learning_rate": 0.0001643402873897657, + "loss": 12.3016, + "step": 10989 + }, + { + "epoch": 0.5984494224472987, + "grad_norm": 0.5839673827473969, + "learning_rate": 0.000164333536518249, + "loss": 12.4692, + "step": 10990 + }, + { + "epoch": 0.5985038764438817, + "grad_norm": 0.6179508354992466, + "learning_rate": 0.00016432678514646392, + "loss": 12.321, + "step": 10991 + }, + { + "epoch": 0.5985583304404648, + "grad_norm": 0.5806189956726392, + "learning_rate": 0.000164320033274463, + "loss": 12.4132, + "step": 10992 + }, + { + "epoch": 0.5986127844370478, + "grad_norm": 0.6172660230088375, + "learning_rate": 0.0001643132809022987, + "loss": 12.3561, + "step": 10993 + }, + { + "epoch": 0.5986672384336308, + "grad_norm": 0.574460998452746, + "learning_rate": 0.0001643065280300235, + "loss": 12.3413, + "step": 10994 + }, + { + "epoch": 0.5987216924302138, + "grad_norm": 0.5811058668708309, + "learning_rate": 0.00016429977465769, + "loss": 12.3356, + "step": 10995 + }, + { + "epoch": 0.5987761464267968, + "grad_norm": 0.61660924334426, + "learning_rate": 0.00016429302078535066, + "loss": 12.1396, + "step": 10996 + }, + { + "epoch": 0.5988306004233799, + "grad_norm": 0.5950546465610835, + "learning_rate": 0.000164286266413058, + "loss": 12.2852, + "step": 10997 + }, + { + "epoch": 0.5988850544199629, + "grad_norm": 0.576631418146556, + "learning_rate": 0.00016427951154086455, + "loss": 12.3239, + "step": 10998 + }, + { + "epoch": 0.5989395084165459, + "grad_norm": 0.5844309618486992, + "learning_rate": 0.00016427275616882287, + "loss": 12.3428, + "step": 10999 + }, + { + "epoch": 0.5989939624131289, + "grad_norm": 0.6278048459137753, + "learning_rate": 0.00016426600029698543, + "loss": 12.3291, + "step": 11000 + }, + { + "epoch": 0.5990484164097118, + "grad_norm": 0.5475127016067086, + "learning_rate": 0.0001642592439254048, + "loss": 12.2299, + "step": 11001 + }, + { + "epoch": 0.5991028704062948, + "grad_norm": 0.6725176166321495, + "learning_rate": 0.0001642524870541335, + "loss": 12.2709, + "step": 11002 + }, + { + "epoch": 0.5991573244028779, + "grad_norm": 0.6494389238030606, + "learning_rate": 0.00016424572968322412, + "loss": 12.3964, + "step": 11003 + }, + { + "epoch": 0.5992117783994609, + "grad_norm": 0.6780755193103549, + "learning_rate": 0.00016423897181272915, + "loss": 12.3481, + "step": 11004 + }, + { + "epoch": 0.5992662323960439, + "grad_norm": 0.5891529447522418, + "learning_rate": 0.00016423221344270114, + "loss": 12.4465, + "step": 11005 + }, + { + "epoch": 0.5993206863926269, + "grad_norm": 0.6440257797821093, + "learning_rate": 0.0001642254545731927, + "loss": 12.4303, + "step": 11006 + }, + { + "epoch": 0.5993751403892099, + "grad_norm": 0.6291431127412522, + "learning_rate": 0.00016421869520425632, + "loss": 12.3692, + "step": 11007 + }, + { + "epoch": 0.5994295943857929, + "grad_norm": 0.5696472824113517, + "learning_rate": 0.00016421193533594465, + "loss": 12.2237, + "step": 11008 + }, + { + "epoch": 0.599484048382376, + "grad_norm": 0.6722822267146954, + "learning_rate": 0.00016420517496831016, + "loss": 12.3112, + "step": 11009 + }, + { + "epoch": 0.599538502378959, + "grad_norm": 0.6023223915088669, + "learning_rate": 0.00016419841410140546, + "loss": 12.2093, + "step": 11010 + }, + { + "epoch": 0.599592956375542, + "grad_norm": 0.6379524481212994, + "learning_rate": 0.00016419165273528317, + "loss": 12.233, + "step": 11011 + }, + { + "epoch": 0.599647410372125, + "grad_norm": 0.5829972476929912, + "learning_rate": 0.00016418489086999577, + "loss": 12.3742, + "step": 11012 + }, + { + "epoch": 0.599701864368708, + "grad_norm": 0.6268520971080569, + "learning_rate": 0.00016417812850559593, + "loss": 12.3992, + "step": 11013 + }, + { + "epoch": 0.599756318365291, + "grad_norm": 0.6059742211114041, + "learning_rate": 0.0001641713656421362, + "loss": 12.2391, + "step": 11014 + }, + { + "epoch": 0.5998107723618741, + "grad_norm": 0.5977241627542447, + "learning_rate": 0.00016416460227966915, + "loss": 12.3261, + "step": 11015 + }, + { + "epoch": 0.5998652263584571, + "grad_norm": 0.6257018673855346, + "learning_rate": 0.00016415783841824738, + "loss": 12.3806, + "step": 11016 + }, + { + "epoch": 0.5999196803550401, + "grad_norm": 0.7731443352708557, + "learning_rate": 0.00016415107405792352, + "loss": 12.4871, + "step": 11017 + }, + { + "epoch": 0.5999741343516231, + "grad_norm": 0.5933405099016545, + "learning_rate": 0.0001641443091987502, + "loss": 12.2664, + "step": 11018 + }, + { + "epoch": 0.600028588348206, + "grad_norm": 0.6599217013753389, + "learning_rate": 0.0001641375438407799, + "loss": 12.399, + "step": 11019 + }, + { + "epoch": 0.600083042344789, + "grad_norm": 0.732712892593169, + "learning_rate": 0.00016413077798406534, + "loss": 12.4574, + "step": 11020 + }, + { + "epoch": 0.6001374963413721, + "grad_norm": 0.618903863653971, + "learning_rate": 0.00016412401162865906, + "loss": 12.3222, + "step": 11021 + }, + { + "epoch": 0.6001919503379551, + "grad_norm": 0.5803114091083403, + "learning_rate": 0.00016411724477461372, + "loss": 12.2771, + "step": 11022 + }, + { + "epoch": 0.6002464043345381, + "grad_norm": 0.6900362551315334, + "learning_rate": 0.00016411047742198197, + "loss": 12.3792, + "step": 11023 + }, + { + "epoch": 0.6003008583311211, + "grad_norm": 0.6273452602392071, + "learning_rate": 0.00016410370957081638, + "loss": 12.1368, + "step": 11024 + }, + { + "epoch": 0.6003553123277041, + "grad_norm": 0.6024275706084519, + "learning_rate": 0.00016409694122116958, + "loss": 12.4001, + "step": 11025 + }, + { + "epoch": 0.6004097663242871, + "grad_norm": 0.5853978343141101, + "learning_rate": 0.00016409017237309424, + "loss": 12.2973, + "step": 11026 + }, + { + "epoch": 0.6004642203208702, + "grad_norm": 0.6306740379428283, + "learning_rate": 0.00016408340302664294, + "loss": 12.4726, + "step": 11027 + }, + { + "epoch": 0.6005186743174532, + "grad_norm": 0.582678644761307, + "learning_rate": 0.00016407663318186838, + "loss": 12.3409, + "step": 11028 + }, + { + "epoch": 0.6005731283140362, + "grad_norm": 0.6438980879627172, + "learning_rate": 0.00016406986283882315, + "loss": 12.4626, + "step": 11029 + }, + { + "epoch": 0.6006275823106192, + "grad_norm": 0.6516172706708898, + "learning_rate": 0.00016406309199755992, + "loss": 12.3803, + "step": 11030 + }, + { + "epoch": 0.6006820363072022, + "grad_norm": 0.6589363085519371, + "learning_rate": 0.00016405632065813136, + "loss": 12.3313, + "step": 11031 + }, + { + "epoch": 0.6007364903037853, + "grad_norm": 0.6892402550538832, + "learning_rate": 0.00016404954882059012, + "loss": 12.4779, + "step": 11032 + }, + { + "epoch": 0.6007909443003683, + "grad_norm": 0.5614826899245277, + "learning_rate": 0.00016404277648498881, + "loss": 12.2103, + "step": 11033 + }, + { + "epoch": 0.6008453982969513, + "grad_norm": 0.5673320871722737, + "learning_rate": 0.00016403600365138017, + "loss": 12.3491, + "step": 11034 + }, + { + "epoch": 0.6008998522935343, + "grad_norm": 0.6451029222343776, + "learning_rate": 0.00016402923031981682, + "loss": 12.3691, + "step": 11035 + }, + { + "epoch": 0.6009543062901173, + "grad_norm": 0.5993129509634276, + "learning_rate": 0.00016402245649035143, + "loss": 12.3083, + "step": 11036 + }, + { + "epoch": 0.6010087602867002, + "grad_norm": 0.8425288989468626, + "learning_rate": 0.00016401568216303666, + "loss": 12.2698, + "step": 11037 + }, + { + "epoch": 0.6010632142832834, + "grad_norm": 0.6080488327074888, + "learning_rate": 0.00016400890733792522, + "loss": 12.3828, + "step": 11038 + }, + { + "epoch": 0.6011176682798663, + "grad_norm": 0.6149323405989047, + "learning_rate": 0.0001640021320150698, + "loss": 12.29, + "step": 11039 + }, + { + "epoch": 0.6011721222764493, + "grad_norm": 0.6503595167012923, + "learning_rate": 0.00016399535619452307, + "loss": 12.4554, + "step": 11040 + }, + { + "epoch": 0.6012265762730323, + "grad_norm": 0.6144374874038958, + "learning_rate": 0.00016398857987633768, + "loss": 12.3855, + "step": 11041 + }, + { + "epoch": 0.6012810302696153, + "grad_norm": 0.5659336071137602, + "learning_rate": 0.0001639818030605664, + "loss": 12.3596, + "step": 11042 + }, + { + "epoch": 0.6013354842661983, + "grad_norm": 0.5376304352006531, + "learning_rate": 0.00016397502574726187, + "loss": 12.2335, + "step": 11043 + }, + { + "epoch": 0.6013899382627814, + "grad_norm": 0.5924159115387776, + "learning_rate": 0.00016396824793647684, + "loss": 12.2906, + "step": 11044 + }, + { + "epoch": 0.6014443922593644, + "grad_norm": 0.6760554560177521, + "learning_rate": 0.00016396146962826394, + "loss": 12.3811, + "step": 11045 + }, + { + "epoch": 0.6014988462559474, + "grad_norm": 0.6041717706336399, + "learning_rate": 0.00016395469082267598, + "loss": 12.4151, + "step": 11046 + }, + { + "epoch": 0.6015533002525304, + "grad_norm": 0.6416209500189765, + "learning_rate": 0.00016394791151976557, + "loss": 12.458, + "step": 11047 + }, + { + "epoch": 0.6016077542491134, + "grad_norm": 0.622185400302005, + "learning_rate": 0.0001639411317195855, + "loss": 12.3202, + "step": 11048 + }, + { + "epoch": 0.6016622082456964, + "grad_norm": 0.6290605003665506, + "learning_rate": 0.0001639343514221885, + "loss": 12.4226, + "step": 11049 + }, + { + "epoch": 0.6017166622422795, + "grad_norm": 0.6200775111861807, + "learning_rate": 0.00016392757062762723, + "loss": 12.3675, + "step": 11050 + }, + { + "epoch": 0.6017711162388625, + "grad_norm": 0.6044956149656243, + "learning_rate": 0.00016392078933595442, + "loss": 12.4052, + "step": 11051 + }, + { + "epoch": 0.6018255702354455, + "grad_norm": 0.5622836150915892, + "learning_rate": 0.00016391400754722286, + "loss": 12.2558, + "step": 11052 + }, + { + "epoch": 0.6018800242320285, + "grad_norm": 0.7140968733839776, + "learning_rate": 0.00016390722526148525, + "loss": 12.4013, + "step": 11053 + }, + { + "epoch": 0.6019344782286115, + "grad_norm": 0.6898733093538983, + "learning_rate": 0.00016390044247879435, + "loss": 12.3683, + "step": 11054 + }, + { + "epoch": 0.6019889322251945, + "grad_norm": 0.6503239657646119, + "learning_rate": 0.0001638936591992029, + "loss": 12.3846, + "step": 11055 + }, + { + "epoch": 0.6020433862217776, + "grad_norm": 0.5894666821242075, + "learning_rate": 0.00016388687542276363, + "loss": 12.3637, + "step": 11056 + }, + { + "epoch": 0.6020978402183605, + "grad_norm": 0.5934108932985701, + "learning_rate": 0.00016388009114952929, + "loss": 12.3284, + "step": 11057 + }, + { + "epoch": 0.6021522942149435, + "grad_norm": 0.7523011319008442, + "learning_rate": 0.00016387330637955265, + "loss": 12.2661, + "step": 11058 + }, + { + "epoch": 0.6022067482115265, + "grad_norm": 0.647101465243184, + "learning_rate": 0.00016386652111288652, + "loss": 12.4377, + "step": 11059 + }, + { + "epoch": 0.6022612022081095, + "grad_norm": 0.5512134794367408, + "learning_rate": 0.00016385973534958356, + "loss": 12.3801, + "step": 11060 + }, + { + "epoch": 0.6023156562046925, + "grad_norm": 0.5902503412478513, + "learning_rate": 0.0001638529490896966, + "loss": 12.3326, + "step": 11061 + }, + { + "epoch": 0.6023701102012756, + "grad_norm": 0.633910662052685, + "learning_rate": 0.00016384616233327837, + "loss": 12.4507, + "step": 11062 + }, + { + "epoch": 0.6024245641978586, + "grad_norm": 0.5754040976327168, + "learning_rate": 0.00016383937508038173, + "loss": 12.3825, + "step": 11063 + }, + { + "epoch": 0.6024790181944416, + "grad_norm": 0.6043470862631053, + "learning_rate": 0.00016383258733105937, + "loss": 12.3094, + "step": 11064 + }, + { + "epoch": 0.6025334721910246, + "grad_norm": 0.6048248874531785, + "learning_rate": 0.00016382579908536413, + "loss": 12.2771, + "step": 11065 + }, + { + "epoch": 0.6025879261876076, + "grad_norm": 0.5452843984000415, + "learning_rate": 0.00016381901034334873, + "loss": 12.2863, + "step": 11066 + }, + { + "epoch": 0.6026423801841907, + "grad_norm": 0.6076571075716745, + "learning_rate": 0.00016381222110506603, + "loss": 12.3473, + "step": 11067 + }, + { + "epoch": 0.6026968341807737, + "grad_norm": 0.5652508244979979, + "learning_rate": 0.00016380543137056882, + "loss": 12.4141, + "step": 11068 + }, + { + "epoch": 0.6027512881773567, + "grad_norm": 0.585225819813063, + "learning_rate": 0.00016379864113990985, + "loss": 12.305, + "step": 11069 + }, + { + "epoch": 0.6028057421739397, + "grad_norm": 0.5795408250187182, + "learning_rate": 0.00016379185041314194, + "loss": 12.2756, + "step": 11070 + }, + { + "epoch": 0.6028601961705227, + "grad_norm": 0.5995362149946898, + "learning_rate": 0.00016378505919031794, + "loss": 12.3073, + "step": 11071 + }, + { + "epoch": 0.6029146501671057, + "grad_norm": 0.5508610806139238, + "learning_rate": 0.0001637782674714906, + "loss": 12.174, + "step": 11072 + }, + { + "epoch": 0.6029691041636888, + "grad_norm": 0.5783209754285752, + "learning_rate": 0.00016377147525671273, + "loss": 12.3101, + "step": 11073 + }, + { + "epoch": 0.6030235581602718, + "grad_norm": 0.5626050721429561, + "learning_rate": 0.0001637646825460372, + "loss": 12.2853, + "step": 11074 + }, + { + "epoch": 0.6030780121568547, + "grad_norm": 0.6027064270819751, + "learning_rate": 0.00016375788933951682, + "loss": 12.3111, + "step": 11075 + }, + { + "epoch": 0.6031324661534377, + "grad_norm": 0.6690295265995301, + "learning_rate": 0.00016375109563720436, + "loss": 12.3696, + "step": 11076 + }, + { + "epoch": 0.6031869201500207, + "grad_norm": 0.5837871491600592, + "learning_rate": 0.0001637443014391527, + "loss": 12.2745, + "step": 11077 + }, + { + "epoch": 0.6032413741466037, + "grad_norm": 0.5733628207555487, + "learning_rate": 0.00016373750674541468, + "loss": 12.2662, + "step": 11078 + }, + { + "epoch": 0.6032958281431868, + "grad_norm": 0.6237040428548752, + "learning_rate": 0.00016373071155604312, + "loss": 12.3948, + "step": 11079 + }, + { + "epoch": 0.6033502821397698, + "grad_norm": 0.6324278402150281, + "learning_rate": 0.00016372391587109086, + "loss": 12.4646, + "step": 11080 + }, + { + "epoch": 0.6034047361363528, + "grad_norm": 0.5900512368765062, + "learning_rate": 0.0001637171196906107, + "loss": 12.293, + "step": 11081 + }, + { + "epoch": 0.6034591901329358, + "grad_norm": 0.6510319242318467, + "learning_rate": 0.0001637103230146556, + "loss": 12.3156, + "step": 11082 + }, + { + "epoch": 0.6035136441295188, + "grad_norm": 0.603809943286475, + "learning_rate": 0.0001637035258432783, + "loss": 12.425, + "step": 11083 + }, + { + "epoch": 0.6035680981261018, + "grad_norm": 0.5907690636784768, + "learning_rate": 0.00016369672817653173, + "loss": 12.3971, + "step": 11084 + }, + { + "epoch": 0.6036225521226849, + "grad_norm": 0.6478519079380884, + "learning_rate": 0.0001636899300144687, + "loss": 12.3261, + "step": 11085 + }, + { + "epoch": 0.6036770061192679, + "grad_norm": 0.5575314006519871, + "learning_rate": 0.0001636831313571421, + "loss": 12.3727, + "step": 11086 + }, + { + "epoch": 0.6037314601158509, + "grad_norm": 0.6231567452468517, + "learning_rate": 0.00016367633220460478, + "loss": 12.4183, + "step": 11087 + }, + { + "epoch": 0.6037859141124339, + "grad_norm": 0.5987386128539546, + "learning_rate": 0.00016366953255690962, + "loss": 12.3108, + "step": 11088 + }, + { + "epoch": 0.6038403681090169, + "grad_norm": 0.5557211914285582, + "learning_rate": 0.00016366273241410952, + "loss": 12.1499, + "step": 11089 + }, + { + "epoch": 0.6038948221055999, + "grad_norm": 0.676169499379147, + "learning_rate": 0.0001636559317762573, + "loss": 12.4032, + "step": 11090 + }, + { + "epoch": 0.603949276102183, + "grad_norm": 0.5760384995265102, + "learning_rate": 0.0001636491306434059, + "loss": 12.2913, + "step": 11091 + }, + { + "epoch": 0.604003730098766, + "grad_norm": 0.5933103287988173, + "learning_rate": 0.0001636423290156082, + "loss": 12.428, + "step": 11092 + }, + { + "epoch": 0.604058184095349, + "grad_norm": 0.7065631146403468, + "learning_rate": 0.00016363552689291705, + "loss": 12.2464, + "step": 11093 + }, + { + "epoch": 0.6041126380919319, + "grad_norm": 0.610605509954838, + "learning_rate": 0.0001636287242753854, + "loss": 12.333, + "step": 11094 + }, + { + "epoch": 0.6041670920885149, + "grad_norm": 0.5975864259914512, + "learning_rate": 0.00016362192116306612, + "loss": 12.2726, + "step": 11095 + }, + { + "epoch": 0.604221546085098, + "grad_norm": 0.6099596482197528, + "learning_rate": 0.00016361511755601206, + "loss": 12.3904, + "step": 11096 + }, + { + "epoch": 0.604276000081681, + "grad_norm": 0.5486189081414413, + "learning_rate": 0.00016360831345427622, + "loss": 12.3003, + "step": 11097 + }, + { + "epoch": 0.604330454078264, + "grad_norm": 0.6590723996025494, + "learning_rate": 0.0001636015088579115, + "loss": 12.3686, + "step": 11098 + }, + { + "epoch": 0.604384908074847, + "grad_norm": 0.5965197006105573, + "learning_rate": 0.00016359470376697073, + "loss": 12.3768, + "step": 11099 + }, + { + "epoch": 0.60443936207143, + "grad_norm": 0.6000795522191682, + "learning_rate": 0.00016358789818150688, + "loss": 12.3006, + "step": 11100 + }, + { + "epoch": 0.604493816068013, + "grad_norm": 0.5966457566072743, + "learning_rate": 0.0001635810921015729, + "loss": 12.3902, + "step": 11101 + }, + { + "epoch": 0.6045482700645961, + "grad_norm": 0.5723338160767066, + "learning_rate": 0.00016357428552722165, + "loss": 12.2545, + "step": 11102 + }, + { + "epoch": 0.6046027240611791, + "grad_norm": 0.5917006614386284, + "learning_rate": 0.0001635674784585061, + "loss": 12.3173, + "step": 11103 + }, + { + "epoch": 0.6046571780577621, + "grad_norm": 0.6300577844312848, + "learning_rate": 0.00016356067089547919, + "loss": 12.4326, + "step": 11104 + }, + { + "epoch": 0.6047116320543451, + "grad_norm": 0.5941836003594154, + "learning_rate": 0.00016355386283819386, + "loss": 12.3251, + "step": 11105 + }, + { + "epoch": 0.6047660860509281, + "grad_norm": 0.6859987389573252, + "learning_rate": 0.000163547054286703, + "loss": 12.3612, + "step": 11106 + }, + { + "epoch": 0.6048205400475111, + "grad_norm": 0.7555113851041918, + "learning_rate": 0.0001635402452410596, + "loss": 12.3068, + "step": 11107 + }, + { + "epoch": 0.6048749940440942, + "grad_norm": 0.7143705006050696, + "learning_rate": 0.0001635334357013166, + "loss": 12.3739, + "step": 11108 + }, + { + "epoch": 0.6049294480406772, + "grad_norm": 0.6897982973385011, + "learning_rate": 0.00016352662566752698, + "loss": 12.384, + "step": 11109 + }, + { + "epoch": 0.6049839020372602, + "grad_norm": 0.6588045553449117, + "learning_rate": 0.00016351981513974365, + "loss": 12.4431, + "step": 11110 + }, + { + "epoch": 0.6050383560338431, + "grad_norm": 0.5990230565766719, + "learning_rate": 0.00016351300411801954, + "loss": 12.3734, + "step": 11111 + }, + { + "epoch": 0.6050928100304261, + "grad_norm": 0.6978925280375029, + "learning_rate": 0.00016350619260240769, + "loss": 12.3558, + "step": 11112 + }, + { + "epoch": 0.6051472640270091, + "grad_norm": 0.5959688891627724, + "learning_rate": 0.00016349938059296104, + "loss": 12.2814, + "step": 11113 + }, + { + "epoch": 0.6052017180235922, + "grad_norm": 0.6048401762930573, + "learning_rate": 0.00016349256808973256, + "loss": 12.2942, + "step": 11114 + }, + { + "epoch": 0.6052561720201752, + "grad_norm": 0.6237154293696561, + "learning_rate": 0.00016348575509277522, + "loss": 12.3011, + "step": 11115 + }, + { + "epoch": 0.6053106260167582, + "grad_norm": 0.5894504081881295, + "learning_rate": 0.000163478941602142, + "loss": 12.4032, + "step": 11116 + }, + { + "epoch": 0.6053650800133412, + "grad_norm": 0.6673670271952441, + "learning_rate": 0.0001634721276178859, + "loss": 12.3395, + "step": 11117 + }, + { + "epoch": 0.6054195340099242, + "grad_norm": 0.5951869230383666, + "learning_rate": 0.00016346531314005987, + "loss": 12.3577, + "step": 11118 + }, + { + "epoch": 0.6054739880065072, + "grad_norm": 0.6334316531072363, + "learning_rate": 0.00016345849816871692, + "loss": 12.3382, + "step": 11119 + }, + { + "epoch": 0.6055284420030903, + "grad_norm": 0.6162645161211552, + "learning_rate": 0.00016345168270391004, + "loss": 12.2919, + "step": 11120 + }, + { + "epoch": 0.6055828959996733, + "grad_norm": 0.5205672866936765, + "learning_rate": 0.00016344486674569226, + "loss": 12.327, + "step": 11121 + }, + { + "epoch": 0.6056373499962563, + "grad_norm": 0.5930119109328588, + "learning_rate": 0.00016343805029411654, + "loss": 12.3136, + "step": 11122 + }, + { + "epoch": 0.6056918039928393, + "grad_norm": 0.5917678881795895, + "learning_rate": 0.00016343123334923592, + "loss": 12.3358, + "step": 11123 + }, + { + "epoch": 0.6057462579894223, + "grad_norm": 0.6278136764825851, + "learning_rate": 0.0001634244159111034, + "loss": 12.4578, + "step": 11124 + }, + { + "epoch": 0.6058007119860053, + "grad_norm": 0.5941650915734099, + "learning_rate": 0.00016341759797977194, + "loss": 12.4215, + "step": 11125 + }, + { + "epoch": 0.6058551659825884, + "grad_norm": 0.6487586325937854, + "learning_rate": 0.00016341077955529465, + "loss": 12.4542, + "step": 11126 + }, + { + "epoch": 0.6059096199791714, + "grad_norm": 0.6134015359441436, + "learning_rate": 0.00016340396063772449, + "loss": 12.3903, + "step": 11127 + }, + { + "epoch": 0.6059640739757544, + "grad_norm": 0.6836090618022848, + "learning_rate": 0.00016339714122711447, + "loss": 12.36, + "step": 11128 + }, + { + "epoch": 0.6060185279723374, + "grad_norm": 0.6845856838845686, + "learning_rate": 0.00016339032132351767, + "loss": 12.2901, + "step": 11129 + }, + { + "epoch": 0.6060729819689203, + "grad_norm": 0.5640102141813625, + "learning_rate": 0.0001633835009269871, + "loss": 12.3379, + "step": 11130 + }, + { + "epoch": 0.6061274359655034, + "grad_norm": 0.5737426198031063, + "learning_rate": 0.0001633766800375758, + "loss": 12.3716, + "step": 11131 + }, + { + "epoch": 0.6061818899620864, + "grad_norm": 0.5824786880831666, + "learning_rate": 0.00016336985865533682, + "loss": 12.4523, + "step": 11132 + }, + { + "epoch": 0.6062363439586694, + "grad_norm": 0.5329751071521219, + "learning_rate": 0.00016336303678032317, + "loss": 12.1134, + "step": 11133 + }, + { + "epoch": 0.6062907979552524, + "grad_norm": 0.6714939419589115, + "learning_rate": 0.00016335621441258792, + "loss": 12.3629, + "step": 11134 + }, + { + "epoch": 0.6063452519518354, + "grad_norm": 0.7119477262638965, + "learning_rate": 0.00016334939155218415, + "loss": 12.4187, + "step": 11135 + }, + { + "epoch": 0.6063997059484184, + "grad_norm": 0.6086057695680839, + "learning_rate": 0.00016334256819916485, + "loss": 12.368, + "step": 11136 + }, + { + "epoch": 0.6064541599450015, + "grad_norm": 0.6078255268695479, + "learning_rate": 0.00016333574435358313, + "loss": 12.4414, + "step": 11137 + }, + { + "epoch": 0.6065086139415845, + "grad_norm": 0.6020225475285034, + "learning_rate": 0.00016332892001549206, + "loss": 12.2573, + "step": 11138 + }, + { + "epoch": 0.6065630679381675, + "grad_norm": 0.7082157627835164, + "learning_rate": 0.00016332209518494468, + "loss": 12.4737, + "step": 11139 + }, + { + "epoch": 0.6066175219347505, + "grad_norm": 0.5873016676995043, + "learning_rate": 0.00016331526986199406, + "loss": 12.4236, + "step": 11140 + }, + { + "epoch": 0.6066719759313335, + "grad_norm": 0.631074673123381, + "learning_rate": 0.00016330844404669327, + "loss": 12.4155, + "step": 11141 + }, + { + "epoch": 0.6067264299279165, + "grad_norm": 0.5926052711170497, + "learning_rate": 0.00016330161773909542, + "loss": 12.3943, + "step": 11142 + }, + { + "epoch": 0.6067808839244996, + "grad_norm": 0.7595425087844859, + "learning_rate": 0.00016329479093925357, + "loss": 12.4255, + "step": 11143 + }, + { + "epoch": 0.6068353379210826, + "grad_norm": 0.6165882148007943, + "learning_rate": 0.00016328796364722083, + "loss": 12.3621, + "step": 11144 + }, + { + "epoch": 0.6068897919176656, + "grad_norm": 0.5348979092332994, + "learning_rate": 0.00016328113586305026, + "loss": 12.3285, + "step": 11145 + }, + { + "epoch": 0.6069442459142486, + "grad_norm": 0.5812507187291274, + "learning_rate": 0.00016327430758679494, + "loss": 12.3238, + "step": 11146 + }, + { + "epoch": 0.6069986999108316, + "grad_norm": 0.57940112965112, + "learning_rate": 0.00016326747881850802, + "loss": 12.3121, + "step": 11147 + }, + { + "epoch": 0.6070531539074145, + "grad_norm": 0.5617843893592939, + "learning_rate": 0.00016326064955824257, + "loss": 12.3478, + "step": 11148 + }, + { + "epoch": 0.6071076079039976, + "grad_norm": 0.5391895305756338, + "learning_rate": 0.0001632538198060517, + "loss": 12.3156, + "step": 11149 + }, + { + "epoch": 0.6071620619005806, + "grad_norm": 0.5712370867105543, + "learning_rate": 0.0001632469895619885, + "loss": 12.2493, + "step": 11150 + }, + { + "epoch": 0.6072165158971636, + "grad_norm": 0.613671361377228, + "learning_rate": 0.00016324015882610615, + "loss": 12.3102, + "step": 11151 + }, + { + "epoch": 0.6072709698937466, + "grad_norm": 0.6108727049954086, + "learning_rate": 0.00016323332759845765, + "loss": 12.3195, + "step": 11152 + }, + { + "epoch": 0.6073254238903296, + "grad_norm": 0.5904626345424183, + "learning_rate": 0.00016322649587909623, + "loss": 12.3891, + "step": 11153 + }, + { + "epoch": 0.6073798778869126, + "grad_norm": 0.5562410258535025, + "learning_rate": 0.000163219663668075, + "loss": 12.258, + "step": 11154 + }, + { + "epoch": 0.6074343318834957, + "grad_norm": 0.5573520133764401, + "learning_rate": 0.00016321283096544704, + "loss": 12.3414, + "step": 11155 + }, + { + "epoch": 0.6074887858800787, + "grad_norm": 0.6867050860705599, + "learning_rate": 0.00016320599777126552, + "loss": 12.3879, + "step": 11156 + }, + { + "epoch": 0.6075432398766617, + "grad_norm": 0.6148388888394638, + "learning_rate": 0.00016319916408558352, + "loss": 12.2203, + "step": 11157 + }, + { + "epoch": 0.6075976938732447, + "grad_norm": 0.6316122300461814, + "learning_rate": 0.00016319232990845425, + "loss": 12.3192, + "step": 11158 + }, + { + "epoch": 0.6076521478698277, + "grad_norm": 0.5381826224839812, + "learning_rate": 0.00016318549523993084, + "loss": 12.1295, + "step": 11159 + }, + { + "epoch": 0.6077066018664107, + "grad_norm": 0.639597752296276, + "learning_rate": 0.00016317866008006639, + "loss": 12.3341, + "step": 11160 + }, + { + "epoch": 0.6077610558629938, + "grad_norm": 0.5850046410800002, + "learning_rate": 0.0001631718244289141, + "loss": 12.3696, + "step": 11161 + }, + { + "epoch": 0.6078155098595768, + "grad_norm": 0.5409904898717542, + "learning_rate": 0.00016316498828652712, + "loss": 12.2983, + "step": 11162 + }, + { + "epoch": 0.6078699638561598, + "grad_norm": 0.6560920448327437, + "learning_rate": 0.00016315815165295855, + "loss": 12.3445, + "step": 11163 + }, + { + "epoch": 0.6079244178527428, + "grad_norm": 0.5881682714143308, + "learning_rate": 0.00016315131452826167, + "loss": 12.2529, + "step": 11164 + }, + { + "epoch": 0.6079788718493258, + "grad_norm": 0.6556078342042584, + "learning_rate": 0.00016314447691248956, + "loss": 12.4407, + "step": 11165 + }, + { + "epoch": 0.6080333258459089, + "grad_norm": 0.5998152193241213, + "learning_rate": 0.00016313763880569537, + "loss": 12.1111, + "step": 11166 + }, + { + "epoch": 0.6080877798424918, + "grad_norm": 0.6040623193833304, + "learning_rate": 0.00016313080020793235, + "loss": 12.2528, + "step": 11167 + }, + { + "epoch": 0.6081422338390748, + "grad_norm": 0.565811403827649, + "learning_rate": 0.00016312396111925362, + "loss": 12.3451, + "step": 11168 + }, + { + "epoch": 0.6081966878356578, + "grad_norm": 0.6303795224857877, + "learning_rate": 0.00016311712153971238, + "loss": 12.3896, + "step": 11169 + }, + { + "epoch": 0.6082511418322408, + "grad_norm": 0.6103204271982208, + "learning_rate": 0.00016311028146936184, + "loss": 12.3071, + "step": 11170 + }, + { + "epoch": 0.6083055958288238, + "grad_norm": 0.5907392547073242, + "learning_rate": 0.00016310344090825516, + "loss": 12.3801, + "step": 11171 + }, + { + "epoch": 0.6083600498254069, + "grad_norm": 0.5898616925852171, + "learning_rate": 0.00016309659985644555, + "loss": 12.2173, + "step": 11172 + }, + { + "epoch": 0.6084145038219899, + "grad_norm": 0.6721267658389909, + "learning_rate": 0.00016308975831398617, + "loss": 12.4195, + "step": 11173 + }, + { + "epoch": 0.6084689578185729, + "grad_norm": 0.5737286379187838, + "learning_rate": 0.00016308291628093025, + "loss": 12.2979, + "step": 11174 + }, + { + "epoch": 0.6085234118151559, + "grad_norm": 0.5948908660886684, + "learning_rate": 0.00016307607375733103, + "loss": 12.4614, + "step": 11175 + }, + { + "epoch": 0.6085778658117389, + "grad_norm": 0.5847183241982178, + "learning_rate": 0.00016306923074324166, + "loss": 12.1829, + "step": 11176 + }, + { + "epoch": 0.6086323198083219, + "grad_norm": 0.5348038205117897, + "learning_rate": 0.00016306238723871536, + "loss": 12.2261, + "step": 11177 + }, + { + "epoch": 0.608686773804905, + "grad_norm": 0.7186846714099959, + "learning_rate": 0.00016305554324380536, + "loss": 12.3023, + "step": 11178 + }, + { + "epoch": 0.608741227801488, + "grad_norm": 0.656779753977201, + "learning_rate": 0.00016304869875856493, + "loss": 12.4631, + "step": 11179 + }, + { + "epoch": 0.608795681798071, + "grad_norm": 0.6612372166326377, + "learning_rate": 0.0001630418537830472, + "loss": 12.3458, + "step": 11180 + }, + { + "epoch": 0.608850135794654, + "grad_norm": 0.6116698754329289, + "learning_rate": 0.00016303500831730546, + "loss": 12.2474, + "step": 11181 + }, + { + "epoch": 0.608904589791237, + "grad_norm": 0.6175693550944324, + "learning_rate": 0.00016302816236139292, + "loss": 12.3383, + "step": 11182 + }, + { + "epoch": 0.60895904378782, + "grad_norm": 0.6838026604965294, + "learning_rate": 0.0001630213159153628, + "loss": 12.3036, + "step": 11183 + }, + { + "epoch": 0.6090134977844031, + "grad_norm": 0.6209752444596405, + "learning_rate": 0.0001630144689792684, + "loss": 12.2806, + "step": 11184 + }, + { + "epoch": 0.609067951780986, + "grad_norm": 0.6992091942059789, + "learning_rate": 0.0001630076215531629, + "loss": 12.3381, + "step": 11185 + }, + { + "epoch": 0.609122405777569, + "grad_norm": 0.6464572432173553, + "learning_rate": 0.0001630007736370996, + "loss": 12.3463, + "step": 11186 + }, + { + "epoch": 0.609176859774152, + "grad_norm": 0.6038931960005121, + "learning_rate": 0.00016299392523113165, + "loss": 12.2283, + "step": 11187 + }, + { + "epoch": 0.609231313770735, + "grad_norm": 0.6692542463517684, + "learning_rate": 0.00016298707633531244, + "loss": 12.1545, + "step": 11188 + }, + { + "epoch": 0.609285767767318, + "grad_norm": 0.5934411500079831, + "learning_rate": 0.0001629802269496951, + "loss": 12.279, + "step": 11189 + }, + { + "epoch": 0.6093402217639011, + "grad_norm": 0.6520832855912216, + "learning_rate": 0.000162973377074333, + "loss": 12.4605, + "step": 11190 + }, + { + "epoch": 0.6093946757604841, + "grad_norm": 0.6356145983590724, + "learning_rate": 0.00016296652670927934, + "loss": 12.3357, + "step": 11191 + }, + { + "epoch": 0.6094491297570671, + "grad_norm": 0.6684675532985174, + "learning_rate": 0.00016295967585458742, + "loss": 12.3771, + "step": 11192 + }, + { + "epoch": 0.6095035837536501, + "grad_norm": 0.5676216790776536, + "learning_rate": 0.00016295282451031048, + "loss": 12.313, + "step": 11193 + }, + { + "epoch": 0.6095580377502331, + "grad_norm": 0.6318426817865253, + "learning_rate": 0.00016294597267650185, + "loss": 12.2556, + "step": 11194 + }, + { + "epoch": 0.6096124917468161, + "grad_norm": 0.546346474512078, + "learning_rate": 0.00016293912035321477, + "loss": 12.3492, + "step": 11195 + }, + { + "epoch": 0.6096669457433992, + "grad_norm": 0.5597620710316814, + "learning_rate": 0.00016293226754050252, + "loss": 12.2942, + "step": 11196 + }, + { + "epoch": 0.6097213997399822, + "grad_norm": 0.6019576331614028, + "learning_rate": 0.00016292541423841843, + "loss": 12.4731, + "step": 11197 + }, + { + "epoch": 0.6097758537365652, + "grad_norm": 0.6087530645396827, + "learning_rate": 0.00016291856044701574, + "loss": 12.414, + "step": 11198 + }, + { + "epoch": 0.6098303077331482, + "grad_norm": 0.5511451580189743, + "learning_rate": 0.0001629117061663478, + "loss": 12.3977, + "step": 11199 + }, + { + "epoch": 0.6098847617297312, + "grad_norm": 0.5673407108230473, + "learning_rate": 0.00016290485139646788, + "loss": 12.3911, + "step": 11200 + }, + { + "epoch": 0.6099392157263143, + "grad_norm": 0.568847143885813, + "learning_rate": 0.00016289799613742925, + "loss": 12.2846, + "step": 11201 + }, + { + "epoch": 0.6099936697228973, + "grad_norm": 0.7944651261831513, + "learning_rate": 0.0001628911403892853, + "loss": 12.4732, + "step": 11202 + }, + { + "epoch": 0.6100481237194803, + "grad_norm": 0.646189949260018, + "learning_rate": 0.00016288428415208925, + "loss": 12.3322, + "step": 11203 + }, + { + "epoch": 0.6101025777160632, + "grad_norm": 0.5732514537043685, + "learning_rate": 0.0001628774274258945, + "loss": 12.3178, + "step": 11204 + }, + { + "epoch": 0.6101570317126462, + "grad_norm": 0.654862812790952, + "learning_rate": 0.00016287057021075428, + "loss": 12.3292, + "step": 11205 + }, + { + "epoch": 0.6102114857092292, + "grad_norm": 0.6568052904002022, + "learning_rate": 0.00016286371250672201, + "loss": 12.4505, + "step": 11206 + }, + { + "epoch": 0.6102659397058123, + "grad_norm": 0.6089929626588377, + "learning_rate": 0.00016285685431385096, + "loss": 12.2601, + "step": 11207 + }, + { + "epoch": 0.6103203937023953, + "grad_norm": 0.6441744367395216, + "learning_rate": 0.00016284999563219446, + "loss": 12.2596, + "step": 11208 + }, + { + "epoch": 0.6103748476989783, + "grad_norm": 0.5886980647496863, + "learning_rate": 0.00016284313646180586, + "loss": 12.4004, + "step": 11209 + }, + { + "epoch": 0.6104293016955613, + "grad_norm": 0.6567455919150536, + "learning_rate": 0.0001628362768027385, + "loss": 12.4423, + "step": 11210 + }, + { + "epoch": 0.6104837556921443, + "grad_norm": 0.6434126784891697, + "learning_rate": 0.0001628294166550457, + "loss": 12.3281, + "step": 11211 + }, + { + "epoch": 0.6105382096887273, + "grad_norm": 0.5550889914663049, + "learning_rate": 0.00016282255601878082, + "loss": 12.2448, + "step": 11212 + }, + { + "epoch": 0.6105926636853104, + "grad_norm": 0.658643938541344, + "learning_rate": 0.0001628156948939972, + "loss": 12.1808, + "step": 11213 + }, + { + "epoch": 0.6106471176818934, + "grad_norm": 0.5953183504399878, + "learning_rate": 0.00016280883328074824, + "loss": 12.3536, + "step": 11214 + }, + { + "epoch": 0.6107015716784764, + "grad_norm": 0.6197836107074511, + "learning_rate": 0.00016280197117908723, + "loss": 12.3475, + "step": 11215 + }, + { + "epoch": 0.6107560256750594, + "grad_norm": 0.6536735721454394, + "learning_rate": 0.00016279510858906756, + "loss": 12.3812, + "step": 11216 + }, + { + "epoch": 0.6108104796716424, + "grad_norm": 0.5736370991155088, + "learning_rate": 0.00016278824551074262, + "loss": 12.3155, + "step": 11217 + }, + { + "epoch": 0.6108649336682254, + "grad_norm": 0.650879421311335, + "learning_rate": 0.0001627813819441657, + "loss": 12.3652, + "step": 11218 + }, + { + "epoch": 0.6109193876648085, + "grad_norm": 0.640251354561717, + "learning_rate": 0.00016277451788939028, + "loss": 12.1254, + "step": 11219 + }, + { + "epoch": 0.6109738416613915, + "grad_norm": 0.5670359796552973, + "learning_rate": 0.00016276765334646967, + "loss": 12.3489, + "step": 11220 + }, + { + "epoch": 0.6110282956579745, + "grad_norm": 0.6384343722609144, + "learning_rate": 0.00016276078831545726, + "loss": 12.2302, + "step": 11221 + }, + { + "epoch": 0.6110827496545574, + "grad_norm": 0.6596286383025379, + "learning_rate": 0.00016275392279640642, + "loss": 12.3865, + "step": 11222 + }, + { + "epoch": 0.6111372036511404, + "grad_norm": 0.5891279826921046, + "learning_rate": 0.00016274705678937057, + "loss": 12.2326, + "step": 11223 + }, + { + "epoch": 0.6111916576477234, + "grad_norm": 0.6565771235725542, + "learning_rate": 0.00016274019029440307, + "loss": 12.3797, + "step": 11224 + }, + { + "epoch": 0.6112461116443065, + "grad_norm": 0.5606421552391951, + "learning_rate": 0.00016273332331155732, + "loss": 12.2621, + "step": 11225 + }, + { + "epoch": 0.6113005656408895, + "grad_norm": 0.5888887080072942, + "learning_rate": 0.00016272645584088674, + "loss": 12.3646, + "step": 11226 + }, + { + "epoch": 0.6113550196374725, + "grad_norm": 0.5660003854592005, + "learning_rate": 0.00016271958788244474, + "loss": 12.3718, + "step": 11227 + }, + { + "epoch": 0.6114094736340555, + "grad_norm": 0.5565818029336428, + "learning_rate": 0.00016271271943628466, + "loss": 12.3108, + "step": 11228 + }, + { + "epoch": 0.6114639276306385, + "grad_norm": 0.6250573083091369, + "learning_rate": 0.00016270585050245999, + "loss": 12.414, + "step": 11229 + }, + { + "epoch": 0.6115183816272216, + "grad_norm": 0.5887684866794202, + "learning_rate": 0.00016269898108102414, + "loss": 12.4102, + "step": 11230 + }, + { + "epoch": 0.6115728356238046, + "grad_norm": 0.6040457465405051, + "learning_rate": 0.00016269211117203044, + "loss": 12.4095, + "step": 11231 + }, + { + "epoch": 0.6116272896203876, + "grad_norm": 0.6114239519382482, + "learning_rate": 0.00016268524077553238, + "loss": 12.2274, + "step": 11232 + }, + { + "epoch": 0.6116817436169706, + "grad_norm": 0.5965745055703814, + "learning_rate": 0.00016267836989158338, + "loss": 12.2036, + "step": 11233 + }, + { + "epoch": 0.6117361976135536, + "grad_norm": 0.6143799913864648, + "learning_rate": 0.0001626714985202369, + "loss": 12.4022, + "step": 11234 + }, + { + "epoch": 0.6117906516101366, + "grad_norm": 0.6054444200965224, + "learning_rate": 0.0001626646266615463, + "loss": 12.2948, + "step": 11235 + }, + { + "epoch": 0.6118451056067197, + "grad_norm": 0.5873370076509535, + "learning_rate": 0.00016265775431556506, + "loss": 12.3566, + "step": 11236 + }, + { + "epoch": 0.6118995596033027, + "grad_norm": 0.6543434983368583, + "learning_rate": 0.0001626508814823466, + "loss": 12.2912, + "step": 11237 + }, + { + "epoch": 0.6119540135998857, + "grad_norm": 0.622245962456289, + "learning_rate": 0.0001626440081619444, + "loss": 12.3253, + "step": 11238 + }, + { + "epoch": 0.6120084675964687, + "grad_norm": 0.5760824372813074, + "learning_rate": 0.00016263713435441188, + "loss": 12.3145, + "step": 11239 + }, + { + "epoch": 0.6120629215930516, + "grad_norm": 0.6597670651926038, + "learning_rate": 0.00016263026005980253, + "loss": 12.3367, + "step": 11240 + }, + { + "epoch": 0.6121173755896346, + "grad_norm": 0.67169983719772, + "learning_rate": 0.00016262338527816972, + "loss": 12.0397, + "step": 11241 + }, + { + "epoch": 0.6121718295862177, + "grad_norm": 0.6221931850437608, + "learning_rate": 0.00016261651000956703, + "loss": 12.2584, + "step": 11242 + }, + { + "epoch": 0.6122262835828007, + "grad_norm": 0.6342830763189989, + "learning_rate": 0.0001626096342540478, + "loss": 12.3914, + "step": 11243 + }, + { + "epoch": 0.6122807375793837, + "grad_norm": 0.567870951485226, + "learning_rate": 0.0001626027580116656, + "loss": 12.2923, + "step": 11244 + }, + { + "epoch": 0.6123351915759667, + "grad_norm": 0.5671950134759478, + "learning_rate": 0.0001625958812824738, + "loss": 12.3838, + "step": 11245 + }, + { + "epoch": 0.6123896455725497, + "grad_norm": 0.6211012286514693, + "learning_rate": 0.00016258900406652596, + "loss": 12.4002, + "step": 11246 + }, + { + "epoch": 0.6124440995691327, + "grad_norm": 0.619630309091858, + "learning_rate": 0.00016258212636387556, + "loss": 12.3923, + "step": 11247 + }, + { + "epoch": 0.6124985535657158, + "grad_norm": 0.6709943689223554, + "learning_rate": 0.00016257524817457598, + "loss": 12.1034, + "step": 11248 + }, + { + "epoch": 0.6125530075622988, + "grad_norm": 0.5728013482326856, + "learning_rate": 0.00016256836949868082, + "loss": 12.2234, + "step": 11249 + }, + { + "epoch": 0.6126074615588818, + "grad_norm": 0.5378034627203535, + "learning_rate": 0.00016256149033624354, + "loss": 12.3361, + "step": 11250 + }, + { + "epoch": 0.6126619155554648, + "grad_norm": 0.5506895229670732, + "learning_rate": 0.0001625546106873176, + "loss": 12.1849, + "step": 11251 + }, + { + "epoch": 0.6127163695520478, + "grad_norm": 0.6064870841906332, + "learning_rate": 0.00016254773055195652, + "loss": 12.3334, + "step": 11252 + }, + { + "epoch": 0.6127708235486308, + "grad_norm": 0.6667096903245954, + "learning_rate": 0.0001625408499302138, + "loss": 12.4349, + "step": 11253 + }, + { + "epoch": 0.6128252775452139, + "grad_norm": 0.6123349006803258, + "learning_rate": 0.00016253396882214292, + "loss": 12.415, + "step": 11254 + }, + { + "epoch": 0.6128797315417969, + "grad_norm": 0.5828800827037145, + "learning_rate": 0.00016252708722779742, + "loss": 12.2713, + "step": 11255 + }, + { + "epoch": 0.6129341855383799, + "grad_norm": 0.5810626939893756, + "learning_rate": 0.00016252020514723084, + "loss": 12.2411, + "step": 11256 + }, + { + "epoch": 0.6129886395349629, + "grad_norm": 0.610484695656065, + "learning_rate": 0.00016251332258049664, + "loss": 12.3313, + "step": 11257 + }, + { + "epoch": 0.6130430935315458, + "grad_norm": 0.5960070240782448, + "learning_rate": 0.00016250643952764833, + "loss": 12.379, + "step": 11258 + }, + { + "epoch": 0.6130975475281288, + "grad_norm": 0.6448646995063299, + "learning_rate": 0.00016249955598873948, + "loss": 12.3597, + "step": 11259 + }, + { + "epoch": 0.6131520015247119, + "grad_norm": 0.5791912269801887, + "learning_rate": 0.00016249267196382362, + "loss": 12.2723, + "step": 11260 + }, + { + "epoch": 0.6132064555212949, + "grad_norm": 0.5642981121650137, + "learning_rate": 0.00016248578745295427, + "loss": 12.3564, + "step": 11261 + }, + { + "epoch": 0.6132609095178779, + "grad_norm": 0.6165409779113825, + "learning_rate": 0.00016247890245618492, + "loss": 12.2522, + "step": 11262 + }, + { + "epoch": 0.6133153635144609, + "grad_norm": 0.627470761262432, + "learning_rate": 0.00016247201697356918, + "loss": 12.3381, + "step": 11263 + }, + { + "epoch": 0.6133698175110439, + "grad_norm": 0.6276131776598461, + "learning_rate": 0.00016246513100516053, + "loss": 12.2903, + "step": 11264 + }, + { + "epoch": 0.613424271507627, + "grad_norm": 0.6311633542362459, + "learning_rate": 0.00016245824455101256, + "loss": 12.385, + "step": 11265 + }, + { + "epoch": 0.61347872550421, + "grad_norm": 0.6036525845880586, + "learning_rate": 0.00016245135761117882, + "loss": 12.3985, + "step": 11266 + }, + { + "epoch": 0.613533179500793, + "grad_norm": 0.6372653976777525, + "learning_rate": 0.00016244447018571285, + "loss": 12.3874, + "step": 11267 + }, + { + "epoch": 0.613587633497376, + "grad_norm": 0.6120876586946324, + "learning_rate": 0.00016243758227466816, + "loss": 12.3954, + "step": 11268 + }, + { + "epoch": 0.613642087493959, + "grad_norm": 0.5670362688938073, + "learning_rate": 0.00016243069387809843, + "loss": 12.2247, + "step": 11269 + }, + { + "epoch": 0.613696541490542, + "grad_norm": 0.6061437844694163, + "learning_rate": 0.00016242380499605712, + "loss": 12.3352, + "step": 11270 + }, + { + "epoch": 0.6137509954871251, + "grad_norm": 0.6213968616718148, + "learning_rate": 0.00016241691562859782, + "loss": 12.2934, + "step": 11271 + }, + { + "epoch": 0.6138054494837081, + "grad_norm": 0.572983281791859, + "learning_rate": 0.00016241002577577413, + "loss": 12.296, + "step": 11272 + }, + { + "epoch": 0.6138599034802911, + "grad_norm": 0.565001252538547, + "learning_rate": 0.0001624031354376396, + "loss": 12.3484, + "step": 11273 + }, + { + "epoch": 0.6139143574768741, + "grad_norm": 0.6102422361467659, + "learning_rate": 0.00016239624461424784, + "loss": 12.5153, + "step": 11274 + }, + { + "epoch": 0.613968811473457, + "grad_norm": 0.791363759356964, + "learning_rate": 0.00016238935330565243, + "loss": 12.4972, + "step": 11275 + }, + { + "epoch": 0.61402326547004, + "grad_norm": 0.5937446958082347, + "learning_rate": 0.00016238246151190696, + "loss": 12.1792, + "step": 11276 + }, + { + "epoch": 0.6140777194666232, + "grad_norm": 0.6128242185531629, + "learning_rate": 0.00016237556923306496, + "loss": 12.4127, + "step": 11277 + }, + { + "epoch": 0.6141321734632061, + "grad_norm": 0.5133072704141306, + "learning_rate": 0.00016236867646918007, + "loss": 12.2986, + "step": 11278 + }, + { + "epoch": 0.6141866274597891, + "grad_norm": 0.616740205009282, + "learning_rate": 0.00016236178322030594, + "loss": 12.4195, + "step": 11279 + }, + { + "epoch": 0.6142410814563721, + "grad_norm": 0.6902749010715484, + "learning_rate": 0.0001623548894864961, + "loss": 12.2448, + "step": 11280 + }, + { + "epoch": 0.6142955354529551, + "grad_norm": 0.5800100682633578, + "learning_rate": 0.00016234799526780418, + "loss": 12.3209, + "step": 11281 + }, + { + "epoch": 0.6143499894495381, + "grad_norm": 0.5863122766710325, + "learning_rate": 0.0001623411005642838, + "loss": 12.2777, + "step": 11282 + }, + { + "epoch": 0.6144044434461212, + "grad_norm": 0.5681102531243815, + "learning_rate": 0.00016233420537598855, + "loss": 12.4094, + "step": 11283 + }, + { + "epoch": 0.6144588974427042, + "grad_norm": 0.5431159450349899, + "learning_rate": 0.0001623273097029721, + "loss": 12.3014, + "step": 11284 + }, + { + "epoch": 0.6145133514392872, + "grad_norm": 0.5311266917053576, + "learning_rate": 0.00016232041354528802, + "loss": 12.2448, + "step": 11285 + }, + { + "epoch": 0.6145678054358702, + "grad_norm": 0.610874460094281, + "learning_rate": 0.00016231351690298995, + "loss": 12.4282, + "step": 11286 + }, + { + "epoch": 0.6146222594324532, + "grad_norm": 0.6480890556816988, + "learning_rate": 0.0001623066197761315, + "loss": 12.4236, + "step": 11287 + }, + { + "epoch": 0.6146767134290362, + "grad_norm": 0.6069702570834702, + "learning_rate": 0.00016229972216476635, + "loss": 12.4945, + "step": 11288 + }, + { + "epoch": 0.6147311674256193, + "grad_norm": 0.5934019147160708, + "learning_rate": 0.00016229282406894811, + "loss": 12.3161, + "step": 11289 + }, + { + "epoch": 0.6147856214222023, + "grad_norm": 0.5672283273861385, + "learning_rate": 0.00016228592548873043, + "loss": 12.2918, + "step": 11290 + }, + { + "epoch": 0.6148400754187853, + "grad_norm": 0.7098678134884511, + "learning_rate": 0.00016227902642416695, + "loss": 12.3796, + "step": 11291 + }, + { + "epoch": 0.6148945294153683, + "grad_norm": 0.5864296809404334, + "learning_rate": 0.0001622721268753113, + "loss": 12.2628, + "step": 11292 + }, + { + "epoch": 0.6149489834119513, + "grad_norm": 0.5438277220093632, + "learning_rate": 0.00016226522684221716, + "loss": 12.2331, + "step": 11293 + }, + { + "epoch": 0.6150034374085342, + "grad_norm": 0.6313422982743069, + "learning_rate": 0.00016225832632493819, + "loss": 12.194, + "step": 11294 + }, + { + "epoch": 0.6150578914051174, + "grad_norm": 0.5614566345902298, + "learning_rate": 0.00016225142532352803, + "loss": 12.4039, + "step": 11295 + }, + { + "epoch": 0.6151123454017003, + "grad_norm": 0.5955111725120733, + "learning_rate": 0.0001622445238380403, + "loss": 12.3284, + "step": 11296 + }, + { + "epoch": 0.6151667993982833, + "grad_norm": 0.5981568208880793, + "learning_rate": 0.00016223762186852876, + "loss": 12.386, + "step": 11297 + }, + { + "epoch": 0.6152212533948663, + "grad_norm": 0.5779828111185735, + "learning_rate": 0.000162230719415047, + "loss": 12.3549, + "step": 11298 + }, + { + "epoch": 0.6152757073914493, + "grad_norm": 0.5988882027190688, + "learning_rate": 0.00016222381647764875, + "loss": 12.2865, + "step": 11299 + }, + { + "epoch": 0.6153301613880324, + "grad_norm": 0.5598859235243391, + "learning_rate": 0.0001622169130563877, + "loss": 12.3623, + "step": 11300 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 0.6025884901045108, + "learning_rate": 0.00016221000915131746, + "loss": 12.3875, + "step": 11301 + }, + { + "epoch": 0.6154390693811984, + "grad_norm": 0.5788006219062364, + "learning_rate": 0.00016220310476249176, + "loss": 12.275, + "step": 11302 + }, + { + "epoch": 0.6154935233777814, + "grad_norm": 0.6248198295930657, + "learning_rate": 0.00016219619988996428, + "loss": 12.1747, + "step": 11303 + }, + { + "epoch": 0.6155479773743644, + "grad_norm": 0.6097860819021905, + "learning_rate": 0.00016218929453378874, + "loss": 12.3079, + "step": 11304 + }, + { + "epoch": 0.6156024313709474, + "grad_norm": 0.6942998620702879, + "learning_rate": 0.0001621823886940188, + "loss": 12.4143, + "step": 11305 + }, + { + "epoch": 0.6156568853675305, + "grad_norm": 0.6263141335439183, + "learning_rate": 0.0001621754823707082, + "loss": 12.361, + "step": 11306 + }, + { + "epoch": 0.6157113393641135, + "grad_norm": 0.6863996262057079, + "learning_rate": 0.00016216857556391063, + "loss": 12.3674, + "step": 11307 + }, + { + "epoch": 0.6157657933606965, + "grad_norm": 0.616314884930882, + "learning_rate": 0.00016216166827367974, + "loss": 12.2604, + "step": 11308 + }, + { + "epoch": 0.6158202473572795, + "grad_norm": 0.5983392937109377, + "learning_rate": 0.00016215476050006932, + "loss": 12.2909, + "step": 11309 + }, + { + "epoch": 0.6158747013538625, + "grad_norm": 0.6468327132530235, + "learning_rate": 0.0001621478522431331, + "loss": 12.3449, + "step": 11310 + }, + { + "epoch": 0.6159291553504455, + "grad_norm": 0.6584847095297925, + "learning_rate": 0.00016214094350292468, + "loss": 12.4412, + "step": 11311 + }, + { + "epoch": 0.6159836093470286, + "grad_norm": 0.5925212894375936, + "learning_rate": 0.00016213403427949792, + "loss": 12.3087, + "step": 11312 + }, + { + "epoch": 0.6160380633436116, + "grad_norm": 0.6141912148999293, + "learning_rate": 0.00016212712457290646, + "loss": 12.2135, + "step": 11313 + }, + { + "epoch": 0.6160925173401945, + "grad_norm": 0.5730557449185941, + "learning_rate": 0.0001621202143832041, + "loss": 12.2448, + "step": 11314 + }, + { + "epoch": 0.6161469713367775, + "grad_norm": 0.6319410511618868, + "learning_rate": 0.0001621133037104445, + "loss": 12.2875, + "step": 11315 + }, + { + "epoch": 0.6162014253333605, + "grad_norm": 0.566548677650518, + "learning_rate": 0.00016210639255468144, + "loss": 12.3506, + "step": 11316 + }, + { + "epoch": 0.6162558793299435, + "grad_norm": 0.571999101358985, + "learning_rate": 0.00016209948091596864, + "loss": 12.3217, + "step": 11317 + }, + { + "epoch": 0.6163103333265266, + "grad_norm": 0.5840680525297499, + "learning_rate": 0.00016209256879435988, + "loss": 12.2879, + "step": 11318 + }, + { + "epoch": 0.6163647873231096, + "grad_norm": 0.6271115358651692, + "learning_rate": 0.0001620856561899089, + "loss": 12.3066, + "step": 11319 + }, + { + "epoch": 0.6164192413196926, + "grad_norm": 0.5674867522352245, + "learning_rate": 0.00016207874310266945, + "loss": 12.404, + "step": 11320 + }, + { + "epoch": 0.6164736953162756, + "grad_norm": 0.5963678282919692, + "learning_rate": 0.00016207182953269523, + "loss": 12.3059, + "step": 11321 + }, + { + "epoch": 0.6165281493128586, + "grad_norm": 0.6584712900680062, + "learning_rate": 0.00016206491548004012, + "loss": 12.1358, + "step": 11322 + }, + { + "epoch": 0.6165826033094416, + "grad_norm": 0.6232001423449859, + "learning_rate": 0.00016205800094475778, + "loss": 12.2733, + "step": 11323 + }, + { + "epoch": 0.6166370573060247, + "grad_norm": 0.5961936391155271, + "learning_rate": 0.00016205108592690204, + "loss": 12.267, + "step": 11324 + }, + { + "epoch": 0.6166915113026077, + "grad_norm": 0.6233694372694509, + "learning_rate": 0.00016204417042652665, + "loss": 12.3168, + "step": 11325 + }, + { + "epoch": 0.6167459652991907, + "grad_norm": 0.5967553232670233, + "learning_rate": 0.00016203725444368538, + "loss": 12.2894, + "step": 11326 + }, + { + "epoch": 0.6168004192957737, + "grad_norm": 0.6328880831888363, + "learning_rate": 0.000162030337978432, + "loss": 12.4589, + "step": 11327 + }, + { + "epoch": 0.6168548732923567, + "grad_norm": 0.6599660413066143, + "learning_rate": 0.00016202342103082033, + "loss": 12.133, + "step": 11328 + }, + { + "epoch": 0.6169093272889397, + "grad_norm": 0.6050448734616868, + "learning_rate": 0.00016201650360090413, + "loss": 12.3368, + "step": 11329 + }, + { + "epoch": 0.6169637812855228, + "grad_norm": 0.616854598836734, + "learning_rate": 0.0001620095856887372, + "loss": 12.2232, + "step": 11330 + }, + { + "epoch": 0.6170182352821058, + "grad_norm": 0.6680516335137024, + "learning_rate": 0.00016200266729437333, + "loss": 12.3242, + "step": 11331 + }, + { + "epoch": 0.6170726892786887, + "grad_norm": 0.5497639299706962, + "learning_rate": 0.00016199574841786635, + "loss": 12.3369, + "step": 11332 + }, + { + "epoch": 0.6171271432752717, + "grad_norm": 0.6708234481169313, + "learning_rate": 0.00016198882905926997, + "loss": 12.3544, + "step": 11333 + }, + { + "epoch": 0.6171815972718547, + "grad_norm": 0.6315538192135355, + "learning_rate": 0.0001619819092186381, + "loss": 12.1928, + "step": 11334 + }, + { + "epoch": 0.6172360512684378, + "grad_norm": 0.6490092063174805, + "learning_rate": 0.00016197498889602448, + "loss": 12.1877, + "step": 11335 + }, + { + "epoch": 0.6172905052650208, + "grad_norm": 0.5577690708953591, + "learning_rate": 0.00016196806809148302, + "loss": 12.4101, + "step": 11336 + }, + { + "epoch": 0.6173449592616038, + "grad_norm": 0.6033656557328602, + "learning_rate": 0.00016196114680506741, + "loss": 12.3735, + "step": 11337 + }, + { + "epoch": 0.6173994132581868, + "grad_norm": 0.6226698585246981, + "learning_rate": 0.00016195422503683155, + "loss": 12.3514, + "step": 11338 + }, + { + "epoch": 0.6174538672547698, + "grad_norm": 0.6146688034799889, + "learning_rate": 0.00016194730278682923, + "loss": 12.3374, + "step": 11339 + }, + { + "epoch": 0.6175083212513528, + "grad_norm": 0.6202871794806031, + "learning_rate": 0.00016194038005511432, + "loss": 12.3491, + "step": 11340 + }, + { + "epoch": 0.6175627752479359, + "grad_norm": 0.654543386433988, + "learning_rate": 0.0001619334568417406, + "loss": 12.4703, + "step": 11341 + }, + { + "epoch": 0.6176172292445189, + "grad_norm": 0.6079034969424639, + "learning_rate": 0.00016192653314676196, + "loss": 12.2333, + "step": 11342 + }, + { + "epoch": 0.6176716832411019, + "grad_norm": 0.666143872758608, + "learning_rate": 0.0001619196089702322, + "loss": 12.3581, + "step": 11343 + }, + { + "epoch": 0.6177261372376849, + "grad_norm": 0.5102477274485845, + "learning_rate": 0.00016191268431220519, + "loss": 12.3265, + "step": 11344 + }, + { + "epoch": 0.6177805912342679, + "grad_norm": 0.5784737794499895, + "learning_rate": 0.00016190575917273474, + "loss": 12.3641, + "step": 11345 + }, + { + "epoch": 0.6178350452308509, + "grad_norm": 0.684633075833952, + "learning_rate": 0.00016189883355187477, + "loss": 12.3834, + "step": 11346 + }, + { + "epoch": 0.617889499227434, + "grad_norm": 0.6336297914409275, + "learning_rate": 0.00016189190744967906, + "loss": 12.3893, + "step": 11347 + }, + { + "epoch": 0.617943953224017, + "grad_norm": 0.6382634194907015, + "learning_rate": 0.00016188498086620146, + "loss": 12.4647, + "step": 11348 + }, + { + "epoch": 0.6179984072206, + "grad_norm": 0.6638108620704917, + "learning_rate": 0.00016187805380149596, + "loss": 12.4085, + "step": 11349 + }, + { + "epoch": 0.618052861217183, + "grad_norm": 0.5642107833644697, + "learning_rate": 0.00016187112625561624, + "loss": 12.2775, + "step": 11350 + }, + { + "epoch": 0.6181073152137659, + "grad_norm": 0.618448690303876, + "learning_rate": 0.00016186419822861634, + "loss": 12.323, + "step": 11351 + }, + { + "epoch": 0.6181617692103489, + "grad_norm": 0.6144504495899537, + "learning_rate": 0.00016185726972055002, + "loss": 12.2607, + "step": 11352 + }, + { + "epoch": 0.618216223206932, + "grad_norm": 0.5653022098286621, + "learning_rate": 0.0001618503407314712, + "loss": 12.2345, + "step": 11353 + }, + { + "epoch": 0.618270677203515, + "grad_norm": 0.5923636217015734, + "learning_rate": 0.00016184341126143376, + "loss": 12.3753, + "step": 11354 + }, + { + "epoch": 0.618325131200098, + "grad_norm": 0.6090629767174827, + "learning_rate": 0.0001618364813104916, + "loss": 12.3978, + "step": 11355 + }, + { + "epoch": 0.618379585196681, + "grad_norm": 0.760239146374173, + "learning_rate": 0.00016182955087869859, + "loss": 12.3513, + "step": 11356 + }, + { + "epoch": 0.618434039193264, + "grad_norm": 0.5499640804080805, + "learning_rate": 0.0001618226199661086, + "loss": 12.2744, + "step": 11357 + }, + { + "epoch": 0.618488493189847, + "grad_norm": 0.5413087096896169, + "learning_rate": 0.0001618156885727756, + "loss": 12.2719, + "step": 11358 + }, + { + "epoch": 0.6185429471864301, + "grad_norm": 0.6448601280327934, + "learning_rate": 0.0001618087566987534, + "loss": 12.3079, + "step": 11359 + }, + { + "epoch": 0.6185974011830131, + "grad_norm": 0.6204740814101944, + "learning_rate": 0.00016180182434409593, + "loss": 12.3334, + "step": 11360 + }, + { + "epoch": 0.6186518551795961, + "grad_norm": 0.6521050686806655, + "learning_rate": 0.00016179489150885715, + "loss": 12.1476, + "step": 11361 + }, + { + "epoch": 0.6187063091761791, + "grad_norm": 0.5789250892511003, + "learning_rate": 0.00016178795819309086, + "loss": 12.3211, + "step": 11362 + }, + { + "epoch": 0.6187607631727621, + "grad_norm": 0.6427843582992568, + "learning_rate": 0.00016178102439685113, + "loss": 12.4807, + "step": 11363 + }, + { + "epoch": 0.6188152171693452, + "grad_norm": 0.7076880164255421, + "learning_rate": 0.00016177409012019175, + "loss": 12.3935, + "step": 11364 + }, + { + "epoch": 0.6188696711659282, + "grad_norm": 0.5605514128844377, + "learning_rate": 0.0001617671553631667, + "loss": 12.2769, + "step": 11365 + }, + { + "epoch": 0.6189241251625112, + "grad_norm": 0.6799978308680839, + "learning_rate": 0.0001617602201258299, + "loss": 12.2532, + "step": 11366 + }, + { + "epoch": 0.6189785791590942, + "grad_norm": 0.7108657629179751, + "learning_rate": 0.00016175328440823524, + "loss": 12.2893, + "step": 11367 + }, + { + "epoch": 0.6190330331556771, + "grad_norm": 0.6701568668267008, + "learning_rate": 0.00016174634821043666, + "loss": 12.331, + "step": 11368 + }, + { + "epoch": 0.6190874871522601, + "grad_norm": 0.6167861448581061, + "learning_rate": 0.00016173941153248817, + "loss": 12.1841, + "step": 11369 + }, + { + "epoch": 0.6191419411488432, + "grad_norm": 0.5542796556543904, + "learning_rate": 0.00016173247437444366, + "loss": 12.2144, + "step": 11370 + }, + { + "epoch": 0.6191963951454262, + "grad_norm": 0.5561406263549451, + "learning_rate": 0.00016172553673635706, + "loss": 12.3912, + "step": 11371 + }, + { + "epoch": 0.6192508491420092, + "grad_norm": 0.6768993477226342, + "learning_rate": 0.00016171859861828237, + "loss": 12.3638, + "step": 11372 + }, + { + "epoch": 0.6193053031385922, + "grad_norm": 0.5643467859039376, + "learning_rate": 0.00016171166002027344, + "loss": 12.2933, + "step": 11373 + }, + { + "epoch": 0.6193597571351752, + "grad_norm": 0.6870602034201745, + "learning_rate": 0.00016170472094238436, + "loss": 12.4109, + "step": 11374 + }, + { + "epoch": 0.6194142111317582, + "grad_norm": 0.6432492313425489, + "learning_rate": 0.00016169778138466897, + "loss": 12.3329, + "step": 11375 + }, + { + "epoch": 0.6194686651283413, + "grad_norm": 0.5871531262125809, + "learning_rate": 0.00016169084134718133, + "loss": 12.3689, + "step": 11376 + }, + { + "epoch": 0.6195231191249243, + "grad_norm": 0.581973602589214, + "learning_rate": 0.00016168390082997534, + "loss": 12.3192, + "step": 11377 + }, + { + "epoch": 0.6195775731215073, + "grad_norm": 0.7273779095746546, + "learning_rate": 0.000161676959833105, + "loss": 12.4788, + "step": 11378 + }, + { + "epoch": 0.6196320271180903, + "grad_norm": 0.6365631002352293, + "learning_rate": 0.0001616700183566243, + "loss": 12.4043, + "step": 11379 + }, + { + "epoch": 0.6196864811146733, + "grad_norm": 0.5885970626918695, + "learning_rate": 0.00016166307640058712, + "loss": 12.3859, + "step": 11380 + }, + { + "epoch": 0.6197409351112563, + "grad_norm": 0.6437308372453692, + "learning_rate": 0.0001616561339650476, + "loss": 12.3077, + "step": 11381 + }, + { + "epoch": 0.6197953891078394, + "grad_norm": 0.7012567621229076, + "learning_rate": 0.00016164919105005957, + "loss": 12.2332, + "step": 11382 + }, + { + "epoch": 0.6198498431044224, + "grad_norm": 0.5809889464584606, + "learning_rate": 0.00016164224765567714, + "loss": 12.2526, + "step": 11383 + }, + { + "epoch": 0.6199042971010054, + "grad_norm": 0.6078813911317118, + "learning_rate": 0.00016163530378195424, + "loss": 12.47, + "step": 11384 + }, + { + "epoch": 0.6199587510975884, + "grad_norm": 0.5687762963315507, + "learning_rate": 0.0001616283594289449, + "loss": 12.4204, + "step": 11385 + }, + { + "epoch": 0.6200132050941713, + "grad_norm": 0.6699596697271144, + "learning_rate": 0.00016162141459670308, + "loss": 12.3298, + "step": 11386 + }, + { + "epoch": 0.6200676590907543, + "grad_norm": 0.688363027506524, + "learning_rate": 0.00016161446928528284, + "loss": 12.3479, + "step": 11387 + }, + { + "epoch": 0.6201221130873374, + "grad_norm": 0.5737921994052604, + "learning_rate": 0.00016160752349473812, + "loss": 12.3075, + "step": 11388 + }, + { + "epoch": 0.6201765670839204, + "grad_norm": 0.5286850608693549, + "learning_rate": 0.00016160057722512295, + "loss": 12.1068, + "step": 11389 + }, + { + "epoch": 0.6202310210805034, + "grad_norm": 0.5964030221046523, + "learning_rate": 0.00016159363047649138, + "loss": 12.3793, + "step": 11390 + }, + { + "epoch": 0.6202854750770864, + "grad_norm": 0.6197530079107472, + "learning_rate": 0.00016158668324889742, + "loss": 12.4887, + "step": 11391 + }, + { + "epoch": 0.6203399290736694, + "grad_norm": 0.5881884732597834, + "learning_rate": 0.0001615797355423951, + "loss": 12.336, + "step": 11392 + }, + { + "epoch": 0.6203943830702524, + "grad_norm": 0.6180850844308952, + "learning_rate": 0.0001615727873570384, + "loss": 12.3021, + "step": 11393 + }, + { + "epoch": 0.6204488370668355, + "grad_norm": 0.5862481019080754, + "learning_rate": 0.00016156583869288138, + "loss": 12.4791, + "step": 11394 + }, + { + "epoch": 0.6205032910634185, + "grad_norm": 0.5721204858048597, + "learning_rate": 0.0001615588895499781, + "loss": 12.3929, + "step": 11395 + }, + { + "epoch": 0.6205577450600015, + "grad_norm": 0.6527465480459093, + "learning_rate": 0.00016155193992838253, + "loss": 12.4634, + "step": 11396 + }, + { + "epoch": 0.6206121990565845, + "grad_norm": 0.5626923759738245, + "learning_rate": 0.0001615449898281488, + "loss": 12.2249, + "step": 11397 + }, + { + "epoch": 0.6206666530531675, + "grad_norm": 0.5240694474890504, + "learning_rate": 0.00016153803924933086, + "loss": 12.268, + "step": 11398 + }, + { + "epoch": 0.6207211070497506, + "grad_norm": 0.6662649597332017, + "learning_rate": 0.00016153108819198285, + "loss": 12.3597, + "step": 11399 + }, + { + "epoch": 0.6207755610463336, + "grad_norm": 0.6755992426640863, + "learning_rate": 0.00016152413665615874, + "loss": 12.333, + "step": 11400 + }, + { + "epoch": 0.6208300150429166, + "grad_norm": 0.5522321110889248, + "learning_rate": 0.00016151718464191265, + "loss": 12.2361, + "step": 11401 + }, + { + "epoch": 0.6208844690394996, + "grad_norm": 0.6126956189989169, + "learning_rate": 0.0001615102321492986, + "loss": 12.4911, + "step": 11402 + }, + { + "epoch": 0.6209389230360826, + "grad_norm": 0.6273283474136503, + "learning_rate": 0.0001615032791783707, + "loss": 12.4404, + "step": 11403 + }, + { + "epoch": 0.6209933770326655, + "grad_norm": 0.5857263956973275, + "learning_rate": 0.00016149632572918295, + "loss": 12.3055, + "step": 11404 + }, + { + "epoch": 0.6210478310292487, + "grad_norm": 0.6970385254814665, + "learning_rate": 0.00016148937180178948, + "loss": 12.4064, + "step": 11405 + }, + { + "epoch": 0.6211022850258316, + "grad_norm": 0.6333024467892219, + "learning_rate": 0.00016148241739624431, + "loss": 12.2791, + "step": 11406 + }, + { + "epoch": 0.6211567390224146, + "grad_norm": 0.6105156550817371, + "learning_rate": 0.0001614754625126016, + "loss": 12.2001, + "step": 11407 + }, + { + "epoch": 0.6212111930189976, + "grad_norm": 0.5839111937695641, + "learning_rate": 0.00016146850715091537, + "loss": 12.3525, + "step": 11408 + }, + { + "epoch": 0.6212656470155806, + "grad_norm": 0.5676464285172782, + "learning_rate": 0.00016146155131123972, + "loss": 12.2968, + "step": 11409 + }, + { + "epoch": 0.6213201010121636, + "grad_norm": 0.6103215468950094, + "learning_rate": 0.00016145459499362872, + "loss": 12.3521, + "step": 11410 + }, + { + "epoch": 0.6213745550087467, + "grad_norm": 0.5476910263333569, + "learning_rate": 0.0001614476381981365, + "loss": 12.3984, + "step": 11411 + }, + { + "epoch": 0.6214290090053297, + "grad_norm": 0.6216564371614643, + "learning_rate": 0.00016144068092481715, + "loss": 12.4056, + "step": 11412 + }, + { + "epoch": 0.6214834630019127, + "grad_norm": 0.5650350514742664, + "learning_rate": 0.00016143372317372476, + "loss": 12.3514, + "step": 11413 + }, + { + "epoch": 0.6215379169984957, + "grad_norm": 0.5629443200885589, + "learning_rate": 0.0001614267649449134, + "loss": 12.2287, + "step": 11414 + }, + { + "epoch": 0.6215923709950787, + "grad_norm": 0.6506262615831745, + "learning_rate": 0.00016141980623843725, + "loss": 12.3663, + "step": 11415 + }, + { + "epoch": 0.6216468249916617, + "grad_norm": 0.5682975024890445, + "learning_rate": 0.00016141284705435037, + "loss": 12.3697, + "step": 11416 + }, + { + "epoch": 0.6217012789882448, + "grad_norm": 0.7033118716417303, + "learning_rate": 0.0001614058873927069, + "loss": 12.4198, + "step": 11417 + }, + { + "epoch": 0.6217557329848278, + "grad_norm": 0.5432031850109923, + "learning_rate": 0.00016139892725356095, + "loss": 12.1988, + "step": 11418 + }, + { + "epoch": 0.6218101869814108, + "grad_norm": 0.6199558312329202, + "learning_rate": 0.00016139196663696666, + "loss": 12.3618, + "step": 11419 + }, + { + "epoch": 0.6218646409779938, + "grad_norm": 0.6876465073353317, + "learning_rate": 0.0001613850055429781, + "loss": 12.3008, + "step": 11420 + }, + { + "epoch": 0.6219190949745768, + "grad_norm": 0.5512823343568609, + "learning_rate": 0.0001613780439716495, + "loss": 12.3898, + "step": 11421 + }, + { + "epoch": 0.6219735489711598, + "grad_norm": 0.5895349045147323, + "learning_rate": 0.00016137108192303492, + "loss": 12.3176, + "step": 11422 + }, + { + "epoch": 0.6220280029677429, + "grad_norm": 0.629941616548519, + "learning_rate": 0.00016136411939718847, + "loss": 12.1728, + "step": 11423 + }, + { + "epoch": 0.6220824569643258, + "grad_norm": 0.610110213177622, + "learning_rate": 0.00016135715639416438, + "loss": 12.3168, + "step": 11424 + }, + { + "epoch": 0.6221369109609088, + "grad_norm": 0.6671360522586627, + "learning_rate": 0.00016135019291401673, + "loss": 12.3129, + "step": 11425 + }, + { + "epoch": 0.6221913649574918, + "grad_norm": 0.630821330701, + "learning_rate": 0.00016134322895679972, + "loss": 12.3731, + "step": 11426 + }, + { + "epoch": 0.6222458189540748, + "grad_norm": 0.6827847510004765, + "learning_rate": 0.00016133626452256747, + "loss": 12.3675, + "step": 11427 + }, + { + "epoch": 0.6223002729506578, + "grad_norm": 0.638241364269164, + "learning_rate": 0.00016132929961137414, + "loss": 12.3356, + "step": 11428 + }, + { + "epoch": 0.6223547269472409, + "grad_norm": 0.7731307743750341, + "learning_rate": 0.00016132233422327385, + "loss": 12.44, + "step": 11429 + }, + { + "epoch": 0.6224091809438239, + "grad_norm": 0.6614816238411625, + "learning_rate": 0.00016131536835832085, + "loss": 12.2131, + "step": 11430 + }, + { + "epoch": 0.6224636349404069, + "grad_norm": 0.6298317845000152, + "learning_rate": 0.00016130840201656924, + "loss": 12.3037, + "step": 11431 + }, + { + "epoch": 0.6225180889369899, + "grad_norm": 0.6875551830205758, + "learning_rate": 0.00016130143519807322, + "loss": 12.3649, + "step": 11432 + }, + { + "epoch": 0.6225725429335729, + "grad_norm": 0.6613920426623728, + "learning_rate": 0.00016129446790288699, + "loss": 12.231, + "step": 11433 + }, + { + "epoch": 0.622626996930156, + "grad_norm": 0.5661040079677665, + "learning_rate": 0.00016128750013106463, + "loss": 12.3363, + "step": 11434 + }, + { + "epoch": 0.622681450926739, + "grad_norm": 0.6117659979479873, + "learning_rate": 0.00016128053188266045, + "loss": 12.3404, + "step": 11435 + }, + { + "epoch": 0.622735904923322, + "grad_norm": 0.6470145341604084, + "learning_rate": 0.00016127356315772857, + "loss": 12.4804, + "step": 11436 + }, + { + "epoch": 0.622790358919905, + "grad_norm": 0.5893466214077081, + "learning_rate": 0.00016126659395632317, + "loss": 12.3519, + "step": 11437 + }, + { + "epoch": 0.622844812916488, + "grad_norm": 0.6130572078926236, + "learning_rate": 0.00016125962427849847, + "loss": 12.2067, + "step": 11438 + }, + { + "epoch": 0.622899266913071, + "grad_norm": 0.649299424382659, + "learning_rate": 0.00016125265412430867, + "loss": 12.3978, + "step": 11439 + }, + { + "epoch": 0.6229537209096541, + "grad_norm": 0.6643560544717511, + "learning_rate": 0.0001612456834938079, + "loss": 12.2235, + "step": 11440 + }, + { + "epoch": 0.6230081749062371, + "grad_norm": 0.6577597286908099, + "learning_rate": 0.00016123871238705052, + "loss": 12.2824, + "step": 11441 + }, + { + "epoch": 0.62306262890282, + "grad_norm": 0.6010533868191522, + "learning_rate": 0.00016123174080409056, + "loss": 12.2624, + "step": 11442 + }, + { + "epoch": 0.623117082899403, + "grad_norm": 0.5642128036208448, + "learning_rate": 0.00016122476874498234, + "loss": 12.2395, + "step": 11443 + }, + { + "epoch": 0.623171536895986, + "grad_norm": 0.820910878340875, + "learning_rate": 0.00016121779620978009, + "loss": 12.3242, + "step": 11444 + }, + { + "epoch": 0.623225990892569, + "grad_norm": 0.5726982445133031, + "learning_rate": 0.00016121082319853796, + "loss": 12.1313, + "step": 11445 + }, + { + "epoch": 0.6232804448891521, + "grad_norm": 0.684093458108693, + "learning_rate": 0.0001612038497113102, + "loss": 12.177, + "step": 11446 + }, + { + "epoch": 0.6233348988857351, + "grad_norm": 0.7989837270669965, + "learning_rate": 0.00016119687574815103, + "loss": 12.4761, + "step": 11447 + }, + { + "epoch": 0.6233893528823181, + "grad_norm": 0.6048684246884773, + "learning_rate": 0.00016118990130911472, + "loss": 12.3019, + "step": 11448 + }, + { + "epoch": 0.6234438068789011, + "grad_norm": 0.6627434098596703, + "learning_rate": 0.00016118292639425545, + "loss": 12.3217, + "step": 11449 + }, + { + "epoch": 0.6234982608754841, + "grad_norm": 0.5933997333423316, + "learning_rate": 0.0001611759510036275, + "loss": 12.346, + "step": 11450 + }, + { + "epoch": 0.6235527148720671, + "grad_norm": 0.6311707370727244, + "learning_rate": 0.00016116897513728507, + "loss": 12.2266, + "step": 11451 + }, + { + "epoch": 0.6236071688686502, + "grad_norm": 0.5682200494836679, + "learning_rate": 0.00016116199879528245, + "loss": 12.2795, + "step": 11452 + }, + { + "epoch": 0.6236616228652332, + "grad_norm": 0.5491493618190088, + "learning_rate": 0.0001611550219776739, + "loss": 12.3042, + "step": 11453 + }, + { + "epoch": 0.6237160768618162, + "grad_norm": 0.5664628758150743, + "learning_rate": 0.00016114804468451359, + "loss": 12.3853, + "step": 11454 + }, + { + "epoch": 0.6237705308583992, + "grad_norm": 0.5770704099528122, + "learning_rate": 0.00016114106691585587, + "loss": 12.3271, + "step": 11455 + }, + { + "epoch": 0.6238249848549822, + "grad_norm": 0.5652321308945724, + "learning_rate": 0.00016113408867175495, + "loss": 12.2696, + "step": 11456 + }, + { + "epoch": 0.6238794388515652, + "grad_norm": 0.5560296538187035, + "learning_rate": 0.0001611271099522651, + "loss": 12.2529, + "step": 11457 + }, + { + "epoch": 0.6239338928481483, + "grad_norm": 0.530748673775166, + "learning_rate": 0.0001611201307574406, + "loss": 12.3456, + "step": 11458 + }, + { + "epoch": 0.6239883468447313, + "grad_norm": 0.5964895425684653, + "learning_rate": 0.00016111315108733568, + "loss": 12.2588, + "step": 11459 + }, + { + "epoch": 0.6240428008413142, + "grad_norm": 0.5661140090484537, + "learning_rate": 0.0001611061709420047, + "loss": 12.3302, + "step": 11460 + }, + { + "epoch": 0.6240972548378972, + "grad_norm": 0.5239071385889489, + "learning_rate": 0.00016109919032150186, + "loss": 12.2311, + "step": 11461 + }, + { + "epoch": 0.6241517088344802, + "grad_norm": 0.6031453262958488, + "learning_rate": 0.00016109220922588146, + "loss": 12.3783, + "step": 11462 + }, + { + "epoch": 0.6242061628310632, + "grad_norm": 0.5879324423494298, + "learning_rate": 0.00016108522765519783, + "loss": 12.2452, + "step": 11463 + }, + { + "epoch": 0.6242606168276463, + "grad_norm": 0.5270952355855081, + "learning_rate": 0.00016107824560950516, + "loss": 12.1846, + "step": 11464 + }, + { + "epoch": 0.6243150708242293, + "grad_norm": 0.5950726560960155, + "learning_rate": 0.00016107126308885787, + "loss": 12.2707, + "step": 11465 + }, + { + "epoch": 0.6243695248208123, + "grad_norm": 0.6100586106298427, + "learning_rate": 0.00016106428009331016, + "loss": 12.3043, + "step": 11466 + }, + { + "epoch": 0.6244239788173953, + "grad_norm": 0.6366884073275398, + "learning_rate": 0.00016105729662291643, + "loss": 12.4719, + "step": 11467 + }, + { + "epoch": 0.6244784328139783, + "grad_norm": 0.5748578837822204, + "learning_rate": 0.00016105031267773086, + "loss": 12.313, + "step": 11468 + }, + { + "epoch": 0.6245328868105614, + "grad_norm": 0.6744817511043489, + "learning_rate": 0.00016104332825780783, + "loss": 12.4107, + "step": 11469 + }, + { + "epoch": 0.6245873408071444, + "grad_norm": 0.5967398349761409, + "learning_rate": 0.00016103634336320165, + "loss": 12.3625, + "step": 11470 + }, + { + "epoch": 0.6246417948037274, + "grad_norm": 0.6167695078489145, + "learning_rate": 0.00016102935799396662, + "loss": 12.3634, + "step": 11471 + }, + { + "epoch": 0.6246962488003104, + "grad_norm": 0.5803870896505192, + "learning_rate": 0.0001610223721501571, + "loss": 12.2731, + "step": 11472 + }, + { + "epoch": 0.6247507027968934, + "grad_norm": 0.6282871396517364, + "learning_rate": 0.00016101538583182735, + "loss": 12.4319, + "step": 11473 + }, + { + "epoch": 0.6248051567934764, + "grad_norm": 0.5646362615407534, + "learning_rate": 0.00016100839903903174, + "loss": 12.299, + "step": 11474 + }, + { + "epoch": 0.6248596107900595, + "grad_norm": 0.5939754539784448, + "learning_rate": 0.00016100141177182456, + "loss": 12.3805, + "step": 11475 + }, + { + "epoch": 0.6249140647866425, + "grad_norm": 0.5399769811511669, + "learning_rate": 0.0001609944240302602, + "loss": 12.3223, + "step": 11476 + }, + { + "epoch": 0.6249685187832255, + "grad_norm": 0.5462233913359226, + "learning_rate": 0.00016098743581439298, + "loss": 12.2263, + "step": 11477 + }, + { + "epoch": 0.6250229727798085, + "grad_norm": 0.5445448329430717, + "learning_rate": 0.0001609804471242772, + "loss": 12.3775, + "step": 11478 + }, + { + "epoch": 0.6250774267763914, + "grad_norm": 0.5906469011451747, + "learning_rate": 0.00016097345795996728, + "loss": 12.4571, + "step": 11479 + }, + { + "epoch": 0.6251318807729744, + "grad_norm": 0.569176070603741, + "learning_rate": 0.00016096646832151746, + "loss": 12.3415, + "step": 11480 + }, + { + "epoch": 0.6251863347695575, + "grad_norm": 0.6055177138206724, + "learning_rate": 0.00016095947820898222, + "loss": 12.3101, + "step": 11481 + }, + { + "epoch": 0.6252407887661405, + "grad_norm": 0.5858300768306324, + "learning_rate": 0.00016095248762241585, + "loss": 12.2721, + "step": 11482 + }, + { + "epoch": 0.6252952427627235, + "grad_norm": 0.6811483796041748, + "learning_rate": 0.0001609454965618727, + "loss": 12.3685, + "step": 11483 + }, + { + "epoch": 0.6253496967593065, + "grad_norm": 0.5647578694435895, + "learning_rate": 0.00016093850502740714, + "loss": 12.254, + "step": 11484 + }, + { + "epoch": 0.6254041507558895, + "grad_norm": 0.7362467140997495, + "learning_rate": 0.00016093151301907352, + "loss": 12.3546, + "step": 11485 + }, + { + "epoch": 0.6254586047524725, + "grad_norm": 0.6211658326926465, + "learning_rate": 0.00016092452053692629, + "loss": 12.3764, + "step": 11486 + }, + { + "epoch": 0.6255130587490556, + "grad_norm": 0.5776185249749698, + "learning_rate": 0.00016091752758101976, + "loss": 12.3521, + "step": 11487 + }, + { + "epoch": 0.6255675127456386, + "grad_norm": 0.6806993333104564, + "learning_rate": 0.00016091053415140827, + "loss": 12.2411, + "step": 11488 + }, + { + "epoch": 0.6256219667422216, + "grad_norm": 0.5323216271189978, + "learning_rate": 0.00016090354024814632, + "loss": 12.3461, + "step": 11489 + }, + { + "epoch": 0.6256764207388046, + "grad_norm": 0.5230959798871765, + "learning_rate": 0.0001608965458712882, + "loss": 12.2264, + "step": 11490 + }, + { + "epoch": 0.6257308747353876, + "grad_norm": 0.6505345111176285, + "learning_rate": 0.0001608895510208883, + "loss": 12.3135, + "step": 11491 + }, + { + "epoch": 0.6257853287319706, + "grad_norm": 0.6001563708139264, + "learning_rate": 0.00016088255569700108, + "loss": 12.2745, + "step": 11492 + }, + { + "epoch": 0.6258397827285537, + "grad_norm": 0.5530352308068975, + "learning_rate": 0.0001608755598996809, + "loss": 12.3888, + "step": 11493 + }, + { + "epoch": 0.6258942367251367, + "grad_norm": 0.5921117482992737, + "learning_rate": 0.0001608685636289821, + "loss": 12.3241, + "step": 11494 + }, + { + "epoch": 0.6259486907217197, + "grad_norm": 0.5998253973463721, + "learning_rate": 0.00016086156688495918, + "loss": 12.3406, + "step": 11495 + }, + { + "epoch": 0.6260031447183027, + "grad_norm": 0.6089749393227019, + "learning_rate": 0.00016085456966766652, + "loss": 12.4072, + "step": 11496 + }, + { + "epoch": 0.6260575987148856, + "grad_norm": 0.6446561830580213, + "learning_rate": 0.00016084757197715852, + "loss": 12.3605, + "step": 11497 + }, + { + "epoch": 0.6261120527114687, + "grad_norm": 0.6811082040294554, + "learning_rate": 0.00016084057381348957, + "loss": 12.1462, + "step": 11498 + }, + { + "epoch": 0.6261665067080517, + "grad_norm": 0.6105997784239102, + "learning_rate": 0.00016083357517671413, + "loss": 12.3169, + "step": 11499 + }, + { + "epoch": 0.6262209607046347, + "grad_norm": 0.5720422689859095, + "learning_rate": 0.0001608265760668866, + "loss": 12.3158, + "step": 11500 + }, + { + "epoch": 0.6262754147012177, + "grad_norm": 0.5999332881450222, + "learning_rate": 0.00016081957648406142, + "loss": 12.3959, + "step": 11501 + }, + { + "epoch": 0.6263298686978007, + "grad_norm": 0.6285254125685592, + "learning_rate": 0.00016081257642829304, + "loss": 12.1985, + "step": 11502 + }, + { + "epoch": 0.6263843226943837, + "grad_norm": 0.5574983213776983, + "learning_rate": 0.00016080557589963584, + "loss": 12.4112, + "step": 11503 + }, + { + "epoch": 0.6264387766909668, + "grad_norm": 0.5640310694002958, + "learning_rate": 0.00016079857489814428, + "loss": 12.1416, + "step": 11504 + }, + { + "epoch": 0.6264932306875498, + "grad_norm": 0.5408204607168074, + "learning_rate": 0.00016079157342387284, + "loss": 12.3197, + "step": 11505 + }, + { + "epoch": 0.6265476846841328, + "grad_norm": 0.7008687453180122, + "learning_rate": 0.00016078457147687588, + "loss": 12.3588, + "step": 11506 + }, + { + "epoch": 0.6266021386807158, + "grad_norm": 0.6096810692741483, + "learning_rate": 0.00016077756905720793, + "loss": 12.3563, + "step": 11507 + }, + { + "epoch": 0.6266565926772988, + "grad_norm": 0.6437700499832963, + "learning_rate": 0.0001607705661649234, + "loss": 12.4718, + "step": 11508 + }, + { + "epoch": 0.6267110466738818, + "grad_norm": 0.6347956814809077, + "learning_rate": 0.00016076356280007677, + "loss": 12.4226, + "step": 11509 + }, + { + "epoch": 0.6267655006704649, + "grad_norm": 0.5734954095097526, + "learning_rate": 0.00016075655896272248, + "loss": 12.2312, + "step": 11510 + }, + { + "epoch": 0.6268199546670479, + "grad_norm": 0.6054178017659978, + "learning_rate": 0.00016074955465291498, + "loss": 12.2454, + "step": 11511 + }, + { + "epoch": 0.6268744086636309, + "grad_norm": 0.5455242527056521, + "learning_rate": 0.0001607425498707088, + "loss": 12.2335, + "step": 11512 + }, + { + "epoch": 0.6269288626602139, + "grad_norm": 0.6139827424512838, + "learning_rate": 0.0001607355446161583, + "loss": 12.2324, + "step": 11513 + }, + { + "epoch": 0.6269833166567969, + "grad_norm": 0.604926511533672, + "learning_rate": 0.00016072853888931808, + "loss": 12.3358, + "step": 11514 + }, + { + "epoch": 0.6270377706533798, + "grad_norm": 0.6322046476901865, + "learning_rate": 0.00016072153269024254, + "loss": 12.3039, + "step": 11515 + }, + { + "epoch": 0.627092224649963, + "grad_norm": 0.5496567982649935, + "learning_rate": 0.00016071452601898616, + "loss": 12.3524, + "step": 11516 + }, + { + "epoch": 0.6271466786465459, + "grad_norm": 0.6463799631048145, + "learning_rate": 0.00016070751887560346, + "loss": 12.3817, + "step": 11517 + }, + { + "epoch": 0.6272011326431289, + "grad_norm": 0.554577880732961, + "learning_rate": 0.0001607005112601489, + "loss": 12.2941, + "step": 11518 + }, + { + "epoch": 0.6272555866397119, + "grad_norm": 0.6290277561324147, + "learning_rate": 0.00016069350317267697, + "loss": 12.4027, + "step": 11519 + }, + { + "epoch": 0.6273100406362949, + "grad_norm": 0.6559922146220869, + "learning_rate": 0.0001606864946132422, + "loss": 12.1408, + "step": 11520 + }, + { + "epoch": 0.6273644946328779, + "grad_norm": 0.565898840105582, + "learning_rate": 0.0001606794855818991, + "loss": 12.313, + "step": 11521 + }, + { + "epoch": 0.627418948629461, + "grad_norm": 0.6088149405508227, + "learning_rate": 0.00016067247607870212, + "loss": 12.4226, + "step": 11522 + }, + { + "epoch": 0.627473402626044, + "grad_norm": 0.6121159921496703, + "learning_rate": 0.00016066546610370578, + "loss": 12.4291, + "step": 11523 + }, + { + "epoch": 0.627527856622627, + "grad_norm": 0.6382194354578349, + "learning_rate": 0.00016065845565696463, + "loss": 12.3131, + "step": 11524 + }, + { + "epoch": 0.62758231061921, + "grad_norm": 0.5665618795827646, + "learning_rate": 0.00016065144473853313, + "loss": 12.298, + "step": 11525 + }, + { + "epoch": 0.627636764615793, + "grad_norm": 0.6252068636147795, + "learning_rate": 0.00016064443334846585, + "loss": 12.3406, + "step": 11526 + }, + { + "epoch": 0.627691218612376, + "grad_norm": 0.6200796722836409, + "learning_rate": 0.00016063742148681725, + "loss": 12.2552, + "step": 11527 + }, + { + "epoch": 0.6277456726089591, + "grad_norm": 0.6278916891273529, + "learning_rate": 0.00016063040915364191, + "loss": 12.3177, + "step": 11528 + }, + { + "epoch": 0.6278001266055421, + "grad_norm": 0.7228648431792107, + "learning_rate": 0.00016062339634899435, + "loss": 12.28, + "step": 11529 + }, + { + "epoch": 0.6278545806021251, + "grad_norm": 0.6419605641087148, + "learning_rate": 0.00016061638307292907, + "loss": 12.3749, + "step": 11530 + }, + { + "epoch": 0.6279090345987081, + "grad_norm": 0.654333567110793, + "learning_rate": 0.00016060936932550064, + "loss": 12.4443, + "step": 11531 + }, + { + "epoch": 0.627963488595291, + "grad_norm": 0.6178585671956981, + "learning_rate": 0.0001606023551067636, + "loss": 12.4102, + "step": 11532 + }, + { + "epoch": 0.6280179425918742, + "grad_norm": 0.6668895511152603, + "learning_rate": 0.0001605953404167725, + "loss": 12.3039, + "step": 11533 + }, + { + "epoch": 0.6280723965884571, + "grad_norm": 0.6314002451567844, + "learning_rate": 0.00016058832525558186, + "loss": 12.3504, + "step": 11534 + }, + { + "epoch": 0.6281268505850401, + "grad_norm": 0.6920156557036148, + "learning_rate": 0.0001605813096232462, + "loss": 12.2574, + "step": 11535 + }, + { + "epoch": 0.6281813045816231, + "grad_norm": 0.6072329811421321, + "learning_rate": 0.00016057429351982013, + "loss": 12.3304, + "step": 11536 + }, + { + "epoch": 0.6282357585782061, + "grad_norm": 0.6583826115350645, + "learning_rate": 0.00016056727694535824, + "loss": 12.3934, + "step": 11537 + }, + { + "epoch": 0.6282902125747891, + "grad_norm": 0.6324447219536025, + "learning_rate": 0.000160560259899915, + "loss": 12.3431, + "step": 11538 + }, + { + "epoch": 0.6283446665713722, + "grad_norm": 0.6568949683285629, + "learning_rate": 0.00016055324238354506, + "loss": 12.2794, + "step": 11539 + }, + { + "epoch": 0.6283991205679552, + "grad_norm": 0.6226100446959809, + "learning_rate": 0.00016054622439630293, + "loss": 12.3236, + "step": 11540 + }, + { + "epoch": 0.6284535745645382, + "grad_norm": 0.5624371013426458, + "learning_rate": 0.0001605392059382432, + "loss": 12.3061, + "step": 11541 + }, + { + "epoch": 0.6285080285611212, + "grad_norm": 0.7739453888229896, + "learning_rate": 0.0001605321870094205, + "loss": 12.2432, + "step": 11542 + }, + { + "epoch": 0.6285624825577042, + "grad_norm": 0.6571712760851335, + "learning_rate": 0.0001605251676098893, + "loss": 12.3277, + "step": 11543 + }, + { + "epoch": 0.6286169365542872, + "grad_norm": 0.659169789649659, + "learning_rate": 0.00016051814773970427, + "loss": 12.3759, + "step": 11544 + }, + { + "epoch": 0.6286713905508703, + "grad_norm": 0.567263752452329, + "learning_rate": 0.00016051112739891998, + "loss": 12.1465, + "step": 11545 + }, + { + "epoch": 0.6287258445474533, + "grad_norm": 0.584308563991564, + "learning_rate": 0.00016050410658759103, + "loss": 12.2037, + "step": 11546 + }, + { + "epoch": 0.6287802985440363, + "grad_norm": 0.6845359198235154, + "learning_rate": 0.000160497085305772, + "loss": 12.4059, + "step": 11547 + }, + { + "epoch": 0.6288347525406193, + "grad_norm": 0.5190582578308734, + "learning_rate": 0.00016049006355351746, + "loss": 12.2964, + "step": 11548 + }, + { + "epoch": 0.6288892065372023, + "grad_norm": 0.6111561586009826, + "learning_rate": 0.00016048304133088202, + "loss": 12.2567, + "step": 11549 + }, + { + "epoch": 0.6289436605337853, + "grad_norm": 0.6995756765034283, + "learning_rate": 0.00016047601863792036, + "loss": 12.0985, + "step": 11550 + }, + { + "epoch": 0.6289981145303684, + "grad_norm": 0.6538510687733482, + "learning_rate": 0.000160468995474687, + "loss": 12.2661, + "step": 11551 + }, + { + "epoch": 0.6290525685269514, + "grad_norm": 0.6233392511529413, + "learning_rate": 0.00016046197184123667, + "loss": 12.3554, + "step": 11552 + }, + { + "epoch": 0.6291070225235343, + "grad_norm": 0.6012815233663004, + "learning_rate": 0.00016045494773762382, + "loss": 12.2265, + "step": 11553 + }, + { + "epoch": 0.6291614765201173, + "grad_norm": 0.5931580882714241, + "learning_rate": 0.0001604479231639032, + "loss": 12.396, + "step": 11554 + }, + { + "epoch": 0.6292159305167003, + "grad_norm": 0.6316027311980901, + "learning_rate": 0.00016044089812012935, + "loss": 12.2889, + "step": 11555 + }, + { + "epoch": 0.6292703845132833, + "grad_norm": 0.7595009750544222, + "learning_rate": 0.00016043387260635696, + "loss": 12.5283, + "step": 11556 + }, + { + "epoch": 0.6293248385098664, + "grad_norm": 0.5545253224480045, + "learning_rate": 0.00016042684662264067, + "loss": 12.2653, + "step": 11557 + }, + { + "epoch": 0.6293792925064494, + "grad_norm": 0.598868192716635, + "learning_rate": 0.00016041982016903506, + "loss": 12.2521, + "step": 11558 + }, + { + "epoch": 0.6294337465030324, + "grad_norm": 0.6504218467817201, + "learning_rate": 0.00016041279324559483, + "loss": 12.3431, + "step": 11559 + }, + { + "epoch": 0.6294882004996154, + "grad_norm": 0.5507058063814501, + "learning_rate": 0.00016040576585237454, + "loss": 12.3622, + "step": 11560 + }, + { + "epoch": 0.6295426544961984, + "grad_norm": 0.5483976461810933, + "learning_rate": 0.00016039873798942892, + "loss": 12.2512, + "step": 11561 + }, + { + "epoch": 0.6295971084927814, + "grad_norm": 0.5704138722303417, + "learning_rate": 0.00016039170965681255, + "loss": 12.3039, + "step": 11562 + }, + { + "epoch": 0.6296515624893645, + "grad_norm": 0.5907463783061986, + "learning_rate": 0.00016038468085458014, + "loss": 12.262, + "step": 11563 + }, + { + "epoch": 0.6297060164859475, + "grad_norm": 0.6172033679213188, + "learning_rate": 0.00016037765158278636, + "loss": 12.3686, + "step": 11564 + }, + { + "epoch": 0.6297604704825305, + "grad_norm": 0.6291212721767191, + "learning_rate": 0.00016037062184148576, + "loss": 12.2868, + "step": 11565 + }, + { + "epoch": 0.6298149244791135, + "grad_norm": 0.5517508471945034, + "learning_rate": 0.0001603635916307331, + "loss": 12.1767, + "step": 11566 + }, + { + "epoch": 0.6298693784756965, + "grad_norm": 0.5935989273602583, + "learning_rate": 0.00016035656095058308, + "loss": 12.4345, + "step": 11567 + }, + { + "epoch": 0.6299238324722796, + "grad_norm": 0.6190845781659723, + "learning_rate": 0.0001603495298010903, + "loss": 12.3404, + "step": 11568 + }, + { + "epoch": 0.6299782864688626, + "grad_norm": 0.5894594749346714, + "learning_rate": 0.00016034249818230943, + "loss": 12.2778, + "step": 11569 + }, + { + "epoch": 0.6300327404654456, + "grad_norm": 0.70588711193054, + "learning_rate": 0.0001603354660942952, + "loss": 12.291, + "step": 11570 + }, + { + "epoch": 0.6300871944620285, + "grad_norm": 0.6115013551554997, + "learning_rate": 0.00016032843353710224, + "loss": 12.2606, + "step": 11571 + }, + { + "epoch": 0.6301416484586115, + "grad_norm": 0.681756775481514, + "learning_rate": 0.00016032140051078532, + "loss": 12.402, + "step": 11572 + }, + { + "epoch": 0.6301961024551945, + "grad_norm": 0.7152374060110404, + "learning_rate": 0.000160314367015399, + "loss": 12.3076, + "step": 11573 + }, + { + "epoch": 0.6302505564517776, + "grad_norm": 0.6401385865783593, + "learning_rate": 0.0001603073330509981, + "loss": 12.2946, + "step": 11574 + }, + { + "epoch": 0.6303050104483606, + "grad_norm": 0.7067366584131319, + "learning_rate": 0.00016030029861763723, + "loss": 12.3269, + "step": 11575 + }, + { + "epoch": 0.6303594644449436, + "grad_norm": 0.5837445863525274, + "learning_rate": 0.00016029326371537115, + "loss": 12.2874, + "step": 11576 + }, + { + "epoch": 0.6304139184415266, + "grad_norm": 0.6664881012459636, + "learning_rate": 0.00016028622834425455, + "loss": 12.4066, + "step": 11577 + }, + { + "epoch": 0.6304683724381096, + "grad_norm": 0.5732167094095257, + "learning_rate": 0.0001602791925043421, + "loss": 12.2736, + "step": 11578 + }, + { + "epoch": 0.6305228264346926, + "grad_norm": 0.5701728735843571, + "learning_rate": 0.00016027215619568853, + "loss": 12.3236, + "step": 11579 + }, + { + "epoch": 0.6305772804312757, + "grad_norm": 0.6263392417410668, + "learning_rate": 0.00016026511941834862, + "loss": 12.3113, + "step": 11580 + }, + { + "epoch": 0.6306317344278587, + "grad_norm": 0.5986858779759879, + "learning_rate": 0.00016025808217237696, + "loss": 12.4141, + "step": 11581 + }, + { + "epoch": 0.6306861884244417, + "grad_norm": 0.6120558822405074, + "learning_rate": 0.0001602510444578284, + "loss": 12.3433, + "step": 11582 + }, + { + "epoch": 0.6307406424210247, + "grad_norm": 0.5997141772225506, + "learning_rate": 0.00016024400627475763, + "loss": 12.2342, + "step": 11583 + }, + { + "epoch": 0.6307950964176077, + "grad_norm": 0.6046918603331474, + "learning_rate": 0.00016023696762321933, + "loss": 12.4326, + "step": 11584 + }, + { + "epoch": 0.6308495504141907, + "grad_norm": 0.7145511224984784, + "learning_rate": 0.0001602299285032683, + "loss": 12.4418, + "step": 11585 + }, + { + "epoch": 0.6309040044107738, + "grad_norm": 0.5674460638373898, + "learning_rate": 0.00016022288891495918, + "loss": 12.3899, + "step": 11586 + }, + { + "epoch": 0.6309584584073568, + "grad_norm": 0.6078663393707878, + "learning_rate": 0.00016021584885834682, + "loss": 12.3836, + "step": 11587 + }, + { + "epoch": 0.6310129124039398, + "grad_norm": 0.6731437267177017, + "learning_rate": 0.00016020880833348593, + "loss": 12.4028, + "step": 11588 + }, + { + "epoch": 0.6310673664005227, + "grad_norm": 0.7464619177043753, + "learning_rate": 0.00016020176734043125, + "loss": 12.3038, + "step": 11589 + }, + { + "epoch": 0.6311218203971057, + "grad_norm": 0.6604754928634954, + "learning_rate": 0.0001601947258792375, + "loss": 12.3084, + "step": 11590 + }, + { + "epoch": 0.6311762743936887, + "grad_norm": 0.625782172031097, + "learning_rate": 0.00016018768394995947, + "loss": 12.319, + "step": 11591 + }, + { + "epoch": 0.6312307283902718, + "grad_norm": 0.6305290661457754, + "learning_rate": 0.00016018064155265196, + "loss": 12.3992, + "step": 11592 + }, + { + "epoch": 0.6312851823868548, + "grad_norm": 0.6876046735716144, + "learning_rate": 0.00016017359868736964, + "loss": 12.3389, + "step": 11593 + }, + { + "epoch": 0.6313396363834378, + "grad_norm": 0.6017411391397692, + "learning_rate": 0.00016016655535416735, + "loss": 12.3124, + "step": 11594 + }, + { + "epoch": 0.6313940903800208, + "grad_norm": 0.5489052272328483, + "learning_rate": 0.00016015951155309982, + "loss": 12.4313, + "step": 11595 + }, + { + "epoch": 0.6314485443766038, + "grad_norm": 0.5643980944234432, + "learning_rate": 0.00016015246728422186, + "loss": 12.3021, + "step": 11596 + }, + { + "epoch": 0.6315029983731868, + "grad_norm": 0.6283739932043341, + "learning_rate": 0.00016014542254758825, + "loss": 12.2458, + "step": 11597 + }, + { + "epoch": 0.6315574523697699, + "grad_norm": 0.6559159408597222, + "learning_rate": 0.0001601383773432537, + "loss": 12.3632, + "step": 11598 + }, + { + "epoch": 0.6316119063663529, + "grad_norm": 0.5743836786873061, + "learning_rate": 0.00016013133167127306, + "loss": 12.28, + "step": 11599 + }, + { + "epoch": 0.6316663603629359, + "grad_norm": 0.6037395609902132, + "learning_rate": 0.0001601242855317011, + "loss": 12.2813, + "step": 11600 + }, + { + "epoch": 0.6317208143595189, + "grad_norm": 0.8138665820981366, + "learning_rate": 0.0001601172389245926, + "loss": 12.3117, + "step": 11601 + }, + { + "epoch": 0.6317752683561019, + "grad_norm": 0.6117045153426184, + "learning_rate": 0.00016011019185000237, + "loss": 12.277, + "step": 11602 + }, + { + "epoch": 0.631829722352685, + "grad_norm": 0.6571968896939481, + "learning_rate": 0.00016010314430798522, + "loss": 12.3114, + "step": 11603 + }, + { + "epoch": 0.631884176349268, + "grad_norm": 0.5819354329428992, + "learning_rate": 0.00016009609629859598, + "loss": 12.3988, + "step": 11604 + }, + { + "epoch": 0.631938630345851, + "grad_norm": 0.6130792154075189, + "learning_rate": 0.00016008904782188936, + "loss": 12.3442, + "step": 11605 + }, + { + "epoch": 0.631993084342434, + "grad_norm": 0.6161087981605953, + "learning_rate": 0.00016008199887792022, + "loss": 12.1447, + "step": 11606 + }, + { + "epoch": 0.632047538339017, + "grad_norm": 0.59213108698951, + "learning_rate": 0.0001600749494667434, + "loss": 12.4355, + "step": 11607 + }, + { + "epoch": 0.6321019923355999, + "grad_norm": 0.5643485490047897, + "learning_rate": 0.00016006789958841373, + "loss": 12.3072, + "step": 11608 + }, + { + "epoch": 0.632156446332183, + "grad_norm": 0.6552253773174634, + "learning_rate": 0.00016006084924298597, + "loss": 12.29, + "step": 11609 + }, + { + "epoch": 0.632210900328766, + "grad_norm": 0.5629019501817991, + "learning_rate": 0.000160053798430515, + "loss": 12.3558, + "step": 11610 + }, + { + "epoch": 0.632265354325349, + "grad_norm": 0.6248217978673832, + "learning_rate": 0.00016004674715105558, + "loss": 12.1634, + "step": 11611 + }, + { + "epoch": 0.632319808321932, + "grad_norm": 0.7052249240995686, + "learning_rate": 0.0001600396954046626, + "loss": 12.4626, + "step": 11612 + }, + { + "epoch": 0.632374262318515, + "grad_norm": 0.6229537066548735, + "learning_rate": 0.00016003264319139088, + "loss": 12.2556, + "step": 11613 + }, + { + "epoch": 0.632428716315098, + "grad_norm": 0.6232155182387574, + "learning_rate": 0.0001600255905112953, + "loss": 12.3656, + "step": 11614 + }, + { + "epoch": 0.6324831703116811, + "grad_norm": 0.5985385901002135, + "learning_rate": 0.0001600185373644306, + "loss": 12.2761, + "step": 11615 + }, + { + "epoch": 0.6325376243082641, + "grad_norm": 0.5804199678921629, + "learning_rate": 0.00016001148375085168, + "loss": 12.335, + "step": 11616 + }, + { + "epoch": 0.6325920783048471, + "grad_norm": 0.6273430482799724, + "learning_rate": 0.00016000442967061346, + "loss": 12.2817, + "step": 11617 + }, + { + "epoch": 0.6326465323014301, + "grad_norm": 0.62391369595692, + "learning_rate": 0.00015999737512377072, + "loss": 12.1001, + "step": 11618 + }, + { + "epoch": 0.6327009862980131, + "grad_norm": 0.6135274564031256, + "learning_rate": 0.0001599903201103783, + "loss": 12.3653, + "step": 11619 + }, + { + "epoch": 0.6327554402945961, + "grad_norm": 0.5769543408166947, + "learning_rate": 0.0001599832646304911, + "loss": 12.2763, + "step": 11620 + }, + { + "epoch": 0.6328098942911792, + "grad_norm": 0.6553316798350617, + "learning_rate": 0.00015997620868416396, + "loss": 12.3955, + "step": 11621 + }, + { + "epoch": 0.6328643482877622, + "grad_norm": 0.6165252378482168, + "learning_rate": 0.00015996915227145178, + "loss": 12.3882, + "step": 11622 + }, + { + "epoch": 0.6329188022843452, + "grad_norm": 0.578197074875526, + "learning_rate": 0.00015996209539240942, + "loss": 12.2141, + "step": 11623 + }, + { + "epoch": 0.6329732562809282, + "grad_norm": 0.6546997218876552, + "learning_rate": 0.00015995503804709175, + "loss": 12.2659, + "step": 11624 + }, + { + "epoch": 0.6330277102775111, + "grad_norm": 0.5346845777961047, + "learning_rate": 0.00015994798023555363, + "loss": 12.4347, + "step": 11625 + }, + { + "epoch": 0.6330821642740941, + "grad_norm": 0.7272621472840548, + "learning_rate": 0.00015994092195785, + "loss": 12.3033, + "step": 11626 + }, + { + "epoch": 0.6331366182706772, + "grad_norm": 0.5912965473134938, + "learning_rate": 0.0001599338632140357, + "loss": 12.3621, + "step": 11627 + }, + { + "epoch": 0.6331910722672602, + "grad_norm": 0.6084544466595607, + "learning_rate": 0.0001599268040041656, + "loss": 12.3431, + "step": 11628 + }, + { + "epoch": 0.6332455262638432, + "grad_norm": 0.6195207017613764, + "learning_rate": 0.00015991974432829468, + "loss": 12.2912, + "step": 11629 + }, + { + "epoch": 0.6332999802604262, + "grad_norm": 0.6689588330255158, + "learning_rate": 0.00015991268418647772, + "loss": 12.2959, + "step": 11630 + }, + { + "epoch": 0.6333544342570092, + "grad_norm": 0.566247369536983, + "learning_rate": 0.00015990562357876968, + "loss": 12.2669, + "step": 11631 + }, + { + "epoch": 0.6334088882535923, + "grad_norm": 0.7277889233888325, + "learning_rate": 0.00015989856250522548, + "loss": 12.4804, + "step": 11632 + }, + { + "epoch": 0.6334633422501753, + "grad_norm": 0.6384352082662375, + "learning_rate": 0.00015989150096590003, + "loss": 12.3755, + "step": 11633 + }, + { + "epoch": 0.6335177962467583, + "grad_norm": 0.7476197942467356, + "learning_rate": 0.00015988443896084822, + "loss": 12.3098, + "step": 11634 + }, + { + "epoch": 0.6335722502433413, + "grad_norm": 0.6117565640338275, + "learning_rate": 0.00015987737649012497, + "loss": 12.2888, + "step": 11635 + }, + { + "epoch": 0.6336267042399243, + "grad_norm": 0.5877500335920101, + "learning_rate": 0.00015987031355378518, + "loss": 12.3271, + "step": 11636 + }, + { + "epoch": 0.6336811582365073, + "grad_norm": 0.7537103433883942, + "learning_rate": 0.0001598632501518838, + "loss": 12.3379, + "step": 11637 + }, + { + "epoch": 0.6337356122330904, + "grad_norm": 0.562326931604982, + "learning_rate": 0.00015985618628447577, + "loss": 12.1521, + "step": 11638 + }, + { + "epoch": 0.6337900662296734, + "grad_norm": 0.6720970545937142, + "learning_rate": 0.00015984912195161595, + "loss": 12.2289, + "step": 11639 + }, + { + "epoch": 0.6338445202262564, + "grad_norm": 0.7437459848399414, + "learning_rate": 0.00015984205715335935, + "loss": 12.2812, + "step": 11640 + }, + { + "epoch": 0.6338989742228394, + "grad_norm": 0.5977958341585544, + "learning_rate": 0.00015983499188976087, + "loss": 12.3845, + "step": 11641 + }, + { + "epoch": 0.6339534282194224, + "grad_norm": 0.6642266351088731, + "learning_rate": 0.00015982792616087545, + "loss": 12.3436, + "step": 11642 + }, + { + "epoch": 0.6340078822160053, + "grad_norm": 0.6253891353430321, + "learning_rate": 0.0001598208599667581, + "loss": 12.271, + "step": 11643 + }, + { + "epoch": 0.6340623362125885, + "grad_norm": 0.6099370956862907, + "learning_rate": 0.00015981379330746363, + "loss": 12.3552, + "step": 11644 + }, + { + "epoch": 0.6341167902091714, + "grad_norm": 0.6318726619121243, + "learning_rate": 0.0001598067261830471, + "loss": 12.2443, + "step": 11645 + }, + { + "epoch": 0.6341712442057544, + "grad_norm": 0.5609364407721689, + "learning_rate": 0.00015979965859356347, + "loss": 12.4605, + "step": 11646 + }, + { + "epoch": 0.6342256982023374, + "grad_norm": 0.702645292105402, + "learning_rate": 0.00015979259053906764, + "loss": 12.3523, + "step": 11647 + }, + { + "epoch": 0.6342801521989204, + "grad_norm": 0.6312822853043644, + "learning_rate": 0.00015978552201961464, + "loss": 12.3942, + "step": 11648 + }, + { + "epoch": 0.6343346061955034, + "grad_norm": 0.6827895428823186, + "learning_rate": 0.00015977845303525933, + "loss": 12.3318, + "step": 11649 + }, + { + "epoch": 0.6343890601920865, + "grad_norm": 0.5633296696619046, + "learning_rate": 0.00015977138358605676, + "loss": 12.3435, + "step": 11650 + }, + { + "epoch": 0.6344435141886695, + "grad_norm": 0.6803882086394497, + "learning_rate": 0.00015976431367206191, + "loss": 12.2223, + "step": 11651 + }, + { + "epoch": 0.6344979681852525, + "grad_norm": 0.5938162328387466, + "learning_rate": 0.00015975724329332972, + "loss": 12.2128, + "step": 11652 + }, + { + "epoch": 0.6345524221818355, + "grad_norm": 0.6310709849247171, + "learning_rate": 0.00015975017244991522, + "loss": 12.3975, + "step": 11653 + }, + { + "epoch": 0.6346068761784185, + "grad_norm": 0.6423715628698327, + "learning_rate": 0.0001597431011418733, + "loss": 12.3256, + "step": 11654 + }, + { + "epoch": 0.6346613301750015, + "grad_norm": 0.6112767294965891, + "learning_rate": 0.00015973602936925904, + "loss": 12.2323, + "step": 11655 + }, + { + "epoch": 0.6347157841715846, + "grad_norm": 0.63268138600798, + "learning_rate": 0.0001597289571321274, + "loss": 12.2258, + "step": 11656 + }, + { + "epoch": 0.6347702381681676, + "grad_norm": 0.6132891422810992, + "learning_rate": 0.00015972188443053335, + "loss": 12.2712, + "step": 11657 + }, + { + "epoch": 0.6348246921647506, + "grad_norm": 0.6007827090687096, + "learning_rate": 0.00015971481126453196, + "loss": 12.3272, + "step": 11658 + }, + { + "epoch": 0.6348791461613336, + "grad_norm": 0.5534845093934893, + "learning_rate": 0.00015970773763417814, + "loss": 12.3964, + "step": 11659 + }, + { + "epoch": 0.6349336001579166, + "grad_norm": 0.5471382906969506, + "learning_rate": 0.00015970066353952696, + "loss": 12.2745, + "step": 11660 + }, + { + "epoch": 0.6349880541544995, + "grad_norm": 0.5633804601387051, + "learning_rate": 0.0001596935889806334, + "loss": 12.3712, + "step": 11661 + }, + { + "epoch": 0.6350425081510827, + "grad_norm": 0.5419783900118207, + "learning_rate": 0.00015968651395755246, + "loss": 12.2674, + "step": 11662 + }, + { + "epoch": 0.6350969621476656, + "grad_norm": 0.6110092220859774, + "learning_rate": 0.00015967943847033922, + "loss": 12.2362, + "step": 11663 + }, + { + "epoch": 0.6351514161442486, + "grad_norm": 0.5929826979767274, + "learning_rate": 0.00015967236251904863, + "loss": 12.3111, + "step": 11664 + }, + { + "epoch": 0.6352058701408316, + "grad_norm": 0.6319140215443649, + "learning_rate": 0.00015966528610373572, + "loss": 12.2552, + "step": 11665 + }, + { + "epoch": 0.6352603241374146, + "grad_norm": 0.6250575551924299, + "learning_rate": 0.00015965820922445557, + "loss": 12.1944, + "step": 11666 + }, + { + "epoch": 0.6353147781339977, + "grad_norm": 0.634596741902867, + "learning_rate": 0.0001596511318812632, + "loss": 12.4088, + "step": 11667 + }, + { + "epoch": 0.6353692321305807, + "grad_norm": 0.5708759498126883, + "learning_rate": 0.00015964405407421358, + "loss": 12.4004, + "step": 11668 + }, + { + "epoch": 0.6354236861271637, + "grad_norm": 0.6274291085104535, + "learning_rate": 0.00015963697580336183, + "loss": 12.3656, + "step": 11669 + }, + { + "epoch": 0.6354781401237467, + "grad_norm": 0.6381275977567078, + "learning_rate": 0.00015962989706876294, + "loss": 12.282, + "step": 11670 + }, + { + "epoch": 0.6355325941203297, + "grad_norm": 0.5877263890706275, + "learning_rate": 0.00015962281787047197, + "loss": 12.4272, + "step": 11671 + }, + { + "epoch": 0.6355870481169127, + "grad_norm": 0.5476695346740104, + "learning_rate": 0.00015961573820854396, + "loss": 12.2424, + "step": 11672 + }, + { + "epoch": 0.6356415021134958, + "grad_norm": 0.5647625962262167, + "learning_rate": 0.000159608658083034, + "loss": 12.2057, + "step": 11673 + }, + { + "epoch": 0.6356959561100788, + "grad_norm": 0.6079880681478522, + "learning_rate": 0.0001596015774939971, + "loss": 12.4301, + "step": 11674 + }, + { + "epoch": 0.6357504101066618, + "grad_norm": 0.627898353169781, + "learning_rate": 0.00015959449644148833, + "loss": 12.2848, + "step": 11675 + }, + { + "epoch": 0.6358048641032448, + "grad_norm": 0.6000947735191888, + "learning_rate": 0.00015958741492556278, + "loss": 12.5301, + "step": 11676 + }, + { + "epoch": 0.6358593180998278, + "grad_norm": 0.6383110683869572, + "learning_rate": 0.0001595803329462755, + "loss": 12.3761, + "step": 11677 + }, + { + "epoch": 0.6359137720964108, + "grad_norm": 0.6226714343986417, + "learning_rate": 0.00015957325050368156, + "loss": 12.2989, + "step": 11678 + }, + { + "epoch": 0.6359682260929939, + "grad_norm": 0.5994882158830673, + "learning_rate": 0.00015956616759783604, + "loss": 12.4552, + "step": 11679 + }, + { + "epoch": 0.6360226800895769, + "grad_norm": 0.6428334737207511, + "learning_rate": 0.00015955908422879395, + "loss": 12.2438, + "step": 11680 + }, + { + "epoch": 0.6360771340861598, + "grad_norm": 0.6423154220996011, + "learning_rate": 0.00015955200039661049, + "loss": 12.4005, + "step": 11681 + }, + { + "epoch": 0.6361315880827428, + "grad_norm": 0.5486200532788184, + "learning_rate": 0.00015954491610134066, + "loss": 12.3216, + "step": 11682 + }, + { + "epoch": 0.6361860420793258, + "grad_norm": 0.6309935704318574, + "learning_rate": 0.00015953783134303962, + "loss": 12.2927, + "step": 11683 + }, + { + "epoch": 0.6362404960759088, + "grad_norm": 0.5824870916214382, + "learning_rate": 0.00015953074612176237, + "loss": 12.3833, + "step": 11684 + }, + { + "epoch": 0.6362949500724919, + "grad_norm": 0.6877518652979272, + "learning_rate": 0.0001595236604375641, + "loss": 12.3747, + "step": 11685 + }, + { + "epoch": 0.6363494040690749, + "grad_norm": 0.6248540007950087, + "learning_rate": 0.00015951657429049982, + "loss": 12.4178, + "step": 11686 + }, + { + "epoch": 0.6364038580656579, + "grad_norm": 0.5595550579826128, + "learning_rate": 0.0001595094876806247, + "loss": 12.3002, + "step": 11687 + }, + { + "epoch": 0.6364583120622409, + "grad_norm": 0.5609830813350578, + "learning_rate": 0.00015950240060799383, + "loss": 12.1729, + "step": 11688 + }, + { + "epoch": 0.6365127660588239, + "grad_norm": 0.6840906942224464, + "learning_rate": 0.00015949531307266233, + "loss": 12.2164, + "step": 11689 + }, + { + "epoch": 0.6365672200554069, + "grad_norm": 0.6102936461713946, + "learning_rate": 0.00015948822507468526, + "loss": 12.2205, + "step": 11690 + }, + { + "epoch": 0.63662167405199, + "grad_norm": 0.7109447118428419, + "learning_rate": 0.00015948113661411778, + "loss": 12.2895, + "step": 11691 + }, + { + "epoch": 0.636676128048573, + "grad_norm": 0.5232082210855122, + "learning_rate": 0.000159474047691015, + "loss": 12.3388, + "step": 11692 + }, + { + "epoch": 0.636730582045156, + "grad_norm": 0.6181267829552438, + "learning_rate": 0.0001594669583054321, + "loss": 12.4013, + "step": 11693 + }, + { + "epoch": 0.636785036041739, + "grad_norm": 0.5853241566666262, + "learning_rate": 0.00015945986845742414, + "loss": 12.2416, + "step": 11694 + }, + { + "epoch": 0.636839490038322, + "grad_norm": 0.6440508892552266, + "learning_rate": 0.00015945277814704623, + "loss": 12.3312, + "step": 11695 + }, + { + "epoch": 0.636893944034905, + "grad_norm": 0.5764079526694624, + "learning_rate": 0.0001594456873743536, + "loss": 12.3378, + "step": 11696 + }, + { + "epoch": 0.6369483980314881, + "grad_norm": 0.5912901229404963, + "learning_rate": 0.0001594385961394013, + "loss": 12.2194, + "step": 11697 + }, + { + "epoch": 0.637002852028071, + "grad_norm": 0.6094323269680411, + "learning_rate": 0.00015943150444224454, + "loss": 12.319, + "step": 11698 + }, + { + "epoch": 0.637057306024654, + "grad_norm": 0.5635457642014866, + "learning_rate": 0.0001594244122829384, + "loss": 12.3567, + "step": 11699 + }, + { + "epoch": 0.637111760021237, + "grad_norm": 0.587726048800709, + "learning_rate": 0.00015941731966153807, + "loss": 12.3092, + "step": 11700 + }, + { + "epoch": 0.63716621401782, + "grad_norm": 0.6048198304309254, + "learning_rate": 0.0001594102265780987, + "loss": 12.3443, + "step": 11701 + }, + { + "epoch": 0.6372206680144031, + "grad_norm": 0.5849478044884568, + "learning_rate": 0.00015940313303267547, + "loss": 12.3281, + "step": 11702 + }, + { + "epoch": 0.6372751220109861, + "grad_norm": 0.6299700554928337, + "learning_rate": 0.0001593960390253235, + "loss": 12.3386, + "step": 11703 + }, + { + "epoch": 0.6373295760075691, + "grad_norm": 0.7091301677395995, + "learning_rate": 0.00015938894455609797, + "loss": 12.2443, + "step": 11704 + }, + { + "epoch": 0.6373840300041521, + "grad_norm": 0.6079507577811608, + "learning_rate": 0.00015938184962505404, + "loss": 12.2513, + "step": 11705 + }, + { + "epoch": 0.6374384840007351, + "grad_norm": 0.6279045948807624, + "learning_rate": 0.00015937475423224688, + "loss": 12.3791, + "step": 11706 + }, + { + "epoch": 0.6374929379973181, + "grad_norm": 0.5422081747141756, + "learning_rate": 0.00015936765837773172, + "loss": 12.2061, + "step": 11707 + }, + { + "epoch": 0.6375473919939012, + "grad_norm": 0.5599573109726491, + "learning_rate": 0.00015936056206156365, + "loss": 12.1982, + "step": 11708 + }, + { + "epoch": 0.6376018459904842, + "grad_norm": 0.6439053391485484, + "learning_rate": 0.00015935346528379792, + "loss": 12.3223, + "step": 11709 + }, + { + "epoch": 0.6376562999870672, + "grad_norm": 0.6076777657170431, + "learning_rate": 0.00015934636804448968, + "loss": 12.2359, + "step": 11710 + }, + { + "epoch": 0.6377107539836502, + "grad_norm": 0.6062940962482165, + "learning_rate": 0.0001593392703436941, + "loss": 12.2789, + "step": 11711 + }, + { + "epoch": 0.6377652079802332, + "grad_norm": 0.6090446598951548, + "learning_rate": 0.00015933217218146644, + "loss": 12.3359, + "step": 11712 + }, + { + "epoch": 0.6378196619768162, + "grad_norm": 0.5609615308756994, + "learning_rate": 0.00015932507355786182, + "loss": 12.3445, + "step": 11713 + }, + { + "epoch": 0.6378741159733993, + "grad_norm": 0.6507111609328309, + "learning_rate": 0.00015931797447293552, + "loss": 12.255, + "step": 11714 + }, + { + "epoch": 0.6379285699699823, + "grad_norm": 0.683631714011919, + "learning_rate": 0.0001593108749267427, + "loss": 12.21, + "step": 11715 + }, + { + "epoch": 0.6379830239665653, + "grad_norm": 0.5426813755368318, + "learning_rate": 0.00015930377491933854, + "loss": 12.2444, + "step": 11716 + }, + { + "epoch": 0.6380374779631482, + "grad_norm": 0.6285851116986505, + "learning_rate": 0.0001592966744507783, + "loss": 12.4862, + "step": 11717 + }, + { + "epoch": 0.6380919319597312, + "grad_norm": 0.6077576905377188, + "learning_rate": 0.0001592895735211172, + "loss": 12.3795, + "step": 11718 + }, + { + "epoch": 0.6381463859563142, + "grad_norm": 0.641708820048512, + "learning_rate": 0.0001592824721304104, + "loss": 12.2624, + "step": 11719 + }, + { + "epoch": 0.6382008399528973, + "grad_norm": 0.5832512250990588, + "learning_rate": 0.00015927537027871316, + "loss": 12.4107, + "step": 11720 + }, + { + "epoch": 0.6382552939494803, + "grad_norm": 0.5560870907638484, + "learning_rate": 0.00015926826796608073, + "loss": 12.2068, + "step": 11721 + }, + { + "epoch": 0.6383097479460633, + "grad_norm": 0.621569606432972, + "learning_rate": 0.00015926116519256826, + "loss": 12.1528, + "step": 11722 + }, + { + "epoch": 0.6383642019426463, + "grad_norm": 0.6470496143775933, + "learning_rate": 0.00015925406195823108, + "loss": 12.2376, + "step": 11723 + }, + { + "epoch": 0.6384186559392293, + "grad_norm": 0.5339169789036717, + "learning_rate": 0.00015924695826312435, + "loss": 12.3241, + "step": 11724 + }, + { + "epoch": 0.6384731099358123, + "grad_norm": 0.5973303793387309, + "learning_rate": 0.00015923985410730334, + "loss": 12.3649, + "step": 11725 + }, + { + "epoch": 0.6385275639323954, + "grad_norm": 0.6828561520479584, + "learning_rate": 0.00015923274949082328, + "loss": 12.4293, + "step": 11726 + }, + { + "epoch": 0.6385820179289784, + "grad_norm": 0.6080382653576513, + "learning_rate": 0.00015922564441373945, + "loss": 12.3078, + "step": 11727 + }, + { + "epoch": 0.6386364719255614, + "grad_norm": 0.6970886252419852, + "learning_rate": 0.00015921853887610707, + "loss": 12.3816, + "step": 11728 + }, + { + "epoch": 0.6386909259221444, + "grad_norm": 0.5930463408717045, + "learning_rate": 0.00015921143287798138, + "loss": 12.4402, + "step": 11729 + }, + { + "epoch": 0.6387453799187274, + "grad_norm": 0.5680002960969729, + "learning_rate": 0.00015920432641941768, + "loss": 12.455, + "step": 11730 + }, + { + "epoch": 0.6387998339153104, + "grad_norm": 0.5601685294026225, + "learning_rate": 0.00015919721950047119, + "loss": 12.3696, + "step": 11731 + }, + { + "epoch": 0.6388542879118935, + "grad_norm": 0.5913415111135867, + "learning_rate": 0.0001591901121211972, + "loss": 12.2751, + "step": 11732 + }, + { + "epoch": 0.6389087419084765, + "grad_norm": 0.559035357315235, + "learning_rate": 0.00015918300428165099, + "loss": 12.3311, + "step": 11733 + }, + { + "epoch": 0.6389631959050595, + "grad_norm": 0.7960160680407351, + "learning_rate": 0.0001591758959818878, + "loss": 12.561, + "step": 11734 + }, + { + "epoch": 0.6390176499016424, + "grad_norm": 0.5580073249870375, + "learning_rate": 0.00015916878722196291, + "loss": 12.3125, + "step": 11735 + }, + { + "epoch": 0.6390721038982254, + "grad_norm": 0.5980153973018723, + "learning_rate": 0.00015916167800193162, + "loss": 12.2887, + "step": 11736 + }, + { + "epoch": 0.6391265578948085, + "grad_norm": 0.6145853706411524, + "learning_rate": 0.00015915456832184922, + "loss": 12.3097, + "step": 11737 + }, + { + "epoch": 0.6391810118913915, + "grad_norm": 0.5761368251670567, + "learning_rate": 0.00015914745818177095, + "loss": 12.1488, + "step": 11738 + }, + { + "epoch": 0.6392354658879745, + "grad_norm": 0.5720363711923273, + "learning_rate": 0.00015914034758175211, + "loss": 12.3468, + "step": 11739 + }, + { + "epoch": 0.6392899198845575, + "grad_norm": 0.5730666501611782, + "learning_rate": 0.00015913323652184803, + "loss": 12.2901, + "step": 11740 + }, + { + "epoch": 0.6393443738811405, + "grad_norm": 0.5967192597957882, + "learning_rate": 0.000159126125002114, + "loss": 12.133, + "step": 11741 + }, + { + "epoch": 0.6393988278777235, + "grad_norm": 0.5847137465942532, + "learning_rate": 0.00015911901302260528, + "loss": 12.2619, + "step": 11742 + }, + { + "epoch": 0.6394532818743066, + "grad_norm": 0.6527293239756213, + "learning_rate": 0.00015911190058337722, + "loss": 12.3478, + "step": 11743 + }, + { + "epoch": 0.6395077358708896, + "grad_norm": 0.6833711071046793, + "learning_rate": 0.0001591047876844851, + "loss": 12.3742, + "step": 11744 + }, + { + "epoch": 0.6395621898674726, + "grad_norm": 0.6587852445363322, + "learning_rate": 0.00015909767432598426, + "loss": 12.274, + "step": 11745 + }, + { + "epoch": 0.6396166438640556, + "grad_norm": 0.6486099259602204, + "learning_rate": 0.00015909056050792994, + "loss": 12.3702, + "step": 11746 + }, + { + "epoch": 0.6396710978606386, + "grad_norm": 0.7265266626848722, + "learning_rate": 0.00015908344623037757, + "loss": 12.1387, + "step": 11747 + }, + { + "epoch": 0.6397255518572216, + "grad_norm": 0.6899028392303771, + "learning_rate": 0.00015907633149338238, + "loss": 12.334, + "step": 11748 + }, + { + "epoch": 0.6397800058538047, + "grad_norm": 0.5360899758896915, + "learning_rate": 0.00015906921629699975, + "loss": 12.3807, + "step": 11749 + }, + { + "epoch": 0.6398344598503877, + "grad_norm": 0.5463456124183731, + "learning_rate": 0.00015906210064128498, + "loss": 12.3499, + "step": 11750 + }, + { + "epoch": 0.6398889138469707, + "grad_norm": 0.6538197908044291, + "learning_rate": 0.00015905498452629342, + "loss": 12.2644, + "step": 11751 + }, + { + "epoch": 0.6399433678435537, + "grad_norm": 0.6847980090091363, + "learning_rate": 0.00015904786795208038, + "loss": 12.4953, + "step": 11752 + }, + { + "epoch": 0.6399978218401366, + "grad_norm": 0.5717753214335817, + "learning_rate": 0.0001590407509187012, + "loss": 12.3944, + "step": 11753 + }, + { + "epoch": 0.6400522758367196, + "grad_norm": 0.5897375896614131, + "learning_rate": 0.0001590336334262113, + "loss": 12.3208, + "step": 11754 + }, + { + "epoch": 0.6401067298333027, + "grad_norm": 0.6509711952414429, + "learning_rate": 0.00015902651547466593, + "loss": 12.3059, + "step": 11755 + }, + { + "epoch": 0.6401611838298857, + "grad_norm": 0.5686185105629006, + "learning_rate": 0.00015901939706412045, + "loss": 12.3686, + "step": 11756 + }, + { + "epoch": 0.6402156378264687, + "grad_norm": 0.5761598035037799, + "learning_rate": 0.00015901227819463024, + "loss": 12.292, + "step": 11757 + }, + { + "epoch": 0.6402700918230517, + "grad_norm": 0.6233562948043391, + "learning_rate": 0.0001590051588662507, + "loss": 12.2645, + "step": 11758 + }, + { + "epoch": 0.6403245458196347, + "grad_norm": 0.5431735823508531, + "learning_rate": 0.00015899803907903714, + "loss": 12.3071, + "step": 11759 + }, + { + "epoch": 0.6403789998162177, + "grad_norm": 0.5844067916605888, + "learning_rate": 0.00015899091883304494, + "loss": 12.3562, + "step": 11760 + }, + { + "epoch": 0.6404334538128008, + "grad_norm": 0.5775938645263473, + "learning_rate": 0.00015898379812832945, + "loss": 12.2704, + "step": 11761 + }, + { + "epoch": 0.6404879078093838, + "grad_norm": 0.6114526535574534, + "learning_rate": 0.00015897667696494606, + "loss": 12.2708, + "step": 11762 + }, + { + "epoch": 0.6405423618059668, + "grad_norm": 0.543502985810955, + "learning_rate": 0.0001589695553429501, + "loss": 12.2512, + "step": 11763 + }, + { + "epoch": 0.6405968158025498, + "grad_norm": 0.5491499746554386, + "learning_rate": 0.00015896243326239703, + "loss": 12.1462, + "step": 11764 + }, + { + "epoch": 0.6406512697991328, + "grad_norm": 0.659161048417712, + "learning_rate": 0.00015895531072334217, + "loss": 12.2656, + "step": 11765 + }, + { + "epoch": 0.6407057237957159, + "grad_norm": 0.5578738711061708, + "learning_rate": 0.00015894818772584096, + "loss": 12.2665, + "step": 11766 + }, + { + "epoch": 0.6407601777922989, + "grad_norm": 0.5485848619593658, + "learning_rate": 0.00015894106426994875, + "loss": 12.3885, + "step": 11767 + }, + { + "epoch": 0.6408146317888819, + "grad_norm": 0.6042684554813096, + "learning_rate": 0.0001589339403557209, + "loss": 12.268, + "step": 11768 + }, + { + "epoch": 0.6408690857854649, + "grad_norm": 0.5603475640961582, + "learning_rate": 0.0001589268159832129, + "loss": 12.2199, + "step": 11769 + }, + { + "epoch": 0.6409235397820479, + "grad_norm": 0.6086400504391121, + "learning_rate": 0.00015891969115248007, + "loss": 12.3038, + "step": 11770 + }, + { + "epoch": 0.6409779937786309, + "grad_norm": 0.7496642611061374, + "learning_rate": 0.00015891256586357782, + "loss": 12.3797, + "step": 11771 + }, + { + "epoch": 0.641032447775214, + "grad_norm": 0.6043326573937473, + "learning_rate": 0.00015890544011656161, + "loss": 12.3197, + "step": 11772 + }, + { + "epoch": 0.641086901771797, + "grad_norm": 0.5879429958227689, + "learning_rate": 0.0001588983139114868, + "loss": 12.3328, + "step": 11773 + }, + { + "epoch": 0.6411413557683799, + "grad_norm": 0.6431041182157121, + "learning_rate": 0.00015889118724840887, + "loss": 12.5443, + "step": 11774 + }, + { + "epoch": 0.6411958097649629, + "grad_norm": 0.6387769615888651, + "learning_rate": 0.00015888406012738314, + "loss": 12.2873, + "step": 11775 + }, + { + "epoch": 0.6412502637615459, + "grad_norm": 0.5907844038891188, + "learning_rate": 0.00015887693254846509, + "loss": 12.1766, + "step": 11776 + }, + { + "epoch": 0.6413047177581289, + "grad_norm": 0.8089658170344911, + "learning_rate": 0.0001588698045117101, + "loss": 12.319, + "step": 11777 + }, + { + "epoch": 0.641359171754712, + "grad_norm": 0.5795627418817415, + "learning_rate": 0.00015886267601717373, + "loss": 12.1944, + "step": 11778 + }, + { + "epoch": 0.641413625751295, + "grad_norm": 0.6330996063985966, + "learning_rate": 0.00015885554706491126, + "loss": 12.3079, + "step": 11779 + }, + { + "epoch": 0.641468079747878, + "grad_norm": 0.6312142988227856, + "learning_rate": 0.0001588484176549782, + "loss": 12.2919, + "step": 11780 + }, + { + "epoch": 0.641522533744461, + "grad_norm": 0.7992335536619127, + "learning_rate": 0.00015884128778743, + "loss": 12.3403, + "step": 11781 + }, + { + "epoch": 0.641576987741044, + "grad_norm": 0.6054780153306412, + "learning_rate": 0.00015883415746232205, + "loss": 12.1981, + "step": 11782 + }, + { + "epoch": 0.641631441737627, + "grad_norm": 0.6121608821672163, + "learning_rate": 0.00015882702667970982, + "loss": 12.237, + "step": 11783 + }, + { + "epoch": 0.6416858957342101, + "grad_norm": 0.6458800184805136, + "learning_rate": 0.00015881989543964877, + "loss": 12.3485, + "step": 11784 + }, + { + "epoch": 0.6417403497307931, + "grad_norm": 0.610389351156267, + "learning_rate": 0.00015881276374219436, + "loss": 12.2533, + "step": 11785 + }, + { + "epoch": 0.6417948037273761, + "grad_norm": 0.6011654744945086, + "learning_rate": 0.00015880563158740202, + "loss": 12.2968, + "step": 11786 + }, + { + "epoch": 0.6418492577239591, + "grad_norm": 0.6489037502758117, + "learning_rate": 0.00015879849897532724, + "loss": 12.4367, + "step": 11787 + }, + { + "epoch": 0.6419037117205421, + "grad_norm": 0.6295543987341068, + "learning_rate": 0.00015879136590602545, + "loss": 12.2218, + "step": 11788 + }, + { + "epoch": 0.641958165717125, + "grad_norm": 0.7691534351517384, + "learning_rate": 0.00015878423237955218, + "loss": 12.3219, + "step": 11789 + }, + { + "epoch": 0.6420126197137082, + "grad_norm": 0.7059532026340123, + "learning_rate": 0.00015877709839596285, + "loss": 12.2999, + "step": 11790 + }, + { + "epoch": 0.6420670737102911, + "grad_norm": 0.6641858468829318, + "learning_rate": 0.0001587699639553129, + "loss": 12.3139, + "step": 11791 + }, + { + "epoch": 0.6421215277068741, + "grad_norm": 0.6673144977966753, + "learning_rate": 0.0001587628290576579, + "loss": 12.4529, + "step": 11792 + }, + { + "epoch": 0.6421759817034571, + "grad_norm": 0.7522695067144315, + "learning_rate": 0.0001587556937030533, + "loss": 12.2677, + "step": 11793 + }, + { + "epoch": 0.6422304357000401, + "grad_norm": 0.533954295083735, + "learning_rate": 0.00015874855789155455, + "loss": 12.2353, + "step": 11794 + }, + { + "epoch": 0.6422848896966231, + "grad_norm": 0.6863388740679423, + "learning_rate": 0.0001587414216232171, + "loss": 12.3066, + "step": 11795 + }, + { + "epoch": 0.6423393436932062, + "grad_norm": 0.7028548606498654, + "learning_rate": 0.0001587342848980966, + "loss": 12.3073, + "step": 11796 + }, + { + "epoch": 0.6423937976897892, + "grad_norm": 0.5838235714771344, + "learning_rate": 0.0001587271477162484, + "loss": 12.3099, + "step": 11797 + }, + { + "epoch": 0.6424482516863722, + "grad_norm": 0.5491865981727515, + "learning_rate": 0.0001587200100777281, + "loss": 12.3046, + "step": 11798 + }, + { + "epoch": 0.6425027056829552, + "grad_norm": 0.6396741653719359, + "learning_rate": 0.00015871287198259112, + "loss": 12.4255, + "step": 11799 + }, + { + "epoch": 0.6425571596795382, + "grad_norm": 0.5941077393260389, + "learning_rate": 0.00015870573343089298, + "loss": 12.3175, + "step": 11800 + }, + { + "epoch": 0.6426116136761213, + "grad_norm": 0.5365098578871422, + "learning_rate": 0.00015869859442268924, + "loss": 12.2355, + "step": 11801 + }, + { + "epoch": 0.6426660676727043, + "grad_norm": 0.629599542239981, + "learning_rate": 0.00015869145495803539, + "loss": 12.2843, + "step": 11802 + }, + { + "epoch": 0.6427205216692873, + "grad_norm": 0.6976645290629956, + "learning_rate": 0.0001586843150369869, + "loss": 12.533, + "step": 11803 + }, + { + "epoch": 0.6427749756658703, + "grad_norm": 0.6257102830411845, + "learning_rate": 0.0001586771746595994, + "loss": 12.4957, + "step": 11804 + }, + { + "epoch": 0.6428294296624533, + "grad_norm": 0.5139680368557052, + "learning_rate": 0.0001586700338259283, + "loss": 12.3099, + "step": 11805 + }, + { + "epoch": 0.6428838836590363, + "grad_norm": 0.6795158134138467, + "learning_rate": 0.0001586628925360292, + "loss": 12.3531, + "step": 11806 + }, + { + "epoch": 0.6429383376556194, + "grad_norm": 0.6336616387516697, + "learning_rate": 0.0001586557507899576, + "loss": 12.2245, + "step": 11807 + }, + { + "epoch": 0.6429927916522024, + "grad_norm": 0.5827707567392567, + "learning_rate": 0.00015864860858776908, + "loss": 12.2562, + "step": 11808 + }, + { + "epoch": 0.6430472456487853, + "grad_norm": 0.6793117933085399, + "learning_rate": 0.0001586414659295191, + "loss": 12.4026, + "step": 11809 + }, + { + "epoch": 0.6431016996453683, + "grad_norm": 0.7724808780527022, + "learning_rate": 0.00015863432281526326, + "loss": 12.2154, + "step": 11810 + }, + { + "epoch": 0.6431561536419513, + "grad_norm": 0.669858522966245, + "learning_rate": 0.0001586271792450571, + "loss": 12.2635, + "step": 11811 + }, + { + "epoch": 0.6432106076385343, + "grad_norm": 0.602202572719811, + "learning_rate": 0.00015862003521895614, + "loss": 12.1333, + "step": 11812 + }, + { + "epoch": 0.6432650616351174, + "grad_norm": 0.5983408313173444, + "learning_rate": 0.00015861289073701597, + "loss": 12.3779, + "step": 11813 + }, + { + "epoch": 0.6433195156317004, + "grad_norm": 0.6113563849314245, + "learning_rate": 0.00015860574579929215, + "loss": 12.3404, + "step": 11814 + }, + { + "epoch": 0.6433739696282834, + "grad_norm": 0.6883609075478769, + "learning_rate": 0.0001585986004058402, + "loss": 12.4151, + "step": 11815 + }, + { + "epoch": 0.6434284236248664, + "grad_norm": 0.6060410474299993, + "learning_rate": 0.0001585914545567157, + "loss": 12.3261, + "step": 11816 + }, + { + "epoch": 0.6434828776214494, + "grad_norm": 0.549030679214863, + "learning_rate": 0.00015858430825197426, + "loss": 12.3239, + "step": 11817 + }, + { + "epoch": 0.6435373316180324, + "grad_norm": 0.5995797386872938, + "learning_rate": 0.00015857716149167138, + "loss": 12.2988, + "step": 11818 + }, + { + "epoch": 0.6435917856146155, + "grad_norm": 0.6321642957582062, + "learning_rate": 0.00015857001427586269, + "loss": 12.3957, + "step": 11819 + }, + { + "epoch": 0.6436462396111985, + "grad_norm": 0.551558625463365, + "learning_rate": 0.00015856286660460373, + "loss": 12.3422, + "step": 11820 + }, + { + "epoch": 0.6437006936077815, + "grad_norm": 0.5490497130441678, + "learning_rate": 0.00015855571847795012, + "loss": 12.2613, + "step": 11821 + }, + { + "epoch": 0.6437551476043645, + "grad_norm": 0.5985946306848401, + "learning_rate": 0.0001585485698959574, + "loss": 12.3177, + "step": 11822 + }, + { + "epoch": 0.6438096016009475, + "grad_norm": 0.5911531978371243, + "learning_rate": 0.00015854142085868118, + "loss": 12.2837, + "step": 11823 + }, + { + "epoch": 0.6438640555975305, + "grad_norm": 0.5405399476121963, + "learning_rate": 0.00015853427136617708, + "loss": 12.3125, + "step": 11824 + }, + { + "epoch": 0.6439185095941136, + "grad_norm": 0.6308899194181132, + "learning_rate": 0.00015852712141850063, + "loss": 12.32, + "step": 11825 + }, + { + "epoch": 0.6439729635906966, + "grad_norm": 0.5272000889908651, + "learning_rate": 0.00015851997101570752, + "loss": 12.2391, + "step": 11826 + }, + { + "epoch": 0.6440274175872795, + "grad_norm": 0.6978034099378577, + "learning_rate": 0.00015851282015785328, + "loss": 12.2693, + "step": 11827 + }, + { + "epoch": 0.6440818715838625, + "grad_norm": 0.6859417143713247, + "learning_rate": 0.00015850566884499352, + "loss": 12.1768, + "step": 11828 + }, + { + "epoch": 0.6441363255804455, + "grad_norm": 0.6287347167833705, + "learning_rate": 0.00015849851707718389, + "loss": 12.2717, + "step": 11829 + }, + { + "epoch": 0.6441907795770285, + "grad_norm": 0.6238801089136621, + "learning_rate": 0.00015849136485447996, + "loss": 12.3766, + "step": 11830 + }, + { + "epoch": 0.6442452335736116, + "grad_norm": 0.5472991333501801, + "learning_rate": 0.0001584842121769374, + "loss": 12.2784, + "step": 11831 + }, + { + "epoch": 0.6442996875701946, + "grad_norm": 0.6514650993952252, + "learning_rate": 0.00015847705904461178, + "loss": 12.3264, + "step": 11832 + }, + { + "epoch": 0.6443541415667776, + "grad_norm": 0.5794851645266091, + "learning_rate": 0.0001584699054575587, + "loss": 12.4061, + "step": 11833 + }, + { + "epoch": 0.6444085955633606, + "grad_norm": 0.6213116335233053, + "learning_rate": 0.00015846275141583388, + "loss": 12.3477, + "step": 11834 + }, + { + "epoch": 0.6444630495599436, + "grad_norm": 0.5722335188355683, + "learning_rate": 0.0001584555969194929, + "loss": 12.306, + "step": 11835 + }, + { + "epoch": 0.6445175035565267, + "grad_norm": 0.6736011912847758, + "learning_rate": 0.0001584484419685914, + "loss": 12.1462, + "step": 11836 + }, + { + "epoch": 0.6445719575531097, + "grad_norm": 0.5385332998875926, + "learning_rate": 0.000158441286563185, + "loss": 12.3111, + "step": 11837 + }, + { + "epoch": 0.6446264115496927, + "grad_norm": 0.6159916819535322, + "learning_rate": 0.00015843413070332934, + "loss": 12.4208, + "step": 11838 + }, + { + "epoch": 0.6446808655462757, + "grad_norm": 0.5688143212500484, + "learning_rate": 0.0001584269743890801, + "loss": 12.0938, + "step": 11839 + }, + { + "epoch": 0.6447353195428587, + "grad_norm": 0.5963157827362754, + "learning_rate": 0.0001584198176204929, + "loss": 12.339, + "step": 11840 + }, + { + "epoch": 0.6447897735394417, + "grad_norm": 0.6757611489483027, + "learning_rate": 0.0001584126603976234, + "loss": 12.3657, + "step": 11841 + }, + { + "epoch": 0.6448442275360248, + "grad_norm": 0.5805181095282763, + "learning_rate": 0.00015840550272052726, + "loss": 12.3187, + "step": 11842 + }, + { + "epoch": 0.6448986815326078, + "grad_norm": 0.635858992716586, + "learning_rate": 0.00015839834458926012, + "loss": 12.4277, + "step": 11843 + }, + { + "epoch": 0.6449531355291908, + "grad_norm": 0.68439571005598, + "learning_rate": 0.00015839118600387771, + "loss": 12.373, + "step": 11844 + }, + { + "epoch": 0.6450075895257738, + "grad_norm": 0.6642118909612381, + "learning_rate": 0.0001583840269644356, + "loss": 12.2769, + "step": 11845 + }, + { + "epoch": 0.6450620435223567, + "grad_norm": 0.7855157673961042, + "learning_rate": 0.00015837686747098952, + "loss": 12.4094, + "step": 11846 + }, + { + "epoch": 0.6451164975189397, + "grad_norm": 0.7055329344152202, + "learning_rate": 0.00015836970752359513, + "loss": 12.4626, + "step": 11847 + }, + { + "epoch": 0.6451709515155228, + "grad_norm": 0.6876268336548789, + "learning_rate": 0.00015836254712230807, + "loss": 12.4605, + "step": 11848 + }, + { + "epoch": 0.6452254055121058, + "grad_norm": 0.6088296851825628, + "learning_rate": 0.00015835538626718412, + "loss": 12.3367, + "step": 11849 + }, + { + "epoch": 0.6452798595086888, + "grad_norm": 0.6061418913585683, + "learning_rate": 0.00015834822495827886, + "loss": 12.2388, + "step": 11850 + }, + { + "epoch": 0.6453343135052718, + "grad_norm": 0.5872816859150357, + "learning_rate": 0.00015834106319564804, + "loss": 12.1829, + "step": 11851 + }, + { + "epoch": 0.6453887675018548, + "grad_norm": 0.5548108060461536, + "learning_rate": 0.00015833390097934728, + "loss": 12.1922, + "step": 11852 + }, + { + "epoch": 0.6454432214984378, + "grad_norm": 0.62230989731747, + "learning_rate": 0.00015832673830943236, + "loss": 12.3993, + "step": 11853 + }, + { + "epoch": 0.6454976754950209, + "grad_norm": 0.6812107135336923, + "learning_rate": 0.00015831957518595897, + "loss": 12.3115, + "step": 11854 + }, + { + "epoch": 0.6455521294916039, + "grad_norm": 0.6203214207289377, + "learning_rate": 0.00015831241160898274, + "loss": 12.3399, + "step": 11855 + }, + { + "epoch": 0.6456065834881869, + "grad_norm": 0.6639827570460414, + "learning_rate": 0.00015830524757855943, + "loss": 12.2929, + "step": 11856 + }, + { + "epoch": 0.6456610374847699, + "grad_norm": 0.6651863119017111, + "learning_rate": 0.00015829808309474473, + "loss": 12.3096, + "step": 11857 + }, + { + "epoch": 0.6457154914813529, + "grad_norm": 0.5703054855457829, + "learning_rate": 0.00015829091815759436, + "loss": 12.2785, + "step": 11858 + }, + { + "epoch": 0.6457699454779359, + "grad_norm": 0.6143567571433828, + "learning_rate": 0.00015828375276716406, + "loss": 12.3569, + "step": 11859 + }, + { + "epoch": 0.645824399474519, + "grad_norm": 0.6088561935498821, + "learning_rate": 0.0001582765869235095, + "loss": 12.3654, + "step": 11860 + }, + { + "epoch": 0.645878853471102, + "grad_norm": 0.6353000312500281, + "learning_rate": 0.00015826942062668645, + "loss": 12.2322, + "step": 11861 + }, + { + "epoch": 0.645933307467685, + "grad_norm": 0.6765711476284719, + "learning_rate": 0.0001582622538767506, + "loss": 12.3561, + "step": 11862 + }, + { + "epoch": 0.645987761464268, + "grad_norm": 0.585489756512397, + "learning_rate": 0.00015825508667375768, + "loss": 12.1992, + "step": 11863 + }, + { + "epoch": 0.6460422154608509, + "grad_norm": 0.6253853276677027, + "learning_rate": 0.00015824791901776342, + "loss": 12.3809, + "step": 11864 + }, + { + "epoch": 0.6460966694574339, + "grad_norm": 0.6279991291887276, + "learning_rate": 0.00015824075090882365, + "loss": 12.3028, + "step": 11865 + }, + { + "epoch": 0.646151123454017, + "grad_norm": 0.6155440485579453, + "learning_rate": 0.00015823358234699398, + "loss": 12.3539, + "step": 11866 + }, + { + "epoch": 0.6462055774506, + "grad_norm": 0.6762972682778122, + "learning_rate": 0.0001582264133323302, + "loss": 12.3892, + "step": 11867 + }, + { + "epoch": 0.646260031447183, + "grad_norm": 0.5950789371522833, + "learning_rate": 0.00015821924386488808, + "loss": 12.2143, + "step": 11868 + }, + { + "epoch": 0.646314485443766, + "grad_norm": 0.606593652587477, + "learning_rate": 0.00015821207394472336, + "loss": 12.2222, + "step": 11869 + }, + { + "epoch": 0.646368939440349, + "grad_norm": 0.5798466789294091, + "learning_rate": 0.00015820490357189177, + "loss": 12.2193, + "step": 11870 + }, + { + "epoch": 0.6464233934369321, + "grad_norm": 0.598060235563983, + "learning_rate": 0.0001581977327464491, + "loss": 12.3152, + "step": 11871 + }, + { + "epoch": 0.6464778474335151, + "grad_norm": 0.6380850115231458, + "learning_rate": 0.0001581905614684511, + "loss": 12.3271, + "step": 11872 + }, + { + "epoch": 0.6465323014300981, + "grad_norm": 0.5917032614504739, + "learning_rate": 0.00015818338973795355, + "loss": 12.3361, + "step": 11873 + }, + { + "epoch": 0.6465867554266811, + "grad_norm": 0.6309372172799924, + "learning_rate": 0.0001581762175550122, + "loss": 12.343, + "step": 11874 + }, + { + "epoch": 0.6466412094232641, + "grad_norm": 0.6536200778124828, + "learning_rate": 0.00015816904491968282, + "loss": 12.4174, + "step": 11875 + }, + { + "epoch": 0.6466956634198471, + "grad_norm": 0.6009748805400857, + "learning_rate": 0.00015816187183202121, + "loss": 12.4067, + "step": 11876 + }, + { + "epoch": 0.6467501174164302, + "grad_norm": 0.5543701890670989, + "learning_rate": 0.0001581546982920831, + "loss": 12.3837, + "step": 11877 + }, + { + "epoch": 0.6468045714130132, + "grad_norm": 0.5888343705499574, + "learning_rate": 0.0001581475242999243, + "loss": 12.2352, + "step": 11878 + }, + { + "epoch": 0.6468590254095962, + "grad_norm": 0.6293347929653329, + "learning_rate": 0.00015814034985560063, + "loss": 12.1845, + "step": 11879 + }, + { + "epoch": 0.6469134794061792, + "grad_norm": 0.6254066864745504, + "learning_rate": 0.0001581331749591678, + "loss": 12.1629, + "step": 11880 + }, + { + "epoch": 0.6469679334027622, + "grad_norm": 0.5798924916404218, + "learning_rate": 0.0001581259996106817, + "loss": 12.2957, + "step": 11881 + }, + { + "epoch": 0.6470223873993451, + "grad_norm": 0.6548652982042967, + "learning_rate": 0.0001581188238101981, + "loss": 12.2919, + "step": 11882 + }, + { + "epoch": 0.6470768413959282, + "grad_norm": 0.6470119826591115, + "learning_rate": 0.00015811164755777274, + "loss": 12.4236, + "step": 11883 + }, + { + "epoch": 0.6471312953925112, + "grad_norm": 0.7396324813529676, + "learning_rate": 0.00015810447085346145, + "loss": 12.3413, + "step": 11884 + }, + { + "epoch": 0.6471857493890942, + "grad_norm": 0.659380387570783, + "learning_rate": 0.0001580972936973201, + "loss": 12.2496, + "step": 11885 + }, + { + "epoch": 0.6472402033856772, + "grad_norm": 0.6407359324597263, + "learning_rate": 0.0001580901160894044, + "loss": 12.348, + "step": 11886 + }, + { + "epoch": 0.6472946573822602, + "grad_norm": 0.5541698793238401, + "learning_rate": 0.00015808293802977024, + "loss": 12.1559, + "step": 11887 + }, + { + "epoch": 0.6473491113788432, + "grad_norm": 0.6298551123169205, + "learning_rate": 0.00015807575951847343, + "loss": 12.4117, + "step": 11888 + }, + { + "epoch": 0.6474035653754263, + "grad_norm": 0.696425727106263, + "learning_rate": 0.00015806858055556971, + "loss": 12.3259, + "step": 11889 + }, + { + "epoch": 0.6474580193720093, + "grad_norm": 0.6215328446217157, + "learning_rate": 0.00015806140114111504, + "loss": 12.298, + "step": 11890 + }, + { + "epoch": 0.6475124733685923, + "grad_norm": 0.6811713311025708, + "learning_rate": 0.00015805422127516513, + "loss": 12.416, + "step": 11891 + }, + { + "epoch": 0.6475669273651753, + "grad_norm": 0.5875299931350033, + "learning_rate": 0.00015804704095777588, + "loss": 12.2622, + "step": 11892 + }, + { + "epoch": 0.6476213813617583, + "grad_norm": 0.6182923010880244, + "learning_rate": 0.00015803986018900315, + "loss": 12.4197, + "step": 11893 + }, + { + "epoch": 0.6476758353583413, + "grad_norm": 0.7301611412615433, + "learning_rate": 0.00015803267896890265, + "loss": 12.3345, + "step": 11894 + }, + { + "epoch": 0.6477302893549244, + "grad_norm": 0.59758392143012, + "learning_rate": 0.0001580254972975304, + "loss": 12.3508, + "step": 11895 + }, + { + "epoch": 0.6477847433515074, + "grad_norm": 0.7039951589381491, + "learning_rate": 0.00015801831517494208, + "loss": 12.3217, + "step": 11896 + }, + { + "epoch": 0.6478391973480904, + "grad_norm": 0.6196817665670296, + "learning_rate": 0.00015801113260119362, + "loss": 12.138, + "step": 11897 + }, + { + "epoch": 0.6478936513446734, + "grad_norm": 0.6676907346235882, + "learning_rate": 0.00015800394957634088, + "loss": 12.3381, + "step": 11898 + }, + { + "epoch": 0.6479481053412564, + "grad_norm": 0.7448106654204392, + "learning_rate": 0.0001579967661004397, + "loss": 12.3667, + "step": 11899 + }, + { + "epoch": 0.6480025593378395, + "grad_norm": 0.6139973688894776, + "learning_rate": 0.00015798958217354592, + "loss": 12.2898, + "step": 11900 + }, + { + "epoch": 0.6480570133344224, + "grad_norm": 0.6219380327775433, + "learning_rate": 0.00015798239779571546, + "loss": 12.3213, + "step": 11901 + }, + { + "epoch": 0.6481114673310054, + "grad_norm": 0.624283925317137, + "learning_rate": 0.00015797521296700412, + "loss": 12.3334, + "step": 11902 + }, + { + "epoch": 0.6481659213275884, + "grad_norm": 0.6852416421453453, + "learning_rate": 0.0001579680276874678, + "loss": 12.4149, + "step": 11903 + }, + { + "epoch": 0.6482203753241714, + "grad_norm": 0.588462029793721, + "learning_rate": 0.00015796084195716242, + "loss": 12.3818, + "step": 11904 + }, + { + "epoch": 0.6482748293207544, + "grad_norm": 0.622781245012364, + "learning_rate": 0.00015795365577614377, + "loss": 12.3759, + "step": 11905 + }, + { + "epoch": 0.6483292833173375, + "grad_norm": 0.7072158092814134, + "learning_rate": 0.00015794646914446778, + "loss": 12.4398, + "step": 11906 + }, + { + "epoch": 0.6483837373139205, + "grad_norm": 0.610219828969978, + "learning_rate": 0.00015793928206219034, + "loss": 12.3247, + "step": 11907 + }, + { + "epoch": 0.6484381913105035, + "grad_norm": 0.592606307805623, + "learning_rate": 0.00015793209452936733, + "loss": 12.371, + "step": 11908 + }, + { + "epoch": 0.6484926453070865, + "grad_norm": 0.6109845084666096, + "learning_rate": 0.0001579249065460546, + "loss": 12.3556, + "step": 11909 + }, + { + "epoch": 0.6485470993036695, + "grad_norm": 0.5691135905588112, + "learning_rate": 0.00015791771811230813, + "loss": 12.3765, + "step": 11910 + }, + { + "epoch": 0.6486015533002525, + "grad_norm": 0.627849729098813, + "learning_rate": 0.00015791052922818375, + "loss": 12.4136, + "step": 11911 + }, + { + "epoch": 0.6486560072968356, + "grad_norm": 0.5720576719145197, + "learning_rate": 0.00015790333989373738, + "loss": 12.187, + "step": 11912 + }, + { + "epoch": 0.6487104612934186, + "grad_norm": 0.5913965852750814, + "learning_rate": 0.00015789615010902494, + "loss": 12.22, + "step": 11913 + }, + { + "epoch": 0.6487649152900016, + "grad_norm": 0.6458783596766352, + "learning_rate": 0.00015788895987410234, + "loss": 12.4376, + "step": 11914 + }, + { + "epoch": 0.6488193692865846, + "grad_norm": 0.5357031183400054, + "learning_rate": 0.00015788176918902545, + "loss": 12.3014, + "step": 11915 + }, + { + "epoch": 0.6488738232831676, + "grad_norm": 0.5878185119517014, + "learning_rate": 0.00015787457805385022, + "loss": 12.2597, + "step": 11916 + }, + { + "epoch": 0.6489282772797506, + "grad_norm": 0.5408649338255814, + "learning_rate": 0.00015786738646863258, + "loss": 12.1758, + "step": 11917 + }, + { + "epoch": 0.6489827312763337, + "grad_norm": 0.5962386022181083, + "learning_rate": 0.0001578601944334284, + "loss": 12.3954, + "step": 11918 + }, + { + "epoch": 0.6490371852729167, + "grad_norm": 0.6894379242956311, + "learning_rate": 0.0001578530019482937, + "loss": 12.4568, + "step": 11919 + }, + { + "epoch": 0.6490916392694996, + "grad_norm": 0.6447570054291695, + "learning_rate": 0.00015784580901328433, + "loss": 12.3826, + "step": 11920 + }, + { + "epoch": 0.6491460932660826, + "grad_norm": 0.6538136842300999, + "learning_rate": 0.00015783861562845624, + "loss": 12.1867, + "step": 11921 + }, + { + "epoch": 0.6492005472626656, + "grad_norm": 0.6588957923748714, + "learning_rate": 0.00015783142179386542, + "loss": 12.3532, + "step": 11922 + }, + { + "epoch": 0.6492550012592486, + "grad_norm": 0.609883320307594, + "learning_rate": 0.0001578242275095677, + "loss": 12.2295, + "step": 11923 + }, + { + "epoch": 0.6493094552558317, + "grad_norm": 0.5832732508989199, + "learning_rate": 0.00015781703277561912, + "loss": 12.3784, + "step": 11924 + }, + { + "epoch": 0.6493639092524147, + "grad_norm": 0.581475044581891, + "learning_rate": 0.0001578098375920756, + "loss": 12.1825, + "step": 11925 + }, + { + "epoch": 0.6494183632489977, + "grad_norm": 0.5271469409272188, + "learning_rate": 0.0001578026419589931, + "loss": 12.2623, + "step": 11926 + }, + { + "epoch": 0.6494728172455807, + "grad_norm": 0.5873360893022631, + "learning_rate": 0.00015779544587642754, + "loss": 12.3341, + "step": 11927 + }, + { + "epoch": 0.6495272712421637, + "grad_norm": 0.7344385378568903, + "learning_rate": 0.0001577882493444349, + "loss": 12.0677, + "step": 11928 + }, + { + "epoch": 0.6495817252387467, + "grad_norm": 0.5793826632935817, + "learning_rate": 0.00015778105236307117, + "loss": 12.444, + "step": 11929 + }, + { + "epoch": 0.6496361792353298, + "grad_norm": 0.5508804313879363, + "learning_rate": 0.00015777385493239226, + "loss": 12.343, + "step": 11930 + }, + { + "epoch": 0.6496906332319128, + "grad_norm": 0.5574879850197885, + "learning_rate": 0.00015776665705245416, + "loss": 12.3612, + "step": 11931 + }, + { + "epoch": 0.6497450872284958, + "grad_norm": 0.6029465954215187, + "learning_rate": 0.00015775945872331288, + "loss": 12.3669, + "step": 11932 + }, + { + "epoch": 0.6497995412250788, + "grad_norm": 0.621974590243019, + "learning_rate": 0.00015775225994502434, + "loss": 12.2378, + "step": 11933 + }, + { + "epoch": 0.6498539952216618, + "grad_norm": 0.5669301302625296, + "learning_rate": 0.00015774506071764455, + "loss": 12.3814, + "step": 11934 + }, + { + "epoch": 0.6499084492182449, + "grad_norm": 0.6069993807753911, + "learning_rate": 0.00015773786104122947, + "loss": 12.31, + "step": 11935 + }, + { + "epoch": 0.6499629032148279, + "grad_norm": 0.6790511836398206, + "learning_rate": 0.0001577306609158351, + "loss": 12.4608, + "step": 11936 + }, + { + "epoch": 0.6500173572114109, + "grad_norm": 0.5829094050295218, + "learning_rate": 0.00015772346034151745, + "loss": 12.3586, + "step": 11937 + }, + { + "epoch": 0.6500718112079938, + "grad_norm": 0.6722397218659423, + "learning_rate": 0.00015771625931833248, + "loss": 12.2938, + "step": 11938 + }, + { + "epoch": 0.6501262652045768, + "grad_norm": 0.5748677087191401, + "learning_rate": 0.0001577090578463362, + "loss": 12.3666, + "step": 11939 + }, + { + "epoch": 0.6501807192011598, + "grad_norm": 0.5524163198640157, + "learning_rate": 0.00015770185592558459, + "loss": 12.1791, + "step": 11940 + }, + { + "epoch": 0.6502351731977429, + "grad_norm": 0.64979857567962, + "learning_rate": 0.00015769465355613372, + "loss": 12.2564, + "step": 11941 + }, + { + "epoch": 0.6502896271943259, + "grad_norm": 0.6487747620285252, + "learning_rate": 0.0001576874507380395, + "loss": 12.2984, + "step": 11942 + }, + { + "epoch": 0.6503440811909089, + "grad_norm": 0.5831819737136476, + "learning_rate": 0.00015768024747135802, + "loss": 12.3315, + "step": 11943 + }, + { + "epoch": 0.6503985351874919, + "grad_norm": 0.726194846556106, + "learning_rate": 0.00015767304375614524, + "loss": 12.373, + "step": 11944 + }, + { + "epoch": 0.6504529891840749, + "grad_norm": 0.7572436990405581, + "learning_rate": 0.00015766583959245722, + "loss": 12.3561, + "step": 11945 + }, + { + "epoch": 0.6505074431806579, + "grad_norm": 0.8388816417572426, + "learning_rate": 0.00015765863498034993, + "loss": 12.3658, + "step": 11946 + }, + { + "epoch": 0.650561897177241, + "grad_norm": 0.6220669970386754, + "learning_rate": 0.00015765142991987948, + "loss": 12.3762, + "step": 11947 + }, + { + "epoch": 0.650616351173824, + "grad_norm": 0.5850096096207479, + "learning_rate": 0.00015764422441110179, + "loss": 12.3122, + "step": 11948 + }, + { + "epoch": 0.650670805170407, + "grad_norm": 0.5960887585380966, + "learning_rate": 0.00015763701845407293, + "loss": 12.2445, + "step": 11949 + }, + { + "epoch": 0.65072525916699, + "grad_norm": 0.6286614967255224, + "learning_rate": 0.000157629812048849, + "loss": 12.3693, + "step": 11950 + }, + { + "epoch": 0.650779713163573, + "grad_norm": 0.5755252327414278, + "learning_rate": 0.00015762260519548596, + "loss": 12.3737, + "step": 11951 + }, + { + "epoch": 0.650834167160156, + "grad_norm": 0.6015708782836013, + "learning_rate": 0.0001576153978940399, + "loss": 12.3598, + "step": 11952 + }, + { + "epoch": 0.6508886211567391, + "grad_norm": 0.5503492430025636, + "learning_rate": 0.0001576081901445668, + "loss": 12.147, + "step": 11953 + }, + { + "epoch": 0.6509430751533221, + "grad_norm": 0.6174251701802627, + "learning_rate": 0.0001576009819471228, + "loss": 12.3472, + "step": 11954 + }, + { + "epoch": 0.650997529149905, + "grad_norm": 0.6617496217312884, + "learning_rate": 0.0001575937733017639, + "loss": 12.335, + "step": 11955 + }, + { + "epoch": 0.651051983146488, + "grad_norm": 0.6619382754910553, + "learning_rate": 0.00015758656420854615, + "loss": 12.297, + "step": 11956 + }, + { + "epoch": 0.651106437143071, + "grad_norm": 0.6114447904475865, + "learning_rate": 0.00015757935466752563, + "loss": 12.2679, + "step": 11957 + }, + { + "epoch": 0.651160891139654, + "grad_norm": 0.5867492114594228, + "learning_rate": 0.00015757214467875837, + "loss": 12.3383, + "step": 11958 + }, + { + "epoch": 0.6512153451362371, + "grad_norm": 0.5545629658830868, + "learning_rate": 0.00015756493424230045, + "loss": 12.3276, + "step": 11959 + }, + { + "epoch": 0.6512697991328201, + "grad_norm": 0.6228513492480882, + "learning_rate": 0.00015755772335820798, + "loss": 12.2693, + "step": 11960 + }, + { + "epoch": 0.6513242531294031, + "grad_norm": 0.5538829374813401, + "learning_rate": 0.00015755051202653698, + "loss": 12.1914, + "step": 11961 + }, + { + "epoch": 0.6513787071259861, + "grad_norm": 0.6761415228572771, + "learning_rate": 0.00015754330024734357, + "loss": 12.2367, + "step": 11962 + }, + { + "epoch": 0.6514331611225691, + "grad_norm": 0.5784702625909043, + "learning_rate": 0.00015753608802068376, + "loss": 12.3423, + "step": 11963 + }, + { + "epoch": 0.6514876151191521, + "grad_norm": 0.5735770424793095, + "learning_rate": 0.0001575288753466137, + "loss": 12.2221, + "step": 11964 + }, + { + "epoch": 0.6515420691157352, + "grad_norm": 0.7926214136031734, + "learning_rate": 0.0001575216622251895, + "loss": 12.3241, + "step": 11965 + }, + { + "epoch": 0.6515965231123182, + "grad_norm": 0.48699907812999516, + "learning_rate": 0.00015751444865646716, + "loss": 12.2729, + "step": 11966 + }, + { + "epoch": 0.6516509771089012, + "grad_norm": 0.64560719180413, + "learning_rate": 0.00015750723464050286, + "loss": 12.3966, + "step": 11967 + }, + { + "epoch": 0.6517054311054842, + "grad_norm": 0.6850859013375421, + "learning_rate": 0.0001575000201773526, + "loss": 12.3712, + "step": 11968 + }, + { + "epoch": 0.6517598851020672, + "grad_norm": 0.6505676416890495, + "learning_rate": 0.0001574928052670726, + "loss": 12.352, + "step": 11969 + }, + { + "epoch": 0.6518143390986503, + "grad_norm": 0.619633594619558, + "learning_rate": 0.00015748558990971888, + "loss": 12.4747, + "step": 11970 + }, + { + "epoch": 0.6518687930952333, + "grad_norm": 0.6128752373979734, + "learning_rate": 0.00015747837410534757, + "loss": 12.3141, + "step": 11971 + }, + { + "epoch": 0.6519232470918163, + "grad_norm": 0.5500144202818116, + "learning_rate": 0.0001574711578540148, + "loss": 12.3213, + "step": 11972 + }, + { + "epoch": 0.6519777010883993, + "grad_norm": 0.5148268142700347, + "learning_rate": 0.00015746394115577665, + "loss": 12.3108, + "step": 11973 + }, + { + "epoch": 0.6520321550849822, + "grad_norm": 0.6139723632259834, + "learning_rate": 0.00015745672401068928, + "loss": 12.2607, + "step": 11974 + }, + { + "epoch": 0.6520866090815652, + "grad_norm": 0.5864930516835105, + "learning_rate": 0.0001574495064188088, + "loss": 12.0589, + "step": 11975 + }, + { + "epoch": 0.6521410630781483, + "grad_norm": 0.5365876166550633, + "learning_rate": 0.00015744228838019127, + "loss": 12.3051, + "step": 11976 + }, + { + "epoch": 0.6521955170747313, + "grad_norm": 0.5499501950636869, + "learning_rate": 0.0001574350698948929, + "loss": 12.3038, + "step": 11977 + }, + { + "epoch": 0.6522499710713143, + "grad_norm": 0.6472124705907247, + "learning_rate": 0.0001574278509629698, + "loss": 12.3738, + "step": 11978 + }, + { + "epoch": 0.6523044250678973, + "grad_norm": 0.5453405892615903, + "learning_rate": 0.0001574206315844781, + "loss": 12.3297, + "step": 11979 + }, + { + "epoch": 0.6523588790644803, + "grad_norm": 0.6329484377503383, + "learning_rate": 0.00015741341175947392, + "loss": 12.2167, + "step": 11980 + }, + { + "epoch": 0.6524133330610633, + "grad_norm": 0.5707382286572685, + "learning_rate": 0.00015740619148801342, + "loss": 12.1752, + "step": 11981 + }, + { + "epoch": 0.6524677870576464, + "grad_norm": 0.5732738756007905, + "learning_rate": 0.00015739897077015277, + "loss": 12.3096, + "step": 11982 + }, + { + "epoch": 0.6525222410542294, + "grad_norm": 0.7108703247791771, + "learning_rate": 0.00015739174960594809, + "loss": 12.3917, + "step": 11983 + }, + { + "epoch": 0.6525766950508124, + "grad_norm": 0.5927231949777018, + "learning_rate": 0.00015738452799545557, + "loss": 12.4086, + "step": 11984 + }, + { + "epoch": 0.6526311490473954, + "grad_norm": 0.5632680185066232, + "learning_rate": 0.0001573773059387313, + "loss": 12.3268, + "step": 11985 + }, + { + "epoch": 0.6526856030439784, + "grad_norm": 0.5575083758906307, + "learning_rate": 0.00015737008343583148, + "loss": 12.3571, + "step": 11986 + }, + { + "epoch": 0.6527400570405614, + "grad_norm": 0.5772371140502875, + "learning_rate": 0.00015736286048681229, + "loss": 12.3336, + "step": 11987 + }, + { + "epoch": 0.6527945110371445, + "grad_norm": 0.6479961373639044, + "learning_rate": 0.00015735563709172985, + "loss": 12.2698, + "step": 11988 + }, + { + "epoch": 0.6528489650337275, + "grad_norm": 0.6505984500018023, + "learning_rate": 0.00015734841325064038, + "loss": 12.3785, + "step": 11989 + }, + { + "epoch": 0.6529034190303105, + "grad_norm": 0.5773942885565875, + "learning_rate": 0.00015734118896360003, + "loss": 12.3468, + "step": 11990 + }, + { + "epoch": 0.6529578730268935, + "grad_norm": 0.5825216433858514, + "learning_rate": 0.00015733396423066496, + "loss": 12.3984, + "step": 11991 + }, + { + "epoch": 0.6530123270234764, + "grad_norm": 0.6430205692159725, + "learning_rate": 0.00015732673905189136, + "loss": 12.3095, + "step": 11992 + }, + { + "epoch": 0.6530667810200594, + "grad_norm": 0.6068657586227032, + "learning_rate": 0.00015731951342733545, + "loss": 12.2339, + "step": 11993 + }, + { + "epoch": 0.6531212350166425, + "grad_norm": 0.5679813776155134, + "learning_rate": 0.00015731228735705338, + "loss": 12.3382, + "step": 11994 + }, + { + "epoch": 0.6531756890132255, + "grad_norm": 0.611364306339243, + "learning_rate": 0.00015730506084110136, + "loss": 12.2155, + "step": 11995 + }, + { + "epoch": 0.6532301430098085, + "grad_norm": 0.5722300818041792, + "learning_rate": 0.00015729783387953558, + "loss": 12.3042, + "step": 11996 + }, + { + "epoch": 0.6532845970063915, + "grad_norm": 0.6272152796074103, + "learning_rate": 0.00015729060647241223, + "loss": 12.3985, + "step": 11997 + }, + { + "epoch": 0.6533390510029745, + "grad_norm": 0.5878780174505018, + "learning_rate": 0.0001572833786197875, + "loss": 12.2693, + "step": 11998 + }, + { + "epoch": 0.6533935049995575, + "grad_norm": 0.5765005193129974, + "learning_rate": 0.00015727615032171764, + "loss": 12.2288, + "step": 11999 + }, + { + "epoch": 0.6534479589961406, + "grad_norm": 0.5794060488531453, + "learning_rate": 0.00015726892157825884, + "loss": 12.2515, + "step": 12000 + }, + { + "epoch": 0.6535024129927236, + "grad_norm": 0.5959742103943618, + "learning_rate": 0.00015726169238946725, + "loss": 12.3568, + "step": 12001 + }, + { + "epoch": 0.6535568669893066, + "grad_norm": 0.5656155306898348, + "learning_rate": 0.00015725446275539917, + "loss": 12.3548, + "step": 12002 + }, + { + "epoch": 0.6536113209858896, + "grad_norm": 0.5809509610831306, + "learning_rate": 0.00015724723267611078, + "loss": 12.2434, + "step": 12003 + }, + { + "epoch": 0.6536657749824726, + "grad_norm": 0.5723505321533915, + "learning_rate": 0.0001572400021516583, + "loss": 12.3567, + "step": 12004 + }, + { + "epoch": 0.6537202289790557, + "grad_norm": 0.5640139894343855, + "learning_rate": 0.000157232771182098, + "loss": 12.4136, + "step": 12005 + }, + { + "epoch": 0.6537746829756387, + "grad_norm": 0.6097832305343643, + "learning_rate": 0.00015722553976748604, + "loss": 12.316, + "step": 12006 + }, + { + "epoch": 0.6538291369722217, + "grad_norm": 0.6075626242319075, + "learning_rate": 0.00015721830790787868, + "loss": 12.448, + "step": 12007 + }, + { + "epoch": 0.6538835909688047, + "grad_norm": 0.6135216935236425, + "learning_rate": 0.00015721107560333217, + "loss": 12.2811, + "step": 12008 + }, + { + "epoch": 0.6539380449653877, + "grad_norm": 0.5900328630403623, + "learning_rate": 0.00015720384285390274, + "loss": 12.3143, + "step": 12009 + }, + { + "epoch": 0.6539924989619706, + "grad_norm": 0.5593576082224303, + "learning_rate": 0.00015719660965964668, + "loss": 12.2963, + "step": 12010 + }, + { + "epoch": 0.6540469529585538, + "grad_norm": 0.6323270839974057, + "learning_rate": 0.00015718937602062015, + "loss": 12.3916, + "step": 12011 + }, + { + "epoch": 0.6541014069551367, + "grad_norm": 0.634713507031242, + "learning_rate": 0.00015718214193687945, + "loss": 12.4702, + "step": 12012 + }, + { + "epoch": 0.6541558609517197, + "grad_norm": 0.5918466121827748, + "learning_rate": 0.0001571749074084808, + "loss": 12.3848, + "step": 12013 + }, + { + "epoch": 0.6542103149483027, + "grad_norm": 0.5930684196620623, + "learning_rate": 0.0001571676724354805, + "loss": 12.4133, + "step": 12014 + }, + { + "epoch": 0.6542647689448857, + "grad_norm": 0.5565132695659449, + "learning_rate": 0.0001571604370179348, + "loss": 12.2225, + "step": 12015 + }, + { + "epoch": 0.6543192229414687, + "grad_norm": 0.571361595467882, + "learning_rate": 0.00015715320115589995, + "loss": 12.248, + "step": 12016 + }, + { + "epoch": 0.6543736769380518, + "grad_norm": 0.5981817106947294, + "learning_rate": 0.0001571459648494322, + "loss": 12.407, + "step": 12017 + }, + { + "epoch": 0.6544281309346348, + "grad_norm": 0.5843807488937806, + "learning_rate": 0.00015713872809858788, + "loss": 12.299, + "step": 12018 + }, + { + "epoch": 0.6544825849312178, + "grad_norm": 0.5713110865905602, + "learning_rate": 0.00015713149090342321, + "loss": 12.2149, + "step": 12019 + }, + { + "epoch": 0.6545370389278008, + "grad_norm": 0.6267794263402403, + "learning_rate": 0.0001571242532639945, + "loss": 12.4155, + "step": 12020 + }, + { + "epoch": 0.6545914929243838, + "grad_norm": 0.6539645542446826, + "learning_rate": 0.000157117015180358, + "loss": 12.3244, + "step": 12021 + }, + { + "epoch": 0.6546459469209668, + "grad_norm": 0.6833958466531187, + "learning_rate": 0.00015710977665257003, + "loss": 12.4647, + "step": 12022 + }, + { + "epoch": 0.6547004009175499, + "grad_norm": 0.5835734191459407, + "learning_rate": 0.0001571025376806868, + "loss": 12.4632, + "step": 12023 + }, + { + "epoch": 0.6547548549141329, + "grad_norm": 0.5560923116448497, + "learning_rate": 0.00015709529826476475, + "loss": 12.2651, + "step": 12024 + }, + { + "epoch": 0.6548093089107159, + "grad_norm": 0.6115189866766454, + "learning_rate": 0.00015708805840486005, + "loss": 12.4087, + "step": 12025 + }, + { + "epoch": 0.6548637629072989, + "grad_norm": 0.5607442964773205, + "learning_rate": 0.000157080818101029, + "loss": 12.2975, + "step": 12026 + }, + { + "epoch": 0.6549182169038819, + "grad_norm": 0.5778224307829855, + "learning_rate": 0.00015707357735332797, + "loss": 12.4361, + "step": 12027 + }, + { + "epoch": 0.6549726709004648, + "grad_norm": 0.6330981188570893, + "learning_rate": 0.00015706633616181323, + "loss": 12.3908, + "step": 12028 + }, + { + "epoch": 0.655027124897048, + "grad_norm": 0.6278392162178607, + "learning_rate": 0.00015705909452654108, + "loss": 12.3544, + "step": 12029 + }, + { + "epoch": 0.655081578893631, + "grad_norm": 0.5846662559335308, + "learning_rate": 0.00015705185244756787, + "loss": 12.2941, + "step": 12030 + }, + { + "epoch": 0.6551360328902139, + "grad_norm": 0.6050336247846226, + "learning_rate": 0.00015704460992494986, + "loss": 12.2861, + "step": 12031 + }, + { + "epoch": 0.6551904868867969, + "grad_norm": 0.6321047953668683, + "learning_rate": 0.0001570373669587434, + "loss": 12.3586, + "step": 12032 + }, + { + "epoch": 0.6552449408833799, + "grad_norm": 0.608849734510545, + "learning_rate": 0.00015703012354900483, + "loss": 12.3521, + "step": 12033 + }, + { + "epoch": 0.655299394879963, + "grad_norm": 0.617231457089484, + "learning_rate": 0.0001570228796957904, + "loss": 12.2676, + "step": 12034 + }, + { + "epoch": 0.655353848876546, + "grad_norm": 0.6180049647494912, + "learning_rate": 0.00015701563539915656, + "loss": 12.2132, + "step": 12035 + }, + { + "epoch": 0.655408302873129, + "grad_norm": 0.6431172833908424, + "learning_rate": 0.00015700839065915955, + "loss": 12.2558, + "step": 12036 + }, + { + "epoch": 0.655462756869712, + "grad_norm": 0.6019823191469429, + "learning_rate": 0.0001570011454758557, + "loss": 12.2704, + "step": 12037 + }, + { + "epoch": 0.655517210866295, + "grad_norm": 0.5671224077287408, + "learning_rate": 0.00015699389984930143, + "loss": 12.3156, + "step": 12038 + }, + { + "epoch": 0.655571664862878, + "grad_norm": 0.7012576025712719, + "learning_rate": 0.00015698665377955303, + "loss": 12.43, + "step": 12039 + }, + { + "epoch": 0.6556261188594611, + "grad_norm": 0.6061221280777732, + "learning_rate": 0.00015697940726666683, + "loss": 12.2937, + "step": 12040 + }, + { + "epoch": 0.6556805728560441, + "grad_norm": 0.7156210816199511, + "learning_rate": 0.00015697216031069925, + "loss": 12.3705, + "step": 12041 + }, + { + "epoch": 0.6557350268526271, + "grad_norm": 0.6125638351685208, + "learning_rate": 0.00015696491291170657, + "loss": 12.3893, + "step": 12042 + }, + { + "epoch": 0.6557894808492101, + "grad_norm": 0.6318843500494461, + "learning_rate": 0.00015695766506974517, + "loss": 12.4039, + "step": 12043 + }, + { + "epoch": 0.6558439348457931, + "grad_norm": 0.6600868298812459, + "learning_rate": 0.00015695041678487142, + "loss": 12.2972, + "step": 12044 + }, + { + "epoch": 0.6558983888423761, + "grad_norm": 0.542910202250795, + "learning_rate": 0.00015694316805714165, + "loss": 12.2664, + "step": 12045 + }, + { + "epoch": 0.6559528428389592, + "grad_norm": 0.5845324019516077, + "learning_rate": 0.0001569359188866123, + "loss": 12.3188, + "step": 12046 + }, + { + "epoch": 0.6560072968355422, + "grad_norm": 0.7004534797723055, + "learning_rate": 0.00015692866927333968, + "loss": 12.41, + "step": 12047 + }, + { + "epoch": 0.6560617508321251, + "grad_norm": 0.6112982977934652, + "learning_rate": 0.00015692141921738018, + "loss": 12.3632, + "step": 12048 + }, + { + "epoch": 0.6561162048287081, + "grad_norm": 0.6132310081784185, + "learning_rate": 0.00015691416871879018, + "loss": 12.3798, + "step": 12049 + }, + { + "epoch": 0.6561706588252911, + "grad_norm": 0.6721167651882155, + "learning_rate": 0.00015690691777762604, + "loss": 12.2113, + "step": 12050 + }, + { + "epoch": 0.6562251128218741, + "grad_norm": 0.6058156618250249, + "learning_rate": 0.0001568996663939442, + "loss": 12.4447, + "step": 12051 + }, + { + "epoch": 0.6562795668184572, + "grad_norm": 0.6217678395993428, + "learning_rate": 0.000156892414567801, + "loss": 12.4583, + "step": 12052 + }, + { + "epoch": 0.6563340208150402, + "grad_norm": 0.8360757048261092, + "learning_rate": 0.00015688516229925284, + "loss": 12.3777, + "step": 12053 + }, + { + "epoch": 0.6563884748116232, + "grad_norm": 0.5595752466357918, + "learning_rate": 0.0001568779095883561, + "loss": 12.2718, + "step": 12054 + }, + { + "epoch": 0.6564429288082062, + "grad_norm": 0.5796818253610839, + "learning_rate": 0.00015687065643516722, + "loss": 12.1915, + "step": 12055 + }, + { + "epoch": 0.6564973828047892, + "grad_norm": 0.6068369376036751, + "learning_rate": 0.00015686340283974258, + "loss": 12.1375, + "step": 12056 + }, + { + "epoch": 0.6565518368013722, + "grad_norm": 0.5964555552588784, + "learning_rate": 0.00015685614880213856, + "loss": 12.385, + "step": 12057 + }, + { + "epoch": 0.6566062907979553, + "grad_norm": 0.5893076488767527, + "learning_rate": 0.0001568488943224116, + "loss": 12.3554, + "step": 12058 + }, + { + "epoch": 0.6566607447945383, + "grad_norm": 0.6133767840778955, + "learning_rate": 0.0001568416394006181, + "loss": 12.3736, + "step": 12059 + }, + { + "epoch": 0.6567151987911213, + "grad_norm": 0.6707278629224666, + "learning_rate": 0.00015683438403681448, + "loss": 12.4436, + "step": 12060 + }, + { + "epoch": 0.6567696527877043, + "grad_norm": 0.5560126577532122, + "learning_rate": 0.0001568271282310572, + "loss": 12.1057, + "step": 12061 + }, + { + "epoch": 0.6568241067842873, + "grad_norm": 0.6215607526503611, + "learning_rate": 0.0001568198719834026, + "loss": 12.1294, + "step": 12062 + }, + { + "epoch": 0.6568785607808703, + "grad_norm": 0.6286325445732135, + "learning_rate": 0.00015681261529390715, + "loss": 12.2769, + "step": 12063 + }, + { + "epoch": 0.6569330147774534, + "grad_norm": 0.5312290249893128, + "learning_rate": 0.00015680535816262728, + "loss": 12.2747, + "step": 12064 + }, + { + "epoch": 0.6569874687740364, + "grad_norm": 0.5738259013012558, + "learning_rate": 0.00015679810058961944, + "loss": 12.3717, + "step": 12065 + }, + { + "epoch": 0.6570419227706193, + "grad_norm": 0.5940529983518259, + "learning_rate": 0.00015679084257494002, + "loss": 12.4212, + "step": 12066 + }, + { + "epoch": 0.6570963767672023, + "grad_norm": 0.583664293392366, + "learning_rate": 0.00015678358411864548, + "loss": 12.2232, + "step": 12067 + }, + { + "epoch": 0.6571508307637853, + "grad_norm": 0.5896927482492248, + "learning_rate": 0.0001567763252207923, + "loss": 12.4077, + "step": 12068 + }, + { + "epoch": 0.6572052847603684, + "grad_norm": 0.6049544554721654, + "learning_rate": 0.00015676906588143685, + "loss": 12.4348, + "step": 12069 + }, + { + "epoch": 0.6572597387569514, + "grad_norm": 0.5725497093698602, + "learning_rate": 0.00015676180610063565, + "loss": 12.4809, + "step": 12070 + }, + { + "epoch": 0.6573141927535344, + "grad_norm": 0.7405786368932967, + "learning_rate": 0.00015675454587844515, + "loss": 12.2816, + "step": 12071 + }, + { + "epoch": 0.6573686467501174, + "grad_norm": 0.600552284188916, + "learning_rate": 0.00015674728521492172, + "loss": 12.2328, + "step": 12072 + }, + { + "epoch": 0.6574231007467004, + "grad_norm": 0.5390920058221059, + "learning_rate": 0.0001567400241101219, + "loss": 12.2939, + "step": 12073 + }, + { + "epoch": 0.6574775547432834, + "grad_norm": 0.5996252240800342, + "learning_rate": 0.0001567327625641022, + "loss": 12.2745, + "step": 12074 + }, + { + "epoch": 0.6575320087398665, + "grad_norm": 0.6203639181018388, + "learning_rate": 0.00015672550057691895, + "loss": 12.2983, + "step": 12075 + }, + { + "epoch": 0.6575864627364495, + "grad_norm": 0.5831791648502749, + "learning_rate": 0.00015671823814862875, + "loss": 12.2538, + "step": 12076 + }, + { + "epoch": 0.6576409167330325, + "grad_norm": 0.6456117615006848, + "learning_rate": 0.00015671097527928795, + "loss": 12.2313, + "step": 12077 + }, + { + "epoch": 0.6576953707296155, + "grad_norm": 0.6120533649848845, + "learning_rate": 0.00015670371196895317, + "loss": 12.3152, + "step": 12078 + }, + { + "epoch": 0.6577498247261985, + "grad_norm": 0.5909407598412671, + "learning_rate": 0.00015669644821768078, + "loss": 12.4194, + "step": 12079 + }, + { + "epoch": 0.6578042787227815, + "grad_norm": 0.5897918822232837, + "learning_rate": 0.0001566891840255273, + "loss": 12.3486, + "step": 12080 + }, + { + "epoch": 0.6578587327193646, + "grad_norm": 0.5691125296830504, + "learning_rate": 0.00015668191939254925, + "loss": 12.2627, + "step": 12081 + }, + { + "epoch": 0.6579131867159476, + "grad_norm": 0.6035693166090563, + "learning_rate": 0.00015667465431880304, + "loss": 12.2438, + "step": 12082 + }, + { + "epoch": 0.6579676407125306, + "grad_norm": 0.6274229351147108, + "learning_rate": 0.00015666738880434523, + "loss": 12.2951, + "step": 12083 + }, + { + "epoch": 0.6580220947091135, + "grad_norm": 0.5970422016358655, + "learning_rate": 0.00015666012284923231, + "loss": 12.1996, + "step": 12084 + }, + { + "epoch": 0.6580765487056965, + "grad_norm": 0.613287883262713, + "learning_rate": 0.00015665285645352078, + "loss": 12.2262, + "step": 12085 + }, + { + "epoch": 0.6581310027022795, + "grad_norm": 0.8218009396210703, + "learning_rate": 0.0001566455896172671, + "loss": 12.6371, + "step": 12086 + }, + { + "epoch": 0.6581854566988626, + "grad_norm": 0.6123225370787431, + "learning_rate": 0.00015663832234052787, + "loss": 12.2022, + "step": 12087 + }, + { + "epoch": 0.6582399106954456, + "grad_norm": 0.5879275100204007, + "learning_rate": 0.0001566310546233595, + "loss": 12.4095, + "step": 12088 + }, + { + "epoch": 0.6582943646920286, + "grad_norm": 0.5717092565134987, + "learning_rate": 0.0001566237864658186, + "loss": 12.3095, + "step": 12089 + }, + { + "epoch": 0.6583488186886116, + "grad_norm": 0.5784460652054834, + "learning_rate": 0.0001566165178679616, + "loss": 12.219, + "step": 12090 + }, + { + "epoch": 0.6584032726851946, + "grad_norm": 0.643811496962863, + "learning_rate": 0.0001566092488298451, + "loss": 12.3034, + "step": 12091 + }, + { + "epoch": 0.6584577266817776, + "grad_norm": 0.5680520389238057, + "learning_rate": 0.00015660197935152555, + "loss": 12.2748, + "step": 12092 + }, + { + "epoch": 0.6585121806783607, + "grad_norm": 0.5790373555853809, + "learning_rate": 0.00015659470943305955, + "loss": 12.3464, + "step": 12093 + }, + { + "epoch": 0.6585666346749437, + "grad_norm": 0.6087938649370235, + "learning_rate": 0.00015658743907450356, + "loss": 12.3078, + "step": 12094 + }, + { + "epoch": 0.6586210886715267, + "grad_norm": 0.6270072053969875, + "learning_rate": 0.00015658016827591417, + "loss": 12.1714, + "step": 12095 + }, + { + "epoch": 0.6586755426681097, + "grad_norm": 0.5861641476491468, + "learning_rate": 0.0001565728970373479, + "loss": 12.3368, + "step": 12096 + }, + { + "epoch": 0.6587299966646927, + "grad_norm": 0.5804427884989929, + "learning_rate": 0.0001565656253588613, + "loss": 12.1728, + "step": 12097 + }, + { + "epoch": 0.6587844506612757, + "grad_norm": 0.5704907114632229, + "learning_rate": 0.00015655835324051093, + "loss": 12.053, + "step": 12098 + }, + { + "epoch": 0.6588389046578588, + "grad_norm": 0.591580493014265, + "learning_rate": 0.0001565510806823533, + "loss": 12.3192, + "step": 12099 + }, + { + "epoch": 0.6588933586544418, + "grad_norm": 0.5779853695319296, + "learning_rate": 0.000156543807684445, + "loss": 12.2956, + "step": 12100 + }, + { + "epoch": 0.6589478126510248, + "grad_norm": 0.6231852103374036, + "learning_rate": 0.00015653653424684255, + "loss": 12.2494, + "step": 12101 + }, + { + "epoch": 0.6590022666476077, + "grad_norm": 0.6667754409013602, + "learning_rate": 0.0001565292603696025, + "loss": 12.3361, + "step": 12102 + }, + { + "epoch": 0.6590567206441907, + "grad_norm": 0.5401924422265222, + "learning_rate": 0.0001565219860527815, + "loss": 12.2276, + "step": 12103 + }, + { + "epoch": 0.6591111746407738, + "grad_norm": 0.6918791259186888, + "learning_rate": 0.00015651471129643602, + "loss": 12.4344, + "step": 12104 + }, + { + "epoch": 0.6591656286373568, + "grad_norm": 0.6604993076391991, + "learning_rate": 0.0001565074361006227, + "loss": 12.134, + "step": 12105 + }, + { + "epoch": 0.6592200826339398, + "grad_norm": 0.5515241712824276, + "learning_rate": 0.00015650016046539806, + "loss": 12.3196, + "step": 12106 + }, + { + "epoch": 0.6592745366305228, + "grad_norm": 0.6419301595352817, + "learning_rate": 0.00015649288439081868, + "loss": 12.0543, + "step": 12107 + }, + { + "epoch": 0.6593289906271058, + "grad_norm": 0.6839742489781926, + "learning_rate": 0.00015648560787694118, + "loss": 12.337, + "step": 12108 + }, + { + "epoch": 0.6593834446236888, + "grad_norm": 0.5489400548407735, + "learning_rate": 0.0001564783309238221, + "loss": 12.3295, + "step": 12109 + }, + { + "epoch": 0.6594378986202719, + "grad_norm": 0.5588116094373012, + "learning_rate": 0.0001564710535315181, + "loss": 12.3351, + "step": 12110 + }, + { + "epoch": 0.6594923526168549, + "grad_norm": 0.6258296469166352, + "learning_rate": 0.00015646377570008565, + "loss": 12.3593, + "step": 12111 + }, + { + "epoch": 0.6595468066134379, + "grad_norm": 0.5714303544699514, + "learning_rate": 0.00015645649742958146, + "loss": 12.2158, + "step": 12112 + }, + { + "epoch": 0.6596012606100209, + "grad_norm": 0.5690123486052074, + "learning_rate": 0.00015644921872006205, + "loss": 12.3059, + "step": 12113 + }, + { + "epoch": 0.6596557146066039, + "grad_norm": 0.5998350963031981, + "learning_rate": 0.00015644193957158406, + "loss": 12.2973, + "step": 12114 + }, + { + "epoch": 0.6597101686031869, + "grad_norm": 0.6126527081622569, + "learning_rate": 0.0001564346599842041, + "loss": 12.3787, + "step": 12115 + }, + { + "epoch": 0.65976462259977, + "grad_norm": 0.6175210777858374, + "learning_rate": 0.00015642737995797873, + "loss": 12.3521, + "step": 12116 + }, + { + "epoch": 0.659819076596353, + "grad_norm": 0.6616911257059388, + "learning_rate": 0.0001564200994929646, + "loss": 12.1806, + "step": 12117 + }, + { + "epoch": 0.659873530592936, + "grad_norm": 0.62720735050687, + "learning_rate": 0.00015641281858921833, + "loss": 12.34, + "step": 12118 + }, + { + "epoch": 0.659927984589519, + "grad_norm": 0.5898731792627565, + "learning_rate": 0.00015640553724679648, + "loss": 12.2155, + "step": 12119 + }, + { + "epoch": 0.659982438586102, + "grad_norm": 0.6070751981197668, + "learning_rate": 0.00015639825546575576, + "loss": 12.1565, + "step": 12120 + }, + { + "epoch": 0.6600368925826849, + "grad_norm": 0.6412845745278387, + "learning_rate": 0.00015639097324615273, + "loss": 12.5173, + "step": 12121 + }, + { + "epoch": 0.660091346579268, + "grad_norm": 0.6269784738662666, + "learning_rate": 0.00015638369058804404, + "loss": 12.2585, + "step": 12122 + }, + { + "epoch": 0.660145800575851, + "grad_norm": 0.5692248201202506, + "learning_rate": 0.0001563764074914863, + "loss": 12.1797, + "step": 12123 + }, + { + "epoch": 0.660200254572434, + "grad_norm": 0.5412511925083626, + "learning_rate": 0.00015636912395653618, + "loss": 12.3225, + "step": 12124 + }, + { + "epoch": 0.660254708569017, + "grad_norm": 0.5678289684620178, + "learning_rate": 0.00015636183998325026, + "loss": 12.332, + "step": 12125 + }, + { + "epoch": 0.6603091625656, + "grad_norm": 0.6016993154089291, + "learning_rate": 0.00015635455557168527, + "loss": 12.3073, + "step": 12126 + }, + { + "epoch": 0.660363616562183, + "grad_norm": 0.6483298042378929, + "learning_rate": 0.00015634727072189782, + "loss": 12.2413, + "step": 12127 + }, + { + "epoch": 0.6604180705587661, + "grad_norm": 0.5863492923591763, + "learning_rate": 0.00015633998543394448, + "loss": 12.2328, + "step": 12128 + }, + { + "epoch": 0.6604725245553491, + "grad_norm": 0.5801873293209294, + "learning_rate": 0.00015633269970788201, + "loss": 12.3545, + "step": 12129 + }, + { + "epoch": 0.6605269785519321, + "grad_norm": 0.5478890482606156, + "learning_rate": 0.00015632541354376698, + "loss": 12.3379, + "step": 12130 + }, + { + "epoch": 0.6605814325485151, + "grad_norm": 0.5774737062533749, + "learning_rate": 0.00015631812694165612, + "loss": 12.3044, + "step": 12131 + }, + { + "epoch": 0.6606358865450981, + "grad_norm": 0.5952639698733143, + "learning_rate": 0.00015631083990160605, + "loss": 12.2476, + "step": 12132 + }, + { + "epoch": 0.6606903405416811, + "grad_norm": 0.523753329481436, + "learning_rate": 0.00015630355242367348, + "loss": 12.1817, + "step": 12133 + }, + { + "epoch": 0.6607447945382642, + "grad_norm": 0.5474573088759221, + "learning_rate": 0.000156296264507915, + "loss": 12.2149, + "step": 12134 + }, + { + "epoch": 0.6607992485348472, + "grad_norm": 0.6304369889579475, + "learning_rate": 0.0001562889761543873, + "loss": 12.4091, + "step": 12135 + }, + { + "epoch": 0.6608537025314302, + "grad_norm": 0.6699170088289395, + "learning_rate": 0.00015628168736314717, + "loss": 12.4412, + "step": 12136 + }, + { + "epoch": 0.6609081565280132, + "grad_norm": 0.5538506368520014, + "learning_rate": 0.00015627439813425115, + "loss": 12.282, + "step": 12137 + }, + { + "epoch": 0.6609626105245962, + "grad_norm": 0.5978091609740449, + "learning_rate": 0.00015626710846775596, + "loss": 12.3531, + "step": 12138 + }, + { + "epoch": 0.6610170645211793, + "grad_norm": 0.5623515777462447, + "learning_rate": 0.00015625981836371833, + "loss": 12.3091, + "step": 12139 + }, + { + "epoch": 0.6610715185177622, + "grad_norm": 0.5864998538286688, + "learning_rate": 0.0001562525278221949, + "loss": 12.1195, + "step": 12140 + }, + { + "epoch": 0.6611259725143452, + "grad_norm": 0.5896243382799916, + "learning_rate": 0.00015624523684324237, + "loss": 12.2116, + "step": 12141 + }, + { + "epoch": 0.6611804265109282, + "grad_norm": 0.5491996186435824, + "learning_rate": 0.0001562379454269175, + "loss": 12.2354, + "step": 12142 + }, + { + "epoch": 0.6612348805075112, + "grad_norm": 0.6777309207922793, + "learning_rate": 0.00015623065357327684, + "loss": 12.2775, + "step": 12143 + }, + { + "epoch": 0.6612893345040942, + "grad_norm": 0.6302362156115535, + "learning_rate": 0.00015622336128237726, + "loss": 12.1918, + "step": 12144 + }, + { + "epoch": 0.6613437885006773, + "grad_norm": 0.650374195038451, + "learning_rate": 0.00015621606855427538, + "loss": 12.3677, + "step": 12145 + }, + { + "epoch": 0.6613982424972603, + "grad_norm": 0.5932691651104086, + "learning_rate": 0.0001562087753890279, + "loss": 12.3541, + "step": 12146 + }, + { + "epoch": 0.6614526964938433, + "grad_norm": 0.5680699252633874, + "learning_rate": 0.00015620148178669161, + "loss": 12.3498, + "step": 12147 + }, + { + "epoch": 0.6615071504904263, + "grad_norm": 0.6155619498335252, + "learning_rate": 0.0001561941877473231, + "loss": 12.195, + "step": 12148 + }, + { + "epoch": 0.6615616044870093, + "grad_norm": 0.6051882680021209, + "learning_rate": 0.0001561868932709792, + "loss": 12.1634, + "step": 12149 + }, + { + "epoch": 0.6616160584835923, + "grad_norm": 0.5986268325000902, + "learning_rate": 0.00015617959835771662, + "loss": 12.3676, + "step": 12150 + }, + { + "epoch": 0.6616705124801754, + "grad_norm": 0.5727966647803312, + "learning_rate": 0.00015617230300759202, + "loss": 12.2913, + "step": 12151 + }, + { + "epoch": 0.6617249664767584, + "grad_norm": 0.5545194798519972, + "learning_rate": 0.00015616500722066219, + "loss": 12.377, + "step": 12152 + }, + { + "epoch": 0.6617794204733414, + "grad_norm": 0.5641449445635649, + "learning_rate": 0.00015615771099698384, + "loss": 12.2211, + "step": 12153 + }, + { + "epoch": 0.6618338744699244, + "grad_norm": 0.60254720612745, + "learning_rate": 0.0001561504143366137, + "loss": 12.2442, + "step": 12154 + }, + { + "epoch": 0.6618883284665074, + "grad_norm": 0.5907651025492713, + "learning_rate": 0.00015614311723960854, + "loss": 12.35, + "step": 12155 + }, + { + "epoch": 0.6619427824630904, + "grad_norm": 0.708172101181777, + "learning_rate": 0.0001561358197060251, + "loss": 12.3802, + "step": 12156 + }, + { + "epoch": 0.6619972364596735, + "grad_norm": 0.5675398599531235, + "learning_rate": 0.0001561285217359201, + "loss": 12.3175, + "step": 12157 + }, + { + "epoch": 0.6620516904562564, + "grad_norm": 0.6494733439949993, + "learning_rate": 0.0001561212233293503, + "loss": 12.2998, + "step": 12158 + }, + { + "epoch": 0.6621061444528394, + "grad_norm": 0.5949033966930486, + "learning_rate": 0.00015611392448637243, + "loss": 12.2403, + "step": 12159 + }, + { + "epoch": 0.6621605984494224, + "grad_norm": 0.597294651572877, + "learning_rate": 0.0001561066252070433, + "loss": 12.3803, + "step": 12160 + }, + { + "epoch": 0.6622150524460054, + "grad_norm": 0.6428094862347965, + "learning_rate": 0.00015609932549141966, + "loss": 12.4209, + "step": 12161 + }, + { + "epoch": 0.6622695064425884, + "grad_norm": 0.6673986827425881, + "learning_rate": 0.00015609202533955823, + "loss": 12.3467, + "step": 12162 + }, + { + "epoch": 0.6623239604391715, + "grad_norm": 0.5701563550210614, + "learning_rate": 0.00015608472475151582, + "loss": 12.1723, + "step": 12163 + }, + { + "epoch": 0.6623784144357545, + "grad_norm": 0.627956929952825, + "learning_rate": 0.00015607742372734915, + "loss": 12.3973, + "step": 12164 + }, + { + "epoch": 0.6624328684323375, + "grad_norm": 0.6038635446801585, + "learning_rate": 0.00015607012226711507, + "loss": 12.3268, + "step": 12165 + }, + { + "epoch": 0.6624873224289205, + "grad_norm": 0.5473757396652543, + "learning_rate": 0.0001560628203708703, + "loss": 12.2702, + "step": 12166 + }, + { + "epoch": 0.6625417764255035, + "grad_norm": 0.554151613594465, + "learning_rate": 0.00015605551803867163, + "loss": 12.2511, + "step": 12167 + }, + { + "epoch": 0.6625962304220866, + "grad_norm": 0.6729198964343531, + "learning_rate": 0.00015604821527057588, + "loss": 12.4947, + "step": 12168 + }, + { + "epoch": 0.6626506844186696, + "grad_norm": 0.6180742038416029, + "learning_rate": 0.00015604091206663977, + "loss": 12.301, + "step": 12169 + }, + { + "epoch": 0.6627051384152526, + "grad_norm": 0.5713033910454256, + "learning_rate": 0.00015603360842692015, + "loss": 12.3176, + "step": 12170 + }, + { + "epoch": 0.6627595924118356, + "grad_norm": 0.599816381001002, + "learning_rate": 0.0001560263043514738, + "loss": 12.2241, + "step": 12171 + }, + { + "epoch": 0.6628140464084186, + "grad_norm": 0.5693130389885763, + "learning_rate": 0.0001560189998403575, + "loss": 12.353, + "step": 12172 + }, + { + "epoch": 0.6628685004050016, + "grad_norm": 0.5702316741912086, + "learning_rate": 0.00015601169489362805, + "loss": 12.1861, + "step": 12173 + }, + { + "epoch": 0.6629229544015847, + "grad_norm": 0.5621961876678578, + "learning_rate": 0.0001560043895113423, + "loss": 12.1976, + "step": 12174 + }, + { + "epoch": 0.6629774083981677, + "grad_norm": 0.6012569929692483, + "learning_rate": 0.000155997083693557, + "loss": 12.238, + "step": 12175 + }, + { + "epoch": 0.6630318623947506, + "grad_norm": 0.5302971343374439, + "learning_rate": 0.00015598977744032898, + "loss": 12.2912, + "step": 12176 + }, + { + "epoch": 0.6630863163913336, + "grad_norm": 0.6054268327809826, + "learning_rate": 0.00015598247075171507, + "loss": 12.3391, + "step": 12177 + }, + { + "epoch": 0.6631407703879166, + "grad_norm": 0.6130891768269731, + "learning_rate": 0.00015597516362777212, + "loss": 12.3841, + "step": 12178 + }, + { + "epoch": 0.6631952243844996, + "grad_norm": 0.595933876472692, + "learning_rate": 0.00015596785606855686, + "loss": 12.3136, + "step": 12179 + }, + { + "epoch": 0.6632496783810827, + "grad_norm": 0.5843406076220462, + "learning_rate": 0.00015596054807412617, + "loss": 12.1744, + "step": 12180 + }, + { + "epoch": 0.6633041323776657, + "grad_norm": 0.6031736206179926, + "learning_rate": 0.00015595323964453687, + "loss": 12.287, + "step": 12181 + }, + { + "epoch": 0.6633585863742487, + "grad_norm": 0.599656465734208, + "learning_rate": 0.00015594593077984583, + "loss": 12.2659, + "step": 12182 + }, + { + "epoch": 0.6634130403708317, + "grad_norm": 0.5738100710286648, + "learning_rate": 0.00015593862148010983, + "loss": 12.417, + "step": 12183 + }, + { + "epoch": 0.6634674943674147, + "grad_norm": 0.6222520183921914, + "learning_rate": 0.0001559313117453857, + "loss": 12.3374, + "step": 12184 + }, + { + "epoch": 0.6635219483639977, + "grad_norm": 0.543589456981991, + "learning_rate": 0.00015592400157573034, + "loss": 12.2794, + "step": 12185 + }, + { + "epoch": 0.6635764023605808, + "grad_norm": 0.6115018237920168, + "learning_rate": 0.00015591669097120056, + "loss": 12.2219, + "step": 12186 + }, + { + "epoch": 0.6636308563571638, + "grad_norm": 0.5458482281292727, + "learning_rate": 0.00015590937993185323, + "loss": 12.2878, + "step": 12187 + }, + { + "epoch": 0.6636853103537468, + "grad_norm": 0.5843729540859018, + "learning_rate": 0.0001559020684577452, + "loss": 12.3356, + "step": 12188 + }, + { + "epoch": 0.6637397643503298, + "grad_norm": 0.6558099047970367, + "learning_rate": 0.00015589475654893326, + "loss": 12.1897, + "step": 12189 + }, + { + "epoch": 0.6637942183469128, + "grad_norm": 0.5952426948130463, + "learning_rate": 0.00015588744420547433, + "loss": 12.3942, + "step": 12190 + }, + { + "epoch": 0.6638486723434958, + "grad_norm": 0.5839362546215011, + "learning_rate": 0.0001558801314274253, + "loss": 12.4069, + "step": 12191 + }, + { + "epoch": 0.6639031263400789, + "grad_norm": 0.8111332595455524, + "learning_rate": 0.00015587281821484295, + "loss": 12.4592, + "step": 12192 + }, + { + "epoch": 0.6639575803366619, + "grad_norm": 0.5717171049299086, + "learning_rate": 0.00015586550456778424, + "loss": 12.2548, + "step": 12193 + }, + { + "epoch": 0.6640120343332448, + "grad_norm": 0.6587477339077662, + "learning_rate": 0.00015585819048630597, + "loss": 12.3692, + "step": 12194 + }, + { + "epoch": 0.6640664883298278, + "grad_norm": 0.7716794105304356, + "learning_rate": 0.00015585087597046505, + "loss": 12.1079, + "step": 12195 + }, + { + "epoch": 0.6641209423264108, + "grad_norm": 0.6176557578909904, + "learning_rate": 0.00015584356102031833, + "loss": 12.402, + "step": 12196 + }, + { + "epoch": 0.6641753963229938, + "grad_norm": 0.860535049891481, + "learning_rate": 0.00015583624563592275, + "loss": 12.201, + "step": 12197 + }, + { + "epoch": 0.6642298503195769, + "grad_norm": 0.6206160971273634, + "learning_rate": 0.00015582892981733513, + "loss": 12.32, + "step": 12198 + }, + { + "epoch": 0.6642843043161599, + "grad_norm": 0.5984237246171267, + "learning_rate": 0.00015582161356461242, + "loss": 12.2633, + "step": 12199 + }, + { + "epoch": 0.6643387583127429, + "grad_norm": 0.7078818570229009, + "learning_rate": 0.00015581429687781147, + "loss": 12.3191, + "step": 12200 + }, + { + "epoch": 0.6643932123093259, + "grad_norm": 0.6105722808230988, + "learning_rate": 0.00015580697975698917, + "loss": 12.2233, + "step": 12201 + }, + { + "epoch": 0.6644476663059089, + "grad_norm": 0.6439423699459289, + "learning_rate": 0.00015579966220220247, + "loss": 12.3645, + "step": 12202 + }, + { + "epoch": 0.664502120302492, + "grad_norm": 0.6489363836579027, + "learning_rate": 0.0001557923442135082, + "loss": 12.3112, + "step": 12203 + }, + { + "epoch": 0.664556574299075, + "grad_norm": 0.6137942271046831, + "learning_rate": 0.00015578502579096336, + "loss": 12.32, + "step": 12204 + }, + { + "epoch": 0.664611028295658, + "grad_norm": 0.5813939230459977, + "learning_rate": 0.00015577770693462475, + "loss": 12.3516, + "step": 12205 + }, + { + "epoch": 0.664665482292241, + "grad_norm": 0.7177450492577477, + "learning_rate": 0.00015577038764454936, + "loss": 12.2759, + "step": 12206 + }, + { + "epoch": 0.664719936288824, + "grad_norm": 0.5908558121589046, + "learning_rate": 0.00015576306792079408, + "loss": 12.264, + "step": 12207 + }, + { + "epoch": 0.664774390285407, + "grad_norm": 0.5941370823923019, + "learning_rate": 0.00015575574776341582, + "loss": 12.3181, + "step": 12208 + }, + { + "epoch": 0.6648288442819901, + "grad_norm": 0.6088651213946604, + "learning_rate": 0.00015574842717247154, + "loss": 12.2413, + "step": 12209 + }, + { + "epoch": 0.6648832982785731, + "grad_norm": 0.6926069129225368, + "learning_rate": 0.00015574110614801812, + "loss": 12.2957, + "step": 12210 + }, + { + "epoch": 0.6649377522751561, + "grad_norm": 0.5973284373065706, + "learning_rate": 0.00015573378469011252, + "loss": 12.4042, + "step": 12211 + }, + { + "epoch": 0.664992206271739, + "grad_norm": 0.6332900499532489, + "learning_rate": 0.00015572646279881166, + "loss": 12.3268, + "step": 12212 + }, + { + "epoch": 0.665046660268322, + "grad_norm": 0.5964173395692406, + "learning_rate": 0.0001557191404741725, + "loss": 12.3616, + "step": 12213 + }, + { + "epoch": 0.665101114264905, + "grad_norm": 0.5895920125238943, + "learning_rate": 0.00015571181771625192, + "loss": 12.3699, + "step": 12214 + }, + { + "epoch": 0.6651555682614881, + "grad_norm": 0.6704097046230486, + "learning_rate": 0.00015570449452510693, + "loss": 12.502, + "step": 12215 + }, + { + "epoch": 0.6652100222580711, + "grad_norm": 0.5794962651301736, + "learning_rate": 0.00015569717090079444, + "loss": 12.1452, + "step": 12216 + }, + { + "epoch": 0.6652644762546541, + "grad_norm": 0.5920778321000353, + "learning_rate": 0.0001556898468433714, + "loss": 12.1568, + "step": 12217 + }, + { + "epoch": 0.6653189302512371, + "grad_norm": 0.6403214063903778, + "learning_rate": 0.0001556825223528948, + "loss": 12.2718, + "step": 12218 + }, + { + "epoch": 0.6653733842478201, + "grad_norm": 0.6007841160667045, + "learning_rate": 0.00015567519742942153, + "loss": 12.3545, + "step": 12219 + }, + { + "epoch": 0.6654278382444031, + "grad_norm": 0.6186117578106466, + "learning_rate": 0.00015566787207300863, + "loss": 12.3221, + "step": 12220 + }, + { + "epoch": 0.6654822922409862, + "grad_norm": 0.6571254360260127, + "learning_rate": 0.000155660546283713, + "loss": 12.4017, + "step": 12221 + }, + { + "epoch": 0.6655367462375692, + "grad_norm": 0.7613889115243458, + "learning_rate": 0.00015565322006159163, + "loss": 12.3841, + "step": 12222 + }, + { + "epoch": 0.6655912002341522, + "grad_norm": 0.6264136875700904, + "learning_rate": 0.00015564589340670147, + "loss": 12.3541, + "step": 12223 + }, + { + "epoch": 0.6656456542307352, + "grad_norm": 0.5637844084922261, + "learning_rate": 0.00015563856631909954, + "loss": 12.1914, + "step": 12224 + }, + { + "epoch": 0.6657001082273182, + "grad_norm": 0.5824889512283538, + "learning_rate": 0.00015563123879884278, + "loss": 12.3026, + "step": 12225 + }, + { + "epoch": 0.6657545622239012, + "grad_norm": 0.6220760352001354, + "learning_rate": 0.00015562391084598818, + "loss": 12.3276, + "step": 12226 + }, + { + "epoch": 0.6658090162204843, + "grad_norm": 0.5298688423843939, + "learning_rate": 0.00015561658246059272, + "loss": 12.2705, + "step": 12227 + }, + { + "epoch": 0.6658634702170673, + "grad_norm": 0.5625501538555766, + "learning_rate": 0.00015560925364271338, + "loss": 12.4409, + "step": 12228 + }, + { + "epoch": 0.6659179242136503, + "grad_norm": 0.6663848890519213, + "learning_rate": 0.00015560192439240716, + "loss": 12.2582, + "step": 12229 + }, + { + "epoch": 0.6659723782102333, + "grad_norm": 0.5904031545317446, + "learning_rate": 0.00015559459470973103, + "loss": 12.1851, + "step": 12230 + }, + { + "epoch": 0.6660268322068162, + "grad_norm": 0.5849650349945908, + "learning_rate": 0.00015558726459474203, + "loss": 12.2091, + "step": 12231 + }, + { + "epoch": 0.6660812862033992, + "grad_norm": 0.5324786902705425, + "learning_rate": 0.00015557993404749715, + "loss": 12.2008, + "step": 12232 + }, + { + "epoch": 0.6661357401999823, + "grad_norm": 0.5968945370069183, + "learning_rate": 0.00015557260306805337, + "loss": 12.4381, + "step": 12233 + }, + { + "epoch": 0.6661901941965653, + "grad_norm": 0.5838181459219236, + "learning_rate": 0.0001555652716564677, + "loss": 12.2553, + "step": 12234 + }, + { + "epoch": 0.6662446481931483, + "grad_norm": 0.5742301697411183, + "learning_rate": 0.00015555793981279718, + "loss": 12.2566, + "step": 12235 + }, + { + "epoch": 0.6662991021897313, + "grad_norm": 0.5914965836401413, + "learning_rate": 0.0001555506075370988, + "loss": 12.3349, + "step": 12236 + }, + { + "epoch": 0.6663535561863143, + "grad_norm": 0.6420592487637078, + "learning_rate": 0.00015554327482942957, + "loss": 12.2938, + "step": 12237 + }, + { + "epoch": 0.6664080101828974, + "grad_norm": 0.5607050708073679, + "learning_rate": 0.00015553594168984654, + "loss": 12.2412, + "step": 12238 + }, + { + "epoch": 0.6664624641794804, + "grad_norm": 0.5609018375686214, + "learning_rate": 0.00015552860811840667, + "loss": 12.2956, + "step": 12239 + }, + { + "epoch": 0.6665169181760634, + "grad_norm": 0.5809352702800051, + "learning_rate": 0.00015552127411516702, + "loss": 12.2465, + "step": 12240 + }, + { + "epoch": 0.6665713721726464, + "grad_norm": 0.5842398929846088, + "learning_rate": 0.0001555139396801847, + "loss": 12.2001, + "step": 12241 + }, + { + "epoch": 0.6666258261692294, + "grad_norm": 0.5639530150176731, + "learning_rate": 0.00015550660481351668, + "loss": 12.3483, + "step": 12242 + }, + { + "epoch": 0.6666802801658124, + "grad_norm": 0.5409246664466759, + "learning_rate": 0.00015549926951521993, + "loss": 12.306, + "step": 12243 + }, + { + "epoch": 0.6667347341623955, + "grad_norm": 0.6340991968931381, + "learning_rate": 0.0001554919337853516, + "loss": 12.4283, + "step": 12244 + }, + { + "epoch": 0.6667891881589785, + "grad_norm": 0.6306748542962477, + "learning_rate": 0.00015548459762396863, + "loss": 12.3561, + "step": 12245 + }, + { + "epoch": 0.6668436421555615, + "grad_norm": 0.6224388651294798, + "learning_rate": 0.00015547726103112817, + "loss": 12.407, + "step": 12246 + }, + { + "epoch": 0.6668980961521445, + "grad_norm": 0.524635684961415, + "learning_rate": 0.00015546992400688724, + "loss": 12.3067, + "step": 12247 + }, + { + "epoch": 0.6669525501487275, + "grad_norm": 0.589540221009856, + "learning_rate": 0.00015546258655130285, + "loss": 12.2554, + "step": 12248 + }, + { + "epoch": 0.6670070041453104, + "grad_norm": 0.5622070308010497, + "learning_rate": 0.00015545524866443207, + "loss": 12.1965, + "step": 12249 + }, + { + "epoch": 0.6670614581418935, + "grad_norm": 0.5203199740512348, + "learning_rate": 0.000155447910346332, + "loss": 12.2916, + "step": 12250 + }, + { + "epoch": 0.6671159121384765, + "grad_norm": 0.5798442999382576, + "learning_rate": 0.00015544057159705966, + "loss": 12.3144, + "step": 12251 + }, + { + "epoch": 0.6671703661350595, + "grad_norm": 0.5390908971582956, + "learning_rate": 0.00015543323241667216, + "loss": 12.2795, + "step": 12252 + }, + { + "epoch": 0.6672248201316425, + "grad_norm": 0.5963423017492206, + "learning_rate": 0.00015542589280522658, + "loss": 12.2141, + "step": 12253 + }, + { + "epoch": 0.6672792741282255, + "grad_norm": 0.5845700390262353, + "learning_rate": 0.0001554185527627799, + "loss": 12.3106, + "step": 12254 + }, + { + "epoch": 0.6673337281248085, + "grad_norm": 0.6140335245300805, + "learning_rate": 0.0001554112122893893, + "loss": 12.3444, + "step": 12255 + }, + { + "epoch": 0.6673881821213916, + "grad_norm": 0.6129360063082528, + "learning_rate": 0.00015540387138511177, + "loss": 12.3694, + "step": 12256 + }, + { + "epoch": 0.6674426361179746, + "grad_norm": 0.5833285955444938, + "learning_rate": 0.0001553965300500045, + "loss": 12.2542, + "step": 12257 + }, + { + "epoch": 0.6674970901145576, + "grad_norm": 0.6353115649335048, + "learning_rate": 0.0001553891882841245, + "loss": 12.4273, + "step": 12258 + }, + { + "epoch": 0.6675515441111406, + "grad_norm": 0.5726108332932701, + "learning_rate": 0.00015538184608752888, + "loss": 12.1541, + "step": 12259 + }, + { + "epoch": 0.6676059981077236, + "grad_norm": 0.6688286875161022, + "learning_rate": 0.00015537450346027475, + "loss": 12.3407, + "step": 12260 + }, + { + "epoch": 0.6676604521043066, + "grad_norm": 0.6635128458029128, + "learning_rate": 0.0001553671604024192, + "loss": 12.3662, + "step": 12261 + }, + { + "epoch": 0.6677149061008897, + "grad_norm": 0.616898015401075, + "learning_rate": 0.0001553598169140193, + "loss": 12.2972, + "step": 12262 + }, + { + "epoch": 0.6677693600974727, + "grad_norm": 0.6165516831846334, + "learning_rate": 0.0001553524729951322, + "loss": 12.1998, + "step": 12263 + }, + { + "epoch": 0.6678238140940557, + "grad_norm": 0.5950312797869388, + "learning_rate": 0.00015534512864581499, + "loss": 12.1871, + "step": 12264 + }, + { + "epoch": 0.6678782680906387, + "grad_norm": 0.5886345418949827, + "learning_rate": 0.00015533778386612478, + "loss": 12.3738, + "step": 12265 + }, + { + "epoch": 0.6679327220872217, + "grad_norm": 0.5797917759980304, + "learning_rate": 0.00015533043865611864, + "loss": 12.259, + "step": 12266 + }, + { + "epoch": 0.6679871760838046, + "grad_norm": 0.6897782017418148, + "learning_rate": 0.0001553230930158538, + "loss": 12.3921, + "step": 12267 + }, + { + "epoch": 0.6680416300803877, + "grad_norm": 0.5643376226552791, + "learning_rate": 0.00015531574694538727, + "loss": 12.3513, + "step": 12268 + }, + { + "epoch": 0.6680960840769707, + "grad_norm": 0.5793628655866306, + "learning_rate": 0.00015530840044477619, + "loss": 12.2912, + "step": 12269 + }, + { + "epoch": 0.6681505380735537, + "grad_norm": 0.524965459789386, + "learning_rate": 0.00015530105351407777, + "loss": 12.2108, + "step": 12270 + }, + { + "epoch": 0.6682049920701367, + "grad_norm": 0.588021490527018, + "learning_rate": 0.00015529370615334904, + "loss": 12.2733, + "step": 12271 + }, + { + "epoch": 0.6682594460667197, + "grad_norm": 0.5918043083770076, + "learning_rate": 0.0001552863583626472, + "loss": 12.3017, + "step": 12272 + }, + { + "epoch": 0.6683139000633028, + "grad_norm": 0.5349154769643079, + "learning_rate": 0.00015527901014202936, + "loss": 12.1857, + "step": 12273 + }, + { + "epoch": 0.6683683540598858, + "grad_norm": 0.5826956440854895, + "learning_rate": 0.00015527166149155268, + "loss": 12.1369, + "step": 12274 + }, + { + "epoch": 0.6684228080564688, + "grad_norm": 0.6401619839037013, + "learning_rate": 0.00015526431241127427, + "loss": 12.3254, + "step": 12275 + }, + { + "epoch": 0.6684772620530518, + "grad_norm": 0.6293240232898998, + "learning_rate": 0.0001552569629012513, + "loss": 12.3404, + "step": 12276 + }, + { + "epoch": 0.6685317160496348, + "grad_norm": 0.5878958409192327, + "learning_rate": 0.00015524961296154092, + "loss": 12.2896, + "step": 12277 + }, + { + "epoch": 0.6685861700462178, + "grad_norm": 0.5434146624970057, + "learning_rate": 0.0001552422625922003, + "loss": 12.2185, + "step": 12278 + }, + { + "epoch": 0.6686406240428009, + "grad_norm": 0.6564827281971444, + "learning_rate": 0.00015523491179328657, + "loss": 12.3515, + "step": 12279 + }, + { + "epoch": 0.6686950780393839, + "grad_norm": 0.6563677584404193, + "learning_rate": 0.0001552275605648569, + "loss": 12.3971, + "step": 12280 + }, + { + "epoch": 0.6687495320359669, + "grad_norm": 0.639262551518245, + "learning_rate": 0.00015522020890696846, + "loss": 12.2402, + "step": 12281 + }, + { + "epoch": 0.6688039860325499, + "grad_norm": 0.7951500546832201, + "learning_rate": 0.00015521285681967844, + "loss": 12.2807, + "step": 12282 + }, + { + "epoch": 0.6688584400291329, + "grad_norm": 0.599505447244044, + "learning_rate": 0.00015520550430304393, + "loss": 12.3153, + "step": 12283 + }, + { + "epoch": 0.6689128940257159, + "grad_norm": 0.6209193433879819, + "learning_rate": 0.0001551981513571222, + "loss": 12.2535, + "step": 12284 + }, + { + "epoch": 0.668967348022299, + "grad_norm": 0.7257855828873232, + "learning_rate": 0.00015519079798197042, + "loss": 12.321, + "step": 12285 + }, + { + "epoch": 0.669021802018882, + "grad_norm": 0.5920719331977042, + "learning_rate": 0.00015518344417764568, + "loss": 12.2257, + "step": 12286 + }, + { + "epoch": 0.6690762560154649, + "grad_norm": 0.6424559920639874, + "learning_rate": 0.00015517608994420527, + "loss": 12.2562, + "step": 12287 + }, + { + "epoch": 0.6691307100120479, + "grad_norm": 0.7367637717941627, + "learning_rate": 0.0001551687352817063, + "loss": 12.4215, + "step": 12288 + }, + { + "epoch": 0.6691851640086309, + "grad_norm": 0.6389986361996636, + "learning_rate": 0.00015516138019020602, + "loss": 12.3891, + "step": 12289 + }, + { + "epoch": 0.6692396180052139, + "grad_norm": 0.7193207098366835, + "learning_rate": 0.00015515402466976155, + "loss": 12.4082, + "step": 12290 + }, + { + "epoch": 0.669294072001797, + "grad_norm": 0.7145208288974416, + "learning_rate": 0.00015514666872043015, + "loss": 12.3602, + "step": 12291 + }, + { + "epoch": 0.66934852599838, + "grad_norm": 0.6881150012113569, + "learning_rate": 0.00015513931234226904, + "loss": 12.4423, + "step": 12292 + }, + { + "epoch": 0.669402979994963, + "grad_norm": 0.745635936605013, + "learning_rate": 0.00015513195553533537, + "loss": 12.3203, + "step": 12293 + }, + { + "epoch": 0.669457433991546, + "grad_norm": 0.6162984481069275, + "learning_rate": 0.00015512459829968638, + "loss": 12.2663, + "step": 12294 + }, + { + "epoch": 0.669511887988129, + "grad_norm": 0.6281275905245626, + "learning_rate": 0.00015511724063537922, + "loss": 12.3963, + "step": 12295 + }, + { + "epoch": 0.669566341984712, + "grad_norm": 0.7733149049652096, + "learning_rate": 0.00015510988254247121, + "loss": 12.3302, + "step": 12296 + }, + { + "epoch": 0.6696207959812951, + "grad_norm": 0.5780855891143521, + "learning_rate": 0.0001551025240210195, + "loss": 12.282, + "step": 12297 + }, + { + "epoch": 0.6696752499778781, + "grad_norm": 0.6068608143213838, + "learning_rate": 0.00015509516507108132, + "loss": 12.212, + "step": 12298 + }, + { + "epoch": 0.6697297039744611, + "grad_norm": 0.7063211921350746, + "learning_rate": 0.00015508780569271387, + "loss": 12.3415, + "step": 12299 + }, + { + "epoch": 0.6697841579710441, + "grad_norm": 0.5802194256618435, + "learning_rate": 0.00015508044588597442, + "loss": 12.3357, + "step": 12300 + }, + { + "epoch": 0.6698386119676271, + "grad_norm": 0.6195286163771041, + "learning_rate": 0.0001550730856509202, + "loss": 12.3546, + "step": 12301 + }, + { + "epoch": 0.6698930659642102, + "grad_norm": 0.826825832697903, + "learning_rate": 0.00015506572498760843, + "loss": 12.3014, + "step": 12302 + }, + { + "epoch": 0.6699475199607932, + "grad_norm": 0.6368084289094414, + "learning_rate": 0.00015505836389609633, + "loss": 12.3103, + "step": 12303 + }, + { + "epoch": 0.6700019739573762, + "grad_norm": 0.6280506242277251, + "learning_rate": 0.00015505100237644118, + "loss": 12.1298, + "step": 12304 + }, + { + "epoch": 0.6700564279539591, + "grad_norm": 0.6526406731752941, + "learning_rate": 0.0001550436404287002, + "loss": 12.4316, + "step": 12305 + }, + { + "epoch": 0.6701108819505421, + "grad_norm": 0.6004196297894798, + "learning_rate": 0.0001550362780529306, + "loss": 12.3352, + "step": 12306 + }, + { + "epoch": 0.6701653359471251, + "grad_norm": 0.6262109885265384, + "learning_rate": 0.00015502891524918973, + "loss": 12.215, + "step": 12307 + }, + { + "epoch": 0.6702197899437082, + "grad_norm": 0.6828665022831916, + "learning_rate": 0.00015502155201753477, + "loss": 12.3009, + "step": 12308 + }, + { + "epoch": 0.6702742439402912, + "grad_norm": 0.5860601334328157, + "learning_rate": 0.000155014188358023, + "loss": 12.3318, + "step": 12309 + }, + { + "epoch": 0.6703286979368742, + "grad_norm": 0.6511973526521048, + "learning_rate": 0.00015500682427071162, + "loss": 12.4775, + "step": 12310 + }, + { + "epoch": 0.6703831519334572, + "grad_norm": 0.5563454571783759, + "learning_rate": 0.000154999459755658, + "loss": 12.2955, + "step": 12311 + }, + { + "epoch": 0.6704376059300402, + "grad_norm": 0.5392087483489257, + "learning_rate": 0.00015499209481291935, + "loss": 12.2512, + "step": 12312 + }, + { + "epoch": 0.6704920599266232, + "grad_norm": 0.5933980380777396, + "learning_rate": 0.00015498472944255296, + "loss": 12.2228, + "step": 12313 + }, + { + "epoch": 0.6705465139232063, + "grad_norm": 0.578197621670821, + "learning_rate": 0.0001549773636446161, + "loss": 12.4391, + "step": 12314 + }, + { + "epoch": 0.6706009679197893, + "grad_norm": 0.5833579940210534, + "learning_rate": 0.000154969997419166, + "loss": 12.443, + "step": 12315 + }, + { + "epoch": 0.6706554219163723, + "grad_norm": 0.5603252629323376, + "learning_rate": 0.00015496263076626, + "loss": 12.1525, + "step": 12316 + }, + { + "epoch": 0.6707098759129553, + "grad_norm": 0.5888360644780639, + "learning_rate": 0.0001549552636859554, + "loss": 12.2737, + "step": 12317 + }, + { + "epoch": 0.6707643299095383, + "grad_norm": 0.6644092677788832, + "learning_rate": 0.00015494789617830938, + "loss": 12.1984, + "step": 12318 + }, + { + "epoch": 0.6708187839061213, + "grad_norm": 0.5836970411791585, + "learning_rate": 0.00015494052824337938, + "loss": 12.3877, + "step": 12319 + }, + { + "epoch": 0.6708732379027044, + "grad_norm": 0.6424242091880643, + "learning_rate": 0.00015493315988122258, + "loss": 12.3825, + "step": 12320 + }, + { + "epoch": 0.6709276918992874, + "grad_norm": 0.6002102631739418, + "learning_rate": 0.00015492579109189632, + "loss": 12.1623, + "step": 12321 + }, + { + "epoch": 0.6709821458958704, + "grad_norm": 0.6137470092392461, + "learning_rate": 0.0001549184218754579, + "loss": 12.3687, + "step": 12322 + }, + { + "epoch": 0.6710365998924533, + "grad_norm": 0.5601924573585031, + "learning_rate": 0.00015491105223196464, + "loss": 12.2654, + "step": 12323 + }, + { + "epoch": 0.6710910538890363, + "grad_norm": 0.6533100680065654, + "learning_rate": 0.00015490368216147378, + "loss": 12.3328, + "step": 12324 + }, + { + "epoch": 0.6711455078856193, + "grad_norm": 0.5571867758560064, + "learning_rate": 0.00015489631166404273, + "loss": 12.2228, + "step": 12325 + }, + { + "epoch": 0.6711999618822024, + "grad_norm": 0.6720500084489055, + "learning_rate": 0.0001548889407397287, + "loss": 12.3046, + "step": 12326 + }, + { + "epoch": 0.6712544158787854, + "grad_norm": 0.5338742799875239, + "learning_rate": 0.00015488156938858912, + "loss": 12.2962, + "step": 12327 + }, + { + "epoch": 0.6713088698753684, + "grad_norm": 0.6713263288393614, + "learning_rate": 0.00015487419761068126, + "loss": 12.3337, + "step": 12328 + }, + { + "epoch": 0.6713633238719514, + "grad_norm": 0.6455710139817895, + "learning_rate": 0.00015486682540606238, + "loss": 12.364, + "step": 12329 + }, + { + "epoch": 0.6714177778685344, + "grad_norm": 0.535391671053815, + "learning_rate": 0.00015485945277478989, + "loss": 12.3494, + "step": 12330 + }, + { + "epoch": 0.6714722318651174, + "grad_norm": 0.6048302286197477, + "learning_rate": 0.0001548520797169211, + "loss": 12.2211, + "step": 12331 + }, + { + "epoch": 0.6715266858617005, + "grad_norm": 0.5674125889055247, + "learning_rate": 0.00015484470623251332, + "loss": 12.1535, + "step": 12332 + }, + { + "epoch": 0.6715811398582835, + "grad_norm": 0.5262918147403375, + "learning_rate": 0.00015483733232162395, + "loss": 12.1773, + "step": 12333 + }, + { + "epoch": 0.6716355938548665, + "grad_norm": 0.5512608317557834, + "learning_rate": 0.00015482995798431025, + "loss": 12.2709, + "step": 12334 + }, + { + "epoch": 0.6716900478514495, + "grad_norm": 0.5518915582374262, + "learning_rate": 0.0001548225832206296, + "loss": 12.2212, + "step": 12335 + }, + { + "epoch": 0.6717445018480325, + "grad_norm": 0.563258202916151, + "learning_rate": 0.00015481520803063936, + "loss": 12.3136, + "step": 12336 + }, + { + "epoch": 0.6717989558446156, + "grad_norm": 0.5866718369655832, + "learning_rate": 0.0001548078324143969, + "loss": 12.3299, + "step": 12337 + }, + { + "epoch": 0.6718534098411986, + "grad_norm": 0.6200280583867798, + "learning_rate": 0.0001548004563719595, + "loss": 12.2475, + "step": 12338 + }, + { + "epoch": 0.6719078638377816, + "grad_norm": 0.6078855752892216, + "learning_rate": 0.00015479307990338457, + "loss": 12.2136, + "step": 12339 + }, + { + "epoch": 0.6719623178343646, + "grad_norm": 0.6314598056079747, + "learning_rate": 0.00015478570300872947, + "loss": 12.2619, + "step": 12340 + }, + { + "epoch": 0.6720167718309475, + "grad_norm": 0.5881620181870542, + "learning_rate": 0.00015477832568805156, + "loss": 12.3938, + "step": 12341 + }, + { + "epoch": 0.6720712258275305, + "grad_norm": 0.6373593538818688, + "learning_rate": 0.00015477094794140821, + "loss": 12.2626, + "step": 12342 + }, + { + "epoch": 0.6721256798241136, + "grad_norm": 0.5922390028924963, + "learning_rate": 0.00015476356976885675, + "loss": 12.3762, + "step": 12343 + }, + { + "epoch": 0.6721801338206966, + "grad_norm": 0.6446399456565183, + "learning_rate": 0.0001547561911704546, + "loss": 12.358, + "step": 12344 + }, + { + "epoch": 0.6722345878172796, + "grad_norm": 0.627724909475266, + "learning_rate": 0.00015474881214625917, + "loss": 12.3067, + "step": 12345 + }, + { + "epoch": 0.6722890418138626, + "grad_norm": 0.6242450041274252, + "learning_rate": 0.00015474143269632773, + "loss": 12.3418, + "step": 12346 + }, + { + "epoch": 0.6723434958104456, + "grad_norm": 0.5695089624131952, + "learning_rate": 0.00015473405282071776, + "loss": 12.315, + "step": 12347 + }, + { + "epoch": 0.6723979498070286, + "grad_norm": 0.6649335811814701, + "learning_rate": 0.00015472667251948663, + "loss": 12.4619, + "step": 12348 + }, + { + "epoch": 0.6724524038036117, + "grad_norm": 0.6151808744087764, + "learning_rate": 0.0001547192917926917, + "loss": 12.2973, + "step": 12349 + }, + { + "epoch": 0.6725068578001947, + "grad_norm": 0.5903380108730035, + "learning_rate": 0.00015471191064039038, + "loss": 12.2254, + "step": 12350 + }, + { + "epoch": 0.6725613117967777, + "grad_norm": 0.5857314014547613, + "learning_rate": 0.00015470452906264005, + "loss": 12.3142, + "step": 12351 + }, + { + "epoch": 0.6726157657933607, + "grad_norm": 0.5376113152177633, + "learning_rate": 0.00015469714705949815, + "loss": 12.2964, + "step": 12352 + }, + { + "epoch": 0.6726702197899437, + "grad_norm": 0.5985240941302591, + "learning_rate": 0.00015468976463102208, + "loss": 12.323, + "step": 12353 + }, + { + "epoch": 0.6727246737865267, + "grad_norm": 0.5841038750145867, + "learning_rate": 0.0001546823817772692, + "loss": 12.221, + "step": 12354 + }, + { + "epoch": 0.6727791277831098, + "grad_norm": 0.5958397562184997, + "learning_rate": 0.00015467499849829697, + "loss": 12.3587, + "step": 12355 + }, + { + "epoch": 0.6728335817796928, + "grad_norm": 0.6544013573600018, + "learning_rate": 0.00015466761479416275, + "loss": 12.2337, + "step": 12356 + }, + { + "epoch": 0.6728880357762758, + "grad_norm": 0.7357460786214085, + "learning_rate": 0.00015466023066492402, + "loss": 12.3293, + "step": 12357 + }, + { + "epoch": 0.6729424897728588, + "grad_norm": 0.53074594411693, + "learning_rate": 0.00015465284611063815, + "loss": 12.2764, + "step": 12358 + }, + { + "epoch": 0.6729969437694417, + "grad_norm": 0.6640114712340902, + "learning_rate": 0.0001546454611313626, + "loss": 12.2647, + "step": 12359 + }, + { + "epoch": 0.6730513977660247, + "grad_norm": 0.5867100703731516, + "learning_rate": 0.0001546380757271548, + "loss": 12.3141, + "step": 12360 + }, + { + "epoch": 0.6731058517626078, + "grad_norm": 0.6114819001091513, + "learning_rate": 0.0001546306898980721, + "loss": 12.4673, + "step": 12361 + }, + { + "epoch": 0.6731603057591908, + "grad_norm": 0.5919784984722599, + "learning_rate": 0.00015462330364417203, + "loss": 12.3433, + "step": 12362 + }, + { + "epoch": 0.6732147597557738, + "grad_norm": 0.6407954340101385, + "learning_rate": 0.00015461591696551202, + "loss": 12.1382, + "step": 12363 + }, + { + "epoch": 0.6732692137523568, + "grad_norm": 0.569920810338462, + "learning_rate": 0.00015460852986214942, + "loss": 12.3783, + "step": 12364 + }, + { + "epoch": 0.6733236677489398, + "grad_norm": 0.5870295077894265, + "learning_rate": 0.00015460114233414177, + "loss": 12.3657, + "step": 12365 + }, + { + "epoch": 0.6733781217455228, + "grad_norm": 0.6636458321676164, + "learning_rate": 0.00015459375438154645, + "loss": 12.3468, + "step": 12366 + }, + { + "epoch": 0.6734325757421059, + "grad_norm": 0.6058518127129257, + "learning_rate": 0.00015458636600442098, + "loss": 12.3653, + "step": 12367 + }, + { + "epoch": 0.6734870297386889, + "grad_norm": 0.5852369575195829, + "learning_rate": 0.00015457897720282278, + "loss": 12.31, + "step": 12368 + }, + { + "epoch": 0.6735414837352719, + "grad_norm": 0.6882324799735526, + "learning_rate": 0.00015457158797680927, + "loss": 12.2676, + "step": 12369 + }, + { + "epoch": 0.6735959377318549, + "grad_norm": 0.6390571986339338, + "learning_rate": 0.00015456419832643795, + "loss": 12.289, + "step": 12370 + }, + { + "epoch": 0.6736503917284379, + "grad_norm": 0.5658026855278877, + "learning_rate": 0.00015455680825176624, + "loss": 12.2175, + "step": 12371 + }, + { + "epoch": 0.673704845725021, + "grad_norm": 0.6516117326196103, + "learning_rate": 0.00015454941775285168, + "loss": 12.3273, + "step": 12372 + }, + { + "epoch": 0.673759299721604, + "grad_norm": 0.6616815047039033, + "learning_rate": 0.0001545420268297517, + "loss": 12.3157, + "step": 12373 + }, + { + "epoch": 0.673813753718187, + "grad_norm": 0.5321409149239987, + "learning_rate": 0.00015453463548252376, + "loss": 12.3777, + "step": 12374 + }, + { + "epoch": 0.67386820771477, + "grad_norm": 0.5706921321363255, + "learning_rate": 0.00015452724371122536, + "loss": 12.3715, + "step": 12375 + }, + { + "epoch": 0.673922661711353, + "grad_norm": 0.6419548750079106, + "learning_rate": 0.00015451985151591395, + "loss": 12.4302, + "step": 12376 + }, + { + "epoch": 0.673977115707936, + "grad_norm": 0.6001667031937706, + "learning_rate": 0.00015451245889664701, + "loss": 12.3792, + "step": 12377 + }, + { + "epoch": 0.674031569704519, + "grad_norm": 0.6269987634290967, + "learning_rate": 0.00015450506585348213, + "loss": 12.2882, + "step": 12378 + }, + { + "epoch": 0.674086023701102, + "grad_norm": 0.6149717814168466, + "learning_rate": 0.00015449767238647665, + "loss": 12.458, + "step": 12379 + }, + { + "epoch": 0.674140477697685, + "grad_norm": 0.6305814591670665, + "learning_rate": 0.00015449027849568815, + "loss": 12.1985, + "step": 12380 + }, + { + "epoch": 0.674194931694268, + "grad_norm": 0.6317839199734646, + "learning_rate": 0.0001544828841811741, + "loss": 12.3741, + "step": 12381 + }, + { + "epoch": 0.674249385690851, + "grad_norm": 0.7296820767208071, + "learning_rate": 0.00015447548944299202, + "loss": 12.4499, + "step": 12382 + }, + { + "epoch": 0.674303839687434, + "grad_norm": 0.760502440648826, + "learning_rate": 0.00015446809428119938, + "loss": 12.4208, + "step": 12383 + }, + { + "epoch": 0.6743582936840171, + "grad_norm": 0.6032206376300737, + "learning_rate": 0.0001544606986958537, + "loss": 12.3193, + "step": 12384 + }, + { + "epoch": 0.6744127476806001, + "grad_norm": 0.651116057178426, + "learning_rate": 0.00015445330268701253, + "loss": 12.3101, + "step": 12385 + }, + { + "epoch": 0.6744672016771831, + "grad_norm": 0.6649409317668227, + "learning_rate": 0.00015444590625473333, + "loss": 12.3424, + "step": 12386 + }, + { + "epoch": 0.6745216556737661, + "grad_norm": 0.6269635487637647, + "learning_rate": 0.0001544385093990736, + "loss": 12.3125, + "step": 12387 + }, + { + "epoch": 0.6745761096703491, + "grad_norm": 0.6004238287073058, + "learning_rate": 0.00015443111212009095, + "loss": 12.2203, + "step": 12388 + }, + { + "epoch": 0.6746305636669321, + "grad_norm": 0.636585741622858, + "learning_rate": 0.00015442371441784278, + "loss": 12.3283, + "step": 12389 + }, + { + "epoch": 0.6746850176635152, + "grad_norm": 0.6314743975811515, + "learning_rate": 0.0001544163162923867, + "loss": 12.3442, + "step": 12390 + }, + { + "epoch": 0.6747394716600982, + "grad_norm": 0.5799282392051064, + "learning_rate": 0.00015440891774378025, + "loss": 12.4539, + "step": 12391 + }, + { + "epoch": 0.6747939256566812, + "grad_norm": 0.7255747829870278, + "learning_rate": 0.0001544015187720809, + "loss": 12.2881, + "step": 12392 + }, + { + "epoch": 0.6748483796532642, + "grad_norm": 0.5439656129914802, + "learning_rate": 0.00015439411937734625, + "loss": 12.124, + "step": 12393 + }, + { + "epoch": 0.6749028336498472, + "grad_norm": 0.6537594353864004, + "learning_rate": 0.0001543867195596338, + "loss": 12.1749, + "step": 12394 + }, + { + "epoch": 0.6749572876464301, + "grad_norm": 0.6631492345005819, + "learning_rate": 0.00015437931931900107, + "loss": 12.2417, + "step": 12395 + }, + { + "epoch": 0.6750117416430133, + "grad_norm": 0.561066732269436, + "learning_rate": 0.00015437191865550563, + "loss": 12.2644, + "step": 12396 + }, + { + "epoch": 0.6750661956395962, + "grad_norm": 0.5838921948641176, + "learning_rate": 0.00015436451756920504, + "loss": 12.29, + "step": 12397 + }, + { + "epoch": 0.6751206496361792, + "grad_norm": 0.6094728089312846, + "learning_rate": 0.0001543571160601569, + "loss": 12.3605, + "step": 12398 + }, + { + "epoch": 0.6751751036327622, + "grad_norm": 0.5575270120681592, + "learning_rate": 0.00015434971412841867, + "loss": 12.2974, + "step": 12399 + }, + { + "epoch": 0.6752295576293452, + "grad_norm": 0.6325187505651708, + "learning_rate": 0.0001543423117740479, + "loss": 12.2801, + "step": 12400 + }, + { + "epoch": 0.6752840116259282, + "grad_norm": 0.6472180003870803, + "learning_rate": 0.00015433490899710223, + "loss": 12.3826, + "step": 12401 + }, + { + "epoch": 0.6753384656225113, + "grad_norm": 0.5868907719763805, + "learning_rate": 0.0001543275057976392, + "loss": 12.2332, + "step": 12402 + }, + { + "epoch": 0.6753929196190943, + "grad_norm": 0.6427595911096574, + "learning_rate": 0.0001543201021757164, + "loss": 12.4348, + "step": 12403 + }, + { + "epoch": 0.6754473736156773, + "grad_norm": 0.6668338028229127, + "learning_rate": 0.00015431269813139138, + "loss": 12.3388, + "step": 12404 + }, + { + "epoch": 0.6755018276122603, + "grad_norm": 0.5775714167901217, + "learning_rate": 0.00015430529366472167, + "loss": 12.3564, + "step": 12405 + }, + { + "epoch": 0.6755562816088433, + "grad_norm": 0.5332194175715848, + "learning_rate": 0.0001542978887757649, + "loss": 12.267, + "step": 12406 + }, + { + "epoch": 0.6756107356054264, + "grad_norm": 0.5890769707710252, + "learning_rate": 0.00015429048346457863, + "loss": 12.3213, + "step": 12407 + }, + { + "epoch": 0.6756651896020094, + "grad_norm": 0.6323650116011595, + "learning_rate": 0.0001542830777312205, + "loss": 12.289, + "step": 12408 + }, + { + "epoch": 0.6757196435985924, + "grad_norm": 0.6041259921360344, + "learning_rate": 0.00015427567157574802, + "loss": 12.172, + "step": 12409 + }, + { + "epoch": 0.6757740975951754, + "grad_norm": 0.6035886166991528, + "learning_rate": 0.00015426826499821884, + "loss": 12.1674, + "step": 12410 + }, + { + "epoch": 0.6758285515917584, + "grad_norm": 0.5608004708899101, + "learning_rate": 0.00015426085799869052, + "loss": 12.308, + "step": 12411 + }, + { + "epoch": 0.6758830055883414, + "grad_norm": 0.7775669709518196, + "learning_rate": 0.00015425345057722064, + "loss": 12.446, + "step": 12412 + }, + { + "epoch": 0.6759374595849245, + "grad_norm": 0.598766215338868, + "learning_rate": 0.0001542460427338669, + "loss": 12.3271, + "step": 12413 + }, + { + "epoch": 0.6759919135815075, + "grad_norm": 0.5941778258183403, + "learning_rate": 0.00015423863446868677, + "loss": 12.2538, + "step": 12414 + }, + { + "epoch": 0.6760463675780904, + "grad_norm": 0.5730630987233022, + "learning_rate": 0.00015423122578173793, + "loss": 12.2467, + "step": 12415 + }, + { + "epoch": 0.6761008215746734, + "grad_norm": 0.6393883467564998, + "learning_rate": 0.000154223816673078, + "loss": 12.3639, + "step": 12416 + }, + { + "epoch": 0.6761552755712564, + "grad_norm": 0.5958537515290336, + "learning_rate": 0.00015421640714276459, + "loss": 12.2933, + "step": 12417 + }, + { + "epoch": 0.6762097295678394, + "grad_norm": 0.5473205329362438, + "learning_rate": 0.00015420899719085528, + "loss": 12.1924, + "step": 12418 + }, + { + "epoch": 0.6762641835644225, + "grad_norm": 0.6135898548407824, + "learning_rate": 0.00015420158681740773, + "loss": 12.3284, + "step": 12419 + }, + { + "epoch": 0.6763186375610055, + "grad_norm": 0.6937169434880651, + "learning_rate": 0.00015419417602247955, + "loss": 12.3112, + "step": 12420 + }, + { + "epoch": 0.6763730915575885, + "grad_norm": 0.5813587951963771, + "learning_rate": 0.00015418676480612837, + "loss": 12.2987, + "step": 12421 + }, + { + "epoch": 0.6764275455541715, + "grad_norm": 0.5634384409012385, + "learning_rate": 0.00015417935316841178, + "loss": 12.2539, + "step": 12422 + }, + { + "epoch": 0.6764819995507545, + "grad_norm": 0.6025612760466049, + "learning_rate": 0.0001541719411093875, + "loss": 12.1718, + "step": 12423 + }, + { + "epoch": 0.6765364535473375, + "grad_norm": 0.5833255778249352, + "learning_rate": 0.00015416452862911312, + "loss": 12.2218, + "step": 12424 + }, + { + "epoch": 0.6765909075439206, + "grad_norm": 0.5830101697665969, + "learning_rate": 0.00015415711572764628, + "loss": 12.2757, + "step": 12425 + }, + { + "epoch": 0.6766453615405036, + "grad_norm": 0.6382187491294317, + "learning_rate": 0.00015414970240504463, + "loss": 12.1939, + "step": 12426 + }, + { + "epoch": 0.6766998155370866, + "grad_norm": 0.6245430074794555, + "learning_rate": 0.0001541422886613658, + "loss": 12.3249, + "step": 12427 + }, + { + "epoch": 0.6767542695336696, + "grad_norm": 0.5902115680820347, + "learning_rate": 0.00015413487449666746, + "loss": 12.3674, + "step": 12428 + }, + { + "epoch": 0.6768087235302526, + "grad_norm": 0.5595368427708826, + "learning_rate": 0.00015412745991100722, + "loss": 12.2548, + "step": 12429 + }, + { + "epoch": 0.6768631775268356, + "grad_norm": 0.7746748718973765, + "learning_rate": 0.00015412004490444282, + "loss": 12.4776, + "step": 12430 + }, + { + "epoch": 0.6769176315234187, + "grad_norm": 0.5850132877825888, + "learning_rate": 0.00015411262947703186, + "loss": 12.2565, + "step": 12431 + }, + { + "epoch": 0.6769720855200017, + "grad_norm": 0.5774462724432337, + "learning_rate": 0.00015410521362883205, + "loss": 12.3022, + "step": 12432 + }, + { + "epoch": 0.6770265395165846, + "grad_norm": 0.5587997789708538, + "learning_rate": 0.00015409779735990096, + "loss": 12.3016, + "step": 12433 + }, + { + "epoch": 0.6770809935131676, + "grad_norm": 0.7234506787324182, + "learning_rate": 0.0001540903806702964, + "loss": 12.2563, + "step": 12434 + }, + { + "epoch": 0.6771354475097506, + "grad_norm": 0.6431977615477494, + "learning_rate": 0.00015408296356007593, + "loss": 12.3079, + "step": 12435 + }, + { + "epoch": 0.6771899015063337, + "grad_norm": 0.6397691285259051, + "learning_rate": 0.00015407554602929726, + "loss": 12.3851, + "step": 12436 + }, + { + "epoch": 0.6772443555029167, + "grad_norm": 0.5613075305208957, + "learning_rate": 0.00015406812807801807, + "loss": 12.274, + "step": 12437 + }, + { + "epoch": 0.6772988094994997, + "grad_norm": 0.5978847918724727, + "learning_rate": 0.0001540607097062961, + "loss": 12.2493, + "step": 12438 + }, + { + "epoch": 0.6773532634960827, + "grad_norm": 0.6838034584535284, + "learning_rate": 0.00015405329091418892, + "loss": 12.4762, + "step": 12439 + }, + { + "epoch": 0.6774077174926657, + "grad_norm": 0.6147624053315099, + "learning_rate": 0.00015404587170175432, + "loss": 12.2885, + "step": 12440 + }, + { + "epoch": 0.6774621714892487, + "grad_norm": 0.566160152720444, + "learning_rate": 0.00015403845206904995, + "loss": 12.2939, + "step": 12441 + }, + { + "epoch": 0.6775166254858318, + "grad_norm": 0.6071692350273591, + "learning_rate": 0.0001540310320161335, + "loss": 12.2652, + "step": 12442 + }, + { + "epoch": 0.6775710794824148, + "grad_norm": 0.6200172147901956, + "learning_rate": 0.0001540236115430627, + "loss": 12.3072, + "step": 12443 + }, + { + "epoch": 0.6776255334789978, + "grad_norm": 0.5818622382070244, + "learning_rate": 0.00015401619064989524, + "loss": 12.2654, + "step": 12444 + }, + { + "epoch": 0.6776799874755808, + "grad_norm": 0.669309561790791, + "learning_rate": 0.00015400876933668883, + "loss": 12.3663, + "step": 12445 + }, + { + "epoch": 0.6777344414721638, + "grad_norm": 0.5745993725924108, + "learning_rate": 0.00015400134760350115, + "loss": 12.2131, + "step": 12446 + }, + { + "epoch": 0.6777888954687468, + "grad_norm": 0.6416078009550086, + "learning_rate": 0.00015399392545038993, + "loss": 12.2781, + "step": 12447 + }, + { + "epoch": 0.6778433494653299, + "grad_norm": 0.6656096813777037, + "learning_rate": 0.00015398650287741294, + "loss": 12.3451, + "step": 12448 + }, + { + "epoch": 0.6778978034619129, + "grad_norm": 0.5414612288002892, + "learning_rate": 0.0001539790798846278, + "loss": 12.2011, + "step": 12449 + }, + { + "epoch": 0.6779522574584959, + "grad_norm": 0.6820187506744111, + "learning_rate": 0.00015397165647209232, + "loss": 12.1749, + "step": 12450 + }, + { + "epoch": 0.6780067114550788, + "grad_norm": 0.568131185561711, + "learning_rate": 0.00015396423263986414, + "loss": 12.2077, + "step": 12451 + }, + { + "epoch": 0.6780611654516618, + "grad_norm": 0.6201515312515274, + "learning_rate": 0.00015395680838800106, + "loss": 12.3235, + "step": 12452 + }, + { + "epoch": 0.6781156194482448, + "grad_norm": 0.576863601249074, + "learning_rate": 0.00015394938371656082, + "loss": 12.3372, + "step": 12453 + }, + { + "epoch": 0.6781700734448279, + "grad_norm": 0.6012291927573601, + "learning_rate": 0.0001539419586256011, + "loss": 12.4355, + "step": 12454 + }, + { + "epoch": 0.6782245274414109, + "grad_norm": 0.6164387649292629, + "learning_rate": 0.00015393453311517968, + "loss": 12.346, + "step": 12455 + }, + { + "epoch": 0.6782789814379939, + "grad_norm": 0.9865572568672095, + "learning_rate": 0.00015392710718535425, + "loss": 12.302, + "step": 12456 + }, + { + "epoch": 0.6783334354345769, + "grad_norm": 0.5986802333369561, + "learning_rate": 0.0001539196808361826, + "loss": 12.3104, + "step": 12457 + }, + { + "epoch": 0.6783878894311599, + "grad_norm": 0.5640829111191706, + "learning_rate": 0.0001539122540677225, + "loss": 12.2395, + "step": 12458 + }, + { + "epoch": 0.6784423434277429, + "grad_norm": 0.556879327150156, + "learning_rate": 0.00015390482688003166, + "loss": 12.3582, + "step": 12459 + }, + { + "epoch": 0.678496797424326, + "grad_norm": 0.5968851843906607, + "learning_rate": 0.0001538973992731678, + "loss": 12.2607, + "step": 12460 + }, + { + "epoch": 0.678551251420909, + "grad_norm": 0.5580397132881889, + "learning_rate": 0.00015388997124718878, + "loss": 12.2164, + "step": 12461 + }, + { + "epoch": 0.678605705417492, + "grad_norm": 0.6033343613214533, + "learning_rate": 0.00015388254280215228, + "loss": 12.4007, + "step": 12462 + }, + { + "epoch": 0.678660159414075, + "grad_norm": 0.6286973816578141, + "learning_rate": 0.0001538751139381161, + "loss": 12.2404, + "step": 12463 + }, + { + "epoch": 0.678714613410658, + "grad_norm": 0.659599719190087, + "learning_rate": 0.000153867684655138, + "loss": 12.5315, + "step": 12464 + }, + { + "epoch": 0.678769067407241, + "grad_norm": 0.5985044281923901, + "learning_rate": 0.00015386025495327577, + "loss": 12.1016, + "step": 12465 + }, + { + "epoch": 0.6788235214038241, + "grad_norm": 0.5894161418965819, + "learning_rate": 0.00015385282483258714, + "loss": 12.2652, + "step": 12466 + }, + { + "epoch": 0.6788779754004071, + "grad_norm": 0.5674643925713331, + "learning_rate": 0.00015384539429312989, + "loss": 12.2766, + "step": 12467 + }, + { + "epoch": 0.6789324293969901, + "grad_norm": 0.6379808068456474, + "learning_rate": 0.00015383796333496186, + "loss": 12.2333, + "step": 12468 + }, + { + "epoch": 0.678986883393573, + "grad_norm": 0.5935111162733151, + "learning_rate": 0.0001538305319581408, + "loss": 12.3503, + "step": 12469 + }, + { + "epoch": 0.679041337390156, + "grad_norm": 0.6086340132949898, + "learning_rate": 0.00015382310016272448, + "loss": 12.2272, + "step": 12470 + }, + { + "epoch": 0.6790957913867391, + "grad_norm": 0.6248968129012789, + "learning_rate": 0.0001538156679487707, + "loss": 12.3787, + "step": 12471 + }, + { + "epoch": 0.6791502453833221, + "grad_norm": 0.6803043130004992, + "learning_rate": 0.00015380823531633729, + "loss": 12.2579, + "step": 12472 + }, + { + "epoch": 0.6792046993799051, + "grad_norm": 0.656370341075035, + "learning_rate": 0.000153800802265482, + "loss": 12.3432, + "step": 12473 + }, + { + "epoch": 0.6792591533764881, + "grad_norm": 0.5485034232369759, + "learning_rate": 0.00015379336879626262, + "loss": 12.253, + "step": 12474 + }, + { + "epoch": 0.6793136073730711, + "grad_norm": 0.5865073594312507, + "learning_rate": 0.00015378593490873702, + "loss": 12.3455, + "step": 12475 + }, + { + "epoch": 0.6793680613696541, + "grad_norm": 0.6106625314250173, + "learning_rate": 0.00015377850060296298, + "loss": 12.25, + "step": 12476 + }, + { + "epoch": 0.6794225153662372, + "grad_norm": 0.6703990987672004, + "learning_rate": 0.00015377106587899828, + "loss": 12.3259, + "step": 12477 + }, + { + "epoch": 0.6794769693628202, + "grad_norm": 0.6749449370571579, + "learning_rate": 0.00015376363073690076, + "loss": 12.4298, + "step": 12478 + }, + { + "epoch": 0.6795314233594032, + "grad_norm": 0.5267898950623957, + "learning_rate": 0.00015375619517672822, + "loss": 12.2494, + "step": 12479 + }, + { + "epoch": 0.6795858773559862, + "grad_norm": 0.5813391501366415, + "learning_rate": 0.00015374875919853853, + "loss": 12.2722, + "step": 12480 + }, + { + "epoch": 0.6796403313525692, + "grad_norm": 0.5691942155847314, + "learning_rate": 0.00015374132280238944, + "loss": 12.3343, + "step": 12481 + }, + { + "epoch": 0.6796947853491522, + "grad_norm": 0.5597577928871845, + "learning_rate": 0.00015373388598833882, + "loss": 12.2779, + "step": 12482 + }, + { + "epoch": 0.6797492393457353, + "grad_norm": 0.5600179115697138, + "learning_rate": 0.0001537264487564445, + "loss": 12.3019, + "step": 12483 + }, + { + "epoch": 0.6798036933423183, + "grad_norm": 0.6609106539019437, + "learning_rate": 0.00015371901110676427, + "loss": 12.3928, + "step": 12484 + }, + { + "epoch": 0.6798581473389013, + "grad_norm": 0.6196194038765365, + "learning_rate": 0.00015371157303935604, + "loss": 12.4834, + "step": 12485 + }, + { + "epoch": 0.6799126013354843, + "grad_norm": 0.7024144124020267, + "learning_rate": 0.00015370413455427758, + "loss": 12.3145, + "step": 12486 + }, + { + "epoch": 0.6799670553320672, + "grad_norm": 0.5335364578058331, + "learning_rate": 0.0001536966956515868, + "loss": 12.1881, + "step": 12487 + }, + { + "epoch": 0.6800215093286502, + "grad_norm": 0.6308656762616506, + "learning_rate": 0.00015368925633134145, + "loss": 12.3981, + "step": 12488 + }, + { + "epoch": 0.6800759633252333, + "grad_norm": 0.5609199684595741, + "learning_rate": 0.0001536818165935995, + "loss": 12.1629, + "step": 12489 + }, + { + "epoch": 0.6801304173218163, + "grad_norm": 0.5882023506431455, + "learning_rate": 0.0001536743764384187, + "loss": 12.217, + "step": 12490 + }, + { + "epoch": 0.6801848713183993, + "grad_norm": 0.5830886102468975, + "learning_rate": 0.00015366693586585698, + "loss": 12.3887, + "step": 12491 + }, + { + "epoch": 0.6802393253149823, + "grad_norm": 0.5933736821972734, + "learning_rate": 0.00015365949487597215, + "loss": 12.2738, + "step": 12492 + }, + { + "epoch": 0.6802937793115653, + "grad_norm": 0.6270327180411331, + "learning_rate": 0.00015365205346882211, + "loss": 12.4265, + "step": 12493 + }, + { + "epoch": 0.6803482333081483, + "grad_norm": 0.6123050154368799, + "learning_rate": 0.0001536446116444647, + "loss": 12.2808, + "step": 12494 + }, + { + "epoch": 0.6804026873047314, + "grad_norm": 0.6149944772394959, + "learning_rate": 0.00015363716940295776, + "loss": 12.283, + "step": 12495 + }, + { + "epoch": 0.6804571413013144, + "grad_norm": 0.5767155457877045, + "learning_rate": 0.00015362972674435923, + "loss": 12.3108, + "step": 12496 + }, + { + "epoch": 0.6805115952978974, + "grad_norm": 0.6095755670317139, + "learning_rate": 0.00015362228366872692, + "loss": 12.3838, + "step": 12497 + }, + { + "epoch": 0.6805660492944804, + "grad_norm": 0.5932684303723733, + "learning_rate": 0.00015361484017611878, + "loss": 12.3602, + "step": 12498 + }, + { + "epoch": 0.6806205032910634, + "grad_norm": 0.557865255664125, + "learning_rate": 0.00015360739626659264, + "loss": 12.2614, + "step": 12499 + }, + { + "epoch": 0.6806749572876464, + "grad_norm": 0.5466275613168837, + "learning_rate": 0.00015359995194020635, + "loss": 12.1963, + "step": 12500 + }, + { + "epoch": 0.6807294112842295, + "grad_norm": 0.5722638998858818, + "learning_rate": 0.0001535925071970179, + "loss": 12.3277, + "step": 12501 + }, + { + "epoch": 0.6807838652808125, + "grad_norm": 0.5719742566065302, + "learning_rate": 0.0001535850620370851, + "loss": 12.301, + "step": 12502 + }, + { + "epoch": 0.6808383192773955, + "grad_norm": 0.5978564215852396, + "learning_rate": 0.00015357761646046586, + "loss": 12.4602, + "step": 12503 + }, + { + "epoch": 0.6808927732739785, + "grad_norm": 0.6845621944319514, + "learning_rate": 0.00015357017046721815, + "loss": 12.2861, + "step": 12504 + }, + { + "epoch": 0.6809472272705615, + "grad_norm": 0.5446505254122316, + "learning_rate": 0.00015356272405739975, + "loss": 12.2506, + "step": 12505 + }, + { + "epoch": 0.6810016812671446, + "grad_norm": 0.5764664656405027, + "learning_rate": 0.00015355527723106866, + "loss": 12.1196, + "step": 12506 + }, + { + "epoch": 0.6810561352637275, + "grad_norm": 0.5824158968217736, + "learning_rate": 0.00015354782998828272, + "loss": 12.1073, + "step": 12507 + }, + { + "epoch": 0.6811105892603105, + "grad_norm": 0.5951660676019116, + "learning_rate": 0.0001535403823290999, + "loss": 12.1546, + "step": 12508 + }, + { + "epoch": 0.6811650432568935, + "grad_norm": 0.633650384302766, + "learning_rate": 0.0001535329342535781, + "loss": 12.3471, + "step": 12509 + }, + { + "epoch": 0.6812194972534765, + "grad_norm": 0.7103507550363984, + "learning_rate": 0.0001535254857617752, + "loss": 12.4524, + "step": 12510 + }, + { + "epoch": 0.6812739512500595, + "grad_norm": 0.57847764762371, + "learning_rate": 0.00015351803685374914, + "loss": 12.2622, + "step": 12511 + }, + { + "epoch": 0.6813284052466426, + "grad_norm": 0.6403836667625455, + "learning_rate": 0.00015351058752955784, + "loss": 12.3535, + "step": 12512 + }, + { + "epoch": 0.6813828592432256, + "grad_norm": 0.7088629683482663, + "learning_rate": 0.0001535031377892593, + "loss": 12.3262, + "step": 12513 + }, + { + "epoch": 0.6814373132398086, + "grad_norm": 0.6193844866774646, + "learning_rate": 0.00015349568763291135, + "loss": 12.3267, + "step": 12514 + }, + { + "epoch": 0.6814917672363916, + "grad_norm": 0.5998515129373875, + "learning_rate": 0.00015348823706057196, + "loss": 12.3904, + "step": 12515 + }, + { + "epoch": 0.6815462212329746, + "grad_norm": 0.6448018105822136, + "learning_rate": 0.00015348078607229905, + "loss": 12.4076, + "step": 12516 + }, + { + "epoch": 0.6816006752295576, + "grad_norm": 0.6148133586569113, + "learning_rate": 0.00015347333466815059, + "loss": 12.2451, + "step": 12517 + }, + { + "epoch": 0.6816551292261407, + "grad_norm": 0.5678300825860817, + "learning_rate": 0.00015346588284818454, + "loss": 12.2692, + "step": 12518 + }, + { + "epoch": 0.6817095832227237, + "grad_norm": 0.5768317706320972, + "learning_rate": 0.0001534584306124588, + "loss": 12.3386, + "step": 12519 + }, + { + "epoch": 0.6817640372193067, + "grad_norm": 0.5331376057490201, + "learning_rate": 0.00015345097796103135, + "loss": 12.1922, + "step": 12520 + }, + { + "epoch": 0.6818184912158897, + "grad_norm": 0.5544525547019814, + "learning_rate": 0.0001534435248939601, + "loss": 12.269, + "step": 12521 + }, + { + "epoch": 0.6818729452124727, + "grad_norm": 0.6193524364573895, + "learning_rate": 0.00015343607141130304, + "loss": 12.2663, + "step": 12522 + }, + { + "epoch": 0.6819273992090557, + "grad_norm": 0.5619608059312059, + "learning_rate": 0.00015342861751311814, + "loss": 12.3329, + "step": 12523 + }, + { + "epoch": 0.6819818532056388, + "grad_norm": 0.6552963097235771, + "learning_rate": 0.00015342116319946338, + "loss": 12.3595, + "step": 12524 + }, + { + "epoch": 0.6820363072022217, + "grad_norm": 0.570274795582616, + "learning_rate": 0.00015341370847039666, + "loss": 12.2273, + "step": 12525 + }, + { + "epoch": 0.6820907611988047, + "grad_norm": 0.6239227052665182, + "learning_rate": 0.000153406253325976, + "loss": 12.3095, + "step": 12526 + }, + { + "epoch": 0.6821452151953877, + "grad_norm": 0.5745369017605514, + "learning_rate": 0.00015339879776625937, + "loss": 12.2778, + "step": 12527 + }, + { + "epoch": 0.6821996691919707, + "grad_norm": 0.6025091065614796, + "learning_rate": 0.00015339134179130469, + "loss": 12.248, + "step": 12528 + }, + { + "epoch": 0.6822541231885537, + "grad_norm": 0.6598333268101045, + "learning_rate": 0.00015338388540117004, + "loss": 12.1671, + "step": 12529 + }, + { + "epoch": 0.6823085771851368, + "grad_norm": 0.6316947537516462, + "learning_rate": 0.0001533764285959133, + "loss": 12.3167, + "step": 12530 + }, + { + "epoch": 0.6823630311817198, + "grad_norm": 0.6344343759849481, + "learning_rate": 0.0001533689713755925, + "loss": 12.2439, + "step": 12531 + }, + { + "epoch": 0.6824174851783028, + "grad_norm": 0.6129540352843904, + "learning_rate": 0.00015336151374026566, + "loss": 12.1051, + "step": 12532 + }, + { + "epoch": 0.6824719391748858, + "grad_norm": 0.6456101389818156, + "learning_rate": 0.00015335405568999073, + "loss": 12.3736, + "step": 12533 + }, + { + "epoch": 0.6825263931714688, + "grad_norm": 0.60867641200438, + "learning_rate": 0.00015334659722482572, + "loss": 12.2544, + "step": 12534 + }, + { + "epoch": 0.6825808471680518, + "grad_norm": 0.6173784654404763, + "learning_rate": 0.0001533391383448286, + "loss": 12.2428, + "step": 12535 + }, + { + "epoch": 0.6826353011646349, + "grad_norm": 0.6030950147532718, + "learning_rate": 0.0001533316790500574, + "loss": 12.282, + "step": 12536 + }, + { + "epoch": 0.6826897551612179, + "grad_norm": 0.6110934537663895, + "learning_rate": 0.00015332421934057013, + "loss": 12.304, + "step": 12537 + }, + { + "epoch": 0.6827442091578009, + "grad_norm": 0.6925727134598422, + "learning_rate": 0.00015331675921642478, + "loss": 12.3138, + "step": 12538 + }, + { + "epoch": 0.6827986631543839, + "grad_norm": 0.5930059362458303, + "learning_rate": 0.00015330929867767937, + "loss": 12.3151, + "step": 12539 + }, + { + "epoch": 0.6828531171509669, + "grad_norm": 0.7085795714441075, + "learning_rate": 0.00015330183772439195, + "loss": 12.3222, + "step": 12540 + }, + { + "epoch": 0.68290757114755, + "grad_norm": 0.5369805349876322, + "learning_rate": 0.00015329437635662046, + "loss": 12.2943, + "step": 12541 + }, + { + "epoch": 0.682962025144133, + "grad_norm": 0.544593423692748, + "learning_rate": 0.00015328691457442296, + "loss": 12.257, + "step": 12542 + }, + { + "epoch": 0.683016479140716, + "grad_norm": 0.5884016852779346, + "learning_rate": 0.00015327945237785748, + "loss": 12.261, + "step": 12543 + }, + { + "epoch": 0.6830709331372989, + "grad_norm": 0.5874520382845972, + "learning_rate": 0.00015327198976698204, + "loss": 12.3641, + "step": 12544 + }, + { + "epoch": 0.6831253871338819, + "grad_norm": 0.5936986434307064, + "learning_rate": 0.00015326452674185468, + "loss": 12.3483, + "step": 12545 + }, + { + "epoch": 0.6831798411304649, + "grad_norm": 0.5729493782893559, + "learning_rate": 0.00015325706330253348, + "loss": 12.3482, + "step": 12546 + }, + { + "epoch": 0.683234295127048, + "grad_norm": 0.5379559808268314, + "learning_rate": 0.00015324959944907636, + "loss": 12.38, + "step": 12547 + }, + { + "epoch": 0.683288749123631, + "grad_norm": 0.5508562129553998, + "learning_rate": 0.00015324213518154144, + "loss": 12.2574, + "step": 12548 + }, + { + "epoch": 0.683343203120214, + "grad_norm": 0.5528210383541169, + "learning_rate": 0.00015323467049998675, + "loss": 12.1128, + "step": 12549 + }, + { + "epoch": 0.683397657116797, + "grad_norm": 0.5690346282873607, + "learning_rate": 0.00015322720540447034, + "loss": 12.0657, + "step": 12550 + }, + { + "epoch": 0.68345211111338, + "grad_norm": 0.6207858687490257, + "learning_rate": 0.00015321973989505024, + "loss": 12.1423, + "step": 12551 + }, + { + "epoch": 0.683506565109963, + "grad_norm": 0.5401235764199944, + "learning_rate": 0.00015321227397178456, + "loss": 12.1383, + "step": 12552 + }, + { + "epoch": 0.6835610191065461, + "grad_norm": 0.5639866531793937, + "learning_rate": 0.00015320480763473127, + "loss": 12.3375, + "step": 12553 + }, + { + "epoch": 0.6836154731031291, + "grad_norm": 0.5569695080226263, + "learning_rate": 0.00015319734088394847, + "loss": 12.2254, + "step": 12554 + }, + { + "epoch": 0.6836699270997121, + "grad_norm": 0.5823886376253445, + "learning_rate": 0.00015318987371949424, + "loss": 12.1081, + "step": 12555 + }, + { + "epoch": 0.6837243810962951, + "grad_norm": 0.5831867835423197, + "learning_rate": 0.0001531824061414266, + "loss": 12.4263, + "step": 12556 + }, + { + "epoch": 0.6837788350928781, + "grad_norm": 0.6368773183461842, + "learning_rate": 0.00015317493814980373, + "loss": 12.3931, + "step": 12557 + }, + { + "epoch": 0.6838332890894611, + "grad_norm": 0.5967235892556965, + "learning_rate": 0.00015316746974468356, + "loss": 12.3436, + "step": 12558 + }, + { + "epoch": 0.6838877430860442, + "grad_norm": 0.5611941804067937, + "learning_rate": 0.00015316000092612425, + "loss": 12.2935, + "step": 12559 + }, + { + "epoch": 0.6839421970826272, + "grad_norm": 0.5821625925593092, + "learning_rate": 0.00015315253169418384, + "loss": 12.3281, + "step": 12560 + }, + { + "epoch": 0.6839966510792101, + "grad_norm": 0.58929587624123, + "learning_rate": 0.00015314506204892046, + "loss": 12.2427, + "step": 12561 + }, + { + "epoch": 0.6840511050757931, + "grad_norm": 0.9656728003292488, + "learning_rate": 0.00015313759199039214, + "loss": 12.3526, + "step": 12562 + }, + { + "epoch": 0.6841055590723761, + "grad_norm": 0.6415621689917624, + "learning_rate": 0.000153130121518657, + "loss": 12.2039, + "step": 12563 + }, + { + "epoch": 0.6841600130689591, + "grad_norm": 0.6300005979679029, + "learning_rate": 0.00015312265063377315, + "loss": 12.3009, + "step": 12564 + }, + { + "epoch": 0.6842144670655422, + "grad_norm": 0.5685462441014264, + "learning_rate": 0.00015311517933579865, + "loss": 12.2691, + "step": 12565 + }, + { + "epoch": 0.6842689210621252, + "grad_norm": 0.5643867251692329, + "learning_rate": 0.00015310770762479156, + "loss": 12.2821, + "step": 12566 + }, + { + "epoch": 0.6843233750587082, + "grad_norm": 0.5868894551224721, + "learning_rate": 0.00015310023550081008, + "loss": 12.2581, + "step": 12567 + }, + { + "epoch": 0.6843778290552912, + "grad_norm": 0.5726335847878873, + "learning_rate": 0.00015309276296391223, + "loss": 12.2945, + "step": 12568 + }, + { + "epoch": 0.6844322830518742, + "grad_norm": 0.6777200529938219, + "learning_rate": 0.0001530852900141562, + "loss": 12.3371, + "step": 12569 + }, + { + "epoch": 0.6844867370484573, + "grad_norm": 0.6001064109162652, + "learning_rate": 0.00015307781665160005, + "loss": 12.3353, + "step": 12570 + }, + { + "epoch": 0.6845411910450403, + "grad_norm": 0.5606026837695675, + "learning_rate": 0.00015307034287630184, + "loss": 12.3618, + "step": 12571 + }, + { + "epoch": 0.6845956450416233, + "grad_norm": 0.5898496714352334, + "learning_rate": 0.00015306286868831977, + "loss": 12.2474, + "step": 12572 + }, + { + "epoch": 0.6846500990382063, + "grad_norm": 0.5889931142568797, + "learning_rate": 0.00015305539408771193, + "loss": 12.3571, + "step": 12573 + }, + { + "epoch": 0.6847045530347893, + "grad_norm": 0.6156435547693119, + "learning_rate": 0.00015304791907453646, + "loss": 12.2613, + "step": 12574 + }, + { + "epoch": 0.6847590070313723, + "grad_norm": 0.6230231332692246, + "learning_rate": 0.00015304044364885147, + "loss": 12.4422, + "step": 12575 + }, + { + "epoch": 0.6848134610279554, + "grad_norm": 0.6871335752250709, + "learning_rate": 0.0001530329678107151, + "loss": 12.3385, + "step": 12576 + }, + { + "epoch": 0.6848679150245384, + "grad_norm": 0.623158174408285, + "learning_rate": 0.00015302549156018545, + "loss": 12.2042, + "step": 12577 + }, + { + "epoch": 0.6849223690211214, + "grad_norm": 0.5706635418917007, + "learning_rate": 0.00015301801489732073, + "loss": 12.1545, + "step": 12578 + }, + { + "epoch": 0.6849768230177044, + "grad_norm": 0.55862280847443, + "learning_rate": 0.00015301053782217902, + "loss": 12.2803, + "step": 12579 + }, + { + "epoch": 0.6850312770142873, + "grad_norm": 0.5992289622696629, + "learning_rate": 0.0001530030603348185, + "loss": 12.2505, + "step": 12580 + }, + { + "epoch": 0.6850857310108703, + "grad_norm": 0.587073117141727, + "learning_rate": 0.00015299558243529724, + "loss": 12.2734, + "step": 12581 + }, + { + "epoch": 0.6851401850074534, + "grad_norm": 0.5766027769184066, + "learning_rate": 0.00015298810412367348, + "loss": 12.3116, + "step": 12582 + }, + { + "epoch": 0.6851946390040364, + "grad_norm": 0.6304471982153476, + "learning_rate": 0.00015298062540000532, + "loss": 12.3564, + "step": 12583 + }, + { + "epoch": 0.6852490930006194, + "grad_norm": 0.6097819467526905, + "learning_rate": 0.00015297314626435093, + "loss": 12.3909, + "step": 12584 + }, + { + "epoch": 0.6853035469972024, + "grad_norm": 0.5925291505144655, + "learning_rate": 0.0001529656667167685, + "loss": 12.2947, + "step": 12585 + }, + { + "epoch": 0.6853580009937854, + "grad_norm": 0.5807666138404012, + "learning_rate": 0.00015295818675731612, + "loss": 12.2874, + "step": 12586 + }, + { + "epoch": 0.6854124549903684, + "grad_norm": 0.6025571608827972, + "learning_rate": 0.00015295070638605203, + "loss": 12.2434, + "step": 12587 + }, + { + "epoch": 0.6854669089869515, + "grad_norm": 0.6237484803353083, + "learning_rate": 0.00015294322560303436, + "loss": 12.3145, + "step": 12588 + }, + { + "epoch": 0.6855213629835345, + "grad_norm": 0.5948647440033049, + "learning_rate": 0.0001529357444083213, + "loss": 12.2099, + "step": 12589 + }, + { + "epoch": 0.6855758169801175, + "grad_norm": 0.605663960635657, + "learning_rate": 0.00015292826280197098, + "loss": 12.2644, + "step": 12590 + }, + { + "epoch": 0.6856302709767005, + "grad_norm": 0.5990931213074819, + "learning_rate": 0.00015292078078404166, + "loss": 12.2598, + "step": 12591 + }, + { + "epoch": 0.6856847249732835, + "grad_norm": 0.6073467998974551, + "learning_rate": 0.00015291329835459146, + "loss": 12.3719, + "step": 12592 + }, + { + "epoch": 0.6857391789698665, + "grad_norm": 0.614944758172525, + "learning_rate": 0.00015290581551367856, + "loss": 12.2056, + "step": 12593 + }, + { + "epoch": 0.6857936329664496, + "grad_norm": 0.6222266110368851, + "learning_rate": 0.00015289833226136116, + "loss": 12.2476, + "step": 12594 + }, + { + "epoch": 0.6858480869630326, + "grad_norm": 0.639785126460829, + "learning_rate": 0.00015289084859769746, + "loss": 12.4306, + "step": 12595 + }, + { + "epoch": 0.6859025409596156, + "grad_norm": 0.5938562698228961, + "learning_rate": 0.0001528833645227457, + "loss": 12.2332, + "step": 12596 + }, + { + "epoch": 0.6859569949561986, + "grad_norm": 0.6253395712765962, + "learning_rate": 0.00015287588003656398, + "loss": 12.2702, + "step": 12597 + }, + { + "epoch": 0.6860114489527815, + "grad_norm": 0.7806519859510992, + "learning_rate": 0.00015286839513921056, + "loss": 12.2199, + "step": 12598 + }, + { + "epoch": 0.6860659029493645, + "grad_norm": 0.6383790912822903, + "learning_rate": 0.00015286090983074365, + "loss": 12.3867, + "step": 12599 + }, + { + "epoch": 0.6861203569459476, + "grad_norm": 0.6354619200163265, + "learning_rate": 0.0001528534241112214, + "loss": 12.2405, + "step": 12600 + }, + { + "epoch": 0.6861748109425306, + "grad_norm": 0.7198083446997495, + "learning_rate": 0.0001528459379807021, + "loss": 12.1971, + "step": 12601 + }, + { + "epoch": 0.6862292649391136, + "grad_norm": 0.5609639402030633, + "learning_rate": 0.0001528384514392439, + "loss": 12.2583, + "step": 12602 + }, + { + "epoch": 0.6862837189356966, + "grad_norm": 0.6590477541121527, + "learning_rate": 0.00015283096448690504, + "loss": 12.4317, + "step": 12603 + }, + { + "epoch": 0.6863381729322796, + "grad_norm": 0.6169043747317672, + "learning_rate": 0.00015282347712374376, + "loss": 12.2422, + "step": 12604 + }, + { + "epoch": 0.6863926269288627, + "grad_norm": 0.5924528998422541, + "learning_rate": 0.00015281598934981826, + "loss": 12.3041, + "step": 12605 + }, + { + "epoch": 0.6864470809254457, + "grad_norm": 0.6317948636202078, + "learning_rate": 0.00015280850116518673, + "loss": 12.3189, + "step": 12606 + }, + { + "epoch": 0.6865015349220287, + "grad_norm": 0.6003665146103847, + "learning_rate": 0.00015280101256990748, + "loss": 12.2639, + "step": 12607 + }, + { + "epoch": 0.6865559889186117, + "grad_norm": 0.4923224569887765, + "learning_rate": 0.00015279352356403872, + "loss": 11.9896, + "step": 12608 + }, + { + "epoch": 0.6866104429151947, + "grad_norm": 0.6174994382831572, + "learning_rate": 0.00015278603414763863, + "loss": 12.2954, + "step": 12609 + }, + { + "epoch": 0.6866648969117777, + "grad_norm": 0.5857052679957093, + "learning_rate": 0.0001527785443207655, + "loss": 12.3063, + "step": 12610 + }, + { + "epoch": 0.6867193509083608, + "grad_norm": 0.5743478781291048, + "learning_rate": 0.00015277105408347756, + "loss": 12.1504, + "step": 12611 + }, + { + "epoch": 0.6867738049049438, + "grad_norm": 0.5559483050548155, + "learning_rate": 0.00015276356343583305, + "loss": 12.2322, + "step": 12612 + }, + { + "epoch": 0.6868282589015268, + "grad_norm": 0.6352063878137918, + "learning_rate": 0.00015275607237789023, + "loss": 12.338, + "step": 12613 + }, + { + "epoch": 0.6868827128981098, + "grad_norm": 0.561237471566255, + "learning_rate": 0.00015274858090970735, + "loss": 12.1346, + "step": 12614 + }, + { + "epoch": 0.6869371668946928, + "grad_norm": 0.6444034241684865, + "learning_rate": 0.00015274108903134262, + "loss": 12.4475, + "step": 12615 + }, + { + "epoch": 0.6869916208912757, + "grad_norm": 0.6399563424531881, + "learning_rate": 0.00015273359674285438, + "loss": 12.1246, + "step": 12616 + }, + { + "epoch": 0.6870460748878588, + "grad_norm": 0.557675000921174, + "learning_rate": 0.00015272610404430082, + "loss": 12.1814, + "step": 12617 + }, + { + "epoch": 0.6871005288844418, + "grad_norm": 0.6547315754685608, + "learning_rate": 0.0001527186109357402, + "loss": 12.3489, + "step": 12618 + }, + { + "epoch": 0.6871549828810248, + "grad_norm": 0.6158957717630221, + "learning_rate": 0.0001527111174172309, + "loss": 12.2577, + "step": 12619 + }, + { + "epoch": 0.6872094368776078, + "grad_norm": 0.6405001217155041, + "learning_rate": 0.00015270362348883108, + "loss": 12.3835, + "step": 12620 + }, + { + "epoch": 0.6872638908741908, + "grad_norm": 0.7138640749459229, + "learning_rate": 0.00015269612915059903, + "loss": 12.4221, + "step": 12621 + }, + { + "epoch": 0.6873183448707738, + "grad_norm": 0.5418849717601474, + "learning_rate": 0.00015268863440259307, + "loss": 12.3072, + "step": 12622 + }, + { + "epoch": 0.6873727988673569, + "grad_norm": 0.6338427053423075, + "learning_rate": 0.0001526811392448714, + "loss": 12.316, + "step": 12623 + }, + { + "epoch": 0.6874272528639399, + "grad_norm": 0.6424556749417951, + "learning_rate": 0.00015267364367749242, + "loss": 12.2569, + "step": 12624 + }, + { + "epoch": 0.6874817068605229, + "grad_norm": 0.5318951779941986, + "learning_rate": 0.00015266614770051434, + "loss": 12.301, + "step": 12625 + }, + { + "epoch": 0.6875361608571059, + "grad_norm": 0.5824150194263151, + "learning_rate": 0.00015265865131399543, + "loss": 12.4017, + "step": 12626 + }, + { + "epoch": 0.6875906148536889, + "grad_norm": 0.6239848450959841, + "learning_rate": 0.00015265115451799403, + "loss": 12.3879, + "step": 12627 + }, + { + "epoch": 0.6876450688502719, + "grad_norm": 0.5886551546118839, + "learning_rate": 0.0001526436573125684, + "loss": 12.349, + "step": 12628 + }, + { + "epoch": 0.687699522846855, + "grad_norm": 0.6041463662522617, + "learning_rate": 0.00015263615969777691, + "loss": 12.2759, + "step": 12629 + }, + { + "epoch": 0.687753976843438, + "grad_norm": 0.6335787004896745, + "learning_rate": 0.00015262866167367785, + "loss": 12.2605, + "step": 12630 + }, + { + "epoch": 0.687808430840021, + "grad_norm": 0.5616758429928841, + "learning_rate": 0.00015262116324032943, + "loss": 12.2135, + "step": 12631 + }, + { + "epoch": 0.687862884836604, + "grad_norm": 0.5568859215137671, + "learning_rate": 0.00015261366439779, + "loss": 12.1521, + "step": 12632 + }, + { + "epoch": 0.687917338833187, + "grad_norm": 0.5301411883490512, + "learning_rate": 0.00015260616514611793, + "loss": 12.1824, + "step": 12633 + }, + { + "epoch": 0.68797179282977, + "grad_norm": 0.6112030855999007, + "learning_rate": 0.00015259866548537144, + "loss": 12.3687, + "step": 12634 + }, + { + "epoch": 0.688026246826353, + "grad_norm": 0.638658439137997, + "learning_rate": 0.00015259116541560902, + "loss": 12.3406, + "step": 12635 + }, + { + "epoch": 0.688080700822936, + "grad_norm": 0.5849601548744854, + "learning_rate": 0.0001525836649368888, + "loss": 12.2362, + "step": 12636 + }, + { + "epoch": 0.688135154819519, + "grad_norm": 0.5890796896219497, + "learning_rate": 0.00015257616404926915, + "loss": 12.2527, + "step": 12637 + }, + { + "epoch": 0.688189608816102, + "grad_norm": 0.5799865995323636, + "learning_rate": 0.0001525686627528085, + "loss": 12.3209, + "step": 12638 + }, + { + "epoch": 0.688244062812685, + "grad_norm": 0.6151652030859852, + "learning_rate": 0.00015256116104756507, + "loss": 12.3799, + "step": 12639 + }, + { + "epoch": 0.6882985168092681, + "grad_norm": 0.5264196989976043, + "learning_rate": 0.0001525536589335973, + "loss": 12.2559, + "step": 12640 + }, + { + "epoch": 0.6883529708058511, + "grad_norm": 0.5980748544120343, + "learning_rate": 0.0001525461564109634, + "loss": 12.2105, + "step": 12641 + }, + { + "epoch": 0.6884074248024341, + "grad_norm": 0.5726667071252022, + "learning_rate": 0.0001525386534797218, + "loss": 12.2894, + "step": 12642 + }, + { + "epoch": 0.6884618787990171, + "grad_norm": 0.5679190932449971, + "learning_rate": 0.0001525311501399308, + "loss": 12.3174, + "step": 12643 + }, + { + "epoch": 0.6885163327956001, + "grad_norm": 0.5482384017862806, + "learning_rate": 0.0001525236463916488, + "loss": 12.2429, + "step": 12644 + }, + { + "epoch": 0.6885707867921831, + "grad_norm": 0.547876635201763, + "learning_rate": 0.00015251614223493412, + "loss": 12.3051, + "step": 12645 + }, + { + "epoch": 0.6886252407887662, + "grad_norm": 0.6059517008941783, + "learning_rate": 0.00015250863766984508, + "loss": 12.2157, + "step": 12646 + }, + { + "epoch": 0.6886796947853492, + "grad_norm": 0.5977206310025369, + "learning_rate": 0.00015250113269644005, + "loss": 12.4078, + "step": 12647 + }, + { + "epoch": 0.6887341487819322, + "grad_norm": 0.5568252330295134, + "learning_rate": 0.00015249362731477745, + "loss": 12.3534, + "step": 12648 + }, + { + "epoch": 0.6887886027785152, + "grad_norm": 0.6075483860753537, + "learning_rate": 0.00015248612152491558, + "loss": 12.3157, + "step": 12649 + }, + { + "epoch": 0.6888430567750982, + "grad_norm": 0.5951457195496563, + "learning_rate": 0.00015247861532691285, + "loss": 12.3323, + "step": 12650 + }, + { + "epoch": 0.6888975107716812, + "grad_norm": 0.6261098231237463, + "learning_rate": 0.00015247110872082759, + "loss": 12.37, + "step": 12651 + }, + { + "epoch": 0.6889519647682643, + "grad_norm": 0.651710583732914, + "learning_rate": 0.0001524636017067182, + "loss": 12.3662, + "step": 12652 + }, + { + "epoch": 0.6890064187648473, + "grad_norm": 0.6357382069552527, + "learning_rate": 0.00015245609428464306, + "loss": 12.4347, + "step": 12653 + }, + { + "epoch": 0.6890608727614302, + "grad_norm": 0.6587152131477861, + "learning_rate": 0.00015244858645466046, + "loss": 12.3483, + "step": 12654 + }, + { + "epoch": 0.6891153267580132, + "grad_norm": 0.5647572288398478, + "learning_rate": 0.00015244107821682894, + "loss": 12.2754, + "step": 12655 + }, + { + "epoch": 0.6891697807545962, + "grad_norm": 0.5548134170905816, + "learning_rate": 0.00015243356957120676, + "loss": 12.2864, + "step": 12656 + }, + { + "epoch": 0.6892242347511792, + "grad_norm": 0.6079633115581555, + "learning_rate": 0.00015242606051785236, + "loss": 12.3156, + "step": 12657 + }, + { + "epoch": 0.6892786887477623, + "grad_norm": 0.6751531247949729, + "learning_rate": 0.00015241855105682412, + "loss": 12.2497, + "step": 12658 + }, + { + "epoch": 0.6893331427443453, + "grad_norm": 0.5337554194327473, + "learning_rate": 0.00015241104118818042, + "loss": 12.311, + "step": 12659 + }, + { + "epoch": 0.6893875967409283, + "grad_norm": 0.6036338919681892, + "learning_rate": 0.0001524035309119797, + "loss": 12.2545, + "step": 12660 + }, + { + "epoch": 0.6894420507375113, + "grad_norm": 0.5690388964265504, + "learning_rate": 0.00015239602022828032, + "loss": 12.2702, + "step": 12661 + }, + { + "epoch": 0.6894965047340943, + "grad_norm": 0.6315982376221074, + "learning_rate": 0.0001523885091371407, + "loss": 12.1872, + "step": 12662 + }, + { + "epoch": 0.6895509587306773, + "grad_norm": 0.560581222380461, + "learning_rate": 0.00015238099763861926, + "loss": 12.1801, + "step": 12663 + }, + { + "epoch": 0.6896054127272604, + "grad_norm": 0.5862186620249155, + "learning_rate": 0.0001523734857327744, + "loss": 12.1854, + "step": 12664 + }, + { + "epoch": 0.6896598667238434, + "grad_norm": 0.5748225751008038, + "learning_rate": 0.00015236597341966452, + "loss": 12.3736, + "step": 12665 + }, + { + "epoch": 0.6897143207204264, + "grad_norm": 0.5517267261087246, + "learning_rate": 0.00015235846069934804, + "loss": 12.1587, + "step": 12666 + }, + { + "epoch": 0.6897687747170094, + "grad_norm": 0.7096481690724898, + "learning_rate": 0.0001523509475718834, + "loss": 12.2512, + "step": 12667 + }, + { + "epoch": 0.6898232287135924, + "grad_norm": 0.6735552712041467, + "learning_rate": 0.00015234343403732902, + "loss": 12.1401, + "step": 12668 + }, + { + "epoch": 0.6898776827101754, + "grad_norm": 0.9097540623374685, + "learning_rate": 0.0001523359200957433, + "loss": 12.3809, + "step": 12669 + }, + { + "epoch": 0.6899321367067585, + "grad_norm": 0.6413803080548649, + "learning_rate": 0.00015232840574718473, + "loss": 12.2742, + "step": 12670 + }, + { + "epoch": 0.6899865907033415, + "grad_norm": 0.7383597525167909, + "learning_rate": 0.00015232089099171165, + "loss": 12.2784, + "step": 12671 + }, + { + "epoch": 0.6900410446999244, + "grad_norm": 0.7636786695259324, + "learning_rate": 0.00015231337582938257, + "loss": 12.4001, + "step": 12672 + }, + { + "epoch": 0.6900954986965074, + "grad_norm": 0.6394640382256751, + "learning_rate": 0.0001523058602602559, + "loss": 12.2575, + "step": 12673 + }, + { + "epoch": 0.6901499526930904, + "grad_norm": 0.6564638150277223, + "learning_rate": 0.0001522983442843901, + "loss": 12.2666, + "step": 12674 + }, + { + "epoch": 0.6902044066896735, + "grad_norm": 0.6819497811738562, + "learning_rate": 0.0001522908279018436, + "loss": 12.276, + "step": 12675 + }, + { + "epoch": 0.6902588606862565, + "grad_norm": 0.6309976102996402, + "learning_rate": 0.00015228331111267487, + "loss": 12.3654, + "step": 12676 + }, + { + "epoch": 0.6903133146828395, + "grad_norm": 0.632177406445353, + "learning_rate": 0.0001522757939169423, + "loss": 12.2867, + "step": 12677 + }, + { + "epoch": 0.6903677686794225, + "grad_norm": 0.6023323477054283, + "learning_rate": 0.00015226827631470443, + "loss": 12.3492, + "step": 12678 + }, + { + "epoch": 0.6904222226760055, + "grad_norm": 0.7334268999145962, + "learning_rate": 0.00015226075830601966, + "loss": 12.4541, + "step": 12679 + }, + { + "epoch": 0.6904766766725885, + "grad_norm": 0.6600399118868495, + "learning_rate": 0.0001522532398909465, + "loss": 12.3805, + "step": 12680 + }, + { + "epoch": 0.6905311306691716, + "grad_norm": 0.6266545126408947, + "learning_rate": 0.00015224572106954334, + "loss": 12.1993, + "step": 12681 + }, + { + "epoch": 0.6905855846657546, + "grad_norm": 0.6731096915303995, + "learning_rate": 0.0001522382018418687, + "loss": 12.1659, + "step": 12682 + }, + { + "epoch": 0.6906400386623376, + "grad_norm": 0.5633882992088448, + "learning_rate": 0.00015223068220798105, + "loss": 12.1641, + "step": 12683 + }, + { + "epoch": 0.6906944926589206, + "grad_norm": 0.6724408702456605, + "learning_rate": 0.00015222316216793885, + "loss": 12.3575, + "step": 12684 + }, + { + "epoch": 0.6907489466555036, + "grad_norm": 0.9751431943544033, + "learning_rate": 0.00015221564172180062, + "loss": 12.2214, + "step": 12685 + }, + { + "epoch": 0.6908034006520866, + "grad_norm": 0.5675514721348829, + "learning_rate": 0.00015220812086962475, + "loss": 12.2358, + "step": 12686 + }, + { + "epoch": 0.6908578546486697, + "grad_norm": 0.5828034922884001, + "learning_rate": 0.00015220059961146978, + "loss": 12.2934, + "step": 12687 + }, + { + "epoch": 0.6909123086452527, + "grad_norm": 0.7084947121379525, + "learning_rate": 0.00015219307794739424, + "loss": 12.2746, + "step": 12688 + }, + { + "epoch": 0.6909667626418357, + "grad_norm": 0.6315651900269331, + "learning_rate": 0.00015218555587745653, + "loss": 12.1951, + "step": 12689 + }, + { + "epoch": 0.6910212166384186, + "grad_norm": 0.6377830658396804, + "learning_rate": 0.00015217803340171522, + "loss": 12.2809, + "step": 12690 + }, + { + "epoch": 0.6910756706350016, + "grad_norm": 0.641576023705419, + "learning_rate": 0.00015217051052022877, + "loss": 12.2517, + "step": 12691 + }, + { + "epoch": 0.6911301246315846, + "grad_norm": 0.6469535141067794, + "learning_rate": 0.00015216298723305565, + "loss": 12.3938, + "step": 12692 + }, + { + "epoch": 0.6911845786281677, + "grad_norm": 0.5669266511751133, + "learning_rate": 0.0001521554635402544, + "loss": 12.2574, + "step": 12693 + }, + { + "epoch": 0.6912390326247507, + "grad_norm": 0.6420864976150247, + "learning_rate": 0.00015214793944188352, + "loss": 12.2383, + "step": 12694 + }, + { + "epoch": 0.6912934866213337, + "grad_norm": 0.581599782572865, + "learning_rate": 0.00015214041493800156, + "loss": 12.1808, + "step": 12695 + }, + { + "epoch": 0.6913479406179167, + "grad_norm": 0.5739199733755143, + "learning_rate": 0.00015213289002866698, + "loss": 12.2491, + "step": 12696 + }, + { + "epoch": 0.6914023946144997, + "grad_norm": 0.619162803547792, + "learning_rate": 0.00015212536471393825, + "loss": 12.35, + "step": 12697 + }, + { + "epoch": 0.6914568486110827, + "grad_norm": 0.5759863171225407, + "learning_rate": 0.00015211783899387395, + "loss": 12.2603, + "step": 12698 + }, + { + "epoch": 0.6915113026076658, + "grad_norm": 0.6136205391072797, + "learning_rate": 0.00015211031286853263, + "loss": 12.4047, + "step": 12699 + }, + { + "epoch": 0.6915657566042488, + "grad_norm": 0.5961564416118947, + "learning_rate": 0.00015210278633797278, + "loss": 12.22, + "step": 12700 + }, + { + "epoch": 0.6916202106008318, + "grad_norm": 0.5430192216745389, + "learning_rate": 0.00015209525940225292, + "loss": 12.2475, + "step": 12701 + }, + { + "epoch": 0.6916746645974148, + "grad_norm": 0.5534574797960371, + "learning_rate": 0.00015208773206143157, + "loss": 12.2389, + "step": 12702 + }, + { + "epoch": 0.6917291185939978, + "grad_norm": 0.5240599579137478, + "learning_rate": 0.00015208020431556725, + "loss": 12.2449, + "step": 12703 + }, + { + "epoch": 0.6917835725905809, + "grad_norm": 0.5372184729126631, + "learning_rate": 0.0001520726761647186, + "loss": 12.1546, + "step": 12704 + }, + { + "epoch": 0.6918380265871639, + "grad_norm": 0.580728450841165, + "learning_rate": 0.00015206514760894406, + "loss": 12.3278, + "step": 12705 + }, + { + "epoch": 0.6918924805837469, + "grad_norm": 0.6056883740245131, + "learning_rate": 0.0001520576186483022, + "loss": 12.2519, + "step": 12706 + }, + { + "epoch": 0.6919469345803299, + "grad_norm": 0.58052840996791, + "learning_rate": 0.00015205008928285155, + "loss": 12.1894, + "step": 12707 + }, + { + "epoch": 0.6920013885769128, + "grad_norm": 0.6161732779139788, + "learning_rate": 0.0001520425595126507, + "loss": 12.4466, + "step": 12708 + }, + { + "epoch": 0.6920558425734958, + "grad_norm": 0.590558757040953, + "learning_rate": 0.00015203502933775815, + "loss": 12.2084, + "step": 12709 + }, + { + "epoch": 0.6921102965700789, + "grad_norm": 0.6009086221721797, + "learning_rate": 0.0001520274987582325, + "loss": 12.2238, + "step": 12710 + }, + { + "epoch": 0.6921647505666619, + "grad_norm": 0.5674633245904497, + "learning_rate": 0.0001520199677741323, + "loss": 12.2568, + "step": 12711 + }, + { + "epoch": 0.6922192045632449, + "grad_norm": 0.5529595321026778, + "learning_rate": 0.0001520124363855161, + "loss": 12.2639, + "step": 12712 + }, + { + "epoch": 0.6922736585598279, + "grad_norm": 0.6156867194157509, + "learning_rate": 0.0001520049045924425, + "loss": 12.4344, + "step": 12713 + }, + { + "epoch": 0.6923281125564109, + "grad_norm": 0.6080044554590114, + "learning_rate": 0.00015199737239497, + "loss": 12.335, + "step": 12714 + }, + { + "epoch": 0.6923825665529939, + "grad_norm": 0.5941132220538231, + "learning_rate": 0.00015198983979315724, + "loss": 12.3197, + "step": 12715 + }, + { + "epoch": 0.692437020549577, + "grad_norm": 0.5653598233702917, + "learning_rate": 0.00015198230678706276, + "loss": 12.1885, + "step": 12716 + }, + { + "epoch": 0.69249147454616, + "grad_norm": 0.5466950124603381, + "learning_rate": 0.00015197477337674514, + "loss": 12.3421, + "step": 12717 + }, + { + "epoch": 0.692545928542743, + "grad_norm": 0.6162963914126968, + "learning_rate": 0.000151967239562263, + "loss": 12.3732, + "step": 12718 + }, + { + "epoch": 0.692600382539326, + "grad_norm": 0.5681428110322244, + "learning_rate": 0.00015195970534367484, + "loss": 12.1208, + "step": 12719 + }, + { + "epoch": 0.692654836535909, + "grad_norm": 0.5765561251357073, + "learning_rate": 0.0001519521707210393, + "loss": 12.2899, + "step": 12720 + }, + { + "epoch": 0.692709290532492, + "grad_norm": 0.566818435101275, + "learning_rate": 0.00015194463569441496, + "loss": 12.3202, + "step": 12721 + }, + { + "epoch": 0.6927637445290751, + "grad_norm": 0.6627642008500717, + "learning_rate": 0.00015193710026386047, + "loss": 12.3357, + "step": 12722 + }, + { + "epoch": 0.6928181985256581, + "grad_norm": 0.6395608584600606, + "learning_rate": 0.00015192956442943435, + "loss": 12.2827, + "step": 12723 + }, + { + "epoch": 0.6928726525222411, + "grad_norm": 0.5554920430630873, + "learning_rate": 0.00015192202819119523, + "loss": 12.2404, + "step": 12724 + }, + { + "epoch": 0.692927106518824, + "grad_norm": 0.5982854590907775, + "learning_rate": 0.00015191449154920172, + "loss": 12.2451, + "step": 12725 + }, + { + "epoch": 0.692981560515407, + "grad_norm": 0.6252918888190135, + "learning_rate": 0.00015190695450351244, + "loss": 12.2617, + "step": 12726 + }, + { + "epoch": 0.69303601451199, + "grad_norm": 0.5792879664772468, + "learning_rate": 0.0001518994170541859, + "loss": 12.1567, + "step": 12727 + }, + { + "epoch": 0.6930904685085731, + "grad_norm": 0.5816976069993248, + "learning_rate": 0.00015189187920128084, + "loss": 12.2965, + "step": 12728 + }, + { + "epoch": 0.6931449225051561, + "grad_norm": 0.5743328304663563, + "learning_rate": 0.00015188434094485584, + "loss": 12.2071, + "step": 12729 + }, + { + "epoch": 0.6931993765017391, + "grad_norm": 0.5497556272045456, + "learning_rate": 0.00015187680228496948, + "loss": 12.3136, + "step": 12730 + }, + { + "epoch": 0.6932538304983221, + "grad_norm": 0.5326517766493589, + "learning_rate": 0.00015186926322168043, + "loss": 12.262, + "step": 12731 + }, + { + "epoch": 0.6933082844949051, + "grad_norm": 0.6330746311860425, + "learning_rate": 0.00015186172375504726, + "loss": 12.3068, + "step": 12732 + }, + { + "epoch": 0.6933627384914881, + "grad_norm": 0.6460808031320815, + "learning_rate": 0.00015185418388512865, + "loss": 12.3098, + "step": 12733 + }, + { + "epoch": 0.6934171924880712, + "grad_norm": 0.5993161584869566, + "learning_rate": 0.0001518466436119832, + "loss": 12.2731, + "step": 12734 + }, + { + "epoch": 0.6934716464846542, + "grad_norm": 0.665898943721501, + "learning_rate": 0.00015183910293566956, + "loss": 12.4917, + "step": 12735 + }, + { + "epoch": 0.6935261004812372, + "grad_norm": 0.5514350662347167, + "learning_rate": 0.00015183156185624635, + "loss": 12.281, + "step": 12736 + }, + { + "epoch": 0.6935805544778202, + "grad_norm": 0.5824643224684546, + "learning_rate": 0.00015182402037377222, + "loss": 12.2971, + "step": 12737 + }, + { + "epoch": 0.6936350084744032, + "grad_norm": 0.5316514992963814, + "learning_rate": 0.00015181647848830585, + "loss": 12.2978, + "step": 12738 + }, + { + "epoch": 0.6936894624709863, + "grad_norm": 0.627321558576572, + "learning_rate": 0.00015180893619990581, + "loss": 12.3271, + "step": 12739 + }, + { + "epoch": 0.6937439164675693, + "grad_norm": 0.5816828595854907, + "learning_rate": 0.00015180139350863086, + "loss": 12.3728, + "step": 12740 + }, + { + "epoch": 0.6937983704641523, + "grad_norm": 0.6047340103704789, + "learning_rate": 0.00015179385041453952, + "loss": 12.2363, + "step": 12741 + }, + { + "epoch": 0.6938528244607353, + "grad_norm": 0.5710045945984428, + "learning_rate": 0.00015178630691769054, + "loss": 12.1712, + "step": 12742 + }, + { + "epoch": 0.6939072784573183, + "grad_norm": 0.5864954337963909, + "learning_rate": 0.00015177876301814255, + "loss": 12.3223, + "step": 12743 + }, + { + "epoch": 0.6939617324539012, + "grad_norm": 0.6000967347826478, + "learning_rate": 0.0001517712187159542, + "loss": 12.3434, + "step": 12744 + }, + { + "epoch": 0.6940161864504844, + "grad_norm": 0.5906872874461015, + "learning_rate": 0.0001517636740111842, + "loss": 12.2595, + "step": 12745 + }, + { + "epoch": 0.6940706404470673, + "grad_norm": 0.637024524133645, + "learning_rate": 0.00015175612890389118, + "loss": 12.4404, + "step": 12746 + }, + { + "epoch": 0.6941250944436503, + "grad_norm": 0.6078531507332336, + "learning_rate": 0.0001517485833941338, + "loss": 12.2771, + "step": 12747 + }, + { + "epoch": 0.6941795484402333, + "grad_norm": 0.5362085108950104, + "learning_rate": 0.00015174103748197078, + "loss": 12.3344, + "step": 12748 + }, + { + "epoch": 0.6942340024368163, + "grad_norm": 0.591849383692123, + "learning_rate": 0.0001517334911674608, + "loss": 12.2298, + "step": 12749 + }, + { + "epoch": 0.6942884564333993, + "grad_norm": 0.5655366449593927, + "learning_rate": 0.0001517259444506625, + "loss": 12.3227, + "step": 12750 + }, + { + "epoch": 0.6943429104299824, + "grad_norm": 0.5807513533675587, + "learning_rate": 0.00015171839733163457, + "loss": 12.2478, + "step": 12751 + }, + { + "epoch": 0.6943973644265654, + "grad_norm": 0.5495206828814007, + "learning_rate": 0.0001517108498104357, + "loss": 12.2208, + "step": 12752 + }, + { + "epoch": 0.6944518184231484, + "grad_norm": 0.5486352146566552, + "learning_rate": 0.00015170330188712456, + "loss": 12.2291, + "step": 12753 + }, + { + "epoch": 0.6945062724197314, + "grad_norm": 0.6028736531502734, + "learning_rate": 0.00015169575356175992, + "loss": 12.4123, + "step": 12754 + }, + { + "epoch": 0.6945607264163144, + "grad_norm": 0.5180394854006246, + "learning_rate": 0.00015168820483440042, + "loss": 12.2678, + "step": 12755 + }, + { + "epoch": 0.6946151804128974, + "grad_norm": 0.6338718124617676, + "learning_rate": 0.00015168065570510478, + "loss": 12.2655, + "step": 12756 + }, + { + "epoch": 0.6946696344094805, + "grad_norm": 0.5826209963066219, + "learning_rate": 0.0001516731061739317, + "loss": 12.2701, + "step": 12757 + }, + { + "epoch": 0.6947240884060635, + "grad_norm": 0.6083649233191714, + "learning_rate": 0.00015166555624093986, + "loss": 12.2946, + "step": 12758 + }, + { + "epoch": 0.6947785424026465, + "grad_norm": 0.5929086951518965, + "learning_rate": 0.000151658005906188, + "loss": 12.2975, + "step": 12759 + }, + { + "epoch": 0.6948329963992295, + "grad_norm": 0.6159253227796201, + "learning_rate": 0.00015165045516973483, + "loss": 12.4129, + "step": 12760 + }, + { + "epoch": 0.6948874503958125, + "grad_norm": 0.5961923098818454, + "learning_rate": 0.00015164290403163905, + "loss": 12.2542, + "step": 12761 + }, + { + "epoch": 0.6949419043923954, + "grad_norm": 0.6082280448345292, + "learning_rate": 0.0001516353524919594, + "loss": 12.3274, + "step": 12762 + }, + { + "epoch": 0.6949963583889786, + "grad_norm": 0.5843565354596677, + "learning_rate": 0.00015162780055075456, + "loss": 12.3214, + "step": 12763 + }, + { + "epoch": 0.6950508123855615, + "grad_norm": 0.5922646499860922, + "learning_rate": 0.0001516202482080833, + "loss": 12.2993, + "step": 12764 + }, + { + "epoch": 0.6951052663821445, + "grad_norm": 0.5319499220696793, + "learning_rate": 0.00015161269546400434, + "loss": 12.2324, + "step": 12765 + }, + { + "epoch": 0.6951597203787275, + "grad_norm": 0.7045455639918438, + "learning_rate": 0.00015160514231857643, + "loss": 12.2201, + "step": 12766 + }, + { + "epoch": 0.6952141743753105, + "grad_norm": 0.5619467296914059, + "learning_rate": 0.00015159758877185823, + "loss": 12.2243, + "step": 12767 + }, + { + "epoch": 0.6952686283718935, + "grad_norm": 0.5465287615022251, + "learning_rate": 0.00015159003482390857, + "loss": 12.3618, + "step": 12768 + }, + { + "epoch": 0.6953230823684766, + "grad_norm": 0.5405474447716598, + "learning_rate": 0.0001515824804747861, + "loss": 12.1688, + "step": 12769 + }, + { + "epoch": 0.6953775363650596, + "grad_norm": 0.617095133941277, + "learning_rate": 0.00015157492572454964, + "loss": 12.2569, + "step": 12770 + }, + { + "epoch": 0.6954319903616426, + "grad_norm": 0.5594754357401277, + "learning_rate": 0.00015156737057325792, + "loss": 12.1694, + "step": 12771 + }, + { + "epoch": 0.6954864443582256, + "grad_norm": 0.564470075597679, + "learning_rate": 0.00015155981502096966, + "loss": 12.1877, + "step": 12772 + }, + { + "epoch": 0.6955408983548086, + "grad_norm": 0.5394155263359696, + "learning_rate": 0.00015155225906774363, + "loss": 12.3133, + "step": 12773 + }, + { + "epoch": 0.6955953523513917, + "grad_norm": 0.5964767067731667, + "learning_rate": 0.00015154470271363862, + "loss": 12.2444, + "step": 12774 + }, + { + "epoch": 0.6956498063479747, + "grad_norm": 0.5258330599265519, + "learning_rate": 0.00015153714595871328, + "loss": 12.3298, + "step": 12775 + }, + { + "epoch": 0.6957042603445577, + "grad_norm": 0.515769601343884, + "learning_rate": 0.00015152958880302654, + "loss": 12.1901, + "step": 12776 + }, + { + "epoch": 0.6957587143411407, + "grad_norm": 0.5625539317088408, + "learning_rate": 0.00015152203124663705, + "loss": 12.1423, + "step": 12777 + }, + { + "epoch": 0.6958131683377237, + "grad_norm": 0.5738176088998937, + "learning_rate": 0.00015151447328960358, + "loss": 12.3304, + "step": 12778 + }, + { + "epoch": 0.6958676223343067, + "grad_norm": 0.5337821156892026, + "learning_rate": 0.00015150691493198495, + "loss": 12.158, + "step": 12779 + }, + { + "epoch": 0.6959220763308898, + "grad_norm": 0.5572367138584677, + "learning_rate": 0.00015149935617383986, + "loss": 12.3404, + "step": 12780 + }, + { + "epoch": 0.6959765303274728, + "grad_norm": 0.5785620836035866, + "learning_rate": 0.0001514917970152272, + "loss": 12.2895, + "step": 12781 + }, + { + "epoch": 0.6960309843240557, + "grad_norm": 0.5567436442972655, + "learning_rate": 0.00015148423745620567, + "loss": 12.2598, + "step": 12782 + }, + { + "epoch": 0.6960854383206387, + "grad_norm": 0.5436685548209514, + "learning_rate": 0.0001514766774968341, + "loss": 12.2851, + "step": 12783 + }, + { + "epoch": 0.6961398923172217, + "grad_norm": 0.5620871243842307, + "learning_rate": 0.00015146911713717122, + "loss": 12.2629, + "step": 12784 + }, + { + "epoch": 0.6961943463138047, + "grad_norm": 0.5825963197643459, + "learning_rate": 0.00015146155637727588, + "loss": 12.2778, + "step": 12785 + }, + { + "epoch": 0.6962488003103878, + "grad_norm": 0.580599403779096, + "learning_rate": 0.0001514539952172068, + "loss": 12.293, + "step": 12786 + }, + { + "epoch": 0.6963032543069708, + "grad_norm": 0.5547121852362786, + "learning_rate": 0.0001514464336570229, + "loss": 12.1824, + "step": 12787 + }, + { + "epoch": 0.6963577083035538, + "grad_norm": 0.5892606362080496, + "learning_rate": 0.00015143887169678284, + "loss": 12.1196, + "step": 12788 + }, + { + "epoch": 0.6964121623001368, + "grad_norm": 0.6486531519357543, + "learning_rate": 0.0001514313093365455, + "loss": 12.3475, + "step": 12789 + }, + { + "epoch": 0.6964666162967198, + "grad_norm": 0.5804390280894669, + "learning_rate": 0.00015142374657636969, + "loss": 12.3115, + "step": 12790 + }, + { + "epoch": 0.6965210702933028, + "grad_norm": 0.6050143210622061, + "learning_rate": 0.00015141618341631418, + "loss": 12.3309, + "step": 12791 + }, + { + "epoch": 0.6965755242898859, + "grad_norm": 0.6440660563413831, + "learning_rate": 0.0001514086198564378, + "loss": 12.2225, + "step": 12792 + }, + { + "epoch": 0.6966299782864689, + "grad_norm": 0.6035204658067048, + "learning_rate": 0.00015140105589679943, + "loss": 12.3462, + "step": 12793 + }, + { + "epoch": 0.6966844322830519, + "grad_norm": 0.586423227975999, + "learning_rate": 0.00015139349153745778, + "loss": 12.1881, + "step": 12794 + }, + { + "epoch": 0.6967388862796349, + "grad_norm": 0.6195614052981723, + "learning_rate": 0.00015138592677847173, + "loss": 12.3054, + "step": 12795 + }, + { + "epoch": 0.6967933402762179, + "grad_norm": 0.5695957080876042, + "learning_rate": 0.0001513783616199001, + "loss": 12.2431, + "step": 12796 + }, + { + "epoch": 0.6968477942728009, + "grad_norm": 0.5401987111839878, + "learning_rate": 0.00015137079606180172, + "loss": 12.2292, + "step": 12797 + }, + { + "epoch": 0.696902248269384, + "grad_norm": 0.644588419023635, + "learning_rate": 0.0001513632301042354, + "loss": 12.2898, + "step": 12798 + }, + { + "epoch": 0.696956702265967, + "grad_norm": 0.558414091002706, + "learning_rate": 0.00015135566374726, + "loss": 12.2123, + "step": 12799 + }, + { + "epoch": 0.69701115626255, + "grad_norm": 0.5782843434074975, + "learning_rate": 0.00015134809699093434, + "loss": 12.3783, + "step": 12800 + }, + { + "epoch": 0.6970656102591329, + "grad_norm": 0.6892559074368685, + "learning_rate": 0.00015134052983531725, + "loss": 12.3138, + "step": 12801 + }, + { + "epoch": 0.6971200642557159, + "grad_norm": 0.596134638203521, + "learning_rate": 0.00015133296228046764, + "loss": 12.2218, + "step": 12802 + }, + { + "epoch": 0.6971745182522989, + "grad_norm": 0.6657524565185177, + "learning_rate": 0.00015132539432644425, + "loss": 12.2724, + "step": 12803 + }, + { + "epoch": 0.697228972248882, + "grad_norm": 0.6904308970357311, + "learning_rate": 0.00015131782597330602, + "loss": 12.3158, + "step": 12804 + }, + { + "epoch": 0.697283426245465, + "grad_norm": 0.5646015845227467, + "learning_rate": 0.00015131025722111177, + "loss": 12.2422, + "step": 12805 + }, + { + "epoch": 0.697337880242048, + "grad_norm": 0.6003823456063323, + "learning_rate": 0.00015130268806992037, + "loss": 12.4381, + "step": 12806 + }, + { + "epoch": 0.697392334238631, + "grad_norm": 0.6052761761039406, + "learning_rate": 0.0001512951185197906, + "loss": 12.4855, + "step": 12807 + }, + { + "epoch": 0.697446788235214, + "grad_norm": 0.5215938576523123, + "learning_rate": 0.00015128754857078141, + "loss": 12.1122, + "step": 12808 + }, + { + "epoch": 0.6975012422317971, + "grad_norm": 0.5757052319756734, + "learning_rate": 0.00015127997822295168, + "loss": 12.3324, + "step": 12809 + }, + { + "epoch": 0.6975556962283801, + "grad_norm": 0.5407436135189331, + "learning_rate": 0.00015127240747636023, + "loss": 12.2614, + "step": 12810 + }, + { + "epoch": 0.6976101502249631, + "grad_norm": 0.563649288491365, + "learning_rate": 0.00015126483633106595, + "loss": 12.2358, + "step": 12811 + }, + { + "epoch": 0.6976646042215461, + "grad_norm": 0.576318626856709, + "learning_rate": 0.00015125726478712765, + "loss": 12.3351, + "step": 12812 + }, + { + "epoch": 0.6977190582181291, + "grad_norm": 0.6966281930802263, + "learning_rate": 0.00015124969284460428, + "loss": 12.4885, + "step": 12813 + }, + { + "epoch": 0.6977735122147121, + "grad_norm": 0.7809185908219524, + "learning_rate": 0.00015124212050355475, + "loss": 12.3636, + "step": 12814 + }, + { + "epoch": 0.6978279662112952, + "grad_norm": 0.5746577283165907, + "learning_rate": 0.00015123454776403786, + "loss": 12.2467, + "step": 12815 + }, + { + "epoch": 0.6978824202078782, + "grad_norm": 0.6024413136284706, + "learning_rate": 0.00015122697462611258, + "loss": 12.2904, + "step": 12816 + }, + { + "epoch": 0.6979368742044612, + "grad_norm": 0.5206933345152238, + "learning_rate": 0.0001512194010898377, + "loss": 12.248, + "step": 12817 + }, + { + "epoch": 0.6979913282010441, + "grad_norm": 0.5610679148396018, + "learning_rate": 0.00015121182715527217, + "loss": 12.3231, + "step": 12818 + }, + { + "epoch": 0.6980457821976271, + "grad_norm": 0.5704520611048067, + "learning_rate": 0.0001512042528224749, + "loss": 12.2603, + "step": 12819 + }, + { + "epoch": 0.6981002361942101, + "grad_norm": 0.5341114754262499, + "learning_rate": 0.00015119667809150475, + "loss": 12.2551, + "step": 12820 + }, + { + "epoch": 0.6981546901907932, + "grad_norm": 0.5651258746143369, + "learning_rate": 0.0001511891029624207, + "loss": 12.3242, + "step": 12821 + }, + { + "epoch": 0.6982091441873762, + "grad_norm": 0.6127180680960584, + "learning_rate": 0.00015118152743528158, + "loss": 12.2176, + "step": 12822 + }, + { + "epoch": 0.6982635981839592, + "grad_norm": 0.6178526829964687, + "learning_rate": 0.0001511739515101463, + "loss": 12.3267, + "step": 12823 + }, + { + "epoch": 0.6983180521805422, + "grad_norm": 0.5485086221160265, + "learning_rate": 0.00015116637518707376, + "loss": 12.2266, + "step": 12824 + }, + { + "epoch": 0.6983725061771252, + "grad_norm": 0.5591931811739885, + "learning_rate": 0.00015115879846612295, + "loss": 12.1893, + "step": 12825 + }, + { + "epoch": 0.6984269601737082, + "grad_norm": 0.5504356367908919, + "learning_rate": 0.00015115122134735278, + "loss": 12.1897, + "step": 12826 + }, + { + "epoch": 0.6984814141702913, + "grad_norm": 0.5791906704907291, + "learning_rate": 0.00015114364383082208, + "loss": 12.3134, + "step": 12827 + }, + { + "epoch": 0.6985358681668743, + "grad_norm": 0.5927965701697596, + "learning_rate": 0.00015113606591658985, + "loss": 12.2676, + "step": 12828 + }, + { + "epoch": 0.6985903221634573, + "grad_norm": 0.6514547782037758, + "learning_rate": 0.00015112848760471498, + "loss": 12.2984, + "step": 12829 + }, + { + "epoch": 0.6986447761600403, + "grad_norm": 0.5625665172333867, + "learning_rate": 0.0001511209088952564, + "loss": 12.2379, + "step": 12830 + }, + { + "epoch": 0.6986992301566233, + "grad_norm": 0.6080219521627901, + "learning_rate": 0.0001511133297882731, + "loss": 12.3662, + "step": 12831 + }, + { + "epoch": 0.6987536841532063, + "grad_norm": 0.6208368036638605, + "learning_rate": 0.00015110575028382396, + "loss": 12.2077, + "step": 12832 + }, + { + "epoch": 0.6988081381497894, + "grad_norm": 0.5981735476751217, + "learning_rate": 0.00015109817038196796, + "loss": 12.3023, + "step": 12833 + }, + { + "epoch": 0.6988625921463724, + "grad_norm": 0.6878859063713462, + "learning_rate": 0.00015109059008276397, + "loss": 12.3612, + "step": 12834 + }, + { + "epoch": 0.6989170461429554, + "grad_norm": 0.6536308103657945, + "learning_rate": 0.000151083009386271, + "loss": 12.3678, + "step": 12835 + }, + { + "epoch": 0.6989715001395383, + "grad_norm": 0.5334163204272254, + "learning_rate": 0.00015107542829254802, + "loss": 12.2648, + "step": 12836 + }, + { + "epoch": 0.6990259541361213, + "grad_norm": 0.5811757267663836, + "learning_rate": 0.0001510678468016539, + "loss": 12.2406, + "step": 12837 + }, + { + "epoch": 0.6990804081327044, + "grad_norm": 0.5791601339932474, + "learning_rate": 0.00015106026491364767, + "loss": 12.173, + "step": 12838 + }, + { + "epoch": 0.6991348621292874, + "grad_norm": 0.5577669427986419, + "learning_rate": 0.00015105268262858825, + "loss": 12.3816, + "step": 12839 + }, + { + "epoch": 0.6991893161258704, + "grad_norm": 0.5523126670538845, + "learning_rate": 0.00015104509994653457, + "loss": 12.2392, + "step": 12840 + }, + { + "epoch": 0.6992437701224534, + "grad_norm": 0.5755533090316269, + "learning_rate": 0.00015103751686754567, + "loss": 12.2055, + "step": 12841 + }, + { + "epoch": 0.6992982241190364, + "grad_norm": 0.5715341861488942, + "learning_rate": 0.00015102993339168047, + "loss": 12.2667, + "step": 12842 + }, + { + "epoch": 0.6993526781156194, + "grad_norm": 0.6337431022952587, + "learning_rate": 0.00015102234951899795, + "loss": 12.2897, + "step": 12843 + }, + { + "epoch": 0.6994071321122025, + "grad_norm": 0.5545934302937681, + "learning_rate": 0.0001510147652495571, + "loss": 12.208, + "step": 12844 + }, + { + "epoch": 0.6994615861087855, + "grad_norm": 0.5818395953361071, + "learning_rate": 0.00015100718058341686, + "loss": 12.3168, + "step": 12845 + }, + { + "epoch": 0.6995160401053685, + "grad_norm": 0.54488203576456, + "learning_rate": 0.00015099959552063622, + "loss": 12.2805, + "step": 12846 + }, + { + "epoch": 0.6995704941019515, + "grad_norm": 0.5819073442016384, + "learning_rate": 0.00015099201006127418, + "loss": 12.2357, + "step": 12847 + }, + { + "epoch": 0.6996249480985345, + "grad_norm": 0.5980085895394841, + "learning_rate": 0.00015098442420538973, + "loss": 12.1713, + "step": 12848 + }, + { + "epoch": 0.6996794020951175, + "grad_norm": 0.5649860888099074, + "learning_rate": 0.00015097683795304185, + "loss": 12.2638, + "step": 12849 + }, + { + "epoch": 0.6997338560917006, + "grad_norm": 0.5692873645192698, + "learning_rate": 0.0001509692513042895, + "loss": 12.2009, + "step": 12850 + }, + { + "epoch": 0.6997883100882836, + "grad_norm": 0.5254540513369161, + "learning_rate": 0.00015096166425919175, + "loss": 12.3878, + "step": 12851 + }, + { + "epoch": 0.6998427640848666, + "grad_norm": 0.559223478872943, + "learning_rate": 0.00015095407681780753, + "loss": 12.1803, + "step": 12852 + }, + { + "epoch": 0.6998972180814496, + "grad_norm": 0.5561422820246377, + "learning_rate": 0.00015094648898019588, + "loss": 12.2901, + "step": 12853 + }, + { + "epoch": 0.6999516720780325, + "grad_norm": 0.6398204891444215, + "learning_rate": 0.00015093890074641575, + "loss": 12.2311, + "step": 12854 + }, + { + "epoch": 0.7000061260746155, + "grad_norm": 0.5730771643651186, + "learning_rate": 0.0001509313121165262, + "loss": 12.2579, + "step": 12855 + }, + { + "epoch": 0.7000605800711986, + "grad_norm": 0.5339535864535195, + "learning_rate": 0.00015092372309058623, + "loss": 12.1686, + "step": 12856 + }, + { + "epoch": 0.7001150340677816, + "grad_norm": 0.5994247273085027, + "learning_rate": 0.00015091613366865486, + "loss": 12.2944, + "step": 12857 + }, + { + "epoch": 0.7001694880643646, + "grad_norm": 0.5083786193557129, + "learning_rate": 0.0001509085438507911, + "loss": 12.1098, + "step": 12858 + }, + { + "epoch": 0.7002239420609476, + "grad_norm": 0.5813561783614933, + "learning_rate": 0.00015090095363705394, + "loss": 12.3114, + "step": 12859 + }, + { + "epoch": 0.7002783960575306, + "grad_norm": 0.5256842061518319, + "learning_rate": 0.00015089336302750246, + "loss": 12.2911, + "step": 12860 + }, + { + "epoch": 0.7003328500541136, + "grad_norm": 0.5854745903233185, + "learning_rate": 0.00015088577202219562, + "loss": 12.3942, + "step": 12861 + }, + { + "epoch": 0.7003873040506967, + "grad_norm": 0.6232949445893607, + "learning_rate": 0.0001508781806211925, + "loss": 12.3221, + "step": 12862 + }, + { + "epoch": 0.7004417580472797, + "grad_norm": 0.556903770839926, + "learning_rate": 0.00015087058882455213, + "loss": 12.3262, + "step": 12863 + }, + { + "epoch": 0.7004962120438627, + "grad_norm": 0.5511222088378669, + "learning_rate": 0.0001508629966323335, + "loss": 12.2111, + "step": 12864 + }, + { + "epoch": 0.7005506660404457, + "grad_norm": 0.6933096411771393, + "learning_rate": 0.0001508554040445957, + "loss": 12.3443, + "step": 12865 + }, + { + "epoch": 0.7006051200370287, + "grad_norm": 0.6450763766755955, + "learning_rate": 0.00015084781106139777, + "loss": 12.2932, + "step": 12866 + }, + { + "epoch": 0.7006595740336117, + "grad_norm": 0.6180854258717581, + "learning_rate": 0.00015084021768279867, + "loss": 12.326, + "step": 12867 + }, + { + "epoch": 0.7007140280301948, + "grad_norm": 0.5652729378781475, + "learning_rate": 0.00015083262390885754, + "loss": 12.1985, + "step": 12868 + }, + { + "epoch": 0.7007684820267778, + "grad_norm": 0.5832845128774566, + "learning_rate": 0.0001508250297396334, + "loss": 12.3935, + "step": 12869 + }, + { + "epoch": 0.7008229360233608, + "grad_norm": 0.6256855422987257, + "learning_rate": 0.00015081743517518533, + "loss": 12.3105, + "step": 12870 + }, + { + "epoch": 0.7008773900199438, + "grad_norm": 0.6514861781739107, + "learning_rate": 0.0001508098402155724, + "loss": 12.2294, + "step": 12871 + }, + { + "epoch": 0.7009318440165268, + "grad_norm": 0.5976054048957782, + "learning_rate": 0.00015080224486085355, + "loss": 12.2549, + "step": 12872 + }, + { + "epoch": 0.7009862980131099, + "grad_norm": 0.6126191865281446, + "learning_rate": 0.00015079464911108793, + "loss": 12.2187, + "step": 12873 + }, + { + "epoch": 0.7010407520096928, + "grad_norm": 0.5903689854623854, + "learning_rate": 0.00015078705296633462, + "loss": 12.2371, + "step": 12874 + }, + { + "epoch": 0.7010952060062758, + "grad_norm": 0.5812216652704836, + "learning_rate": 0.00015077945642665269, + "loss": 12.369, + "step": 12875 + }, + { + "epoch": 0.7011496600028588, + "grad_norm": 0.5534852216522496, + "learning_rate": 0.00015077185949210117, + "loss": 12.2853, + "step": 12876 + }, + { + "epoch": 0.7012041139994418, + "grad_norm": 0.6274739036507253, + "learning_rate": 0.00015076426216273914, + "loss": 12.1898, + "step": 12877 + }, + { + "epoch": 0.7012585679960248, + "grad_norm": 0.6063997065462186, + "learning_rate": 0.00015075666443862572, + "loss": 12.3014, + "step": 12878 + }, + { + "epoch": 0.7013130219926079, + "grad_norm": 0.5217788404855932, + "learning_rate": 0.00015074906631981999, + "loss": 12.2941, + "step": 12879 + }, + { + "epoch": 0.7013674759891909, + "grad_norm": 0.5552649160050687, + "learning_rate": 0.00015074146780638095, + "loss": 12.1025, + "step": 12880 + }, + { + "epoch": 0.7014219299857739, + "grad_norm": 0.6392569035584885, + "learning_rate": 0.00015073386889836782, + "loss": 12.1831, + "step": 12881 + }, + { + "epoch": 0.7014763839823569, + "grad_norm": 0.5521041501633279, + "learning_rate": 0.00015072626959583956, + "loss": 12.2143, + "step": 12882 + }, + { + "epoch": 0.7015308379789399, + "grad_norm": 0.6227440240483435, + "learning_rate": 0.00015071866989885532, + "loss": 12.3998, + "step": 12883 + }, + { + "epoch": 0.7015852919755229, + "grad_norm": 0.6334247030673338, + "learning_rate": 0.00015071106980747421, + "loss": 12.3131, + "step": 12884 + }, + { + "epoch": 0.701639745972106, + "grad_norm": 0.5563039830909307, + "learning_rate": 0.0001507034693217553, + "loss": 12.2404, + "step": 12885 + }, + { + "epoch": 0.701694199968689, + "grad_norm": 0.5740373990473525, + "learning_rate": 0.00015069586844175775, + "loss": 12.1678, + "step": 12886 + }, + { + "epoch": 0.701748653965272, + "grad_norm": 0.5615508506413522, + "learning_rate": 0.00015068826716754062, + "loss": 12.2148, + "step": 12887 + }, + { + "epoch": 0.701803107961855, + "grad_norm": 0.5881964519645851, + "learning_rate": 0.000150680665499163, + "loss": 12.3873, + "step": 12888 + }, + { + "epoch": 0.701857561958438, + "grad_norm": 0.6559325462620302, + "learning_rate": 0.000150673063436684, + "loss": 12.2265, + "step": 12889 + }, + { + "epoch": 0.701912015955021, + "grad_norm": 0.5351451244591657, + "learning_rate": 0.0001506654609801628, + "loss": 12.2559, + "step": 12890 + }, + { + "epoch": 0.701966469951604, + "grad_norm": 0.5569533211512768, + "learning_rate": 0.00015065785812965848, + "loss": 12.0566, + "step": 12891 + }, + { + "epoch": 0.702020923948187, + "grad_norm": 0.5937804356031744, + "learning_rate": 0.00015065025488523017, + "loss": 12.2113, + "step": 12892 + }, + { + "epoch": 0.70207537794477, + "grad_norm": 0.5659010125123903, + "learning_rate": 0.00015064265124693698, + "loss": 12.278, + "step": 12893 + }, + { + "epoch": 0.702129831941353, + "grad_norm": 0.535901319384464, + "learning_rate": 0.00015063504721483803, + "loss": 12.3676, + "step": 12894 + }, + { + "epoch": 0.702184285937936, + "grad_norm": 0.5669487525365963, + "learning_rate": 0.00015062744278899242, + "loss": 12.3111, + "step": 12895 + }, + { + "epoch": 0.702238739934519, + "grad_norm": 0.6140266146940161, + "learning_rate": 0.0001506198379694594, + "loss": 12.276, + "step": 12896 + }, + { + "epoch": 0.7022931939311021, + "grad_norm": 0.5226019933935814, + "learning_rate": 0.000150612232756298, + "loss": 12.2468, + "step": 12897 + }, + { + "epoch": 0.7023476479276851, + "grad_norm": 0.5518765132762485, + "learning_rate": 0.00015060462714956737, + "loss": 12.3558, + "step": 12898 + }, + { + "epoch": 0.7024021019242681, + "grad_norm": 0.5930158592363907, + "learning_rate": 0.0001505970211493267, + "loss": 12.2942, + "step": 12899 + }, + { + "epoch": 0.7024565559208511, + "grad_norm": 0.5990262226635165, + "learning_rate": 0.00015058941475563507, + "loss": 12.4069, + "step": 12900 + }, + { + "epoch": 0.7025110099174341, + "grad_norm": 0.6295009447765174, + "learning_rate": 0.0001505818079685517, + "loss": 12.2482, + "step": 12901 + }, + { + "epoch": 0.7025654639140171, + "grad_norm": 0.5713781944258155, + "learning_rate": 0.0001505742007881357, + "loss": 12.272, + "step": 12902 + }, + { + "epoch": 0.7026199179106002, + "grad_norm": 0.5775952506978804, + "learning_rate": 0.00015056659321444627, + "loss": 12.198, + "step": 12903 + }, + { + "epoch": 0.7026743719071832, + "grad_norm": 0.5989314705696425, + "learning_rate": 0.00015055898524754245, + "loss": 12.3093, + "step": 12904 + }, + { + "epoch": 0.7027288259037662, + "grad_norm": 0.5691834182800973, + "learning_rate": 0.00015055137688748356, + "loss": 12.2635, + "step": 12905 + }, + { + "epoch": 0.7027832799003492, + "grad_norm": 0.5813875692402253, + "learning_rate": 0.00015054376813432866, + "loss": 12.2248, + "step": 12906 + }, + { + "epoch": 0.7028377338969322, + "grad_norm": 0.5870360426395933, + "learning_rate": 0.00015053615898813694, + "loss": 12.2217, + "step": 12907 + }, + { + "epoch": 0.7028921878935153, + "grad_norm": 0.5343732930374057, + "learning_rate": 0.00015052854944896756, + "loss": 12.3343, + "step": 12908 + }, + { + "epoch": 0.7029466418900983, + "grad_norm": 0.5699443719681166, + "learning_rate": 0.0001505209395168797, + "loss": 12.223, + "step": 12909 + }, + { + "epoch": 0.7030010958866812, + "grad_norm": 0.5973307979328459, + "learning_rate": 0.00015051332919193257, + "loss": 12.2756, + "step": 12910 + }, + { + "epoch": 0.7030555498832642, + "grad_norm": 0.6120749129557116, + "learning_rate": 0.0001505057184741853, + "loss": 12.3103, + "step": 12911 + }, + { + "epoch": 0.7031100038798472, + "grad_norm": 0.6953633712044719, + "learning_rate": 0.0001504981073636971, + "loss": 12.3662, + "step": 12912 + }, + { + "epoch": 0.7031644578764302, + "grad_norm": 0.5138596022468547, + "learning_rate": 0.00015049049586052718, + "loss": 12.253, + "step": 12913 + }, + { + "epoch": 0.7032189118730133, + "grad_norm": 0.6047293103686023, + "learning_rate": 0.00015048288396473468, + "loss": 12.4362, + "step": 12914 + }, + { + "epoch": 0.7032733658695963, + "grad_norm": 0.5717537391124353, + "learning_rate": 0.00015047527167637878, + "loss": 12.1667, + "step": 12915 + }, + { + "epoch": 0.7033278198661793, + "grad_norm": 0.5407187604839495, + "learning_rate": 0.0001504676589955187, + "loss": 12.1886, + "step": 12916 + }, + { + "epoch": 0.7033822738627623, + "grad_norm": 0.6358353512639061, + "learning_rate": 0.00015046004592221367, + "loss": 12.2309, + "step": 12917 + }, + { + "epoch": 0.7034367278593453, + "grad_norm": 0.5882551571751676, + "learning_rate": 0.00015045243245652285, + "loss": 12.3412, + "step": 12918 + }, + { + "epoch": 0.7034911818559283, + "grad_norm": 0.5664833841054566, + "learning_rate": 0.00015044481859850545, + "loss": 12.3031, + "step": 12919 + }, + { + "epoch": 0.7035456358525114, + "grad_norm": 0.6640648902820085, + "learning_rate": 0.00015043720434822067, + "loss": 12.1166, + "step": 12920 + }, + { + "epoch": 0.7036000898490944, + "grad_norm": 0.5332762140500776, + "learning_rate": 0.0001504295897057278, + "loss": 12.213, + "step": 12921 + }, + { + "epoch": 0.7036545438456774, + "grad_norm": 0.6033862431121884, + "learning_rate": 0.0001504219746710859, + "loss": 12.1348, + "step": 12922 + }, + { + "epoch": 0.7037089978422604, + "grad_norm": 0.6008975227578273, + "learning_rate": 0.0001504143592443543, + "loss": 12.2651, + "step": 12923 + }, + { + "epoch": 0.7037634518388434, + "grad_norm": 0.5713175980307648, + "learning_rate": 0.00015040674342559217, + "loss": 12.1195, + "step": 12924 + }, + { + "epoch": 0.7038179058354264, + "grad_norm": 0.6870336688388138, + "learning_rate": 0.00015039912721485877, + "loss": 12.2627, + "step": 12925 + }, + { + "epoch": 0.7038723598320095, + "grad_norm": 0.550509285896728, + "learning_rate": 0.0001503915106122133, + "loss": 12.2699, + "step": 12926 + }, + { + "epoch": 0.7039268138285925, + "grad_norm": 0.5183542937646016, + "learning_rate": 0.000150383893617715, + "loss": 12.1956, + "step": 12927 + }, + { + "epoch": 0.7039812678251754, + "grad_norm": 0.505524050927356, + "learning_rate": 0.0001503762762314231, + "loss": 12.2436, + "step": 12928 + }, + { + "epoch": 0.7040357218217584, + "grad_norm": 0.5528750624664851, + "learning_rate": 0.0001503686584533968, + "loss": 12.2574, + "step": 12929 + }, + { + "epoch": 0.7040901758183414, + "grad_norm": 0.6847928417111844, + "learning_rate": 0.00015036104028369538, + "loss": 12.2566, + "step": 12930 + }, + { + "epoch": 0.7041446298149244, + "grad_norm": 0.5811133448734164, + "learning_rate": 0.00015035342172237806, + "loss": 12.193, + "step": 12931 + }, + { + "epoch": 0.7041990838115075, + "grad_norm": 0.5146003924536093, + "learning_rate": 0.0001503458027695041, + "loss": 12.2794, + "step": 12932 + }, + { + "epoch": 0.7042535378080905, + "grad_norm": 0.6257737359358769, + "learning_rate": 0.00015033818342513272, + "loss": 12.2411, + "step": 12933 + }, + { + "epoch": 0.7043079918046735, + "grad_norm": 0.5742448206754013, + "learning_rate": 0.00015033056368932322, + "loss": 12.3275, + "step": 12934 + }, + { + "epoch": 0.7043624458012565, + "grad_norm": 0.5797591318656884, + "learning_rate": 0.00015032294356213477, + "loss": 12.1942, + "step": 12935 + }, + { + "epoch": 0.7044168997978395, + "grad_norm": 0.7451421687573521, + "learning_rate": 0.0001503153230436267, + "loss": 12.226, + "step": 12936 + }, + { + "epoch": 0.7044713537944225, + "grad_norm": 0.539841668324749, + "learning_rate": 0.00015030770213385824, + "loss": 12.272, + "step": 12937 + }, + { + "epoch": 0.7045258077910056, + "grad_norm": 0.5935585281767789, + "learning_rate": 0.00015030008083288864, + "loss": 12.3097, + "step": 12938 + }, + { + "epoch": 0.7045802617875886, + "grad_norm": 0.5511736584624088, + "learning_rate": 0.0001502924591407772, + "loss": 12.2108, + "step": 12939 + }, + { + "epoch": 0.7046347157841716, + "grad_norm": 0.5534412404344856, + "learning_rate": 0.00015028483705758312, + "loss": 12.2039, + "step": 12940 + }, + { + "epoch": 0.7046891697807546, + "grad_norm": 0.6563150821077004, + "learning_rate": 0.00015027721458336574, + "loss": 12.2303, + "step": 12941 + }, + { + "epoch": 0.7047436237773376, + "grad_norm": 0.6366833265609215, + "learning_rate": 0.00015026959171818432, + "loss": 12.4365, + "step": 12942 + }, + { + "epoch": 0.7047980777739207, + "grad_norm": 0.6358540136371424, + "learning_rate": 0.0001502619684620981, + "loss": 12.3262, + "step": 12943 + }, + { + "epoch": 0.7048525317705037, + "grad_norm": 0.6537474371778512, + "learning_rate": 0.00015025434481516638, + "loss": 12.3111, + "step": 12944 + }, + { + "epoch": 0.7049069857670867, + "grad_norm": 0.5743493906745762, + "learning_rate": 0.00015024672077744845, + "loss": 12.2707, + "step": 12945 + }, + { + "epoch": 0.7049614397636697, + "grad_norm": 0.5473693388988641, + "learning_rate": 0.00015023909634900363, + "loss": 12.2449, + "step": 12946 + }, + { + "epoch": 0.7050158937602526, + "grad_norm": 0.6103975608154453, + "learning_rate": 0.00015023147152989113, + "loss": 12.3103, + "step": 12947 + }, + { + "epoch": 0.7050703477568356, + "grad_norm": 0.5814312456830363, + "learning_rate": 0.00015022384632017033, + "loss": 12.2969, + "step": 12948 + }, + { + "epoch": 0.7051248017534187, + "grad_norm": 0.5968487505951352, + "learning_rate": 0.00015021622071990045, + "loss": 12.2258, + "step": 12949 + }, + { + "epoch": 0.7051792557500017, + "grad_norm": 0.5681675401207187, + "learning_rate": 0.00015020859472914078, + "loss": 12.083, + "step": 12950 + }, + { + "epoch": 0.7052337097465847, + "grad_norm": 0.5611788321327241, + "learning_rate": 0.0001502009683479507, + "loss": 12.2544, + "step": 12951 + }, + { + "epoch": 0.7052881637431677, + "grad_norm": 0.6028495282539408, + "learning_rate": 0.00015019334157638948, + "loss": 12.0151, + "step": 12952 + }, + { + "epoch": 0.7053426177397507, + "grad_norm": 0.6158821652859549, + "learning_rate": 0.00015018571441451642, + "loss": 12.3645, + "step": 12953 + }, + { + "epoch": 0.7053970717363337, + "grad_norm": 0.5943618216782155, + "learning_rate": 0.00015017808686239079, + "loss": 12.2499, + "step": 12954 + }, + { + "epoch": 0.7054515257329168, + "grad_norm": 0.5629309510246335, + "learning_rate": 0.00015017045892007195, + "loss": 12.3606, + "step": 12955 + }, + { + "epoch": 0.7055059797294998, + "grad_norm": 0.6314402368385341, + "learning_rate": 0.0001501628305876192, + "loss": 12.0944, + "step": 12956 + }, + { + "epoch": 0.7055604337260828, + "grad_norm": 0.6365115644400484, + "learning_rate": 0.00015015520186509193, + "loss": 12.3064, + "step": 12957 + }, + { + "epoch": 0.7056148877226658, + "grad_norm": 0.5965986296655351, + "learning_rate": 0.00015014757275254932, + "loss": 12.1364, + "step": 12958 + }, + { + "epoch": 0.7056693417192488, + "grad_norm": 0.5948973063017181, + "learning_rate": 0.00015013994325005085, + "loss": 12.3672, + "step": 12959 + }, + { + "epoch": 0.7057237957158318, + "grad_norm": 0.5884444949376633, + "learning_rate": 0.00015013231335765572, + "loss": 12.2143, + "step": 12960 + }, + { + "epoch": 0.7057782497124149, + "grad_norm": 0.6133597645062747, + "learning_rate": 0.0001501246830754233, + "loss": 12.3034, + "step": 12961 + }, + { + "epoch": 0.7058327037089979, + "grad_norm": 0.5898220414273352, + "learning_rate": 0.000150117052403413, + "loss": 12.2611, + "step": 12962 + }, + { + "epoch": 0.7058871577055809, + "grad_norm": 0.5961760638203288, + "learning_rate": 0.00015010942134168403, + "loss": 12.1966, + "step": 12963 + }, + { + "epoch": 0.7059416117021639, + "grad_norm": 0.5601188165322548, + "learning_rate": 0.00015010178989029584, + "loss": 12.3401, + "step": 12964 + }, + { + "epoch": 0.7059960656987468, + "grad_norm": 0.6761852903853308, + "learning_rate": 0.00015009415804930772, + "loss": 12.328, + "step": 12965 + }, + { + "epoch": 0.7060505196953298, + "grad_norm": 0.8251909509115992, + "learning_rate": 0.000150086525818779, + "loss": 12.3578, + "step": 12966 + }, + { + "epoch": 0.7061049736919129, + "grad_norm": 0.5408091436796602, + "learning_rate": 0.00015007889319876912, + "loss": 12.2561, + "step": 12967 + }, + { + "epoch": 0.7061594276884959, + "grad_norm": 0.5507950813247795, + "learning_rate": 0.00015007126018933733, + "loss": 12.1793, + "step": 12968 + }, + { + "epoch": 0.7062138816850789, + "grad_norm": 0.5232231954991038, + "learning_rate": 0.000150063626790543, + "loss": 12.2478, + "step": 12969 + }, + { + "epoch": 0.7062683356816619, + "grad_norm": 0.5860145048316517, + "learning_rate": 0.00015005599300244556, + "loss": 12.3502, + "step": 12970 + }, + { + "epoch": 0.7063227896782449, + "grad_norm": 0.5740070028883291, + "learning_rate": 0.0001500483588251043, + "loss": 12.201, + "step": 12971 + }, + { + "epoch": 0.706377243674828, + "grad_norm": 0.5277369607087119, + "learning_rate": 0.0001500407242585786, + "loss": 12.2771, + "step": 12972 + }, + { + "epoch": 0.706431697671411, + "grad_norm": 0.6978047578361452, + "learning_rate": 0.00015003308930292784, + "loss": 12.2717, + "step": 12973 + }, + { + "epoch": 0.706486151667994, + "grad_norm": 0.5385232740976329, + "learning_rate": 0.00015002545395821138, + "loss": 12.1565, + "step": 12974 + }, + { + "epoch": 0.706540605664577, + "grad_norm": 0.5768496412226605, + "learning_rate": 0.0001500178182244886, + "loss": 12.2259, + "step": 12975 + }, + { + "epoch": 0.70659505966116, + "grad_norm": 0.6189081619794492, + "learning_rate": 0.0001500101821018189, + "loss": 12.1882, + "step": 12976 + }, + { + "epoch": 0.706649513657743, + "grad_norm": 0.630691109976242, + "learning_rate": 0.00015000254559026163, + "loss": 12.1659, + "step": 12977 + }, + { + "epoch": 0.7067039676543261, + "grad_norm": 0.5674145907848038, + "learning_rate": 0.00014999490868987617, + "loss": 12.2894, + "step": 12978 + }, + { + "epoch": 0.7067584216509091, + "grad_norm": 0.5967081606199218, + "learning_rate": 0.00014998727140072194, + "loss": 12.3154, + "step": 12979 + }, + { + "epoch": 0.7068128756474921, + "grad_norm": 0.5814432648578499, + "learning_rate": 0.00014997963372285827, + "loss": 12.2798, + "step": 12980 + }, + { + "epoch": 0.7068673296440751, + "grad_norm": 0.7079190267375342, + "learning_rate": 0.0001499719956563446, + "loss": 12.2989, + "step": 12981 + }, + { + "epoch": 0.706921783640658, + "grad_norm": 0.5976573686600499, + "learning_rate": 0.0001499643572012403, + "loss": 12.3619, + "step": 12982 + }, + { + "epoch": 0.706976237637241, + "grad_norm": 0.5945798656008229, + "learning_rate": 0.00014995671835760482, + "loss": 12.2903, + "step": 12983 + }, + { + "epoch": 0.7070306916338241, + "grad_norm": 0.7134044262037385, + "learning_rate": 0.0001499490791254975, + "loss": 12.315, + "step": 12984 + }, + { + "epoch": 0.7070851456304071, + "grad_norm": 0.7298247499418934, + "learning_rate": 0.00014994143950497775, + "loss": 12.2258, + "step": 12985 + }, + { + "epoch": 0.7071395996269901, + "grad_norm": 0.5753519644815357, + "learning_rate": 0.000149933799496105, + "loss": 12.2707, + "step": 12986 + }, + { + "epoch": 0.7071940536235731, + "grad_norm": 0.5618107193737906, + "learning_rate": 0.00014992615909893865, + "loss": 12.2128, + "step": 12987 + }, + { + "epoch": 0.7072485076201561, + "grad_norm": 0.6589114753084558, + "learning_rate": 0.0001499185183135381, + "loss": 12.3593, + "step": 12988 + }, + { + "epoch": 0.7073029616167391, + "grad_norm": 0.6024408905521136, + "learning_rate": 0.0001499108771399628, + "loss": 12.1083, + "step": 12989 + }, + { + "epoch": 0.7073574156133222, + "grad_norm": 0.5998225459431465, + "learning_rate": 0.00014990323557827214, + "loss": 12.2451, + "step": 12990 + }, + { + "epoch": 0.7074118696099052, + "grad_norm": 0.6596737008732433, + "learning_rate": 0.00014989559362852555, + "loss": 12.352, + "step": 12991 + }, + { + "epoch": 0.7074663236064882, + "grad_norm": 0.5640430440547669, + "learning_rate": 0.0001498879512907825, + "loss": 12.2266, + "step": 12992 + }, + { + "epoch": 0.7075207776030712, + "grad_norm": 0.58905805846539, + "learning_rate": 0.0001498803085651023, + "loss": 12.2615, + "step": 12993 + }, + { + "epoch": 0.7075752315996542, + "grad_norm": 0.5459263943702819, + "learning_rate": 0.0001498726654515445, + "loss": 12.2659, + "step": 12994 + }, + { + "epoch": 0.7076296855962372, + "grad_norm": 0.64321390044219, + "learning_rate": 0.0001498650219501685, + "loss": 12.3126, + "step": 12995 + }, + { + "epoch": 0.7076841395928203, + "grad_norm": 0.6119570222522704, + "learning_rate": 0.00014985737806103372, + "loss": 12.3096, + "step": 12996 + }, + { + "epoch": 0.7077385935894033, + "grad_norm": 0.594510800282877, + "learning_rate": 0.0001498497337841996, + "loss": 12.3027, + "step": 12997 + }, + { + "epoch": 0.7077930475859863, + "grad_norm": 0.6117166303249918, + "learning_rate": 0.0001498420891197256, + "loss": 12.0491, + "step": 12998 + }, + { + "epoch": 0.7078475015825693, + "grad_norm": 0.5886463053915209, + "learning_rate": 0.00014983444406767112, + "loss": 12.3004, + "step": 12999 + }, + { + "epoch": 0.7079019555791523, + "grad_norm": 0.5637629847214728, + "learning_rate": 0.00014982679862809566, + "loss": 12.2073, + "step": 13000 + }, + { + "epoch": 0.7079564095757352, + "grad_norm": 0.8458027872746456, + "learning_rate": 0.00014981915280105868, + "loss": 12.3988, + "step": 13001 + }, + { + "epoch": 0.7080108635723183, + "grad_norm": 0.6073435712822226, + "learning_rate": 0.0001498115065866196, + "loss": 12.2968, + "step": 13002 + }, + { + "epoch": 0.7080653175689013, + "grad_norm": 0.574713441835352, + "learning_rate": 0.0001498038599848379, + "loss": 12.3245, + "step": 13003 + }, + { + "epoch": 0.7081197715654843, + "grad_norm": 0.5738326787390379, + "learning_rate": 0.000149796212995773, + "loss": 12.0705, + "step": 13004 + }, + { + "epoch": 0.7081742255620673, + "grad_norm": 0.6681791939973973, + "learning_rate": 0.00014978856561948442, + "loss": 12.3879, + "step": 13005 + }, + { + "epoch": 0.7082286795586503, + "grad_norm": 0.5903997054499869, + "learning_rate": 0.00014978091785603162, + "loss": 12.3157, + "step": 13006 + }, + { + "epoch": 0.7082831335552334, + "grad_norm": 0.6755493009094661, + "learning_rate": 0.00014977326970547406, + "loss": 12.3751, + "step": 13007 + }, + { + "epoch": 0.7083375875518164, + "grad_norm": 0.6520915349052323, + "learning_rate": 0.00014976562116787115, + "loss": 12.2351, + "step": 13008 + }, + { + "epoch": 0.7083920415483994, + "grad_norm": 0.6333760318312311, + "learning_rate": 0.00014975797224328247, + "loss": 12.1828, + "step": 13009 + }, + { + "epoch": 0.7084464955449824, + "grad_norm": 0.6331195754530802, + "learning_rate": 0.00014975032293176744, + "loss": 12.3569, + "step": 13010 + }, + { + "epoch": 0.7085009495415654, + "grad_norm": 0.6345875524219766, + "learning_rate": 0.00014974267323338556, + "loss": 12.3998, + "step": 13011 + }, + { + "epoch": 0.7085554035381484, + "grad_norm": 0.580569784650918, + "learning_rate": 0.00014973502314819635, + "loss": 12.2017, + "step": 13012 + }, + { + "epoch": 0.7086098575347315, + "grad_norm": 0.5794457087183509, + "learning_rate": 0.0001497273726762592, + "loss": 12.2481, + "step": 13013 + }, + { + "epoch": 0.7086643115313145, + "grad_norm": 0.566031642380644, + "learning_rate": 0.0001497197218176337, + "loss": 12.17, + "step": 13014 + }, + { + "epoch": 0.7087187655278975, + "grad_norm": 0.5701972777924958, + "learning_rate": 0.00014971207057237927, + "loss": 12.2852, + "step": 13015 + }, + { + "epoch": 0.7087732195244805, + "grad_norm": 0.6256922142544816, + "learning_rate": 0.00014970441894055544, + "loss": 12.3115, + "step": 13016 + }, + { + "epoch": 0.7088276735210635, + "grad_norm": 0.6354118579123414, + "learning_rate": 0.00014969676692222174, + "loss": 12.3488, + "step": 13017 + }, + { + "epoch": 0.7088821275176465, + "grad_norm": 0.6177833774317564, + "learning_rate": 0.00014968911451743766, + "loss": 12.3751, + "step": 13018 + }, + { + "epoch": 0.7089365815142296, + "grad_norm": 0.6051671506499325, + "learning_rate": 0.00014968146172626264, + "loss": 12.1438, + "step": 13019 + }, + { + "epoch": 0.7089910355108126, + "grad_norm": 0.5834906437141846, + "learning_rate": 0.0001496738085487563, + "loss": 12.2348, + "step": 13020 + }, + { + "epoch": 0.7090454895073955, + "grad_norm": 0.579844554082125, + "learning_rate": 0.00014966615498497804, + "loss": 12.297, + "step": 13021 + }, + { + "epoch": 0.7090999435039785, + "grad_norm": 0.5442590529551725, + "learning_rate": 0.00014965850103498747, + "loss": 12.2521, + "step": 13022 + }, + { + "epoch": 0.7091543975005615, + "grad_norm": 0.5210616076593362, + "learning_rate": 0.00014965084669884407, + "loss": 11.9946, + "step": 13023 + }, + { + "epoch": 0.7092088514971445, + "grad_norm": 0.6417333566925977, + "learning_rate": 0.00014964319197660736, + "loss": 12.4075, + "step": 13024 + }, + { + "epoch": 0.7092633054937276, + "grad_norm": 0.5371747489792881, + "learning_rate": 0.00014963553686833683, + "loss": 12.1904, + "step": 13025 + }, + { + "epoch": 0.7093177594903106, + "grad_norm": 0.5548915066555968, + "learning_rate": 0.00014962788137409206, + "loss": 12.1782, + "step": 13026 + }, + { + "epoch": 0.7093722134868936, + "grad_norm": 0.5334034909808053, + "learning_rate": 0.00014962022549393255, + "loss": 12.1223, + "step": 13027 + }, + { + "epoch": 0.7094266674834766, + "grad_norm": 0.5738456004784469, + "learning_rate": 0.00014961256922791787, + "loss": 12.1724, + "step": 13028 + }, + { + "epoch": 0.7094811214800596, + "grad_norm": 0.6147623973023848, + "learning_rate": 0.00014960491257610752, + "loss": 12.3882, + "step": 13029 + }, + { + "epoch": 0.7095355754766426, + "grad_norm": 0.5406251727724115, + "learning_rate": 0.00014959725553856105, + "loss": 12.1801, + "step": 13030 + }, + { + "epoch": 0.7095900294732257, + "grad_norm": 0.5821260739638942, + "learning_rate": 0.000149589598115338, + "loss": 12.2504, + "step": 13031 + }, + { + "epoch": 0.7096444834698087, + "grad_norm": 0.555126981255046, + "learning_rate": 0.00014958194030649792, + "loss": 12.1343, + "step": 13032 + }, + { + "epoch": 0.7096989374663917, + "grad_norm": 0.5697013295420689, + "learning_rate": 0.00014957428211210036, + "loss": 12.3033, + "step": 13033 + }, + { + "epoch": 0.7097533914629747, + "grad_norm": 0.5492315961312766, + "learning_rate": 0.00014956662353220488, + "loss": 12.3455, + "step": 13034 + }, + { + "epoch": 0.7098078454595577, + "grad_norm": 0.5918307712403531, + "learning_rate": 0.00014955896456687103, + "loss": 12.2083, + "step": 13035 + }, + { + "epoch": 0.7098622994561407, + "grad_norm": 0.6530977518570804, + "learning_rate": 0.00014955130521615833, + "loss": 12.2589, + "step": 13036 + }, + { + "epoch": 0.7099167534527238, + "grad_norm": 0.6120186751995188, + "learning_rate": 0.00014954364548012638, + "loss": 12.2314, + "step": 13037 + }, + { + "epoch": 0.7099712074493068, + "grad_norm": 0.5867757951906781, + "learning_rate": 0.00014953598535883474, + "loss": 12.3824, + "step": 13038 + }, + { + "epoch": 0.7100256614458897, + "grad_norm": 0.6451329228108325, + "learning_rate": 0.00014952832485234295, + "loss": 12.3503, + "step": 13039 + }, + { + "epoch": 0.7100801154424727, + "grad_norm": 0.6861941361354333, + "learning_rate": 0.00014952066396071062, + "loss": 12.3653, + "step": 13040 + }, + { + "epoch": 0.7101345694390557, + "grad_norm": 0.6513061150663569, + "learning_rate": 0.0001495130026839973, + "loss": 12.2643, + "step": 13041 + }, + { + "epoch": 0.7101890234356388, + "grad_norm": 0.6295887315128024, + "learning_rate": 0.0001495053410222626, + "loss": 12.1674, + "step": 13042 + }, + { + "epoch": 0.7102434774322218, + "grad_norm": 0.6510454621330697, + "learning_rate": 0.00014949767897556602, + "loss": 12.2018, + "step": 13043 + }, + { + "epoch": 0.7102979314288048, + "grad_norm": 0.6875565394105327, + "learning_rate": 0.00014949001654396719, + "loss": 12.3256, + "step": 13044 + }, + { + "epoch": 0.7103523854253878, + "grad_norm": 0.5398634943151209, + "learning_rate": 0.0001494823537275257, + "loss": 12.2262, + "step": 13045 + }, + { + "epoch": 0.7104068394219708, + "grad_norm": 0.543330019868829, + "learning_rate": 0.00014947469052630115, + "loss": 12.2737, + "step": 13046 + }, + { + "epoch": 0.7104612934185538, + "grad_norm": 0.6116969575632312, + "learning_rate": 0.0001494670269403531, + "loss": 12.2623, + "step": 13047 + }, + { + "epoch": 0.7105157474151369, + "grad_norm": 0.577409531377027, + "learning_rate": 0.00014945936296974113, + "loss": 12.3066, + "step": 13048 + }, + { + "epoch": 0.7105702014117199, + "grad_norm": 0.6337211061727935, + "learning_rate": 0.0001494516986145249, + "loss": 12.2312, + "step": 13049 + }, + { + "epoch": 0.7106246554083029, + "grad_norm": 0.6188852759508668, + "learning_rate": 0.00014944403387476393, + "loss": 12.1674, + "step": 13050 + }, + { + "epoch": 0.7106791094048859, + "grad_norm": 0.6080463199956461, + "learning_rate": 0.00014943636875051788, + "loss": 12.2713, + "step": 13051 + }, + { + "epoch": 0.7107335634014689, + "grad_norm": 0.5719060777051059, + "learning_rate": 0.00014942870324184633, + "loss": 12.248, + "step": 13052 + }, + { + "epoch": 0.7107880173980519, + "grad_norm": 0.5589157114552834, + "learning_rate": 0.0001494210373488089, + "loss": 12.2863, + "step": 13053 + }, + { + "epoch": 0.710842471394635, + "grad_norm": 0.6891559546184134, + "learning_rate": 0.00014941337107146518, + "loss": 12.2425, + "step": 13054 + }, + { + "epoch": 0.710896925391218, + "grad_norm": 0.5982489360119315, + "learning_rate": 0.0001494057044098748, + "loss": 12.2747, + "step": 13055 + }, + { + "epoch": 0.710951379387801, + "grad_norm": 0.6009505560195769, + "learning_rate": 0.00014939803736409738, + "loss": 12.2842, + "step": 13056 + }, + { + "epoch": 0.711005833384384, + "grad_norm": 0.6199360893918832, + "learning_rate": 0.00014939036993419255, + "loss": 12.2094, + "step": 13057 + }, + { + "epoch": 0.7110602873809669, + "grad_norm": 0.6178935479060033, + "learning_rate": 0.00014938270212021994, + "loss": 12.323, + "step": 13058 + }, + { + "epoch": 0.7111147413775499, + "grad_norm": 0.6180419467020031, + "learning_rate": 0.0001493750339222391, + "loss": 12.4192, + "step": 13059 + }, + { + "epoch": 0.711169195374133, + "grad_norm": 0.61773533913744, + "learning_rate": 0.00014936736534030978, + "loss": 12.3702, + "step": 13060 + }, + { + "epoch": 0.711223649370716, + "grad_norm": 0.6224002643765801, + "learning_rate": 0.0001493596963744915, + "loss": 12.3371, + "step": 13061 + }, + { + "epoch": 0.711278103367299, + "grad_norm": 0.6259537112445535, + "learning_rate": 0.00014935202702484395, + "loss": 12.3679, + "step": 13062 + }, + { + "epoch": 0.711332557363882, + "grad_norm": 0.5328991389879344, + "learning_rate": 0.00014934435729142682, + "loss": 12.269, + "step": 13063 + }, + { + "epoch": 0.711387011360465, + "grad_norm": 0.5898400658269457, + "learning_rate": 0.0001493366871742996, + "loss": 12.4181, + "step": 13064 + }, + { + "epoch": 0.711441465357048, + "grad_norm": 0.5628544206358188, + "learning_rate": 0.0001493290166735221, + "loss": 12.3083, + "step": 13065 + }, + { + "epoch": 0.7114959193536311, + "grad_norm": 0.6712292466408655, + "learning_rate": 0.00014932134578915385, + "loss": 12.2852, + "step": 13066 + }, + { + "epoch": 0.7115503733502141, + "grad_norm": 0.5481225982918251, + "learning_rate": 0.00014931367452125456, + "loss": 12.0959, + "step": 13067 + }, + { + "epoch": 0.7116048273467971, + "grad_norm": 0.6030042765006167, + "learning_rate": 0.00014930600286988387, + "loss": 12.2808, + "step": 13068 + }, + { + "epoch": 0.7116592813433801, + "grad_norm": 0.6391940261618729, + "learning_rate": 0.00014929833083510144, + "loss": 12.4102, + "step": 13069 + }, + { + "epoch": 0.7117137353399631, + "grad_norm": 0.5354168607988496, + "learning_rate": 0.00014929065841696686, + "loss": 12.2758, + "step": 13070 + }, + { + "epoch": 0.7117681893365461, + "grad_norm": 0.596410067524563, + "learning_rate": 0.00014928298561553996, + "loss": 12.3488, + "step": 13071 + }, + { + "epoch": 0.7118226433331292, + "grad_norm": 0.5435545899850704, + "learning_rate": 0.00014927531243088022, + "loss": 12.0993, + "step": 13072 + }, + { + "epoch": 0.7118770973297122, + "grad_norm": 0.5464907779246453, + "learning_rate": 0.00014926763886304744, + "loss": 12.2565, + "step": 13073 + }, + { + "epoch": 0.7119315513262952, + "grad_norm": 0.5507232020598494, + "learning_rate": 0.0001492599649121012, + "loss": 12.2084, + "step": 13074 + }, + { + "epoch": 0.7119860053228781, + "grad_norm": 0.576496083537236, + "learning_rate": 0.0001492522905781012, + "loss": 12.4546, + "step": 13075 + }, + { + "epoch": 0.7120404593194611, + "grad_norm": 0.5174273315209944, + "learning_rate": 0.00014924461586110716, + "loss": 12.2471, + "step": 13076 + }, + { + "epoch": 0.7120949133160442, + "grad_norm": 0.5563814598625784, + "learning_rate": 0.00014923694076117872, + "loss": 12.1744, + "step": 13077 + }, + { + "epoch": 0.7121493673126272, + "grad_norm": 0.5938209262601601, + "learning_rate": 0.00014922926527837556, + "loss": 12.4136, + "step": 13078 + }, + { + "epoch": 0.7122038213092102, + "grad_norm": 0.6440486117840959, + "learning_rate": 0.00014922158941275742, + "loss": 12.2879, + "step": 13079 + }, + { + "epoch": 0.7122582753057932, + "grad_norm": 0.5424085110763885, + "learning_rate": 0.0001492139131643839, + "loss": 12.2847, + "step": 13080 + }, + { + "epoch": 0.7123127293023762, + "grad_norm": 0.6288430281756898, + "learning_rate": 0.00014920623653331473, + "loss": 12.2262, + "step": 13081 + }, + { + "epoch": 0.7123671832989592, + "grad_norm": 0.55112138539526, + "learning_rate": 0.00014919855951960964, + "loss": 12.193, + "step": 13082 + }, + { + "epoch": 0.7124216372955423, + "grad_norm": 0.6103232989314326, + "learning_rate": 0.00014919088212332832, + "loss": 12.3006, + "step": 13083 + }, + { + "epoch": 0.7124760912921253, + "grad_norm": 0.585852252955573, + "learning_rate": 0.00014918320434453044, + "loss": 12.2011, + "step": 13084 + }, + { + "epoch": 0.7125305452887083, + "grad_norm": 0.6023320204235458, + "learning_rate": 0.00014917552618327567, + "loss": 12.183, + "step": 13085 + }, + { + "epoch": 0.7125849992852913, + "grad_norm": 0.637684131987905, + "learning_rate": 0.00014916784763962382, + "loss": 12.1952, + "step": 13086 + }, + { + "epoch": 0.7126394532818743, + "grad_norm": 0.5635832480007723, + "learning_rate": 0.0001491601687136345, + "loss": 12.2105, + "step": 13087 + }, + { + "epoch": 0.7126939072784573, + "grad_norm": 0.5277875554433332, + "learning_rate": 0.00014915248940536747, + "loss": 12.2859, + "step": 13088 + }, + { + "epoch": 0.7127483612750404, + "grad_norm": 0.7686941264622156, + "learning_rate": 0.00014914480971488247, + "loss": 12.3042, + "step": 13089 + }, + { + "epoch": 0.7128028152716234, + "grad_norm": 0.5777273873310947, + "learning_rate": 0.00014913712964223917, + "loss": 12.3386, + "step": 13090 + }, + { + "epoch": 0.7128572692682064, + "grad_norm": 0.5662398880449121, + "learning_rate": 0.0001491294491874973, + "loss": 12.0751, + "step": 13091 + }, + { + "epoch": 0.7129117232647894, + "grad_norm": 0.5694475424596657, + "learning_rate": 0.00014912176835071657, + "loss": 12.2462, + "step": 13092 + }, + { + "epoch": 0.7129661772613723, + "grad_norm": 0.6194296392446067, + "learning_rate": 0.00014911408713195678, + "loss": 12.3051, + "step": 13093 + }, + { + "epoch": 0.7130206312579553, + "grad_norm": 0.5929804883936958, + "learning_rate": 0.0001491064055312776, + "loss": 11.9636, + "step": 13094 + }, + { + "epoch": 0.7130750852545384, + "grad_norm": 0.577121793325591, + "learning_rate": 0.00014909872354873876, + "loss": 12.1692, + "step": 13095 + }, + { + "epoch": 0.7131295392511214, + "grad_norm": 0.5399081423465084, + "learning_rate": 0.00014909104118440003, + "loss": 12.1337, + "step": 13096 + }, + { + "epoch": 0.7131839932477044, + "grad_norm": 0.6129261066771551, + "learning_rate": 0.00014908335843832112, + "loss": 12.2635, + "step": 13097 + }, + { + "epoch": 0.7132384472442874, + "grad_norm": 0.5557186330224296, + "learning_rate": 0.0001490756753105618, + "loss": 12.152, + "step": 13098 + }, + { + "epoch": 0.7132929012408704, + "grad_norm": 0.6522389101871415, + "learning_rate": 0.00014906799180118178, + "loss": 12.222, + "step": 13099 + }, + { + "epoch": 0.7133473552374534, + "grad_norm": 0.6201291721778343, + "learning_rate": 0.00014906030791024083, + "loss": 12.4092, + "step": 13100 + }, + { + "epoch": 0.7134018092340365, + "grad_norm": 0.5769656708175186, + "learning_rate": 0.0001490526236377987, + "loss": 12.1597, + "step": 13101 + }, + { + "epoch": 0.7134562632306195, + "grad_norm": 0.6034623352492604, + "learning_rate": 0.00014904493898391515, + "loss": 12.2657, + "step": 13102 + }, + { + "epoch": 0.7135107172272025, + "grad_norm": 0.7048369446631927, + "learning_rate": 0.00014903725394864993, + "loss": 12.3768, + "step": 13103 + }, + { + "epoch": 0.7135651712237855, + "grad_norm": 0.5917872807045255, + "learning_rate": 0.00014902956853206275, + "loss": 12.1509, + "step": 13104 + }, + { + "epoch": 0.7136196252203685, + "grad_norm": 0.6228402490002504, + "learning_rate": 0.0001490218827342135, + "loss": 12.1816, + "step": 13105 + }, + { + "epoch": 0.7136740792169516, + "grad_norm": 0.6237825639120929, + "learning_rate": 0.00014901419655516182, + "loss": 12.2323, + "step": 13106 + }, + { + "epoch": 0.7137285332135346, + "grad_norm": 0.6437731143746762, + "learning_rate": 0.00014900650999496754, + "loss": 12.1785, + "step": 13107 + }, + { + "epoch": 0.7137829872101176, + "grad_norm": 0.5712009786525403, + "learning_rate": 0.00014899882305369043, + "loss": 12.2221, + "step": 13108 + }, + { + "epoch": 0.7138374412067006, + "grad_norm": 0.5935683808044501, + "learning_rate": 0.00014899113573139022, + "loss": 12.2265, + "step": 13109 + }, + { + "epoch": 0.7138918952032836, + "grad_norm": 0.6016139400333811, + "learning_rate": 0.00014898344802812677, + "loss": 12.2309, + "step": 13110 + }, + { + "epoch": 0.7139463491998665, + "grad_norm": 0.5515894574044553, + "learning_rate": 0.0001489757599439598, + "loss": 12.2961, + "step": 13111 + }, + { + "epoch": 0.7140008031964497, + "grad_norm": 0.6157520966673266, + "learning_rate": 0.0001489680714789491, + "loss": 12.3123, + "step": 13112 + }, + { + "epoch": 0.7140552571930326, + "grad_norm": 0.6139323540701279, + "learning_rate": 0.00014896038263315445, + "loss": 12.1918, + "step": 13113 + }, + { + "epoch": 0.7141097111896156, + "grad_norm": 0.6524316381419333, + "learning_rate": 0.00014895269340663568, + "loss": 12.3856, + "step": 13114 + }, + { + "epoch": 0.7141641651861986, + "grad_norm": 0.564740400972851, + "learning_rate": 0.00014894500379945252, + "loss": 12.401, + "step": 13115 + }, + { + "epoch": 0.7142186191827816, + "grad_norm": 0.621305851648461, + "learning_rate": 0.00014893731381166486, + "loss": 12.2224, + "step": 13116 + }, + { + "epoch": 0.7142730731793646, + "grad_norm": 0.6056844793845901, + "learning_rate": 0.0001489296234433324, + "loss": 12.2252, + "step": 13117 + }, + { + "epoch": 0.7143275271759477, + "grad_norm": 0.565175529177596, + "learning_rate": 0.000148921932694515, + "loss": 12.2674, + "step": 13118 + }, + { + "epoch": 0.7143819811725307, + "grad_norm": 0.6050541903974955, + "learning_rate": 0.00014891424156527241, + "loss": 12.3528, + "step": 13119 + }, + { + "epoch": 0.7144364351691137, + "grad_norm": 0.5742183932203648, + "learning_rate": 0.0001489065500556645, + "loss": 12.2321, + "step": 13120 + }, + { + "epoch": 0.7144908891656967, + "grad_norm": 0.6274447021592812, + "learning_rate": 0.00014889885816575106, + "loss": 12.2845, + "step": 13121 + }, + { + "epoch": 0.7145453431622797, + "grad_norm": 0.6539372288535807, + "learning_rate": 0.00014889116589559192, + "loss": 12.2235, + "step": 13122 + }, + { + "epoch": 0.7145997971588627, + "grad_norm": 0.5398132389690338, + "learning_rate": 0.00014888347324524685, + "loss": 12.1618, + "step": 13123 + }, + { + "epoch": 0.7146542511554458, + "grad_norm": 0.6012845841839606, + "learning_rate": 0.0001488757802147757, + "loss": 12.4002, + "step": 13124 + }, + { + "epoch": 0.7147087051520288, + "grad_norm": 0.6481016820248655, + "learning_rate": 0.00014886808680423825, + "loss": 12.3224, + "step": 13125 + }, + { + "epoch": 0.7147631591486118, + "grad_norm": 0.5473743337050713, + "learning_rate": 0.0001488603930136944, + "loss": 12.2754, + "step": 13126 + }, + { + "epoch": 0.7148176131451948, + "grad_norm": 0.5585932175414414, + "learning_rate": 0.00014885269884320394, + "loss": 12.3176, + "step": 13127 + }, + { + "epoch": 0.7148720671417778, + "grad_norm": 0.6098644156874881, + "learning_rate": 0.00014884500429282672, + "loss": 12.3161, + "step": 13128 + }, + { + "epoch": 0.7149265211383607, + "grad_norm": 0.5900891649967697, + "learning_rate": 0.0001488373093626225, + "loss": 12.3236, + "step": 13129 + }, + { + "epoch": 0.7149809751349439, + "grad_norm": 0.52526226559206, + "learning_rate": 0.0001488296140526512, + "loss": 12.2481, + "step": 13130 + }, + { + "epoch": 0.7150354291315268, + "grad_norm": 0.6211944112719395, + "learning_rate": 0.00014882191836297263, + "loss": 12.355, + "step": 13131 + }, + { + "epoch": 0.7150898831281098, + "grad_norm": 0.6088171060766341, + "learning_rate": 0.00014881422229364667, + "loss": 12.2572, + "step": 13132 + }, + { + "epoch": 0.7151443371246928, + "grad_norm": 0.5867369564056631, + "learning_rate": 0.0001488065258447331, + "loss": 12.366, + "step": 13133 + }, + { + "epoch": 0.7151987911212758, + "grad_norm": 0.6565966460145465, + "learning_rate": 0.00014879882901629182, + "loss": 12.0463, + "step": 13134 + }, + { + "epoch": 0.7152532451178588, + "grad_norm": 0.8062366477423585, + "learning_rate": 0.00014879113180838263, + "loss": 12.2164, + "step": 13135 + }, + { + "epoch": 0.7153076991144419, + "grad_norm": 0.8015097363835164, + "learning_rate": 0.0001487834342210654, + "loss": 12.4014, + "step": 13136 + }, + { + "epoch": 0.7153621531110249, + "grad_norm": 0.5663030303378246, + "learning_rate": 0.00014877573625440006, + "loss": 12.3439, + "step": 13137 + }, + { + "epoch": 0.7154166071076079, + "grad_norm": 0.6414869394241983, + "learning_rate": 0.0001487680379084464, + "loss": 12.3671, + "step": 13138 + }, + { + "epoch": 0.7154710611041909, + "grad_norm": 0.599459862838817, + "learning_rate": 0.0001487603391832643, + "loss": 12.2809, + "step": 13139 + }, + { + "epoch": 0.7155255151007739, + "grad_norm": 0.6084045336182192, + "learning_rate": 0.0001487526400789136, + "loss": 12.366, + "step": 13140 + }, + { + "epoch": 0.715579969097357, + "grad_norm": 0.5717786606274167, + "learning_rate": 0.0001487449405954542, + "loss": 12.1181, + "step": 13141 + }, + { + "epoch": 0.71563442309394, + "grad_norm": 0.5944221913914374, + "learning_rate": 0.00014873724073294597, + "loss": 12.3322, + "step": 13142 + }, + { + "epoch": 0.715688877090523, + "grad_norm": 0.6191905531252168, + "learning_rate": 0.0001487295404914488, + "loss": 12.1768, + "step": 13143 + }, + { + "epoch": 0.715743331087106, + "grad_norm": 0.5948946545639103, + "learning_rate": 0.00014872183987102254, + "loss": 12.138, + "step": 13144 + }, + { + "epoch": 0.715797785083689, + "grad_norm": 0.5729785875130822, + "learning_rate": 0.0001487141388717271, + "loss": 12.2828, + "step": 13145 + }, + { + "epoch": 0.715852239080272, + "grad_norm": 0.5673965410686297, + "learning_rate": 0.00014870643749362233, + "loss": 12.33, + "step": 13146 + }, + { + "epoch": 0.7159066930768551, + "grad_norm": 0.5663436986398205, + "learning_rate": 0.00014869873573676812, + "loss": 12.3317, + "step": 13147 + }, + { + "epoch": 0.715961147073438, + "grad_norm": 0.5832471979785514, + "learning_rate": 0.00014869103360122437, + "loss": 12.2599, + "step": 13148 + }, + { + "epoch": 0.716015601070021, + "grad_norm": 0.6042795355125038, + "learning_rate": 0.00014868333108705102, + "loss": 12.3361, + "step": 13149 + }, + { + "epoch": 0.716070055066604, + "grad_norm": 0.5654725103208902, + "learning_rate": 0.0001486756281943079, + "loss": 12.1197, + "step": 13150 + }, + { + "epoch": 0.716124509063187, + "grad_norm": 0.5867311364311525, + "learning_rate": 0.00014866792492305493, + "loss": 12.2859, + "step": 13151 + }, + { + "epoch": 0.71617896305977, + "grad_norm": 0.6016682905288424, + "learning_rate": 0.00014866022127335202, + "loss": 12.2406, + "step": 13152 + }, + { + "epoch": 0.7162334170563531, + "grad_norm": 0.6115550769668805, + "learning_rate": 0.00014865251724525906, + "loss": 12.2205, + "step": 13153 + }, + { + "epoch": 0.7162878710529361, + "grad_norm": 0.5630660440481695, + "learning_rate": 0.00014864481283883598, + "loss": 12.1391, + "step": 13154 + }, + { + "epoch": 0.7163423250495191, + "grad_norm": 0.5638294256497566, + "learning_rate": 0.00014863710805414267, + "loss": 12.3224, + "step": 13155 + }, + { + "epoch": 0.7163967790461021, + "grad_norm": 0.560701108490784, + "learning_rate": 0.00014862940289123904, + "loss": 12.1735, + "step": 13156 + }, + { + "epoch": 0.7164512330426851, + "grad_norm": 0.541192260738824, + "learning_rate": 0.00014862169735018504, + "loss": 12.1591, + "step": 13157 + }, + { + "epoch": 0.7165056870392681, + "grad_norm": 0.6230640920391687, + "learning_rate": 0.00014861399143104053, + "loss": 12.2217, + "step": 13158 + }, + { + "epoch": 0.7165601410358512, + "grad_norm": 0.5660894693307399, + "learning_rate": 0.0001486062851338655, + "loss": 12.2232, + "step": 13159 + }, + { + "epoch": 0.7166145950324342, + "grad_norm": 0.5447717408784877, + "learning_rate": 0.00014859857845871984, + "loss": 12.3504, + "step": 13160 + }, + { + "epoch": 0.7166690490290172, + "grad_norm": 0.5989972551874758, + "learning_rate": 0.00014859087140566351, + "loss": 12.3824, + "step": 13161 + }, + { + "epoch": 0.7167235030256002, + "grad_norm": 0.6462447002724582, + "learning_rate": 0.0001485831639747564, + "loss": 12.3014, + "step": 13162 + }, + { + "epoch": 0.7167779570221832, + "grad_norm": 0.5806368639453152, + "learning_rate": 0.00014857545616605842, + "loss": 12.2635, + "step": 13163 + }, + { + "epoch": 0.7168324110187662, + "grad_norm": 0.5597040487622332, + "learning_rate": 0.00014856774797962957, + "loss": 12.1284, + "step": 13164 + }, + { + "epoch": 0.7168868650153493, + "grad_norm": 0.6358123005292217, + "learning_rate": 0.0001485600394155298, + "loss": 12.3169, + "step": 13165 + }, + { + "epoch": 0.7169413190119323, + "grad_norm": 0.6256827556331831, + "learning_rate": 0.00014855233047381897, + "loss": 12.2201, + "step": 13166 + }, + { + "epoch": 0.7169957730085152, + "grad_norm": 0.6116325432530314, + "learning_rate": 0.00014854462115455712, + "loss": 12.3764, + "step": 13167 + }, + { + "epoch": 0.7170502270050982, + "grad_norm": 0.5973785607260411, + "learning_rate": 0.00014853691145780412, + "loss": 12.3064, + "step": 13168 + }, + { + "epoch": 0.7171046810016812, + "grad_norm": 0.6229178221685887, + "learning_rate": 0.00014852920138361996, + "loss": 12.2634, + "step": 13169 + }, + { + "epoch": 0.7171591349982642, + "grad_norm": 0.5018641979222833, + "learning_rate": 0.00014852149093206462, + "loss": 12.227, + "step": 13170 + }, + { + "epoch": 0.7172135889948473, + "grad_norm": 0.6245439842759595, + "learning_rate": 0.00014851378010319801, + "loss": 12.1027, + "step": 13171 + }, + { + "epoch": 0.7172680429914303, + "grad_norm": 0.5944257970056329, + "learning_rate": 0.00014850606889708012, + "loss": 12.3463, + "step": 13172 + }, + { + "epoch": 0.7173224969880133, + "grad_norm": 0.5401040438062852, + "learning_rate": 0.00014849835731377088, + "loss": 12.2422, + "step": 13173 + }, + { + "epoch": 0.7173769509845963, + "grad_norm": 0.5836619995670721, + "learning_rate": 0.0001484906453533303, + "loss": 12.2505, + "step": 13174 + }, + { + "epoch": 0.7174314049811793, + "grad_norm": 0.6015006881115633, + "learning_rate": 0.00014848293301581835, + "loss": 12.195, + "step": 13175 + }, + { + "epoch": 0.7174858589777624, + "grad_norm": 0.6136890602982735, + "learning_rate": 0.00014847522030129494, + "loss": 12.3676, + "step": 13176 + }, + { + "epoch": 0.7175403129743454, + "grad_norm": 0.622763211404351, + "learning_rate": 0.0001484675072098201, + "loss": 12.3623, + "step": 13177 + }, + { + "epoch": 0.7175947669709284, + "grad_norm": 0.5902021660932876, + "learning_rate": 0.00014845979374145384, + "loss": 12.3369, + "step": 13178 + }, + { + "epoch": 0.7176492209675114, + "grad_norm": 0.6462075351516074, + "learning_rate": 0.00014845207989625604, + "loss": 12.3864, + "step": 13179 + }, + { + "epoch": 0.7177036749640944, + "grad_norm": 0.5608681041428895, + "learning_rate": 0.00014844436567428674, + "loss": 12.0696, + "step": 13180 + }, + { + "epoch": 0.7177581289606774, + "grad_norm": 0.6024320792955538, + "learning_rate": 0.00014843665107560597, + "loss": 12.419, + "step": 13181 + }, + { + "epoch": 0.7178125829572605, + "grad_norm": 0.5527658114202388, + "learning_rate": 0.00014842893610027367, + "loss": 12.3359, + "step": 13182 + }, + { + "epoch": 0.7178670369538435, + "grad_norm": 0.585011551195655, + "learning_rate": 0.00014842122074834984, + "loss": 12.2481, + "step": 13183 + }, + { + "epoch": 0.7179214909504265, + "grad_norm": 0.6151587175440556, + "learning_rate": 0.00014841350501989446, + "loss": 12.2827, + "step": 13184 + }, + { + "epoch": 0.7179759449470094, + "grad_norm": 0.5769188019599946, + "learning_rate": 0.00014840578891496753, + "loss": 12.2817, + "step": 13185 + }, + { + "epoch": 0.7180303989435924, + "grad_norm": 0.621828495887503, + "learning_rate": 0.00014839807243362908, + "loss": 12.2329, + "step": 13186 + }, + { + "epoch": 0.7180848529401754, + "grad_norm": 0.5963198844843429, + "learning_rate": 0.00014839035557593912, + "loss": 12.3039, + "step": 13187 + }, + { + "epoch": 0.7181393069367585, + "grad_norm": 0.5708228204132709, + "learning_rate": 0.00014838263834195765, + "loss": 12.3559, + "step": 13188 + }, + { + "epoch": 0.7181937609333415, + "grad_norm": 0.5727996839170717, + "learning_rate": 0.00014837492073174467, + "loss": 12.2922, + "step": 13189 + }, + { + "epoch": 0.7182482149299245, + "grad_norm": 0.6060897473951321, + "learning_rate": 0.00014836720274536016, + "loss": 12.4252, + "step": 13190 + }, + { + "epoch": 0.7183026689265075, + "grad_norm": 0.5326123536533812, + "learning_rate": 0.00014835948438286418, + "loss": 12.1688, + "step": 13191 + }, + { + "epoch": 0.7183571229230905, + "grad_norm": 0.5509354815979086, + "learning_rate": 0.00014835176564431674, + "loss": 12.256, + "step": 13192 + }, + { + "epoch": 0.7184115769196735, + "grad_norm": 0.6110671982352849, + "learning_rate": 0.0001483440465297779, + "loss": 12.2245, + "step": 13193 + }, + { + "epoch": 0.7184660309162566, + "grad_norm": 0.5516487381645298, + "learning_rate": 0.0001483363270393076, + "loss": 12.2766, + "step": 13194 + }, + { + "epoch": 0.7185204849128396, + "grad_norm": 0.5437664696658203, + "learning_rate": 0.00014832860717296594, + "loss": 12.2439, + "step": 13195 + }, + { + "epoch": 0.7185749389094226, + "grad_norm": 0.5607613825596112, + "learning_rate": 0.0001483208869308129, + "loss": 12.4613, + "step": 13196 + }, + { + "epoch": 0.7186293929060056, + "grad_norm": 0.5516449388920995, + "learning_rate": 0.00014831316631290856, + "loss": 12.2504, + "step": 13197 + }, + { + "epoch": 0.7186838469025886, + "grad_norm": 0.589519706303882, + "learning_rate": 0.00014830544531931292, + "loss": 12.3524, + "step": 13198 + }, + { + "epoch": 0.7187383008991716, + "grad_norm": 0.5644158485740307, + "learning_rate": 0.00014829772395008606, + "loss": 12.2768, + "step": 13199 + }, + { + "epoch": 0.7187927548957547, + "grad_norm": 0.565257166737133, + "learning_rate": 0.00014829000220528799, + "loss": 12.3203, + "step": 13200 + }, + { + "epoch": 0.7188472088923377, + "grad_norm": 0.5562113769820356, + "learning_rate": 0.00014828228008497877, + "loss": 12.2482, + "step": 13201 + }, + { + "epoch": 0.7189016628889207, + "grad_norm": 0.5630262812387138, + "learning_rate": 0.00014827455758921842, + "loss": 12.2811, + "step": 13202 + }, + { + "epoch": 0.7189561168855036, + "grad_norm": 0.5694366223079674, + "learning_rate": 0.00014826683471806703, + "loss": 12.2332, + "step": 13203 + }, + { + "epoch": 0.7190105708820866, + "grad_norm": 0.6049096069216633, + "learning_rate": 0.00014825911147158465, + "loss": 12.2179, + "step": 13204 + }, + { + "epoch": 0.7190650248786696, + "grad_norm": 0.5414694234678251, + "learning_rate": 0.0001482513878498313, + "loss": 12.2502, + "step": 13205 + }, + { + "epoch": 0.7191194788752527, + "grad_norm": 0.5530293618529364, + "learning_rate": 0.0001482436638528671, + "loss": 12.1771, + "step": 13206 + }, + { + "epoch": 0.7191739328718357, + "grad_norm": 0.605115901589326, + "learning_rate": 0.00014823593948075202, + "loss": 12.3434, + "step": 13207 + }, + { + "epoch": 0.7192283868684187, + "grad_norm": 0.5936913368093815, + "learning_rate": 0.00014822821473354624, + "loss": 12.2302, + "step": 13208 + }, + { + "epoch": 0.7192828408650017, + "grad_norm": 0.5730714795630938, + "learning_rate": 0.0001482204896113098, + "loss": 12.2835, + "step": 13209 + }, + { + "epoch": 0.7193372948615847, + "grad_norm": 0.5703306686028046, + "learning_rate": 0.0001482127641141027, + "loss": 12.2479, + "step": 13210 + }, + { + "epoch": 0.7193917488581678, + "grad_norm": 0.574174984669217, + "learning_rate": 0.00014820503824198507, + "loss": 12.2809, + "step": 13211 + }, + { + "epoch": 0.7194462028547508, + "grad_norm": 0.5336652865060002, + "learning_rate": 0.00014819731199501696, + "loss": 12.2294, + "step": 13212 + }, + { + "epoch": 0.7195006568513338, + "grad_norm": 0.556654346812789, + "learning_rate": 0.00014818958537325848, + "loss": 12.1503, + "step": 13213 + }, + { + "epoch": 0.7195551108479168, + "grad_norm": 0.5763679539817065, + "learning_rate": 0.00014818185837676972, + "loss": 12.2594, + "step": 13214 + }, + { + "epoch": 0.7196095648444998, + "grad_norm": 0.5844597639610618, + "learning_rate": 0.00014817413100561076, + "loss": 12.2629, + "step": 13215 + }, + { + "epoch": 0.7196640188410828, + "grad_norm": 0.5921512871316501, + "learning_rate": 0.00014816640325984165, + "loss": 12.2631, + "step": 13216 + }, + { + "epoch": 0.7197184728376659, + "grad_norm": 0.5898121302300365, + "learning_rate": 0.0001481586751395225, + "loss": 12.35, + "step": 13217 + }, + { + "epoch": 0.7197729268342489, + "grad_norm": 0.5358922462860523, + "learning_rate": 0.00014815094664471343, + "loss": 12.2799, + "step": 13218 + }, + { + "epoch": 0.7198273808308319, + "grad_norm": 0.6353500122755664, + "learning_rate": 0.00014814321777547454, + "loss": 12.3604, + "step": 13219 + }, + { + "epoch": 0.7198818348274149, + "grad_norm": 0.6809386098176096, + "learning_rate": 0.0001481354885318659, + "loss": 12.1429, + "step": 13220 + }, + { + "epoch": 0.7199362888239978, + "grad_norm": 0.5314956806716192, + "learning_rate": 0.00014812775891394762, + "loss": 12.1481, + "step": 13221 + }, + { + "epoch": 0.7199907428205808, + "grad_norm": 0.5688952356004566, + "learning_rate": 0.00014812002892177983, + "loss": 12.2585, + "step": 13222 + }, + { + "epoch": 0.720045196817164, + "grad_norm": 0.5839960551532886, + "learning_rate": 0.00014811229855542265, + "loss": 12.2537, + "step": 13223 + }, + { + "epoch": 0.7200996508137469, + "grad_norm": 0.626351737694393, + "learning_rate": 0.0001481045678149361, + "loss": 12.2205, + "step": 13224 + }, + { + "epoch": 0.7201541048103299, + "grad_norm": 0.6407856048354745, + "learning_rate": 0.0001480968367003804, + "loss": 12.356, + "step": 13225 + }, + { + "epoch": 0.7202085588069129, + "grad_norm": 0.6536082447989517, + "learning_rate": 0.00014808910521181564, + "loss": 12.3344, + "step": 13226 + }, + { + "epoch": 0.7202630128034959, + "grad_norm": 0.6131272889251808, + "learning_rate": 0.00014808137334930193, + "loss": 12.2439, + "step": 13227 + }, + { + "epoch": 0.7203174668000789, + "grad_norm": 0.6580668437691843, + "learning_rate": 0.0001480736411128994, + "loss": 12.2149, + "step": 13228 + }, + { + "epoch": 0.720371920796662, + "grad_norm": 0.5741574367765172, + "learning_rate": 0.00014806590850266818, + "loss": 12.2251, + "step": 13229 + }, + { + "epoch": 0.720426374793245, + "grad_norm": 0.5474613052768861, + "learning_rate": 0.00014805817551866838, + "loss": 12.3027, + "step": 13230 + }, + { + "epoch": 0.720480828789828, + "grad_norm": 0.6422235053459793, + "learning_rate": 0.00014805044216096016, + "loss": 12.3378, + "step": 13231 + }, + { + "epoch": 0.720535282786411, + "grad_norm": 0.597895121175297, + "learning_rate": 0.00014804270842960364, + "loss": 12.3361, + "step": 13232 + }, + { + "epoch": 0.720589736782994, + "grad_norm": 0.6785533887594158, + "learning_rate": 0.00014803497432465897, + "loss": 12.379, + "step": 13233 + }, + { + "epoch": 0.720644190779577, + "grad_norm": 0.6109394549575866, + "learning_rate": 0.00014802723984618624, + "loss": 12.3102, + "step": 13234 + }, + { + "epoch": 0.7206986447761601, + "grad_norm": 0.6062002142767499, + "learning_rate": 0.00014801950499424571, + "loss": 12.3901, + "step": 13235 + }, + { + "epoch": 0.7207530987727431, + "grad_norm": 0.6146312263798974, + "learning_rate": 0.00014801176976889742, + "loss": 12.27, + "step": 13236 + }, + { + "epoch": 0.7208075527693261, + "grad_norm": 0.5439968721697996, + "learning_rate": 0.00014800403417020155, + "loss": 12.2037, + "step": 13237 + }, + { + "epoch": 0.7208620067659091, + "grad_norm": 0.5652952590775139, + "learning_rate": 0.00014799629819821824, + "loss": 12.2625, + "step": 13238 + }, + { + "epoch": 0.720916460762492, + "grad_norm": 0.5736293661004036, + "learning_rate": 0.00014798856185300772, + "loss": 12.3362, + "step": 13239 + }, + { + "epoch": 0.7209709147590752, + "grad_norm": 0.6988017105721162, + "learning_rate": 0.00014798082513463006, + "loss": 12.2804, + "step": 13240 + }, + { + "epoch": 0.7210253687556581, + "grad_norm": 0.5507195086396749, + "learning_rate": 0.00014797308804314545, + "loss": 12.2721, + "step": 13241 + }, + { + "epoch": 0.7210798227522411, + "grad_norm": 0.5989828390022323, + "learning_rate": 0.00014796535057861408, + "loss": 12.2255, + "step": 13242 + }, + { + "epoch": 0.7211342767488241, + "grad_norm": 0.5783072500642104, + "learning_rate": 0.0001479576127410961, + "loss": 12.3047, + "step": 13243 + }, + { + "epoch": 0.7211887307454071, + "grad_norm": 0.5611434128093166, + "learning_rate": 0.0001479498745306517, + "loss": 12.1689, + "step": 13244 + }, + { + "epoch": 0.7212431847419901, + "grad_norm": 0.5748784280245217, + "learning_rate": 0.00014794213594734098, + "loss": 12.2031, + "step": 13245 + }, + { + "epoch": 0.7212976387385732, + "grad_norm": 0.5399655039572351, + "learning_rate": 0.00014793439699122422, + "loss": 12.2459, + "step": 13246 + }, + { + "epoch": 0.7213520927351562, + "grad_norm": 0.6441317220669062, + "learning_rate": 0.00014792665766236155, + "loss": 12.3354, + "step": 13247 + }, + { + "epoch": 0.7214065467317392, + "grad_norm": 0.5339434294189568, + "learning_rate": 0.00014791891796081313, + "loss": 12.2029, + "step": 13248 + }, + { + "epoch": 0.7214610007283222, + "grad_norm": 0.5497203591337948, + "learning_rate": 0.00014791117788663918, + "loss": 12.28, + "step": 13249 + }, + { + "epoch": 0.7215154547249052, + "grad_norm": 0.5884085579228641, + "learning_rate": 0.00014790343743989988, + "loss": 12.4046, + "step": 13250 + }, + { + "epoch": 0.7215699087214882, + "grad_norm": 0.5542525975959941, + "learning_rate": 0.00014789569662065539, + "loss": 12.239, + "step": 13251 + }, + { + "epoch": 0.7216243627180713, + "grad_norm": 0.5225190570253677, + "learning_rate": 0.00014788795542896595, + "loss": 12.2479, + "step": 13252 + }, + { + "epoch": 0.7216788167146543, + "grad_norm": 0.5464471759603579, + "learning_rate": 0.00014788021386489173, + "loss": 12.2732, + "step": 13253 + }, + { + "epoch": 0.7217332707112373, + "grad_norm": 0.6026126866474447, + "learning_rate": 0.00014787247192849296, + "loss": 12.338, + "step": 13254 + }, + { + "epoch": 0.7217877247078203, + "grad_norm": 0.5680906729215941, + "learning_rate": 0.0001478647296198298, + "loss": 12.2654, + "step": 13255 + }, + { + "epoch": 0.7218421787044033, + "grad_norm": 0.5279643367138486, + "learning_rate": 0.00014785698693896247, + "loss": 12.281, + "step": 13256 + }, + { + "epoch": 0.7218966327009863, + "grad_norm": 0.5780639737499039, + "learning_rate": 0.00014784924388595118, + "loss": 12.1744, + "step": 13257 + }, + { + "epoch": 0.7219510866975694, + "grad_norm": 0.6157191538415512, + "learning_rate": 0.00014784150046085617, + "loss": 12.2092, + "step": 13258 + }, + { + "epoch": 0.7220055406941523, + "grad_norm": 0.594849991840615, + "learning_rate": 0.00014783375666373765, + "loss": 12.3156, + "step": 13259 + }, + { + "epoch": 0.7220599946907353, + "grad_norm": 0.5610084800496234, + "learning_rate": 0.00014782601249465577, + "loss": 12.3049, + "step": 13260 + }, + { + "epoch": 0.7221144486873183, + "grad_norm": 0.5138308536565745, + "learning_rate": 0.0001478182679536708, + "loss": 12.1668, + "step": 13261 + }, + { + "epoch": 0.7221689026839013, + "grad_norm": 0.5857148696720267, + "learning_rate": 0.00014781052304084296, + "loss": 12.2645, + "step": 13262 + }, + { + "epoch": 0.7222233566804843, + "grad_norm": 0.5869236689159668, + "learning_rate": 0.00014780277775623248, + "loss": 12.2645, + "step": 13263 + }, + { + "epoch": 0.7222778106770674, + "grad_norm": 0.5861017227555576, + "learning_rate": 0.00014779503209989963, + "loss": 12.1274, + "step": 13264 + }, + { + "epoch": 0.7223322646736504, + "grad_norm": 0.5319448586885308, + "learning_rate": 0.00014778728607190453, + "loss": 12.2314, + "step": 13265 + }, + { + "epoch": 0.7223867186702334, + "grad_norm": 0.5311790313561285, + "learning_rate": 0.0001477795396723075, + "loss": 12.1801, + "step": 13266 + }, + { + "epoch": 0.7224411726668164, + "grad_norm": 0.6732219693817844, + "learning_rate": 0.00014777179290116873, + "loss": 12.3139, + "step": 13267 + }, + { + "epoch": 0.7224956266633994, + "grad_norm": 0.6493652353578988, + "learning_rate": 0.00014776404575854855, + "loss": 12.3059, + "step": 13268 + }, + { + "epoch": 0.7225500806599824, + "grad_norm": 0.5937044833129376, + "learning_rate": 0.0001477562982445071, + "loss": 12.364, + "step": 13269 + }, + { + "epoch": 0.7226045346565655, + "grad_norm": 0.6198658920216181, + "learning_rate": 0.00014774855035910468, + "loss": 12.2827, + "step": 13270 + }, + { + "epoch": 0.7226589886531485, + "grad_norm": 0.5732476439334573, + "learning_rate": 0.00014774080210240153, + "loss": 12.2738, + "step": 13271 + }, + { + "epoch": 0.7227134426497315, + "grad_norm": 0.5360260553007256, + "learning_rate": 0.00014773305347445787, + "loss": 12.111, + "step": 13272 + }, + { + "epoch": 0.7227678966463145, + "grad_norm": 0.5275770907062345, + "learning_rate": 0.000147725304475334, + "loss": 12.2252, + "step": 13273 + }, + { + "epoch": 0.7228223506428975, + "grad_norm": 0.5172824207596717, + "learning_rate": 0.00014771755510509016, + "loss": 12.2106, + "step": 13274 + }, + { + "epoch": 0.7228768046394806, + "grad_norm": 0.5721249521602304, + "learning_rate": 0.0001477098053637866, + "loss": 12.199, + "step": 13275 + }, + { + "epoch": 0.7229312586360636, + "grad_norm": 0.7191742987674118, + "learning_rate": 0.0001477020552514836, + "loss": 12.2984, + "step": 13276 + }, + { + "epoch": 0.7229857126326465, + "grad_norm": 0.6365763140364471, + "learning_rate": 0.00014769430476824143, + "loss": 12.2329, + "step": 13277 + }, + { + "epoch": 0.7230401666292295, + "grad_norm": 0.551456274285001, + "learning_rate": 0.0001476865539141203, + "loss": 12.2782, + "step": 13278 + }, + { + "epoch": 0.7230946206258125, + "grad_norm": 0.5991760905410665, + "learning_rate": 0.0001476788026891806, + "loss": 12.2426, + "step": 13279 + }, + { + "epoch": 0.7231490746223955, + "grad_norm": 0.5544333248898721, + "learning_rate": 0.0001476710510934825, + "loss": 12.299, + "step": 13280 + }, + { + "epoch": 0.7232035286189786, + "grad_norm": 0.5394336831800627, + "learning_rate": 0.0001476632991270863, + "loss": 12.2804, + "step": 13281 + }, + { + "epoch": 0.7232579826155616, + "grad_norm": 0.6062299792911675, + "learning_rate": 0.00014765554679005232, + "loss": 12.2861, + "step": 13282 + }, + { + "epoch": 0.7233124366121446, + "grad_norm": 0.5730035830507569, + "learning_rate": 0.00014764779408244077, + "loss": 12.2278, + "step": 13283 + }, + { + "epoch": 0.7233668906087276, + "grad_norm": 0.6156500355967656, + "learning_rate": 0.00014764004100431202, + "loss": 12.2724, + "step": 13284 + }, + { + "epoch": 0.7234213446053106, + "grad_norm": 0.6188118186143371, + "learning_rate": 0.0001476322875557263, + "loss": 12.3845, + "step": 13285 + }, + { + "epoch": 0.7234757986018936, + "grad_norm": 0.5600796231001508, + "learning_rate": 0.00014762453373674394, + "loss": 12.3946, + "step": 13286 + }, + { + "epoch": 0.7235302525984767, + "grad_norm": 0.5966923620651808, + "learning_rate": 0.00014761677954742525, + "loss": 12.2449, + "step": 13287 + }, + { + "epoch": 0.7235847065950597, + "grad_norm": 0.6581313433360012, + "learning_rate": 0.00014760902498783045, + "loss": 12.2262, + "step": 13288 + }, + { + "epoch": 0.7236391605916427, + "grad_norm": 0.6462196705353955, + "learning_rate": 0.0001476012700580199, + "loss": 12.1701, + "step": 13289 + }, + { + "epoch": 0.7236936145882257, + "grad_norm": 0.5686442508026062, + "learning_rate": 0.0001475935147580539, + "loss": 12.3473, + "step": 13290 + }, + { + "epoch": 0.7237480685848087, + "grad_norm": 0.6936995965357283, + "learning_rate": 0.00014758575908799273, + "loss": 12.161, + "step": 13291 + }, + { + "epoch": 0.7238025225813917, + "grad_norm": 0.5734359477612324, + "learning_rate": 0.00014757800304789672, + "loss": 12.2128, + "step": 13292 + }, + { + "epoch": 0.7238569765779748, + "grad_norm": 0.5543159894192236, + "learning_rate": 0.00014757024663782618, + "loss": 12.2305, + "step": 13293 + }, + { + "epoch": 0.7239114305745578, + "grad_norm": 0.7523444100322242, + "learning_rate": 0.00014756248985784145, + "loss": 12.2388, + "step": 13294 + }, + { + "epoch": 0.7239658845711407, + "grad_norm": 0.6761115842844122, + "learning_rate": 0.0001475547327080028, + "loss": 12.5183, + "step": 13295 + }, + { + "epoch": 0.7240203385677237, + "grad_norm": 0.6478389352027981, + "learning_rate": 0.00014754697518837057, + "loss": 12.2586, + "step": 13296 + }, + { + "epoch": 0.7240747925643067, + "grad_norm": 0.6431879080137026, + "learning_rate": 0.0001475392172990051, + "loss": 12.3238, + "step": 13297 + }, + { + "epoch": 0.7241292465608897, + "grad_norm": 0.5920708792499044, + "learning_rate": 0.0001475314590399667, + "loss": 12.2876, + "step": 13298 + }, + { + "epoch": 0.7241837005574728, + "grad_norm": 0.5144758746792878, + "learning_rate": 0.00014752370041131571, + "loss": 12.118, + "step": 13299 + }, + { + "epoch": 0.7242381545540558, + "grad_norm": 0.5594016258185207, + "learning_rate": 0.00014751594141311245, + "loss": 12.171, + "step": 13300 + }, + { + "epoch": 0.7242926085506388, + "grad_norm": 0.5727034735694334, + "learning_rate": 0.00014750818204541726, + "loss": 12.2966, + "step": 13301 + }, + { + "epoch": 0.7243470625472218, + "grad_norm": 0.5423280565884221, + "learning_rate": 0.00014750042230829048, + "loss": 12.2054, + "step": 13302 + }, + { + "epoch": 0.7244015165438048, + "grad_norm": 0.6132969743187309, + "learning_rate": 0.00014749266220179244, + "loss": 12.2604, + "step": 13303 + }, + { + "epoch": 0.7244559705403878, + "grad_norm": 0.5824401764590079, + "learning_rate": 0.00014748490172598351, + "loss": 12.3298, + "step": 13304 + }, + { + "epoch": 0.7245104245369709, + "grad_norm": 0.6718562011950225, + "learning_rate": 0.000147477140880924, + "loss": 12.2995, + "step": 13305 + }, + { + "epoch": 0.7245648785335539, + "grad_norm": 0.6296253155808531, + "learning_rate": 0.0001474693796666743, + "loss": 12.2741, + "step": 13306 + }, + { + "epoch": 0.7246193325301369, + "grad_norm": 0.5643711473386575, + "learning_rate": 0.00014746161808329474, + "loss": 12.3123, + "step": 13307 + }, + { + "epoch": 0.7246737865267199, + "grad_norm": 0.5789895661930375, + "learning_rate": 0.00014745385613084569, + "loss": 12.3231, + "step": 13308 + }, + { + "epoch": 0.7247282405233029, + "grad_norm": 0.6649865105288295, + "learning_rate": 0.0001474460938093875, + "loss": 12.3596, + "step": 13309 + }, + { + "epoch": 0.724782694519886, + "grad_norm": 0.6178714964656502, + "learning_rate": 0.00014743833111898052, + "loss": 12.252, + "step": 13310 + }, + { + "epoch": 0.724837148516469, + "grad_norm": 0.5727155683070335, + "learning_rate": 0.00014743056805968508, + "loss": 12.2908, + "step": 13311 + }, + { + "epoch": 0.724891602513052, + "grad_norm": 0.6811695965625274, + "learning_rate": 0.00014742280463156163, + "loss": 12.4003, + "step": 13312 + }, + { + "epoch": 0.724946056509635, + "grad_norm": 0.606043872375847, + "learning_rate": 0.0001474150408346705, + "loss": 12.3047, + "step": 13313 + }, + { + "epoch": 0.7250005105062179, + "grad_norm": 0.5663335087891896, + "learning_rate": 0.00014740727666907207, + "loss": 12.2953, + "step": 13314 + }, + { + "epoch": 0.7250549645028009, + "grad_norm": 0.6290111840184379, + "learning_rate": 0.00014739951213482667, + "loss": 12.2735, + "step": 13315 + }, + { + "epoch": 0.725109418499384, + "grad_norm": 0.6244440368051256, + "learning_rate": 0.0001473917472319947, + "loss": 12.2807, + "step": 13316 + }, + { + "epoch": 0.725163872495967, + "grad_norm": 0.5204893675027747, + "learning_rate": 0.00014738398196063662, + "loss": 12.1904, + "step": 13317 + }, + { + "epoch": 0.72521832649255, + "grad_norm": 0.6421462501017908, + "learning_rate": 0.00014737621632081272, + "loss": 12.2922, + "step": 13318 + }, + { + "epoch": 0.725272780489133, + "grad_norm": 0.6330231138389201, + "learning_rate": 0.0001473684503125834, + "loss": 12.3048, + "step": 13319 + }, + { + "epoch": 0.725327234485716, + "grad_norm": 0.5362822429633152, + "learning_rate": 0.0001473606839360091, + "loss": 12.2355, + "step": 13320 + }, + { + "epoch": 0.725381688482299, + "grad_norm": 0.5748616604842707, + "learning_rate": 0.00014735291719115016, + "loss": 12.2656, + "step": 13321 + }, + { + "epoch": 0.7254361424788821, + "grad_norm": 0.6475297889755877, + "learning_rate": 0.00014734515007806698, + "loss": 12.3399, + "step": 13322 + }, + { + "epoch": 0.7254905964754651, + "grad_norm": 0.6503247222470766, + "learning_rate": 0.00014733738259682, + "loss": 12.3816, + "step": 13323 + }, + { + "epoch": 0.7255450504720481, + "grad_norm": 0.5656483255703779, + "learning_rate": 0.0001473296147474696, + "loss": 12.2718, + "step": 13324 + }, + { + "epoch": 0.7255995044686311, + "grad_norm": 0.6695739183253075, + "learning_rate": 0.00014732184653007616, + "loss": 12.1371, + "step": 13325 + }, + { + "epoch": 0.7256539584652141, + "grad_norm": 0.6144707598948669, + "learning_rate": 0.00014731407794470013, + "loss": 12.2903, + "step": 13326 + }, + { + "epoch": 0.7257084124617971, + "grad_norm": 0.6436012618615664, + "learning_rate": 0.00014730630899140184, + "loss": 12.2941, + "step": 13327 + }, + { + "epoch": 0.7257628664583802, + "grad_norm": 0.5990201294905514, + "learning_rate": 0.00014729853967024176, + "loss": 12.3202, + "step": 13328 + }, + { + "epoch": 0.7258173204549632, + "grad_norm": 0.558011434881847, + "learning_rate": 0.00014729076998128038, + "loss": 12.3215, + "step": 13329 + }, + { + "epoch": 0.7258717744515462, + "grad_norm": 0.5789380298918541, + "learning_rate": 0.000147282999924578, + "loss": 12.2409, + "step": 13330 + }, + { + "epoch": 0.7259262284481292, + "grad_norm": 0.6535672422004971, + "learning_rate": 0.00014727522950019507, + "loss": 12.2585, + "step": 13331 + }, + { + "epoch": 0.7259806824447121, + "grad_norm": 0.5473404565670977, + "learning_rate": 0.00014726745870819203, + "loss": 12.063, + "step": 13332 + }, + { + "epoch": 0.7260351364412951, + "grad_norm": 0.5768166524166972, + "learning_rate": 0.0001472596875486293, + "loss": 12.3775, + "step": 13333 + }, + { + "epoch": 0.7260895904378782, + "grad_norm": 0.605122601726206, + "learning_rate": 0.00014725191602156735, + "loss": 12.3055, + "step": 13334 + }, + { + "epoch": 0.7261440444344612, + "grad_norm": 0.5977671889604294, + "learning_rate": 0.00014724414412706655, + "loss": 12.2939, + "step": 13335 + }, + { + "epoch": 0.7261984984310442, + "grad_norm": 0.5656575048123962, + "learning_rate": 0.00014723637186518738, + "loss": 12.2937, + "step": 13336 + }, + { + "epoch": 0.7262529524276272, + "grad_norm": 0.7920585669939834, + "learning_rate": 0.0001472285992359902, + "loss": 12.4134, + "step": 13337 + }, + { + "epoch": 0.7263074064242102, + "grad_norm": 0.6288762888571692, + "learning_rate": 0.0001472208262395356, + "loss": 12.4197, + "step": 13338 + }, + { + "epoch": 0.7263618604207932, + "grad_norm": 0.5799542013846901, + "learning_rate": 0.00014721305287588386, + "loss": 12.2353, + "step": 13339 + }, + { + "epoch": 0.7264163144173763, + "grad_norm": 0.5553553616139497, + "learning_rate": 0.00014720527914509557, + "loss": 12.2557, + "step": 13340 + }, + { + "epoch": 0.7264707684139593, + "grad_norm": 0.5912935785724808, + "learning_rate": 0.00014719750504723107, + "loss": 12.2559, + "step": 13341 + }, + { + "epoch": 0.7265252224105423, + "grad_norm": 0.5412870648624158, + "learning_rate": 0.00014718973058235087, + "loss": 12.0724, + "step": 13342 + }, + { + "epoch": 0.7265796764071253, + "grad_norm": 0.5471026390853213, + "learning_rate": 0.0001471819557505154, + "loss": 12.1997, + "step": 13343 + }, + { + "epoch": 0.7266341304037083, + "grad_norm": 0.6702127640155227, + "learning_rate": 0.0001471741805517851, + "loss": 12.4182, + "step": 13344 + }, + { + "epoch": 0.7266885844002914, + "grad_norm": 0.583732138964941, + "learning_rate": 0.00014716640498622054, + "loss": 12.2193, + "step": 13345 + }, + { + "epoch": 0.7267430383968744, + "grad_norm": 0.571671078235295, + "learning_rate": 0.00014715862905388205, + "loss": 12.2459, + "step": 13346 + }, + { + "epoch": 0.7267974923934574, + "grad_norm": 0.7012614857505217, + "learning_rate": 0.00014715085275483015, + "loss": 12.2849, + "step": 13347 + }, + { + "epoch": 0.7268519463900404, + "grad_norm": 0.6271694136899435, + "learning_rate": 0.00014714307608912534, + "loss": 12.2322, + "step": 13348 + }, + { + "epoch": 0.7269064003866234, + "grad_norm": 0.6199557536061102, + "learning_rate": 0.00014713529905682805, + "loss": 12.1896, + "step": 13349 + }, + { + "epoch": 0.7269608543832063, + "grad_norm": 0.7010623743553475, + "learning_rate": 0.00014712752165799877, + "loss": 12.4026, + "step": 13350 + }, + { + "epoch": 0.7270153083797894, + "grad_norm": 0.6694734843494442, + "learning_rate": 0.00014711974389269798, + "loss": 12.1728, + "step": 13351 + }, + { + "epoch": 0.7270697623763724, + "grad_norm": 0.6172901433277747, + "learning_rate": 0.00014711196576098616, + "loss": 12.2305, + "step": 13352 + }, + { + "epoch": 0.7271242163729554, + "grad_norm": 0.6393667883302725, + "learning_rate": 0.00014710418726292382, + "loss": 12.2514, + "step": 13353 + }, + { + "epoch": 0.7271786703695384, + "grad_norm": 0.8042984169721533, + "learning_rate": 0.00014709640839857138, + "loss": 12.2913, + "step": 13354 + }, + { + "epoch": 0.7272331243661214, + "grad_norm": 0.5605753639003496, + "learning_rate": 0.00014708862916798938, + "loss": 12.3283, + "step": 13355 + }, + { + "epoch": 0.7272875783627044, + "grad_norm": 0.8140781294933005, + "learning_rate": 0.00014708084957123832, + "loss": 12.388, + "step": 13356 + }, + { + "epoch": 0.7273420323592875, + "grad_norm": 0.6234743849624682, + "learning_rate": 0.00014707306960837866, + "loss": 12.2007, + "step": 13357 + }, + { + "epoch": 0.7273964863558705, + "grad_norm": 0.5595606397637554, + "learning_rate": 0.00014706528927947092, + "loss": 12.2607, + "step": 13358 + }, + { + "epoch": 0.7274509403524535, + "grad_norm": 0.5624929559642442, + "learning_rate": 0.00014705750858457557, + "loss": 12.155, + "step": 13359 + }, + { + "epoch": 0.7275053943490365, + "grad_norm": 0.7250644876285313, + "learning_rate": 0.0001470497275237532, + "loss": 12.232, + "step": 13360 + }, + { + "epoch": 0.7275598483456195, + "grad_norm": 0.5341427046039439, + "learning_rate": 0.0001470419460970642, + "loss": 12.1898, + "step": 13361 + }, + { + "epoch": 0.7276143023422025, + "grad_norm": 0.6067037064595763, + "learning_rate": 0.00014703416430456918, + "loss": 12.2517, + "step": 13362 + }, + { + "epoch": 0.7276687563387856, + "grad_norm": 0.6619249652306931, + "learning_rate": 0.0001470263821463286, + "loss": 12.1289, + "step": 13363 + }, + { + "epoch": 0.7277232103353686, + "grad_norm": 0.539140862548324, + "learning_rate": 0.00014701859962240298, + "loss": 12.2192, + "step": 13364 + }, + { + "epoch": 0.7277776643319516, + "grad_norm": 0.6013055091812155, + "learning_rate": 0.00014701081673285286, + "loss": 12.2707, + "step": 13365 + }, + { + "epoch": 0.7278321183285346, + "grad_norm": 0.5978285453649279, + "learning_rate": 0.00014700303347773872, + "loss": 12.2326, + "step": 13366 + }, + { + "epoch": 0.7278865723251176, + "grad_norm": 0.5711855928250466, + "learning_rate": 0.00014699524985712113, + "loss": 12.3082, + "step": 13367 + }, + { + "epoch": 0.7279410263217005, + "grad_norm": 0.5499721625163349, + "learning_rate": 0.0001469874658710606, + "loss": 12.134, + "step": 13368 + }, + { + "epoch": 0.7279954803182836, + "grad_norm": 0.6450866127583862, + "learning_rate": 0.00014697968151961764, + "loss": 12.3181, + "step": 13369 + }, + { + "epoch": 0.7280499343148666, + "grad_norm": 0.6587570915991241, + "learning_rate": 0.00014697189680285282, + "loss": 12.1078, + "step": 13370 + }, + { + "epoch": 0.7281043883114496, + "grad_norm": 0.6449739855760339, + "learning_rate": 0.0001469641117208266, + "loss": 12.3408, + "step": 13371 + }, + { + "epoch": 0.7281588423080326, + "grad_norm": 0.5962989797445586, + "learning_rate": 0.00014695632627359962, + "loss": 12.0901, + "step": 13372 + }, + { + "epoch": 0.7282132963046156, + "grad_norm": 0.6419537874928303, + "learning_rate": 0.00014694854046123237, + "loss": 12.3548, + "step": 13373 + }, + { + "epoch": 0.7282677503011986, + "grad_norm": 0.6217723276634614, + "learning_rate": 0.0001469407542837854, + "loss": 12.3115, + "step": 13374 + }, + { + "epoch": 0.7283222042977817, + "grad_norm": 0.5143051049374879, + "learning_rate": 0.00014693296774131924, + "loss": 12.1744, + "step": 13375 + }, + { + "epoch": 0.7283766582943647, + "grad_norm": 0.7747848484144629, + "learning_rate": 0.00014692518083389442, + "loss": 12.3623, + "step": 13376 + }, + { + "epoch": 0.7284311122909477, + "grad_norm": 0.6839726523055348, + "learning_rate": 0.00014691739356157156, + "loss": 12.1832, + "step": 13377 + }, + { + "epoch": 0.7284855662875307, + "grad_norm": 0.5831370886270943, + "learning_rate": 0.00014690960592441118, + "loss": 12.2775, + "step": 13378 + }, + { + "epoch": 0.7285400202841137, + "grad_norm": 0.6356590097338246, + "learning_rate": 0.00014690181792247382, + "loss": 12.2817, + "step": 13379 + }, + { + "epoch": 0.7285944742806968, + "grad_norm": 0.6111939040509912, + "learning_rate": 0.0001468940295558201, + "loss": 12.32, + "step": 13380 + }, + { + "epoch": 0.7286489282772798, + "grad_norm": 0.6661048890883065, + "learning_rate": 0.0001468862408245105, + "loss": 12.4681, + "step": 13381 + }, + { + "epoch": 0.7287033822738628, + "grad_norm": 0.701286526648347, + "learning_rate": 0.0001468784517286056, + "loss": 12.3801, + "step": 13382 + }, + { + "epoch": 0.7287578362704458, + "grad_norm": 0.6065563264179271, + "learning_rate": 0.00014687066226816607, + "loss": 12.1662, + "step": 13383 + }, + { + "epoch": 0.7288122902670288, + "grad_norm": 0.6226504736642833, + "learning_rate": 0.00014686287244325237, + "loss": 12.2861, + "step": 13384 + }, + { + "epoch": 0.7288667442636118, + "grad_norm": 0.6341344589623714, + "learning_rate": 0.00014685508225392515, + "loss": 12.3233, + "step": 13385 + }, + { + "epoch": 0.7289211982601949, + "grad_norm": 0.6283968099363282, + "learning_rate": 0.00014684729170024493, + "loss": 12.2036, + "step": 13386 + }, + { + "epoch": 0.7289756522567779, + "grad_norm": 0.5482355338897202, + "learning_rate": 0.00014683950078227232, + "loss": 12.1719, + "step": 13387 + }, + { + "epoch": 0.7290301062533608, + "grad_norm": 0.5844528949461678, + "learning_rate": 0.00014683170950006785, + "loss": 12.1159, + "step": 13388 + }, + { + "epoch": 0.7290845602499438, + "grad_norm": 0.6758906069619615, + "learning_rate": 0.0001468239178536922, + "loss": 12.4005, + "step": 13389 + }, + { + "epoch": 0.7291390142465268, + "grad_norm": 0.5703040967089928, + "learning_rate": 0.00014681612584320592, + "loss": 12.2006, + "step": 13390 + }, + { + "epoch": 0.7291934682431098, + "grad_norm": 0.5527985593987613, + "learning_rate": 0.00014680833346866957, + "loss": 12.1996, + "step": 13391 + }, + { + "epoch": 0.7292479222396929, + "grad_norm": 0.5777786022154163, + "learning_rate": 0.00014680054073014378, + "loss": 12.3138, + "step": 13392 + }, + { + "epoch": 0.7293023762362759, + "grad_norm": 0.546652622965535, + "learning_rate": 0.00014679274762768911, + "loss": 12.2746, + "step": 13393 + }, + { + "epoch": 0.7293568302328589, + "grad_norm": 0.5675033988841082, + "learning_rate": 0.00014678495416136622, + "loss": 12.2041, + "step": 13394 + }, + { + "epoch": 0.7294112842294419, + "grad_norm": 0.5321126755557474, + "learning_rate": 0.00014677716033123568, + "loss": 12.1684, + "step": 13395 + }, + { + "epoch": 0.7294657382260249, + "grad_norm": 0.5728224465252162, + "learning_rate": 0.0001467693661373581, + "loss": 12.18, + "step": 13396 + }, + { + "epoch": 0.7295201922226079, + "grad_norm": 0.5844124602179768, + "learning_rate": 0.00014676157157979407, + "loss": 12.3234, + "step": 13397 + }, + { + "epoch": 0.729574646219191, + "grad_norm": 0.5655242088204449, + "learning_rate": 0.00014675377665860422, + "loss": 12.2359, + "step": 13398 + }, + { + "epoch": 0.729629100215774, + "grad_norm": 0.5145490553683293, + "learning_rate": 0.00014674598137384915, + "loss": 12.1504, + "step": 13399 + }, + { + "epoch": 0.729683554212357, + "grad_norm": 0.5206155089772565, + "learning_rate": 0.0001467381857255895, + "loss": 12.3104, + "step": 13400 + }, + { + "epoch": 0.72973800820894, + "grad_norm": 0.5732089652040417, + "learning_rate": 0.0001467303897138859, + "loss": 12.3735, + "step": 13401 + }, + { + "epoch": 0.729792462205523, + "grad_norm": 0.5893013514173729, + "learning_rate": 0.00014672259333879892, + "loss": 12.3674, + "step": 13402 + }, + { + "epoch": 0.729846916202106, + "grad_norm": 0.5453283379493583, + "learning_rate": 0.00014671479660038921, + "loss": 12.3758, + "step": 13403 + }, + { + "epoch": 0.7299013701986891, + "grad_norm": 0.5553399861077735, + "learning_rate": 0.00014670699949871746, + "loss": 12.2429, + "step": 13404 + }, + { + "epoch": 0.729955824195272, + "grad_norm": 0.5781061733118386, + "learning_rate": 0.00014669920203384422, + "loss": 12.325, + "step": 13405 + }, + { + "epoch": 0.730010278191855, + "grad_norm": 0.6196937505905702, + "learning_rate": 0.00014669140420583017, + "loss": 12.3664, + "step": 13406 + }, + { + "epoch": 0.730064732188438, + "grad_norm": 0.5884521889356856, + "learning_rate": 0.0001466836060147359, + "loss": 12.2777, + "step": 13407 + }, + { + "epoch": 0.730119186185021, + "grad_norm": 0.672981152642552, + "learning_rate": 0.00014667580746062208, + "loss": 12.359, + "step": 13408 + }, + { + "epoch": 0.7301736401816041, + "grad_norm": 0.5582101153546333, + "learning_rate": 0.00014666800854354938, + "loss": 12.3577, + "step": 13409 + }, + { + "epoch": 0.7302280941781871, + "grad_norm": 0.573828160756465, + "learning_rate": 0.00014666020926357843, + "loss": 12.2839, + "step": 13410 + }, + { + "epoch": 0.7302825481747701, + "grad_norm": 0.5411774195844762, + "learning_rate": 0.00014665240962076983, + "loss": 12.3829, + "step": 13411 + }, + { + "epoch": 0.7303370021713531, + "grad_norm": 0.5269134935140668, + "learning_rate": 0.0001466446096151843, + "loss": 12.1819, + "step": 13412 + }, + { + "epoch": 0.7303914561679361, + "grad_norm": 0.5500051890399976, + "learning_rate": 0.00014663680924688243, + "loss": 12.2836, + "step": 13413 + }, + { + "epoch": 0.7304459101645191, + "grad_norm": 0.5691310142421843, + "learning_rate": 0.00014662900851592493, + "loss": 12.0934, + "step": 13414 + }, + { + "epoch": 0.7305003641611022, + "grad_norm": 0.5971972214596137, + "learning_rate": 0.00014662120742237245, + "loss": 12.2853, + "step": 13415 + }, + { + "epoch": 0.7305548181576852, + "grad_norm": 0.5193671565095754, + "learning_rate": 0.00014661340596628563, + "loss": 12.2221, + "step": 13416 + }, + { + "epoch": 0.7306092721542682, + "grad_norm": 0.6140800043494785, + "learning_rate": 0.00014660560414772516, + "loss": 12.2681, + "step": 13417 + }, + { + "epoch": 0.7306637261508512, + "grad_norm": 0.5116223750237714, + "learning_rate": 0.00014659780196675168, + "loss": 12.0489, + "step": 13418 + }, + { + "epoch": 0.7307181801474342, + "grad_norm": 0.5778048595206212, + "learning_rate": 0.00014658999942342588, + "loss": 12.2313, + "step": 13419 + }, + { + "epoch": 0.7307726341440172, + "grad_norm": 0.5833576137200138, + "learning_rate": 0.00014658219651780843, + "loss": 12.3763, + "step": 13420 + }, + { + "epoch": 0.7308270881406003, + "grad_norm": 0.5915683602865042, + "learning_rate": 0.00014657439324996, + "loss": 12.233, + "step": 13421 + }, + { + "epoch": 0.7308815421371833, + "grad_norm": 0.5454616527970554, + "learning_rate": 0.0001465665896199413, + "loss": 12.0943, + "step": 13422 + }, + { + "epoch": 0.7309359961337663, + "grad_norm": 0.5649149517234928, + "learning_rate": 0.000146558785627813, + "loss": 12.0765, + "step": 13423 + }, + { + "epoch": 0.7309904501303492, + "grad_norm": 0.5516421296053254, + "learning_rate": 0.00014655098127363574, + "loss": 12.167, + "step": 13424 + }, + { + "epoch": 0.7310449041269322, + "grad_norm": 0.5613095485317154, + "learning_rate": 0.00014654317655747026, + "loss": 12.3, + "step": 13425 + }, + { + "epoch": 0.7310993581235152, + "grad_norm": 0.6169308152750115, + "learning_rate": 0.00014653537147937723, + "loss": 12.2698, + "step": 13426 + }, + { + "epoch": 0.7311538121200983, + "grad_norm": 0.5861048880352672, + "learning_rate": 0.00014652756603941735, + "loss": 12.1751, + "step": 13427 + }, + { + "epoch": 0.7312082661166813, + "grad_norm": 0.6425101239536442, + "learning_rate": 0.00014651976023765133, + "loss": 12.2381, + "step": 13428 + }, + { + "epoch": 0.7312627201132643, + "grad_norm": 0.5962613060442913, + "learning_rate": 0.00014651195407413984, + "loss": 12.1845, + "step": 13429 + }, + { + "epoch": 0.7313171741098473, + "grad_norm": 0.5802762355006588, + "learning_rate": 0.0001465041475489436, + "loss": 12.3437, + "step": 13430 + }, + { + "epoch": 0.7313716281064303, + "grad_norm": 0.6089072591955769, + "learning_rate": 0.00014649634066212327, + "loss": 12.2565, + "step": 13431 + }, + { + "epoch": 0.7314260821030133, + "grad_norm": 0.5502931701197896, + "learning_rate": 0.00014648853341373965, + "loss": 12.2332, + "step": 13432 + }, + { + "epoch": 0.7314805360995964, + "grad_norm": 0.624853396439485, + "learning_rate": 0.00014648072580385337, + "loss": 12.1673, + "step": 13433 + }, + { + "epoch": 0.7315349900961794, + "grad_norm": 0.7874743943545011, + "learning_rate": 0.00014647291783252518, + "loss": 12.4053, + "step": 13434 + }, + { + "epoch": 0.7315894440927624, + "grad_norm": 0.6302379277319068, + "learning_rate": 0.0001464651094998158, + "loss": 12.3989, + "step": 13435 + }, + { + "epoch": 0.7316438980893454, + "grad_norm": 0.5415596819633678, + "learning_rate": 0.00014645730080578592, + "loss": 12.1859, + "step": 13436 + }, + { + "epoch": 0.7316983520859284, + "grad_norm": 0.6349985580522457, + "learning_rate": 0.00014644949175049627, + "loss": 12.2451, + "step": 13437 + }, + { + "epoch": 0.7317528060825114, + "grad_norm": 0.5329773431001347, + "learning_rate": 0.0001464416823340076, + "loss": 12.2649, + "step": 13438 + }, + { + "epoch": 0.7318072600790945, + "grad_norm": 0.6348334862342842, + "learning_rate": 0.00014643387255638062, + "loss": 12.2546, + "step": 13439 + }, + { + "epoch": 0.7318617140756775, + "grad_norm": 0.5907952876566426, + "learning_rate": 0.00014642606241767605, + "loss": 12.263, + "step": 13440 + }, + { + "epoch": 0.7319161680722605, + "grad_norm": 0.674265035975136, + "learning_rate": 0.00014641825191795464, + "loss": 12.3306, + "step": 13441 + }, + { + "epoch": 0.7319706220688434, + "grad_norm": 0.5746427347799835, + "learning_rate": 0.0001464104410572771, + "loss": 12.2673, + "step": 13442 + }, + { + "epoch": 0.7320250760654264, + "grad_norm": 0.9009121743624731, + "learning_rate": 0.0001464026298357042, + "loss": 12.3275, + "step": 13443 + }, + { + "epoch": 0.7320795300620095, + "grad_norm": 0.8717593714396172, + "learning_rate": 0.00014639481825329668, + "loss": 12.4716, + "step": 13444 + }, + { + "epoch": 0.7321339840585925, + "grad_norm": 0.5369226230477494, + "learning_rate": 0.0001463870063101153, + "loss": 12.1858, + "step": 13445 + }, + { + "epoch": 0.7321884380551755, + "grad_norm": 0.7186729605792112, + "learning_rate": 0.0001463791940062207, + "loss": 12.3621, + "step": 13446 + }, + { + "epoch": 0.7322428920517585, + "grad_norm": 0.6834082893354678, + "learning_rate": 0.00014637138134167377, + "loss": 12.2566, + "step": 13447 + }, + { + "epoch": 0.7322973460483415, + "grad_norm": 0.5394081582792316, + "learning_rate": 0.00014636356831653518, + "loss": 12.2781, + "step": 13448 + }, + { + "epoch": 0.7323518000449245, + "grad_norm": 0.7195940712977363, + "learning_rate": 0.0001463557549308657, + "loss": 12.3361, + "step": 13449 + }, + { + "epoch": 0.7324062540415076, + "grad_norm": 0.6233426938470448, + "learning_rate": 0.00014634794118472612, + "loss": 12.3518, + "step": 13450 + }, + { + "epoch": 0.7324607080380906, + "grad_norm": 0.5872558256245961, + "learning_rate": 0.00014634012707817718, + "loss": 12.3114, + "step": 13451 + }, + { + "epoch": 0.7325151620346736, + "grad_norm": 0.5707993151414409, + "learning_rate": 0.00014633231261127963, + "loss": 12.0597, + "step": 13452 + }, + { + "epoch": 0.7325696160312566, + "grad_norm": 0.750199557350371, + "learning_rate": 0.0001463244977840942, + "loss": 12.3008, + "step": 13453 + }, + { + "epoch": 0.7326240700278396, + "grad_norm": 0.5962135364609051, + "learning_rate": 0.00014631668259668177, + "loss": 12.2241, + "step": 13454 + }, + { + "epoch": 0.7326785240244226, + "grad_norm": 0.5914875532602344, + "learning_rate": 0.00014630886704910306, + "loss": 12.2123, + "step": 13455 + }, + { + "epoch": 0.7327329780210057, + "grad_norm": 0.6519128543769156, + "learning_rate": 0.00014630105114141878, + "loss": 12.1622, + "step": 13456 + }, + { + "epoch": 0.7327874320175887, + "grad_norm": 0.5820134384859893, + "learning_rate": 0.0001462932348736898, + "loss": 12.169, + "step": 13457 + }, + { + "epoch": 0.7328418860141717, + "grad_norm": 0.6285044893524522, + "learning_rate": 0.00014628541824597685, + "loss": 12.3364, + "step": 13458 + }, + { + "epoch": 0.7328963400107547, + "grad_norm": 0.5651507324870145, + "learning_rate": 0.0001462776012583407, + "loss": 12.2208, + "step": 13459 + }, + { + "epoch": 0.7329507940073376, + "grad_norm": 0.6813457185451129, + "learning_rate": 0.0001462697839108422, + "loss": 12.3105, + "step": 13460 + }, + { + "epoch": 0.7330052480039206, + "grad_norm": 0.6321546183033216, + "learning_rate": 0.0001462619662035421, + "loss": 12.298, + "step": 13461 + }, + { + "epoch": 0.7330597020005037, + "grad_norm": 0.5713403422122469, + "learning_rate": 0.00014625414813650115, + "loss": 12.1503, + "step": 13462 + }, + { + "epoch": 0.7331141559970867, + "grad_norm": 0.5789797059059206, + "learning_rate": 0.0001462463297097802, + "loss": 12.3081, + "step": 13463 + }, + { + "epoch": 0.7331686099936697, + "grad_norm": 0.6189844941170249, + "learning_rate": 0.00014623851092344006, + "loss": 12.2846, + "step": 13464 + }, + { + "epoch": 0.7332230639902527, + "grad_norm": 0.6421197750744856, + "learning_rate": 0.0001462306917775415, + "loss": 12.302, + "step": 13465 + }, + { + "epoch": 0.7332775179868357, + "grad_norm": 0.6500344081161316, + "learning_rate": 0.00014622287227214533, + "loss": 12.0938, + "step": 13466 + }, + { + "epoch": 0.7333319719834187, + "grad_norm": 0.5543130907603288, + "learning_rate": 0.00014621505240731234, + "loss": 12.1314, + "step": 13467 + }, + { + "epoch": 0.7333864259800018, + "grad_norm": 0.6276524792591983, + "learning_rate": 0.00014620723218310334, + "loss": 12.2113, + "step": 13468 + }, + { + "epoch": 0.7334408799765848, + "grad_norm": 0.5433848208031679, + "learning_rate": 0.0001461994115995792, + "loss": 12.2855, + "step": 13469 + }, + { + "epoch": 0.7334953339731678, + "grad_norm": 0.5441465096653444, + "learning_rate": 0.00014619159065680065, + "loss": 12.209, + "step": 13470 + }, + { + "epoch": 0.7335497879697508, + "grad_norm": 0.5901166753899868, + "learning_rate": 0.00014618376935482855, + "loss": 12.341, + "step": 13471 + }, + { + "epoch": 0.7336042419663338, + "grad_norm": 0.5837915344088275, + "learning_rate": 0.00014617594769372372, + "loss": 12.2193, + "step": 13472 + }, + { + "epoch": 0.7336586959629168, + "grad_norm": 0.5186080736607912, + "learning_rate": 0.000146168125673547, + "loss": 12.2442, + "step": 13473 + }, + { + "epoch": 0.7337131499594999, + "grad_norm": 0.5407521525804058, + "learning_rate": 0.00014616030329435913, + "loss": 12.2598, + "step": 13474 + }, + { + "epoch": 0.7337676039560829, + "grad_norm": 0.4887043957052942, + "learning_rate": 0.00014615248055622107, + "loss": 12.1829, + "step": 13475 + }, + { + "epoch": 0.7338220579526659, + "grad_norm": 0.5614679674040877, + "learning_rate": 0.00014614465745919355, + "loss": 12.3229, + "step": 13476 + }, + { + "epoch": 0.7338765119492489, + "grad_norm": 0.631059913970582, + "learning_rate": 0.00014613683400333742, + "loss": 12.4136, + "step": 13477 + }, + { + "epoch": 0.7339309659458318, + "grad_norm": 0.5837309056326555, + "learning_rate": 0.00014612901018871356, + "loss": 12.0118, + "step": 13478 + }, + { + "epoch": 0.733985419942415, + "grad_norm": 0.6021308541180563, + "learning_rate": 0.00014612118601538276, + "loss": 12.3842, + "step": 13479 + }, + { + "epoch": 0.7340398739389979, + "grad_norm": 0.6012447827579785, + "learning_rate": 0.0001461133614834059, + "loss": 12.3327, + "step": 13480 + }, + { + "epoch": 0.7340943279355809, + "grad_norm": 0.5456836130548256, + "learning_rate": 0.00014610553659284378, + "loss": 12.2408, + "step": 13481 + }, + { + "epoch": 0.7341487819321639, + "grad_norm": 0.5359770041758793, + "learning_rate": 0.0001460977113437573, + "loss": 12.0626, + "step": 13482 + }, + { + "epoch": 0.7342032359287469, + "grad_norm": 0.5577707378911068, + "learning_rate": 0.00014608988573620727, + "loss": 12.3124, + "step": 13483 + }, + { + "epoch": 0.7342576899253299, + "grad_norm": 0.5949322494568023, + "learning_rate": 0.00014608205977025457, + "loss": 12.1902, + "step": 13484 + }, + { + "epoch": 0.734312143921913, + "grad_norm": 0.5502749814129778, + "learning_rate": 0.00014607423344596004, + "loss": 12.2777, + "step": 13485 + }, + { + "epoch": 0.734366597918496, + "grad_norm": 0.632209872094979, + "learning_rate": 0.00014606640676338457, + "loss": 12.4037, + "step": 13486 + }, + { + "epoch": 0.734421051915079, + "grad_norm": 0.557195963153172, + "learning_rate": 0.00014605857972258896, + "loss": 12.391, + "step": 13487 + }, + { + "epoch": 0.734475505911662, + "grad_norm": 0.5943557669758027, + "learning_rate": 0.00014605075232363413, + "loss": 12.217, + "step": 13488 + }, + { + "epoch": 0.734529959908245, + "grad_norm": 0.5943023457895839, + "learning_rate": 0.0001460429245665809, + "loss": 12.3385, + "step": 13489 + }, + { + "epoch": 0.734584413904828, + "grad_norm": 0.7428476430130446, + "learning_rate": 0.0001460350964514902, + "loss": 12.2624, + "step": 13490 + }, + { + "epoch": 0.7346388679014111, + "grad_norm": 0.5815614808212614, + "learning_rate": 0.0001460272679784229, + "loss": 12.2735, + "step": 13491 + }, + { + "epoch": 0.7346933218979941, + "grad_norm": 0.5817982523327593, + "learning_rate": 0.00014601943914743977, + "loss": 12.332, + "step": 13492 + }, + { + "epoch": 0.7347477758945771, + "grad_norm": 0.6430540982028815, + "learning_rate": 0.00014601160995860178, + "loss": 12.2342, + "step": 13493 + }, + { + "epoch": 0.7348022298911601, + "grad_norm": 0.5849969352651594, + "learning_rate": 0.00014600378041196982, + "loss": 12.2903, + "step": 13494 + }, + { + "epoch": 0.7348566838877431, + "grad_norm": 0.5780107159072119, + "learning_rate": 0.00014599595050760475, + "loss": 12.2955, + "step": 13495 + }, + { + "epoch": 0.734911137884326, + "grad_norm": 0.5634969389091683, + "learning_rate": 0.00014598812024556746, + "loss": 12.2938, + "step": 13496 + }, + { + "epoch": 0.7349655918809092, + "grad_norm": 0.5658156766141947, + "learning_rate": 0.0001459802896259188, + "loss": 12.2406, + "step": 13497 + }, + { + "epoch": 0.7350200458774921, + "grad_norm": 0.5327649821036103, + "learning_rate": 0.00014597245864871974, + "loss": 12.3137, + "step": 13498 + }, + { + "epoch": 0.7350744998740751, + "grad_norm": 0.5806255240789654, + "learning_rate": 0.0001459646273140311, + "loss": 12.2654, + "step": 13499 + }, + { + "epoch": 0.7351289538706581, + "grad_norm": 0.6076178268505733, + "learning_rate": 0.00014595679562191382, + "loss": 12.1846, + "step": 13500 + }, + { + "epoch": 0.7351834078672411, + "grad_norm": 0.5687460574489392, + "learning_rate": 0.0001459489635724288, + "loss": 12.1725, + "step": 13501 + }, + { + "epoch": 0.7352378618638241, + "grad_norm": 0.8070955169800478, + "learning_rate": 0.0001459411311656369, + "loss": 12.2549, + "step": 13502 + }, + { + "epoch": 0.7352923158604072, + "grad_norm": 0.6600330966428078, + "learning_rate": 0.00014593329840159908, + "loss": 12.2134, + "step": 13503 + }, + { + "epoch": 0.7353467698569902, + "grad_norm": 0.5385361695723251, + "learning_rate": 0.00014592546528037625, + "loss": 11.9943, + "step": 13504 + }, + { + "epoch": 0.7354012238535732, + "grad_norm": 0.5557788743641016, + "learning_rate": 0.00014591763180202928, + "loss": 12.2433, + "step": 13505 + }, + { + "epoch": 0.7354556778501562, + "grad_norm": 0.5609249760965059, + "learning_rate": 0.00014590979796661913, + "loss": 12.0992, + "step": 13506 + }, + { + "epoch": 0.7355101318467392, + "grad_norm": 0.5638842934128018, + "learning_rate": 0.00014590196377420667, + "loss": 12.2798, + "step": 13507 + }, + { + "epoch": 0.7355645858433222, + "grad_norm": 0.5813055251948805, + "learning_rate": 0.0001458941292248528, + "loss": 12.4461, + "step": 13508 + }, + { + "epoch": 0.7356190398399053, + "grad_norm": 0.6686401426161527, + "learning_rate": 0.00014588629431861857, + "loss": 12.4359, + "step": 13509 + }, + { + "epoch": 0.7356734938364883, + "grad_norm": 0.5796752659915911, + "learning_rate": 0.00014587845905556478, + "loss": 12.2123, + "step": 13510 + }, + { + "epoch": 0.7357279478330713, + "grad_norm": 0.5726235025614373, + "learning_rate": 0.0001458706234357524, + "loss": 12.1894, + "step": 13511 + }, + { + "epoch": 0.7357824018296543, + "grad_norm": 0.5743503966196527, + "learning_rate": 0.00014586278745924234, + "loss": 12.209, + "step": 13512 + }, + { + "epoch": 0.7358368558262373, + "grad_norm": 0.6588986599637868, + "learning_rate": 0.00014585495112609558, + "loss": 12.3586, + "step": 13513 + }, + { + "epoch": 0.7358913098228204, + "grad_norm": 0.5738888272484045, + "learning_rate": 0.00014584711443637298, + "loss": 12.2194, + "step": 13514 + }, + { + "epoch": 0.7359457638194034, + "grad_norm": 0.5493036599841871, + "learning_rate": 0.00014583927739013558, + "loss": 12.1909, + "step": 13515 + }, + { + "epoch": 0.7360002178159863, + "grad_norm": 0.5487234894005516, + "learning_rate": 0.00014583143998744426, + "loss": 12.2235, + "step": 13516 + }, + { + "epoch": 0.7360546718125693, + "grad_norm": 0.5138716167966478, + "learning_rate": 0.00014582360222835998, + "loss": 12.2678, + "step": 13517 + }, + { + "epoch": 0.7361091258091523, + "grad_norm": 0.5461539491689609, + "learning_rate": 0.0001458157641129437, + "loss": 12.2597, + "step": 13518 + }, + { + "epoch": 0.7361635798057353, + "grad_norm": 0.5466894629268688, + "learning_rate": 0.0001458079256412563, + "loss": 12.2962, + "step": 13519 + }, + { + "epoch": 0.7362180338023184, + "grad_norm": 0.5822005527745485, + "learning_rate": 0.00014580008681335885, + "loss": 12.2964, + "step": 13520 + }, + { + "epoch": 0.7362724877989014, + "grad_norm": 0.5800259077540941, + "learning_rate": 0.00014579224762931224, + "loss": 12.2507, + "step": 13521 + }, + { + "epoch": 0.7363269417954844, + "grad_norm": 0.6048416669196667, + "learning_rate": 0.0001457844080891774, + "loss": 12.1804, + "step": 13522 + }, + { + "epoch": 0.7363813957920674, + "grad_norm": 0.5683179294002261, + "learning_rate": 0.00014577656819301534, + "loss": 12.276, + "step": 13523 + }, + { + "epoch": 0.7364358497886504, + "grad_norm": 0.6042767266846831, + "learning_rate": 0.000145768727940887, + "loss": 12.3961, + "step": 13524 + }, + { + "epoch": 0.7364903037852334, + "grad_norm": 0.5971000332059202, + "learning_rate": 0.00014576088733285334, + "loss": 12.1241, + "step": 13525 + }, + { + "epoch": 0.7365447577818165, + "grad_norm": 0.5960415410498545, + "learning_rate": 0.00014575304636897538, + "loss": 12.2755, + "step": 13526 + }, + { + "epoch": 0.7365992117783995, + "grad_norm": 0.5501720767126754, + "learning_rate": 0.00014574520504931403, + "loss": 12.2194, + "step": 13527 + }, + { + "epoch": 0.7366536657749825, + "grad_norm": 0.5525318178591357, + "learning_rate": 0.00014573736337393031, + "loss": 12.2742, + "step": 13528 + }, + { + "epoch": 0.7367081197715655, + "grad_norm": 0.5931735852925304, + "learning_rate": 0.0001457295213428852, + "loss": 12.2892, + "step": 13529 + }, + { + "epoch": 0.7367625737681485, + "grad_norm": 0.574905675319439, + "learning_rate": 0.00014572167895623962, + "loss": 12.2478, + "step": 13530 + }, + { + "epoch": 0.7368170277647315, + "grad_norm": 0.5614724178224977, + "learning_rate": 0.00014571383621405463, + "loss": 12.2513, + "step": 13531 + }, + { + "epoch": 0.7368714817613146, + "grad_norm": 0.5164268210770245, + "learning_rate": 0.0001457059931163912, + "loss": 12.3159, + "step": 13532 + }, + { + "epoch": 0.7369259357578976, + "grad_norm": 0.5551055122116084, + "learning_rate": 0.00014569814966331027, + "loss": 12.2811, + "step": 13533 + }, + { + "epoch": 0.7369803897544805, + "grad_norm": 0.7238589607973043, + "learning_rate": 0.00014569030585487286, + "loss": 12.2553, + "step": 13534 + }, + { + "epoch": 0.7370348437510635, + "grad_norm": 0.6121042465176388, + "learning_rate": 0.00014568246169114, + "loss": 12.2991, + "step": 13535 + }, + { + "epoch": 0.7370892977476465, + "grad_norm": 0.5443821640990063, + "learning_rate": 0.00014567461717217262, + "loss": 12.2379, + "step": 13536 + }, + { + "epoch": 0.7371437517442295, + "grad_norm": 0.5899600381512372, + "learning_rate": 0.00014566677229803178, + "loss": 12.2119, + "step": 13537 + }, + { + "epoch": 0.7371982057408126, + "grad_norm": 0.5833225779741186, + "learning_rate": 0.00014565892706877847, + "loss": 12.3063, + "step": 13538 + }, + { + "epoch": 0.7372526597373956, + "grad_norm": 0.5632297981423725, + "learning_rate": 0.00014565108148447366, + "loss": 12.2568, + "step": 13539 + }, + { + "epoch": 0.7373071137339786, + "grad_norm": 0.5456236956874208, + "learning_rate": 0.0001456432355451784, + "loss": 12.2761, + "step": 13540 + }, + { + "epoch": 0.7373615677305616, + "grad_norm": 0.5495704263049151, + "learning_rate": 0.00014563538925095368, + "loss": 12.2297, + "step": 13541 + }, + { + "epoch": 0.7374160217271446, + "grad_norm": 0.5689853068119046, + "learning_rate": 0.0001456275426018605, + "loss": 12.2283, + "step": 13542 + }, + { + "epoch": 0.7374704757237277, + "grad_norm": 0.5180836306877948, + "learning_rate": 0.00014561969559795995, + "loss": 12.1401, + "step": 13543 + }, + { + "epoch": 0.7375249297203107, + "grad_norm": 0.5718271747766805, + "learning_rate": 0.00014561184823931296, + "loss": 12.3157, + "step": 13544 + }, + { + "epoch": 0.7375793837168937, + "grad_norm": 0.5746504330019291, + "learning_rate": 0.0001456040005259806, + "loss": 12.1731, + "step": 13545 + }, + { + "epoch": 0.7376338377134767, + "grad_norm": 0.6065461245363518, + "learning_rate": 0.00014559615245802386, + "loss": 12.352, + "step": 13546 + }, + { + "epoch": 0.7376882917100597, + "grad_norm": 0.5784360166514776, + "learning_rate": 0.00014558830403550382, + "loss": 12.3759, + "step": 13547 + }, + { + "epoch": 0.7377427457066427, + "grad_norm": 0.5602189288940183, + "learning_rate": 0.00014558045525848147, + "loss": 12.1386, + "step": 13548 + }, + { + "epoch": 0.7377971997032258, + "grad_norm": 0.5670399784838877, + "learning_rate": 0.00014557260612701786, + "loss": 12.2799, + "step": 13549 + }, + { + "epoch": 0.7378516536998088, + "grad_norm": 0.5896749388285503, + "learning_rate": 0.00014556475664117405, + "loss": 12.3304, + "step": 13550 + }, + { + "epoch": 0.7379061076963918, + "grad_norm": 0.5927587892809875, + "learning_rate": 0.00014555690680101102, + "loss": 12.3231, + "step": 13551 + }, + { + "epoch": 0.7379605616929747, + "grad_norm": 0.5778156056704327, + "learning_rate": 0.00014554905660658983, + "loss": 12.2669, + "step": 13552 + }, + { + "epoch": 0.7380150156895577, + "grad_norm": 0.6212925554269098, + "learning_rate": 0.00014554120605797156, + "loss": 12.268, + "step": 13553 + }, + { + "epoch": 0.7380694696861407, + "grad_norm": 0.5703826293906922, + "learning_rate": 0.0001455333551552172, + "loss": 12.1634, + "step": 13554 + }, + { + "epoch": 0.7381239236827238, + "grad_norm": 0.5892926596679476, + "learning_rate": 0.00014552550389838791, + "loss": 12.362, + "step": 13555 + }, + { + "epoch": 0.7381783776793068, + "grad_norm": 0.6057136352760333, + "learning_rate": 0.00014551765228754463, + "loss": 12.2202, + "step": 13556 + }, + { + "epoch": 0.7382328316758898, + "grad_norm": 0.5703198221862579, + "learning_rate": 0.00014550980032274841, + "loss": 12.2498, + "step": 13557 + }, + { + "epoch": 0.7382872856724728, + "grad_norm": 0.627692916894544, + "learning_rate": 0.00014550194800406037, + "loss": 12.3628, + "step": 13558 + }, + { + "epoch": 0.7383417396690558, + "grad_norm": 0.6016477926686616, + "learning_rate": 0.0001454940953315416, + "loss": 12.1268, + "step": 13559 + }, + { + "epoch": 0.7383961936656388, + "grad_norm": 0.6573337693225969, + "learning_rate": 0.00014548624230525307, + "loss": 12.3016, + "step": 13560 + }, + { + "epoch": 0.7384506476622219, + "grad_norm": 0.575658458983648, + "learning_rate": 0.00014547838892525592, + "loss": 12.2689, + "step": 13561 + }, + { + "epoch": 0.7385051016588049, + "grad_norm": 0.583711733930323, + "learning_rate": 0.00014547053519161116, + "loss": 12.2254, + "step": 13562 + }, + { + "epoch": 0.7385595556553879, + "grad_norm": 0.6919837322214447, + "learning_rate": 0.0001454626811043799, + "loss": 12.4156, + "step": 13563 + }, + { + "epoch": 0.7386140096519709, + "grad_norm": 0.5667989851626005, + "learning_rate": 0.0001454548266636232, + "loss": 12.2173, + "step": 13564 + }, + { + "epoch": 0.7386684636485539, + "grad_norm": 0.5967847312016634, + "learning_rate": 0.00014544697186940218, + "loss": 12.3047, + "step": 13565 + }, + { + "epoch": 0.7387229176451369, + "grad_norm": 0.5783482130215254, + "learning_rate": 0.00014543911672177786, + "loss": 12.393, + "step": 13566 + }, + { + "epoch": 0.73877737164172, + "grad_norm": 0.6326278799926038, + "learning_rate": 0.00014543126122081138, + "loss": 12.3051, + "step": 13567 + }, + { + "epoch": 0.738831825638303, + "grad_norm": 0.5975802047422007, + "learning_rate": 0.00014542340536656374, + "loss": 12.1247, + "step": 13568 + }, + { + "epoch": 0.738886279634886, + "grad_norm": 0.6201015768827072, + "learning_rate": 0.0001454155491590961, + "loss": 12.2745, + "step": 13569 + }, + { + "epoch": 0.738940733631469, + "grad_norm": 0.5977105967525396, + "learning_rate": 0.00014540769259846953, + "loss": 12.3551, + "step": 13570 + }, + { + "epoch": 0.7389951876280519, + "grad_norm": 0.5992047393148686, + "learning_rate": 0.00014539983568474517, + "loss": 12.2941, + "step": 13571 + }, + { + "epoch": 0.7390496416246349, + "grad_norm": 0.5986098983608372, + "learning_rate": 0.00014539197841798403, + "loss": 12.2243, + "step": 13572 + }, + { + "epoch": 0.739104095621218, + "grad_norm": 0.562071760476364, + "learning_rate": 0.00014538412079824728, + "loss": 12.3107, + "step": 13573 + }, + { + "epoch": 0.739158549617801, + "grad_norm": 0.5992289735675804, + "learning_rate": 0.00014537626282559596, + "loss": 12.1891, + "step": 13574 + }, + { + "epoch": 0.739213003614384, + "grad_norm": 0.5848912184261011, + "learning_rate": 0.00014536840450009124, + "loss": 12.1678, + "step": 13575 + }, + { + "epoch": 0.739267457610967, + "grad_norm": 0.5714703977133936, + "learning_rate": 0.0001453605458217942, + "loss": 12.2443, + "step": 13576 + }, + { + "epoch": 0.73932191160755, + "grad_norm": 0.5760218558487907, + "learning_rate": 0.00014535268679076595, + "loss": 12.3124, + "step": 13577 + }, + { + "epoch": 0.7393763656041331, + "grad_norm": 0.6493899149913374, + "learning_rate": 0.00014534482740706758, + "loss": 12.4153, + "step": 13578 + }, + { + "epoch": 0.7394308196007161, + "grad_norm": 0.5886954646525789, + "learning_rate": 0.00014533696767076023, + "loss": 12.331, + "step": 13579 + }, + { + "epoch": 0.7394852735972991, + "grad_norm": 0.5521788953301661, + "learning_rate": 0.00014532910758190503, + "loss": 12.2524, + "step": 13580 + }, + { + "epoch": 0.7395397275938821, + "grad_norm": 0.5263278183981711, + "learning_rate": 0.0001453212471405631, + "loss": 12.3494, + "step": 13581 + }, + { + "epoch": 0.7395941815904651, + "grad_norm": 0.5995637429608626, + "learning_rate": 0.00014531338634679553, + "loss": 12.2439, + "step": 13582 + }, + { + "epoch": 0.7396486355870481, + "grad_norm": 0.6126319407071071, + "learning_rate": 0.00014530552520066348, + "loss": 12.3076, + "step": 13583 + }, + { + "epoch": 0.7397030895836312, + "grad_norm": 0.5829210288313339, + "learning_rate": 0.00014529766370222807, + "loss": 12.1716, + "step": 13584 + }, + { + "epoch": 0.7397575435802142, + "grad_norm": 0.5936945728405048, + "learning_rate": 0.0001452898018515504, + "loss": 12.2531, + "step": 13585 + }, + { + "epoch": 0.7398119975767972, + "grad_norm": 0.5985701987761367, + "learning_rate": 0.00014528193964869168, + "loss": 12.1612, + "step": 13586 + }, + { + "epoch": 0.7398664515733802, + "grad_norm": 0.5641246242590785, + "learning_rate": 0.00014527407709371298, + "loss": 12.1089, + "step": 13587 + }, + { + "epoch": 0.7399209055699631, + "grad_norm": 0.6151635949583258, + "learning_rate": 0.00014526621418667546, + "loss": 12.2131, + "step": 13588 + }, + { + "epoch": 0.7399753595665461, + "grad_norm": 0.6159449104099786, + "learning_rate": 0.0001452583509276403, + "loss": 12.302, + "step": 13589 + }, + { + "epoch": 0.7400298135631292, + "grad_norm": 0.6151269426786296, + "learning_rate": 0.00014525048731666858, + "loss": 12.2723, + "step": 13590 + }, + { + "epoch": 0.7400842675597122, + "grad_norm": 0.657999967620929, + "learning_rate": 0.00014524262335382149, + "loss": 12.1819, + "step": 13591 + }, + { + "epoch": 0.7401387215562952, + "grad_norm": 0.5417844327833512, + "learning_rate": 0.00014523475903916016, + "loss": 12.2666, + "step": 13592 + }, + { + "epoch": 0.7401931755528782, + "grad_norm": 0.659413396793381, + "learning_rate": 0.00014522689437274577, + "loss": 12.3194, + "step": 13593 + }, + { + "epoch": 0.7402476295494612, + "grad_norm": 0.6522191671164157, + "learning_rate": 0.00014521902935463947, + "loss": 12.2682, + "step": 13594 + }, + { + "epoch": 0.7403020835460442, + "grad_norm": 0.5712575409989621, + "learning_rate": 0.00014521116398490241, + "loss": 12.2528, + "step": 13595 + }, + { + "epoch": 0.7403565375426273, + "grad_norm": 0.5925659529071102, + "learning_rate": 0.00014520329826359576, + "loss": 12.1879, + "step": 13596 + }, + { + "epoch": 0.7404109915392103, + "grad_norm": 0.6085032636756643, + "learning_rate": 0.00014519543219078068, + "loss": 12.2377, + "step": 13597 + }, + { + "epoch": 0.7404654455357933, + "grad_norm": 0.548249484685204, + "learning_rate": 0.00014518756576651834, + "loss": 12.3519, + "step": 13598 + }, + { + "epoch": 0.7405198995323763, + "grad_norm": 0.5862348515271124, + "learning_rate": 0.0001451796989908699, + "loss": 12.3024, + "step": 13599 + }, + { + "epoch": 0.7405743535289593, + "grad_norm": 0.5617435120340949, + "learning_rate": 0.00014517183186389657, + "loss": 12.3191, + "step": 13600 + }, + { + "epoch": 0.7406288075255423, + "grad_norm": 0.5561225287567393, + "learning_rate": 0.00014516396438565948, + "loss": 12.2676, + "step": 13601 + }, + { + "epoch": 0.7406832615221254, + "grad_norm": 0.6384067084986527, + "learning_rate": 0.00014515609655621985, + "loss": 12.3782, + "step": 13602 + }, + { + "epoch": 0.7407377155187084, + "grad_norm": 0.530157034517588, + "learning_rate": 0.00014514822837563882, + "loss": 12.2542, + "step": 13603 + }, + { + "epoch": 0.7407921695152914, + "grad_norm": 0.5767821558742593, + "learning_rate": 0.00014514035984397757, + "loss": 12.2971, + "step": 13604 + }, + { + "epoch": 0.7408466235118744, + "grad_norm": 0.6036355364934566, + "learning_rate": 0.00014513249096129735, + "loss": 12.2806, + "step": 13605 + }, + { + "epoch": 0.7409010775084574, + "grad_norm": 0.6545513956221317, + "learning_rate": 0.0001451246217276593, + "loss": 12.0547, + "step": 13606 + }, + { + "epoch": 0.7409555315050403, + "grad_norm": 0.670134175781721, + "learning_rate": 0.00014511675214312462, + "loss": 12.1504, + "step": 13607 + }, + { + "epoch": 0.7410099855016234, + "grad_norm": 0.6100622110757251, + "learning_rate": 0.00014510888220775454, + "loss": 12.2609, + "step": 13608 + }, + { + "epoch": 0.7410644394982064, + "grad_norm": 0.8321259426406292, + "learning_rate": 0.00014510101192161018, + "loss": 12.1772, + "step": 13609 + }, + { + "epoch": 0.7411188934947894, + "grad_norm": 0.5704049760476888, + "learning_rate": 0.00014509314128475283, + "loss": 12.2398, + "step": 13610 + }, + { + "epoch": 0.7411733474913724, + "grad_norm": 0.5847939629309116, + "learning_rate": 0.00014508527029724366, + "loss": 12.2684, + "step": 13611 + }, + { + "epoch": 0.7412278014879554, + "grad_norm": 0.5778186625862041, + "learning_rate": 0.00014507739895914382, + "loss": 12.2927, + "step": 13612 + }, + { + "epoch": 0.7412822554845385, + "grad_norm": 0.6271163582322844, + "learning_rate": 0.00014506952727051458, + "loss": 12.2311, + "step": 13613 + }, + { + "epoch": 0.7413367094811215, + "grad_norm": 0.569653045038958, + "learning_rate": 0.00014506165523141712, + "loss": 12.28, + "step": 13614 + }, + { + "epoch": 0.7413911634777045, + "grad_norm": 0.6158185408598926, + "learning_rate": 0.0001450537828419127, + "loss": 12.2618, + "step": 13615 + }, + { + "epoch": 0.7414456174742875, + "grad_norm": 0.6184929667207226, + "learning_rate": 0.0001450459101020625, + "loss": 12.2997, + "step": 13616 + }, + { + "epoch": 0.7415000714708705, + "grad_norm": 0.5651499843666569, + "learning_rate": 0.00014503803701192776, + "loss": 12.228, + "step": 13617 + }, + { + "epoch": 0.7415545254674535, + "grad_norm": 0.5706628227808465, + "learning_rate": 0.00014503016357156969, + "loss": 12.264, + "step": 13618 + }, + { + "epoch": 0.7416089794640366, + "grad_norm": 0.5585073128263942, + "learning_rate": 0.0001450222897810495, + "loss": 12.2749, + "step": 13619 + }, + { + "epoch": 0.7416634334606196, + "grad_norm": 0.6289826168740338, + "learning_rate": 0.00014501441564042847, + "loss": 12.3302, + "step": 13620 + }, + { + "epoch": 0.7417178874572026, + "grad_norm": 0.5512294887972594, + "learning_rate": 0.00014500654114976778, + "loss": 12.2143, + "step": 13621 + }, + { + "epoch": 0.7417723414537856, + "grad_norm": 0.6371616506621154, + "learning_rate": 0.00014499866630912866, + "loss": 12.4681, + "step": 13622 + }, + { + "epoch": 0.7418267954503686, + "grad_norm": 0.5869525221861992, + "learning_rate": 0.00014499079111857235, + "loss": 12.212, + "step": 13623 + }, + { + "epoch": 0.7418812494469516, + "grad_norm": 0.5600136938066297, + "learning_rate": 0.00014498291557816012, + "loss": 12.2693, + "step": 13624 + }, + { + "epoch": 0.7419357034435347, + "grad_norm": 0.6598005867431953, + "learning_rate": 0.00014497503968795324, + "loss": 12.3359, + "step": 13625 + }, + { + "epoch": 0.7419901574401176, + "grad_norm": 0.5939656040106014, + "learning_rate": 0.00014496716344801288, + "loss": 12.2251, + "step": 13626 + }, + { + "epoch": 0.7420446114367006, + "grad_norm": 0.6090690177906303, + "learning_rate": 0.0001449592868584003, + "loss": 12.2218, + "step": 13627 + }, + { + "epoch": 0.7420990654332836, + "grad_norm": 0.5774584014151134, + "learning_rate": 0.00014495140991917674, + "loss": 12.3191, + "step": 13628 + }, + { + "epoch": 0.7421535194298666, + "grad_norm": 0.6263921067003643, + "learning_rate": 0.0001449435326304035, + "loss": 12.3032, + "step": 13629 + }, + { + "epoch": 0.7422079734264496, + "grad_norm": 0.7222604042773875, + "learning_rate": 0.00014493565499214183, + "loss": 12.237, + "step": 13630 + }, + { + "epoch": 0.7422624274230327, + "grad_norm": 0.6076500489786401, + "learning_rate": 0.00014492777700445296, + "loss": 12.3552, + "step": 13631 + }, + { + "epoch": 0.7423168814196157, + "grad_norm": 0.6058780080649863, + "learning_rate": 0.0001449198986673982, + "loss": 12.2474, + "step": 13632 + }, + { + "epoch": 0.7423713354161987, + "grad_norm": 0.5490641811425729, + "learning_rate": 0.00014491201998103874, + "loss": 12.2425, + "step": 13633 + }, + { + "epoch": 0.7424257894127817, + "grad_norm": 0.5456156945385467, + "learning_rate": 0.00014490414094543589, + "loss": 12.1875, + "step": 13634 + }, + { + "epoch": 0.7424802434093647, + "grad_norm": 0.6296439828779367, + "learning_rate": 0.00014489626156065087, + "loss": 12.244, + "step": 13635 + }, + { + "epoch": 0.7425346974059477, + "grad_norm": 0.6227359546068482, + "learning_rate": 0.00014488838182674503, + "loss": 12.2818, + "step": 13636 + }, + { + "epoch": 0.7425891514025308, + "grad_norm": 0.5959937363619823, + "learning_rate": 0.00014488050174377962, + "loss": 12.3168, + "step": 13637 + }, + { + "epoch": 0.7426436053991138, + "grad_norm": 0.6016861050673398, + "learning_rate": 0.00014487262131181587, + "loss": 12.333, + "step": 13638 + }, + { + "epoch": 0.7426980593956968, + "grad_norm": 0.5950905794185968, + "learning_rate": 0.0001448647405309151, + "loss": 12.3405, + "step": 13639 + }, + { + "epoch": 0.7427525133922798, + "grad_norm": 0.5464111091884458, + "learning_rate": 0.0001448568594011386, + "loss": 12.1223, + "step": 13640 + }, + { + "epoch": 0.7428069673888628, + "grad_norm": 0.7861765047156204, + "learning_rate": 0.0001448489779225476, + "loss": 12.2474, + "step": 13641 + }, + { + "epoch": 0.7428614213854458, + "grad_norm": 0.6003190220576142, + "learning_rate": 0.00014484109609520345, + "loss": 12.2989, + "step": 13642 + }, + { + "epoch": 0.7429158753820289, + "grad_norm": 0.5756507794999683, + "learning_rate": 0.00014483321391916746, + "loss": 12.2264, + "step": 13643 + }, + { + "epoch": 0.7429703293786118, + "grad_norm": 0.6686777632634148, + "learning_rate": 0.0001448253313945008, + "loss": 12.2312, + "step": 13644 + }, + { + "epoch": 0.7430247833751948, + "grad_norm": 0.6014856625238543, + "learning_rate": 0.00014481744852126485, + "loss": 12.1709, + "step": 13645 + }, + { + "epoch": 0.7430792373717778, + "grad_norm": 0.6274728754383695, + "learning_rate": 0.00014480956529952095, + "loss": 12.1732, + "step": 13646 + }, + { + "epoch": 0.7431336913683608, + "grad_norm": 0.6578097771737071, + "learning_rate": 0.00014480168172933036, + "loss": 12.1725, + "step": 13647 + }, + { + "epoch": 0.7431881453649439, + "grad_norm": 0.6243897320773598, + "learning_rate": 0.00014479379781075438, + "loss": 12.2669, + "step": 13648 + }, + { + "epoch": 0.7432425993615269, + "grad_norm": 0.5644404409732856, + "learning_rate": 0.00014478591354385428, + "loss": 12.2525, + "step": 13649 + }, + { + "epoch": 0.7432970533581099, + "grad_norm": 0.5899929628289414, + "learning_rate": 0.00014477802892869142, + "loss": 12.3192, + "step": 13650 + }, + { + "epoch": 0.7433515073546929, + "grad_norm": 0.6204189531105295, + "learning_rate": 0.0001447701439653271, + "loss": 12.2539, + "step": 13651 + }, + { + "epoch": 0.7434059613512759, + "grad_norm": 0.5978774508167788, + "learning_rate": 0.00014476225865382264, + "loss": 12.2512, + "step": 13652 + }, + { + "epoch": 0.7434604153478589, + "grad_norm": 0.5607476504150831, + "learning_rate": 0.00014475437299423937, + "loss": 12.4198, + "step": 13653 + }, + { + "epoch": 0.743514869344442, + "grad_norm": 0.7064522212229157, + "learning_rate": 0.00014474648698663856, + "loss": 12.3364, + "step": 13654 + }, + { + "epoch": 0.743569323341025, + "grad_norm": 0.5292842905349883, + "learning_rate": 0.00014473860063108157, + "loss": 12.2842, + "step": 13655 + }, + { + "epoch": 0.743623777337608, + "grad_norm": 0.6604362435987455, + "learning_rate": 0.0001447307139276297, + "loss": 12.2738, + "step": 13656 + }, + { + "epoch": 0.743678231334191, + "grad_norm": 0.6411131805921383, + "learning_rate": 0.00014472282687634432, + "loss": 12.3607, + "step": 13657 + }, + { + "epoch": 0.743732685330774, + "grad_norm": 0.5740431821060552, + "learning_rate": 0.00014471493947728673, + "loss": 12.2158, + "step": 13658 + }, + { + "epoch": 0.743787139327357, + "grad_norm": 0.650154708336648, + "learning_rate": 0.00014470705173051827, + "loss": 12.3459, + "step": 13659 + }, + { + "epoch": 0.7438415933239401, + "grad_norm": 0.6166673090641452, + "learning_rate": 0.0001446991636361003, + "loss": 12.2296, + "step": 13660 + }, + { + "epoch": 0.7438960473205231, + "grad_norm": 0.6630240435802148, + "learning_rate": 0.00014469127519409414, + "loss": 12.2581, + "step": 13661 + }, + { + "epoch": 0.743950501317106, + "grad_norm": 0.6285502485037263, + "learning_rate": 0.0001446833864045611, + "loss": 12.3641, + "step": 13662 + }, + { + "epoch": 0.744004955313689, + "grad_norm": 0.6243840836632044, + "learning_rate": 0.00014467549726756256, + "loss": 12.2185, + "step": 13663 + }, + { + "epoch": 0.744059409310272, + "grad_norm": 0.594732247309961, + "learning_rate": 0.00014466760778315986, + "loss": 12.1288, + "step": 13664 + }, + { + "epoch": 0.744113863306855, + "grad_norm": 0.5717664112534702, + "learning_rate": 0.00014465971795141436, + "loss": 12.2183, + "step": 13665 + }, + { + "epoch": 0.7441683173034381, + "grad_norm": 0.6705905603648593, + "learning_rate": 0.0001446518277723874, + "loss": 12.3219, + "step": 13666 + }, + { + "epoch": 0.7442227713000211, + "grad_norm": 0.6370616353146389, + "learning_rate": 0.00014464393724614033, + "loss": 12.3651, + "step": 13667 + }, + { + "epoch": 0.7442772252966041, + "grad_norm": 0.558520232425937, + "learning_rate": 0.00014463604637273453, + "loss": 12.2007, + "step": 13668 + }, + { + "epoch": 0.7443316792931871, + "grad_norm": 0.6498084660186766, + "learning_rate": 0.00014462815515223137, + "loss": 12.3098, + "step": 13669 + }, + { + "epoch": 0.7443861332897701, + "grad_norm": 0.7689024411652365, + "learning_rate": 0.00014462026358469214, + "loss": 12.232, + "step": 13670 + }, + { + "epoch": 0.7444405872863531, + "grad_norm": 0.5720853515258976, + "learning_rate": 0.0001446123716701783, + "loss": 12.2595, + "step": 13671 + }, + { + "epoch": 0.7444950412829362, + "grad_norm": 0.5988148788680487, + "learning_rate": 0.00014460447940875114, + "loss": 12.3409, + "step": 13672 + }, + { + "epoch": 0.7445494952795192, + "grad_norm": 0.7856904621089689, + "learning_rate": 0.0001445965868004721, + "loss": 12.3772, + "step": 13673 + }, + { + "epoch": 0.7446039492761022, + "grad_norm": 0.7043842824252822, + "learning_rate": 0.0001445886938454025, + "loss": 12.2397, + "step": 13674 + }, + { + "epoch": 0.7446584032726852, + "grad_norm": 0.5913036266727861, + "learning_rate": 0.00014458080054360374, + "loss": 12.2747, + "step": 13675 + }, + { + "epoch": 0.7447128572692682, + "grad_norm": 0.6960974142798146, + "learning_rate": 0.00014457290689513723, + "loss": 12.3003, + "step": 13676 + }, + { + "epoch": 0.7447673112658513, + "grad_norm": 0.6087124615164062, + "learning_rate": 0.00014456501290006427, + "loss": 12.2339, + "step": 13677 + }, + { + "epoch": 0.7448217652624343, + "grad_norm": 0.5571231214937553, + "learning_rate": 0.00014455711855844636, + "loss": 12.1596, + "step": 13678 + }, + { + "epoch": 0.7448762192590173, + "grad_norm": 0.5879599096930934, + "learning_rate": 0.00014454922387034476, + "loss": 12.2023, + "step": 13679 + }, + { + "epoch": 0.7449306732556003, + "grad_norm": 0.6587926585973795, + "learning_rate": 0.00014454132883582097, + "loss": 12.3255, + "step": 13680 + }, + { + "epoch": 0.7449851272521832, + "grad_norm": 0.5565501984200791, + "learning_rate": 0.00014453343345493633, + "loss": 12.2422, + "step": 13681 + }, + { + "epoch": 0.7450395812487662, + "grad_norm": 0.6054602481476327, + "learning_rate": 0.00014452553772775225, + "loss": 12.1699, + "step": 13682 + }, + { + "epoch": 0.7450940352453493, + "grad_norm": 0.6502136729803876, + "learning_rate": 0.00014451764165433008, + "loss": 12.0922, + "step": 13683 + }, + { + "epoch": 0.7451484892419323, + "grad_norm": 0.5510847853537779, + "learning_rate": 0.0001445097452347313, + "loss": 12.1959, + "step": 13684 + }, + { + "epoch": 0.7452029432385153, + "grad_norm": 0.5850713060797119, + "learning_rate": 0.00014450184846901724, + "loss": 12.1878, + "step": 13685 + }, + { + "epoch": 0.7452573972350983, + "grad_norm": 0.6319530785149593, + "learning_rate": 0.00014449395135724937, + "loss": 12.264, + "step": 13686 + }, + { + "epoch": 0.7453118512316813, + "grad_norm": 0.554872046596875, + "learning_rate": 0.0001444860538994891, + "loss": 12.1727, + "step": 13687 + }, + { + "epoch": 0.7453663052282643, + "grad_norm": 0.5718499206730574, + "learning_rate": 0.00014447815609579777, + "loss": 12.1314, + "step": 13688 + }, + { + "epoch": 0.7454207592248474, + "grad_norm": 0.5848613944881548, + "learning_rate": 0.00014447025794623686, + "loss": 12.189, + "step": 13689 + }, + { + "epoch": 0.7454752132214304, + "grad_norm": 0.5843558986303549, + "learning_rate": 0.00014446235945086774, + "loss": 12.3481, + "step": 13690 + }, + { + "epoch": 0.7455296672180134, + "grad_norm": 0.5790485101159226, + "learning_rate": 0.0001444544606097519, + "loss": 12.2546, + "step": 13691 + }, + { + "epoch": 0.7455841212145964, + "grad_norm": 0.5908234248192212, + "learning_rate": 0.0001444465614229507, + "loss": 12.1865, + "step": 13692 + }, + { + "epoch": 0.7456385752111794, + "grad_norm": 0.5979218535232742, + "learning_rate": 0.00014443866189052559, + "loss": 12.2428, + "step": 13693 + }, + { + "epoch": 0.7456930292077624, + "grad_norm": 0.5620026112583111, + "learning_rate": 0.00014443076201253795, + "loss": 12.3382, + "step": 13694 + }, + { + "epoch": 0.7457474832043455, + "grad_norm": 0.6823118955874878, + "learning_rate": 0.00014442286178904928, + "loss": 12.325, + "step": 13695 + }, + { + "epoch": 0.7458019372009285, + "grad_norm": 0.6067039441439942, + "learning_rate": 0.000144414961220121, + "loss": 12.3408, + "step": 13696 + }, + { + "epoch": 0.7458563911975115, + "grad_norm": 0.7007921904958164, + "learning_rate": 0.00014440706030581456, + "loss": 12.3337, + "step": 13697 + }, + { + "epoch": 0.7459108451940945, + "grad_norm": 0.6113298266092536, + "learning_rate": 0.00014439915904619134, + "loss": 12.2801, + "step": 13698 + }, + { + "epoch": 0.7459652991906774, + "grad_norm": 0.5916639524324443, + "learning_rate": 0.00014439125744131282, + "loss": 12.1633, + "step": 13699 + }, + { + "epoch": 0.7460197531872604, + "grad_norm": 0.604444000681318, + "learning_rate": 0.0001443833554912404, + "loss": 12.315, + "step": 13700 + }, + { + "epoch": 0.7460742071838435, + "grad_norm": 0.5307951087338542, + "learning_rate": 0.0001443754531960356, + "loss": 12.1628, + "step": 13701 + }, + { + "epoch": 0.7461286611804265, + "grad_norm": 0.6424748872034338, + "learning_rate": 0.00014436755055575984, + "loss": 12.2518, + "step": 13702 + }, + { + "epoch": 0.7461831151770095, + "grad_norm": 0.5858372985468218, + "learning_rate": 0.00014435964757047458, + "loss": 12.1831, + "step": 13703 + }, + { + "epoch": 0.7462375691735925, + "grad_norm": 0.7016042162035304, + "learning_rate": 0.00014435174424024124, + "loss": 12.3785, + "step": 13704 + }, + { + "epoch": 0.7462920231701755, + "grad_norm": 0.5351112745115311, + "learning_rate": 0.00014434384056512126, + "loss": 12.0123, + "step": 13705 + }, + { + "epoch": 0.7463464771667585, + "grad_norm": 0.6004125768755129, + "learning_rate": 0.00014433593654517618, + "loss": 12.2861, + "step": 13706 + }, + { + "epoch": 0.7464009311633416, + "grad_norm": 0.5993177925145355, + "learning_rate": 0.00014432803218046746, + "loss": 12.265, + "step": 13707 + }, + { + "epoch": 0.7464553851599246, + "grad_norm": 0.50355922016749, + "learning_rate": 0.00014432012747105647, + "loss": 12.0824, + "step": 13708 + }, + { + "epoch": 0.7465098391565076, + "grad_norm": 0.5574649579642977, + "learning_rate": 0.00014431222241700475, + "loss": 12.1988, + "step": 13709 + }, + { + "epoch": 0.7465642931530906, + "grad_norm": 0.6904411913829945, + "learning_rate": 0.00014430431701837376, + "loss": 12.2252, + "step": 13710 + }, + { + "epoch": 0.7466187471496736, + "grad_norm": 0.5940225992786953, + "learning_rate": 0.00014429641127522495, + "loss": 11.9661, + "step": 13711 + }, + { + "epoch": 0.7466732011462567, + "grad_norm": 0.5574148268284755, + "learning_rate": 0.00014428850518761986, + "loss": 12.318, + "step": 13712 + }, + { + "epoch": 0.7467276551428397, + "grad_norm": 0.5236783246576344, + "learning_rate": 0.0001442805987556199, + "loss": 12.0991, + "step": 13713 + }, + { + "epoch": 0.7467821091394227, + "grad_norm": 0.5820653271899563, + "learning_rate": 0.0001442726919792866, + "loss": 12.3288, + "step": 13714 + }, + { + "epoch": 0.7468365631360057, + "grad_norm": 0.5932316400669257, + "learning_rate": 0.0001442647848586814, + "loss": 12.3073, + "step": 13715 + }, + { + "epoch": 0.7468910171325887, + "grad_norm": 0.6467755435657985, + "learning_rate": 0.0001442568773938658, + "loss": 12.1978, + "step": 13716 + }, + { + "epoch": 0.7469454711291716, + "grad_norm": 0.5516042158353747, + "learning_rate": 0.00014424896958490133, + "loss": 12.0761, + "step": 13717 + }, + { + "epoch": 0.7469999251257547, + "grad_norm": 0.5613663038502804, + "learning_rate": 0.00014424106143184944, + "loss": 12.187, + "step": 13718 + }, + { + "epoch": 0.7470543791223377, + "grad_norm": 0.6013135202519371, + "learning_rate": 0.00014423315293477163, + "loss": 12.3413, + "step": 13719 + }, + { + "epoch": 0.7471088331189207, + "grad_norm": 0.5628113838646053, + "learning_rate": 0.0001442252440937294, + "loss": 12.2496, + "step": 13720 + }, + { + "epoch": 0.7471632871155037, + "grad_norm": 0.5904699220530979, + "learning_rate": 0.00014421733490878425, + "loss": 12.2199, + "step": 13721 + }, + { + "epoch": 0.7472177411120867, + "grad_norm": 0.6257160288719434, + "learning_rate": 0.00014420942537999773, + "loss": 12.2555, + "step": 13722 + }, + { + "epoch": 0.7472721951086697, + "grad_norm": 0.5439262444923114, + "learning_rate": 0.00014420151550743125, + "loss": 12.2134, + "step": 13723 + }, + { + "epoch": 0.7473266491052528, + "grad_norm": 0.6459417849868265, + "learning_rate": 0.00014419360529114642, + "loss": 12.2833, + "step": 13724 + }, + { + "epoch": 0.7473811031018358, + "grad_norm": 0.62630629000918, + "learning_rate": 0.00014418569473120468, + "loss": 12.2979, + "step": 13725 + }, + { + "epoch": 0.7474355570984188, + "grad_norm": 0.5777191040932425, + "learning_rate": 0.00014417778382766757, + "loss": 12.211, + "step": 13726 + }, + { + "epoch": 0.7474900110950018, + "grad_norm": 0.6563049547344048, + "learning_rate": 0.00014416987258059663, + "loss": 12.3766, + "step": 13727 + }, + { + "epoch": 0.7475444650915848, + "grad_norm": 0.530946466034582, + "learning_rate": 0.0001441619609900533, + "loss": 12.187, + "step": 13728 + }, + { + "epoch": 0.7475989190881678, + "grad_norm": 0.5851608738144949, + "learning_rate": 0.0001441540490560992, + "loss": 12.1812, + "step": 13729 + }, + { + "epoch": 0.7476533730847509, + "grad_norm": 0.623454123044338, + "learning_rate": 0.0001441461367787958, + "loss": 12.2312, + "step": 13730 + }, + { + "epoch": 0.7477078270813339, + "grad_norm": 0.6151173822258862, + "learning_rate": 0.00014413822415820465, + "loss": 12.272, + "step": 13731 + }, + { + "epoch": 0.7477622810779169, + "grad_norm": 0.6270601805006041, + "learning_rate": 0.00014413031119438723, + "loss": 12.2757, + "step": 13732 + }, + { + "epoch": 0.7478167350744999, + "grad_norm": 0.5982856037425696, + "learning_rate": 0.00014412239788740513, + "loss": 12.2053, + "step": 13733 + }, + { + "epoch": 0.7478711890710829, + "grad_norm": 0.6499148875444776, + "learning_rate": 0.00014411448423731985, + "loss": 12.3002, + "step": 13734 + }, + { + "epoch": 0.7479256430676658, + "grad_norm": 0.566224351321186, + "learning_rate": 0.00014410657024419295, + "loss": 12.1763, + "step": 13735 + }, + { + "epoch": 0.747980097064249, + "grad_norm": 0.6627051566300732, + "learning_rate": 0.00014409865590808598, + "loss": 12.2677, + "step": 13736 + }, + { + "epoch": 0.7480345510608319, + "grad_norm": 0.646344114628114, + "learning_rate": 0.00014409074122906048, + "loss": 12.2699, + "step": 13737 + }, + { + "epoch": 0.7480890050574149, + "grad_norm": 0.5937235560428703, + "learning_rate": 0.00014408282620717794, + "loss": 12.2404, + "step": 13738 + }, + { + "epoch": 0.7481434590539979, + "grad_norm": 0.5806014220082066, + "learning_rate": 0.00014407491084249995, + "loss": 12.2855, + "step": 13739 + }, + { + "epoch": 0.7481979130505809, + "grad_norm": 0.5659780798715816, + "learning_rate": 0.0001440669951350881, + "loss": 12.3017, + "step": 13740 + }, + { + "epoch": 0.7482523670471639, + "grad_norm": 0.6227277218313976, + "learning_rate": 0.00014405907908500388, + "loss": 12.2454, + "step": 13741 + }, + { + "epoch": 0.748306821043747, + "grad_norm": 0.6664710496279762, + "learning_rate": 0.0001440511626923089, + "loss": 12.3328, + "step": 13742 + }, + { + "epoch": 0.74836127504033, + "grad_norm": 0.5970770618016265, + "learning_rate": 0.00014404324595706464, + "loss": 12.2713, + "step": 13743 + }, + { + "epoch": 0.748415729036913, + "grad_norm": 0.578291694869162, + "learning_rate": 0.00014403532887933274, + "loss": 12.3062, + "step": 13744 + }, + { + "epoch": 0.748470183033496, + "grad_norm": 0.6416824753124646, + "learning_rate": 0.00014402741145917475, + "loss": 12.0758, + "step": 13745 + }, + { + "epoch": 0.748524637030079, + "grad_norm": 0.5655552786892055, + "learning_rate": 0.00014401949369665222, + "loss": 12.1638, + "step": 13746 + }, + { + "epoch": 0.7485790910266621, + "grad_norm": 0.6710049036354784, + "learning_rate": 0.00014401157559182674, + "loss": 12.285, + "step": 13747 + }, + { + "epoch": 0.7486335450232451, + "grad_norm": 0.6801326451669022, + "learning_rate": 0.00014400365714475986, + "loss": 12.3959, + "step": 13748 + }, + { + "epoch": 0.7486879990198281, + "grad_norm": 0.5368301698649847, + "learning_rate": 0.00014399573835551313, + "loss": 12.2605, + "step": 13749 + }, + { + "epoch": 0.7487424530164111, + "grad_norm": 0.6540587730560741, + "learning_rate": 0.00014398781922414817, + "loss": 12.3747, + "step": 13750 + }, + { + "epoch": 0.7487969070129941, + "grad_norm": 0.6016638900317841, + "learning_rate": 0.00014397989975072656, + "loss": 12.3126, + "step": 13751 + }, + { + "epoch": 0.748851361009577, + "grad_norm": 0.555668289888147, + "learning_rate": 0.0001439719799353099, + "loss": 12.3146, + "step": 13752 + }, + { + "epoch": 0.7489058150061602, + "grad_norm": 0.6973329845200757, + "learning_rate": 0.00014396405977795972, + "loss": 12.2642, + "step": 13753 + }, + { + "epoch": 0.7489602690027432, + "grad_norm": 0.5280970056108852, + "learning_rate": 0.00014395613927873765, + "loss": 12.2146, + "step": 13754 + }, + { + "epoch": 0.7490147229993261, + "grad_norm": 0.5899667507355091, + "learning_rate": 0.00014394821843770526, + "loss": 12.2097, + "step": 13755 + }, + { + "epoch": 0.7490691769959091, + "grad_norm": 0.5202820450953016, + "learning_rate": 0.00014394029725492416, + "loss": 12.2154, + "step": 13756 + }, + { + "epoch": 0.7491236309924921, + "grad_norm": 0.5798168456983885, + "learning_rate": 0.00014393237573045596, + "loss": 12.2218, + "step": 13757 + }, + { + "epoch": 0.7491780849890751, + "grad_norm": 0.5757177192193262, + "learning_rate": 0.0001439244538643622, + "loss": 12.1298, + "step": 13758 + }, + { + "epoch": 0.7492325389856582, + "grad_norm": 0.5439117109561773, + "learning_rate": 0.00014391653165670454, + "loss": 12.2106, + "step": 13759 + }, + { + "epoch": 0.7492869929822412, + "grad_norm": 0.5665992283629767, + "learning_rate": 0.00014390860910754453, + "loss": 12.2342, + "step": 13760 + }, + { + "epoch": 0.7493414469788242, + "grad_norm": 0.610271080039589, + "learning_rate": 0.00014390068621694387, + "loss": 12.2926, + "step": 13761 + }, + { + "epoch": 0.7493959009754072, + "grad_norm": 0.5701767578829343, + "learning_rate": 0.0001438927629849641, + "loss": 12.23, + "step": 13762 + }, + { + "epoch": 0.7494503549719902, + "grad_norm": 0.7850785013536542, + "learning_rate": 0.00014388483941166682, + "loss": 12.2746, + "step": 13763 + }, + { + "epoch": 0.7495048089685732, + "grad_norm": 0.6326742294350892, + "learning_rate": 0.0001438769154971137, + "loss": 12.2667, + "step": 13764 + }, + { + "epoch": 0.7495592629651563, + "grad_norm": 0.5210535123007397, + "learning_rate": 0.0001438689912413663, + "loss": 12.2196, + "step": 13765 + }, + { + "epoch": 0.7496137169617393, + "grad_norm": 0.5766175886374203, + "learning_rate": 0.00014386106664448625, + "loss": 12.2693, + "step": 13766 + }, + { + "epoch": 0.7496681709583223, + "grad_norm": 0.6178574912384032, + "learning_rate": 0.00014385314170653523, + "loss": 12.252, + "step": 13767 + }, + { + "epoch": 0.7497226249549053, + "grad_norm": 0.5915591393770658, + "learning_rate": 0.0001438452164275748, + "loss": 12.2551, + "step": 13768 + }, + { + "epoch": 0.7497770789514883, + "grad_norm": 0.6269592287242665, + "learning_rate": 0.00014383729080766664, + "loss": 12.2719, + "step": 13769 + }, + { + "epoch": 0.7498315329480713, + "grad_norm": 0.5609303240083141, + "learning_rate": 0.00014382936484687233, + "loss": 12.1725, + "step": 13770 + }, + { + "epoch": 0.7498859869446544, + "grad_norm": 0.6438885235253843, + "learning_rate": 0.00014382143854525353, + "loss": 12.333, + "step": 13771 + }, + { + "epoch": 0.7499404409412374, + "grad_norm": 0.5705368582587276, + "learning_rate": 0.00014381351190287188, + "loss": 12.2322, + "step": 13772 + }, + { + "epoch": 0.7499948949378203, + "grad_norm": 0.6498876610872063, + "learning_rate": 0.000143805584919789, + "loss": 12.3526, + "step": 13773 + }, + { + "epoch": 0.7500493489344033, + "grad_norm": 0.5829378708822657, + "learning_rate": 0.00014379765759606658, + "loss": 12.146, + "step": 13774 + }, + { + "epoch": 0.7501038029309863, + "grad_norm": 0.5781624917520148, + "learning_rate": 0.00014378972993176622, + "loss": 12.2652, + "step": 13775 + }, + { + "epoch": 0.7501582569275693, + "grad_norm": 0.5605017438627762, + "learning_rate": 0.00014378180192694957, + "loss": 12.1909, + "step": 13776 + }, + { + "epoch": 0.7502127109241524, + "grad_norm": 0.5589222728784693, + "learning_rate": 0.00014377387358167828, + "loss": 12.159, + "step": 13777 + }, + { + "epoch": 0.7502671649207354, + "grad_norm": 0.5655748541001888, + "learning_rate": 0.00014376594489601402, + "loss": 12.1719, + "step": 13778 + }, + { + "epoch": 0.7503216189173184, + "grad_norm": 0.9229516569274044, + "learning_rate": 0.00014375801587001842, + "loss": 12.1961, + "step": 13779 + }, + { + "epoch": 0.7503760729139014, + "grad_norm": 0.5190587246726092, + "learning_rate": 0.00014375008650375313, + "loss": 12.1197, + "step": 13780 + }, + { + "epoch": 0.7504305269104844, + "grad_norm": 0.5842286466771999, + "learning_rate": 0.0001437421567972799, + "loss": 12.0913, + "step": 13781 + }, + { + "epoch": 0.7504849809070675, + "grad_norm": 0.5366466263553302, + "learning_rate": 0.00014373422675066023, + "loss": 12.1292, + "step": 13782 + }, + { + "epoch": 0.7505394349036505, + "grad_norm": 0.6292072846094948, + "learning_rate": 0.00014372629636395597, + "loss": 12.3445, + "step": 13783 + }, + { + "epoch": 0.7505938889002335, + "grad_norm": 0.6518660654831823, + "learning_rate": 0.00014371836563722865, + "loss": 12.3205, + "step": 13784 + }, + { + "epoch": 0.7506483428968165, + "grad_norm": 0.5621702149563956, + "learning_rate": 0.00014371043457054, + "loss": 12.3394, + "step": 13785 + }, + { + "epoch": 0.7507027968933995, + "grad_norm": 0.5716000303103266, + "learning_rate": 0.00014370250316395167, + "loss": 12.2124, + "step": 13786 + }, + { + "epoch": 0.7507572508899825, + "grad_norm": 0.527542076552203, + "learning_rate": 0.00014369457141752534, + "loss": 12.2448, + "step": 13787 + }, + { + "epoch": 0.7508117048865656, + "grad_norm": 0.6303658246608846, + "learning_rate": 0.0001436866393313227, + "loss": 12.3021, + "step": 13788 + }, + { + "epoch": 0.7508661588831486, + "grad_norm": 0.5276474677797767, + "learning_rate": 0.00014367870690540544, + "loss": 12.2299, + "step": 13789 + }, + { + "epoch": 0.7509206128797316, + "grad_norm": 0.6712221790438491, + "learning_rate": 0.00014367077413983523, + "loss": 12.2085, + "step": 13790 + }, + { + "epoch": 0.7509750668763145, + "grad_norm": 0.534081814151848, + "learning_rate": 0.00014366284103467373, + "loss": 12.1906, + "step": 13791 + }, + { + "epoch": 0.7510295208728975, + "grad_norm": 0.5951491738825837, + "learning_rate": 0.0001436549075899827, + "loss": 12.2514, + "step": 13792 + }, + { + "epoch": 0.7510839748694805, + "grad_norm": 0.6803997052913654, + "learning_rate": 0.00014364697380582375, + "loss": 12.2647, + "step": 13793 + }, + { + "epoch": 0.7511384288660636, + "grad_norm": 0.5874316064236298, + "learning_rate": 0.00014363903968225863, + "loss": 12.3495, + "step": 13794 + }, + { + "epoch": 0.7511928828626466, + "grad_norm": 0.5356708641919317, + "learning_rate": 0.000143631105219349, + "loss": 12.2118, + "step": 13795 + }, + { + "epoch": 0.7512473368592296, + "grad_norm": 0.6295249693626224, + "learning_rate": 0.00014362317041715657, + "loss": 12.1805, + "step": 13796 + }, + { + "epoch": 0.7513017908558126, + "grad_norm": 1.03868303664006, + "learning_rate": 0.00014361523527574307, + "loss": 12.259, + "step": 13797 + }, + { + "epoch": 0.7513562448523956, + "grad_norm": 0.6257302393342892, + "learning_rate": 0.0001436072997951702, + "loss": 12.257, + "step": 13798 + }, + { + "epoch": 0.7514106988489786, + "grad_norm": 0.623335943617568, + "learning_rate": 0.00014359936397549965, + "loss": 12.3552, + "step": 13799 + }, + { + "epoch": 0.7514651528455617, + "grad_norm": 0.5684967086416157, + "learning_rate": 0.00014359142781679313, + "loss": 12.169, + "step": 13800 + }, + { + "epoch": 0.7515196068421447, + "grad_norm": 0.5688960037551076, + "learning_rate": 0.00014358349131911234, + "loss": 12.3163, + "step": 13801 + }, + { + "epoch": 0.7515740608387277, + "grad_norm": 0.5842685732263988, + "learning_rate": 0.00014357555448251902, + "loss": 12.2271, + "step": 13802 + }, + { + "epoch": 0.7516285148353107, + "grad_norm": 0.6143806368491119, + "learning_rate": 0.00014356761730707489, + "loss": 12.3893, + "step": 13803 + }, + { + "epoch": 0.7516829688318937, + "grad_norm": 0.6554362600326484, + "learning_rate": 0.00014355967979284167, + "loss": 12.2427, + "step": 13804 + }, + { + "epoch": 0.7517374228284767, + "grad_norm": 0.6311453226982938, + "learning_rate": 0.00014355174193988107, + "loss": 12.3582, + "step": 13805 + }, + { + "epoch": 0.7517918768250598, + "grad_norm": 0.6346421035651231, + "learning_rate": 0.0001435438037482548, + "loss": 12.2786, + "step": 13806 + }, + { + "epoch": 0.7518463308216428, + "grad_norm": 0.6117921300048075, + "learning_rate": 0.00014353586521802461, + "loss": 12.2096, + "step": 13807 + }, + { + "epoch": 0.7519007848182258, + "grad_norm": 0.6019476080731878, + "learning_rate": 0.0001435279263492523, + "loss": 12.3236, + "step": 13808 + }, + { + "epoch": 0.7519552388148087, + "grad_norm": 0.7514865124107512, + "learning_rate": 0.00014351998714199943, + "loss": 12.3547, + "step": 13809 + }, + { + "epoch": 0.7520096928113917, + "grad_norm": 0.5646877462273913, + "learning_rate": 0.0001435120475963279, + "loss": 12.2639, + "step": 13810 + }, + { + "epoch": 0.7520641468079748, + "grad_norm": 0.5891752717887218, + "learning_rate": 0.0001435041077122994, + "loss": 12.3077, + "step": 13811 + }, + { + "epoch": 0.7521186008045578, + "grad_norm": 0.5862291042415606, + "learning_rate": 0.00014349616748997562, + "loss": 12.2445, + "step": 13812 + }, + { + "epoch": 0.7521730548011408, + "grad_norm": 0.614589247223749, + "learning_rate": 0.00014348822692941842, + "loss": 12.3795, + "step": 13813 + }, + { + "epoch": 0.7522275087977238, + "grad_norm": 0.5813380916070603, + "learning_rate": 0.00014348028603068942, + "loss": 12.2293, + "step": 13814 + }, + { + "epoch": 0.7522819627943068, + "grad_norm": 0.63080789984502, + "learning_rate": 0.0001434723447938504, + "loss": 12.2895, + "step": 13815 + }, + { + "epoch": 0.7523364167908898, + "grad_norm": 0.5631956520829646, + "learning_rate": 0.00014346440321896318, + "loss": 12.2301, + "step": 13816 + }, + { + "epoch": 0.7523908707874729, + "grad_norm": 0.5276970829388432, + "learning_rate": 0.00014345646130608944, + "loss": 12.0721, + "step": 13817 + }, + { + "epoch": 0.7524453247840559, + "grad_norm": 0.5801900867009577, + "learning_rate": 0.00014344851905529103, + "loss": 12.2662, + "step": 13818 + }, + { + "epoch": 0.7524997787806389, + "grad_norm": 0.620601218168845, + "learning_rate": 0.0001434405764666296, + "loss": 12.3432, + "step": 13819 + }, + { + "epoch": 0.7525542327772219, + "grad_norm": 0.5993138059551066, + "learning_rate": 0.00014343263354016695, + "loss": 12.246, + "step": 13820 + }, + { + "epoch": 0.7526086867738049, + "grad_norm": 0.5793428284321268, + "learning_rate": 0.00014342469027596487, + "loss": 12.2635, + "step": 13821 + }, + { + "epoch": 0.7526631407703879, + "grad_norm": 0.626149109871221, + "learning_rate": 0.00014341674667408513, + "loss": 12.0145, + "step": 13822 + }, + { + "epoch": 0.752717594766971, + "grad_norm": 0.6405112662502047, + "learning_rate": 0.0001434088027345895, + "loss": 12.463, + "step": 13823 + }, + { + "epoch": 0.752772048763554, + "grad_norm": 0.6023957152504208, + "learning_rate": 0.00014340085845753972, + "loss": 12.1497, + "step": 13824 + }, + { + "epoch": 0.752826502760137, + "grad_norm": 0.6355489896387884, + "learning_rate": 0.0001433929138429976, + "loss": 12.459, + "step": 13825 + }, + { + "epoch": 0.75288095675672, + "grad_norm": 0.6425241440639089, + "learning_rate": 0.00014338496889102487, + "loss": 12.311, + "step": 13826 + }, + { + "epoch": 0.752935410753303, + "grad_norm": 0.6167990707777722, + "learning_rate": 0.00014337702360168336, + "loss": 12.1696, + "step": 13827 + }, + { + "epoch": 0.7529898647498859, + "grad_norm": 0.6539399435350298, + "learning_rate": 0.00014336907797503487, + "loss": 12.2048, + "step": 13828 + }, + { + "epoch": 0.753044318746469, + "grad_norm": 0.6056473383463385, + "learning_rate": 0.00014336113201114114, + "loss": 12.1473, + "step": 13829 + }, + { + "epoch": 0.753098772743052, + "grad_norm": 0.5535186851656467, + "learning_rate": 0.00014335318571006398, + "loss": 12.2927, + "step": 13830 + }, + { + "epoch": 0.753153226739635, + "grad_norm": 0.6022086596989811, + "learning_rate": 0.00014334523907186513, + "loss": 12.2001, + "step": 13831 + }, + { + "epoch": 0.753207680736218, + "grad_norm": 0.591273556250499, + "learning_rate": 0.00014333729209660648, + "loss": 12.199, + "step": 13832 + }, + { + "epoch": 0.753262134732801, + "grad_norm": 0.5760100490032448, + "learning_rate": 0.00014332934478434982, + "loss": 12.1534, + "step": 13833 + }, + { + "epoch": 0.753316588729384, + "grad_norm": 0.6027884470696825, + "learning_rate": 0.00014332139713515684, + "loss": 12.3531, + "step": 13834 + }, + { + "epoch": 0.7533710427259671, + "grad_norm": 0.6205928858416357, + "learning_rate": 0.00014331344914908942, + "loss": 12.2737, + "step": 13835 + }, + { + "epoch": 0.7534254967225501, + "grad_norm": 0.5886492463854501, + "learning_rate": 0.00014330550082620937, + "loss": 12.263, + "step": 13836 + }, + { + "epoch": 0.7534799507191331, + "grad_norm": 0.5914234570101967, + "learning_rate": 0.00014329755216657849, + "loss": 12.0383, + "step": 13837 + }, + { + "epoch": 0.7535344047157161, + "grad_norm": 0.6410193708975239, + "learning_rate": 0.00014328960317025856, + "loss": 12.281, + "step": 13838 + }, + { + "epoch": 0.7535888587122991, + "grad_norm": 0.5314330429599432, + "learning_rate": 0.00014328165383731145, + "loss": 12.2602, + "step": 13839 + }, + { + "epoch": 0.7536433127088821, + "grad_norm": 0.540957434758258, + "learning_rate": 0.0001432737041677989, + "loss": 12.2461, + "step": 13840 + }, + { + "epoch": 0.7536977667054652, + "grad_norm": 0.5432546815104532, + "learning_rate": 0.00014326575416178278, + "loss": 12.1897, + "step": 13841 + }, + { + "epoch": 0.7537522207020482, + "grad_norm": 0.5541501713891164, + "learning_rate": 0.00014325780381932492, + "loss": 12.164, + "step": 13842 + }, + { + "epoch": 0.7538066746986312, + "grad_norm": 0.7697095586952005, + "learning_rate": 0.0001432498531404871, + "loss": 12.1623, + "step": 13843 + }, + { + "epoch": 0.7538611286952142, + "grad_norm": 0.49945314514455, + "learning_rate": 0.0001432419021253312, + "loss": 12.2043, + "step": 13844 + }, + { + "epoch": 0.7539155826917971, + "grad_norm": 0.5374440022342415, + "learning_rate": 0.000143233950773919, + "loss": 12.1716, + "step": 13845 + }, + { + "epoch": 0.7539700366883803, + "grad_norm": 0.7339500382749483, + "learning_rate": 0.00014322599908631232, + "loss": 12.3037, + "step": 13846 + }, + { + "epoch": 0.7540244906849632, + "grad_norm": 0.5815419891307385, + "learning_rate": 0.00014321804706257307, + "loss": 12.2847, + "step": 13847 + }, + { + "epoch": 0.7540789446815462, + "grad_norm": 0.5297258689843342, + "learning_rate": 0.00014321009470276303, + "loss": 12.1873, + "step": 13848 + }, + { + "epoch": 0.7541333986781292, + "grad_norm": 0.6692806498141518, + "learning_rate": 0.000143202142006944, + "loss": 12.3004, + "step": 13849 + }, + { + "epoch": 0.7541878526747122, + "grad_norm": 0.517943893944968, + "learning_rate": 0.00014319418897517792, + "loss": 12.1011, + "step": 13850 + }, + { + "epoch": 0.7542423066712952, + "grad_norm": 0.5772763106403177, + "learning_rate": 0.00014318623560752658, + "loss": 12.3086, + "step": 13851 + }, + { + "epoch": 0.7542967606678783, + "grad_norm": 0.6149695725607878, + "learning_rate": 0.0001431782819040518, + "loss": 12.2392, + "step": 13852 + }, + { + "epoch": 0.7543512146644613, + "grad_norm": 0.6400262055660713, + "learning_rate": 0.00014317032786481548, + "loss": 12.4411, + "step": 13853 + }, + { + "epoch": 0.7544056686610443, + "grad_norm": 0.5576986266208344, + "learning_rate": 0.00014316237348987944, + "loss": 12.2759, + "step": 13854 + }, + { + "epoch": 0.7544601226576273, + "grad_norm": 0.6692229239456416, + "learning_rate": 0.00014315441877930555, + "loss": 12.2574, + "step": 13855 + }, + { + "epoch": 0.7545145766542103, + "grad_norm": 0.6281993479852913, + "learning_rate": 0.0001431464637331557, + "loss": 12.3431, + "step": 13856 + }, + { + "epoch": 0.7545690306507933, + "grad_norm": 0.5816207313651164, + "learning_rate": 0.00014313850835149166, + "loss": 12.1529, + "step": 13857 + }, + { + "epoch": 0.7546234846473764, + "grad_norm": 0.6051148573253917, + "learning_rate": 0.00014313055263437535, + "loss": 12.3998, + "step": 13858 + }, + { + "epoch": 0.7546779386439594, + "grad_norm": 0.557618965382382, + "learning_rate": 0.00014312259658186865, + "loss": 12.2016, + "step": 13859 + }, + { + "epoch": 0.7547323926405424, + "grad_norm": 0.5795972098301391, + "learning_rate": 0.0001431146401940334, + "loss": 12.2748, + "step": 13860 + }, + { + "epoch": 0.7547868466371254, + "grad_norm": 0.5661625741258683, + "learning_rate": 0.00014310668347093146, + "loss": 12.1477, + "step": 13861 + }, + { + "epoch": 0.7548413006337084, + "grad_norm": 0.5847540858969881, + "learning_rate": 0.00014309872641262475, + "loss": 12.2328, + "step": 13862 + }, + { + "epoch": 0.7548957546302913, + "grad_norm": 0.5954478677911897, + "learning_rate": 0.00014309076901917512, + "loss": 12.2756, + "step": 13863 + }, + { + "epoch": 0.7549502086268745, + "grad_norm": 0.5042730004768833, + "learning_rate": 0.00014308281129064442, + "loss": 12.159, + "step": 13864 + }, + { + "epoch": 0.7550046626234574, + "grad_norm": 0.6458120398598242, + "learning_rate": 0.00014307485322709456, + "loss": 12.1989, + "step": 13865 + }, + { + "epoch": 0.7550591166200404, + "grad_norm": 0.5819826806870918, + "learning_rate": 0.00014306689482858738, + "loss": 12.1708, + "step": 13866 + }, + { + "epoch": 0.7551135706166234, + "grad_norm": 0.552542592845485, + "learning_rate": 0.00014305893609518487, + "loss": 12.1174, + "step": 13867 + }, + { + "epoch": 0.7551680246132064, + "grad_norm": 0.6422678536801202, + "learning_rate": 0.00014305097702694883, + "loss": 12.2365, + "step": 13868 + }, + { + "epoch": 0.7552224786097894, + "grad_norm": 0.5456608259492738, + "learning_rate": 0.00014304301762394116, + "loss": 12.2812, + "step": 13869 + }, + { + "epoch": 0.7552769326063725, + "grad_norm": 0.5815679736576249, + "learning_rate": 0.00014303505788622374, + "loss": 12.2574, + "step": 13870 + }, + { + "epoch": 0.7553313866029555, + "grad_norm": 0.6311052988627255, + "learning_rate": 0.00014302709781385855, + "loss": 12.2308, + "step": 13871 + }, + { + "epoch": 0.7553858405995385, + "grad_norm": 0.6006336352942205, + "learning_rate": 0.0001430191374069074, + "loss": 12.3778, + "step": 13872 + }, + { + "epoch": 0.7554402945961215, + "grad_norm": 0.6255333097598189, + "learning_rate": 0.00014301117666543225, + "loss": 11.9867, + "step": 13873 + }, + { + "epoch": 0.7554947485927045, + "grad_norm": 0.5808851218717448, + "learning_rate": 0.00014300321558949496, + "loss": 12.172, + "step": 13874 + }, + { + "epoch": 0.7555492025892875, + "grad_norm": 0.661725093009846, + "learning_rate": 0.00014299525417915744, + "loss": 12.3623, + "step": 13875 + }, + { + "epoch": 0.7556036565858706, + "grad_norm": 0.6244849203626905, + "learning_rate": 0.00014298729243448162, + "loss": 12.2022, + "step": 13876 + }, + { + "epoch": 0.7556581105824536, + "grad_norm": 0.5769886983567915, + "learning_rate": 0.00014297933035552942, + "loss": 12.2396, + "step": 13877 + }, + { + "epoch": 0.7557125645790366, + "grad_norm": 0.5965097791921065, + "learning_rate": 0.00014297136794236273, + "loss": 12.2386, + "step": 13878 + }, + { + "epoch": 0.7557670185756196, + "grad_norm": 0.5921318795590744, + "learning_rate": 0.00014296340519504347, + "loss": 12.2327, + "step": 13879 + }, + { + "epoch": 0.7558214725722026, + "grad_norm": 0.6021718374457375, + "learning_rate": 0.00014295544211363357, + "loss": 12.2171, + "step": 13880 + }, + { + "epoch": 0.7558759265687857, + "grad_norm": 0.5942547115161655, + "learning_rate": 0.00014294747869819495, + "loss": 12.2583, + "step": 13881 + }, + { + "epoch": 0.7559303805653687, + "grad_norm": 0.5705233666422782, + "learning_rate": 0.00014293951494878955, + "loss": 12.1982, + "step": 13882 + }, + { + "epoch": 0.7559848345619516, + "grad_norm": 0.6233055567094691, + "learning_rate": 0.00014293155086547927, + "loss": 12.3178, + "step": 13883 + }, + { + "epoch": 0.7560392885585346, + "grad_norm": 0.5574722512875832, + "learning_rate": 0.00014292358644832603, + "loss": 12.4212, + "step": 13884 + }, + { + "epoch": 0.7560937425551176, + "grad_norm": 0.6029716387807774, + "learning_rate": 0.0001429156216973918, + "loss": 12.4054, + "step": 13885 + }, + { + "epoch": 0.7561481965517006, + "grad_norm": 0.588774065224866, + "learning_rate": 0.00014290765661273847, + "loss": 12.2661, + "step": 13886 + }, + { + "epoch": 0.7562026505482837, + "grad_norm": 0.5495595440606598, + "learning_rate": 0.00014289969119442804, + "loss": 12.364, + "step": 13887 + }, + { + "epoch": 0.7562571045448667, + "grad_norm": 0.6343815607337305, + "learning_rate": 0.00014289172544252246, + "loss": 12.2642, + "step": 13888 + }, + { + "epoch": 0.7563115585414497, + "grad_norm": 0.5732501696621901, + "learning_rate": 0.00014288375935708357, + "loss": 12.2799, + "step": 13889 + }, + { + "epoch": 0.7563660125380327, + "grad_norm": 0.5839071557316391, + "learning_rate": 0.0001428757929381734, + "loss": 12.1891, + "step": 13890 + }, + { + "epoch": 0.7564204665346157, + "grad_norm": 0.6679324286207482, + "learning_rate": 0.00014286782618585383, + "loss": 12.253, + "step": 13891 + }, + { + "epoch": 0.7564749205311987, + "grad_norm": 0.5527326954974784, + "learning_rate": 0.00014285985910018689, + "loss": 12.2295, + "step": 13892 + }, + { + "epoch": 0.7565293745277818, + "grad_norm": 0.5980791290120517, + "learning_rate": 0.0001428518916812345, + "loss": 12.3769, + "step": 13893 + }, + { + "epoch": 0.7565838285243648, + "grad_norm": 0.6069665356579306, + "learning_rate": 0.00014284392392905861, + "loss": 12.2317, + "step": 13894 + }, + { + "epoch": 0.7566382825209478, + "grad_norm": 0.5593302708857323, + "learning_rate": 0.0001428359558437212, + "loss": 12.1659, + "step": 13895 + }, + { + "epoch": 0.7566927365175308, + "grad_norm": 0.6431685721579361, + "learning_rate": 0.0001428279874252842, + "loss": 12.3359, + "step": 13896 + }, + { + "epoch": 0.7567471905141138, + "grad_norm": 0.6286285307146453, + "learning_rate": 0.00014282001867380953, + "loss": 12.261, + "step": 13897 + }, + { + "epoch": 0.7568016445106968, + "grad_norm": 0.5815019618216938, + "learning_rate": 0.00014281204958935929, + "loss": 12.2054, + "step": 13898 + }, + { + "epoch": 0.7568560985072799, + "grad_norm": 0.6448256396057778, + "learning_rate": 0.00014280408017199535, + "loss": 12.2712, + "step": 13899 + }, + { + "epoch": 0.7569105525038629, + "grad_norm": 0.5632502677636745, + "learning_rate": 0.0001427961104217797, + "loss": 12.3028, + "step": 13900 + }, + { + "epoch": 0.7569650065004458, + "grad_norm": 0.5939995631430055, + "learning_rate": 0.00014278814033877432, + "loss": 12.1262, + "step": 13901 + }, + { + "epoch": 0.7570194604970288, + "grad_norm": 0.610425379765044, + "learning_rate": 0.0001427801699230412, + "loss": 12.2151, + "step": 13902 + }, + { + "epoch": 0.7570739144936118, + "grad_norm": 0.5515460328017994, + "learning_rate": 0.0001427721991746423, + "loss": 12.2251, + "step": 13903 + }, + { + "epoch": 0.7571283684901948, + "grad_norm": 0.6204862537814627, + "learning_rate": 0.00014276422809363957, + "loss": 12.2634, + "step": 13904 + }, + { + "epoch": 0.7571828224867779, + "grad_norm": 0.582736281918877, + "learning_rate": 0.00014275625668009508, + "loss": 12.2089, + "step": 13905 + }, + { + "epoch": 0.7572372764833609, + "grad_norm": 0.5936936567696945, + "learning_rate": 0.00014274828493407072, + "loss": 12.2002, + "step": 13906 + }, + { + "epoch": 0.7572917304799439, + "grad_norm": 0.5530303776248853, + "learning_rate": 0.00014274031285562856, + "loss": 12.2321, + "step": 13907 + }, + { + "epoch": 0.7573461844765269, + "grad_norm": 0.5826100861165144, + "learning_rate": 0.00014273234044483054, + "loss": 12.3413, + "step": 13908 + }, + { + "epoch": 0.7574006384731099, + "grad_norm": 0.5578995915437032, + "learning_rate": 0.00014272436770173868, + "loss": 12.2443, + "step": 13909 + }, + { + "epoch": 0.7574550924696929, + "grad_norm": 0.5545115358555107, + "learning_rate": 0.000142716394626415, + "loss": 12.1602, + "step": 13910 + }, + { + "epoch": 0.757509546466276, + "grad_norm": 0.6002801986882448, + "learning_rate": 0.00014270842121892144, + "loss": 12.2704, + "step": 13911 + }, + { + "epoch": 0.757564000462859, + "grad_norm": 0.5875392696566134, + "learning_rate": 0.00014270044747932006, + "loss": 12.3167, + "step": 13912 + }, + { + "epoch": 0.757618454459442, + "grad_norm": 0.6594591775214861, + "learning_rate": 0.00014269247340767283, + "loss": 12.3784, + "step": 13913 + }, + { + "epoch": 0.757672908456025, + "grad_norm": 0.5947682770062622, + "learning_rate": 0.00014268449900404175, + "loss": 12.3146, + "step": 13914 + }, + { + "epoch": 0.757727362452608, + "grad_norm": 0.6568834496108618, + "learning_rate": 0.00014267652426848887, + "loss": 12.1571, + "step": 13915 + }, + { + "epoch": 0.7577818164491911, + "grad_norm": 0.6560720446581242, + "learning_rate": 0.00014266854920107617, + "loss": 12.1478, + "step": 13916 + }, + { + "epoch": 0.7578362704457741, + "grad_norm": 0.5738304909572517, + "learning_rate": 0.0001426605738018657, + "loss": 12.2459, + "step": 13917 + }, + { + "epoch": 0.757890724442357, + "grad_norm": 0.560306964319945, + "learning_rate": 0.0001426525980709194, + "loss": 12.3494, + "step": 13918 + }, + { + "epoch": 0.75794517843894, + "grad_norm": 0.6763115699225631, + "learning_rate": 0.00014264462200829937, + "loss": 12.3879, + "step": 13919 + }, + { + "epoch": 0.757999632435523, + "grad_norm": 0.5703290243771721, + "learning_rate": 0.00014263664561406763, + "loss": 12.3699, + "step": 13920 + }, + { + "epoch": 0.758054086432106, + "grad_norm": 0.6625014384246614, + "learning_rate": 0.00014262866888828615, + "loss": 12.2679, + "step": 13921 + }, + { + "epoch": 0.7581085404286891, + "grad_norm": 0.5769931413845019, + "learning_rate": 0.000142620691831017, + "loss": 12.2372, + "step": 13922 + }, + { + "epoch": 0.7581629944252721, + "grad_norm": 0.5949061529616914, + "learning_rate": 0.00014261271444232222, + "loss": 12.2714, + "step": 13923 + }, + { + "epoch": 0.7582174484218551, + "grad_norm": 0.5496679640085099, + "learning_rate": 0.00014260473672226382, + "loss": 12.2477, + "step": 13924 + }, + { + "epoch": 0.7582719024184381, + "grad_norm": 0.6493771055430315, + "learning_rate": 0.00014259675867090384, + "loss": 12.3036, + "step": 13925 + }, + { + "epoch": 0.7583263564150211, + "grad_norm": 0.5665578504631344, + "learning_rate": 0.00014258878028830432, + "loss": 12.3569, + "step": 13926 + }, + { + "epoch": 0.7583808104116041, + "grad_norm": 0.553654878253341, + "learning_rate": 0.0001425808015745273, + "loss": 12.2306, + "step": 13927 + }, + { + "epoch": 0.7584352644081872, + "grad_norm": 0.6336767633083233, + "learning_rate": 0.00014257282252963483, + "loss": 12.2879, + "step": 13928 + }, + { + "epoch": 0.7584897184047702, + "grad_norm": 0.6957955270092684, + "learning_rate": 0.00014256484315368896, + "loss": 12.3103, + "step": 13929 + }, + { + "epoch": 0.7585441724013532, + "grad_norm": 0.6296785833482869, + "learning_rate": 0.00014255686344675171, + "loss": 12.2726, + "step": 13930 + }, + { + "epoch": 0.7585986263979362, + "grad_norm": 0.6146755812721879, + "learning_rate": 0.00014254888340888518, + "loss": 12.3361, + "step": 13931 + }, + { + "epoch": 0.7586530803945192, + "grad_norm": 0.5695074475030054, + "learning_rate": 0.00014254090304015136, + "loss": 12.3646, + "step": 13932 + }, + { + "epoch": 0.7587075343911022, + "grad_norm": 0.5958776096596177, + "learning_rate": 0.00014253292234061237, + "loss": 12.2512, + "step": 13933 + }, + { + "epoch": 0.7587619883876853, + "grad_norm": 0.579919117409958, + "learning_rate": 0.00014252494131033027, + "loss": 12.2253, + "step": 13934 + }, + { + "epoch": 0.7588164423842683, + "grad_norm": 0.5915421701951903, + "learning_rate": 0.00014251695994936704, + "loss": 12.2334, + "step": 13935 + }, + { + "epoch": 0.7588708963808513, + "grad_norm": 0.6499479162687332, + "learning_rate": 0.00014250897825778482, + "loss": 12.4318, + "step": 13936 + }, + { + "epoch": 0.7589253503774342, + "grad_norm": 0.5295834541077248, + "learning_rate": 0.00014250099623564565, + "loss": 12.1041, + "step": 13937 + }, + { + "epoch": 0.7589798043740172, + "grad_norm": 0.5251911988552174, + "learning_rate": 0.00014249301388301165, + "loss": 12.2811, + "step": 13938 + }, + { + "epoch": 0.7590342583706002, + "grad_norm": 0.6043959090041436, + "learning_rate": 0.00014248503119994484, + "loss": 12.246, + "step": 13939 + }, + { + "epoch": 0.7590887123671833, + "grad_norm": 0.577799259201176, + "learning_rate": 0.00014247704818650723, + "loss": 12.1886, + "step": 13940 + }, + { + "epoch": 0.7591431663637663, + "grad_norm": 0.6230601576377133, + "learning_rate": 0.00014246906484276104, + "loss": 12.2068, + "step": 13941 + }, + { + "epoch": 0.7591976203603493, + "grad_norm": 0.6102940610921074, + "learning_rate": 0.00014246108116876824, + "loss": 12.1497, + "step": 13942 + }, + { + "epoch": 0.7592520743569323, + "grad_norm": 0.638796409373731, + "learning_rate": 0.00014245309716459098, + "loss": 12.1821, + "step": 13943 + }, + { + "epoch": 0.7593065283535153, + "grad_norm": 0.60346867893492, + "learning_rate": 0.00014244511283029133, + "loss": 12.2799, + "step": 13944 + }, + { + "epoch": 0.7593609823500984, + "grad_norm": 0.5569590445930767, + "learning_rate": 0.00014243712816593132, + "loss": 12.1409, + "step": 13945 + }, + { + "epoch": 0.7594154363466814, + "grad_norm": 0.6095387293968589, + "learning_rate": 0.0001424291431715731, + "loss": 12.3211, + "step": 13946 + }, + { + "epoch": 0.7594698903432644, + "grad_norm": 0.6054626111032342, + "learning_rate": 0.00014242115784727873, + "loss": 12.281, + "step": 13947 + }, + { + "epoch": 0.7595243443398474, + "grad_norm": 0.5724240846205761, + "learning_rate": 0.00014241317219311036, + "loss": 12.375, + "step": 13948 + }, + { + "epoch": 0.7595787983364304, + "grad_norm": 0.561696636977615, + "learning_rate": 0.00014240518620913005, + "loss": 12.2647, + "step": 13949 + }, + { + "epoch": 0.7596332523330134, + "grad_norm": 0.5821499988373755, + "learning_rate": 0.0001423971998953999, + "loss": 12.1901, + "step": 13950 + }, + { + "epoch": 0.7596877063295965, + "grad_norm": 0.5776864442082547, + "learning_rate": 0.000142389213251982, + "loss": 12.2944, + "step": 13951 + }, + { + "epoch": 0.7597421603261795, + "grad_norm": 0.5354845840539186, + "learning_rate": 0.00014238122627893845, + "loss": 12.1452, + "step": 13952 + }, + { + "epoch": 0.7597966143227625, + "grad_norm": 0.5960205210876889, + "learning_rate": 0.0001423732389763314, + "loss": 12.0842, + "step": 13953 + }, + { + "epoch": 0.7598510683193455, + "grad_norm": 0.6156684709857669, + "learning_rate": 0.00014236525134422295, + "loss": 12.2972, + "step": 13954 + }, + { + "epoch": 0.7599055223159285, + "grad_norm": 0.5610810124186224, + "learning_rate": 0.00014235726338267517, + "loss": 12.2933, + "step": 13955 + }, + { + "epoch": 0.7599599763125114, + "grad_norm": 0.6195395184747021, + "learning_rate": 0.00014234927509175022, + "loss": 12.2857, + "step": 13956 + }, + { + "epoch": 0.7600144303090945, + "grad_norm": 0.5602682088640545, + "learning_rate": 0.00014234128647151018, + "loss": 12.093, + "step": 13957 + }, + { + "epoch": 0.7600688843056775, + "grad_norm": 0.722577336954991, + "learning_rate": 0.00014233329752201722, + "loss": 12.2452, + "step": 13958 + }, + { + "epoch": 0.7601233383022605, + "grad_norm": 0.5471100875030792, + "learning_rate": 0.00014232530824333348, + "loss": 12.2115, + "step": 13959 + }, + { + "epoch": 0.7601777922988435, + "grad_norm": 0.6606957761725386, + "learning_rate": 0.000142317318635521, + "loss": 12.2199, + "step": 13960 + }, + { + "epoch": 0.7602322462954265, + "grad_norm": 0.5268687412814926, + "learning_rate": 0.00014230932869864195, + "loss": 12.092, + "step": 13961 + }, + { + "epoch": 0.7602867002920095, + "grad_norm": 0.607743751189324, + "learning_rate": 0.00014230133843275847, + "loss": 12.2734, + "step": 13962 + }, + { + "epoch": 0.7603411542885926, + "grad_norm": 0.5600065514983386, + "learning_rate": 0.00014229334783793268, + "loss": 12.0704, + "step": 13963 + }, + { + "epoch": 0.7603956082851756, + "grad_norm": 0.5695028254072064, + "learning_rate": 0.00014228535691422674, + "loss": 12.2109, + "step": 13964 + }, + { + "epoch": 0.7604500622817586, + "grad_norm": 0.5815235519116067, + "learning_rate": 0.0001422773656617028, + "loss": 12.1929, + "step": 13965 + }, + { + "epoch": 0.7605045162783416, + "grad_norm": 0.5251375307774158, + "learning_rate": 0.00014226937408042293, + "loss": 12.2333, + "step": 13966 + }, + { + "epoch": 0.7605589702749246, + "grad_norm": 0.5857315453039587, + "learning_rate": 0.00014226138217044937, + "loss": 12.2082, + "step": 13967 + }, + { + "epoch": 0.7606134242715076, + "grad_norm": 0.577963728586446, + "learning_rate": 0.00014225338993184417, + "loss": 12.2695, + "step": 13968 + }, + { + "epoch": 0.7606678782680907, + "grad_norm": 0.5513473816586468, + "learning_rate": 0.0001422453973646695, + "loss": 12.3888, + "step": 13969 + }, + { + "epoch": 0.7607223322646737, + "grad_norm": 0.6016650383635523, + "learning_rate": 0.0001422374044689876, + "loss": 12.1849, + "step": 13970 + }, + { + "epoch": 0.7607767862612567, + "grad_norm": 0.6370471484189492, + "learning_rate": 0.00014222941124486052, + "loss": 12.3189, + "step": 13971 + }, + { + "epoch": 0.7608312402578397, + "grad_norm": 0.5499723366235015, + "learning_rate": 0.00014222141769235048, + "loss": 12.2014, + "step": 13972 + }, + { + "epoch": 0.7608856942544227, + "grad_norm": 0.6698610911475733, + "learning_rate": 0.0001422134238115196, + "loss": 12.2091, + "step": 13973 + }, + { + "epoch": 0.7609401482510056, + "grad_norm": 0.599557631459671, + "learning_rate": 0.00014220542960243003, + "loss": 12.2612, + "step": 13974 + }, + { + "epoch": 0.7609946022475887, + "grad_norm": 0.6551527802649024, + "learning_rate": 0.000142197435065144, + "loss": 12.2865, + "step": 13975 + }, + { + "epoch": 0.7610490562441717, + "grad_norm": 0.6157311467055842, + "learning_rate": 0.00014218944019972363, + "loss": 12.3061, + "step": 13976 + }, + { + "epoch": 0.7611035102407547, + "grad_norm": 0.5781439219031981, + "learning_rate": 0.0001421814450062311, + "loss": 12.1868, + "step": 13977 + }, + { + "epoch": 0.7611579642373377, + "grad_norm": 0.5501211790005208, + "learning_rate": 0.00014217344948472857, + "loss": 12.2197, + "step": 13978 + }, + { + "epoch": 0.7612124182339207, + "grad_norm": 0.5240842381295177, + "learning_rate": 0.00014216545363527822, + "loss": 12.1797, + "step": 13979 + }, + { + "epoch": 0.7612668722305038, + "grad_norm": 0.6189210529605181, + "learning_rate": 0.00014215745745794226, + "loss": 12.2871, + "step": 13980 + }, + { + "epoch": 0.7613213262270868, + "grad_norm": 0.5951223275828108, + "learning_rate": 0.0001421494609527828, + "loss": 12.3688, + "step": 13981 + }, + { + "epoch": 0.7613757802236698, + "grad_norm": 0.5921341999404179, + "learning_rate": 0.0001421414641198621, + "loss": 12.3034, + "step": 13982 + }, + { + "epoch": 0.7614302342202528, + "grad_norm": 0.655618968539555, + "learning_rate": 0.00014213346695924229, + "loss": 12.3406, + "step": 13983 + }, + { + "epoch": 0.7614846882168358, + "grad_norm": 0.5432306320692075, + "learning_rate": 0.00014212546947098558, + "loss": 12.24, + "step": 13984 + }, + { + "epoch": 0.7615391422134188, + "grad_norm": 0.6036883548019074, + "learning_rate": 0.00014211747165515415, + "loss": 12.0426, + "step": 13985 + }, + { + "epoch": 0.7615935962100019, + "grad_norm": 0.5124128687561235, + "learning_rate": 0.00014210947351181018, + "loss": 12.1231, + "step": 13986 + }, + { + "epoch": 0.7616480502065849, + "grad_norm": 0.5640171129791517, + "learning_rate": 0.0001421014750410159, + "loss": 12.2459, + "step": 13987 + }, + { + "epoch": 0.7617025042031679, + "grad_norm": 0.5533932551479256, + "learning_rate": 0.0001420934762428335, + "loss": 12.2373, + "step": 13988 + }, + { + "epoch": 0.7617569581997509, + "grad_norm": 0.7231313373660987, + "learning_rate": 0.00014208547711732516, + "loss": 12.2069, + "step": 13989 + }, + { + "epoch": 0.7618114121963339, + "grad_norm": 0.5883848264970278, + "learning_rate": 0.0001420774776645531, + "loss": 12.2925, + "step": 13990 + }, + { + "epoch": 0.7618658661929169, + "grad_norm": 0.6112306520637002, + "learning_rate": 0.00014206947788457952, + "loss": 12.2716, + "step": 13991 + }, + { + "epoch": 0.7619203201895, + "grad_norm": 0.5651211551615413, + "learning_rate": 0.00014206147777746662, + "loss": 12.315, + "step": 13992 + }, + { + "epoch": 0.761974774186083, + "grad_norm": 0.5509869436075503, + "learning_rate": 0.0001420534773432766, + "loss": 12.346, + "step": 13993 + }, + { + "epoch": 0.7620292281826659, + "grad_norm": 0.5918797097019901, + "learning_rate": 0.00014204547658207173, + "loss": 12.276, + "step": 13994 + }, + { + "epoch": 0.7620836821792489, + "grad_norm": 0.6708119074943824, + "learning_rate": 0.00014203747549391412, + "loss": 12.3249, + "step": 13995 + }, + { + "epoch": 0.7621381361758319, + "grad_norm": 0.6152818078894708, + "learning_rate": 0.0001420294740788661, + "loss": 12.153, + "step": 13996 + }, + { + "epoch": 0.7621925901724149, + "grad_norm": 0.6318583507656009, + "learning_rate": 0.00014202147233698982, + "loss": 12.3252, + "step": 13997 + }, + { + "epoch": 0.762247044168998, + "grad_norm": 0.5700684553606541, + "learning_rate": 0.0001420134702683475, + "loss": 12.2328, + "step": 13998 + }, + { + "epoch": 0.762301498165581, + "grad_norm": 0.5645040981476561, + "learning_rate": 0.00014200546787300144, + "loss": 12.1694, + "step": 13999 + }, + { + "epoch": 0.762355952162164, + "grad_norm": 0.688139703247829, + "learning_rate": 0.00014199746515101383, + "loss": 12.3021, + "step": 14000 + }, + { + "epoch": 0.762410406158747, + "grad_norm": 0.9161755471060709, + "learning_rate": 0.0001419894621024468, + "loss": 12.309, + "step": 14001 + }, + { + "epoch": 0.76246486015533, + "grad_norm": 0.6852488173033148, + "learning_rate": 0.00014198145872736272, + "loss": 12.3005, + "step": 14002 + }, + { + "epoch": 0.762519314151913, + "grad_norm": 0.6742687447311347, + "learning_rate": 0.0001419734550258238, + "loss": 12.3142, + "step": 14003 + }, + { + "epoch": 0.7625737681484961, + "grad_norm": 0.6925915386799684, + "learning_rate": 0.00014196545099789224, + "loss": 12.2864, + "step": 14004 + }, + { + "epoch": 0.7626282221450791, + "grad_norm": 0.595301777501327, + "learning_rate": 0.00014195744664363032, + "loss": 12.0688, + "step": 14005 + }, + { + "epoch": 0.7626826761416621, + "grad_norm": 0.5281359073325815, + "learning_rate": 0.00014194944196310017, + "loss": 12.2558, + "step": 14006 + }, + { + "epoch": 0.7627371301382451, + "grad_norm": 0.5523714966665472, + "learning_rate": 0.00014194143695636421, + "loss": 12.2697, + "step": 14007 + }, + { + "epoch": 0.7627915841348281, + "grad_norm": 0.6080070137033171, + "learning_rate": 0.00014193343162348455, + "loss": 12.2527, + "step": 14008 + }, + { + "epoch": 0.762846038131411, + "grad_norm": 0.6627933267237143, + "learning_rate": 0.00014192542596452355, + "loss": 12.0337, + "step": 14009 + }, + { + "epoch": 0.7629004921279942, + "grad_norm": 0.6805001156429517, + "learning_rate": 0.0001419174199795434, + "loss": 12.2574, + "step": 14010 + }, + { + "epoch": 0.7629549461245771, + "grad_norm": 0.6161146392108356, + "learning_rate": 0.00014190941366860633, + "loss": 12.1328, + "step": 14011 + }, + { + "epoch": 0.7630094001211601, + "grad_norm": 0.5934805056499477, + "learning_rate": 0.0001419014070317746, + "loss": 12.2775, + "step": 14012 + }, + { + "epoch": 0.7630638541177431, + "grad_norm": 0.6779302289590717, + "learning_rate": 0.00014189340006911055, + "loss": 12.3298, + "step": 14013 + }, + { + "epoch": 0.7631183081143261, + "grad_norm": 0.6070414779581786, + "learning_rate": 0.00014188539278067643, + "loss": 12.219, + "step": 14014 + }, + { + "epoch": 0.7631727621109092, + "grad_norm": 0.6993064018098617, + "learning_rate": 0.0001418773851665344, + "loss": 12.3345, + "step": 14015 + }, + { + "epoch": 0.7632272161074922, + "grad_norm": 0.6305373858898516, + "learning_rate": 0.00014186937722674683, + "loss": 12.1297, + "step": 14016 + }, + { + "epoch": 0.7632816701040752, + "grad_norm": 0.6013170403095079, + "learning_rate": 0.00014186136896137598, + "loss": 12.265, + "step": 14017 + }, + { + "epoch": 0.7633361241006582, + "grad_norm": 0.5879012594309635, + "learning_rate": 0.00014185336037048408, + "loss": 12.2657, + "step": 14018 + }, + { + "epoch": 0.7633905780972412, + "grad_norm": 0.5452413532638681, + "learning_rate": 0.00014184535145413344, + "loss": 12.1823, + "step": 14019 + }, + { + "epoch": 0.7634450320938242, + "grad_norm": 0.6873152094810032, + "learning_rate": 0.00014183734221238635, + "loss": 12.4098, + "step": 14020 + }, + { + "epoch": 0.7634994860904073, + "grad_norm": 0.6416778737618622, + "learning_rate": 0.00014182933264530502, + "loss": 12.283, + "step": 14021 + }, + { + "epoch": 0.7635539400869903, + "grad_norm": 0.6437179415426925, + "learning_rate": 0.00014182132275295182, + "loss": 12.3234, + "step": 14022 + }, + { + "epoch": 0.7636083940835733, + "grad_norm": 0.556334606335069, + "learning_rate": 0.00014181331253538897, + "loss": 12.2012, + "step": 14023 + }, + { + "epoch": 0.7636628480801563, + "grad_norm": 0.5543179396361387, + "learning_rate": 0.00014180530199267883, + "loss": 12.2312, + "step": 14024 + }, + { + "epoch": 0.7637173020767393, + "grad_norm": 0.5500295422833468, + "learning_rate": 0.00014179729112488365, + "loss": 12.1844, + "step": 14025 + }, + { + "epoch": 0.7637717560733223, + "grad_norm": 0.5425630994642155, + "learning_rate": 0.0001417892799320657, + "loss": 12.2491, + "step": 14026 + }, + { + "epoch": 0.7638262100699054, + "grad_norm": 0.6669730471823939, + "learning_rate": 0.00014178126841428733, + "loss": 12.3245, + "step": 14027 + }, + { + "epoch": 0.7638806640664884, + "grad_norm": 0.6372441479102375, + "learning_rate": 0.00014177325657161079, + "loss": 12.121, + "step": 14028 + }, + { + "epoch": 0.7639351180630714, + "grad_norm": 0.5936749439861729, + "learning_rate": 0.00014176524440409838, + "loss": 12.1716, + "step": 14029 + }, + { + "epoch": 0.7639895720596543, + "grad_norm": 0.5548252061186839, + "learning_rate": 0.00014175723191181246, + "loss": 12.2135, + "step": 14030 + }, + { + "epoch": 0.7640440260562373, + "grad_norm": 0.6081908661433257, + "learning_rate": 0.00014174921909481528, + "loss": 12.1114, + "step": 14031 + }, + { + "epoch": 0.7640984800528203, + "grad_norm": 0.5962901142755918, + "learning_rate": 0.00014174120595316918, + "loss": 12.3284, + "step": 14032 + }, + { + "epoch": 0.7641529340494034, + "grad_norm": 0.6069543878257613, + "learning_rate": 0.00014173319248693647, + "loss": 12.2171, + "step": 14033 + }, + { + "epoch": 0.7642073880459864, + "grad_norm": 0.5340676390046819, + "learning_rate": 0.00014172517869617942, + "loss": 12.1292, + "step": 14034 + }, + { + "epoch": 0.7642618420425694, + "grad_norm": 0.5802220742321209, + "learning_rate": 0.00014171716458096043, + "loss": 12.0764, + "step": 14035 + }, + { + "epoch": 0.7643162960391524, + "grad_norm": 0.5425549092592509, + "learning_rate": 0.00014170915014134175, + "loss": 12.1105, + "step": 14036 + }, + { + "epoch": 0.7643707500357354, + "grad_norm": 0.6521458879338885, + "learning_rate": 0.00014170113537738572, + "loss": 12.3549, + "step": 14037 + }, + { + "epoch": 0.7644252040323184, + "grad_norm": 0.5721445904275433, + "learning_rate": 0.00014169312028915467, + "loss": 12.2843, + "step": 14038 + }, + { + "epoch": 0.7644796580289015, + "grad_norm": 0.6204254611658935, + "learning_rate": 0.00014168510487671095, + "loss": 12.1998, + "step": 14039 + }, + { + "epoch": 0.7645341120254845, + "grad_norm": 0.5713946970086126, + "learning_rate": 0.00014167708914011683, + "loss": 12.3153, + "step": 14040 + }, + { + "epoch": 0.7645885660220675, + "grad_norm": 0.5532519282768511, + "learning_rate": 0.00014166907307943468, + "loss": 12.1961, + "step": 14041 + }, + { + "epoch": 0.7646430200186505, + "grad_norm": 0.5569680627789245, + "learning_rate": 0.00014166105669472686, + "loss": 12.2786, + "step": 14042 + }, + { + "epoch": 0.7646974740152335, + "grad_norm": 0.5536494835696807, + "learning_rate": 0.00014165303998605567, + "loss": 12.2799, + "step": 14043 + }, + { + "epoch": 0.7647519280118165, + "grad_norm": 0.681863473872942, + "learning_rate": 0.00014164502295348344, + "loss": 12.2401, + "step": 14044 + }, + { + "epoch": 0.7648063820083996, + "grad_norm": 0.5696680388291402, + "learning_rate": 0.00014163700559707251, + "loss": 12.0994, + "step": 14045 + }, + { + "epoch": 0.7648608360049826, + "grad_norm": 0.5586529453896714, + "learning_rate": 0.00014162898791688527, + "loss": 12.2236, + "step": 14046 + }, + { + "epoch": 0.7649152900015656, + "grad_norm": 0.6305781565597973, + "learning_rate": 0.000141620969912984, + "loss": 12.3263, + "step": 14047 + }, + { + "epoch": 0.7649697439981485, + "grad_norm": 0.5753330948747856, + "learning_rate": 0.00014161295158543117, + "loss": 12.3274, + "step": 14048 + }, + { + "epoch": 0.7650241979947315, + "grad_norm": 0.5359807136362528, + "learning_rate": 0.000141604932934289, + "loss": 12.2058, + "step": 14049 + }, + { + "epoch": 0.7650786519913146, + "grad_norm": 0.5956265092053136, + "learning_rate": 0.00014159691395961986, + "loss": 12.2686, + "step": 14050 + }, + { + "epoch": 0.7651331059878976, + "grad_norm": 0.5829785198998221, + "learning_rate": 0.0001415888946614862, + "loss": 12.2445, + "step": 14051 + }, + { + "epoch": 0.7651875599844806, + "grad_norm": 0.5570639705465623, + "learning_rate": 0.00014158087503995029, + "loss": 12.2025, + "step": 14052 + }, + { + "epoch": 0.7652420139810636, + "grad_norm": 0.6362187285845966, + "learning_rate": 0.00014157285509507452, + "loss": 12.3384, + "step": 14053 + }, + { + "epoch": 0.7652964679776466, + "grad_norm": 0.5309991837853475, + "learning_rate": 0.00014156483482692127, + "loss": 12.2158, + "step": 14054 + }, + { + "epoch": 0.7653509219742296, + "grad_norm": 0.5545637757522944, + "learning_rate": 0.00014155681423555288, + "loss": 12.1136, + "step": 14055 + }, + { + "epoch": 0.7654053759708127, + "grad_norm": 0.6145129134489931, + "learning_rate": 0.00014154879332103174, + "loss": 12.1744, + "step": 14056 + }, + { + "epoch": 0.7654598299673957, + "grad_norm": 0.5123633665586574, + "learning_rate": 0.00014154077208342023, + "loss": 12.1632, + "step": 14057 + }, + { + "epoch": 0.7655142839639787, + "grad_norm": 0.5908039187960368, + "learning_rate": 0.00014153275052278068, + "loss": 12.2842, + "step": 14058 + }, + { + "epoch": 0.7655687379605617, + "grad_norm": 0.5412909082059538, + "learning_rate": 0.00014152472863917555, + "loss": 12.0648, + "step": 14059 + }, + { + "epoch": 0.7656231919571447, + "grad_norm": 0.6130141862215384, + "learning_rate": 0.00014151670643266712, + "loss": 12.1749, + "step": 14060 + }, + { + "epoch": 0.7656776459537277, + "grad_norm": 0.6770562027867211, + "learning_rate": 0.0001415086839033178, + "loss": 12.4342, + "step": 14061 + }, + { + "epoch": 0.7657320999503108, + "grad_norm": 0.6104892574412778, + "learning_rate": 0.00014150066105119002, + "loss": 12.3244, + "step": 14062 + }, + { + "epoch": 0.7657865539468938, + "grad_norm": 0.556694748689259, + "learning_rate": 0.00014149263787634615, + "loss": 12.1292, + "step": 14063 + }, + { + "epoch": 0.7658410079434768, + "grad_norm": 0.532048703486904, + "learning_rate": 0.00014148461437884857, + "loss": 12.1891, + "step": 14064 + }, + { + "epoch": 0.7658954619400598, + "grad_norm": 0.5730215255960645, + "learning_rate": 0.0001414765905587597, + "loss": 12.272, + "step": 14065 + }, + { + "epoch": 0.7659499159366427, + "grad_norm": 0.7000139721440017, + "learning_rate": 0.00014146856641614184, + "loss": 12.1002, + "step": 14066 + }, + { + "epoch": 0.7660043699332257, + "grad_norm": 0.6843534527260816, + "learning_rate": 0.00014146054195105748, + "loss": 12.3384, + "step": 14067 + }, + { + "epoch": 0.7660588239298088, + "grad_norm": 0.5634617117221637, + "learning_rate": 0.00014145251716356897, + "loss": 12.3222, + "step": 14068 + }, + { + "epoch": 0.7661132779263918, + "grad_norm": 0.5687279399361462, + "learning_rate": 0.00014144449205373877, + "loss": 12.2644, + "step": 14069 + }, + { + "epoch": 0.7661677319229748, + "grad_norm": 0.6051485761650688, + "learning_rate": 0.00014143646662162927, + "loss": 12.2573, + "step": 14070 + }, + { + "epoch": 0.7662221859195578, + "grad_norm": 0.5839936942791063, + "learning_rate": 0.00014142844086730282, + "loss": 12.2485, + "step": 14071 + }, + { + "epoch": 0.7662766399161408, + "grad_norm": 0.5517736427541252, + "learning_rate": 0.00014142041479082185, + "loss": 12.116, + "step": 14072 + }, + { + "epoch": 0.7663310939127238, + "grad_norm": 0.5854388894353925, + "learning_rate": 0.00014141238839224883, + "loss": 12.3393, + "step": 14073 + }, + { + "epoch": 0.7663855479093069, + "grad_norm": 0.6314116918440781, + "learning_rate": 0.0001414043616716461, + "loss": 12.3267, + "step": 14074 + }, + { + "epoch": 0.7664400019058899, + "grad_norm": 0.5714963377609871, + "learning_rate": 0.00014139633462907614, + "loss": 12.2609, + "step": 14075 + }, + { + "epoch": 0.7664944559024729, + "grad_norm": 0.6137410946530638, + "learning_rate": 0.00014138830726460132, + "loss": 12.2717, + "step": 14076 + }, + { + "epoch": 0.7665489098990559, + "grad_norm": 0.5546721402047556, + "learning_rate": 0.0001413802795782841, + "loss": 12.3057, + "step": 14077 + }, + { + "epoch": 0.7666033638956389, + "grad_norm": 0.6403782377884639, + "learning_rate": 0.00014137225157018684, + "loss": 12.2847, + "step": 14078 + }, + { + "epoch": 0.766657817892222, + "grad_norm": 0.611941356520709, + "learning_rate": 0.00014136422324037207, + "loss": 12.3098, + "step": 14079 + }, + { + "epoch": 0.766712271888805, + "grad_norm": 0.6050520607106942, + "learning_rate": 0.00014135619458890215, + "loss": 12.3527, + "step": 14080 + }, + { + "epoch": 0.766766725885388, + "grad_norm": 0.5664621409668334, + "learning_rate": 0.0001413481656158395, + "loss": 12.2701, + "step": 14081 + }, + { + "epoch": 0.766821179881971, + "grad_norm": 0.5631607069562086, + "learning_rate": 0.00014134013632124658, + "loss": 12.1803, + "step": 14082 + }, + { + "epoch": 0.766875633878554, + "grad_norm": 0.5437281052141982, + "learning_rate": 0.00014133210670518585, + "loss": 12.1745, + "step": 14083 + }, + { + "epoch": 0.766930087875137, + "grad_norm": 0.5991515782094939, + "learning_rate": 0.0001413240767677197, + "loss": 12.2471, + "step": 14084 + }, + { + "epoch": 0.76698454187172, + "grad_norm": 0.5363419925328808, + "learning_rate": 0.00014131604650891063, + "loss": 12.1673, + "step": 14085 + }, + { + "epoch": 0.767038995868303, + "grad_norm": 0.6946396140367467, + "learning_rate": 0.00014130801592882107, + "loss": 12.2711, + "step": 14086 + }, + { + "epoch": 0.767093449864886, + "grad_norm": 0.6315343447460228, + "learning_rate": 0.00014129998502751342, + "loss": 12.2857, + "step": 14087 + }, + { + "epoch": 0.767147903861469, + "grad_norm": 0.5829576252690439, + "learning_rate": 0.00014129195380505017, + "loss": 12.225, + "step": 14088 + }, + { + "epoch": 0.767202357858052, + "grad_norm": 0.6156299807501912, + "learning_rate": 0.0001412839222614937, + "loss": 12.2652, + "step": 14089 + }, + { + "epoch": 0.767256811854635, + "grad_norm": 0.6254524138851091, + "learning_rate": 0.00014127589039690663, + "loss": 12.2521, + "step": 14090 + }, + { + "epoch": 0.7673112658512181, + "grad_norm": 0.6034966930210365, + "learning_rate": 0.00014126785821135126, + "loss": 12.2941, + "step": 14091 + }, + { + "epoch": 0.7673657198478011, + "grad_norm": 0.6437585849859505, + "learning_rate": 0.00014125982570489012, + "loss": 12.202, + "step": 14092 + }, + { + "epoch": 0.7674201738443841, + "grad_norm": 0.5373181078765519, + "learning_rate": 0.00014125179287758564, + "loss": 12.1046, + "step": 14093 + }, + { + "epoch": 0.7674746278409671, + "grad_norm": 0.6572518223600609, + "learning_rate": 0.0001412437597295003, + "loss": 12.4171, + "step": 14094 + }, + { + "epoch": 0.7675290818375501, + "grad_norm": 0.6232379859928363, + "learning_rate": 0.00014123572626069657, + "loss": 12.2784, + "step": 14095 + }, + { + "epoch": 0.7675835358341331, + "grad_norm": 0.6577703484439251, + "learning_rate": 0.00014122769247123694, + "loss": 12.199, + "step": 14096 + }, + { + "epoch": 0.7676379898307162, + "grad_norm": 0.5299622761718101, + "learning_rate": 0.00014121965836118384, + "loss": 12.2502, + "step": 14097 + }, + { + "epoch": 0.7676924438272992, + "grad_norm": 0.6044188745898461, + "learning_rate": 0.00014121162393059976, + "loss": 12.2248, + "step": 14098 + }, + { + "epoch": 0.7677468978238822, + "grad_norm": 0.6542289909898396, + "learning_rate": 0.0001412035891795472, + "loss": 12.1898, + "step": 14099 + }, + { + "epoch": 0.7678013518204652, + "grad_norm": 0.5732423193009067, + "learning_rate": 0.0001411955541080886, + "loss": 12.3215, + "step": 14100 + }, + { + "epoch": 0.7678558058170482, + "grad_norm": 0.6485999873433785, + "learning_rate": 0.00014118751871628645, + "loss": 12.341, + "step": 14101 + }, + { + "epoch": 0.7679102598136311, + "grad_norm": 0.6092602520664506, + "learning_rate": 0.0001411794830042033, + "loss": 12.2059, + "step": 14102 + }, + { + "epoch": 0.7679647138102143, + "grad_norm": 0.5593846752229331, + "learning_rate": 0.00014117144697190154, + "loss": 12.2195, + "step": 14103 + }, + { + "epoch": 0.7680191678067972, + "grad_norm": 0.5751117900315875, + "learning_rate": 0.00014116341061944372, + "loss": 12.1362, + "step": 14104 + }, + { + "epoch": 0.7680736218033802, + "grad_norm": 0.6063510707625296, + "learning_rate": 0.00014115537394689232, + "loss": 12.2299, + "step": 14105 + }, + { + "epoch": 0.7681280757999632, + "grad_norm": 0.597977108682404, + "learning_rate": 0.0001411473369543098, + "loss": 12.0905, + "step": 14106 + }, + { + "epoch": 0.7681825297965462, + "grad_norm": 0.5748477244036646, + "learning_rate": 0.0001411392996417587, + "loss": 12.2832, + "step": 14107 + }, + { + "epoch": 0.7682369837931292, + "grad_norm": 0.6037677483892898, + "learning_rate": 0.00014113126200930153, + "loss": 12.2684, + "step": 14108 + }, + { + "epoch": 0.7682914377897123, + "grad_norm": 0.6156244411231219, + "learning_rate": 0.00014112322405700076, + "loss": 12.2037, + "step": 14109 + }, + { + "epoch": 0.7683458917862953, + "grad_norm": 0.6140588551870635, + "learning_rate": 0.0001411151857849189, + "loss": 12.3049, + "step": 14110 + }, + { + "epoch": 0.7684003457828783, + "grad_norm": 0.6866071509031823, + "learning_rate": 0.00014110714719311847, + "loss": 12.1586, + "step": 14111 + }, + { + "epoch": 0.7684547997794613, + "grad_norm": 0.6114492089687162, + "learning_rate": 0.00014109910828166196, + "loss": 12.2286, + "step": 14112 + }, + { + "epoch": 0.7685092537760443, + "grad_norm": 0.5802988526973467, + "learning_rate": 0.0001410910690506119, + "loss": 12.2486, + "step": 14113 + }, + { + "epoch": 0.7685637077726274, + "grad_norm": 0.6874522798193882, + "learning_rate": 0.00014108302950003077, + "loss": 12.1485, + "step": 14114 + }, + { + "epoch": 0.7686181617692104, + "grad_norm": 0.6517853430434413, + "learning_rate": 0.00014107498962998114, + "loss": 12.317, + "step": 14115 + }, + { + "epoch": 0.7686726157657934, + "grad_norm": 0.6469399529099216, + "learning_rate": 0.0001410669494405255, + "loss": 12.1699, + "step": 14116 + }, + { + "epoch": 0.7687270697623764, + "grad_norm": 0.6214515150946013, + "learning_rate": 0.00014105890893172636, + "loss": 12.3012, + "step": 14117 + }, + { + "epoch": 0.7687815237589594, + "grad_norm": 0.6367902841549076, + "learning_rate": 0.00014105086810364625, + "loss": 12.3051, + "step": 14118 + }, + { + "epoch": 0.7688359777555424, + "grad_norm": 0.5850425284936396, + "learning_rate": 0.00014104282695634771, + "loss": 12.2222, + "step": 14119 + }, + { + "epoch": 0.7688904317521255, + "grad_norm": 0.6108114424575637, + "learning_rate": 0.0001410347854898933, + "loss": 12.2159, + "step": 14120 + }, + { + "epoch": 0.7689448857487085, + "grad_norm": 0.6282073225540852, + "learning_rate": 0.00014102674370434546, + "loss": 12.2697, + "step": 14121 + }, + { + "epoch": 0.7689993397452914, + "grad_norm": 0.6241786619890566, + "learning_rate": 0.00014101870159976683, + "loss": 12.1934, + "step": 14122 + }, + { + "epoch": 0.7690537937418744, + "grad_norm": 0.5502129213330372, + "learning_rate": 0.00014101065917621988, + "loss": 12.1528, + "step": 14123 + }, + { + "epoch": 0.7691082477384574, + "grad_norm": 0.6558359504268575, + "learning_rate": 0.00014100261643376717, + "loss": 12.1864, + "step": 14124 + }, + { + "epoch": 0.7691627017350404, + "grad_norm": 0.610417619158311, + "learning_rate": 0.00014099457337247127, + "loss": 12.201, + "step": 14125 + }, + { + "epoch": 0.7692171557316235, + "grad_norm": 0.5636968506474196, + "learning_rate": 0.00014098652999239462, + "loss": 12.1071, + "step": 14126 + }, + { + "epoch": 0.7692716097282065, + "grad_norm": 0.6876986715271385, + "learning_rate": 0.0001409784862935999, + "loss": 12.2103, + "step": 14127 + }, + { + "epoch": 0.7693260637247895, + "grad_norm": 0.5493141280887939, + "learning_rate": 0.00014097044227614954, + "loss": 12.2582, + "step": 14128 + }, + { + "epoch": 0.7693805177213725, + "grad_norm": 0.5951341346995529, + "learning_rate": 0.0001409623979401062, + "loss": 12.2833, + "step": 14129 + }, + { + "epoch": 0.7694349717179555, + "grad_norm": 0.6742088084590737, + "learning_rate": 0.00014095435328553239, + "loss": 12.3141, + "step": 14130 + }, + { + "epoch": 0.7694894257145385, + "grad_norm": 0.5384175217702536, + "learning_rate": 0.00014094630831249064, + "loss": 12.2142, + "step": 14131 + }, + { + "epoch": 0.7695438797111216, + "grad_norm": 0.6060570398527729, + "learning_rate": 0.00014093826302104351, + "loss": 12.2129, + "step": 14132 + }, + { + "epoch": 0.7695983337077046, + "grad_norm": 0.7023543612414251, + "learning_rate": 0.0001409302174112536, + "loss": 12.4134, + "step": 14133 + }, + { + "epoch": 0.7696527877042876, + "grad_norm": 0.5842896036172063, + "learning_rate": 0.00014092217148318348, + "loss": 12.3908, + "step": 14134 + }, + { + "epoch": 0.7697072417008706, + "grad_norm": 0.5804160737393887, + "learning_rate": 0.00014091412523689566, + "loss": 12.2307, + "step": 14135 + }, + { + "epoch": 0.7697616956974536, + "grad_norm": 0.5472274719740815, + "learning_rate": 0.00014090607867245274, + "loss": 12.1742, + "step": 14136 + }, + { + "epoch": 0.7698161496940366, + "grad_norm": 0.5146133222305155, + "learning_rate": 0.00014089803178991733, + "loss": 12.2048, + "step": 14137 + }, + { + "epoch": 0.7698706036906197, + "grad_norm": 0.5899090148481555, + "learning_rate": 0.00014088998458935192, + "loss": 12.2604, + "step": 14138 + }, + { + "epoch": 0.7699250576872027, + "grad_norm": 0.5819490009014724, + "learning_rate": 0.00014088193707081914, + "loss": 12.2092, + "step": 14139 + }, + { + "epoch": 0.7699795116837856, + "grad_norm": 0.585003880961486, + "learning_rate": 0.0001408738892343816, + "loss": 12.1618, + "step": 14140 + }, + { + "epoch": 0.7700339656803686, + "grad_norm": 0.6022137293508225, + "learning_rate": 0.0001408658410801018, + "loss": 12.2765, + "step": 14141 + }, + { + "epoch": 0.7700884196769516, + "grad_norm": 0.5975999931093471, + "learning_rate": 0.0001408577926080424, + "loss": 12.208, + "step": 14142 + }, + { + "epoch": 0.7701428736735346, + "grad_norm": 0.5542571069872959, + "learning_rate": 0.00014084974381826592, + "loss": 12.2463, + "step": 14143 + }, + { + "epoch": 0.7701973276701177, + "grad_norm": 0.5281431689300452, + "learning_rate": 0.000140841694710835, + "loss": 12.2376, + "step": 14144 + }, + { + "epoch": 0.7702517816667007, + "grad_norm": 0.5943061625491666, + "learning_rate": 0.0001408336452858122, + "loss": 12.2756, + "step": 14145 + }, + { + "epoch": 0.7703062356632837, + "grad_norm": 0.5421811990530448, + "learning_rate": 0.00014082559554326015, + "loss": 12.0254, + "step": 14146 + }, + { + "epoch": 0.7703606896598667, + "grad_norm": 0.6361321674266639, + "learning_rate": 0.0001408175454832414, + "loss": 12.2234, + "step": 14147 + }, + { + "epoch": 0.7704151436564497, + "grad_norm": 0.5642016530538166, + "learning_rate": 0.00014080949510581858, + "loss": 12.2879, + "step": 14148 + }, + { + "epoch": 0.7704695976530328, + "grad_norm": 0.5787333057483416, + "learning_rate": 0.00014080144441105429, + "loss": 12.2408, + "step": 14149 + }, + { + "epoch": 0.7705240516496158, + "grad_norm": 0.5170654352262323, + "learning_rate": 0.00014079339339901113, + "loss": 12.2432, + "step": 14150 + }, + { + "epoch": 0.7705785056461988, + "grad_norm": 0.6319362718158542, + "learning_rate": 0.00014078534206975166, + "loss": 12.328, + "step": 14151 + }, + { + "epoch": 0.7706329596427818, + "grad_norm": 0.628696957200546, + "learning_rate": 0.0001407772904233386, + "loss": 12.313, + "step": 14152 + }, + { + "epoch": 0.7706874136393648, + "grad_norm": 0.5423506725771897, + "learning_rate": 0.00014076923845983443, + "loss": 12.2563, + "step": 14153 + }, + { + "epoch": 0.7707418676359478, + "grad_norm": 0.5830858369473833, + "learning_rate": 0.00014076118617930186, + "loss": 12.2093, + "step": 14154 + }, + { + "epoch": 0.7707963216325309, + "grad_norm": 0.6498214355393194, + "learning_rate": 0.00014075313358180344, + "loss": 12.2451, + "step": 14155 + }, + { + "epoch": 0.7708507756291139, + "grad_norm": 0.6162319238332199, + "learning_rate": 0.00014074508066740183, + "loss": 12.1171, + "step": 14156 + }, + { + "epoch": 0.7709052296256969, + "grad_norm": 0.6190695713664903, + "learning_rate": 0.00014073702743615964, + "loss": 12.2088, + "step": 14157 + }, + { + "epoch": 0.7709596836222798, + "grad_norm": 0.5412261050077144, + "learning_rate": 0.0001407289738881395, + "loss": 12.0418, + "step": 14158 + }, + { + "epoch": 0.7710141376188628, + "grad_norm": 0.7319502020643702, + "learning_rate": 0.00014072092002340405, + "loss": 12.3812, + "step": 14159 + }, + { + "epoch": 0.7710685916154458, + "grad_norm": 0.6467337001986067, + "learning_rate": 0.00014071286584201587, + "loss": 12.3046, + "step": 14160 + }, + { + "epoch": 0.7711230456120289, + "grad_norm": 0.63723613660386, + "learning_rate": 0.00014070481134403762, + "loss": 12.3337, + "step": 14161 + }, + { + "epoch": 0.7711774996086119, + "grad_norm": 0.541520470381976, + "learning_rate": 0.00014069675652953194, + "loss": 12.1234, + "step": 14162 + }, + { + "epoch": 0.7712319536051949, + "grad_norm": 0.6029623074529031, + "learning_rate": 0.00014068870139856142, + "loss": 12.3006, + "step": 14163 + }, + { + "epoch": 0.7712864076017779, + "grad_norm": 0.5985857701434745, + "learning_rate": 0.00014068064595118877, + "loss": 12.2511, + "step": 14164 + }, + { + "epoch": 0.7713408615983609, + "grad_norm": 0.550235811414424, + "learning_rate": 0.0001406725901874766, + "loss": 12.1389, + "step": 14165 + }, + { + "epoch": 0.7713953155949439, + "grad_norm": 0.5984638770455566, + "learning_rate": 0.00014066453410748752, + "loss": 12.3201, + "step": 14166 + }, + { + "epoch": 0.771449769591527, + "grad_norm": 0.5862567048207609, + "learning_rate": 0.00014065647771128423, + "loss": 12.3524, + "step": 14167 + }, + { + "epoch": 0.77150422358811, + "grad_norm": 0.562059377621077, + "learning_rate": 0.00014064842099892935, + "loss": 12.2515, + "step": 14168 + }, + { + "epoch": 0.771558677584693, + "grad_norm": 0.5315770319244236, + "learning_rate": 0.00014064036397048551, + "loss": 12.0774, + "step": 14169 + }, + { + "epoch": 0.771613131581276, + "grad_norm": 0.6436814701280071, + "learning_rate": 0.0001406323066260154, + "loss": 12.2492, + "step": 14170 + }, + { + "epoch": 0.771667585577859, + "grad_norm": 0.5141619178170451, + "learning_rate": 0.00014062424896558166, + "loss": 12.216, + "step": 14171 + }, + { + "epoch": 0.771722039574442, + "grad_norm": 0.601453934681363, + "learning_rate": 0.00014061619098924694, + "loss": 12.0353, + "step": 14172 + }, + { + "epoch": 0.7717764935710251, + "grad_norm": 0.7781834634976749, + "learning_rate": 0.00014060813269707392, + "loss": 12.272, + "step": 14173 + }, + { + "epoch": 0.7718309475676081, + "grad_norm": 0.6037550044731685, + "learning_rate": 0.00014060007408912525, + "loss": 12.2854, + "step": 14174 + }, + { + "epoch": 0.771885401564191, + "grad_norm": 0.6248294027085685, + "learning_rate": 0.0001405920151654636, + "loss": 12.2883, + "step": 14175 + }, + { + "epoch": 0.771939855560774, + "grad_norm": 0.5940242997523166, + "learning_rate": 0.00014058395592615158, + "loss": 12.2134, + "step": 14176 + }, + { + "epoch": 0.771994309557357, + "grad_norm": 0.6932183762839871, + "learning_rate": 0.00014057589637125198, + "loss": 12.1776, + "step": 14177 + }, + { + "epoch": 0.77204876355394, + "grad_norm": 0.5124592144785853, + "learning_rate": 0.00014056783650082737, + "loss": 12.0275, + "step": 14178 + }, + { + "epoch": 0.7721032175505231, + "grad_norm": 0.5714334986091909, + "learning_rate": 0.00014055977631494045, + "loss": 12.2715, + "step": 14179 + }, + { + "epoch": 0.7721576715471061, + "grad_norm": 0.5712169349242016, + "learning_rate": 0.00014055171581365397, + "loss": 12.2596, + "step": 14180 + }, + { + "epoch": 0.7722121255436891, + "grad_norm": 0.6319480061558428, + "learning_rate": 0.00014054365499703046, + "loss": 12.3137, + "step": 14181 + }, + { + "epoch": 0.7722665795402721, + "grad_norm": 0.5785141653358192, + "learning_rate": 0.00014053559386513275, + "loss": 12.2219, + "step": 14182 + }, + { + "epoch": 0.7723210335368551, + "grad_norm": 0.5591661426535585, + "learning_rate": 0.00014052753241802342, + "loss": 12.2081, + "step": 14183 + }, + { + "epoch": 0.7723754875334382, + "grad_norm": 0.5548912954895845, + "learning_rate": 0.00014051947065576526, + "loss": 12.2108, + "step": 14184 + }, + { + "epoch": 0.7724299415300212, + "grad_norm": 0.569668227503653, + "learning_rate": 0.00014051140857842086, + "loss": 12.2756, + "step": 14185 + }, + { + "epoch": 0.7724843955266042, + "grad_norm": 0.525580456092135, + "learning_rate": 0.000140503346186053, + "loss": 12.3377, + "step": 14186 + }, + { + "epoch": 0.7725388495231872, + "grad_norm": 0.5478331742854908, + "learning_rate": 0.00014049528347872425, + "loss": 12.2428, + "step": 14187 + }, + { + "epoch": 0.7725933035197702, + "grad_norm": 0.572341978220355, + "learning_rate": 0.00014048722045649742, + "loss": 12.2341, + "step": 14188 + }, + { + "epoch": 0.7726477575163532, + "grad_norm": 0.6014348989134071, + "learning_rate": 0.0001404791571194352, + "loss": 12.2087, + "step": 14189 + }, + { + "epoch": 0.7727022115129363, + "grad_norm": 0.5457895419471461, + "learning_rate": 0.00014047109346760023, + "loss": 12.2166, + "step": 14190 + }, + { + "epoch": 0.7727566655095193, + "grad_norm": 0.6426511252714703, + "learning_rate": 0.00014046302950105529, + "loss": 12.2607, + "step": 14191 + }, + { + "epoch": 0.7728111195061023, + "grad_norm": 0.5161979350729351, + "learning_rate": 0.000140454965219863, + "loss": 12.1732, + "step": 14192 + }, + { + "epoch": 0.7728655735026853, + "grad_norm": 0.554545149023617, + "learning_rate": 0.00014044690062408612, + "loss": 12.2118, + "step": 14193 + }, + { + "epoch": 0.7729200274992682, + "grad_norm": 0.5719533718854034, + "learning_rate": 0.00014043883571378737, + "loss": 12.22, + "step": 14194 + }, + { + "epoch": 0.7729744814958512, + "grad_norm": 0.6718674667684825, + "learning_rate": 0.0001404307704890295, + "loss": 12.0822, + "step": 14195 + }, + { + "epoch": 0.7730289354924343, + "grad_norm": 0.5855615166821195, + "learning_rate": 0.00014042270494987513, + "loss": 12.3239, + "step": 14196 + }, + { + "epoch": 0.7730833894890173, + "grad_norm": 0.6017993497722051, + "learning_rate": 0.000140414639096387, + "loss": 12.3342, + "step": 14197 + }, + { + "epoch": 0.7731378434856003, + "grad_norm": 0.5411438881222047, + "learning_rate": 0.00014040657292862792, + "loss": 12.3142, + "step": 14198 + }, + { + "epoch": 0.7731922974821833, + "grad_norm": 0.5559677870703806, + "learning_rate": 0.0001403985064466605, + "loss": 12.2001, + "step": 14199 + }, + { + "epoch": 0.7732467514787663, + "grad_norm": 0.5760873129276473, + "learning_rate": 0.00014039043965054758, + "loss": 12.2604, + "step": 14200 + }, + { + "epoch": 0.7733012054753493, + "grad_norm": 0.5805544309998965, + "learning_rate": 0.00014038237254035177, + "loss": 12.3626, + "step": 14201 + }, + { + "epoch": 0.7733556594719324, + "grad_norm": 0.5615832662673361, + "learning_rate": 0.00014037430511613588, + "loss": 12.0235, + "step": 14202 + }, + { + "epoch": 0.7734101134685154, + "grad_norm": 0.6065320858204621, + "learning_rate": 0.00014036623737796261, + "loss": 12.3079, + "step": 14203 + }, + { + "epoch": 0.7734645674650984, + "grad_norm": 0.5479637189418413, + "learning_rate": 0.0001403581693258947, + "loss": 12.2065, + "step": 14204 + }, + { + "epoch": 0.7735190214616814, + "grad_norm": 0.5847645596608256, + "learning_rate": 0.00014035010095999497, + "loss": 12.1909, + "step": 14205 + }, + { + "epoch": 0.7735734754582644, + "grad_norm": 0.5972784753038021, + "learning_rate": 0.00014034203228032604, + "loss": 12.2817, + "step": 14206 + }, + { + "epoch": 0.7736279294548474, + "grad_norm": 0.6331906216266722, + "learning_rate": 0.0001403339632869507, + "loss": 12.2867, + "step": 14207 + }, + { + "epoch": 0.7736823834514305, + "grad_norm": 0.5132859404573481, + "learning_rate": 0.00014032589397993168, + "loss": 12.1685, + "step": 14208 + }, + { + "epoch": 0.7737368374480135, + "grad_norm": 0.5589622425109604, + "learning_rate": 0.00014031782435933174, + "loss": 12.1366, + "step": 14209 + }, + { + "epoch": 0.7737912914445965, + "grad_norm": 0.5858976298498404, + "learning_rate": 0.00014030975442521365, + "loss": 12.1649, + "step": 14210 + }, + { + "epoch": 0.7738457454411795, + "grad_norm": 0.6674804842623974, + "learning_rate": 0.0001403016841776402, + "loss": 12.4075, + "step": 14211 + }, + { + "epoch": 0.7739001994377624, + "grad_norm": 0.5939039505378468, + "learning_rate": 0.00014029361361667407, + "loss": 12.1996, + "step": 14212 + }, + { + "epoch": 0.7739546534343456, + "grad_norm": 0.5616736497767092, + "learning_rate": 0.000140285542742378, + "loss": 12.3147, + "step": 14213 + }, + { + "epoch": 0.7740091074309285, + "grad_norm": 0.6052812911670153, + "learning_rate": 0.00014027747155481482, + "loss": 12.3252, + "step": 14214 + }, + { + "epoch": 0.7740635614275115, + "grad_norm": 0.612917274492108, + "learning_rate": 0.00014026940005404726, + "loss": 12.2219, + "step": 14215 + }, + { + "epoch": 0.7741180154240945, + "grad_norm": 0.546998223405516, + "learning_rate": 0.00014026132824013812, + "loss": 12.2989, + "step": 14216 + }, + { + "epoch": 0.7741724694206775, + "grad_norm": 0.5894059670789069, + "learning_rate": 0.00014025325611315012, + "loss": 12.2427, + "step": 14217 + }, + { + "epoch": 0.7742269234172605, + "grad_norm": 0.5957066757457238, + "learning_rate": 0.00014024518367314602, + "loss": 12.3165, + "step": 14218 + }, + { + "epoch": 0.7742813774138436, + "grad_norm": 0.5836536982041985, + "learning_rate": 0.00014023711092018868, + "loss": 12.2283, + "step": 14219 + }, + { + "epoch": 0.7743358314104266, + "grad_norm": 0.5368407291439166, + "learning_rate": 0.00014022903785434077, + "loss": 12.351, + "step": 14220 + }, + { + "epoch": 0.7743902854070096, + "grad_norm": 0.5384549553801493, + "learning_rate": 0.00014022096447566514, + "loss": 12.2068, + "step": 14221 + }, + { + "epoch": 0.7744447394035926, + "grad_norm": 0.5645116287805855, + "learning_rate": 0.00014021289078422456, + "loss": 12.1967, + "step": 14222 + }, + { + "epoch": 0.7744991934001756, + "grad_norm": 0.5907865445463186, + "learning_rate": 0.00014020481678008175, + "loss": 12.252, + "step": 14223 + }, + { + "epoch": 0.7745536473967586, + "grad_norm": 0.6491946219544488, + "learning_rate": 0.0001401967424632996, + "loss": 12.2809, + "step": 14224 + }, + { + "epoch": 0.7746081013933417, + "grad_norm": 0.562889073485214, + "learning_rate": 0.0001401886678339408, + "loss": 12.1292, + "step": 14225 + }, + { + "epoch": 0.7746625553899247, + "grad_norm": 0.6158908521325102, + "learning_rate": 0.00014018059289206818, + "loss": 12.2408, + "step": 14226 + }, + { + "epoch": 0.7747170093865077, + "grad_norm": 0.6743988176690096, + "learning_rate": 0.00014017251763774456, + "loss": 12.2302, + "step": 14227 + }, + { + "epoch": 0.7747714633830907, + "grad_norm": 0.5750539418203042, + "learning_rate": 0.0001401644420710327, + "loss": 12.3108, + "step": 14228 + }, + { + "epoch": 0.7748259173796737, + "grad_norm": 0.6184375139837525, + "learning_rate": 0.00014015636619199535, + "loss": 12.3028, + "step": 14229 + }, + { + "epoch": 0.7748803713762566, + "grad_norm": 0.610642355628531, + "learning_rate": 0.0001401482900006954, + "loss": 12.1486, + "step": 14230 + }, + { + "epoch": 0.7749348253728398, + "grad_norm": 0.5821790796481626, + "learning_rate": 0.0001401402134971956, + "loss": 12.2153, + "step": 14231 + }, + { + "epoch": 0.7749892793694227, + "grad_norm": 0.5664819681648965, + "learning_rate": 0.0001401321366815588, + "loss": 12.2911, + "step": 14232 + }, + { + "epoch": 0.7750437333660057, + "grad_norm": 0.5699499311143544, + "learning_rate": 0.00014012405955384776, + "loss": 12.1542, + "step": 14233 + }, + { + "epoch": 0.7750981873625887, + "grad_norm": 0.6004532193029597, + "learning_rate": 0.0001401159821141253, + "loss": 12.2844, + "step": 14234 + }, + { + "epoch": 0.7751526413591717, + "grad_norm": 0.5606771204873505, + "learning_rate": 0.00014010790436245425, + "loss": 12.2204, + "step": 14235 + }, + { + "epoch": 0.7752070953557547, + "grad_norm": 0.5976923703128083, + "learning_rate": 0.00014009982629889736, + "loss": 12.2957, + "step": 14236 + }, + { + "epoch": 0.7752615493523378, + "grad_norm": 0.6291384584779743, + "learning_rate": 0.00014009174792351756, + "loss": 12.1954, + "step": 14237 + }, + { + "epoch": 0.7753160033489208, + "grad_norm": 0.5841736611008006, + "learning_rate": 0.00014008366923637757, + "loss": 12.2286, + "step": 14238 + }, + { + "epoch": 0.7753704573455038, + "grad_norm": 0.6188229355529823, + "learning_rate": 0.00014007559023754027, + "loss": 12.2629, + "step": 14239 + }, + { + "epoch": 0.7754249113420868, + "grad_norm": 0.6582155521706322, + "learning_rate": 0.00014006751092706843, + "loss": 12.2669, + "step": 14240 + }, + { + "epoch": 0.7754793653386698, + "grad_norm": 0.643902293261855, + "learning_rate": 0.00014005943130502492, + "loss": 12.3276, + "step": 14241 + }, + { + "epoch": 0.7755338193352528, + "grad_norm": 0.6682038792668181, + "learning_rate": 0.00014005135137147256, + "loss": 12.2784, + "step": 14242 + }, + { + "epoch": 0.7755882733318359, + "grad_norm": 0.5877777782070308, + "learning_rate": 0.00014004327112647418, + "loss": 12.3538, + "step": 14243 + }, + { + "epoch": 0.7756427273284189, + "grad_norm": 0.5785124014962444, + "learning_rate": 0.0001400351905700926, + "loss": 12.1579, + "step": 14244 + }, + { + "epoch": 0.7756971813250019, + "grad_norm": 0.6505713308803961, + "learning_rate": 0.00014002710970239062, + "loss": 12.1179, + "step": 14245 + }, + { + "epoch": 0.7757516353215849, + "grad_norm": 0.6449881450005565, + "learning_rate": 0.00014001902852343122, + "loss": 12.3544, + "step": 14246 + }, + { + "epoch": 0.7758060893181679, + "grad_norm": 0.5479162926644555, + "learning_rate": 0.00014001094703327706, + "loss": 12.1618, + "step": 14247 + }, + { + "epoch": 0.775860543314751, + "grad_norm": 0.6190528133158492, + "learning_rate": 0.00014000286523199108, + "loss": 12.2099, + "step": 14248 + }, + { + "epoch": 0.775914997311334, + "grad_norm": 0.7418593316007284, + "learning_rate": 0.00013999478311963614, + "loss": 12.3345, + "step": 14249 + }, + { + "epoch": 0.775969451307917, + "grad_norm": 0.5577836545952066, + "learning_rate": 0.00013998670069627505, + "loss": 12.3324, + "step": 14250 + }, + { + "epoch": 0.7760239053044999, + "grad_norm": 0.5608169550998805, + "learning_rate": 0.00013997861796197068, + "loss": 12.124, + "step": 14251 + }, + { + "epoch": 0.7760783593010829, + "grad_norm": 0.632786855936109, + "learning_rate": 0.00013997053491678584, + "loss": 12.4297, + "step": 14252 + }, + { + "epoch": 0.7761328132976659, + "grad_norm": 0.5433203133912633, + "learning_rate": 0.00013996245156078343, + "loss": 12.072, + "step": 14253 + }, + { + "epoch": 0.776187267294249, + "grad_norm": 0.5843187705344397, + "learning_rate": 0.0001399543678940263, + "loss": 12.2276, + "step": 14254 + }, + { + "epoch": 0.776241721290832, + "grad_norm": 0.5815658943635917, + "learning_rate": 0.0001399462839165773, + "loss": 12.2323, + "step": 14255 + }, + { + "epoch": 0.776296175287415, + "grad_norm": 0.5538371414048853, + "learning_rate": 0.00013993819962849932, + "loss": 12.1063, + "step": 14256 + }, + { + "epoch": 0.776350629283998, + "grad_norm": 0.5472741001888909, + "learning_rate": 0.00013993011502985519, + "loss": 12.1826, + "step": 14257 + }, + { + "epoch": 0.776405083280581, + "grad_norm": 0.6017028137201669, + "learning_rate": 0.00013992203012070775, + "loss": 12.1747, + "step": 14258 + }, + { + "epoch": 0.776459537277164, + "grad_norm": 0.5861421427275709, + "learning_rate": 0.00013991394490111994, + "loss": 12.1843, + "step": 14259 + }, + { + "epoch": 0.7765139912737471, + "grad_norm": 0.6023377412435076, + "learning_rate": 0.0001399058593711546, + "loss": 12.1658, + "step": 14260 + }, + { + "epoch": 0.7765684452703301, + "grad_norm": 0.5754821585939178, + "learning_rate": 0.00013989777353087463, + "loss": 12.1961, + "step": 14261 + }, + { + "epoch": 0.7766228992669131, + "grad_norm": 0.6475523071910245, + "learning_rate": 0.00013988968738034286, + "loss": 12.2166, + "step": 14262 + }, + { + "epoch": 0.7766773532634961, + "grad_norm": 0.5220931415276219, + "learning_rate": 0.00013988160091962218, + "loss": 12.0853, + "step": 14263 + }, + { + "epoch": 0.7767318072600791, + "grad_norm": 0.570755149486794, + "learning_rate": 0.00013987351414877547, + "loss": 12.1619, + "step": 14264 + }, + { + "epoch": 0.7767862612566621, + "grad_norm": 0.6100661674420887, + "learning_rate": 0.00013986542706786564, + "loss": 12.2246, + "step": 14265 + }, + { + "epoch": 0.7768407152532452, + "grad_norm": 0.5887754471812554, + "learning_rate": 0.00013985733967695563, + "loss": 12.2059, + "step": 14266 + }, + { + "epoch": 0.7768951692498282, + "grad_norm": 0.6519893463475808, + "learning_rate": 0.0001398492519761082, + "loss": 12.1453, + "step": 14267 + }, + { + "epoch": 0.7769496232464111, + "grad_norm": 0.5964378271102649, + "learning_rate": 0.0001398411639653863, + "loss": 12.2564, + "step": 14268 + }, + { + "epoch": 0.7770040772429941, + "grad_norm": 0.5732917751538955, + "learning_rate": 0.0001398330756448528, + "loss": 12.2191, + "step": 14269 + }, + { + "epoch": 0.7770585312395771, + "grad_norm": 0.5754070704973367, + "learning_rate": 0.0001398249870145707, + "loss": 12.1242, + "step": 14270 + }, + { + "epoch": 0.7771129852361601, + "grad_norm": 0.6793722475374825, + "learning_rate": 0.0001398168980746028, + "loss": 12.3338, + "step": 14271 + }, + { + "epoch": 0.7771674392327432, + "grad_norm": 0.5648411282352749, + "learning_rate": 0.00013980880882501199, + "loss": 12.218, + "step": 14272 + }, + { + "epoch": 0.7772218932293262, + "grad_norm": 0.6331849677299348, + "learning_rate": 0.00013980071926586124, + "loss": 12.4167, + "step": 14273 + }, + { + "epoch": 0.7772763472259092, + "grad_norm": 0.578231429227074, + "learning_rate": 0.0001397926293972134, + "loss": 12.2841, + "step": 14274 + }, + { + "epoch": 0.7773308012224922, + "grad_norm": 0.6491037203032933, + "learning_rate": 0.0001397845392191314, + "loss": 12.3289, + "step": 14275 + }, + { + "epoch": 0.7773852552190752, + "grad_norm": 0.5303799869679332, + "learning_rate": 0.00013977644873167816, + "loss": 12.1669, + "step": 14276 + }, + { + "epoch": 0.7774397092156582, + "grad_norm": 0.5901335407418612, + "learning_rate": 0.0001397683579349166, + "loss": 12.1261, + "step": 14277 + }, + { + "epoch": 0.7774941632122413, + "grad_norm": 0.5676359507328063, + "learning_rate": 0.00013976026682890958, + "loss": 12.1885, + "step": 14278 + }, + { + "epoch": 0.7775486172088243, + "grad_norm": 0.534057470171481, + "learning_rate": 0.0001397521754137201, + "loss": 12.1992, + "step": 14279 + }, + { + "epoch": 0.7776030712054073, + "grad_norm": 0.5814240032332899, + "learning_rate": 0.000139744083689411, + "loss": 12.1737, + "step": 14280 + }, + { + "epoch": 0.7776575252019903, + "grad_norm": 0.5591124285685799, + "learning_rate": 0.00013973599165604527, + "loss": 12.2514, + "step": 14281 + }, + { + "epoch": 0.7777119791985733, + "grad_norm": 0.5714511589156374, + "learning_rate": 0.0001397278993136858, + "loss": 12.2969, + "step": 14282 + }, + { + "epoch": 0.7777664331951564, + "grad_norm": 0.5185389079557104, + "learning_rate": 0.0001397198066623955, + "loss": 12.2458, + "step": 14283 + }, + { + "epoch": 0.7778208871917394, + "grad_norm": 0.5416837860102734, + "learning_rate": 0.00013971171370223736, + "loss": 12.292, + "step": 14284 + }, + { + "epoch": 0.7778753411883224, + "grad_norm": 0.6085444604486575, + "learning_rate": 0.00013970362043327423, + "loss": 12.2255, + "step": 14285 + }, + { + "epoch": 0.7779297951849053, + "grad_norm": 0.6118832655342363, + "learning_rate": 0.00013969552685556914, + "loss": 12.249, + "step": 14286 + }, + { + "epoch": 0.7779842491814883, + "grad_norm": 0.583875910795135, + "learning_rate": 0.0001396874329691849, + "loss": 12.2431, + "step": 14287 + }, + { + "epoch": 0.7780387031780713, + "grad_norm": 0.6199842096758743, + "learning_rate": 0.0001396793387741846, + "loss": 12.237, + "step": 14288 + }, + { + "epoch": 0.7780931571746544, + "grad_norm": 0.5951711257796768, + "learning_rate": 0.00013967124427063108, + "loss": 12.0649, + "step": 14289 + }, + { + "epoch": 0.7781476111712374, + "grad_norm": 0.6533099070438597, + "learning_rate": 0.0001396631494585873, + "loss": 12.3251, + "step": 14290 + }, + { + "epoch": 0.7782020651678204, + "grad_norm": 0.6306942750243015, + "learning_rate": 0.00013965505433811623, + "loss": 12.2949, + "step": 14291 + }, + { + "epoch": 0.7782565191644034, + "grad_norm": 0.5373222830184418, + "learning_rate": 0.0001396469589092808, + "loss": 12.1354, + "step": 14292 + }, + { + "epoch": 0.7783109731609864, + "grad_norm": 0.5551774573109617, + "learning_rate": 0.00013963886317214398, + "loss": 12.2581, + "step": 14293 + }, + { + "epoch": 0.7783654271575694, + "grad_norm": 0.5963154606421266, + "learning_rate": 0.00013963076712676873, + "loss": 12.3286, + "step": 14294 + }, + { + "epoch": 0.7784198811541525, + "grad_norm": 0.5861963744730597, + "learning_rate": 0.00013962267077321795, + "loss": 12.2763, + "step": 14295 + }, + { + "epoch": 0.7784743351507355, + "grad_norm": 0.6176149298969914, + "learning_rate": 0.00013961457411155466, + "loss": 12.3619, + "step": 14296 + }, + { + "epoch": 0.7785287891473185, + "grad_norm": 0.5448354456452081, + "learning_rate": 0.00013960647714184182, + "loss": 12.2151, + "step": 14297 + }, + { + "epoch": 0.7785832431439015, + "grad_norm": 0.520422704062557, + "learning_rate": 0.0001395983798641423, + "loss": 12.2499, + "step": 14298 + }, + { + "epoch": 0.7786376971404845, + "grad_norm": 0.557202372867009, + "learning_rate": 0.0001395902822785192, + "loss": 12.165, + "step": 14299 + }, + { + "epoch": 0.7786921511370675, + "grad_norm": 0.5961029491895187, + "learning_rate": 0.00013958218438503542, + "loss": 12.3071, + "step": 14300 + }, + { + "epoch": 0.7787466051336506, + "grad_norm": 0.5826837314749691, + "learning_rate": 0.00013957408618375393, + "loss": 12.2213, + "step": 14301 + }, + { + "epoch": 0.7788010591302336, + "grad_norm": 0.5420504530064048, + "learning_rate": 0.0001395659876747377, + "loss": 12.2102, + "step": 14302 + }, + { + "epoch": 0.7788555131268166, + "grad_norm": 0.5346947413027613, + "learning_rate": 0.00013955788885804972, + "loss": 12.1678, + "step": 14303 + }, + { + "epoch": 0.7789099671233995, + "grad_norm": 0.5882148826393877, + "learning_rate": 0.00013954978973375294, + "loss": 12.1439, + "step": 14304 + }, + { + "epoch": 0.7789644211199825, + "grad_norm": 0.5857740211823618, + "learning_rate": 0.0001395416903019104, + "loss": 12.2248, + "step": 14305 + }, + { + "epoch": 0.7790188751165655, + "grad_norm": 0.5330991902072838, + "learning_rate": 0.00013953359056258503, + "loss": 12.1562, + "step": 14306 + }, + { + "epoch": 0.7790733291131486, + "grad_norm": 0.6525952363657743, + "learning_rate": 0.00013952549051583982, + "loss": 12.281, + "step": 14307 + }, + { + "epoch": 0.7791277831097316, + "grad_norm": 0.6381722528438454, + "learning_rate": 0.0001395173901617378, + "loss": 12.2519, + "step": 14308 + }, + { + "epoch": 0.7791822371063146, + "grad_norm": 0.5788636699692825, + "learning_rate": 0.00013950928950034187, + "loss": 12.1706, + "step": 14309 + }, + { + "epoch": 0.7792366911028976, + "grad_norm": 0.5714525079864948, + "learning_rate": 0.00013950118853171513, + "loss": 12.1567, + "step": 14310 + }, + { + "epoch": 0.7792911450994806, + "grad_norm": 0.5847814821575158, + "learning_rate": 0.0001394930872559205, + "loss": 12.274, + "step": 14311 + }, + { + "epoch": 0.7793455990960636, + "grad_norm": 0.6262641473386918, + "learning_rate": 0.000139484985673021, + "loss": 12.3492, + "step": 14312 + }, + { + "epoch": 0.7794000530926467, + "grad_norm": 0.6105688553660109, + "learning_rate": 0.00013947688378307963, + "loss": 12.1608, + "step": 14313 + }, + { + "epoch": 0.7794545070892297, + "grad_norm": 0.5249630319727343, + "learning_rate": 0.0001394687815861594, + "loss": 12.0978, + "step": 14314 + }, + { + "epoch": 0.7795089610858127, + "grad_norm": 0.6093801425256825, + "learning_rate": 0.00013946067908232333, + "loss": 12.1557, + "step": 14315 + }, + { + "epoch": 0.7795634150823957, + "grad_norm": 0.6559736581795284, + "learning_rate": 0.00013945257627163437, + "loss": 12.2336, + "step": 14316 + }, + { + "epoch": 0.7796178690789787, + "grad_norm": 0.5574043885221901, + "learning_rate": 0.00013944447315415557, + "loss": 12.1308, + "step": 14317 + }, + { + "epoch": 0.7796723230755618, + "grad_norm": 0.5868701210809807, + "learning_rate": 0.00013943636972994991, + "loss": 12.2638, + "step": 14318 + }, + { + "epoch": 0.7797267770721448, + "grad_norm": 0.6012516255490127, + "learning_rate": 0.00013942826599908044, + "loss": 12.2809, + "step": 14319 + }, + { + "epoch": 0.7797812310687278, + "grad_norm": 0.5372097078881665, + "learning_rate": 0.00013942016196161016, + "loss": 12.2253, + "step": 14320 + }, + { + "epoch": 0.7798356850653108, + "grad_norm": 0.545134903719249, + "learning_rate": 0.00013941205761760212, + "loss": 12.2653, + "step": 14321 + }, + { + "epoch": 0.7798901390618938, + "grad_norm": 0.5730787720955979, + "learning_rate": 0.0001394039529671193, + "loss": 12.2101, + "step": 14322 + }, + { + "epoch": 0.7799445930584767, + "grad_norm": 0.5340799709141614, + "learning_rate": 0.0001393958480102247, + "loss": 12.1503, + "step": 14323 + }, + { + "epoch": 0.7799990470550598, + "grad_norm": 0.562407601107663, + "learning_rate": 0.0001393877427469814, + "loss": 12.2644, + "step": 14324 + }, + { + "epoch": 0.7800535010516428, + "grad_norm": 0.6181723575845991, + "learning_rate": 0.0001393796371774524, + "loss": 12.124, + "step": 14325 + }, + { + "epoch": 0.7801079550482258, + "grad_norm": 0.509798324698095, + "learning_rate": 0.00013937153130170075, + "loss": 12.2401, + "step": 14326 + }, + { + "epoch": 0.7801624090448088, + "grad_norm": 0.587458779265688, + "learning_rate": 0.00013936342511978946, + "loss": 12.126, + "step": 14327 + }, + { + "epoch": 0.7802168630413918, + "grad_norm": 0.6251305508340274, + "learning_rate": 0.00013935531863178157, + "loss": 12.1942, + "step": 14328 + }, + { + "epoch": 0.7802713170379748, + "grad_norm": 0.5939469560259177, + "learning_rate": 0.00013934721183774015, + "loss": 12.2734, + "step": 14329 + }, + { + "epoch": 0.7803257710345579, + "grad_norm": 0.584601867857084, + "learning_rate": 0.00013933910473772816, + "loss": 12.2859, + "step": 14330 + }, + { + "epoch": 0.7803802250311409, + "grad_norm": 0.5854926109302606, + "learning_rate": 0.00013933099733180876, + "loss": 12.2367, + "step": 14331 + }, + { + "epoch": 0.7804346790277239, + "grad_norm": 0.6588983389598322, + "learning_rate": 0.00013932288962004486, + "loss": 12.209, + "step": 14332 + }, + { + "epoch": 0.7804891330243069, + "grad_norm": 0.6337923522917029, + "learning_rate": 0.00013931478160249966, + "loss": 12.237, + "step": 14333 + }, + { + "epoch": 0.7805435870208899, + "grad_norm": 0.587236476231902, + "learning_rate": 0.00013930667327923606, + "loss": 12.2099, + "step": 14334 + }, + { + "epoch": 0.7805980410174729, + "grad_norm": 0.5244300732635503, + "learning_rate": 0.00013929856465031716, + "loss": 12.1203, + "step": 14335 + }, + { + "epoch": 0.780652495014056, + "grad_norm": 0.5159189318018038, + "learning_rate": 0.00013929045571580608, + "loss": 12.2196, + "step": 14336 + }, + { + "epoch": 0.780706949010639, + "grad_norm": 0.6009509071692117, + "learning_rate": 0.00013928234647576581, + "loss": 12.2065, + "step": 14337 + }, + { + "epoch": 0.780761403007222, + "grad_norm": 0.62828740155288, + "learning_rate": 0.00013927423693025942, + "loss": 12.2639, + "step": 14338 + }, + { + "epoch": 0.780815857003805, + "grad_norm": 0.5782629456373587, + "learning_rate": 0.00013926612707935, + "loss": 12.3063, + "step": 14339 + }, + { + "epoch": 0.780870311000388, + "grad_norm": 0.6183466777482532, + "learning_rate": 0.00013925801692310058, + "loss": 12.1923, + "step": 14340 + }, + { + "epoch": 0.7809247649969709, + "grad_norm": 0.5903083877025544, + "learning_rate": 0.00013924990646157424, + "loss": 12.2672, + "step": 14341 + }, + { + "epoch": 0.780979218993554, + "grad_norm": 0.6053161053985386, + "learning_rate": 0.00013924179569483401, + "loss": 12.2701, + "step": 14342 + }, + { + "epoch": 0.781033672990137, + "grad_norm": 0.5986010917087732, + "learning_rate": 0.00013923368462294303, + "loss": 12.2788, + "step": 14343 + }, + { + "epoch": 0.78108812698672, + "grad_norm": 0.6302334667450733, + "learning_rate": 0.00013922557324596435, + "loss": 12.3288, + "step": 14344 + }, + { + "epoch": 0.781142580983303, + "grad_norm": 0.564817574612964, + "learning_rate": 0.000139217461563961, + "loss": 12.2768, + "step": 14345 + }, + { + "epoch": 0.781197034979886, + "grad_norm": 0.6042381395814973, + "learning_rate": 0.00013920934957699612, + "loss": 12.0756, + "step": 14346 + }, + { + "epoch": 0.7812514889764691, + "grad_norm": 0.7369856283803117, + "learning_rate": 0.00013920123728513274, + "loss": 12.2836, + "step": 14347 + }, + { + "epoch": 0.7813059429730521, + "grad_norm": 0.6592566362375767, + "learning_rate": 0.00013919312468843397, + "loss": 12.2566, + "step": 14348 + }, + { + "epoch": 0.7813603969696351, + "grad_norm": 0.6098850365522455, + "learning_rate": 0.0001391850117869629, + "loss": 12.2025, + "step": 14349 + }, + { + "epoch": 0.7814148509662181, + "grad_norm": 0.6014968474919596, + "learning_rate": 0.0001391768985807826, + "loss": 12.2522, + "step": 14350 + }, + { + "epoch": 0.7814693049628011, + "grad_norm": 0.5381136132046134, + "learning_rate": 0.00013916878506995618, + "loss": 12.2215, + "step": 14351 + }, + { + "epoch": 0.7815237589593841, + "grad_norm": 0.5348296555678123, + "learning_rate": 0.0001391606712545467, + "loss": 12.2183, + "step": 14352 + }, + { + "epoch": 0.7815782129559672, + "grad_norm": 0.6507633060534262, + "learning_rate": 0.00013915255713461727, + "loss": 12.2612, + "step": 14353 + }, + { + "epoch": 0.7816326669525502, + "grad_norm": 0.5396809649087732, + "learning_rate": 0.000139144442710231, + "loss": 12.1339, + "step": 14354 + }, + { + "epoch": 0.7816871209491332, + "grad_norm": 0.5438368503662775, + "learning_rate": 0.000139136327981451, + "loss": 12.3099, + "step": 14355 + }, + { + "epoch": 0.7817415749457162, + "grad_norm": 0.659822970082171, + "learning_rate": 0.00013912821294834033, + "loss": 12.2638, + "step": 14356 + }, + { + "epoch": 0.7817960289422992, + "grad_norm": 0.4852121890389048, + "learning_rate": 0.00013912009761096213, + "loss": 12.1385, + "step": 14357 + }, + { + "epoch": 0.7818504829388822, + "grad_norm": 0.515123968159365, + "learning_rate": 0.00013911198196937946, + "loss": 12.2034, + "step": 14358 + }, + { + "epoch": 0.7819049369354653, + "grad_norm": 0.5601852691409116, + "learning_rate": 0.00013910386602365547, + "loss": 12.2976, + "step": 14359 + }, + { + "epoch": 0.7819593909320482, + "grad_norm": 0.5218995324590391, + "learning_rate": 0.00013909574977385327, + "loss": 12.2155, + "step": 14360 + }, + { + "epoch": 0.7820138449286312, + "grad_norm": 0.7009425777978717, + "learning_rate": 0.00013908763322003595, + "loss": 12.1889, + "step": 14361 + }, + { + "epoch": 0.7820682989252142, + "grad_norm": 0.5250741002436927, + "learning_rate": 0.00013907951636226665, + "loss": 12.2074, + "step": 14362 + }, + { + "epoch": 0.7821227529217972, + "grad_norm": 0.5484504796190243, + "learning_rate": 0.00013907139920060847, + "loss": 12.2898, + "step": 14363 + }, + { + "epoch": 0.7821772069183802, + "grad_norm": 0.639001255662936, + "learning_rate": 0.00013906328173512455, + "loss": 12.215, + "step": 14364 + }, + { + "epoch": 0.7822316609149633, + "grad_norm": 0.5844299666446592, + "learning_rate": 0.000139055163965878, + "loss": 12.1908, + "step": 14365 + }, + { + "epoch": 0.7822861149115463, + "grad_norm": 0.6061975542541682, + "learning_rate": 0.00013904704589293192, + "loss": 12.2301, + "step": 14366 + }, + { + "epoch": 0.7823405689081293, + "grad_norm": 0.6185399821651211, + "learning_rate": 0.00013903892751634947, + "loss": 12.2471, + "step": 14367 + }, + { + "epoch": 0.7823950229047123, + "grad_norm": 0.6760511265559507, + "learning_rate": 0.0001390308088361938, + "loss": 12.233, + "step": 14368 + }, + { + "epoch": 0.7824494769012953, + "grad_norm": 0.5388520380861961, + "learning_rate": 0.000139022689852528, + "loss": 12.2188, + "step": 14369 + }, + { + "epoch": 0.7825039308978783, + "grad_norm": 0.5564895987143336, + "learning_rate": 0.0001390145705654152, + "loss": 12.106, + "step": 14370 + }, + { + "epoch": 0.7825583848944614, + "grad_norm": 0.5215538856075096, + "learning_rate": 0.00013900645097491857, + "loss": 12.1252, + "step": 14371 + }, + { + "epoch": 0.7826128388910444, + "grad_norm": 0.6067388147613065, + "learning_rate": 0.00013899833108110127, + "loss": 12.1462, + "step": 14372 + }, + { + "epoch": 0.7826672928876274, + "grad_norm": 0.5499048916495173, + "learning_rate": 0.00013899021088402638, + "loss": 12.1765, + "step": 14373 + }, + { + "epoch": 0.7827217468842104, + "grad_norm": 0.6123862168122999, + "learning_rate": 0.00013898209038375704, + "loss": 12.1465, + "step": 14374 + }, + { + "epoch": 0.7827762008807934, + "grad_norm": 0.5353567283212902, + "learning_rate": 0.00013897396958035646, + "loss": 12.3048, + "step": 14375 + }, + { + "epoch": 0.7828306548773764, + "grad_norm": 0.7451386834832064, + "learning_rate": 0.00013896584847388776, + "loss": 12.3199, + "step": 14376 + }, + { + "epoch": 0.7828851088739595, + "grad_norm": 0.5468178378836177, + "learning_rate": 0.0001389577270644141, + "loss": 12.2477, + "step": 14377 + }, + { + "epoch": 0.7829395628705424, + "grad_norm": 0.561119411847658, + "learning_rate": 0.00013894960535199862, + "loss": 12.0196, + "step": 14378 + }, + { + "epoch": 0.7829940168671254, + "grad_norm": 0.5808741207692449, + "learning_rate": 0.00013894148333670444, + "loss": 12.1076, + "step": 14379 + }, + { + "epoch": 0.7830484708637084, + "grad_norm": 0.6466017916541937, + "learning_rate": 0.00013893336101859479, + "loss": 12.3068, + "step": 14380 + }, + { + "epoch": 0.7831029248602914, + "grad_norm": 0.6033825408993769, + "learning_rate": 0.00013892523839773274, + "loss": 12.1595, + "step": 14381 + }, + { + "epoch": 0.7831573788568745, + "grad_norm": 0.6134304033155082, + "learning_rate": 0.00013891711547418158, + "loss": 12.3442, + "step": 14382 + }, + { + "epoch": 0.7832118328534575, + "grad_norm": 0.516047645087405, + "learning_rate": 0.0001389089922480044, + "loss": 12.241, + "step": 14383 + }, + { + "epoch": 0.7832662868500405, + "grad_norm": 0.5306019795143434, + "learning_rate": 0.0001389008687192643, + "loss": 12.2247, + "step": 14384 + }, + { + "epoch": 0.7833207408466235, + "grad_norm": 0.6731617967193122, + "learning_rate": 0.00013889274488802458, + "loss": 12.157, + "step": 14385 + }, + { + "epoch": 0.7833751948432065, + "grad_norm": 0.5768927983386264, + "learning_rate": 0.00013888462075434832, + "loss": 12.1987, + "step": 14386 + }, + { + "epoch": 0.7834296488397895, + "grad_norm": 0.5348814636384673, + "learning_rate": 0.00013887649631829879, + "loss": 12.2369, + "step": 14387 + }, + { + "epoch": 0.7834841028363726, + "grad_norm": 0.5322337299843027, + "learning_rate": 0.00013886837157993904, + "loss": 12.1423, + "step": 14388 + }, + { + "epoch": 0.7835385568329556, + "grad_norm": 0.5413549190692319, + "learning_rate": 0.00013886024653933233, + "loss": 12.2067, + "step": 14389 + }, + { + "epoch": 0.7835930108295386, + "grad_norm": 0.5780044416754666, + "learning_rate": 0.0001388521211965418, + "loss": 12.1955, + "step": 14390 + }, + { + "epoch": 0.7836474648261216, + "grad_norm": 0.5817049120825359, + "learning_rate": 0.0001388439955516307, + "loss": 12.2339, + "step": 14391 + }, + { + "epoch": 0.7837019188227046, + "grad_norm": 0.5970867677148517, + "learning_rate": 0.0001388358696046622, + "loss": 12.3324, + "step": 14392 + }, + { + "epoch": 0.7837563728192876, + "grad_norm": 0.6115114126909592, + "learning_rate": 0.00013882774335569943, + "loss": 12.1704, + "step": 14393 + }, + { + "epoch": 0.7838108268158707, + "grad_norm": 0.6358443448694101, + "learning_rate": 0.00013881961680480562, + "loss": 12.2058, + "step": 14394 + }, + { + "epoch": 0.7838652808124537, + "grad_norm": 0.5083773725240279, + "learning_rate": 0.00013881148995204394, + "loss": 12.0843, + "step": 14395 + }, + { + "epoch": 0.7839197348090367, + "grad_norm": 0.5771923630494237, + "learning_rate": 0.0001388033627974776, + "loss": 12.2925, + "step": 14396 + }, + { + "epoch": 0.7839741888056196, + "grad_norm": 0.7033857158513953, + "learning_rate": 0.00013879523534116988, + "loss": 12.3006, + "step": 14397 + }, + { + "epoch": 0.7840286428022026, + "grad_norm": 0.5933514222197236, + "learning_rate": 0.00013878710758318384, + "loss": 12.2273, + "step": 14398 + }, + { + "epoch": 0.7840830967987856, + "grad_norm": 0.5768914323508817, + "learning_rate": 0.00013877897952358276, + "loss": 12.1288, + "step": 14399 + }, + { + "epoch": 0.7841375507953687, + "grad_norm": 0.5646886542575501, + "learning_rate": 0.00013877085116242982, + "loss": 12.2829, + "step": 14400 + }, + { + "epoch": 0.7841920047919517, + "grad_norm": 0.6477289200019736, + "learning_rate": 0.00013876272249978823, + "loss": 12.3978, + "step": 14401 + }, + { + "epoch": 0.7842464587885347, + "grad_norm": 0.5648206891195865, + "learning_rate": 0.00013875459353572124, + "loss": 12.1633, + "step": 14402 + }, + { + "epoch": 0.7843009127851177, + "grad_norm": 0.6490668828600686, + "learning_rate": 0.00013874646427029203, + "loss": 12.2609, + "step": 14403 + }, + { + "epoch": 0.7843553667817007, + "grad_norm": 0.5890545313527348, + "learning_rate": 0.00013873833470356381, + "loss": 12.3098, + "step": 14404 + }, + { + "epoch": 0.7844098207782837, + "grad_norm": 0.6127329938396254, + "learning_rate": 0.00013873020483559978, + "loss": 12.3745, + "step": 14405 + }, + { + "epoch": 0.7844642747748668, + "grad_norm": 0.5983140066662879, + "learning_rate": 0.00013872207466646323, + "loss": 12.2806, + "step": 14406 + }, + { + "epoch": 0.7845187287714498, + "grad_norm": 0.5606783047599062, + "learning_rate": 0.0001387139441962173, + "loss": 12.0884, + "step": 14407 + }, + { + "epoch": 0.7845731827680328, + "grad_norm": 0.6118712385980577, + "learning_rate": 0.00013870581342492527, + "loss": 12.2015, + "step": 14408 + }, + { + "epoch": 0.7846276367646158, + "grad_norm": 0.5752553693397917, + "learning_rate": 0.00013869768235265034, + "loss": 12.262, + "step": 14409 + }, + { + "epoch": 0.7846820907611988, + "grad_norm": 0.6494019440602582, + "learning_rate": 0.00013868955097945572, + "loss": 12.2359, + "step": 14410 + }, + { + "epoch": 0.7847365447577818, + "grad_norm": 0.5540211353726932, + "learning_rate": 0.00013868141930540467, + "loss": 12.2692, + "step": 14411 + }, + { + "epoch": 0.7847909987543649, + "grad_norm": 0.5986349560833598, + "learning_rate": 0.00013867328733056043, + "loss": 12.1355, + "step": 14412 + }, + { + "epoch": 0.7848454527509479, + "grad_norm": 0.5163497163917473, + "learning_rate": 0.00013866515505498619, + "loss": 12.2954, + "step": 14413 + }, + { + "epoch": 0.7848999067475309, + "grad_norm": 0.6433436001488466, + "learning_rate": 0.00013865702247874525, + "loss": 12.2901, + "step": 14414 + }, + { + "epoch": 0.7849543607441138, + "grad_norm": 0.581890366088506, + "learning_rate": 0.0001386488896019008, + "loss": 12.0715, + "step": 14415 + }, + { + "epoch": 0.7850088147406968, + "grad_norm": 0.6102902133868333, + "learning_rate": 0.0001386407564245161, + "loss": 12.3666, + "step": 14416 + }, + { + "epoch": 0.7850632687372799, + "grad_norm": 0.5804923766002399, + "learning_rate": 0.00013863262294665444, + "loss": 12.3214, + "step": 14417 + }, + { + "epoch": 0.7851177227338629, + "grad_norm": 0.6068979406349941, + "learning_rate": 0.00013862448916837901, + "loss": 12.41, + "step": 14418 + }, + { + "epoch": 0.7851721767304459, + "grad_norm": 0.6058305883785644, + "learning_rate": 0.00013861635508975305, + "loss": 12.224, + "step": 14419 + }, + { + "epoch": 0.7852266307270289, + "grad_norm": 0.5933602069707344, + "learning_rate": 0.00013860822071083985, + "loss": 12.2893, + "step": 14420 + }, + { + "epoch": 0.7852810847236119, + "grad_norm": 0.5838506041908982, + "learning_rate": 0.00013860008603170266, + "loss": 12.219, + "step": 14421 + }, + { + "epoch": 0.7853355387201949, + "grad_norm": 0.565446422535431, + "learning_rate": 0.0001385919510524047, + "loss": 12.1994, + "step": 14422 + }, + { + "epoch": 0.785389992716778, + "grad_norm": 0.591349648141618, + "learning_rate": 0.00013858381577300927, + "loss": 12.3008, + "step": 14423 + }, + { + "epoch": 0.785444446713361, + "grad_norm": 0.6212443898915186, + "learning_rate": 0.00013857568019357962, + "loss": 12.3911, + "step": 14424 + }, + { + "epoch": 0.785498900709944, + "grad_norm": 0.5605166019234723, + "learning_rate": 0.00013856754431417905, + "loss": 12.297, + "step": 14425 + }, + { + "epoch": 0.785553354706527, + "grad_norm": 0.5557117201016185, + "learning_rate": 0.00013855940813487075, + "loss": 12.2484, + "step": 14426 + }, + { + "epoch": 0.78560780870311, + "grad_norm": 0.5896367418056946, + "learning_rate": 0.00013855127165571804, + "loss": 12.0708, + "step": 14427 + }, + { + "epoch": 0.785662262699693, + "grad_norm": 0.5566213369949221, + "learning_rate": 0.00013854313487678414, + "loss": 12.2472, + "step": 14428 + }, + { + "epoch": 0.7857167166962761, + "grad_norm": 0.5948199089283874, + "learning_rate": 0.0001385349977981324, + "loss": 12.3153, + "step": 14429 + }, + { + "epoch": 0.7857711706928591, + "grad_norm": 0.6235122010792709, + "learning_rate": 0.00013852686041982603, + "loss": 12.3027, + "step": 14430 + }, + { + "epoch": 0.7858256246894421, + "grad_norm": 0.5084880832530156, + "learning_rate": 0.00013851872274192833, + "loss": 12.1476, + "step": 14431 + }, + { + "epoch": 0.785880078686025, + "grad_norm": 0.5662903553664386, + "learning_rate": 0.00013851058476450264, + "loss": 12.1704, + "step": 14432 + }, + { + "epoch": 0.785934532682608, + "grad_norm": 0.5153724619182718, + "learning_rate": 0.0001385024464876121, + "loss": 12.1991, + "step": 14433 + }, + { + "epoch": 0.785988986679191, + "grad_norm": 0.608915473834492, + "learning_rate": 0.00013849430791132013, + "loss": 12.4918, + "step": 14434 + }, + { + "epoch": 0.7860434406757741, + "grad_norm": 0.5316088716633605, + "learning_rate": 0.00013848616903568996, + "loss": 12.1875, + "step": 14435 + }, + { + "epoch": 0.7860978946723571, + "grad_norm": 0.5752852247644937, + "learning_rate": 0.00013847802986078486, + "loss": 12.2453, + "step": 14436 + }, + { + "epoch": 0.7861523486689401, + "grad_norm": 0.5561773848748036, + "learning_rate": 0.0001384698903866682, + "loss": 12.1688, + "step": 14437 + }, + { + "epoch": 0.7862068026655231, + "grad_norm": 0.5808394601058333, + "learning_rate": 0.00013846175061340318, + "loss": 12.2476, + "step": 14438 + }, + { + "epoch": 0.7862612566621061, + "grad_norm": 0.6028946704665702, + "learning_rate": 0.00013845361054105312, + "loss": 12.2198, + "step": 14439 + }, + { + "epoch": 0.7863157106586891, + "grad_norm": 0.5933014981190774, + "learning_rate": 0.00013844547016968138, + "loss": 12.2097, + "step": 14440 + }, + { + "epoch": 0.7863701646552722, + "grad_norm": 0.5770644222918464, + "learning_rate": 0.0001384373294993512, + "loss": 12.1455, + "step": 14441 + }, + { + "epoch": 0.7864246186518552, + "grad_norm": 0.6156425720653473, + "learning_rate": 0.00013842918853012592, + "loss": 12.2782, + "step": 14442 + }, + { + "epoch": 0.7864790726484382, + "grad_norm": 0.6109416698177631, + "learning_rate": 0.0001384210472620688, + "loss": 12.2674, + "step": 14443 + }, + { + "epoch": 0.7865335266450212, + "grad_norm": 0.6145744345463588, + "learning_rate": 0.00013841290569524314, + "loss": 12.2003, + "step": 14444 + }, + { + "epoch": 0.7865879806416042, + "grad_norm": 0.5651575047053573, + "learning_rate": 0.00013840476382971233, + "loss": 12.2573, + "step": 14445 + }, + { + "epoch": 0.7866424346381872, + "grad_norm": 0.569776001009847, + "learning_rate": 0.00013839662166553965, + "loss": 12.2178, + "step": 14446 + }, + { + "epoch": 0.7866968886347703, + "grad_norm": 0.5834825925351157, + "learning_rate": 0.00013838847920278838, + "loss": 12.203, + "step": 14447 + }, + { + "epoch": 0.7867513426313533, + "grad_norm": 0.5820569556884193, + "learning_rate": 0.00013838033644152185, + "loss": 12.1508, + "step": 14448 + }, + { + "epoch": 0.7868057966279363, + "grad_norm": 0.549526625619789, + "learning_rate": 0.0001383721933818034, + "loss": 12.1994, + "step": 14449 + }, + { + "epoch": 0.7868602506245193, + "grad_norm": 0.6419107847056853, + "learning_rate": 0.0001383640500236963, + "loss": 12.1019, + "step": 14450 + }, + { + "epoch": 0.7869147046211022, + "grad_norm": 0.5877121778475324, + "learning_rate": 0.00013835590636726396, + "loss": 12.2655, + "step": 14451 + }, + { + "epoch": 0.7869691586176853, + "grad_norm": 0.5575112325526567, + "learning_rate": 0.00013834776241256966, + "loss": 12.223, + "step": 14452 + }, + { + "epoch": 0.7870236126142683, + "grad_norm": 0.5705576045447335, + "learning_rate": 0.0001383396181596767, + "loss": 12.2809, + "step": 14453 + }, + { + "epoch": 0.7870780666108513, + "grad_norm": 0.5420703228607463, + "learning_rate": 0.00013833147360864848, + "loss": 12.2297, + "step": 14454 + }, + { + "epoch": 0.7871325206074343, + "grad_norm": 0.5484969588430767, + "learning_rate": 0.00013832332875954824, + "loss": 12.2913, + "step": 14455 + }, + { + "epoch": 0.7871869746040173, + "grad_norm": 0.549863675343978, + "learning_rate": 0.00013831518361243937, + "loss": 12.2342, + "step": 14456 + }, + { + "epoch": 0.7872414286006003, + "grad_norm": 0.5468044821046292, + "learning_rate": 0.00013830703816738527, + "loss": 12.2085, + "step": 14457 + }, + { + "epoch": 0.7872958825971834, + "grad_norm": 0.6595316857062661, + "learning_rate": 0.00013829889242444917, + "loss": 12.1952, + "step": 14458 + }, + { + "epoch": 0.7873503365937664, + "grad_norm": 0.6309748753681473, + "learning_rate": 0.0001382907463836945, + "loss": 12.1863, + "step": 14459 + }, + { + "epoch": 0.7874047905903494, + "grad_norm": 0.528524171845402, + "learning_rate": 0.0001382826000451845, + "loss": 12.1068, + "step": 14460 + }, + { + "epoch": 0.7874592445869324, + "grad_norm": 0.6553348815463083, + "learning_rate": 0.00013827445340898258, + "loss": 12.2282, + "step": 14461 + }, + { + "epoch": 0.7875136985835154, + "grad_norm": 0.5770544173966313, + "learning_rate": 0.00013826630647515218, + "loss": 12.2327, + "step": 14462 + }, + { + "epoch": 0.7875681525800984, + "grad_norm": 0.6103420761265705, + "learning_rate": 0.0001382581592437565, + "loss": 12.3382, + "step": 14463 + }, + { + "epoch": 0.7876226065766815, + "grad_norm": 0.6456961095391563, + "learning_rate": 0.00013825001171485895, + "loss": 12.2109, + "step": 14464 + }, + { + "epoch": 0.7876770605732645, + "grad_norm": 0.6069538803998202, + "learning_rate": 0.0001382418638885229, + "loss": 12.2003, + "step": 14465 + }, + { + "epoch": 0.7877315145698475, + "grad_norm": 0.6036311575066308, + "learning_rate": 0.00013823371576481174, + "loss": 12.2489, + "step": 14466 + }, + { + "epoch": 0.7877859685664305, + "grad_norm": 0.6012697959525011, + "learning_rate": 0.00013822556734378877, + "loss": 12.2291, + "step": 14467 + }, + { + "epoch": 0.7878404225630135, + "grad_norm": 0.5750414494310325, + "learning_rate": 0.00013821741862551738, + "loss": 12.1316, + "step": 14468 + }, + { + "epoch": 0.7878948765595964, + "grad_norm": 0.6562236907687204, + "learning_rate": 0.00013820926961006092, + "loss": 12.1454, + "step": 14469 + }, + { + "epoch": 0.7879493305561796, + "grad_norm": 0.61036382134781, + "learning_rate": 0.0001382011202974828, + "loss": 12.3017, + "step": 14470 + }, + { + "epoch": 0.7880037845527625, + "grad_norm": 0.5865091461942048, + "learning_rate": 0.00013819297068784633, + "loss": 12.2436, + "step": 14471 + }, + { + "epoch": 0.7880582385493455, + "grad_norm": 0.6258204578276099, + "learning_rate": 0.00013818482078121496, + "loss": 12.297, + "step": 14472 + }, + { + "epoch": 0.7881126925459285, + "grad_norm": 0.5526784225643813, + "learning_rate": 0.00013817667057765197, + "loss": 12.263, + "step": 14473 + }, + { + "epoch": 0.7881671465425115, + "grad_norm": 0.6303789008627477, + "learning_rate": 0.0001381685200772208, + "loss": 12.0504, + "step": 14474 + }, + { + "epoch": 0.7882216005390945, + "grad_norm": 0.5270705695551884, + "learning_rate": 0.00013816036927998484, + "loss": 12.0476, + "step": 14475 + }, + { + "epoch": 0.7882760545356776, + "grad_norm": 0.5661338193897029, + "learning_rate": 0.00013815221818600743, + "loss": 12.3261, + "step": 14476 + }, + { + "epoch": 0.7883305085322606, + "grad_norm": 0.4920199077371078, + "learning_rate": 0.000138144066795352, + "loss": 12.1692, + "step": 14477 + }, + { + "epoch": 0.7883849625288436, + "grad_norm": 0.5405871902740343, + "learning_rate": 0.00013813591510808187, + "loss": 12.1211, + "step": 14478 + }, + { + "epoch": 0.7884394165254266, + "grad_norm": 0.5315923941485983, + "learning_rate": 0.00013812776312426047, + "loss": 12.0371, + "step": 14479 + }, + { + "epoch": 0.7884938705220096, + "grad_norm": 0.5127041828760139, + "learning_rate": 0.0001381196108439512, + "loss": 12.135, + "step": 14480 + }, + { + "epoch": 0.7885483245185927, + "grad_norm": 0.5459664774774081, + "learning_rate": 0.00013811145826721747, + "loss": 12.1413, + "step": 14481 + }, + { + "epoch": 0.7886027785151757, + "grad_norm": 0.6777493278458847, + "learning_rate": 0.00013810330539412263, + "loss": 12.2048, + "step": 14482 + }, + { + "epoch": 0.7886572325117587, + "grad_norm": 0.567214388297729, + "learning_rate": 0.0001380951522247301, + "loss": 12.194, + "step": 14483 + }, + { + "epoch": 0.7887116865083417, + "grad_norm": 0.5948900536613386, + "learning_rate": 0.00013808699875910327, + "loss": 12.1971, + "step": 14484 + }, + { + "epoch": 0.7887661405049247, + "grad_norm": 0.552835977401995, + "learning_rate": 0.00013807884499730555, + "loss": 12.2779, + "step": 14485 + }, + { + "epoch": 0.7888205945015077, + "grad_norm": 0.6584396299441723, + "learning_rate": 0.00013807069093940035, + "loss": 12.3761, + "step": 14486 + }, + { + "epoch": 0.7888750484980908, + "grad_norm": 0.5307595145590913, + "learning_rate": 0.0001380625365854511, + "loss": 12.2804, + "step": 14487 + }, + { + "epoch": 0.7889295024946738, + "grad_norm": 0.5491257047139692, + "learning_rate": 0.00013805438193552114, + "loss": 12.2452, + "step": 14488 + }, + { + "epoch": 0.7889839564912567, + "grad_norm": 0.5955458673578021, + "learning_rate": 0.00013804622698967392, + "loss": 12.1676, + "step": 14489 + }, + { + "epoch": 0.7890384104878397, + "grad_norm": 0.5192351249345226, + "learning_rate": 0.0001380380717479729, + "loss": 12.3022, + "step": 14490 + }, + { + "epoch": 0.7890928644844227, + "grad_norm": 0.4887155507006782, + "learning_rate": 0.00013802991621048141, + "loss": 12.192, + "step": 14491 + }, + { + "epoch": 0.7891473184810057, + "grad_norm": 0.5547709732773544, + "learning_rate": 0.00013802176037726297, + "loss": 12.2312, + "step": 14492 + }, + { + "epoch": 0.7892017724775888, + "grad_norm": 0.556762358309864, + "learning_rate": 0.00013801360424838089, + "loss": 12.2581, + "step": 14493 + }, + { + "epoch": 0.7892562264741718, + "grad_norm": 0.5308349164149758, + "learning_rate": 0.00013800544782389867, + "loss": 12.2044, + "step": 14494 + }, + { + "epoch": 0.7893106804707548, + "grad_norm": 0.5452192484856463, + "learning_rate": 0.00013799729110387972, + "loss": 12.2182, + "step": 14495 + }, + { + "epoch": 0.7893651344673378, + "grad_norm": 0.5342171146676084, + "learning_rate": 0.00013798913408838746, + "loss": 12.1737, + "step": 14496 + }, + { + "epoch": 0.7894195884639208, + "grad_norm": 0.5880953024934228, + "learning_rate": 0.00013798097677748534, + "loss": 12.1635, + "step": 14497 + }, + { + "epoch": 0.7894740424605038, + "grad_norm": 0.5501073488750367, + "learning_rate": 0.00013797281917123674, + "loss": 12.1803, + "step": 14498 + }, + { + "epoch": 0.7895284964570869, + "grad_norm": 0.5466462771813189, + "learning_rate": 0.00013796466126970514, + "loss": 12.282, + "step": 14499 + }, + { + "epoch": 0.7895829504536699, + "grad_norm": 0.555337501442166, + "learning_rate": 0.00013795650307295396, + "loss": 12.2572, + "step": 14500 + }, + { + "epoch": 0.7896374044502529, + "grad_norm": 0.5644067977646973, + "learning_rate": 0.00013794834458104665, + "loss": 12.2426, + "step": 14501 + }, + { + "epoch": 0.7896918584468359, + "grad_norm": 0.6290705505458757, + "learning_rate": 0.00013794018579404668, + "loss": 12.3258, + "step": 14502 + }, + { + "epoch": 0.7897463124434189, + "grad_norm": 0.5530317475999645, + "learning_rate": 0.00013793202671201745, + "loss": 12.2766, + "step": 14503 + }, + { + "epoch": 0.7898007664400019, + "grad_norm": 0.5526295277722347, + "learning_rate": 0.0001379238673350224, + "loss": 12.3009, + "step": 14504 + }, + { + "epoch": 0.789855220436585, + "grad_norm": 0.5924239444057052, + "learning_rate": 0.000137915707663125, + "loss": 12.1161, + "step": 14505 + }, + { + "epoch": 0.789909674433168, + "grad_norm": 0.59420567157587, + "learning_rate": 0.0001379075476963887, + "loss": 12.1919, + "step": 14506 + }, + { + "epoch": 0.789964128429751, + "grad_norm": 0.6298810966617565, + "learning_rate": 0.00013789938743487696, + "loss": 12.2766, + "step": 14507 + }, + { + "epoch": 0.7900185824263339, + "grad_norm": 0.5476263842566301, + "learning_rate": 0.00013789122687865325, + "loss": 12.2857, + "step": 14508 + }, + { + "epoch": 0.7900730364229169, + "grad_norm": 0.555176190503391, + "learning_rate": 0.00013788306602778097, + "loss": 12.132, + "step": 14509 + }, + { + "epoch": 0.7901274904194999, + "grad_norm": 0.6204025876595295, + "learning_rate": 0.00013787490488232363, + "loss": 12.2649, + "step": 14510 + }, + { + "epoch": 0.790181944416083, + "grad_norm": 0.5461093843848734, + "learning_rate": 0.00013786674344234466, + "loss": 12.1079, + "step": 14511 + }, + { + "epoch": 0.790236398412666, + "grad_norm": 0.5836535420064173, + "learning_rate": 0.00013785858170790754, + "loss": 11.9077, + "step": 14512 + }, + { + "epoch": 0.790290852409249, + "grad_norm": 0.5692528585134783, + "learning_rate": 0.0001378504196790758, + "loss": 12.3145, + "step": 14513 + }, + { + "epoch": 0.790345306405832, + "grad_norm": 0.526143377711485, + "learning_rate": 0.00013784225735591278, + "loss": 12.1412, + "step": 14514 + }, + { + "epoch": 0.790399760402415, + "grad_norm": 0.6780130437929691, + "learning_rate": 0.00013783409473848207, + "loss": 12.1476, + "step": 14515 + }, + { + "epoch": 0.7904542143989981, + "grad_norm": 0.5763331530144483, + "learning_rate": 0.00013782593182684705, + "loss": 12.2956, + "step": 14516 + }, + { + "epoch": 0.7905086683955811, + "grad_norm": 0.5775096015189632, + "learning_rate": 0.00013781776862107126, + "loss": 12.259, + "step": 14517 + }, + { + "epoch": 0.7905631223921641, + "grad_norm": 0.5617552123384292, + "learning_rate": 0.00013780960512121818, + "loss": 12.0761, + "step": 14518 + }, + { + "epoch": 0.7906175763887471, + "grad_norm": 0.715470699824914, + "learning_rate": 0.00013780144132735124, + "loss": 12.267, + "step": 14519 + }, + { + "epoch": 0.7906720303853301, + "grad_norm": 0.6163374199941223, + "learning_rate": 0.00013779327723953397, + "loss": 12.1038, + "step": 14520 + }, + { + "epoch": 0.7907264843819131, + "grad_norm": 0.5935665540501485, + "learning_rate": 0.0001377851128578298, + "loss": 12.2294, + "step": 14521 + }, + { + "epoch": 0.7907809383784962, + "grad_norm": 0.5838932995567909, + "learning_rate": 0.0001377769481823023, + "loss": 12.312, + "step": 14522 + }, + { + "epoch": 0.7908353923750792, + "grad_norm": 0.6095538760565489, + "learning_rate": 0.00013776878321301492, + "loss": 12.1626, + "step": 14523 + }, + { + "epoch": 0.7908898463716622, + "grad_norm": 0.5922649772685893, + "learning_rate": 0.00013776061795003113, + "loss": 12.3035, + "step": 14524 + }, + { + "epoch": 0.7909443003682451, + "grad_norm": 0.5515954714545865, + "learning_rate": 0.00013775245239341444, + "loss": 12.315, + "step": 14525 + }, + { + "epoch": 0.7909987543648281, + "grad_norm": 0.6539394474403301, + "learning_rate": 0.00013774428654322836, + "loss": 12.242, + "step": 14526 + }, + { + "epoch": 0.7910532083614111, + "grad_norm": 0.5634951214555385, + "learning_rate": 0.00013773612039953635, + "loss": 12.26, + "step": 14527 + }, + { + "epoch": 0.7911076623579942, + "grad_norm": 0.6319106731830428, + "learning_rate": 0.000137727953962402, + "loss": 12.3335, + "step": 14528 + }, + { + "epoch": 0.7911621163545772, + "grad_norm": 0.5989947043730212, + "learning_rate": 0.00013771978723188869, + "loss": 12.0249, + "step": 14529 + }, + { + "epoch": 0.7912165703511602, + "grad_norm": 0.658527852097458, + "learning_rate": 0.00013771162020806004, + "loss": 12.1369, + "step": 14530 + }, + { + "epoch": 0.7912710243477432, + "grad_norm": 0.6143727627790688, + "learning_rate": 0.00013770345289097945, + "loss": 12.2429, + "step": 14531 + }, + { + "epoch": 0.7913254783443262, + "grad_norm": 0.7332441044745701, + "learning_rate": 0.00013769528528071053, + "loss": 12.2045, + "step": 14532 + }, + { + "epoch": 0.7913799323409092, + "grad_norm": 0.7027518226233722, + "learning_rate": 0.00013768711737731674, + "loss": 12.3577, + "step": 14533 + }, + { + "epoch": 0.7914343863374923, + "grad_norm": 0.6661778673562905, + "learning_rate": 0.0001376789491808616, + "loss": 12.2812, + "step": 14534 + }, + { + "epoch": 0.7914888403340753, + "grad_norm": 0.7305109231819773, + "learning_rate": 0.00013767078069140863, + "loss": 12.3905, + "step": 14535 + }, + { + "epoch": 0.7915432943306583, + "grad_norm": 0.678911449401082, + "learning_rate": 0.00013766261190902138, + "loss": 12.2408, + "step": 14536 + }, + { + "epoch": 0.7915977483272413, + "grad_norm": 0.6530486210619075, + "learning_rate": 0.0001376544428337633, + "loss": 12.1734, + "step": 14537 + }, + { + "epoch": 0.7916522023238243, + "grad_norm": 0.5827242495444377, + "learning_rate": 0.00013764627346569798, + "loss": 12.2727, + "step": 14538 + }, + { + "epoch": 0.7917066563204073, + "grad_norm": 0.542965175998758, + "learning_rate": 0.00013763810380488893, + "loss": 12.2165, + "step": 14539 + }, + { + "epoch": 0.7917611103169904, + "grad_norm": 0.6735407450418386, + "learning_rate": 0.00013762993385139967, + "loss": 12.2434, + "step": 14540 + }, + { + "epoch": 0.7918155643135734, + "grad_norm": 0.5408198038923513, + "learning_rate": 0.0001376217636052937, + "loss": 12.2286, + "step": 14541 + }, + { + "epoch": 0.7918700183101564, + "grad_norm": 0.6335727041039692, + "learning_rate": 0.00013761359306663465, + "loss": 12.3902, + "step": 14542 + }, + { + "epoch": 0.7919244723067393, + "grad_norm": 0.7766853164870627, + "learning_rate": 0.00013760542223548595, + "loss": 12.3514, + "step": 14543 + }, + { + "epoch": 0.7919789263033223, + "grad_norm": 0.5963601706181519, + "learning_rate": 0.00013759725111191118, + "loss": 12.1699, + "step": 14544 + }, + { + "epoch": 0.7920333802999053, + "grad_norm": 0.641213799629075, + "learning_rate": 0.0001375890796959739, + "loss": 12.32, + "step": 14545 + }, + { + "epoch": 0.7920878342964884, + "grad_norm": 0.6972634743512368, + "learning_rate": 0.00013758090798773762, + "loss": 12.3032, + "step": 14546 + }, + { + "epoch": 0.7921422882930714, + "grad_norm": 0.6017069234143751, + "learning_rate": 0.00013757273598726587, + "loss": 12.3563, + "step": 14547 + }, + { + "epoch": 0.7921967422896544, + "grad_norm": 0.6360018090555419, + "learning_rate": 0.00013756456369462227, + "loss": 12.2611, + "step": 14548 + }, + { + "epoch": 0.7922511962862374, + "grad_norm": 0.6503478151736607, + "learning_rate": 0.00013755639110987032, + "loss": 12.3143, + "step": 14549 + }, + { + "epoch": 0.7923056502828204, + "grad_norm": 0.5543365015342491, + "learning_rate": 0.00013754821823307354, + "loss": 12.3949, + "step": 14550 + }, + { + "epoch": 0.7923601042794035, + "grad_norm": 0.5973243221748423, + "learning_rate": 0.00013754004506429554, + "loss": 12.3585, + "step": 14551 + }, + { + "epoch": 0.7924145582759865, + "grad_norm": 0.7117129070923307, + "learning_rate": 0.00013753187160359985, + "loss": 12.2683, + "step": 14552 + }, + { + "epoch": 0.7924690122725695, + "grad_norm": 0.5653042724525108, + "learning_rate": 0.00013752369785105005, + "loss": 12.2011, + "step": 14553 + }, + { + "epoch": 0.7925234662691525, + "grad_norm": 0.6353979891028753, + "learning_rate": 0.00013751552380670968, + "loss": 12.2549, + "step": 14554 + }, + { + "epoch": 0.7925779202657355, + "grad_norm": 0.5861169980674861, + "learning_rate": 0.00013750734947064227, + "loss": 12.2593, + "step": 14555 + }, + { + "epoch": 0.7926323742623185, + "grad_norm": 0.6516937965590943, + "learning_rate": 0.00013749917484291144, + "loss": 12.2449, + "step": 14556 + }, + { + "epoch": 0.7926868282589016, + "grad_norm": 0.6707502137605735, + "learning_rate": 0.00013749099992358077, + "loss": 12.2887, + "step": 14557 + }, + { + "epoch": 0.7927412822554846, + "grad_norm": 0.5770933413979192, + "learning_rate": 0.00013748282471271375, + "loss": 12.2399, + "step": 14558 + }, + { + "epoch": 0.7927957362520676, + "grad_norm": 0.6133565514888829, + "learning_rate": 0.000137474649210374, + "loss": 12.1271, + "step": 14559 + }, + { + "epoch": 0.7928501902486506, + "grad_norm": 0.565282292880067, + "learning_rate": 0.00013746647341662512, + "loss": 12.0827, + "step": 14560 + }, + { + "epoch": 0.7929046442452335, + "grad_norm": 0.630819282440226, + "learning_rate": 0.00013745829733153065, + "loss": 12.2273, + "step": 14561 + }, + { + "epoch": 0.7929590982418165, + "grad_norm": 0.5287941385974477, + "learning_rate": 0.00013745012095515418, + "loss": 12.3124, + "step": 14562 + }, + { + "epoch": 0.7930135522383996, + "grad_norm": 0.6013911595860558, + "learning_rate": 0.00013744194428755933, + "loss": 12.1449, + "step": 14563 + }, + { + "epoch": 0.7930680062349826, + "grad_norm": 0.5908970621387647, + "learning_rate": 0.0001374337673288096, + "loss": 11.9763, + "step": 14564 + }, + { + "epoch": 0.7931224602315656, + "grad_norm": 0.5678961642577054, + "learning_rate": 0.0001374255900789686, + "loss": 12.1764, + "step": 14565 + }, + { + "epoch": 0.7931769142281486, + "grad_norm": 0.5223695709824954, + "learning_rate": 0.00013741741253809996, + "loss": 12.0156, + "step": 14566 + }, + { + "epoch": 0.7932313682247316, + "grad_norm": 0.564503233628171, + "learning_rate": 0.0001374092347062672, + "loss": 12.188, + "step": 14567 + }, + { + "epoch": 0.7932858222213146, + "grad_norm": 0.5416228729888778, + "learning_rate": 0.00013740105658353405, + "loss": 12.1661, + "step": 14568 + }, + { + "epoch": 0.7933402762178977, + "grad_norm": 0.6134468990697433, + "learning_rate": 0.00013739287816996395, + "loss": 12.3041, + "step": 14569 + }, + { + "epoch": 0.7933947302144807, + "grad_norm": 0.5865723569145507, + "learning_rate": 0.00013738469946562054, + "loss": 12.1678, + "step": 14570 + }, + { + "epoch": 0.7934491842110637, + "grad_norm": 0.5917678061680537, + "learning_rate": 0.00013737652047056745, + "loss": 12.1629, + "step": 14571 + }, + { + "epoch": 0.7935036382076467, + "grad_norm": 0.6067411146767691, + "learning_rate": 0.0001373683411848683, + "loss": 12.2797, + "step": 14572 + }, + { + "epoch": 0.7935580922042297, + "grad_norm": 0.5688816523572685, + "learning_rate": 0.00013736016160858667, + "loss": 12.1704, + "step": 14573 + }, + { + "epoch": 0.7936125462008127, + "grad_norm": 0.5348887968152709, + "learning_rate": 0.00013735198174178614, + "loss": 12.1637, + "step": 14574 + }, + { + "epoch": 0.7936670001973958, + "grad_norm": 0.5976971969257272, + "learning_rate": 0.0001373438015845303, + "loss": 12.2787, + "step": 14575 + }, + { + "epoch": 0.7937214541939788, + "grad_norm": 0.6372535098151176, + "learning_rate": 0.00013733562113688283, + "loss": 12.2731, + "step": 14576 + }, + { + "epoch": 0.7937759081905618, + "grad_norm": 0.5856623029227053, + "learning_rate": 0.0001373274403989073, + "loss": 12.1469, + "step": 14577 + }, + { + "epoch": 0.7938303621871448, + "grad_norm": 0.5340829343077999, + "learning_rate": 0.00013731925937066736, + "loss": 12.2518, + "step": 14578 + }, + { + "epoch": 0.7938848161837277, + "grad_norm": 0.6135248305708401, + "learning_rate": 0.0001373110780522266, + "loss": 12.2766, + "step": 14579 + }, + { + "epoch": 0.7939392701803107, + "grad_norm": 0.5980173322439528, + "learning_rate": 0.0001373028964436486, + "loss": 12.1919, + "step": 14580 + }, + { + "epoch": 0.7939937241768938, + "grad_norm": 0.6206658138611297, + "learning_rate": 0.00013729471454499703, + "loss": 12.1507, + "step": 14581 + }, + { + "epoch": 0.7940481781734768, + "grad_norm": 0.5367388619002081, + "learning_rate": 0.0001372865323563355, + "loss": 12.2038, + "step": 14582 + }, + { + "epoch": 0.7941026321700598, + "grad_norm": 0.5240897993979492, + "learning_rate": 0.00013727834987772768, + "loss": 12.2194, + "step": 14583 + }, + { + "epoch": 0.7941570861666428, + "grad_norm": 0.647420197970378, + "learning_rate": 0.0001372701671092371, + "loss": 12.2785, + "step": 14584 + }, + { + "epoch": 0.7942115401632258, + "grad_norm": 0.5684613022836379, + "learning_rate": 0.0001372619840509275, + "loss": 12.2735, + "step": 14585 + }, + { + "epoch": 0.7942659941598089, + "grad_norm": 0.5945683442190239, + "learning_rate": 0.00013725380070286246, + "loss": 12.2594, + "step": 14586 + }, + { + "epoch": 0.7943204481563919, + "grad_norm": 0.6259752328008525, + "learning_rate": 0.0001372456170651056, + "loss": 12.1878, + "step": 14587 + }, + { + "epoch": 0.7943749021529749, + "grad_norm": 0.546480691542995, + "learning_rate": 0.00013723743313772058, + "loss": 12.2561, + "step": 14588 + }, + { + "epoch": 0.7944293561495579, + "grad_norm": 0.5150143310049871, + "learning_rate": 0.00013722924892077106, + "loss": 12.1557, + "step": 14589 + }, + { + "epoch": 0.7944838101461409, + "grad_norm": 0.561310388062198, + "learning_rate": 0.00013722106441432064, + "loss": 12.1778, + "step": 14590 + }, + { + "epoch": 0.7945382641427239, + "grad_norm": 0.606486418543721, + "learning_rate": 0.00013721287961843297, + "loss": 12.2936, + "step": 14591 + }, + { + "epoch": 0.794592718139307, + "grad_norm": 0.5837914612089911, + "learning_rate": 0.00013720469453317173, + "loss": 12.3163, + "step": 14592 + }, + { + "epoch": 0.79464717213589, + "grad_norm": 0.6707863662057902, + "learning_rate": 0.00013719650915860053, + "loss": 12.1247, + "step": 14593 + }, + { + "epoch": 0.794701626132473, + "grad_norm": 0.5614619023691085, + "learning_rate": 0.00013718832349478305, + "loss": 12.3049, + "step": 14594 + }, + { + "epoch": 0.794756080129056, + "grad_norm": 0.5097509032435517, + "learning_rate": 0.0001371801375417829, + "loss": 12.164, + "step": 14595 + }, + { + "epoch": 0.794810534125639, + "grad_norm": 0.5976077493113269, + "learning_rate": 0.00013717195129966378, + "loss": 12.2807, + "step": 14596 + }, + { + "epoch": 0.794864988122222, + "grad_norm": 0.5392292790465633, + "learning_rate": 0.00013716376476848933, + "loss": 12.2907, + "step": 14597 + }, + { + "epoch": 0.794919442118805, + "grad_norm": 0.5623035202767939, + "learning_rate": 0.00013715557794832323, + "loss": 12.2697, + "step": 14598 + }, + { + "epoch": 0.794973896115388, + "grad_norm": 0.5536073700263109, + "learning_rate": 0.00013714739083922912, + "loss": 12.1871, + "step": 14599 + }, + { + "epoch": 0.795028350111971, + "grad_norm": 0.5485839640207039, + "learning_rate": 0.00013713920344127068, + "loss": 12.2897, + "step": 14600 + }, + { + "epoch": 0.795082804108554, + "grad_norm": 0.5211499804336917, + "learning_rate": 0.00013713101575451152, + "loss": 12.1034, + "step": 14601 + }, + { + "epoch": 0.795137258105137, + "grad_norm": 0.5511440782227762, + "learning_rate": 0.00013712282777901538, + "loss": 12.2477, + "step": 14602 + }, + { + "epoch": 0.79519171210172, + "grad_norm": 0.5648455952058448, + "learning_rate": 0.00013711463951484592, + "loss": 12.1965, + "step": 14603 + }, + { + "epoch": 0.7952461660983031, + "grad_norm": 0.6047438965263254, + "learning_rate": 0.0001371064509620668, + "loss": 12.2426, + "step": 14604 + }, + { + "epoch": 0.7953006200948861, + "grad_norm": 0.5210930126052459, + "learning_rate": 0.0001370982621207417, + "loss": 12.2967, + "step": 14605 + }, + { + "epoch": 0.7953550740914691, + "grad_norm": 0.6753107385324894, + "learning_rate": 0.00013709007299093426, + "loss": 12.233, + "step": 14606 + }, + { + "epoch": 0.7954095280880521, + "grad_norm": 0.5903172203022329, + "learning_rate": 0.0001370818835727082, + "loss": 12.1532, + "step": 14607 + }, + { + "epoch": 0.7954639820846351, + "grad_norm": 0.5400132903393716, + "learning_rate": 0.00013707369386612721, + "loss": 12.1621, + "step": 14608 + }, + { + "epoch": 0.7955184360812181, + "grad_norm": 0.5691708873764322, + "learning_rate": 0.00013706550387125493, + "loss": 12.2356, + "step": 14609 + }, + { + "epoch": 0.7955728900778012, + "grad_norm": 0.545545193497752, + "learning_rate": 0.0001370573135881551, + "loss": 12.2555, + "step": 14610 + }, + { + "epoch": 0.7956273440743842, + "grad_norm": 0.5469260876252976, + "learning_rate": 0.0001370491230168914, + "loss": 12.1018, + "step": 14611 + }, + { + "epoch": 0.7956817980709672, + "grad_norm": 0.571494057494282, + "learning_rate": 0.0001370409321575275, + "loss": 12.1387, + "step": 14612 + }, + { + "epoch": 0.7957362520675502, + "grad_norm": 0.5837025395276881, + "learning_rate": 0.00013703274101012708, + "loss": 12.2117, + "step": 14613 + }, + { + "epoch": 0.7957907060641332, + "grad_norm": 0.6528327502031022, + "learning_rate": 0.00013702454957475386, + "loss": 12.2818, + "step": 14614 + }, + { + "epoch": 0.7958451600607163, + "grad_norm": 0.6390175326589221, + "learning_rate": 0.00013701635785147152, + "loss": 12.097, + "step": 14615 + }, + { + "epoch": 0.7958996140572993, + "grad_norm": 0.6133716093343833, + "learning_rate": 0.0001370081658403438, + "loss": 12.2751, + "step": 14616 + }, + { + "epoch": 0.7959540680538822, + "grad_norm": 0.5489687858980747, + "learning_rate": 0.00013699997354143438, + "loss": 12.2439, + "step": 14617 + }, + { + "epoch": 0.7960085220504652, + "grad_norm": 0.6197973559483165, + "learning_rate": 0.00013699178095480697, + "loss": 12.2906, + "step": 14618 + }, + { + "epoch": 0.7960629760470482, + "grad_norm": 0.7095353639430027, + "learning_rate": 0.0001369835880805252, + "loss": 12.1462, + "step": 14619 + }, + { + "epoch": 0.7961174300436312, + "grad_norm": 0.5870978258985311, + "learning_rate": 0.0001369753949186529, + "loss": 12.1074, + "step": 14620 + }, + { + "epoch": 0.7961718840402143, + "grad_norm": 0.6594864495359288, + "learning_rate": 0.00013696720146925373, + "loss": 12.368, + "step": 14621 + }, + { + "epoch": 0.7962263380367973, + "grad_norm": 0.6184783874708606, + "learning_rate": 0.0001369590077323914, + "loss": 12.3256, + "step": 14622 + }, + { + "epoch": 0.7962807920333803, + "grad_norm": 0.5951694789565457, + "learning_rate": 0.00013695081370812963, + "loss": 12.1502, + "step": 14623 + }, + { + "epoch": 0.7963352460299633, + "grad_norm": 0.607457031537557, + "learning_rate": 0.00013694261939653214, + "loss": 12.1754, + "step": 14624 + }, + { + "epoch": 0.7963897000265463, + "grad_norm": 0.5191105385985078, + "learning_rate": 0.0001369344247976626, + "loss": 12.1142, + "step": 14625 + }, + { + "epoch": 0.7964441540231293, + "grad_norm": 0.5516297677297756, + "learning_rate": 0.0001369262299115848, + "loss": 12.2765, + "step": 14626 + }, + { + "epoch": 0.7964986080197124, + "grad_norm": 0.5515463986744881, + "learning_rate": 0.00013691803473836247, + "loss": 12.2683, + "step": 14627 + }, + { + "epoch": 0.7965530620162954, + "grad_norm": 0.5851600829755839, + "learning_rate": 0.00013690983927805932, + "loss": 12.2109, + "step": 14628 + }, + { + "epoch": 0.7966075160128784, + "grad_norm": 0.6260586049833458, + "learning_rate": 0.00013690164353073904, + "loss": 12.29, + "step": 14629 + }, + { + "epoch": 0.7966619700094614, + "grad_norm": 0.5304233546419216, + "learning_rate": 0.0001368934474964654, + "loss": 12.2011, + "step": 14630 + }, + { + "epoch": 0.7967164240060444, + "grad_norm": 0.6180773390482837, + "learning_rate": 0.0001368852511753021, + "loss": 12.2119, + "step": 14631 + }, + { + "epoch": 0.7967708780026274, + "grad_norm": 0.5630688489731834, + "learning_rate": 0.00013687705456731295, + "loss": 12.2902, + "step": 14632 + }, + { + "epoch": 0.7968253319992105, + "grad_norm": 0.6090926735929871, + "learning_rate": 0.00013686885767256164, + "loss": 12.1717, + "step": 14633 + }, + { + "epoch": 0.7968797859957935, + "grad_norm": 0.5872552028357287, + "learning_rate": 0.00013686066049111185, + "loss": 12.269, + "step": 14634 + }, + { + "epoch": 0.7969342399923764, + "grad_norm": 0.5742787047807977, + "learning_rate": 0.00013685246302302744, + "loss": 12.2413, + "step": 14635 + }, + { + "epoch": 0.7969886939889594, + "grad_norm": 0.6247212389621019, + "learning_rate": 0.00013684426526837205, + "loss": 12.2067, + "step": 14636 + }, + { + "epoch": 0.7970431479855424, + "grad_norm": 0.5868775930778958, + "learning_rate": 0.0001368360672272095, + "loss": 12.2478, + "step": 14637 + }, + { + "epoch": 0.7970976019821254, + "grad_norm": 0.5846150678878385, + "learning_rate": 0.00013682786889960354, + "loss": 12.2411, + "step": 14638 + }, + { + "epoch": 0.7971520559787085, + "grad_norm": 0.5843841367905177, + "learning_rate": 0.00013681967028561785, + "loss": 12.2898, + "step": 14639 + }, + { + "epoch": 0.7972065099752915, + "grad_norm": 0.5868475342754406, + "learning_rate": 0.00013681147138531625, + "loss": 12.3007, + "step": 14640 + }, + { + "epoch": 0.7972609639718745, + "grad_norm": 0.583273108266093, + "learning_rate": 0.00013680327219876248, + "loss": 12.315, + "step": 14641 + }, + { + "epoch": 0.7973154179684575, + "grad_norm": 0.600712103579576, + "learning_rate": 0.00013679507272602027, + "loss": 12.2416, + "step": 14642 + }, + { + "epoch": 0.7973698719650405, + "grad_norm": 0.5903813153092471, + "learning_rate": 0.0001367868729671534, + "loss": 12.1484, + "step": 14643 + }, + { + "epoch": 0.7974243259616235, + "grad_norm": 0.6057061535598088, + "learning_rate": 0.00013677867292222567, + "loss": 12.2062, + "step": 14644 + }, + { + "epoch": 0.7974787799582066, + "grad_norm": 0.5382834472760614, + "learning_rate": 0.00013677047259130082, + "loss": 12.1268, + "step": 14645 + }, + { + "epoch": 0.7975332339547896, + "grad_norm": 0.6063616100416661, + "learning_rate": 0.00013676227197444258, + "loss": 12.2374, + "step": 14646 + }, + { + "epoch": 0.7975876879513726, + "grad_norm": 0.5621407926219465, + "learning_rate": 0.00013675407107171473, + "loss": 12.2684, + "step": 14647 + }, + { + "epoch": 0.7976421419479556, + "grad_norm": 0.5651809829118901, + "learning_rate": 0.00013674586988318108, + "loss": 12.3008, + "step": 14648 + }, + { + "epoch": 0.7976965959445386, + "grad_norm": 0.7892357277511804, + "learning_rate": 0.0001367376684089054, + "loss": 12.313, + "step": 14649 + }, + { + "epoch": 0.7977510499411217, + "grad_norm": 0.6506794636249186, + "learning_rate": 0.00013672946664895145, + "loss": 12.4271, + "step": 14650 + }, + { + "epoch": 0.7978055039377047, + "grad_norm": 0.5041891292585133, + "learning_rate": 0.00013672126460338298, + "loss": 12.2156, + "step": 14651 + }, + { + "epoch": 0.7978599579342877, + "grad_norm": 0.7401483260152929, + "learning_rate": 0.00013671306227226385, + "loss": 12.2375, + "step": 14652 + }, + { + "epoch": 0.7979144119308706, + "grad_norm": 0.6222369533899695, + "learning_rate": 0.00013670485965565772, + "loss": 12.276, + "step": 14653 + }, + { + "epoch": 0.7979688659274536, + "grad_norm": 0.6692280187649381, + "learning_rate": 0.00013669665675362848, + "loss": 12.3599, + "step": 14654 + }, + { + "epoch": 0.7980233199240366, + "grad_norm": 0.5884085203012612, + "learning_rate": 0.0001366884535662399, + "loss": 12.4401, + "step": 14655 + }, + { + "epoch": 0.7980777739206197, + "grad_norm": 0.5589894629310427, + "learning_rate": 0.00013668025009355573, + "loss": 12.1091, + "step": 14656 + }, + { + "epoch": 0.7981322279172027, + "grad_norm": 0.5326014670977347, + "learning_rate": 0.0001366720463356398, + "loss": 12.1856, + "step": 14657 + }, + { + "epoch": 0.7981866819137857, + "grad_norm": 0.6594920539581905, + "learning_rate": 0.00013666384229255585, + "loss": 12.336, + "step": 14658 + }, + { + "epoch": 0.7982411359103687, + "grad_norm": 0.5766616140343678, + "learning_rate": 0.00013665563796436776, + "loss": 12.111, + "step": 14659 + }, + { + "epoch": 0.7982955899069517, + "grad_norm": 0.6231565003777191, + "learning_rate": 0.00013664743335113926, + "loss": 12.3686, + "step": 14660 + }, + { + "epoch": 0.7983500439035347, + "grad_norm": 0.6206882350344767, + "learning_rate": 0.00013663922845293417, + "loss": 12.1138, + "step": 14661 + }, + { + "epoch": 0.7984044979001178, + "grad_norm": 0.5864199324503923, + "learning_rate": 0.00013663102326981632, + "loss": 12.2635, + "step": 14662 + }, + { + "epoch": 0.7984589518967008, + "grad_norm": 0.5555180848908318, + "learning_rate": 0.00013662281780184947, + "loss": 12.1993, + "step": 14663 + }, + { + "epoch": 0.7985134058932838, + "grad_norm": 0.5393908024567289, + "learning_rate": 0.00013661461204909746, + "loss": 12.2018, + "step": 14664 + }, + { + "epoch": 0.7985678598898668, + "grad_norm": 0.5869442261183992, + "learning_rate": 0.00013660640601162406, + "loss": 12.3016, + "step": 14665 + }, + { + "epoch": 0.7986223138864498, + "grad_norm": 0.5705150655125303, + "learning_rate": 0.0001365981996894931, + "loss": 12.1733, + "step": 14666 + }, + { + "epoch": 0.7986767678830328, + "grad_norm": 0.5564124217432502, + "learning_rate": 0.00013658999308276845, + "loss": 12.2509, + "step": 14667 + }, + { + "epoch": 0.7987312218796159, + "grad_norm": 0.543176948218349, + "learning_rate": 0.00013658178619151384, + "loss": 12.1485, + "step": 14668 + }, + { + "epoch": 0.7987856758761989, + "grad_norm": 0.5580168309808128, + "learning_rate": 0.00013657357901579315, + "loss": 12.1323, + "step": 14669 + }, + { + "epoch": 0.7988401298727819, + "grad_norm": 0.5844524466135493, + "learning_rate": 0.00013656537155567016, + "loss": 12.1079, + "step": 14670 + }, + { + "epoch": 0.7988945838693648, + "grad_norm": 0.5497642901168213, + "learning_rate": 0.0001365571638112087, + "loss": 12.2781, + "step": 14671 + }, + { + "epoch": 0.7989490378659478, + "grad_norm": 0.5574194895445754, + "learning_rate": 0.00013654895578247262, + "loss": 12.2041, + "step": 14672 + }, + { + "epoch": 0.7990034918625308, + "grad_norm": 0.5767024085857305, + "learning_rate": 0.00013654074746952572, + "loss": 12.2425, + "step": 14673 + }, + { + "epoch": 0.7990579458591139, + "grad_norm": 0.6576026837736271, + "learning_rate": 0.00013653253887243184, + "loss": 12.3398, + "step": 14674 + }, + { + "epoch": 0.7991123998556969, + "grad_norm": 0.6085438305694729, + "learning_rate": 0.00013652432999125484, + "loss": 12.229, + "step": 14675 + }, + { + "epoch": 0.7991668538522799, + "grad_norm": 0.5624586386412984, + "learning_rate": 0.0001365161208260585, + "loss": 12.0903, + "step": 14676 + }, + { + "epoch": 0.7992213078488629, + "grad_norm": 0.5549898217069293, + "learning_rate": 0.00013650791137690668, + "loss": 12.1677, + "step": 14677 + }, + { + "epoch": 0.7992757618454459, + "grad_norm": 0.5676483063775526, + "learning_rate": 0.00013649970164386323, + "loss": 12.2304, + "step": 14678 + }, + { + "epoch": 0.7993302158420289, + "grad_norm": 0.5308701782521362, + "learning_rate": 0.00013649149162699197, + "loss": 12.261, + "step": 14679 + }, + { + "epoch": 0.799384669838612, + "grad_norm": 0.6357462521824179, + "learning_rate": 0.00013648328132635676, + "loss": 12.3132, + "step": 14680 + }, + { + "epoch": 0.799439123835195, + "grad_norm": 0.5580459839955783, + "learning_rate": 0.00013647507074202142, + "loss": 12.2095, + "step": 14681 + }, + { + "epoch": 0.799493577831778, + "grad_norm": 0.5323776944845773, + "learning_rate": 0.0001364668598740498, + "loss": 12.259, + "step": 14682 + }, + { + "epoch": 0.799548031828361, + "grad_norm": 0.5452627919182734, + "learning_rate": 0.0001364586487225058, + "loss": 12.1575, + "step": 14683 + }, + { + "epoch": 0.799602485824944, + "grad_norm": 0.6235269652424776, + "learning_rate": 0.00013645043728745325, + "loss": 12.2087, + "step": 14684 + }, + { + "epoch": 0.7996569398215271, + "grad_norm": 0.6513695487707094, + "learning_rate": 0.00013644222556895592, + "loss": 12.3251, + "step": 14685 + }, + { + "epoch": 0.7997113938181101, + "grad_norm": 0.541478799476608, + "learning_rate": 0.00013643401356707777, + "loss": 12.2063, + "step": 14686 + }, + { + "epoch": 0.7997658478146931, + "grad_norm": 0.6221379479706041, + "learning_rate": 0.00013642580128188264, + "loss": 12.2197, + "step": 14687 + }, + { + "epoch": 0.7998203018112761, + "grad_norm": 0.5977131685243543, + "learning_rate": 0.00013641758871343432, + "loss": 12.2846, + "step": 14688 + }, + { + "epoch": 0.799874755807859, + "grad_norm": 0.5986729314260992, + "learning_rate": 0.00013640937586179678, + "loss": 12.2008, + "step": 14689 + }, + { + "epoch": 0.799929209804442, + "grad_norm": 0.6037694902876016, + "learning_rate": 0.0001364011627270338, + "loss": 12.3244, + "step": 14690 + }, + { + "epoch": 0.7999836638010251, + "grad_norm": 0.5716463867704794, + "learning_rate": 0.00013639294930920925, + "loss": 12.2065, + "step": 14691 + }, + { + "epoch": 0.8000381177976081, + "grad_norm": 0.6101374572745445, + "learning_rate": 0.00013638473560838706, + "loss": 12.2074, + "step": 14692 + }, + { + "epoch": 0.8000925717941911, + "grad_norm": 0.5804774150756093, + "learning_rate": 0.00013637652162463103, + "loss": 12.1971, + "step": 14693 + }, + { + "epoch": 0.8001470257907741, + "grad_norm": 0.5712723197657905, + "learning_rate": 0.0001363683073580051, + "loss": 12.1784, + "step": 14694 + }, + { + "epoch": 0.8002014797873571, + "grad_norm": 0.5736267925533555, + "learning_rate": 0.0001363600928085731, + "loss": 12.1645, + "step": 14695 + }, + { + "epoch": 0.8002559337839401, + "grad_norm": 0.5704183696042504, + "learning_rate": 0.0001363518779763989, + "loss": 12.2962, + "step": 14696 + }, + { + "epoch": 0.8003103877805232, + "grad_norm": 0.5422945074682746, + "learning_rate": 0.00013634366286154642, + "loss": 12.2148, + "step": 14697 + }, + { + "epoch": 0.8003648417771062, + "grad_norm": 0.538070683148117, + "learning_rate": 0.00013633544746407953, + "loss": 12.2103, + "step": 14698 + }, + { + "epoch": 0.8004192957736892, + "grad_norm": 0.5463898898573379, + "learning_rate": 0.00013632723178406213, + "loss": 12.1015, + "step": 14699 + }, + { + "epoch": 0.8004737497702722, + "grad_norm": 0.6780580468037041, + "learning_rate": 0.00013631901582155807, + "loss": 12.2762, + "step": 14700 + }, + { + "epoch": 0.8005282037668552, + "grad_norm": 0.5277520574048612, + "learning_rate": 0.00013631079957663122, + "loss": 12.2817, + "step": 14701 + }, + { + "epoch": 0.8005826577634382, + "grad_norm": 0.5393645235090414, + "learning_rate": 0.00013630258304934552, + "loss": 12.146, + "step": 14702 + }, + { + "epoch": 0.8006371117600213, + "grad_norm": 0.5997930595758416, + "learning_rate": 0.00013629436623976483, + "loss": 12.2559, + "step": 14703 + }, + { + "epoch": 0.8006915657566043, + "grad_norm": 0.5373163929300773, + "learning_rate": 0.0001362861491479531, + "loss": 12.1077, + "step": 14704 + }, + { + "epoch": 0.8007460197531873, + "grad_norm": 0.5806005994554888, + "learning_rate": 0.00013627793177397416, + "loss": 12.3336, + "step": 14705 + }, + { + "epoch": 0.8008004737497703, + "grad_norm": 0.6518520429407361, + "learning_rate": 0.00013626971411789197, + "loss": 12.2488, + "step": 14706 + }, + { + "epoch": 0.8008549277463533, + "grad_norm": 0.5552547751323605, + "learning_rate": 0.00013626149617977035, + "loss": 12.2116, + "step": 14707 + }, + { + "epoch": 0.8009093817429362, + "grad_norm": 0.551120271148661, + "learning_rate": 0.00013625327795967326, + "loss": 12.2319, + "step": 14708 + }, + { + "epoch": 0.8009638357395193, + "grad_norm": 0.5855379834522427, + "learning_rate": 0.00013624505945766466, + "loss": 12.1803, + "step": 14709 + }, + { + "epoch": 0.8010182897361023, + "grad_norm": 0.5753752303480557, + "learning_rate": 0.00013623684067380835, + "loss": 12.2338, + "step": 14710 + }, + { + "epoch": 0.8010727437326853, + "grad_norm": 0.5330703433849421, + "learning_rate": 0.0001362286216081683, + "loss": 12.3258, + "step": 14711 + }, + { + "epoch": 0.8011271977292683, + "grad_norm": 0.5619976654850436, + "learning_rate": 0.00013622040226080842, + "loss": 12.2137, + "step": 14712 + }, + { + "epoch": 0.8011816517258513, + "grad_norm": 0.600438451497241, + "learning_rate": 0.00013621218263179259, + "loss": 12.1938, + "step": 14713 + }, + { + "epoch": 0.8012361057224343, + "grad_norm": 0.6901686333060042, + "learning_rate": 0.0001362039627211848, + "loss": 12.058, + "step": 14714 + }, + { + "epoch": 0.8012905597190174, + "grad_norm": 0.5017285782650442, + "learning_rate": 0.0001361957425290489, + "loss": 12.2392, + "step": 14715 + }, + { + "epoch": 0.8013450137156004, + "grad_norm": 0.5516965967674011, + "learning_rate": 0.00013618752205544885, + "loss": 12.138, + "step": 14716 + }, + { + "epoch": 0.8013994677121834, + "grad_norm": 0.6359579853789539, + "learning_rate": 0.00013617930130044854, + "loss": 12.2751, + "step": 14717 + }, + { + "epoch": 0.8014539217087664, + "grad_norm": 0.6236073792954583, + "learning_rate": 0.0001361710802641119, + "loss": 12.131, + "step": 14718 + }, + { + "epoch": 0.8015083757053494, + "grad_norm": 0.5946363293936324, + "learning_rate": 0.0001361628589465029, + "loss": 12.2467, + "step": 14719 + }, + { + "epoch": 0.8015628297019325, + "grad_norm": 0.5702754809065214, + "learning_rate": 0.00013615463734768546, + "loss": 12.2673, + "step": 14720 + }, + { + "epoch": 0.8016172836985155, + "grad_norm": 0.5655799635243692, + "learning_rate": 0.00013614641546772348, + "loss": 12.1912, + "step": 14721 + }, + { + "epoch": 0.8016717376950985, + "grad_norm": 0.5317017544142133, + "learning_rate": 0.0001361381933066809, + "loss": 12.2049, + "step": 14722 + }, + { + "epoch": 0.8017261916916815, + "grad_norm": 0.6350850966842021, + "learning_rate": 0.00013612997086462169, + "loss": 12.302, + "step": 14723 + }, + { + "epoch": 0.8017806456882645, + "grad_norm": 0.6144874801838794, + "learning_rate": 0.00013612174814160976, + "loss": 12.2868, + "step": 14724 + }, + { + "epoch": 0.8018350996848475, + "grad_norm": 0.5321833507643821, + "learning_rate": 0.00013611352513770905, + "loss": 12.0525, + "step": 14725 + }, + { + "epoch": 0.8018895536814306, + "grad_norm": 0.5718843327099883, + "learning_rate": 0.00013610530185298353, + "loss": 12.1852, + "step": 14726 + }, + { + "epoch": 0.8019440076780135, + "grad_norm": 0.6070868198744772, + "learning_rate": 0.0001360970782874971, + "loss": 12.1785, + "step": 14727 + }, + { + "epoch": 0.8019984616745965, + "grad_norm": 0.5306665602802828, + "learning_rate": 0.00013608885444131374, + "loss": 12.3216, + "step": 14728 + }, + { + "epoch": 0.8020529156711795, + "grad_norm": 0.6288825474118335, + "learning_rate": 0.0001360806303144974, + "loss": 12.3004, + "step": 14729 + }, + { + "epoch": 0.8021073696677625, + "grad_norm": 0.5613030264008502, + "learning_rate": 0.00013607240590711206, + "loss": 12.2871, + "step": 14730 + }, + { + "epoch": 0.8021618236643455, + "grad_norm": 0.5322723276921058, + "learning_rate": 0.0001360641812192216, + "loss": 12.1806, + "step": 14731 + }, + { + "epoch": 0.8022162776609286, + "grad_norm": 0.584965786831096, + "learning_rate": 0.00013605595625089005, + "loss": 12.1354, + "step": 14732 + }, + { + "epoch": 0.8022707316575116, + "grad_norm": 0.561919891996372, + "learning_rate": 0.00013604773100218132, + "loss": 12.2726, + "step": 14733 + }, + { + "epoch": 0.8023251856540946, + "grad_norm": 0.697282815185284, + "learning_rate": 0.0001360395054731594, + "loss": 12.3293, + "step": 14734 + }, + { + "epoch": 0.8023796396506776, + "grad_norm": 0.5649339931331062, + "learning_rate": 0.0001360312796638882, + "loss": 12.0921, + "step": 14735 + }, + { + "epoch": 0.8024340936472606, + "grad_norm": 0.5525623322911426, + "learning_rate": 0.0001360230535744318, + "loss": 12.2729, + "step": 14736 + }, + { + "epoch": 0.8024885476438436, + "grad_norm": 0.5735865225673554, + "learning_rate": 0.00013601482720485404, + "loss": 12.06, + "step": 14737 + }, + { + "epoch": 0.8025430016404267, + "grad_norm": 0.6089480738355258, + "learning_rate": 0.00013600660055521896, + "loss": 12.2962, + "step": 14738 + }, + { + "epoch": 0.8025974556370097, + "grad_norm": 0.6060095300813723, + "learning_rate": 0.00013599837362559053, + "loss": 12.1462, + "step": 14739 + }, + { + "epoch": 0.8026519096335927, + "grad_norm": 0.5992632558396295, + "learning_rate": 0.0001359901464160327, + "loss": 12.3345, + "step": 14740 + }, + { + "epoch": 0.8027063636301757, + "grad_norm": 0.646098819998647, + "learning_rate": 0.00013598191892660942, + "loss": 12.2307, + "step": 14741 + }, + { + "epoch": 0.8027608176267587, + "grad_norm": 0.590717453674496, + "learning_rate": 0.00013597369115738475, + "loss": 12.2248, + "step": 14742 + }, + { + "epoch": 0.8028152716233417, + "grad_norm": 0.5790171924040252, + "learning_rate": 0.00013596546310842259, + "loss": 12.3083, + "step": 14743 + }, + { + "epoch": 0.8028697256199248, + "grad_norm": 0.5690108975248733, + "learning_rate": 0.000135957234779787, + "loss": 12.1001, + "step": 14744 + }, + { + "epoch": 0.8029241796165077, + "grad_norm": 0.6598419449912617, + "learning_rate": 0.00013594900617154188, + "loss": 12.2677, + "step": 14745 + }, + { + "epoch": 0.8029786336130907, + "grad_norm": 0.6457502835703006, + "learning_rate": 0.00013594077728375128, + "loss": 12.2013, + "step": 14746 + }, + { + "epoch": 0.8030330876096737, + "grad_norm": 0.5676104527087016, + "learning_rate": 0.00013593254811647916, + "loss": 12.2458, + "step": 14747 + }, + { + "epoch": 0.8030875416062567, + "grad_norm": 0.5815299697852133, + "learning_rate": 0.00013592431866978955, + "loss": 12.0223, + "step": 14748 + }, + { + "epoch": 0.8031419956028398, + "grad_norm": 0.635907774790155, + "learning_rate": 0.00013591608894374642, + "loss": 12.1705, + "step": 14749 + }, + { + "epoch": 0.8031964495994228, + "grad_norm": 0.632519027187312, + "learning_rate": 0.00013590785893841372, + "loss": 12.2368, + "step": 14750 + }, + { + "epoch": 0.8032509035960058, + "grad_norm": 0.5971202957757051, + "learning_rate": 0.00013589962865385546, + "loss": 12.2808, + "step": 14751 + }, + { + "epoch": 0.8033053575925888, + "grad_norm": 0.6267980859454244, + "learning_rate": 0.00013589139809013572, + "loss": 12.3181, + "step": 14752 + }, + { + "epoch": 0.8033598115891718, + "grad_norm": 0.5696313738145158, + "learning_rate": 0.00013588316724731842, + "loss": 12.1311, + "step": 14753 + }, + { + "epoch": 0.8034142655857548, + "grad_norm": 0.5387721506325479, + "learning_rate": 0.00013587493612546764, + "loss": 12.2173, + "step": 14754 + }, + { + "epoch": 0.8034687195823379, + "grad_norm": 0.5487563950306569, + "learning_rate": 0.00013586670472464732, + "loss": 12.2189, + "step": 14755 + }, + { + "epoch": 0.8035231735789209, + "grad_norm": 0.6023344704593654, + "learning_rate": 0.00013585847304492144, + "loss": 12.2929, + "step": 14756 + }, + { + "epoch": 0.8035776275755039, + "grad_norm": 0.68285705582827, + "learning_rate": 0.00013585024108635408, + "loss": 12.2312, + "step": 14757 + }, + { + "epoch": 0.8036320815720869, + "grad_norm": 0.5828683432730125, + "learning_rate": 0.00013584200884900926, + "loss": 12.166, + "step": 14758 + }, + { + "epoch": 0.8036865355686699, + "grad_norm": 0.598886249716344, + "learning_rate": 0.00013583377633295097, + "loss": 12.272, + "step": 14759 + }, + { + "epoch": 0.8037409895652529, + "grad_norm": 0.574350861635556, + "learning_rate": 0.00013582554353824323, + "loss": 12.2154, + "step": 14760 + }, + { + "epoch": 0.803795443561836, + "grad_norm": 0.531639449220211, + "learning_rate": 0.00013581731046495004, + "loss": 12.0654, + "step": 14761 + }, + { + "epoch": 0.803849897558419, + "grad_norm": 0.5760371469392463, + "learning_rate": 0.00013580907711313543, + "loss": 12.1291, + "step": 14762 + }, + { + "epoch": 0.803904351555002, + "grad_norm": 0.6355944627676456, + "learning_rate": 0.00013580084348286344, + "loss": 12.2196, + "step": 14763 + }, + { + "epoch": 0.8039588055515849, + "grad_norm": 0.6236060806198317, + "learning_rate": 0.00013579260957419812, + "loss": 12.1445, + "step": 14764 + }, + { + "epoch": 0.8040132595481679, + "grad_norm": 0.5654738248795809, + "learning_rate": 0.0001357843753872034, + "loss": 12.3111, + "step": 14765 + }, + { + "epoch": 0.8040677135447509, + "grad_norm": 0.6123689053793384, + "learning_rate": 0.0001357761409219434, + "loss": 12.2449, + "step": 14766 + }, + { + "epoch": 0.804122167541334, + "grad_norm": 0.6004670492334654, + "learning_rate": 0.00013576790617848215, + "loss": 12.2578, + "step": 14767 + }, + { + "epoch": 0.804176621537917, + "grad_norm": 0.5645618267114271, + "learning_rate": 0.00013575967115688365, + "loss": 12.2648, + "step": 14768 + }, + { + "epoch": 0.8042310755345, + "grad_norm": 0.5500008223839482, + "learning_rate": 0.00013575143585721196, + "loss": 12.2283, + "step": 14769 + }, + { + "epoch": 0.804285529531083, + "grad_norm": 0.6051696895269583, + "learning_rate": 0.0001357432002795311, + "loss": 12.3082, + "step": 14770 + }, + { + "epoch": 0.804339983527666, + "grad_norm": 0.5952390426204499, + "learning_rate": 0.00013573496442390511, + "loss": 12.1966, + "step": 14771 + }, + { + "epoch": 0.804394437524249, + "grad_norm": 0.6088605323849096, + "learning_rate": 0.00013572672829039806, + "loss": 12.3442, + "step": 14772 + }, + { + "epoch": 0.8044488915208321, + "grad_norm": 0.5993152028924302, + "learning_rate": 0.00013571849187907396, + "loss": 12.1041, + "step": 14773 + }, + { + "epoch": 0.8045033455174151, + "grad_norm": 0.5218338456393441, + "learning_rate": 0.0001357102551899969, + "loss": 12.1031, + "step": 14774 + }, + { + "epoch": 0.8045577995139981, + "grad_norm": 0.609926418028468, + "learning_rate": 0.0001357020182232309, + "loss": 12.3269, + "step": 14775 + }, + { + "epoch": 0.8046122535105811, + "grad_norm": 0.6470069008285002, + "learning_rate": 0.00013569378097884, + "loss": 12.2054, + "step": 14776 + }, + { + "epoch": 0.8046667075071641, + "grad_norm": 0.5562287041283757, + "learning_rate": 0.0001356855434568883, + "loss": 12.171, + "step": 14777 + }, + { + "epoch": 0.8047211615037471, + "grad_norm": 0.5383874909013538, + "learning_rate": 0.00013567730565743982, + "loss": 12.2667, + "step": 14778 + }, + { + "epoch": 0.8047756155003302, + "grad_norm": 0.5781141534823001, + "learning_rate": 0.00013566906758055863, + "loss": 12.0728, + "step": 14779 + }, + { + "epoch": 0.8048300694969132, + "grad_norm": 0.5767436399068844, + "learning_rate": 0.00013566082922630878, + "loss": 12.181, + "step": 14780 + }, + { + "epoch": 0.8048845234934962, + "grad_norm": 0.6601181379244292, + "learning_rate": 0.00013565259059475436, + "loss": 12.1892, + "step": 14781 + }, + { + "epoch": 0.8049389774900791, + "grad_norm": 0.5180753674094193, + "learning_rate": 0.00013564435168595938, + "loss": 12.2281, + "step": 14782 + }, + { + "epoch": 0.8049934314866621, + "grad_norm": 0.5461455256382294, + "learning_rate": 0.000135636112499988, + "loss": 12.1856, + "step": 14783 + }, + { + "epoch": 0.8050478854832452, + "grad_norm": 0.5651300856132475, + "learning_rate": 0.0001356278730369042, + "loss": 12.1808, + "step": 14784 + }, + { + "epoch": 0.8051023394798282, + "grad_norm": 0.5762346524051141, + "learning_rate": 0.00013561963329677208, + "loss": 12.0586, + "step": 14785 + }, + { + "epoch": 0.8051567934764112, + "grad_norm": 0.5566516236897829, + "learning_rate": 0.0001356113932796557, + "loss": 12.2405, + "step": 14786 + }, + { + "epoch": 0.8052112474729942, + "grad_norm": 0.5272763168734077, + "learning_rate": 0.0001356031529856192, + "loss": 12.2018, + "step": 14787 + }, + { + "epoch": 0.8052657014695772, + "grad_norm": 0.5277452420280747, + "learning_rate": 0.00013559491241472657, + "loss": 12.1506, + "step": 14788 + }, + { + "epoch": 0.8053201554661602, + "grad_norm": 0.5549414438819789, + "learning_rate": 0.00013558667156704195, + "loss": 12.2929, + "step": 14789 + }, + { + "epoch": 0.8053746094627433, + "grad_norm": 0.573943349349434, + "learning_rate": 0.00013557843044262942, + "loss": 12.2114, + "step": 14790 + }, + { + "epoch": 0.8054290634593263, + "grad_norm": 0.631727434172093, + "learning_rate": 0.000135570189041553, + "loss": 12.0948, + "step": 14791 + }, + { + "epoch": 0.8054835174559093, + "grad_norm": 0.5886082743780373, + "learning_rate": 0.00013556194736387688, + "loss": 12.2512, + "step": 14792 + }, + { + "epoch": 0.8055379714524923, + "grad_norm": 0.5156967125767348, + "learning_rate": 0.00013555370540966507, + "loss": 12.1234, + "step": 14793 + }, + { + "epoch": 0.8055924254490753, + "grad_norm": 0.5906966524320194, + "learning_rate": 0.00013554546317898168, + "loss": 12.1704, + "step": 14794 + }, + { + "epoch": 0.8056468794456583, + "grad_norm": 0.6407555076229271, + "learning_rate": 0.00013553722067189084, + "loss": 12.1559, + "step": 14795 + }, + { + "epoch": 0.8057013334422414, + "grad_norm": 0.7499130923304811, + "learning_rate": 0.00013552897788845656, + "loss": 12.1958, + "step": 14796 + }, + { + "epoch": 0.8057557874388244, + "grad_norm": 0.5890173368065736, + "learning_rate": 0.00013552073482874302, + "loss": 12.1924, + "step": 14797 + }, + { + "epoch": 0.8058102414354074, + "grad_norm": 0.5424754999109495, + "learning_rate": 0.0001355124914928143, + "loss": 12.1568, + "step": 14798 + }, + { + "epoch": 0.8058646954319904, + "grad_norm": 0.5570372008104348, + "learning_rate": 0.00013550424788073446, + "loss": 12.1763, + "step": 14799 + }, + { + "epoch": 0.8059191494285733, + "grad_norm": 0.6144758166704617, + "learning_rate": 0.00013549600399256762, + "loss": 12.2377, + "step": 14800 + }, + { + "epoch": 0.8059736034251563, + "grad_norm": 0.5475933792469335, + "learning_rate": 0.00013548775982837795, + "loss": 12.1686, + "step": 14801 + }, + { + "epoch": 0.8060280574217394, + "grad_norm": 0.532083181554464, + "learning_rate": 0.0001354795153882295, + "loss": 12.1992, + "step": 14802 + }, + { + "epoch": 0.8060825114183224, + "grad_norm": 0.569394156356611, + "learning_rate": 0.00013547127067218637, + "loss": 12.2908, + "step": 14803 + }, + { + "epoch": 0.8061369654149054, + "grad_norm": 0.5876466479875222, + "learning_rate": 0.0001354630256803127, + "loss": 12.376, + "step": 14804 + }, + { + "epoch": 0.8061914194114884, + "grad_norm": 0.6446041649581469, + "learning_rate": 0.00013545478041267258, + "loss": 12.3708, + "step": 14805 + }, + { + "epoch": 0.8062458734080714, + "grad_norm": 0.5745533624203241, + "learning_rate": 0.00013544653486933017, + "loss": 12.2191, + "step": 14806 + }, + { + "epoch": 0.8063003274046544, + "grad_norm": 0.5876921213626856, + "learning_rate": 0.00013543828905034953, + "loss": 12.0911, + "step": 14807 + }, + { + "epoch": 0.8063547814012375, + "grad_norm": 0.61122104079932, + "learning_rate": 0.00013543004295579481, + "loss": 12.3174, + "step": 14808 + }, + { + "epoch": 0.8064092353978205, + "grad_norm": 0.5693585541111951, + "learning_rate": 0.00013542179658573018, + "loss": 12.3254, + "step": 14809 + }, + { + "epoch": 0.8064636893944035, + "grad_norm": 0.6509018058521152, + "learning_rate": 0.00013541354994021972, + "loss": 12.1927, + "step": 14810 + }, + { + "epoch": 0.8065181433909865, + "grad_norm": 0.524792413116897, + "learning_rate": 0.0001354053030193275, + "loss": 12.1193, + "step": 14811 + }, + { + "epoch": 0.8065725973875695, + "grad_norm": 0.6622307559286874, + "learning_rate": 0.0001353970558231177, + "loss": 12.1634, + "step": 14812 + }, + { + "epoch": 0.8066270513841525, + "grad_norm": 0.6240020101914101, + "learning_rate": 0.00013538880835165453, + "loss": 12.1128, + "step": 14813 + }, + { + "epoch": 0.8066815053807356, + "grad_norm": 0.6213612845211237, + "learning_rate": 0.000135380560605002, + "loss": 12.2167, + "step": 14814 + }, + { + "epoch": 0.8067359593773186, + "grad_norm": 0.5891375759818152, + "learning_rate": 0.00013537231258322434, + "loss": 12.2565, + "step": 14815 + }, + { + "epoch": 0.8067904133739016, + "grad_norm": 0.6213675246525658, + "learning_rate": 0.00013536406428638558, + "loss": 12.1733, + "step": 14816 + }, + { + "epoch": 0.8068448673704846, + "grad_norm": 0.5792737305087725, + "learning_rate": 0.00013535581571454995, + "loss": 12.2326, + "step": 14817 + }, + { + "epoch": 0.8068993213670675, + "grad_norm": 0.5503879796337302, + "learning_rate": 0.00013534756686778157, + "loss": 12.1473, + "step": 14818 + }, + { + "epoch": 0.8069537753636506, + "grad_norm": 0.5875674071397632, + "learning_rate": 0.0001353393177461446, + "loss": 12.3264, + "step": 14819 + }, + { + "epoch": 0.8070082293602336, + "grad_norm": 0.6658764117400607, + "learning_rate": 0.00013533106834970319, + "loss": 12.3356, + "step": 14820 + }, + { + "epoch": 0.8070626833568166, + "grad_norm": 0.5598078870331447, + "learning_rate": 0.00013532281867852144, + "loss": 12.056, + "step": 14821 + }, + { + "epoch": 0.8071171373533996, + "grad_norm": 0.5427067549451772, + "learning_rate": 0.00013531456873266352, + "loss": 12.2513, + "step": 14822 + }, + { + "epoch": 0.8071715913499826, + "grad_norm": 0.6039181316269996, + "learning_rate": 0.00013530631851219358, + "loss": 12.104, + "step": 14823 + }, + { + "epoch": 0.8072260453465656, + "grad_norm": 0.5514512877987486, + "learning_rate": 0.00013529806801717583, + "loss": 12.0721, + "step": 14824 + }, + { + "epoch": 0.8072804993431487, + "grad_norm": 0.5026969335243999, + "learning_rate": 0.00013528981724767434, + "loss": 12.1453, + "step": 14825 + }, + { + "epoch": 0.8073349533397317, + "grad_norm": 0.6399354209506974, + "learning_rate": 0.00013528156620375335, + "loss": 12.3137, + "step": 14826 + }, + { + "epoch": 0.8073894073363147, + "grad_norm": 0.5228089435675359, + "learning_rate": 0.00013527331488547698, + "loss": 12.1478, + "step": 14827 + }, + { + "epoch": 0.8074438613328977, + "grad_norm": 0.5234410976027418, + "learning_rate": 0.00013526506329290933, + "loss": 12.2074, + "step": 14828 + }, + { + "epoch": 0.8074983153294807, + "grad_norm": 0.529908021814124, + "learning_rate": 0.00013525681142611472, + "loss": 12.1225, + "step": 14829 + }, + { + "epoch": 0.8075527693260637, + "grad_norm": 0.544371633644651, + "learning_rate": 0.00013524855928515717, + "loss": 12.0765, + "step": 14830 + }, + { + "epoch": 0.8076072233226468, + "grad_norm": 0.5615786923535416, + "learning_rate": 0.00013524030687010096, + "loss": 12.2037, + "step": 14831 + }, + { + "epoch": 0.8076616773192298, + "grad_norm": 0.5504412742251322, + "learning_rate": 0.0001352320541810102, + "loss": 12.1708, + "step": 14832 + }, + { + "epoch": 0.8077161313158128, + "grad_norm": 0.7066085094534744, + "learning_rate": 0.00013522380121794907, + "loss": 12.1918, + "step": 14833 + }, + { + "epoch": 0.8077705853123958, + "grad_norm": 0.6199773081342997, + "learning_rate": 0.00013521554798098172, + "loss": 12.276, + "step": 14834 + }, + { + "epoch": 0.8078250393089788, + "grad_norm": 0.5580291655636943, + "learning_rate": 0.00013520729447017243, + "loss": 12.3218, + "step": 14835 + }, + { + "epoch": 0.8078794933055617, + "grad_norm": 0.5219495895617315, + "learning_rate": 0.0001351990406855853, + "loss": 12.1418, + "step": 14836 + }, + { + "epoch": 0.8079339473021449, + "grad_norm": 0.6113277869483955, + "learning_rate": 0.00013519078662728448, + "loss": 12.1674, + "step": 14837 + }, + { + "epoch": 0.8079884012987278, + "grad_norm": 0.5109110687457398, + "learning_rate": 0.00013518253229533424, + "loss": 12.3098, + "step": 14838 + }, + { + "epoch": 0.8080428552953108, + "grad_norm": 0.557004942818786, + "learning_rate": 0.0001351742776897987, + "loss": 12.1824, + "step": 14839 + }, + { + "epoch": 0.8080973092918938, + "grad_norm": 0.6010657488392056, + "learning_rate": 0.00013516602281074213, + "loss": 12.2227, + "step": 14840 + }, + { + "epoch": 0.8081517632884768, + "grad_norm": 0.6358189902556056, + "learning_rate": 0.00013515776765822863, + "loss": 12.277, + "step": 14841 + }, + { + "epoch": 0.8082062172850598, + "grad_norm": 0.5751197052084914, + "learning_rate": 0.00013514951223232244, + "loss": 12.2324, + "step": 14842 + }, + { + "epoch": 0.8082606712816429, + "grad_norm": 0.5700495119696227, + "learning_rate": 0.00013514125653308777, + "loss": 12.2499, + "step": 14843 + }, + { + "epoch": 0.8083151252782259, + "grad_norm": 0.5918496963055511, + "learning_rate": 0.00013513300056058877, + "loss": 12.2725, + "step": 14844 + }, + { + "epoch": 0.8083695792748089, + "grad_norm": 0.5626414254597144, + "learning_rate": 0.00013512474431488967, + "loss": 12.2727, + "step": 14845 + }, + { + "epoch": 0.8084240332713919, + "grad_norm": 0.5642979068634242, + "learning_rate": 0.00013511648779605465, + "loss": 12.156, + "step": 14846 + }, + { + "epoch": 0.8084784872679749, + "grad_norm": 0.5493409572701059, + "learning_rate": 0.00013510823100414796, + "loss": 12.2125, + "step": 14847 + }, + { + "epoch": 0.8085329412645579, + "grad_norm": 0.5689975122058666, + "learning_rate": 0.00013509997393923377, + "loss": 12.2015, + "step": 14848 + }, + { + "epoch": 0.808587395261141, + "grad_norm": 0.49182099649915784, + "learning_rate": 0.0001350917166013763, + "loss": 12.1185, + "step": 14849 + }, + { + "epoch": 0.808641849257724, + "grad_norm": 0.5247699829593893, + "learning_rate": 0.00013508345899063975, + "loss": 12.1663, + "step": 14850 + }, + { + "epoch": 0.808696303254307, + "grad_norm": 0.5642604245222994, + "learning_rate": 0.00013507520110708833, + "loss": 12.1613, + "step": 14851 + }, + { + "epoch": 0.80875075725089, + "grad_norm": 0.5587763435778694, + "learning_rate": 0.00013506694295078628, + "loss": 12.2153, + "step": 14852 + }, + { + "epoch": 0.808805211247473, + "grad_norm": 0.5574095281181681, + "learning_rate": 0.0001350586845217978, + "loss": 12.267, + "step": 14853 + }, + { + "epoch": 0.8088596652440561, + "grad_norm": 0.5716751867881421, + "learning_rate": 0.0001350504258201871, + "loss": 12.1996, + "step": 14854 + }, + { + "epoch": 0.808914119240639, + "grad_norm": 0.5550756421929837, + "learning_rate": 0.00013504216684601843, + "loss": 12.1243, + "step": 14855 + }, + { + "epoch": 0.808968573237222, + "grad_norm": 0.5443095253722837, + "learning_rate": 0.00013503390759935597, + "loss": 11.9836, + "step": 14856 + }, + { + "epoch": 0.809023027233805, + "grad_norm": 0.6122039032601948, + "learning_rate": 0.00013502564808026398, + "loss": 12.2784, + "step": 14857 + }, + { + "epoch": 0.809077481230388, + "grad_norm": 0.5787822118596989, + "learning_rate": 0.00013501738828880668, + "loss": 12.086, + "step": 14858 + }, + { + "epoch": 0.809131935226971, + "grad_norm": 0.6197521341959868, + "learning_rate": 0.0001350091282250483, + "loss": 12.3282, + "step": 14859 + }, + { + "epoch": 0.8091863892235541, + "grad_norm": 0.5613037311489635, + "learning_rate": 0.00013500086788905305, + "loss": 12.1889, + "step": 14860 + }, + { + "epoch": 0.8092408432201371, + "grad_norm": 0.5834880990877368, + "learning_rate": 0.00013499260728088518, + "loss": 12.2518, + "step": 14861 + }, + { + "epoch": 0.8092952972167201, + "grad_norm": 0.6046503559268155, + "learning_rate": 0.00013498434640060896, + "loss": 12.1995, + "step": 14862 + }, + { + "epoch": 0.8093497512133031, + "grad_norm": 0.5478187529728412, + "learning_rate": 0.00013497608524828857, + "loss": 12.2138, + "step": 14863 + }, + { + "epoch": 0.8094042052098861, + "grad_norm": 0.5791711463562509, + "learning_rate": 0.0001349678238239883, + "loss": 12.0796, + "step": 14864 + }, + { + "epoch": 0.8094586592064691, + "grad_norm": 0.5683328909188236, + "learning_rate": 0.00013495956212777237, + "loss": 12.1985, + "step": 14865 + }, + { + "epoch": 0.8095131132030522, + "grad_norm": 0.5371246737714352, + "learning_rate": 0.00013495130015970497, + "loss": 12.0929, + "step": 14866 + }, + { + "epoch": 0.8095675671996352, + "grad_norm": 0.5835081530600631, + "learning_rate": 0.00013494303791985045, + "loss": 12.1738, + "step": 14867 + }, + { + "epoch": 0.8096220211962182, + "grad_norm": 0.5857274196314827, + "learning_rate": 0.00013493477540827298, + "loss": 12.2489, + "step": 14868 + }, + { + "epoch": 0.8096764751928012, + "grad_norm": 0.5554600483101491, + "learning_rate": 0.00013492651262503685, + "loss": 12.1106, + "step": 14869 + }, + { + "epoch": 0.8097309291893842, + "grad_norm": 0.6650260925920747, + "learning_rate": 0.00013491824957020628, + "loss": 12.4283, + "step": 14870 + }, + { + "epoch": 0.8097853831859672, + "grad_norm": 0.5837912278619439, + "learning_rate": 0.00013490998624384558, + "loss": 12.1607, + "step": 14871 + }, + { + "epoch": 0.8098398371825503, + "grad_norm": 0.5421872387535078, + "learning_rate": 0.0001349017226460189, + "loss": 12.2336, + "step": 14872 + }, + { + "epoch": 0.8098942911791333, + "grad_norm": 0.7267994171336976, + "learning_rate": 0.00013489345877679067, + "loss": 12.2144, + "step": 14873 + }, + { + "epoch": 0.8099487451757162, + "grad_norm": 0.544225159839763, + "learning_rate": 0.000134885194636225, + "loss": 12.3555, + "step": 14874 + }, + { + "epoch": 0.8100031991722992, + "grad_norm": 0.6004138775591223, + "learning_rate": 0.00013487693022438624, + "loss": 12.2701, + "step": 14875 + }, + { + "epoch": 0.8100576531688822, + "grad_norm": 0.6246154775919707, + "learning_rate": 0.0001348686655413386, + "loss": 12.2617, + "step": 14876 + }, + { + "epoch": 0.8101121071654652, + "grad_norm": 0.5404264875377471, + "learning_rate": 0.00013486040058714632, + "loss": 12.2787, + "step": 14877 + }, + { + "epoch": 0.8101665611620483, + "grad_norm": 0.550133321004403, + "learning_rate": 0.00013485213536187378, + "loss": 12.2487, + "step": 14878 + }, + { + "epoch": 0.8102210151586313, + "grad_norm": 0.5770492698911255, + "learning_rate": 0.00013484386986558516, + "loss": 12.2898, + "step": 14879 + }, + { + "epoch": 0.8102754691552143, + "grad_norm": 0.7417205778001171, + "learning_rate": 0.0001348356040983448, + "loss": 12.324, + "step": 14880 + }, + { + "epoch": 0.8103299231517973, + "grad_norm": 0.5670109763720117, + "learning_rate": 0.00013482733806021693, + "loss": 12.1479, + "step": 14881 + }, + { + "epoch": 0.8103843771483803, + "grad_norm": 0.6161954924055218, + "learning_rate": 0.00013481907175126582, + "loss": 12.3676, + "step": 14882 + }, + { + "epoch": 0.8104388311449634, + "grad_norm": 0.5659342862521876, + "learning_rate": 0.00013481080517155578, + "loss": 12.1375, + "step": 14883 + }, + { + "epoch": 0.8104932851415464, + "grad_norm": 0.5521924434604571, + "learning_rate": 0.00013480253832115108, + "loss": 12.1946, + "step": 14884 + }, + { + "epoch": 0.8105477391381294, + "grad_norm": 0.5282392078953213, + "learning_rate": 0.000134794271200116, + "loss": 12.1081, + "step": 14885 + }, + { + "epoch": 0.8106021931347124, + "grad_norm": 0.5869486267483298, + "learning_rate": 0.00013478600380851486, + "loss": 12.1544, + "step": 14886 + }, + { + "epoch": 0.8106566471312954, + "grad_norm": 0.6393240761158134, + "learning_rate": 0.00013477773614641188, + "loss": 12.2286, + "step": 14887 + }, + { + "epoch": 0.8107111011278784, + "grad_norm": 0.5865865463438562, + "learning_rate": 0.0001347694682138714, + "loss": 12.1453, + "step": 14888 + }, + { + "epoch": 0.8107655551244615, + "grad_norm": 0.6034268114450104, + "learning_rate": 0.0001347612000109577, + "loss": 12.1117, + "step": 14889 + }, + { + "epoch": 0.8108200091210445, + "grad_norm": 0.5755438394885927, + "learning_rate": 0.0001347529315377351, + "loss": 12.2393, + "step": 14890 + }, + { + "epoch": 0.8108744631176275, + "grad_norm": 0.5920600215118633, + "learning_rate": 0.00013474466279426788, + "loss": 12.2165, + "step": 14891 + }, + { + "epoch": 0.8109289171142104, + "grad_norm": 0.5869621005908414, + "learning_rate": 0.00013473639378062035, + "loss": 12.1462, + "step": 14892 + }, + { + "epoch": 0.8109833711107934, + "grad_norm": 0.6218140884335485, + "learning_rate": 0.00013472812449685675, + "loss": 12.2385, + "step": 14893 + }, + { + "epoch": 0.8110378251073764, + "grad_norm": 0.6585378622196876, + "learning_rate": 0.00013471985494304143, + "loss": 12.2975, + "step": 14894 + }, + { + "epoch": 0.8110922791039595, + "grad_norm": 0.603287054489097, + "learning_rate": 0.00013471158511923876, + "loss": 12.1148, + "step": 14895 + }, + { + "epoch": 0.8111467331005425, + "grad_norm": 0.5996342050759436, + "learning_rate": 0.00013470331502551293, + "loss": 12.2267, + "step": 14896 + }, + { + "epoch": 0.8112011870971255, + "grad_norm": 0.5561600021162297, + "learning_rate": 0.00013469504466192831, + "loss": 12.227, + "step": 14897 + }, + { + "epoch": 0.8112556410937085, + "grad_norm": 0.5989965552789527, + "learning_rate": 0.0001346867740285492, + "loss": 12.1776, + "step": 14898 + }, + { + "epoch": 0.8113100950902915, + "grad_norm": 0.5785238524484136, + "learning_rate": 0.00013467850312543994, + "loss": 12.2839, + "step": 14899 + }, + { + "epoch": 0.8113645490868745, + "grad_norm": 0.6157536243138646, + "learning_rate": 0.0001346702319526648, + "loss": 12.216, + "step": 14900 + }, + { + "epoch": 0.8114190030834576, + "grad_norm": 0.6139216252470333, + "learning_rate": 0.00013466196051028814, + "loss": 12.3419, + "step": 14901 + }, + { + "epoch": 0.8114734570800406, + "grad_norm": 0.565349217936788, + "learning_rate": 0.00013465368879837425, + "loss": 12.2088, + "step": 14902 + }, + { + "epoch": 0.8115279110766236, + "grad_norm": 0.610329727565721, + "learning_rate": 0.00013464541681698747, + "loss": 12.246, + "step": 14903 + }, + { + "epoch": 0.8115823650732066, + "grad_norm": 0.6121022967448893, + "learning_rate": 0.0001346371445661921, + "loss": 12.1796, + "step": 14904 + }, + { + "epoch": 0.8116368190697896, + "grad_norm": 0.5470315206379218, + "learning_rate": 0.00013462887204605253, + "loss": 12.0974, + "step": 14905 + }, + { + "epoch": 0.8116912730663726, + "grad_norm": 0.5627392323847954, + "learning_rate": 0.00013462059925663299, + "loss": 12.1349, + "step": 14906 + }, + { + "epoch": 0.8117457270629557, + "grad_norm": 0.6150381921746256, + "learning_rate": 0.0001346123261979979, + "loss": 12.1735, + "step": 14907 + }, + { + "epoch": 0.8118001810595387, + "grad_norm": 0.539941217460702, + "learning_rate": 0.00013460405287021155, + "loss": 12.1675, + "step": 14908 + }, + { + "epoch": 0.8118546350561217, + "grad_norm": 0.6506378340672261, + "learning_rate": 0.0001345957792733383, + "loss": 12.1344, + "step": 14909 + }, + { + "epoch": 0.8119090890527046, + "grad_norm": 0.5772058084352601, + "learning_rate": 0.00013458750540744244, + "loss": 12.2413, + "step": 14910 + }, + { + "epoch": 0.8119635430492876, + "grad_norm": 0.5675206244256016, + "learning_rate": 0.00013457923127258833, + "loss": 12.0752, + "step": 14911 + }, + { + "epoch": 0.8120179970458706, + "grad_norm": 0.5806601939090454, + "learning_rate": 0.00013457095686884033, + "loss": 12.2102, + "step": 14912 + }, + { + "epoch": 0.8120724510424537, + "grad_norm": 0.6070178312564786, + "learning_rate": 0.00013456268219626277, + "loss": 12.2825, + "step": 14913 + }, + { + "epoch": 0.8121269050390367, + "grad_norm": 0.6414391844447898, + "learning_rate": 0.00013455440725492, + "loss": 12.1771, + "step": 14914 + }, + { + "epoch": 0.8121813590356197, + "grad_norm": 0.593889169013522, + "learning_rate": 0.00013454613204487637, + "loss": 12.1321, + "step": 14915 + }, + { + "epoch": 0.8122358130322027, + "grad_norm": 0.6188895270570187, + "learning_rate": 0.00013453785656619623, + "loss": 12.3201, + "step": 14916 + }, + { + "epoch": 0.8122902670287857, + "grad_norm": 0.6319531156353801, + "learning_rate": 0.00013452958081894392, + "loss": 12.2275, + "step": 14917 + }, + { + "epoch": 0.8123447210253688, + "grad_norm": 0.6033406417544306, + "learning_rate": 0.0001345213048031838, + "loss": 12.2757, + "step": 14918 + }, + { + "epoch": 0.8123991750219518, + "grad_norm": 0.5136940686796113, + "learning_rate": 0.00013451302851898023, + "loss": 12.141, + "step": 14919 + }, + { + "epoch": 0.8124536290185348, + "grad_norm": 0.5767843735213666, + "learning_rate": 0.00013450475196639754, + "loss": 12.2928, + "step": 14920 + }, + { + "epoch": 0.8125080830151178, + "grad_norm": 0.5360562112845915, + "learning_rate": 0.00013449647514550013, + "loss": 12.2221, + "step": 14921 + }, + { + "epoch": 0.8125625370117008, + "grad_norm": 0.5779506182159675, + "learning_rate": 0.00013448819805635234, + "loss": 12.184, + "step": 14922 + }, + { + "epoch": 0.8126169910082838, + "grad_norm": 0.5536109511509439, + "learning_rate": 0.0001344799206990185, + "loss": 12.2786, + "step": 14923 + }, + { + "epoch": 0.8126714450048669, + "grad_norm": 0.530682294119379, + "learning_rate": 0.0001344716430735631, + "loss": 12.1823, + "step": 14924 + }, + { + "epoch": 0.8127258990014499, + "grad_norm": 0.5865372297258536, + "learning_rate": 0.00013446336518005037, + "loss": 12.1718, + "step": 14925 + }, + { + "epoch": 0.8127803529980329, + "grad_norm": 0.5676681573906274, + "learning_rate": 0.00013445508701854473, + "loss": 12.1888, + "step": 14926 + }, + { + "epoch": 0.8128348069946159, + "grad_norm": 0.5783128967423121, + "learning_rate": 0.00013444680858911055, + "loss": 12.2015, + "step": 14927 + }, + { + "epoch": 0.8128892609911988, + "grad_norm": 0.6538533256948675, + "learning_rate": 0.00013443852989181222, + "loss": 12.3719, + "step": 14928 + }, + { + "epoch": 0.8129437149877818, + "grad_norm": 0.5702116895106648, + "learning_rate": 0.0001344302509267141, + "loss": 12.2315, + "step": 14929 + }, + { + "epoch": 0.8129981689843649, + "grad_norm": 0.5575423623731566, + "learning_rate": 0.0001344219716938806, + "loss": 12.1814, + "step": 14930 + }, + { + "epoch": 0.8130526229809479, + "grad_norm": 0.6088179847352079, + "learning_rate": 0.00013441369219337605, + "loss": 12.1685, + "step": 14931 + }, + { + "epoch": 0.8131070769775309, + "grad_norm": 0.607212284364403, + "learning_rate": 0.00013440541242526485, + "loss": 12.3098, + "step": 14932 + }, + { + "epoch": 0.8131615309741139, + "grad_norm": 0.567310137687238, + "learning_rate": 0.00013439713238961142, + "loss": 12.0795, + "step": 14933 + }, + { + "epoch": 0.8132159849706969, + "grad_norm": 0.5280748988570373, + "learning_rate": 0.0001343888520864801, + "loss": 12.2618, + "step": 14934 + }, + { + "epoch": 0.8132704389672799, + "grad_norm": 0.6022799459452628, + "learning_rate": 0.00013438057151593532, + "loss": 12.2013, + "step": 14935 + }, + { + "epoch": 0.813324892963863, + "grad_norm": 0.5395486494308462, + "learning_rate": 0.00013437229067804146, + "loss": 12.1533, + "step": 14936 + }, + { + "epoch": 0.813379346960446, + "grad_norm": 0.5345849608595724, + "learning_rate": 0.00013436400957286286, + "loss": 12.2119, + "step": 14937 + }, + { + "epoch": 0.813433800957029, + "grad_norm": 0.5223620841321481, + "learning_rate": 0.00013435572820046397, + "loss": 12.1625, + "step": 14938 + }, + { + "epoch": 0.813488254953612, + "grad_norm": 0.6776491326508257, + "learning_rate": 0.0001343474465609092, + "loss": 12.2583, + "step": 14939 + }, + { + "epoch": 0.813542708950195, + "grad_norm": 0.5517732802269188, + "learning_rate": 0.00013433916465426294, + "loss": 12.069, + "step": 14940 + }, + { + "epoch": 0.813597162946778, + "grad_norm": 0.5566442191844937, + "learning_rate": 0.00013433088248058955, + "loss": 12.2793, + "step": 14941 + }, + { + "epoch": 0.8136516169433611, + "grad_norm": 0.5713057966290501, + "learning_rate": 0.00013432260003995347, + "loss": 12.1679, + "step": 14942 + }, + { + "epoch": 0.8137060709399441, + "grad_norm": 0.5897516315170662, + "learning_rate": 0.00013431431733241907, + "loss": 12.1071, + "step": 14943 + }, + { + "epoch": 0.8137605249365271, + "grad_norm": 0.5827581462897125, + "learning_rate": 0.00013430603435805077, + "loss": 12.3552, + "step": 14944 + }, + { + "epoch": 0.8138149789331101, + "grad_norm": 0.5386720004141335, + "learning_rate": 0.00013429775111691304, + "loss": 12.1711, + "step": 14945 + }, + { + "epoch": 0.813869432929693, + "grad_norm": 0.6672047986879622, + "learning_rate": 0.00013428946760907025, + "loss": 12.0904, + "step": 14946 + }, + { + "epoch": 0.813923886926276, + "grad_norm": 0.570405414930972, + "learning_rate": 0.00013428118383458678, + "loss": 12.1777, + "step": 14947 + }, + { + "epoch": 0.8139783409228591, + "grad_norm": 0.5348817899525419, + "learning_rate": 0.00013427289979352707, + "loss": 12.181, + "step": 14948 + }, + { + "epoch": 0.8140327949194421, + "grad_norm": 0.5564605962690318, + "learning_rate": 0.00013426461548595556, + "loss": 12.2239, + "step": 14949 + }, + { + "epoch": 0.8140872489160251, + "grad_norm": 0.5826757439277491, + "learning_rate": 0.00013425633091193666, + "loss": 12.0763, + "step": 14950 + }, + { + "epoch": 0.8141417029126081, + "grad_norm": 0.6392072154759486, + "learning_rate": 0.00013424804607153478, + "loss": 12.2083, + "step": 14951 + }, + { + "epoch": 0.8141961569091911, + "grad_norm": 0.5278847783302267, + "learning_rate": 0.00013423976096481435, + "loss": 12.0883, + "step": 14952 + }, + { + "epoch": 0.8142506109057742, + "grad_norm": 0.5667153367275108, + "learning_rate": 0.00013423147559183982, + "loss": 12.1424, + "step": 14953 + }, + { + "epoch": 0.8143050649023572, + "grad_norm": 0.6059042657487198, + "learning_rate": 0.00013422318995267554, + "loss": 12.1883, + "step": 14954 + }, + { + "epoch": 0.8143595188989402, + "grad_norm": 0.584830107343991, + "learning_rate": 0.00013421490404738604, + "loss": 12.2183, + "step": 14955 + }, + { + "epoch": 0.8144139728955232, + "grad_norm": 0.6678900363954223, + "learning_rate": 0.0001342066178760357, + "loss": 12.2575, + "step": 14956 + }, + { + "epoch": 0.8144684268921062, + "grad_norm": 0.563722864109056, + "learning_rate": 0.00013419833143868897, + "loss": 12.1862, + "step": 14957 + }, + { + "epoch": 0.8145228808886892, + "grad_norm": 0.6231865053320498, + "learning_rate": 0.00013419004473541027, + "loss": 12.1494, + "step": 14958 + }, + { + "epoch": 0.8145773348852723, + "grad_norm": 0.6414733001046949, + "learning_rate": 0.000134181757766264, + "loss": 12.2958, + "step": 14959 + }, + { + "epoch": 0.8146317888818553, + "grad_norm": 0.5652240401860344, + "learning_rate": 0.0001341734705313147, + "loss": 12.1767, + "step": 14960 + }, + { + "epoch": 0.8146862428784383, + "grad_norm": 0.5747060543173036, + "learning_rate": 0.0001341651830306268, + "loss": 12.1846, + "step": 14961 + }, + { + "epoch": 0.8147406968750213, + "grad_norm": 0.6027332207345627, + "learning_rate": 0.00013415689526426465, + "loss": 12.1885, + "step": 14962 + }, + { + "epoch": 0.8147951508716043, + "grad_norm": 0.5982557446880276, + "learning_rate": 0.00013414860723229277, + "loss": 12.173, + "step": 14963 + }, + { + "epoch": 0.8148496048681872, + "grad_norm": 0.557802973894175, + "learning_rate": 0.00013414031893477558, + "loss": 12.2135, + "step": 14964 + }, + { + "epoch": 0.8149040588647704, + "grad_norm": 0.5448109877067271, + "learning_rate": 0.00013413203037177754, + "loss": 12.146, + "step": 14965 + }, + { + "epoch": 0.8149585128613533, + "grad_norm": 0.5656869069438613, + "learning_rate": 0.00013412374154336316, + "loss": 12.1739, + "step": 14966 + }, + { + "epoch": 0.8150129668579363, + "grad_norm": 0.5997702753121104, + "learning_rate": 0.0001341154524495968, + "loss": 12.2298, + "step": 14967 + }, + { + "epoch": 0.8150674208545193, + "grad_norm": 0.5748158705419266, + "learning_rate": 0.00013410716309054295, + "loss": 12.3095, + "step": 14968 + }, + { + "epoch": 0.8151218748511023, + "grad_norm": 0.6540648899325642, + "learning_rate": 0.00013409887346626612, + "loss": 12.1783, + "step": 14969 + }, + { + "epoch": 0.8151763288476853, + "grad_norm": 0.5447230236480691, + "learning_rate": 0.0001340905835768307, + "loss": 12.2082, + "step": 14970 + }, + { + "epoch": 0.8152307828442684, + "grad_norm": 0.5399966726816193, + "learning_rate": 0.0001340822934223012, + "loss": 12.118, + "step": 14971 + }, + { + "epoch": 0.8152852368408514, + "grad_norm": 0.5574187249050045, + "learning_rate": 0.00013407400300274207, + "loss": 12.1641, + "step": 14972 + }, + { + "epoch": 0.8153396908374344, + "grad_norm": 0.5774856992493318, + "learning_rate": 0.00013406571231821775, + "loss": 12.2274, + "step": 14973 + }, + { + "epoch": 0.8153941448340174, + "grad_norm": 0.5995070765895913, + "learning_rate": 0.00013405742136879278, + "loss": 12.2242, + "step": 14974 + }, + { + "epoch": 0.8154485988306004, + "grad_norm": 0.5695205010593473, + "learning_rate": 0.00013404913015453157, + "loss": 12.1707, + "step": 14975 + }, + { + "epoch": 0.8155030528271834, + "grad_norm": 0.5626070337798693, + "learning_rate": 0.00013404083867549863, + "loss": 12.2188, + "step": 14976 + }, + { + "epoch": 0.8155575068237665, + "grad_norm": 0.6446567968174487, + "learning_rate": 0.0001340325469317584, + "loss": 12.254, + "step": 14977 + }, + { + "epoch": 0.8156119608203495, + "grad_norm": 0.578028180559197, + "learning_rate": 0.00013402425492337538, + "loss": 12.0871, + "step": 14978 + }, + { + "epoch": 0.8156664148169325, + "grad_norm": 0.5707444550148375, + "learning_rate": 0.00013401596265041405, + "loss": 12.2326, + "step": 14979 + }, + { + "epoch": 0.8157208688135155, + "grad_norm": 0.5790218064683561, + "learning_rate": 0.0001340076701129389, + "loss": 12.1651, + "step": 14980 + }, + { + "epoch": 0.8157753228100985, + "grad_norm": 0.5530111516710103, + "learning_rate": 0.0001339993773110144, + "loss": 12.2381, + "step": 14981 + }, + { + "epoch": 0.8158297768066815, + "grad_norm": 0.5525339506735362, + "learning_rate": 0.00013399108424470504, + "loss": 12.1943, + "step": 14982 + }, + { + "epoch": 0.8158842308032646, + "grad_norm": 0.6791189230415343, + "learning_rate": 0.0001339827909140753, + "loss": 12.2272, + "step": 14983 + }, + { + "epoch": 0.8159386847998475, + "grad_norm": 0.8677941890684707, + "learning_rate": 0.00013397449731918968, + "loss": 12.2352, + "step": 14984 + }, + { + "epoch": 0.8159931387964305, + "grad_norm": 0.5423445874305333, + "learning_rate": 0.00013396620346011267, + "loss": 12.2357, + "step": 14985 + }, + { + "epoch": 0.8160475927930135, + "grad_norm": 0.5993168559027767, + "learning_rate": 0.00013395790933690878, + "loss": 12.2122, + "step": 14986 + }, + { + "epoch": 0.8161020467895965, + "grad_norm": 0.5884806571904675, + "learning_rate": 0.0001339496149496425, + "loss": 12.2743, + "step": 14987 + }, + { + "epoch": 0.8161565007861796, + "grad_norm": 0.5554630343945989, + "learning_rate": 0.00013394132029837828, + "loss": 11.9055, + "step": 14988 + }, + { + "epoch": 0.8162109547827626, + "grad_norm": 0.5215621664224854, + "learning_rate": 0.0001339330253831807, + "loss": 12.1985, + "step": 14989 + }, + { + "epoch": 0.8162654087793456, + "grad_norm": 0.6540481921438157, + "learning_rate": 0.0001339247302041142, + "loss": 12.2831, + "step": 14990 + }, + { + "epoch": 0.8163198627759286, + "grad_norm": 0.6157703375357999, + "learning_rate": 0.00013391643476124335, + "loss": 12.262, + "step": 14991 + }, + { + "epoch": 0.8163743167725116, + "grad_norm": 0.6485824430647104, + "learning_rate": 0.00013390813905463255, + "loss": 12.2356, + "step": 14992 + }, + { + "epoch": 0.8164287707690946, + "grad_norm": 0.5571332481181887, + "learning_rate": 0.0001338998430843464, + "loss": 12.2123, + "step": 14993 + }, + { + "epoch": 0.8164832247656777, + "grad_norm": 0.6334643771864544, + "learning_rate": 0.0001338915468504494, + "loss": 12.2523, + "step": 14994 + }, + { + "epoch": 0.8165376787622607, + "grad_norm": 0.5118559097275878, + "learning_rate": 0.00013388325035300605, + "loss": 12.1212, + "step": 14995 + }, + { + "epoch": 0.8165921327588437, + "grad_norm": 0.5741352405113275, + "learning_rate": 0.00013387495359208087, + "loss": 12.1782, + "step": 14996 + }, + { + "epoch": 0.8166465867554267, + "grad_norm": 0.7133859869683401, + "learning_rate": 0.00013386665656773834, + "loss": 12.1758, + "step": 14997 + }, + { + "epoch": 0.8167010407520097, + "grad_norm": 0.6902711214672571, + "learning_rate": 0.00013385835928004302, + "loss": 12.2727, + "step": 14998 + }, + { + "epoch": 0.8167554947485927, + "grad_norm": 0.5711108832715581, + "learning_rate": 0.00013385006172905942, + "loss": 12.0462, + "step": 14999 + }, + { + "epoch": 0.8168099487451758, + "grad_norm": 0.7150747556761515, + "learning_rate": 0.00013384176391485205, + "loss": 12.3261, + "step": 15000 + }, + { + "epoch": 0.8168644027417588, + "grad_norm": 0.5812774549161445, + "learning_rate": 0.0001338334658374855, + "loss": 12.1661, + "step": 15001 + }, + { + "epoch": 0.8169188567383417, + "grad_norm": 0.5824205643528076, + "learning_rate": 0.0001338251674970242, + "loss": 12.1429, + "step": 15002 + }, + { + "epoch": 0.8169733107349247, + "grad_norm": 0.5720606638834883, + "learning_rate": 0.00013381686889353273, + "loss": 12.2057, + "step": 15003 + }, + { + "epoch": 0.8170277647315077, + "grad_norm": 0.5880657890604791, + "learning_rate": 0.00013380857002707563, + "loss": 12.3358, + "step": 15004 + }, + { + "epoch": 0.8170822187280907, + "grad_norm": 0.6258769532904722, + "learning_rate": 0.0001338002708977174, + "loss": 12.2655, + "step": 15005 + }, + { + "epoch": 0.8171366727246738, + "grad_norm": 0.6101374276518876, + "learning_rate": 0.00013379197150552262, + "loss": 12.2291, + "step": 15006 + }, + { + "epoch": 0.8171911267212568, + "grad_norm": 0.512928837530879, + "learning_rate": 0.0001337836718505558, + "loss": 12.1752, + "step": 15007 + }, + { + "epoch": 0.8172455807178398, + "grad_norm": 0.5762590538623538, + "learning_rate": 0.00013377537193288145, + "loss": 12.1972, + "step": 15008 + }, + { + "epoch": 0.8173000347144228, + "grad_norm": 0.5424482566110911, + "learning_rate": 0.00013376707175256417, + "loss": 12.0931, + "step": 15009 + }, + { + "epoch": 0.8173544887110058, + "grad_norm": 0.5571591334733318, + "learning_rate": 0.00013375877130966847, + "loss": 12.2593, + "step": 15010 + }, + { + "epoch": 0.8174089427075888, + "grad_norm": 0.5990429172530969, + "learning_rate": 0.00013375047060425893, + "loss": 12.3188, + "step": 15011 + }, + { + "epoch": 0.8174633967041719, + "grad_norm": 0.5771953018397764, + "learning_rate": 0.00013374216963640004, + "loss": 12.1905, + "step": 15012 + }, + { + "epoch": 0.8175178507007549, + "grad_norm": 0.5856012728256729, + "learning_rate": 0.0001337338684061564, + "loss": 12.2053, + "step": 15013 + }, + { + "epoch": 0.8175723046973379, + "grad_norm": 0.5262269838088067, + "learning_rate": 0.0001337255669135925, + "loss": 12.2862, + "step": 15014 + }, + { + "epoch": 0.8176267586939209, + "grad_norm": 0.553391820209899, + "learning_rate": 0.000133717265158773, + "loss": 12.2014, + "step": 15015 + }, + { + "epoch": 0.8176812126905039, + "grad_norm": 0.568890866266896, + "learning_rate": 0.00013370896314176235, + "loss": 12.2971, + "step": 15016 + }, + { + "epoch": 0.817735666687087, + "grad_norm": 0.616745452688003, + "learning_rate": 0.00013370066086262517, + "loss": 12.232, + "step": 15017 + }, + { + "epoch": 0.81779012068367, + "grad_norm": 0.5888482042030274, + "learning_rate": 0.00013369235832142598, + "loss": 12.2087, + "step": 15018 + }, + { + "epoch": 0.817844574680253, + "grad_norm": 0.606699835744316, + "learning_rate": 0.00013368405551822935, + "loss": 12.2074, + "step": 15019 + }, + { + "epoch": 0.817899028676836, + "grad_norm": 0.6539207177383145, + "learning_rate": 0.00013367575245309987, + "loss": 12.1792, + "step": 15020 + }, + { + "epoch": 0.8179534826734189, + "grad_norm": 0.5783875286795106, + "learning_rate": 0.0001336674491261021, + "loss": 12.2332, + "step": 15021 + }, + { + "epoch": 0.8180079366700019, + "grad_norm": 0.6137768782047479, + "learning_rate": 0.0001336591455373006, + "loss": 12.1272, + "step": 15022 + }, + { + "epoch": 0.818062390666585, + "grad_norm": 0.6339487854126397, + "learning_rate": 0.00013365084168675994, + "loss": 12.1809, + "step": 15023 + }, + { + "epoch": 0.818116844663168, + "grad_norm": 0.57824407345384, + "learning_rate": 0.00013364253757454467, + "loss": 12.3144, + "step": 15024 + }, + { + "epoch": 0.818171298659751, + "grad_norm": 0.5894692927948438, + "learning_rate": 0.00013363423320071938, + "loss": 12.2668, + "step": 15025 + }, + { + "epoch": 0.818225752656334, + "grad_norm": 0.5856085183522631, + "learning_rate": 0.00013362592856534873, + "loss": 12.1982, + "step": 15026 + }, + { + "epoch": 0.818280206652917, + "grad_norm": 0.5610742843623008, + "learning_rate": 0.00013361762366849715, + "loss": 12.2298, + "step": 15027 + }, + { + "epoch": 0.8183346606495, + "grad_norm": 0.540835830705468, + "learning_rate": 0.00013360931851022931, + "loss": 12.19, + "step": 15028 + }, + { + "epoch": 0.8183891146460831, + "grad_norm": 0.5510492670300929, + "learning_rate": 0.00013360101309060974, + "loss": 12.3049, + "step": 15029 + }, + { + "epoch": 0.8184435686426661, + "grad_norm": 0.5300153454770301, + "learning_rate": 0.0001335927074097031, + "loss": 12.014, + "step": 15030 + }, + { + "epoch": 0.8184980226392491, + "grad_norm": 0.5576465141288676, + "learning_rate": 0.0001335844014675739, + "loss": 12.1904, + "step": 15031 + }, + { + "epoch": 0.8185524766358321, + "grad_norm": 0.5183661440156276, + "learning_rate": 0.0001335760952642868, + "loss": 12.2148, + "step": 15032 + }, + { + "epoch": 0.8186069306324151, + "grad_norm": 0.5266003458757883, + "learning_rate": 0.00013356778879990632, + "loss": 12.1942, + "step": 15033 + }, + { + "epoch": 0.8186613846289981, + "grad_norm": 0.5466301509703855, + "learning_rate": 0.0001335594820744971, + "loss": 12.1378, + "step": 15034 + }, + { + "epoch": 0.8187158386255812, + "grad_norm": 0.5023351188898104, + "learning_rate": 0.00013355117508812372, + "loss": 12.106, + "step": 15035 + }, + { + "epoch": 0.8187702926221642, + "grad_norm": 0.660955828481278, + "learning_rate": 0.00013354286784085078, + "loss": 12.1578, + "step": 15036 + }, + { + "epoch": 0.8188247466187472, + "grad_norm": 0.5585541576290538, + "learning_rate": 0.00013353456033274286, + "loss": 12.1009, + "step": 15037 + }, + { + "epoch": 0.8188792006153301, + "grad_norm": 0.5276413850534565, + "learning_rate": 0.0001335262525638646, + "loss": 12.2114, + "step": 15038 + }, + { + "epoch": 0.8189336546119131, + "grad_norm": 0.6796528046414316, + "learning_rate": 0.00013351794453428056, + "loss": 12.5493, + "step": 15039 + }, + { + "epoch": 0.8189881086084961, + "grad_norm": 0.5857017757416788, + "learning_rate": 0.00013350963624405538, + "loss": 12.196, + "step": 15040 + }, + { + "epoch": 0.8190425626050792, + "grad_norm": 0.5514624615441971, + "learning_rate": 0.00013350132769325362, + "loss": 12.152, + "step": 15041 + }, + { + "epoch": 0.8190970166016622, + "grad_norm": 0.5636971843317338, + "learning_rate": 0.00013349301888193992, + "loss": 12.2295, + "step": 15042 + }, + { + "epoch": 0.8191514705982452, + "grad_norm": 0.5006540132671965, + "learning_rate": 0.0001334847098101789, + "loss": 12.1136, + "step": 15043 + }, + { + "epoch": 0.8192059245948282, + "grad_norm": 0.5326584324047584, + "learning_rate": 0.00013347640047803517, + "loss": 12.065, + "step": 15044 + }, + { + "epoch": 0.8192603785914112, + "grad_norm": 0.5372718477066166, + "learning_rate": 0.00013346809088557332, + "loss": 12.2216, + "step": 15045 + }, + { + "epoch": 0.8193148325879942, + "grad_norm": 0.5359071036987115, + "learning_rate": 0.000133459781032858, + "loss": 12.1195, + "step": 15046 + }, + { + "epoch": 0.8193692865845773, + "grad_norm": 0.5939841514566832, + "learning_rate": 0.00013345147091995378, + "loss": 12.1763, + "step": 15047 + }, + { + "epoch": 0.8194237405811603, + "grad_norm": 0.5482117414915757, + "learning_rate": 0.00013344316054692533, + "loss": 12.176, + "step": 15048 + }, + { + "epoch": 0.8194781945777433, + "grad_norm": 0.5910514392352234, + "learning_rate": 0.0001334348499138373, + "loss": 12.2198, + "step": 15049 + }, + { + "epoch": 0.8195326485743263, + "grad_norm": 0.5148094608300227, + "learning_rate": 0.00013342653902075418, + "loss": 12.2085, + "step": 15050 + }, + { + "epoch": 0.8195871025709093, + "grad_norm": 0.6351328799642851, + "learning_rate": 0.00013341822786774076, + "loss": 12.2853, + "step": 15051 + }, + { + "epoch": 0.8196415565674924, + "grad_norm": 0.5943046099493197, + "learning_rate": 0.00013340991645486157, + "loss": 12.1503, + "step": 15052 + }, + { + "epoch": 0.8196960105640754, + "grad_norm": 0.7340126164156044, + "learning_rate": 0.00013340160478218126, + "loss": 12.3678, + "step": 15053 + }, + { + "epoch": 0.8197504645606584, + "grad_norm": 0.5561954548461967, + "learning_rate": 0.00013339329284976447, + "loss": 12.2121, + "step": 15054 + }, + { + "epoch": 0.8198049185572414, + "grad_norm": 0.5936471684288838, + "learning_rate": 0.00013338498065767587, + "loss": 12.2002, + "step": 15055 + }, + { + "epoch": 0.8198593725538244, + "grad_norm": 0.5767221429828392, + "learning_rate": 0.00013337666820598001, + "loss": 12.2062, + "step": 15056 + }, + { + "epoch": 0.8199138265504073, + "grad_norm": 0.5520368662964245, + "learning_rate": 0.0001333683554947416, + "loss": 12.2063, + "step": 15057 + }, + { + "epoch": 0.8199682805469904, + "grad_norm": 0.5753584754006462, + "learning_rate": 0.00013336004252402527, + "loss": 12.1897, + "step": 15058 + }, + { + "epoch": 0.8200227345435734, + "grad_norm": 0.533969251635747, + "learning_rate": 0.00013335172929389565, + "loss": 12.1252, + "step": 15059 + }, + { + "epoch": 0.8200771885401564, + "grad_norm": 0.5427429013322442, + "learning_rate": 0.0001333434158044174, + "loss": 12.1885, + "step": 15060 + }, + { + "epoch": 0.8201316425367394, + "grad_norm": 0.4952467730062113, + "learning_rate": 0.00013333510205565516, + "loss": 12.0779, + "step": 15061 + }, + { + "epoch": 0.8201860965333224, + "grad_norm": 0.5345066578384771, + "learning_rate": 0.00013332678804767358, + "loss": 12.2012, + "step": 15062 + }, + { + "epoch": 0.8202405505299054, + "grad_norm": 0.552949495917493, + "learning_rate": 0.00013331847378053726, + "loss": 12.1347, + "step": 15063 + }, + { + "epoch": 0.8202950045264885, + "grad_norm": 0.5550958159476888, + "learning_rate": 0.00013331015925431095, + "loss": 12.2529, + "step": 15064 + }, + { + "epoch": 0.8203494585230715, + "grad_norm": 0.5544262032236434, + "learning_rate": 0.00013330184446905922, + "loss": 12.0939, + "step": 15065 + }, + { + "epoch": 0.8204039125196545, + "grad_norm": 0.5958917087716364, + "learning_rate": 0.00013329352942484678, + "loss": 12.1504, + "step": 15066 + }, + { + "epoch": 0.8204583665162375, + "grad_norm": 0.550229500277586, + "learning_rate": 0.0001332852141217383, + "loss": 12.101, + "step": 15067 + }, + { + "epoch": 0.8205128205128205, + "grad_norm": 0.5225662194495485, + "learning_rate": 0.00013327689855979836, + "loss": 12.1023, + "step": 15068 + }, + { + "epoch": 0.8205672745094035, + "grad_norm": 0.530911218491028, + "learning_rate": 0.00013326858273909168, + "loss": 12.2245, + "step": 15069 + }, + { + "epoch": 0.8206217285059866, + "grad_norm": 0.5554527417021022, + "learning_rate": 0.00013326026665968295, + "loss": 12.1633, + "step": 15070 + }, + { + "epoch": 0.8206761825025696, + "grad_norm": 0.6875066978735751, + "learning_rate": 0.00013325195032163682, + "loss": 12.2125, + "step": 15071 + }, + { + "epoch": 0.8207306364991526, + "grad_norm": 0.6042554096651395, + "learning_rate": 0.00013324363372501795, + "loss": 12.2184, + "step": 15072 + }, + { + "epoch": 0.8207850904957356, + "grad_norm": 0.5588330743067605, + "learning_rate": 0.00013323531686989097, + "loss": 12.1346, + "step": 15073 + }, + { + "epoch": 0.8208395444923186, + "grad_norm": 0.6216993891491206, + "learning_rate": 0.0001332269997563206, + "loss": 12.2326, + "step": 15074 + }, + { + "epoch": 0.8208939984889015, + "grad_norm": 0.5679478272925854, + "learning_rate": 0.00013321868238437153, + "loss": 12.2848, + "step": 15075 + }, + { + "epoch": 0.8209484524854846, + "grad_norm": 0.6401907090556928, + "learning_rate": 0.0001332103647541084, + "loss": 12.2149, + "step": 15076 + }, + { + "epoch": 0.8210029064820676, + "grad_norm": 0.5509524927768689, + "learning_rate": 0.00013320204686559592, + "loss": 12.1914, + "step": 15077 + }, + { + "epoch": 0.8210573604786506, + "grad_norm": 0.5333250539046567, + "learning_rate": 0.00013319372871889874, + "loss": 12.1979, + "step": 15078 + }, + { + "epoch": 0.8211118144752336, + "grad_norm": 0.6510537470053677, + "learning_rate": 0.00013318541031408156, + "loss": 12.1866, + "step": 15079 + }, + { + "epoch": 0.8211662684718166, + "grad_norm": 0.5456246146264664, + "learning_rate": 0.00013317709165120903, + "loss": 12.1768, + "step": 15080 + }, + { + "epoch": 0.8212207224683996, + "grad_norm": 0.60234366708656, + "learning_rate": 0.0001331687727303459, + "loss": 12.173, + "step": 15081 + }, + { + "epoch": 0.8212751764649827, + "grad_norm": 0.5435572532492777, + "learning_rate": 0.00013316045355155689, + "loss": 12.2481, + "step": 15082 + }, + { + "epoch": 0.8213296304615657, + "grad_norm": 0.5782641007791894, + "learning_rate": 0.00013315213411490656, + "loss": 12.126, + "step": 15083 + }, + { + "epoch": 0.8213840844581487, + "grad_norm": 0.5774753450730651, + "learning_rate": 0.0001331438144204597, + "loss": 12.0706, + "step": 15084 + }, + { + "epoch": 0.8214385384547317, + "grad_norm": 0.6488453649299252, + "learning_rate": 0.00013313549446828096, + "loss": 12.1743, + "step": 15085 + }, + { + "epoch": 0.8214929924513147, + "grad_norm": 0.5730644068548515, + "learning_rate": 0.00013312717425843508, + "loss": 12.218, + "step": 15086 + }, + { + "epoch": 0.8215474464478978, + "grad_norm": 0.5404712651524503, + "learning_rate": 0.00013311885379098674, + "loss": 12.1142, + "step": 15087 + }, + { + "epoch": 0.8216019004444808, + "grad_norm": 0.5496206771070863, + "learning_rate": 0.00013311053306600066, + "loss": 12.2041, + "step": 15088 + }, + { + "epoch": 0.8216563544410638, + "grad_norm": 0.5862083753367804, + "learning_rate": 0.00013310221208354147, + "loss": 12.2, + "step": 15089 + }, + { + "epoch": 0.8217108084376468, + "grad_norm": 0.6033512458007323, + "learning_rate": 0.00013309389084367396, + "loss": 12.2182, + "step": 15090 + }, + { + "epoch": 0.8217652624342298, + "grad_norm": 0.576839526949148, + "learning_rate": 0.00013308556934646276, + "loss": 12.3081, + "step": 15091 + }, + { + "epoch": 0.8218197164308128, + "grad_norm": 0.5115106817823334, + "learning_rate": 0.0001330772475919727, + "loss": 12.1187, + "step": 15092 + }, + { + "epoch": 0.8218741704273959, + "grad_norm": 0.6423138685815555, + "learning_rate": 0.00013306892558026837, + "loss": 12.3313, + "step": 15093 + }, + { + "epoch": 0.8219286244239788, + "grad_norm": 0.5606849061577902, + "learning_rate": 0.00013306060331141456, + "loss": 12.1125, + "step": 15094 + }, + { + "epoch": 0.8219830784205618, + "grad_norm": 0.5475736926996045, + "learning_rate": 0.0001330522807854759, + "loss": 12.2019, + "step": 15095 + }, + { + "epoch": 0.8220375324171448, + "grad_norm": 0.5095605580907949, + "learning_rate": 0.00013304395800251722, + "loss": 12.1341, + "step": 15096 + }, + { + "epoch": 0.8220919864137278, + "grad_norm": 0.5374397129569775, + "learning_rate": 0.00013303563496260313, + "loss": 12.304, + "step": 15097 + }, + { + "epoch": 0.8221464404103108, + "grad_norm": 0.5965306221315761, + "learning_rate": 0.00013302731166579842, + "loss": 12.108, + "step": 15098 + }, + { + "epoch": 0.8222008944068939, + "grad_norm": 0.6075627779084403, + "learning_rate": 0.00013301898811216782, + "loss": 12.2044, + "step": 15099 + }, + { + "epoch": 0.8222553484034769, + "grad_norm": 0.5798476561085326, + "learning_rate": 0.000133010664301776, + "loss": 12.1951, + "step": 15100 + }, + { + "epoch": 0.8223098024000599, + "grad_norm": 0.5656120162879976, + "learning_rate": 0.00013300234023468774, + "loss": 12.2176, + "step": 15101 + }, + { + "epoch": 0.8223642563966429, + "grad_norm": 0.5392765374070314, + "learning_rate": 0.00013299401591096774, + "loss": 12.3709, + "step": 15102 + }, + { + "epoch": 0.8224187103932259, + "grad_norm": 0.5474554976638221, + "learning_rate": 0.00013298569133068073, + "loss": 12.1893, + "step": 15103 + }, + { + "epoch": 0.8224731643898089, + "grad_norm": 0.5922941430634817, + "learning_rate": 0.00013297736649389147, + "loss": 12.2515, + "step": 15104 + }, + { + "epoch": 0.822527618386392, + "grad_norm": 0.6390417808924062, + "learning_rate": 0.00013296904140066467, + "loss": 12.2673, + "step": 15105 + }, + { + "epoch": 0.822582072382975, + "grad_norm": 0.5320100230544484, + "learning_rate": 0.00013296071605106507, + "loss": 12.1982, + "step": 15106 + }, + { + "epoch": 0.822636526379558, + "grad_norm": 0.574697692543034, + "learning_rate": 0.00013295239044515742, + "loss": 12.2736, + "step": 15107 + }, + { + "epoch": 0.822690980376141, + "grad_norm": 0.5725886367874117, + "learning_rate": 0.00013294406458300644, + "loss": 12.1297, + "step": 15108 + }, + { + "epoch": 0.822745434372724, + "grad_norm": 0.6607264694600522, + "learning_rate": 0.0001329357384646769, + "loss": 12.2569, + "step": 15109 + }, + { + "epoch": 0.822799888369307, + "grad_norm": 0.5687599519704036, + "learning_rate": 0.00013292741209023354, + "loss": 12.2327, + "step": 15110 + }, + { + "epoch": 0.8228543423658901, + "grad_norm": 0.5317134091370369, + "learning_rate": 0.0001329190854597411, + "loss": 12.1822, + "step": 15111 + }, + { + "epoch": 0.822908796362473, + "grad_norm": 0.5366673126182483, + "learning_rate": 0.00013291075857326434, + "loss": 12.1526, + "step": 15112 + }, + { + "epoch": 0.822963250359056, + "grad_norm": 0.6491831756234533, + "learning_rate": 0.00013290243143086798, + "loss": 12.2972, + "step": 15113 + }, + { + "epoch": 0.823017704355639, + "grad_norm": 0.5172806571766189, + "learning_rate": 0.00013289410403261682, + "loss": 12.0933, + "step": 15114 + }, + { + "epoch": 0.823072158352222, + "grad_norm": 0.5683596383075902, + "learning_rate": 0.00013288577637857556, + "loss": 12.2948, + "step": 15115 + }, + { + "epoch": 0.823126612348805, + "grad_norm": 0.5875142686688732, + "learning_rate": 0.00013287744846880904, + "loss": 12.2356, + "step": 15116 + }, + { + "epoch": 0.8231810663453881, + "grad_norm": 0.570076875886587, + "learning_rate": 0.00013286912030338193, + "loss": 12.3245, + "step": 15117 + }, + { + "epoch": 0.8232355203419711, + "grad_norm": 0.6519852031719279, + "learning_rate": 0.000132860791882359, + "loss": 12.1499, + "step": 15118 + }, + { + "epoch": 0.8232899743385541, + "grad_norm": 0.5983918322104361, + "learning_rate": 0.0001328524632058051, + "loss": 12.1896, + "step": 15119 + }, + { + "epoch": 0.8233444283351371, + "grad_norm": 0.5932760518870501, + "learning_rate": 0.0001328441342737849, + "loss": 12.198, + "step": 15120 + }, + { + "epoch": 0.8233988823317201, + "grad_norm": 0.59185708375135, + "learning_rate": 0.00013283580508636323, + "loss": 12.1776, + "step": 15121 + }, + { + "epoch": 0.8234533363283032, + "grad_norm": 0.5858903392430636, + "learning_rate": 0.0001328274756436048, + "loss": 12.1823, + "step": 15122 + }, + { + "epoch": 0.8235077903248862, + "grad_norm": 0.5887880925684659, + "learning_rate": 0.00013281914594557442, + "loss": 12.0918, + "step": 15123 + }, + { + "epoch": 0.8235622443214692, + "grad_norm": 0.540027316882084, + "learning_rate": 0.00013281081599233688, + "loss": 12.1714, + "step": 15124 + }, + { + "epoch": 0.8236166983180522, + "grad_norm": 0.5667513931707916, + "learning_rate": 0.0001328024857839569, + "loss": 12.1175, + "step": 15125 + }, + { + "epoch": 0.8236711523146352, + "grad_norm": 0.6075293531496816, + "learning_rate": 0.00013279415532049932, + "loss": 12.1341, + "step": 15126 + }, + { + "epoch": 0.8237256063112182, + "grad_norm": 0.6328301413775694, + "learning_rate": 0.0001327858246020289, + "loss": 12.2516, + "step": 15127 + }, + { + "epoch": 0.8237800603078013, + "grad_norm": 0.5389076487138665, + "learning_rate": 0.0001327774936286104, + "loss": 12.3203, + "step": 15128 + }, + { + "epoch": 0.8238345143043843, + "grad_norm": 0.5732364690525088, + "learning_rate": 0.00013276916240030856, + "loss": 12.2238, + "step": 15129 + }, + { + "epoch": 0.8238889683009673, + "grad_norm": 0.6314953539941783, + "learning_rate": 0.00013276083091718827, + "loss": 12.236, + "step": 15130 + }, + { + "epoch": 0.8239434222975502, + "grad_norm": 0.5614318534426168, + "learning_rate": 0.00013275249917931424, + "loss": 12.0909, + "step": 15131 + }, + { + "epoch": 0.8239978762941332, + "grad_norm": 0.5909689994430131, + "learning_rate": 0.00013274416718675133, + "loss": 12.1413, + "step": 15132 + }, + { + "epoch": 0.8240523302907162, + "grad_norm": 0.634909287782436, + "learning_rate": 0.00013273583493956425, + "loss": 12.3205, + "step": 15133 + }, + { + "epoch": 0.8241067842872993, + "grad_norm": 0.5915683255470073, + "learning_rate": 0.00013272750243781782, + "loss": 12.0764, + "step": 15134 + }, + { + "epoch": 0.8241612382838823, + "grad_norm": 0.6220220305808242, + "learning_rate": 0.00013271916968157684, + "loss": 12.2925, + "step": 15135 + }, + { + "epoch": 0.8242156922804653, + "grad_norm": 0.5997585424873934, + "learning_rate": 0.00013271083667090614, + "loss": 12.1521, + "step": 15136 + }, + { + "epoch": 0.8242701462770483, + "grad_norm": 0.5278769519062395, + "learning_rate": 0.00013270250340587046, + "loss": 12.1131, + "step": 15137 + }, + { + "epoch": 0.8243246002736313, + "grad_norm": 0.524995503437437, + "learning_rate": 0.00013269416988653468, + "loss": 12.2127, + "step": 15138 + }, + { + "epoch": 0.8243790542702143, + "grad_norm": 0.6492659430294754, + "learning_rate": 0.0001326858361129635, + "loss": 12.2993, + "step": 15139 + }, + { + "epoch": 0.8244335082667974, + "grad_norm": 0.5796193885415427, + "learning_rate": 0.00013267750208522175, + "loss": 12.2585, + "step": 15140 + }, + { + "epoch": 0.8244879622633804, + "grad_norm": 0.5702422691628836, + "learning_rate": 0.00013266916780337433, + "loss": 12.1578, + "step": 15141 + }, + { + "epoch": 0.8245424162599634, + "grad_norm": 0.6843938991700117, + "learning_rate": 0.00013266083326748596, + "loss": 12.4301, + "step": 15142 + }, + { + "epoch": 0.8245968702565464, + "grad_norm": 0.6428570758362845, + "learning_rate": 0.00013265249847762146, + "loss": 12.1424, + "step": 15143 + }, + { + "epoch": 0.8246513242531294, + "grad_norm": 0.5287730976454701, + "learning_rate": 0.00013264416343384568, + "loss": 12.2458, + "step": 15144 + }, + { + "epoch": 0.8247057782497124, + "grad_norm": 0.5955981955761246, + "learning_rate": 0.0001326358281362234, + "loss": 12.1539, + "step": 15145 + }, + { + "epoch": 0.8247602322462955, + "grad_norm": 0.5641366752885426, + "learning_rate": 0.0001326274925848194, + "loss": 12.1635, + "step": 15146 + }, + { + "epoch": 0.8248146862428785, + "grad_norm": 0.5511840238603661, + "learning_rate": 0.00013261915677969862, + "loss": 12.2499, + "step": 15147 + }, + { + "epoch": 0.8248691402394615, + "grad_norm": 0.5706898292626512, + "learning_rate": 0.00013261082072092578, + "loss": 12.1368, + "step": 15148 + }, + { + "epoch": 0.8249235942360444, + "grad_norm": 0.6157811933476967, + "learning_rate": 0.00013260248440856572, + "loss": 12.2726, + "step": 15149 + }, + { + "epoch": 0.8249780482326274, + "grad_norm": 0.5371187183576617, + "learning_rate": 0.00013259414784268328, + "loss": 12.159, + "step": 15150 + }, + { + "epoch": 0.8250325022292105, + "grad_norm": 0.5935211585790439, + "learning_rate": 0.00013258581102334327, + "loss": 12.0911, + "step": 15151 + }, + { + "epoch": 0.8250869562257935, + "grad_norm": 0.5473734635512906, + "learning_rate": 0.00013257747395061054, + "loss": 12.2132, + "step": 15152 + }, + { + "epoch": 0.8251414102223765, + "grad_norm": 0.5040749155652381, + "learning_rate": 0.00013256913662454994, + "loss": 12.1254, + "step": 15153 + }, + { + "epoch": 0.8251958642189595, + "grad_norm": 0.5500514667341897, + "learning_rate": 0.00013256079904522622, + "loss": 12.2533, + "step": 15154 + }, + { + "epoch": 0.8252503182155425, + "grad_norm": 0.5382832797192473, + "learning_rate": 0.00013255246121270429, + "loss": 12.1713, + "step": 15155 + }, + { + "epoch": 0.8253047722121255, + "grad_norm": 0.5520042361371003, + "learning_rate": 0.00013254412312704896, + "loss": 12.1337, + "step": 15156 + }, + { + "epoch": 0.8253592262087086, + "grad_norm": 0.569443223317474, + "learning_rate": 0.00013253578478832507, + "loss": 12.2426, + "step": 15157 + }, + { + "epoch": 0.8254136802052916, + "grad_norm": 0.6446110993756711, + "learning_rate": 0.00013252744619659745, + "loss": 12.1518, + "step": 15158 + }, + { + "epoch": 0.8254681342018746, + "grad_norm": 0.6038693948816372, + "learning_rate": 0.00013251910735193097, + "loss": 12.2149, + "step": 15159 + }, + { + "epoch": 0.8255225881984576, + "grad_norm": 0.6525901872902011, + "learning_rate": 0.00013251076825439043, + "loss": 12.3306, + "step": 15160 + }, + { + "epoch": 0.8255770421950406, + "grad_norm": 0.6481041452764463, + "learning_rate": 0.00013250242890404076, + "loss": 12.283, + "step": 15161 + }, + { + "epoch": 0.8256314961916236, + "grad_norm": 0.5856391893384809, + "learning_rate": 0.0001324940893009467, + "loss": 12.0143, + "step": 15162 + }, + { + "epoch": 0.8256859501882067, + "grad_norm": 0.5459209242515835, + "learning_rate": 0.00013248574944517316, + "loss": 12.1153, + "step": 15163 + }, + { + "epoch": 0.8257404041847897, + "grad_norm": 0.6465502579114393, + "learning_rate": 0.00013247740933678502, + "loss": 12.1841, + "step": 15164 + }, + { + "epoch": 0.8257948581813727, + "grad_norm": 0.7028110202838673, + "learning_rate": 0.00013246906897584706, + "loss": 12.1794, + "step": 15165 + }, + { + "epoch": 0.8258493121779557, + "grad_norm": 0.5582118795351031, + "learning_rate": 0.00013246072836242418, + "loss": 12.1924, + "step": 15166 + }, + { + "epoch": 0.8259037661745386, + "grad_norm": 0.6080292099406148, + "learning_rate": 0.00013245238749658123, + "loss": 12.2244, + "step": 15167 + }, + { + "epoch": 0.8259582201711216, + "grad_norm": 0.6377228813808977, + "learning_rate": 0.0001324440463783831, + "loss": 12.0584, + "step": 15168 + }, + { + "epoch": 0.8260126741677047, + "grad_norm": 0.5637368797575435, + "learning_rate": 0.0001324357050078946, + "loss": 12.2439, + "step": 15169 + }, + { + "epoch": 0.8260671281642877, + "grad_norm": 0.5521350397117724, + "learning_rate": 0.00013242736338518063, + "loss": 12.1303, + "step": 15170 + }, + { + "epoch": 0.8261215821608707, + "grad_norm": 0.5868245001879185, + "learning_rate": 0.00013241902151030604, + "loss": 12.1829, + "step": 15171 + }, + { + "epoch": 0.8261760361574537, + "grad_norm": 0.5677736365759718, + "learning_rate": 0.00013241067938333568, + "loss": 12.1276, + "step": 15172 + }, + { + "epoch": 0.8262304901540367, + "grad_norm": 0.5734079441240211, + "learning_rate": 0.00013240233700433448, + "loss": 12.2542, + "step": 15173 + }, + { + "epoch": 0.8262849441506197, + "grad_norm": 0.5641702940489937, + "learning_rate": 0.00013239399437336725, + "loss": 12.3364, + "step": 15174 + }, + { + "epoch": 0.8263393981472028, + "grad_norm": 0.5934181207445682, + "learning_rate": 0.00013238565149049892, + "loss": 12.2717, + "step": 15175 + }, + { + "epoch": 0.8263938521437858, + "grad_norm": 0.5179909032091367, + "learning_rate": 0.0001323773083557943, + "loss": 12.2665, + "step": 15176 + }, + { + "epoch": 0.8264483061403688, + "grad_norm": 0.5937253578214855, + "learning_rate": 0.00013236896496931832, + "loss": 12.3528, + "step": 15177 + }, + { + "epoch": 0.8265027601369518, + "grad_norm": 0.502476889232368, + "learning_rate": 0.0001323606213311358, + "loss": 12.1983, + "step": 15178 + }, + { + "epoch": 0.8265572141335348, + "grad_norm": 0.531901728632106, + "learning_rate": 0.0001323522774413117, + "loss": 11.9762, + "step": 15179 + }, + { + "epoch": 0.8266116681301178, + "grad_norm": 0.5294521390225176, + "learning_rate": 0.00013234393329991084, + "loss": 12.1733, + "step": 15180 + }, + { + "epoch": 0.8266661221267009, + "grad_norm": 0.5465460122389068, + "learning_rate": 0.00013233558890699814, + "loss": 12.0676, + "step": 15181 + }, + { + "epoch": 0.8267205761232839, + "grad_norm": 0.549624763802986, + "learning_rate": 0.00013232724426263853, + "loss": 12.2706, + "step": 15182 + }, + { + "epoch": 0.8267750301198669, + "grad_norm": 0.5855428195237342, + "learning_rate": 0.00013231889936689677, + "loss": 12.2152, + "step": 15183 + }, + { + "epoch": 0.8268294841164499, + "grad_norm": 0.48878624781299707, + "learning_rate": 0.00013231055421983787, + "loss": 12.1706, + "step": 15184 + }, + { + "epoch": 0.8268839381130328, + "grad_norm": 0.6439542590151076, + "learning_rate": 0.00013230220882152666, + "loss": 12.2569, + "step": 15185 + }, + { + "epoch": 0.826938392109616, + "grad_norm": 0.5333101585897986, + "learning_rate": 0.00013229386317202806, + "loss": 12.0302, + "step": 15186 + }, + { + "epoch": 0.8269928461061989, + "grad_norm": 0.6082116074025128, + "learning_rate": 0.000132285517271407, + "loss": 12.2354, + "step": 15187 + }, + { + "epoch": 0.8270473001027819, + "grad_norm": 0.5978135139139774, + "learning_rate": 0.00013227717111972834, + "loss": 12.2528, + "step": 15188 + }, + { + "epoch": 0.8271017540993649, + "grad_norm": 0.5878083936155488, + "learning_rate": 0.0001322688247170569, + "loss": 12.3464, + "step": 15189 + }, + { + "epoch": 0.8271562080959479, + "grad_norm": 0.5858863888880815, + "learning_rate": 0.00013226047806345773, + "loss": 12.3016, + "step": 15190 + }, + { + "epoch": 0.8272106620925309, + "grad_norm": 0.589348190036878, + "learning_rate": 0.00013225213115899569, + "loss": 12.1782, + "step": 15191 + }, + { + "epoch": 0.827265116089114, + "grad_norm": 0.5216357234796167, + "learning_rate": 0.00013224378400373564, + "loss": 12.1173, + "step": 15192 + }, + { + "epoch": 0.827319570085697, + "grad_norm": 0.5453169370096311, + "learning_rate": 0.00013223543659774255, + "loss": 12.1557, + "step": 15193 + }, + { + "epoch": 0.82737402408228, + "grad_norm": 1.8532061690992983, + "learning_rate": 0.00013222708894108126, + "loss": 12.2979, + "step": 15194 + }, + { + "epoch": 0.827428478078863, + "grad_norm": 0.5981247067276574, + "learning_rate": 0.00013221874103381672, + "loss": 12.1334, + "step": 15195 + }, + { + "epoch": 0.827482932075446, + "grad_norm": 0.5542933873060595, + "learning_rate": 0.00013221039287601388, + "loss": 12.1865, + "step": 15196 + }, + { + "epoch": 0.827537386072029, + "grad_norm": 0.5801329723495282, + "learning_rate": 0.00013220204446773762, + "loss": 12.2615, + "step": 15197 + }, + { + "epoch": 0.8275918400686121, + "grad_norm": 0.7613352726613339, + "learning_rate": 0.00013219369580905289, + "loss": 12.1892, + "step": 15198 + }, + { + "epoch": 0.8276462940651951, + "grad_norm": 0.6650532458101823, + "learning_rate": 0.00013218534690002452, + "loss": 12.2418, + "step": 15199 + }, + { + "epoch": 0.8277007480617781, + "grad_norm": 0.6317919173574159, + "learning_rate": 0.0001321769977407175, + "loss": 12.1991, + "step": 15200 + }, + { + "epoch": 0.8277552020583611, + "grad_norm": 0.5618309537187812, + "learning_rate": 0.00013216864833119675, + "loss": 12.1656, + "step": 15201 + }, + { + "epoch": 0.827809656054944, + "grad_norm": 0.6232714602395312, + "learning_rate": 0.00013216029867152724, + "loss": 12.1143, + "step": 15202 + }, + { + "epoch": 0.827864110051527, + "grad_norm": 0.5737033519336524, + "learning_rate": 0.00013215194876177385, + "loss": 12.1755, + "step": 15203 + }, + { + "epoch": 0.8279185640481102, + "grad_norm": 0.70282100669915, + "learning_rate": 0.00013214359860200148, + "loss": 12.1138, + "step": 15204 + }, + { + "epoch": 0.8279730180446931, + "grad_norm": 0.757833350297887, + "learning_rate": 0.00013213524819227513, + "loss": 12.187, + "step": 15205 + }, + { + "epoch": 0.8280274720412761, + "grad_norm": 0.5789596533234671, + "learning_rate": 0.00013212689753265965, + "loss": 12.2284, + "step": 15206 + }, + { + "epoch": 0.8280819260378591, + "grad_norm": 0.641352209545092, + "learning_rate": 0.00013211854662322007, + "loss": 12.3184, + "step": 15207 + }, + { + "epoch": 0.8281363800344421, + "grad_norm": 0.5660627888936081, + "learning_rate": 0.00013211019546402128, + "loss": 12.1388, + "step": 15208 + }, + { + "epoch": 0.8281908340310251, + "grad_norm": 0.5537899501912319, + "learning_rate": 0.0001321018440551282, + "loss": 12.2371, + "step": 15209 + }, + { + "epoch": 0.8282452880276082, + "grad_norm": 0.7017780050980083, + "learning_rate": 0.0001320934923966058, + "loss": 12.2134, + "step": 15210 + }, + { + "epoch": 0.8282997420241912, + "grad_norm": 0.629825583850304, + "learning_rate": 0.00013208514048851903, + "loss": 12.1223, + "step": 15211 + }, + { + "epoch": 0.8283541960207742, + "grad_norm": 0.5852813246540232, + "learning_rate": 0.00013207678833093285, + "loss": 12.193, + "step": 15212 + }, + { + "epoch": 0.8284086500173572, + "grad_norm": 0.654363836888728, + "learning_rate": 0.00013206843592391217, + "loss": 12.2933, + "step": 15213 + }, + { + "epoch": 0.8284631040139402, + "grad_norm": 0.6444913765173723, + "learning_rate": 0.00013206008326752198, + "loss": 12.1784, + "step": 15214 + }, + { + "epoch": 0.8285175580105232, + "grad_norm": 0.7632623438059727, + "learning_rate": 0.00013205173036182717, + "loss": 12.2421, + "step": 15215 + }, + { + "epoch": 0.8285720120071063, + "grad_norm": 0.6093109181145168, + "learning_rate": 0.00013204337720689274, + "loss": 12.2275, + "step": 15216 + }, + { + "epoch": 0.8286264660036893, + "grad_norm": 0.5887353236091951, + "learning_rate": 0.0001320350238027836, + "loss": 12.2388, + "step": 15217 + }, + { + "epoch": 0.8286809200002723, + "grad_norm": 0.5890757678442023, + "learning_rate": 0.00013202667014956481, + "loss": 12.2197, + "step": 15218 + }, + { + "epoch": 0.8287353739968553, + "grad_norm": 0.8375274103573752, + "learning_rate": 0.0001320183162473012, + "loss": 12.2183, + "step": 15219 + }, + { + "epoch": 0.8287898279934383, + "grad_norm": 0.6257245685057095, + "learning_rate": 0.0001320099620960578, + "loss": 12.1336, + "step": 15220 + }, + { + "epoch": 0.8288442819900214, + "grad_norm": 0.5952583992488294, + "learning_rate": 0.00013200160769589962, + "loss": 12.1276, + "step": 15221 + }, + { + "epoch": 0.8288987359866044, + "grad_norm": 0.6480649554934045, + "learning_rate": 0.00013199325304689153, + "loss": 12.1835, + "step": 15222 + }, + { + "epoch": 0.8289531899831873, + "grad_norm": 0.6080343846993349, + "learning_rate": 0.00013198489814909855, + "loss": 12.1816, + "step": 15223 + }, + { + "epoch": 0.8290076439797703, + "grad_norm": 0.5783908524821597, + "learning_rate": 0.00013197654300258564, + "loss": 12.3538, + "step": 15224 + }, + { + "epoch": 0.8290620979763533, + "grad_norm": 0.6053366205027224, + "learning_rate": 0.0001319681876074178, + "loss": 12.1792, + "step": 15225 + }, + { + "epoch": 0.8291165519729363, + "grad_norm": 0.6561232963435246, + "learning_rate": 0.0001319598319636599, + "loss": 12.3573, + "step": 15226 + }, + { + "epoch": 0.8291710059695194, + "grad_norm": 0.5575796826423648, + "learning_rate": 0.00013195147607137703, + "loss": 12.3334, + "step": 15227 + }, + { + "epoch": 0.8292254599661024, + "grad_norm": 0.613532357704036, + "learning_rate": 0.00013194311993063412, + "loss": 12.1937, + "step": 15228 + }, + { + "epoch": 0.8292799139626854, + "grad_norm": 0.6084414020114763, + "learning_rate": 0.00013193476354149617, + "loss": 12.213, + "step": 15229 + }, + { + "epoch": 0.8293343679592684, + "grad_norm": 0.5463433763684223, + "learning_rate": 0.00013192640690402811, + "loss": 12.2368, + "step": 15230 + }, + { + "epoch": 0.8293888219558514, + "grad_norm": 0.5157030945118751, + "learning_rate": 0.00013191805001829495, + "loss": 12.1926, + "step": 15231 + }, + { + "epoch": 0.8294432759524344, + "grad_norm": 0.6152722825800643, + "learning_rate": 0.00013190969288436172, + "loss": 12.4055, + "step": 15232 + }, + { + "epoch": 0.8294977299490175, + "grad_norm": 0.6345939630072156, + "learning_rate": 0.00013190133550229334, + "loss": 12.1898, + "step": 15233 + }, + { + "epoch": 0.8295521839456005, + "grad_norm": 0.5770984625351699, + "learning_rate": 0.00013189297787215485, + "loss": 12.2821, + "step": 15234 + }, + { + "epoch": 0.8296066379421835, + "grad_norm": 0.5530313175997955, + "learning_rate": 0.00013188461999401118, + "loss": 12.2363, + "step": 15235 + }, + { + "epoch": 0.8296610919387665, + "grad_norm": 0.5676731684196313, + "learning_rate": 0.0001318762618679274, + "loss": 12.1333, + "step": 15236 + }, + { + "epoch": 0.8297155459353495, + "grad_norm": 0.5897430392692317, + "learning_rate": 0.00013186790349396842, + "loss": 12.2853, + "step": 15237 + }, + { + "epoch": 0.8297699999319325, + "grad_norm": 0.6143112745421574, + "learning_rate": 0.0001318595448721993, + "loss": 12.1977, + "step": 15238 + }, + { + "epoch": 0.8298244539285156, + "grad_norm": 0.5961912593192616, + "learning_rate": 0.000131851186002685, + "loss": 12.2267, + "step": 15239 + }, + { + "epoch": 0.8298789079250986, + "grad_norm": 0.5864432196548776, + "learning_rate": 0.00013184282688549057, + "loss": 12.0463, + "step": 15240 + }, + { + "epoch": 0.8299333619216815, + "grad_norm": 0.550662140713574, + "learning_rate": 0.00013183446752068094, + "loss": 12.1639, + "step": 15241 + }, + { + "epoch": 0.8299878159182645, + "grad_norm": 0.5855212190237381, + "learning_rate": 0.00013182610790832118, + "loss": 12.1433, + "step": 15242 + }, + { + "epoch": 0.8300422699148475, + "grad_norm": 0.5995853240988076, + "learning_rate": 0.00013181774804847627, + "loss": 12.1656, + "step": 15243 + }, + { + "epoch": 0.8300967239114305, + "grad_norm": 0.5616551083670744, + "learning_rate": 0.00013180938794121118, + "loss": 12.2573, + "step": 15244 + }, + { + "epoch": 0.8301511779080136, + "grad_norm": 0.7404378825252865, + "learning_rate": 0.000131801027586591, + "loss": 12.3811, + "step": 15245 + }, + { + "epoch": 0.8302056319045966, + "grad_norm": 0.6166144410712183, + "learning_rate": 0.00013179266698468064, + "loss": 12.3776, + "step": 15246 + }, + { + "epoch": 0.8302600859011796, + "grad_norm": 0.5533530192204831, + "learning_rate": 0.00013178430613554522, + "loss": 12.1586, + "step": 15247 + }, + { + "epoch": 0.8303145398977626, + "grad_norm": 0.5844910546885216, + "learning_rate": 0.0001317759450392497, + "loss": 12.3598, + "step": 15248 + }, + { + "epoch": 0.8303689938943456, + "grad_norm": 0.6002406804984849, + "learning_rate": 0.0001317675836958591, + "loss": 12.2845, + "step": 15249 + }, + { + "epoch": 0.8304234478909286, + "grad_norm": 0.4980976925076202, + "learning_rate": 0.0001317592221054384, + "loss": 12.1897, + "step": 15250 + }, + { + "epoch": 0.8304779018875117, + "grad_norm": 0.6031308926270025, + "learning_rate": 0.0001317508602680527, + "loss": 12.2017, + "step": 15251 + }, + { + "epoch": 0.8305323558840947, + "grad_norm": 0.6147733627442982, + "learning_rate": 0.00013174249818376699, + "loss": 12.2611, + "step": 15252 + }, + { + "epoch": 0.8305868098806777, + "grad_norm": 0.5800706246651661, + "learning_rate": 0.00013173413585264632, + "loss": 12.3297, + "step": 15253 + }, + { + "epoch": 0.8306412638772607, + "grad_norm": 0.5807351910438452, + "learning_rate": 0.00013172577327475563, + "loss": 12.3804, + "step": 15254 + }, + { + "epoch": 0.8306957178738437, + "grad_norm": 0.617417880066591, + "learning_rate": 0.00013171741045016002, + "loss": 12.1884, + "step": 15255 + }, + { + "epoch": 0.8307501718704268, + "grad_norm": 0.6256639537422343, + "learning_rate": 0.00013170904737892452, + "loss": 12.2294, + "step": 15256 + }, + { + "epoch": 0.8308046258670098, + "grad_norm": 0.5417802792206997, + "learning_rate": 0.00013170068406111413, + "loss": 12.2778, + "step": 15257 + }, + { + "epoch": 0.8308590798635928, + "grad_norm": 0.6743405301489448, + "learning_rate": 0.00013169232049679394, + "loss": 12.1945, + "step": 15258 + }, + { + "epoch": 0.8309135338601757, + "grad_norm": 0.637377462288876, + "learning_rate": 0.00013168395668602893, + "loss": 12.2543, + "step": 15259 + }, + { + "epoch": 0.8309679878567587, + "grad_norm": 0.5322625723022238, + "learning_rate": 0.00013167559262888413, + "loss": 12.1928, + "step": 15260 + }, + { + "epoch": 0.8310224418533417, + "grad_norm": 0.6171535140221914, + "learning_rate": 0.00013166722832542465, + "loss": 12.2262, + "step": 15261 + }, + { + "epoch": 0.8310768958499248, + "grad_norm": 0.8597285154023284, + "learning_rate": 0.00013165886377571547, + "loss": 12.2184, + "step": 15262 + }, + { + "epoch": 0.8311313498465078, + "grad_norm": 0.5779101303293248, + "learning_rate": 0.00013165049897982168, + "loss": 12.2524, + "step": 15263 + }, + { + "epoch": 0.8311858038430908, + "grad_norm": 0.5739459923844983, + "learning_rate": 0.0001316421339378083, + "loss": 12.2599, + "step": 15264 + }, + { + "epoch": 0.8312402578396738, + "grad_norm": 0.6661241609743747, + "learning_rate": 0.00013163376864974038, + "loss": 12.2557, + "step": 15265 + }, + { + "epoch": 0.8312947118362568, + "grad_norm": 0.5379554045635468, + "learning_rate": 0.00013162540311568294, + "loss": 12.1778, + "step": 15266 + }, + { + "epoch": 0.8313491658328398, + "grad_norm": 0.5524540929241767, + "learning_rate": 0.00013161703733570107, + "loss": 12.1688, + "step": 15267 + }, + { + "epoch": 0.8314036198294229, + "grad_norm": 0.6297163416154375, + "learning_rate": 0.00013160867130985985, + "loss": 12.2242, + "step": 15268 + }, + { + "epoch": 0.8314580738260059, + "grad_norm": 0.9573883676541012, + "learning_rate": 0.00013160030503822428, + "loss": 12.1995, + "step": 15269 + }, + { + "epoch": 0.8315125278225889, + "grad_norm": 0.6181746204171851, + "learning_rate": 0.00013159193852085944, + "loss": 12.1728, + "step": 15270 + }, + { + "epoch": 0.8315669818191719, + "grad_norm": 0.5946896226480637, + "learning_rate": 0.00013158357175783038, + "loss": 12.051, + "step": 15271 + }, + { + "epoch": 0.8316214358157549, + "grad_norm": 0.5841329282239158, + "learning_rate": 0.00013157520474920214, + "loss": 12.1945, + "step": 15272 + }, + { + "epoch": 0.8316758898123379, + "grad_norm": 0.6162149903340335, + "learning_rate": 0.00013156683749503988, + "loss": 12.186, + "step": 15273 + }, + { + "epoch": 0.831730343808921, + "grad_norm": 0.5633195206483552, + "learning_rate": 0.00013155846999540858, + "loss": 12.1581, + "step": 15274 + }, + { + "epoch": 0.831784797805504, + "grad_norm": 0.5604639867880522, + "learning_rate": 0.0001315501022503733, + "loss": 12.0926, + "step": 15275 + }, + { + "epoch": 0.831839251802087, + "grad_norm": 0.5308652778688698, + "learning_rate": 0.00013154173425999915, + "loss": 12.0024, + "step": 15276 + }, + { + "epoch": 0.83189370579867, + "grad_norm": 0.5447564444093732, + "learning_rate": 0.00013153336602435113, + "loss": 12.1712, + "step": 15277 + }, + { + "epoch": 0.8319481597952529, + "grad_norm": 0.5386724699747754, + "learning_rate": 0.00013152499754349445, + "loss": 11.9927, + "step": 15278 + }, + { + "epoch": 0.8320026137918359, + "grad_norm": 0.5797490134641445, + "learning_rate": 0.00013151662881749407, + "loss": 12.2172, + "step": 15279 + }, + { + "epoch": 0.832057067788419, + "grad_norm": 0.626816413248062, + "learning_rate": 0.0001315082598464151, + "loss": 12.1729, + "step": 15280 + }, + { + "epoch": 0.832111521785002, + "grad_norm": 0.5420185299027364, + "learning_rate": 0.0001314998906303226, + "loss": 12.1501, + "step": 15281 + }, + { + "epoch": 0.832165975781585, + "grad_norm": 0.6286576341012895, + "learning_rate": 0.00013149152116928169, + "loss": 12.3009, + "step": 15282 + }, + { + "epoch": 0.832220429778168, + "grad_norm": 0.5347714267872193, + "learning_rate": 0.00013148315146335743, + "loss": 12.094, + "step": 15283 + }, + { + "epoch": 0.832274883774751, + "grad_norm": 0.5982064880757932, + "learning_rate": 0.0001314747815126149, + "loss": 12.0955, + "step": 15284 + }, + { + "epoch": 0.8323293377713341, + "grad_norm": 0.5360239055325735, + "learning_rate": 0.00013146641131711918, + "loss": 12.207, + "step": 15285 + }, + { + "epoch": 0.8323837917679171, + "grad_norm": 0.6032727350194691, + "learning_rate": 0.0001314580408769354, + "loss": 12.2156, + "step": 15286 + }, + { + "epoch": 0.8324382457645001, + "grad_norm": 0.5264867464022237, + "learning_rate": 0.00013144967019212858, + "loss": 12.2824, + "step": 15287 + }, + { + "epoch": 0.8324926997610831, + "grad_norm": 0.614179478401167, + "learning_rate": 0.00013144129926276387, + "loss": 12.2034, + "step": 15288 + }, + { + "epoch": 0.8325471537576661, + "grad_norm": 0.585713319770856, + "learning_rate": 0.00013143292808890633, + "loss": 12.1851, + "step": 15289 + }, + { + "epoch": 0.8326016077542491, + "grad_norm": 0.5740739803646323, + "learning_rate": 0.00013142455667062108, + "loss": 12.01, + "step": 15290 + }, + { + "epoch": 0.8326560617508322, + "grad_norm": 0.5796548730360052, + "learning_rate": 0.00013141618500797322, + "loss": 12.244, + "step": 15291 + }, + { + "epoch": 0.8327105157474152, + "grad_norm": 0.5597923489671587, + "learning_rate": 0.0001314078131010278, + "loss": 12.2179, + "step": 15292 + }, + { + "epoch": 0.8327649697439982, + "grad_norm": 0.6321308750754813, + "learning_rate": 0.00013139944094985, + "loss": 12.2756, + "step": 15293 + }, + { + "epoch": 0.8328194237405812, + "grad_norm": 0.5446385936148014, + "learning_rate": 0.00013139106855450486, + "loss": 12.2949, + "step": 15294 + }, + { + "epoch": 0.8328738777371641, + "grad_norm": 0.6349345381171592, + "learning_rate": 0.00013138269591505752, + "loss": 12.0296, + "step": 15295 + }, + { + "epoch": 0.8329283317337471, + "grad_norm": 0.6889425832186824, + "learning_rate": 0.00013137432303157305, + "loss": 12.3, + "step": 15296 + }, + { + "epoch": 0.8329827857303302, + "grad_norm": 0.5147927373090182, + "learning_rate": 0.0001313659499041166, + "loss": 12.211, + "step": 15297 + }, + { + "epoch": 0.8330372397269132, + "grad_norm": 0.6220664367543297, + "learning_rate": 0.00013135757653275326, + "loss": 12.3032, + "step": 15298 + }, + { + "epoch": 0.8330916937234962, + "grad_norm": 0.5922621718306847, + "learning_rate": 0.00013134920291754814, + "loss": 12.1767, + "step": 15299 + }, + { + "epoch": 0.8331461477200792, + "grad_norm": 0.598265551750958, + "learning_rate": 0.00013134082905856637, + "loss": 12.1843, + "step": 15300 + }, + { + "epoch": 0.8332006017166622, + "grad_norm": 0.6566281451981195, + "learning_rate": 0.00013133245495587306, + "loss": 12.1329, + "step": 15301 + }, + { + "epoch": 0.8332550557132452, + "grad_norm": 0.6198392400520069, + "learning_rate": 0.0001313240806095333, + "loss": 12.2163, + "step": 15302 + }, + { + "epoch": 0.8333095097098283, + "grad_norm": 0.5089681050707612, + "learning_rate": 0.0001313157060196123, + "loss": 12.1203, + "step": 15303 + }, + { + "epoch": 0.8333639637064113, + "grad_norm": 0.5946276409729907, + "learning_rate": 0.00013130733118617505, + "loss": 12.3139, + "step": 15304 + }, + { + "epoch": 0.8334184177029943, + "grad_norm": 0.6768073905113495, + "learning_rate": 0.00013129895610928678, + "loss": 12.0899, + "step": 15305 + }, + { + "epoch": 0.8334728716995773, + "grad_norm": 0.5706898431899106, + "learning_rate": 0.00013129058078901256, + "loss": 12.2588, + "step": 15306 + }, + { + "epoch": 0.8335273256961603, + "grad_norm": 0.5295503593770585, + "learning_rate": 0.00013128220522541753, + "loss": 12.2527, + "step": 15307 + }, + { + "epoch": 0.8335817796927433, + "grad_norm": 0.5914096601533734, + "learning_rate": 0.00013127382941856687, + "loss": 12.304, + "step": 15308 + }, + { + "epoch": 0.8336362336893264, + "grad_norm": 0.555249289629514, + "learning_rate": 0.0001312654533685256, + "loss": 12.0958, + "step": 15309 + }, + { + "epoch": 0.8336906876859094, + "grad_norm": 0.553953495231632, + "learning_rate": 0.00013125707707535897, + "loss": 12.0996, + "step": 15310 + }, + { + "epoch": 0.8337451416824924, + "grad_norm": 0.5426767378037972, + "learning_rate": 0.00013124870053913206, + "loss": 12.1959, + "step": 15311 + }, + { + "epoch": 0.8337995956790754, + "grad_norm": 0.5638715609519398, + "learning_rate": 0.00013124032375991, + "loss": 12.1801, + "step": 15312 + }, + { + "epoch": 0.8338540496756583, + "grad_norm": 0.5968174649338724, + "learning_rate": 0.00013123194673775798, + "loss": 12.2152, + "step": 15313 + }, + { + "epoch": 0.8339085036722413, + "grad_norm": 0.5734240715647675, + "learning_rate": 0.00013122356947274107, + "loss": 12.1478, + "step": 15314 + }, + { + "epoch": 0.8339629576688244, + "grad_norm": 0.611642059701702, + "learning_rate": 0.00013121519196492444, + "loss": 12.1398, + "step": 15315 + }, + { + "epoch": 0.8340174116654074, + "grad_norm": 0.6082665454481369, + "learning_rate": 0.00013120681421437325, + "loss": 12.0678, + "step": 15316 + }, + { + "epoch": 0.8340718656619904, + "grad_norm": 0.6145617171266933, + "learning_rate": 0.00013119843622115264, + "loss": 12.2043, + "step": 15317 + }, + { + "epoch": 0.8341263196585734, + "grad_norm": 0.5730138743482941, + "learning_rate": 0.0001311900579853278, + "loss": 12.3287, + "step": 15318 + }, + { + "epoch": 0.8341807736551564, + "grad_norm": 0.5975019952444647, + "learning_rate": 0.00013118167950696382, + "loss": 12.1496, + "step": 15319 + }, + { + "epoch": 0.8342352276517395, + "grad_norm": 0.5874154839513164, + "learning_rate": 0.00013117330078612582, + "loss": 12.1702, + "step": 15320 + }, + { + "epoch": 0.8342896816483225, + "grad_norm": 0.5368851943471138, + "learning_rate": 0.00013116492182287904, + "loss": 12.1665, + "step": 15321 + }, + { + "epoch": 0.8343441356449055, + "grad_norm": 0.6005496640293485, + "learning_rate": 0.0001311565426172886, + "loss": 12.1575, + "step": 15322 + }, + { + "epoch": 0.8343985896414885, + "grad_norm": 0.5216150312343807, + "learning_rate": 0.00013114816316941967, + "loss": 12.1666, + "step": 15323 + }, + { + "epoch": 0.8344530436380715, + "grad_norm": 0.5951216910198904, + "learning_rate": 0.0001311397834793374, + "loss": 12.1675, + "step": 15324 + }, + { + "epoch": 0.8345074976346545, + "grad_norm": 0.6309894694156547, + "learning_rate": 0.00013113140354710693, + "loss": 12.2063, + "step": 15325 + }, + { + "epoch": 0.8345619516312376, + "grad_norm": 0.5681904907277748, + "learning_rate": 0.00013112302337279342, + "loss": 12.1512, + "step": 15326 + }, + { + "epoch": 0.8346164056278206, + "grad_norm": 0.5735928927624965, + "learning_rate": 0.00013111464295646212, + "loss": 12.3276, + "step": 15327 + }, + { + "epoch": 0.8346708596244036, + "grad_norm": 0.6591312024956174, + "learning_rate": 0.00013110626229817813, + "loss": 12.224, + "step": 15328 + }, + { + "epoch": 0.8347253136209866, + "grad_norm": 0.6444292844076731, + "learning_rate": 0.0001310978813980066, + "loss": 12.2489, + "step": 15329 + }, + { + "epoch": 0.8347797676175696, + "grad_norm": 0.6733190185260566, + "learning_rate": 0.00013108950025601275, + "loss": 12.1378, + "step": 15330 + }, + { + "epoch": 0.8348342216141525, + "grad_norm": 0.5369684470576751, + "learning_rate": 0.00013108111887226174, + "loss": 12.0915, + "step": 15331 + }, + { + "epoch": 0.8348886756107357, + "grad_norm": 0.6256900431355148, + "learning_rate": 0.0001310727372468187, + "loss": 12.2504, + "step": 15332 + }, + { + "epoch": 0.8349431296073186, + "grad_norm": 0.6406448205533162, + "learning_rate": 0.0001310643553797489, + "loss": 12.1589, + "step": 15333 + }, + { + "epoch": 0.8349975836039016, + "grad_norm": 0.5376786196408638, + "learning_rate": 0.0001310559732711174, + "loss": 12.2238, + "step": 15334 + }, + { + "epoch": 0.8350520376004846, + "grad_norm": 0.5783970591286314, + "learning_rate": 0.0001310475909209895, + "loss": 12.3498, + "step": 15335 + }, + { + "epoch": 0.8351064915970676, + "grad_norm": 0.5942024240935793, + "learning_rate": 0.0001310392083294303, + "loss": 12.188, + "step": 15336 + }, + { + "epoch": 0.8351609455936506, + "grad_norm": 0.60545358347949, + "learning_rate": 0.00013103082549650497, + "loss": 12.2307, + "step": 15337 + }, + { + "epoch": 0.8352153995902337, + "grad_norm": 0.5636018467561122, + "learning_rate": 0.00013102244242227878, + "loss": 12.2584, + "step": 15338 + }, + { + "epoch": 0.8352698535868167, + "grad_norm": 0.580316800840767, + "learning_rate": 0.0001310140591068169, + "loss": 12.1868, + "step": 15339 + }, + { + "epoch": 0.8353243075833997, + "grad_norm": 0.5418777539446611, + "learning_rate": 0.00013100567555018446, + "loss": 12.1379, + "step": 15340 + }, + { + "epoch": 0.8353787615799827, + "grad_norm": 0.6088070564598652, + "learning_rate": 0.0001309972917524467, + "loss": 12.1415, + "step": 15341 + }, + { + "epoch": 0.8354332155765657, + "grad_norm": 0.6252243895763365, + "learning_rate": 0.00013098890771366878, + "loss": 12.2393, + "step": 15342 + }, + { + "epoch": 0.8354876695731487, + "grad_norm": 0.5364906707347072, + "learning_rate": 0.00013098052343391597, + "loss": 12.147, + "step": 15343 + }, + { + "epoch": 0.8355421235697318, + "grad_norm": 0.5666736471551951, + "learning_rate": 0.00013097213891325336, + "loss": 12.1947, + "step": 15344 + }, + { + "epoch": 0.8355965775663148, + "grad_norm": 0.5833266248887412, + "learning_rate": 0.00013096375415174623, + "loss": 12.1789, + "step": 15345 + }, + { + "epoch": 0.8356510315628978, + "grad_norm": 0.6036746488446625, + "learning_rate": 0.00013095536914945973, + "loss": 12.2361, + "step": 15346 + }, + { + "epoch": 0.8357054855594808, + "grad_norm": 0.5709164918817299, + "learning_rate": 0.00013094698390645913, + "loss": 12.1975, + "step": 15347 + }, + { + "epoch": 0.8357599395560638, + "grad_norm": 0.5415393121643536, + "learning_rate": 0.00013093859842280955, + "loss": 12.1258, + "step": 15348 + }, + { + "epoch": 0.8358143935526468, + "grad_norm": 0.6444672849336692, + "learning_rate": 0.00013093021269857625, + "loss": 12.1467, + "step": 15349 + }, + { + "epoch": 0.8358688475492299, + "grad_norm": 0.6121385670183787, + "learning_rate": 0.00013092182673382445, + "loss": 12.2194, + "step": 15350 + }, + { + "epoch": 0.8359233015458128, + "grad_norm": 0.5742302760383521, + "learning_rate": 0.0001309134405286193, + "loss": 12.1077, + "step": 15351 + }, + { + "epoch": 0.8359777555423958, + "grad_norm": 0.5597706966472342, + "learning_rate": 0.00013090505408302612, + "loss": 12.2605, + "step": 15352 + }, + { + "epoch": 0.8360322095389788, + "grad_norm": 0.6120210383203647, + "learning_rate": 0.00013089666739711, + "loss": 12.3513, + "step": 15353 + }, + { + "epoch": 0.8360866635355618, + "grad_norm": 0.6357714430157161, + "learning_rate": 0.00013088828047093623, + "loss": 12.22, + "step": 15354 + }, + { + "epoch": 0.8361411175321449, + "grad_norm": 0.560241191145675, + "learning_rate": 0.00013087989330457, + "loss": 12.1131, + "step": 15355 + }, + { + "epoch": 0.8361955715287279, + "grad_norm": 0.558665134696943, + "learning_rate": 0.00013087150589807656, + "loss": 12.2699, + "step": 15356 + }, + { + "epoch": 0.8362500255253109, + "grad_norm": 0.6030718684554315, + "learning_rate": 0.00013086311825152112, + "loss": 12.164, + "step": 15357 + }, + { + "epoch": 0.8363044795218939, + "grad_norm": 0.5626851183441052, + "learning_rate": 0.00013085473036496888, + "loss": 12.2383, + "step": 15358 + }, + { + "epoch": 0.8363589335184769, + "grad_norm": 0.60581591719412, + "learning_rate": 0.00013084634223848506, + "loss": 12.1086, + "step": 15359 + }, + { + "epoch": 0.8364133875150599, + "grad_norm": 0.6441289035553069, + "learning_rate": 0.00013083795387213495, + "loss": 12.1012, + "step": 15360 + }, + { + "epoch": 0.836467841511643, + "grad_norm": 0.5432182200872989, + "learning_rate": 0.0001308295652659837, + "loss": 12.2727, + "step": 15361 + }, + { + "epoch": 0.836522295508226, + "grad_norm": 0.5112965914274825, + "learning_rate": 0.00013082117642009662, + "loss": 12.1125, + "step": 15362 + }, + { + "epoch": 0.836576749504809, + "grad_norm": 0.5197450256056448, + "learning_rate": 0.0001308127873345389, + "loss": 12.117, + "step": 15363 + }, + { + "epoch": 0.836631203501392, + "grad_norm": 0.5860435056081319, + "learning_rate": 0.00013080439800937575, + "loss": 12.2292, + "step": 15364 + }, + { + "epoch": 0.836685657497975, + "grad_norm": 0.5629178413356406, + "learning_rate": 0.00013079600844467242, + "loss": 12.196, + "step": 15365 + }, + { + "epoch": 0.836740111494558, + "grad_norm": 0.5758380845386492, + "learning_rate": 0.0001307876186404942, + "loss": 12.2736, + "step": 15366 + }, + { + "epoch": 0.8367945654911411, + "grad_norm": 0.5345267079541275, + "learning_rate": 0.0001307792285969063, + "loss": 12.2351, + "step": 15367 + }, + { + "epoch": 0.836849019487724, + "grad_norm": 0.6010343621198327, + "learning_rate": 0.00013077083831397395, + "loss": 12.1085, + "step": 15368 + }, + { + "epoch": 0.836903473484307, + "grad_norm": 0.5567987453339249, + "learning_rate": 0.00013076244779176244, + "loss": 12.2886, + "step": 15369 + }, + { + "epoch": 0.83695792748089, + "grad_norm": 0.575988620818282, + "learning_rate": 0.0001307540570303369, + "loss": 12.0892, + "step": 15370 + }, + { + "epoch": 0.837012381477473, + "grad_norm": 0.5461307537724167, + "learning_rate": 0.00013074566602976268, + "loss": 12.1839, + "step": 15371 + }, + { + "epoch": 0.837066835474056, + "grad_norm": 0.5741964053298968, + "learning_rate": 0.000130737274790105, + "loss": 12.288, + "step": 15372 + }, + { + "epoch": 0.8371212894706391, + "grad_norm": 0.5969391944247079, + "learning_rate": 0.00013072888331142914, + "loss": 12.0641, + "step": 15373 + }, + { + "epoch": 0.8371757434672221, + "grad_norm": 0.5987646358058737, + "learning_rate": 0.00013072049159380033, + "loss": 12.2112, + "step": 15374 + }, + { + "epoch": 0.8372301974638051, + "grad_norm": 0.5455711004770989, + "learning_rate": 0.0001307120996372838, + "loss": 12.2458, + "step": 15375 + }, + { + "epoch": 0.8372846514603881, + "grad_norm": 0.5414312185891778, + "learning_rate": 0.00013070370744194487, + "loss": 12.127, + "step": 15376 + }, + { + "epoch": 0.8373391054569711, + "grad_norm": 0.6305935747972193, + "learning_rate": 0.00013069531500784873, + "loss": 12.2119, + "step": 15377 + }, + { + "epoch": 0.8373935594535541, + "grad_norm": 0.5685400442083921, + "learning_rate": 0.00013068692233506068, + "loss": 12.2422, + "step": 15378 + }, + { + "epoch": 0.8374480134501372, + "grad_norm": 0.5949894077370246, + "learning_rate": 0.000130678529423646, + "loss": 12.1692, + "step": 15379 + }, + { + "epoch": 0.8375024674467202, + "grad_norm": 0.6653462622638929, + "learning_rate": 0.00013067013627366991, + "loss": 12.2144, + "step": 15380 + }, + { + "epoch": 0.8375569214433032, + "grad_norm": 0.5731677512947161, + "learning_rate": 0.00013066174288519768, + "loss": 12.1916, + "step": 15381 + }, + { + "epoch": 0.8376113754398862, + "grad_norm": 0.6319538726760888, + "learning_rate": 0.0001306533492582946, + "loss": 12.2809, + "step": 15382 + }, + { + "epoch": 0.8376658294364692, + "grad_norm": 0.6846801198465237, + "learning_rate": 0.00013064495539302594, + "loss": 12.0877, + "step": 15383 + }, + { + "epoch": 0.8377202834330522, + "grad_norm": 0.534280489313378, + "learning_rate": 0.000130636561289457, + "loss": 12.076, + "step": 15384 + }, + { + "epoch": 0.8377747374296353, + "grad_norm": 0.5713743213754297, + "learning_rate": 0.000130628166947653, + "loss": 12.1155, + "step": 15385 + }, + { + "epoch": 0.8378291914262183, + "grad_norm": 0.6238325355933657, + "learning_rate": 0.0001306197723676792, + "loss": 12.1667, + "step": 15386 + }, + { + "epoch": 0.8378836454228012, + "grad_norm": 0.6029480253780464, + "learning_rate": 0.00013061137754960094, + "loss": 12.1733, + "step": 15387 + }, + { + "epoch": 0.8379380994193842, + "grad_norm": 0.5375203365801448, + "learning_rate": 0.0001306029824934835, + "loss": 12.2736, + "step": 15388 + }, + { + "epoch": 0.8379925534159672, + "grad_norm": 0.6180323788492011, + "learning_rate": 0.00013059458719939215, + "loss": 12.2002, + "step": 15389 + }, + { + "epoch": 0.8380470074125503, + "grad_norm": 0.5576984568599684, + "learning_rate": 0.0001305861916673921, + "loss": 12.0192, + "step": 15390 + }, + { + "epoch": 0.8381014614091333, + "grad_norm": 0.5571576789776306, + "learning_rate": 0.00013057779589754876, + "loss": 12.1686, + "step": 15391 + }, + { + "epoch": 0.8381559154057163, + "grad_norm": 0.5723703608319274, + "learning_rate": 0.0001305693998899273, + "loss": 12.1266, + "step": 15392 + }, + { + "epoch": 0.8382103694022993, + "grad_norm": 0.5639093049927778, + "learning_rate": 0.00013056100364459305, + "loss": 12.1069, + "step": 15393 + }, + { + "epoch": 0.8382648233988823, + "grad_norm": 0.5571453029303685, + "learning_rate": 0.00013055260716161136, + "loss": 12.1868, + "step": 15394 + }, + { + "epoch": 0.8383192773954653, + "grad_norm": 0.5848548378507161, + "learning_rate": 0.00013054421044104744, + "loss": 12.2119, + "step": 15395 + }, + { + "epoch": 0.8383737313920484, + "grad_norm": 0.5833350880830424, + "learning_rate": 0.00013053581348296663, + "loss": 12.2092, + "step": 15396 + }, + { + "epoch": 0.8384281853886314, + "grad_norm": 0.564726235079284, + "learning_rate": 0.0001305274162874342, + "loss": 12.2672, + "step": 15397 + }, + { + "epoch": 0.8384826393852144, + "grad_norm": 0.5640123189749882, + "learning_rate": 0.00013051901885451544, + "loss": 12.1969, + "step": 15398 + }, + { + "epoch": 0.8385370933817974, + "grad_norm": 0.5567158354042706, + "learning_rate": 0.00013051062118427575, + "loss": 12.1581, + "step": 15399 + }, + { + "epoch": 0.8385915473783804, + "grad_norm": 0.6333351307124527, + "learning_rate": 0.0001305022232767803, + "loss": 12.2515, + "step": 15400 + }, + { + "epoch": 0.8386460013749634, + "grad_norm": 0.6196519026665442, + "learning_rate": 0.00013049382513209446, + "loss": 12.157, + "step": 15401 + }, + { + "epoch": 0.8387004553715465, + "grad_norm": 0.6613005026286023, + "learning_rate": 0.0001304854267502835, + "loss": 12.2158, + "step": 15402 + }, + { + "epoch": 0.8387549093681295, + "grad_norm": 0.5624156990839256, + "learning_rate": 0.00013047702813141274, + "loss": 12.1756, + "step": 15403 + }, + { + "epoch": 0.8388093633647125, + "grad_norm": 0.6391498061034914, + "learning_rate": 0.00013046862927554756, + "loss": 12.1817, + "step": 15404 + }, + { + "epoch": 0.8388638173612954, + "grad_norm": 0.7324785085813693, + "learning_rate": 0.00013046023018275314, + "loss": 12.4085, + "step": 15405 + }, + { + "epoch": 0.8389182713578784, + "grad_norm": 0.5810583470765037, + "learning_rate": 0.00013045183085309492, + "loss": 12.3102, + "step": 15406 + }, + { + "epoch": 0.8389727253544614, + "grad_norm": 0.5512366258667796, + "learning_rate": 0.00013044343128663813, + "loss": 12.1526, + "step": 15407 + }, + { + "epoch": 0.8390271793510445, + "grad_norm": 0.5588903997378186, + "learning_rate": 0.0001304350314834481, + "loss": 12.1694, + "step": 15408 + }, + { + "epoch": 0.8390816333476275, + "grad_norm": 0.5795679654907878, + "learning_rate": 0.00013042663144359015, + "loss": 12.3495, + "step": 15409 + }, + { + "epoch": 0.8391360873442105, + "grad_norm": 0.5369781674763817, + "learning_rate": 0.00013041823116712964, + "loss": 12.2644, + "step": 15410 + }, + { + "epoch": 0.8391905413407935, + "grad_norm": 0.5698494013959302, + "learning_rate": 0.00013040983065413185, + "loss": 12.1736, + "step": 15411 + }, + { + "epoch": 0.8392449953373765, + "grad_norm": 0.5609634761067283, + "learning_rate": 0.00013040142990466212, + "loss": 12.3333, + "step": 15412 + }, + { + "epoch": 0.8392994493339595, + "grad_norm": 0.5922172919484425, + "learning_rate": 0.0001303930289187858, + "loss": 12.1139, + "step": 15413 + }, + { + "epoch": 0.8393539033305426, + "grad_norm": 0.5710970376219491, + "learning_rate": 0.00013038462769656816, + "loss": 12.2493, + "step": 15414 + }, + { + "epoch": 0.8394083573271256, + "grad_norm": 0.6130269140220768, + "learning_rate": 0.00013037622623807458, + "loss": 12.3838, + "step": 15415 + }, + { + "epoch": 0.8394628113237086, + "grad_norm": 0.5685305558318331, + "learning_rate": 0.00013036782454337034, + "loss": 12.264, + "step": 15416 + }, + { + "epoch": 0.8395172653202916, + "grad_norm": 0.6027775225125119, + "learning_rate": 0.00013035942261252083, + "loss": 12.0201, + "step": 15417 + }, + { + "epoch": 0.8395717193168746, + "grad_norm": 0.5522397647824678, + "learning_rate": 0.00013035102044559133, + "loss": 12.0813, + "step": 15418 + }, + { + "epoch": 0.8396261733134577, + "grad_norm": 0.6259116033321801, + "learning_rate": 0.00013034261804264726, + "loss": 12.2236, + "step": 15419 + }, + { + "epoch": 0.8396806273100407, + "grad_norm": 0.5905157049713758, + "learning_rate": 0.00013033421540375385, + "loss": 12.3173, + "step": 15420 + }, + { + "epoch": 0.8397350813066237, + "grad_norm": 0.5151391159511745, + "learning_rate": 0.0001303258125289765, + "loss": 12.1515, + "step": 15421 + }, + { + "epoch": 0.8397895353032067, + "grad_norm": 0.531701959746869, + "learning_rate": 0.00013031740941838057, + "loss": 12.1666, + "step": 15422 + }, + { + "epoch": 0.8398439892997897, + "grad_norm": 0.522400648155579, + "learning_rate": 0.00013030900607203136, + "loss": 12.1277, + "step": 15423 + }, + { + "epoch": 0.8398984432963726, + "grad_norm": 0.6362661145573922, + "learning_rate": 0.00013030060248999425, + "loss": 12.2302, + "step": 15424 + }, + { + "epoch": 0.8399528972929557, + "grad_norm": 0.5138634705111699, + "learning_rate": 0.00013029219867233458, + "loss": 12.1477, + "step": 15425 + }, + { + "epoch": 0.8400073512895387, + "grad_norm": 0.5504408190011557, + "learning_rate": 0.00013028379461911766, + "loss": 12.2609, + "step": 15426 + }, + { + "epoch": 0.8400618052861217, + "grad_norm": 0.5817229386487759, + "learning_rate": 0.0001302753903304089, + "loss": 12.1662, + "step": 15427 + }, + { + "epoch": 0.8401162592827047, + "grad_norm": 0.5779589417910154, + "learning_rate": 0.00013026698580627364, + "loss": 12.1039, + "step": 15428 + }, + { + "epoch": 0.8401707132792877, + "grad_norm": 0.6501390033485781, + "learning_rate": 0.00013025858104677722, + "loss": 12.3038, + "step": 15429 + }, + { + "epoch": 0.8402251672758707, + "grad_norm": 0.547251099236945, + "learning_rate": 0.00013025017605198494, + "loss": 12.1442, + "step": 15430 + }, + { + "epoch": 0.8402796212724538, + "grad_norm": 0.5299319663603849, + "learning_rate": 0.00013024177082196226, + "loss": 12.1717, + "step": 15431 + }, + { + "epoch": 0.8403340752690368, + "grad_norm": 0.5703350863581309, + "learning_rate": 0.00013023336535677454, + "loss": 12.2853, + "step": 15432 + }, + { + "epoch": 0.8403885292656198, + "grad_norm": 0.6139029056887727, + "learning_rate": 0.00013022495965648705, + "loss": 12.2486, + "step": 15433 + }, + { + "epoch": 0.8404429832622028, + "grad_norm": 0.5488466788920311, + "learning_rate": 0.00013021655372116525, + "loss": 12.1232, + "step": 15434 + }, + { + "epoch": 0.8404974372587858, + "grad_norm": 0.5072533971790376, + "learning_rate": 0.0001302081475508744, + "loss": 12.1123, + "step": 15435 + }, + { + "epoch": 0.8405518912553688, + "grad_norm": 0.6553603035867362, + "learning_rate": 0.00013019974114567993, + "loss": 12.4077, + "step": 15436 + }, + { + "epoch": 0.8406063452519519, + "grad_norm": 0.5963926708645846, + "learning_rate": 0.00013019133450564724, + "loss": 12.0707, + "step": 15437 + }, + { + "epoch": 0.8406607992485349, + "grad_norm": 0.5766582613875916, + "learning_rate": 0.00013018292763084167, + "loss": 12.1169, + "step": 15438 + }, + { + "epoch": 0.8407152532451179, + "grad_norm": 0.598765168143679, + "learning_rate": 0.0001301745205213286, + "loss": 12.2108, + "step": 15439 + }, + { + "epoch": 0.8407697072417009, + "grad_norm": 0.6183119765379161, + "learning_rate": 0.0001301661131771734, + "loss": 12.3027, + "step": 15440 + }, + { + "epoch": 0.8408241612382839, + "grad_norm": 0.5438659222201025, + "learning_rate": 0.0001301577055984414, + "loss": 12.2832, + "step": 15441 + }, + { + "epoch": 0.8408786152348668, + "grad_norm": 0.5569020818739407, + "learning_rate": 0.00013014929778519806, + "loss": 12.1564, + "step": 15442 + }, + { + "epoch": 0.84093306923145, + "grad_norm": 0.5396355165755193, + "learning_rate": 0.00013014088973750874, + "loss": 12.2076, + "step": 15443 + }, + { + "epoch": 0.8409875232280329, + "grad_norm": 0.5594209164770022, + "learning_rate": 0.00013013248145543878, + "loss": 12.1998, + "step": 15444 + }, + { + "epoch": 0.8410419772246159, + "grad_norm": 0.5582406828411369, + "learning_rate": 0.00013012407293905363, + "loss": 12.0757, + "step": 15445 + }, + { + "epoch": 0.8410964312211989, + "grad_norm": 0.557935986177484, + "learning_rate": 0.00013011566418841858, + "loss": 12.1023, + "step": 15446 + }, + { + "epoch": 0.8411508852177819, + "grad_norm": 0.5590660230542867, + "learning_rate": 0.00013010725520359908, + "loss": 12.2449, + "step": 15447 + }, + { + "epoch": 0.8412053392143649, + "grad_norm": 0.5686932609582025, + "learning_rate": 0.00013009884598466054, + "loss": 12.1602, + "step": 15448 + }, + { + "epoch": 0.841259793210948, + "grad_norm": 0.5438917008388785, + "learning_rate": 0.00013009043653166834, + "loss": 12.1989, + "step": 15449 + }, + { + "epoch": 0.841314247207531, + "grad_norm": 0.5770990080511804, + "learning_rate": 0.00013008202684468786, + "loss": 12.0269, + "step": 15450 + }, + { + "epoch": 0.841368701204114, + "grad_norm": 0.6950577279601987, + "learning_rate": 0.00013007361692378446, + "loss": 12.2033, + "step": 15451 + }, + { + "epoch": 0.841423155200697, + "grad_norm": 0.5215834135230242, + "learning_rate": 0.00013006520676902357, + "loss": 12.153, + "step": 15452 + }, + { + "epoch": 0.84147760919728, + "grad_norm": 0.5919430674869965, + "learning_rate": 0.00013005679638047058, + "loss": 12.2235, + "step": 15453 + }, + { + "epoch": 0.8415320631938631, + "grad_norm": 0.5878741611778164, + "learning_rate": 0.00013004838575819097, + "loss": 12.1629, + "step": 15454 + }, + { + "epoch": 0.8415865171904461, + "grad_norm": 0.5506223431101179, + "learning_rate": 0.00013003997490225003, + "loss": 12.1377, + "step": 15455 + }, + { + "epoch": 0.8416409711870291, + "grad_norm": 0.5477777108898231, + "learning_rate": 0.0001300315638127132, + "loss": 12.1777, + "step": 15456 + }, + { + "epoch": 0.8416954251836121, + "grad_norm": 0.6021798307343533, + "learning_rate": 0.00013002315248964588, + "loss": 12.058, + "step": 15457 + }, + { + "epoch": 0.8417498791801951, + "grad_norm": 0.6091936693650849, + "learning_rate": 0.00013001474093311352, + "loss": 12.1401, + "step": 15458 + }, + { + "epoch": 0.841804333176778, + "grad_norm": 0.590264553535219, + "learning_rate": 0.0001300063291431815, + "loss": 12.1855, + "step": 15459 + }, + { + "epoch": 0.8418587871733612, + "grad_norm": 0.5543263624857145, + "learning_rate": 0.00012999791711991522, + "loss": 12.1857, + "step": 15460 + }, + { + "epoch": 0.8419132411699441, + "grad_norm": 0.6258587587841732, + "learning_rate": 0.00012998950486338014, + "loss": 12.1809, + "step": 15461 + }, + { + "epoch": 0.8419676951665271, + "grad_norm": 0.576115813718609, + "learning_rate": 0.0001299810923736416, + "loss": 12.1628, + "step": 15462 + }, + { + "epoch": 0.8420221491631101, + "grad_norm": 0.5779529876001324, + "learning_rate": 0.00012997267965076504, + "loss": 12.1651, + "step": 15463 + }, + { + "epoch": 0.8420766031596931, + "grad_norm": 0.6170858443746692, + "learning_rate": 0.00012996426669481593, + "loss": 12.3123, + "step": 15464 + }, + { + "epoch": 0.8421310571562761, + "grad_norm": 0.5993499195332175, + "learning_rate": 0.00012995585350585967, + "loss": 12.2695, + "step": 15465 + }, + { + "epoch": 0.8421855111528592, + "grad_norm": 0.5826163014390762, + "learning_rate": 0.00012994744008396167, + "loss": 12.2115, + "step": 15466 + }, + { + "epoch": 0.8422399651494422, + "grad_norm": 0.5385510345677272, + "learning_rate": 0.00012993902642918732, + "loss": 12.2976, + "step": 15467 + }, + { + "epoch": 0.8422944191460252, + "grad_norm": 0.5659184008479512, + "learning_rate": 0.0001299306125416021, + "loss": 12.1892, + "step": 15468 + }, + { + "epoch": 0.8423488731426082, + "grad_norm": 0.5205833855431005, + "learning_rate": 0.00012992219842127142, + "loss": 12.1612, + "step": 15469 + }, + { + "epoch": 0.8424033271391912, + "grad_norm": 0.5737317949815772, + "learning_rate": 0.0001299137840682607, + "loss": 12.2158, + "step": 15470 + }, + { + "epoch": 0.8424577811357742, + "grad_norm": 0.5923701455972223, + "learning_rate": 0.00012990536948263536, + "loss": 12.1834, + "step": 15471 + }, + { + "epoch": 0.8425122351323573, + "grad_norm": 0.5610356626591934, + "learning_rate": 0.00012989695466446088, + "loss": 12.1868, + "step": 15472 + }, + { + "epoch": 0.8425666891289403, + "grad_norm": 0.5593317641200841, + "learning_rate": 0.00012988853961380268, + "loss": 12.1865, + "step": 15473 + }, + { + "epoch": 0.8426211431255233, + "grad_norm": 0.5609214500978, + "learning_rate": 0.00012988012433072616, + "loss": 12.2068, + "step": 15474 + }, + { + "epoch": 0.8426755971221063, + "grad_norm": 0.6189225809724805, + "learning_rate": 0.00012987170881529678, + "loss": 12.3564, + "step": 15475 + }, + { + "epoch": 0.8427300511186893, + "grad_norm": 0.5810532995223517, + "learning_rate": 0.00012986329306757997, + "loss": 12.225, + "step": 15476 + }, + { + "epoch": 0.8427845051152723, + "grad_norm": 0.6477006315994673, + "learning_rate": 0.00012985487708764122, + "loss": 12.3078, + "step": 15477 + }, + { + "epoch": 0.8428389591118554, + "grad_norm": 0.5325454748651055, + "learning_rate": 0.0001298464608755459, + "loss": 12.2116, + "step": 15478 + }, + { + "epoch": 0.8428934131084383, + "grad_norm": 0.5630427183844071, + "learning_rate": 0.0001298380444313595, + "loss": 12.2595, + "step": 15479 + }, + { + "epoch": 0.8429478671050213, + "grad_norm": 0.5153174542840714, + "learning_rate": 0.0001298296277551475, + "loss": 12.144, + "step": 15480 + }, + { + "epoch": 0.8430023211016043, + "grad_norm": 0.5812089814295625, + "learning_rate": 0.00012982121084697529, + "loss": 12.289, + "step": 15481 + }, + { + "epoch": 0.8430567750981873, + "grad_norm": 0.5651481908263806, + "learning_rate": 0.00012981279370690834, + "loss": 12.1724, + "step": 15482 + }, + { + "epoch": 0.8431112290947703, + "grad_norm": 0.5993511716762084, + "learning_rate": 0.00012980437633501214, + "loss": 12.1811, + "step": 15483 + }, + { + "epoch": 0.8431656830913534, + "grad_norm": 0.5576741439576591, + "learning_rate": 0.00012979595873135205, + "loss": 12.1552, + "step": 15484 + }, + { + "epoch": 0.8432201370879364, + "grad_norm": 0.5825885230304538, + "learning_rate": 0.00012978754089599363, + "loss": 12.2838, + "step": 15485 + }, + { + "epoch": 0.8432745910845194, + "grad_norm": 0.5981931555447255, + "learning_rate": 0.00012977912282900232, + "loss": 12.1833, + "step": 15486 + }, + { + "epoch": 0.8433290450811024, + "grad_norm": 0.5922220565819358, + "learning_rate": 0.00012977070453044348, + "loss": 12.1535, + "step": 15487 + }, + { + "epoch": 0.8433834990776854, + "grad_norm": 0.5485480344051351, + "learning_rate": 0.00012976228600038273, + "loss": 12.2509, + "step": 15488 + }, + { + "epoch": 0.8434379530742685, + "grad_norm": 0.5948772804151026, + "learning_rate": 0.00012975386723888542, + "loss": 12.1709, + "step": 15489 + }, + { + "epoch": 0.8434924070708515, + "grad_norm": 0.6364488969136304, + "learning_rate": 0.00012974544824601703, + "loss": 12.0577, + "step": 15490 + }, + { + "epoch": 0.8435468610674345, + "grad_norm": 0.5738671439296887, + "learning_rate": 0.00012973702902184306, + "loss": 12.2015, + "step": 15491 + }, + { + "epoch": 0.8436013150640175, + "grad_norm": 0.5646197147628447, + "learning_rate": 0.00012972860956642895, + "loss": 12.1925, + "step": 15492 + }, + { + "epoch": 0.8436557690606005, + "grad_norm": 0.527313632438477, + "learning_rate": 0.00012972018987984023, + "loss": 12.1908, + "step": 15493 + }, + { + "epoch": 0.8437102230571835, + "grad_norm": 0.5621058442006267, + "learning_rate": 0.00012971176996214232, + "loss": 12.189, + "step": 15494 + }, + { + "epoch": 0.8437646770537666, + "grad_norm": 0.5441499521332711, + "learning_rate": 0.00012970334981340063, + "loss": 12.1707, + "step": 15495 + }, + { + "epoch": 0.8438191310503496, + "grad_norm": 0.6113276956688619, + "learning_rate": 0.0001296949294336808, + "loss": 12.1549, + "step": 15496 + }, + { + "epoch": 0.8438735850469326, + "grad_norm": 0.6162383652250739, + "learning_rate": 0.00012968650882304818, + "loss": 12.2017, + "step": 15497 + }, + { + "epoch": 0.8439280390435155, + "grad_norm": 0.5558501593387551, + "learning_rate": 0.00012967808798156828, + "loss": 12.1558, + "step": 15498 + }, + { + "epoch": 0.8439824930400985, + "grad_norm": 0.527880376040958, + "learning_rate": 0.00012966966690930665, + "loss": 12.2283, + "step": 15499 + }, + { + "epoch": 0.8440369470366815, + "grad_norm": 0.5775924687961809, + "learning_rate": 0.00012966124560632867, + "loss": 12.3681, + "step": 15500 + }, + { + "epoch": 0.8440914010332646, + "grad_norm": 0.5918371650924008, + "learning_rate": 0.00012965282407269982, + "loss": 12.0314, + "step": 15501 + }, + { + "epoch": 0.8441458550298476, + "grad_norm": 0.5889430522237789, + "learning_rate": 0.0001296444023084857, + "loss": 12.1733, + "step": 15502 + }, + { + "epoch": 0.8442003090264306, + "grad_norm": 0.630949062021415, + "learning_rate": 0.0001296359803137517, + "loss": 12.1696, + "step": 15503 + }, + { + "epoch": 0.8442547630230136, + "grad_norm": 0.6027404325392326, + "learning_rate": 0.00012962755808856342, + "loss": 12.2744, + "step": 15504 + }, + { + "epoch": 0.8443092170195966, + "grad_norm": 0.6058898368980901, + "learning_rate": 0.00012961913563298624, + "loss": 12.3292, + "step": 15505 + }, + { + "epoch": 0.8443636710161796, + "grad_norm": 0.5531306126172333, + "learning_rate": 0.0001296107129470857, + "loss": 12.2541, + "step": 15506 + }, + { + "epoch": 0.8444181250127627, + "grad_norm": 0.6083152632010047, + "learning_rate": 0.00012960229003092724, + "loss": 12.163, + "step": 15507 + }, + { + "epoch": 0.8444725790093457, + "grad_norm": 0.5771152076523544, + "learning_rate": 0.00012959386688457642, + "loss": 12.3182, + "step": 15508 + }, + { + "epoch": 0.8445270330059287, + "grad_norm": 0.5904188640337819, + "learning_rate": 0.00012958544350809878, + "loss": 12.0135, + "step": 15509 + }, + { + "epoch": 0.8445814870025117, + "grad_norm": 0.511318850188779, + "learning_rate": 0.00012957701990155975, + "loss": 12.1324, + "step": 15510 + }, + { + "epoch": 0.8446359409990947, + "grad_norm": 0.549514494740283, + "learning_rate": 0.00012956859606502486, + "loss": 12.1693, + "step": 15511 + }, + { + "epoch": 0.8446903949956777, + "grad_norm": 0.6198624919438058, + "learning_rate": 0.00012956017199855957, + "loss": 12.3404, + "step": 15512 + }, + { + "epoch": 0.8447448489922608, + "grad_norm": 0.5654961144625223, + "learning_rate": 0.00012955174770222944, + "loss": 12.3016, + "step": 15513 + }, + { + "epoch": 0.8447993029888438, + "grad_norm": 0.5328895178588599, + "learning_rate": 0.00012954332317609995, + "loss": 12.1564, + "step": 15514 + }, + { + "epoch": 0.8448537569854268, + "grad_norm": 0.593522704545548, + "learning_rate": 0.0001295348984202367, + "loss": 12.2934, + "step": 15515 + }, + { + "epoch": 0.8449082109820097, + "grad_norm": 0.5430808014371415, + "learning_rate": 0.00012952647343470505, + "loss": 12.191, + "step": 15516 + }, + { + "epoch": 0.8449626649785927, + "grad_norm": 0.5265172841458723, + "learning_rate": 0.00012951804821957063, + "loss": 12.2325, + "step": 15517 + }, + { + "epoch": 0.8450171189751757, + "grad_norm": 0.6839327055872534, + "learning_rate": 0.00012950962277489885, + "loss": 12.3049, + "step": 15518 + }, + { + "epoch": 0.8450715729717588, + "grad_norm": 0.5473030469567395, + "learning_rate": 0.00012950119710075536, + "loss": 12.2636, + "step": 15519 + }, + { + "epoch": 0.8451260269683418, + "grad_norm": 0.5500510581940654, + "learning_rate": 0.00012949277119720564, + "loss": 12.2041, + "step": 15520 + }, + { + "epoch": 0.8451804809649248, + "grad_norm": 0.586482402015144, + "learning_rate": 0.00012948434506431514, + "loss": 12.2955, + "step": 15521 + }, + { + "epoch": 0.8452349349615078, + "grad_norm": 0.5724440277061343, + "learning_rate": 0.00012947591870214945, + "loss": 12.2484, + "step": 15522 + }, + { + "epoch": 0.8452893889580908, + "grad_norm": 0.5696594400728073, + "learning_rate": 0.00012946749211077406, + "loss": 12.193, + "step": 15523 + }, + { + "epoch": 0.8453438429546739, + "grad_norm": 0.5209405674486619, + "learning_rate": 0.00012945906529025447, + "loss": 12.0911, + "step": 15524 + }, + { + "epoch": 0.8453982969512569, + "grad_norm": 0.570044338500723, + "learning_rate": 0.00012945063824065632, + "loss": 12.2399, + "step": 15525 + }, + { + "epoch": 0.8454527509478399, + "grad_norm": 0.5518226214837841, + "learning_rate": 0.00012944221096204502, + "loss": 12.2789, + "step": 15526 + }, + { + "epoch": 0.8455072049444229, + "grad_norm": 0.5511632782042666, + "learning_rate": 0.00012943378345448616, + "loss": 12.0836, + "step": 15527 + }, + { + "epoch": 0.8455616589410059, + "grad_norm": 0.6136098803437449, + "learning_rate": 0.00012942535571804526, + "loss": 12.1695, + "step": 15528 + }, + { + "epoch": 0.8456161129375889, + "grad_norm": 0.6071253304180788, + "learning_rate": 0.00012941692775278785, + "loss": 12.2983, + "step": 15529 + }, + { + "epoch": 0.845670566934172, + "grad_norm": 0.5204147345078632, + "learning_rate": 0.0001294084995587795, + "loss": 12.1814, + "step": 15530 + }, + { + "epoch": 0.845725020930755, + "grad_norm": 0.5472504695583279, + "learning_rate": 0.0001294000711360857, + "loss": 12.2124, + "step": 15531 + }, + { + "epoch": 0.845779474927338, + "grad_norm": 0.6406576106211804, + "learning_rate": 0.00012939164248477206, + "loss": 12.3914, + "step": 15532 + }, + { + "epoch": 0.845833928923921, + "grad_norm": 0.5236012866958867, + "learning_rate": 0.00012938321360490406, + "loss": 12.0712, + "step": 15533 + }, + { + "epoch": 0.845888382920504, + "grad_norm": 0.5797560648484227, + "learning_rate": 0.00012937478449654726, + "loss": 12.1934, + "step": 15534 + }, + { + "epoch": 0.8459428369170869, + "grad_norm": 0.5685664703573508, + "learning_rate": 0.00012936635515976722, + "loss": 12.1307, + "step": 15535 + }, + { + "epoch": 0.84599729091367, + "grad_norm": 0.6221459849482847, + "learning_rate": 0.00012935792559462945, + "loss": 12.1631, + "step": 15536 + }, + { + "epoch": 0.846051744910253, + "grad_norm": 0.5876772033125665, + "learning_rate": 0.00012934949580119953, + "loss": 12.1943, + "step": 15537 + }, + { + "epoch": 0.846106198906836, + "grad_norm": 0.759882435895185, + "learning_rate": 0.00012934106577954305, + "loss": 12.3563, + "step": 15538 + }, + { + "epoch": 0.846160652903419, + "grad_norm": 0.5415582133886058, + "learning_rate": 0.00012933263552972549, + "loss": 12.1682, + "step": 15539 + }, + { + "epoch": 0.846215106900002, + "grad_norm": 0.5903793304696313, + "learning_rate": 0.00012932420505181241, + "loss": 12.2252, + "step": 15540 + }, + { + "epoch": 0.846269560896585, + "grad_norm": 0.5849930079909551, + "learning_rate": 0.00012931577434586943, + "loss": 12.3335, + "step": 15541 + }, + { + "epoch": 0.8463240148931681, + "grad_norm": 0.6003166564253619, + "learning_rate": 0.00012930734341196206, + "loss": 12.1644, + "step": 15542 + }, + { + "epoch": 0.8463784688897511, + "grad_norm": 0.6097396533940193, + "learning_rate": 0.00012929891225015586, + "loss": 12.1985, + "step": 15543 + }, + { + "epoch": 0.8464329228863341, + "grad_norm": 0.5938451907882728, + "learning_rate": 0.00012929048086051645, + "loss": 12.2239, + "step": 15544 + }, + { + "epoch": 0.8464873768829171, + "grad_norm": 0.7848091599738762, + "learning_rate": 0.0001292820492431093, + "loss": 12.3416, + "step": 15545 + }, + { + "epoch": 0.8465418308795001, + "grad_norm": 0.5677497381016352, + "learning_rate": 0.00012927361739800005, + "loss": 12.1728, + "step": 15546 + }, + { + "epoch": 0.8465962848760831, + "grad_norm": 0.6104022337169018, + "learning_rate": 0.00012926518532525424, + "loss": 12.1725, + "step": 15547 + }, + { + "epoch": 0.8466507388726662, + "grad_norm": 0.5022216219713065, + "learning_rate": 0.00012925675302493745, + "loss": 12.1429, + "step": 15548 + }, + { + "epoch": 0.8467051928692492, + "grad_norm": 0.5848567111984192, + "learning_rate": 0.00012924832049711525, + "loss": 12.2029, + "step": 15549 + }, + { + "epoch": 0.8467596468658322, + "grad_norm": 0.5081974836950981, + "learning_rate": 0.00012923988774185316, + "loss": 11.9883, + "step": 15550 + }, + { + "epoch": 0.8468141008624152, + "grad_norm": 0.6221118858893209, + "learning_rate": 0.00012923145475921683, + "loss": 12.2106, + "step": 15551 + }, + { + "epoch": 0.8468685548589981, + "grad_norm": 0.6175250867818555, + "learning_rate": 0.00012922302154927179, + "loss": 12.3017, + "step": 15552 + }, + { + "epoch": 0.8469230088555812, + "grad_norm": 0.5352118342810327, + "learning_rate": 0.00012921458811208366, + "loss": 12.1494, + "step": 15553 + }, + { + "epoch": 0.8469774628521642, + "grad_norm": 0.5872967934663972, + "learning_rate": 0.00012920615444771797, + "loss": 12.0825, + "step": 15554 + }, + { + "epoch": 0.8470319168487472, + "grad_norm": 0.5933345426004941, + "learning_rate": 0.00012919772055624038, + "loss": 12.2394, + "step": 15555 + }, + { + "epoch": 0.8470863708453302, + "grad_norm": 0.6272768455911071, + "learning_rate": 0.00012918928643771633, + "loss": 12.2328, + "step": 15556 + }, + { + "epoch": 0.8471408248419132, + "grad_norm": 0.5198233088362648, + "learning_rate": 0.00012918085209221155, + "loss": 12.0769, + "step": 15557 + }, + { + "epoch": 0.8471952788384962, + "grad_norm": 0.6244403370285655, + "learning_rate": 0.00012917241751979154, + "loss": 12.1873, + "step": 15558 + }, + { + "epoch": 0.8472497328350793, + "grad_norm": 0.522512235489971, + "learning_rate": 0.00012916398272052194, + "loss": 12.1636, + "step": 15559 + }, + { + "epoch": 0.8473041868316623, + "grad_norm": 0.5587611000537016, + "learning_rate": 0.00012915554769446833, + "loss": 12.3209, + "step": 15560 + }, + { + "epoch": 0.8473586408282453, + "grad_norm": 0.5404422738375619, + "learning_rate": 0.00012914711244169626, + "loss": 12.2373, + "step": 15561 + }, + { + "epoch": 0.8474130948248283, + "grad_norm": 0.5233974892533286, + "learning_rate": 0.00012913867696227136, + "loss": 12.1196, + "step": 15562 + }, + { + "epoch": 0.8474675488214113, + "grad_norm": 0.5540664811886048, + "learning_rate": 0.00012913024125625925, + "loss": 12.201, + "step": 15563 + }, + { + "epoch": 0.8475220028179943, + "grad_norm": 0.585623802648054, + "learning_rate": 0.00012912180532372548, + "loss": 12.0897, + "step": 15564 + }, + { + "epoch": 0.8475764568145774, + "grad_norm": 0.5162575407180071, + "learning_rate": 0.0001291133691647357, + "loss": 12.097, + "step": 15565 + }, + { + "epoch": 0.8476309108111604, + "grad_norm": 0.6065161656591413, + "learning_rate": 0.00012910493277935544, + "loss": 12.1424, + "step": 15566 + }, + { + "epoch": 0.8476853648077434, + "grad_norm": 0.6120489582519593, + "learning_rate": 0.00012909649616765033, + "loss": 12.1668, + "step": 15567 + }, + { + "epoch": 0.8477398188043264, + "grad_norm": 0.610427532528256, + "learning_rate": 0.00012908805932968602, + "loss": 12.2514, + "step": 15568 + }, + { + "epoch": 0.8477942728009094, + "grad_norm": 0.5880797631439723, + "learning_rate": 0.00012907962226552807, + "loss": 12.1468, + "step": 15569 + }, + { + "epoch": 0.8478487267974923, + "grad_norm": 0.7011077942022087, + "learning_rate": 0.00012907118497524213, + "loss": 12.3227, + "step": 15570 + }, + { + "epoch": 0.8479031807940755, + "grad_norm": 0.5620293371921344, + "learning_rate": 0.00012906274745889374, + "loss": 12.1715, + "step": 15571 + }, + { + "epoch": 0.8479576347906584, + "grad_norm": 0.6079063552721871, + "learning_rate": 0.00012905430971654858, + "loss": 12.1763, + "step": 15572 + }, + { + "epoch": 0.8480120887872414, + "grad_norm": 0.6148687010089448, + "learning_rate": 0.0001290458717482722, + "loss": 12.2441, + "step": 15573 + }, + { + "epoch": 0.8480665427838244, + "grad_norm": 0.5995826169807735, + "learning_rate": 0.00012903743355413024, + "loss": 12.19, + "step": 15574 + }, + { + "epoch": 0.8481209967804074, + "grad_norm": 0.5907546417641577, + "learning_rate": 0.0001290289951341884, + "loss": 12.1121, + "step": 15575 + }, + { + "epoch": 0.8481754507769904, + "grad_norm": 0.5300240353852979, + "learning_rate": 0.00012902055648851218, + "loss": 12.134, + "step": 15576 + }, + { + "epoch": 0.8482299047735735, + "grad_norm": 0.6731591510323991, + "learning_rate": 0.00012901211761716724, + "loss": 12.3143, + "step": 15577 + }, + { + "epoch": 0.8482843587701565, + "grad_norm": 0.6375915340003899, + "learning_rate": 0.0001290036785202192, + "loss": 12.2515, + "step": 15578 + }, + { + "epoch": 0.8483388127667395, + "grad_norm": 0.5813097313194986, + "learning_rate": 0.00012899523919773372, + "loss": 12.2116, + "step": 15579 + }, + { + "epoch": 0.8483932667633225, + "grad_norm": 0.5742185102082507, + "learning_rate": 0.0001289867996497764, + "loss": 12.1474, + "step": 15580 + }, + { + "epoch": 0.8484477207599055, + "grad_norm": 0.590523813660125, + "learning_rate": 0.00012897835987641285, + "loss": 12.106, + "step": 15581 + }, + { + "epoch": 0.8485021747564885, + "grad_norm": 0.6000171184383458, + "learning_rate": 0.0001289699198777087, + "loss": 12.154, + "step": 15582 + }, + { + "epoch": 0.8485566287530716, + "grad_norm": 0.5719366141175086, + "learning_rate": 0.00012896147965372963, + "loss": 12.2698, + "step": 15583 + }, + { + "epoch": 0.8486110827496546, + "grad_norm": 0.5386123198262065, + "learning_rate": 0.00012895303920454118, + "loss": 12.2407, + "step": 15584 + }, + { + "epoch": 0.8486655367462376, + "grad_norm": 0.5296364039346468, + "learning_rate": 0.0001289445985302091, + "loss": 12.155, + "step": 15585 + }, + { + "epoch": 0.8487199907428206, + "grad_norm": 0.6482716971909245, + "learning_rate": 0.00012893615763079894, + "loss": 12.2155, + "step": 15586 + }, + { + "epoch": 0.8487744447394036, + "grad_norm": 0.7109488935622398, + "learning_rate": 0.00012892771650637637, + "loss": 12.1994, + "step": 15587 + }, + { + "epoch": 0.8488288987359867, + "grad_norm": 0.593128389693315, + "learning_rate": 0.00012891927515700703, + "loss": 12.1713, + "step": 15588 + }, + { + "epoch": 0.8488833527325697, + "grad_norm": 0.6519114749932312, + "learning_rate": 0.00012891083358275654, + "loss": 12.1332, + "step": 15589 + }, + { + "epoch": 0.8489378067291526, + "grad_norm": 0.6107853226476072, + "learning_rate": 0.00012890239178369058, + "loss": 12.2532, + "step": 15590 + }, + { + "epoch": 0.8489922607257356, + "grad_norm": 0.6797629066198152, + "learning_rate": 0.00012889394975987476, + "loss": 12.1826, + "step": 15591 + }, + { + "epoch": 0.8490467147223186, + "grad_norm": 0.6361362221930984, + "learning_rate": 0.00012888550751137475, + "loss": 12.1707, + "step": 15592 + }, + { + "epoch": 0.8491011687189016, + "grad_norm": 0.5971055886039516, + "learning_rate": 0.0001288770650382562, + "loss": 12.2782, + "step": 15593 + }, + { + "epoch": 0.8491556227154847, + "grad_norm": 0.5947171593968466, + "learning_rate": 0.00012886862234058475, + "loss": 12.0976, + "step": 15594 + }, + { + "epoch": 0.8492100767120677, + "grad_norm": 0.6335157038701741, + "learning_rate": 0.00012886017941842604, + "loss": 12.1543, + "step": 15595 + }, + { + "epoch": 0.8492645307086507, + "grad_norm": 0.5592617264152265, + "learning_rate": 0.00012885173627184571, + "loss": 12.1406, + "step": 15596 + }, + { + "epoch": 0.8493189847052337, + "grad_norm": 0.6160812598362347, + "learning_rate": 0.00012884329290090949, + "loss": 12.1859, + "step": 15597 + }, + { + "epoch": 0.8493734387018167, + "grad_norm": 0.5659969854376821, + "learning_rate": 0.00012883484930568294, + "loss": 12.1066, + "step": 15598 + }, + { + "epoch": 0.8494278926983997, + "grad_norm": 0.6664842479810961, + "learning_rate": 0.0001288264054862318, + "loss": 12.096, + "step": 15599 + }, + { + "epoch": 0.8494823466949828, + "grad_norm": 0.5276228134648259, + "learning_rate": 0.00012881796144262168, + "loss": 12.1223, + "step": 15600 + }, + { + "epoch": 0.8495368006915658, + "grad_norm": 0.5805855702523423, + "learning_rate": 0.00012880951717491828, + "loss": 12.2562, + "step": 15601 + }, + { + "epoch": 0.8495912546881488, + "grad_norm": 0.5383428403603512, + "learning_rate": 0.00012880107268318722, + "loss": 12.0346, + "step": 15602 + }, + { + "epoch": 0.8496457086847318, + "grad_norm": 0.5522025055589089, + "learning_rate": 0.00012879262796749422, + "loss": 12.1612, + "step": 15603 + }, + { + "epoch": 0.8497001626813148, + "grad_norm": 0.583298513098549, + "learning_rate": 0.00012878418302790488, + "loss": 12.1072, + "step": 15604 + }, + { + "epoch": 0.8497546166778978, + "grad_norm": 0.6016494841639478, + "learning_rate": 0.0001287757378644849, + "loss": 12.3494, + "step": 15605 + }, + { + "epoch": 0.8498090706744809, + "grad_norm": 0.5713091203784781, + "learning_rate": 0.00012876729247729998, + "loss": 12.0923, + "step": 15606 + }, + { + "epoch": 0.8498635246710639, + "grad_norm": 0.5960425318662682, + "learning_rate": 0.00012875884686641578, + "loss": 12.0263, + "step": 15607 + }, + { + "epoch": 0.8499179786676468, + "grad_norm": 0.5389275337863298, + "learning_rate": 0.00012875040103189791, + "loss": 12.1331, + "step": 15608 + }, + { + "epoch": 0.8499724326642298, + "grad_norm": 0.5523982837647737, + "learning_rate": 0.00012874195497381217, + "loss": 12.1424, + "step": 15609 + }, + { + "epoch": 0.8500268866608128, + "grad_norm": 0.5161903641796735, + "learning_rate": 0.0001287335086922241, + "loss": 12.1487, + "step": 15610 + }, + { + "epoch": 0.8500813406573958, + "grad_norm": 0.544290717987044, + "learning_rate": 0.0001287250621871995, + "loss": 12.1779, + "step": 15611 + }, + { + "epoch": 0.8501357946539789, + "grad_norm": 0.5652310341744862, + "learning_rate": 0.00012871661545880398, + "loss": 12.2929, + "step": 15612 + }, + { + "epoch": 0.8501902486505619, + "grad_norm": 0.5724418405069429, + "learning_rate": 0.00012870816850710326, + "loss": 12.1614, + "step": 15613 + }, + { + "epoch": 0.8502447026471449, + "grad_norm": 0.5898243654550441, + "learning_rate": 0.000128699721332163, + "loss": 12.229, + "step": 15614 + }, + { + "epoch": 0.8502991566437279, + "grad_norm": 0.5731288486052545, + "learning_rate": 0.0001286912739340489, + "loss": 12.183, + "step": 15615 + }, + { + "epoch": 0.8503536106403109, + "grad_norm": 0.5246223185961127, + "learning_rate": 0.00012868282631282662, + "loss": 12.1769, + "step": 15616 + }, + { + "epoch": 0.8504080646368939, + "grad_norm": 0.5049356988743475, + "learning_rate": 0.00012867437846856186, + "loss": 12.1367, + "step": 15617 + }, + { + "epoch": 0.850462518633477, + "grad_norm": 0.5481648482068753, + "learning_rate": 0.00012866593040132036, + "loss": 12.1416, + "step": 15618 + }, + { + "epoch": 0.85051697263006, + "grad_norm": 0.6076905164308919, + "learning_rate": 0.00012865748211116776, + "loss": 12.2317, + "step": 15619 + }, + { + "epoch": 0.850571426626643, + "grad_norm": 0.507112466697084, + "learning_rate": 0.00012864903359816979, + "loss": 12.0967, + "step": 15620 + }, + { + "epoch": 0.850625880623226, + "grad_norm": 0.5362129911701178, + "learning_rate": 0.0001286405848623921, + "loss": 12.0546, + "step": 15621 + }, + { + "epoch": 0.850680334619809, + "grad_norm": 0.5950019792773952, + "learning_rate": 0.00012863213590390044, + "loss": 12.0381, + "step": 15622 + }, + { + "epoch": 0.8507347886163921, + "grad_norm": 0.5447907140074338, + "learning_rate": 0.0001286236867227605, + "loss": 12.1993, + "step": 15623 + }, + { + "epoch": 0.8507892426129751, + "grad_norm": 0.5149878848651488, + "learning_rate": 0.00012861523731903796, + "loss": 12.0942, + "step": 15624 + }, + { + "epoch": 0.850843696609558, + "grad_norm": 0.5531567677264669, + "learning_rate": 0.00012860678769279854, + "loss": 12.2105, + "step": 15625 + }, + { + "epoch": 0.850898150606141, + "grad_norm": 0.5594788185985377, + "learning_rate": 0.00012859833784410792, + "loss": 12.1554, + "step": 15626 + }, + { + "epoch": 0.850952604602724, + "grad_norm": 0.5571718604967746, + "learning_rate": 0.00012858988777303184, + "loss": 12.2235, + "step": 15627 + }, + { + "epoch": 0.851007058599307, + "grad_norm": 0.7899774324453996, + "learning_rate": 0.000128581437479636, + "loss": 12.5309, + "step": 15628 + }, + { + "epoch": 0.8510615125958901, + "grad_norm": 0.5659755971424956, + "learning_rate": 0.00012857298696398613, + "loss": 12.2907, + "step": 15629 + }, + { + "epoch": 0.8511159665924731, + "grad_norm": 0.61763375246358, + "learning_rate": 0.00012856453622614791, + "loss": 12.2098, + "step": 15630 + }, + { + "epoch": 0.8511704205890561, + "grad_norm": 0.5680070609100679, + "learning_rate": 0.00012855608526618706, + "loss": 12.1818, + "step": 15631 + }, + { + "epoch": 0.8512248745856391, + "grad_norm": 0.6291213052887918, + "learning_rate": 0.0001285476340841693, + "loss": 12.1265, + "step": 15632 + }, + { + "epoch": 0.8512793285822221, + "grad_norm": 0.5967570939808631, + "learning_rate": 0.00012853918268016033, + "loss": 12.1694, + "step": 15633 + }, + { + "epoch": 0.8513337825788051, + "grad_norm": 0.5645989960623116, + "learning_rate": 0.0001285307310542259, + "loss": 12.123, + "step": 15634 + }, + { + "epoch": 0.8513882365753882, + "grad_norm": 0.5923944416190499, + "learning_rate": 0.00012852227920643177, + "loss": 12.1184, + "step": 15635 + }, + { + "epoch": 0.8514426905719712, + "grad_norm": 0.5343867541462105, + "learning_rate": 0.00012851382713684358, + "loss": 12.2321, + "step": 15636 + }, + { + "epoch": 0.8514971445685542, + "grad_norm": 0.5317908527102665, + "learning_rate": 0.00012850537484552707, + "loss": 12.0625, + "step": 15637 + }, + { + "epoch": 0.8515515985651372, + "grad_norm": 0.6008950192481002, + "learning_rate": 0.00012849692233254798, + "loss": 12.2511, + "step": 15638 + }, + { + "epoch": 0.8516060525617202, + "grad_norm": 0.5891196291828917, + "learning_rate": 0.00012848846959797206, + "loss": 12.2549, + "step": 15639 + }, + { + "epoch": 0.8516605065583032, + "grad_norm": 0.6080059299665412, + "learning_rate": 0.00012848001664186504, + "loss": 12.2132, + "step": 15640 + }, + { + "epoch": 0.8517149605548863, + "grad_norm": 0.6157269450140649, + "learning_rate": 0.00012847156346429262, + "loss": 12.3017, + "step": 15641 + }, + { + "epoch": 0.8517694145514693, + "grad_norm": 0.5276589367835588, + "learning_rate": 0.00012846311006532054, + "loss": 12.2921, + "step": 15642 + }, + { + "epoch": 0.8518238685480523, + "grad_norm": 0.5142565017966974, + "learning_rate": 0.00012845465644501454, + "loss": 12.1849, + "step": 15643 + }, + { + "epoch": 0.8518783225446352, + "grad_norm": 0.5449135733343222, + "learning_rate": 0.00012844620260344037, + "loss": 12.1698, + "step": 15644 + }, + { + "epoch": 0.8519327765412182, + "grad_norm": 0.6327389332555616, + "learning_rate": 0.00012843774854066377, + "loss": 12.1762, + "step": 15645 + }, + { + "epoch": 0.8519872305378012, + "grad_norm": 0.5907704006134956, + "learning_rate": 0.00012842929425675048, + "loss": 12.2365, + "step": 15646 + }, + { + "epoch": 0.8520416845343843, + "grad_norm": 0.6351242368170997, + "learning_rate": 0.0001284208397517662, + "loss": 12.1271, + "step": 15647 + }, + { + "epoch": 0.8520961385309673, + "grad_norm": 0.5151588337199436, + "learning_rate": 0.0001284123850257767, + "loss": 12.1418, + "step": 15648 + }, + { + "epoch": 0.8521505925275503, + "grad_norm": 0.5683816891671283, + "learning_rate": 0.00012840393007884776, + "loss": 12.1423, + "step": 15649 + }, + { + "epoch": 0.8522050465241333, + "grad_norm": 0.6980382453882409, + "learning_rate": 0.00012839547491104507, + "loss": 12.3939, + "step": 15650 + }, + { + "epoch": 0.8522595005207163, + "grad_norm": 0.5468550971970724, + "learning_rate": 0.00012838701952243439, + "loss": 12.1403, + "step": 15651 + }, + { + "epoch": 0.8523139545172993, + "grad_norm": 0.5720309246905906, + "learning_rate": 0.00012837856391308152, + "loss": 12.1732, + "step": 15652 + }, + { + "epoch": 0.8523684085138824, + "grad_norm": 0.5901994933747409, + "learning_rate": 0.00012837010808305216, + "loss": 12.1811, + "step": 15653 + }, + { + "epoch": 0.8524228625104654, + "grad_norm": 0.6417524330224629, + "learning_rate": 0.00012836165203241207, + "loss": 12.1895, + "step": 15654 + }, + { + "epoch": 0.8524773165070484, + "grad_norm": 0.5508830074943798, + "learning_rate": 0.00012835319576122705, + "loss": 12.2433, + "step": 15655 + }, + { + "epoch": 0.8525317705036314, + "grad_norm": 0.4847934473438708, + "learning_rate": 0.00012834473926956282, + "loss": 12.1649, + "step": 15656 + }, + { + "epoch": 0.8525862245002144, + "grad_norm": 0.5106405252112488, + "learning_rate": 0.0001283362825574851, + "loss": 12.1226, + "step": 15657 + }, + { + "epoch": 0.8526406784967975, + "grad_norm": 0.6224454278859773, + "learning_rate": 0.00012832782562505974, + "loss": 12.4138, + "step": 15658 + }, + { + "epoch": 0.8526951324933805, + "grad_norm": 0.5774410000543585, + "learning_rate": 0.00012831936847235243, + "loss": 12.185, + "step": 15659 + }, + { + "epoch": 0.8527495864899635, + "grad_norm": 0.5379968708739294, + "learning_rate": 0.000128310911099429, + "loss": 12.121, + "step": 15660 + }, + { + "epoch": 0.8528040404865465, + "grad_norm": 0.5884777063588783, + "learning_rate": 0.00012830245350635514, + "loss": 12.2266, + "step": 15661 + }, + { + "epoch": 0.8528584944831294, + "grad_norm": 0.5666623086845319, + "learning_rate": 0.0001282939956931967, + "loss": 12.2207, + "step": 15662 + }, + { + "epoch": 0.8529129484797124, + "grad_norm": 0.5922210812425717, + "learning_rate": 0.00012828553766001935, + "loss": 12.2144, + "step": 15663 + }, + { + "epoch": 0.8529674024762955, + "grad_norm": 0.5464399974466867, + "learning_rate": 0.00012827707940688896, + "loss": 12.3001, + "step": 15664 + }, + { + "epoch": 0.8530218564728785, + "grad_norm": 0.5641139397080874, + "learning_rate": 0.00012826862093387123, + "loss": 12.1005, + "step": 15665 + }, + { + "epoch": 0.8530763104694615, + "grad_norm": 0.5914279130202791, + "learning_rate": 0.00012826016224103196, + "loss": 12.2504, + "step": 15666 + }, + { + "epoch": 0.8531307644660445, + "grad_norm": 0.5375514541952663, + "learning_rate": 0.00012825170332843696, + "loss": 12.18, + "step": 15667 + }, + { + "epoch": 0.8531852184626275, + "grad_norm": 0.5536914518116253, + "learning_rate": 0.00012824324419615196, + "loss": 12.0696, + "step": 15668 + }, + { + "epoch": 0.8532396724592105, + "grad_norm": 0.608791386538355, + "learning_rate": 0.00012823478484424273, + "loss": 12.2929, + "step": 15669 + }, + { + "epoch": 0.8532941264557936, + "grad_norm": 0.5442849581551492, + "learning_rate": 0.0001282263252727751, + "loss": 12.2019, + "step": 15670 + }, + { + "epoch": 0.8533485804523766, + "grad_norm": 0.5579861998109426, + "learning_rate": 0.00012821786548181485, + "loss": 12.193, + "step": 15671 + }, + { + "epoch": 0.8534030344489596, + "grad_norm": 0.5299834636731653, + "learning_rate": 0.00012820940547142773, + "loss": 12.2381, + "step": 15672 + }, + { + "epoch": 0.8534574884455426, + "grad_norm": 0.7834986858693507, + "learning_rate": 0.00012820094524167955, + "loss": 12.1198, + "step": 15673 + }, + { + "epoch": 0.8535119424421256, + "grad_norm": 0.602285605010258, + "learning_rate": 0.00012819248479263606, + "loss": 12.2375, + "step": 15674 + }, + { + "epoch": 0.8535663964387086, + "grad_norm": 0.5712320288841317, + "learning_rate": 0.00012818402412436312, + "loss": 12.1941, + "step": 15675 + }, + { + "epoch": 0.8536208504352917, + "grad_norm": 0.5429152513655475, + "learning_rate": 0.00012817556323692646, + "loss": 12.0621, + "step": 15676 + }, + { + "epoch": 0.8536753044318747, + "grad_norm": 0.6057068946766633, + "learning_rate": 0.0001281671021303919, + "loss": 12.2741, + "step": 15677 + }, + { + "epoch": 0.8537297584284577, + "grad_norm": 0.6539577442174344, + "learning_rate": 0.00012815864080482523, + "loss": 12.2394, + "step": 15678 + }, + { + "epoch": 0.8537842124250407, + "grad_norm": 0.6474510632946274, + "learning_rate": 0.00012815017926029224, + "loss": 12.2298, + "step": 15679 + }, + { + "epoch": 0.8538386664216236, + "grad_norm": 0.6012156573147364, + "learning_rate": 0.00012814171749685874, + "loss": 12.2179, + "step": 15680 + }, + { + "epoch": 0.8538931204182066, + "grad_norm": 0.5748682195613363, + "learning_rate": 0.00012813325551459055, + "loss": 12.0987, + "step": 15681 + }, + { + "epoch": 0.8539475744147897, + "grad_norm": 0.5932538036371817, + "learning_rate": 0.0001281247933135534, + "loss": 12.2372, + "step": 15682 + }, + { + "epoch": 0.8540020284113727, + "grad_norm": 0.5569302402712455, + "learning_rate": 0.00012811633089381317, + "loss": 12.1693, + "step": 15683 + }, + { + "epoch": 0.8540564824079557, + "grad_norm": 0.9870608844038362, + "learning_rate": 0.00012810786825543562, + "loss": 12.1499, + "step": 15684 + }, + { + "epoch": 0.8541109364045387, + "grad_norm": 0.6173207121444124, + "learning_rate": 0.0001280994053984866, + "loss": 12.2683, + "step": 15685 + }, + { + "epoch": 0.8541653904011217, + "grad_norm": 0.6280964771873433, + "learning_rate": 0.0001280909423230319, + "loss": 12.2718, + "step": 15686 + }, + { + "epoch": 0.8542198443977048, + "grad_norm": 0.619197910380965, + "learning_rate": 0.00012808247902913725, + "loss": 12.2047, + "step": 15687 + }, + { + "epoch": 0.8542742983942878, + "grad_norm": 0.6570766982144078, + "learning_rate": 0.0001280740155168686, + "loss": 12.314, + "step": 15688 + }, + { + "epoch": 0.8543287523908708, + "grad_norm": 0.7308763519060135, + "learning_rate": 0.00012806555178629167, + "loss": 12.1709, + "step": 15689 + }, + { + "epoch": 0.8543832063874538, + "grad_norm": 0.6809149470383755, + "learning_rate": 0.00012805708783747232, + "loss": 12.1298, + "step": 15690 + }, + { + "epoch": 0.8544376603840368, + "grad_norm": 0.5754298989534565, + "learning_rate": 0.00012804862367047637, + "loss": 12.1261, + "step": 15691 + }, + { + "epoch": 0.8544921143806198, + "grad_norm": 0.645798903559039, + "learning_rate": 0.00012804015928536956, + "loss": 12.1758, + "step": 15692 + }, + { + "epoch": 0.8545465683772029, + "grad_norm": 0.5825265322505782, + "learning_rate": 0.00012803169468221778, + "loss": 12.1584, + "step": 15693 + }, + { + "epoch": 0.8546010223737859, + "grad_norm": 0.5700802449068311, + "learning_rate": 0.00012802322986108687, + "loss": 12.1773, + "step": 15694 + }, + { + "epoch": 0.8546554763703689, + "grad_norm": 0.5854575405293855, + "learning_rate": 0.0001280147648220426, + "loss": 12.1983, + "step": 15695 + }, + { + "epoch": 0.8547099303669519, + "grad_norm": 0.559123078988021, + "learning_rate": 0.00012800629956515088, + "loss": 12.1285, + "step": 15696 + }, + { + "epoch": 0.8547643843635349, + "grad_norm": 0.5305252493834495, + "learning_rate": 0.0001279978340904774, + "loss": 12.1489, + "step": 15697 + }, + { + "epoch": 0.8548188383601178, + "grad_norm": 0.6947880528017051, + "learning_rate": 0.00012798936839808811, + "loss": 12.2499, + "step": 15698 + }, + { + "epoch": 0.854873292356701, + "grad_norm": 0.5427173645269607, + "learning_rate": 0.00012798090248804876, + "loss": 12.0906, + "step": 15699 + }, + { + "epoch": 0.854927746353284, + "grad_norm": 0.5586751572827647, + "learning_rate": 0.00012797243636042527, + "loss": 12.1936, + "step": 15700 + }, + { + "epoch": 0.8549822003498669, + "grad_norm": 0.5557568132196792, + "learning_rate": 0.0001279639700152834, + "loss": 12.2273, + "step": 15701 + }, + { + "epoch": 0.8550366543464499, + "grad_norm": 0.5682109416724617, + "learning_rate": 0.00012795550345268903, + "loss": 12.2523, + "step": 15702 + }, + { + "epoch": 0.8550911083430329, + "grad_norm": 0.5987692325981113, + "learning_rate": 0.00012794703667270795, + "loss": 12.2351, + "step": 15703 + }, + { + "epoch": 0.8551455623396159, + "grad_norm": 0.5738180654741157, + "learning_rate": 0.00012793856967540602, + "loss": 12.2044, + "step": 15704 + }, + { + "epoch": 0.855200016336199, + "grad_norm": 0.556910543590465, + "learning_rate": 0.00012793010246084908, + "loss": 12.1197, + "step": 15705 + }, + { + "epoch": 0.855254470332782, + "grad_norm": 0.6121439251457784, + "learning_rate": 0.00012792163502910303, + "loss": 12.0928, + "step": 15706 + }, + { + "epoch": 0.855308924329365, + "grad_norm": 0.564178283270504, + "learning_rate": 0.00012791316738023365, + "loss": 12.211, + "step": 15707 + }, + { + "epoch": 0.855363378325948, + "grad_norm": 0.5654129343631402, + "learning_rate": 0.00012790469951430678, + "loss": 12.2054, + "step": 15708 + }, + { + "epoch": 0.855417832322531, + "grad_norm": 0.5129254702848438, + "learning_rate": 0.00012789623143138828, + "loss": 12.2229, + "step": 15709 + }, + { + "epoch": 0.855472286319114, + "grad_norm": 0.5557932268999569, + "learning_rate": 0.000127887763131544, + "loss": 12.2176, + "step": 15710 + }, + { + "epoch": 0.8555267403156971, + "grad_norm": 0.5486285626359496, + "learning_rate": 0.00012787929461483983, + "loss": 12.2681, + "step": 15711 + }, + { + "epoch": 0.8555811943122801, + "grad_norm": 0.5652838634167313, + "learning_rate": 0.0001278708258813416, + "loss": 12.1061, + "step": 15712 + }, + { + "epoch": 0.8556356483088631, + "grad_norm": 0.5872954549608131, + "learning_rate": 0.0001278623569311151, + "loss": 12.0819, + "step": 15713 + }, + { + "epoch": 0.8556901023054461, + "grad_norm": 0.5543051339598092, + "learning_rate": 0.00012785388776422626, + "loss": 12.1606, + "step": 15714 + }, + { + "epoch": 0.8557445563020291, + "grad_norm": 0.6378223338713711, + "learning_rate": 0.0001278454183807409, + "loss": 12.202, + "step": 15715 + }, + { + "epoch": 0.855799010298612, + "grad_norm": 0.5744170509509097, + "learning_rate": 0.00012783694878072495, + "loss": 12.1183, + "step": 15716 + }, + { + "epoch": 0.8558534642951952, + "grad_norm": 0.5258546467076047, + "learning_rate": 0.00012782847896424418, + "loss": 12.1676, + "step": 15717 + }, + { + "epoch": 0.8559079182917781, + "grad_norm": 0.6485700887433437, + "learning_rate": 0.00012782000893136452, + "loss": 12.3775, + "step": 15718 + }, + { + "epoch": 0.8559623722883611, + "grad_norm": 0.5813874713791451, + "learning_rate": 0.00012781153868215178, + "loss": 12.3445, + "step": 15719 + }, + { + "epoch": 0.8560168262849441, + "grad_norm": 0.5446866086229591, + "learning_rate": 0.00012780306821667185, + "loss": 12.1281, + "step": 15720 + }, + { + "epoch": 0.8560712802815271, + "grad_norm": 0.617440955219744, + "learning_rate": 0.00012779459753499062, + "loss": 12.3763, + "step": 15721 + }, + { + "epoch": 0.8561257342781102, + "grad_norm": 0.5485880847095598, + "learning_rate": 0.0001277861266371739, + "loss": 12.1894, + "step": 15722 + }, + { + "epoch": 0.8561801882746932, + "grad_norm": 0.6267972681339404, + "learning_rate": 0.00012777765552328765, + "loss": 12.1803, + "step": 15723 + }, + { + "epoch": 0.8562346422712762, + "grad_norm": 0.6028696142522535, + "learning_rate": 0.00012776918419339764, + "loss": 12.0883, + "step": 15724 + }, + { + "epoch": 0.8562890962678592, + "grad_norm": 0.5685786219339078, + "learning_rate": 0.00012776071264756985, + "loss": 12.1525, + "step": 15725 + }, + { + "epoch": 0.8563435502644422, + "grad_norm": 0.7374933028055527, + "learning_rate": 0.00012775224088587005, + "loss": 12.2591, + "step": 15726 + }, + { + "epoch": 0.8563980042610252, + "grad_norm": 0.5980493259753991, + "learning_rate": 0.0001277437689083642, + "loss": 12.1731, + "step": 15727 + }, + { + "epoch": 0.8564524582576083, + "grad_norm": 0.5843647706304774, + "learning_rate": 0.00012773529671511816, + "loss": 12.1551, + "step": 15728 + }, + { + "epoch": 0.8565069122541913, + "grad_norm": 0.5503302278270538, + "learning_rate": 0.00012772682430619778, + "loss": 12.2297, + "step": 15729 + }, + { + "epoch": 0.8565613662507743, + "grad_norm": 0.6475393445604003, + "learning_rate": 0.000127718351681669, + "loss": 12.2689, + "step": 15730 + }, + { + "epoch": 0.8566158202473573, + "grad_norm": 0.5359188086214569, + "learning_rate": 0.0001277098788415976, + "loss": 12.1432, + "step": 15731 + }, + { + "epoch": 0.8566702742439403, + "grad_norm": 0.563354163807259, + "learning_rate": 0.0001277014057860496, + "loss": 12.2512, + "step": 15732 + }, + { + "epoch": 0.8567247282405233, + "grad_norm": 0.5281041117124711, + "learning_rate": 0.0001276929325150908, + "loss": 12.1194, + "step": 15733 + }, + { + "epoch": 0.8567791822371064, + "grad_norm": 0.556022921641765, + "learning_rate": 0.00012768445902878713, + "loss": 12.1332, + "step": 15734 + }, + { + "epoch": 0.8568336362336894, + "grad_norm": 0.5349451535607578, + "learning_rate": 0.00012767598532720443, + "loss": 12.0863, + "step": 15735 + }, + { + "epoch": 0.8568880902302723, + "grad_norm": 0.608521578203048, + "learning_rate": 0.00012766751141040866, + "loss": 12.2161, + "step": 15736 + }, + { + "epoch": 0.8569425442268553, + "grad_norm": 0.543320017849001, + "learning_rate": 0.00012765903727846565, + "loss": 12.1176, + "step": 15737 + }, + { + "epoch": 0.8569969982234383, + "grad_norm": 0.5457695761142336, + "learning_rate": 0.00012765056293144133, + "loss": 12.2448, + "step": 15738 + }, + { + "epoch": 0.8570514522200213, + "grad_norm": 0.554684247229752, + "learning_rate": 0.0001276420883694016, + "loss": 12.0115, + "step": 15739 + }, + { + "epoch": 0.8571059062166044, + "grad_norm": 0.5336282304607382, + "learning_rate": 0.00012763361359241238, + "loss": 12.0986, + "step": 15740 + }, + { + "epoch": 0.8571603602131874, + "grad_norm": 0.5906900125833782, + "learning_rate": 0.00012762513860053955, + "loss": 12.1907, + "step": 15741 + }, + { + "epoch": 0.8572148142097704, + "grad_norm": 0.5163197838124347, + "learning_rate": 0.00012761666339384896, + "loss": 12.1659, + "step": 15742 + }, + { + "epoch": 0.8572692682063534, + "grad_norm": 0.5845795882841365, + "learning_rate": 0.0001276081879724066, + "loss": 12.3958, + "step": 15743 + }, + { + "epoch": 0.8573237222029364, + "grad_norm": 0.5321494538582507, + "learning_rate": 0.00012759971233627834, + "loss": 12.1918, + "step": 15744 + }, + { + "epoch": 0.8573781761995194, + "grad_norm": 0.5166924507597535, + "learning_rate": 0.00012759123648553006, + "loss": 12.1325, + "step": 15745 + }, + { + "epoch": 0.8574326301961025, + "grad_norm": 0.587566718750353, + "learning_rate": 0.00012758276042022776, + "loss": 12.1939, + "step": 15746 + }, + { + "epoch": 0.8574870841926855, + "grad_norm": 0.6003595372439998, + "learning_rate": 0.0001275742841404372, + "loss": 12.2588, + "step": 15747 + }, + { + "epoch": 0.8575415381892685, + "grad_norm": 0.7960268013196092, + "learning_rate": 0.00012756580764622445, + "loss": 12.4471, + "step": 15748 + }, + { + "epoch": 0.8575959921858515, + "grad_norm": 0.5824598169241241, + "learning_rate": 0.00012755733093765533, + "loss": 12.2527, + "step": 15749 + }, + { + "epoch": 0.8576504461824345, + "grad_norm": 0.551996715410952, + "learning_rate": 0.00012754885401479582, + "loss": 12.107, + "step": 15750 + }, + { + "epoch": 0.8577049001790175, + "grad_norm": 0.5434241010597629, + "learning_rate": 0.00012754037687771178, + "loss": 12.1144, + "step": 15751 + }, + { + "epoch": 0.8577593541756006, + "grad_norm": 0.6147048798357491, + "learning_rate": 0.00012753189952646916, + "loss": 12.1863, + "step": 15752 + }, + { + "epoch": 0.8578138081721836, + "grad_norm": 0.6034774893716371, + "learning_rate": 0.00012752342196113383, + "loss": 12.2386, + "step": 15753 + }, + { + "epoch": 0.8578682621687665, + "grad_norm": 0.5959562157298051, + "learning_rate": 0.0001275149441817718, + "loss": 12.2741, + "step": 15754 + }, + { + "epoch": 0.8579227161653495, + "grad_norm": 0.5231375209394874, + "learning_rate": 0.0001275064661884489, + "loss": 12.1576, + "step": 15755 + }, + { + "epoch": 0.8579771701619325, + "grad_norm": 0.5972949129909878, + "learning_rate": 0.00012749798798123116, + "loss": 12.1434, + "step": 15756 + }, + { + "epoch": 0.8580316241585156, + "grad_norm": 0.6122546872739701, + "learning_rate": 0.00012748950956018444, + "loss": 12.1824, + "step": 15757 + }, + { + "epoch": 0.8580860781550986, + "grad_norm": 0.548827539391232, + "learning_rate": 0.00012748103092537466, + "loss": 12.1858, + "step": 15758 + }, + { + "epoch": 0.8581405321516816, + "grad_norm": 0.5268961504095557, + "learning_rate": 0.00012747255207686778, + "loss": 12.1409, + "step": 15759 + }, + { + "epoch": 0.8581949861482646, + "grad_norm": 0.6139714662781754, + "learning_rate": 0.00012746407301472974, + "loss": 12.2219, + "step": 15760 + }, + { + "epoch": 0.8582494401448476, + "grad_norm": 0.5731399770059618, + "learning_rate": 0.00012745559373902648, + "loss": 12.2019, + "step": 15761 + }, + { + "epoch": 0.8583038941414306, + "grad_norm": 0.7382380608364358, + "learning_rate": 0.0001274471142498239, + "loss": 12.0339, + "step": 15762 + }, + { + "epoch": 0.8583583481380137, + "grad_norm": 0.6025951957591087, + "learning_rate": 0.00012743863454718797, + "loss": 12.1943, + "step": 15763 + }, + { + "epoch": 0.8584128021345967, + "grad_norm": 0.5695672727653155, + "learning_rate": 0.00012743015463118458, + "loss": 12.3162, + "step": 15764 + }, + { + "epoch": 0.8584672561311797, + "grad_norm": 0.5282926940371087, + "learning_rate": 0.0001274216745018797, + "loss": 12.1697, + "step": 15765 + }, + { + "epoch": 0.8585217101277627, + "grad_norm": 0.6295977037542692, + "learning_rate": 0.00012741319415933934, + "loss": 12.4558, + "step": 15766 + }, + { + "epoch": 0.8585761641243457, + "grad_norm": 0.5456157031705988, + "learning_rate": 0.00012740471360362938, + "loss": 12.213, + "step": 15767 + }, + { + "epoch": 0.8586306181209287, + "grad_norm": 0.5722528479712246, + "learning_rate": 0.00012739623283481572, + "loss": 12.0039, + "step": 15768 + }, + { + "epoch": 0.8586850721175118, + "grad_norm": 0.5259898355911858, + "learning_rate": 0.0001273877518529644, + "loss": 12.1635, + "step": 15769 + }, + { + "epoch": 0.8587395261140948, + "grad_norm": 0.5325491294898629, + "learning_rate": 0.00012737927065814127, + "loss": 12.216, + "step": 15770 + }, + { + "epoch": 0.8587939801106778, + "grad_norm": 0.5831718666607144, + "learning_rate": 0.00012737078925041244, + "loss": 12.2157, + "step": 15771 + }, + { + "epoch": 0.8588484341072607, + "grad_norm": 0.5362693585664796, + "learning_rate": 0.0001273623076298437, + "loss": 12.0752, + "step": 15772 + }, + { + "epoch": 0.8589028881038437, + "grad_norm": 0.5317601893405478, + "learning_rate": 0.00012735382579650106, + "loss": 12.1272, + "step": 15773 + }, + { + "epoch": 0.8589573421004267, + "grad_norm": 0.5385207556616182, + "learning_rate": 0.0001273453437504505, + "loss": 12.2153, + "step": 15774 + }, + { + "epoch": 0.8590117960970098, + "grad_norm": 0.5966096170591533, + "learning_rate": 0.00012733686149175795, + "loss": 12.3078, + "step": 15775 + }, + { + "epoch": 0.8590662500935928, + "grad_norm": 0.5946618340368106, + "learning_rate": 0.00012732837902048943, + "loss": 12.1621, + "step": 15776 + }, + { + "epoch": 0.8591207040901758, + "grad_norm": 0.5682263517375123, + "learning_rate": 0.00012731989633671078, + "loss": 12.2022, + "step": 15777 + }, + { + "epoch": 0.8591751580867588, + "grad_norm": 0.6371370179585477, + "learning_rate": 0.00012731141344048808, + "loss": 12.2738, + "step": 15778 + }, + { + "epoch": 0.8592296120833418, + "grad_norm": 0.562737486394942, + "learning_rate": 0.00012730293033188722, + "loss": 12.2651, + "step": 15779 + }, + { + "epoch": 0.8592840660799248, + "grad_norm": 0.5862226570010126, + "learning_rate": 0.0001272944470109742, + "loss": 12.2035, + "step": 15780 + }, + { + "epoch": 0.8593385200765079, + "grad_norm": 0.5991217322704121, + "learning_rate": 0.000127285963477815, + "loss": 12.213, + "step": 15781 + }, + { + "epoch": 0.8593929740730909, + "grad_norm": 0.5343548554741508, + "learning_rate": 0.00012727747973247558, + "loss": 12.1085, + "step": 15782 + }, + { + "epoch": 0.8594474280696739, + "grad_norm": 0.6527294044364248, + "learning_rate": 0.0001272689957750219, + "loss": 12.2033, + "step": 15783 + }, + { + "epoch": 0.8595018820662569, + "grad_norm": 0.5903937465005267, + "learning_rate": 0.0001272605116055199, + "loss": 12.221, + "step": 15784 + }, + { + "epoch": 0.8595563360628399, + "grad_norm": 0.5348246661864136, + "learning_rate": 0.00012725202722403561, + "loss": 12.1709, + "step": 15785 + }, + { + "epoch": 0.8596107900594229, + "grad_norm": 0.5726522511258787, + "learning_rate": 0.000127243542630635, + "loss": 12.1589, + "step": 15786 + }, + { + "epoch": 0.859665244056006, + "grad_norm": 0.5883931333080112, + "learning_rate": 0.00012723505782538403, + "loss": 12.1653, + "step": 15787 + }, + { + "epoch": 0.859719698052589, + "grad_norm": 0.5167501422138584, + "learning_rate": 0.00012722657280834866, + "loss": 12.1629, + "step": 15788 + }, + { + "epoch": 0.859774152049172, + "grad_norm": 0.5965163354165692, + "learning_rate": 0.00012721808757959493, + "loss": 12.1697, + "step": 15789 + }, + { + "epoch": 0.859828606045755, + "grad_norm": 0.6206399193062977, + "learning_rate": 0.00012720960213918875, + "loss": 12.1275, + "step": 15790 + }, + { + "epoch": 0.8598830600423379, + "grad_norm": 0.5041028697890138, + "learning_rate": 0.00012720111648719618, + "loss": 11.9535, + "step": 15791 + }, + { + "epoch": 0.859937514038921, + "grad_norm": 0.5346675298940814, + "learning_rate": 0.0001271926306236831, + "loss": 12.1904, + "step": 15792 + }, + { + "epoch": 0.859991968035504, + "grad_norm": 0.6794951925620087, + "learning_rate": 0.00012718414454871563, + "loss": 12.2198, + "step": 15793 + }, + { + "epoch": 0.860046422032087, + "grad_norm": 0.5376773252891305, + "learning_rate": 0.00012717565826235967, + "loss": 12.196, + "step": 15794 + }, + { + "epoch": 0.86010087602867, + "grad_norm": 0.572556725984343, + "learning_rate": 0.00012716717176468125, + "loss": 12.1767, + "step": 15795 + }, + { + "epoch": 0.860155330025253, + "grad_norm": 0.5859922266043762, + "learning_rate": 0.00012715868505574633, + "loss": 12.2814, + "step": 15796 + }, + { + "epoch": 0.860209784021836, + "grad_norm": 0.5422575776659524, + "learning_rate": 0.00012715019813562092, + "loss": 12.1301, + "step": 15797 + }, + { + "epoch": 0.8602642380184191, + "grad_norm": 0.563991747041213, + "learning_rate": 0.00012714171100437101, + "loss": 12.2544, + "step": 15798 + }, + { + "epoch": 0.8603186920150021, + "grad_norm": 0.5510860391557669, + "learning_rate": 0.00012713322366206262, + "loss": 12.1942, + "step": 15799 + }, + { + "epoch": 0.8603731460115851, + "grad_norm": 0.5276618685659437, + "learning_rate": 0.00012712473610876173, + "loss": 12.1224, + "step": 15800 + }, + { + "epoch": 0.8604276000081681, + "grad_norm": 0.5539203820098891, + "learning_rate": 0.00012711624834453434, + "loss": 12.2318, + "step": 15801 + }, + { + "epoch": 0.8604820540047511, + "grad_norm": 0.5027654839906905, + "learning_rate": 0.00012710776036944644, + "loss": 12.1669, + "step": 15802 + }, + { + "epoch": 0.8605365080013341, + "grad_norm": 0.5413982573232451, + "learning_rate": 0.00012709927218356408, + "loss": 12.1924, + "step": 15803 + }, + { + "epoch": 0.8605909619979172, + "grad_norm": 0.5670345239907993, + "learning_rate": 0.0001270907837869532, + "loss": 12.2394, + "step": 15804 + }, + { + "epoch": 0.8606454159945002, + "grad_norm": 0.6040295601424962, + "learning_rate": 0.00012708229517967987, + "loss": 12.2011, + "step": 15805 + }, + { + "epoch": 0.8606998699910832, + "grad_norm": 0.5927785994065459, + "learning_rate": 0.00012707380636181004, + "loss": 12.2454, + "step": 15806 + }, + { + "epoch": 0.8607543239876662, + "grad_norm": 0.6117346037164473, + "learning_rate": 0.0001270653173334098, + "loss": 12.3725, + "step": 15807 + }, + { + "epoch": 0.8608087779842492, + "grad_norm": 0.5259833447062889, + "learning_rate": 0.0001270568280945451, + "loss": 12.1544, + "step": 15808 + }, + { + "epoch": 0.8608632319808321, + "grad_norm": 0.5304112166595931, + "learning_rate": 0.00012704833864528195, + "loss": 12.1403, + "step": 15809 + }, + { + "epoch": 0.8609176859774152, + "grad_norm": 0.5797000367923341, + "learning_rate": 0.00012703984898568637, + "loss": 12.182, + "step": 15810 + }, + { + "epoch": 0.8609721399739982, + "grad_norm": 0.6167607347967838, + "learning_rate": 0.0001270313591158244, + "loss": 12.2032, + "step": 15811 + }, + { + "epoch": 0.8610265939705812, + "grad_norm": 0.6009461728701405, + "learning_rate": 0.00012702286903576207, + "loss": 12.2631, + "step": 15812 + }, + { + "epoch": 0.8610810479671642, + "grad_norm": 0.5570489244263896, + "learning_rate": 0.00012701437874556537, + "loss": 12.2562, + "step": 15813 + }, + { + "epoch": 0.8611355019637472, + "grad_norm": 0.5456738299269331, + "learning_rate": 0.0001270058882453003, + "loss": 12.1073, + "step": 15814 + }, + { + "epoch": 0.8611899559603302, + "grad_norm": 0.5308395603172099, + "learning_rate": 0.00012699739753503295, + "loss": 12.3165, + "step": 15815 + }, + { + "epoch": 0.8612444099569133, + "grad_norm": 0.5629932726871376, + "learning_rate": 0.0001269889066148293, + "loss": 12.0754, + "step": 15816 + }, + { + "epoch": 0.8612988639534963, + "grad_norm": 0.5583612163321409, + "learning_rate": 0.00012698041548475539, + "loss": 12.055, + "step": 15817 + }, + { + "epoch": 0.8613533179500793, + "grad_norm": 0.5271745262645791, + "learning_rate": 0.00012697192414487724, + "loss": 12.2464, + "step": 15818 + }, + { + "epoch": 0.8614077719466623, + "grad_norm": 0.7819952104852348, + "learning_rate": 0.00012696343259526087, + "loss": 12.2299, + "step": 15819 + }, + { + "epoch": 0.8614622259432453, + "grad_norm": 0.5765961651972086, + "learning_rate": 0.00012695494083597234, + "loss": 12.0144, + "step": 15820 + }, + { + "epoch": 0.8615166799398284, + "grad_norm": 0.5198545087241869, + "learning_rate": 0.00012694644886707766, + "loss": 12.0843, + "step": 15821 + }, + { + "epoch": 0.8615711339364114, + "grad_norm": 0.5480677269766174, + "learning_rate": 0.0001269379566886429, + "loss": 12.1978, + "step": 15822 + }, + { + "epoch": 0.8616255879329944, + "grad_norm": 0.5826026538724887, + "learning_rate": 0.00012692946430073408, + "loss": 12.1008, + "step": 15823 + }, + { + "epoch": 0.8616800419295774, + "grad_norm": 0.6127132481854947, + "learning_rate": 0.00012692097170341718, + "loss": 12.0893, + "step": 15824 + }, + { + "epoch": 0.8617344959261604, + "grad_norm": 0.5628802050208016, + "learning_rate": 0.0001269124788967583, + "loss": 12.2108, + "step": 15825 + }, + { + "epoch": 0.8617889499227434, + "grad_norm": 0.5169071358368816, + "learning_rate": 0.00012690398588082347, + "loss": 12.163, + "step": 15826 + }, + { + "epoch": 0.8618434039193265, + "grad_norm": 0.6288968216432084, + "learning_rate": 0.00012689549265567878, + "loss": 12.2239, + "step": 15827 + }, + { + "epoch": 0.8618978579159094, + "grad_norm": 0.5522019003477887, + "learning_rate": 0.0001268869992213902, + "loss": 12.232, + "step": 15828 + }, + { + "epoch": 0.8619523119124924, + "grad_norm": 0.560181845302788, + "learning_rate": 0.0001268785055780238, + "loss": 12.1887, + "step": 15829 + }, + { + "epoch": 0.8620067659090754, + "grad_norm": 0.6260370544594249, + "learning_rate": 0.0001268700117256456, + "loss": 12.1765, + "step": 15830 + }, + { + "epoch": 0.8620612199056584, + "grad_norm": 0.5681163602908822, + "learning_rate": 0.00012686151766432173, + "loss": 12.164, + "step": 15831 + }, + { + "epoch": 0.8621156739022414, + "grad_norm": 0.5935928659798232, + "learning_rate": 0.0001268530233941182, + "loss": 12.1859, + "step": 15832 + }, + { + "epoch": 0.8621701278988245, + "grad_norm": 0.5675365643912833, + "learning_rate": 0.00012684452891510104, + "loss": 12.1672, + "step": 15833 + }, + { + "epoch": 0.8622245818954075, + "grad_norm": 0.6448365470006007, + "learning_rate": 0.00012683603422733631, + "loss": 12.2005, + "step": 15834 + }, + { + "epoch": 0.8622790358919905, + "grad_norm": 0.5880991860675321, + "learning_rate": 0.0001268275393308901, + "loss": 12.088, + "step": 15835 + }, + { + "epoch": 0.8623334898885735, + "grad_norm": 0.5695140928744413, + "learning_rate": 0.0001268190442258284, + "loss": 12.0658, + "step": 15836 + }, + { + "epoch": 0.8623879438851565, + "grad_norm": 0.5976367558594671, + "learning_rate": 0.00012681054891221737, + "loss": 12.1741, + "step": 15837 + }, + { + "epoch": 0.8624423978817395, + "grad_norm": 0.588921568249969, + "learning_rate": 0.000126802053390123, + "loss": 12.1703, + "step": 15838 + }, + { + "epoch": 0.8624968518783226, + "grad_norm": 0.5867597648512988, + "learning_rate": 0.00012679355765961135, + "loss": 12.2174, + "step": 15839 + }, + { + "epoch": 0.8625513058749056, + "grad_norm": 0.6466506954919735, + "learning_rate": 0.00012678506172074852, + "loss": 12.3293, + "step": 15840 + }, + { + "epoch": 0.8626057598714886, + "grad_norm": 0.5361124196749647, + "learning_rate": 0.00012677656557360053, + "loss": 12.1079, + "step": 15841 + }, + { + "epoch": 0.8626602138680716, + "grad_norm": 0.6141638826558271, + "learning_rate": 0.00012676806921823352, + "loss": 12.2044, + "step": 15842 + }, + { + "epoch": 0.8627146678646546, + "grad_norm": 0.5573530110558617, + "learning_rate": 0.0001267595726547135, + "loss": 12.0857, + "step": 15843 + }, + { + "epoch": 0.8627691218612376, + "grad_norm": 0.5294474626887843, + "learning_rate": 0.00012675107588310653, + "loss": 12.1398, + "step": 15844 + }, + { + "epoch": 0.8628235758578207, + "grad_norm": 0.547586828976658, + "learning_rate": 0.00012674257890347873, + "loss": 12.2056, + "step": 15845 + }, + { + "epoch": 0.8628780298544036, + "grad_norm": 0.5949898096535011, + "learning_rate": 0.00012673408171589616, + "loss": 12.357, + "step": 15846 + }, + { + "epoch": 0.8629324838509866, + "grad_norm": 0.6332146130789029, + "learning_rate": 0.00012672558432042487, + "loss": 12.138, + "step": 15847 + }, + { + "epoch": 0.8629869378475696, + "grad_norm": 0.5961911285767854, + "learning_rate": 0.00012671708671713097, + "loss": 12.1459, + "step": 15848 + }, + { + "epoch": 0.8630413918441526, + "grad_norm": 0.5387073458048571, + "learning_rate": 0.00012670858890608048, + "loss": 12.1757, + "step": 15849 + }, + { + "epoch": 0.8630958458407356, + "grad_norm": 0.5953776124197804, + "learning_rate": 0.00012670009088733955, + "loss": 12.2023, + "step": 15850 + }, + { + "epoch": 0.8631502998373187, + "grad_norm": 0.5234957418215699, + "learning_rate": 0.00012669159266097426, + "loss": 12.1306, + "step": 15851 + }, + { + "epoch": 0.8632047538339017, + "grad_norm": 0.6395342720574844, + "learning_rate": 0.00012668309422705063, + "loss": 12.3002, + "step": 15852 + }, + { + "epoch": 0.8632592078304847, + "grad_norm": 0.5452036442524539, + "learning_rate": 0.0001266745955856348, + "loss": 12.1587, + "step": 15853 + }, + { + "epoch": 0.8633136618270677, + "grad_norm": 0.5731158621347431, + "learning_rate": 0.0001266660967367929, + "loss": 12.2635, + "step": 15854 + }, + { + "epoch": 0.8633681158236507, + "grad_norm": 0.5685607595502273, + "learning_rate": 0.00012665759768059085, + "loss": 12.2997, + "step": 15855 + }, + { + "epoch": 0.8634225698202338, + "grad_norm": 0.578836813347422, + "learning_rate": 0.00012664909841709495, + "loss": 12.0963, + "step": 15856 + }, + { + "epoch": 0.8634770238168168, + "grad_norm": 0.5947050777966181, + "learning_rate": 0.00012664059894637112, + "loss": 12.2036, + "step": 15857 + }, + { + "epoch": 0.8635314778133998, + "grad_norm": 0.5946116539132492, + "learning_rate": 0.00012663209926848555, + "loss": 12.0137, + "step": 15858 + }, + { + "epoch": 0.8635859318099828, + "grad_norm": 0.5032585307333325, + "learning_rate": 0.00012662359938350433, + "loss": 12.1626, + "step": 15859 + }, + { + "epoch": 0.8636403858065658, + "grad_norm": 0.5593607639097642, + "learning_rate": 0.00012661509929149352, + "loss": 12.0341, + "step": 15860 + }, + { + "epoch": 0.8636948398031488, + "grad_norm": 0.6007704384398124, + "learning_rate": 0.00012660659899251924, + "loss": 12.3132, + "step": 15861 + }, + { + "epoch": 0.8637492937997319, + "grad_norm": 0.6161230813975721, + "learning_rate": 0.00012659809848664756, + "loss": 12.0831, + "step": 15862 + }, + { + "epoch": 0.8638037477963149, + "grad_norm": 0.5722203566505186, + "learning_rate": 0.00012658959777394463, + "loss": 12.2513, + "step": 15863 + }, + { + "epoch": 0.8638582017928979, + "grad_norm": 0.5794489421825233, + "learning_rate": 0.00012658109685447652, + "loss": 12.272, + "step": 15864 + }, + { + "epoch": 0.8639126557894808, + "grad_norm": 0.6115055730524654, + "learning_rate": 0.00012657259572830935, + "loss": 12.2261, + "step": 15865 + }, + { + "epoch": 0.8639671097860638, + "grad_norm": 0.5648488003671629, + "learning_rate": 0.0001265640943955092, + "loss": 12.2224, + "step": 15866 + }, + { + "epoch": 0.8640215637826468, + "grad_norm": 0.5465945861616255, + "learning_rate": 0.00012655559285614225, + "loss": 12.189, + "step": 15867 + }, + { + "epoch": 0.8640760177792299, + "grad_norm": 0.6274956086793052, + "learning_rate": 0.00012654709111027448, + "loss": 12.1032, + "step": 15868 + }, + { + "epoch": 0.8641304717758129, + "grad_norm": 0.5903592400481897, + "learning_rate": 0.00012653858915797212, + "loss": 12.1017, + "step": 15869 + }, + { + "epoch": 0.8641849257723959, + "grad_norm": 0.5421409523757088, + "learning_rate": 0.00012653008699930123, + "loss": 12.1491, + "step": 15870 + }, + { + "epoch": 0.8642393797689789, + "grad_norm": 0.569210839807263, + "learning_rate": 0.00012652158463432795, + "loss": 12.2195, + "step": 15871 + }, + { + "epoch": 0.8642938337655619, + "grad_norm": 0.5722602728817883, + "learning_rate": 0.0001265130820631184, + "loss": 12.2284, + "step": 15872 + }, + { + "epoch": 0.8643482877621449, + "grad_norm": 0.6533167097267406, + "learning_rate": 0.00012650457928573862, + "loss": 12.2311, + "step": 15873 + }, + { + "epoch": 0.864402741758728, + "grad_norm": 0.578072245143734, + "learning_rate": 0.00012649607630225483, + "loss": 12.245, + "step": 15874 + }, + { + "epoch": 0.864457195755311, + "grad_norm": 0.5929833512597014, + "learning_rate": 0.00012648757311273308, + "loss": 12.24, + "step": 15875 + }, + { + "epoch": 0.864511649751894, + "grad_norm": 0.6360942032083976, + "learning_rate": 0.00012647906971723953, + "loss": 12.1463, + "step": 15876 + }, + { + "epoch": 0.864566103748477, + "grad_norm": 0.5783819182317385, + "learning_rate": 0.00012647056611584032, + "loss": 12.1567, + "step": 15877 + }, + { + "epoch": 0.86462055774506, + "grad_norm": 0.6337107565036638, + "learning_rate": 0.0001264620623086015, + "loss": 12.234, + "step": 15878 + }, + { + "epoch": 0.864675011741643, + "grad_norm": 0.5829480172215896, + "learning_rate": 0.00012645355829558926, + "loss": 12.1982, + "step": 15879 + }, + { + "epoch": 0.8647294657382261, + "grad_norm": 0.5889035316620181, + "learning_rate": 0.00012644505407686973, + "loss": 12.2011, + "step": 15880 + }, + { + "epoch": 0.8647839197348091, + "grad_norm": 0.5656346434970699, + "learning_rate": 0.000126436549652509, + "loss": 12.2205, + "step": 15881 + }, + { + "epoch": 0.864838373731392, + "grad_norm": 0.6134097132972022, + "learning_rate": 0.00012642804502257328, + "loss": 12.0625, + "step": 15882 + }, + { + "epoch": 0.864892827727975, + "grad_norm": 0.555717845222476, + "learning_rate": 0.00012641954018712863, + "loss": 12.1802, + "step": 15883 + }, + { + "epoch": 0.864947281724558, + "grad_norm": 0.6775557458447009, + "learning_rate": 0.00012641103514624116, + "loss": 12.2947, + "step": 15884 + }, + { + "epoch": 0.865001735721141, + "grad_norm": 0.6434186072760585, + "learning_rate": 0.0001264025298999771, + "loss": 12.3112, + "step": 15885 + }, + { + "epoch": 0.8650561897177241, + "grad_norm": 0.5606701802164176, + "learning_rate": 0.0001263940244484025, + "loss": 12.1202, + "step": 15886 + }, + { + "epoch": 0.8651106437143071, + "grad_norm": 0.5973302884811093, + "learning_rate": 0.00012638551879158358, + "loss": 12.264, + "step": 15887 + }, + { + "epoch": 0.8651650977108901, + "grad_norm": 0.5938117907409824, + "learning_rate": 0.00012637701292958644, + "loss": 12.1346, + "step": 15888 + }, + { + "epoch": 0.8652195517074731, + "grad_norm": 0.5702681890575382, + "learning_rate": 0.0001263685068624772, + "loss": 12.1939, + "step": 15889 + }, + { + "epoch": 0.8652740057040561, + "grad_norm": 0.5664421468399776, + "learning_rate": 0.000126360000590322, + "loss": 12.1546, + "step": 15890 + }, + { + "epoch": 0.8653284597006392, + "grad_norm": 0.5531560334769265, + "learning_rate": 0.00012635149411318705, + "loss": 12.2315, + "step": 15891 + }, + { + "epoch": 0.8653829136972222, + "grad_norm": 0.5814574553270802, + "learning_rate": 0.0001263429874311385, + "loss": 12.0902, + "step": 15892 + }, + { + "epoch": 0.8654373676938052, + "grad_norm": 0.5422698332386978, + "learning_rate": 0.00012633448054424242, + "loss": 12.0986, + "step": 15893 + }, + { + "epoch": 0.8654918216903882, + "grad_norm": 0.5539154433284016, + "learning_rate": 0.000126325973452565, + "loss": 12.1193, + "step": 15894 + }, + { + "epoch": 0.8655462756869712, + "grad_norm": 0.5322445342023846, + "learning_rate": 0.0001263174661561724, + "loss": 12.1351, + "step": 15895 + }, + { + "epoch": 0.8656007296835542, + "grad_norm": 0.6011871815047078, + "learning_rate": 0.00012630895865513075, + "loss": 12.2558, + "step": 15896 + }, + { + "epoch": 0.8656551836801373, + "grad_norm": 0.5421870278380945, + "learning_rate": 0.00012630045094950626, + "loss": 12.0785, + "step": 15897 + }, + { + "epoch": 0.8657096376767203, + "grad_norm": 0.5153219346986837, + "learning_rate": 0.00012629194303936508, + "loss": 12.0322, + "step": 15898 + }, + { + "epoch": 0.8657640916733033, + "grad_norm": 0.5698421826144286, + "learning_rate": 0.0001262834349247733, + "loss": 12.0598, + "step": 15899 + }, + { + "epoch": 0.8658185456698863, + "grad_norm": 0.5698797197970296, + "learning_rate": 0.00012627492660579711, + "loss": 12.1255, + "step": 15900 + }, + { + "epoch": 0.8658729996664692, + "grad_norm": 0.5740622128057058, + "learning_rate": 0.0001262664180825027, + "loss": 12.1623, + "step": 15901 + }, + { + "epoch": 0.8659274536630522, + "grad_norm": 0.6295996044812211, + "learning_rate": 0.0001262579093549562, + "loss": 12.4251, + "step": 15902 + }, + { + "epoch": 0.8659819076596353, + "grad_norm": 0.6189536561994822, + "learning_rate": 0.0001262494004232238, + "loss": 12.2374, + "step": 15903 + }, + { + "epoch": 0.8660363616562183, + "grad_norm": 0.5553251154063896, + "learning_rate": 0.0001262408912873717, + "loss": 12.1977, + "step": 15904 + }, + { + "epoch": 0.8660908156528013, + "grad_norm": 0.5551849578424735, + "learning_rate": 0.000126232381947466, + "loss": 12.112, + "step": 15905 + }, + { + "epoch": 0.8661452696493843, + "grad_norm": 0.6238235130029314, + "learning_rate": 0.0001262238724035729, + "loss": 12.141, + "step": 15906 + }, + { + "epoch": 0.8661997236459673, + "grad_norm": 0.49946598932732883, + "learning_rate": 0.00012621536265575856, + "loss": 12.1029, + "step": 15907 + }, + { + "epoch": 0.8662541776425503, + "grad_norm": 0.645038326550859, + "learning_rate": 0.00012620685270408916, + "loss": 12.1514, + "step": 15908 + }, + { + "epoch": 0.8663086316391334, + "grad_norm": 0.584876027405954, + "learning_rate": 0.0001261983425486309, + "loss": 12.2783, + "step": 15909 + }, + { + "epoch": 0.8663630856357164, + "grad_norm": 0.5779854503830523, + "learning_rate": 0.00012618983218944994, + "loss": 12.2156, + "step": 15910 + }, + { + "epoch": 0.8664175396322994, + "grad_norm": 0.5737317733374087, + "learning_rate": 0.00012618132162661242, + "loss": 12.2477, + "step": 15911 + }, + { + "epoch": 0.8664719936288824, + "grad_norm": 0.5514906059361818, + "learning_rate": 0.00012617281086018458, + "loss": 12.2409, + "step": 15912 + }, + { + "epoch": 0.8665264476254654, + "grad_norm": 0.6499006968948686, + "learning_rate": 0.00012616429989023258, + "loss": 12.3086, + "step": 15913 + }, + { + "epoch": 0.8665809016220484, + "grad_norm": 0.5509850457484902, + "learning_rate": 0.00012615578871682257, + "loss": 12.1739, + "step": 15914 + }, + { + "epoch": 0.8666353556186315, + "grad_norm": 0.6157756457540347, + "learning_rate": 0.00012614727734002075, + "loss": 11.9785, + "step": 15915 + }, + { + "epoch": 0.8666898096152145, + "grad_norm": 0.5260273113757105, + "learning_rate": 0.00012613876575989335, + "loss": 12.1181, + "step": 15916 + }, + { + "epoch": 0.8667442636117975, + "grad_norm": 0.5745111404782917, + "learning_rate": 0.00012613025397650653, + "loss": 12.21, + "step": 15917 + }, + { + "epoch": 0.8667987176083805, + "grad_norm": 0.601576928480999, + "learning_rate": 0.00012612174198992646, + "loss": 12.3353, + "step": 15918 + }, + { + "epoch": 0.8668531716049634, + "grad_norm": 0.7151657198245608, + "learning_rate": 0.00012611322980021932, + "loss": 12.2895, + "step": 15919 + }, + { + "epoch": 0.8669076256015464, + "grad_norm": 0.5150769140549022, + "learning_rate": 0.00012610471740745135, + "loss": 12.1851, + "step": 15920 + }, + { + "epoch": 0.8669620795981295, + "grad_norm": 0.6268954466952763, + "learning_rate": 0.0001260962048116887, + "loss": 12.1937, + "step": 15921 + }, + { + "epoch": 0.8670165335947125, + "grad_norm": 0.5102438388337576, + "learning_rate": 0.00012608769201299762, + "loss": 12.1903, + "step": 15922 + }, + { + "epoch": 0.8670709875912955, + "grad_norm": 0.6492952084312248, + "learning_rate": 0.00012607917901144423, + "loss": 12.3463, + "step": 15923 + }, + { + "epoch": 0.8671254415878785, + "grad_norm": 0.5412306855807728, + "learning_rate": 0.0001260706658070948, + "loss": 12.2514, + "step": 15924 + }, + { + "epoch": 0.8671798955844615, + "grad_norm": 0.5436226777635539, + "learning_rate": 0.00012606215240001549, + "loss": 12.1734, + "step": 15925 + }, + { + "epoch": 0.8672343495810446, + "grad_norm": 0.5435772683271166, + "learning_rate": 0.00012605363879027252, + "loss": 12.1831, + "step": 15926 + }, + { + "epoch": 0.8672888035776276, + "grad_norm": 0.5662856637085216, + "learning_rate": 0.00012604512497793211, + "loss": 12.2577, + "step": 15927 + }, + { + "epoch": 0.8673432575742106, + "grad_norm": 0.557901457511073, + "learning_rate": 0.0001260366109630604, + "loss": 12.1894, + "step": 15928 + }, + { + "epoch": 0.8673977115707936, + "grad_norm": 0.5852888269369317, + "learning_rate": 0.00012602809674572364, + "loss": 12.1945, + "step": 15929 + }, + { + "epoch": 0.8674521655673766, + "grad_norm": 0.532930528537664, + "learning_rate": 0.00012601958232598802, + "loss": 12.1464, + "step": 15930 + }, + { + "epoch": 0.8675066195639596, + "grad_norm": 0.5427560184485759, + "learning_rate": 0.00012601106770391982, + "loss": 12.1841, + "step": 15931 + }, + { + "epoch": 0.8675610735605427, + "grad_norm": 0.593897701522849, + "learning_rate": 0.0001260025528795852, + "loss": 12.1475, + "step": 15932 + }, + { + "epoch": 0.8676155275571257, + "grad_norm": 0.644566088488215, + "learning_rate": 0.0001259940378530503, + "loss": 12.2218, + "step": 15933 + }, + { + "epoch": 0.8676699815537087, + "grad_norm": 0.6236033276293929, + "learning_rate": 0.00012598552262438141, + "loss": 12.2827, + "step": 15934 + }, + { + "epoch": 0.8677244355502917, + "grad_norm": 0.5774315218311655, + "learning_rate": 0.00012597700719364476, + "loss": 12.2084, + "step": 15935 + }, + { + "epoch": 0.8677788895468747, + "grad_norm": 0.5348287784866989, + "learning_rate": 0.00012596849156090655, + "loss": 12.1903, + "step": 15936 + }, + { + "epoch": 0.8678333435434576, + "grad_norm": 0.5716130435917542, + "learning_rate": 0.00012595997572623302, + "loss": 12.1704, + "step": 15937 + }, + { + "epoch": 0.8678877975400408, + "grad_norm": 0.8535427189945483, + "learning_rate": 0.00012595145968969033, + "loss": 12.1884, + "step": 15938 + }, + { + "epoch": 0.8679422515366237, + "grad_norm": 0.5641547745325012, + "learning_rate": 0.00012594294345134473, + "loss": 12.2976, + "step": 15939 + }, + { + "epoch": 0.8679967055332067, + "grad_norm": 0.5748166546920048, + "learning_rate": 0.00012593442701126247, + "loss": 12.0867, + "step": 15940 + }, + { + "epoch": 0.8680511595297897, + "grad_norm": 0.5416784728172392, + "learning_rate": 0.00012592591036950974, + "loss": 12.2147, + "step": 15941 + }, + { + "epoch": 0.8681056135263727, + "grad_norm": 0.5389782673423665, + "learning_rate": 0.00012591739352615282, + "loss": 12.2652, + "step": 15942 + }, + { + "epoch": 0.8681600675229557, + "grad_norm": 0.5646252422136855, + "learning_rate": 0.0001259088764812579, + "loss": 12.2263, + "step": 15943 + }, + { + "epoch": 0.8682145215195388, + "grad_norm": 0.5201360681378036, + "learning_rate": 0.00012590035923489115, + "loss": 12.0908, + "step": 15944 + }, + { + "epoch": 0.8682689755161218, + "grad_norm": 0.5889451038920502, + "learning_rate": 0.00012589184178711887, + "loss": 12.0581, + "step": 15945 + }, + { + "epoch": 0.8683234295127048, + "grad_norm": 0.5434486158643616, + "learning_rate": 0.00012588332413800734, + "loss": 12.3294, + "step": 15946 + }, + { + "epoch": 0.8683778835092878, + "grad_norm": 0.5894799008113736, + "learning_rate": 0.0001258748062876227, + "loss": 12.3475, + "step": 15947 + }, + { + "epoch": 0.8684323375058708, + "grad_norm": 0.5559143802835625, + "learning_rate": 0.0001258662882360313, + "loss": 12.1807, + "step": 15948 + }, + { + "epoch": 0.8684867915024538, + "grad_norm": 0.5751187142046755, + "learning_rate": 0.00012585776998329923, + "loss": 12.2895, + "step": 15949 + }, + { + "epoch": 0.8685412454990369, + "grad_norm": 0.583166156976266, + "learning_rate": 0.0001258492515294928, + "loss": 12.374, + "step": 15950 + }, + { + "epoch": 0.8685956994956199, + "grad_norm": 0.5732070056996998, + "learning_rate": 0.0001258407328746783, + "loss": 12.1528, + "step": 15951 + }, + { + "epoch": 0.8686501534922029, + "grad_norm": 0.5332503377591635, + "learning_rate": 0.0001258322140189219, + "loss": 12.0977, + "step": 15952 + }, + { + "epoch": 0.8687046074887859, + "grad_norm": 0.5375694322894206, + "learning_rate": 0.0001258236949622899, + "loss": 12.1752, + "step": 15953 + }, + { + "epoch": 0.8687590614853689, + "grad_norm": 0.5557590566613545, + "learning_rate": 0.0001258151757048485, + "loss": 12.177, + "step": 15954 + }, + { + "epoch": 0.868813515481952, + "grad_norm": 0.5326035517730949, + "learning_rate": 0.00012580665624666395, + "loss": 12.1465, + "step": 15955 + }, + { + "epoch": 0.868867969478535, + "grad_norm": 0.5769864726318715, + "learning_rate": 0.0001257981365878025, + "loss": 12.2239, + "step": 15956 + }, + { + "epoch": 0.8689224234751179, + "grad_norm": 0.5476567244141276, + "learning_rate": 0.00012578961672833044, + "loss": 12.2517, + "step": 15957 + }, + { + "epoch": 0.8689768774717009, + "grad_norm": 0.591401878428727, + "learning_rate": 0.00012578109666831403, + "loss": 12.1768, + "step": 15958 + }, + { + "epoch": 0.8690313314682839, + "grad_norm": 0.5794924956390661, + "learning_rate": 0.00012577257640781944, + "loss": 12.2739, + "step": 15959 + }, + { + "epoch": 0.8690857854648669, + "grad_norm": 0.5852044421285363, + "learning_rate": 0.00012576405594691298, + "loss": 12.1281, + "step": 15960 + }, + { + "epoch": 0.86914023946145, + "grad_norm": 0.5417282944487868, + "learning_rate": 0.00012575553528566092, + "loss": 12.0221, + "step": 15961 + }, + { + "epoch": 0.869194693458033, + "grad_norm": 0.5253617600990087, + "learning_rate": 0.00012574701442412945, + "loss": 12.2346, + "step": 15962 + }, + { + "epoch": 0.869249147454616, + "grad_norm": 0.587901673765946, + "learning_rate": 0.00012573849336238496, + "loss": 12.1188, + "step": 15963 + }, + { + "epoch": 0.869303601451199, + "grad_norm": 0.5501555510980765, + "learning_rate": 0.00012572997210049354, + "loss": 12.2694, + "step": 15964 + }, + { + "epoch": 0.869358055447782, + "grad_norm": 0.5475981794367448, + "learning_rate": 0.00012572145063852161, + "loss": 12.2831, + "step": 15965 + }, + { + "epoch": 0.869412509444365, + "grad_norm": 0.5385544190151177, + "learning_rate": 0.00012571292897653534, + "loss": 12.2199, + "step": 15966 + }, + { + "epoch": 0.8694669634409481, + "grad_norm": 0.5752778166783943, + "learning_rate": 0.000125704407114601, + "loss": 12.2106, + "step": 15967 + }, + { + "epoch": 0.8695214174375311, + "grad_norm": 0.5854288021900113, + "learning_rate": 0.0001256958850527849, + "loss": 12.2438, + "step": 15968 + }, + { + "epoch": 0.8695758714341141, + "grad_norm": 0.5484390545530441, + "learning_rate": 0.00012568736279115332, + "loss": 12.211, + "step": 15969 + }, + { + "epoch": 0.8696303254306971, + "grad_norm": 0.5633289860451813, + "learning_rate": 0.00012567884032977245, + "loss": 12.2505, + "step": 15970 + }, + { + "epoch": 0.8696847794272801, + "grad_norm": 0.5497065293508597, + "learning_rate": 0.00012567031766870865, + "loss": 12.1282, + "step": 15971 + }, + { + "epoch": 0.8697392334238631, + "grad_norm": 0.5536133855620301, + "learning_rate": 0.00012566179480802812, + "loss": 12.1886, + "step": 15972 + }, + { + "epoch": 0.8697936874204462, + "grad_norm": 0.525991757190078, + "learning_rate": 0.0001256532717477972, + "loss": 12.1135, + "step": 15973 + }, + { + "epoch": 0.8698481414170292, + "grad_norm": 0.5349464762038328, + "learning_rate": 0.00012564474848808212, + "loss": 12.248, + "step": 15974 + }, + { + "epoch": 0.8699025954136121, + "grad_norm": 0.5385765964720809, + "learning_rate": 0.00012563622502894918, + "loss": 12.0118, + "step": 15975 + }, + { + "epoch": 0.8699570494101951, + "grad_norm": 0.559178233160023, + "learning_rate": 0.00012562770137046464, + "loss": 12.1902, + "step": 15976 + }, + { + "epoch": 0.8700115034067781, + "grad_norm": 0.5540507729228215, + "learning_rate": 0.00012561917751269483, + "loss": 12.1979, + "step": 15977 + }, + { + "epoch": 0.8700659574033611, + "grad_norm": 0.5140207796261256, + "learning_rate": 0.00012561065345570598, + "loss": 12.0578, + "step": 15978 + }, + { + "epoch": 0.8701204113999442, + "grad_norm": 0.5288952409117764, + "learning_rate": 0.00012560212919956437, + "loss": 12.2029, + "step": 15979 + }, + { + "epoch": 0.8701748653965272, + "grad_norm": 0.5725414765013682, + "learning_rate": 0.00012559360474433636, + "loss": 12.1941, + "step": 15980 + }, + { + "epoch": 0.8702293193931102, + "grad_norm": 0.5931801761743127, + "learning_rate": 0.00012558508009008812, + "loss": 12.2551, + "step": 15981 + }, + { + "epoch": 0.8702837733896932, + "grad_norm": 0.5256172262315336, + "learning_rate": 0.00012557655523688608, + "loss": 12.123, + "step": 15982 + }, + { + "epoch": 0.8703382273862762, + "grad_norm": 0.5238440893579909, + "learning_rate": 0.00012556803018479642, + "loss": 12.2338, + "step": 15983 + }, + { + "epoch": 0.8703926813828592, + "grad_norm": 0.5536812282601996, + "learning_rate": 0.00012555950493388547, + "loss": 12.2798, + "step": 15984 + }, + { + "epoch": 0.8704471353794423, + "grad_norm": 0.5504843169868991, + "learning_rate": 0.00012555097948421952, + "loss": 12.1371, + "step": 15985 + }, + { + "epoch": 0.8705015893760253, + "grad_norm": 0.5987785474860935, + "learning_rate": 0.00012554245383586488, + "loss": 12.3446, + "step": 15986 + }, + { + "epoch": 0.8705560433726083, + "grad_norm": 0.5325786338129379, + "learning_rate": 0.00012553392798888785, + "loss": 11.9497, + "step": 15987 + }, + { + "epoch": 0.8706104973691913, + "grad_norm": 0.5434399515345769, + "learning_rate": 0.00012552540194335466, + "loss": 12.2137, + "step": 15988 + }, + { + "epoch": 0.8706649513657743, + "grad_norm": 0.5620219933284507, + "learning_rate": 0.00012551687569933173, + "loss": 12.2886, + "step": 15989 + }, + { + "epoch": 0.8707194053623574, + "grad_norm": 0.5546283490990715, + "learning_rate": 0.00012550834925688525, + "loss": 12.1043, + "step": 15990 + }, + { + "epoch": 0.8707738593589404, + "grad_norm": 0.47488405181773846, + "learning_rate": 0.0001254998226160816, + "loss": 12.0254, + "step": 15991 + }, + { + "epoch": 0.8708283133555234, + "grad_norm": 0.5528150439905408, + "learning_rate": 0.000125491295776987, + "loss": 12.0765, + "step": 15992 + }, + { + "epoch": 0.8708827673521063, + "grad_norm": 0.5498831187915221, + "learning_rate": 0.0001254827687396679, + "loss": 12.0998, + "step": 15993 + }, + { + "epoch": 0.8709372213486893, + "grad_norm": 0.543385726227821, + "learning_rate": 0.00012547424150419044, + "loss": 12.2286, + "step": 15994 + }, + { + "epoch": 0.8709916753452723, + "grad_norm": 0.5621492505997553, + "learning_rate": 0.00012546571407062103, + "loss": 12.2143, + "step": 15995 + }, + { + "epoch": 0.8710461293418554, + "grad_norm": 0.5433169361833831, + "learning_rate": 0.00012545718643902594, + "loss": 12.2198, + "step": 15996 + }, + { + "epoch": 0.8711005833384384, + "grad_norm": 0.5223570553806087, + "learning_rate": 0.0001254486586094715, + "loss": 12.0896, + "step": 15997 + }, + { + "epoch": 0.8711550373350214, + "grad_norm": 0.5249600347028466, + "learning_rate": 0.00012544013058202405, + "loss": 12.1902, + "step": 15998 + }, + { + "epoch": 0.8712094913316044, + "grad_norm": 0.5305491138861335, + "learning_rate": 0.00012543160235674986, + "loss": 12.0778, + "step": 15999 + }, + { + "epoch": 0.8712639453281874, + "grad_norm": 0.5129323620281291, + "learning_rate": 0.00012542307393371526, + "loss": 12.1419, + "step": 16000 + }, + { + "epoch": 0.8713183993247704, + "grad_norm": 0.6005913292330536, + "learning_rate": 0.0001254145453129866, + "loss": 12.0793, + "step": 16001 + }, + { + "epoch": 0.8713728533213535, + "grad_norm": 0.5728093693687083, + "learning_rate": 0.00012540601649463015, + "loss": 12.123, + "step": 16002 + }, + { + "epoch": 0.8714273073179365, + "grad_norm": 0.5636253436070369, + "learning_rate": 0.00012539748747871228, + "loss": 12.1248, + "step": 16003 + }, + { + "epoch": 0.8714817613145195, + "grad_norm": 0.5975695198443635, + "learning_rate": 0.0001253889582652993, + "loss": 12.1997, + "step": 16004 + }, + { + "epoch": 0.8715362153111025, + "grad_norm": 0.5694835079399884, + "learning_rate": 0.00012538042885445745, + "loss": 12.2186, + "step": 16005 + }, + { + "epoch": 0.8715906693076855, + "grad_norm": 0.533297799927533, + "learning_rate": 0.00012537189924625316, + "loss": 12.2206, + "step": 16006 + }, + { + "epoch": 0.8716451233042685, + "grad_norm": 0.5422775699384693, + "learning_rate": 0.00012536336944075276, + "loss": 12.1786, + "step": 16007 + }, + { + "epoch": 0.8716995773008516, + "grad_norm": 0.5741067704232433, + "learning_rate": 0.00012535483943802253, + "loss": 12.2374, + "step": 16008 + }, + { + "epoch": 0.8717540312974346, + "grad_norm": 0.539661125867655, + "learning_rate": 0.0001253463092381288, + "loss": 12.1391, + "step": 16009 + }, + { + "epoch": 0.8718084852940176, + "grad_norm": 0.5816111108862853, + "learning_rate": 0.00012533777884113793, + "loss": 12.1623, + "step": 16010 + }, + { + "epoch": 0.8718629392906005, + "grad_norm": 0.5311156149067617, + "learning_rate": 0.00012532924824711623, + "loss": 12.2057, + "step": 16011 + }, + { + "epoch": 0.8719173932871835, + "grad_norm": 0.5295481418916593, + "learning_rate": 0.00012532071745613007, + "loss": 12.1391, + "step": 16012 + }, + { + "epoch": 0.8719718472837665, + "grad_norm": 0.6036077984497444, + "learning_rate": 0.00012531218646824577, + "loss": 12.1833, + "step": 16013 + }, + { + "epoch": 0.8720263012803496, + "grad_norm": 0.5358288325689889, + "learning_rate": 0.00012530365528352964, + "loss": 12.1514, + "step": 16014 + }, + { + "epoch": 0.8720807552769326, + "grad_norm": 0.6269518042813348, + "learning_rate": 0.00012529512390204808, + "loss": 12.2951, + "step": 16015 + }, + { + "epoch": 0.8721352092735156, + "grad_norm": 0.6034229536886009, + "learning_rate": 0.00012528659232386732, + "loss": 12.2408, + "step": 16016 + }, + { + "epoch": 0.8721896632700986, + "grad_norm": 0.5514941628900512, + "learning_rate": 0.00012527806054905382, + "loss": 12.0462, + "step": 16017 + }, + { + "epoch": 0.8722441172666816, + "grad_norm": 0.6823523013440215, + "learning_rate": 0.0001252695285776739, + "loss": 12.3112, + "step": 16018 + }, + { + "epoch": 0.8722985712632646, + "grad_norm": 0.6098942419908867, + "learning_rate": 0.0001252609964097939, + "loss": 12.175, + "step": 16019 + }, + { + "epoch": 0.8723530252598477, + "grad_norm": 0.5719599376070134, + "learning_rate": 0.00012525246404548011, + "loss": 12.1688, + "step": 16020 + }, + { + "epoch": 0.8724074792564307, + "grad_norm": 0.5623532728458146, + "learning_rate": 0.00012524393148479897, + "loss": 12.0794, + "step": 16021 + }, + { + "epoch": 0.8724619332530137, + "grad_norm": 0.5240231925996959, + "learning_rate": 0.0001252353987278167, + "loss": 12.1055, + "step": 16022 + }, + { + "epoch": 0.8725163872495967, + "grad_norm": 0.6506632285409836, + "learning_rate": 0.00012522686577459984, + "loss": 12.2739, + "step": 16023 + }, + { + "epoch": 0.8725708412461797, + "grad_norm": 0.5723476929353505, + "learning_rate": 0.00012521833262521458, + "loss": 12.1607, + "step": 16024 + }, + { + "epoch": 0.8726252952427628, + "grad_norm": 0.6015507472500625, + "learning_rate": 0.00012520979927972738, + "loss": 12.2483, + "step": 16025 + }, + { + "epoch": 0.8726797492393458, + "grad_norm": 0.5956510012054886, + "learning_rate": 0.00012520126573820452, + "loss": 12.2262, + "step": 16026 + }, + { + "epoch": 0.8727342032359288, + "grad_norm": 0.5526715767225209, + "learning_rate": 0.00012519273200071237, + "loss": 12.1711, + "step": 16027 + }, + { + "epoch": 0.8727886572325118, + "grad_norm": 0.5974678518446006, + "learning_rate": 0.00012518419806731735, + "loss": 12.2869, + "step": 16028 + }, + { + "epoch": 0.8728431112290947, + "grad_norm": 0.5520163993498713, + "learning_rate": 0.0001251756639380858, + "loss": 12.0081, + "step": 16029 + }, + { + "epoch": 0.8728975652256777, + "grad_norm": 0.5639711796095425, + "learning_rate": 0.00012516712961308402, + "loss": 12.2842, + "step": 16030 + }, + { + "epoch": 0.8729520192222608, + "grad_norm": 0.5580007563756508, + "learning_rate": 0.00012515859509237845, + "loss": 12.3199, + "step": 16031 + }, + { + "epoch": 0.8730064732188438, + "grad_norm": 0.6335581879602564, + "learning_rate": 0.0001251500603760354, + "loss": 12.2911, + "step": 16032 + }, + { + "epoch": 0.8730609272154268, + "grad_norm": 0.6078406349282465, + "learning_rate": 0.00012514152546412127, + "loss": 12.1284, + "step": 16033 + }, + { + "epoch": 0.8731153812120098, + "grad_norm": 0.5154839706557784, + "learning_rate": 0.00012513299035670246, + "loss": 12.1873, + "step": 16034 + }, + { + "epoch": 0.8731698352085928, + "grad_norm": 0.5187457942704194, + "learning_rate": 0.00012512445505384525, + "loss": 12.1551, + "step": 16035 + }, + { + "epoch": 0.8732242892051758, + "grad_norm": 0.6452981420258556, + "learning_rate": 0.00012511591955561608, + "loss": 12.188, + "step": 16036 + }, + { + "epoch": 0.8732787432017589, + "grad_norm": 0.6740558251184862, + "learning_rate": 0.00012510738386208132, + "loss": 12.3964, + "step": 16037 + }, + { + "epoch": 0.8733331971983419, + "grad_norm": 0.5663356963926833, + "learning_rate": 0.00012509884797330732, + "loss": 12.0535, + "step": 16038 + }, + { + "epoch": 0.8733876511949249, + "grad_norm": 0.5878177856957262, + "learning_rate": 0.00012509031188936046, + "loss": 12.0532, + "step": 16039 + }, + { + "epoch": 0.8734421051915079, + "grad_norm": 0.5952682212176018, + "learning_rate": 0.00012508177561030716, + "loss": 12.1387, + "step": 16040 + }, + { + "epoch": 0.8734965591880909, + "grad_norm": 0.5413794683777703, + "learning_rate": 0.00012507323913621375, + "loss": 12.1888, + "step": 16041 + }, + { + "epoch": 0.8735510131846739, + "grad_norm": 0.5450852611443711, + "learning_rate": 0.0001250647024671466, + "loss": 12.1893, + "step": 16042 + }, + { + "epoch": 0.873605467181257, + "grad_norm": 0.574623152064138, + "learning_rate": 0.00012505616560317217, + "loss": 12.2776, + "step": 16043 + }, + { + "epoch": 0.87365992117784, + "grad_norm": 0.4820315833893964, + "learning_rate": 0.00012504762854435676, + "loss": 12.0415, + "step": 16044 + }, + { + "epoch": 0.873714375174423, + "grad_norm": 0.5191268713251201, + "learning_rate": 0.0001250390912907668, + "loss": 12.2193, + "step": 16045 + }, + { + "epoch": 0.873768829171006, + "grad_norm": 0.5708885380831076, + "learning_rate": 0.00012503055384246867, + "loss": 12.1649, + "step": 16046 + }, + { + "epoch": 0.873823283167589, + "grad_norm": 0.626996981524449, + "learning_rate": 0.00012502201619952875, + "loss": 12.1362, + "step": 16047 + }, + { + "epoch": 0.8738777371641719, + "grad_norm": 0.582192853950474, + "learning_rate": 0.00012501347836201343, + "loss": 12.1748, + "step": 16048 + }, + { + "epoch": 0.873932191160755, + "grad_norm": 0.5500699566784814, + "learning_rate": 0.0001250049403299891, + "loss": 12.2157, + "step": 16049 + }, + { + "epoch": 0.873986645157338, + "grad_norm": 0.5405585547116991, + "learning_rate": 0.00012499640210352219, + "loss": 12.212, + "step": 16050 + }, + { + "epoch": 0.874041099153921, + "grad_norm": 0.5180377431941692, + "learning_rate": 0.00012498786368267905, + "loss": 12.1162, + "step": 16051 + }, + { + "epoch": 0.874095553150504, + "grad_norm": 0.5348153153227052, + "learning_rate": 0.0001249793250675261, + "loss": 12.1576, + "step": 16052 + }, + { + "epoch": 0.874150007147087, + "grad_norm": 0.604938726107424, + "learning_rate": 0.00012497078625812975, + "loss": 12.0198, + "step": 16053 + }, + { + "epoch": 0.87420446114367, + "grad_norm": 0.5422674776228027, + "learning_rate": 0.00012496224725455632, + "loss": 12.1241, + "step": 16054 + }, + { + "epoch": 0.8742589151402531, + "grad_norm": 0.5538596352364217, + "learning_rate": 0.0001249537080568723, + "loss": 12.0372, + "step": 16055 + }, + { + "epoch": 0.8743133691368361, + "grad_norm": 0.5431686420588868, + "learning_rate": 0.00012494516866514406, + "loss": 12.187, + "step": 16056 + }, + { + "epoch": 0.8743678231334191, + "grad_norm": 0.5386989318844304, + "learning_rate": 0.000124936629079438, + "loss": 12.2008, + "step": 16057 + }, + { + "epoch": 0.8744222771300021, + "grad_norm": 0.544736835791952, + "learning_rate": 0.00012492808929982056, + "loss": 12.3372, + "step": 16058 + }, + { + "epoch": 0.8744767311265851, + "grad_norm": 0.6666965240807943, + "learning_rate": 0.0001249195493263581, + "loss": 12.1557, + "step": 16059 + }, + { + "epoch": 0.8745311851231682, + "grad_norm": 0.5583615651695085, + "learning_rate": 0.00012491100915911702, + "loss": 12.3522, + "step": 16060 + }, + { + "epoch": 0.8745856391197512, + "grad_norm": 0.6220010193284105, + "learning_rate": 0.00012490246879816376, + "loss": 12.2049, + "step": 16061 + }, + { + "epoch": 0.8746400931163342, + "grad_norm": 0.5007739341726704, + "learning_rate": 0.00012489392824356475, + "loss": 12.1438, + "step": 16062 + }, + { + "epoch": 0.8746945471129172, + "grad_norm": 0.51788794478555, + "learning_rate": 0.0001248853874953864, + "loss": 12.1486, + "step": 16063 + }, + { + "epoch": 0.8747490011095002, + "grad_norm": 0.5350124014753489, + "learning_rate": 0.00012487684655369507, + "loss": 12.1837, + "step": 16064 + }, + { + "epoch": 0.8748034551060831, + "grad_norm": 0.5369649604490867, + "learning_rate": 0.00012486830541855718, + "loss": 12.2681, + "step": 16065 + }, + { + "epoch": 0.8748579091026663, + "grad_norm": 0.536904150078264, + "learning_rate": 0.0001248597640900392, + "loss": 12.1093, + "step": 16066 + }, + { + "epoch": 0.8749123630992492, + "grad_norm": 0.5389369443292675, + "learning_rate": 0.00012485122256820756, + "loss": 11.9681, + "step": 16067 + }, + { + "epoch": 0.8749668170958322, + "grad_norm": 0.5705666993112773, + "learning_rate": 0.00012484268085312863, + "loss": 12.1541, + "step": 16068 + }, + { + "epoch": 0.8750212710924152, + "grad_norm": 0.5060632652893851, + "learning_rate": 0.00012483413894486884, + "loss": 12.2575, + "step": 16069 + }, + { + "epoch": 0.8750757250889982, + "grad_norm": 0.5912434449471482, + "learning_rate": 0.00012482559684349461, + "loss": 12.1034, + "step": 16070 + }, + { + "epoch": 0.8751301790855812, + "grad_norm": 0.5795383894091256, + "learning_rate": 0.00012481705454907237, + "loss": 12.038, + "step": 16071 + }, + { + "epoch": 0.8751846330821643, + "grad_norm": 0.5638125410718151, + "learning_rate": 0.00012480851206166858, + "loss": 12.1461, + "step": 16072 + }, + { + "epoch": 0.8752390870787473, + "grad_norm": 0.5825779199294095, + "learning_rate": 0.00012479996938134964, + "loss": 12.0403, + "step": 16073 + }, + { + "epoch": 0.8752935410753303, + "grad_norm": 0.5298141174132485, + "learning_rate": 0.00012479142650818195, + "loss": 12.0994, + "step": 16074 + }, + { + "epoch": 0.8753479950719133, + "grad_norm": 0.6573123207917692, + "learning_rate": 0.00012478288344223198, + "loss": 12.3921, + "step": 16075 + }, + { + "epoch": 0.8754024490684963, + "grad_norm": 0.6068568486815071, + "learning_rate": 0.00012477434018356615, + "loss": 12.1627, + "step": 16076 + }, + { + "epoch": 0.8754569030650793, + "grad_norm": 0.609707243377987, + "learning_rate": 0.0001247657967322509, + "loss": 12.0332, + "step": 16077 + }, + { + "epoch": 0.8755113570616624, + "grad_norm": 0.5680797285820538, + "learning_rate": 0.00012475725308835268, + "loss": 12.1601, + "step": 16078 + }, + { + "epoch": 0.8755658110582454, + "grad_norm": 0.5526014221770983, + "learning_rate": 0.0001247487092519379, + "loss": 12.1961, + "step": 16079 + }, + { + "epoch": 0.8756202650548284, + "grad_norm": 0.6115334683613696, + "learning_rate": 0.00012474016522307302, + "loss": 12.2713, + "step": 16080 + }, + { + "epoch": 0.8756747190514114, + "grad_norm": 0.6858296789782814, + "learning_rate": 0.00012473162100182442, + "loss": 12.1809, + "step": 16081 + }, + { + "epoch": 0.8757291730479944, + "grad_norm": 0.5441928771376101, + "learning_rate": 0.0001247230765882586, + "loss": 12.1297, + "step": 16082 + }, + { + "epoch": 0.8757836270445774, + "grad_norm": 0.5270975788304015, + "learning_rate": 0.00012471453198244204, + "loss": 12.0124, + "step": 16083 + }, + { + "epoch": 0.8758380810411605, + "grad_norm": 0.5791745606955657, + "learning_rate": 0.0001247059871844411, + "loss": 12.0933, + "step": 16084 + }, + { + "epoch": 0.8758925350377434, + "grad_norm": 0.7093793546578929, + "learning_rate": 0.00012469744219432226, + "loss": 12.3378, + "step": 16085 + }, + { + "epoch": 0.8759469890343264, + "grad_norm": 0.5754791802733077, + "learning_rate": 0.00012468889701215197, + "loss": 12.1667, + "step": 16086 + }, + { + "epoch": 0.8760014430309094, + "grad_norm": 0.5710204967149822, + "learning_rate": 0.00012468035163799667, + "loss": 12.143, + "step": 16087 + }, + { + "epoch": 0.8760558970274924, + "grad_norm": 0.5749549445115179, + "learning_rate": 0.0001246718060719228, + "loss": 12.1046, + "step": 16088 + }, + { + "epoch": 0.8761103510240754, + "grad_norm": 0.5773812106423682, + "learning_rate": 0.00012466326031399688, + "loss": 12.2384, + "step": 16089 + }, + { + "epoch": 0.8761648050206585, + "grad_norm": 0.5467160672068934, + "learning_rate": 0.00012465471436428526, + "loss": 12.1804, + "step": 16090 + }, + { + "epoch": 0.8762192590172415, + "grad_norm": 0.5905830453636292, + "learning_rate": 0.00012464616822285447, + "loss": 12.0921, + "step": 16091 + }, + { + "epoch": 0.8762737130138245, + "grad_norm": 0.48989163227231247, + "learning_rate": 0.00012463762188977094, + "loss": 11.9986, + "step": 16092 + }, + { + "epoch": 0.8763281670104075, + "grad_norm": 0.6316447646107993, + "learning_rate": 0.0001246290753651011, + "loss": 12.1741, + "step": 16093 + }, + { + "epoch": 0.8763826210069905, + "grad_norm": 0.5579716652402554, + "learning_rate": 0.0001246205286489115, + "loss": 12.1777, + "step": 16094 + }, + { + "epoch": 0.8764370750035736, + "grad_norm": 0.5929747326262457, + "learning_rate": 0.00012461198174126852, + "loss": 12.3101, + "step": 16095 + }, + { + "epoch": 0.8764915290001566, + "grad_norm": 0.5366363838568602, + "learning_rate": 0.00012460343464223864, + "loss": 12.1976, + "step": 16096 + }, + { + "epoch": 0.8765459829967396, + "grad_norm": 0.5221075684624025, + "learning_rate": 0.00012459488735188832, + "loss": 11.9716, + "step": 16097 + }, + { + "epoch": 0.8766004369933226, + "grad_norm": 0.5476321426506422, + "learning_rate": 0.000124586339870284, + "loss": 12.1252, + "step": 16098 + }, + { + "epoch": 0.8766548909899056, + "grad_norm": 0.5792030541270509, + "learning_rate": 0.0001245777921974922, + "loss": 12.2472, + "step": 16099 + }, + { + "epoch": 0.8767093449864886, + "grad_norm": 0.6985294307629022, + "learning_rate": 0.0001245692443335794, + "loss": 12.1955, + "step": 16100 + }, + { + "epoch": 0.8767637989830717, + "grad_norm": 0.5577872331396814, + "learning_rate": 0.000124560696278612, + "loss": 12.0629, + "step": 16101 + }, + { + "epoch": 0.8768182529796547, + "grad_norm": 0.5853442110348637, + "learning_rate": 0.00012455214803265652, + "loss": 12.1465, + "step": 16102 + }, + { + "epoch": 0.8768727069762376, + "grad_norm": 0.5525518738121004, + "learning_rate": 0.0001245435995957794, + "loss": 12.1607, + "step": 16103 + }, + { + "epoch": 0.8769271609728206, + "grad_norm": 0.5284200111715768, + "learning_rate": 0.0001245350509680472, + "loss": 12.204, + "step": 16104 + }, + { + "epoch": 0.8769816149694036, + "grad_norm": 0.6105951133886501, + "learning_rate": 0.00012452650214952624, + "loss": 12.2329, + "step": 16105 + }, + { + "epoch": 0.8770360689659866, + "grad_norm": 0.538846310148232, + "learning_rate": 0.00012451795314028313, + "loss": 11.8713, + "step": 16106 + }, + { + "epoch": 0.8770905229625697, + "grad_norm": 0.5356062524521141, + "learning_rate": 0.0001245094039403843, + "loss": 12.1835, + "step": 16107 + }, + { + "epoch": 0.8771449769591527, + "grad_norm": 0.5572700303346961, + "learning_rate": 0.00012450085454989625, + "loss": 12.0499, + "step": 16108 + }, + { + "epoch": 0.8771994309557357, + "grad_norm": 0.6277722706686351, + "learning_rate": 0.00012449230496888543, + "loss": 12.2874, + "step": 16109 + }, + { + "epoch": 0.8772538849523187, + "grad_norm": 0.5989562610312907, + "learning_rate": 0.00012448375519741835, + "loss": 12.2477, + "step": 16110 + }, + { + "epoch": 0.8773083389489017, + "grad_norm": 0.591640548205753, + "learning_rate": 0.00012447520523556146, + "loss": 12.2729, + "step": 16111 + }, + { + "epoch": 0.8773627929454847, + "grad_norm": 0.5043005990234394, + "learning_rate": 0.00012446665508338128, + "loss": 12.0538, + "step": 16112 + }, + { + "epoch": 0.8774172469420678, + "grad_norm": 0.5474216805855566, + "learning_rate": 0.0001244581047409443, + "loss": 12.2178, + "step": 16113 + }, + { + "epoch": 0.8774717009386508, + "grad_norm": 0.5857427999025753, + "learning_rate": 0.000124449554208317, + "loss": 12.162, + "step": 16114 + }, + { + "epoch": 0.8775261549352338, + "grad_norm": 0.5805090016549473, + "learning_rate": 0.00012444100348556585, + "loss": 12.2238, + "step": 16115 + }, + { + "epoch": 0.8775806089318168, + "grad_norm": 0.6105765736875828, + "learning_rate": 0.00012443245257275735, + "loss": 12.2629, + "step": 16116 + }, + { + "epoch": 0.8776350629283998, + "grad_norm": 0.518552692809577, + "learning_rate": 0.000124423901469958, + "loss": 12.0606, + "step": 16117 + }, + { + "epoch": 0.8776895169249828, + "grad_norm": 0.5763076452780304, + "learning_rate": 0.00012441535017723433, + "loss": 12.1436, + "step": 16118 + }, + { + "epoch": 0.8777439709215659, + "grad_norm": 0.5341231124511039, + "learning_rate": 0.0001244067986946528, + "loss": 12.191, + "step": 16119 + }, + { + "epoch": 0.8777984249181489, + "grad_norm": 0.5500435671005613, + "learning_rate": 0.00012439824702227987, + "loss": 12.2206, + "step": 16120 + }, + { + "epoch": 0.8778528789147318, + "grad_norm": 0.5525902266037133, + "learning_rate": 0.0001243896951601821, + "loss": 12.1845, + "step": 16121 + }, + { + "epoch": 0.8779073329113148, + "grad_norm": 0.6192969566452226, + "learning_rate": 0.00012438114310842598, + "loss": 12.2346, + "step": 16122 + }, + { + "epoch": 0.8779617869078978, + "grad_norm": 0.5553910515782029, + "learning_rate": 0.000124372590867078, + "loss": 12.2288, + "step": 16123 + }, + { + "epoch": 0.8780162409044809, + "grad_norm": 0.5477482552336438, + "learning_rate": 0.00012436403843620468, + "loss": 12.2031, + "step": 16124 + }, + { + "epoch": 0.8780706949010639, + "grad_norm": 0.5097790965963074, + "learning_rate": 0.00012435548581587246, + "loss": 12.1158, + "step": 16125 + }, + { + "epoch": 0.8781251488976469, + "grad_norm": 0.6096381870021689, + "learning_rate": 0.00012434693300614793, + "loss": 12.2922, + "step": 16126 + }, + { + "epoch": 0.8781796028942299, + "grad_norm": 0.5502228426678689, + "learning_rate": 0.00012433838000709755, + "loss": 12.1405, + "step": 16127 + }, + { + "epoch": 0.8782340568908129, + "grad_norm": 0.6006586891159624, + "learning_rate": 0.0001243298268187879, + "loss": 12.3248, + "step": 16128 + }, + { + "epoch": 0.8782885108873959, + "grad_norm": 0.5581646276150829, + "learning_rate": 0.0001243212734412854, + "loss": 12.1715, + "step": 16129 + }, + { + "epoch": 0.878342964883979, + "grad_norm": 0.5495172790231901, + "learning_rate": 0.00012431271987465661, + "loss": 12.2233, + "step": 16130 + }, + { + "epoch": 0.878397418880562, + "grad_norm": 0.5375101576174629, + "learning_rate": 0.00012430416611896797, + "loss": 12.1081, + "step": 16131 + }, + { + "epoch": 0.878451872877145, + "grad_norm": 0.5300128739993558, + "learning_rate": 0.0001242956121742861, + "loss": 12.2207, + "step": 16132 + }, + { + "epoch": 0.878506326873728, + "grad_norm": 0.5687779995223394, + "learning_rate": 0.00012428705804067751, + "loss": 12.189, + "step": 16133 + }, + { + "epoch": 0.878560780870311, + "grad_norm": 0.5836162326318499, + "learning_rate": 0.00012427850371820868, + "loss": 12.2623, + "step": 16134 + }, + { + "epoch": 0.878615234866894, + "grad_norm": 0.5308718848591922, + "learning_rate": 0.0001242699492069461, + "loss": 12.2993, + "step": 16135 + }, + { + "epoch": 0.8786696888634771, + "grad_norm": 0.49513660224252215, + "learning_rate": 0.00012426139450695634, + "loss": 12.0951, + "step": 16136 + }, + { + "epoch": 0.8787241428600601, + "grad_norm": 0.5178325865330834, + "learning_rate": 0.0001242528396183059, + "loss": 11.9599, + "step": 16137 + }, + { + "epoch": 0.8787785968566431, + "grad_norm": 0.5969490784962753, + "learning_rate": 0.00012424428454106128, + "loss": 12.2505, + "step": 16138 + }, + { + "epoch": 0.878833050853226, + "grad_norm": 0.5901651262878582, + "learning_rate": 0.0001242357292752891, + "loss": 12.2431, + "step": 16139 + }, + { + "epoch": 0.878887504849809, + "grad_norm": 0.5301687700694988, + "learning_rate": 0.00012422717382105583, + "loss": 12.1567, + "step": 16140 + }, + { + "epoch": 0.878941958846392, + "grad_norm": 0.5653153214152838, + "learning_rate": 0.00012421861817842796, + "loss": 12.2456, + "step": 16141 + }, + { + "epoch": 0.8789964128429751, + "grad_norm": 0.6632815726539283, + "learning_rate": 0.000124210062347472, + "loss": 12.4149, + "step": 16142 + }, + { + "epoch": 0.8790508668395581, + "grad_norm": 0.5648761481147258, + "learning_rate": 0.0001242015063282546, + "loss": 12.2223, + "step": 16143 + }, + { + "epoch": 0.8791053208361411, + "grad_norm": 0.5085759784843153, + "learning_rate": 0.00012419295012084224, + "loss": 12.2876, + "step": 16144 + }, + { + "epoch": 0.8791597748327241, + "grad_norm": 0.5453487051823546, + "learning_rate": 0.00012418439372530141, + "loss": 12.1462, + "step": 16145 + }, + { + "epoch": 0.8792142288293071, + "grad_norm": 0.5181223589736799, + "learning_rate": 0.0001241758371416987, + "loss": 12.0158, + "step": 16146 + }, + { + "epoch": 0.8792686828258901, + "grad_norm": 0.677804920864788, + "learning_rate": 0.00012416728037010062, + "loss": 12.3514, + "step": 16147 + }, + { + "epoch": 0.8793231368224732, + "grad_norm": 0.5147318760050673, + "learning_rate": 0.00012415872341057369, + "loss": 12.1635, + "step": 16148 + }, + { + "epoch": 0.8793775908190562, + "grad_norm": 0.522595656999475, + "learning_rate": 0.00012415016626318452, + "loss": 12.0867, + "step": 16149 + }, + { + "epoch": 0.8794320448156392, + "grad_norm": 0.6116936916835619, + "learning_rate": 0.00012414160892799958, + "loss": 12.1803, + "step": 16150 + }, + { + "epoch": 0.8794864988122222, + "grad_norm": 0.5489823607717342, + "learning_rate": 0.00012413305140508544, + "loss": 12.1459, + "step": 16151 + }, + { + "epoch": 0.8795409528088052, + "grad_norm": 0.5998587379305671, + "learning_rate": 0.00012412449369450865, + "loss": 12.0937, + "step": 16152 + }, + { + "epoch": 0.8795954068053882, + "grad_norm": 0.6123606539421548, + "learning_rate": 0.00012411593579633574, + "loss": 12.1539, + "step": 16153 + }, + { + "epoch": 0.8796498608019713, + "grad_norm": 0.6051188759788264, + "learning_rate": 0.0001241073777106333, + "loss": 12.173, + "step": 16154 + }, + { + "epoch": 0.8797043147985543, + "grad_norm": 0.541487724905037, + "learning_rate": 0.0001240988194374678, + "loss": 12.0629, + "step": 16155 + }, + { + "epoch": 0.8797587687951373, + "grad_norm": 0.5286488857519637, + "learning_rate": 0.00012409026097690587, + "loss": 12.237, + "step": 16156 + }, + { + "epoch": 0.8798132227917203, + "grad_norm": 0.5222242290522313, + "learning_rate": 0.00012408170232901404, + "loss": 12.1247, + "step": 16157 + }, + { + "epoch": 0.8798676767883032, + "grad_norm": 0.5605723823245377, + "learning_rate": 0.00012407314349385885, + "loss": 12.2575, + "step": 16158 + }, + { + "epoch": 0.8799221307848863, + "grad_norm": 0.5311519997094444, + "learning_rate": 0.00012406458447150685, + "loss": 12.1567, + "step": 16159 + }, + { + "epoch": 0.8799765847814693, + "grad_norm": 0.522106869753087, + "learning_rate": 0.0001240560252620246, + "loss": 12.1882, + "step": 16160 + }, + { + "epoch": 0.8800310387780523, + "grad_norm": 0.5700146146601299, + "learning_rate": 0.00012404746586547867, + "loss": 11.8611, + "step": 16161 + }, + { + "epoch": 0.8800854927746353, + "grad_norm": 0.5800248799485075, + "learning_rate": 0.00012403890628193563, + "loss": 12.0938, + "step": 16162 + }, + { + "epoch": 0.8801399467712183, + "grad_norm": 0.5157443715077356, + "learning_rate": 0.00012403034651146198, + "loss": 12.1552, + "step": 16163 + }, + { + "epoch": 0.8801944007678013, + "grad_norm": 0.5421930476489523, + "learning_rate": 0.00012402178655412436, + "loss": 12.1899, + "step": 16164 + }, + { + "epoch": 0.8802488547643844, + "grad_norm": 0.6029714786139897, + "learning_rate": 0.0001240132264099893, + "loss": 12.1137, + "step": 16165 + }, + { + "epoch": 0.8803033087609674, + "grad_norm": 0.5451626473064809, + "learning_rate": 0.00012400466607912332, + "loss": 12.1901, + "step": 16166 + }, + { + "epoch": 0.8803577627575504, + "grad_norm": 0.5134452705606481, + "learning_rate": 0.0001239961055615931, + "loss": 12.1074, + "step": 16167 + }, + { + "epoch": 0.8804122167541334, + "grad_norm": 0.5925524126284681, + "learning_rate": 0.0001239875448574651, + "loss": 12.3034, + "step": 16168 + }, + { + "epoch": 0.8804666707507164, + "grad_norm": 0.5287324031552987, + "learning_rate": 0.0001239789839668059, + "loss": 12.2371, + "step": 16169 + }, + { + "epoch": 0.8805211247472994, + "grad_norm": 0.5181531375923086, + "learning_rate": 0.00012397042288968214, + "loss": 12.079, + "step": 16170 + }, + { + "epoch": 0.8805755787438825, + "grad_norm": 0.509508990530057, + "learning_rate": 0.00012396186162616038, + "loss": 11.9842, + "step": 16171 + }, + { + "epoch": 0.8806300327404655, + "grad_norm": 0.5914236089719387, + "learning_rate": 0.00012395330017630712, + "loss": 12.2112, + "step": 16172 + }, + { + "epoch": 0.8806844867370485, + "grad_norm": 0.5585576561169054, + "learning_rate": 0.00012394473854018898, + "loss": 12.0564, + "step": 16173 + }, + { + "epoch": 0.8807389407336315, + "grad_norm": 0.5256569160661628, + "learning_rate": 0.00012393617671787254, + "loss": 12.1797, + "step": 16174 + }, + { + "epoch": 0.8807933947302145, + "grad_norm": 0.5503036986398105, + "learning_rate": 0.0001239276147094244, + "loss": 12.1849, + "step": 16175 + }, + { + "epoch": 0.8808478487267974, + "grad_norm": 0.5899224050988658, + "learning_rate": 0.0001239190525149111, + "loss": 12.2352, + "step": 16176 + }, + { + "epoch": 0.8809023027233805, + "grad_norm": 0.5690699450700633, + "learning_rate": 0.0001239104901343992, + "loss": 12.114, + "step": 16177 + }, + { + "epoch": 0.8809567567199635, + "grad_norm": 0.5769616797004105, + "learning_rate": 0.00012390192756795538, + "loss": 12.0492, + "step": 16178 + }, + { + "epoch": 0.8810112107165465, + "grad_norm": 0.570383479686332, + "learning_rate": 0.00012389336481564614, + "loss": 12.1202, + "step": 16179 + }, + { + "epoch": 0.8810656647131295, + "grad_norm": 0.5639287176449898, + "learning_rate": 0.00012388480187753808, + "loss": 12.0932, + "step": 16180 + }, + { + "epoch": 0.8811201187097125, + "grad_norm": 0.5824002134140114, + "learning_rate": 0.0001238762387536978, + "loss": 12.2077, + "step": 16181 + }, + { + "epoch": 0.8811745727062955, + "grad_norm": 0.5580492724272099, + "learning_rate": 0.0001238676754441919, + "loss": 12.1451, + "step": 16182 + }, + { + "epoch": 0.8812290267028786, + "grad_norm": 0.5599035260414097, + "learning_rate": 0.00012385911194908692, + "loss": 12.1846, + "step": 16183 + }, + { + "epoch": 0.8812834806994616, + "grad_norm": 0.5425513211680949, + "learning_rate": 0.00012385054826844952, + "loss": 11.9813, + "step": 16184 + }, + { + "epoch": 0.8813379346960446, + "grad_norm": 0.5685357381558466, + "learning_rate": 0.00012384198440234625, + "loss": 12.0235, + "step": 16185 + }, + { + "epoch": 0.8813923886926276, + "grad_norm": 0.5543533366808976, + "learning_rate": 0.00012383342035084368, + "loss": 12.1006, + "step": 16186 + }, + { + "epoch": 0.8814468426892106, + "grad_norm": 0.5634053378083402, + "learning_rate": 0.00012382485611400846, + "loss": 12.1838, + "step": 16187 + }, + { + "epoch": 0.8815012966857936, + "grad_norm": 0.6518959759414273, + "learning_rate": 0.00012381629169190715, + "loss": 12.2529, + "step": 16188 + }, + { + "epoch": 0.8815557506823767, + "grad_norm": 0.6343989078058219, + "learning_rate": 0.0001238077270846064, + "loss": 12.1136, + "step": 16189 + }, + { + "epoch": 0.8816102046789597, + "grad_norm": 0.5619803061551234, + "learning_rate": 0.00012379916229217274, + "loss": 12.0956, + "step": 16190 + }, + { + "epoch": 0.8816646586755427, + "grad_norm": 0.5500800945417131, + "learning_rate": 0.00012379059731467277, + "loss": 12.1264, + "step": 16191 + }, + { + "epoch": 0.8817191126721257, + "grad_norm": 0.6534795437272554, + "learning_rate": 0.00012378203215217316, + "loss": 12.1569, + "step": 16192 + }, + { + "epoch": 0.8817735666687087, + "grad_norm": 0.5751586600273532, + "learning_rate": 0.0001237734668047405, + "loss": 12.1951, + "step": 16193 + }, + { + "epoch": 0.8818280206652918, + "grad_norm": 0.5799723742902898, + "learning_rate": 0.0001237649012724414, + "loss": 12.2284, + "step": 16194 + }, + { + "epoch": 0.8818824746618747, + "grad_norm": 0.5652758554051909, + "learning_rate": 0.00012375633555534237, + "loss": 12.078, + "step": 16195 + }, + { + "epoch": 0.8819369286584577, + "grad_norm": 0.5470256921634873, + "learning_rate": 0.00012374776965351012, + "loss": 12.1292, + "step": 16196 + }, + { + "epoch": 0.8819913826550407, + "grad_norm": 0.5500982540293085, + "learning_rate": 0.00012373920356701122, + "loss": 12.1825, + "step": 16197 + }, + { + "epoch": 0.8820458366516237, + "grad_norm": 0.553900653584647, + "learning_rate": 0.0001237306372959123, + "loss": 12.1216, + "step": 16198 + }, + { + "epoch": 0.8821002906482067, + "grad_norm": 0.5561992559435806, + "learning_rate": 0.00012372207084027998, + "loss": 12.2922, + "step": 16199 + }, + { + "epoch": 0.8821547446447898, + "grad_norm": 0.5356433230478098, + "learning_rate": 0.00012371350420018083, + "loss": 12.1967, + "step": 16200 + }, + { + "epoch": 0.8822091986413728, + "grad_norm": 0.5244114586518281, + "learning_rate": 0.00012370493737568153, + "loss": 12.2054, + "step": 16201 + }, + { + "epoch": 0.8822636526379558, + "grad_norm": 0.6249332730815699, + "learning_rate": 0.00012369637036684862, + "loss": 12.1978, + "step": 16202 + }, + { + "epoch": 0.8823181066345388, + "grad_norm": 0.5166215210519938, + "learning_rate": 0.00012368780317374876, + "loss": 12.172, + "step": 16203 + }, + { + "epoch": 0.8823725606311218, + "grad_norm": 0.5655592203207452, + "learning_rate": 0.00012367923579644863, + "loss": 12.0116, + "step": 16204 + }, + { + "epoch": 0.8824270146277048, + "grad_norm": 0.5342191190418638, + "learning_rate": 0.00012367066823501475, + "loss": 12.1477, + "step": 16205 + }, + { + "epoch": 0.8824814686242879, + "grad_norm": 0.5122266335572258, + "learning_rate": 0.0001236621004895138, + "loss": 12.2533, + "step": 16206 + }, + { + "epoch": 0.8825359226208709, + "grad_norm": 0.6474094156178377, + "learning_rate": 0.00012365353256001238, + "loss": 12.2779, + "step": 16207 + }, + { + "epoch": 0.8825903766174539, + "grad_norm": 0.534846689296231, + "learning_rate": 0.00012364496444657708, + "loss": 12.1578, + "step": 16208 + }, + { + "epoch": 0.8826448306140369, + "grad_norm": 0.5461817063501252, + "learning_rate": 0.00012363639614927465, + "loss": 12.2337, + "step": 16209 + }, + { + "epoch": 0.8826992846106199, + "grad_norm": 0.5458809676123125, + "learning_rate": 0.00012362782766817162, + "loss": 12.2354, + "step": 16210 + }, + { + "epoch": 0.8827537386072029, + "grad_norm": 0.49927285854458225, + "learning_rate": 0.0001236192590033346, + "loss": 12.107, + "step": 16211 + }, + { + "epoch": 0.882808192603786, + "grad_norm": 0.5523903728560546, + "learning_rate": 0.00012361069015483028, + "loss": 12.1729, + "step": 16212 + }, + { + "epoch": 0.882862646600369, + "grad_norm": 0.5428571481794514, + "learning_rate": 0.00012360212112272527, + "loss": 12.3167, + "step": 16213 + }, + { + "epoch": 0.8829171005969519, + "grad_norm": 0.5616631045634104, + "learning_rate": 0.00012359355190708622, + "loss": 12.1382, + "step": 16214 + }, + { + "epoch": 0.8829715545935349, + "grad_norm": 0.5735167151423458, + "learning_rate": 0.00012358498250797975, + "loss": 12.1902, + "step": 16215 + }, + { + "epoch": 0.8830260085901179, + "grad_norm": 0.505655421612921, + "learning_rate": 0.00012357641292547249, + "loss": 12.1472, + "step": 16216 + }, + { + "epoch": 0.8830804625867009, + "grad_norm": 0.5735619841639722, + "learning_rate": 0.0001235678431596311, + "loss": 12.1078, + "step": 16217 + }, + { + "epoch": 0.883134916583284, + "grad_norm": 0.686628470289581, + "learning_rate": 0.0001235592732105222, + "loss": 12.2523, + "step": 16218 + }, + { + "epoch": 0.883189370579867, + "grad_norm": 0.5126576113589593, + "learning_rate": 0.00012355070307821245, + "loss": 12.2123, + "step": 16219 + }, + { + "epoch": 0.88324382457645, + "grad_norm": 0.6286287193684238, + "learning_rate": 0.0001235421327627685, + "loss": 12.2636, + "step": 16220 + }, + { + "epoch": 0.883298278573033, + "grad_norm": 0.5610115455566963, + "learning_rate": 0.00012353356226425693, + "loss": 12.1751, + "step": 16221 + }, + { + "epoch": 0.883352732569616, + "grad_norm": 0.5548192270939025, + "learning_rate": 0.00012352499158274448, + "loss": 12.2444, + "step": 16222 + }, + { + "epoch": 0.883407186566199, + "grad_norm": 0.5182734831287024, + "learning_rate": 0.0001235164207182977, + "loss": 12.0495, + "step": 16223 + }, + { + "epoch": 0.8834616405627821, + "grad_norm": 0.5282561495176271, + "learning_rate": 0.00012350784967098333, + "loss": 12.2166, + "step": 16224 + }, + { + "epoch": 0.8835160945593651, + "grad_norm": 0.5400405941500195, + "learning_rate": 0.000123499278440868, + "loss": 12.1352, + "step": 16225 + }, + { + "epoch": 0.8835705485559481, + "grad_norm": 0.5242033447130563, + "learning_rate": 0.0001234907070280183, + "loss": 12.1647, + "step": 16226 + }, + { + "epoch": 0.8836250025525311, + "grad_norm": 0.5239109929409396, + "learning_rate": 0.00012348213543250094, + "loss": 12.1428, + "step": 16227 + }, + { + "epoch": 0.8836794565491141, + "grad_norm": 0.5410506015404936, + "learning_rate": 0.00012347356365438253, + "loss": 12.1778, + "step": 16228 + }, + { + "epoch": 0.8837339105456972, + "grad_norm": 0.6075298664685101, + "learning_rate": 0.0001234649916937298, + "loss": 12.1646, + "step": 16229 + }, + { + "epoch": 0.8837883645422802, + "grad_norm": 0.5405876936004674, + "learning_rate": 0.00012345641955060932, + "loss": 12.2044, + "step": 16230 + }, + { + "epoch": 0.8838428185388632, + "grad_norm": 0.5208606016764383, + "learning_rate": 0.0001234478472250878, + "loss": 12.0691, + "step": 16231 + }, + { + "epoch": 0.8838972725354461, + "grad_norm": 0.5368623083267269, + "learning_rate": 0.00012343927471723188, + "loss": 12.2568, + "step": 16232 + }, + { + "epoch": 0.8839517265320291, + "grad_norm": 0.5468750069143732, + "learning_rate": 0.00012343070202710824, + "loss": 12.0706, + "step": 16233 + }, + { + "epoch": 0.8840061805286121, + "grad_norm": 0.5499608163084332, + "learning_rate": 0.00012342212915478353, + "loss": 12.2757, + "step": 16234 + }, + { + "epoch": 0.8840606345251952, + "grad_norm": 0.5851504111453643, + "learning_rate": 0.0001234135561003244, + "loss": 12.2076, + "step": 16235 + }, + { + "epoch": 0.8841150885217782, + "grad_norm": 0.5974130400887863, + "learning_rate": 0.00012340498286379756, + "loss": 12.1854, + "step": 16236 + }, + { + "epoch": 0.8841695425183612, + "grad_norm": 0.5491304519499025, + "learning_rate": 0.00012339640944526964, + "loss": 12.2238, + "step": 16237 + }, + { + "epoch": 0.8842239965149442, + "grad_norm": 0.5346073392197845, + "learning_rate": 0.0001233878358448073, + "loss": 12.202, + "step": 16238 + }, + { + "epoch": 0.8842784505115272, + "grad_norm": 0.6736336593523841, + "learning_rate": 0.00012337926206247723, + "loss": 12.3677, + "step": 16239 + }, + { + "epoch": 0.8843329045081102, + "grad_norm": 0.5083872491556979, + "learning_rate": 0.0001233706880983461, + "loss": 12.2147, + "step": 16240 + }, + { + "epoch": 0.8843873585046933, + "grad_norm": 0.5800111367392793, + "learning_rate": 0.00012336211395248058, + "loss": 12.1883, + "step": 16241 + }, + { + "epoch": 0.8844418125012763, + "grad_norm": 0.5869973957798797, + "learning_rate": 0.00012335353962494736, + "loss": 12.1675, + "step": 16242 + }, + { + "epoch": 0.8844962664978593, + "grad_norm": 0.4909759782375525, + "learning_rate": 0.0001233449651158131, + "loss": 12.1226, + "step": 16243 + }, + { + "epoch": 0.8845507204944423, + "grad_norm": 0.5963019301703091, + "learning_rate": 0.00012333639042514446, + "loss": 12.1958, + "step": 16244 + }, + { + "epoch": 0.8846051744910253, + "grad_norm": 0.5483578119503779, + "learning_rate": 0.00012332781555300816, + "loss": 12.2078, + "step": 16245 + }, + { + "epoch": 0.8846596284876083, + "grad_norm": 0.50012001467015, + "learning_rate": 0.0001233192404994708, + "loss": 12.1573, + "step": 16246 + }, + { + "epoch": 0.8847140824841914, + "grad_norm": 0.5462820967201222, + "learning_rate": 0.00012331066526459917, + "loss": 12.1758, + "step": 16247 + }, + { + "epoch": 0.8847685364807744, + "grad_norm": 0.5483925328240585, + "learning_rate": 0.00012330208984845986, + "loss": 12.1894, + "step": 16248 + }, + { + "epoch": 0.8848229904773574, + "grad_norm": 0.5219144410997871, + "learning_rate": 0.00012329351425111962, + "loss": 12.1431, + "step": 16249 + }, + { + "epoch": 0.8848774444739403, + "grad_norm": 0.5361374767689396, + "learning_rate": 0.00012328493847264512, + "loss": 12.1482, + "step": 16250 + }, + { + "epoch": 0.8849318984705233, + "grad_norm": 0.5427429428242615, + "learning_rate": 0.00012327636251310297, + "loss": 12.258, + "step": 16251 + }, + { + "epoch": 0.8849863524671063, + "grad_norm": 0.5912904179396163, + "learning_rate": 0.00012326778637255996, + "loss": 12.0507, + "step": 16252 + }, + { + "epoch": 0.8850408064636894, + "grad_norm": 0.5378832184296238, + "learning_rate": 0.00012325921005108275, + "loss": 12.1293, + "step": 16253 + }, + { + "epoch": 0.8850952604602724, + "grad_norm": 0.6089376138378523, + "learning_rate": 0.000123250633548738, + "loss": 12.1337, + "step": 16254 + }, + { + "epoch": 0.8851497144568554, + "grad_norm": 0.5601034014331702, + "learning_rate": 0.00012324205686559245, + "loss": 12.2097, + "step": 16255 + }, + { + "epoch": 0.8852041684534384, + "grad_norm": 0.5591657526431818, + "learning_rate": 0.00012323348000171277, + "loss": 12.0945, + "step": 16256 + }, + { + "epoch": 0.8852586224500214, + "grad_norm": 0.5661957903672931, + "learning_rate": 0.0001232249029571656, + "loss": 12.1274, + "step": 16257 + }, + { + "epoch": 0.8853130764466045, + "grad_norm": 0.5427624453408229, + "learning_rate": 0.00012321632573201774, + "loss": 12.1423, + "step": 16258 + }, + { + "epoch": 0.8853675304431875, + "grad_norm": 0.6076214886273502, + "learning_rate": 0.0001232077483263358, + "loss": 12.1057, + "step": 16259 + }, + { + "epoch": 0.8854219844397705, + "grad_norm": 0.5781501899130344, + "learning_rate": 0.00012319917074018658, + "loss": 12.1624, + "step": 16260 + }, + { + "epoch": 0.8854764384363535, + "grad_norm": 0.5986244677879341, + "learning_rate": 0.00012319059297363668, + "loss": 12.2996, + "step": 16261 + }, + { + "epoch": 0.8855308924329365, + "grad_norm": 0.6197660951665949, + "learning_rate": 0.00012318201502675285, + "loss": 12.154, + "step": 16262 + }, + { + "epoch": 0.8855853464295195, + "grad_norm": 0.535054653933544, + "learning_rate": 0.00012317343689960175, + "loss": 12.0921, + "step": 16263 + }, + { + "epoch": 0.8856398004261026, + "grad_norm": 0.565728316731146, + "learning_rate": 0.00012316485859225016, + "loss": 12.0625, + "step": 16264 + }, + { + "epoch": 0.8856942544226856, + "grad_norm": 0.6148431799302818, + "learning_rate": 0.0001231562801047647, + "loss": 12.1517, + "step": 16265 + }, + { + "epoch": 0.8857487084192686, + "grad_norm": 0.5625341975593025, + "learning_rate": 0.00012314770143721218, + "loss": 12.1513, + "step": 16266 + }, + { + "epoch": 0.8858031624158516, + "grad_norm": 0.5596002114804742, + "learning_rate": 0.00012313912258965924, + "loss": 12.1375, + "step": 16267 + }, + { + "epoch": 0.8858576164124345, + "grad_norm": 0.5337452297070369, + "learning_rate": 0.00012313054356217256, + "loss": 12.1935, + "step": 16268 + }, + { + "epoch": 0.8859120704090175, + "grad_norm": 0.561215436744151, + "learning_rate": 0.00012312196435481892, + "loss": 12.2311, + "step": 16269 + }, + { + "epoch": 0.8859665244056006, + "grad_norm": 0.701137105823549, + "learning_rate": 0.000123113384967665, + "loss": 12.1859, + "step": 16270 + }, + { + "epoch": 0.8860209784021836, + "grad_norm": 0.6259806114999253, + "learning_rate": 0.00012310480540077753, + "loss": 12.2817, + "step": 16271 + }, + { + "epoch": 0.8860754323987666, + "grad_norm": 0.575558700552531, + "learning_rate": 0.00012309622565422323, + "loss": 12.0026, + "step": 16272 + }, + { + "epoch": 0.8861298863953496, + "grad_norm": 0.579891197993033, + "learning_rate": 0.0001230876457280688, + "loss": 12.1153, + "step": 16273 + }, + { + "epoch": 0.8861843403919326, + "grad_norm": 0.5505349526446447, + "learning_rate": 0.0001230790656223809, + "loss": 11.9594, + "step": 16274 + }, + { + "epoch": 0.8862387943885156, + "grad_norm": 0.6402945163461424, + "learning_rate": 0.00012307048533722643, + "loss": 12.2174, + "step": 16275 + }, + { + "epoch": 0.8862932483850987, + "grad_norm": 0.5457796048605926, + "learning_rate": 0.00012306190487267193, + "loss": 12.0925, + "step": 16276 + }, + { + "epoch": 0.8863477023816817, + "grad_norm": 0.5305821367555109, + "learning_rate": 0.0001230533242287842, + "loss": 12.1706, + "step": 16277 + }, + { + "epoch": 0.8864021563782647, + "grad_norm": 0.5239564589926617, + "learning_rate": 0.00012304474340562994, + "loss": 12.1302, + "step": 16278 + }, + { + "epoch": 0.8864566103748477, + "grad_norm": 0.5413941416062154, + "learning_rate": 0.00012303616240327592, + "loss": 12.0329, + "step": 16279 + }, + { + "epoch": 0.8865110643714307, + "grad_norm": 0.5185014039488977, + "learning_rate": 0.00012302758122178882, + "loss": 12.0784, + "step": 16280 + }, + { + "epoch": 0.8865655183680137, + "grad_norm": 0.4951657358478465, + "learning_rate": 0.00012301899986123539, + "loss": 12.1503, + "step": 16281 + }, + { + "epoch": 0.8866199723645968, + "grad_norm": 0.5694149924321806, + "learning_rate": 0.00012301041832168236, + "loss": 12.2309, + "step": 16282 + }, + { + "epoch": 0.8866744263611798, + "grad_norm": 0.5313585264494142, + "learning_rate": 0.00012300183660319647, + "loss": 12.1177, + "step": 16283 + }, + { + "epoch": 0.8867288803577628, + "grad_norm": 0.6007296788229879, + "learning_rate": 0.00012299325470584442, + "loss": 12.1911, + "step": 16284 + }, + { + "epoch": 0.8867833343543458, + "grad_norm": 0.5937648289150693, + "learning_rate": 0.00012298467262969297, + "loss": 12.3804, + "step": 16285 + }, + { + "epoch": 0.8868377883509287, + "grad_norm": 0.5532089813326304, + "learning_rate": 0.00012297609037480886, + "loss": 12.0865, + "step": 16286 + }, + { + "epoch": 0.8868922423475117, + "grad_norm": 0.6688215915836628, + "learning_rate": 0.0001229675079412588, + "loss": 12.1452, + "step": 16287 + }, + { + "epoch": 0.8869466963440948, + "grad_norm": 0.5951680028150612, + "learning_rate": 0.00012295892532910956, + "loss": 12.3691, + "step": 16288 + }, + { + "epoch": 0.8870011503406778, + "grad_norm": 0.5231733871097624, + "learning_rate": 0.00012295034253842789, + "loss": 12.0778, + "step": 16289 + }, + { + "epoch": 0.8870556043372608, + "grad_norm": 0.5423677131696757, + "learning_rate": 0.00012294175956928047, + "loss": 12.1868, + "step": 16290 + }, + { + "epoch": 0.8871100583338438, + "grad_norm": 0.4946320742927819, + "learning_rate": 0.00012293317642173408, + "loss": 12.0247, + "step": 16291 + }, + { + "epoch": 0.8871645123304268, + "grad_norm": 0.5529772902355801, + "learning_rate": 0.0001229245930958555, + "loss": 12.1839, + "step": 16292 + }, + { + "epoch": 0.8872189663270099, + "grad_norm": 0.6392097378935568, + "learning_rate": 0.0001229160095917114, + "loss": 12.286, + "step": 16293 + }, + { + "epoch": 0.8872734203235929, + "grad_norm": 0.5291321917582374, + "learning_rate": 0.00012290742590936857, + "loss": 12.1872, + "step": 16294 + }, + { + "epoch": 0.8873278743201759, + "grad_norm": 0.5831656257225017, + "learning_rate": 0.00012289884204889378, + "loss": 12.1548, + "step": 16295 + }, + { + "epoch": 0.8873823283167589, + "grad_norm": 0.5499164854122962, + "learning_rate": 0.00012289025801035373, + "loss": 12.1475, + "step": 16296 + }, + { + "epoch": 0.8874367823133419, + "grad_norm": 0.5528434084145767, + "learning_rate": 0.0001228816737938152, + "loss": 12.0544, + "step": 16297 + }, + { + "epoch": 0.8874912363099249, + "grad_norm": 0.5137783709029952, + "learning_rate": 0.00012287308939934496, + "loss": 12.1277, + "step": 16298 + }, + { + "epoch": 0.887545690306508, + "grad_norm": 0.5625471444950313, + "learning_rate": 0.00012286450482700974, + "loss": 12.2227, + "step": 16299 + }, + { + "epoch": 0.887600144303091, + "grad_norm": 0.5936241104738309, + "learning_rate": 0.00012285592007687626, + "loss": 12.1372, + "step": 16300 + }, + { + "epoch": 0.887654598299674, + "grad_norm": 0.5238975637094012, + "learning_rate": 0.0001228473351490113, + "loss": 12.0677, + "step": 16301 + }, + { + "epoch": 0.887709052296257, + "grad_norm": 0.6606203666543881, + "learning_rate": 0.00012283875004348167, + "loss": 12.2815, + "step": 16302 + }, + { + "epoch": 0.88776350629284, + "grad_norm": 0.5478035977552437, + "learning_rate": 0.0001228301647603541, + "loss": 12.1251, + "step": 16303 + }, + { + "epoch": 0.887817960289423, + "grad_norm": 0.5400120488267081, + "learning_rate": 0.00012282157929969533, + "loss": 12.1598, + "step": 16304 + }, + { + "epoch": 0.887872414286006, + "grad_norm": 0.6254885091083855, + "learning_rate": 0.00012281299366157216, + "loss": 12.2532, + "step": 16305 + }, + { + "epoch": 0.887926868282589, + "grad_norm": 0.5666635884476116, + "learning_rate": 0.00012280440784605124, + "loss": 12.3314, + "step": 16306 + }, + { + "epoch": 0.887981322279172, + "grad_norm": 0.5571324077998513, + "learning_rate": 0.0001227958218531995, + "loss": 12.2664, + "step": 16307 + }, + { + "epoch": 0.888035776275755, + "grad_norm": 0.6383871738610409, + "learning_rate": 0.00012278723568308358, + "loss": 12.268, + "step": 16308 + }, + { + "epoch": 0.888090230272338, + "grad_norm": 0.5489827954744986, + "learning_rate": 0.00012277864933577033, + "loss": 12.0518, + "step": 16309 + }, + { + "epoch": 0.888144684268921, + "grad_norm": 0.5291341260448726, + "learning_rate": 0.00012277006281132647, + "loss": 12.1884, + "step": 16310 + }, + { + "epoch": 0.8881991382655041, + "grad_norm": 0.6066827861748462, + "learning_rate": 0.00012276147610981876, + "loss": 12.1157, + "step": 16311 + }, + { + "epoch": 0.8882535922620871, + "grad_norm": 0.5441557383002226, + "learning_rate": 0.00012275288923131402, + "loss": 12.0808, + "step": 16312 + }, + { + "epoch": 0.8883080462586701, + "grad_norm": 0.4965751123135751, + "learning_rate": 0.000122744302175879, + "loss": 12.1255, + "step": 16313 + }, + { + "epoch": 0.8883625002552531, + "grad_norm": 0.6936057137138693, + "learning_rate": 0.00012273571494358045, + "loss": 12.2344, + "step": 16314 + }, + { + "epoch": 0.8884169542518361, + "grad_norm": 0.6014344061380988, + "learning_rate": 0.00012272712753448522, + "loss": 12.1617, + "step": 16315 + }, + { + "epoch": 0.8884714082484191, + "grad_norm": 0.5859981341056422, + "learning_rate": 0.00012271853994866, + "loss": 12.0722, + "step": 16316 + }, + { + "epoch": 0.8885258622450022, + "grad_norm": 0.573634206675812, + "learning_rate": 0.00012270995218617155, + "loss": 12.0755, + "step": 16317 + }, + { + "epoch": 0.8885803162415852, + "grad_norm": 0.5927714899422986, + "learning_rate": 0.00012270136424708675, + "loss": 12.1773, + "step": 16318 + }, + { + "epoch": 0.8886347702381682, + "grad_norm": 0.5807420605409117, + "learning_rate": 0.00012269277613147233, + "loss": 12.1732, + "step": 16319 + }, + { + "epoch": 0.8886892242347512, + "grad_norm": 0.633388832174088, + "learning_rate": 0.00012268418783939513, + "loss": 12.1529, + "step": 16320 + }, + { + "epoch": 0.8887436782313342, + "grad_norm": 0.6540598015292096, + "learning_rate": 0.0001226755993709218, + "loss": 11.977, + "step": 16321 + }, + { + "epoch": 0.8887981322279171, + "grad_norm": 0.557404386843737, + "learning_rate": 0.00012266701072611926, + "loss": 12.097, + "step": 16322 + }, + { + "epoch": 0.8888525862245003, + "grad_norm": 0.5960222979783917, + "learning_rate": 0.0001226584219050542, + "loss": 12.0737, + "step": 16323 + }, + { + "epoch": 0.8889070402210832, + "grad_norm": 0.6314024877140787, + "learning_rate": 0.00012264983290779347, + "loss": 12.1702, + "step": 16324 + }, + { + "epoch": 0.8889614942176662, + "grad_norm": 0.8419863945950672, + "learning_rate": 0.00012264124373440388, + "loss": 12.085, + "step": 16325 + }, + { + "epoch": 0.8890159482142492, + "grad_norm": 0.5432509180119829, + "learning_rate": 0.00012263265438495214, + "loss": 12.069, + "step": 16326 + }, + { + "epoch": 0.8890704022108322, + "grad_norm": 0.5902105609285478, + "learning_rate": 0.0001226240648595051, + "loss": 12.1508, + "step": 16327 + }, + { + "epoch": 0.8891248562074153, + "grad_norm": 0.6650731727870468, + "learning_rate": 0.00012261547515812952, + "loss": 12.1691, + "step": 16328 + }, + { + "epoch": 0.8891793102039983, + "grad_norm": 0.5477451446445849, + "learning_rate": 0.00012260688528089222, + "loss": 12.1464, + "step": 16329 + }, + { + "epoch": 0.8892337642005813, + "grad_norm": 0.5608551356558081, + "learning_rate": 0.00012259829522786003, + "loss": 12.1764, + "step": 16330 + }, + { + "epoch": 0.8892882181971643, + "grad_norm": 0.516292820888293, + "learning_rate": 0.00012258970499909964, + "loss": 12.1369, + "step": 16331 + }, + { + "epoch": 0.8893426721937473, + "grad_norm": 0.558635622035302, + "learning_rate": 0.00012258111459467796, + "loss": 12.2017, + "step": 16332 + }, + { + "epoch": 0.8893971261903303, + "grad_norm": 0.585917460390887, + "learning_rate": 0.00012257252401466173, + "loss": 12.2535, + "step": 16333 + }, + { + "epoch": 0.8894515801869134, + "grad_norm": 0.5802831891033983, + "learning_rate": 0.00012256393325911776, + "loss": 12.1516, + "step": 16334 + }, + { + "epoch": 0.8895060341834964, + "grad_norm": 0.5930622050620903, + "learning_rate": 0.00012255534232811287, + "loss": 12.1491, + "step": 16335 + }, + { + "epoch": 0.8895604881800794, + "grad_norm": 0.5219656216979696, + "learning_rate": 0.00012254675122171387, + "loss": 12.1651, + "step": 16336 + }, + { + "epoch": 0.8896149421766624, + "grad_norm": 0.5821970410221826, + "learning_rate": 0.00012253815993998752, + "loss": 12.1802, + "step": 16337 + }, + { + "epoch": 0.8896693961732454, + "grad_norm": 0.5259181992579247, + "learning_rate": 0.00012252956848300068, + "loss": 12.2109, + "step": 16338 + }, + { + "epoch": 0.8897238501698284, + "grad_norm": 0.562719113000839, + "learning_rate": 0.0001225209768508201, + "loss": 12.2265, + "step": 16339 + }, + { + "epoch": 0.8897783041664115, + "grad_norm": 0.547578636302829, + "learning_rate": 0.00012251238504351267, + "loss": 12.1624, + "step": 16340 + }, + { + "epoch": 0.8898327581629945, + "grad_norm": 0.5196405505890513, + "learning_rate": 0.00012250379306114517, + "loss": 12.0975, + "step": 16341 + }, + { + "epoch": 0.8898872121595774, + "grad_norm": 0.6273714554787057, + "learning_rate": 0.00012249520090378436, + "loss": 12.3457, + "step": 16342 + }, + { + "epoch": 0.8899416661561604, + "grad_norm": 0.5091159924147887, + "learning_rate": 0.00012248660857149712, + "loss": 12.2966, + "step": 16343 + }, + { + "epoch": 0.8899961201527434, + "grad_norm": 0.5232684819703758, + "learning_rate": 0.00012247801606435024, + "loss": 12.1101, + "step": 16344 + }, + { + "epoch": 0.8900505741493264, + "grad_norm": 0.5616859527095743, + "learning_rate": 0.00012246942338241053, + "loss": 12.1192, + "step": 16345 + }, + { + "epoch": 0.8901050281459095, + "grad_norm": 0.6289801674171244, + "learning_rate": 0.00012246083052574482, + "loss": 12.1321, + "step": 16346 + }, + { + "epoch": 0.8901594821424925, + "grad_norm": 0.5605368192195113, + "learning_rate": 0.0001224522374944199, + "loss": 12.1582, + "step": 16347 + }, + { + "epoch": 0.8902139361390755, + "grad_norm": 0.59515441220397, + "learning_rate": 0.00012244364428850267, + "loss": 12.2295, + "step": 16348 + }, + { + "epoch": 0.8902683901356585, + "grad_norm": 0.5613411178657675, + "learning_rate": 0.00012243505090805986, + "loss": 12.2211, + "step": 16349 + }, + { + "epoch": 0.8903228441322415, + "grad_norm": 0.625794527188511, + "learning_rate": 0.00012242645735315835, + "loss": 12.0044, + "step": 16350 + }, + { + "epoch": 0.8903772981288245, + "grad_norm": 0.6345129634846822, + "learning_rate": 0.0001224178636238649, + "loss": 12.1942, + "step": 16351 + }, + { + "epoch": 0.8904317521254076, + "grad_norm": 0.5797800622091027, + "learning_rate": 0.00012240926972024644, + "loss": 12.0692, + "step": 16352 + }, + { + "epoch": 0.8904862061219906, + "grad_norm": 0.6149532271097778, + "learning_rate": 0.0001224006756423697, + "loss": 12.163, + "step": 16353 + }, + { + "epoch": 0.8905406601185736, + "grad_norm": 0.6390410226858315, + "learning_rate": 0.0001223920813903016, + "loss": 12.2444, + "step": 16354 + }, + { + "epoch": 0.8905951141151566, + "grad_norm": 0.5575843300674095, + "learning_rate": 0.00012238348696410887, + "loss": 12.2189, + "step": 16355 + }, + { + "epoch": 0.8906495681117396, + "grad_norm": 0.5934635885270143, + "learning_rate": 0.00012237489236385842, + "loss": 12.0775, + "step": 16356 + }, + { + "epoch": 0.8907040221083226, + "grad_norm": 0.6940744268501108, + "learning_rate": 0.00012236629758961704, + "loss": 12.2922, + "step": 16357 + }, + { + "epoch": 0.8907584761049057, + "grad_norm": 0.5486866213964949, + "learning_rate": 0.00012235770264145158, + "loss": 12.0936, + "step": 16358 + }, + { + "epoch": 0.8908129301014887, + "grad_norm": 0.6384254601668586, + "learning_rate": 0.00012234910751942888, + "loss": 12.087, + "step": 16359 + }, + { + "epoch": 0.8908673840980716, + "grad_norm": 0.5725247404303682, + "learning_rate": 0.0001223405122236158, + "loss": 12.1062, + "step": 16360 + }, + { + "epoch": 0.8909218380946546, + "grad_norm": 0.6134408624917898, + "learning_rate": 0.0001223319167540791, + "loss": 12.2632, + "step": 16361 + }, + { + "epoch": 0.8909762920912376, + "grad_norm": 0.5677903063661517, + "learning_rate": 0.00012232332111088569, + "loss": 12.1721, + "step": 16362 + }, + { + "epoch": 0.8910307460878207, + "grad_norm": 0.5678554421060821, + "learning_rate": 0.0001223147252941024, + "loss": 12.2632, + "step": 16363 + }, + { + "epoch": 0.8910852000844037, + "grad_norm": 0.5411360054176657, + "learning_rate": 0.00012230612930379605, + "loss": 12.1227, + "step": 16364 + }, + { + "epoch": 0.8911396540809867, + "grad_norm": 0.5789415259530507, + "learning_rate": 0.00012229753314003353, + "loss": 12.0349, + "step": 16365 + }, + { + "epoch": 0.8911941080775697, + "grad_norm": 0.6118033550791085, + "learning_rate": 0.0001222889368028816, + "loss": 12.3661, + "step": 16366 + }, + { + "epoch": 0.8912485620741527, + "grad_norm": 0.5498114164510856, + "learning_rate": 0.0001222803402924072, + "loss": 12.0789, + "step": 16367 + }, + { + "epoch": 0.8913030160707357, + "grad_norm": 0.5455619732312031, + "learning_rate": 0.00012227174360867712, + "loss": 12.1499, + "step": 16368 + }, + { + "epoch": 0.8913574700673188, + "grad_norm": 0.5849315482521636, + "learning_rate": 0.00012226314675175824, + "loss": 12.123, + "step": 16369 + }, + { + "epoch": 0.8914119240639018, + "grad_norm": 0.6706327880267889, + "learning_rate": 0.00012225454972171742, + "loss": 12.3123, + "step": 16370 + }, + { + "epoch": 0.8914663780604848, + "grad_norm": 0.563472642508159, + "learning_rate": 0.00012224595251862145, + "loss": 12.1743, + "step": 16371 + }, + { + "epoch": 0.8915208320570678, + "grad_norm": 0.5805514198217513, + "learning_rate": 0.0001222373551425372, + "loss": 12.2397, + "step": 16372 + }, + { + "epoch": 0.8915752860536508, + "grad_norm": 0.5495186261985949, + "learning_rate": 0.00012222875759353158, + "loss": 12.2148, + "step": 16373 + }, + { + "epoch": 0.8916297400502338, + "grad_norm": 0.5189203705272784, + "learning_rate": 0.0001222201598716714, + "loss": 12.1897, + "step": 16374 + }, + { + "epoch": 0.8916841940468169, + "grad_norm": 0.5316500332118727, + "learning_rate": 0.00012221156197702356, + "loss": 12.1792, + "step": 16375 + }, + { + "epoch": 0.8917386480433999, + "grad_norm": 0.5383019627872128, + "learning_rate": 0.0001222029639096549, + "loss": 12.1051, + "step": 16376 + }, + { + "epoch": 0.8917931020399829, + "grad_norm": 0.5819361656303953, + "learning_rate": 0.00012219436566963222, + "loss": 12.3008, + "step": 16377 + }, + { + "epoch": 0.8918475560365658, + "grad_norm": 0.6054698883628403, + "learning_rate": 0.00012218576725702245, + "loss": 12.2099, + "step": 16378 + }, + { + "epoch": 0.8919020100331488, + "grad_norm": 0.56277415551667, + "learning_rate": 0.00012217716867189243, + "loss": 12.11, + "step": 16379 + }, + { + "epoch": 0.8919564640297318, + "grad_norm": 0.5353622779874301, + "learning_rate": 0.00012216856991430905, + "loss": 12.1667, + "step": 16380 + }, + { + "epoch": 0.8920109180263149, + "grad_norm": 0.5907613177814792, + "learning_rate": 0.00012215997098433912, + "loss": 12.2423, + "step": 16381 + }, + { + "epoch": 0.8920653720228979, + "grad_norm": 0.5314572914567044, + "learning_rate": 0.00012215137188204957, + "loss": 12.2518, + "step": 16382 + }, + { + "epoch": 0.8921198260194809, + "grad_norm": 0.5587918377791286, + "learning_rate": 0.00012214277260750718, + "loss": 12.1841, + "step": 16383 + }, + { + "epoch": 0.8921742800160639, + "grad_norm": 0.5779357858442474, + "learning_rate": 0.00012213417316077894, + "loss": 12.0817, + "step": 16384 + }, + { + "epoch": 0.8922287340126469, + "grad_norm": 0.5427932311031911, + "learning_rate": 0.00012212557354193164, + "loss": 12.1866, + "step": 16385 + }, + { + "epoch": 0.8922831880092299, + "grad_norm": 0.5430726865693253, + "learning_rate": 0.00012211697375103217, + "loss": 12.0536, + "step": 16386 + }, + { + "epoch": 0.892337642005813, + "grad_norm": 0.5788113619506856, + "learning_rate": 0.0001221083737881474, + "loss": 12.0162, + "step": 16387 + }, + { + "epoch": 0.892392096002396, + "grad_norm": 0.5527416162532146, + "learning_rate": 0.00012209977365334419, + "loss": 12.1027, + "step": 16388 + }, + { + "epoch": 0.892446549998979, + "grad_norm": 0.5431246939434727, + "learning_rate": 0.00012209117334668944, + "loss": 12.1254, + "step": 16389 + }, + { + "epoch": 0.892501003995562, + "grad_norm": 0.5809144579873806, + "learning_rate": 0.00012208257286825004, + "loss": 12.0177, + "step": 16390 + }, + { + "epoch": 0.892555457992145, + "grad_norm": 0.5657949703100597, + "learning_rate": 0.00012207397221809286, + "loss": 12.2054, + "step": 16391 + }, + { + "epoch": 0.8926099119887281, + "grad_norm": 0.5635021259578552, + "learning_rate": 0.00012206537139628476, + "loss": 12.1643, + "step": 16392 + }, + { + "epoch": 0.8926643659853111, + "grad_norm": 0.7856142703467077, + "learning_rate": 0.00012205677040289263, + "loss": 12.2693, + "step": 16393 + }, + { + "epoch": 0.8927188199818941, + "grad_norm": 0.5431584101018093, + "learning_rate": 0.00012204816923798332, + "loss": 12.1436, + "step": 16394 + }, + { + "epoch": 0.892773273978477, + "grad_norm": 0.5429192019043648, + "learning_rate": 0.00012203956790162379, + "loss": 12.2422, + "step": 16395 + }, + { + "epoch": 0.89282772797506, + "grad_norm": 0.5613179884500105, + "learning_rate": 0.00012203096639388088, + "loss": 12.143, + "step": 16396 + }, + { + "epoch": 0.892882181971643, + "grad_norm": 0.5946760079053818, + "learning_rate": 0.00012202236471482147, + "loss": 12.1774, + "step": 16397 + }, + { + "epoch": 0.8929366359682261, + "grad_norm": 0.553657927886927, + "learning_rate": 0.00012201376286451247, + "loss": 12.0548, + "step": 16398 + }, + { + "epoch": 0.8929910899648091, + "grad_norm": 0.5354058557561385, + "learning_rate": 0.00012200516084302074, + "loss": 12.2015, + "step": 16399 + }, + { + "epoch": 0.8930455439613921, + "grad_norm": 0.5010026502091414, + "learning_rate": 0.00012199655865041318, + "loss": 12.102, + "step": 16400 + }, + { + "epoch": 0.8930999979579751, + "grad_norm": 0.5683799680633952, + "learning_rate": 0.00012198795628675673, + "loss": 11.9626, + "step": 16401 + }, + { + "epoch": 0.8931544519545581, + "grad_norm": 0.5685946238732208, + "learning_rate": 0.00012197935375211822, + "loss": 12.1334, + "step": 16402 + }, + { + "epoch": 0.8932089059511411, + "grad_norm": 0.5321421061657889, + "learning_rate": 0.00012197075104656457, + "loss": 12.0824, + "step": 16403 + }, + { + "epoch": 0.8932633599477242, + "grad_norm": 0.6369712125898802, + "learning_rate": 0.00012196214817016267, + "loss": 12.0904, + "step": 16404 + }, + { + "epoch": 0.8933178139443072, + "grad_norm": 0.5264122019388323, + "learning_rate": 0.0001219535451229794, + "loss": 11.9427, + "step": 16405 + }, + { + "epoch": 0.8933722679408902, + "grad_norm": 0.5250222597786064, + "learning_rate": 0.00012194494190508175, + "loss": 12.2276, + "step": 16406 + }, + { + "epoch": 0.8934267219374732, + "grad_norm": 0.5850483257175525, + "learning_rate": 0.00012193633851653652, + "loss": 12.3033, + "step": 16407 + }, + { + "epoch": 0.8934811759340562, + "grad_norm": 0.5621707395066748, + "learning_rate": 0.00012192773495741063, + "loss": 12.2006, + "step": 16408 + }, + { + "epoch": 0.8935356299306392, + "grad_norm": 0.6184809514475227, + "learning_rate": 0.00012191913122777098, + "loss": 12.1792, + "step": 16409 + }, + { + "epoch": 0.8935900839272223, + "grad_norm": 0.5997452134431591, + "learning_rate": 0.00012191052732768453, + "loss": 12.1659, + "step": 16410 + }, + { + "epoch": 0.8936445379238053, + "grad_norm": 0.6570410268290692, + "learning_rate": 0.00012190192325721812, + "loss": 12.1779, + "step": 16411 + }, + { + "epoch": 0.8936989919203883, + "grad_norm": 0.579003499709148, + "learning_rate": 0.0001218933190164387, + "loss": 12.2527, + "step": 16412 + }, + { + "epoch": 0.8937534459169713, + "grad_norm": 0.5530522847933657, + "learning_rate": 0.00012188471460541315, + "loss": 12.0883, + "step": 16413 + }, + { + "epoch": 0.8938078999135542, + "grad_norm": 0.6791854967291375, + "learning_rate": 0.00012187611002420838, + "loss": 12.1236, + "step": 16414 + }, + { + "epoch": 0.8938623539101372, + "grad_norm": 0.5153430179909868, + "learning_rate": 0.00012186750527289132, + "loss": 12.1704, + "step": 16415 + }, + { + "epoch": 0.8939168079067203, + "grad_norm": 0.5253876428987704, + "learning_rate": 0.00012185890035152887, + "loss": 12.2482, + "step": 16416 + }, + { + "epoch": 0.8939712619033033, + "grad_norm": 0.5821672884226603, + "learning_rate": 0.00012185029526018794, + "loss": 12.2772, + "step": 16417 + }, + { + "epoch": 0.8940257158998863, + "grad_norm": 0.5224363701557859, + "learning_rate": 0.00012184168999893546, + "loss": 12.1356, + "step": 16418 + }, + { + "epoch": 0.8940801698964693, + "grad_norm": 0.588352730562737, + "learning_rate": 0.00012183308456783832, + "loss": 12.2662, + "step": 16419 + }, + { + "epoch": 0.8941346238930523, + "grad_norm": 0.5155512892761352, + "learning_rate": 0.00012182447896696347, + "loss": 12.0925, + "step": 16420 + }, + { + "epoch": 0.8941890778896353, + "grad_norm": 0.5575012062730131, + "learning_rate": 0.00012181587319637782, + "loss": 12.1494, + "step": 16421 + }, + { + "epoch": 0.8942435318862184, + "grad_norm": 0.5537627331780988, + "learning_rate": 0.00012180726725614826, + "loss": 12.0568, + "step": 16422 + }, + { + "epoch": 0.8942979858828014, + "grad_norm": 0.5203552197086324, + "learning_rate": 0.00012179866114634174, + "loss": 12.12, + "step": 16423 + }, + { + "epoch": 0.8943524398793844, + "grad_norm": 0.5283891939212064, + "learning_rate": 0.00012179005486702517, + "loss": 12.0989, + "step": 16424 + }, + { + "epoch": 0.8944068938759674, + "grad_norm": 0.5485087823080463, + "learning_rate": 0.00012178144841826548, + "loss": 12.159, + "step": 16425 + }, + { + "epoch": 0.8944613478725504, + "grad_norm": 0.5690013527694032, + "learning_rate": 0.0001217728418001296, + "loss": 12.1885, + "step": 16426 + }, + { + "epoch": 0.8945158018691335, + "grad_norm": 0.5463935145707699, + "learning_rate": 0.00012176423501268445, + "loss": 12.2017, + "step": 16427 + }, + { + "epoch": 0.8945702558657165, + "grad_norm": 0.5334736599123772, + "learning_rate": 0.00012175562805599696, + "loss": 12.2179, + "step": 16428 + }, + { + "epoch": 0.8946247098622995, + "grad_norm": 0.5351992571302758, + "learning_rate": 0.00012174702093013403, + "loss": 12.1498, + "step": 16429 + }, + { + "epoch": 0.8946791638588825, + "grad_norm": 0.6364589176976457, + "learning_rate": 0.00012173841363516265, + "loss": 12.1802, + "step": 16430 + }, + { + "epoch": 0.8947336178554655, + "grad_norm": 0.5253528278949113, + "learning_rate": 0.00012172980617114975, + "loss": 12.1021, + "step": 16431 + }, + { + "epoch": 0.8947880718520484, + "grad_norm": 0.5145999047777803, + "learning_rate": 0.00012172119853816217, + "loss": 12.2855, + "step": 16432 + }, + { + "epoch": 0.8948425258486316, + "grad_norm": 0.6613421138830665, + "learning_rate": 0.00012171259073626693, + "loss": 11.9993, + "step": 16433 + }, + { + "epoch": 0.8948969798452145, + "grad_norm": 0.4970791425366797, + "learning_rate": 0.00012170398276553094, + "loss": 11.9895, + "step": 16434 + }, + { + "epoch": 0.8949514338417975, + "grad_norm": 0.5340672710813823, + "learning_rate": 0.00012169537462602117, + "loss": 12.0471, + "step": 16435 + }, + { + "epoch": 0.8950058878383805, + "grad_norm": 0.5608441757057984, + "learning_rate": 0.00012168676631780451, + "loss": 12.0705, + "step": 16436 + }, + { + "epoch": 0.8950603418349635, + "grad_norm": 0.5554717285872464, + "learning_rate": 0.0001216781578409479, + "loss": 12.3042, + "step": 16437 + }, + { + "epoch": 0.8951147958315465, + "grad_norm": 0.5135038200349453, + "learning_rate": 0.00012166954919551832, + "loss": 12.1329, + "step": 16438 + }, + { + "epoch": 0.8951692498281296, + "grad_norm": 0.5576360282006322, + "learning_rate": 0.00012166094038158267, + "loss": 12.2105, + "step": 16439 + }, + { + "epoch": 0.8952237038247126, + "grad_norm": 0.4971445563517799, + "learning_rate": 0.00012165233139920793, + "loss": 12.178, + "step": 16440 + }, + { + "epoch": 0.8952781578212956, + "grad_norm": 0.5787494931942184, + "learning_rate": 0.00012164372224846106, + "loss": 12.145, + "step": 16441 + }, + { + "epoch": 0.8953326118178786, + "grad_norm": 0.5199979770663736, + "learning_rate": 0.00012163511292940894, + "loss": 12.0949, + "step": 16442 + }, + { + "epoch": 0.8953870658144616, + "grad_norm": 0.5910601885480786, + "learning_rate": 0.00012162650344211855, + "loss": 12.2081, + "step": 16443 + }, + { + "epoch": 0.8954415198110446, + "grad_norm": 0.5637481454779644, + "learning_rate": 0.00012161789378665684, + "loss": 12.1374, + "step": 16444 + }, + { + "epoch": 0.8954959738076277, + "grad_norm": 0.5433839584875032, + "learning_rate": 0.00012160928396309077, + "loss": 12.1934, + "step": 16445 + }, + { + "epoch": 0.8955504278042107, + "grad_norm": 0.5625919676665406, + "learning_rate": 0.00012160067397148732, + "loss": 12.0736, + "step": 16446 + }, + { + "epoch": 0.8956048818007937, + "grad_norm": 0.5686657784743229, + "learning_rate": 0.00012159206381191337, + "loss": 12.1472, + "step": 16447 + }, + { + "epoch": 0.8956593357973767, + "grad_norm": 0.5481632261766926, + "learning_rate": 0.00012158345348443592, + "loss": 12.1238, + "step": 16448 + }, + { + "epoch": 0.8957137897939597, + "grad_norm": 0.5579295242376465, + "learning_rate": 0.00012157484298912189, + "loss": 12.1607, + "step": 16449 + }, + { + "epoch": 0.8957682437905427, + "grad_norm": 0.5298093247807992, + "learning_rate": 0.0001215662323260383, + "loss": 12.2074, + "step": 16450 + }, + { + "epoch": 0.8958226977871258, + "grad_norm": 0.5798119261994258, + "learning_rate": 0.00012155762149525207, + "loss": 12.1358, + "step": 16451 + }, + { + "epoch": 0.8958771517837087, + "grad_norm": 0.5433865315844972, + "learning_rate": 0.00012154901049683014, + "loss": 12.0943, + "step": 16452 + }, + { + "epoch": 0.8959316057802917, + "grad_norm": 0.5994413735552185, + "learning_rate": 0.00012154039933083949, + "loss": 12.3357, + "step": 16453 + }, + { + "epoch": 0.8959860597768747, + "grad_norm": 0.5811815042877232, + "learning_rate": 0.00012153178799734707, + "loss": 12.1058, + "step": 16454 + }, + { + "epoch": 0.8960405137734577, + "grad_norm": 0.5344823959899129, + "learning_rate": 0.00012152317649641989, + "loss": 12.0799, + "step": 16455 + }, + { + "epoch": 0.8960949677700407, + "grad_norm": 0.6342418638772556, + "learning_rate": 0.0001215145648281249, + "loss": 12.1704, + "step": 16456 + }, + { + "epoch": 0.8961494217666238, + "grad_norm": 0.5936470227195616, + "learning_rate": 0.00012150595299252898, + "loss": 12.3105, + "step": 16457 + }, + { + "epoch": 0.8962038757632068, + "grad_norm": 0.4930485342447415, + "learning_rate": 0.00012149734098969921, + "loss": 12.0914, + "step": 16458 + }, + { + "epoch": 0.8962583297597898, + "grad_norm": 0.570412094111875, + "learning_rate": 0.00012148872881970248, + "loss": 12.2792, + "step": 16459 + }, + { + "epoch": 0.8963127837563728, + "grad_norm": 0.5553123223184898, + "learning_rate": 0.0001214801164826058, + "loss": 12.1716, + "step": 16460 + }, + { + "epoch": 0.8963672377529558, + "grad_norm": 0.5598916626976647, + "learning_rate": 0.00012147150397847616, + "loss": 12.1481, + "step": 16461 + }, + { + "epoch": 0.8964216917495389, + "grad_norm": 0.573612473014463, + "learning_rate": 0.00012146289130738046, + "loss": 12.22, + "step": 16462 + }, + { + "epoch": 0.8964761457461219, + "grad_norm": 0.5584110522872364, + "learning_rate": 0.00012145427846938575, + "loss": 11.9865, + "step": 16463 + }, + { + "epoch": 0.8965305997427049, + "grad_norm": 0.5648022930877249, + "learning_rate": 0.00012144566546455897, + "loss": 12.1107, + "step": 16464 + }, + { + "epoch": 0.8965850537392879, + "grad_norm": 0.4750639851872308, + "learning_rate": 0.00012143705229296707, + "loss": 11.9892, + "step": 16465 + }, + { + "epoch": 0.8966395077358709, + "grad_norm": 0.5240865844768169, + "learning_rate": 0.00012142843895467711, + "loss": 12.1236, + "step": 16466 + }, + { + "epoch": 0.8966939617324539, + "grad_norm": 0.5603857222852854, + "learning_rate": 0.00012141982544975596, + "loss": 12.2482, + "step": 16467 + }, + { + "epoch": 0.896748415729037, + "grad_norm": 0.5896166433903733, + "learning_rate": 0.00012141121177827068, + "loss": 12.2365, + "step": 16468 + }, + { + "epoch": 0.89680286972562, + "grad_norm": 0.5330245279845494, + "learning_rate": 0.00012140259794028823, + "loss": 12.08, + "step": 16469 + }, + { + "epoch": 0.896857323722203, + "grad_norm": 0.5299275708835319, + "learning_rate": 0.0001213939839358756, + "loss": 12.1218, + "step": 16470 + }, + { + "epoch": 0.8969117777187859, + "grad_norm": 0.5266691139680327, + "learning_rate": 0.00012138536976509973, + "loss": 12.244, + "step": 16471 + }, + { + "epoch": 0.8969662317153689, + "grad_norm": 0.5208286285047266, + "learning_rate": 0.00012137675542802767, + "loss": 12.1085, + "step": 16472 + }, + { + "epoch": 0.8970206857119519, + "grad_norm": 0.5754574343157628, + "learning_rate": 0.00012136814092472635, + "loss": 12.0592, + "step": 16473 + }, + { + "epoch": 0.897075139708535, + "grad_norm": 0.584767279675821, + "learning_rate": 0.00012135952625526278, + "loss": 12.1781, + "step": 16474 + }, + { + "epoch": 0.897129593705118, + "grad_norm": 0.5304841517778917, + "learning_rate": 0.00012135091141970399, + "loss": 12.1799, + "step": 16475 + }, + { + "epoch": 0.897184047701701, + "grad_norm": 0.5576125404035996, + "learning_rate": 0.00012134229641811689, + "loss": 12.3244, + "step": 16476 + }, + { + "epoch": 0.897238501698284, + "grad_norm": 0.5819146834555918, + "learning_rate": 0.00012133368125056854, + "loss": 12.2412, + "step": 16477 + }, + { + "epoch": 0.897292955694867, + "grad_norm": 0.5235517952639437, + "learning_rate": 0.00012132506591712592, + "loss": 12.1088, + "step": 16478 + }, + { + "epoch": 0.89734740969145, + "grad_norm": 0.5092297491141731, + "learning_rate": 0.00012131645041785598, + "loss": 12.0861, + "step": 16479 + }, + { + "epoch": 0.8974018636880331, + "grad_norm": 0.5423376099011958, + "learning_rate": 0.00012130783475282575, + "loss": 12.3334, + "step": 16480 + }, + { + "epoch": 0.8974563176846161, + "grad_norm": 0.6165992620351156, + "learning_rate": 0.00012129921892210223, + "loss": 12.1034, + "step": 16481 + }, + { + "epoch": 0.8975107716811991, + "grad_norm": 0.560353220103204, + "learning_rate": 0.0001212906029257524, + "loss": 12.171, + "step": 16482 + }, + { + "epoch": 0.8975652256777821, + "grad_norm": 0.6155562428640652, + "learning_rate": 0.0001212819867638433, + "loss": 12.198, + "step": 16483 + }, + { + "epoch": 0.8976196796743651, + "grad_norm": 0.5437538950050923, + "learning_rate": 0.00012127337043644189, + "loss": 12.1642, + "step": 16484 + }, + { + "epoch": 0.8976741336709481, + "grad_norm": 0.6687829324129224, + "learning_rate": 0.00012126475394361518, + "loss": 12.1437, + "step": 16485 + }, + { + "epoch": 0.8977285876675312, + "grad_norm": 0.5454772058136936, + "learning_rate": 0.00012125613728543017, + "loss": 12.1478, + "step": 16486 + }, + { + "epoch": 0.8977830416641142, + "grad_norm": 0.5472486153196883, + "learning_rate": 0.00012124752046195386, + "loss": 12.1001, + "step": 16487 + }, + { + "epoch": 0.8978374956606971, + "grad_norm": 0.5995583406684463, + "learning_rate": 0.0001212389034732533, + "loss": 12.1316, + "step": 16488 + }, + { + "epoch": 0.8978919496572801, + "grad_norm": 0.6390986154680263, + "learning_rate": 0.00012123028631939546, + "loss": 12.2673, + "step": 16489 + }, + { + "epoch": 0.8979464036538631, + "grad_norm": 0.5580090457577993, + "learning_rate": 0.00012122166900044734, + "loss": 12.0893, + "step": 16490 + }, + { + "epoch": 0.8980008576504461, + "grad_norm": 0.6298978408215506, + "learning_rate": 0.000121213051516476, + "loss": 12.0943, + "step": 16491 + }, + { + "epoch": 0.8980553116470292, + "grad_norm": 0.5786218651642691, + "learning_rate": 0.00012120443386754833, + "loss": 12.2038, + "step": 16492 + }, + { + "epoch": 0.8981097656436122, + "grad_norm": 0.6447769323897492, + "learning_rate": 0.00012119581605373149, + "loss": 12.324, + "step": 16493 + }, + { + "epoch": 0.8981642196401952, + "grad_norm": 0.5846483339186045, + "learning_rate": 0.00012118719807509242, + "loss": 12.0203, + "step": 16494 + }, + { + "epoch": 0.8982186736367782, + "grad_norm": 0.5444446996695285, + "learning_rate": 0.00012117857993169815, + "loss": 12.1582, + "step": 16495 + }, + { + "epoch": 0.8982731276333612, + "grad_norm": 0.5039319594598647, + "learning_rate": 0.00012116996162361569, + "loss": 12.1209, + "step": 16496 + }, + { + "epoch": 0.8983275816299443, + "grad_norm": 0.5367761049127899, + "learning_rate": 0.00012116134315091205, + "loss": 12.1595, + "step": 16497 + }, + { + "epoch": 0.8983820356265273, + "grad_norm": 0.5986676385179812, + "learning_rate": 0.00012115272451365425, + "loss": 12.1886, + "step": 16498 + }, + { + "epoch": 0.8984364896231103, + "grad_norm": 0.5056536004141298, + "learning_rate": 0.00012114410571190932, + "loss": 12.155, + "step": 16499 + }, + { + "epoch": 0.8984909436196933, + "grad_norm": 0.5538299734606021, + "learning_rate": 0.00012113548674574428, + "loss": 12.1304, + "step": 16500 + }, + { + "epoch": 0.8985453976162763, + "grad_norm": 0.6101289756036824, + "learning_rate": 0.00012112686761522618, + "loss": 12.2362, + "step": 16501 + }, + { + "epoch": 0.8985998516128593, + "grad_norm": 0.5493581205779898, + "learning_rate": 0.00012111824832042198, + "loss": 12.1193, + "step": 16502 + }, + { + "epoch": 0.8986543056094424, + "grad_norm": 0.5364252298008217, + "learning_rate": 0.00012110962886139874, + "loss": 12.0637, + "step": 16503 + }, + { + "epoch": 0.8987087596060254, + "grad_norm": 0.6534881826446343, + "learning_rate": 0.00012110100923822347, + "loss": 12.0993, + "step": 16504 + }, + { + "epoch": 0.8987632136026084, + "grad_norm": 0.5688817236521643, + "learning_rate": 0.00012109238945096324, + "loss": 12.1232, + "step": 16505 + }, + { + "epoch": 0.8988176675991914, + "grad_norm": 0.571168880384876, + "learning_rate": 0.00012108376949968507, + "loss": 12.1747, + "step": 16506 + }, + { + "epoch": 0.8988721215957743, + "grad_norm": 0.581402689275133, + "learning_rate": 0.00012107514938445597, + "loss": 12.0934, + "step": 16507 + }, + { + "epoch": 0.8989265755923573, + "grad_norm": 0.565341996031247, + "learning_rate": 0.00012106652910534295, + "loss": 12.1374, + "step": 16508 + }, + { + "epoch": 0.8989810295889404, + "grad_norm": 0.5806175343777127, + "learning_rate": 0.00012105790866241304, + "loss": 12.1423, + "step": 16509 + }, + { + "epoch": 0.8990354835855234, + "grad_norm": 0.6088934935079416, + "learning_rate": 0.00012104928805573335, + "loss": 12.1107, + "step": 16510 + }, + { + "epoch": 0.8990899375821064, + "grad_norm": 0.592673300247471, + "learning_rate": 0.00012104066728537087, + "loss": 12.221, + "step": 16511 + }, + { + "epoch": 0.8991443915786894, + "grad_norm": 0.5911083965399428, + "learning_rate": 0.0001210320463513926, + "loss": 12.2857, + "step": 16512 + }, + { + "epoch": 0.8991988455752724, + "grad_norm": 0.6696587962542493, + "learning_rate": 0.00012102342525386563, + "loss": 12.2196, + "step": 16513 + }, + { + "epoch": 0.8992532995718554, + "grad_norm": 0.6126665467482124, + "learning_rate": 0.00012101480399285694, + "loss": 12.1568, + "step": 16514 + }, + { + "epoch": 0.8993077535684385, + "grad_norm": 0.5412566276103402, + "learning_rate": 0.00012100618256843365, + "loss": 12.1299, + "step": 16515 + }, + { + "epoch": 0.8993622075650215, + "grad_norm": 0.5894568191245585, + "learning_rate": 0.00012099756098066277, + "loss": 12.0663, + "step": 16516 + }, + { + "epoch": 0.8994166615616045, + "grad_norm": 0.5975120390893832, + "learning_rate": 0.00012098893922961132, + "loss": 12.2112, + "step": 16517 + }, + { + "epoch": 0.8994711155581875, + "grad_norm": 0.501871125377737, + "learning_rate": 0.00012098031731534636, + "loss": 12.1129, + "step": 16518 + }, + { + "epoch": 0.8995255695547705, + "grad_norm": 0.5719878838629991, + "learning_rate": 0.00012097169523793492, + "loss": 12.2414, + "step": 16519 + }, + { + "epoch": 0.8995800235513535, + "grad_norm": 0.5550650218828809, + "learning_rate": 0.00012096307299744407, + "loss": 12.1494, + "step": 16520 + }, + { + "epoch": 0.8996344775479366, + "grad_norm": 0.5500582375834041, + "learning_rate": 0.00012095445059394086, + "loss": 12.2454, + "step": 16521 + }, + { + "epoch": 0.8996889315445196, + "grad_norm": 0.5435338973403163, + "learning_rate": 0.00012094582802749233, + "loss": 12.09, + "step": 16522 + }, + { + "epoch": 0.8997433855411026, + "grad_norm": 0.5817020199326123, + "learning_rate": 0.0001209372052981655, + "loss": 12.0923, + "step": 16523 + }, + { + "epoch": 0.8997978395376856, + "grad_norm": 0.5539529391293029, + "learning_rate": 0.00012092858240602746, + "loss": 12.1604, + "step": 16524 + }, + { + "epoch": 0.8998522935342685, + "grad_norm": 0.5991940563450556, + "learning_rate": 0.00012091995935114529, + "loss": 12.2028, + "step": 16525 + }, + { + "epoch": 0.8999067475308516, + "grad_norm": 0.5861434759316408, + "learning_rate": 0.00012091133613358594, + "loss": 12.2572, + "step": 16526 + }, + { + "epoch": 0.8999612015274346, + "grad_norm": 0.5538040311209945, + "learning_rate": 0.0001209027127534166, + "loss": 12.0785, + "step": 16527 + }, + { + "epoch": 0.9000156555240176, + "grad_norm": 0.5297275052982207, + "learning_rate": 0.00012089408921070424, + "loss": 12.1511, + "step": 16528 + }, + { + "epoch": 0.9000701095206006, + "grad_norm": 0.5395278733873242, + "learning_rate": 0.00012088546550551592, + "loss": 12.1181, + "step": 16529 + }, + { + "epoch": 0.9001245635171836, + "grad_norm": 0.5456466831033283, + "learning_rate": 0.00012087684163791873, + "loss": 12.1539, + "step": 16530 + }, + { + "epoch": 0.9001790175137666, + "grad_norm": 0.5656408884837834, + "learning_rate": 0.0001208682176079797, + "loss": 12.2807, + "step": 16531 + }, + { + "epoch": 0.9002334715103497, + "grad_norm": 0.543053283049204, + "learning_rate": 0.00012085959341576596, + "loss": 12.1513, + "step": 16532 + }, + { + "epoch": 0.9002879255069327, + "grad_norm": 0.5275488355315441, + "learning_rate": 0.00012085096906134447, + "loss": 12.167, + "step": 16533 + }, + { + "epoch": 0.9003423795035157, + "grad_norm": 0.5525419018374172, + "learning_rate": 0.00012084234454478239, + "loss": 12.0559, + "step": 16534 + }, + { + "epoch": 0.9003968335000987, + "grad_norm": 0.5121672343176237, + "learning_rate": 0.00012083371986614671, + "loss": 12.1287, + "step": 16535 + }, + { + "epoch": 0.9004512874966817, + "grad_norm": 0.5679055100171616, + "learning_rate": 0.00012082509502550454, + "loss": 12.1174, + "step": 16536 + }, + { + "epoch": 0.9005057414932647, + "grad_norm": 0.6259631770417831, + "learning_rate": 0.00012081647002292296, + "loss": 12.1103, + "step": 16537 + }, + { + "epoch": 0.9005601954898478, + "grad_norm": 0.6373837573994012, + "learning_rate": 0.00012080784485846899, + "loss": 12.1739, + "step": 16538 + }, + { + "epoch": 0.9006146494864308, + "grad_norm": 0.5627806904812449, + "learning_rate": 0.00012079921953220975, + "loss": 12.1975, + "step": 16539 + }, + { + "epoch": 0.9006691034830138, + "grad_norm": 0.5666742303334366, + "learning_rate": 0.00012079059404421227, + "loss": 12.1863, + "step": 16540 + }, + { + "epoch": 0.9007235574795968, + "grad_norm": 0.5884420986046116, + "learning_rate": 0.00012078196839454365, + "loss": 12.257, + "step": 16541 + }, + { + "epoch": 0.9007780114761798, + "grad_norm": 0.6108872627903197, + "learning_rate": 0.00012077334258327097, + "loss": 12.1131, + "step": 16542 + }, + { + "epoch": 0.9008324654727627, + "grad_norm": 0.6058293067479195, + "learning_rate": 0.00012076471661046129, + "loss": 12.146, + "step": 16543 + }, + { + "epoch": 0.9008869194693458, + "grad_norm": 0.5609878745463391, + "learning_rate": 0.00012075609047618169, + "loss": 12.2037, + "step": 16544 + }, + { + "epoch": 0.9009413734659288, + "grad_norm": 0.5445608032269247, + "learning_rate": 0.00012074746418049924, + "loss": 12.1039, + "step": 16545 + }, + { + "epoch": 0.9009958274625118, + "grad_norm": 0.5625864225762172, + "learning_rate": 0.00012073883772348105, + "loss": 12.2862, + "step": 16546 + }, + { + "epoch": 0.9010502814590948, + "grad_norm": 0.5848363906000628, + "learning_rate": 0.00012073021110519416, + "loss": 12.1611, + "step": 16547 + }, + { + "epoch": 0.9011047354556778, + "grad_norm": 0.5851829608040879, + "learning_rate": 0.00012072158432570569, + "loss": 12.2337, + "step": 16548 + }, + { + "epoch": 0.9011591894522608, + "grad_norm": 0.5248127918593556, + "learning_rate": 0.00012071295738508268, + "loss": 12.2178, + "step": 16549 + }, + { + "epoch": 0.9012136434488439, + "grad_norm": 0.5568758893873234, + "learning_rate": 0.00012070433028339226, + "loss": 12.1396, + "step": 16550 + }, + { + "epoch": 0.9012680974454269, + "grad_norm": 0.5732195807465644, + "learning_rate": 0.00012069570302070148, + "loss": 12.1719, + "step": 16551 + }, + { + "epoch": 0.9013225514420099, + "grad_norm": 0.6810937493211286, + "learning_rate": 0.00012068707559707746, + "loss": 12.2129, + "step": 16552 + }, + { + "epoch": 0.9013770054385929, + "grad_norm": 0.5264730181108157, + "learning_rate": 0.00012067844801258726, + "loss": 12.1977, + "step": 16553 + }, + { + "epoch": 0.9014314594351759, + "grad_norm": 0.5334773683000122, + "learning_rate": 0.00012066982026729798, + "loss": 12.1088, + "step": 16554 + }, + { + "epoch": 0.9014859134317589, + "grad_norm": 0.6875605989114848, + "learning_rate": 0.0001206611923612767, + "loss": 12.0489, + "step": 16555 + }, + { + "epoch": 0.901540367428342, + "grad_norm": 0.6142806055177614, + "learning_rate": 0.00012065256429459056, + "loss": 12.2554, + "step": 16556 + }, + { + "epoch": 0.901594821424925, + "grad_norm": 0.5594151040235987, + "learning_rate": 0.00012064393606730662, + "loss": 12.1569, + "step": 16557 + }, + { + "epoch": 0.901649275421508, + "grad_norm": 0.5922801599066374, + "learning_rate": 0.00012063530767949191, + "loss": 12.1549, + "step": 16558 + }, + { + "epoch": 0.901703729418091, + "grad_norm": 0.6089845067197045, + "learning_rate": 0.00012062667913121363, + "loss": 12.2396, + "step": 16559 + }, + { + "epoch": 0.901758183414674, + "grad_norm": 0.5340371089060799, + "learning_rate": 0.00012061805042253881, + "loss": 11.9331, + "step": 16560 + }, + { + "epoch": 0.9018126374112571, + "grad_norm": 0.55790354309246, + "learning_rate": 0.0001206094215535346, + "loss": 12.0415, + "step": 16561 + }, + { + "epoch": 0.90186709140784, + "grad_norm": 0.6135537059450782, + "learning_rate": 0.00012060079252426809, + "loss": 12.1952, + "step": 16562 + }, + { + "epoch": 0.901921545404423, + "grad_norm": 0.5821693916055684, + "learning_rate": 0.00012059216333480632, + "loss": 12.2118, + "step": 16563 + }, + { + "epoch": 0.901975999401006, + "grad_norm": 0.5744206999331853, + "learning_rate": 0.00012058353398521644, + "loss": 12.2648, + "step": 16564 + }, + { + "epoch": 0.902030453397589, + "grad_norm": 0.6871102465432373, + "learning_rate": 0.00012057490447556556, + "loss": 12.1974, + "step": 16565 + }, + { + "epoch": 0.902084907394172, + "grad_norm": 0.5584545841478958, + "learning_rate": 0.00012056627480592077, + "loss": 12.1063, + "step": 16566 + }, + { + "epoch": 0.9021393613907551, + "grad_norm": 0.5289270437003082, + "learning_rate": 0.0001205576449763492, + "loss": 12.0957, + "step": 16567 + }, + { + "epoch": 0.9021938153873381, + "grad_norm": 0.613888501086125, + "learning_rate": 0.0001205490149869179, + "loss": 12.1563, + "step": 16568 + }, + { + "epoch": 0.9022482693839211, + "grad_norm": 0.6974437227061876, + "learning_rate": 0.00012054038483769401, + "loss": 12.1972, + "step": 16569 + }, + { + "epoch": 0.9023027233805041, + "grad_norm": 0.5782039366655223, + "learning_rate": 0.00012053175452874466, + "loss": 12.1505, + "step": 16570 + }, + { + "epoch": 0.9023571773770871, + "grad_norm": 0.5986953500684845, + "learning_rate": 0.00012052312406013694, + "loss": 12.0572, + "step": 16571 + }, + { + "epoch": 0.9024116313736701, + "grad_norm": 0.5781745381427984, + "learning_rate": 0.00012051449343193799, + "loss": 12.1791, + "step": 16572 + }, + { + "epoch": 0.9024660853702532, + "grad_norm": 0.580489142635654, + "learning_rate": 0.00012050586264421489, + "loss": 12.1939, + "step": 16573 + }, + { + "epoch": 0.9025205393668362, + "grad_norm": 0.5888350093249815, + "learning_rate": 0.00012049723169703474, + "loss": 12.1028, + "step": 16574 + }, + { + "epoch": 0.9025749933634192, + "grad_norm": 0.6599516366929871, + "learning_rate": 0.00012048860059046468, + "loss": 12.33, + "step": 16575 + }, + { + "epoch": 0.9026294473600022, + "grad_norm": 0.6568993104827798, + "learning_rate": 0.00012047996932457182, + "loss": 12.3188, + "step": 16576 + }, + { + "epoch": 0.9026839013565852, + "grad_norm": 0.5526626692920116, + "learning_rate": 0.0001204713378994233, + "loss": 12.1452, + "step": 16577 + }, + { + "epoch": 0.9027383553531682, + "grad_norm": 0.5411002258022113, + "learning_rate": 0.00012046270631508623, + "loss": 12.2017, + "step": 16578 + }, + { + "epoch": 0.9027928093497513, + "grad_norm": 0.5304822759479126, + "learning_rate": 0.00012045407457162772, + "loss": 11.9785, + "step": 16579 + }, + { + "epoch": 0.9028472633463343, + "grad_norm": 0.668688733482345, + "learning_rate": 0.00012044544266911488, + "loss": 12.1479, + "step": 16580 + }, + { + "epoch": 0.9029017173429172, + "grad_norm": 0.5969611170805564, + "learning_rate": 0.00012043681060761484, + "loss": 12.1249, + "step": 16581 + }, + { + "epoch": 0.9029561713395002, + "grad_norm": 0.6044686527675578, + "learning_rate": 0.0001204281783871948, + "loss": 12.2064, + "step": 16582 + }, + { + "epoch": 0.9030106253360832, + "grad_norm": 0.577749237566279, + "learning_rate": 0.00012041954600792175, + "loss": 12.1072, + "step": 16583 + }, + { + "epoch": 0.9030650793326662, + "grad_norm": 0.5766044759471065, + "learning_rate": 0.00012041091346986292, + "loss": 12.2084, + "step": 16584 + }, + { + "epoch": 0.9031195333292493, + "grad_norm": 0.5806415623737022, + "learning_rate": 0.0001204022807730854, + "loss": 12.1572, + "step": 16585 + }, + { + "epoch": 0.9031739873258323, + "grad_norm": 0.5656482861317721, + "learning_rate": 0.0001203936479176563, + "loss": 12.1701, + "step": 16586 + }, + { + "epoch": 0.9032284413224153, + "grad_norm": 0.619887669845093, + "learning_rate": 0.00012038501490364281, + "loss": 12.19, + "step": 16587 + }, + { + "epoch": 0.9032828953189983, + "grad_norm": 0.595453210340934, + "learning_rate": 0.00012037638173111201, + "loss": 12.0868, + "step": 16588 + }, + { + "epoch": 0.9033373493155813, + "grad_norm": 0.5307560222648793, + "learning_rate": 0.00012036774840013103, + "loss": 12.0417, + "step": 16589 + }, + { + "epoch": 0.9033918033121643, + "grad_norm": 0.5689723062921288, + "learning_rate": 0.00012035911491076704, + "loss": 12.0614, + "step": 16590 + }, + { + "epoch": 0.9034462573087474, + "grad_norm": 0.5369195786912859, + "learning_rate": 0.00012035048126308715, + "loss": 12.1579, + "step": 16591 + }, + { + "epoch": 0.9035007113053304, + "grad_norm": 0.528098688405729, + "learning_rate": 0.00012034184745715853, + "loss": 12.1516, + "step": 16592 + }, + { + "epoch": 0.9035551653019134, + "grad_norm": 0.6048731746648017, + "learning_rate": 0.00012033321349304827, + "loss": 12.1478, + "step": 16593 + }, + { + "epoch": 0.9036096192984964, + "grad_norm": 0.5574691218373621, + "learning_rate": 0.00012032457937082353, + "loss": 12.1007, + "step": 16594 + }, + { + "epoch": 0.9036640732950794, + "grad_norm": 0.5729354710039366, + "learning_rate": 0.00012031594509055146, + "loss": 12.2509, + "step": 16595 + }, + { + "epoch": 0.9037185272916625, + "grad_norm": 0.5898254632000199, + "learning_rate": 0.00012030731065229918, + "loss": 12.178, + "step": 16596 + }, + { + "epoch": 0.9037729812882455, + "grad_norm": 0.5430476778389886, + "learning_rate": 0.00012029867605613385, + "loss": 12.1013, + "step": 16597 + }, + { + "epoch": 0.9038274352848285, + "grad_norm": 0.5719532844173403, + "learning_rate": 0.00012029004130212263, + "loss": 12.2637, + "step": 16598 + }, + { + "epoch": 0.9038818892814114, + "grad_norm": 0.5673084511445369, + "learning_rate": 0.00012028140639033262, + "loss": 12.2498, + "step": 16599 + }, + { + "epoch": 0.9039363432779944, + "grad_norm": 0.5575708863107153, + "learning_rate": 0.00012027277132083103, + "loss": 12.0741, + "step": 16600 + }, + { + "epoch": 0.9039907972745774, + "grad_norm": 0.5685739983488483, + "learning_rate": 0.00012026413609368495, + "loss": 12.1816, + "step": 16601 + }, + { + "epoch": 0.9040452512711605, + "grad_norm": 0.581791608106945, + "learning_rate": 0.00012025550070896155, + "loss": 12.1209, + "step": 16602 + }, + { + "epoch": 0.9040997052677435, + "grad_norm": 0.5168490029790749, + "learning_rate": 0.00012024686516672796, + "loss": 12.1303, + "step": 16603 + }, + { + "epoch": 0.9041541592643265, + "grad_norm": 0.5735573094212707, + "learning_rate": 0.0001202382294670514, + "loss": 12.1718, + "step": 16604 + }, + { + "epoch": 0.9042086132609095, + "grad_norm": 0.5851835687748101, + "learning_rate": 0.00012022959360999893, + "loss": 12.1299, + "step": 16605 + }, + { + "epoch": 0.9042630672574925, + "grad_norm": 0.5561947463654558, + "learning_rate": 0.00012022095759563777, + "loss": 12.0567, + "step": 16606 + }, + { + "epoch": 0.9043175212540755, + "grad_norm": 0.57096710325472, + "learning_rate": 0.00012021232142403502, + "loss": 12.1633, + "step": 16607 + }, + { + "epoch": 0.9043719752506586, + "grad_norm": 0.6064023929898223, + "learning_rate": 0.0001202036850952579, + "loss": 12.195, + "step": 16608 + }, + { + "epoch": 0.9044264292472416, + "grad_norm": 0.7204348761597378, + "learning_rate": 0.00012019504860937352, + "loss": 12.2921, + "step": 16609 + }, + { + "epoch": 0.9044808832438246, + "grad_norm": 0.5646044232101393, + "learning_rate": 0.00012018641196644906, + "loss": 12.1701, + "step": 16610 + }, + { + "epoch": 0.9045353372404076, + "grad_norm": 0.5274641380881213, + "learning_rate": 0.00012017777516655169, + "loss": 12.06, + "step": 16611 + }, + { + "epoch": 0.9045897912369906, + "grad_norm": 0.5251427691853415, + "learning_rate": 0.00012016913820974856, + "loss": 12.2064, + "step": 16612 + }, + { + "epoch": 0.9046442452335736, + "grad_norm": 0.6021069458024625, + "learning_rate": 0.00012016050109610679, + "loss": 12.111, + "step": 16613 + }, + { + "epoch": 0.9046986992301567, + "grad_norm": 0.5338626495772549, + "learning_rate": 0.00012015186382569362, + "loss": 12.1724, + "step": 16614 + }, + { + "epoch": 0.9047531532267397, + "grad_norm": 0.5322612991595553, + "learning_rate": 0.00012014322639857616, + "loss": 12.1126, + "step": 16615 + }, + { + "epoch": 0.9048076072233227, + "grad_norm": 0.596317737541182, + "learning_rate": 0.0001201345888148216, + "loss": 12.1244, + "step": 16616 + }, + { + "epoch": 0.9048620612199056, + "grad_norm": 0.5481072720850805, + "learning_rate": 0.00012012595107449712, + "loss": 12.2287, + "step": 16617 + }, + { + "epoch": 0.9049165152164886, + "grad_norm": 0.5679762111084247, + "learning_rate": 0.00012011731317766983, + "loss": 12.1883, + "step": 16618 + }, + { + "epoch": 0.9049709692130716, + "grad_norm": 0.5880208205737794, + "learning_rate": 0.00012010867512440695, + "loss": 12.1742, + "step": 16619 + }, + { + "epoch": 0.9050254232096547, + "grad_norm": 0.6104970007982246, + "learning_rate": 0.00012010003691477564, + "loss": 12.2109, + "step": 16620 + }, + { + "epoch": 0.9050798772062377, + "grad_norm": 0.5823536654745095, + "learning_rate": 0.00012009139854884308, + "loss": 12.1697, + "step": 16621 + }, + { + "epoch": 0.9051343312028207, + "grad_norm": 0.6349717560007302, + "learning_rate": 0.00012008276002667646, + "loss": 12.0318, + "step": 16622 + }, + { + "epoch": 0.9051887851994037, + "grad_norm": 0.5989307276431446, + "learning_rate": 0.0001200741213483429, + "loss": 12.0962, + "step": 16623 + }, + { + "epoch": 0.9052432391959867, + "grad_norm": 0.6016610302270592, + "learning_rate": 0.00012006548251390959, + "loss": 12.1917, + "step": 16624 + }, + { + "epoch": 0.9052976931925697, + "grad_norm": 0.5617629352693115, + "learning_rate": 0.00012005684352344375, + "loss": 12.0168, + "step": 16625 + }, + { + "epoch": 0.9053521471891528, + "grad_norm": 0.7072618121372366, + "learning_rate": 0.00012004820437701252, + "loss": 12.1314, + "step": 16626 + }, + { + "epoch": 0.9054066011857358, + "grad_norm": 0.8022261000813399, + "learning_rate": 0.00012003956507468312, + "loss": 12.1558, + "step": 16627 + }, + { + "epoch": 0.9054610551823188, + "grad_norm": 0.6477531181079941, + "learning_rate": 0.00012003092561652267, + "loss": 12.184, + "step": 16628 + }, + { + "epoch": 0.9055155091789018, + "grad_norm": 0.565053757093744, + "learning_rate": 0.00012002228600259838, + "loss": 12.2107, + "step": 16629 + }, + { + "epoch": 0.9055699631754848, + "grad_norm": 0.6237518603632382, + "learning_rate": 0.00012001364623297744, + "loss": 12.045, + "step": 16630 + }, + { + "epoch": 0.9056244171720679, + "grad_norm": 0.6334450869783368, + "learning_rate": 0.00012000500630772705, + "loss": 12.1567, + "step": 16631 + }, + { + "epoch": 0.9056788711686509, + "grad_norm": 0.5331753187207506, + "learning_rate": 0.00011999636622691438, + "loss": 12.1466, + "step": 16632 + }, + { + "epoch": 0.9057333251652339, + "grad_norm": 0.5556861753442944, + "learning_rate": 0.0001199877259906066, + "loss": 12.2399, + "step": 16633 + }, + { + "epoch": 0.9057877791618169, + "grad_norm": 0.6053269076177362, + "learning_rate": 0.0001199790855988709, + "loss": 12.2552, + "step": 16634 + }, + { + "epoch": 0.9058422331583998, + "grad_norm": 0.5511390999628655, + "learning_rate": 0.00011997044505177446, + "loss": 12.2137, + "step": 16635 + }, + { + "epoch": 0.9058966871549828, + "grad_norm": 0.5596620142884977, + "learning_rate": 0.00011996180434938451, + "loss": 12.0874, + "step": 16636 + }, + { + "epoch": 0.9059511411515659, + "grad_norm": 0.7251786263883778, + "learning_rate": 0.00011995316349176825, + "loss": 12.1136, + "step": 16637 + }, + { + "epoch": 0.9060055951481489, + "grad_norm": 0.5735425418309598, + "learning_rate": 0.00011994452247899284, + "loss": 12.1922, + "step": 16638 + }, + { + "epoch": 0.9060600491447319, + "grad_norm": 0.5538004555453727, + "learning_rate": 0.00011993588131112545, + "loss": 12.2168, + "step": 16639 + }, + { + "epoch": 0.9061145031413149, + "grad_norm": 0.6079380700170678, + "learning_rate": 0.0001199272399882333, + "loss": 12.1329, + "step": 16640 + }, + { + "epoch": 0.9061689571378979, + "grad_norm": 0.617454855656419, + "learning_rate": 0.0001199185985103836, + "loss": 12.144, + "step": 16641 + }, + { + "epoch": 0.9062234111344809, + "grad_norm": 0.5169310498004772, + "learning_rate": 0.00011990995687764357, + "loss": 12.2623, + "step": 16642 + }, + { + "epoch": 0.906277865131064, + "grad_norm": 0.6177406350488743, + "learning_rate": 0.00011990131509008036, + "loss": 12.12, + "step": 16643 + }, + { + "epoch": 0.906332319127647, + "grad_norm": 0.6277145773760757, + "learning_rate": 0.00011989267314776116, + "loss": 12.2011, + "step": 16644 + }, + { + "epoch": 0.90638677312423, + "grad_norm": 0.5425803391625272, + "learning_rate": 0.00011988403105075323, + "loss": 12.0853, + "step": 16645 + }, + { + "epoch": 0.906441227120813, + "grad_norm": 0.5910161248310666, + "learning_rate": 0.0001198753887991237, + "loss": 12.2245, + "step": 16646 + }, + { + "epoch": 0.906495681117396, + "grad_norm": 0.5450148777653727, + "learning_rate": 0.00011986674639293984, + "loss": 12.2561, + "step": 16647 + }, + { + "epoch": 0.906550135113979, + "grad_norm": 0.6509899028924494, + "learning_rate": 0.00011985810383226884, + "loss": 12.3318, + "step": 16648 + }, + { + "epoch": 0.9066045891105621, + "grad_norm": 0.5842785402708647, + "learning_rate": 0.00011984946111717787, + "loss": 12.1969, + "step": 16649 + }, + { + "epoch": 0.9066590431071451, + "grad_norm": 0.5736884415395951, + "learning_rate": 0.00011984081824773418, + "loss": 12.2531, + "step": 16650 + }, + { + "epoch": 0.9067134971037281, + "grad_norm": 0.6329788568815561, + "learning_rate": 0.00011983217522400494, + "loss": 12.2265, + "step": 16651 + }, + { + "epoch": 0.906767951100311, + "grad_norm": 0.5200775966974495, + "learning_rate": 0.0001198235320460574, + "loss": 12.1019, + "step": 16652 + }, + { + "epoch": 0.906822405096894, + "grad_norm": 0.6080449242599029, + "learning_rate": 0.00011981488871395874, + "loss": 12.2413, + "step": 16653 + }, + { + "epoch": 0.906876859093477, + "grad_norm": 0.5179714179732703, + "learning_rate": 0.0001198062452277762, + "loss": 12.2128, + "step": 16654 + }, + { + "epoch": 0.9069313130900601, + "grad_norm": 0.5315355453059781, + "learning_rate": 0.00011979760158757693, + "loss": 12.1813, + "step": 16655 + }, + { + "epoch": 0.9069857670866431, + "grad_norm": 0.540844043152265, + "learning_rate": 0.00011978895779342823, + "loss": 12.0854, + "step": 16656 + }, + { + "epoch": 0.9070402210832261, + "grad_norm": 0.5503422274480156, + "learning_rate": 0.00011978031384539727, + "loss": 12.2659, + "step": 16657 + }, + { + "epoch": 0.9070946750798091, + "grad_norm": 0.5766267629462053, + "learning_rate": 0.00011977166974355127, + "loss": 12.252, + "step": 16658 + }, + { + "epoch": 0.9071491290763921, + "grad_norm": 0.5981171855869646, + "learning_rate": 0.00011976302548795746, + "loss": 12.1501, + "step": 16659 + }, + { + "epoch": 0.9072035830729752, + "grad_norm": 0.6071357970665403, + "learning_rate": 0.00011975438107868302, + "loss": 12.237, + "step": 16660 + }, + { + "epoch": 0.9072580370695582, + "grad_norm": 0.5337519119985165, + "learning_rate": 0.00011974573651579521, + "loss": 12.0513, + "step": 16661 + }, + { + "epoch": 0.9073124910661412, + "grad_norm": 0.6093536507145341, + "learning_rate": 0.00011973709179936125, + "loss": 12.2212, + "step": 16662 + }, + { + "epoch": 0.9073669450627242, + "grad_norm": 0.5361183955849707, + "learning_rate": 0.00011972844692944835, + "loss": 11.9772, + "step": 16663 + }, + { + "epoch": 0.9074213990593072, + "grad_norm": 0.6072014184975755, + "learning_rate": 0.00011971980190612375, + "loss": 12.0981, + "step": 16664 + }, + { + "epoch": 0.9074758530558902, + "grad_norm": 0.5604623769710646, + "learning_rate": 0.00011971115672945466, + "loss": 12.2179, + "step": 16665 + }, + { + "epoch": 0.9075303070524733, + "grad_norm": 0.6645669085703189, + "learning_rate": 0.0001197025113995083, + "loss": 12.3859, + "step": 16666 + }, + { + "epoch": 0.9075847610490563, + "grad_norm": 0.6003976294910272, + "learning_rate": 0.00011969386591635192, + "loss": 12.164, + "step": 16667 + }, + { + "epoch": 0.9076392150456393, + "grad_norm": 0.6585124722512923, + "learning_rate": 0.00011968522028005273, + "loss": 12.2127, + "step": 16668 + }, + { + "epoch": 0.9076936690422223, + "grad_norm": 0.5535103951097059, + "learning_rate": 0.00011967657449067795, + "loss": 12.1997, + "step": 16669 + }, + { + "epoch": 0.9077481230388053, + "grad_norm": 0.541805955353317, + "learning_rate": 0.00011966792854829485, + "loss": 12.0348, + "step": 16670 + }, + { + "epoch": 0.9078025770353882, + "grad_norm": 0.5215776723442405, + "learning_rate": 0.00011965928245297063, + "loss": 12.1675, + "step": 16671 + }, + { + "epoch": 0.9078570310319714, + "grad_norm": 0.5248796095810393, + "learning_rate": 0.00011965063620477252, + "loss": 12.0794, + "step": 16672 + }, + { + "epoch": 0.9079114850285543, + "grad_norm": 0.48543629832874036, + "learning_rate": 0.0001196419898037678, + "loss": 12.0909, + "step": 16673 + }, + { + "epoch": 0.9079659390251373, + "grad_norm": 0.5810116401522981, + "learning_rate": 0.00011963334325002364, + "loss": 12.1623, + "step": 16674 + }, + { + "epoch": 0.9080203930217203, + "grad_norm": 0.5140735331995465, + "learning_rate": 0.00011962469654360733, + "loss": 12.0415, + "step": 16675 + }, + { + "epoch": 0.9080748470183033, + "grad_norm": 0.5242609749151819, + "learning_rate": 0.00011961604968458609, + "loss": 12.1652, + "step": 16676 + }, + { + "epoch": 0.9081293010148863, + "grad_norm": 0.5012690451280981, + "learning_rate": 0.00011960740267302715, + "loss": 12.0782, + "step": 16677 + }, + { + "epoch": 0.9081837550114694, + "grad_norm": 0.5704474808309116, + "learning_rate": 0.00011959875550899775, + "loss": 12.2232, + "step": 16678 + }, + { + "epoch": 0.9082382090080524, + "grad_norm": 0.5242457472681777, + "learning_rate": 0.00011959010819256515, + "loss": 12.1655, + "step": 16679 + }, + { + "epoch": 0.9082926630046354, + "grad_norm": 0.5767583316134496, + "learning_rate": 0.00011958146072379659, + "loss": 12.0799, + "step": 16680 + }, + { + "epoch": 0.9083471170012184, + "grad_norm": 0.612337674023645, + "learning_rate": 0.00011957281310275929, + "loss": 12.1277, + "step": 16681 + }, + { + "epoch": 0.9084015709978014, + "grad_norm": 0.5639431431197535, + "learning_rate": 0.00011956416532952052, + "loss": 12.1048, + "step": 16682 + }, + { + "epoch": 0.9084560249943844, + "grad_norm": 0.5781707248709169, + "learning_rate": 0.00011955551740414754, + "loss": 12.0985, + "step": 16683 + }, + { + "epoch": 0.9085104789909675, + "grad_norm": 0.5415750537886928, + "learning_rate": 0.00011954686932670755, + "loss": 12.0249, + "step": 16684 + }, + { + "epoch": 0.9085649329875505, + "grad_norm": 0.5771376377405151, + "learning_rate": 0.00011953822109726785, + "loss": 12.1427, + "step": 16685 + }, + { + "epoch": 0.9086193869841335, + "grad_norm": 0.5812078814955836, + "learning_rate": 0.00011952957271589565, + "loss": 12.2247, + "step": 16686 + }, + { + "epoch": 0.9086738409807165, + "grad_norm": 0.5473565554611068, + "learning_rate": 0.00011952092418265821, + "loss": 12.0588, + "step": 16687 + }, + { + "epoch": 0.9087282949772995, + "grad_norm": 0.5833788408235893, + "learning_rate": 0.00011951227549762283, + "loss": 12.102, + "step": 16688 + }, + { + "epoch": 0.9087827489738824, + "grad_norm": 0.5727748231134404, + "learning_rate": 0.00011950362666085665, + "loss": 12.1467, + "step": 16689 + }, + { + "epoch": 0.9088372029704656, + "grad_norm": 0.5539800357546315, + "learning_rate": 0.00011949497767242706, + "loss": 12.1409, + "step": 16690 + }, + { + "epoch": 0.9088916569670485, + "grad_norm": 0.5645437034529261, + "learning_rate": 0.00011948632853240122, + "loss": 12.0354, + "step": 16691 + }, + { + "epoch": 0.9089461109636315, + "grad_norm": 0.605985359763697, + "learning_rate": 0.00011947767924084645, + "loss": 12.1695, + "step": 16692 + }, + { + "epoch": 0.9090005649602145, + "grad_norm": 0.5683245753476472, + "learning_rate": 0.00011946902979782999, + "loss": 12.0926, + "step": 16693 + }, + { + "epoch": 0.9090550189567975, + "grad_norm": 0.5298631423959123, + "learning_rate": 0.00011946038020341905, + "loss": 12.2171, + "step": 16694 + }, + { + "epoch": 0.9091094729533806, + "grad_norm": 0.6042090785026276, + "learning_rate": 0.00011945173045768095, + "loss": 12.2209, + "step": 16695 + }, + { + "epoch": 0.9091639269499636, + "grad_norm": 0.5521047723399338, + "learning_rate": 0.00011944308056068292, + "loss": 12.0592, + "step": 16696 + }, + { + "epoch": 0.9092183809465466, + "grad_norm": 0.5184238358553434, + "learning_rate": 0.00011943443051249224, + "loss": 12.1804, + "step": 16697 + }, + { + "epoch": 0.9092728349431296, + "grad_norm": 0.6132916575172702, + "learning_rate": 0.00011942578031317619, + "loss": 12.2734, + "step": 16698 + }, + { + "epoch": 0.9093272889397126, + "grad_norm": 0.5377853963467617, + "learning_rate": 0.00011941712996280201, + "loss": 12.0987, + "step": 16699 + }, + { + "epoch": 0.9093817429362956, + "grad_norm": 0.4954452948200757, + "learning_rate": 0.00011940847946143696, + "loss": 12.1779, + "step": 16700 + }, + { + "epoch": 0.9094361969328787, + "grad_norm": 0.5452706709754639, + "learning_rate": 0.00011939982880914828, + "loss": 12.1383, + "step": 16701 + }, + { + "epoch": 0.9094906509294617, + "grad_norm": 0.5960469894657686, + "learning_rate": 0.00011939117800600333, + "loss": 12.1946, + "step": 16702 + }, + { + "epoch": 0.9095451049260447, + "grad_norm": 0.6250770426978768, + "learning_rate": 0.00011938252705206934, + "loss": 12.12, + "step": 16703 + }, + { + "epoch": 0.9095995589226277, + "grad_norm": 0.5619249572154056, + "learning_rate": 0.00011937387594741353, + "loss": 12.1961, + "step": 16704 + }, + { + "epoch": 0.9096540129192107, + "grad_norm": 0.5960458283295135, + "learning_rate": 0.00011936522469210323, + "loss": 12.1709, + "step": 16705 + }, + { + "epoch": 0.9097084669157937, + "grad_norm": 0.545670326510443, + "learning_rate": 0.00011935657328620566, + "loss": 12.0751, + "step": 16706 + }, + { + "epoch": 0.9097629209123768, + "grad_norm": 0.5305274367137507, + "learning_rate": 0.00011934792172978815, + "loss": 12.1436, + "step": 16707 + }, + { + "epoch": 0.9098173749089598, + "grad_norm": 0.5447327452757997, + "learning_rate": 0.00011933927002291801, + "loss": 12.0792, + "step": 16708 + }, + { + "epoch": 0.9098718289055427, + "grad_norm": 0.6511668231413018, + "learning_rate": 0.0001193306181656624, + "loss": 12.1077, + "step": 16709 + }, + { + "epoch": 0.9099262829021257, + "grad_norm": 0.5451503446692807, + "learning_rate": 0.00011932196615808868, + "loss": 12.1108, + "step": 16710 + }, + { + "epoch": 0.9099807368987087, + "grad_norm": 0.5807730649400389, + "learning_rate": 0.0001193133140002641, + "loss": 12.246, + "step": 16711 + }, + { + "epoch": 0.9100351908952917, + "grad_norm": 0.5497627488136255, + "learning_rate": 0.00011930466169225595, + "loss": 12.1494, + "step": 16712 + }, + { + "epoch": 0.9100896448918748, + "grad_norm": 0.568200531087904, + "learning_rate": 0.00011929600923413156, + "loss": 12.1344, + "step": 16713 + }, + { + "epoch": 0.9101440988884578, + "grad_norm": 0.5289936895464843, + "learning_rate": 0.00011928735662595812, + "loss": 12.1177, + "step": 16714 + }, + { + "epoch": 0.9101985528850408, + "grad_norm": 0.5598408108034993, + "learning_rate": 0.00011927870386780298, + "loss": 12.1175, + "step": 16715 + }, + { + "epoch": 0.9102530068816238, + "grad_norm": 0.6177995714639638, + "learning_rate": 0.00011927005095973341, + "loss": 12.1444, + "step": 16716 + }, + { + "epoch": 0.9103074608782068, + "grad_norm": 0.6274524773670057, + "learning_rate": 0.00011926139790181663, + "loss": 12.2214, + "step": 16717 + }, + { + "epoch": 0.9103619148747898, + "grad_norm": 0.5085526453142428, + "learning_rate": 0.00011925274469412007, + "loss": 12.2215, + "step": 16718 + }, + { + "epoch": 0.9104163688713729, + "grad_norm": 0.5974215900567931, + "learning_rate": 0.00011924409133671091, + "loss": 12.123, + "step": 16719 + }, + { + "epoch": 0.9104708228679559, + "grad_norm": 0.5536194685243843, + "learning_rate": 0.00011923543782965647, + "loss": 12.1997, + "step": 16720 + }, + { + "epoch": 0.9105252768645389, + "grad_norm": 0.548810186607552, + "learning_rate": 0.00011922678417302404, + "loss": 12.1594, + "step": 16721 + }, + { + "epoch": 0.9105797308611219, + "grad_norm": 0.5358261416721035, + "learning_rate": 0.0001192181303668809, + "loss": 12.1352, + "step": 16722 + }, + { + "epoch": 0.9106341848577049, + "grad_norm": 0.5869635801603279, + "learning_rate": 0.00011920947641129437, + "loss": 12.1825, + "step": 16723 + }, + { + "epoch": 0.9106886388542879, + "grad_norm": 0.5178457255217145, + "learning_rate": 0.00011920082230633172, + "loss": 12.1535, + "step": 16724 + }, + { + "epoch": 0.910743092850871, + "grad_norm": 0.5471474478485095, + "learning_rate": 0.00011919216805206026, + "loss": 12.0578, + "step": 16725 + }, + { + "epoch": 0.910797546847454, + "grad_norm": 0.5618150486227647, + "learning_rate": 0.00011918351364854728, + "loss": 12.1135, + "step": 16726 + }, + { + "epoch": 0.910852000844037, + "grad_norm": 0.5054061078826275, + "learning_rate": 0.00011917485909586008, + "loss": 12.1893, + "step": 16727 + }, + { + "epoch": 0.9109064548406199, + "grad_norm": 0.5295583824315849, + "learning_rate": 0.00011916620439406597, + "loss": 12.131, + "step": 16728 + }, + { + "epoch": 0.9109609088372029, + "grad_norm": 0.5810858529402816, + "learning_rate": 0.00011915754954323222, + "loss": 12.0111, + "step": 16729 + }, + { + "epoch": 0.911015362833786, + "grad_norm": 0.581320298295613, + "learning_rate": 0.00011914889454342617, + "loss": 12.2068, + "step": 16730 + }, + { + "epoch": 0.911069816830369, + "grad_norm": 0.5417895266138278, + "learning_rate": 0.0001191402393947151, + "loss": 12.1504, + "step": 16731 + }, + { + "epoch": 0.911124270826952, + "grad_norm": 0.5312038085499952, + "learning_rate": 0.00011913158409716631, + "loss": 12.0605, + "step": 16732 + }, + { + "epoch": 0.911178724823535, + "grad_norm": 0.5206094610344083, + "learning_rate": 0.00011912292865084713, + "loss": 12.1059, + "step": 16733 + }, + { + "epoch": 0.911233178820118, + "grad_norm": 0.5489024764471662, + "learning_rate": 0.00011911427305582486, + "loss": 12.0937, + "step": 16734 + }, + { + "epoch": 0.911287632816701, + "grad_norm": 0.5308736008490176, + "learning_rate": 0.00011910561731216676, + "loss": 12.2083, + "step": 16735 + }, + { + "epoch": 0.9113420868132841, + "grad_norm": 0.5017524052748702, + "learning_rate": 0.0001190969614199402, + "loss": 12.079, + "step": 16736 + }, + { + "epoch": 0.9113965408098671, + "grad_norm": 0.5330214212294354, + "learning_rate": 0.00011908830537921247, + "loss": 12.2847, + "step": 16737 + }, + { + "epoch": 0.9114509948064501, + "grad_norm": 0.542780798730531, + "learning_rate": 0.00011907964919005085, + "loss": 12.253, + "step": 16738 + }, + { + "epoch": 0.9115054488030331, + "grad_norm": 0.526216751731207, + "learning_rate": 0.0001190709928525227, + "loss": 12.1371, + "step": 16739 + }, + { + "epoch": 0.9115599027996161, + "grad_norm": 0.5608380080254172, + "learning_rate": 0.0001190623363666953, + "loss": 12.0116, + "step": 16740 + }, + { + "epoch": 0.9116143567961991, + "grad_norm": 0.5547421600370966, + "learning_rate": 0.000119053679732636, + "loss": 12.1079, + "step": 16741 + }, + { + "epoch": 0.9116688107927822, + "grad_norm": 0.614869012298698, + "learning_rate": 0.00011904502295041206, + "loss": 12.2039, + "step": 16742 + }, + { + "epoch": 0.9117232647893652, + "grad_norm": 0.5727502640962558, + "learning_rate": 0.00011903636602009087, + "loss": 12.1143, + "step": 16743 + }, + { + "epoch": 0.9117777187859482, + "grad_norm": 0.5575695243558199, + "learning_rate": 0.00011902770894173967, + "loss": 12.1738, + "step": 16744 + }, + { + "epoch": 0.9118321727825311, + "grad_norm": 0.5975922358915751, + "learning_rate": 0.0001190190517154258, + "loss": 12.277, + "step": 16745 + }, + { + "epoch": 0.9118866267791141, + "grad_norm": 0.6040288247120295, + "learning_rate": 0.00011901039434121661, + "loss": 12.1355, + "step": 16746 + }, + { + "epoch": 0.9119410807756971, + "grad_norm": 0.6450256711017941, + "learning_rate": 0.00011900173681917944, + "loss": 12.1493, + "step": 16747 + }, + { + "epoch": 0.9119955347722802, + "grad_norm": 0.4947022240185635, + "learning_rate": 0.00011899307914938157, + "loss": 12.1176, + "step": 16748 + }, + { + "epoch": 0.9120499887688632, + "grad_norm": 0.5670942648674901, + "learning_rate": 0.0001189844213318903, + "loss": 12.2005, + "step": 16749 + }, + { + "epoch": 0.9121044427654462, + "grad_norm": 0.5765238273736617, + "learning_rate": 0.00011897576336677297, + "loss": 12.0534, + "step": 16750 + }, + { + "epoch": 0.9121588967620292, + "grad_norm": 0.5754905418803878, + "learning_rate": 0.00011896710525409696, + "loss": 12.1506, + "step": 16751 + }, + { + "epoch": 0.9122133507586122, + "grad_norm": 0.597427615061958, + "learning_rate": 0.00011895844699392952, + "loss": 12.132, + "step": 16752 + }, + { + "epoch": 0.9122678047551952, + "grad_norm": 0.5888072462379587, + "learning_rate": 0.00011894978858633808, + "loss": 12.1797, + "step": 16753 + }, + { + "epoch": 0.9123222587517783, + "grad_norm": 0.5648742161166406, + "learning_rate": 0.00011894113003138987, + "loss": 12.0295, + "step": 16754 + }, + { + "epoch": 0.9123767127483613, + "grad_norm": 0.580604401098401, + "learning_rate": 0.00011893247132915224, + "loss": 12.0438, + "step": 16755 + }, + { + "epoch": 0.9124311667449443, + "grad_norm": 0.6148933844315049, + "learning_rate": 0.00011892381247969256, + "loss": 12.1361, + "step": 16756 + }, + { + "epoch": 0.9124856207415273, + "grad_norm": 0.5691730989122715, + "learning_rate": 0.00011891515348307812, + "loss": 12.0467, + "step": 16757 + }, + { + "epoch": 0.9125400747381103, + "grad_norm": 0.5702840363937903, + "learning_rate": 0.00011890649433937631, + "loss": 12.1824, + "step": 16758 + }, + { + "epoch": 0.9125945287346933, + "grad_norm": 0.7081968855598006, + "learning_rate": 0.00011889783504865442, + "loss": 12.1837, + "step": 16759 + }, + { + "epoch": 0.9126489827312764, + "grad_norm": 0.5773421638162692, + "learning_rate": 0.00011888917561097978, + "loss": 12.2206, + "step": 16760 + }, + { + "epoch": 0.9127034367278594, + "grad_norm": 0.6249849239293285, + "learning_rate": 0.00011888051602641971, + "loss": 12.2438, + "step": 16761 + }, + { + "epoch": 0.9127578907244424, + "grad_norm": 0.5705795563291992, + "learning_rate": 0.00011887185629504162, + "loss": 12.1928, + "step": 16762 + }, + { + "epoch": 0.9128123447210253, + "grad_norm": 0.565809674777921, + "learning_rate": 0.00011886319641691284, + "loss": 12.1484, + "step": 16763 + }, + { + "epoch": 0.9128667987176083, + "grad_norm": 0.6011464088918341, + "learning_rate": 0.00011885453639210064, + "loss": 12.0994, + "step": 16764 + }, + { + "epoch": 0.9129212527141914, + "grad_norm": 0.5115240947395492, + "learning_rate": 0.00011884587622067243, + "loss": 12.1974, + "step": 16765 + }, + { + "epoch": 0.9129757067107744, + "grad_norm": 0.5979106644846061, + "learning_rate": 0.00011883721590269548, + "loss": 12.2424, + "step": 16766 + }, + { + "epoch": 0.9130301607073574, + "grad_norm": 0.6040356674593268, + "learning_rate": 0.00011882855543823721, + "loss": 12.2643, + "step": 16767 + }, + { + "epoch": 0.9130846147039404, + "grad_norm": 0.6333776562522606, + "learning_rate": 0.00011881989482736496, + "loss": 12.2181, + "step": 16768 + }, + { + "epoch": 0.9131390687005234, + "grad_norm": 0.5974897631083352, + "learning_rate": 0.00011881123407014602, + "loss": 12.2313, + "step": 16769 + }, + { + "epoch": 0.9131935226971064, + "grad_norm": 0.5542553523882653, + "learning_rate": 0.00011880257316664778, + "loss": 12.2712, + "step": 16770 + }, + { + "epoch": 0.9132479766936895, + "grad_norm": 0.5402366430344611, + "learning_rate": 0.00011879391211693757, + "loss": 12.1131, + "step": 16771 + }, + { + "epoch": 0.9133024306902725, + "grad_norm": 0.511598019376126, + "learning_rate": 0.00011878525092108272, + "loss": 12.0113, + "step": 16772 + }, + { + "epoch": 0.9133568846868555, + "grad_norm": 0.5790887579476809, + "learning_rate": 0.00011877658957915068, + "loss": 12.3035, + "step": 16773 + }, + { + "epoch": 0.9134113386834385, + "grad_norm": 0.5195244342633508, + "learning_rate": 0.00011876792809120867, + "loss": 12.1332, + "step": 16774 + }, + { + "epoch": 0.9134657926800215, + "grad_norm": 0.569483171720159, + "learning_rate": 0.00011875926645732413, + "loss": 12.0297, + "step": 16775 + }, + { + "epoch": 0.9135202466766045, + "grad_norm": 0.5953612424801111, + "learning_rate": 0.00011875060467756438, + "loss": 12.3428, + "step": 16776 + }, + { + "epoch": 0.9135747006731876, + "grad_norm": 0.5306157216561193, + "learning_rate": 0.0001187419427519968, + "loss": 12.1489, + "step": 16777 + }, + { + "epoch": 0.9136291546697706, + "grad_norm": 0.5446436430199308, + "learning_rate": 0.0001187332806806887, + "loss": 12.192, + "step": 16778 + }, + { + "epoch": 0.9136836086663536, + "grad_norm": 0.5653427027897644, + "learning_rate": 0.00011872461846370748, + "loss": 12.184, + "step": 16779 + }, + { + "epoch": 0.9137380626629366, + "grad_norm": 0.5296990011718393, + "learning_rate": 0.0001187159561011205, + "loss": 12.2265, + "step": 16780 + }, + { + "epoch": 0.9137925166595195, + "grad_norm": 0.5344827860754364, + "learning_rate": 0.0001187072935929951, + "loss": 12.1615, + "step": 16781 + }, + { + "epoch": 0.9138469706561025, + "grad_norm": 0.5383766982801177, + "learning_rate": 0.00011869863093939864, + "loss": 12.1102, + "step": 16782 + }, + { + "epoch": 0.9139014246526856, + "grad_norm": 1.0582908680442866, + "learning_rate": 0.0001186899681403985, + "loss": 12.2392, + "step": 16783 + }, + { + "epoch": 0.9139558786492686, + "grad_norm": 0.6430932939318632, + "learning_rate": 0.00011868130519606202, + "loss": 12.1362, + "step": 16784 + }, + { + "epoch": 0.9140103326458516, + "grad_norm": 0.5464335146088309, + "learning_rate": 0.00011867264210645659, + "loss": 12.1514, + "step": 16785 + }, + { + "epoch": 0.9140647866424346, + "grad_norm": 0.5557665961056553, + "learning_rate": 0.00011866397887164958, + "loss": 12.2397, + "step": 16786 + }, + { + "epoch": 0.9141192406390176, + "grad_norm": 0.5393363904487715, + "learning_rate": 0.0001186553154917083, + "loss": 12.1043, + "step": 16787 + }, + { + "epoch": 0.9141736946356006, + "grad_norm": 0.5548053206768346, + "learning_rate": 0.00011864665196670018, + "loss": 12.2035, + "step": 16788 + }, + { + "epoch": 0.9142281486321837, + "grad_norm": 0.6222726367876813, + "learning_rate": 0.00011863798829669257, + "loss": 12.0901, + "step": 16789 + }, + { + "epoch": 0.9142826026287667, + "grad_norm": 0.5679218291031215, + "learning_rate": 0.00011862932448175283, + "loss": 12.0993, + "step": 16790 + }, + { + "epoch": 0.9143370566253497, + "grad_norm": 0.5486238511901713, + "learning_rate": 0.00011862066052194833, + "loss": 12.2531, + "step": 16791 + }, + { + "epoch": 0.9143915106219327, + "grad_norm": 0.5321184478689914, + "learning_rate": 0.00011861199641734648, + "loss": 12.1077, + "step": 16792 + }, + { + "epoch": 0.9144459646185157, + "grad_norm": 0.5435200884960917, + "learning_rate": 0.0001186033321680146, + "loss": 12.1614, + "step": 16793 + }, + { + "epoch": 0.9145004186150988, + "grad_norm": 0.5414606235146378, + "learning_rate": 0.00011859466777402009, + "loss": 12.2113, + "step": 16794 + }, + { + "epoch": 0.9145548726116818, + "grad_norm": 0.5413555138696808, + "learning_rate": 0.00011858600323543035, + "loss": 12.1002, + "step": 16795 + }, + { + "epoch": 0.9146093266082648, + "grad_norm": 0.5328312176322992, + "learning_rate": 0.0001185773385523127, + "loss": 12.0829, + "step": 16796 + }, + { + "epoch": 0.9146637806048478, + "grad_norm": 0.5645118336086263, + "learning_rate": 0.00011856867372473456, + "loss": 12.279, + "step": 16797 + }, + { + "epoch": 0.9147182346014308, + "grad_norm": 0.594588448885202, + "learning_rate": 0.00011856000875276332, + "loss": 12.1769, + "step": 16798 + }, + { + "epoch": 0.9147726885980138, + "grad_norm": 0.5671975738301612, + "learning_rate": 0.00011855134363646631, + "loss": 12.1391, + "step": 16799 + }, + { + "epoch": 0.9148271425945969, + "grad_norm": 0.5324538805071298, + "learning_rate": 0.00011854267837591095, + "loss": 12.1197, + "step": 16800 + }, + { + "epoch": 0.9148815965911798, + "grad_norm": 0.6474357680124087, + "learning_rate": 0.00011853401297116462, + "loss": 12.1605, + "step": 16801 + }, + { + "epoch": 0.9149360505877628, + "grad_norm": 0.5367501737187403, + "learning_rate": 0.00011852534742229469, + "loss": 12.0937, + "step": 16802 + }, + { + "epoch": 0.9149905045843458, + "grad_norm": 0.5720099048286376, + "learning_rate": 0.00011851668172936858, + "loss": 12.0828, + "step": 16803 + }, + { + "epoch": 0.9150449585809288, + "grad_norm": 0.5583897433769571, + "learning_rate": 0.0001185080158924536, + "loss": 12.1997, + "step": 16804 + }, + { + "epoch": 0.9150994125775118, + "grad_norm": 0.54360969188526, + "learning_rate": 0.0001184993499116172, + "loss": 12.1959, + "step": 16805 + }, + { + "epoch": 0.9151538665740949, + "grad_norm": 0.522848415236117, + "learning_rate": 0.00011849068378692676, + "loss": 12.1708, + "step": 16806 + }, + { + "epoch": 0.9152083205706779, + "grad_norm": 0.6759180594616391, + "learning_rate": 0.00011848201751844967, + "loss": 12.2251, + "step": 16807 + }, + { + "epoch": 0.9152627745672609, + "grad_norm": 0.5675597634248547, + "learning_rate": 0.00011847335110625333, + "loss": 12.0897, + "step": 16808 + }, + { + "epoch": 0.9153172285638439, + "grad_norm": 0.5468070558084621, + "learning_rate": 0.00011846468455040509, + "loss": 12.1422, + "step": 16809 + }, + { + "epoch": 0.9153716825604269, + "grad_norm": 0.5786821751893668, + "learning_rate": 0.00011845601785097233, + "loss": 12.1366, + "step": 16810 + }, + { + "epoch": 0.9154261365570099, + "grad_norm": 0.5456084060971332, + "learning_rate": 0.00011844735100802253, + "loss": 12.2959, + "step": 16811 + }, + { + "epoch": 0.915480590553593, + "grad_norm": 0.5598519840243396, + "learning_rate": 0.00011843868402162301, + "loss": 12.076, + "step": 16812 + }, + { + "epoch": 0.915535044550176, + "grad_norm": 0.5677753649414218, + "learning_rate": 0.00011843001689184123, + "loss": 12.2355, + "step": 16813 + }, + { + "epoch": 0.915589498546759, + "grad_norm": 0.5350599288088638, + "learning_rate": 0.00011842134961874451, + "loss": 12.1107, + "step": 16814 + }, + { + "epoch": 0.915643952543342, + "grad_norm": 0.5516091666079663, + "learning_rate": 0.00011841268220240027, + "loss": 12.145, + "step": 16815 + }, + { + "epoch": 0.915698406539925, + "grad_norm": 0.5692472115070899, + "learning_rate": 0.00011840401464287596, + "loss": 11.952, + "step": 16816 + }, + { + "epoch": 0.915752860536508, + "grad_norm": 0.598519974976252, + "learning_rate": 0.00011839534694023893, + "loss": 12.0693, + "step": 16817 + }, + { + "epoch": 0.915807314533091, + "grad_norm": 0.5313337553609586, + "learning_rate": 0.0001183866790945566, + "loss": 12.1934, + "step": 16818 + }, + { + "epoch": 0.915861768529674, + "grad_norm": 0.5734942775980757, + "learning_rate": 0.00011837801110589639, + "loss": 12.1456, + "step": 16819 + }, + { + "epoch": 0.915916222526257, + "grad_norm": 0.5834459456631099, + "learning_rate": 0.00011836934297432568, + "loss": 12.1803, + "step": 16820 + }, + { + "epoch": 0.91597067652284, + "grad_norm": 0.5465156658572613, + "learning_rate": 0.00011836067469991184, + "loss": 12.0439, + "step": 16821 + }, + { + "epoch": 0.916025130519423, + "grad_norm": 0.5169095512192226, + "learning_rate": 0.00011835200628272234, + "loss": 12.0206, + "step": 16822 + }, + { + "epoch": 0.916079584516006, + "grad_norm": 0.5766012605581224, + "learning_rate": 0.00011834333772282455, + "loss": 12.132, + "step": 16823 + }, + { + "epoch": 0.9161340385125891, + "grad_norm": 0.5590255109156209, + "learning_rate": 0.00011833466902028594, + "loss": 12.1715, + "step": 16824 + }, + { + "epoch": 0.9161884925091721, + "grad_norm": 0.5938053946101209, + "learning_rate": 0.00011832600017517382, + "loss": 12.171, + "step": 16825 + }, + { + "epoch": 0.9162429465057551, + "grad_norm": 0.6132387898997561, + "learning_rate": 0.00011831733118755566, + "loss": 12.1305, + "step": 16826 + }, + { + "epoch": 0.9162974005023381, + "grad_norm": 0.6982163513457541, + "learning_rate": 0.00011830866205749884, + "loss": 12.2049, + "step": 16827 + }, + { + "epoch": 0.9163518544989211, + "grad_norm": 0.5599338143513132, + "learning_rate": 0.00011829999278507083, + "loss": 12.1475, + "step": 16828 + }, + { + "epoch": 0.9164063084955042, + "grad_norm": 0.6149948822024341, + "learning_rate": 0.00011829132337033899, + "loss": 12.1869, + "step": 16829 + }, + { + "epoch": 0.9164607624920872, + "grad_norm": 0.6304619061712248, + "learning_rate": 0.00011828265381337076, + "loss": 12.2142, + "step": 16830 + }, + { + "epoch": 0.9165152164886702, + "grad_norm": 0.4988338920506397, + "learning_rate": 0.00011827398411423354, + "loss": 12.006, + "step": 16831 + }, + { + "epoch": 0.9165696704852532, + "grad_norm": 0.6173633751853064, + "learning_rate": 0.00011826531427299475, + "loss": 12.1481, + "step": 16832 + }, + { + "epoch": 0.9166241244818362, + "grad_norm": 0.5357110912482415, + "learning_rate": 0.00011825664428972181, + "loss": 12.0745, + "step": 16833 + }, + { + "epoch": 0.9166785784784192, + "grad_norm": 0.6967866106962702, + "learning_rate": 0.00011824797416448216, + "loss": 12.2176, + "step": 16834 + }, + { + "epoch": 0.9167330324750023, + "grad_norm": 0.6544247026014385, + "learning_rate": 0.0001182393038973432, + "loss": 12.287, + "step": 16835 + }, + { + "epoch": 0.9167874864715853, + "grad_norm": 0.5644438850069057, + "learning_rate": 0.00011823063348837235, + "loss": 12.1593, + "step": 16836 + }, + { + "epoch": 0.9168419404681682, + "grad_norm": 0.5911753838445318, + "learning_rate": 0.00011822196293763704, + "loss": 12.1714, + "step": 16837 + }, + { + "epoch": 0.9168963944647512, + "grad_norm": 0.5883370221029104, + "learning_rate": 0.00011821329224520465, + "loss": 12.1827, + "step": 16838 + }, + { + "epoch": 0.9169508484613342, + "grad_norm": 0.5699740769714241, + "learning_rate": 0.0001182046214111427, + "loss": 12.0986, + "step": 16839 + }, + { + "epoch": 0.9170053024579172, + "grad_norm": 0.6434367700000917, + "learning_rate": 0.00011819595043551854, + "loss": 12.2088, + "step": 16840 + }, + { + "epoch": 0.9170597564545003, + "grad_norm": 0.5810603582792637, + "learning_rate": 0.00011818727931839964, + "loss": 12.2819, + "step": 16841 + }, + { + "epoch": 0.9171142104510833, + "grad_norm": 0.7095274139215517, + "learning_rate": 0.00011817860805985337, + "loss": 12.2811, + "step": 16842 + }, + { + "epoch": 0.9171686644476663, + "grad_norm": 0.6104125487630834, + "learning_rate": 0.0001181699366599472, + "loss": 12.0946, + "step": 16843 + }, + { + "epoch": 0.9172231184442493, + "grad_norm": 0.5246410991558379, + "learning_rate": 0.00011816126511874859, + "loss": 11.9659, + "step": 16844 + }, + { + "epoch": 0.9172775724408323, + "grad_norm": 0.6089764921307709, + "learning_rate": 0.0001181525934363249, + "loss": 12.1241, + "step": 16845 + }, + { + "epoch": 0.9173320264374153, + "grad_norm": 0.5785060702910032, + "learning_rate": 0.00011814392161274361, + "loss": 12.1418, + "step": 16846 + }, + { + "epoch": 0.9173864804339984, + "grad_norm": 0.6352933490510204, + "learning_rate": 0.00011813524964807215, + "loss": 12.1555, + "step": 16847 + }, + { + "epoch": 0.9174409344305814, + "grad_norm": 0.5131026586102728, + "learning_rate": 0.00011812657754237795, + "loss": 12.0899, + "step": 16848 + }, + { + "epoch": 0.9174953884271644, + "grad_norm": 0.5821382588317344, + "learning_rate": 0.00011811790529572842, + "loss": 12.1434, + "step": 16849 + }, + { + "epoch": 0.9175498424237474, + "grad_norm": 0.5447482139681347, + "learning_rate": 0.00011810923290819104, + "loss": 12.1217, + "step": 16850 + }, + { + "epoch": 0.9176042964203304, + "grad_norm": 0.539484649326593, + "learning_rate": 0.00011810056037983322, + "loss": 12.1442, + "step": 16851 + }, + { + "epoch": 0.9176587504169134, + "grad_norm": 0.6068708590455357, + "learning_rate": 0.00011809188771072241, + "loss": 12.0923, + "step": 16852 + }, + { + "epoch": 0.9177132044134965, + "grad_norm": 0.5918613326342391, + "learning_rate": 0.00011808321490092605, + "loss": 12.2175, + "step": 16853 + }, + { + "epoch": 0.9177676584100795, + "grad_norm": 0.49693951625138766, + "learning_rate": 0.00011807454195051158, + "loss": 12.0775, + "step": 16854 + }, + { + "epoch": 0.9178221124066624, + "grad_norm": 0.585130212317906, + "learning_rate": 0.00011806586885954642, + "loss": 12.114, + "step": 16855 + }, + { + "epoch": 0.9178765664032454, + "grad_norm": 0.5801909218668415, + "learning_rate": 0.00011805719562809807, + "loss": 12.2656, + "step": 16856 + }, + { + "epoch": 0.9179310203998284, + "grad_norm": 0.5522355927824034, + "learning_rate": 0.00011804852225623391, + "loss": 12.2535, + "step": 16857 + }, + { + "epoch": 0.9179854743964114, + "grad_norm": 0.5427828363518948, + "learning_rate": 0.00011803984874402143, + "loss": 12.0822, + "step": 16858 + }, + { + "epoch": 0.9180399283929945, + "grad_norm": 0.5661936373007997, + "learning_rate": 0.00011803117509152805, + "loss": 12.0889, + "step": 16859 + }, + { + "epoch": 0.9180943823895775, + "grad_norm": 0.637976219585058, + "learning_rate": 0.00011802250129882124, + "loss": 12.0885, + "step": 16860 + }, + { + "epoch": 0.9181488363861605, + "grad_norm": 0.6508504526745578, + "learning_rate": 0.00011801382736596842, + "loss": 12.1481, + "step": 16861 + }, + { + "epoch": 0.9182032903827435, + "grad_norm": 0.5508959358035357, + "learning_rate": 0.00011800515329303707, + "loss": 12.239, + "step": 16862 + }, + { + "epoch": 0.9182577443793265, + "grad_norm": 0.6082919722422235, + "learning_rate": 0.00011799647908009463, + "loss": 12.2931, + "step": 16863 + }, + { + "epoch": 0.9183121983759096, + "grad_norm": 0.5692475687106825, + "learning_rate": 0.00011798780472720854, + "loss": 12.0328, + "step": 16864 + }, + { + "epoch": 0.9183666523724926, + "grad_norm": 0.544721000401607, + "learning_rate": 0.00011797913023444626, + "loss": 12.1171, + "step": 16865 + }, + { + "epoch": 0.9184211063690756, + "grad_norm": 0.5488719223519434, + "learning_rate": 0.00011797045560187527, + "loss": 12.2137, + "step": 16866 + }, + { + "epoch": 0.9184755603656586, + "grad_norm": 0.541763616208224, + "learning_rate": 0.000117961780829563, + "loss": 12.1395, + "step": 16867 + }, + { + "epoch": 0.9185300143622416, + "grad_norm": 0.5545722866569928, + "learning_rate": 0.00011795310591757691, + "loss": 12.2065, + "step": 16868 + }, + { + "epoch": 0.9185844683588246, + "grad_norm": 0.8852819561606639, + "learning_rate": 0.00011794443086598446, + "loss": 12.1008, + "step": 16869 + }, + { + "epoch": 0.9186389223554077, + "grad_norm": 0.5838155507667105, + "learning_rate": 0.00011793575567485308, + "loss": 12.1819, + "step": 16870 + }, + { + "epoch": 0.9186933763519907, + "grad_norm": 0.5629298051200554, + "learning_rate": 0.00011792708034425027, + "loss": 12.0794, + "step": 16871 + }, + { + "epoch": 0.9187478303485737, + "grad_norm": 0.6035217143909042, + "learning_rate": 0.00011791840487424348, + "loss": 12.1327, + "step": 16872 + }, + { + "epoch": 0.9188022843451567, + "grad_norm": 0.5309279145877762, + "learning_rate": 0.00011790972926490018, + "loss": 12.2339, + "step": 16873 + }, + { + "epoch": 0.9188567383417396, + "grad_norm": 0.5765969595397223, + "learning_rate": 0.00011790105351628782, + "loss": 12.2462, + "step": 16874 + }, + { + "epoch": 0.9189111923383226, + "grad_norm": 0.6436547861263355, + "learning_rate": 0.00011789237762847385, + "loss": 12.1905, + "step": 16875 + }, + { + "epoch": 0.9189656463349057, + "grad_norm": 0.5637473395664372, + "learning_rate": 0.00011788370160152575, + "loss": 12.1323, + "step": 16876 + }, + { + "epoch": 0.9190201003314887, + "grad_norm": 0.5530859550148169, + "learning_rate": 0.00011787502543551099, + "loss": 12.099, + "step": 16877 + }, + { + "epoch": 0.9190745543280717, + "grad_norm": 0.6553952935885003, + "learning_rate": 0.00011786634913049703, + "loss": 12.2056, + "step": 16878 + }, + { + "epoch": 0.9191290083246547, + "grad_norm": 0.6089985457195687, + "learning_rate": 0.00011785767268655139, + "loss": 12.1522, + "step": 16879 + }, + { + "epoch": 0.9191834623212377, + "grad_norm": 0.5779445588832643, + "learning_rate": 0.00011784899610374144, + "loss": 12.223, + "step": 16880 + }, + { + "epoch": 0.9192379163178207, + "grad_norm": 0.7227344842874233, + "learning_rate": 0.00011784031938213471, + "loss": 12.2078, + "step": 16881 + }, + { + "epoch": 0.9192923703144038, + "grad_norm": 0.6875875596524184, + "learning_rate": 0.00011783164252179866, + "loss": 12.2143, + "step": 16882 + }, + { + "epoch": 0.9193468243109868, + "grad_norm": 0.556742417679642, + "learning_rate": 0.00011782296552280079, + "loss": 12.2031, + "step": 16883 + }, + { + "epoch": 0.9194012783075698, + "grad_norm": 0.6334430636052265, + "learning_rate": 0.00011781428838520856, + "loss": 12.1927, + "step": 16884 + }, + { + "epoch": 0.9194557323041528, + "grad_norm": 0.6206500293231294, + "learning_rate": 0.00011780561110908941, + "loss": 12.326, + "step": 16885 + }, + { + "epoch": 0.9195101863007358, + "grad_norm": 0.5766624897076231, + "learning_rate": 0.00011779693369451086, + "loss": 12.2196, + "step": 16886 + }, + { + "epoch": 0.9195646402973188, + "grad_norm": 0.583325671430832, + "learning_rate": 0.00011778825614154031, + "loss": 12.1991, + "step": 16887 + }, + { + "epoch": 0.9196190942939019, + "grad_norm": 0.5834547821471319, + "learning_rate": 0.00011777957845024537, + "loss": 12.1825, + "step": 16888 + }, + { + "epoch": 0.9196735482904849, + "grad_norm": 0.5174570598618606, + "learning_rate": 0.00011777090062069343, + "loss": 12.1828, + "step": 16889 + }, + { + "epoch": 0.9197280022870679, + "grad_norm": 0.622195757252222, + "learning_rate": 0.00011776222265295199, + "loss": 12.1731, + "step": 16890 + }, + { + "epoch": 0.9197824562836509, + "grad_norm": 0.5755695132597045, + "learning_rate": 0.00011775354454708851, + "loss": 12.2292, + "step": 16891 + }, + { + "epoch": 0.9198369102802338, + "grad_norm": 0.620146372251081, + "learning_rate": 0.00011774486630317048, + "loss": 12.159, + "step": 16892 + }, + { + "epoch": 0.9198913642768168, + "grad_norm": 0.6189664842317881, + "learning_rate": 0.00011773618792126542, + "loss": 12.1616, + "step": 16893 + }, + { + "epoch": 0.9199458182733999, + "grad_norm": 0.5461333527479741, + "learning_rate": 0.0001177275094014408, + "loss": 12.0693, + "step": 16894 + }, + { + "epoch": 0.9200002722699829, + "grad_norm": 0.6395709052423875, + "learning_rate": 0.00011771883074376406, + "loss": 12.1698, + "step": 16895 + }, + { + "epoch": 0.9200547262665659, + "grad_norm": 0.5707100063782455, + "learning_rate": 0.00011771015194830273, + "loss": 12.1584, + "step": 16896 + }, + { + "epoch": 0.9201091802631489, + "grad_norm": 0.6704402001258953, + "learning_rate": 0.00011770147301512429, + "loss": 12.2118, + "step": 16897 + }, + { + "epoch": 0.9201636342597319, + "grad_norm": 0.553766648058549, + "learning_rate": 0.00011769279394429622, + "loss": 12.1416, + "step": 16898 + }, + { + "epoch": 0.920218088256315, + "grad_norm": 0.5445145089497211, + "learning_rate": 0.00011768411473588603, + "loss": 12.0892, + "step": 16899 + }, + { + "epoch": 0.920272542252898, + "grad_norm": 0.5384689612754129, + "learning_rate": 0.0001176754353899612, + "loss": 12.1376, + "step": 16900 + }, + { + "epoch": 0.920326996249481, + "grad_norm": 0.6064363198046522, + "learning_rate": 0.00011766675590658922, + "loss": 12.178, + "step": 16901 + }, + { + "epoch": 0.920381450246064, + "grad_norm": 0.5880024864089798, + "learning_rate": 0.00011765807628583758, + "loss": 12.1358, + "step": 16902 + }, + { + "epoch": 0.920435904242647, + "grad_norm": 0.534753270596174, + "learning_rate": 0.00011764939652777376, + "loss": 12.1993, + "step": 16903 + }, + { + "epoch": 0.92049035823923, + "grad_norm": 0.6295164454306247, + "learning_rate": 0.0001176407166324653, + "loss": 12.1118, + "step": 16904 + }, + { + "epoch": 0.9205448122358131, + "grad_norm": 0.6023643479493942, + "learning_rate": 0.00011763203659997965, + "loss": 12.2011, + "step": 16905 + }, + { + "epoch": 0.9205992662323961, + "grad_norm": 0.5757732415578404, + "learning_rate": 0.00011762335643038433, + "loss": 12.1225, + "step": 16906 + }, + { + "epoch": 0.9206537202289791, + "grad_norm": 0.5355212834199039, + "learning_rate": 0.00011761467612374684, + "loss": 12.1321, + "step": 16907 + }, + { + "epoch": 0.9207081742255621, + "grad_norm": 0.5048507275544266, + "learning_rate": 0.00011760599568013468, + "loss": 12.0923, + "step": 16908 + }, + { + "epoch": 0.920762628222145, + "grad_norm": 0.5551109810794038, + "learning_rate": 0.00011759731509961534, + "loss": 12.1753, + "step": 16909 + }, + { + "epoch": 0.920817082218728, + "grad_norm": 0.5191144251279728, + "learning_rate": 0.00011758863438225631, + "loss": 12.0751, + "step": 16910 + }, + { + "epoch": 0.9208715362153111, + "grad_norm": 0.5625247792112014, + "learning_rate": 0.00011757995352812514, + "loss": 12.1726, + "step": 16911 + }, + { + "epoch": 0.9209259902118941, + "grad_norm": 0.5466597820845551, + "learning_rate": 0.00011757127253728928, + "loss": 12.1765, + "step": 16912 + }, + { + "epoch": 0.9209804442084771, + "grad_norm": 0.5194500983946128, + "learning_rate": 0.00011756259140981627, + "loss": 12.0903, + "step": 16913 + }, + { + "epoch": 0.9210348982050601, + "grad_norm": 0.541872524740386, + "learning_rate": 0.0001175539101457736, + "loss": 12.2169, + "step": 16914 + }, + { + "epoch": 0.9210893522016431, + "grad_norm": 0.6645674759196384, + "learning_rate": 0.00011754522874522877, + "loss": 12.2635, + "step": 16915 + }, + { + "epoch": 0.9211438061982261, + "grad_norm": 0.5777726404571027, + "learning_rate": 0.00011753654720824932, + "loss": 12.0605, + "step": 16916 + }, + { + "epoch": 0.9211982601948092, + "grad_norm": 0.5564081110481262, + "learning_rate": 0.00011752786553490272, + "loss": 12.1363, + "step": 16917 + }, + { + "epoch": 0.9212527141913922, + "grad_norm": 0.6655345929789, + "learning_rate": 0.00011751918372525652, + "loss": 12.1562, + "step": 16918 + }, + { + "epoch": 0.9213071681879752, + "grad_norm": 0.5578526087544446, + "learning_rate": 0.00011751050177937818, + "loss": 12.1479, + "step": 16919 + }, + { + "epoch": 0.9213616221845582, + "grad_norm": 0.5116884761127108, + "learning_rate": 0.00011750181969733528, + "loss": 12.0556, + "step": 16920 + }, + { + "epoch": 0.9214160761811412, + "grad_norm": 0.5883180073812652, + "learning_rate": 0.00011749313747919526, + "loss": 12.152, + "step": 16921 + }, + { + "epoch": 0.9214705301777242, + "grad_norm": 0.71928827180107, + "learning_rate": 0.00011748445512502568, + "loss": 12.1099, + "step": 16922 + }, + { + "epoch": 0.9215249841743073, + "grad_norm": 0.5801052874839137, + "learning_rate": 0.00011747577263489405, + "loss": 12.1079, + "step": 16923 + }, + { + "epoch": 0.9215794381708903, + "grad_norm": 0.5791300993972605, + "learning_rate": 0.00011746709000886786, + "loss": 12.0104, + "step": 16924 + }, + { + "epoch": 0.9216338921674733, + "grad_norm": 0.5914497436093543, + "learning_rate": 0.00011745840724701467, + "loss": 12.1728, + "step": 16925 + }, + { + "epoch": 0.9216883461640563, + "grad_norm": 0.6521740942280411, + "learning_rate": 0.00011744972434940199, + "loss": 12.2992, + "step": 16926 + }, + { + "epoch": 0.9217428001606393, + "grad_norm": 0.6474119664235948, + "learning_rate": 0.0001174410413160973, + "loss": 12.3222, + "step": 16927 + }, + { + "epoch": 0.9217972541572224, + "grad_norm": 0.6115920339132841, + "learning_rate": 0.00011743235814716816, + "loss": 12.1622, + "step": 16928 + }, + { + "epoch": 0.9218517081538053, + "grad_norm": 0.571447806039776, + "learning_rate": 0.0001174236748426821, + "loss": 12.1173, + "step": 16929 + }, + { + "epoch": 0.9219061621503883, + "grad_norm": 0.6527582706546112, + "learning_rate": 0.00011741499140270658, + "loss": 12.07, + "step": 16930 + }, + { + "epoch": 0.9219606161469713, + "grad_norm": 0.5238781389090017, + "learning_rate": 0.0001174063078273092, + "loss": 12.1486, + "step": 16931 + }, + { + "epoch": 0.9220150701435543, + "grad_norm": 0.53294174741349, + "learning_rate": 0.00011739762411655741, + "loss": 12.1161, + "step": 16932 + }, + { + "epoch": 0.9220695241401373, + "grad_norm": 0.54549396412689, + "learning_rate": 0.00011738894027051882, + "loss": 12.0161, + "step": 16933 + }, + { + "epoch": 0.9221239781367204, + "grad_norm": 0.5851738088661577, + "learning_rate": 0.00011738025628926092, + "loss": 12.0931, + "step": 16934 + }, + { + "epoch": 0.9221784321333034, + "grad_norm": 0.5742586998918857, + "learning_rate": 0.00011737157217285122, + "loss": 11.9496, + "step": 16935 + }, + { + "epoch": 0.9222328861298864, + "grad_norm": 0.5667530315524123, + "learning_rate": 0.00011736288792135721, + "loss": 12.1213, + "step": 16936 + }, + { + "epoch": 0.9222873401264694, + "grad_norm": 0.5389019254871072, + "learning_rate": 0.00011735420353484654, + "loss": 12.1589, + "step": 16937 + }, + { + "epoch": 0.9223417941230524, + "grad_norm": 0.5653999762083491, + "learning_rate": 0.00011734551901338664, + "loss": 12.1234, + "step": 16938 + }, + { + "epoch": 0.9223962481196354, + "grad_norm": 0.5291458672313059, + "learning_rate": 0.0001173368343570451, + "loss": 11.9858, + "step": 16939 + }, + { + "epoch": 0.9224507021162185, + "grad_norm": 0.5492650586281563, + "learning_rate": 0.00011732814956588942, + "loss": 12.217, + "step": 16940 + }, + { + "epoch": 0.9225051561128015, + "grad_norm": 0.5499049350097684, + "learning_rate": 0.00011731946463998711, + "loss": 12.1631, + "step": 16941 + }, + { + "epoch": 0.9225596101093845, + "grad_norm": 0.6615920540288064, + "learning_rate": 0.00011731077957940578, + "loss": 12.1106, + "step": 16942 + }, + { + "epoch": 0.9226140641059675, + "grad_norm": 0.5381055966689792, + "learning_rate": 0.00011730209438421288, + "loss": 12.2094, + "step": 16943 + }, + { + "epoch": 0.9226685181025505, + "grad_norm": 0.5609330805323823, + "learning_rate": 0.00011729340905447606, + "loss": 12.1898, + "step": 16944 + }, + { + "epoch": 0.9227229720991335, + "grad_norm": 0.5939709915701091, + "learning_rate": 0.00011728472359026275, + "loss": 12.2755, + "step": 16945 + }, + { + "epoch": 0.9227774260957166, + "grad_norm": 0.627128974055508, + "learning_rate": 0.00011727603799164053, + "loss": 12.1736, + "step": 16946 + }, + { + "epoch": 0.9228318800922996, + "grad_norm": 0.5773341602635771, + "learning_rate": 0.00011726735225867693, + "loss": 12.0639, + "step": 16947 + }, + { + "epoch": 0.9228863340888825, + "grad_norm": 0.6039647843673996, + "learning_rate": 0.00011725866639143952, + "loss": 12.178, + "step": 16948 + }, + { + "epoch": 0.9229407880854655, + "grad_norm": 0.5758653069003834, + "learning_rate": 0.00011724998038999585, + "loss": 12.0593, + "step": 16949 + }, + { + "epoch": 0.9229952420820485, + "grad_norm": 0.5899518105644326, + "learning_rate": 0.0001172412942544134, + "loss": 12.2028, + "step": 16950 + }, + { + "epoch": 0.9230496960786315, + "grad_norm": 0.5652653779097088, + "learning_rate": 0.00011723260798475976, + "loss": 12.2616, + "step": 16951 + }, + { + "epoch": 0.9231041500752146, + "grad_norm": 0.5888009417927444, + "learning_rate": 0.00011722392158110249, + "loss": 12.0104, + "step": 16952 + }, + { + "epoch": 0.9231586040717976, + "grad_norm": 0.5669795658816074, + "learning_rate": 0.00011721523504350909, + "loss": 12.2135, + "step": 16953 + }, + { + "epoch": 0.9232130580683806, + "grad_norm": 0.611730610878642, + "learning_rate": 0.00011720654837204717, + "loss": 12.2023, + "step": 16954 + }, + { + "epoch": 0.9232675120649636, + "grad_norm": 0.51904773839831, + "learning_rate": 0.00011719786156678423, + "loss": 12.1355, + "step": 16955 + }, + { + "epoch": 0.9233219660615466, + "grad_norm": 0.6915896744511061, + "learning_rate": 0.00011718917462778782, + "loss": 12.2847, + "step": 16956 + }, + { + "epoch": 0.9233764200581296, + "grad_norm": 0.6096948831390273, + "learning_rate": 0.00011718048755512552, + "loss": 12.0716, + "step": 16957 + }, + { + "epoch": 0.9234308740547127, + "grad_norm": 0.6273856058339771, + "learning_rate": 0.00011717180034886484, + "loss": 12.2011, + "step": 16958 + }, + { + "epoch": 0.9234853280512957, + "grad_norm": 0.557686025784662, + "learning_rate": 0.00011716311300907339, + "loss": 12.2558, + "step": 16959 + }, + { + "epoch": 0.9235397820478787, + "grad_norm": 0.5179974039851645, + "learning_rate": 0.0001171544255358187, + "loss": 12.1361, + "step": 16960 + }, + { + "epoch": 0.9235942360444617, + "grad_norm": 0.6006390284439724, + "learning_rate": 0.0001171457379291683, + "loss": 12.1918, + "step": 16961 + }, + { + "epoch": 0.9236486900410447, + "grad_norm": 0.5772358993306812, + "learning_rate": 0.00011713705018918979, + "loss": 12.0963, + "step": 16962 + }, + { + "epoch": 0.9237031440376278, + "grad_norm": 0.5863016849465408, + "learning_rate": 0.00011712836231595067, + "loss": 12.1746, + "step": 16963 + }, + { + "epoch": 0.9237575980342108, + "grad_norm": 0.5758399347318343, + "learning_rate": 0.00011711967430951853, + "loss": 12.1444, + "step": 16964 + }, + { + "epoch": 0.9238120520307938, + "grad_norm": 0.5939292813079025, + "learning_rate": 0.000117110986169961, + "loss": 12.276, + "step": 16965 + }, + { + "epoch": 0.9238665060273767, + "grad_norm": 0.541052639544623, + "learning_rate": 0.00011710229789734551, + "loss": 12.2945, + "step": 16966 + }, + { + "epoch": 0.9239209600239597, + "grad_norm": 0.505180080963915, + "learning_rate": 0.00011709360949173968, + "loss": 11.9867, + "step": 16967 + }, + { + "epoch": 0.9239754140205427, + "grad_norm": 0.5817509351335834, + "learning_rate": 0.0001170849209532111, + "loss": 12.055, + "step": 16968 + }, + { + "epoch": 0.9240298680171258, + "grad_norm": 0.540099205163771, + "learning_rate": 0.00011707623228182729, + "loss": 12.1972, + "step": 16969 + }, + { + "epoch": 0.9240843220137088, + "grad_norm": 0.5672608784576997, + "learning_rate": 0.00011706754347765587, + "loss": 12.2125, + "step": 16970 + }, + { + "epoch": 0.9241387760102918, + "grad_norm": 0.5743230894713309, + "learning_rate": 0.00011705885454076435, + "loss": 12.1818, + "step": 16971 + }, + { + "epoch": 0.9241932300068748, + "grad_norm": 0.5462976545022575, + "learning_rate": 0.00011705016547122032, + "loss": 12.209, + "step": 16972 + }, + { + "epoch": 0.9242476840034578, + "grad_norm": 0.5832898955105185, + "learning_rate": 0.00011704147626909134, + "loss": 12.3018, + "step": 16973 + }, + { + "epoch": 0.9243021380000408, + "grad_norm": 0.7573427014823444, + "learning_rate": 0.00011703278693444498, + "loss": 12.1651, + "step": 16974 + }, + { + "epoch": 0.9243565919966239, + "grad_norm": 0.5762810005951213, + "learning_rate": 0.00011702409746734882, + "loss": 12.1974, + "step": 16975 + }, + { + "epoch": 0.9244110459932069, + "grad_norm": 0.5916727994725474, + "learning_rate": 0.0001170154078678704, + "loss": 12.3085, + "step": 16976 + }, + { + "epoch": 0.9244654999897899, + "grad_norm": 0.7111713426251668, + "learning_rate": 0.00011700671813607734, + "loss": 12.2125, + "step": 16977 + }, + { + "epoch": 0.9245199539863729, + "grad_norm": 0.5577895221948833, + "learning_rate": 0.00011699802827203718, + "loss": 12.0943, + "step": 16978 + }, + { + "epoch": 0.9245744079829559, + "grad_norm": 0.5312133239506819, + "learning_rate": 0.0001169893382758175, + "loss": 12.1762, + "step": 16979 + }, + { + "epoch": 0.9246288619795389, + "grad_norm": 0.5249657690938024, + "learning_rate": 0.00011698064814748586, + "loss": 12.1692, + "step": 16980 + }, + { + "epoch": 0.924683315976122, + "grad_norm": 0.5837377384106609, + "learning_rate": 0.00011697195788710988, + "loss": 11.9651, + "step": 16981 + }, + { + "epoch": 0.924737769972705, + "grad_norm": 0.5471769020073902, + "learning_rate": 0.0001169632674947571, + "loss": 12.1227, + "step": 16982 + }, + { + "epoch": 0.924792223969288, + "grad_norm": 0.6455960922545634, + "learning_rate": 0.0001169545769704951, + "loss": 12.1644, + "step": 16983 + }, + { + "epoch": 0.924846677965871, + "grad_norm": 0.6383411558745579, + "learning_rate": 0.00011694588631439147, + "loss": 12.1891, + "step": 16984 + }, + { + "epoch": 0.9249011319624539, + "grad_norm": 0.5272725937755892, + "learning_rate": 0.00011693719552651378, + "loss": 12.0553, + "step": 16985 + }, + { + "epoch": 0.9249555859590369, + "grad_norm": 0.5199498291500081, + "learning_rate": 0.00011692850460692964, + "loss": 12.1509, + "step": 16986 + }, + { + "epoch": 0.92501003995562, + "grad_norm": 0.5810983895908186, + "learning_rate": 0.0001169198135557066, + "loss": 12.1364, + "step": 16987 + }, + { + "epoch": 0.925064493952203, + "grad_norm": 0.5786941192229852, + "learning_rate": 0.00011691112237291224, + "loss": 12.2021, + "step": 16988 + }, + { + "epoch": 0.925118947948786, + "grad_norm": 0.5227920236417976, + "learning_rate": 0.00011690243105861416, + "loss": 12.0734, + "step": 16989 + }, + { + "epoch": 0.925173401945369, + "grad_norm": 0.605307586156183, + "learning_rate": 0.00011689373961287995, + "loss": 12.1793, + "step": 16990 + }, + { + "epoch": 0.925227855941952, + "grad_norm": 0.5932639111977884, + "learning_rate": 0.00011688504803577718, + "loss": 12.1466, + "step": 16991 + }, + { + "epoch": 0.925282309938535, + "grad_norm": 0.6001483622846763, + "learning_rate": 0.00011687635632737346, + "loss": 12.2558, + "step": 16992 + }, + { + "epoch": 0.9253367639351181, + "grad_norm": 0.6580245600331703, + "learning_rate": 0.00011686766448773634, + "loss": 12.2324, + "step": 16993 + }, + { + "epoch": 0.9253912179317011, + "grad_norm": 0.6248616057460186, + "learning_rate": 0.00011685897251693345, + "loss": 12.0222, + "step": 16994 + }, + { + "epoch": 0.9254456719282841, + "grad_norm": 0.6384201382347111, + "learning_rate": 0.00011685028041503236, + "loss": 12.2823, + "step": 16995 + }, + { + "epoch": 0.9255001259248671, + "grad_norm": 0.5851656381722564, + "learning_rate": 0.00011684158818210064, + "loss": 12.1895, + "step": 16996 + }, + { + "epoch": 0.9255545799214501, + "grad_norm": 0.6050196836611006, + "learning_rate": 0.00011683289581820593, + "loss": 12.1716, + "step": 16997 + }, + { + "epoch": 0.9256090339180332, + "grad_norm": 0.5977317895680825, + "learning_rate": 0.0001168242033234158, + "loss": 12.1989, + "step": 16998 + }, + { + "epoch": 0.9256634879146162, + "grad_norm": 0.5553430762164486, + "learning_rate": 0.00011681551069779784, + "loss": 12.099, + "step": 16999 + }, + { + "epoch": 0.9257179419111992, + "grad_norm": 0.6662028747923707, + "learning_rate": 0.00011680681794141965, + "loss": 12.1246, + "step": 17000 + }, + { + "epoch": 0.9257723959077822, + "grad_norm": 0.6443699801689361, + "learning_rate": 0.00011679812505434882, + "loss": 12.0578, + "step": 17001 + }, + { + "epoch": 0.9258268499043651, + "grad_norm": 0.5643059761112635, + "learning_rate": 0.00011678943203665294, + "loss": 12.1783, + "step": 17002 + }, + { + "epoch": 0.9258813039009481, + "grad_norm": 0.6073214020860899, + "learning_rate": 0.00011678073888839965, + "loss": 12.2232, + "step": 17003 + }, + { + "epoch": 0.9259357578975312, + "grad_norm": 0.6071808298586896, + "learning_rate": 0.00011677204560965649, + "loss": 12.2522, + "step": 17004 + }, + { + "epoch": 0.9259902118941142, + "grad_norm": 0.6351405987968302, + "learning_rate": 0.00011676335220049112, + "loss": 12.2171, + "step": 17005 + }, + { + "epoch": 0.9260446658906972, + "grad_norm": 0.5831917980551805, + "learning_rate": 0.0001167546586609711, + "loss": 12.146, + "step": 17006 + }, + { + "epoch": 0.9260991198872802, + "grad_norm": 0.6338100597883539, + "learning_rate": 0.00011674596499116404, + "loss": 12.169, + "step": 17007 + }, + { + "epoch": 0.9261535738838632, + "grad_norm": 0.5556819623122808, + "learning_rate": 0.00011673727119113756, + "loss": 12.0856, + "step": 17008 + }, + { + "epoch": 0.9262080278804462, + "grad_norm": 0.5452693802865192, + "learning_rate": 0.00011672857726095923, + "loss": 12.2743, + "step": 17009 + }, + { + "epoch": 0.9262624818770293, + "grad_norm": 0.5845607427851699, + "learning_rate": 0.0001167198832006967, + "loss": 12.3197, + "step": 17010 + }, + { + "epoch": 0.9263169358736123, + "grad_norm": 0.5174778972822843, + "learning_rate": 0.00011671118901041755, + "loss": 12.1127, + "step": 17011 + }, + { + "epoch": 0.9263713898701953, + "grad_norm": 0.5765368900728236, + "learning_rate": 0.00011670249469018939, + "loss": 12.2682, + "step": 17012 + }, + { + "epoch": 0.9264258438667783, + "grad_norm": 0.5312777638235685, + "learning_rate": 0.00011669380024007981, + "loss": 12.1433, + "step": 17013 + }, + { + "epoch": 0.9264802978633613, + "grad_norm": 0.4942873016918579, + "learning_rate": 0.00011668510566015644, + "loss": 12.0418, + "step": 17014 + }, + { + "epoch": 0.9265347518599443, + "grad_norm": 0.56684886519407, + "learning_rate": 0.00011667641095048693, + "loss": 12.1658, + "step": 17015 + }, + { + "epoch": 0.9265892058565274, + "grad_norm": 0.5438479777725457, + "learning_rate": 0.00011666771611113885, + "loss": 12.3115, + "step": 17016 + }, + { + "epoch": 0.9266436598531104, + "grad_norm": 0.5589051106902513, + "learning_rate": 0.00011665902114217979, + "loss": 11.9983, + "step": 17017 + }, + { + "epoch": 0.9266981138496934, + "grad_norm": 0.55847418262368, + "learning_rate": 0.00011665032604367736, + "loss": 12.059, + "step": 17018 + }, + { + "epoch": 0.9267525678462764, + "grad_norm": 0.5630048352703018, + "learning_rate": 0.00011664163081569923, + "loss": 12.1887, + "step": 17019 + }, + { + "epoch": 0.9268070218428593, + "grad_norm": 0.5732274252668589, + "learning_rate": 0.00011663293545831302, + "loss": 12.3322, + "step": 17020 + }, + { + "epoch": 0.9268614758394423, + "grad_norm": 0.5943019439752977, + "learning_rate": 0.00011662423997158629, + "loss": 12.1177, + "step": 17021 + }, + { + "epoch": 0.9269159298360254, + "grad_norm": 0.5640282485806937, + "learning_rate": 0.00011661554435558668, + "loss": 12.1892, + "step": 17022 + }, + { + "epoch": 0.9269703838326084, + "grad_norm": 0.5874228700734306, + "learning_rate": 0.00011660684861038181, + "loss": 12.1637, + "step": 17023 + }, + { + "epoch": 0.9270248378291914, + "grad_norm": 0.5497658023687364, + "learning_rate": 0.00011659815273603927, + "loss": 12.1317, + "step": 17024 + }, + { + "epoch": 0.9270792918257744, + "grad_norm": 0.7441199081125127, + "learning_rate": 0.00011658945673262675, + "loss": 12.3582, + "step": 17025 + }, + { + "epoch": 0.9271337458223574, + "grad_norm": 0.6354938189067647, + "learning_rate": 0.00011658076060021184, + "loss": 12.2086, + "step": 17026 + }, + { + "epoch": 0.9271881998189404, + "grad_norm": 0.5038329814281183, + "learning_rate": 0.00011657206433886214, + "loss": 12.1215, + "step": 17027 + }, + { + "epoch": 0.9272426538155235, + "grad_norm": 0.5855466980777795, + "learning_rate": 0.00011656336794864528, + "loss": 12.2244, + "step": 17028 + }, + { + "epoch": 0.9272971078121065, + "grad_norm": 0.6551742005373945, + "learning_rate": 0.0001165546714296289, + "loss": 12.1614, + "step": 17029 + }, + { + "epoch": 0.9273515618086895, + "grad_norm": 0.6345041410268087, + "learning_rate": 0.00011654597478188061, + "loss": 12.115, + "step": 17030 + }, + { + "epoch": 0.9274060158052725, + "grad_norm": 0.5470199496279786, + "learning_rate": 0.00011653727800546805, + "loss": 12.1861, + "step": 17031 + }, + { + "epoch": 0.9274604698018555, + "grad_norm": 0.6602093942963679, + "learning_rate": 0.00011652858110045886, + "loss": 12.0375, + "step": 17032 + }, + { + "epoch": 0.9275149237984386, + "grad_norm": 0.6575298540859775, + "learning_rate": 0.00011651988406692062, + "loss": 12.3605, + "step": 17033 + }, + { + "epoch": 0.9275693777950216, + "grad_norm": 0.5075422298046308, + "learning_rate": 0.00011651118690492102, + "loss": 12.0472, + "step": 17034 + }, + { + "epoch": 0.9276238317916046, + "grad_norm": 0.6874168942810434, + "learning_rate": 0.00011650248961452765, + "loss": 12.1397, + "step": 17035 + }, + { + "epoch": 0.9276782857881876, + "grad_norm": 0.573716036356838, + "learning_rate": 0.00011649379219580816, + "loss": 12.1485, + "step": 17036 + }, + { + "epoch": 0.9277327397847706, + "grad_norm": 0.594723876278983, + "learning_rate": 0.00011648509464883018, + "loss": 12.2291, + "step": 17037 + }, + { + "epoch": 0.9277871937813535, + "grad_norm": 0.5567430749098153, + "learning_rate": 0.0001164763969736613, + "loss": 12.1369, + "step": 17038 + }, + { + "epoch": 0.9278416477779367, + "grad_norm": 0.6402356541374973, + "learning_rate": 0.00011646769917036923, + "loss": 12.2883, + "step": 17039 + }, + { + "epoch": 0.9278961017745196, + "grad_norm": 0.5948790511726406, + "learning_rate": 0.00011645900123902156, + "loss": 12.1557, + "step": 17040 + }, + { + "epoch": 0.9279505557711026, + "grad_norm": 0.5748905277106298, + "learning_rate": 0.00011645030317968594, + "loss": 12.1694, + "step": 17041 + }, + { + "epoch": 0.9280050097676856, + "grad_norm": 0.6725080394286576, + "learning_rate": 0.00011644160499243002, + "loss": 12.1196, + "step": 17042 + }, + { + "epoch": 0.9280594637642686, + "grad_norm": 0.6136217550622626, + "learning_rate": 0.00011643290667732141, + "loss": 12.114, + "step": 17043 + }, + { + "epoch": 0.9281139177608516, + "grad_norm": 0.5746529170486847, + "learning_rate": 0.00011642420823442774, + "loss": 12.1748, + "step": 17044 + }, + { + "epoch": 0.9281683717574347, + "grad_norm": 0.7606424559623073, + "learning_rate": 0.00011641550966381669, + "loss": 12.1793, + "step": 17045 + }, + { + "epoch": 0.9282228257540177, + "grad_norm": 0.5736704979817694, + "learning_rate": 0.00011640681096555588, + "loss": 12.2137, + "step": 17046 + }, + { + "epoch": 0.9282772797506007, + "grad_norm": 0.5956818081165554, + "learning_rate": 0.00011639811213971297, + "loss": 12.2121, + "step": 17047 + }, + { + "epoch": 0.9283317337471837, + "grad_norm": 0.5155022066522056, + "learning_rate": 0.00011638941318635557, + "loss": 12.2178, + "step": 17048 + }, + { + "epoch": 0.9283861877437667, + "grad_norm": 0.6042941383500556, + "learning_rate": 0.00011638071410555136, + "loss": 12.1546, + "step": 17049 + }, + { + "epoch": 0.9284406417403497, + "grad_norm": 0.6940539649814073, + "learning_rate": 0.00011637201489736797, + "loss": 12.1122, + "step": 17050 + }, + { + "epoch": 0.9284950957369328, + "grad_norm": 0.5318145162753902, + "learning_rate": 0.00011636331556187303, + "loss": 12.1872, + "step": 17051 + }, + { + "epoch": 0.9285495497335158, + "grad_norm": 0.6024309285582887, + "learning_rate": 0.00011635461609913422, + "loss": 12.2451, + "step": 17052 + }, + { + "epoch": 0.9286040037300988, + "grad_norm": 0.5717953753086978, + "learning_rate": 0.00011634591650921916, + "loss": 12.1309, + "step": 17053 + }, + { + "epoch": 0.9286584577266818, + "grad_norm": 0.6785739383229514, + "learning_rate": 0.00011633721679219553, + "loss": 12.0856, + "step": 17054 + }, + { + "epoch": 0.9287129117232648, + "grad_norm": 0.5574867157515434, + "learning_rate": 0.00011632851694813096, + "loss": 12.2054, + "step": 17055 + }, + { + "epoch": 0.9287673657198477, + "grad_norm": 0.5600461548071675, + "learning_rate": 0.00011631981697709308, + "loss": 12.2204, + "step": 17056 + }, + { + "epoch": 0.9288218197164309, + "grad_norm": 0.5760992539546776, + "learning_rate": 0.00011631111687914959, + "loss": 12.1364, + "step": 17057 + }, + { + "epoch": 0.9288762737130138, + "grad_norm": 0.5484661974854979, + "learning_rate": 0.00011630241665436812, + "loss": 12.2272, + "step": 17058 + }, + { + "epoch": 0.9289307277095968, + "grad_norm": 0.6020621004749886, + "learning_rate": 0.00011629371630281632, + "loss": 12.1409, + "step": 17059 + }, + { + "epoch": 0.9289851817061798, + "grad_norm": 0.554976863830771, + "learning_rate": 0.00011628501582456186, + "loss": 12.1648, + "step": 17060 + }, + { + "epoch": 0.9290396357027628, + "grad_norm": 0.5700672634511531, + "learning_rate": 0.00011627631521967237, + "loss": 12.1624, + "step": 17061 + }, + { + "epoch": 0.9290940896993459, + "grad_norm": 0.5677438278839031, + "learning_rate": 0.00011626761448821551, + "loss": 12.1885, + "step": 17062 + }, + { + "epoch": 0.9291485436959289, + "grad_norm": 0.7112087422260562, + "learning_rate": 0.00011625891363025896, + "loss": 12.3993, + "step": 17063 + }, + { + "epoch": 0.9292029976925119, + "grad_norm": 0.611967896965925, + "learning_rate": 0.00011625021264587037, + "loss": 12.1846, + "step": 17064 + }, + { + "epoch": 0.9292574516890949, + "grad_norm": 0.5658316156390023, + "learning_rate": 0.00011624151153511745, + "loss": 12.1155, + "step": 17065 + }, + { + "epoch": 0.9293119056856779, + "grad_norm": 0.5161574686800847, + "learning_rate": 0.00011623281029806776, + "loss": 12.102, + "step": 17066 + }, + { + "epoch": 0.9293663596822609, + "grad_norm": 0.5298763617916129, + "learning_rate": 0.000116224108934789, + "loss": 12.1406, + "step": 17067 + }, + { + "epoch": 0.929420813678844, + "grad_norm": 0.6023836352490733, + "learning_rate": 0.00011621540744534886, + "loss": 12.1412, + "step": 17068 + }, + { + "epoch": 0.929475267675427, + "grad_norm": 0.6430555819292236, + "learning_rate": 0.00011620670582981504, + "loss": 12.17, + "step": 17069 + }, + { + "epoch": 0.92952972167201, + "grad_norm": 0.5346111639127659, + "learning_rate": 0.00011619800408825511, + "loss": 12.2956, + "step": 17070 + }, + { + "epoch": 0.929584175668593, + "grad_norm": 0.5934744123185334, + "learning_rate": 0.00011618930222073681, + "loss": 12.1488, + "step": 17071 + }, + { + "epoch": 0.929638629665176, + "grad_norm": 0.5902132907553823, + "learning_rate": 0.00011618060022732778, + "loss": 12.137, + "step": 17072 + }, + { + "epoch": 0.929693083661759, + "grad_norm": 0.5783018779464009, + "learning_rate": 0.00011617189810809566, + "loss": 12.1546, + "step": 17073 + }, + { + "epoch": 0.9297475376583421, + "grad_norm": 0.5436054397773015, + "learning_rate": 0.00011616319586310815, + "loss": 12.0984, + "step": 17074 + }, + { + "epoch": 0.929801991654925, + "grad_norm": 0.5234324656657, + "learning_rate": 0.00011615449349243297, + "loss": 12.1047, + "step": 17075 + }, + { + "epoch": 0.929856445651508, + "grad_norm": 0.5774191004854853, + "learning_rate": 0.0001161457909961377, + "loss": 12.1284, + "step": 17076 + }, + { + "epoch": 0.929910899648091, + "grad_norm": 0.5535469054706854, + "learning_rate": 0.00011613708837429005, + "loss": 12.1228, + "step": 17077 + }, + { + "epoch": 0.929965353644674, + "grad_norm": 0.5883833818085753, + "learning_rate": 0.00011612838562695772, + "loss": 12.2282, + "step": 17078 + }, + { + "epoch": 0.930019807641257, + "grad_norm": 0.5181596944895872, + "learning_rate": 0.0001161196827542083, + "loss": 12.1033, + "step": 17079 + }, + { + "epoch": 0.9300742616378401, + "grad_norm": 0.5877125272973959, + "learning_rate": 0.00011611097975610959, + "loss": 12.1928, + "step": 17080 + }, + { + "epoch": 0.9301287156344231, + "grad_norm": 0.5048991643199667, + "learning_rate": 0.00011610227663272917, + "loss": 12.1095, + "step": 17081 + }, + { + "epoch": 0.9301831696310061, + "grad_norm": 0.6103487489415191, + "learning_rate": 0.00011609357338413476, + "loss": 11.9787, + "step": 17082 + }, + { + "epoch": 0.9302376236275891, + "grad_norm": 0.5328646129693538, + "learning_rate": 0.000116084870010394, + "loss": 12.1755, + "step": 17083 + }, + { + "epoch": 0.9302920776241721, + "grad_norm": 0.6728846970846543, + "learning_rate": 0.00011607616651157461, + "loss": 12.0957, + "step": 17084 + }, + { + "epoch": 0.9303465316207551, + "grad_norm": 0.5457521547207682, + "learning_rate": 0.00011606746288774426, + "loss": 12.2521, + "step": 17085 + }, + { + "epoch": 0.9304009856173382, + "grad_norm": 0.5770038977786668, + "learning_rate": 0.00011605875913897062, + "loss": 12.0996, + "step": 17086 + }, + { + "epoch": 0.9304554396139212, + "grad_norm": 0.5414805298595619, + "learning_rate": 0.00011605005526532136, + "loss": 12.2006, + "step": 17087 + }, + { + "epoch": 0.9305098936105042, + "grad_norm": 0.551139919852669, + "learning_rate": 0.0001160413512668642, + "loss": 12.1229, + "step": 17088 + }, + { + "epoch": 0.9305643476070872, + "grad_norm": 0.5455866926700138, + "learning_rate": 0.00011603264714366677, + "loss": 12.082, + "step": 17089 + }, + { + "epoch": 0.9306188016036702, + "grad_norm": 0.5882208100494172, + "learning_rate": 0.00011602394289579681, + "loss": 12.1895, + "step": 17090 + }, + { + "epoch": 0.9306732556002532, + "grad_norm": 0.5537036790854779, + "learning_rate": 0.00011601523852332199, + "loss": 12.1104, + "step": 17091 + }, + { + "epoch": 0.9307277095968363, + "grad_norm": 0.5617690974975039, + "learning_rate": 0.00011600653402630999, + "loss": 12.2329, + "step": 17092 + }, + { + "epoch": 0.9307821635934193, + "grad_norm": 0.5464658860363083, + "learning_rate": 0.00011599782940482849, + "loss": 12.1214, + "step": 17093 + }, + { + "epoch": 0.9308366175900022, + "grad_norm": 0.5451502554132573, + "learning_rate": 0.00011598912465894518, + "loss": 12.129, + "step": 17094 + }, + { + "epoch": 0.9308910715865852, + "grad_norm": 0.5438190252079544, + "learning_rate": 0.00011598041978872776, + "loss": 12.102, + "step": 17095 + }, + { + "epoch": 0.9309455255831682, + "grad_norm": 0.5555309407005641, + "learning_rate": 0.00011597171479424391, + "loss": 12.1562, + "step": 17096 + }, + { + "epoch": 0.9309999795797513, + "grad_norm": 0.5117316313785095, + "learning_rate": 0.00011596300967556132, + "loss": 12.0907, + "step": 17097 + }, + { + "epoch": 0.9310544335763343, + "grad_norm": 0.610060574719516, + "learning_rate": 0.00011595430443274771, + "loss": 12.1539, + "step": 17098 + }, + { + "epoch": 0.9311088875729173, + "grad_norm": 0.674916055042679, + "learning_rate": 0.00011594559906587075, + "loss": 12.3322, + "step": 17099 + }, + { + "epoch": 0.9311633415695003, + "grad_norm": 0.5116038869777758, + "learning_rate": 0.00011593689357499813, + "loss": 12.1525, + "step": 17100 + }, + { + "epoch": 0.9312177955660833, + "grad_norm": 0.5640310944624323, + "learning_rate": 0.00011592818796019756, + "loss": 12.2007, + "step": 17101 + }, + { + "epoch": 0.9312722495626663, + "grad_norm": 0.6681081203799246, + "learning_rate": 0.00011591948222153672, + "loss": 12.1388, + "step": 17102 + }, + { + "epoch": 0.9313267035592494, + "grad_norm": 0.5026041518446537, + "learning_rate": 0.00011591077635908332, + "loss": 12.1392, + "step": 17103 + }, + { + "epoch": 0.9313811575558324, + "grad_norm": 0.5451971935992687, + "learning_rate": 0.00011590207037290506, + "loss": 12.2014, + "step": 17104 + }, + { + "epoch": 0.9314356115524154, + "grad_norm": 0.6192185932773268, + "learning_rate": 0.00011589336426306963, + "loss": 12.2781, + "step": 17105 + }, + { + "epoch": 0.9314900655489984, + "grad_norm": 0.5562586554348979, + "learning_rate": 0.00011588465802964472, + "loss": 12.0791, + "step": 17106 + }, + { + "epoch": 0.9315445195455814, + "grad_norm": 0.5452650641975968, + "learning_rate": 0.00011587595167269807, + "loss": 12.1655, + "step": 17107 + }, + { + "epoch": 0.9315989735421644, + "grad_norm": 0.7624867306533909, + "learning_rate": 0.00011586724519229734, + "loss": 12.1808, + "step": 17108 + }, + { + "epoch": 0.9316534275387475, + "grad_norm": 0.6006007027077219, + "learning_rate": 0.00011585853858851026, + "loss": 12.1602, + "step": 17109 + }, + { + "epoch": 0.9317078815353305, + "grad_norm": 0.5315077069002945, + "learning_rate": 0.00011584983186140453, + "loss": 12.2366, + "step": 17110 + }, + { + "epoch": 0.9317623355319135, + "grad_norm": 0.6186125620931717, + "learning_rate": 0.00011584112501104785, + "loss": 12.1332, + "step": 17111 + }, + { + "epoch": 0.9318167895284964, + "grad_norm": 0.5338508517958055, + "learning_rate": 0.0001158324180375079, + "loss": 12.1224, + "step": 17112 + }, + { + "epoch": 0.9318712435250794, + "grad_norm": 0.5366362161509145, + "learning_rate": 0.00011582371094085243, + "loss": 12.1756, + "step": 17113 + }, + { + "epoch": 0.9319256975216624, + "grad_norm": 0.5225751437955348, + "learning_rate": 0.00011581500372114912, + "loss": 12.1414, + "step": 17114 + }, + { + "epoch": 0.9319801515182455, + "grad_norm": 0.5488376445852008, + "learning_rate": 0.00011580629637846572, + "loss": 12.2341, + "step": 17115 + }, + { + "epoch": 0.9320346055148285, + "grad_norm": 0.6082275491325198, + "learning_rate": 0.00011579758891286988, + "loss": 12.1232, + "step": 17116 + }, + { + "epoch": 0.9320890595114115, + "grad_norm": 0.7465683112808241, + "learning_rate": 0.00011578888132442935, + "loss": 12.1643, + "step": 17117 + }, + { + "epoch": 0.9321435135079945, + "grad_norm": 0.5606054589322913, + "learning_rate": 0.00011578017361321183, + "loss": 12.0169, + "step": 17118 + }, + { + "epoch": 0.9321979675045775, + "grad_norm": 0.48457016303443323, + "learning_rate": 0.00011577146577928501, + "loss": 12.2196, + "step": 17119 + }, + { + "epoch": 0.9322524215011605, + "grad_norm": 0.6522209736071727, + "learning_rate": 0.00011576275782271666, + "loss": 12.1335, + "step": 17120 + }, + { + "epoch": 0.9323068754977436, + "grad_norm": 0.5705310071264384, + "learning_rate": 0.00011575404974357447, + "loss": 12.1468, + "step": 17121 + }, + { + "epoch": 0.9323613294943266, + "grad_norm": 0.6997733089409353, + "learning_rate": 0.00011574534154192611, + "loss": 12.2644, + "step": 17122 + }, + { + "epoch": 0.9324157834909096, + "grad_norm": 0.53727646261493, + "learning_rate": 0.00011573663321783935, + "loss": 12.2136, + "step": 17123 + }, + { + "epoch": 0.9324702374874926, + "grad_norm": 0.5202735440742016, + "learning_rate": 0.00011572792477138188, + "loss": 12.0472, + "step": 17124 + }, + { + "epoch": 0.9325246914840756, + "grad_norm": 0.5814386451768224, + "learning_rate": 0.00011571921620262145, + "loss": 12.1675, + "step": 17125 + }, + { + "epoch": 0.9325791454806586, + "grad_norm": 0.597725688567498, + "learning_rate": 0.00011571050751162577, + "loss": 12.2002, + "step": 17126 + }, + { + "epoch": 0.9326335994772417, + "grad_norm": 0.5559482868976506, + "learning_rate": 0.00011570179869846252, + "loss": 12.1005, + "step": 17127 + }, + { + "epoch": 0.9326880534738247, + "grad_norm": 0.5902442834706637, + "learning_rate": 0.00011569308976319946, + "loss": 12.2999, + "step": 17128 + }, + { + "epoch": 0.9327425074704077, + "grad_norm": 0.6024852485207571, + "learning_rate": 0.00011568438070590429, + "loss": 12.2589, + "step": 17129 + }, + { + "epoch": 0.9327969614669906, + "grad_norm": 0.5433397289395396, + "learning_rate": 0.00011567567152664476, + "loss": 12.2047, + "step": 17130 + }, + { + "epoch": 0.9328514154635736, + "grad_norm": 0.5315996546762495, + "learning_rate": 0.00011566696222548858, + "loss": 12.1595, + "step": 17131 + }, + { + "epoch": 0.9329058694601567, + "grad_norm": 0.6238510583867447, + "learning_rate": 0.00011565825280250348, + "loss": 12.037, + "step": 17132 + }, + { + "epoch": 0.9329603234567397, + "grad_norm": 0.5691266977985826, + "learning_rate": 0.00011564954325775714, + "loss": 12.2516, + "step": 17133 + }, + { + "epoch": 0.9330147774533227, + "grad_norm": 0.5831989456009242, + "learning_rate": 0.00011564083359131736, + "loss": 12.2016, + "step": 17134 + }, + { + "epoch": 0.9330692314499057, + "grad_norm": 0.5385068948884913, + "learning_rate": 0.00011563212380325184, + "loss": 12.0845, + "step": 17135 + }, + { + "epoch": 0.9331236854464887, + "grad_norm": 0.5410471288548246, + "learning_rate": 0.00011562341389362831, + "loss": 12.16, + "step": 17136 + }, + { + "epoch": 0.9331781394430717, + "grad_norm": 0.5636545198374513, + "learning_rate": 0.00011561470386251449, + "loss": 12.2888, + "step": 17137 + }, + { + "epoch": 0.9332325934396548, + "grad_norm": 0.5218132981958773, + "learning_rate": 0.00011560599370997812, + "loss": 12.113, + "step": 17138 + }, + { + "epoch": 0.9332870474362378, + "grad_norm": 0.5833455834743435, + "learning_rate": 0.00011559728343608687, + "loss": 12.0584, + "step": 17139 + }, + { + "epoch": 0.9333415014328208, + "grad_norm": 0.5390141802259124, + "learning_rate": 0.00011558857304090858, + "loss": 12.0942, + "step": 17140 + }, + { + "epoch": 0.9333959554294038, + "grad_norm": 0.5172155285648882, + "learning_rate": 0.00011557986252451095, + "loss": 12.1737, + "step": 17141 + }, + { + "epoch": 0.9334504094259868, + "grad_norm": 0.5277277962681094, + "learning_rate": 0.00011557115188696163, + "loss": 11.991, + "step": 17142 + }, + { + "epoch": 0.9335048634225698, + "grad_norm": 0.5706722403948381, + "learning_rate": 0.00011556244112832848, + "loss": 12.0166, + "step": 17143 + }, + { + "epoch": 0.9335593174191529, + "grad_norm": 0.5892789767614426, + "learning_rate": 0.00011555373024867912, + "loss": 12.2115, + "step": 17144 + }, + { + "epoch": 0.9336137714157359, + "grad_norm": 0.575780453957793, + "learning_rate": 0.00011554501924808139, + "loss": 12.0539, + "step": 17145 + }, + { + "epoch": 0.9336682254123189, + "grad_norm": 0.5678263145783361, + "learning_rate": 0.00011553630812660298, + "loss": 12.1123, + "step": 17146 + }, + { + "epoch": 0.9337226794089019, + "grad_norm": 0.5739674121461581, + "learning_rate": 0.00011552759688431165, + "loss": 12.143, + "step": 17147 + }, + { + "epoch": 0.9337771334054848, + "grad_norm": 0.523763579277213, + "learning_rate": 0.0001155188855212751, + "loss": 12.0914, + "step": 17148 + }, + { + "epoch": 0.9338315874020678, + "grad_norm": 0.5190423460737542, + "learning_rate": 0.00011551017403756108, + "loss": 11.9937, + "step": 17149 + }, + { + "epoch": 0.933886041398651, + "grad_norm": 0.5127265184209078, + "learning_rate": 0.00011550146243323734, + "loss": 12.0303, + "step": 17150 + }, + { + "epoch": 0.9339404953952339, + "grad_norm": 0.6086177874858811, + "learning_rate": 0.00011549275070837167, + "loss": 12.2432, + "step": 17151 + }, + { + "epoch": 0.9339949493918169, + "grad_norm": 0.5150423244136452, + "learning_rate": 0.00011548403886303176, + "loss": 12.1052, + "step": 17152 + }, + { + "epoch": 0.9340494033883999, + "grad_norm": 0.546345240118378, + "learning_rate": 0.00011547532689728535, + "loss": 12.1896, + "step": 17153 + }, + { + "epoch": 0.9341038573849829, + "grad_norm": 0.6130336189200207, + "learning_rate": 0.0001154666148112002, + "loss": 12.0921, + "step": 17154 + }, + { + "epoch": 0.9341583113815659, + "grad_norm": 0.5476637616569271, + "learning_rate": 0.00011545790260484409, + "loss": 12.0947, + "step": 17155 + }, + { + "epoch": 0.934212765378149, + "grad_norm": 0.5547513808519101, + "learning_rate": 0.00011544919027828472, + "loss": 12.0596, + "step": 17156 + }, + { + "epoch": 0.934267219374732, + "grad_norm": 0.5931607387938834, + "learning_rate": 0.00011544047783158984, + "loss": 12.1686, + "step": 17157 + }, + { + "epoch": 0.934321673371315, + "grad_norm": 0.599223671621129, + "learning_rate": 0.00011543176526482722, + "loss": 12.0938, + "step": 17158 + }, + { + "epoch": 0.934376127367898, + "grad_norm": 0.565651878471124, + "learning_rate": 0.00011542305257806463, + "loss": 12.1767, + "step": 17159 + }, + { + "epoch": 0.934430581364481, + "grad_norm": 0.5577459724178409, + "learning_rate": 0.00011541433977136977, + "loss": 12.078, + "step": 17160 + }, + { + "epoch": 0.934485035361064, + "grad_norm": 0.5095102241397735, + "learning_rate": 0.00011540562684481042, + "loss": 12.1707, + "step": 17161 + }, + { + "epoch": 0.9345394893576471, + "grad_norm": 0.568478015192735, + "learning_rate": 0.00011539691379845435, + "loss": 12.0611, + "step": 17162 + }, + { + "epoch": 0.9345939433542301, + "grad_norm": 0.5619916172790094, + "learning_rate": 0.00011538820063236928, + "loss": 12.2408, + "step": 17163 + }, + { + "epoch": 0.9346483973508131, + "grad_norm": 0.5480443047733544, + "learning_rate": 0.00011537948734662299, + "loss": 12.025, + "step": 17164 + }, + { + "epoch": 0.9347028513473961, + "grad_norm": 0.49224784515548436, + "learning_rate": 0.00011537077394128321, + "loss": 12.029, + "step": 17165 + }, + { + "epoch": 0.934757305343979, + "grad_norm": 0.49048885080856797, + "learning_rate": 0.00011536206041641776, + "loss": 12.1366, + "step": 17166 + }, + { + "epoch": 0.9348117593405622, + "grad_norm": 0.519980158503926, + "learning_rate": 0.00011535334677209431, + "loss": 12.0942, + "step": 17167 + }, + { + "epoch": 0.9348662133371451, + "grad_norm": 0.5725900985707221, + "learning_rate": 0.00011534463300838067, + "loss": 12.2885, + "step": 17168 + }, + { + "epoch": 0.9349206673337281, + "grad_norm": 0.5930581777586704, + "learning_rate": 0.0001153359191253446, + "loss": 12.1438, + "step": 17169 + }, + { + "epoch": 0.9349751213303111, + "grad_norm": 0.592854074271066, + "learning_rate": 0.00011532720512305384, + "loss": 12.1531, + "step": 17170 + }, + { + "epoch": 0.9350295753268941, + "grad_norm": 0.5424309855938475, + "learning_rate": 0.00011531849100157617, + "loss": 12.1337, + "step": 17171 + }, + { + "epoch": 0.9350840293234771, + "grad_norm": 0.5206493011432844, + "learning_rate": 0.00011530977676097934, + "loss": 12.1362, + "step": 17172 + }, + { + "epoch": 0.9351384833200602, + "grad_norm": 0.5947327816436266, + "learning_rate": 0.00011530106240133112, + "loss": 12.1783, + "step": 17173 + }, + { + "epoch": 0.9351929373166432, + "grad_norm": 0.5273619577220753, + "learning_rate": 0.00011529234792269927, + "loss": 12.0998, + "step": 17174 + }, + { + "epoch": 0.9352473913132262, + "grad_norm": 0.5809187372992236, + "learning_rate": 0.00011528363332515155, + "loss": 11.9999, + "step": 17175 + }, + { + "epoch": 0.9353018453098092, + "grad_norm": 0.5708666708407913, + "learning_rate": 0.00011527491860875575, + "loss": 12.1564, + "step": 17176 + }, + { + "epoch": 0.9353562993063922, + "grad_norm": 0.5071641418210248, + "learning_rate": 0.00011526620377357961, + "loss": 12.0695, + "step": 17177 + }, + { + "epoch": 0.9354107533029752, + "grad_norm": 0.5717476277001297, + "learning_rate": 0.00011525748881969091, + "loss": 12.1292, + "step": 17178 + }, + { + "epoch": 0.9354652072995583, + "grad_norm": 0.48412158610907385, + "learning_rate": 0.00011524877374715743, + "loss": 12.0474, + "step": 17179 + }, + { + "epoch": 0.9355196612961413, + "grad_norm": 0.6004876501095229, + "learning_rate": 0.00011524005855604692, + "loss": 12.107, + "step": 17180 + }, + { + "epoch": 0.9355741152927243, + "grad_norm": 0.5485097559377725, + "learning_rate": 0.0001152313432464272, + "loss": 11.9742, + "step": 17181 + }, + { + "epoch": 0.9356285692893073, + "grad_norm": 0.5582121852771219, + "learning_rate": 0.00011522262781836593, + "loss": 12.0482, + "step": 17182 + }, + { + "epoch": 0.9356830232858903, + "grad_norm": 0.6497945649112843, + "learning_rate": 0.000115213912271931, + "loss": 12.0386, + "step": 17183 + }, + { + "epoch": 0.9357374772824733, + "grad_norm": 0.5430460869746485, + "learning_rate": 0.00011520519660719009, + "loss": 12.0862, + "step": 17184 + }, + { + "epoch": 0.9357919312790564, + "grad_norm": 0.5625820920059277, + "learning_rate": 0.00011519648082421107, + "loss": 12.1591, + "step": 17185 + }, + { + "epoch": 0.9358463852756393, + "grad_norm": 0.6288034785425524, + "learning_rate": 0.00011518776492306167, + "loss": 12.2041, + "step": 17186 + }, + { + "epoch": 0.9359008392722223, + "grad_norm": 0.5300944084181654, + "learning_rate": 0.00011517904890380963, + "loss": 12.162, + "step": 17187 + }, + { + "epoch": 0.9359552932688053, + "grad_norm": 0.5571033320051584, + "learning_rate": 0.00011517033276652276, + "loss": 11.9701, + "step": 17188 + }, + { + "epoch": 0.9360097472653883, + "grad_norm": 0.6130325143516594, + "learning_rate": 0.00011516161651126884, + "loss": 12.077, + "step": 17189 + }, + { + "epoch": 0.9360642012619713, + "grad_norm": 0.5379809855893798, + "learning_rate": 0.00011515290013811565, + "loss": 12.1274, + "step": 17190 + }, + { + "epoch": 0.9361186552585544, + "grad_norm": 0.5765900320169567, + "learning_rate": 0.00011514418364713102, + "loss": 12.1006, + "step": 17191 + }, + { + "epoch": 0.9361731092551374, + "grad_norm": 0.5541974507656546, + "learning_rate": 0.00011513546703838263, + "loss": 12.2088, + "step": 17192 + }, + { + "epoch": 0.9362275632517204, + "grad_norm": 0.6294734378681487, + "learning_rate": 0.0001151267503119383, + "loss": 12.2842, + "step": 17193 + }, + { + "epoch": 0.9362820172483034, + "grad_norm": 0.4899631038419695, + "learning_rate": 0.00011511803346786583, + "loss": 12.1487, + "step": 17194 + }, + { + "epoch": 0.9363364712448864, + "grad_norm": 0.6317754888221799, + "learning_rate": 0.000115109316506233, + "loss": 12.2118, + "step": 17195 + }, + { + "epoch": 0.9363909252414695, + "grad_norm": 0.577960628241482, + "learning_rate": 0.00011510059942710762, + "loss": 12.0954, + "step": 17196 + }, + { + "epoch": 0.9364453792380525, + "grad_norm": 0.5164315592205132, + "learning_rate": 0.00011509188223055742, + "loss": 12.0404, + "step": 17197 + }, + { + "epoch": 0.9364998332346355, + "grad_norm": 0.5667785249426822, + "learning_rate": 0.00011508316491665021, + "loss": 12.1055, + "step": 17198 + }, + { + "epoch": 0.9365542872312185, + "grad_norm": 0.5659445918451027, + "learning_rate": 0.00011507444748545375, + "loss": 12.0473, + "step": 17199 + }, + { + "epoch": 0.9366087412278015, + "grad_norm": 0.5705775571743129, + "learning_rate": 0.0001150657299370359, + "loss": 12.1798, + "step": 17200 + }, + { + "epoch": 0.9366631952243845, + "grad_norm": 0.5987243004380423, + "learning_rate": 0.00011505701227146441, + "loss": 12.0715, + "step": 17201 + }, + { + "epoch": 0.9367176492209676, + "grad_norm": 0.5876089105774565, + "learning_rate": 0.00011504829448880708, + "loss": 12.0864, + "step": 17202 + }, + { + "epoch": 0.9367721032175506, + "grad_norm": 0.5385016581645973, + "learning_rate": 0.00011503957658913165, + "loss": 12.1301, + "step": 17203 + }, + { + "epoch": 0.9368265572141335, + "grad_norm": 0.6261101427354301, + "learning_rate": 0.00011503085857250595, + "loss": 12.2408, + "step": 17204 + }, + { + "epoch": 0.9368810112107165, + "grad_norm": 0.7358535512856772, + "learning_rate": 0.00011502214043899779, + "loss": 12.2206, + "step": 17205 + }, + { + "epoch": 0.9369354652072995, + "grad_norm": 0.5778289403753424, + "learning_rate": 0.00011501342218867496, + "loss": 12.141, + "step": 17206 + }, + { + "epoch": 0.9369899192038825, + "grad_norm": 0.5286401393506325, + "learning_rate": 0.00011500470382160524, + "loss": 12.1795, + "step": 17207 + }, + { + "epoch": 0.9370443732004656, + "grad_norm": 0.5701706314532361, + "learning_rate": 0.0001149959853378564, + "loss": 12.1545, + "step": 17208 + }, + { + "epoch": 0.9370988271970486, + "grad_norm": 0.5960299180323986, + "learning_rate": 0.0001149872667374963, + "loss": 12.1785, + "step": 17209 + }, + { + "epoch": 0.9371532811936316, + "grad_norm": 0.5829854024390061, + "learning_rate": 0.00011497854802059265, + "loss": 12.1051, + "step": 17210 + }, + { + "epoch": 0.9372077351902146, + "grad_norm": 0.5881724224735968, + "learning_rate": 0.00011496982918721333, + "loss": 12.2929, + "step": 17211 + }, + { + "epoch": 0.9372621891867976, + "grad_norm": 0.5541811460465499, + "learning_rate": 0.00011496111023742611, + "loss": 12.2226, + "step": 17212 + }, + { + "epoch": 0.9373166431833806, + "grad_norm": 0.5486564396126202, + "learning_rate": 0.0001149523911712988, + "loss": 12.1091, + "step": 17213 + }, + { + "epoch": 0.9373710971799637, + "grad_norm": 0.6097862643791315, + "learning_rate": 0.00011494367198889915, + "loss": 12.155, + "step": 17214 + }, + { + "epoch": 0.9374255511765467, + "grad_norm": 0.545545428506642, + "learning_rate": 0.00011493495269029501, + "loss": 12.0725, + "step": 17215 + }, + { + "epoch": 0.9374800051731297, + "grad_norm": 0.609667211079781, + "learning_rate": 0.0001149262332755542, + "loss": 12.0824, + "step": 17216 + }, + { + "epoch": 0.9375344591697127, + "grad_norm": 0.5271418269924881, + "learning_rate": 0.00011491751374474447, + "loss": 12.1602, + "step": 17217 + }, + { + "epoch": 0.9375889131662957, + "grad_norm": 0.5761514603166165, + "learning_rate": 0.00011490879409793367, + "loss": 12.1334, + "step": 17218 + }, + { + "epoch": 0.9376433671628787, + "grad_norm": 0.5680582296778267, + "learning_rate": 0.00011490007433518956, + "loss": 12.1941, + "step": 17219 + }, + { + "epoch": 0.9376978211594618, + "grad_norm": 0.6078517328718738, + "learning_rate": 0.00011489135445658001, + "loss": 12.0645, + "step": 17220 + }, + { + "epoch": 0.9377522751560448, + "grad_norm": 0.509081853414556, + "learning_rate": 0.00011488263446217278, + "loss": 12.0702, + "step": 17221 + }, + { + "epoch": 0.9378067291526277, + "grad_norm": 0.5501102501809502, + "learning_rate": 0.00011487391435203568, + "loss": 12.0714, + "step": 17222 + }, + { + "epoch": 0.9378611831492107, + "grad_norm": 0.5744126645045073, + "learning_rate": 0.00011486519412623654, + "loss": 12.1208, + "step": 17223 + }, + { + "epoch": 0.9379156371457937, + "grad_norm": 0.5587439549384924, + "learning_rate": 0.00011485647378484312, + "loss": 12.1202, + "step": 17224 + }, + { + "epoch": 0.9379700911423767, + "grad_norm": 0.561473849628231, + "learning_rate": 0.0001148477533279233, + "loss": 12.1321, + "step": 17225 + }, + { + "epoch": 0.9380245451389598, + "grad_norm": 0.5659402032513542, + "learning_rate": 0.00011483903275554486, + "loss": 12.0077, + "step": 17226 + }, + { + "epoch": 0.9380789991355428, + "grad_norm": 0.5377675953612329, + "learning_rate": 0.00011483031206777562, + "loss": 12.1217, + "step": 17227 + }, + { + "epoch": 0.9381334531321258, + "grad_norm": 0.5242989901422854, + "learning_rate": 0.00011482159126468338, + "loss": 12.1191, + "step": 17228 + }, + { + "epoch": 0.9381879071287088, + "grad_norm": 0.5297059636906761, + "learning_rate": 0.00011481287034633595, + "loss": 12.09, + "step": 17229 + }, + { + "epoch": 0.9382423611252918, + "grad_norm": 0.5568369256020095, + "learning_rate": 0.00011480414931280114, + "loss": 12.2279, + "step": 17230 + }, + { + "epoch": 0.9382968151218749, + "grad_norm": 0.5244586564878074, + "learning_rate": 0.00011479542816414681, + "loss": 12.1117, + "step": 17231 + }, + { + "epoch": 0.9383512691184579, + "grad_norm": 0.5413030521735377, + "learning_rate": 0.00011478670690044075, + "loss": 12.3001, + "step": 17232 + }, + { + "epoch": 0.9384057231150409, + "grad_norm": 0.5701559086699557, + "learning_rate": 0.00011477798552175076, + "loss": 12.208, + "step": 17233 + }, + { + "epoch": 0.9384601771116239, + "grad_norm": 0.512633275163757, + "learning_rate": 0.0001147692640281447, + "loss": 12.1601, + "step": 17234 + }, + { + "epoch": 0.9385146311082069, + "grad_norm": 0.5953874673114348, + "learning_rate": 0.00011476054241969035, + "loss": 12.2078, + "step": 17235 + }, + { + "epoch": 0.9385690851047899, + "grad_norm": 0.5853808960841238, + "learning_rate": 0.00011475182069645556, + "loss": 12.1906, + "step": 17236 + }, + { + "epoch": 0.938623539101373, + "grad_norm": 0.5601311855180856, + "learning_rate": 0.00011474309885850811, + "loss": 12.1287, + "step": 17237 + }, + { + "epoch": 0.938677993097956, + "grad_norm": 0.5993874750692453, + "learning_rate": 0.00011473437690591589, + "loss": 12.3005, + "step": 17238 + }, + { + "epoch": 0.938732447094539, + "grad_norm": 0.5524879737952889, + "learning_rate": 0.00011472565483874665, + "loss": 12.2526, + "step": 17239 + }, + { + "epoch": 0.938786901091122, + "grad_norm": 0.6498326786926967, + "learning_rate": 0.00011471693265706827, + "loss": 12.1835, + "step": 17240 + }, + { + "epoch": 0.9388413550877049, + "grad_norm": 0.547793442948194, + "learning_rate": 0.00011470821036094856, + "loss": 12.0828, + "step": 17241 + }, + { + "epoch": 0.9388958090842879, + "grad_norm": 0.5926849825805017, + "learning_rate": 0.0001146994879504553, + "loss": 12.305, + "step": 17242 + }, + { + "epoch": 0.938950263080871, + "grad_norm": 0.5139655451009593, + "learning_rate": 0.00011469076542565637, + "loss": 12.0733, + "step": 17243 + }, + { + "epoch": 0.939004717077454, + "grad_norm": 0.5806839240526883, + "learning_rate": 0.00011468204278661961, + "loss": 12.0816, + "step": 17244 + }, + { + "epoch": 0.939059171074037, + "grad_norm": 0.5360932660865105, + "learning_rate": 0.00011467332003341281, + "loss": 12.1387, + "step": 17245 + }, + { + "epoch": 0.93911362507062, + "grad_norm": 0.5959651623575238, + "learning_rate": 0.00011466459716610382, + "loss": 12.1715, + "step": 17246 + }, + { + "epoch": 0.939168079067203, + "grad_norm": 0.49148132757862095, + "learning_rate": 0.00011465587418476047, + "loss": 12.1486, + "step": 17247 + }, + { + "epoch": 0.939222533063786, + "grad_norm": 0.5828766323037756, + "learning_rate": 0.00011464715108945055, + "loss": 12.1267, + "step": 17248 + }, + { + "epoch": 0.9392769870603691, + "grad_norm": 0.556492706857238, + "learning_rate": 0.00011463842788024192, + "loss": 12.1077, + "step": 17249 + }, + { + "epoch": 0.9393314410569521, + "grad_norm": 0.529851964618464, + "learning_rate": 0.00011462970455720246, + "loss": 12.0681, + "step": 17250 + }, + { + "epoch": 0.9393858950535351, + "grad_norm": 0.5119082388277009, + "learning_rate": 0.00011462098112039997, + "loss": 12.1816, + "step": 17251 + }, + { + "epoch": 0.9394403490501181, + "grad_norm": 0.5067629636196034, + "learning_rate": 0.00011461225756990226, + "loss": 12.1203, + "step": 17252 + }, + { + "epoch": 0.9394948030467011, + "grad_norm": 0.5617860450760956, + "learning_rate": 0.00011460353390577716, + "loss": 12.2252, + "step": 17253 + }, + { + "epoch": 0.9395492570432841, + "grad_norm": 0.4995096624139038, + "learning_rate": 0.00011459481012809256, + "loss": 12.1493, + "step": 17254 + }, + { + "epoch": 0.9396037110398672, + "grad_norm": 0.5723783085188626, + "learning_rate": 0.00011458608623691627, + "loss": 12.1434, + "step": 17255 + }, + { + "epoch": 0.9396581650364502, + "grad_norm": 0.6267075493003642, + "learning_rate": 0.00011457736223231612, + "loss": 12.1304, + "step": 17256 + }, + { + "epoch": 0.9397126190330332, + "grad_norm": 0.5694948368007134, + "learning_rate": 0.00011456863811435998, + "loss": 12.0896, + "step": 17257 + }, + { + "epoch": 0.9397670730296162, + "grad_norm": 0.5344020892925807, + "learning_rate": 0.00011455991388311564, + "loss": 12.2124, + "step": 17258 + }, + { + "epoch": 0.9398215270261991, + "grad_norm": 0.542197920612981, + "learning_rate": 0.00011455118953865096, + "loss": 12.1603, + "step": 17259 + }, + { + "epoch": 0.9398759810227821, + "grad_norm": 0.5471556562301783, + "learning_rate": 0.00011454246508103379, + "loss": 12.0228, + "step": 17260 + }, + { + "epoch": 0.9399304350193652, + "grad_norm": 0.5374465446352593, + "learning_rate": 0.00011453374051033199, + "loss": 12.2189, + "step": 17261 + }, + { + "epoch": 0.9399848890159482, + "grad_norm": 0.5364038580856191, + "learning_rate": 0.00011452501582661341, + "loss": 12.2213, + "step": 17262 + }, + { + "epoch": 0.9400393430125312, + "grad_norm": 0.5654931347362897, + "learning_rate": 0.00011451629102994583, + "loss": 12.1549, + "step": 17263 + }, + { + "epoch": 0.9400937970091142, + "grad_norm": 0.6023270565342111, + "learning_rate": 0.00011450756612039715, + "loss": 12.0658, + "step": 17264 + }, + { + "epoch": 0.9401482510056972, + "grad_norm": 0.5695175187289669, + "learning_rate": 0.00011449884109803519, + "loss": 12.2456, + "step": 17265 + }, + { + "epoch": 0.9402027050022803, + "grad_norm": 0.6502306398488362, + "learning_rate": 0.00011449011596292783, + "loss": 12.2209, + "step": 17266 + }, + { + "epoch": 0.9402571589988633, + "grad_norm": 0.5582934648368669, + "learning_rate": 0.0001144813907151429, + "loss": 12.1111, + "step": 17267 + }, + { + "epoch": 0.9403116129954463, + "grad_norm": 0.6445743452834216, + "learning_rate": 0.00011447266535474824, + "loss": 12.1412, + "step": 17268 + }, + { + "epoch": 0.9403660669920293, + "grad_norm": 0.5357891514697972, + "learning_rate": 0.0001144639398818117, + "loss": 12.1874, + "step": 17269 + }, + { + "epoch": 0.9404205209886123, + "grad_norm": 0.5512280444023175, + "learning_rate": 0.00011445521429640114, + "loss": 12.1655, + "step": 17270 + }, + { + "epoch": 0.9404749749851953, + "grad_norm": 0.625777175265489, + "learning_rate": 0.0001144464885985844, + "loss": 12.0704, + "step": 17271 + }, + { + "epoch": 0.9405294289817784, + "grad_norm": 0.5525918408539039, + "learning_rate": 0.00011443776278842937, + "loss": 11.9734, + "step": 17272 + }, + { + "epoch": 0.9405838829783614, + "grad_norm": 0.5754948191262231, + "learning_rate": 0.00011442903686600386, + "loss": 12.1846, + "step": 17273 + }, + { + "epoch": 0.9406383369749444, + "grad_norm": 0.5790135252150729, + "learning_rate": 0.00011442031083137574, + "loss": 12.223, + "step": 17274 + }, + { + "epoch": 0.9406927909715274, + "grad_norm": 0.5600864123416874, + "learning_rate": 0.00011441158468461286, + "loss": 12.0566, + "step": 17275 + }, + { + "epoch": 0.9407472449681104, + "grad_norm": 0.5736911960390423, + "learning_rate": 0.00011440285842578306, + "loss": 12.1032, + "step": 17276 + }, + { + "epoch": 0.9408016989646933, + "grad_norm": 0.5543856612987784, + "learning_rate": 0.00011439413205495428, + "loss": 12.0983, + "step": 17277 + }, + { + "epoch": 0.9408561529612764, + "grad_norm": 0.5984601213406878, + "learning_rate": 0.00011438540557219429, + "loss": 12.2118, + "step": 17278 + }, + { + "epoch": 0.9409106069578594, + "grad_norm": 0.5530215820344018, + "learning_rate": 0.00011437667897757093, + "loss": 12.2162, + "step": 17279 + }, + { + "epoch": 0.9409650609544424, + "grad_norm": 0.5248977660305737, + "learning_rate": 0.00011436795227115216, + "loss": 12.1515, + "step": 17280 + }, + { + "epoch": 0.9410195149510254, + "grad_norm": 0.5153105000297468, + "learning_rate": 0.00011435922545300572, + "loss": 12.0293, + "step": 17281 + }, + { + "epoch": 0.9410739689476084, + "grad_norm": 0.5206047845654533, + "learning_rate": 0.00011435049852319961, + "loss": 12.1381, + "step": 17282 + }, + { + "epoch": 0.9411284229441914, + "grad_norm": 0.5185415528814564, + "learning_rate": 0.00011434177148180159, + "loss": 12.0153, + "step": 17283 + }, + { + "epoch": 0.9411828769407745, + "grad_norm": 0.5520501755180541, + "learning_rate": 0.00011433304432887952, + "loss": 12.1841, + "step": 17284 + }, + { + "epoch": 0.9412373309373575, + "grad_norm": 0.6256029817077796, + "learning_rate": 0.00011432431706450133, + "loss": 12.119, + "step": 17285 + }, + { + "epoch": 0.9412917849339405, + "grad_norm": 0.5374855220323205, + "learning_rate": 0.00011431558968873482, + "loss": 12.198, + "step": 17286 + }, + { + "epoch": 0.9413462389305235, + "grad_norm": 0.5433292626029383, + "learning_rate": 0.0001143068622016479, + "loss": 12.0907, + "step": 17287 + }, + { + "epoch": 0.9414006929271065, + "grad_norm": 0.5521975390496797, + "learning_rate": 0.00011429813460330841, + "loss": 12.1391, + "step": 17288 + }, + { + "epoch": 0.9414551469236895, + "grad_norm": 0.5429237677242953, + "learning_rate": 0.00011428940689378423, + "loss": 12.1754, + "step": 17289 + }, + { + "epoch": 0.9415096009202726, + "grad_norm": 0.6030906254884438, + "learning_rate": 0.00011428067907314324, + "loss": 12.2256, + "step": 17290 + }, + { + "epoch": 0.9415640549168556, + "grad_norm": 0.5743037190203135, + "learning_rate": 0.00011427195114145328, + "loss": 11.9909, + "step": 17291 + }, + { + "epoch": 0.9416185089134386, + "grad_norm": 0.5468077782858237, + "learning_rate": 0.00011426322309878223, + "loss": 12.0723, + "step": 17292 + }, + { + "epoch": 0.9416729629100216, + "grad_norm": 0.5599611855486325, + "learning_rate": 0.00011425449494519798, + "loss": 12.0496, + "step": 17293 + }, + { + "epoch": 0.9417274169066046, + "grad_norm": 0.5863866721261957, + "learning_rate": 0.00011424576668076838, + "loss": 12.0658, + "step": 17294 + }, + { + "epoch": 0.9417818709031875, + "grad_norm": 0.562459945614839, + "learning_rate": 0.00011423703830556132, + "loss": 12.1998, + "step": 17295 + }, + { + "epoch": 0.9418363248997706, + "grad_norm": 0.5531156188848322, + "learning_rate": 0.00011422830981964465, + "loss": 12.1009, + "step": 17296 + }, + { + "epoch": 0.9418907788963536, + "grad_norm": 0.5583643188921626, + "learning_rate": 0.00011421958122308625, + "loss": 12.1192, + "step": 17297 + }, + { + "epoch": 0.9419452328929366, + "grad_norm": 0.568445649499285, + "learning_rate": 0.00011421085251595402, + "loss": 12.1539, + "step": 17298 + }, + { + "epoch": 0.9419996868895196, + "grad_norm": 0.5543496850795123, + "learning_rate": 0.00011420212369831579, + "loss": 12.1915, + "step": 17299 + }, + { + "epoch": 0.9420541408861026, + "grad_norm": 0.6451922077957235, + "learning_rate": 0.0001141933947702395, + "loss": 12.1941, + "step": 17300 + }, + { + "epoch": 0.9421085948826857, + "grad_norm": 0.5803207346318053, + "learning_rate": 0.00011418466573179297, + "loss": 12.1449, + "step": 17301 + }, + { + "epoch": 0.9421630488792687, + "grad_norm": 0.5794351050123866, + "learning_rate": 0.00011417593658304411, + "loss": 12.1409, + "step": 17302 + }, + { + "epoch": 0.9422175028758517, + "grad_norm": 0.6212903231713002, + "learning_rate": 0.00011416720732406078, + "loss": 12.2715, + "step": 17303 + }, + { + "epoch": 0.9422719568724347, + "grad_norm": 0.550008672363222, + "learning_rate": 0.00011415847795491088, + "loss": 11.8332, + "step": 17304 + }, + { + "epoch": 0.9423264108690177, + "grad_norm": 0.5297503330256259, + "learning_rate": 0.00011414974847566226, + "loss": 11.9181, + "step": 17305 + }, + { + "epoch": 0.9423808648656007, + "grad_norm": 0.5472197784649815, + "learning_rate": 0.00011414101888638284, + "loss": 12.0817, + "step": 17306 + }, + { + "epoch": 0.9424353188621838, + "grad_norm": 0.561480153899593, + "learning_rate": 0.00011413228918714051, + "loss": 12.2081, + "step": 17307 + }, + { + "epoch": 0.9424897728587668, + "grad_norm": 0.5529541879321098, + "learning_rate": 0.0001141235593780031, + "loss": 12.109, + "step": 17308 + }, + { + "epoch": 0.9425442268553498, + "grad_norm": 0.6121092589831865, + "learning_rate": 0.00011411482945903853, + "loss": 12.1352, + "step": 17309 + }, + { + "epoch": 0.9425986808519328, + "grad_norm": 0.5248546324420222, + "learning_rate": 0.00011410609943031467, + "loss": 12.1414, + "step": 17310 + }, + { + "epoch": 0.9426531348485158, + "grad_norm": 0.5544470836079637, + "learning_rate": 0.00011409736929189943, + "loss": 12.1372, + "step": 17311 + }, + { + "epoch": 0.9427075888450988, + "grad_norm": 0.6278726221777126, + "learning_rate": 0.00011408863904386068, + "loss": 12.2407, + "step": 17312 + }, + { + "epoch": 0.9427620428416819, + "grad_norm": 0.5198308450201492, + "learning_rate": 0.0001140799086862663, + "loss": 12.0772, + "step": 17313 + }, + { + "epoch": 0.9428164968382649, + "grad_norm": 0.6000937497366587, + "learning_rate": 0.00011407117821918419, + "loss": 12.1906, + "step": 17314 + }, + { + "epoch": 0.9428709508348478, + "grad_norm": 0.504756532291079, + "learning_rate": 0.00011406244764268223, + "loss": 12.0159, + "step": 17315 + }, + { + "epoch": 0.9429254048314308, + "grad_norm": 0.547540736074649, + "learning_rate": 0.00011405371695682834, + "loss": 12.2065, + "step": 17316 + }, + { + "epoch": 0.9429798588280138, + "grad_norm": 0.5870598282344961, + "learning_rate": 0.00011404498616169039, + "loss": 12.247, + "step": 17317 + }, + { + "epoch": 0.9430343128245968, + "grad_norm": 0.6027365008204192, + "learning_rate": 0.00011403625525733628, + "loss": 12.1875, + "step": 17318 + }, + { + "epoch": 0.9430887668211799, + "grad_norm": 0.6294453674112667, + "learning_rate": 0.00011402752424383385, + "loss": 12.1007, + "step": 17319 + }, + { + "epoch": 0.9431432208177629, + "grad_norm": 0.60501481424706, + "learning_rate": 0.00011401879312125108, + "loss": 12.2884, + "step": 17320 + }, + { + "epoch": 0.9431976748143459, + "grad_norm": 0.5559023358331184, + "learning_rate": 0.0001140100618896558, + "loss": 12.1927, + "step": 17321 + }, + { + "epoch": 0.9432521288109289, + "grad_norm": 0.49646878847966375, + "learning_rate": 0.00011400133054911597, + "loss": 12.0381, + "step": 17322 + }, + { + "epoch": 0.9433065828075119, + "grad_norm": 0.534139219927877, + "learning_rate": 0.00011399259909969942, + "loss": 12.1031, + "step": 17323 + }, + { + "epoch": 0.9433610368040949, + "grad_norm": 0.5331067913106019, + "learning_rate": 0.00011398386754147405, + "loss": 12.1819, + "step": 17324 + }, + { + "epoch": 0.943415490800678, + "grad_norm": 0.5361743784298774, + "learning_rate": 0.00011397513587450779, + "loss": 12.0945, + "step": 17325 + }, + { + "epoch": 0.943469944797261, + "grad_norm": 0.5337030188842238, + "learning_rate": 0.00011396640409886854, + "loss": 12.1484, + "step": 17326 + }, + { + "epoch": 0.943524398793844, + "grad_norm": 0.6535158266766455, + "learning_rate": 0.00011395767221462421, + "loss": 12.2561, + "step": 17327 + }, + { + "epoch": 0.943578852790427, + "grad_norm": 0.5496197226487632, + "learning_rate": 0.00011394894022184266, + "loss": 12.0781, + "step": 17328 + }, + { + "epoch": 0.94363330678701, + "grad_norm": 0.5964472425795224, + "learning_rate": 0.0001139402081205918, + "loss": 12.1233, + "step": 17329 + }, + { + "epoch": 0.9436877607835931, + "grad_norm": 0.5233459287429948, + "learning_rate": 0.00011393147591093954, + "loss": 12.2202, + "step": 17330 + }, + { + "epoch": 0.9437422147801761, + "grad_norm": 0.5396707815865741, + "learning_rate": 0.00011392274359295381, + "loss": 12.0967, + "step": 17331 + }, + { + "epoch": 0.943796668776759, + "grad_norm": 0.5324413065684591, + "learning_rate": 0.00011391401116670248, + "loss": 12.097, + "step": 17332 + }, + { + "epoch": 0.943851122773342, + "grad_norm": 0.518466699604795, + "learning_rate": 0.00011390527863225349, + "loss": 12.0776, + "step": 17333 + }, + { + "epoch": 0.943905576769925, + "grad_norm": 0.6130210757512741, + "learning_rate": 0.00011389654598967469, + "loss": 12.174, + "step": 17334 + }, + { + "epoch": 0.943960030766508, + "grad_norm": 0.5183145927597441, + "learning_rate": 0.00011388781323903403, + "loss": 12.1552, + "step": 17335 + }, + { + "epoch": 0.9440144847630911, + "grad_norm": 0.5357415610664379, + "learning_rate": 0.00011387908038039938, + "loss": 12.1089, + "step": 17336 + }, + { + "epoch": 0.9440689387596741, + "grad_norm": 0.5192848791681334, + "learning_rate": 0.00011387034741383872, + "loss": 12.0486, + "step": 17337 + }, + { + "epoch": 0.9441233927562571, + "grad_norm": 0.5211735670403752, + "learning_rate": 0.00011386161433941988, + "loss": 12.2151, + "step": 17338 + }, + { + "epoch": 0.9441778467528401, + "grad_norm": 0.5316633016104322, + "learning_rate": 0.00011385288115721082, + "loss": 12.0402, + "step": 17339 + }, + { + "epoch": 0.9442323007494231, + "grad_norm": 0.5882106176391751, + "learning_rate": 0.00011384414786727942, + "loss": 12.1058, + "step": 17340 + }, + { + "epoch": 0.9442867547460061, + "grad_norm": 0.6081795735550459, + "learning_rate": 0.00011383541446969362, + "loss": 12.1262, + "step": 17341 + }, + { + "epoch": 0.9443412087425892, + "grad_norm": 0.50259690448361, + "learning_rate": 0.0001138266809645213, + "loss": 12.0509, + "step": 17342 + }, + { + "epoch": 0.9443956627391722, + "grad_norm": 0.5571861587041949, + "learning_rate": 0.0001138179473518304, + "loss": 12.1127, + "step": 17343 + }, + { + "epoch": 0.9444501167357552, + "grad_norm": 0.5622518601050132, + "learning_rate": 0.00011380921363168882, + "loss": 12.1726, + "step": 17344 + }, + { + "epoch": 0.9445045707323382, + "grad_norm": 0.5339367631474244, + "learning_rate": 0.00011380047980416449, + "loss": 12.1181, + "step": 17345 + }, + { + "epoch": 0.9445590247289212, + "grad_norm": 0.5763643587724595, + "learning_rate": 0.00011379174586932528, + "loss": 12.0472, + "step": 17346 + }, + { + "epoch": 0.9446134787255042, + "grad_norm": 0.53330973830854, + "learning_rate": 0.00011378301182723918, + "loss": 12.0635, + "step": 17347 + }, + { + "epoch": 0.9446679327220873, + "grad_norm": 0.5257490232366746, + "learning_rate": 0.00011377427767797404, + "loss": 12.1081, + "step": 17348 + }, + { + "epoch": 0.9447223867186703, + "grad_norm": 0.6136431341982396, + "learning_rate": 0.00011376554342159785, + "loss": 12.2335, + "step": 17349 + }, + { + "epoch": 0.9447768407152533, + "grad_norm": 0.546794308497035, + "learning_rate": 0.00011375680905817844, + "loss": 12.1004, + "step": 17350 + }, + { + "epoch": 0.9448312947118362, + "grad_norm": 0.6030840671502765, + "learning_rate": 0.00011374807458778378, + "loss": 12.1877, + "step": 17351 + }, + { + "epoch": 0.9448857487084192, + "grad_norm": 0.5508296104098631, + "learning_rate": 0.00011373934001048181, + "loss": 12.237, + "step": 17352 + }, + { + "epoch": 0.9449402027050022, + "grad_norm": 0.5684494836431603, + "learning_rate": 0.00011373060532634041, + "loss": 12.0696, + "step": 17353 + }, + { + "epoch": 0.9449946567015853, + "grad_norm": 0.5099595884068093, + "learning_rate": 0.00011372187053542753, + "loss": 12.0227, + "step": 17354 + }, + { + "epoch": 0.9450491106981683, + "grad_norm": 0.6102015865848511, + "learning_rate": 0.00011371313563781107, + "loss": 12.2132, + "step": 17355 + }, + { + "epoch": 0.9451035646947513, + "grad_norm": 0.5640477794515227, + "learning_rate": 0.00011370440063355898, + "loss": 12.1198, + "step": 17356 + }, + { + "epoch": 0.9451580186913343, + "grad_norm": 0.6693167233760415, + "learning_rate": 0.00011369566552273919, + "loss": 12.268, + "step": 17357 + }, + { + "epoch": 0.9452124726879173, + "grad_norm": 0.5674765937082062, + "learning_rate": 0.0001136869303054196, + "loss": 12.066, + "step": 17358 + }, + { + "epoch": 0.9452669266845003, + "grad_norm": 0.5607735624035676, + "learning_rate": 0.00011367819498166812, + "loss": 12.1589, + "step": 17359 + }, + { + "epoch": 0.9453213806810834, + "grad_norm": 0.5681497183526665, + "learning_rate": 0.00011366945955155271, + "loss": 12.0458, + "step": 17360 + }, + { + "epoch": 0.9453758346776664, + "grad_norm": 0.5625421382395502, + "learning_rate": 0.00011366072401514129, + "loss": 12.0679, + "step": 17361 + }, + { + "epoch": 0.9454302886742494, + "grad_norm": 0.5649134208134503, + "learning_rate": 0.00011365198837250182, + "loss": 12.1959, + "step": 17362 + }, + { + "epoch": 0.9454847426708324, + "grad_norm": 0.5822452562809511, + "learning_rate": 0.00011364325262370215, + "loss": 12.2539, + "step": 17363 + }, + { + "epoch": 0.9455391966674154, + "grad_norm": 0.6101939104399868, + "learning_rate": 0.0001136345167688103, + "loss": 11.9465, + "step": 17364 + }, + { + "epoch": 0.9455936506639985, + "grad_norm": 0.5451417122742256, + "learning_rate": 0.00011362578080789413, + "loss": 12.1123, + "step": 17365 + }, + { + "epoch": 0.9456481046605815, + "grad_norm": 0.5445237279344438, + "learning_rate": 0.00011361704474102162, + "loss": 12.102, + "step": 17366 + }, + { + "epoch": 0.9457025586571645, + "grad_norm": 0.5558014321065162, + "learning_rate": 0.0001136083085682607, + "loss": 12.1779, + "step": 17367 + }, + { + "epoch": 0.9457570126537475, + "grad_norm": 0.5443084486631766, + "learning_rate": 0.00011359957228967926, + "loss": 12.0391, + "step": 17368 + }, + { + "epoch": 0.9458114666503304, + "grad_norm": 0.5223724320434633, + "learning_rate": 0.00011359083590534527, + "loss": 12.0774, + "step": 17369 + }, + { + "epoch": 0.9458659206469134, + "grad_norm": 0.6259468637705189, + "learning_rate": 0.00011358209941532668, + "loss": 12.2839, + "step": 17370 + }, + { + "epoch": 0.9459203746434965, + "grad_norm": 0.501076660026277, + "learning_rate": 0.0001135733628196914, + "loss": 12.1208, + "step": 17371 + }, + { + "epoch": 0.9459748286400795, + "grad_norm": 0.5180692728575292, + "learning_rate": 0.0001135646261185074, + "loss": 12.0093, + "step": 17372 + }, + { + "epoch": 0.9460292826366625, + "grad_norm": 0.48719332354599426, + "learning_rate": 0.00011355588931184256, + "loss": 12.1396, + "step": 17373 + }, + { + "epoch": 0.9460837366332455, + "grad_norm": 0.6126639995736392, + "learning_rate": 0.00011354715239976483, + "loss": 12.271, + "step": 17374 + }, + { + "epoch": 0.9461381906298285, + "grad_norm": 0.5393699437816664, + "learning_rate": 0.00011353841538234221, + "loss": 12.1046, + "step": 17375 + }, + { + "epoch": 0.9461926446264115, + "grad_norm": 0.5420248069570972, + "learning_rate": 0.00011352967825964259, + "loss": 12.1104, + "step": 17376 + }, + { + "epoch": 0.9462470986229946, + "grad_norm": 0.5331460613546519, + "learning_rate": 0.00011352094103173394, + "loss": 12.0273, + "step": 17377 + }, + { + "epoch": 0.9463015526195776, + "grad_norm": 0.4752978198092573, + "learning_rate": 0.00011351220369868416, + "loss": 12.2514, + "step": 17378 + }, + { + "epoch": 0.9463560066161606, + "grad_norm": 0.6074988970316852, + "learning_rate": 0.00011350346626056121, + "loss": 12.2263, + "step": 17379 + }, + { + "epoch": 0.9464104606127436, + "grad_norm": 0.5837326630588333, + "learning_rate": 0.00011349472871743306, + "loss": 12.1478, + "step": 17380 + }, + { + "epoch": 0.9464649146093266, + "grad_norm": 0.7559202053192198, + "learning_rate": 0.00011348599106936762, + "loss": 12.2578, + "step": 17381 + }, + { + "epoch": 0.9465193686059096, + "grad_norm": 0.6222093173873497, + "learning_rate": 0.00011347725331643289, + "loss": 12.2416, + "step": 17382 + }, + { + "epoch": 0.9465738226024927, + "grad_norm": 0.5169883056107297, + "learning_rate": 0.00011346851545869674, + "loss": 12.0261, + "step": 17383 + }, + { + "epoch": 0.9466282765990757, + "grad_norm": 0.5574210783211934, + "learning_rate": 0.00011345977749622718, + "loss": 12.1017, + "step": 17384 + }, + { + "epoch": 0.9466827305956587, + "grad_norm": 0.6661342012506033, + "learning_rate": 0.0001134510394290921, + "loss": 12.1888, + "step": 17385 + }, + { + "epoch": 0.9467371845922417, + "grad_norm": 0.5531060894825526, + "learning_rate": 0.0001134423012573595, + "loss": 12.1628, + "step": 17386 + }, + { + "epoch": 0.9467916385888246, + "grad_norm": 1.0749222571419381, + "learning_rate": 0.00011343356298109732, + "loss": 12.1868, + "step": 17387 + }, + { + "epoch": 0.9468460925854076, + "grad_norm": 0.5015559730252587, + "learning_rate": 0.00011342482460037352, + "loss": 12.1034, + "step": 17388 + }, + { + "epoch": 0.9469005465819907, + "grad_norm": 0.5686342159403498, + "learning_rate": 0.000113416086115256, + "loss": 12.0834, + "step": 17389 + }, + { + "epoch": 0.9469550005785737, + "grad_norm": 0.5218876135221702, + "learning_rate": 0.00011340734752581274, + "loss": 12.0357, + "step": 17390 + }, + { + "epoch": 0.9470094545751567, + "grad_norm": 0.5554576526720056, + "learning_rate": 0.00011339860883211171, + "loss": 12.1212, + "step": 17391 + }, + { + "epoch": 0.9470639085717397, + "grad_norm": 0.6075109268687903, + "learning_rate": 0.00011338987003422086, + "loss": 12.1669, + "step": 17392 + }, + { + "epoch": 0.9471183625683227, + "grad_norm": 0.5674266591827817, + "learning_rate": 0.00011338113113220814, + "loss": 12.0762, + "step": 17393 + }, + { + "epoch": 0.9471728165649057, + "grad_norm": 0.5507636381804221, + "learning_rate": 0.00011337239212614148, + "loss": 12.0855, + "step": 17394 + }, + { + "epoch": 0.9472272705614888, + "grad_norm": 0.5326074009086318, + "learning_rate": 0.00011336365301608887, + "loss": 12.0757, + "step": 17395 + }, + { + "epoch": 0.9472817245580718, + "grad_norm": 0.5373601830555752, + "learning_rate": 0.00011335491380211823, + "loss": 12.04, + "step": 17396 + }, + { + "epoch": 0.9473361785546548, + "grad_norm": 0.5781711851409429, + "learning_rate": 0.00011334617448429754, + "loss": 12.1499, + "step": 17397 + }, + { + "epoch": 0.9473906325512378, + "grad_norm": 0.62400690303693, + "learning_rate": 0.00011333743506269479, + "loss": 12.3313, + "step": 17398 + }, + { + "epoch": 0.9474450865478208, + "grad_norm": 0.5670385056747921, + "learning_rate": 0.0001133286955373779, + "loss": 11.9934, + "step": 17399 + }, + { + "epoch": 0.9474995405444039, + "grad_norm": 0.5790623640026985, + "learning_rate": 0.00011331995590841482, + "loss": 12.3538, + "step": 17400 + }, + { + "epoch": 0.9475539945409869, + "grad_norm": 0.4869641044386059, + "learning_rate": 0.00011331121617587355, + "loss": 12.131, + "step": 17401 + }, + { + "epoch": 0.9476084485375699, + "grad_norm": 0.605769205367832, + "learning_rate": 0.000113302476339822, + "loss": 12.2189, + "step": 17402 + }, + { + "epoch": 0.9476629025341529, + "grad_norm": 0.5477340165700033, + "learning_rate": 0.00011329373640032821, + "loss": 12.2405, + "step": 17403 + }, + { + "epoch": 0.9477173565307359, + "grad_norm": 0.5295438034670991, + "learning_rate": 0.00011328499635746009, + "loss": 12.1444, + "step": 17404 + }, + { + "epoch": 0.9477718105273188, + "grad_norm": 0.5227246506383778, + "learning_rate": 0.00011327625621128556, + "loss": 12.128, + "step": 17405 + }, + { + "epoch": 0.947826264523902, + "grad_norm": 0.5701971780820198, + "learning_rate": 0.00011326751596187269, + "loss": 12.0952, + "step": 17406 + }, + { + "epoch": 0.9478807185204849, + "grad_norm": 0.5291297001215025, + "learning_rate": 0.00011325877560928932, + "loss": 12.0341, + "step": 17407 + }, + { + "epoch": 0.9479351725170679, + "grad_norm": 0.561216245895998, + "learning_rate": 0.00011325003515360357, + "loss": 12.0441, + "step": 17408 + }, + { + "epoch": 0.9479896265136509, + "grad_norm": 0.7248037630493238, + "learning_rate": 0.00011324129459488329, + "loss": 12.0241, + "step": 17409 + }, + { + "epoch": 0.9480440805102339, + "grad_norm": 0.5478144736346154, + "learning_rate": 0.00011323255393319647, + "loss": 12.1368, + "step": 17410 + }, + { + "epoch": 0.9480985345068169, + "grad_norm": 0.5832870659400218, + "learning_rate": 0.00011322381316861112, + "loss": 12.2738, + "step": 17411 + }, + { + "epoch": 0.9481529885034, + "grad_norm": 0.5652275283387745, + "learning_rate": 0.00011321507230119517, + "loss": 12.1513, + "step": 17412 + }, + { + "epoch": 0.948207442499983, + "grad_norm": 0.538845252796443, + "learning_rate": 0.00011320633133101659, + "loss": 12.0958, + "step": 17413 + }, + { + "epoch": 0.948261896496566, + "grad_norm": 0.5822453906528845, + "learning_rate": 0.00011319759025814335, + "loss": 12.1057, + "step": 17414 + }, + { + "epoch": 0.948316350493149, + "grad_norm": 0.6519578863689457, + "learning_rate": 0.00011318884908264347, + "loss": 12.3193, + "step": 17415 + }, + { + "epoch": 0.948370804489732, + "grad_norm": 0.602744532893201, + "learning_rate": 0.00011318010780458488, + "loss": 12.1319, + "step": 17416 + }, + { + "epoch": 0.948425258486315, + "grad_norm": 0.6262288472678893, + "learning_rate": 0.00011317136642403554, + "loss": 12.0864, + "step": 17417 + }, + { + "epoch": 0.9484797124828981, + "grad_norm": 0.5392669614315889, + "learning_rate": 0.00011316262494106347, + "loss": 12.2444, + "step": 17418 + }, + { + "epoch": 0.9485341664794811, + "grad_norm": 0.6239141160838986, + "learning_rate": 0.0001131538833557366, + "loss": 12.1257, + "step": 17419 + }, + { + "epoch": 0.9485886204760641, + "grad_norm": 0.6145631515186998, + "learning_rate": 0.00011314514166812295, + "loss": 12.1822, + "step": 17420 + }, + { + "epoch": 0.9486430744726471, + "grad_norm": 0.596281164024472, + "learning_rate": 0.00011313639987829046, + "loss": 12.1477, + "step": 17421 + }, + { + "epoch": 0.9486975284692301, + "grad_norm": 0.5929985373760998, + "learning_rate": 0.00011312765798630711, + "loss": 12.0916, + "step": 17422 + }, + { + "epoch": 0.948751982465813, + "grad_norm": 0.8604325299052797, + "learning_rate": 0.00011311891599224092, + "loss": 11.9498, + "step": 17423 + }, + { + "epoch": 0.9488064364623962, + "grad_norm": 0.5983641398134796, + "learning_rate": 0.00011311017389615981, + "loss": 12.1452, + "step": 17424 + }, + { + "epoch": 0.9488608904589791, + "grad_norm": 0.5306545391820514, + "learning_rate": 0.0001131014316981318, + "loss": 12.0887, + "step": 17425 + }, + { + "epoch": 0.9489153444555621, + "grad_norm": 0.6019247104345119, + "learning_rate": 0.00011309268939822486, + "loss": 12.0831, + "step": 17426 + }, + { + "epoch": 0.9489697984521451, + "grad_norm": 0.6030923956429187, + "learning_rate": 0.00011308394699650697, + "loss": 12.1672, + "step": 17427 + }, + { + "epoch": 0.9490242524487281, + "grad_norm": 0.5375711174347375, + "learning_rate": 0.00011307520449304614, + "loss": 12.0472, + "step": 17428 + }, + { + "epoch": 0.9490787064453111, + "grad_norm": 0.5398776364743896, + "learning_rate": 0.0001130664618879103, + "loss": 12.2488, + "step": 17429 + }, + { + "epoch": 0.9491331604418942, + "grad_norm": 0.543823949854254, + "learning_rate": 0.00011305771918116746, + "loss": 12.1812, + "step": 17430 + }, + { + "epoch": 0.9491876144384772, + "grad_norm": 0.5784847529335118, + "learning_rate": 0.00011304897637288561, + "loss": 12.0837, + "step": 17431 + }, + { + "epoch": 0.9492420684350602, + "grad_norm": 0.5714020178424278, + "learning_rate": 0.00011304023346313273, + "loss": 12.1253, + "step": 17432 + }, + { + "epoch": 0.9492965224316432, + "grad_norm": 0.4896225675110771, + "learning_rate": 0.00011303149045197682, + "loss": 12.0577, + "step": 17433 + }, + { + "epoch": 0.9493509764282262, + "grad_norm": 0.5872673682035371, + "learning_rate": 0.00011302274733948583, + "loss": 12.1131, + "step": 17434 + }, + { + "epoch": 0.9494054304248093, + "grad_norm": 0.5501561471508206, + "learning_rate": 0.00011301400412572781, + "loss": 12.1594, + "step": 17435 + }, + { + "epoch": 0.9494598844213923, + "grad_norm": 0.6206510333411461, + "learning_rate": 0.00011300526081077068, + "loss": 12.3073, + "step": 17436 + }, + { + "epoch": 0.9495143384179753, + "grad_norm": 0.5840100543707513, + "learning_rate": 0.00011299651739468246, + "loss": 12.17, + "step": 17437 + }, + { + "epoch": 0.9495687924145583, + "grad_norm": 0.6196232425988139, + "learning_rate": 0.00011298777387753118, + "loss": 12.1427, + "step": 17438 + }, + { + "epoch": 0.9496232464111413, + "grad_norm": 0.70709403486026, + "learning_rate": 0.00011297903025938476, + "loss": 12.1109, + "step": 17439 + }, + { + "epoch": 0.9496777004077243, + "grad_norm": 0.6074900970206965, + "learning_rate": 0.00011297028654031121, + "loss": 12.0835, + "step": 17440 + }, + { + "epoch": 0.9497321544043074, + "grad_norm": 0.6208250634797887, + "learning_rate": 0.00011296154272037856, + "loss": 12.1172, + "step": 17441 + }, + { + "epoch": 0.9497866084008904, + "grad_norm": 0.5726205600853366, + "learning_rate": 0.00011295279879965477, + "loss": 12.1401, + "step": 17442 + }, + { + "epoch": 0.9498410623974733, + "grad_norm": 0.5696277608532468, + "learning_rate": 0.00011294405477820787, + "loss": 12.2064, + "step": 17443 + }, + { + "epoch": 0.9498955163940563, + "grad_norm": 0.5569385381906664, + "learning_rate": 0.00011293531065610581, + "loss": 12.1694, + "step": 17444 + }, + { + "epoch": 0.9499499703906393, + "grad_norm": 0.6089017667986101, + "learning_rate": 0.00011292656643341659, + "loss": 12.1573, + "step": 17445 + }, + { + "epoch": 0.9500044243872223, + "grad_norm": 0.5086641759852797, + "learning_rate": 0.00011291782211020823, + "loss": 12.098, + "step": 17446 + }, + { + "epoch": 0.9500588783838054, + "grad_norm": 0.5903911370652851, + "learning_rate": 0.00011290907768654872, + "loss": 12.1332, + "step": 17447 + }, + { + "epoch": 0.9501133323803884, + "grad_norm": 0.5118851465113656, + "learning_rate": 0.00011290033316250608, + "loss": 11.9755, + "step": 17448 + }, + { + "epoch": 0.9501677863769714, + "grad_norm": 0.5796184135739372, + "learning_rate": 0.00011289158853814827, + "loss": 12.2098, + "step": 17449 + }, + { + "epoch": 0.9502222403735544, + "grad_norm": 0.5154883143599293, + "learning_rate": 0.0001128828438135433, + "loss": 11.988, + "step": 17450 + }, + { + "epoch": 0.9502766943701374, + "grad_norm": 0.5610163462632073, + "learning_rate": 0.00011287409898875916, + "loss": 12.2074, + "step": 17451 + }, + { + "epoch": 0.9503311483667204, + "grad_norm": 0.5747716835570177, + "learning_rate": 0.00011286535406386389, + "loss": 12.079, + "step": 17452 + }, + { + "epoch": 0.9503856023633035, + "grad_norm": 0.5182737197924299, + "learning_rate": 0.0001128566090389255, + "loss": 12.1829, + "step": 17453 + }, + { + "epoch": 0.9504400563598865, + "grad_norm": 0.5359861646375332, + "learning_rate": 0.00011284786391401191, + "loss": 12.1085, + "step": 17454 + }, + { + "epoch": 0.9504945103564695, + "grad_norm": 0.6373633744777849, + "learning_rate": 0.00011283911868919119, + "loss": 12.1897, + "step": 17455 + }, + { + "epoch": 0.9505489643530525, + "grad_norm": 0.5455363959029207, + "learning_rate": 0.00011283037336453132, + "loss": 12.1452, + "step": 17456 + }, + { + "epoch": 0.9506034183496355, + "grad_norm": 0.5436581687885378, + "learning_rate": 0.00011282162794010034, + "loss": 12.1088, + "step": 17457 + }, + { + "epoch": 0.9506578723462185, + "grad_norm": 0.5684905876488265, + "learning_rate": 0.00011281288241596624, + "loss": 12.2331, + "step": 17458 + }, + { + "epoch": 0.9507123263428016, + "grad_norm": 0.5930821930183624, + "learning_rate": 0.000112804136792197, + "loss": 11.9246, + "step": 17459 + }, + { + "epoch": 0.9507667803393846, + "grad_norm": 0.5410520640706985, + "learning_rate": 0.00011279539106886064, + "loss": 12.033, + "step": 17460 + }, + { + "epoch": 0.9508212343359675, + "grad_norm": 0.5541788671385539, + "learning_rate": 0.00011278664524602516, + "loss": 12.1253, + "step": 17461 + }, + { + "epoch": 0.9508756883325505, + "grad_norm": 0.5638897814342899, + "learning_rate": 0.00011277789932375858, + "loss": 12.1153, + "step": 17462 + }, + { + "epoch": 0.9509301423291335, + "grad_norm": 0.5427936828451158, + "learning_rate": 0.00011276915330212894, + "loss": 11.9585, + "step": 17463 + }, + { + "epoch": 0.9509845963257166, + "grad_norm": 0.6050927126754538, + "learning_rate": 0.00011276040718120422, + "loss": 12.1493, + "step": 17464 + }, + { + "epoch": 0.9510390503222996, + "grad_norm": 0.5121729638018014, + "learning_rate": 0.00011275166096105243, + "loss": 11.9161, + "step": 17465 + }, + { + "epoch": 0.9510935043188826, + "grad_norm": 0.5349722848715646, + "learning_rate": 0.00011274291464174158, + "loss": 12.0515, + "step": 17466 + }, + { + "epoch": 0.9511479583154656, + "grad_norm": 0.6259830681803067, + "learning_rate": 0.00011273416822333969, + "loss": 12.1932, + "step": 17467 + }, + { + "epoch": 0.9512024123120486, + "grad_norm": 0.5447584965802076, + "learning_rate": 0.00011272542170591478, + "loss": 12.1151, + "step": 17468 + }, + { + "epoch": 0.9512568663086316, + "grad_norm": 0.5172873978728029, + "learning_rate": 0.00011271667508953485, + "loss": 12.1788, + "step": 17469 + }, + { + "epoch": 0.9513113203052147, + "grad_norm": 0.5173157791537659, + "learning_rate": 0.00011270792837426791, + "loss": 12.042, + "step": 17470 + }, + { + "epoch": 0.9513657743017977, + "grad_norm": 0.6380131731191625, + "learning_rate": 0.000112699181560182, + "loss": 12.0384, + "step": 17471 + }, + { + "epoch": 0.9514202282983807, + "grad_norm": 0.5753540757727378, + "learning_rate": 0.00011269043464734513, + "loss": 12.0551, + "step": 17472 + }, + { + "epoch": 0.9514746822949637, + "grad_norm": 0.7165165108917677, + "learning_rate": 0.00011268168763582529, + "loss": 12.3405, + "step": 17473 + }, + { + "epoch": 0.9515291362915467, + "grad_norm": 0.5157512469301627, + "learning_rate": 0.00011267294052569055, + "loss": 12.0039, + "step": 17474 + }, + { + "epoch": 0.9515835902881297, + "grad_norm": 0.5786763748974254, + "learning_rate": 0.00011266419331700888, + "loss": 12.241, + "step": 17475 + }, + { + "epoch": 0.9516380442847128, + "grad_norm": 0.5817474153048059, + "learning_rate": 0.00011265544600984831, + "loss": 12.1188, + "step": 17476 + }, + { + "epoch": 0.9516924982812958, + "grad_norm": 0.5102441058380783, + "learning_rate": 0.0001126466986042769, + "loss": 12.0575, + "step": 17477 + }, + { + "epoch": 0.9517469522778788, + "grad_norm": 0.6148685340058035, + "learning_rate": 0.00011263795110036261, + "loss": 12.2371, + "step": 17478 + }, + { + "epoch": 0.9518014062744617, + "grad_norm": 0.6335387459693504, + "learning_rate": 0.00011262920349817352, + "loss": 12.0826, + "step": 17479 + }, + { + "epoch": 0.9518558602710447, + "grad_norm": 0.5463841829279674, + "learning_rate": 0.00011262045579777763, + "loss": 12.0817, + "step": 17480 + }, + { + "epoch": 0.9519103142676277, + "grad_norm": 0.5303469750614441, + "learning_rate": 0.00011261170799924291, + "loss": 12.1507, + "step": 17481 + }, + { + "epoch": 0.9519647682642108, + "grad_norm": 0.5621517509390335, + "learning_rate": 0.00011260296010263749, + "loss": 12.1779, + "step": 17482 + }, + { + "epoch": 0.9520192222607938, + "grad_norm": 0.5725064239488651, + "learning_rate": 0.00011259421210802931, + "loss": 12.147, + "step": 17483 + }, + { + "epoch": 0.9520736762573768, + "grad_norm": 0.5288320670227035, + "learning_rate": 0.00011258546401548641, + "loss": 12.0917, + "step": 17484 + }, + { + "epoch": 0.9521281302539598, + "grad_norm": 0.5175473878683832, + "learning_rate": 0.00011257671582507687, + "loss": 12.1715, + "step": 17485 + }, + { + "epoch": 0.9521825842505428, + "grad_norm": 0.5596797458969293, + "learning_rate": 0.00011256796753686867, + "loss": 12.1307, + "step": 17486 + }, + { + "epoch": 0.9522370382471258, + "grad_norm": 0.56719958493079, + "learning_rate": 0.00011255921915092982, + "loss": 12.1031, + "step": 17487 + }, + { + "epoch": 0.9522914922437089, + "grad_norm": 0.5592444241773291, + "learning_rate": 0.00011255047066732842, + "loss": 12.0593, + "step": 17488 + }, + { + "epoch": 0.9523459462402919, + "grad_norm": 0.5679003470701051, + "learning_rate": 0.0001125417220861324, + "loss": 12.2328, + "step": 17489 + }, + { + "epoch": 0.9524004002368749, + "grad_norm": 0.5376761644488985, + "learning_rate": 0.00011253297340740987, + "loss": 12.1243, + "step": 17490 + }, + { + "epoch": 0.9524548542334579, + "grad_norm": 0.5672837763664881, + "learning_rate": 0.00011252422463122884, + "loss": 12.0671, + "step": 17491 + }, + { + "epoch": 0.9525093082300409, + "grad_norm": 0.6140860516736624, + "learning_rate": 0.00011251547575765735, + "loss": 12.2153, + "step": 17492 + }, + { + "epoch": 0.9525637622266239, + "grad_norm": 0.5151714774041493, + "learning_rate": 0.00011250672678676342, + "loss": 12.0233, + "step": 17493 + }, + { + "epoch": 0.952618216223207, + "grad_norm": 0.6192749628451423, + "learning_rate": 0.00011249797771861506, + "loss": 12.1945, + "step": 17494 + }, + { + "epoch": 0.95267267021979, + "grad_norm": 0.5071903732072338, + "learning_rate": 0.00011248922855328035, + "loss": 12.1009, + "step": 17495 + }, + { + "epoch": 0.952727124216373, + "grad_norm": 0.5047572016443033, + "learning_rate": 0.0001124804792908273, + "loss": 12.137, + "step": 17496 + }, + { + "epoch": 0.952781578212956, + "grad_norm": 0.5803959738561252, + "learning_rate": 0.00011247172993132394, + "loss": 12.3401, + "step": 17497 + }, + { + "epoch": 0.9528360322095389, + "grad_norm": 0.6369007115438952, + "learning_rate": 0.00011246298047483834, + "loss": 12.1584, + "step": 17498 + }, + { + "epoch": 0.952890486206122, + "grad_norm": 0.6022015694890215, + "learning_rate": 0.00011245423092143852, + "loss": 12.2493, + "step": 17499 + }, + { + "epoch": 0.952944940202705, + "grad_norm": 0.5495827166066689, + "learning_rate": 0.00011244548127119245, + "loss": 12.0801, + "step": 17500 + }, + { + "epoch": 0.952999394199288, + "grad_norm": 0.5195826351260587, + "learning_rate": 0.00011243673152416827, + "loss": 12.0984, + "step": 17501 + }, + { + "epoch": 0.953053848195871, + "grad_norm": 0.541215822448782, + "learning_rate": 0.000112427981680434, + "loss": 12.0597, + "step": 17502 + }, + { + "epoch": 0.953108302192454, + "grad_norm": 0.546730479220331, + "learning_rate": 0.00011241923174005767, + "loss": 12.1773, + "step": 17503 + }, + { + "epoch": 0.953162756189037, + "grad_norm": 0.5547994486952583, + "learning_rate": 0.00011241048170310726, + "loss": 12.1626, + "step": 17504 + }, + { + "epoch": 0.9532172101856201, + "grad_norm": 0.546797868136595, + "learning_rate": 0.00011240173156965088, + "loss": 12.2097, + "step": 17505 + }, + { + "epoch": 0.9532716641822031, + "grad_norm": 0.6532019259814937, + "learning_rate": 0.00011239298133975656, + "loss": 12.282, + "step": 17506 + }, + { + "epoch": 0.9533261181787861, + "grad_norm": 0.5831149839815883, + "learning_rate": 0.00011238423101349234, + "loss": 12.2849, + "step": 17507 + }, + { + "epoch": 0.9533805721753691, + "grad_norm": 0.5331895682878139, + "learning_rate": 0.00011237548059092629, + "loss": 12.2181, + "step": 17508 + }, + { + "epoch": 0.9534350261719521, + "grad_norm": 0.5594185352289169, + "learning_rate": 0.00011236673007212639, + "loss": 12.0504, + "step": 17509 + }, + { + "epoch": 0.9534894801685351, + "grad_norm": 0.6052484244390849, + "learning_rate": 0.00011235797945716073, + "loss": 12.2553, + "step": 17510 + }, + { + "epoch": 0.9535439341651182, + "grad_norm": 0.5459851995298468, + "learning_rate": 0.00011234922874609735, + "loss": 12.2289, + "step": 17511 + }, + { + "epoch": 0.9535983881617012, + "grad_norm": 0.5216090581938867, + "learning_rate": 0.00011234047793900429, + "loss": 12.0912, + "step": 17512 + }, + { + "epoch": 0.9536528421582842, + "grad_norm": 0.6099670336744216, + "learning_rate": 0.00011233172703594962, + "loss": 12.2609, + "step": 17513 + }, + { + "epoch": 0.9537072961548672, + "grad_norm": 0.5814475487547149, + "learning_rate": 0.0001123229760370014, + "loss": 12.1752, + "step": 17514 + }, + { + "epoch": 0.9537617501514501, + "grad_norm": 0.597815180264054, + "learning_rate": 0.00011231422494222761, + "loss": 12.007, + "step": 17515 + }, + { + "epoch": 0.9538162041480331, + "grad_norm": 0.49954782488132937, + "learning_rate": 0.00011230547375169634, + "loss": 12.0644, + "step": 17516 + }, + { + "epoch": 0.9538706581446162, + "grad_norm": 0.5026829090145971, + "learning_rate": 0.00011229672246547562, + "loss": 12.1274, + "step": 17517 + }, + { + "epoch": 0.9539251121411992, + "grad_norm": 0.5387715703406172, + "learning_rate": 0.00011228797108363358, + "loss": 12.0936, + "step": 17518 + }, + { + "epoch": 0.9539795661377822, + "grad_norm": 0.5469127944330988, + "learning_rate": 0.0001122792196062382, + "loss": 12.1236, + "step": 17519 + }, + { + "epoch": 0.9540340201343652, + "grad_norm": 0.506500734264548, + "learning_rate": 0.00011227046803335755, + "loss": 11.9995, + "step": 17520 + }, + { + "epoch": 0.9540884741309482, + "grad_norm": 0.5343499369968282, + "learning_rate": 0.00011226171636505967, + "loss": 12.1079, + "step": 17521 + }, + { + "epoch": 0.9541429281275312, + "grad_norm": 0.530386561431065, + "learning_rate": 0.00011225296460141262, + "loss": 12.0347, + "step": 17522 + }, + { + "epoch": 0.9541973821241143, + "grad_norm": 0.5424374998579926, + "learning_rate": 0.0001122442127424845, + "loss": 12.1531, + "step": 17523 + }, + { + "epoch": 0.9542518361206973, + "grad_norm": 0.558572783863881, + "learning_rate": 0.00011223546078834328, + "loss": 12.1718, + "step": 17524 + }, + { + "epoch": 0.9543062901172803, + "grad_norm": 0.5112976002495363, + "learning_rate": 0.0001122267087390571, + "loss": 12.1203, + "step": 17525 + }, + { + "epoch": 0.9543607441138633, + "grad_norm": 0.6139608398164536, + "learning_rate": 0.00011221795659469396, + "loss": 12.2115, + "step": 17526 + }, + { + "epoch": 0.9544151981104463, + "grad_norm": 0.5590312583921028, + "learning_rate": 0.00011220920435532197, + "loss": 12.1637, + "step": 17527 + }, + { + "epoch": 0.9544696521070293, + "grad_norm": 0.5220602531012433, + "learning_rate": 0.00011220045202100913, + "loss": 12.0965, + "step": 17528 + }, + { + "epoch": 0.9545241061036124, + "grad_norm": 0.5146669588181602, + "learning_rate": 0.00011219169959182354, + "loss": 12.1581, + "step": 17529 + }, + { + "epoch": 0.9545785601001954, + "grad_norm": 0.5103508856909138, + "learning_rate": 0.00011218294706783323, + "loss": 12.0123, + "step": 17530 + }, + { + "epoch": 0.9546330140967784, + "grad_norm": 0.5394214752467305, + "learning_rate": 0.00011217419444910631, + "loss": 12.1238, + "step": 17531 + }, + { + "epoch": 0.9546874680933614, + "grad_norm": 0.6403842059952954, + "learning_rate": 0.0001121654417357108, + "loss": 12.2112, + "step": 17532 + }, + { + "epoch": 0.9547419220899444, + "grad_norm": 0.5181254678820698, + "learning_rate": 0.00011215668892771478, + "loss": 12.0037, + "step": 17533 + }, + { + "epoch": 0.9547963760865275, + "grad_norm": 0.5086013058427543, + "learning_rate": 0.0001121479360251863, + "loss": 12.1228, + "step": 17534 + }, + { + "epoch": 0.9548508300831104, + "grad_norm": 0.5257888747973913, + "learning_rate": 0.00011213918302819344, + "loss": 12.2236, + "step": 17535 + }, + { + "epoch": 0.9549052840796934, + "grad_norm": 0.5328935148950803, + "learning_rate": 0.00011213042993680424, + "loss": 12.0387, + "step": 17536 + }, + { + "epoch": 0.9549597380762764, + "grad_norm": 0.5784220847540573, + "learning_rate": 0.00011212167675108683, + "loss": 12.1117, + "step": 17537 + }, + { + "epoch": 0.9550141920728594, + "grad_norm": 0.5529509688232136, + "learning_rate": 0.00011211292347110918, + "loss": 12.1224, + "step": 17538 + }, + { + "epoch": 0.9550686460694424, + "grad_norm": 0.6476686014260221, + "learning_rate": 0.0001121041700969394, + "loss": 12.1743, + "step": 17539 + }, + { + "epoch": 0.9551231000660255, + "grad_norm": 0.6194846217104752, + "learning_rate": 0.0001120954166286456, + "loss": 12.1517, + "step": 17540 + }, + { + "epoch": 0.9551775540626085, + "grad_norm": 0.5867983675726799, + "learning_rate": 0.00011208666306629581, + "loss": 12.1768, + "step": 17541 + }, + { + "epoch": 0.9552320080591915, + "grad_norm": 0.5464992669933516, + "learning_rate": 0.00011207790940995808, + "loss": 12.0183, + "step": 17542 + }, + { + "epoch": 0.9552864620557745, + "grad_norm": 0.5798443116280011, + "learning_rate": 0.0001120691556597005, + "loss": 12.0348, + "step": 17543 + }, + { + "epoch": 0.9553409160523575, + "grad_norm": 0.5732012006437913, + "learning_rate": 0.00011206040181559117, + "loss": 12.1493, + "step": 17544 + }, + { + "epoch": 0.9553953700489405, + "grad_norm": 0.5767419723503235, + "learning_rate": 0.0001120516478776981, + "loss": 11.9773, + "step": 17545 + }, + { + "epoch": 0.9554498240455236, + "grad_norm": 0.5359491833504862, + "learning_rate": 0.00011204289384608941, + "loss": 11.8632, + "step": 17546 + }, + { + "epoch": 0.9555042780421066, + "grad_norm": 0.5848547547696398, + "learning_rate": 0.00011203413972083315, + "loss": 12.1047, + "step": 17547 + }, + { + "epoch": 0.9555587320386896, + "grad_norm": 0.5465695726591704, + "learning_rate": 0.00011202538550199742, + "loss": 12.0137, + "step": 17548 + }, + { + "epoch": 0.9556131860352726, + "grad_norm": 0.49996478425654056, + "learning_rate": 0.00011201663118965025, + "loss": 12.0775, + "step": 17549 + }, + { + "epoch": 0.9556676400318556, + "grad_norm": 0.5747110527477752, + "learning_rate": 0.00011200787678385975, + "loss": 12.1007, + "step": 17550 + }, + { + "epoch": 0.9557220940284386, + "grad_norm": 0.6505968349618484, + "learning_rate": 0.000111999122284694, + "loss": 12.2088, + "step": 17551 + }, + { + "epoch": 0.9557765480250217, + "grad_norm": 0.5489434484718236, + "learning_rate": 0.00011199036769222105, + "loss": 12.26, + "step": 17552 + }, + { + "epoch": 0.9558310020216046, + "grad_norm": 0.5103149364972873, + "learning_rate": 0.000111981613006509, + "loss": 12.174, + "step": 17553 + }, + { + "epoch": 0.9558854560181876, + "grad_norm": 0.5294935254432681, + "learning_rate": 0.0001119728582276259, + "loss": 12.1472, + "step": 17554 + }, + { + "epoch": 0.9559399100147706, + "grad_norm": 0.5714016666252779, + "learning_rate": 0.00011196410335563984, + "loss": 12.0593, + "step": 17555 + }, + { + "epoch": 0.9559943640113536, + "grad_norm": 0.5641777737296872, + "learning_rate": 0.00011195534839061895, + "loss": 12.1639, + "step": 17556 + }, + { + "epoch": 0.9560488180079366, + "grad_norm": 0.5434267121340497, + "learning_rate": 0.00011194659333263122, + "loss": 12.0999, + "step": 17557 + }, + { + "epoch": 0.9561032720045197, + "grad_norm": 0.5531724406692369, + "learning_rate": 0.00011193783818174482, + "loss": 12.1736, + "step": 17558 + }, + { + "epoch": 0.9561577260011027, + "grad_norm": 0.5639080263258313, + "learning_rate": 0.00011192908293802778, + "loss": 12.1458, + "step": 17559 + }, + { + "epoch": 0.9562121799976857, + "grad_norm": 0.5885309555968149, + "learning_rate": 0.00011192032760154814, + "loss": 11.9894, + "step": 17560 + }, + { + "epoch": 0.9562666339942687, + "grad_norm": 0.5576789959998956, + "learning_rate": 0.00011191157217237406, + "loss": 12.1119, + "step": 17561 + }, + { + "epoch": 0.9563210879908517, + "grad_norm": 0.5315643033369111, + "learning_rate": 0.00011190281665057362, + "loss": 12.199, + "step": 17562 + }, + { + "epoch": 0.9563755419874347, + "grad_norm": 0.5283050591210666, + "learning_rate": 0.00011189406103621487, + "loss": 12.2325, + "step": 17563 + }, + { + "epoch": 0.9564299959840178, + "grad_norm": 0.5224092214354858, + "learning_rate": 0.00011188530532936592, + "loss": 12.1109, + "step": 17564 + }, + { + "epoch": 0.9564844499806008, + "grad_norm": 0.6066525480426279, + "learning_rate": 0.00011187654953009483, + "loss": 12.1813, + "step": 17565 + }, + { + "epoch": 0.9565389039771838, + "grad_norm": 0.649472746570281, + "learning_rate": 0.00011186779363846966, + "loss": 12.1454, + "step": 17566 + }, + { + "epoch": 0.9565933579737668, + "grad_norm": 0.5786411478000614, + "learning_rate": 0.00011185903765455859, + "loss": 12.1886, + "step": 17567 + }, + { + "epoch": 0.9566478119703498, + "grad_norm": 0.5527424783159517, + "learning_rate": 0.00011185028157842962, + "loss": 12.0374, + "step": 17568 + }, + { + "epoch": 0.9567022659669329, + "grad_norm": 0.5821605386294587, + "learning_rate": 0.00011184152541015092, + "loss": 12.1065, + "step": 17569 + }, + { + "epoch": 0.9567567199635159, + "grad_norm": 0.5925912060183305, + "learning_rate": 0.00011183276914979051, + "loss": 12.3158, + "step": 17570 + }, + { + "epoch": 0.9568111739600988, + "grad_norm": 0.5634063289150791, + "learning_rate": 0.00011182401279741648, + "loss": 12.057, + "step": 17571 + }, + { + "epoch": 0.9568656279566818, + "grad_norm": 0.5824811736735698, + "learning_rate": 0.00011181525635309695, + "loss": 12.2049, + "step": 17572 + }, + { + "epoch": 0.9569200819532648, + "grad_norm": 0.5775520866209082, + "learning_rate": 0.0001118064998169, + "loss": 12.1704, + "step": 17573 + }, + { + "epoch": 0.9569745359498478, + "grad_norm": 0.48993449407470774, + "learning_rate": 0.00011179774318889378, + "loss": 12.1007, + "step": 17574 + }, + { + "epoch": 0.9570289899464309, + "grad_norm": 0.5439971950253318, + "learning_rate": 0.00011178898646914629, + "loss": 12.1628, + "step": 17575 + }, + { + "epoch": 0.9570834439430139, + "grad_norm": 0.5620142288513571, + "learning_rate": 0.00011178022965772566, + "loss": 12.1764, + "step": 17576 + }, + { + "epoch": 0.9571378979395969, + "grad_norm": 0.5586042403613316, + "learning_rate": 0.00011177147275469997, + "loss": 12.0713, + "step": 17577 + }, + { + "epoch": 0.9571923519361799, + "grad_norm": 0.6908235593149696, + "learning_rate": 0.00011176271576013738, + "loss": 12.3033, + "step": 17578 + }, + { + "epoch": 0.9572468059327629, + "grad_norm": 0.5475781376792936, + "learning_rate": 0.00011175395867410592, + "loss": 11.989, + "step": 17579 + }, + { + "epoch": 0.9573012599293459, + "grad_norm": 0.6810380687100889, + "learning_rate": 0.00011174520149667371, + "loss": 12.0593, + "step": 17580 + }, + { + "epoch": 0.957355713925929, + "grad_norm": 0.5340136630295348, + "learning_rate": 0.00011173644422790883, + "loss": 12.0288, + "step": 17581 + }, + { + "epoch": 0.957410167922512, + "grad_norm": 0.5542940066010866, + "learning_rate": 0.00011172768686787938, + "loss": 12.1359, + "step": 17582 + }, + { + "epoch": 0.957464621919095, + "grad_norm": 0.5892735554281785, + "learning_rate": 0.00011171892941665349, + "loss": 12.1703, + "step": 17583 + }, + { + "epoch": 0.957519075915678, + "grad_norm": 0.544821778155144, + "learning_rate": 0.00011171017187429926, + "loss": 12.1065, + "step": 17584 + }, + { + "epoch": 0.957573529912261, + "grad_norm": 0.5092524735271818, + "learning_rate": 0.00011170141424088476, + "loss": 12.0485, + "step": 17585 + }, + { + "epoch": 0.957627983908844, + "grad_norm": 0.5919783181085536, + "learning_rate": 0.00011169265651647809, + "loss": 12.1461, + "step": 17586 + }, + { + "epoch": 0.9576824379054271, + "grad_norm": 0.5603813725359015, + "learning_rate": 0.00011168389870114735, + "loss": 12.1747, + "step": 17587 + }, + { + "epoch": 0.9577368919020101, + "grad_norm": 0.6012158867500161, + "learning_rate": 0.00011167514079496064, + "loss": 12.2645, + "step": 17588 + }, + { + "epoch": 0.957791345898593, + "grad_norm": 0.5660345202872905, + "learning_rate": 0.00011166638279798614, + "loss": 12.1592, + "step": 17589 + }, + { + "epoch": 0.957845799895176, + "grad_norm": 0.5603806205536941, + "learning_rate": 0.00011165762471029184, + "loss": 12.1503, + "step": 17590 + }, + { + "epoch": 0.957900253891759, + "grad_norm": 0.5918656178193153, + "learning_rate": 0.0001116488665319459, + "loss": 12.2107, + "step": 17591 + }, + { + "epoch": 0.957954707888342, + "grad_norm": 0.5341569004437059, + "learning_rate": 0.00011164010826301645, + "loss": 12.191, + "step": 17592 + }, + { + "epoch": 0.9580091618849251, + "grad_norm": 0.6158463315800793, + "learning_rate": 0.00011163134990357153, + "loss": 12.2008, + "step": 17593 + }, + { + "epoch": 0.9580636158815081, + "grad_norm": 0.5816977629065496, + "learning_rate": 0.00011162259145367931, + "loss": 12.1926, + "step": 17594 + }, + { + "epoch": 0.9581180698780911, + "grad_norm": 0.5728226483155598, + "learning_rate": 0.00011161383291340786, + "loss": 12.0512, + "step": 17595 + }, + { + "epoch": 0.9581725238746741, + "grad_norm": 0.5456983562201944, + "learning_rate": 0.00011160507428282529, + "loss": 12.1537, + "step": 17596 + }, + { + "epoch": 0.9582269778712571, + "grad_norm": 0.5588182377531732, + "learning_rate": 0.00011159631556199971, + "loss": 12.2735, + "step": 17597 + }, + { + "epoch": 0.9582814318678402, + "grad_norm": 0.5403151397688128, + "learning_rate": 0.00011158755675099925, + "loss": 12.0898, + "step": 17598 + }, + { + "epoch": 0.9583358858644232, + "grad_norm": 0.6025445149650608, + "learning_rate": 0.00011157879784989202, + "loss": 12.077, + "step": 17599 + }, + { + "epoch": 0.9583903398610062, + "grad_norm": 0.5381067905909196, + "learning_rate": 0.00011157003885874609, + "loss": 12.1695, + "step": 17600 + }, + { + "epoch": 0.9584447938575892, + "grad_norm": 0.509999086899731, + "learning_rate": 0.0001115612797776296, + "loss": 12.0278, + "step": 17601 + }, + { + "epoch": 0.9584992478541722, + "grad_norm": 0.5077297192498021, + "learning_rate": 0.00011155252060661068, + "loss": 12.1266, + "step": 17602 + }, + { + "epoch": 0.9585537018507552, + "grad_norm": 0.5601415730476714, + "learning_rate": 0.00011154376134575742, + "loss": 12.07, + "step": 17603 + }, + { + "epoch": 0.9586081558473383, + "grad_norm": 0.519053095973141, + "learning_rate": 0.00011153500199513791, + "loss": 12.1834, + "step": 17604 + }, + { + "epoch": 0.9586626098439213, + "grad_norm": 0.5574930188491317, + "learning_rate": 0.0001115262425548203, + "loss": 12.1422, + "step": 17605 + }, + { + "epoch": 0.9587170638405043, + "grad_norm": 0.5596730089308446, + "learning_rate": 0.0001115174830248727, + "loss": 12.1437, + "step": 17606 + }, + { + "epoch": 0.9587715178370873, + "grad_norm": 0.5587333800127735, + "learning_rate": 0.00011150872340536323, + "loss": 12.1504, + "step": 17607 + }, + { + "epoch": 0.9588259718336702, + "grad_norm": 0.5281109827424201, + "learning_rate": 0.00011149996369635997, + "loss": 12.083, + "step": 17608 + }, + { + "epoch": 0.9588804258302532, + "grad_norm": 0.556475140686353, + "learning_rate": 0.00011149120389793108, + "loss": 12.0109, + "step": 17609 + }, + { + "epoch": 0.9589348798268363, + "grad_norm": 0.6320131511446037, + "learning_rate": 0.00011148244401014467, + "loss": 11.9375, + "step": 17610 + }, + { + "epoch": 0.9589893338234193, + "grad_norm": 0.5369064905067129, + "learning_rate": 0.00011147368403306884, + "loss": 12.1104, + "step": 17611 + }, + { + "epoch": 0.9590437878200023, + "grad_norm": 0.5518176679425901, + "learning_rate": 0.00011146492396677173, + "loss": 12.0945, + "step": 17612 + }, + { + "epoch": 0.9590982418165853, + "grad_norm": 0.5533819450011978, + "learning_rate": 0.00011145616381132143, + "loss": 12.0637, + "step": 17613 + }, + { + "epoch": 0.9591526958131683, + "grad_norm": 0.5464334412896337, + "learning_rate": 0.00011144740356678611, + "loss": 12.1286, + "step": 17614 + }, + { + "epoch": 0.9592071498097513, + "grad_norm": 0.5515032427606107, + "learning_rate": 0.0001114386432332338, + "loss": 12.0493, + "step": 17615 + }, + { + "epoch": 0.9592616038063344, + "grad_norm": 0.48501813224335094, + "learning_rate": 0.00011142988281073274, + "loss": 12.1414, + "step": 17616 + }, + { + "epoch": 0.9593160578029174, + "grad_norm": 0.508094754675668, + "learning_rate": 0.00011142112229935097, + "loss": 12.113, + "step": 17617 + }, + { + "epoch": 0.9593705117995004, + "grad_norm": 0.5712777383401623, + "learning_rate": 0.00011141236169915665, + "loss": 12.2388, + "step": 17618 + }, + { + "epoch": 0.9594249657960834, + "grad_norm": 0.49801658666359233, + "learning_rate": 0.00011140360101021789, + "loss": 12.1054, + "step": 17619 + }, + { + "epoch": 0.9594794197926664, + "grad_norm": 0.5334850006632109, + "learning_rate": 0.0001113948402326028, + "loss": 12.1435, + "step": 17620 + }, + { + "epoch": 0.9595338737892494, + "grad_norm": 0.5502584110738724, + "learning_rate": 0.00011138607936637952, + "loss": 12.0844, + "step": 17621 + }, + { + "epoch": 0.9595883277858325, + "grad_norm": 0.6631027245293025, + "learning_rate": 0.00011137731841161621, + "loss": 12.1147, + "step": 17622 + }, + { + "epoch": 0.9596427817824155, + "grad_norm": 0.6061199396871476, + "learning_rate": 0.00011136855736838092, + "loss": 12.1675, + "step": 17623 + }, + { + "epoch": 0.9596972357789985, + "grad_norm": 0.5381270689582018, + "learning_rate": 0.0001113597962367419, + "loss": 12.2246, + "step": 17624 + }, + { + "epoch": 0.9597516897755815, + "grad_norm": 0.5291846897679273, + "learning_rate": 0.00011135103501676711, + "loss": 12.0552, + "step": 17625 + }, + { + "epoch": 0.9598061437721644, + "grad_norm": 0.5619720136638755, + "learning_rate": 0.00011134227370852479, + "loss": 12.2252, + "step": 17626 + }, + { + "epoch": 0.9598605977687474, + "grad_norm": 0.5260468703160102, + "learning_rate": 0.00011133351231208307, + "loss": 12.093, + "step": 17627 + }, + { + "epoch": 0.9599150517653305, + "grad_norm": 0.5527473910586487, + "learning_rate": 0.00011132475082751004, + "loss": 12.126, + "step": 17628 + }, + { + "epoch": 0.9599695057619135, + "grad_norm": 0.5415193205337437, + "learning_rate": 0.00011131598925487387, + "loss": 11.9692, + "step": 17629 + }, + { + "epoch": 0.9600239597584965, + "grad_norm": 0.5793518969808102, + "learning_rate": 0.00011130722759424266, + "loss": 12.0674, + "step": 17630 + }, + { + "epoch": 0.9600784137550795, + "grad_norm": 0.6041437604961865, + "learning_rate": 0.00011129846584568453, + "loss": 12.1541, + "step": 17631 + }, + { + "epoch": 0.9601328677516625, + "grad_norm": 0.5358846274478933, + "learning_rate": 0.00011128970400926766, + "loss": 12.1374, + "step": 17632 + }, + { + "epoch": 0.9601873217482456, + "grad_norm": 0.5344201600769312, + "learning_rate": 0.00011128094208506014, + "loss": 12.138, + "step": 17633 + }, + { + "epoch": 0.9602417757448286, + "grad_norm": 0.658089256867609, + "learning_rate": 0.00011127218007313016, + "loss": 12.1471, + "step": 17634 + }, + { + "epoch": 0.9602962297414116, + "grad_norm": 0.5888500353636016, + "learning_rate": 0.00011126341797354578, + "loss": 12.1384, + "step": 17635 + }, + { + "epoch": 0.9603506837379946, + "grad_norm": 0.5721624240589097, + "learning_rate": 0.0001112546557863752, + "loss": 12.1714, + "step": 17636 + }, + { + "epoch": 0.9604051377345776, + "grad_norm": 0.5605762852581129, + "learning_rate": 0.00011124589351168648, + "loss": 12.0539, + "step": 17637 + }, + { + "epoch": 0.9604595917311606, + "grad_norm": 0.6076806266827197, + "learning_rate": 0.00011123713114954784, + "loss": 12.1637, + "step": 17638 + }, + { + "epoch": 0.9605140457277437, + "grad_norm": 0.5530730637761129, + "learning_rate": 0.00011122836870002739, + "loss": 12.0228, + "step": 17639 + }, + { + "epoch": 0.9605684997243267, + "grad_norm": 0.5300881258613511, + "learning_rate": 0.00011121960616319327, + "loss": 12.1155, + "step": 17640 + }, + { + "epoch": 0.9606229537209097, + "grad_norm": 0.5156942259719506, + "learning_rate": 0.0001112108435391136, + "loss": 12.1455, + "step": 17641 + }, + { + "epoch": 0.9606774077174927, + "grad_norm": 0.5948524091580496, + "learning_rate": 0.00011120208082785653, + "loss": 12.1968, + "step": 17642 + }, + { + "epoch": 0.9607318617140757, + "grad_norm": 0.5832431241578581, + "learning_rate": 0.00011119331802949016, + "loss": 12.1199, + "step": 17643 + }, + { + "epoch": 0.9607863157106586, + "grad_norm": 0.5666121368406475, + "learning_rate": 0.00011118455514408272, + "loss": 12.0781, + "step": 17644 + }, + { + "epoch": 0.9608407697072417, + "grad_norm": 0.534523597152492, + "learning_rate": 0.0001111757921717023, + "loss": 12.0978, + "step": 17645 + }, + { + "epoch": 0.9608952237038247, + "grad_norm": 0.5605840777661149, + "learning_rate": 0.00011116702911241703, + "loss": 12.1463, + "step": 17646 + }, + { + "epoch": 0.9609496777004077, + "grad_norm": 0.5344915870698154, + "learning_rate": 0.00011115826596629508, + "loss": 12.169, + "step": 17647 + }, + { + "epoch": 0.9610041316969907, + "grad_norm": 0.593752192813583, + "learning_rate": 0.00011114950273340456, + "loss": 12.1766, + "step": 17648 + }, + { + "epoch": 0.9610585856935737, + "grad_norm": 0.5331963208485385, + "learning_rate": 0.00011114073941381369, + "loss": 12.1211, + "step": 17649 + }, + { + "epoch": 0.9611130396901567, + "grad_norm": 0.6414026744605359, + "learning_rate": 0.00011113197600759053, + "loss": 12.1183, + "step": 17650 + }, + { + "epoch": 0.9611674936867398, + "grad_norm": 0.6785769352508368, + "learning_rate": 0.00011112321251480324, + "loss": 12.2273, + "step": 17651 + }, + { + "epoch": 0.9612219476833228, + "grad_norm": 0.5554691315348811, + "learning_rate": 0.00011111444893552, + "loss": 11.9135, + "step": 17652 + }, + { + "epoch": 0.9612764016799058, + "grad_norm": 0.5031722199806385, + "learning_rate": 0.00011110568526980896, + "loss": 12.1629, + "step": 17653 + }, + { + "epoch": 0.9613308556764888, + "grad_norm": 0.5353251694854835, + "learning_rate": 0.00011109692151773822, + "loss": 12.1008, + "step": 17654 + }, + { + "epoch": 0.9613853096730718, + "grad_norm": 0.5602726696502285, + "learning_rate": 0.00011108815767937598, + "loss": 12.0703, + "step": 17655 + }, + { + "epoch": 0.9614397636696548, + "grad_norm": 0.5902007713398969, + "learning_rate": 0.00011107939375479035, + "loss": 12.1335, + "step": 17656 + }, + { + "epoch": 0.9614942176662379, + "grad_norm": 0.5433913493252555, + "learning_rate": 0.00011107062974404949, + "loss": 12.0951, + "step": 17657 + }, + { + "epoch": 0.9615486716628209, + "grad_norm": 0.5543159795777511, + "learning_rate": 0.00011106186564722156, + "loss": 12.1574, + "step": 17658 + }, + { + "epoch": 0.9616031256594039, + "grad_norm": 0.5747791802624055, + "learning_rate": 0.00011105310146437473, + "loss": 12.0994, + "step": 17659 + }, + { + "epoch": 0.9616575796559869, + "grad_norm": 0.5090408070972073, + "learning_rate": 0.00011104433719557711, + "loss": 12.1205, + "step": 17660 + }, + { + "epoch": 0.9617120336525699, + "grad_norm": 0.508055808689014, + "learning_rate": 0.00011103557284089688, + "loss": 12.1905, + "step": 17661 + }, + { + "epoch": 0.9617664876491528, + "grad_norm": 0.5597201386370542, + "learning_rate": 0.00011102680840040218, + "loss": 12.0787, + "step": 17662 + }, + { + "epoch": 0.961820941645736, + "grad_norm": 0.49863322307253, + "learning_rate": 0.00011101804387416117, + "loss": 12.151, + "step": 17663 + }, + { + "epoch": 0.9618753956423189, + "grad_norm": 0.6389963740600546, + "learning_rate": 0.000111009279262242, + "loss": 12.2213, + "step": 17664 + }, + { + "epoch": 0.9619298496389019, + "grad_norm": 0.6343941955728799, + "learning_rate": 0.00011100051456471283, + "loss": 12.0429, + "step": 17665 + }, + { + "epoch": 0.9619843036354849, + "grad_norm": 0.5353282170232694, + "learning_rate": 0.00011099174978164182, + "loss": 12.1827, + "step": 17666 + }, + { + "epoch": 0.9620387576320679, + "grad_norm": 0.5617569530792962, + "learning_rate": 0.00011098298491309711, + "loss": 12.2126, + "step": 17667 + }, + { + "epoch": 0.962093211628651, + "grad_norm": 0.5868797748808696, + "learning_rate": 0.00011097421995914687, + "loss": 12.2281, + "step": 17668 + }, + { + "epoch": 0.962147665625234, + "grad_norm": 0.534053789297622, + "learning_rate": 0.00011096545491985926, + "loss": 12.1078, + "step": 17669 + }, + { + "epoch": 0.962202119621817, + "grad_norm": 0.722243932219114, + "learning_rate": 0.00011095668979530242, + "loss": 12.3244, + "step": 17670 + }, + { + "epoch": 0.9622565736184, + "grad_norm": 0.5627001055793047, + "learning_rate": 0.00011094792458554455, + "loss": 12.09, + "step": 17671 + }, + { + "epoch": 0.962311027614983, + "grad_norm": 0.5470272643302989, + "learning_rate": 0.00011093915929065378, + "loss": 12.0246, + "step": 17672 + }, + { + "epoch": 0.962365481611566, + "grad_norm": 0.5584494205081291, + "learning_rate": 0.00011093039391069823, + "loss": 12.1595, + "step": 17673 + }, + { + "epoch": 0.9624199356081491, + "grad_norm": 0.5521649001758041, + "learning_rate": 0.00011092162844574616, + "loss": 12.0636, + "step": 17674 + }, + { + "epoch": 0.9624743896047321, + "grad_norm": 0.5869416157238587, + "learning_rate": 0.00011091286289586564, + "loss": 12.2035, + "step": 17675 + }, + { + "epoch": 0.9625288436013151, + "grad_norm": 0.6040009460125166, + "learning_rate": 0.00011090409726112487, + "loss": 12.246, + "step": 17676 + }, + { + "epoch": 0.9625832975978981, + "grad_norm": 0.525820810259676, + "learning_rate": 0.00011089533154159202, + "loss": 12.1115, + "step": 17677 + }, + { + "epoch": 0.9626377515944811, + "grad_norm": 0.550318637173561, + "learning_rate": 0.00011088656573733524, + "loss": 12.0287, + "step": 17678 + }, + { + "epoch": 0.962692205591064, + "grad_norm": 0.5773091445174735, + "learning_rate": 0.00011087779984842273, + "loss": 12.0927, + "step": 17679 + }, + { + "epoch": 0.9627466595876472, + "grad_norm": 0.5159794268401052, + "learning_rate": 0.00011086903387492257, + "loss": 12.1421, + "step": 17680 + }, + { + "epoch": 0.9628011135842302, + "grad_norm": 0.5845435845603085, + "learning_rate": 0.00011086026781690299, + "loss": 12.1661, + "step": 17681 + }, + { + "epoch": 0.9628555675808131, + "grad_norm": 0.570819557846207, + "learning_rate": 0.00011085150167443217, + "loss": 12.1378, + "step": 17682 + }, + { + "epoch": 0.9629100215773961, + "grad_norm": 0.5376903021291405, + "learning_rate": 0.00011084273544757826, + "loss": 12.0382, + "step": 17683 + }, + { + "epoch": 0.9629644755739791, + "grad_norm": 0.6044262125617864, + "learning_rate": 0.00011083396913640942, + "loss": 12.2581, + "step": 17684 + }, + { + "epoch": 0.9630189295705621, + "grad_norm": 0.5417333473630804, + "learning_rate": 0.00011082520274099382, + "loss": 12.1245, + "step": 17685 + }, + { + "epoch": 0.9630733835671452, + "grad_norm": 0.5730865039814494, + "learning_rate": 0.00011081643626139957, + "loss": 12.2003, + "step": 17686 + }, + { + "epoch": 0.9631278375637282, + "grad_norm": 0.5888580206213043, + "learning_rate": 0.00011080766969769493, + "loss": 12.1222, + "step": 17687 + }, + { + "epoch": 0.9631822915603112, + "grad_norm": 0.5706176479690456, + "learning_rate": 0.00011079890304994807, + "loss": 12.0993, + "step": 17688 + }, + { + "epoch": 0.9632367455568942, + "grad_norm": 0.6091264510310652, + "learning_rate": 0.0001107901363182271, + "loss": 12.1513, + "step": 17689 + }, + { + "epoch": 0.9632911995534772, + "grad_norm": 0.5900429395195937, + "learning_rate": 0.00011078136950260025, + "loss": 12.1448, + "step": 17690 + }, + { + "epoch": 0.9633456535500602, + "grad_norm": 0.5261933964563386, + "learning_rate": 0.00011077260260313565, + "loss": 12.1517, + "step": 17691 + }, + { + "epoch": 0.9634001075466433, + "grad_norm": 0.5556032226876118, + "learning_rate": 0.00011076383561990145, + "loss": 12.1654, + "step": 17692 + }, + { + "epoch": 0.9634545615432263, + "grad_norm": 0.6023480256814611, + "learning_rate": 0.0001107550685529659, + "loss": 12.1451, + "step": 17693 + }, + { + "epoch": 0.9635090155398093, + "grad_norm": 0.5415615996758417, + "learning_rate": 0.0001107463014023971, + "loss": 12.1361, + "step": 17694 + }, + { + "epoch": 0.9635634695363923, + "grad_norm": 0.5984563325807083, + "learning_rate": 0.00011073753416826331, + "loss": 12.2119, + "step": 17695 + }, + { + "epoch": 0.9636179235329753, + "grad_norm": 0.522196686225686, + "learning_rate": 0.00011072876685063262, + "loss": 12.1152, + "step": 17696 + }, + { + "epoch": 0.9636723775295583, + "grad_norm": 0.5544319855034716, + "learning_rate": 0.00011071999944957321, + "loss": 12.0433, + "step": 17697 + }, + { + "epoch": 0.9637268315261414, + "grad_norm": 0.6373008070139872, + "learning_rate": 0.00011071123196515332, + "loss": 12.2079, + "step": 17698 + }, + { + "epoch": 0.9637812855227244, + "grad_norm": 0.5717749510728661, + "learning_rate": 0.0001107024643974411, + "loss": 12.0661, + "step": 17699 + }, + { + "epoch": 0.9638357395193073, + "grad_norm": 0.5143182302457973, + "learning_rate": 0.00011069369674650474, + "loss": 12.0557, + "step": 17700 + }, + { + "epoch": 0.9638901935158903, + "grad_norm": 0.6181546577870921, + "learning_rate": 0.00011068492901241237, + "loss": 12.3, + "step": 17701 + }, + { + "epoch": 0.9639446475124733, + "grad_norm": 0.5729675225960451, + "learning_rate": 0.0001106761611952322, + "loss": 12.2037, + "step": 17702 + }, + { + "epoch": 0.9639991015090564, + "grad_norm": 0.6016097035970154, + "learning_rate": 0.0001106673932950324, + "loss": 12.2443, + "step": 17703 + }, + { + "epoch": 0.9640535555056394, + "grad_norm": 0.5869604428655072, + "learning_rate": 0.00011065862531188116, + "loss": 12.1124, + "step": 17704 + }, + { + "epoch": 0.9641080095022224, + "grad_norm": 0.4965795860117334, + "learning_rate": 0.00011064985724584671, + "loss": 12.0727, + "step": 17705 + }, + { + "epoch": 0.9641624634988054, + "grad_norm": 0.6585504075663329, + "learning_rate": 0.00011064108909699715, + "loss": 12.2583, + "step": 17706 + }, + { + "epoch": 0.9642169174953884, + "grad_norm": 0.5649950048506434, + "learning_rate": 0.00011063232086540069, + "loss": 11.9266, + "step": 17707 + }, + { + "epoch": 0.9642713714919714, + "grad_norm": 0.6744596465749182, + "learning_rate": 0.00011062355255112552, + "loss": 12.1738, + "step": 17708 + }, + { + "epoch": 0.9643258254885545, + "grad_norm": 0.5533997239712947, + "learning_rate": 0.00011061478415423983, + "loss": 12.1556, + "step": 17709 + }, + { + "epoch": 0.9643802794851375, + "grad_norm": 0.5526966187371531, + "learning_rate": 0.00011060601567481181, + "loss": 12.1112, + "step": 17710 + }, + { + "epoch": 0.9644347334817205, + "grad_norm": 0.5610632617639335, + "learning_rate": 0.00011059724711290961, + "loss": 12.0933, + "step": 17711 + }, + { + "epoch": 0.9644891874783035, + "grad_norm": 0.6103466403065875, + "learning_rate": 0.00011058847846860147, + "loss": 12.2399, + "step": 17712 + }, + { + "epoch": 0.9645436414748865, + "grad_norm": 0.6296615986685131, + "learning_rate": 0.00011057970974195553, + "loss": 12.0778, + "step": 17713 + }, + { + "epoch": 0.9645980954714695, + "grad_norm": 0.5589011964809298, + "learning_rate": 0.00011057094093303997, + "loss": 12.2017, + "step": 17714 + }, + { + "epoch": 0.9646525494680526, + "grad_norm": 0.5706032467020418, + "learning_rate": 0.00011056217204192306, + "loss": 12.1385, + "step": 17715 + }, + { + "epoch": 0.9647070034646356, + "grad_norm": 0.5380067185680935, + "learning_rate": 0.00011055340306867288, + "loss": 12.1828, + "step": 17716 + }, + { + "epoch": 0.9647614574612186, + "grad_norm": 0.5678778439882624, + "learning_rate": 0.00011054463401335769, + "loss": 12.009, + "step": 17717 + }, + { + "epoch": 0.9648159114578015, + "grad_norm": 0.5380345678007299, + "learning_rate": 0.00011053586487604563, + "loss": 12.0443, + "step": 17718 + }, + { + "epoch": 0.9648703654543845, + "grad_norm": 0.5474811307620698, + "learning_rate": 0.00011052709565680493, + "loss": 12.1248, + "step": 17719 + }, + { + "epoch": 0.9649248194509675, + "grad_norm": 0.5442985197543375, + "learning_rate": 0.00011051832635570379, + "loss": 12.0413, + "step": 17720 + }, + { + "epoch": 0.9649792734475506, + "grad_norm": 0.5285841645711664, + "learning_rate": 0.00011050955697281036, + "loss": 12.0744, + "step": 17721 + }, + { + "epoch": 0.9650337274441336, + "grad_norm": 0.5890287986663996, + "learning_rate": 0.00011050078750819284, + "loss": 12.0981, + "step": 17722 + }, + { + "epoch": 0.9650881814407166, + "grad_norm": 0.5926662553098975, + "learning_rate": 0.00011049201796191945, + "loss": 12.135, + "step": 17723 + }, + { + "epoch": 0.9651426354372996, + "grad_norm": 0.556356239472455, + "learning_rate": 0.00011048324833405839, + "loss": 12.0762, + "step": 17724 + }, + { + "epoch": 0.9651970894338826, + "grad_norm": 0.5723507110626933, + "learning_rate": 0.00011047447862467781, + "loss": 12.095, + "step": 17725 + }, + { + "epoch": 0.9652515434304656, + "grad_norm": 0.6284135079318599, + "learning_rate": 0.00011046570883384593, + "loss": 12.2335, + "step": 17726 + }, + { + "epoch": 0.9653059974270487, + "grad_norm": 0.5506230573164945, + "learning_rate": 0.00011045693896163094, + "loss": 12.1929, + "step": 17727 + }, + { + "epoch": 0.9653604514236317, + "grad_norm": 0.5633949960312391, + "learning_rate": 0.00011044816900810105, + "loss": 12.2633, + "step": 17728 + }, + { + "epoch": 0.9654149054202147, + "grad_norm": 0.5426691026278101, + "learning_rate": 0.00011043939897332442, + "loss": 12.0683, + "step": 17729 + }, + { + "epoch": 0.9654693594167977, + "grad_norm": 0.556582190461888, + "learning_rate": 0.0001104306288573693, + "loss": 12.0986, + "step": 17730 + }, + { + "epoch": 0.9655238134133807, + "grad_norm": 0.5444972117144539, + "learning_rate": 0.00011042185866030386, + "loss": 12.2007, + "step": 17731 + }, + { + "epoch": 0.9655782674099638, + "grad_norm": 0.6540297175499002, + "learning_rate": 0.00011041308838219628, + "loss": 12.009, + "step": 17732 + }, + { + "epoch": 0.9656327214065468, + "grad_norm": 0.5411306170642678, + "learning_rate": 0.0001104043180231148, + "loss": 12.095, + "step": 17733 + }, + { + "epoch": 0.9656871754031298, + "grad_norm": 0.5715791532516885, + "learning_rate": 0.00011039554758312758, + "loss": 12.2714, + "step": 17734 + }, + { + "epoch": 0.9657416293997128, + "grad_norm": 0.48470411648455586, + "learning_rate": 0.00011038677706230285, + "loss": 11.9713, + "step": 17735 + }, + { + "epoch": 0.9657960833962957, + "grad_norm": 0.5726443792608469, + "learning_rate": 0.00011037800646070879, + "loss": 12.2113, + "step": 17736 + }, + { + "epoch": 0.9658505373928787, + "grad_norm": 0.5876872602975912, + "learning_rate": 0.00011036923577841363, + "loss": 12.1221, + "step": 17737 + }, + { + "epoch": 0.9659049913894618, + "grad_norm": 0.6080695032280878, + "learning_rate": 0.00011036046501548554, + "loss": 12.1666, + "step": 17738 + }, + { + "epoch": 0.9659594453860448, + "grad_norm": 0.5340274869133574, + "learning_rate": 0.00011035169417199274, + "loss": 12.2262, + "step": 17739 + }, + { + "epoch": 0.9660138993826278, + "grad_norm": 0.5118138027530789, + "learning_rate": 0.00011034292324800342, + "loss": 12.0464, + "step": 17740 + }, + { + "epoch": 0.9660683533792108, + "grad_norm": 0.557631660530052, + "learning_rate": 0.00011033415224358581, + "loss": 12.0623, + "step": 17741 + }, + { + "epoch": 0.9661228073757938, + "grad_norm": 0.5622665514054274, + "learning_rate": 0.00011032538115880809, + "loss": 12.1165, + "step": 17742 + }, + { + "epoch": 0.9661772613723768, + "grad_norm": 0.6304248670208875, + "learning_rate": 0.00011031660999373847, + "loss": 12.2801, + "step": 17743 + }, + { + "epoch": 0.9662317153689599, + "grad_norm": 0.5487926451839776, + "learning_rate": 0.00011030783874844517, + "loss": 12.2362, + "step": 17744 + }, + { + "epoch": 0.9662861693655429, + "grad_norm": 0.5422608903886796, + "learning_rate": 0.00011029906742299641, + "loss": 12.1801, + "step": 17745 + }, + { + "epoch": 0.9663406233621259, + "grad_norm": 0.5263104983244903, + "learning_rate": 0.00011029029601746033, + "loss": 12.0961, + "step": 17746 + }, + { + "epoch": 0.9663950773587089, + "grad_norm": 0.5613791572161544, + "learning_rate": 0.00011028152453190518, + "loss": 12.1236, + "step": 17747 + }, + { + "epoch": 0.9664495313552919, + "grad_norm": 0.5716835210741285, + "learning_rate": 0.00011027275296639921, + "loss": 12.1599, + "step": 17748 + }, + { + "epoch": 0.9665039853518749, + "grad_norm": 0.4813681714826752, + "learning_rate": 0.00011026398132101057, + "loss": 12.0847, + "step": 17749 + }, + { + "epoch": 0.966558439348458, + "grad_norm": 0.5561675500158483, + "learning_rate": 0.0001102552095958075, + "loss": 12.069, + "step": 17750 + }, + { + "epoch": 0.966612893345041, + "grad_norm": 0.5888608590341523, + "learning_rate": 0.00011024643779085819, + "loss": 12.1539, + "step": 17751 + }, + { + "epoch": 0.966667347341624, + "grad_norm": 0.5710489765909422, + "learning_rate": 0.00011023766590623085, + "loss": 12.1051, + "step": 17752 + }, + { + "epoch": 0.966721801338207, + "grad_norm": 0.5384051393906014, + "learning_rate": 0.00011022889394199371, + "loss": 12.1687, + "step": 17753 + }, + { + "epoch": 0.96677625533479, + "grad_norm": 0.6134717562884549, + "learning_rate": 0.000110220121898215, + "loss": 12.2179, + "step": 17754 + }, + { + "epoch": 0.9668307093313729, + "grad_norm": 0.6099856991129531, + "learning_rate": 0.0001102113497749629, + "loss": 12.1484, + "step": 17755 + }, + { + "epoch": 0.966885163327956, + "grad_norm": 0.5933025472236174, + "learning_rate": 0.00011020257757230563, + "loss": 12.0995, + "step": 17756 + }, + { + "epoch": 0.966939617324539, + "grad_norm": 0.6946683299584384, + "learning_rate": 0.00011019380529031138, + "loss": 12.1113, + "step": 17757 + }, + { + "epoch": 0.966994071321122, + "grad_norm": 0.5606703202864911, + "learning_rate": 0.00011018503292904841, + "loss": 12.0408, + "step": 17758 + }, + { + "epoch": 0.967048525317705, + "grad_norm": 0.5298869291861762, + "learning_rate": 0.00011017626048858491, + "loss": 12.2027, + "step": 17759 + }, + { + "epoch": 0.967102979314288, + "grad_norm": 0.5606826260571348, + "learning_rate": 0.00011016748796898913, + "loss": 12.0425, + "step": 17760 + }, + { + "epoch": 0.967157433310871, + "grad_norm": 0.5834963687793829, + "learning_rate": 0.00011015871537032923, + "loss": 12.1075, + "step": 17761 + }, + { + "epoch": 0.9672118873074541, + "grad_norm": 0.5592877249862209, + "learning_rate": 0.00011014994269267347, + "loss": 12.0068, + "step": 17762 + }, + { + "epoch": 0.9672663413040371, + "grad_norm": 0.5764920098079752, + "learning_rate": 0.00011014116993609001, + "loss": 12.1481, + "step": 17763 + }, + { + "epoch": 0.9673207953006201, + "grad_norm": 0.5605968035673906, + "learning_rate": 0.00011013239710064716, + "loss": 12.1694, + "step": 17764 + }, + { + "epoch": 0.9673752492972031, + "grad_norm": 0.5566859412922541, + "learning_rate": 0.00011012362418641309, + "loss": 12.2004, + "step": 17765 + }, + { + "epoch": 0.9674297032937861, + "grad_norm": 0.5714501816840976, + "learning_rate": 0.00011011485119345602, + "loss": 12.1422, + "step": 17766 + }, + { + "epoch": 0.9674841572903692, + "grad_norm": 0.5339414451837563, + "learning_rate": 0.00011010607812184415, + "loss": 12.0454, + "step": 17767 + }, + { + "epoch": 0.9675386112869522, + "grad_norm": 0.5231871867158163, + "learning_rate": 0.00011009730497164572, + "loss": 12.0991, + "step": 17768 + }, + { + "epoch": 0.9675930652835352, + "grad_norm": 0.5128397437189953, + "learning_rate": 0.00011008853174292895, + "loss": 12.0688, + "step": 17769 + }, + { + "epoch": 0.9676475192801182, + "grad_norm": 0.5893411496827528, + "learning_rate": 0.0001100797584357621, + "loss": 12.1022, + "step": 17770 + }, + { + "epoch": 0.9677019732767012, + "grad_norm": 0.541464055289501, + "learning_rate": 0.00011007098505021334, + "loss": 12.0411, + "step": 17771 + }, + { + "epoch": 0.9677564272732841, + "grad_norm": 0.5888980133927575, + "learning_rate": 0.0001100622115863509, + "loss": 12.1288, + "step": 17772 + }, + { + "epoch": 0.9678108812698673, + "grad_norm": 0.6788846626525558, + "learning_rate": 0.00011005343804424302, + "loss": 12.1538, + "step": 17773 + }, + { + "epoch": 0.9678653352664502, + "grad_norm": 0.5944965815213048, + "learning_rate": 0.00011004466442395792, + "loss": 12.0351, + "step": 17774 + }, + { + "epoch": 0.9679197892630332, + "grad_norm": 0.5203669199642835, + "learning_rate": 0.00011003589072556384, + "loss": 12.1308, + "step": 17775 + }, + { + "epoch": 0.9679742432596162, + "grad_norm": 0.5225541564802234, + "learning_rate": 0.00011002711694912898, + "loss": 12.1828, + "step": 17776 + }, + { + "epoch": 0.9680286972561992, + "grad_norm": 0.5809015328095049, + "learning_rate": 0.00011001834309472157, + "loss": 12.094, + "step": 17777 + }, + { + "epoch": 0.9680831512527822, + "grad_norm": 0.5858089071552731, + "learning_rate": 0.00011000956916240985, + "loss": 12.1914, + "step": 17778 + }, + { + "epoch": 0.9681376052493653, + "grad_norm": 0.5900559454762778, + "learning_rate": 0.00011000079515226204, + "loss": 12.1834, + "step": 17779 + }, + { + "epoch": 0.9681920592459483, + "grad_norm": 0.5748059397943471, + "learning_rate": 0.00010999202106434637, + "loss": 12.1808, + "step": 17780 + }, + { + "epoch": 0.9682465132425313, + "grad_norm": 0.5127534384417431, + "learning_rate": 0.00010998324689873107, + "loss": 11.8834, + "step": 17781 + }, + { + "epoch": 0.9683009672391143, + "grad_norm": 0.5320361643919194, + "learning_rate": 0.00010997447265548437, + "loss": 12.0902, + "step": 17782 + }, + { + "epoch": 0.9683554212356973, + "grad_norm": 0.5377705507502049, + "learning_rate": 0.00010996569833467449, + "loss": 12.1417, + "step": 17783 + }, + { + "epoch": 0.9684098752322803, + "grad_norm": 0.5265742666871742, + "learning_rate": 0.00010995692393636968, + "loss": 12.1022, + "step": 17784 + }, + { + "epoch": 0.9684643292288634, + "grad_norm": 0.5190329800999794, + "learning_rate": 0.00010994814946063816, + "loss": 11.987, + "step": 17785 + }, + { + "epoch": 0.9685187832254464, + "grad_norm": 0.5031755079473749, + "learning_rate": 0.00010993937490754815, + "loss": 12.1866, + "step": 17786 + }, + { + "epoch": 0.9685732372220294, + "grad_norm": 0.5982588842106145, + "learning_rate": 0.00010993060027716791, + "loss": 12.0101, + "step": 17787 + }, + { + "epoch": 0.9686276912186124, + "grad_norm": 0.550467381176369, + "learning_rate": 0.00010992182556956562, + "loss": 12.1632, + "step": 17788 + }, + { + "epoch": 0.9686821452151954, + "grad_norm": 0.6039951623824136, + "learning_rate": 0.00010991305078480957, + "loss": 12.1521, + "step": 17789 + }, + { + "epoch": 0.9687365992117783, + "grad_norm": 0.5398755963835501, + "learning_rate": 0.000109904275922968, + "loss": 12.1698, + "step": 17790 + }, + { + "epoch": 0.9687910532083615, + "grad_norm": 0.5309708003642307, + "learning_rate": 0.0001098955009841091, + "loss": 12.0762, + "step": 17791 + }, + { + "epoch": 0.9688455072049444, + "grad_norm": 0.5860085116735837, + "learning_rate": 0.00010988672596830112, + "loss": 12.3387, + "step": 17792 + }, + { + "epoch": 0.9688999612015274, + "grad_norm": 0.5844423560245651, + "learning_rate": 0.00010987795087561232, + "loss": 12.1434, + "step": 17793 + }, + { + "epoch": 0.9689544151981104, + "grad_norm": 0.6662312473550055, + "learning_rate": 0.0001098691757061109, + "loss": 12.0429, + "step": 17794 + }, + { + "epoch": 0.9690088691946934, + "grad_norm": 0.5643006113425973, + "learning_rate": 0.00010986040045986512, + "loss": 12.1875, + "step": 17795 + }, + { + "epoch": 0.9690633231912764, + "grad_norm": 0.5834323785455291, + "learning_rate": 0.0001098516251369432, + "loss": 12.1732, + "step": 17796 + }, + { + "epoch": 0.9691177771878595, + "grad_norm": 0.5963028530585607, + "learning_rate": 0.0001098428497374134, + "loss": 12.0946, + "step": 17797 + }, + { + "epoch": 0.9691722311844425, + "grad_norm": 0.6125634195562416, + "learning_rate": 0.00010983407426134396, + "loss": 12.1698, + "step": 17798 + }, + { + "epoch": 0.9692266851810255, + "grad_norm": 0.615414593265336, + "learning_rate": 0.0001098252987088031, + "loss": 12.107, + "step": 17799 + }, + { + "epoch": 0.9692811391776085, + "grad_norm": 0.533551237794284, + "learning_rate": 0.0001098165230798591, + "loss": 12.0986, + "step": 17800 + }, + { + "epoch": 0.9693355931741915, + "grad_norm": 0.6055228077578817, + "learning_rate": 0.00010980774737458011, + "loss": 12.2149, + "step": 17801 + }, + { + "epoch": 0.9693900471707746, + "grad_norm": 0.571576525970899, + "learning_rate": 0.00010979897159303447, + "loss": 12.1945, + "step": 17802 + }, + { + "epoch": 0.9694445011673576, + "grad_norm": 0.6362828666064082, + "learning_rate": 0.00010979019573529037, + "loss": 12.0863, + "step": 17803 + }, + { + "epoch": 0.9694989551639406, + "grad_norm": 0.5302447488075314, + "learning_rate": 0.00010978141980141608, + "loss": 12.0461, + "step": 17804 + }, + { + "epoch": 0.9695534091605236, + "grad_norm": 0.4924079645027928, + "learning_rate": 0.00010977264379147985, + "loss": 12.0715, + "step": 17805 + }, + { + "epoch": 0.9696078631571066, + "grad_norm": 0.5327589830801305, + "learning_rate": 0.00010976386770554983, + "loss": 12.1169, + "step": 17806 + }, + { + "epoch": 0.9696623171536896, + "grad_norm": 0.5716778845807983, + "learning_rate": 0.00010975509154369439, + "loss": 12.0048, + "step": 17807 + }, + { + "epoch": 0.9697167711502727, + "grad_norm": 0.5491710740186433, + "learning_rate": 0.00010974631530598171, + "loss": 12.0912, + "step": 17808 + }, + { + "epoch": 0.9697712251468557, + "grad_norm": 0.5898653267786542, + "learning_rate": 0.00010973753899248005, + "loss": 12.2711, + "step": 17809 + }, + { + "epoch": 0.9698256791434386, + "grad_norm": 0.6055769095504616, + "learning_rate": 0.00010972876260325769, + "loss": 12.2608, + "step": 17810 + }, + { + "epoch": 0.9698801331400216, + "grad_norm": 0.6172966019298604, + "learning_rate": 0.0001097199861383828, + "loss": 12.1219, + "step": 17811 + }, + { + "epoch": 0.9699345871366046, + "grad_norm": 0.5145952516661499, + "learning_rate": 0.00010971120959792365, + "loss": 12.1383, + "step": 17812 + }, + { + "epoch": 0.9699890411331876, + "grad_norm": 0.5068824144878783, + "learning_rate": 0.00010970243298194853, + "loss": 12.098, + "step": 17813 + }, + { + "epoch": 0.9700434951297707, + "grad_norm": 0.5364138632936271, + "learning_rate": 0.00010969365629052566, + "loss": 12.0551, + "step": 17814 + }, + { + "epoch": 0.9700979491263537, + "grad_norm": 0.5586029030784768, + "learning_rate": 0.00010968487952372333, + "loss": 12.1015, + "step": 17815 + }, + { + "epoch": 0.9701524031229367, + "grad_norm": 0.5385237767372034, + "learning_rate": 0.0001096761026816097, + "loss": 12.119, + "step": 17816 + }, + { + "epoch": 0.9702068571195197, + "grad_norm": 0.5140552714449329, + "learning_rate": 0.00010966732576425309, + "loss": 12.1271, + "step": 17817 + }, + { + "epoch": 0.9702613111161027, + "grad_norm": 0.6726861153897186, + "learning_rate": 0.00010965854877172172, + "loss": 12.008, + "step": 17818 + }, + { + "epoch": 0.9703157651126857, + "grad_norm": 0.519146769147053, + "learning_rate": 0.00010964977170408387, + "loss": 12.1725, + "step": 17819 + }, + { + "epoch": 0.9703702191092688, + "grad_norm": 0.53818594048876, + "learning_rate": 0.00010964099456140781, + "loss": 12.1311, + "step": 17820 + }, + { + "epoch": 0.9704246731058518, + "grad_norm": 0.5203950796474744, + "learning_rate": 0.00010963221734376172, + "loss": 12.0239, + "step": 17821 + }, + { + "epoch": 0.9704791271024348, + "grad_norm": 0.5622449476707068, + "learning_rate": 0.0001096234400512139, + "loss": 12.2025, + "step": 17822 + }, + { + "epoch": 0.9705335810990178, + "grad_norm": 0.5555495555370853, + "learning_rate": 0.00010961466268383258, + "loss": 12.0297, + "step": 17823 + }, + { + "epoch": 0.9705880350956008, + "grad_norm": 0.5917465536259531, + "learning_rate": 0.00010960588524168604, + "loss": 12.1552, + "step": 17824 + }, + { + "epoch": 0.9706424890921838, + "grad_norm": 0.6715298222792764, + "learning_rate": 0.00010959710772484256, + "loss": 12.1018, + "step": 17825 + }, + { + "epoch": 0.9706969430887669, + "grad_norm": 0.5380566954643132, + "learning_rate": 0.00010958833013337033, + "loss": 11.9396, + "step": 17826 + }, + { + "epoch": 0.9707513970853499, + "grad_norm": 0.5373949699354345, + "learning_rate": 0.00010957955246733764, + "loss": 12.1633, + "step": 17827 + }, + { + "epoch": 0.9708058510819328, + "grad_norm": 0.5680912823299027, + "learning_rate": 0.00010957077472681274, + "loss": 12.1199, + "step": 17828 + }, + { + "epoch": 0.9708603050785158, + "grad_norm": 0.5263261907686924, + "learning_rate": 0.00010956199691186388, + "loss": 12.1293, + "step": 17829 + }, + { + "epoch": 0.9709147590750988, + "grad_norm": 0.581323079354999, + "learning_rate": 0.00010955321902255935, + "loss": 12.2449, + "step": 17830 + }, + { + "epoch": 0.9709692130716818, + "grad_norm": 0.5268762044732026, + "learning_rate": 0.00010954444105896739, + "loss": 12.1004, + "step": 17831 + }, + { + "epoch": 0.9710236670682649, + "grad_norm": 0.5384281904412787, + "learning_rate": 0.00010953566302115625, + "loss": 12.1106, + "step": 17832 + }, + { + "epoch": 0.9710781210648479, + "grad_norm": 0.5466667623989441, + "learning_rate": 0.00010952688490919419, + "loss": 12.1187, + "step": 17833 + }, + { + "epoch": 0.9711325750614309, + "grad_norm": 0.7741320210031013, + "learning_rate": 0.00010951810672314946, + "loss": 12.0845, + "step": 17834 + }, + { + "epoch": 0.9711870290580139, + "grad_norm": 0.5379145355908516, + "learning_rate": 0.00010950932846309034, + "loss": 12.1771, + "step": 17835 + }, + { + "epoch": 0.9712414830545969, + "grad_norm": 0.5433365394397129, + "learning_rate": 0.00010950055012908513, + "loss": 12.1766, + "step": 17836 + }, + { + "epoch": 0.97129593705118, + "grad_norm": 0.6430230755614863, + "learning_rate": 0.00010949177172120202, + "loss": 12.0224, + "step": 17837 + }, + { + "epoch": 0.971350391047763, + "grad_norm": 0.5650864017654597, + "learning_rate": 0.00010948299323950928, + "loss": 11.9911, + "step": 17838 + }, + { + "epoch": 0.971404845044346, + "grad_norm": 0.5924520159138199, + "learning_rate": 0.00010947421468407522, + "loss": 12.0785, + "step": 17839 + }, + { + "epoch": 0.971459299040929, + "grad_norm": 0.5629261714071007, + "learning_rate": 0.00010946543605496806, + "loss": 12.0858, + "step": 17840 + }, + { + "epoch": 0.971513753037512, + "grad_norm": 0.6190367299550407, + "learning_rate": 0.0001094566573522561, + "loss": 12.1683, + "step": 17841 + }, + { + "epoch": 0.971568207034095, + "grad_norm": 0.5187158542715369, + "learning_rate": 0.00010944787857600758, + "loss": 12.137, + "step": 17842 + }, + { + "epoch": 0.9716226610306781, + "grad_norm": 0.5514563108516193, + "learning_rate": 0.00010943909972629078, + "loss": 11.932, + "step": 17843 + }, + { + "epoch": 0.9716771150272611, + "grad_norm": 0.5467067275674893, + "learning_rate": 0.00010943032080317394, + "loss": 12.0348, + "step": 17844 + }, + { + "epoch": 0.971731569023844, + "grad_norm": 0.9845368637661265, + "learning_rate": 0.00010942154180672535, + "loss": 12.1329, + "step": 17845 + }, + { + "epoch": 0.971786023020427, + "grad_norm": 0.5554885818409444, + "learning_rate": 0.00010941276273701328, + "loss": 12.0405, + "step": 17846 + }, + { + "epoch": 0.97184047701701, + "grad_norm": 0.57104996085094, + "learning_rate": 0.00010940398359410598, + "loss": 12.111, + "step": 17847 + }, + { + "epoch": 0.971894931013593, + "grad_norm": 0.5571519314162232, + "learning_rate": 0.00010939520437807174, + "loss": 12.133, + "step": 17848 + }, + { + "epoch": 0.9719493850101761, + "grad_norm": 0.5739596263599989, + "learning_rate": 0.0001093864250889788, + "loss": 12.0966, + "step": 17849 + }, + { + "epoch": 0.9720038390067591, + "grad_norm": 0.5896902198048481, + "learning_rate": 0.00010937764572689544, + "loss": 12.0106, + "step": 17850 + }, + { + "epoch": 0.9720582930033421, + "grad_norm": 0.5185716274990738, + "learning_rate": 0.00010936886629188993, + "loss": 12.0609, + "step": 17851 + }, + { + "epoch": 0.9721127469999251, + "grad_norm": 0.5247556641643809, + "learning_rate": 0.00010936008678403057, + "loss": 12.214, + "step": 17852 + }, + { + "epoch": 0.9721672009965081, + "grad_norm": 0.5813388227502093, + "learning_rate": 0.0001093513072033856, + "loss": 12.1157, + "step": 17853 + }, + { + "epoch": 0.9722216549930911, + "grad_norm": 0.6682536334487343, + "learning_rate": 0.00010934252755002328, + "loss": 12.2233, + "step": 17854 + }, + { + "epoch": 0.9722761089896742, + "grad_norm": 0.8240527314908686, + "learning_rate": 0.00010933374782401191, + "loss": 12.259, + "step": 17855 + }, + { + "epoch": 0.9723305629862572, + "grad_norm": 0.5808292742415025, + "learning_rate": 0.00010932496802541976, + "loss": 12.0409, + "step": 17856 + }, + { + "epoch": 0.9723850169828402, + "grad_norm": 0.6116237421436368, + "learning_rate": 0.00010931618815431508, + "loss": 12.0902, + "step": 17857 + }, + { + "epoch": 0.9724394709794232, + "grad_norm": 0.5712556415067888, + "learning_rate": 0.00010930740821076618, + "loss": 12.1183, + "step": 17858 + }, + { + "epoch": 0.9724939249760062, + "grad_norm": 0.5495703744822813, + "learning_rate": 0.00010929862819484129, + "loss": 12.2186, + "step": 17859 + }, + { + "epoch": 0.9725483789725892, + "grad_norm": 0.53985474865941, + "learning_rate": 0.0001092898481066087, + "loss": 12.0988, + "step": 17860 + }, + { + "epoch": 0.9726028329691723, + "grad_norm": 0.567346809786572, + "learning_rate": 0.0001092810679461367, + "loss": 11.9864, + "step": 17861 + }, + { + "epoch": 0.9726572869657553, + "grad_norm": 0.5352723508895281, + "learning_rate": 0.00010927228771349358, + "loss": 12.0967, + "step": 17862 + }, + { + "epoch": 0.9727117409623383, + "grad_norm": 0.5864110588587573, + "learning_rate": 0.00010926350740874757, + "loss": 12.35, + "step": 17863 + }, + { + "epoch": 0.9727661949589212, + "grad_norm": 0.5903631071486342, + "learning_rate": 0.000109254727031967, + "loss": 12.145, + "step": 17864 + }, + { + "epoch": 0.9728206489555042, + "grad_norm": 0.6475830505747637, + "learning_rate": 0.0001092459465832201, + "loss": 12.252, + "step": 17865 + }, + { + "epoch": 0.9728751029520873, + "grad_norm": 0.5593561081555694, + "learning_rate": 0.00010923716606257517, + "loss": 12.1333, + "step": 17866 + }, + { + "epoch": 0.9729295569486703, + "grad_norm": 0.6374503415738403, + "learning_rate": 0.0001092283854701005, + "loss": 12.0877, + "step": 17867 + }, + { + "epoch": 0.9729840109452533, + "grad_norm": 0.5556889201637897, + "learning_rate": 0.00010921960480586435, + "loss": 12.1325, + "step": 17868 + }, + { + "epoch": 0.9730384649418363, + "grad_norm": 0.6601778675459046, + "learning_rate": 0.00010921082406993502, + "loss": 12.1835, + "step": 17869 + }, + { + "epoch": 0.9730929189384193, + "grad_norm": 0.6017919349112363, + "learning_rate": 0.00010920204326238075, + "loss": 12.1647, + "step": 17870 + }, + { + "epoch": 0.9731473729350023, + "grad_norm": 0.5306760234908892, + "learning_rate": 0.00010919326238326988, + "loss": 12.0787, + "step": 17871 + }, + { + "epoch": 0.9732018269315854, + "grad_norm": 0.5448361215848412, + "learning_rate": 0.00010918448143267062, + "loss": 12.1605, + "step": 17872 + }, + { + "epoch": 0.9732562809281684, + "grad_norm": 0.5346807218112846, + "learning_rate": 0.00010917570041065132, + "loss": 12.1521, + "step": 17873 + }, + { + "epoch": 0.9733107349247514, + "grad_norm": 0.5584559000184194, + "learning_rate": 0.00010916691931728022, + "loss": 12.1005, + "step": 17874 + }, + { + "epoch": 0.9733651889213344, + "grad_norm": 0.5442613405654735, + "learning_rate": 0.00010915813815262564, + "loss": 12.1057, + "step": 17875 + }, + { + "epoch": 0.9734196429179174, + "grad_norm": 0.5360045078582204, + "learning_rate": 0.00010914935691675586, + "loss": 12.136, + "step": 17876 + }, + { + "epoch": 0.9734740969145004, + "grad_norm": 0.5755332612033531, + "learning_rate": 0.00010914057560973909, + "loss": 12.297, + "step": 17877 + }, + { + "epoch": 0.9735285509110835, + "grad_norm": 0.5595959670494073, + "learning_rate": 0.00010913179423164368, + "loss": 12.2594, + "step": 17878 + }, + { + "epoch": 0.9735830049076665, + "grad_norm": 0.47179394787793116, + "learning_rate": 0.00010912301278253793, + "loss": 12.0613, + "step": 17879 + }, + { + "epoch": 0.9736374589042495, + "grad_norm": 0.6288161960743753, + "learning_rate": 0.0001091142312624901, + "loss": 12.1996, + "step": 17880 + }, + { + "epoch": 0.9736919129008325, + "grad_norm": 0.5473793202384529, + "learning_rate": 0.00010910544967156849, + "loss": 12.1201, + "step": 17881 + }, + { + "epoch": 0.9737463668974154, + "grad_norm": 0.5471473747021309, + "learning_rate": 0.00010909666800984136, + "loss": 12.1405, + "step": 17882 + }, + { + "epoch": 0.9738008208939984, + "grad_norm": 0.5209045787765305, + "learning_rate": 0.000109087886277377, + "loss": 12.0672, + "step": 17883 + }, + { + "epoch": 0.9738552748905815, + "grad_norm": 0.5390163506174233, + "learning_rate": 0.00010907910447424373, + "loss": 12.0384, + "step": 17884 + }, + { + "epoch": 0.9739097288871645, + "grad_norm": 0.5157419432595162, + "learning_rate": 0.00010907032260050982, + "loss": 12.0893, + "step": 17885 + }, + { + "epoch": 0.9739641828837475, + "grad_norm": 0.56821905247156, + "learning_rate": 0.00010906154065624356, + "loss": 12.1278, + "step": 17886 + }, + { + "epoch": 0.9740186368803305, + "grad_norm": 0.5938279214721289, + "learning_rate": 0.00010905275864151326, + "loss": 12.1691, + "step": 17887 + }, + { + "epoch": 0.9740730908769135, + "grad_norm": 0.599030570834302, + "learning_rate": 0.00010904397655638717, + "loss": 12.1988, + "step": 17888 + }, + { + "epoch": 0.9741275448734965, + "grad_norm": 0.49742525863148496, + "learning_rate": 0.00010903519440093357, + "loss": 12.0655, + "step": 17889 + }, + { + "epoch": 0.9741819988700796, + "grad_norm": 0.5680825248237323, + "learning_rate": 0.00010902641217522083, + "loss": 12.1229, + "step": 17890 + }, + { + "epoch": 0.9742364528666626, + "grad_norm": 0.5945795928096985, + "learning_rate": 0.00010901762987931718, + "loss": 12.1609, + "step": 17891 + }, + { + "epoch": 0.9742909068632456, + "grad_norm": 0.5741389455881379, + "learning_rate": 0.00010900884751329095, + "loss": 12.1743, + "step": 17892 + }, + { + "epoch": 0.9743453608598286, + "grad_norm": 0.6398993803084989, + "learning_rate": 0.00010900006507721036, + "loss": 12.2676, + "step": 17893 + }, + { + "epoch": 0.9743998148564116, + "grad_norm": 0.6475032989074506, + "learning_rate": 0.00010899128257114377, + "loss": 12.2419, + "step": 17894 + }, + { + "epoch": 0.9744542688529946, + "grad_norm": 0.5941851830064357, + "learning_rate": 0.00010898249999515949, + "loss": 12.2465, + "step": 17895 + }, + { + "epoch": 0.9745087228495777, + "grad_norm": 0.5181394519052931, + "learning_rate": 0.00010897371734932578, + "loss": 12.0279, + "step": 17896 + }, + { + "epoch": 0.9745631768461607, + "grad_norm": 0.5359386713850717, + "learning_rate": 0.00010896493463371092, + "loss": 12.0701, + "step": 17897 + }, + { + "epoch": 0.9746176308427437, + "grad_norm": 0.5777851464649929, + "learning_rate": 0.00010895615184838324, + "loss": 12.2015, + "step": 17898 + }, + { + "epoch": 0.9746720848393267, + "grad_norm": 0.5159998961382526, + "learning_rate": 0.00010894736899341101, + "loss": 11.9164, + "step": 17899 + }, + { + "epoch": 0.9747265388359097, + "grad_norm": 0.5260505807175065, + "learning_rate": 0.00010893858606886251, + "loss": 12.1511, + "step": 17900 + }, + { + "epoch": 0.9747809928324928, + "grad_norm": 0.5636914625494629, + "learning_rate": 0.00010892980307480612, + "loss": 12.0698, + "step": 17901 + }, + { + "epoch": 0.9748354468290757, + "grad_norm": 0.6476801604471305, + "learning_rate": 0.00010892102001131006, + "loss": 12.2996, + "step": 17902 + }, + { + "epoch": 0.9748899008256587, + "grad_norm": 0.5250071769126085, + "learning_rate": 0.00010891223687844266, + "loss": 12.0254, + "step": 17903 + }, + { + "epoch": 0.9749443548222417, + "grad_norm": 0.631637222132833, + "learning_rate": 0.00010890345367627219, + "loss": 12.2131, + "step": 17904 + }, + { + "epoch": 0.9749988088188247, + "grad_norm": 0.6668428762840949, + "learning_rate": 0.00010889467040486699, + "loss": 12.0375, + "step": 17905 + }, + { + "epoch": 0.9750532628154077, + "grad_norm": 0.550951569104823, + "learning_rate": 0.00010888588706429532, + "loss": 12.1446, + "step": 17906 + }, + { + "epoch": 0.9751077168119908, + "grad_norm": 0.5678601252962668, + "learning_rate": 0.00010887710365462554, + "loss": 12.1614, + "step": 17907 + }, + { + "epoch": 0.9751621708085738, + "grad_norm": 0.5540205006570619, + "learning_rate": 0.00010886832017592588, + "loss": 12.1959, + "step": 17908 + }, + { + "epoch": 0.9752166248051568, + "grad_norm": 0.5464565421988227, + "learning_rate": 0.00010885953662826467, + "loss": 12.0261, + "step": 17909 + }, + { + "epoch": 0.9752710788017398, + "grad_norm": 0.575540090418497, + "learning_rate": 0.00010885075301171024, + "loss": 12.0915, + "step": 17910 + }, + { + "epoch": 0.9753255327983228, + "grad_norm": 0.6135515535481806, + "learning_rate": 0.00010884196932633086, + "loss": 12.1961, + "step": 17911 + }, + { + "epoch": 0.9753799867949058, + "grad_norm": 0.6282773902043612, + "learning_rate": 0.00010883318557219486, + "loss": 12.0977, + "step": 17912 + }, + { + "epoch": 0.9754344407914889, + "grad_norm": 0.5265484255082639, + "learning_rate": 0.00010882440174937052, + "loss": 12.0893, + "step": 17913 + }, + { + "epoch": 0.9754888947880719, + "grad_norm": 0.5622450247447242, + "learning_rate": 0.00010881561785792614, + "loss": 12.1413, + "step": 17914 + }, + { + "epoch": 0.9755433487846549, + "grad_norm": 0.6854546710007255, + "learning_rate": 0.00010880683389793005, + "loss": 12.108, + "step": 17915 + }, + { + "epoch": 0.9755978027812379, + "grad_norm": 0.6047047159213987, + "learning_rate": 0.00010879804986945053, + "loss": 12.0257, + "step": 17916 + }, + { + "epoch": 0.9756522567778209, + "grad_norm": 0.6069312642192195, + "learning_rate": 0.0001087892657725559, + "loss": 12.1501, + "step": 17917 + }, + { + "epoch": 0.9757067107744039, + "grad_norm": 0.6288459884045596, + "learning_rate": 0.00010878048160731447, + "loss": 12.0399, + "step": 17918 + }, + { + "epoch": 0.975761164770987, + "grad_norm": 0.548933130048583, + "learning_rate": 0.00010877169737379454, + "loss": 12.2037, + "step": 17919 + }, + { + "epoch": 0.97581561876757, + "grad_norm": 0.5325861854351972, + "learning_rate": 0.00010876291307206444, + "loss": 12.1012, + "step": 17920 + }, + { + "epoch": 0.9758700727641529, + "grad_norm": 0.5778575483593685, + "learning_rate": 0.00010875412870219244, + "loss": 12.1451, + "step": 17921 + }, + { + "epoch": 0.9759245267607359, + "grad_norm": 0.7127994538232065, + "learning_rate": 0.00010874534426424685, + "loss": 12.2466, + "step": 17922 + }, + { + "epoch": 0.9759789807573189, + "grad_norm": 0.6218779145716515, + "learning_rate": 0.00010873655975829601, + "loss": 12.0478, + "step": 17923 + }, + { + "epoch": 0.9760334347539019, + "grad_norm": 0.6401297530367082, + "learning_rate": 0.00010872777518440825, + "loss": 12.1656, + "step": 17924 + }, + { + "epoch": 0.976087888750485, + "grad_norm": 0.5689038896878923, + "learning_rate": 0.00010871899054265179, + "loss": 12.1548, + "step": 17925 + }, + { + "epoch": 0.976142342747068, + "grad_norm": 0.6836915922090367, + "learning_rate": 0.00010871020583309506, + "loss": 12.1765, + "step": 17926 + }, + { + "epoch": 0.976196796743651, + "grad_norm": 0.5045814336424027, + "learning_rate": 0.00010870142105580626, + "loss": 12.05, + "step": 17927 + }, + { + "epoch": 0.976251250740234, + "grad_norm": 0.5209395539749949, + "learning_rate": 0.00010869263621085374, + "loss": 12.1797, + "step": 17928 + }, + { + "epoch": 0.976305704736817, + "grad_norm": 0.6326383548968321, + "learning_rate": 0.00010868385129830587, + "loss": 12.1397, + "step": 17929 + }, + { + "epoch": 0.9763601587334, + "grad_norm": 0.5497160922094432, + "learning_rate": 0.0001086750663182309, + "loss": 12.0658, + "step": 17930 + }, + { + "epoch": 0.9764146127299831, + "grad_norm": 0.6505528718449232, + "learning_rate": 0.00010866628127069716, + "loss": 12.0582, + "step": 17931 + }, + { + "epoch": 0.9764690667265661, + "grad_norm": 0.5561586077183442, + "learning_rate": 0.00010865749615577295, + "loss": 12.0352, + "step": 17932 + }, + { + "epoch": 0.9765235207231491, + "grad_norm": 0.5374969245163094, + "learning_rate": 0.0001086487109735266, + "loss": 12.1648, + "step": 17933 + }, + { + "epoch": 0.9765779747197321, + "grad_norm": 0.5859100683304503, + "learning_rate": 0.00010863992572402642, + "loss": 12.1309, + "step": 17934 + }, + { + "epoch": 0.9766324287163151, + "grad_norm": 0.5627313103501189, + "learning_rate": 0.00010863114040734075, + "loss": 12.1343, + "step": 17935 + }, + { + "epoch": 0.9766868827128982, + "grad_norm": 0.527947713424401, + "learning_rate": 0.00010862235502353788, + "loss": 11.9982, + "step": 17936 + }, + { + "epoch": 0.9767413367094812, + "grad_norm": 0.5412487045573909, + "learning_rate": 0.00010861356957268613, + "loss": 12.1632, + "step": 17937 + }, + { + "epoch": 0.9767957907060641, + "grad_norm": 0.5426660693000421, + "learning_rate": 0.00010860478405485379, + "loss": 12.1363, + "step": 17938 + }, + { + "epoch": 0.9768502447026471, + "grad_norm": 0.5833797058817659, + "learning_rate": 0.00010859599847010922, + "loss": 12.0881, + "step": 17939 + }, + { + "epoch": 0.9769046986992301, + "grad_norm": 0.6033501398873252, + "learning_rate": 0.00010858721281852074, + "loss": 12.2206, + "step": 17940 + }, + { + "epoch": 0.9769591526958131, + "grad_norm": 0.5812379565297366, + "learning_rate": 0.00010857842710015666, + "loss": 12.1085, + "step": 17941 + }, + { + "epoch": 0.9770136066923962, + "grad_norm": 0.575416761901028, + "learning_rate": 0.00010856964131508526, + "loss": 12.0669, + "step": 17942 + }, + { + "epoch": 0.9770680606889792, + "grad_norm": 0.5377194087122438, + "learning_rate": 0.0001085608554633749, + "loss": 12.1132, + "step": 17943 + }, + { + "epoch": 0.9771225146855622, + "grad_norm": 0.6368059974770356, + "learning_rate": 0.00010855206954509391, + "loss": 12.1264, + "step": 17944 + }, + { + "epoch": 0.9771769686821452, + "grad_norm": 0.5726633456620404, + "learning_rate": 0.00010854328356031059, + "loss": 12.1398, + "step": 17945 + }, + { + "epoch": 0.9772314226787282, + "grad_norm": 0.5546276963249545, + "learning_rate": 0.00010853449750909328, + "loss": 11.9444, + "step": 17946 + }, + { + "epoch": 0.9772858766753112, + "grad_norm": 0.5278076544779617, + "learning_rate": 0.00010852571139151027, + "loss": 12.1496, + "step": 17947 + }, + { + "epoch": 0.9773403306718943, + "grad_norm": 0.5866875037092192, + "learning_rate": 0.00010851692520762989, + "loss": 11.8878, + "step": 17948 + }, + { + "epoch": 0.9773947846684773, + "grad_norm": 0.6536364797697218, + "learning_rate": 0.00010850813895752043, + "loss": 12.1626, + "step": 17949 + }, + { + "epoch": 0.9774492386650603, + "grad_norm": 0.500705799452207, + "learning_rate": 0.00010849935264125031, + "loss": 12.1102, + "step": 17950 + }, + { + "epoch": 0.9775036926616433, + "grad_norm": 0.6214715212457284, + "learning_rate": 0.0001084905662588878, + "loss": 12.2064, + "step": 17951 + }, + { + "epoch": 0.9775581466582263, + "grad_norm": 0.5569672393862045, + "learning_rate": 0.00010848177981050123, + "loss": 12.099, + "step": 17952 + }, + { + "epoch": 0.9776126006548093, + "grad_norm": 0.5271052084321294, + "learning_rate": 0.0001084729932961589, + "loss": 12.134, + "step": 17953 + }, + { + "epoch": 0.9776670546513924, + "grad_norm": 0.5813167820322008, + "learning_rate": 0.00010846420671592913, + "loss": 11.932, + "step": 17954 + }, + { + "epoch": 0.9777215086479754, + "grad_norm": 0.5652022496642765, + "learning_rate": 0.00010845542006988028, + "loss": 12.084, + "step": 17955 + }, + { + "epoch": 0.9777759626445583, + "grad_norm": 0.6489469752678734, + "learning_rate": 0.0001084466333580807, + "loss": 12.1296, + "step": 17956 + }, + { + "epoch": 0.9778304166411413, + "grad_norm": 0.5975893786025105, + "learning_rate": 0.00010843784658059865, + "loss": 12.0141, + "step": 17957 + }, + { + "epoch": 0.9778848706377243, + "grad_norm": 0.5519125805054584, + "learning_rate": 0.0001084290597375025, + "loss": 12.2335, + "step": 17958 + }, + { + "epoch": 0.9779393246343073, + "grad_norm": 0.5395481619309116, + "learning_rate": 0.00010842027282886059, + "loss": 11.9542, + "step": 17959 + }, + { + "epoch": 0.9779937786308904, + "grad_norm": 0.6353045346707804, + "learning_rate": 0.00010841148585474117, + "loss": 12.1194, + "step": 17960 + }, + { + "epoch": 0.9780482326274734, + "grad_norm": 0.5823209191961672, + "learning_rate": 0.00010840269881521268, + "loss": 12.1265, + "step": 17961 + }, + { + "epoch": 0.9781026866240564, + "grad_norm": 0.5776965927558751, + "learning_rate": 0.00010839391171034336, + "loss": 12.0926, + "step": 17962 + }, + { + "epoch": 0.9781571406206394, + "grad_norm": 0.6333194385576171, + "learning_rate": 0.0001083851245402016, + "loss": 12.1832, + "step": 17963 + }, + { + "epoch": 0.9782115946172224, + "grad_norm": 0.6352767740368886, + "learning_rate": 0.0001083763373048557, + "loss": 12.0946, + "step": 17964 + }, + { + "epoch": 0.9782660486138054, + "grad_norm": 0.5645207598118537, + "learning_rate": 0.00010836755000437397, + "loss": 12.0763, + "step": 17965 + }, + { + "epoch": 0.9783205026103885, + "grad_norm": 0.5937879276484013, + "learning_rate": 0.00010835876263882481, + "loss": 12.0289, + "step": 17966 + }, + { + "epoch": 0.9783749566069715, + "grad_norm": 0.6074090833865735, + "learning_rate": 0.00010834997520827648, + "loss": 12.1025, + "step": 17967 + }, + { + "epoch": 0.9784294106035545, + "grad_norm": 0.5633156870660759, + "learning_rate": 0.00010834118771279736, + "loss": 12.1154, + "step": 17968 + }, + { + "epoch": 0.9784838646001375, + "grad_norm": 0.5320985531607659, + "learning_rate": 0.00010833240015245573, + "loss": 12.1585, + "step": 17969 + }, + { + "epoch": 0.9785383185967205, + "grad_norm": 0.6191167257525989, + "learning_rate": 0.00010832361252731999, + "loss": 11.9955, + "step": 17970 + }, + { + "epoch": 0.9785927725933036, + "grad_norm": 0.5422379859682265, + "learning_rate": 0.00010831482483745845, + "loss": 12.1218, + "step": 17971 + }, + { + "epoch": 0.9786472265898866, + "grad_norm": 0.5304028876755441, + "learning_rate": 0.00010830603708293942, + "loss": 12.2471, + "step": 17972 + }, + { + "epoch": 0.9787016805864696, + "grad_norm": 0.6449485704565627, + "learning_rate": 0.00010829724926383127, + "loss": 12.225, + "step": 17973 + }, + { + "epoch": 0.9787561345830526, + "grad_norm": 0.5306371704220962, + "learning_rate": 0.0001082884613802023, + "loss": 12.1296, + "step": 17974 + }, + { + "epoch": 0.9788105885796355, + "grad_norm": 0.5167911108564022, + "learning_rate": 0.00010827967343212087, + "loss": 12.0481, + "step": 17975 + }, + { + "epoch": 0.9788650425762185, + "grad_norm": 0.5274856214728664, + "learning_rate": 0.00010827088541965531, + "loss": 12.1757, + "step": 17976 + }, + { + "epoch": 0.9789194965728016, + "grad_norm": 0.5521135327126813, + "learning_rate": 0.00010826209734287396, + "loss": 12.0129, + "step": 17977 + }, + { + "epoch": 0.9789739505693846, + "grad_norm": 0.5761098151717915, + "learning_rate": 0.00010825330920184515, + "loss": 12.2367, + "step": 17978 + }, + { + "epoch": 0.9790284045659676, + "grad_norm": 0.5076363892495853, + "learning_rate": 0.00010824452099663725, + "loss": 12.1218, + "step": 17979 + }, + { + "epoch": 0.9790828585625506, + "grad_norm": 0.5273708890063069, + "learning_rate": 0.00010823573272731855, + "loss": 12.0445, + "step": 17980 + }, + { + "epoch": 0.9791373125591336, + "grad_norm": 0.546803219978149, + "learning_rate": 0.0001082269443939574, + "loss": 12.1183, + "step": 17981 + }, + { + "epoch": 0.9791917665557166, + "grad_norm": 0.5666503969234457, + "learning_rate": 0.00010821815599662217, + "loss": 12.1089, + "step": 17982 + }, + { + "epoch": 0.9792462205522997, + "grad_norm": 0.5976456319310408, + "learning_rate": 0.00010820936753538118, + "loss": 12.2222, + "step": 17983 + }, + { + "epoch": 0.9793006745488827, + "grad_norm": 0.5807534805493791, + "learning_rate": 0.00010820057901030276, + "loss": 12.1417, + "step": 17984 + }, + { + "epoch": 0.9793551285454657, + "grad_norm": 0.5618407017610094, + "learning_rate": 0.00010819179042145527, + "loss": 12.1541, + "step": 17985 + }, + { + "epoch": 0.9794095825420487, + "grad_norm": 0.5399095579670065, + "learning_rate": 0.00010818300176890703, + "loss": 12.165, + "step": 17986 + }, + { + "epoch": 0.9794640365386317, + "grad_norm": 0.5595505093425492, + "learning_rate": 0.00010817421305272642, + "loss": 11.9565, + "step": 17987 + }, + { + "epoch": 0.9795184905352147, + "grad_norm": 0.6173433296027873, + "learning_rate": 0.00010816542427298173, + "loss": 12.1378, + "step": 17988 + }, + { + "epoch": 0.9795729445317978, + "grad_norm": 0.5635028286575519, + "learning_rate": 0.00010815663542974135, + "loss": 12.0596, + "step": 17989 + }, + { + "epoch": 0.9796273985283808, + "grad_norm": 0.5128177697315897, + "learning_rate": 0.00010814784652307361, + "loss": 12.1665, + "step": 17990 + }, + { + "epoch": 0.9796818525249638, + "grad_norm": 0.5451806256854534, + "learning_rate": 0.00010813905755304686, + "loss": 12.0466, + "step": 17991 + }, + { + "epoch": 0.9797363065215468, + "grad_norm": 0.546160671865186, + "learning_rate": 0.00010813026851972941, + "loss": 12.2315, + "step": 17992 + }, + { + "epoch": 0.9797907605181297, + "grad_norm": 0.6423773110763001, + "learning_rate": 0.00010812147942318961, + "loss": 12.0426, + "step": 17993 + }, + { + "epoch": 0.9798452145147127, + "grad_norm": 0.49260296577336354, + "learning_rate": 0.00010811269026349587, + "loss": 12.0204, + "step": 17994 + }, + { + "epoch": 0.9798996685112958, + "grad_norm": 0.5733819918469207, + "learning_rate": 0.00010810390104071646, + "loss": 12.1513, + "step": 17995 + }, + { + "epoch": 0.9799541225078788, + "grad_norm": 0.5574227799978274, + "learning_rate": 0.00010809511175491976, + "loss": 12.0072, + "step": 17996 + }, + { + "epoch": 0.9800085765044618, + "grad_norm": 0.5219117221008631, + "learning_rate": 0.00010808632240617414, + "loss": 12.1725, + "step": 17997 + }, + { + "epoch": 0.9800630305010448, + "grad_norm": 0.5741168714520937, + "learning_rate": 0.00010807753299454789, + "loss": 11.9589, + "step": 17998 + }, + { + "epoch": 0.9801174844976278, + "grad_norm": 0.5677294057678615, + "learning_rate": 0.00010806874352010938, + "loss": 12.0291, + "step": 17999 + }, + { + "epoch": 0.9801719384942109, + "grad_norm": 0.5318485484630929, + "learning_rate": 0.00010805995398292699, + "loss": 12.1372, + "step": 18000 + }, + { + "epoch": 0.9802263924907939, + "grad_norm": 0.5543616701944382, + "learning_rate": 0.00010805116438306903, + "loss": 12.1638, + "step": 18001 + }, + { + "epoch": 0.9802808464873769, + "grad_norm": 0.5477869601113502, + "learning_rate": 0.0001080423747206039, + "loss": 12.0025, + "step": 18002 + }, + { + "epoch": 0.9803353004839599, + "grad_norm": 0.5230337758187195, + "learning_rate": 0.0001080335849955999, + "loss": 12.0703, + "step": 18003 + }, + { + "epoch": 0.9803897544805429, + "grad_norm": 0.5306094902096865, + "learning_rate": 0.00010802479520812536, + "loss": 11.9167, + "step": 18004 + }, + { + "epoch": 0.9804442084771259, + "grad_norm": 0.6366613017358728, + "learning_rate": 0.00010801600535824869, + "loss": 12.1444, + "step": 18005 + }, + { + "epoch": 0.980498662473709, + "grad_norm": 0.5727482572658238, + "learning_rate": 0.00010800721544603822, + "loss": 12.0977, + "step": 18006 + }, + { + "epoch": 0.980553116470292, + "grad_norm": 0.5089284317859405, + "learning_rate": 0.00010799842547156234, + "loss": 12.0473, + "step": 18007 + }, + { + "epoch": 0.980607570466875, + "grad_norm": 0.5667934645021488, + "learning_rate": 0.00010798963543488932, + "loss": 12.137, + "step": 18008 + }, + { + "epoch": 0.980662024463458, + "grad_norm": 0.6196388965212127, + "learning_rate": 0.00010798084533608754, + "loss": 12.2894, + "step": 18009 + }, + { + "epoch": 0.980716478460041, + "grad_norm": 0.562464564711028, + "learning_rate": 0.0001079720551752254, + "loss": 12.1624, + "step": 18010 + }, + { + "epoch": 0.980770932456624, + "grad_norm": 0.5348173101555526, + "learning_rate": 0.0001079632649523712, + "loss": 12.2295, + "step": 18011 + }, + { + "epoch": 0.980825386453207, + "grad_norm": 0.5570937732739201, + "learning_rate": 0.00010795447466759335, + "loss": 12.1079, + "step": 18012 + }, + { + "epoch": 0.98087984044979, + "grad_norm": 0.5647120952256477, + "learning_rate": 0.00010794568432096015, + "loss": 12.1854, + "step": 18013 + }, + { + "epoch": 0.980934294446373, + "grad_norm": 0.5815748623492203, + "learning_rate": 0.00010793689391253996, + "loss": 12.1952, + "step": 18014 + }, + { + "epoch": 0.980988748442956, + "grad_norm": 0.5537602408110703, + "learning_rate": 0.00010792810344240114, + "loss": 12.1408, + "step": 18015 + }, + { + "epoch": 0.981043202439539, + "grad_norm": 0.540086712463153, + "learning_rate": 0.0001079193129106121, + "loss": 12.0253, + "step": 18016 + }, + { + "epoch": 0.981097656436122, + "grad_norm": 0.7435614062524526, + "learning_rate": 0.00010791052231724115, + "loss": 12.1708, + "step": 18017 + }, + { + "epoch": 0.9811521104327051, + "grad_norm": 0.5266529019604518, + "learning_rate": 0.00010790173166235665, + "loss": 12.1274, + "step": 18018 + }, + { + "epoch": 0.9812065644292881, + "grad_norm": 0.5536468972179033, + "learning_rate": 0.00010789294094602694, + "loss": 12.1007, + "step": 18019 + }, + { + "epoch": 0.9812610184258711, + "grad_norm": 0.644183661388461, + "learning_rate": 0.00010788415016832039, + "loss": 12.0889, + "step": 18020 + }, + { + "epoch": 0.9813154724224541, + "grad_norm": 0.5300747203805085, + "learning_rate": 0.00010787535932930537, + "loss": 12.1142, + "step": 18021 + }, + { + "epoch": 0.9813699264190371, + "grad_norm": 0.5795795154313257, + "learning_rate": 0.00010786656842905028, + "loss": 12.0046, + "step": 18022 + }, + { + "epoch": 0.9814243804156201, + "grad_norm": 0.5725988801129808, + "learning_rate": 0.0001078577774676234, + "loss": 12.2399, + "step": 18023 + }, + { + "epoch": 0.9814788344122032, + "grad_norm": 0.5818950412111472, + "learning_rate": 0.00010784898644509313, + "loss": 12.0231, + "step": 18024 + }, + { + "epoch": 0.9815332884087862, + "grad_norm": 0.5411298785123068, + "learning_rate": 0.00010784019536152783, + "loss": 11.8646, + "step": 18025 + }, + { + "epoch": 0.9815877424053692, + "grad_norm": 0.5472834620427034, + "learning_rate": 0.00010783140421699582, + "loss": 11.995, + "step": 18026 + }, + { + "epoch": 0.9816421964019522, + "grad_norm": 0.5295784041111941, + "learning_rate": 0.00010782261301156555, + "loss": 12.1112, + "step": 18027 + }, + { + "epoch": 0.9816966503985352, + "grad_norm": 0.5634509356491804, + "learning_rate": 0.0001078138217453053, + "loss": 12.0938, + "step": 18028 + }, + { + "epoch": 0.9817511043951181, + "grad_norm": 0.5739913712880079, + "learning_rate": 0.00010780503041828347, + "loss": 12.1133, + "step": 18029 + }, + { + "epoch": 0.9818055583917012, + "grad_norm": 0.5719679045022055, + "learning_rate": 0.00010779623903056842, + "loss": 12.183, + "step": 18030 + }, + { + "epoch": 0.9818600123882842, + "grad_norm": 0.5433390891457947, + "learning_rate": 0.0001077874475822285, + "loss": 12.072, + "step": 18031 + }, + { + "epoch": 0.9819144663848672, + "grad_norm": 0.596004488459137, + "learning_rate": 0.00010777865607333208, + "loss": 12.0495, + "step": 18032 + }, + { + "epoch": 0.9819689203814502, + "grad_norm": 0.5773870752684507, + "learning_rate": 0.00010776986450394753, + "loss": 12.2028, + "step": 18033 + }, + { + "epoch": 0.9820233743780332, + "grad_norm": 0.5502718478168531, + "learning_rate": 0.0001077610728741432, + "loss": 12.0119, + "step": 18034 + }, + { + "epoch": 0.9820778283746163, + "grad_norm": 0.6144172766414152, + "learning_rate": 0.00010775228118398748, + "loss": 12.1112, + "step": 18035 + }, + { + "epoch": 0.9821322823711993, + "grad_norm": 0.5632127743324946, + "learning_rate": 0.0001077434894335487, + "loss": 12.0601, + "step": 18036 + }, + { + "epoch": 0.9821867363677823, + "grad_norm": 0.5986532670411304, + "learning_rate": 0.00010773469762289528, + "loss": 12.1375, + "step": 18037 + }, + { + "epoch": 0.9822411903643653, + "grad_norm": 0.5348121444254492, + "learning_rate": 0.00010772590575209553, + "loss": 12.0448, + "step": 18038 + }, + { + "epoch": 0.9822956443609483, + "grad_norm": 0.6977366636723346, + "learning_rate": 0.00010771711382121786, + "loss": 12.0596, + "step": 18039 + }, + { + "epoch": 0.9823500983575313, + "grad_norm": 0.5281573876140327, + "learning_rate": 0.0001077083218303306, + "loss": 12.1564, + "step": 18040 + }, + { + "epoch": 0.9824045523541144, + "grad_norm": 0.6582946415270958, + "learning_rate": 0.00010769952977950215, + "loss": 12.2643, + "step": 18041 + }, + { + "epoch": 0.9824590063506974, + "grad_norm": 0.5929653534678702, + "learning_rate": 0.00010769073766880085, + "loss": 12.0594, + "step": 18042 + }, + { + "epoch": 0.9825134603472804, + "grad_norm": 0.5457434530855422, + "learning_rate": 0.00010768194549829508, + "loss": 12.164, + "step": 18043 + }, + { + "epoch": 0.9825679143438634, + "grad_norm": 0.5500436007280676, + "learning_rate": 0.00010767315326805323, + "loss": 12.2298, + "step": 18044 + }, + { + "epoch": 0.9826223683404464, + "grad_norm": 0.5224746472815626, + "learning_rate": 0.00010766436097814365, + "loss": 12.134, + "step": 18045 + }, + { + "epoch": 0.9826768223370294, + "grad_norm": 0.5619769872720628, + "learning_rate": 0.00010765556862863472, + "loss": 11.9188, + "step": 18046 + }, + { + "epoch": 0.9827312763336125, + "grad_norm": 0.5357558992994437, + "learning_rate": 0.0001076467762195948, + "loss": 12.1788, + "step": 18047 + }, + { + "epoch": 0.9827857303301955, + "grad_norm": 0.5352705390795878, + "learning_rate": 0.00010763798375109226, + "loss": 11.9371, + "step": 18048 + }, + { + "epoch": 0.9828401843267784, + "grad_norm": 0.5181102203185849, + "learning_rate": 0.00010762919122319548, + "loss": 12.072, + "step": 18049 + }, + { + "epoch": 0.9828946383233614, + "grad_norm": 0.5860233741332485, + "learning_rate": 0.00010762039863597284, + "loss": 12.1642, + "step": 18050 + }, + { + "epoch": 0.9829490923199444, + "grad_norm": 0.5729014679436384, + "learning_rate": 0.00010761160598949269, + "loss": 12.2005, + "step": 18051 + }, + { + "epoch": 0.9830035463165274, + "grad_norm": 0.5920741834598858, + "learning_rate": 0.00010760281328382344, + "loss": 12.2143, + "step": 18052 + }, + { + "epoch": 0.9830580003131105, + "grad_norm": 0.5143756884138256, + "learning_rate": 0.0001075940205190334, + "loss": 12.152, + "step": 18053 + }, + { + "epoch": 0.9831124543096935, + "grad_norm": 0.565118066991634, + "learning_rate": 0.000107585227695191, + "loss": 12.0064, + "step": 18054 + }, + { + "epoch": 0.9831669083062765, + "grad_norm": 0.5744740854247854, + "learning_rate": 0.0001075764348123646, + "loss": 11.9919, + "step": 18055 + }, + { + "epoch": 0.9832213623028595, + "grad_norm": 0.6331700601523153, + "learning_rate": 0.00010756764187062257, + "loss": 12.0929, + "step": 18056 + }, + { + "epoch": 0.9832758162994425, + "grad_norm": 0.6835017108603204, + "learning_rate": 0.00010755884887003331, + "loss": 12.2208, + "step": 18057 + }, + { + "epoch": 0.9833302702960255, + "grad_norm": 0.5656699014455033, + "learning_rate": 0.00010755005581066513, + "loss": 12.1017, + "step": 18058 + }, + { + "epoch": 0.9833847242926086, + "grad_norm": 0.5311723015634895, + "learning_rate": 0.00010754126269258647, + "loss": 11.9106, + "step": 18059 + }, + { + "epoch": 0.9834391782891916, + "grad_norm": 0.5720387434249207, + "learning_rate": 0.00010753246951586568, + "loss": 12.0141, + "step": 18060 + }, + { + "epoch": 0.9834936322857746, + "grad_norm": 0.6936595517459607, + "learning_rate": 0.00010752367628057116, + "loss": 12.189, + "step": 18061 + }, + { + "epoch": 0.9835480862823576, + "grad_norm": 0.4982052031486083, + "learning_rate": 0.00010751488298677128, + "loss": 12.0908, + "step": 18062 + }, + { + "epoch": 0.9836025402789406, + "grad_norm": 0.5599579395059051, + "learning_rate": 0.00010750608963453438, + "loss": 12.1435, + "step": 18063 + }, + { + "epoch": 0.9836569942755236, + "grad_norm": 0.5998340544163681, + "learning_rate": 0.00010749729622392888, + "loss": 12.075, + "step": 18064 + }, + { + "epoch": 0.9837114482721067, + "grad_norm": 0.5876829349332333, + "learning_rate": 0.00010748850275502316, + "loss": 12.0845, + "step": 18065 + }, + { + "epoch": 0.9837659022686897, + "grad_norm": 0.5161903199408583, + "learning_rate": 0.00010747970922788557, + "loss": 11.9695, + "step": 18066 + }, + { + "epoch": 0.9838203562652726, + "grad_norm": 0.6270574736815702, + "learning_rate": 0.00010747091564258454, + "loss": 12.1766, + "step": 18067 + }, + { + "epoch": 0.9838748102618556, + "grad_norm": 0.5369537586885043, + "learning_rate": 0.00010746212199918838, + "loss": 12.0638, + "step": 18068 + }, + { + "epoch": 0.9839292642584386, + "grad_norm": 0.5853144281322175, + "learning_rate": 0.00010745332829776548, + "loss": 11.9787, + "step": 18069 + }, + { + "epoch": 0.9839837182550217, + "grad_norm": 0.60185097776034, + "learning_rate": 0.0001074445345383843, + "loss": 12.009, + "step": 18070 + }, + { + "epoch": 0.9840381722516047, + "grad_norm": 0.5639261433739271, + "learning_rate": 0.00010743574072111315, + "loss": 12.104, + "step": 18071 + }, + { + "epoch": 0.9840926262481877, + "grad_norm": 0.5537166577957787, + "learning_rate": 0.00010742694684602046, + "loss": 11.9331, + "step": 18072 + }, + { + "epoch": 0.9841470802447707, + "grad_norm": 0.6591769633023632, + "learning_rate": 0.00010741815291317459, + "loss": 12.3092, + "step": 18073 + }, + { + "epoch": 0.9842015342413537, + "grad_norm": 0.5947384769187573, + "learning_rate": 0.00010740935892264387, + "loss": 12.2152, + "step": 18074 + }, + { + "epoch": 0.9842559882379367, + "grad_norm": 0.5599849847542595, + "learning_rate": 0.00010740056487449674, + "loss": 12.1216, + "step": 18075 + }, + { + "epoch": 0.9843104422345198, + "grad_norm": 0.5517022014958264, + "learning_rate": 0.0001073917707688016, + "loss": 12.0721, + "step": 18076 + }, + { + "epoch": 0.9843648962311028, + "grad_norm": 0.618082533686037, + "learning_rate": 0.00010738297660562682, + "loss": 12.0813, + "step": 18077 + }, + { + "epoch": 0.9844193502276858, + "grad_norm": 0.5772571839574674, + "learning_rate": 0.00010737418238504078, + "loss": 12.195, + "step": 18078 + }, + { + "epoch": 0.9844738042242688, + "grad_norm": 0.5673878961561942, + "learning_rate": 0.00010736538810711184, + "loss": 12.0617, + "step": 18079 + }, + { + "epoch": 0.9845282582208518, + "grad_norm": 0.5546780893853501, + "learning_rate": 0.00010735659377190841, + "loss": 12.0691, + "step": 18080 + }, + { + "epoch": 0.9845827122174348, + "grad_norm": 0.5853216934400048, + "learning_rate": 0.00010734779937949883, + "loss": 12.1178, + "step": 18081 + }, + { + "epoch": 0.9846371662140179, + "grad_norm": 0.5358952268618732, + "learning_rate": 0.0001073390049299516, + "loss": 11.9943, + "step": 18082 + }, + { + "epoch": 0.9846916202106009, + "grad_norm": 0.5755377660157842, + "learning_rate": 0.00010733021042333502, + "loss": 12.1944, + "step": 18083 + }, + { + "epoch": 0.9847460742071839, + "grad_norm": 0.566788817104964, + "learning_rate": 0.00010732141585971747, + "loss": 12.1504, + "step": 18084 + }, + { + "epoch": 0.9848005282037668, + "grad_norm": 0.6196705651253785, + "learning_rate": 0.00010731262123916739, + "loss": 12.1013, + "step": 18085 + }, + { + "epoch": 0.9848549822003498, + "grad_norm": 0.5980150147717634, + "learning_rate": 0.00010730382656175311, + "loss": 11.9677, + "step": 18086 + }, + { + "epoch": 0.9849094361969328, + "grad_norm": 0.5577501627316889, + "learning_rate": 0.00010729503182754308, + "loss": 12.1253, + "step": 18087 + }, + { + "epoch": 0.9849638901935159, + "grad_norm": 0.4919465787672597, + "learning_rate": 0.00010728623703660562, + "loss": 12.0263, + "step": 18088 + }, + { + "epoch": 0.9850183441900989, + "grad_norm": 0.5853269404716969, + "learning_rate": 0.00010727744218900921, + "loss": 12.1506, + "step": 18089 + }, + { + "epoch": 0.9850727981866819, + "grad_norm": 0.6055052020375156, + "learning_rate": 0.00010726864728482215, + "loss": 12.0832, + "step": 18090 + }, + { + "epoch": 0.9851272521832649, + "grad_norm": 0.625309898276744, + "learning_rate": 0.00010725985232411288, + "loss": 12.15, + "step": 18091 + }, + { + "epoch": 0.9851817061798479, + "grad_norm": 0.5676098635808333, + "learning_rate": 0.0001072510573069498, + "loss": 12.0437, + "step": 18092 + }, + { + "epoch": 0.9852361601764309, + "grad_norm": 0.5124787306749896, + "learning_rate": 0.00010724226223340125, + "loss": 12.1212, + "step": 18093 + }, + { + "epoch": 0.985290614173014, + "grad_norm": 0.5933495437373781, + "learning_rate": 0.00010723346710353568, + "loss": 11.9825, + "step": 18094 + }, + { + "epoch": 0.985345068169597, + "grad_norm": 0.6130369637735575, + "learning_rate": 0.00010722467191742144, + "loss": 12.0794, + "step": 18095 + }, + { + "epoch": 0.98539952216618, + "grad_norm": 0.5928905745980033, + "learning_rate": 0.00010721587667512695, + "loss": 12.1596, + "step": 18096 + }, + { + "epoch": 0.985453976162763, + "grad_norm": 0.5369197231079916, + "learning_rate": 0.0001072070813767206, + "loss": 11.9061, + "step": 18097 + }, + { + "epoch": 0.985508430159346, + "grad_norm": 0.520335964142194, + "learning_rate": 0.00010719828602227075, + "loss": 12.0001, + "step": 18098 + }, + { + "epoch": 0.985562884155929, + "grad_norm": 0.5918418697364416, + "learning_rate": 0.00010718949061184585, + "loss": 12.091, + "step": 18099 + }, + { + "epoch": 0.9856173381525121, + "grad_norm": 1.0030124548635482, + "learning_rate": 0.00010718069514551426, + "loss": 12.1456, + "step": 18100 + }, + { + "epoch": 0.9856717921490951, + "grad_norm": 0.6393789302062024, + "learning_rate": 0.00010717189962334437, + "loss": 12.1362, + "step": 18101 + }, + { + "epoch": 0.985726246145678, + "grad_norm": 0.5601102862060541, + "learning_rate": 0.00010716310404540459, + "loss": 12.0762, + "step": 18102 + }, + { + "epoch": 0.985780700142261, + "grad_norm": 0.5786649803546119, + "learning_rate": 0.0001071543084117633, + "loss": 12.0578, + "step": 18103 + }, + { + "epoch": 0.985835154138844, + "grad_norm": 0.5872084322909208, + "learning_rate": 0.00010714551272248891, + "loss": 12.2205, + "step": 18104 + }, + { + "epoch": 0.9858896081354271, + "grad_norm": 0.5060151325768041, + "learning_rate": 0.00010713671697764984, + "loss": 12.0591, + "step": 18105 + }, + { + "epoch": 0.9859440621320101, + "grad_norm": 0.5528163577261255, + "learning_rate": 0.00010712792117731445, + "loss": 12.0899, + "step": 18106 + }, + { + "epoch": 0.9859985161285931, + "grad_norm": 0.565991675505829, + "learning_rate": 0.00010711912532155115, + "loss": 12.1375, + "step": 18107 + }, + { + "epoch": 0.9860529701251761, + "grad_norm": 0.5447176746423495, + "learning_rate": 0.0001071103294104283, + "loss": 12.0478, + "step": 18108 + }, + { + "epoch": 0.9861074241217591, + "grad_norm": 0.696030140774074, + "learning_rate": 0.00010710153344401439, + "loss": 12.3183, + "step": 18109 + }, + { + "epoch": 0.9861618781183421, + "grad_norm": 0.5880121858085362, + "learning_rate": 0.00010709273742237776, + "loss": 12.0449, + "step": 18110 + }, + { + "epoch": 0.9862163321149252, + "grad_norm": 0.49293080395869104, + "learning_rate": 0.00010708394134558678, + "loss": 11.9812, + "step": 18111 + }, + { + "epoch": 0.9862707861115082, + "grad_norm": 0.6154783409642931, + "learning_rate": 0.00010707514521370994, + "loss": 12.0409, + "step": 18112 + }, + { + "epoch": 0.9863252401080912, + "grad_norm": 0.5396766468657854, + "learning_rate": 0.00010706634902681551, + "loss": 12.022, + "step": 18113 + }, + { + "epoch": 0.9863796941046742, + "grad_norm": 0.5503026621440306, + "learning_rate": 0.000107057552784972, + "loss": 12.1035, + "step": 18114 + }, + { + "epoch": 0.9864341481012572, + "grad_norm": 0.5962588816072459, + "learning_rate": 0.00010704875648824777, + "loss": 11.9612, + "step": 18115 + }, + { + "epoch": 0.9864886020978402, + "grad_norm": 0.631680261499398, + "learning_rate": 0.00010703996013671124, + "loss": 12.0747, + "step": 18116 + }, + { + "epoch": 0.9865430560944233, + "grad_norm": 0.542234954428847, + "learning_rate": 0.00010703116373043082, + "loss": 12.1221, + "step": 18117 + }, + { + "epoch": 0.9865975100910063, + "grad_norm": 0.6474710621477797, + "learning_rate": 0.00010702236726947485, + "loss": 12.2001, + "step": 18118 + }, + { + "epoch": 0.9866519640875893, + "grad_norm": 0.5821040606293187, + "learning_rate": 0.00010701357075391178, + "loss": 12.099, + "step": 18119 + }, + { + "epoch": 0.9867064180841723, + "grad_norm": 0.6329945116586946, + "learning_rate": 0.00010700477418381003, + "loss": 12.2078, + "step": 18120 + }, + { + "epoch": 0.9867608720807552, + "grad_norm": 0.5319668920455514, + "learning_rate": 0.00010699597755923796, + "loss": 12.0948, + "step": 18121 + }, + { + "epoch": 0.9868153260773382, + "grad_norm": 0.5900748869649325, + "learning_rate": 0.000106987180880264, + "loss": 12.1813, + "step": 18122 + }, + { + "epoch": 0.9868697800739213, + "grad_norm": 0.6149433502827112, + "learning_rate": 0.00010697838414695657, + "loss": 12.2179, + "step": 18123 + }, + { + "epoch": 0.9869242340705043, + "grad_norm": 0.562985381858628, + "learning_rate": 0.00010696958735938403, + "loss": 12.1519, + "step": 18124 + }, + { + "epoch": 0.9869786880670873, + "grad_norm": 0.5658279954443907, + "learning_rate": 0.00010696079051761483, + "loss": 12.091, + "step": 18125 + }, + { + "epoch": 0.9870331420636703, + "grad_norm": 0.6312905273552373, + "learning_rate": 0.00010695199362171733, + "loss": 12.1555, + "step": 18126 + }, + { + "epoch": 0.9870875960602533, + "grad_norm": 0.5886784688030845, + "learning_rate": 0.00010694319667175998, + "loss": 12.205, + "step": 18127 + }, + { + "epoch": 0.9871420500568363, + "grad_norm": 0.4950797462419129, + "learning_rate": 0.00010693439966781118, + "loss": 11.8538, + "step": 18128 + }, + { + "epoch": 0.9871965040534194, + "grad_norm": 0.5522936025725195, + "learning_rate": 0.00010692560260993931, + "loss": 12.1399, + "step": 18129 + }, + { + "epoch": 0.9872509580500024, + "grad_norm": 0.6427225968953367, + "learning_rate": 0.00010691680549821277, + "loss": 12.07, + "step": 18130 + }, + { + "epoch": 0.9873054120465854, + "grad_norm": 0.5154353236629954, + "learning_rate": 0.00010690800833270002, + "loss": 12.0826, + "step": 18131 + }, + { + "epoch": 0.9873598660431684, + "grad_norm": 0.6143806323295387, + "learning_rate": 0.00010689921111346943, + "loss": 12.1745, + "step": 18132 + }, + { + "epoch": 0.9874143200397514, + "grad_norm": 0.6140008287528899, + "learning_rate": 0.00010689041384058944, + "loss": 12.3005, + "step": 18133 + }, + { + "epoch": 0.9874687740363345, + "grad_norm": 0.5372926978927672, + "learning_rate": 0.00010688161651412843, + "loss": 12.0937, + "step": 18134 + }, + { + "epoch": 0.9875232280329175, + "grad_norm": 0.6016048547083367, + "learning_rate": 0.00010687281913415477, + "loss": 12.0777, + "step": 18135 + }, + { + "epoch": 0.9875776820295005, + "grad_norm": 0.5706361887179785, + "learning_rate": 0.00010686402170073695, + "loss": 12.2219, + "step": 18136 + }, + { + "epoch": 0.9876321360260835, + "grad_norm": 0.5111482874190844, + "learning_rate": 0.00010685522421394334, + "loss": 12.0913, + "step": 18137 + }, + { + "epoch": 0.9876865900226665, + "grad_norm": 0.5407132927064728, + "learning_rate": 0.00010684642667384239, + "loss": 12.2046, + "step": 18138 + }, + { + "epoch": 0.9877410440192494, + "grad_norm": 0.5551703354684117, + "learning_rate": 0.00010683762908050243, + "loss": 12.0568, + "step": 18139 + }, + { + "epoch": 0.9877954980158326, + "grad_norm": 0.556144476717898, + "learning_rate": 0.00010682883143399194, + "loss": 12.1016, + "step": 18140 + }, + { + "epoch": 0.9878499520124155, + "grad_norm": 0.52610775539807, + "learning_rate": 0.0001068200337343793, + "loss": 12.0702, + "step": 18141 + }, + { + "epoch": 0.9879044060089985, + "grad_norm": 0.5839966922936672, + "learning_rate": 0.00010681123598173295, + "loss": 12.2517, + "step": 18142 + }, + { + "epoch": 0.9879588600055815, + "grad_norm": 0.5289322739960025, + "learning_rate": 0.0001068024381761213, + "loss": 12.1082, + "step": 18143 + }, + { + "epoch": 0.9880133140021645, + "grad_norm": 0.5662415515881658, + "learning_rate": 0.00010679364031761273, + "loss": 11.9966, + "step": 18144 + }, + { + "epoch": 0.9880677679987475, + "grad_norm": 0.595066729336303, + "learning_rate": 0.00010678484240627566, + "loss": 12.0954, + "step": 18145 + }, + { + "epoch": 0.9881222219953306, + "grad_norm": 0.756702540809058, + "learning_rate": 0.00010677604444217853, + "loss": 12.0219, + "step": 18146 + }, + { + "epoch": 0.9881766759919136, + "grad_norm": 0.584760628442418, + "learning_rate": 0.00010676724642538973, + "loss": 12.1959, + "step": 18147 + }, + { + "epoch": 0.9882311299884966, + "grad_norm": 0.5955713714862642, + "learning_rate": 0.00010675844835597772, + "loss": 12.1417, + "step": 18148 + }, + { + "epoch": 0.9882855839850796, + "grad_norm": 0.545147343129215, + "learning_rate": 0.00010674965023401087, + "loss": 12.1022, + "step": 18149 + }, + { + "epoch": 0.9883400379816626, + "grad_norm": 0.5710069833773501, + "learning_rate": 0.00010674085205955759, + "loss": 12.1044, + "step": 18150 + }, + { + "epoch": 0.9883944919782456, + "grad_norm": 0.5329622577089048, + "learning_rate": 0.00010673205383268632, + "loss": 12.0673, + "step": 18151 + }, + { + "epoch": 0.9884489459748287, + "grad_norm": 0.5487041818413038, + "learning_rate": 0.00010672325555346545, + "loss": 12.1085, + "step": 18152 + }, + { + "epoch": 0.9885033999714117, + "grad_norm": 0.5415157448620397, + "learning_rate": 0.00010671445722196346, + "loss": 12.1328, + "step": 18153 + }, + { + "epoch": 0.9885578539679947, + "grad_norm": 0.5624921937625975, + "learning_rate": 0.00010670565883824872, + "loss": 12.1548, + "step": 18154 + }, + { + "epoch": 0.9886123079645777, + "grad_norm": 0.6212541456400029, + "learning_rate": 0.00010669686040238964, + "loss": 12.1634, + "step": 18155 + }, + { + "epoch": 0.9886667619611607, + "grad_norm": 0.5194000136149667, + "learning_rate": 0.00010668806191445466, + "loss": 11.9366, + "step": 18156 + }, + { + "epoch": 0.9887212159577436, + "grad_norm": 0.5155042191634405, + "learning_rate": 0.00010667926337451217, + "loss": 12.1547, + "step": 18157 + }, + { + "epoch": 0.9887756699543268, + "grad_norm": 0.5246555161359897, + "learning_rate": 0.0001066704647826306, + "loss": 12.0819, + "step": 18158 + }, + { + "epoch": 0.9888301239509097, + "grad_norm": 0.5078959519576165, + "learning_rate": 0.0001066616661388784, + "loss": 12.1861, + "step": 18159 + }, + { + "epoch": 0.9888845779474927, + "grad_norm": 0.6152159808564477, + "learning_rate": 0.00010665286744332397, + "loss": 12.2136, + "step": 18160 + }, + { + "epoch": 0.9889390319440757, + "grad_norm": 0.5111939428258314, + "learning_rate": 0.00010664406869603572, + "loss": 12.0874, + "step": 18161 + }, + { + "epoch": 0.9889934859406587, + "grad_norm": 0.5119214118859521, + "learning_rate": 0.00010663526989708209, + "loss": 12.0325, + "step": 18162 + }, + { + "epoch": 0.9890479399372417, + "grad_norm": 0.5219377048092392, + "learning_rate": 0.00010662647104653146, + "loss": 11.8204, + "step": 18163 + }, + { + "epoch": 0.9891023939338248, + "grad_norm": 0.5443243507868772, + "learning_rate": 0.00010661767214445229, + "loss": 12.1334, + "step": 18164 + }, + { + "epoch": 0.9891568479304078, + "grad_norm": 0.5892688335860076, + "learning_rate": 0.000106608873190913, + "loss": 12.0278, + "step": 18165 + }, + { + "epoch": 0.9892113019269908, + "grad_norm": 0.5884705795429934, + "learning_rate": 0.00010660007418598199, + "loss": 12.082, + "step": 18166 + }, + { + "epoch": 0.9892657559235738, + "grad_norm": 0.5458190704317927, + "learning_rate": 0.00010659127512972771, + "loss": 12.1603, + "step": 18167 + }, + { + "epoch": 0.9893202099201568, + "grad_norm": 0.6312072324698195, + "learning_rate": 0.00010658247602221855, + "loss": 12.1664, + "step": 18168 + }, + { + "epoch": 0.9893746639167399, + "grad_norm": 0.580064155012794, + "learning_rate": 0.00010657367686352298, + "loss": 12.0865, + "step": 18169 + }, + { + "epoch": 0.9894291179133229, + "grad_norm": 0.5564038444545667, + "learning_rate": 0.0001065648776537094, + "loss": 12.2043, + "step": 18170 + }, + { + "epoch": 0.9894835719099059, + "grad_norm": 0.5270218157556361, + "learning_rate": 0.0001065560783928462, + "loss": 12.0985, + "step": 18171 + }, + { + "epoch": 0.9895380259064889, + "grad_norm": 0.5923197475586851, + "learning_rate": 0.00010654727908100183, + "loss": 12.0379, + "step": 18172 + }, + { + "epoch": 0.9895924799030719, + "grad_norm": 0.5829247033872216, + "learning_rate": 0.00010653847971824476, + "loss": 12.0139, + "step": 18173 + }, + { + "epoch": 0.9896469338996549, + "grad_norm": 0.5516682687822828, + "learning_rate": 0.00010652968030464334, + "loss": 12.0776, + "step": 18174 + }, + { + "epoch": 0.989701387896238, + "grad_norm": 0.5639592853444282, + "learning_rate": 0.00010652088084026606, + "loss": 12.1019, + "step": 18175 + }, + { + "epoch": 0.989755841892821, + "grad_norm": 0.5384906436819519, + "learning_rate": 0.00010651208132518129, + "loss": 11.9862, + "step": 18176 + }, + { + "epoch": 0.989810295889404, + "grad_norm": 0.6070968546473495, + "learning_rate": 0.0001065032817594575, + "loss": 12.0275, + "step": 18177 + }, + { + "epoch": 0.9898647498859869, + "grad_norm": 0.5808379207583481, + "learning_rate": 0.00010649448214316314, + "loss": 12.1359, + "step": 18178 + }, + { + "epoch": 0.9899192038825699, + "grad_norm": 0.4988094591452025, + "learning_rate": 0.00010648568247636653, + "loss": 12.0005, + "step": 18179 + }, + { + "epoch": 0.9899736578791529, + "grad_norm": 0.5838907201723186, + "learning_rate": 0.00010647688275913616, + "loss": 12.172, + "step": 18180 + }, + { + "epoch": 0.990028111875736, + "grad_norm": 0.6488156091959372, + "learning_rate": 0.00010646808299154049, + "loss": 12.1838, + "step": 18181 + }, + { + "epoch": 0.990082565872319, + "grad_norm": 0.53625159665213, + "learning_rate": 0.00010645928317364795, + "loss": 12.1533, + "step": 18182 + }, + { + "epoch": 0.990137019868902, + "grad_norm": 0.5081836936590858, + "learning_rate": 0.00010645048330552692, + "loss": 12.0303, + "step": 18183 + }, + { + "epoch": 0.990191473865485, + "grad_norm": 0.5390404382885886, + "learning_rate": 0.00010644168338724583, + "loss": 12.1266, + "step": 18184 + }, + { + "epoch": 0.990245927862068, + "grad_norm": 0.5996580270093308, + "learning_rate": 0.00010643288341887314, + "loss": 11.9662, + "step": 18185 + }, + { + "epoch": 0.990300381858651, + "grad_norm": 0.4910316126595332, + "learning_rate": 0.00010642408340047728, + "loss": 12.1151, + "step": 18186 + }, + { + "epoch": 0.9903548358552341, + "grad_norm": 0.5270594120462088, + "learning_rate": 0.00010641528333212667, + "loss": 12.1097, + "step": 18187 + }, + { + "epoch": 0.9904092898518171, + "grad_norm": 0.5193287254087474, + "learning_rate": 0.00010640648321388976, + "loss": 12.0473, + "step": 18188 + }, + { + "epoch": 0.9904637438484001, + "grad_norm": 0.5753604760633858, + "learning_rate": 0.00010639768304583493, + "loss": 12.1701, + "step": 18189 + }, + { + "epoch": 0.9905181978449831, + "grad_norm": 0.5739489111380263, + "learning_rate": 0.00010638888282803064, + "loss": 12.1724, + "step": 18190 + }, + { + "epoch": 0.9905726518415661, + "grad_norm": 0.5255039712114434, + "learning_rate": 0.00010638008256054532, + "loss": 11.9917, + "step": 18191 + }, + { + "epoch": 0.9906271058381491, + "grad_norm": 0.5672165055545463, + "learning_rate": 0.00010637128224344743, + "loss": 12.1203, + "step": 18192 + }, + { + "epoch": 0.9906815598347322, + "grad_norm": 0.5776361743452018, + "learning_rate": 0.0001063624818768054, + "loss": 12.0376, + "step": 18193 + }, + { + "epoch": 0.9907360138313152, + "grad_norm": 0.510166317980021, + "learning_rate": 0.00010635368146068763, + "loss": 11.9939, + "step": 18194 + }, + { + "epoch": 0.9907904678278981, + "grad_norm": 0.5041406687352171, + "learning_rate": 0.00010634488099516253, + "loss": 11.9797, + "step": 18195 + }, + { + "epoch": 0.9908449218244811, + "grad_norm": 0.5805858677346144, + "learning_rate": 0.00010633608048029859, + "loss": 11.9986, + "step": 18196 + }, + { + "epoch": 0.9908993758210641, + "grad_norm": 0.5529385195231652, + "learning_rate": 0.00010632727991616425, + "loss": 12.0752, + "step": 18197 + }, + { + "epoch": 0.9909538298176471, + "grad_norm": 0.5922940201271124, + "learning_rate": 0.00010631847930282792, + "loss": 12.1256, + "step": 18198 + }, + { + "epoch": 0.9910082838142302, + "grad_norm": 0.5176892169025592, + "learning_rate": 0.00010630967864035801, + "loss": 12.1186, + "step": 18199 + }, + { + "epoch": 0.9910627378108132, + "grad_norm": 0.5392799211225574, + "learning_rate": 0.00010630087792882299, + "loss": 12.0985, + "step": 18200 + }, + { + "epoch": 0.9911171918073962, + "grad_norm": 0.5741306146407585, + "learning_rate": 0.00010629207716829128, + "loss": 12.2897, + "step": 18201 + }, + { + "epoch": 0.9911716458039792, + "grad_norm": 0.6588728596304212, + "learning_rate": 0.00010628327635883133, + "loss": 12.1349, + "step": 18202 + }, + { + "epoch": 0.9912260998005622, + "grad_norm": 0.5016770517506026, + "learning_rate": 0.0001062744755005116, + "loss": 12.1715, + "step": 18203 + }, + { + "epoch": 0.9912805537971453, + "grad_norm": 0.6018119933793505, + "learning_rate": 0.00010626567459340047, + "loss": 12.0232, + "step": 18204 + }, + { + "epoch": 0.9913350077937283, + "grad_norm": 0.5748866193437475, + "learning_rate": 0.00010625687363756638, + "loss": 12.0883, + "step": 18205 + }, + { + "epoch": 0.9913894617903113, + "grad_norm": 0.5671183525950302, + "learning_rate": 0.0001062480726330778, + "loss": 12.1136, + "step": 18206 + }, + { + "epoch": 0.9914439157868943, + "grad_norm": 0.5512127695258885, + "learning_rate": 0.00010623927158000318, + "loss": 12.1586, + "step": 18207 + }, + { + "epoch": 0.9914983697834773, + "grad_norm": 0.5306245357987395, + "learning_rate": 0.00010623047047841095, + "loss": 12.0574, + "step": 18208 + }, + { + "epoch": 0.9915528237800603, + "grad_norm": 0.7030287098542315, + "learning_rate": 0.0001062216693283695, + "loss": 12.1599, + "step": 18209 + }, + { + "epoch": 0.9916072777766434, + "grad_norm": 0.5550678850782707, + "learning_rate": 0.00010621286812994733, + "loss": 12.0858, + "step": 18210 + }, + { + "epoch": 0.9916617317732264, + "grad_norm": 0.614911713623915, + "learning_rate": 0.00010620406688321285, + "loss": 12.0705, + "step": 18211 + }, + { + "epoch": 0.9917161857698094, + "grad_norm": 0.5689402836679145, + "learning_rate": 0.00010619526558823447, + "loss": 12.0661, + "step": 18212 + }, + { + "epoch": 0.9917706397663923, + "grad_norm": 0.601418914315275, + "learning_rate": 0.00010618646424508072, + "loss": 12.0915, + "step": 18213 + }, + { + "epoch": 0.9918250937629753, + "grad_norm": 0.5290614927342719, + "learning_rate": 0.00010617766285381997, + "loss": 12.0135, + "step": 18214 + }, + { + "epoch": 0.9918795477595583, + "grad_norm": 0.6662437714409607, + "learning_rate": 0.00010616886141452066, + "loss": 12.1376, + "step": 18215 + }, + { + "epoch": 0.9919340017561414, + "grad_norm": 0.5554799207212264, + "learning_rate": 0.00010616005992725127, + "loss": 12.0578, + "step": 18216 + }, + { + "epoch": 0.9919884557527244, + "grad_norm": 0.5894407616829986, + "learning_rate": 0.0001061512583920802, + "loss": 12.1633, + "step": 18217 + }, + { + "epoch": 0.9920429097493074, + "grad_norm": 0.6647002522545795, + "learning_rate": 0.00010614245680907593, + "loss": 12.0707, + "step": 18218 + }, + { + "epoch": 0.9920973637458904, + "grad_norm": 0.5547368028447971, + "learning_rate": 0.0001061336551783069, + "loss": 12.1337, + "step": 18219 + }, + { + "epoch": 0.9921518177424734, + "grad_norm": 0.5780326738983894, + "learning_rate": 0.00010612485349984151, + "loss": 12.1128, + "step": 18220 + }, + { + "epoch": 0.9922062717390564, + "grad_norm": 0.6561917013664642, + "learning_rate": 0.00010611605177374824, + "loss": 12.2249, + "step": 18221 + }, + { + "epoch": 0.9922607257356395, + "grad_norm": 0.5224015433566719, + "learning_rate": 0.00010610725000009551, + "loss": 12.0916, + "step": 18222 + }, + { + "epoch": 0.9923151797322225, + "grad_norm": 0.7322683937666953, + "learning_rate": 0.00010609844817895181, + "loss": 12.0303, + "step": 18223 + }, + { + "epoch": 0.9923696337288055, + "grad_norm": 0.5827006291050971, + "learning_rate": 0.00010608964631038553, + "loss": 12.1149, + "step": 18224 + }, + { + "epoch": 0.9924240877253885, + "grad_norm": 0.6341172939816072, + "learning_rate": 0.00010608084439446517, + "loss": 12.2159, + "step": 18225 + }, + { + "epoch": 0.9924785417219715, + "grad_norm": 0.5055789506140991, + "learning_rate": 0.00010607204243125912, + "loss": 12.0186, + "step": 18226 + }, + { + "epoch": 0.9925329957185545, + "grad_norm": 0.5937480331497839, + "learning_rate": 0.00010606324042083586, + "loss": 12.2173, + "step": 18227 + }, + { + "epoch": 0.9925874497151376, + "grad_norm": 0.5765533212067099, + "learning_rate": 0.0001060544383632638, + "loss": 12.1039, + "step": 18228 + }, + { + "epoch": 0.9926419037117206, + "grad_norm": 0.5856380091206774, + "learning_rate": 0.00010604563625861146, + "loss": 12.1383, + "step": 18229 + }, + { + "epoch": 0.9926963577083036, + "grad_norm": 0.5355559798841263, + "learning_rate": 0.00010603683410694721, + "loss": 12.0303, + "step": 18230 + }, + { + "epoch": 0.9927508117048865, + "grad_norm": 0.5242059372670049, + "learning_rate": 0.00010602803190833952, + "loss": 12.0785, + "step": 18231 + }, + { + "epoch": 0.9928052657014695, + "grad_norm": 0.5529125861572977, + "learning_rate": 0.00010601922966285685, + "loss": 12.1323, + "step": 18232 + }, + { + "epoch": 0.9928597196980525, + "grad_norm": 0.579570952487471, + "learning_rate": 0.00010601042737056763, + "loss": 12.1193, + "step": 18233 + }, + { + "epoch": 0.9929141736946356, + "grad_norm": 0.5615623403966941, + "learning_rate": 0.00010600162503154034, + "loss": 12.063, + "step": 18234 + }, + { + "epoch": 0.9929686276912186, + "grad_norm": 0.5263312129752604, + "learning_rate": 0.00010599282264584338, + "loss": 12.0795, + "step": 18235 + }, + { + "epoch": 0.9930230816878016, + "grad_norm": 0.5741760977770437, + "learning_rate": 0.00010598402021354525, + "loss": 12.1506, + "step": 18236 + }, + { + "epoch": 0.9930775356843846, + "grad_norm": 0.5453528232582568, + "learning_rate": 0.00010597521773471437, + "loss": 12.1111, + "step": 18237 + }, + { + "epoch": 0.9931319896809676, + "grad_norm": 0.5848245007509505, + "learning_rate": 0.00010596641520941921, + "loss": 12.0653, + "step": 18238 + }, + { + "epoch": 0.9931864436775507, + "grad_norm": 0.5734984559056954, + "learning_rate": 0.00010595761263772816, + "loss": 12.0619, + "step": 18239 + }, + { + "epoch": 0.9932408976741337, + "grad_norm": 0.5634072989729753, + "learning_rate": 0.00010594881001970975, + "loss": 11.9756, + "step": 18240 + }, + { + "epoch": 0.9932953516707167, + "grad_norm": 0.5894397628042625, + "learning_rate": 0.00010594000735543239, + "loss": 11.9566, + "step": 18241 + }, + { + "epoch": 0.9933498056672997, + "grad_norm": 0.5321180965263821, + "learning_rate": 0.00010593120464496453, + "loss": 12.0611, + "step": 18242 + }, + { + "epoch": 0.9934042596638827, + "grad_norm": 0.5994931595328526, + "learning_rate": 0.00010592240188837465, + "loss": 12.105, + "step": 18243 + }, + { + "epoch": 0.9934587136604657, + "grad_norm": 0.5939063125313075, + "learning_rate": 0.00010591359908573115, + "loss": 12.0165, + "step": 18244 + }, + { + "epoch": 0.9935131676570488, + "grad_norm": 0.5686234838614362, + "learning_rate": 0.00010590479623710252, + "loss": 12.0928, + "step": 18245 + }, + { + "epoch": 0.9935676216536318, + "grad_norm": 0.5747729490831621, + "learning_rate": 0.00010589599334255722, + "loss": 12.141, + "step": 18246 + }, + { + "epoch": 0.9936220756502148, + "grad_norm": 0.5933524183797961, + "learning_rate": 0.00010588719040216366, + "loss": 12.0861, + "step": 18247 + }, + { + "epoch": 0.9936765296467978, + "grad_norm": 0.5578922147546961, + "learning_rate": 0.00010587838741599037, + "loss": 12.107, + "step": 18248 + }, + { + "epoch": 0.9937309836433807, + "grad_norm": 0.5309388163475087, + "learning_rate": 0.00010586958438410572, + "loss": 12.052, + "step": 18249 + }, + { + "epoch": 0.9937854376399637, + "grad_norm": 0.5308965616404674, + "learning_rate": 0.00010586078130657817, + "loss": 12.0533, + "step": 18250 + }, + { + "epoch": 0.9938398916365468, + "grad_norm": 0.5162989002287177, + "learning_rate": 0.00010585197818347624, + "loss": 11.9956, + "step": 18251 + }, + { + "epoch": 0.9938943456331298, + "grad_norm": 0.5915427121879998, + "learning_rate": 0.00010584317501486833, + "loss": 12.0785, + "step": 18252 + }, + { + "epoch": 0.9939487996297128, + "grad_norm": 0.5787166950197778, + "learning_rate": 0.00010583437180082293, + "loss": 12.0896, + "step": 18253 + }, + { + "epoch": 0.9940032536262958, + "grad_norm": 0.5140441644242902, + "learning_rate": 0.00010582556854140846, + "loss": 12.0014, + "step": 18254 + }, + { + "epoch": 0.9940577076228788, + "grad_norm": 0.5961023277506549, + "learning_rate": 0.00010581676523669339, + "loss": 12.1662, + "step": 18255 + }, + { + "epoch": 0.9941121616194618, + "grad_norm": 0.5563828940257974, + "learning_rate": 0.00010580796188674618, + "loss": 12.09, + "step": 18256 + }, + { + "epoch": 0.9941666156160449, + "grad_norm": 0.527189994076118, + "learning_rate": 0.00010579915849163528, + "loss": 11.9866, + "step": 18257 + }, + { + "epoch": 0.9942210696126279, + "grad_norm": 0.6035817247554295, + "learning_rate": 0.00010579035505142918, + "loss": 12.2226, + "step": 18258 + }, + { + "epoch": 0.9942755236092109, + "grad_norm": 0.5345851780364875, + "learning_rate": 0.00010578155156619629, + "loss": 12.1269, + "step": 18259 + }, + { + "epoch": 0.9943299776057939, + "grad_norm": 0.7405022779870131, + "learning_rate": 0.00010577274803600508, + "loss": 12.0807, + "step": 18260 + }, + { + "epoch": 0.9943844316023769, + "grad_norm": 0.5171137351541494, + "learning_rate": 0.00010576394446092399, + "loss": 12.0452, + "step": 18261 + }, + { + "epoch": 0.9944388855989599, + "grad_norm": 0.5749189436715378, + "learning_rate": 0.00010575514084102151, + "loss": 12.1096, + "step": 18262 + }, + { + "epoch": 0.994493339595543, + "grad_norm": 0.7449553375505649, + "learning_rate": 0.00010574633717636615, + "loss": 12.1099, + "step": 18263 + }, + { + "epoch": 0.994547793592126, + "grad_norm": 0.6323078374538844, + "learning_rate": 0.00010573753346702626, + "loss": 12.2157, + "step": 18264 + }, + { + "epoch": 0.994602247588709, + "grad_norm": 0.5005928734970015, + "learning_rate": 0.00010572872971307035, + "loss": 11.8993, + "step": 18265 + }, + { + "epoch": 0.994656701585292, + "grad_norm": 0.5487925587482815, + "learning_rate": 0.00010571992591456687, + "loss": 12.1289, + "step": 18266 + }, + { + "epoch": 0.994711155581875, + "grad_norm": 0.5645526783870632, + "learning_rate": 0.00010571112207158429, + "loss": 12.1062, + "step": 18267 + }, + { + "epoch": 0.994765609578458, + "grad_norm": 0.5660600084508958, + "learning_rate": 0.00010570231818419106, + "loss": 12.0204, + "step": 18268 + }, + { + "epoch": 0.994820063575041, + "grad_norm": 0.5105849321119167, + "learning_rate": 0.00010569351425245569, + "loss": 12.1643, + "step": 18269 + }, + { + "epoch": 0.994874517571624, + "grad_norm": 0.5828619360755012, + "learning_rate": 0.00010568471027644656, + "loss": 12.1449, + "step": 18270 + }, + { + "epoch": 0.994928971568207, + "grad_norm": 0.5563200390594023, + "learning_rate": 0.00010567590625623219, + "loss": 12.0673, + "step": 18271 + }, + { + "epoch": 0.99498342556479, + "grad_norm": 0.5429151672763892, + "learning_rate": 0.00010566710219188098, + "loss": 12.1319, + "step": 18272 + }, + { + "epoch": 0.995037879561373, + "grad_norm": 0.5584770559140745, + "learning_rate": 0.00010565829808346146, + "loss": 12.1062, + "step": 18273 + }, + { + "epoch": 0.9950923335579561, + "grad_norm": 0.6520903283099112, + "learning_rate": 0.00010564949393104208, + "loss": 12.1104, + "step": 18274 + }, + { + "epoch": 0.9951467875545391, + "grad_norm": 0.5716805206843671, + "learning_rate": 0.00010564068973469128, + "loss": 12.1117, + "step": 18275 + }, + { + "epoch": 0.9952012415511221, + "grad_norm": 1.2849329251298771, + "learning_rate": 0.00010563188549447754, + "loss": 12.2314, + "step": 18276 + }, + { + "epoch": 0.9952556955477051, + "grad_norm": 0.571927857807694, + "learning_rate": 0.00010562308121046929, + "loss": 12.098, + "step": 18277 + }, + { + "epoch": 0.9953101495442881, + "grad_norm": 0.6030794560881759, + "learning_rate": 0.000105614276882735, + "loss": 11.971, + "step": 18278 + }, + { + "epoch": 0.9953646035408711, + "grad_norm": 0.5171068645879546, + "learning_rate": 0.0001056054725113432, + "loss": 12.124, + "step": 18279 + }, + { + "epoch": 0.9954190575374542, + "grad_norm": 0.5842618670819876, + "learning_rate": 0.00010559666809636229, + "loss": 12.0035, + "step": 18280 + }, + { + "epoch": 0.9954735115340372, + "grad_norm": 0.5434906527951233, + "learning_rate": 0.00010558786363786075, + "loss": 12.1142, + "step": 18281 + }, + { + "epoch": 0.9955279655306202, + "grad_norm": 0.49536206412147676, + "learning_rate": 0.00010557905913590704, + "loss": 12.0497, + "step": 18282 + }, + { + "epoch": 0.9955824195272032, + "grad_norm": 0.585149941217784, + "learning_rate": 0.00010557025459056962, + "loss": 12.1341, + "step": 18283 + }, + { + "epoch": 0.9956368735237862, + "grad_norm": 0.631751785017183, + "learning_rate": 0.00010556145000191697, + "loss": 12.0925, + "step": 18284 + }, + { + "epoch": 0.9956913275203692, + "grad_norm": 0.5918976363038114, + "learning_rate": 0.00010555264537001757, + "loss": 12.293, + "step": 18285 + }, + { + "epoch": 0.9957457815169523, + "grad_norm": 0.6808319682134729, + "learning_rate": 0.00010554384069493985, + "loss": 12.1553, + "step": 18286 + }, + { + "epoch": 0.9958002355135352, + "grad_norm": 0.5531333942652283, + "learning_rate": 0.00010553503597675231, + "loss": 12.0735, + "step": 18287 + }, + { + "epoch": 0.9958546895101182, + "grad_norm": 0.5628731217928776, + "learning_rate": 0.00010552623121552339, + "loss": 12.106, + "step": 18288 + }, + { + "epoch": 0.9959091435067012, + "grad_norm": 0.5609251496824058, + "learning_rate": 0.00010551742641132159, + "loss": 12.0956, + "step": 18289 + }, + { + "epoch": 0.9959635975032842, + "grad_norm": 0.568234454533744, + "learning_rate": 0.00010550862156421532, + "loss": 12.0748, + "step": 18290 + }, + { + "epoch": 0.9960180514998672, + "grad_norm": 0.580822952517534, + "learning_rate": 0.00010549981667427312, + "loss": 12.0833, + "step": 18291 + }, + { + "epoch": 0.9960725054964503, + "grad_norm": 0.5632507217341308, + "learning_rate": 0.0001054910117415634, + "loss": 12.1432, + "step": 18292 + }, + { + "epoch": 0.9961269594930333, + "grad_norm": 0.6988838802306254, + "learning_rate": 0.00010548220676615466, + "loss": 12.3317, + "step": 18293 + }, + { + "epoch": 0.9961814134896163, + "grad_norm": 0.6437110816396651, + "learning_rate": 0.00010547340174811538, + "loss": 11.9467, + "step": 18294 + }, + { + "epoch": 0.9962358674861993, + "grad_norm": 0.5156467182743514, + "learning_rate": 0.00010546459668751398, + "loss": 11.8644, + "step": 18295 + }, + { + "epoch": 0.9962903214827823, + "grad_norm": 0.5651016554038113, + "learning_rate": 0.00010545579158441896, + "loss": 12.0975, + "step": 18296 + }, + { + "epoch": 0.9963447754793653, + "grad_norm": 0.547738898561608, + "learning_rate": 0.0001054469864388988, + "loss": 12.0015, + "step": 18297 + }, + { + "epoch": 0.9963992294759484, + "grad_norm": 0.5631191104540555, + "learning_rate": 0.00010543818125102197, + "loss": 12.1064, + "step": 18298 + }, + { + "epoch": 0.9964536834725314, + "grad_norm": 0.5402298000145086, + "learning_rate": 0.00010542937602085692, + "loss": 12.0611, + "step": 18299 + }, + { + "epoch": 0.9965081374691144, + "grad_norm": 0.54779694513099, + "learning_rate": 0.00010542057074847214, + "loss": 12.1614, + "step": 18300 + }, + { + "epoch": 0.9965625914656974, + "grad_norm": 0.5866882211911401, + "learning_rate": 0.00010541176543393607, + "loss": 12.1319, + "step": 18301 + }, + { + "epoch": 0.9966170454622804, + "grad_norm": 0.5580272799539157, + "learning_rate": 0.00010540296007731723, + "loss": 12.1469, + "step": 18302 + }, + { + "epoch": 0.9966714994588635, + "grad_norm": 0.604361212709162, + "learning_rate": 0.00010539415467868406, + "loss": 12.1564, + "step": 18303 + }, + { + "epoch": 0.9967259534554465, + "grad_norm": 0.569177199760398, + "learning_rate": 0.00010538534923810506, + "loss": 12.1793, + "step": 18304 + }, + { + "epoch": 0.9967804074520294, + "grad_norm": 0.5338852867265473, + "learning_rate": 0.00010537654375564862, + "loss": 12.0523, + "step": 18305 + }, + { + "epoch": 0.9968348614486124, + "grad_norm": 0.5389476611207479, + "learning_rate": 0.00010536773823138333, + "loss": 12.1469, + "step": 18306 + }, + { + "epoch": 0.9968893154451954, + "grad_norm": 0.5520982290746783, + "learning_rate": 0.00010535893266537758, + "loss": 12.0407, + "step": 18307 + }, + { + "epoch": 0.9969437694417784, + "grad_norm": 0.5762786107442041, + "learning_rate": 0.00010535012705769989, + "loss": 11.9832, + "step": 18308 + }, + { + "epoch": 0.9969982234383615, + "grad_norm": 0.5447738058091682, + "learning_rate": 0.00010534132140841873, + "loss": 12.2752, + "step": 18309 + }, + { + "epoch": 0.9970526774349445, + "grad_norm": 0.48809500004019324, + "learning_rate": 0.0001053325157176025, + "loss": 12.0917, + "step": 18310 + }, + { + "epoch": 0.9971071314315275, + "grad_norm": 0.5557349364671941, + "learning_rate": 0.00010532370998531976, + "loss": 12.106, + "step": 18311 + }, + { + "epoch": 0.9971615854281105, + "grad_norm": 0.5191673464952495, + "learning_rate": 0.00010531490421163897, + "loss": 12.1023, + "step": 18312 + }, + { + "epoch": 0.9972160394246935, + "grad_norm": 0.5318529763885843, + "learning_rate": 0.00010530609839662857, + "loss": 11.9875, + "step": 18313 + }, + { + "epoch": 0.9972704934212765, + "grad_norm": 0.5989152240444041, + "learning_rate": 0.00010529729254035712, + "loss": 12.1512, + "step": 18314 + }, + { + "epoch": 0.9973249474178596, + "grad_norm": 0.5312593750250028, + "learning_rate": 0.00010528848664289299, + "loss": 12.023, + "step": 18315 + }, + { + "epoch": 0.9973794014144426, + "grad_norm": 0.6016810621628746, + "learning_rate": 0.00010527968070430467, + "loss": 12.1681, + "step": 18316 + }, + { + "epoch": 0.9974338554110256, + "grad_norm": 0.5905204779838882, + "learning_rate": 0.0001052708747246607, + "loss": 11.9806, + "step": 18317 + }, + { + "epoch": 0.9974883094076086, + "grad_norm": 0.5281817931669279, + "learning_rate": 0.00010526206870402952, + "loss": 12.0172, + "step": 18318 + }, + { + "epoch": 0.9975427634041916, + "grad_norm": 0.5710793478121307, + "learning_rate": 0.00010525326264247965, + "loss": 12.0452, + "step": 18319 + }, + { + "epoch": 0.9975972174007746, + "grad_norm": 0.5690343274241164, + "learning_rate": 0.00010524445654007949, + "loss": 12.1181, + "step": 18320 + }, + { + "epoch": 0.9976516713973577, + "grad_norm": 0.6045292433790345, + "learning_rate": 0.00010523565039689753, + "loss": 12.1459, + "step": 18321 + }, + { + "epoch": 0.9977061253939407, + "grad_norm": 0.5120165329316765, + "learning_rate": 0.0001052268442130023, + "loss": 12.0556, + "step": 18322 + }, + { + "epoch": 0.9977605793905236, + "grad_norm": 0.5256668955838202, + "learning_rate": 0.00010521803798846228, + "loss": 12.1167, + "step": 18323 + }, + { + "epoch": 0.9978150333871066, + "grad_norm": 0.5949637649102439, + "learning_rate": 0.00010520923172334592, + "loss": 12.2264, + "step": 18324 + }, + { + "epoch": 0.9978694873836896, + "grad_norm": 0.49523772762042173, + "learning_rate": 0.00010520042541772168, + "loss": 12.122, + "step": 18325 + }, + { + "epoch": 0.9979239413802726, + "grad_norm": 0.5778216801940866, + "learning_rate": 0.00010519161907165806, + "loss": 12.1547, + "step": 18326 + }, + { + "epoch": 0.9979783953768557, + "grad_norm": 0.5770705735686229, + "learning_rate": 0.00010518281268522352, + "loss": 12.0834, + "step": 18327 + }, + { + "epoch": 0.9980328493734387, + "grad_norm": 0.5596684489682436, + "learning_rate": 0.00010517400625848657, + "loss": 12.2114, + "step": 18328 + }, + { + "epoch": 0.9980873033700217, + "grad_norm": 0.5670008931842118, + "learning_rate": 0.00010516519979151572, + "loss": 12.1024, + "step": 18329 + }, + { + "epoch": 0.9981417573666047, + "grad_norm": 0.5357328828073912, + "learning_rate": 0.00010515639328437938, + "loss": 12.0188, + "step": 18330 + }, + { + "epoch": 0.9981962113631877, + "grad_norm": 0.515364442144104, + "learning_rate": 0.00010514758673714607, + "loss": 12.1352, + "step": 18331 + }, + { + "epoch": 0.9982506653597707, + "grad_norm": 0.5324150196422278, + "learning_rate": 0.00010513878014988422, + "loss": 12.0376, + "step": 18332 + }, + { + "epoch": 0.9983051193563538, + "grad_norm": 0.7198694397915397, + "learning_rate": 0.00010512997352266239, + "loss": 12.2379, + "step": 18333 + }, + { + "epoch": 0.9983595733529368, + "grad_norm": 0.7596864730164228, + "learning_rate": 0.00010512116685554904, + "loss": 12.2487, + "step": 18334 + }, + { + "epoch": 0.9984140273495198, + "grad_norm": 0.5273094451393162, + "learning_rate": 0.00010511236014861261, + "loss": 12.0907, + "step": 18335 + }, + { + "epoch": 0.9984684813461028, + "grad_norm": 0.6108561774057857, + "learning_rate": 0.00010510355340192162, + "loss": 12.0449, + "step": 18336 + }, + { + "epoch": 0.9985229353426858, + "grad_norm": 0.5945808666942504, + "learning_rate": 0.00010509474661554454, + "loss": 12.0426, + "step": 18337 + }, + { + "epoch": 0.9985773893392689, + "grad_norm": 0.6075217557210957, + "learning_rate": 0.00010508593978954984, + "loss": 12.0566, + "step": 18338 + }, + { + "epoch": 0.9986318433358519, + "grad_norm": 0.5271617568745623, + "learning_rate": 0.00010507713292400607, + "loss": 12.1659, + "step": 18339 + }, + { + "epoch": 0.9986862973324349, + "grad_norm": 0.6434501091325255, + "learning_rate": 0.00010506832601898162, + "loss": 12.1409, + "step": 18340 + }, + { + "epoch": 0.9987407513290179, + "grad_norm": 0.601408971484078, + "learning_rate": 0.00010505951907454504, + "loss": 12.2454, + "step": 18341 + }, + { + "epoch": 0.9987952053256008, + "grad_norm": 0.5887687221593112, + "learning_rate": 0.00010505071209076478, + "loss": 12.1978, + "step": 18342 + }, + { + "epoch": 0.9988496593221838, + "grad_norm": 0.5492925724499284, + "learning_rate": 0.00010504190506770932, + "loss": 12.0751, + "step": 18343 + }, + { + "epoch": 0.9989041133187669, + "grad_norm": 0.5594962061512018, + "learning_rate": 0.00010503309800544718, + "loss": 12.0993, + "step": 18344 + }, + { + "epoch": 0.9989585673153499, + "grad_norm": 0.537376461163246, + "learning_rate": 0.00010502429090404681, + "loss": 12.1009, + "step": 18345 + }, + { + "epoch": 0.9990130213119329, + "grad_norm": 0.5483466664034863, + "learning_rate": 0.00010501548376357673, + "loss": 12.0368, + "step": 18346 + }, + { + "epoch": 0.9990674753085159, + "grad_norm": 0.5815370462057136, + "learning_rate": 0.00010500667658410541, + "loss": 12.2623, + "step": 18347 + }, + { + "epoch": 0.9991219293050989, + "grad_norm": 0.5212947122431155, + "learning_rate": 0.00010499786936570133, + "loss": 12.0358, + "step": 18348 + }, + { + "epoch": 0.9991763833016819, + "grad_norm": 0.543887849801361, + "learning_rate": 0.00010498906210843296, + "loss": 12.0294, + "step": 18349 + }, + { + "epoch": 0.999230837298265, + "grad_norm": 0.5967475053340418, + "learning_rate": 0.00010498025481236881, + "loss": 12.0394, + "step": 18350 + }, + { + "epoch": 0.999285291294848, + "grad_norm": 0.6079438630113693, + "learning_rate": 0.0001049714474775774, + "loss": 12.112, + "step": 18351 + }, + { + "epoch": 0.999339745291431, + "grad_norm": 0.5832927223163846, + "learning_rate": 0.00010496264010412714, + "loss": 12.1601, + "step": 18352 + }, + { + "epoch": 0.999394199288014, + "grad_norm": 0.5199728927111259, + "learning_rate": 0.00010495383269208656, + "loss": 12.1181, + "step": 18353 + }, + { + "epoch": 0.999448653284597, + "grad_norm": 0.5941050390356781, + "learning_rate": 0.00010494502524152417, + "loss": 12.2261, + "step": 18354 + }, + { + "epoch": 0.99950310728118, + "grad_norm": 0.6533739538334977, + "learning_rate": 0.00010493621775250842, + "loss": 12.1423, + "step": 18355 + }, + { + "epoch": 0.9995575612777631, + "grad_norm": 0.6397750626797618, + "learning_rate": 0.00010492741022510781, + "loss": 11.9695, + "step": 18356 + }, + { + "epoch": 0.9996120152743461, + "grad_norm": 0.615535482329696, + "learning_rate": 0.00010491860265939084, + "loss": 11.9663, + "step": 18357 + }, + { + "epoch": 0.9996664692709291, + "grad_norm": 0.5402599506044298, + "learning_rate": 0.000104909795055426, + "loss": 11.8675, + "step": 18358 + }, + { + "epoch": 0.999720923267512, + "grad_norm": 0.5627393779909873, + "learning_rate": 0.00010490098741328174, + "loss": 12.0418, + "step": 18359 + }, + { + "epoch": 0.999775377264095, + "grad_norm": 0.5455322046325075, + "learning_rate": 0.00010489217973302661, + "loss": 12.0932, + "step": 18360 + }, + { + "epoch": 0.999829831260678, + "grad_norm": 0.5342616361648647, + "learning_rate": 0.00010488337201472905, + "loss": 11.9299, + "step": 18361 + }, + { + "epoch": 0.9998842852572611, + "grad_norm": 0.5799942188983298, + "learning_rate": 0.00010487456425845758, + "loss": 11.969, + "step": 18362 + }, + { + "epoch": 0.9999387392538441, + "grad_norm": 0.583225236752225, + "learning_rate": 0.00010486575646428067, + "loss": 12.1362, + "step": 18363 + }, + { + "epoch": 0.9999931932504271, + "grad_norm": 0.5327575410878634, + "learning_rate": 0.00010485694863226687, + "loss": 12.0343, + "step": 18364 + }, + { + "epoch": 1.0000476472470101, + "grad_norm": 0.6505595880771092, + "learning_rate": 0.00010484814076248455, + "loss": 12.085, + "step": 18365 + }, + { + "epoch": 1.0001021012435931, + "grad_norm": 0.5183267779782971, + "learning_rate": 0.00010483933285500231, + "loss": 11.9666, + "step": 18366 + }, + { + "epoch": 1.000156555240176, + "grad_norm": 0.6106950132925038, + "learning_rate": 0.00010483052490988861, + "loss": 11.988, + "step": 18367 + }, + { + "epoch": 1.000211009236759, + "grad_norm": 0.5487647852412104, + "learning_rate": 0.00010482171692721192, + "loss": 12.043, + "step": 18368 + }, + { + "epoch": 1.000265463233342, + "grad_norm": 0.547568729580315, + "learning_rate": 0.00010481290890704079, + "loss": 12.086, + "step": 18369 + }, + { + "epoch": 1.0003199172299253, + "grad_norm": 0.5356534198690128, + "learning_rate": 0.00010480410084944363, + "loss": 12.1115, + "step": 18370 + }, + { + "epoch": 1.0003743712265083, + "grad_norm": 0.56201989986331, + "learning_rate": 0.00010479529275448899, + "loss": 11.7968, + "step": 18371 + }, + { + "epoch": 1.0004288252230913, + "grad_norm": 0.5862078200930146, + "learning_rate": 0.00010478648462224536, + "loss": 12.098, + "step": 18372 + }, + { + "epoch": 1.0004832792196743, + "grad_norm": 0.5426265159006913, + "learning_rate": 0.0001047776764527812, + "loss": 12.15, + "step": 18373 + }, + { + "epoch": 1.0005377332162573, + "grad_norm": 0.6361834878239455, + "learning_rate": 0.00010476886824616508, + "loss": 12.1512, + "step": 18374 + }, + { + "epoch": 1.0005921872128403, + "grad_norm": 0.643370746183964, + "learning_rate": 0.00010476006000246539, + "loss": 12.2027, + "step": 18375 + }, + { + "epoch": 1.0006466412094233, + "grad_norm": 0.5831748434881041, + "learning_rate": 0.00010475125172175066, + "loss": 12.0607, + "step": 18376 + }, + { + "epoch": 1.0007010952060063, + "grad_norm": 0.5546440422407036, + "learning_rate": 0.00010474244340408943, + "loss": 12.0811, + "step": 18377 + }, + { + "epoch": 1.0007555492025892, + "grad_norm": 0.5409337616191884, + "learning_rate": 0.00010473363504955017, + "loss": 11.9853, + "step": 18378 + }, + { + "epoch": 1.0008100031991722, + "grad_norm": 0.5619493544416494, + "learning_rate": 0.00010472482665820139, + "loss": 12.109, + "step": 18379 + }, + { + "epoch": 1.0008644571957552, + "grad_norm": 0.4948552519989432, + "learning_rate": 0.00010471601823011152, + "loss": 12.121, + "step": 18380 + }, + { + "epoch": 1.0009189111923382, + "grad_norm": 0.6170624624020086, + "learning_rate": 0.00010470720976534913, + "loss": 12.2022, + "step": 18381 + }, + { + "epoch": 1.0009733651889214, + "grad_norm": 0.5417170613625275, + "learning_rate": 0.00010469840126398265, + "loss": 12.1306, + "step": 18382 + }, + { + "epoch": 1.0010278191855044, + "grad_norm": 0.4905086632246415, + "learning_rate": 0.00010468959272608063, + "loss": 12.0468, + "step": 18383 + }, + { + "epoch": 1.0010822731820874, + "grad_norm": 0.6156457670311729, + "learning_rate": 0.00010468078415171159, + "loss": 12.1791, + "step": 18384 + }, + { + "epoch": 1.0011367271786704, + "grad_norm": 0.5629794884599503, + "learning_rate": 0.00010467197554094397, + "loss": 12.2719, + "step": 18385 + }, + { + "epoch": 1.0011911811752534, + "grad_norm": 0.5730873010065706, + "learning_rate": 0.00010466316689384625, + "loss": 12.0505, + "step": 18386 + }, + { + "epoch": 1.0012456351718364, + "grad_norm": 0.5959155320820246, + "learning_rate": 0.00010465435821048697, + "loss": 12.1176, + "step": 18387 + }, + { + "epoch": 1.0013000891684194, + "grad_norm": 0.5252390152725271, + "learning_rate": 0.00010464554949093463, + "loss": 12.0478, + "step": 18388 + }, + { + "epoch": 1.0013545431650024, + "grad_norm": 0.5594494652330558, + "learning_rate": 0.00010463674073525775, + "loss": 12.0372, + "step": 18389 + }, + { + "epoch": 1.0014089971615854, + "grad_norm": 0.5531813407378738, + "learning_rate": 0.00010462793194352478, + "loss": 12.0106, + "step": 18390 + }, + { + "epoch": 1.0014634511581684, + "grad_norm": 0.6363738001080059, + "learning_rate": 0.00010461912311580422, + "loss": 12.1426, + "step": 18391 + }, + { + "epoch": 1.0015179051547514, + "grad_norm": 0.5505332682820571, + "learning_rate": 0.00010461031425216459, + "loss": 12.1038, + "step": 18392 + }, + { + "epoch": 1.0015723591513344, + "grad_norm": 0.5347841437757687, + "learning_rate": 0.00010460150535267436, + "loss": 11.9116, + "step": 18393 + }, + { + "epoch": 1.0016268131479176, + "grad_norm": 0.6161254610542731, + "learning_rate": 0.0001045926964174021, + "loss": 12.1878, + "step": 18394 + }, + { + "epoch": 1.0016812671445006, + "grad_norm": 0.5473230783030792, + "learning_rate": 0.00010458388744641622, + "loss": 11.9945, + "step": 18395 + }, + { + "epoch": 1.0017357211410836, + "grad_norm": 0.5361957484183014, + "learning_rate": 0.00010457507843978529, + "loss": 12.0654, + "step": 18396 + }, + { + "epoch": 1.0017901751376665, + "grad_norm": 0.564843165650648, + "learning_rate": 0.00010456626939757779, + "loss": 12.0611, + "step": 18397 + }, + { + "epoch": 1.0018446291342495, + "grad_norm": 0.49246724425305344, + "learning_rate": 0.00010455746031986215, + "loss": 12.0085, + "step": 18398 + }, + { + "epoch": 1.0018990831308325, + "grad_norm": 0.517952796975952, + "learning_rate": 0.00010454865120670701, + "loss": 11.9902, + "step": 18399 + }, + { + "epoch": 1.0019535371274155, + "grad_norm": 0.5715093319395536, + "learning_rate": 0.00010453984205818078, + "loss": 12.0314, + "step": 18400 + }, + { + "epoch": 1.0020079911239985, + "grad_norm": 0.6381694403156342, + "learning_rate": 0.00010453103287435196, + "loss": 12.1184, + "step": 18401 + }, + { + "epoch": 1.0020624451205815, + "grad_norm": 0.5189112439951066, + "learning_rate": 0.00010452222365528906, + "loss": 12.1006, + "step": 18402 + }, + { + "epoch": 1.0021168991171645, + "grad_norm": 0.5427153553870643, + "learning_rate": 0.00010451341440106059, + "loss": 12.1646, + "step": 18403 + }, + { + "epoch": 1.0021713531137475, + "grad_norm": 0.5508173660075053, + "learning_rate": 0.00010450460511173508, + "loss": 11.9587, + "step": 18404 + }, + { + "epoch": 1.0022258071103307, + "grad_norm": 0.5125258189425049, + "learning_rate": 0.00010449579578738097, + "loss": 12.0517, + "step": 18405 + }, + { + "epoch": 1.0022802611069137, + "grad_norm": 0.5640314476131568, + "learning_rate": 0.00010448698642806682, + "loss": 11.9638, + "step": 18406 + }, + { + "epoch": 1.0023347151034967, + "grad_norm": 0.5247936949494446, + "learning_rate": 0.0001044781770338611, + "loss": 12.1338, + "step": 18407 + }, + { + "epoch": 1.0023891691000797, + "grad_norm": 0.5699248662268073, + "learning_rate": 0.00010446936760483235, + "loss": 12.0554, + "step": 18408 + }, + { + "epoch": 1.0024436230966627, + "grad_norm": 0.5832846353753555, + "learning_rate": 0.00010446055814104903, + "loss": 12.1558, + "step": 18409 + }, + { + "epoch": 1.0024980770932457, + "grad_norm": 0.5888257898048711, + "learning_rate": 0.00010445174864257967, + "loss": 11.9725, + "step": 18410 + }, + { + "epoch": 1.0025525310898287, + "grad_norm": 0.5342655938104394, + "learning_rate": 0.00010444293910949277, + "loss": 12.1343, + "step": 18411 + }, + { + "epoch": 1.0026069850864117, + "grad_norm": 0.5222292673682406, + "learning_rate": 0.00010443412954185681, + "loss": 12.01, + "step": 18412 + }, + { + "epoch": 1.0026614390829947, + "grad_norm": 0.5605641546372595, + "learning_rate": 0.00010442531993974031, + "loss": 12.1621, + "step": 18413 + }, + { + "epoch": 1.0027158930795776, + "grad_norm": 0.5506957085219449, + "learning_rate": 0.0001044165103032118, + "loss": 12.017, + "step": 18414 + }, + { + "epoch": 1.0027703470761606, + "grad_norm": 0.5362068520464105, + "learning_rate": 0.00010440770063233976, + "loss": 12.0291, + "step": 18415 + }, + { + "epoch": 1.0028248010727436, + "grad_norm": 0.6547642284338772, + "learning_rate": 0.00010439889092719271, + "loss": 12.3063, + "step": 18416 + }, + { + "epoch": 1.0028792550693268, + "grad_norm": 0.5953052514900333, + "learning_rate": 0.00010439008118783913, + "loss": 12.0355, + "step": 18417 + }, + { + "epoch": 1.0029337090659098, + "grad_norm": 0.5622270585981601, + "learning_rate": 0.00010438127141434754, + "loss": 12.1138, + "step": 18418 + }, + { + "epoch": 1.0029881630624928, + "grad_norm": 0.6166438221046772, + "learning_rate": 0.00010437246160678647, + "loss": 12.173, + "step": 18419 + }, + { + "epoch": 1.0030426170590758, + "grad_norm": 0.6443790331998598, + "learning_rate": 0.00010436365176522438, + "loss": 12.1626, + "step": 18420 + }, + { + "epoch": 1.0030970710556588, + "grad_norm": 0.5064232929602658, + "learning_rate": 0.00010435484188972982, + "loss": 12.036, + "step": 18421 + }, + { + "epoch": 1.0031515250522418, + "grad_norm": 0.6420988468737352, + "learning_rate": 0.00010434603198037127, + "loss": 12.2683, + "step": 18422 + }, + { + "epoch": 1.0032059790488248, + "grad_norm": 0.5796531182415882, + "learning_rate": 0.00010433722203721725, + "loss": 12.0797, + "step": 18423 + }, + { + "epoch": 1.0032604330454078, + "grad_norm": 0.7296537883602282, + "learning_rate": 0.00010432841206033627, + "loss": 12.3333, + "step": 18424 + }, + { + "epoch": 1.0033148870419908, + "grad_norm": 0.5213973256119997, + "learning_rate": 0.00010431960204979683, + "loss": 12.0892, + "step": 18425 + }, + { + "epoch": 1.0033693410385738, + "grad_norm": 0.5898352075567528, + "learning_rate": 0.00010431079200566745, + "loss": 11.9569, + "step": 18426 + }, + { + "epoch": 1.0034237950351568, + "grad_norm": 0.6532708033340328, + "learning_rate": 0.00010430198192801662, + "loss": 12.0374, + "step": 18427 + }, + { + "epoch": 1.0034782490317398, + "grad_norm": 0.5499115261605702, + "learning_rate": 0.00010429317181691283, + "loss": 11.9641, + "step": 18428 + }, + { + "epoch": 1.003532703028323, + "grad_norm": 0.6449371239041375, + "learning_rate": 0.00010428436167242464, + "loss": 12.0602, + "step": 18429 + }, + { + "epoch": 1.003587157024906, + "grad_norm": 0.7144262127444471, + "learning_rate": 0.00010427555149462053, + "loss": 12.108, + "step": 18430 + }, + { + "epoch": 1.003641611021489, + "grad_norm": 0.577597272843995, + "learning_rate": 0.00010426674128356904, + "loss": 12.0951, + "step": 18431 + }, + { + "epoch": 1.003696065018072, + "grad_norm": 0.5852692927340701, + "learning_rate": 0.0001042579310393386, + "loss": 12.0938, + "step": 18432 + }, + { + "epoch": 1.003750519014655, + "grad_norm": 0.7541386181334564, + "learning_rate": 0.00010424912076199782, + "loss": 12.1857, + "step": 18433 + }, + { + "epoch": 1.003804973011238, + "grad_norm": 0.5903454039399789, + "learning_rate": 0.00010424031045161516, + "loss": 12.0776, + "step": 18434 + }, + { + "epoch": 1.003859427007821, + "grad_norm": 0.5015008043754426, + "learning_rate": 0.00010423150010825915, + "loss": 12.0856, + "step": 18435 + }, + { + "epoch": 1.003913881004404, + "grad_norm": 0.6864607542844916, + "learning_rate": 0.00010422268973199822, + "loss": 12.1423, + "step": 18436 + }, + { + "epoch": 1.003968335000987, + "grad_norm": 0.576314090636686, + "learning_rate": 0.000104213879322901, + "loss": 12.1007, + "step": 18437 + }, + { + "epoch": 1.00402278899757, + "grad_norm": 0.5434246964817178, + "learning_rate": 0.00010420506888103593, + "loss": 11.9121, + "step": 18438 + }, + { + "epoch": 1.004077242994153, + "grad_norm": 0.6407559040439097, + "learning_rate": 0.00010419625840647156, + "loss": 12.1781, + "step": 18439 + }, + { + "epoch": 1.0041316969907361, + "grad_norm": 0.6164183897597848, + "learning_rate": 0.00010418744789927637, + "loss": 12.1224, + "step": 18440 + }, + { + "epoch": 1.0041861509873191, + "grad_norm": 0.57202267820432, + "learning_rate": 0.0001041786373595189, + "loss": 12.081, + "step": 18441 + }, + { + "epoch": 1.004240604983902, + "grad_norm": 0.5641499945102079, + "learning_rate": 0.0001041698267872676, + "loss": 12.1598, + "step": 18442 + }, + { + "epoch": 1.004295058980485, + "grad_norm": 0.6088130538427488, + "learning_rate": 0.00010416101618259104, + "loss": 12.1856, + "step": 18443 + }, + { + "epoch": 1.004349512977068, + "grad_norm": 0.8869373652399809, + "learning_rate": 0.00010415220554555774, + "loss": 12.061, + "step": 18444 + }, + { + "epoch": 1.004403966973651, + "grad_norm": 0.5206873460568164, + "learning_rate": 0.00010414339487623618, + "loss": 12.047, + "step": 18445 + }, + { + "epoch": 1.004458420970234, + "grad_norm": 0.525736502612232, + "learning_rate": 0.00010413458417469491, + "loss": 12.0695, + "step": 18446 + }, + { + "epoch": 1.004512874966817, + "grad_norm": 0.5385043026808839, + "learning_rate": 0.00010412577344100239, + "loss": 12.1558, + "step": 18447 + }, + { + "epoch": 1.0045673289634, + "grad_norm": 0.5321986529369179, + "learning_rate": 0.00010411696267522718, + "loss": 12.0883, + "step": 18448 + }, + { + "epoch": 1.004621782959983, + "grad_norm": 0.5432885165476395, + "learning_rate": 0.00010410815187743776, + "loss": 12.0963, + "step": 18449 + }, + { + "epoch": 1.004676236956566, + "grad_norm": 0.5153007684808616, + "learning_rate": 0.00010409934104770269, + "loss": 12.1426, + "step": 18450 + }, + { + "epoch": 1.004730690953149, + "grad_norm": 0.7251781770255962, + "learning_rate": 0.00010409053018609045, + "loss": 12.091, + "step": 18451 + }, + { + "epoch": 1.0047851449497323, + "grad_norm": 0.5546233647379865, + "learning_rate": 0.00010408171929266954, + "loss": 12.0329, + "step": 18452 + }, + { + "epoch": 1.0048395989463152, + "grad_norm": 0.5691353512399754, + "learning_rate": 0.00010407290836750849, + "loss": 12.0922, + "step": 18453 + }, + { + "epoch": 1.0048940529428982, + "grad_norm": 0.5097120315588252, + "learning_rate": 0.00010406409741067584, + "loss": 12.0595, + "step": 18454 + }, + { + "epoch": 1.0049485069394812, + "grad_norm": 0.5588728055036118, + "learning_rate": 0.00010405528642224011, + "loss": 12.2999, + "step": 18455 + }, + { + "epoch": 1.0050029609360642, + "grad_norm": 0.5455730220948464, + "learning_rate": 0.00010404647540226977, + "loss": 12.0505, + "step": 18456 + }, + { + "epoch": 1.0050574149326472, + "grad_norm": 0.4835546737187294, + "learning_rate": 0.00010403766435083337, + "loss": 12.0044, + "step": 18457 + }, + { + "epoch": 1.0051118689292302, + "grad_norm": 0.5266537781447859, + "learning_rate": 0.0001040288532679994, + "loss": 12.0936, + "step": 18458 + }, + { + "epoch": 1.0051663229258132, + "grad_norm": 0.7211166245977295, + "learning_rate": 0.00010402004215383638, + "loss": 12.2525, + "step": 18459 + }, + { + "epoch": 1.0052207769223962, + "grad_norm": 0.5973374180082196, + "learning_rate": 0.00010401123100841288, + "loss": 12.1424, + "step": 18460 + }, + { + "epoch": 1.0052752309189792, + "grad_norm": 0.5170532918173177, + "learning_rate": 0.00010400241983179735, + "loss": 12.0298, + "step": 18461 + }, + { + "epoch": 1.0053296849155622, + "grad_norm": 0.5375349396750642, + "learning_rate": 0.00010399360862405832, + "loss": 12.1485, + "step": 18462 + }, + { + "epoch": 1.0053841389121452, + "grad_norm": 0.520035242682927, + "learning_rate": 0.00010398479738526434, + "loss": 12.1356, + "step": 18463 + }, + { + "epoch": 1.0054385929087284, + "grad_norm": 0.5432332946684317, + "learning_rate": 0.00010397598611548387, + "loss": 11.8634, + "step": 18464 + }, + { + "epoch": 1.0054930469053114, + "grad_norm": 0.5893141249318277, + "learning_rate": 0.00010396717481478551, + "loss": 12.1584, + "step": 18465 + }, + { + "epoch": 1.0055475009018944, + "grad_norm": 0.5408766358330444, + "learning_rate": 0.00010395836348323771, + "loss": 12.0764, + "step": 18466 + }, + { + "epoch": 1.0056019548984774, + "grad_norm": 0.5598581105351644, + "learning_rate": 0.00010394955212090903, + "loss": 11.7836, + "step": 18467 + }, + { + "epoch": 1.0056564088950604, + "grad_norm": 0.538544943045943, + "learning_rate": 0.00010394074072786794, + "loss": 11.8081, + "step": 18468 + }, + { + "epoch": 1.0057108628916434, + "grad_norm": 0.5679894467120316, + "learning_rate": 0.00010393192930418302, + "loss": 11.9564, + "step": 18469 + }, + { + "epoch": 1.0057653168882263, + "grad_norm": 0.6323405377769352, + "learning_rate": 0.00010392311784992275, + "loss": 12.0496, + "step": 18470 + }, + { + "epoch": 1.0058197708848093, + "grad_norm": 0.5766824622562673, + "learning_rate": 0.00010391430636515565, + "loss": 12.0748, + "step": 18471 + }, + { + "epoch": 1.0058742248813923, + "grad_norm": 0.5564550229807947, + "learning_rate": 0.00010390549484995024, + "loss": 11.993, + "step": 18472 + }, + { + "epoch": 1.0059286788779753, + "grad_norm": 0.6228642070002696, + "learning_rate": 0.00010389668330437507, + "loss": 11.996, + "step": 18473 + }, + { + "epoch": 1.0059831328745583, + "grad_norm": 0.5529858077664811, + "learning_rate": 0.00010388787172849863, + "loss": 12.076, + "step": 18474 + }, + { + "epoch": 1.0060375868711415, + "grad_norm": 0.5923726170221378, + "learning_rate": 0.00010387906012238943, + "loss": 12.1028, + "step": 18475 + }, + { + "epoch": 1.0060920408677245, + "grad_norm": 0.5525567253092625, + "learning_rate": 0.00010387024848611604, + "loss": 11.9582, + "step": 18476 + }, + { + "epoch": 1.0061464948643075, + "grad_norm": 0.5672762174972753, + "learning_rate": 0.00010386143681974692, + "loss": 11.9318, + "step": 18477 + }, + { + "epoch": 1.0062009488608905, + "grad_norm": 0.5536384723037703, + "learning_rate": 0.00010385262512335063, + "loss": 12.0877, + "step": 18478 + }, + { + "epoch": 1.0062554028574735, + "grad_norm": 0.5384526911923556, + "learning_rate": 0.00010384381339699567, + "loss": 12.0977, + "step": 18479 + }, + { + "epoch": 1.0063098568540565, + "grad_norm": 0.5593960088882626, + "learning_rate": 0.00010383500164075059, + "loss": 12.0818, + "step": 18480 + }, + { + "epoch": 1.0063643108506395, + "grad_norm": 0.6027909589961592, + "learning_rate": 0.00010382618985468389, + "loss": 12.0996, + "step": 18481 + }, + { + "epoch": 1.0064187648472225, + "grad_norm": 0.6295374486803064, + "learning_rate": 0.00010381737803886409, + "loss": 12.3242, + "step": 18482 + }, + { + "epoch": 1.0064732188438055, + "grad_norm": 0.5972378296093238, + "learning_rate": 0.00010380856619335973, + "loss": 12.1621, + "step": 18483 + }, + { + "epoch": 1.0065276728403885, + "grad_norm": 0.6392123771019983, + "learning_rate": 0.0001037997543182393, + "loss": 12.1677, + "step": 18484 + }, + { + "epoch": 1.0065821268369715, + "grad_norm": 0.5159467258469377, + "learning_rate": 0.00010379094241357134, + "loss": 12.0815, + "step": 18485 + }, + { + "epoch": 1.0066365808335545, + "grad_norm": 0.5290614198308999, + "learning_rate": 0.0001037821304794244, + "loss": 12.0834, + "step": 18486 + }, + { + "epoch": 1.0066910348301377, + "grad_norm": 0.5618735098496637, + "learning_rate": 0.00010377331851586699, + "loss": 12.0976, + "step": 18487 + }, + { + "epoch": 1.0067454888267207, + "grad_norm": 0.5436998168386749, + "learning_rate": 0.00010376450652296759, + "loss": 12.0281, + "step": 18488 + }, + { + "epoch": 1.0067999428233037, + "grad_norm": 0.5783917074189285, + "learning_rate": 0.00010375569450079476, + "loss": 12.0415, + "step": 18489 + }, + { + "epoch": 1.0068543968198866, + "grad_norm": 0.5613412995931357, + "learning_rate": 0.00010374688244941707, + "loss": 12.0988, + "step": 18490 + }, + { + "epoch": 1.0069088508164696, + "grad_norm": 0.5748374790527079, + "learning_rate": 0.00010373807036890291, + "loss": 11.9973, + "step": 18491 + }, + { + "epoch": 1.0069633048130526, + "grad_norm": 0.512362412059344, + "learning_rate": 0.00010372925825932093, + "loss": 11.956, + "step": 18492 + }, + { + "epoch": 1.0070177588096356, + "grad_norm": 0.6758198926200745, + "learning_rate": 0.00010372044612073961, + "loss": 12.0425, + "step": 18493 + }, + { + "epoch": 1.0070722128062186, + "grad_norm": 0.5579764231238112, + "learning_rate": 0.00010371163395322749, + "loss": 12.06, + "step": 18494 + }, + { + "epoch": 1.0071266668028016, + "grad_norm": 0.5418374459029155, + "learning_rate": 0.00010370282175685308, + "loss": 12.021, + "step": 18495 + }, + { + "epoch": 1.0071811207993846, + "grad_norm": 0.6048706028105221, + "learning_rate": 0.00010369400953168489, + "loss": 12.1112, + "step": 18496 + }, + { + "epoch": 1.0072355747959676, + "grad_norm": 0.5569148287664402, + "learning_rate": 0.00010368519727779147, + "loss": 12.1173, + "step": 18497 + }, + { + "epoch": 1.0072900287925506, + "grad_norm": 0.527313019239876, + "learning_rate": 0.00010367638499524132, + "loss": 12.1017, + "step": 18498 + }, + { + "epoch": 1.0073444827891338, + "grad_norm": 0.5977487624522466, + "learning_rate": 0.00010366757268410302, + "loss": 12.112, + "step": 18499 + }, + { + "epoch": 1.0073989367857168, + "grad_norm": 0.5721359655878242, + "learning_rate": 0.00010365876034444506, + "loss": 12.0808, + "step": 18500 + }, + { + "epoch": 1.0074533907822998, + "grad_norm": 0.5107380598881754, + "learning_rate": 0.00010364994797633594, + "loss": 11.9403, + "step": 18501 + }, + { + "epoch": 1.0075078447788828, + "grad_norm": 0.6051001091539683, + "learning_rate": 0.0001036411355798442, + "loss": 12.0673, + "step": 18502 + }, + { + "epoch": 1.0075622987754658, + "grad_norm": 0.6471543229360646, + "learning_rate": 0.00010363232315503841, + "loss": 12.1068, + "step": 18503 + }, + { + "epoch": 1.0076167527720488, + "grad_norm": 0.564946503043798, + "learning_rate": 0.00010362351070198705, + "loss": 11.9981, + "step": 18504 + }, + { + "epoch": 1.0076712067686318, + "grad_norm": 0.5337812757312993, + "learning_rate": 0.00010361469822075869, + "loss": 12.1306, + "step": 18505 + }, + { + "epoch": 1.0077256607652147, + "grad_norm": 0.6579308245165749, + "learning_rate": 0.0001036058857114218, + "loss": 12.0056, + "step": 18506 + }, + { + "epoch": 1.0077801147617977, + "grad_norm": 0.5874164251985708, + "learning_rate": 0.00010359707317404494, + "loss": 12.1955, + "step": 18507 + }, + { + "epoch": 1.0078345687583807, + "grad_norm": 0.5816155901461761, + "learning_rate": 0.00010358826060869664, + "loss": 12.006, + "step": 18508 + }, + { + "epoch": 1.0078890227549637, + "grad_norm": 0.6768919139281958, + "learning_rate": 0.00010357944801544541, + "loss": 11.9988, + "step": 18509 + }, + { + "epoch": 1.007943476751547, + "grad_norm": 0.6709855862271281, + "learning_rate": 0.00010357063539435985, + "loss": 12.206, + "step": 18510 + }, + { + "epoch": 1.00799793074813, + "grad_norm": 0.5917606577492475, + "learning_rate": 0.00010356182274550838, + "loss": 12.0946, + "step": 18511 + }, + { + "epoch": 1.008052384744713, + "grad_norm": 0.5531900919101767, + "learning_rate": 0.00010355301006895958, + "loss": 12.1235, + "step": 18512 + }, + { + "epoch": 1.008106838741296, + "grad_norm": 0.5723874950368045, + "learning_rate": 0.00010354419736478198, + "loss": 12.0916, + "step": 18513 + }, + { + "epoch": 1.008161292737879, + "grad_norm": 0.6002824915363437, + "learning_rate": 0.00010353538463304411, + "loss": 12.1344, + "step": 18514 + }, + { + "epoch": 1.008215746734462, + "grad_norm": 0.5513697032116269, + "learning_rate": 0.00010352657187381451, + "loss": 12.0326, + "step": 18515 + }, + { + "epoch": 1.008270200731045, + "grad_norm": 0.5574745845879414, + "learning_rate": 0.0001035177590871617, + "loss": 12.0332, + "step": 18516 + }, + { + "epoch": 1.008324654727628, + "grad_norm": 0.5538227245458262, + "learning_rate": 0.00010350894627315417, + "loss": 12.177, + "step": 18517 + }, + { + "epoch": 1.0083791087242109, + "grad_norm": 0.6256207524064054, + "learning_rate": 0.00010350013343186051, + "loss": 12.1336, + "step": 18518 + }, + { + "epoch": 1.0084335627207939, + "grad_norm": 0.544668977596636, + "learning_rate": 0.0001034913205633492, + "loss": 12.1373, + "step": 18519 + }, + { + "epoch": 1.0084880167173769, + "grad_norm": 0.5971507286826666, + "learning_rate": 0.00010348250766768885, + "loss": 12.0926, + "step": 18520 + }, + { + "epoch": 1.0085424707139599, + "grad_norm": 0.5554077923661265, + "learning_rate": 0.00010347369474494791, + "loss": 12.0171, + "step": 18521 + }, + { + "epoch": 1.008596924710543, + "grad_norm": 0.6661562125947798, + "learning_rate": 0.00010346488179519491, + "loss": 12.0987, + "step": 18522 + }, + { + "epoch": 1.008651378707126, + "grad_norm": 0.5218887040607149, + "learning_rate": 0.00010345606881849843, + "loss": 12.1033, + "step": 18523 + }, + { + "epoch": 1.008705832703709, + "grad_norm": 0.5641752186109523, + "learning_rate": 0.00010344725581492695, + "loss": 12.2255, + "step": 18524 + }, + { + "epoch": 1.008760286700292, + "grad_norm": 0.5770632410257892, + "learning_rate": 0.00010343844278454908, + "loss": 12.1212, + "step": 18525 + }, + { + "epoch": 1.008814740696875, + "grad_norm": 0.5254078477638074, + "learning_rate": 0.00010342962972743329, + "loss": 11.8766, + "step": 18526 + }, + { + "epoch": 1.008869194693458, + "grad_norm": 0.5831145330438583, + "learning_rate": 0.00010342081664364811, + "loss": 11.9643, + "step": 18527 + }, + { + "epoch": 1.008923648690041, + "grad_norm": 0.556557972163656, + "learning_rate": 0.00010341200353326211, + "loss": 12.1719, + "step": 18528 + }, + { + "epoch": 1.008978102686624, + "grad_norm": 0.5579480008859713, + "learning_rate": 0.00010340319039634376, + "loss": 12.1567, + "step": 18529 + }, + { + "epoch": 1.009032556683207, + "grad_norm": 0.5204330978413887, + "learning_rate": 0.00010339437723296166, + "loss": 12.0736, + "step": 18530 + }, + { + "epoch": 1.00908701067979, + "grad_norm": 0.5512177356789631, + "learning_rate": 0.0001033855640431843, + "loss": 12.0599, + "step": 18531 + }, + { + "epoch": 1.009141464676373, + "grad_norm": 0.5358028471647213, + "learning_rate": 0.00010337675082708023, + "loss": 11.9859, + "step": 18532 + }, + { + "epoch": 1.009195918672956, + "grad_norm": 0.5478092408227072, + "learning_rate": 0.00010336793758471797, + "loss": 11.9727, + "step": 18533 + }, + { + "epoch": 1.0092503726695392, + "grad_norm": 0.5514440248075692, + "learning_rate": 0.00010335912431616608, + "loss": 12.0721, + "step": 18534 + }, + { + "epoch": 1.0093048266661222, + "grad_norm": 0.659187372167803, + "learning_rate": 0.00010335031102149306, + "loss": 12.1512, + "step": 18535 + }, + { + "epoch": 1.0093592806627052, + "grad_norm": 0.539932060291416, + "learning_rate": 0.00010334149770076747, + "loss": 12.1408, + "step": 18536 + }, + { + "epoch": 1.0094137346592882, + "grad_norm": 0.6274236836397442, + "learning_rate": 0.00010333268435405783, + "loss": 12.2776, + "step": 18537 + }, + { + "epoch": 1.0094681886558712, + "grad_norm": 0.6185688062471368, + "learning_rate": 0.00010332387098143267, + "loss": 12.1801, + "step": 18538 + }, + { + "epoch": 1.0095226426524542, + "grad_norm": 0.5242210851813076, + "learning_rate": 0.00010331505758296054, + "loss": 12.0495, + "step": 18539 + }, + { + "epoch": 1.0095770966490372, + "grad_norm": 0.5033737270364694, + "learning_rate": 0.00010330624415870998, + "loss": 11.9722, + "step": 18540 + }, + { + "epoch": 1.0096315506456202, + "grad_norm": 0.5950935511479065, + "learning_rate": 0.00010329743070874949, + "loss": 12.1449, + "step": 18541 + }, + { + "epoch": 1.0096860046422031, + "grad_norm": 0.5280528480410401, + "learning_rate": 0.00010328861723314763, + "loss": 11.964, + "step": 18542 + }, + { + "epoch": 1.0097404586387861, + "grad_norm": 0.5309463025005629, + "learning_rate": 0.00010327980373197294, + "loss": 12.0875, + "step": 18543 + }, + { + "epoch": 1.0097949126353691, + "grad_norm": 0.5320408860846423, + "learning_rate": 0.00010327099020529393, + "loss": 12.0683, + "step": 18544 + }, + { + "epoch": 1.0098493666319523, + "grad_norm": 0.6038898483784096, + "learning_rate": 0.00010326217665317916, + "loss": 11.9718, + "step": 18545 + }, + { + "epoch": 1.0099038206285353, + "grad_norm": 0.6014477134507331, + "learning_rate": 0.00010325336307569717, + "loss": 12.13, + "step": 18546 + }, + { + "epoch": 1.0099582746251183, + "grad_norm": 0.5495599267243306, + "learning_rate": 0.00010324454947291647, + "loss": 12.045, + "step": 18547 + }, + { + "epoch": 1.0100127286217013, + "grad_norm": 0.5280405649469392, + "learning_rate": 0.00010323573584490561, + "loss": 12.0402, + "step": 18548 + }, + { + "epoch": 1.0100671826182843, + "grad_norm": 0.5590605689991598, + "learning_rate": 0.00010322692219173314, + "loss": 12.1249, + "step": 18549 + }, + { + "epoch": 1.0101216366148673, + "grad_norm": 0.6570155279799302, + "learning_rate": 0.00010321810851346758, + "loss": 12.0431, + "step": 18550 + }, + { + "epoch": 1.0101760906114503, + "grad_norm": 0.5111924942415713, + "learning_rate": 0.00010320929481017743, + "loss": 12.0129, + "step": 18551 + }, + { + "epoch": 1.0102305446080333, + "grad_norm": 0.5239726843256711, + "learning_rate": 0.0001032004810819313, + "loss": 12.0665, + "step": 18552 + }, + { + "epoch": 1.0102849986046163, + "grad_norm": 0.5345809725825313, + "learning_rate": 0.00010319166732879768, + "loss": 12.0353, + "step": 18553 + }, + { + "epoch": 1.0103394526011993, + "grad_norm": 0.630011898885438, + "learning_rate": 0.00010318285355084512, + "loss": 12.1699, + "step": 18554 + }, + { + "epoch": 1.0103939065977823, + "grad_norm": 0.5191116007882293, + "learning_rate": 0.00010317403974814217, + "loss": 11.9667, + "step": 18555 + }, + { + "epoch": 1.0104483605943653, + "grad_norm": 0.4831786103221667, + "learning_rate": 0.00010316522592075734, + "loss": 12.1, + "step": 18556 + }, + { + "epoch": 1.0105028145909485, + "grad_norm": 0.5561397048347925, + "learning_rate": 0.00010315641206875919, + "loss": 12.0128, + "step": 18557 + }, + { + "epoch": 1.0105572685875315, + "grad_norm": 0.6440097086515711, + "learning_rate": 0.00010314759819221624, + "loss": 12.2757, + "step": 18558 + }, + { + "epoch": 1.0106117225841145, + "grad_norm": 0.656110245218873, + "learning_rate": 0.00010313878429119705, + "loss": 12.1454, + "step": 18559 + }, + { + "epoch": 1.0106661765806975, + "grad_norm": 0.5903269479289363, + "learning_rate": 0.00010312997036577014, + "loss": 11.9893, + "step": 18560 + }, + { + "epoch": 1.0107206305772805, + "grad_norm": 0.6022232729710705, + "learning_rate": 0.00010312115641600408, + "loss": 12.0534, + "step": 18561 + }, + { + "epoch": 1.0107750845738634, + "grad_norm": 0.5060077336455366, + "learning_rate": 0.00010311234244196735, + "loss": 11.9423, + "step": 18562 + }, + { + "epoch": 1.0108295385704464, + "grad_norm": 0.5435731159339288, + "learning_rate": 0.00010310352844372855, + "loss": 12.089, + "step": 18563 + }, + { + "epoch": 1.0108839925670294, + "grad_norm": 0.5674002940175599, + "learning_rate": 0.00010309471442135617, + "loss": 12.0693, + "step": 18564 + }, + { + "epoch": 1.0109384465636124, + "grad_norm": 0.5404684656688796, + "learning_rate": 0.0001030859003749188, + "loss": 12.1342, + "step": 18565 + }, + { + "epoch": 1.0109929005601954, + "grad_norm": 0.5961366588591839, + "learning_rate": 0.00010307708630448494, + "loss": 11.9892, + "step": 18566 + }, + { + "epoch": 1.0110473545567784, + "grad_norm": 0.5454753540434794, + "learning_rate": 0.00010306827221012312, + "loss": 12.0481, + "step": 18567 + }, + { + "epoch": 1.0111018085533616, + "grad_norm": 0.566266637646021, + "learning_rate": 0.0001030594580919019, + "loss": 12.1063, + "step": 18568 + }, + { + "epoch": 1.0111562625499446, + "grad_norm": 0.5533092897930931, + "learning_rate": 0.00010305064394988984, + "loss": 12.0457, + "step": 18569 + }, + { + "epoch": 1.0112107165465276, + "grad_norm": 0.5386943447678215, + "learning_rate": 0.00010304182978415544, + "loss": 11.8717, + "step": 18570 + }, + { + "epoch": 1.0112651705431106, + "grad_norm": 0.5470324120297019, + "learning_rate": 0.0001030330155947673, + "loss": 12.0962, + "step": 18571 + }, + { + "epoch": 1.0113196245396936, + "grad_norm": 0.5911071622271485, + "learning_rate": 0.00010302420138179391, + "loss": 12.2112, + "step": 18572 + }, + { + "epoch": 1.0113740785362766, + "grad_norm": 0.5854015202959052, + "learning_rate": 0.00010301538714530379, + "loss": 12.0793, + "step": 18573 + }, + { + "epoch": 1.0114285325328596, + "grad_norm": 0.5698593275883355, + "learning_rate": 0.00010300657288536553, + "loss": 12.0904, + "step": 18574 + }, + { + "epoch": 1.0114829865294426, + "grad_norm": 0.5832187817899874, + "learning_rate": 0.00010299775860204768, + "loss": 12.1363, + "step": 18575 + }, + { + "epoch": 1.0115374405260256, + "grad_norm": 0.5679029416512514, + "learning_rate": 0.00010298894429541874, + "loss": 12.1225, + "step": 18576 + }, + { + "epoch": 1.0115918945226086, + "grad_norm": 0.5673043538062682, + "learning_rate": 0.00010298012996554727, + "loss": 11.9131, + "step": 18577 + }, + { + "epoch": 1.0116463485191916, + "grad_norm": 0.5366736456338547, + "learning_rate": 0.00010297131561250182, + "loss": 11.7861, + "step": 18578 + }, + { + "epoch": 1.0117008025157745, + "grad_norm": 0.6059117847345783, + "learning_rate": 0.00010296250123635087, + "loss": 12.0755, + "step": 18579 + }, + { + "epoch": 1.0117552565123578, + "grad_norm": 0.5766746731730145, + "learning_rate": 0.00010295368683716305, + "loss": 12.1941, + "step": 18580 + }, + { + "epoch": 1.0118097105089408, + "grad_norm": 0.5975853286218374, + "learning_rate": 0.0001029448724150069, + "loss": 12.2667, + "step": 18581 + }, + { + "epoch": 1.0118641645055237, + "grad_norm": 0.5813723381540811, + "learning_rate": 0.0001029360579699509, + "loss": 12.0985, + "step": 18582 + }, + { + "epoch": 1.0119186185021067, + "grad_norm": 0.5144480498398242, + "learning_rate": 0.00010292724350206359, + "loss": 12.1034, + "step": 18583 + }, + { + "epoch": 1.0119730724986897, + "grad_norm": 0.5496274656727532, + "learning_rate": 0.00010291842901141357, + "loss": 11.8125, + "step": 18584 + }, + { + "epoch": 1.0120275264952727, + "grad_norm": 0.5870109527864531, + "learning_rate": 0.00010290961449806935, + "loss": 12.1074, + "step": 18585 + }, + { + "epoch": 1.0120819804918557, + "grad_norm": 0.5517661081173563, + "learning_rate": 0.00010290079996209949, + "loss": 12.1656, + "step": 18586 + }, + { + "epoch": 1.0121364344884387, + "grad_norm": 0.5204688912020631, + "learning_rate": 0.00010289198540357252, + "loss": 12.1468, + "step": 18587 + }, + { + "epoch": 1.0121908884850217, + "grad_norm": 0.6109212134761495, + "learning_rate": 0.00010288317082255698, + "loss": 12.0229, + "step": 18588 + }, + { + "epoch": 1.0122453424816047, + "grad_norm": 0.50168078275934, + "learning_rate": 0.00010287435621912145, + "loss": 12.0959, + "step": 18589 + }, + { + "epoch": 1.0122997964781877, + "grad_norm": 0.5870154397372992, + "learning_rate": 0.00010286554159333439, + "loss": 12.1773, + "step": 18590 + }, + { + "epoch": 1.0123542504747707, + "grad_norm": 0.5661339588060216, + "learning_rate": 0.00010285672694526445, + "loss": 12.1347, + "step": 18591 + }, + { + "epoch": 1.012408704471354, + "grad_norm": 0.5685607270196777, + "learning_rate": 0.0001028479122749801, + "loss": 11.9809, + "step": 18592 + }, + { + "epoch": 1.0124631584679369, + "grad_norm": 0.586910468464697, + "learning_rate": 0.0001028390975825499, + "loss": 12.1174, + "step": 18593 + }, + { + "epoch": 1.0125176124645199, + "grad_norm": 0.5493072224736333, + "learning_rate": 0.00010283028286804241, + "loss": 12.0765, + "step": 18594 + }, + { + "epoch": 1.0125720664611029, + "grad_norm": 0.6860285133901204, + "learning_rate": 0.00010282146813152616, + "loss": 12.1214, + "step": 18595 + }, + { + "epoch": 1.0126265204576859, + "grad_norm": 0.5578553443266105, + "learning_rate": 0.00010281265337306971, + "loss": 12.1132, + "step": 18596 + }, + { + "epoch": 1.0126809744542689, + "grad_norm": 0.760650444662143, + "learning_rate": 0.00010280383859274159, + "loss": 12.1466, + "step": 18597 + }, + { + "epoch": 1.0127354284508518, + "grad_norm": 0.5341634657264303, + "learning_rate": 0.00010279502379061035, + "loss": 12.0617, + "step": 18598 + }, + { + "epoch": 1.0127898824474348, + "grad_norm": 0.598461962481969, + "learning_rate": 0.00010278620896674453, + "loss": 12.0937, + "step": 18599 + }, + { + "epoch": 1.0128443364440178, + "grad_norm": 0.6275053519413482, + "learning_rate": 0.00010277739412121267, + "loss": 12.0502, + "step": 18600 + }, + { + "epoch": 1.0128987904406008, + "grad_norm": 0.5627921254857454, + "learning_rate": 0.00010276857925408337, + "loss": 12.0195, + "step": 18601 + }, + { + "epoch": 1.0129532444371838, + "grad_norm": 0.6277134432781751, + "learning_rate": 0.00010275976436542509, + "loss": 12.0657, + "step": 18602 + }, + { + "epoch": 1.0130076984337668, + "grad_norm": 0.552236707841139, + "learning_rate": 0.00010275094945530645, + "loss": 12.087, + "step": 18603 + }, + { + "epoch": 1.01306215243035, + "grad_norm": 0.5339410393649697, + "learning_rate": 0.00010274213452379595, + "loss": 12.0573, + "step": 18604 + }, + { + "epoch": 1.013116606426933, + "grad_norm": 0.5199770355858867, + "learning_rate": 0.00010273331957096215, + "loss": 12.0424, + "step": 18605 + }, + { + "epoch": 1.013171060423516, + "grad_norm": 0.6059916556226567, + "learning_rate": 0.00010272450459687362, + "loss": 12.1755, + "step": 18606 + }, + { + "epoch": 1.013225514420099, + "grad_norm": 0.6439764951819686, + "learning_rate": 0.00010271568960159887, + "loss": 11.9937, + "step": 18607 + }, + { + "epoch": 1.013279968416682, + "grad_norm": 0.5616246688610501, + "learning_rate": 0.00010270687458520645, + "loss": 12.2604, + "step": 18608 + }, + { + "epoch": 1.013334422413265, + "grad_norm": 0.5976513162085014, + "learning_rate": 0.00010269805954776495, + "loss": 12.0994, + "step": 18609 + }, + { + "epoch": 1.013388876409848, + "grad_norm": 0.5814974973020262, + "learning_rate": 0.00010268924448934285, + "loss": 12.225, + "step": 18610 + }, + { + "epoch": 1.013443330406431, + "grad_norm": 0.5557409579034941, + "learning_rate": 0.00010268042941000874, + "loss": 12.1234, + "step": 18611 + }, + { + "epoch": 1.013497784403014, + "grad_norm": 0.5961948506538783, + "learning_rate": 0.00010267161430983119, + "loss": 12.1489, + "step": 18612 + }, + { + "epoch": 1.013552238399597, + "grad_norm": 0.5275476176618449, + "learning_rate": 0.00010266279918887872, + "loss": 12.0462, + "step": 18613 + }, + { + "epoch": 1.01360669239618, + "grad_norm": 0.46270763533338205, + "learning_rate": 0.00010265398404721984, + "loss": 12.0466, + "step": 18614 + }, + { + "epoch": 1.0136611463927632, + "grad_norm": 0.5396062804897083, + "learning_rate": 0.00010264516888492315, + "loss": 12.1478, + "step": 18615 + }, + { + "epoch": 1.0137156003893462, + "grad_norm": 0.5603888801196434, + "learning_rate": 0.00010263635370205722, + "loss": 12.0761, + "step": 18616 + }, + { + "epoch": 1.0137700543859292, + "grad_norm": 0.5949006213362621, + "learning_rate": 0.00010262753849869051, + "loss": 12.1037, + "step": 18617 + }, + { + "epoch": 1.0138245083825121, + "grad_norm": 0.5478553417772944, + "learning_rate": 0.00010261872327489164, + "loss": 12.0866, + "step": 18618 + }, + { + "epoch": 1.0138789623790951, + "grad_norm": 0.5663167078664715, + "learning_rate": 0.00010260990803072915, + "loss": 12.1534, + "step": 18619 + }, + { + "epoch": 1.0139334163756781, + "grad_norm": 0.5931662652583294, + "learning_rate": 0.00010260109276627154, + "loss": 12.0215, + "step": 18620 + }, + { + "epoch": 1.0139878703722611, + "grad_norm": 0.5409318674949393, + "learning_rate": 0.00010259227748158747, + "loss": 12.1054, + "step": 18621 + }, + { + "epoch": 1.0140423243688441, + "grad_norm": 0.5896010595760806, + "learning_rate": 0.00010258346217674532, + "loss": 12.1124, + "step": 18622 + }, + { + "epoch": 1.014096778365427, + "grad_norm": 0.5106183988735412, + "learning_rate": 0.0001025746468518138, + "loss": 12.1734, + "step": 18623 + }, + { + "epoch": 1.01415123236201, + "grad_norm": 0.5312656153615478, + "learning_rate": 0.00010256583150686136, + "loss": 11.9229, + "step": 18624 + }, + { + "epoch": 1.014205686358593, + "grad_norm": 0.5194900895636492, + "learning_rate": 0.00010255701614195661, + "loss": 12.1093, + "step": 18625 + }, + { + "epoch": 1.014260140355176, + "grad_norm": 0.5859684979836088, + "learning_rate": 0.00010254820075716809, + "loss": 12.1531, + "step": 18626 + }, + { + "epoch": 1.0143145943517593, + "grad_norm": 0.5551928335979637, + "learning_rate": 0.00010253938535256431, + "loss": 12.1345, + "step": 18627 + }, + { + "epoch": 1.0143690483483423, + "grad_norm": 0.5775510465028477, + "learning_rate": 0.00010253056992821382, + "loss": 12.1252, + "step": 18628 + }, + { + "epoch": 1.0144235023449253, + "grad_norm": 0.5427232453071151, + "learning_rate": 0.0001025217544841852, + "loss": 12.0609, + "step": 18629 + }, + { + "epoch": 1.0144779563415083, + "grad_norm": 0.6115163583493086, + "learning_rate": 0.00010251293902054701, + "loss": 12.0571, + "step": 18630 + }, + { + "epoch": 1.0145324103380913, + "grad_norm": 0.5386722267689895, + "learning_rate": 0.0001025041235373678, + "loss": 12.0379, + "step": 18631 + }, + { + "epoch": 1.0145868643346743, + "grad_norm": 0.5506371185172241, + "learning_rate": 0.0001024953080347161, + "loss": 12.0624, + "step": 18632 + }, + { + "epoch": 1.0146413183312573, + "grad_norm": 0.5827098744078735, + "learning_rate": 0.00010248649251266042, + "loss": 11.9455, + "step": 18633 + }, + { + "epoch": 1.0146957723278403, + "grad_norm": 0.592762604673711, + "learning_rate": 0.0001024776769712694, + "loss": 12.078, + "step": 18634 + }, + { + "epoch": 1.0147502263244232, + "grad_norm": 0.5656728108497389, + "learning_rate": 0.00010246886141061154, + "loss": 12.1246, + "step": 18635 + }, + { + "epoch": 1.0148046803210062, + "grad_norm": 0.5517144152095224, + "learning_rate": 0.00010246004583075544, + "loss": 12.0933, + "step": 18636 + }, + { + "epoch": 1.0148591343175892, + "grad_norm": 0.5596194562044612, + "learning_rate": 0.00010245123023176957, + "loss": 12.0172, + "step": 18637 + }, + { + "epoch": 1.0149135883141724, + "grad_norm": 0.5771751091706115, + "learning_rate": 0.00010244241461372252, + "loss": 12.2037, + "step": 18638 + }, + { + "epoch": 1.0149680423107554, + "grad_norm": 0.5211343834215023, + "learning_rate": 0.00010243359897668283, + "loss": 12.0722, + "step": 18639 + }, + { + "epoch": 1.0150224963073384, + "grad_norm": 0.5699645617576364, + "learning_rate": 0.00010242478332071907, + "loss": 12.1202, + "step": 18640 + }, + { + "epoch": 1.0150769503039214, + "grad_norm": 0.5044244164856309, + "learning_rate": 0.00010241596764589985, + "loss": 12.0559, + "step": 18641 + }, + { + "epoch": 1.0151314043005044, + "grad_norm": 0.534380669527365, + "learning_rate": 0.0001024071519522936, + "loss": 11.8102, + "step": 18642 + }, + { + "epoch": 1.0151858582970874, + "grad_norm": 0.5889098771521126, + "learning_rate": 0.00010239833623996895, + "loss": 12.0705, + "step": 18643 + }, + { + "epoch": 1.0152403122936704, + "grad_norm": 0.541823910174673, + "learning_rate": 0.00010238952050899442, + "loss": 12.1835, + "step": 18644 + }, + { + "epoch": 1.0152947662902534, + "grad_norm": 0.6260545936225633, + "learning_rate": 0.00010238070475943857, + "loss": 12.0786, + "step": 18645 + }, + { + "epoch": 1.0153492202868364, + "grad_norm": 0.51878762670923, + "learning_rate": 0.00010237188899137, + "loss": 11.8656, + "step": 18646 + }, + { + "epoch": 1.0154036742834194, + "grad_norm": 0.5707399005959253, + "learning_rate": 0.00010236307320485721, + "loss": 12.0872, + "step": 18647 + }, + { + "epoch": 1.0154581282800024, + "grad_norm": 0.5309342136202445, + "learning_rate": 0.00010235425739996876, + "loss": 12.2071, + "step": 18648 + }, + { + "epoch": 1.0155125822765854, + "grad_norm": 0.5721763583756236, + "learning_rate": 0.00010234544157677322, + "loss": 12.201, + "step": 18649 + }, + { + "epoch": 1.0155670362731686, + "grad_norm": 0.5475461071889699, + "learning_rate": 0.00010233662573533909, + "loss": 12.103, + "step": 18650 + }, + { + "epoch": 1.0156214902697516, + "grad_norm": 0.5219852763402135, + "learning_rate": 0.00010232780987573502, + "loss": 12.017, + "step": 18651 + }, + { + "epoch": 1.0156759442663346, + "grad_norm": 0.5092900645153045, + "learning_rate": 0.0001023189939980295, + "loss": 12.0555, + "step": 18652 + }, + { + "epoch": 1.0157303982629176, + "grad_norm": 0.5388537015061275, + "learning_rate": 0.00010231017810229108, + "loss": 12.0605, + "step": 18653 + }, + { + "epoch": 1.0157848522595005, + "grad_norm": 0.588907815625058, + "learning_rate": 0.00010230136218858832, + "loss": 12.0845, + "step": 18654 + }, + { + "epoch": 1.0158393062560835, + "grad_norm": 0.5230904598736458, + "learning_rate": 0.00010229254625698981, + "loss": 12.1564, + "step": 18655 + }, + { + "epoch": 1.0158937602526665, + "grad_norm": 0.5401076147941122, + "learning_rate": 0.00010228373030756404, + "loss": 12.0952, + "step": 18656 + }, + { + "epoch": 1.0159482142492495, + "grad_norm": 0.5646841759125969, + "learning_rate": 0.00010227491434037963, + "loss": 12.0907, + "step": 18657 + }, + { + "epoch": 1.0160026682458325, + "grad_norm": 0.5290137046555882, + "learning_rate": 0.0001022660983555051, + "loss": 12.0885, + "step": 18658 + }, + { + "epoch": 1.0160571222424155, + "grad_norm": 0.5215509143252841, + "learning_rate": 0.000102257282353009, + "loss": 12.0678, + "step": 18659 + }, + { + "epoch": 1.0161115762389985, + "grad_norm": 0.5440424781375254, + "learning_rate": 0.00010224846633295988, + "loss": 12.1164, + "step": 18660 + }, + { + "epoch": 1.0161660302355815, + "grad_norm": 0.6125029465965208, + "learning_rate": 0.00010223965029542632, + "loss": 12.1944, + "step": 18661 + }, + { + "epoch": 1.0162204842321647, + "grad_norm": 0.5361094072719735, + "learning_rate": 0.00010223083424047689, + "loss": 11.9944, + "step": 18662 + }, + { + "epoch": 1.0162749382287477, + "grad_norm": 0.5382271270974258, + "learning_rate": 0.00010222201816818009, + "loss": 12.0014, + "step": 18663 + }, + { + "epoch": 1.0163293922253307, + "grad_norm": 0.5677736707067792, + "learning_rate": 0.00010221320207860452, + "loss": 11.8384, + "step": 18664 + }, + { + "epoch": 1.0163838462219137, + "grad_norm": 0.5571622510638461, + "learning_rate": 0.0001022043859718187, + "loss": 12.0366, + "step": 18665 + }, + { + "epoch": 1.0164383002184967, + "grad_norm": 0.562284739796698, + "learning_rate": 0.00010219556984789123, + "loss": 12.062, + "step": 18666 + }, + { + "epoch": 1.0164927542150797, + "grad_norm": 0.5145086992467447, + "learning_rate": 0.00010218675370689061, + "loss": 11.8804, + "step": 18667 + }, + { + "epoch": 1.0165472082116627, + "grad_norm": 0.6198656528706378, + "learning_rate": 0.00010217793754888544, + "loss": 12.1277, + "step": 18668 + }, + { + "epoch": 1.0166016622082457, + "grad_norm": 0.5516678091316137, + "learning_rate": 0.00010216912137394428, + "loss": 12.085, + "step": 18669 + }, + { + "epoch": 1.0166561162048287, + "grad_norm": 0.576048071907269, + "learning_rate": 0.00010216030518213564, + "loss": 12.0683, + "step": 18670 + }, + { + "epoch": 1.0167105702014116, + "grad_norm": 0.5530427825086842, + "learning_rate": 0.00010215148897352814, + "loss": 12.06, + "step": 18671 + }, + { + "epoch": 1.0167650241979946, + "grad_norm": 0.5391895765062983, + "learning_rate": 0.00010214267274819027, + "loss": 12.0372, + "step": 18672 + }, + { + "epoch": 1.0168194781945776, + "grad_norm": 0.5363936955345292, + "learning_rate": 0.00010213385650619063, + "loss": 11.9683, + "step": 18673 + }, + { + "epoch": 1.0168739321911608, + "grad_norm": 0.5527397413340591, + "learning_rate": 0.00010212504024759775, + "loss": 12.1035, + "step": 18674 + }, + { + "epoch": 1.0169283861877438, + "grad_norm": 0.5391156205400923, + "learning_rate": 0.00010211622397248022, + "loss": 12.0273, + "step": 18675 + }, + { + "epoch": 1.0169828401843268, + "grad_norm": 0.5325203017680954, + "learning_rate": 0.00010210740768090659, + "loss": 12.134, + "step": 18676 + }, + { + "epoch": 1.0170372941809098, + "grad_norm": 0.5224397247187977, + "learning_rate": 0.00010209859137294535, + "loss": 12.087, + "step": 18677 + }, + { + "epoch": 1.0170917481774928, + "grad_norm": 0.508210641030574, + "learning_rate": 0.00010208977504866514, + "loss": 12.1013, + "step": 18678 + }, + { + "epoch": 1.0171462021740758, + "grad_norm": 0.5594443000847564, + "learning_rate": 0.0001020809587081345, + "loss": 12.2038, + "step": 18679 + }, + { + "epoch": 1.0172006561706588, + "grad_norm": 0.5325988145437173, + "learning_rate": 0.00010207214235142197, + "loss": 12.0305, + "step": 18680 + }, + { + "epoch": 1.0172551101672418, + "grad_norm": 0.46681354969671174, + "learning_rate": 0.00010206332597859614, + "loss": 12.0177, + "step": 18681 + }, + { + "epoch": 1.0173095641638248, + "grad_norm": 0.55466047261685, + "learning_rate": 0.00010205450958972549, + "loss": 12.1903, + "step": 18682 + }, + { + "epoch": 1.0173640181604078, + "grad_norm": 0.5211680837345727, + "learning_rate": 0.00010204569318487867, + "loss": 12.0376, + "step": 18683 + }, + { + "epoch": 1.0174184721569908, + "grad_norm": 0.531284352544976, + "learning_rate": 0.00010203687676412416, + "loss": 12.1672, + "step": 18684 + }, + { + "epoch": 1.017472926153574, + "grad_norm": 0.5212183307590827, + "learning_rate": 0.00010202806032753059, + "loss": 12.1094, + "step": 18685 + }, + { + "epoch": 1.017527380150157, + "grad_norm": 0.5192248511028463, + "learning_rate": 0.00010201924387516648, + "loss": 11.8798, + "step": 18686 + }, + { + "epoch": 1.01758183414674, + "grad_norm": 0.5739008702696496, + "learning_rate": 0.00010201042740710039, + "loss": 12.0077, + "step": 18687 + }, + { + "epoch": 1.017636288143323, + "grad_norm": 0.5127739004057067, + "learning_rate": 0.00010200161092340083, + "loss": 12.0929, + "step": 18688 + }, + { + "epoch": 1.017690742139906, + "grad_norm": 0.5732873555817004, + "learning_rate": 0.00010199279442413645, + "loss": 12.0487, + "step": 18689 + }, + { + "epoch": 1.017745196136489, + "grad_norm": 0.609518732694126, + "learning_rate": 0.00010198397790937577, + "loss": 12.1473, + "step": 18690 + }, + { + "epoch": 1.017799650133072, + "grad_norm": 0.5678661229413832, + "learning_rate": 0.00010197516137918734, + "loss": 11.9684, + "step": 18691 + }, + { + "epoch": 1.017854104129655, + "grad_norm": 0.6481102039488043, + "learning_rate": 0.00010196634483363974, + "loss": 12.0967, + "step": 18692 + }, + { + "epoch": 1.017908558126238, + "grad_norm": 0.6070736689133404, + "learning_rate": 0.00010195752827280149, + "loss": 12.0106, + "step": 18693 + }, + { + "epoch": 1.017963012122821, + "grad_norm": 0.5671805438971573, + "learning_rate": 0.00010194871169674117, + "loss": 12.1093, + "step": 18694 + }, + { + "epoch": 1.018017466119404, + "grad_norm": 0.5596878343691645, + "learning_rate": 0.00010193989510552732, + "loss": 12.1828, + "step": 18695 + }, + { + "epoch": 1.018071920115987, + "grad_norm": 0.5731959237727028, + "learning_rate": 0.00010193107849922859, + "loss": 12.1028, + "step": 18696 + }, + { + "epoch": 1.0181263741125701, + "grad_norm": 0.557951890413181, + "learning_rate": 0.0001019222618779134, + "loss": 12.0865, + "step": 18697 + }, + { + "epoch": 1.0181808281091531, + "grad_norm": 0.5547917029109849, + "learning_rate": 0.00010191344524165043, + "loss": 12.0514, + "step": 18698 + }, + { + "epoch": 1.018235282105736, + "grad_norm": 0.536152548658443, + "learning_rate": 0.00010190462859050813, + "loss": 12.0461, + "step": 18699 + }, + { + "epoch": 1.018289736102319, + "grad_norm": 0.5658531788317116, + "learning_rate": 0.00010189581192455515, + "loss": 12.0915, + "step": 18700 + }, + { + "epoch": 1.018344190098902, + "grad_norm": 0.5396720468549974, + "learning_rate": 0.00010188699524386003, + "loss": 12.14, + "step": 18701 + }, + { + "epoch": 1.018398644095485, + "grad_norm": 0.5452850946768681, + "learning_rate": 0.00010187817854849133, + "loss": 12.179, + "step": 18702 + }, + { + "epoch": 1.018453098092068, + "grad_norm": 0.5293811185081714, + "learning_rate": 0.00010186936183851759, + "loss": 12.0484, + "step": 18703 + }, + { + "epoch": 1.018507552088651, + "grad_norm": 0.5231718692384336, + "learning_rate": 0.00010186054511400735, + "loss": 11.9543, + "step": 18704 + }, + { + "epoch": 1.018562006085234, + "grad_norm": 0.537399970978767, + "learning_rate": 0.00010185172837502921, + "loss": 12.0999, + "step": 18705 + }, + { + "epoch": 1.018616460081817, + "grad_norm": 0.5760471061648441, + "learning_rate": 0.00010184291162165172, + "loss": 12.0845, + "step": 18706 + }, + { + "epoch": 1.0186709140784, + "grad_norm": 0.5970680416849219, + "learning_rate": 0.00010183409485394348, + "loss": 12.0448, + "step": 18707 + }, + { + "epoch": 1.0187253680749833, + "grad_norm": 0.5310220685462248, + "learning_rate": 0.00010182527807197297, + "loss": 12.1472, + "step": 18708 + }, + { + "epoch": 1.0187798220715663, + "grad_norm": 0.4984097673992301, + "learning_rate": 0.0001018164612758088, + "loss": 12.0098, + "step": 18709 + }, + { + "epoch": 1.0188342760681492, + "grad_norm": 0.5165077518009882, + "learning_rate": 0.0001018076444655195, + "loss": 12.1112, + "step": 18710 + }, + { + "epoch": 1.0188887300647322, + "grad_norm": 0.48983985957564, + "learning_rate": 0.00010179882764117368, + "loss": 12.1085, + "step": 18711 + }, + { + "epoch": 1.0189431840613152, + "grad_norm": 0.5714553058215357, + "learning_rate": 0.00010179001080283989, + "loss": 12.0102, + "step": 18712 + }, + { + "epoch": 1.0189976380578982, + "grad_norm": 0.5127940910981188, + "learning_rate": 0.00010178119395058665, + "loss": 12.0016, + "step": 18713 + }, + { + "epoch": 1.0190520920544812, + "grad_norm": 0.5399543294573546, + "learning_rate": 0.00010177237708448255, + "loss": 12.0651, + "step": 18714 + }, + { + "epoch": 1.0191065460510642, + "grad_norm": 0.5470640580069777, + "learning_rate": 0.00010176356020459617, + "loss": 12.1846, + "step": 18715 + }, + { + "epoch": 1.0191610000476472, + "grad_norm": 0.6234479197958068, + "learning_rate": 0.000101754743310996, + "loss": 11.93, + "step": 18716 + }, + { + "epoch": 1.0192154540442302, + "grad_norm": 0.5405988025274492, + "learning_rate": 0.00010174592640375072, + "loss": 12.1231, + "step": 18717 + }, + { + "epoch": 1.0192699080408132, + "grad_norm": 0.5078125142188712, + "learning_rate": 0.00010173710948292878, + "loss": 12.0405, + "step": 18718 + }, + { + "epoch": 1.0193243620373962, + "grad_norm": 0.6023465117020487, + "learning_rate": 0.00010172829254859879, + "loss": 11.965, + "step": 18719 + }, + { + "epoch": 1.0193788160339794, + "grad_norm": 0.5971529182156567, + "learning_rate": 0.0001017194756008293, + "loss": 12.27, + "step": 18720 + }, + { + "epoch": 1.0194332700305624, + "grad_norm": 0.5185990422077906, + "learning_rate": 0.00010171065863968889, + "loss": 12.0221, + "step": 18721 + }, + { + "epoch": 1.0194877240271454, + "grad_norm": 0.5241390327715381, + "learning_rate": 0.00010170184166524612, + "loss": 12.1332, + "step": 18722 + }, + { + "epoch": 1.0195421780237284, + "grad_norm": 0.5010651696477758, + "learning_rate": 0.00010169302467756953, + "loss": 12.0712, + "step": 18723 + }, + { + "epoch": 1.0195966320203114, + "grad_norm": 0.49437860750760554, + "learning_rate": 0.0001016842076767277, + "loss": 12.1145, + "step": 18724 + }, + { + "epoch": 1.0196510860168944, + "grad_norm": 0.6015474759820658, + "learning_rate": 0.00010167539066278919, + "loss": 12.1264, + "step": 18725 + }, + { + "epoch": 1.0197055400134774, + "grad_norm": 0.5337388508705605, + "learning_rate": 0.00010166657363582257, + "loss": 12.0994, + "step": 18726 + }, + { + "epoch": 1.0197599940100603, + "grad_norm": 0.5894897986921385, + "learning_rate": 0.00010165775659589639, + "loss": 12.1271, + "step": 18727 + }, + { + "epoch": 1.0198144480066433, + "grad_norm": 0.5502165812525377, + "learning_rate": 0.00010164893954307919, + "loss": 12.0668, + "step": 18728 + }, + { + "epoch": 1.0198689020032263, + "grad_norm": 0.5448571962174019, + "learning_rate": 0.00010164012247743959, + "loss": 12.0644, + "step": 18729 + }, + { + "epoch": 1.0199233559998093, + "grad_norm": 0.590115771224414, + "learning_rate": 0.0001016313053990461, + "loss": 12.1188, + "step": 18730 + }, + { + "epoch": 1.0199778099963923, + "grad_norm": 0.5500449590839104, + "learning_rate": 0.00010162248830796733, + "loss": 12.0466, + "step": 18731 + }, + { + "epoch": 1.0200322639929755, + "grad_norm": 0.5412151787779529, + "learning_rate": 0.00010161367120427181, + "loss": 12.033, + "step": 18732 + }, + { + "epoch": 1.0200867179895585, + "grad_norm": 0.6254430498979263, + "learning_rate": 0.00010160485408802811, + "loss": 12.2435, + "step": 18733 + }, + { + "epoch": 1.0201411719861415, + "grad_norm": 0.5336524786885203, + "learning_rate": 0.00010159603695930479, + "loss": 12.0075, + "step": 18734 + }, + { + "epoch": 1.0201956259827245, + "grad_norm": 0.5837363686218118, + "learning_rate": 0.00010158721981817044, + "loss": 12.0391, + "step": 18735 + }, + { + "epoch": 1.0202500799793075, + "grad_norm": 0.6079896229766106, + "learning_rate": 0.00010157840266469359, + "loss": 12.0356, + "step": 18736 + }, + { + "epoch": 1.0203045339758905, + "grad_norm": 0.5245603346550712, + "learning_rate": 0.0001015695854989428, + "loss": 12.1001, + "step": 18737 + }, + { + "epoch": 1.0203589879724735, + "grad_norm": 0.5537582378652263, + "learning_rate": 0.00010156076832098666, + "loss": 11.9315, + "step": 18738 + }, + { + "epoch": 1.0204134419690565, + "grad_norm": 0.5868965655185666, + "learning_rate": 0.00010155195113089373, + "loss": 12.0819, + "step": 18739 + }, + { + "epoch": 1.0204678959656395, + "grad_norm": 0.5751484799267724, + "learning_rate": 0.00010154313392873257, + "loss": 12.0688, + "step": 18740 + }, + { + "epoch": 1.0205223499622225, + "grad_norm": 0.5558567163544844, + "learning_rate": 0.00010153431671457174, + "loss": 12.0395, + "step": 18741 + }, + { + "epoch": 1.0205768039588055, + "grad_norm": 0.5417134858944368, + "learning_rate": 0.00010152549948847982, + "loss": 11.9737, + "step": 18742 + }, + { + "epoch": 1.0206312579553887, + "grad_norm": 0.5436549049238402, + "learning_rate": 0.0001015166822505253, + "loss": 12.0564, + "step": 18743 + }, + { + "epoch": 1.0206857119519717, + "grad_norm": 0.5613973847463412, + "learning_rate": 0.00010150786500077687, + "loss": 12.0953, + "step": 18744 + }, + { + "epoch": 1.0207401659485547, + "grad_norm": 0.6256121230482002, + "learning_rate": 0.00010149904773930301, + "loss": 12.0651, + "step": 18745 + }, + { + "epoch": 1.0207946199451376, + "grad_norm": 0.563324983194387, + "learning_rate": 0.0001014902304661723, + "loss": 11.9725, + "step": 18746 + }, + { + "epoch": 1.0208490739417206, + "grad_norm": 0.5449957795585058, + "learning_rate": 0.00010148141318145333, + "loss": 12.0534, + "step": 18747 + }, + { + "epoch": 1.0209035279383036, + "grad_norm": 0.5355158319275114, + "learning_rate": 0.00010147259588521458, + "loss": 12.0776, + "step": 18748 + }, + { + "epoch": 1.0209579819348866, + "grad_norm": 0.5237347803738321, + "learning_rate": 0.00010146377857752472, + "loss": 12.1556, + "step": 18749 + }, + { + "epoch": 1.0210124359314696, + "grad_norm": 0.6142868047989184, + "learning_rate": 0.00010145496125845227, + "loss": 12.0608, + "step": 18750 + }, + { + "epoch": 1.0210668899280526, + "grad_norm": 0.5820946703122036, + "learning_rate": 0.0001014461439280658, + "loss": 11.9877, + "step": 18751 + }, + { + "epoch": 1.0211213439246356, + "grad_norm": 0.5053531335963286, + "learning_rate": 0.0001014373265864339, + "loss": 11.9562, + "step": 18752 + }, + { + "epoch": 1.0211757979212186, + "grad_norm": 0.6387350861190165, + "learning_rate": 0.00010142850923362505, + "loss": 12.2674, + "step": 18753 + }, + { + "epoch": 1.0212302519178016, + "grad_norm": 0.555291955522916, + "learning_rate": 0.0001014196918697079, + "loss": 12.0657, + "step": 18754 + }, + { + "epoch": 1.0212847059143848, + "grad_norm": 0.5306074304395978, + "learning_rate": 0.00010141087449475098, + "loss": 11.9807, + "step": 18755 + }, + { + "epoch": 1.0213391599109678, + "grad_norm": 0.6337599413581059, + "learning_rate": 0.00010140205710882287, + "loss": 12.1726, + "step": 18756 + }, + { + "epoch": 1.0213936139075508, + "grad_norm": 0.5208889541758751, + "learning_rate": 0.00010139323971199216, + "loss": 11.9897, + "step": 18757 + }, + { + "epoch": 1.0214480679041338, + "grad_norm": 0.5865241119324267, + "learning_rate": 0.00010138442230432735, + "loss": 11.9329, + "step": 18758 + }, + { + "epoch": 1.0215025219007168, + "grad_norm": 0.5366785199446498, + "learning_rate": 0.00010137560488589701, + "loss": 12.0962, + "step": 18759 + }, + { + "epoch": 1.0215569758972998, + "grad_norm": 0.5554973337117057, + "learning_rate": 0.00010136678745676977, + "loss": 12.098, + "step": 18760 + }, + { + "epoch": 1.0216114298938828, + "grad_norm": 0.5298973012350927, + "learning_rate": 0.00010135797001701417, + "loss": 12.1539, + "step": 18761 + }, + { + "epoch": 1.0216658838904658, + "grad_norm": 0.589512390220508, + "learning_rate": 0.00010134915256669878, + "loss": 12.0498, + "step": 18762 + }, + { + "epoch": 1.0217203378870487, + "grad_norm": 0.5424917531098438, + "learning_rate": 0.00010134033510589213, + "loss": 11.9644, + "step": 18763 + }, + { + "epoch": 1.0217747918836317, + "grad_norm": 0.5703553316374649, + "learning_rate": 0.00010133151763466282, + "loss": 12.0504, + "step": 18764 + }, + { + "epoch": 1.0218292458802147, + "grad_norm": 0.4877443637565837, + "learning_rate": 0.00010132270015307937, + "loss": 12.0059, + "step": 18765 + }, + { + "epoch": 1.0218836998767977, + "grad_norm": 0.5437625770251306, + "learning_rate": 0.00010131388266121041, + "loss": 12.0672, + "step": 18766 + }, + { + "epoch": 1.021938153873381, + "grad_norm": 0.5800226829041949, + "learning_rate": 0.0001013050651591245, + "loss": 12.0847, + "step": 18767 + }, + { + "epoch": 1.021992607869964, + "grad_norm": 0.5554761728053199, + "learning_rate": 0.00010129624764689016, + "loss": 12.0638, + "step": 18768 + }, + { + "epoch": 1.022047061866547, + "grad_norm": 0.5443212101981026, + "learning_rate": 0.00010128743012457598, + "loss": 11.9379, + "step": 18769 + }, + { + "epoch": 1.02210151586313, + "grad_norm": 0.5293439211571511, + "learning_rate": 0.00010127861259225053, + "loss": 12.105, + "step": 18770 + }, + { + "epoch": 1.022155969859713, + "grad_norm": 0.5387399325359294, + "learning_rate": 0.00010126979504998235, + "loss": 12.1094, + "step": 18771 + }, + { + "epoch": 1.022210423856296, + "grad_norm": 0.5950910700409785, + "learning_rate": 0.00010126097749784007, + "loss": 12.0754, + "step": 18772 + }, + { + "epoch": 1.022264877852879, + "grad_norm": 0.57269072448735, + "learning_rate": 0.0001012521599358922, + "loss": 11.9847, + "step": 18773 + }, + { + "epoch": 1.0223193318494619, + "grad_norm": 0.4811092820120146, + "learning_rate": 0.00010124334236420734, + "loss": 11.9244, + "step": 18774 + }, + { + "epoch": 1.0223737858460449, + "grad_norm": 0.5758684597581273, + "learning_rate": 0.00010123452478285403, + "loss": 12.1405, + "step": 18775 + }, + { + "epoch": 1.0224282398426279, + "grad_norm": 0.5886928417106222, + "learning_rate": 0.0001012257071919008, + "loss": 11.9732, + "step": 18776 + }, + { + "epoch": 1.0224826938392109, + "grad_norm": 0.5781547324240595, + "learning_rate": 0.00010121688959141635, + "loss": 12.1145, + "step": 18777 + }, + { + "epoch": 1.022537147835794, + "grad_norm": 0.6275044757931225, + "learning_rate": 0.00010120807198146914, + "loss": 12.2266, + "step": 18778 + }, + { + "epoch": 1.022591601832377, + "grad_norm": 0.494586053572155, + "learning_rate": 0.00010119925436212772, + "loss": 12.0316, + "step": 18779 + }, + { + "epoch": 1.02264605582896, + "grad_norm": 0.587922190618471, + "learning_rate": 0.00010119043673346071, + "loss": 12.0634, + "step": 18780 + }, + { + "epoch": 1.022700509825543, + "grad_norm": 0.5239084659528589, + "learning_rate": 0.0001011816190955367, + "loss": 12.0887, + "step": 18781 + }, + { + "epoch": 1.022754963822126, + "grad_norm": 0.516350361404884, + "learning_rate": 0.00010117280144842419, + "loss": 12.0924, + "step": 18782 + }, + { + "epoch": 1.022809417818709, + "grad_norm": 0.552523659378862, + "learning_rate": 0.00010116398379219179, + "loss": 12.1637, + "step": 18783 + }, + { + "epoch": 1.022863871815292, + "grad_norm": 0.5703124789098535, + "learning_rate": 0.00010115516612690805, + "loss": 12.0717, + "step": 18784 + }, + { + "epoch": 1.022918325811875, + "grad_norm": 0.5249871002893897, + "learning_rate": 0.00010114634845264155, + "loss": 12.1681, + "step": 18785 + }, + { + "epoch": 1.022972779808458, + "grad_norm": 0.5586287033768093, + "learning_rate": 0.00010113753076946084, + "loss": 12.0204, + "step": 18786 + }, + { + "epoch": 1.023027233805041, + "grad_norm": 0.5640718775388597, + "learning_rate": 0.0001011287130774345, + "loss": 12.1506, + "step": 18787 + }, + { + "epoch": 1.023081687801624, + "grad_norm": 0.6064181594035376, + "learning_rate": 0.0001011198953766311, + "loss": 12.2754, + "step": 18788 + }, + { + "epoch": 1.023136141798207, + "grad_norm": 0.6162582884740045, + "learning_rate": 0.00010111107766711922, + "loss": 12.0792, + "step": 18789 + }, + { + "epoch": 1.0231905957947902, + "grad_norm": 0.5912703805811506, + "learning_rate": 0.0001011022599489674, + "loss": 12.0633, + "step": 18790 + }, + { + "epoch": 1.0232450497913732, + "grad_norm": 0.5679337295473877, + "learning_rate": 0.00010109344222224425, + "loss": 12.0239, + "step": 18791 + }, + { + "epoch": 1.0232995037879562, + "grad_norm": 0.661333811706787, + "learning_rate": 0.00010108462448701827, + "loss": 12.0685, + "step": 18792 + }, + { + "epoch": 1.0233539577845392, + "grad_norm": 0.5829655380196324, + "learning_rate": 0.0001010758067433581, + "loss": 12.16, + "step": 18793 + }, + { + "epoch": 1.0234084117811222, + "grad_norm": 0.5264206204636439, + "learning_rate": 0.00010106698899133227, + "loss": 12.02, + "step": 18794 + }, + { + "epoch": 1.0234628657777052, + "grad_norm": 0.5469072976668753, + "learning_rate": 0.00010105817123100933, + "loss": 11.9848, + "step": 18795 + }, + { + "epoch": 1.0235173197742882, + "grad_norm": 0.5762653358726504, + "learning_rate": 0.0001010493534624579, + "loss": 12.1183, + "step": 18796 + }, + { + "epoch": 1.0235717737708712, + "grad_norm": 0.5336393034489133, + "learning_rate": 0.0001010405356857465, + "loss": 11.9188, + "step": 18797 + }, + { + "epoch": 1.0236262277674542, + "grad_norm": 0.5976938336576505, + "learning_rate": 0.00010103171790094375, + "loss": 12.0602, + "step": 18798 + }, + { + "epoch": 1.0236806817640371, + "grad_norm": 0.5245667416753186, + "learning_rate": 0.00010102290010811816, + "loss": 12.0888, + "step": 18799 + }, + { + "epoch": 1.0237351357606201, + "grad_norm": 0.5591741725535495, + "learning_rate": 0.00010101408230733833, + "loss": 12.0078, + "step": 18800 + }, + { + "epoch": 1.0237895897572034, + "grad_norm": 0.555771938556028, + "learning_rate": 0.00010100526449867284, + "loss": 12.2226, + "step": 18801 + }, + { + "epoch": 1.0238440437537863, + "grad_norm": 0.5595947083970808, + "learning_rate": 0.00010099644668219027, + "loss": 12.1012, + "step": 18802 + }, + { + "epoch": 1.0238984977503693, + "grad_norm": 0.604175891713645, + "learning_rate": 0.00010098762885795909, + "loss": 12.1059, + "step": 18803 + }, + { + "epoch": 1.0239529517469523, + "grad_norm": 0.5114344969036151, + "learning_rate": 0.00010097881102604798, + "loss": 12.0845, + "step": 18804 + }, + { + "epoch": 1.0240074057435353, + "grad_norm": 0.5097780349521573, + "learning_rate": 0.0001009699931865255, + "loss": 12.03, + "step": 18805 + }, + { + "epoch": 1.0240618597401183, + "grad_norm": 0.530155162660626, + "learning_rate": 0.00010096117533946014, + "loss": 12.0368, + "step": 18806 + }, + { + "epoch": 1.0241163137367013, + "grad_norm": 0.589263072886528, + "learning_rate": 0.00010095235748492058, + "loss": 11.9318, + "step": 18807 + }, + { + "epoch": 1.0241707677332843, + "grad_norm": 0.5603419836453702, + "learning_rate": 0.00010094353962297526, + "loss": 12.0558, + "step": 18808 + }, + { + "epoch": 1.0242252217298673, + "grad_norm": 0.5514549651704579, + "learning_rate": 0.00010093472175369286, + "loss": 12.0309, + "step": 18809 + }, + { + "epoch": 1.0242796757264503, + "grad_norm": 0.5324272182449006, + "learning_rate": 0.00010092590387714189, + "loss": 12.1358, + "step": 18810 + }, + { + "epoch": 1.0243341297230333, + "grad_norm": 0.5853805115201112, + "learning_rate": 0.00010091708599339095, + "loss": 11.9852, + "step": 18811 + }, + { + "epoch": 1.0243885837196163, + "grad_norm": 0.4729042685800001, + "learning_rate": 0.00010090826810250862, + "loss": 11.9887, + "step": 18812 + }, + { + "epoch": 1.0244430377161995, + "grad_norm": 0.6148014443636458, + "learning_rate": 0.00010089945020456342, + "loss": 12.0331, + "step": 18813 + }, + { + "epoch": 1.0244974917127825, + "grad_norm": 0.635639106078519, + "learning_rate": 0.0001008906322996239, + "loss": 12.1934, + "step": 18814 + }, + { + "epoch": 1.0245519457093655, + "grad_norm": 0.5633292596161977, + "learning_rate": 0.00010088181438775873, + "loss": 12.0455, + "step": 18815 + }, + { + "epoch": 1.0246063997059485, + "grad_norm": 0.5560741591155846, + "learning_rate": 0.00010087299646903639, + "loss": 12.0702, + "step": 18816 + }, + { + "epoch": 1.0246608537025315, + "grad_norm": 0.5651794701755701, + "learning_rate": 0.00010086417854352552, + "loss": 11.9584, + "step": 18817 + }, + { + "epoch": 1.0247153076991145, + "grad_norm": 0.7044712598019117, + "learning_rate": 0.00010085536061129463, + "loss": 12.2901, + "step": 18818 + }, + { + "epoch": 1.0247697616956974, + "grad_norm": 0.5552101311602395, + "learning_rate": 0.00010084654267241231, + "loss": 12.1134, + "step": 18819 + }, + { + "epoch": 1.0248242156922804, + "grad_norm": 0.5905702158058971, + "learning_rate": 0.00010083772472694713, + "loss": 12.0672, + "step": 18820 + }, + { + "epoch": 1.0248786696888634, + "grad_norm": 0.5573521983669069, + "learning_rate": 0.00010082890677496766, + "loss": 12.0905, + "step": 18821 + }, + { + "epoch": 1.0249331236854464, + "grad_norm": 0.6824700731125648, + "learning_rate": 0.0001008200888165425, + "loss": 12.2232, + "step": 18822 + }, + { + "epoch": 1.0249875776820294, + "grad_norm": 0.5409647664250958, + "learning_rate": 0.00010081127085174019, + "loss": 12.1013, + "step": 18823 + }, + { + "epoch": 1.0250420316786124, + "grad_norm": 0.5736042445755989, + "learning_rate": 0.00010080245288062928, + "loss": 12.0641, + "step": 18824 + }, + { + "epoch": 1.0250964856751956, + "grad_norm": 0.6756140383975451, + "learning_rate": 0.00010079363490327833, + "loss": 12.0934, + "step": 18825 + }, + { + "epoch": 1.0251509396717786, + "grad_norm": 0.536914119748739, + "learning_rate": 0.00010078481691975599, + "loss": 11.9342, + "step": 18826 + }, + { + "epoch": 1.0252053936683616, + "grad_norm": 0.5894137031872616, + "learning_rate": 0.00010077599893013079, + "loss": 12.0131, + "step": 18827 + }, + { + "epoch": 1.0252598476649446, + "grad_norm": 0.5786068936267161, + "learning_rate": 0.00010076718093447126, + "loss": 12.047, + "step": 18828 + }, + { + "epoch": 1.0253143016615276, + "grad_norm": 0.5762402343191538, + "learning_rate": 0.00010075836293284602, + "loss": 12.0729, + "step": 18829 + }, + { + "epoch": 1.0253687556581106, + "grad_norm": 0.6076074032551696, + "learning_rate": 0.00010074954492532362, + "loss": 12.1409, + "step": 18830 + }, + { + "epoch": 1.0254232096546936, + "grad_norm": 0.5985679684566773, + "learning_rate": 0.0001007407269119726, + "loss": 12.1766, + "step": 18831 + }, + { + "epoch": 1.0254776636512766, + "grad_norm": 0.5925485044810468, + "learning_rate": 0.00010073190889286164, + "loss": 12.2808, + "step": 18832 + }, + { + "epoch": 1.0255321176478596, + "grad_norm": 0.5589669324917635, + "learning_rate": 0.00010072309086805918, + "loss": 12.131, + "step": 18833 + }, + { + "epoch": 1.0255865716444426, + "grad_norm": 0.5891691454120911, + "learning_rate": 0.00010071427283763385, + "loss": 12.1005, + "step": 18834 + }, + { + "epoch": 1.0256410256410255, + "grad_norm": 0.5507771665067509, + "learning_rate": 0.00010070545480165422, + "loss": 12.1302, + "step": 18835 + }, + { + "epoch": 1.0256954796376085, + "grad_norm": 0.5988299559898301, + "learning_rate": 0.0001006966367601888, + "loss": 12.0119, + "step": 18836 + }, + { + "epoch": 1.0257499336341918, + "grad_norm": 0.6153467427206969, + "learning_rate": 0.0001006878187133063, + "loss": 12.1839, + "step": 18837 + }, + { + "epoch": 1.0258043876307747, + "grad_norm": 0.5110678266933942, + "learning_rate": 0.00010067900066107519, + "loss": 12.0238, + "step": 18838 + }, + { + "epoch": 1.0258588416273577, + "grad_norm": 0.5701816809723554, + "learning_rate": 0.00010067018260356402, + "loss": 12.0576, + "step": 18839 + }, + { + "epoch": 1.0259132956239407, + "grad_norm": 0.5255673439636622, + "learning_rate": 0.00010066136454084143, + "loss": 11.9858, + "step": 18840 + }, + { + "epoch": 1.0259677496205237, + "grad_norm": 0.5161129919900771, + "learning_rate": 0.00010065254647297595, + "loss": 11.9873, + "step": 18841 + }, + { + "epoch": 1.0260222036171067, + "grad_norm": 0.5421377360540655, + "learning_rate": 0.00010064372840003615, + "loss": 12.1362, + "step": 18842 + }, + { + "epoch": 1.0260766576136897, + "grad_norm": 0.5521988833133786, + "learning_rate": 0.00010063491032209063, + "loss": 12.0913, + "step": 18843 + }, + { + "epoch": 1.0261311116102727, + "grad_norm": 0.5272891891303668, + "learning_rate": 0.00010062609223920791, + "loss": 12.1025, + "step": 18844 + }, + { + "epoch": 1.0261855656068557, + "grad_norm": 0.5198128569718347, + "learning_rate": 0.00010061727415145661, + "loss": 12.0384, + "step": 18845 + }, + { + "epoch": 1.0262400196034387, + "grad_norm": 0.6355999518648248, + "learning_rate": 0.00010060845605890528, + "loss": 12.0885, + "step": 18846 + }, + { + "epoch": 1.0262944736000217, + "grad_norm": 0.5310977995229271, + "learning_rate": 0.00010059963796162251, + "loss": 12.136, + "step": 18847 + }, + { + "epoch": 1.026348927596605, + "grad_norm": 0.5305271542221838, + "learning_rate": 0.00010059081985967682, + "loss": 12.1, + "step": 18848 + }, + { + "epoch": 1.026403381593188, + "grad_norm": 0.5428545717662631, + "learning_rate": 0.00010058200175313684, + "loss": 12.0657, + "step": 18849 + }, + { + "epoch": 1.0264578355897709, + "grad_norm": 0.5361102279776829, + "learning_rate": 0.00010057318364207111, + "loss": 12.0446, + "step": 18850 + }, + { + "epoch": 1.0265122895863539, + "grad_norm": 0.5577618963549263, + "learning_rate": 0.00010056436552654822, + "loss": 12.1281, + "step": 18851 + }, + { + "epoch": 1.0265667435829369, + "grad_norm": 0.5283476355851188, + "learning_rate": 0.0001005555474066367, + "loss": 12.0621, + "step": 18852 + }, + { + "epoch": 1.0266211975795199, + "grad_norm": 0.6257579941479259, + "learning_rate": 0.00010054672928240519, + "loss": 12.1508, + "step": 18853 + }, + { + "epoch": 1.0266756515761029, + "grad_norm": 0.603240107097262, + "learning_rate": 0.0001005379111539222, + "loss": 12.156, + "step": 18854 + }, + { + "epoch": 1.0267301055726858, + "grad_norm": 0.5314831448689977, + "learning_rate": 0.0001005290930212563, + "loss": 12.087, + "step": 18855 + }, + { + "epoch": 1.0267845595692688, + "grad_norm": 0.5783292631175788, + "learning_rate": 0.00010052027488447612, + "loss": 12.0822, + "step": 18856 + }, + { + "epoch": 1.0268390135658518, + "grad_norm": 0.5412065536058184, + "learning_rate": 0.00010051145674365019, + "loss": 12.103, + "step": 18857 + }, + { + "epoch": 1.0268934675624348, + "grad_norm": 0.5276334499808119, + "learning_rate": 0.00010050263859884708, + "loss": 12.1075, + "step": 18858 + }, + { + "epoch": 1.0269479215590178, + "grad_norm": 0.5480083335352859, + "learning_rate": 0.00010049382045013536, + "loss": 12.0958, + "step": 18859 + }, + { + "epoch": 1.027002375555601, + "grad_norm": 0.5588973912889202, + "learning_rate": 0.00010048500229758362, + "loss": 12.0568, + "step": 18860 + }, + { + "epoch": 1.027056829552184, + "grad_norm": 0.6231732824018359, + "learning_rate": 0.0001004761841412604, + "loss": 12.0312, + "step": 18861 + }, + { + "epoch": 1.027111283548767, + "grad_norm": 0.5221917786578817, + "learning_rate": 0.00010046736598123432, + "loss": 11.9448, + "step": 18862 + }, + { + "epoch": 1.02716573754535, + "grad_norm": 0.585005556567255, + "learning_rate": 0.00010045854781757392, + "loss": 12.1332, + "step": 18863 + }, + { + "epoch": 1.027220191541933, + "grad_norm": 0.5552817641655828, + "learning_rate": 0.00010044972965034775, + "loss": 11.9784, + "step": 18864 + }, + { + "epoch": 1.027274645538516, + "grad_norm": 0.5862540149650539, + "learning_rate": 0.00010044091147962442, + "loss": 12.1105, + "step": 18865 + }, + { + "epoch": 1.027329099535099, + "grad_norm": 0.5233113635404432, + "learning_rate": 0.0001004320933054725, + "loss": 12.0754, + "step": 18866 + }, + { + "epoch": 1.027383553531682, + "grad_norm": 0.6066314094699042, + "learning_rate": 0.00010042327512796055, + "loss": 12.1417, + "step": 18867 + }, + { + "epoch": 1.027438007528265, + "grad_norm": 0.5895423082417945, + "learning_rate": 0.00010041445694715716, + "loss": 11.9777, + "step": 18868 + }, + { + "epoch": 1.027492461524848, + "grad_norm": 0.5582136229584224, + "learning_rate": 0.00010040563876313082, + "loss": 12.0938, + "step": 18869 + }, + { + "epoch": 1.027546915521431, + "grad_norm": 0.746659076517137, + "learning_rate": 0.00010039682057595021, + "loss": 12.1253, + "step": 18870 + }, + { + "epoch": 1.0276013695180142, + "grad_norm": 0.5547433612075412, + "learning_rate": 0.00010038800238568384, + "loss": 12.044, + "step": 18871 + }, + { + "epoch": 1.0276558235145972, + "grad_norm": 0.6296806052435052, + "learning_rate": 0.00010037918419240033, + "loss": 12.1935, + "step": 18872 + }, + { + "epoch": 1.0277102775111802, + "grad_norm": 0.5703231434418233, + "learning_rate": 0.0001003703659961682, + "loss": 12.1033, + "step": 18873 + }, + { + "epoch": 1.0277647315077632, + "grad_norm": 0.6215786874718152, + "learning_rate": 0.00010036154779705602, + "loss": 12.0414, + "step": 18874 + }, + { + "epoch": 1.0278191855043461, + "grad_norm": 0.5927630134487893, + "learning_rate": 0.00010035272959513243, + "loss": 12.06, + "step": 18875 + }, + { + "epoch": 1.0278736395009291, + "grad_norm": 0.6420508848971314, + "learning_rate": 0.0001003439113904659, + "loss": 12.0871, + "step": 18876 + }, + { + "epoch": 1.0279280934975121, + "grad_norm": 0.5103978927575662, + "learning_rate": 0.00010033509318312511, + "loss": 12.0255, + "step": 18877 + }, + { + "epoch": 1.0279825474940951, + "grad_norm": 0.552259339180842, + "learning_rate": 0.00010032627497317857, + "loss": 12.0814, + "step": 18878 + }, + { + "epoch": 1.0280370014906781, + "grad_norm": 0.5748723134775683, + "learning_rate": 0.00010031745676069484, + "loss": 12.0785, + "step": 18879 + }, + { + "epoch": 1.028091455487261, + "grad_norm": 0.6033855987766538, + "learning_rate": 0.0001003086385457425, + "loss": 12.0556, + "step": 18880 + }, + { + "epoch": 1.028145909483844, + "grad_norm": 0.5400001704991296, + "learning_rate": 0.00010029982032839016, + "loss": 12.1273, + "step": 18881 + }, + { + "epoch": 1.028200363480427, + "grad_norm": 0.5946102420376946, + "learning_rate": 0.00010029100210870636, + "loss": 12.0104, + "step": 18882 + }, + { + "epoch": 1.0282548174770103, + "grad_norm": 0.5710579850850007, + "learning_rate": 0.0001002821838867597, + "loss": 12.0413, + "step": 18883 + }, + { + "epoch": 1.0283092714735933, + "grad_norm": 0.5396354673695882, + "learning_rate": 0.00010027336566261871, + "loss": 11.9725, + "step": 18884 + }, + { + "epoch": 1.0283637254701763, + "grad_norm": 0.5580340497124632, + "learning_rate": 0.00010026454743635196, + "loss": 12.0463, + "step": 18885 + }, + { + "epoch": 1.0284181794667593, + "grad_norm": 0.5317955365880455, + "learning_rate": 0.00010025572920802808, + "loss": 11.9057, + "step": 18886 + }, + { + "epoch": 1.0284726334633423, + "grad_norm": 0.5387542121345305, + "learning_rate": 0.00010024691097771559, + "loss": 12.178, + "step": 18887 + }, + { + "epoch": 1.0285270874599253, + "grad_norm": 0.5915490733558898, + "learning_rate": 0.0001002380927454831, + "loss": 12.0586, + "step": 18888 + }, + { + "epoch": 1.0285815414565083, + "grad_norm": 0.5333617342298406, + "learning_rate": 0.00010022927451139915, + "loss": 12.0333, + "step": 18889 + }, + { + "epoch": 1.0286359954530913, + "grad_norm": 0.5935844742153724, + "learning_rate": 0.00010022045627553232, + "loss": 12.1081, + "step": 18890 + }, + { + "epoch": 1.0286904494496742, + "grad_norm": 0.6277507616477012, + "learning_rate": 0.00010021163803795115, + "loss": 12.0982, + "step": 18891 + }, + { + "epoch": 1.0287449034462572, + "grad_norm": 0.5572110632178765, + "learning_rate": 0.00010020281979872431, + "loss": 12.1265, + "step": 18892 + }, + { + "epoch": 1.0287993574428402, + "grad_norm": 0.5242220196092897, + "learning_rate": 0.00010019400155792029, + "loss": 12.0825, + "step": 18893 + }, + { + "epoch": 1.0288538114394232, + "grad_norm": 0.5328799708456232, + "learning_rate": 0.00010018518331560767, + "loss": 11.9993, + "step": 18894 + }, + { + "epoch": 1.0289082654360064, + "grad_norm": 0.5325890831838296, + "learning_rate": 0.00010017636507185504, + "loss": 12.1233, + "step": 18895 + }, + { + "epoch": 1.0289627194325894, + "grad_norm": 0.5734016280637351, + "learning_rate": 0.00010016754682673096, + "loss": 11.9025, + "step": 18896 + }, + { + "epoch": 1.0290171734291724, + "grad_norm": 0.6176216690609465, + "learning_rate": 0.00010015872858030397, + "loss": 12.2049, + "step": 18897 + }, + { + "epoch": 1.0290716274257554, + "grad_norm": 0.6092051065623323, + "learning_rate": 0.00010014991033264274, + "loss": 12.2688, + "step": 18898 + }, + { + "epoch": 1.0291260814223384, + "grad_norm": 0.508883902672158, + "learning_rate": 0.00010014109208381577, + "loss": 12.005, + "step": 18899 + }, + { + "epoch": 1.0291805354189214, + "grad_norm": 0.5493388982078555, + "learning_rate": 0.00010013227383389163, + "loss": 12.0163, + "step": 18900 + }, + { + "epoch": 1.0292349894155044, + "grad_norm": 0.5805459862865998, + "learning_rate": 0.00010012345558293892, + "loss": 12.0681, + "step": 18901 + }, + { + "epoch": 1.0292894434120874, + "grad_norm": 0.5456664730677401, + "learning_rate": 0.00010011463733102615, + "loss": 11.9971, + "step": 18902 + }, + { + "epoch": 1.0293438974086704, + "grad_norm": 0.5602136876295947, + "learning_rate": 0.00010010581907822201, + "loss": 12.0039, + "step": 18903 + }, + { + "epoch": 1.0293983514052534, + "grad_norm": 0.5162878149882081, + "learning_rate": 0.00010009700082459496, + "loss": 12.0213, + "step": 18904 + }, + { + "epoch": 1.0294528054018364, + "grad_norm": 0.5679069808387381, + "learning_rate": 0.00010008818257021363, + "loss": 12.2285, + "step": 18905 + }, + { + "epoch": 1.0295072593984194, + "grad_norm": 0.5750738974008015, + "learning_rate": 0.0001000793643151466, + "loss": 12.1621, + "step": 18906 + }, + { + "epoch": 1.0295617133950026, + "grad_norm": 0.5284897092297695, + "learning_rate": 0.00010007054605946237, + "loss": 11.9923, + "step": 18907 + }, + { + "epoch": 1.0296161673915856, + "grad_norm": 0.5325368772512918, + "learning_rate": 0.0001000617278032296, + "loss": 12.012, + "step": 18908 + }, + { + "epoch": 1.0296706213881686, + "grad_norm": 0.6267462450949988, + "learning_rate": 0.00010005290954651681, + "loss": 11.9871, + "step": 18909 + }, + { + "epoch": 1.0297250753847516, + "grad_norm": 0.632236511368181, + "learning_rate": 0.00010004409128939258, + "loss": 12.2269, + "step": 18910 + }, + { + "epoch": 1.0297795293813345, + "grad_norm": 0.6192054578441146, + "learning_rate": 0.0001000352730319255, + "loss": 12.0524, + "step": 18911 + }, + { + "epoch": 1.0298339833779175, + "grad_norm": 0.5384892591463919, + "learning_rate": 0.00010002645477418413, + "loss": 11.9744, + "step": 18912 + }, + { + "epoch": 1.0298884373745005, + "grad_norm": 0.5472096567294882, + "learning_rate": 0.00010001763651623706, + "loss": 11.8775, + "step": 18913 + }, + { + "epoch": 1.0299428913710835, + "grad_norm": 0.5666239111993764, + "learning_rate": 0.00010000881825815283, + "loss": 11.9846, + "step": 18914 + }, + { + "epoch": 1.0299973453676665, + "grad_norm": 0.5516408627753739, + "learning_rate": 0.0001, + "loss": 12.0715, + "step": 18915 + }, + { + "epoch": 1.0300517993642495, + "grad_norm": 0.5649090063726684, + "learning_rate": 9.999118174184721e-05, + "loss": 12.0079, + "step": 18916 + }, + { + "epoch": 1.0301062533608325, + "grad_norm": 0.6835071626704871, + "learning_rate": 9.998236348376298e-05, + "loss": 12.1264, + "step": 18917 + }, + { + "epoch": 1.0301607073574157, + "grad_norm": 0.6434222873091657, + "learning_rate": 9.997354522581588e-05, + "loss": 11.988, + "step": 18918 + }, + { + "epoch": 1.0302151613539987, + "grad_norm": 0.52579603385314, + "learning_rate": 9.996472696807449e-05, + "loss": 12.023, + "step": 18919 + }, + { + "epoch": 1.0302696153505817, + "grad_norm": 0.5374831986440027, + "learning_rate": 9.99559087106074e-05, + "loss": 11.9577, + "step": 18920 + }, + { + "epoch": 1.0303240693471647, + "grad_norm": 0.6623159749593205, + "learning_rate": 9.994709045348323e-05, + "loss": 12.0303, + "step": 18921 + }, + { + "epoch": 1.0303785233437477, + "grad_norm": 0.5375802379687675, + "learning_rate": 9.993827219677044e-05, + "loss": 12.0215, + "step": 18922 + }, + { + "epoch": 1.0304329773403307, + "grad_norm": 0.5363274067085633, + "learning_rate": 9.992945394053764e-05, + "loss": 12.1377, + "step": 18923 + }, + { + "epoch": 1.0304874313369137, + "grad_norm": 0.5387464707574885, + "learning_rate": 9.992063568485344e-05, + "loss": 12.1371, + "step": 18924 + }, + { + "epoch": 1.0305418853334967, + "grad_norm": 0.5193837127488434, + "learning_rate": 9.991181742978638e-05, + "loss": 12.1071, + "step": 18925 + }, + { + "epoch": 1.0305963393300797, + "grad_norm": 0.5706090335186389, + "learning_rate": 9.990299917540506e-05, + "loss": 12.0525, + "step": 18926 + }, + { + "epoch": 1.0306507933266627, + "grad_norm": 0.5737329560214532, + "learning_rate": 9.989418092177801e-05, + "loss": 12.0354, + "step": 18927 + }, + { + "epoch": 1.0307052473232456, + "grad_norm": 0.5472470873820268, + "learning_rate": 9.988536266897384e-05, + "loss": 11.9458, + "step": 18928 + }, + { + "epoch": 1.0307597013198286, + "grad_norm": 0.5789577964659437, + "learning_rate": 9.987654441706112e-05, + "loss": 12.2022, + "step": 18929 + }, + { + "epoch": 1.0308141553164119, + "grad_norm": 0.6027020474997213, + "learning_rate": 9.986772616610838e-05, + "loss": 12.0335, + "step": 18930 + }, + { + "epoch": 1.0308686093129948, + "grad_norm": 0.559245253812309, + "learning_rate": 9.985890791618428e-05, + "loss": 11.9799, + "step": 18931 + }, + { + "epoch": 1.0309230633095778, + "grad_norm": 0.5994855825821289, + "learning_rate": 9.985008966735731e-05, + "loss": 12.1266, + "step": 18932 + }, + { + "epoch": 1.0309775173061608, + "grad_norm": 0.5604127136129978, + "learning_rate": 9.984127141969605e-05, + "loss": 12.0725, + "step": 18933 + }, + { + "epoch": 1.0310319713027438, + "grad_norm": 0.584477924604607, + "learning_rate": 9.983245317326907e-05, + "loss": 12.0621, + "step": 18934 + }, + { + "epoch": 1.0310864252993268, + "grad_norm": 0.543314089318705, + "learning_rate": 9.9823634928145e-05, + "loss": 12.0443, + "step": 18935 + }, + { + "epoch": 1.0311408792959098, + "grad_norm": 0.6018410254304223, + "learning_rate": 9.981481668439236e-05, + "loss": 12.1504, + "step": 18936 + }, + { + "epoch": 1.0311953332924928, + "grad_norm": 0.6363326342899834, + "learning_rate": 9.980599844207974e-05, + "loss": 11.9964, + "step": 18937 + }, + { + "epoch": 1.0312497872890758, + "grad_norm": 0.5265196045905317, + "learning_rate": 9.97971802012757e-05, + "loss": 12.1074, + "step": 18938 + }, + { + "epoch": 1.0313042412856588, + "grad_norm": 0.577653919376557, + "learning_rate": 9.978836196204884e-05, + "loss": 12.0779, + "step": 18939 + }, + { + "epoch": 1.0313586952822418, + "grad_norm": 0.6271410352203819, + "learning_rate": 9.97795437244677e-05, + "loss": 12.024, + "step": 18940 + }, + { + "epoch": 1.031413149278825, + "grad_norm": 0.545593795326789, + "learning_rate": 9.977072548860089e-05, + "loss": 12.2078, + "step": 18941 + }, + { + "epoch": 1.031467603275408, + "grad_norm": 0.7223263909651635, + "learning_rate": 9.976190725451694e-05, + "loss": 12.1842, + "step": 18942 + }, + { + "epoch": 1.031522057271991, + "grad_norm": 0.5921282937729481, + "learning_rate": 9.975308902228444e-05, + "loss": 11.9853, + "step": 18943 + }, + { + "epoch": 1.031576511268574, + "grad_norm": 0.5496087592341936, + "learning_rate": 9.974427079197195e-05, + "loss": 12.0496, + "step": 18944 + }, + { + "epoch": 1.031630965265157, + "grad_norm": 0.5592203365582061, + "learning_rate": 9.973545256364806e-05, + "loss": 12.1286, + "step": 18945 + }, + { + "epoch": 1.03168541926174, + "grad_norm": 0.4917921990182803, + "learning_rate": 9.972663433738132e-05, + "loss": 12.021, + "step": 18946 + }, + { + "epoch": 1.031739873258323, + "grad_norm": 0.6193108216780243, + "learning_rate": 9.971781611324031e-05, + "loss": 12.0472, + "step": 18947 + }, + { + "epoch": 1.031794327254906, + "grad_norm": 0.6255273614029856, + "learning_rate": 9.970899789129365e-05, + "loss": 12.0162, + "step": 18948 + }, + { + "epoch": 1.031848781251489, + "grad_norm": 0.5431678806221594, + "learning_rate": 9.970017967160984e-05, + "loss": 12.0859, + "step": 18949 + }, + { + "epoch": 1.031903235248072, + "grad_norm": 0.5535537159302271, + "learning_rate": 9.96913614542575e-05, + "loss": 12.0325, + "step": 18950 + }, + { + "epoch": 1.031957689244655, + "grad_norm": 0.5821790594525964, + "learning_rate": 9.968254323930521e-05, + "loss": 12.0404, + "step": 18951 + }, + { + "epoch": 1.032012143241238, + "grad_norm": 0.5588029126194511, + "learning_rate": 9.967372502682148e-05, + "loss": 12.0284, + "step": 18952 + }, + { + "epoch": 1.0320665972378211, + "grad_norm": 0.5647439710683829, + "learning_rate": 9.966490681687493e-05, + "loss": 12.0481, + "step": 18953 + }, + { + "epoch": 1.0321210512344041, + "grad_norm": 0.5671428492368455, + "learning_rate": 9.965608860953411e-05, + "loss": 12.0792, + "step": 18954 + }, + { + "epoch": 1.0321755052309871, + "grad_norm": 0.7456889785914095, + "learning_rate": 9.964727040486761e-05, + "loss": 12.1077, + "step": 18955 + }, + { + "epoch": 1.03222995922757, + "grad_norm": 0.6149926470951985, + "learning_rate": 9.963845220294399e-05, + "loss": 12.1788, + "step": 18956 + }, + { + "epoch": 1.032284413224153, + "grad_norm": 0.5729471131619598, + "learning_rate": 9.96296340038318e-05, + "loss": 12.0537, + "step": 18957 + }, + { + "epoch": 1.032338867220736, + "grad_norm": 0.5583344837668531, + "learning_rate": 9.962081580759968e-05, + "loss": 12.2213, + "step": 18958 + }, + { + "epoch": 1.032393321217319, + "grad_norm": 0.5723927892301177, + "learning_rate": 9.961199761431614e-05, + "loss": 12.0922, + "step": 18959 + }, + { + "epoch": 1.032447775213902, + "grad_norm": 0.5593927752480166, + "learning_rate": 9.960317942404978e-05, + "loss": 12.1068, + "step": 18960 + }, + { + "epoch": 1.032502229210485, + "grad_norm": 0.5554236215804222, + "learning_rate": 9.959436123686919e-05, + "loss": 12.114, + "step": 18961 + }, + { + "epoch": 1.032556683207068, + "grad_norm": 0.5022972277314655, + "learning_rate": 9.958554305284289e-05, + "loss": 12.1123, + "step": 18962 + }, + { + "epoch": 1.032611137203651, + "grad_norm": 0.5686257319727192, + "learning_rate": 9.957672487203948e-05, + "loss": 12.0303, + "step": 18963 + }, + { + "epoch": 1.032665591200234, + "grad_norm": 0.5288932357435517, + "learning_rate": 9.956790669452751e-05, + "loss": 12.1181, + "step": 18964 + }, + { + "epoch": 1.0327200451968173, + "grad_norm": 0.5488837938781982, + "learning_rate": 9.955908852037559e-05, + "loss": 12.1645, + "step": 18965 + }, + { + "epoch": 1.0327744991934003, + "grad_norm": 0.5616232895228722, + "learning_rate": 9.955027034965226e-05, + "loss": 12.1661, + "step": 18966 + }, + { + "epoch": 1.0328289531899832, + "grad_norm": 0.7209270444507411, + "learning_rate": 9.954145218242611e-05, + "loss": 12.1788, + "step": 18967 + }, + { + "epoch": 1.0328834071865662, + "grad_norm": 0.5679915308447578, + "learning_rate": 9.953263401876568e-05, + "loss": 11.8683, + "step": 18968 + }, + { + "epoch": 1.0329378611831492, + "grad_norm": 0.5261119171360865, + "learning_rate": 9.952381585873959e-05, + "loss": 12.0068, + "step": 18969 + }, + { + "epoch": 1.0329923151797322, + "grad_norm": 0.5270703981807149, + "learning_rate": 9.95149977024164e-05, + "loss": 12.1285, + "step": 18970 + }, + { + "epoch": 1.0330467691763152, + "grad_norm": 0.5303365022994876, + "learning_rate": 9.950617954986469e-05, + "loss": 12.0673, + "step": 18971 + }, + { + "epoch": 1.0331012231728982, + "grad_norm": 0.5403425930205259, + "learning_rate": 9.949736140115295e-05, + "loss": 12.0178, + "step": 18972 + }, + { + "epoch": 1.0331556771694812, + "grad_norm": 0.5136572749686279, + "learning_rate": 9.948854325634984e-05, + "loss": 12.0827, + "step": 18973 + }, + { + "epoch": 1.0332101311660642, + "grad_norm": 0.5512873047640114, + "learning_rate": 9.94797251155239e-05, + "loss": 11.9257, + "step": 18974 + }, + { + "epoch": 1.0332645851626472, + "grad_norm": 0.5574323799652663, + "learning_rate": 9.94709069787437e-05, + "loss": 12.092, + "step": 18975 + }, + { + "epoch": 1.0333190391592302, + "grad_norm": 0.576200540979803, + "learning_rate": 9.946208884607782e-05, + "loss": 12.2042, + "step": 18976 + }, + { + "epoch": 1.0333734931558134, + "grad_norm": 0.6078327664055824, + "learning_rate": 9.945327071759484e-05, + "loss": 12.1118, + "step": 18977 + }, + { + "epoch": 1.0334279471523964, + "grad_norm": 0.5972974214912442, + "learning_rate": 9.944445259336332e-05, + "loss": 12.0486, + "step": 18978 + }, + { + "epoch": 1.0334824011489794, + "grad_norm": 0.5406042584852266, + "learning_rate": 9.943563447345179e-05, + "loss": 12.1173, + "step": 18979 + }, + { + "epoch": 1.0335368551455624, + "grad_norm": 0.5917761282289588, + "learning_rate": 9.942681635792888e-05, + "loss": 12.2366, + "step": 18980 + }, + { + "epoch": 1.0335913091421454, + "grad_norm": 0.5517644699994737, + "learning_rate": 9.94179982468632e-05, + "loss": 12.1436, + "step": 18981 + }, + { + "epoch": 1.0336457631387284, + "grad_norm": 0.6317805706296912, + "learning_rate": 9.940918014032323e-05, + "loss": 12.1843, + "step": 18982 + }, + { + "epoch": 1.0337002171353113, + "grad_norm": 0.5568818214034049, + "learning_rate": 9.940036203837753e-05, + "loss": 12.0803, + "step": 18983 + }, + { + "epoch": 1.0337546711318943, + "grad_norm": 0.5646030978093064, + "learning_rate": 9.939154394109473e-05, + "loss": 12.0041, + "step": 18984 + }, + { + "epoch": 1.0338091251284773, + "grad_norm": 0.5578569716580203, + "learning_rate": 9.938272584854341e-05, + "loss": 12.1064, + "step": 18985 + }, + { + "epoch": 1.0338635791250603, + "grad_norm": 0.4875324723148824, + "learning_rate": 9.937390776079211e-05, + "loss": 11.9227, + "step": 18986 + }, + { + "epoch": 1.0339180331216433, + "grad_norm": 0.5457733707355378, + "learning_rate": 9.936508967790941e-05, + "loss": 12.1069, + "step": 18987 + }, + { + "epoch": 1.0339724871182265, + "grad_norm": 0.5743029530179137, + "learning_rate": 9.935627159996386e-05, + "loss": 12.0204, + "step": 18988 + }, + { + "epoch": 1.0340269411148095, + "grad_norm": 0.5679480383613924, + "learning_rate": 9.934745352702406e-05, + "loss": 12.0522, + "step": 18989 + }, + { + "epoch": 1.0340813951113925, + "grad_norm": 0.6754449658587147, + "learning_rate": 9.933863545915856e-05, + "loss": 12.2621, + "step": 18990 + }, + { + "epoch": 1.0341358491079755, + "grad_norm": 0.5515893171453582, + "learning_rate": 9.9329817396436e-05, + "loss": 12.139, + "step": 18991 + }, + { + "epoch": 1.0341903031045585, + "grad_norm": 0.544087538047309, + "learning_rate": 9.932099933892486e-05, + "loss": 11.9348, + "step": 18992 + }, + { + "epoch": 1.0342447571011415, + "grad_norm": 0.5359449039020331, + "learning_rate": 9.931218128669373e-05, + "loss": 12.0435, + "step": 18993 + }, + { + "epoch": 1.0342992110977245, + "grad_norm": 0.5110707846928103, + "learning_rate": 9.93033632398112e-05, + "loss": 12.0414, + "step": 18994 + }, + { + "epoch": 1.0343536650943075, + "grad_norm": 0.4865654922492864, + "learning_rate": 9.929454519834581e-05, + "loss": 12.0509, + "step": 18995 + }, + { + "epoch": 1.0344081190908905, + "grad_norm": 0.5841905001779324, + "learning_rate": 9.928572716236616e-05, + "loss": 12.0051, + "step": 18996 + }, + { + "epoch": 1.0344625730874735, + "grad_norm": 0.5291241370961548, + "learning_rate": 9.927690913194085e-05, + "loss": 12.0463, + "step": 18997 + }, + { + "epoch": 1.0345170270840565, + "grad_norm": 0.5640243804897475, + "learning_rate": 9.92680911071384e-05, + "loss": 11.8798, + "step": 18998 + }, + { + "epoch": 1.0345714810806395, + "grad_norm": 0.542482064559819, + "learning_rate": 9.925927308802738e-05, + "loss": 12.117, + "step": 18999 + }, + { + "epoch": 1.0346259350772227, + "grad_norm": 0.5547007371518616, + "learning_rate": 9.92504550746764e-05, + "loss": 12.1063, + "step": 19000 + }, + { + "epoch": 1.0346803890738057, + "grad_norm": 0.5291276435062658, + "learning_rate": 9.924163706715402e-05, + "loss": 12.0121, + "step": 19001 + }, + { + "epoch": 1.0347348430703887, + "grad_norm": 0.54012404073616, + "learning_rate": 9.923281906552877e-05, + "loss": 12.0397, + "step": 19002 + }, + { + "epoch": 1.0347892970669716, + "grad_norm": 0.562145421207624, + "learning_rate": 9.922400106986926e-05, + "loss": 12.1357, + "step": 19003 + }, + { + "epoch": 1.0348437510635546, + "grad_norm": 0.5854892169410031, + "learning_rate": 9.921518308024403e-05, + "loss": 12.1699, + "step": 19004 + }, + { + "epoch": 1.0348982050601376, + "grad_norm": 0.5723611719763151, + "learning_rate": 9.920636509672168e-05, + "loss": 12.0045, + "step": 19005 + }, + { + "epoch": 1.0349526590567206, + "grad_norm": 0.5125566985009139, + "learning_rate": 9.919754711937074e-05, + "loss": 12.0007, + "step": 19006 + }, + { + "epoch": 1.0350071130533036, + "grad_norm": 0.500166941294284, + "learning_rate": 9.918872914825984e-05, + "loss": 11.9961, + "step": 19007 + }, + { + "epoch": 1.0350615670498866, + "grad_norm": 0.5550410627066128, + "learning_rate": 9.917991118345751e-05, + "loss": 11.9788, + "step": 19008 + }, + { + "epoch": 1.0351160210464696, + "grad_norm": 0.5116309465811975, + "learning_rate": 9.917109322503232e-05, + "loss": 12.0869, + "step": 19009 + }, + { + "epoch": 1.0351704750430526, + "grad_norm": 0.5259939680912251, + "learning_rate": 9.916227527305287e-05, + "loss": 12.066, + "step": 19010 + }, + { + "epoch": 1.0352249290396358, + "grad_norm": 0.5247178368056492, + "learning_rate": 9.915345732758772e-05, + "loss": 12.0876, + "step": 19011 + }, + { + "epoch": 1.0352793830362188, + "grad_norm": 0.5336971277089876, + "learning_rate": 9.91446393887054e-05, + "loss": 12.0686, + "step": 19012 + }, + { + "epoch": 1.0353338370328018, + "grad_norm": 0.5337462776018592, + "learning_rate": 9.913582145647452e-05, + "loss": 11.9633, + "step": 19013 + }, + { + "epoch": 1.0353882910293848, + "grad_norm": 0.5420632483238385, + "learning_rate": 9.912700353096362e-05, + "loss": 12.1343, + "step": 19014 + }, + { + "epoch": 1.0354427450259678, + "grad_norm": 0.5160256288543138, + "learning_rate": 9.91181856122413e-05, + "loss": 12.128, + "step": 19015 + }, + { + "epoch": 1.0354971990225508, + "grad_norm": 0.478344528213043, + "learning_rate": 9.91093677003761e-05, + "loss": 12.0558, + "step": 19016 + }, + { + "epoch": 1.0355516530191338, + "grad_norm": 0.4993093672597959, + "learning_rate": 9.910054979543662e-05, + "loss": 12.1166, + "step": 19017 + }, + { + "epoch": 1.0356061070157168, + "grad_norm": 0.6508014470582698, + "learning_rate": 9.90917318974914e-05, + "loss": 12.1159, + "step": 19018 + }, + { + "epoch": 1.0356605610122998, + "grad_norm": 0.532951712132628, + "learning_rate": 9.908291400660906e-05, + "loss": 12.1033, + "step": 19019 + }, + { + "epoch": 1.0357150150088827, + "grad_norm": 0.5157335528993278, + "learning_rate": 9.90740961228581e-05, + "loss": 11.9439, + "step": 19020 + }, + { + "epoch": 1.0357694690054657, + "grad_norm": 0.5211195795040291, + "learning_rate": 9.906527824630715e-05, + "loss": 12.0271, + "step": 19021 + }, + { + "epoch": 1.0358239230020487, + "grad_norm": 0.4964343356487457, + "learning_rate": 9.905646037702475e-05, + "loss": 12.0784, + "step": 19022 + }, + { + "epoch": 1.035878376998632, + "grad_norm": 0.5121292915204849, + "learning_rate": 9.904764251507946e-05, + "loss": 11.8588, + "step": 19023 + }, + { + "epoch": 1.035932830995215, + "grad_norm": 0.5395009713858713, + "learning_rate": 9.903882466053987e-05, + "loss": 12.078, + "step": 19024 + }, + { + "epoch": 1.035987284991798, + "grad_norm": 0.5724794808973698, + "learning_rate": 9.903000681347453e-05, + "loss": 12.1624, + "step": 19025 + }, + { + "epoch": 1.036041738988381, + "grad_norm": 0.528610169588521, + "learning_rate": 9.902118897395203e-05, + "loss": 12.1195, + "step": 19026 + }, + { + "epoch": 1.036096192984964, + "grad_norm": 0.5874990058259184, + "learning_rate": 9.901237114204092e-05, + "loss": 12.0297, + "step": 19027 + }, + { + "epoch": 1.036150646981547, + "grad_norm": 0.5577128810521385, + "learning_rate": 9.900355331780976e-05, + "loss": 12.1688, + "step": 19028 + }, + { + "epoch": 1.03620510097813, + "grad_norm": 0.5359707995224262, + "learning_rate": 9.899473550132717e-05, + "loss": 12.0934, + "step": 19029 + }, + { + "epoch": 1.036259554974713, + "grad_norm": 0.5320585447586443, + "learning_rate": 9.898591769266166e-05, + "loss": 12.0178, + "step": 19030 + }, + { + "epoch": 1.0363140089712959, + "grad_norm": 0.5996530740109022, + "learning_rate": 9.897709989188189e-05, + "loss": 12.1333, + "step": 19031 + }, + { + "epoch": 1.0363684629678789, + "grad_norm": 0.530592100026717, + "learning_rate": 9.896828209905629e-05, + "loss": 11.8981, + "step": 19032 + }, + { + "epoch": 1.0364229169644619, + "grad_norm": 0.5071047991797685, + "learning_rate": 9.895946431425351e-05, + "loss": 12.0599, + "step": 19033 + }, + { + "epoch": 1.036477370961045, + "grad_norm": 0.5045948386650949, + "learning_rate": 9.895064653754212e-05, + "loss": 12.0018, + "step": 19034 + }, + { + "epoch": 1.036531824957628, + "grad_norm": 0.5357652354755498, + "learning_rate": 9.894182876899069e-05, + "loss": 12.106, + "step": 19035 + }, + { + "epoch": 1.036586278954211, + "grad_norm": 0.49056228823431847, + "learning_rate": 9.893301100866777e-05, + "loss": 12.0361, + "step": 19036 + }, + { + "epoch": 1.036640732950794, + "grad_norm": 0.514162932443665, + "learning_rate": 9.892419325664193e-05, + "loss": 11.9886, + "step": 19037 + }, + { + "epoch": 1.036695186947377, + "grad_norm": 0.48106338028908796, + "learning_rate": 9.891537551298175e-05, + "loss": 12.0299, + "step": 19038 + }, + { + "epoch": 1.03674964094396, + "grad_norm": 0.5403447290973654, + "learning_rate": 9.890655777775576e-05, + "loss": 11.9933, + "step": 19039 + }, + { + "epoch": 1.036804094940543, + "grad_norm": 0.5711878205368841, + "learning_rate": 9.889774005103258e-05, + "loss": 12.0667, + "step": 19040 + }, + { + "epoch": 1.036858548937126, + "grad_norm": 0.5227090801811834, + "learning_rate": 9.888892233288081e-05, + "loss": 11.9956, + "step": 19041 + }, + { + "epoch": 1.036913002933709, + "grad_norm": 0.5509038475517687, + "learning_rate": 9.888010462336893e-05, + "loss": 12.0979, + "step": 19042 + }, + { + "epoch": 1.036967456930292, + "grad_norm": 0.5564037998168841, + "learning_rate": 9.887128692256554e-05, + "loss": 12.0903, + "step": 19043 + }, + { + "epoch": 1.037021910926875, + "grad_norm": 0.5489284125232368, + "learning_rate": 9.886246923053918e-05, + "loss": 12.124, + "step": 19044 + }, + { + "epoch": 1.037076364923458, + "grad_norm": 0.48695931999911213, + "learning_rate": 9.885365154735849e-05, + "loss": 11.9784, + "step": 19045 + }, + { + "epoch": 1.0371308189200412, + "grad_norm": 0.5119116175650932, + "learning_rate": 9.884483387309197e-05, + "loss": 12.0831, + "step": 19046 + }, + { + "epoch": 1.0371852729166242, + "grad_norm": 0.5402343458051497, + "learning_rate": 9.883601620780825e-05, + "loss": 12.0994, + "step": 19047 + }, + { + "epoch": 1.0372397269132072, + "grad_norm": 0.5443656623920647, + "learning_rate": 9.882719855157584e-05, + "loss": 12.0989, + "step": 19048 + }, + { + "epoch": 1.0372941809097902, + "grad_norm": 0.5613854849086186, + "learning_rate": 9.881838090446334e-05, + "loss": 11.9348, + "step": 19049 + }, + { + "epoch": 1.0373486349063732, + "grad_norm": 0.5716948056406087, + "learning_rate": 9.88095632665393e-05, + "loss": 11.9324, + "step": 19050 + }, + { + "epoch": 1.0374030889029562, + "grad_norm": 0.5615163943071505, + "learning_rate": 9.880074563787232e-05, + "loss": 12.079, + "step": 19051 + }, + { + "epoch": 1.0374575428995392, + "grad_norm": 0.5416536736159172, + "learning_rate": 9.879192801853093e-05, + "loss": 11.9152, + "step": 19052 + }, + { + "epoch": 1.0375119968961222, + "grad_norm": 0.5257042329245045, + "learning_rate": 9.87831104085837e-05, + "loss": 11.9454, + "step": 19053 + }, + { + "epoch": 1.0375664508927052, + "grad_norm": 0.5153521266220805, + "learning_rate": 9.87742928080992e-05, + "loss": 12.022, + "step": 19054 + }, + { + "epoch": 1.0376209048892882, + "grad_norm": 0.532221667724164, + "learning_rate": 9.8765475217146e-05, + "loss": 12.061, + "step": 19055 + }, + { + "epoch": 1.0376753588858711, + "grad_norm": 0.5538100715675632, + "learning_rate": 9.875665763579269e-05, + "loss": 12.1585, + "step": 19056 + }, + { + "epoch": 1.0377298128824541, + "grad_norm": 0.5054629450823357, + "learning_rate": 9.874784006410782e-05, + "loss": 11.7964, + "step": 19057 + }, + { + "epoch": 1.0377842668790374, + "grad_norm": 0.5457479810413797, + "learning_rate": 9.873902250215994e-05, + "loss": 11.8406, + "step": 19058 + }, + { + "epoch": 1.0378387208756203, + "grad_norm": 0.5415794466342914, + "learning_rate": 9.873020495001765e-05, + "loss": 12.1593, + "step": 19059 + }, + { + "epoch": 1.0378931748722033, + "grad_norm": 0.5725519633500826, + "learning_rate": 9.87213874077495e-05, + "loss": 12.055, + "step": 19060 + }, + { + "epoch": 1.0379476288687863, + "grad_norm": 0.5763961304813658, + "learning_rate": 9.871256987542404e-05, + "loss": 12.1032, + "step": 19061 + }, + { + "epoch": 1.0380020828653693, + "grad_norm": 0.5393060127149656, + "learning_rate": 9.870375235310989e-05, + "loss": 11.9136, + "step": 19062 + }, + { + "epoch": 1.0380565368619523, + "grad_norm": 0.5573715002208469, + "learning_rate": 9.869493484087556e-05, + "loss": 12.0489, + "step": 19063 + }, + { + "epoch": 1.0381109908585353, + "grad_norm": 0.5628184000735703, + "learning_rate": 9.868611733878961e-05, + "loss": 12.0815, + "step": 19064 + }, + { + "epoch": 1.0381654448551183, + "grad_norm": 0.6216920499794681, + "learning_rate": 9.867729984692065e-05, + "loss": 12.0082, + "step": 19065 + }, + { + "epoch": 1.0382198988517013, + "grad_norm": 0.5540984827577488, + "learning_rate": 9.86684823653372e-05, + "loss": 12.0654, + "step": 19066 + }, + { + "epoch": 1.0382743528482843, + "grad_norm": 0.5134707959928382, + "learning_rate": 9.865966489410789e-05, + "loss": 12.0314, + "step": 19067 + }, + { + "epoch": 1.0383288068448673, + "grad_norm": 0.5443577726995507, + "learning_rate": 9.865084743330124e-05, + "loss": 12.0544, + "step": 19068 + }, + { + "epoch": 1.0383832608414503, + "grad_norm": 0.5897961064460123, + "learning_rate": 9.864202998298583e-05, + "loss": 12.0468, + "step": 19069 + }, + { + "epoch": 1.0384377148380335, + "grad_norm": 0.5367005604858965, + "learning_rate": 9.863321254323022e-05, + "loss": 11.9909, + "step": 19070 + }, + { + "epoch": 1.0384921688346165, + "grad_norm": 0.571728376343309, + "learning_rate": 9.862439511410297e-05, + "loss": 11.984, + "step": 19071 + }, + { + "epoch": 1.0385466228311995, + "grad_norm": 0.5030556817084691, + "learning_rate": 9.86155776956727e-05, + "loss": 12.0831, + "step": 19072 + }, + { + "epoch": 1.0386010768277825, + "grad_norm": 0.5938392851995647, + "learning_rate": 9.86067602880079e-05, + "loss": 12.0718, + "step": 19073 + }, + { + "epoch": 1.0386555308243655, + "grad_norm": 0.5332649327413911, + "learning_rate": 9.859794289117716e-05, + "loss": 11.9327, + "step": 19074 + }, + { + "epoch": 1.0387099848209485, + "grad_norm": 0.5293914447218716, + "learning_rate": 9.858912550524903e-05, + "loss": 12.0487, + "step": 19075 + }, + { + "epoch": 1.0387644388175314, + "grad_norm": 0.5238701352832031, + "learning_rate": 9.858030813029214e-05, + "loss": 12.1218, + "step": 19076 + }, + { + "epoch": 1.0388188928141144, + "grad_norm": 0.6132977437798783, + "learning_rate": 9.857149076637496e-05, + "loss": 12.1199, + "step": 19077 + }, + { + "epoch": 1.0388733468106974, + "grad_norm": 0.5851041481376358, + "learning_rate": 9.856267341356613e-05, + "loss": 11.8652, + "step": 19078 + }, + { + "epoch": 1.0389278008072804, + "grad_norm": 0.5249538310080266, + "learning_rate": 9.85538560719342e-05, + "loss": 11.9839, + "step": 19079 + }, + { + "epoch": 1.0389822548038634, + "grad_norm": 0.5606023303923444, + "learning_rate": 9.854503874154773e-05, + "loss": 11.9515, + "step": 19080 + }, + { + "epoch": 1.0390367088004466, + "grad_norm": 0.5288047906674928, + "learning_rate": 9.853622142247528e-05, + "loss": 12.0651, + "step": 19081 + }, + { + "epoch": 1.0390911627970296, + "grad_norm": 0.5558862569835413, + "learning_rate": 9.852740411478543e-05, + "loss": 12.0179, + "step": 19082 + }, + { + "epoch": 1.0391456167936126, + "grad_norm": 0.605587366926134, + "learning_rate": 9.851858681854673e-05, + "loss": 12.1148, + "step": 19083 + }, + { + "epoch": 1.0392000707901956, + "grad_norm": 0.6323635269478186, + "learning_rate": 9.850976953382773e-05, + "loss": 12.0581, + "step": 19084 + }, + { + "epoch": 1.0392545247867786, + "grad_norm": 0.5735279356919927, + "learning_rate": 9.850095226069702e-05, + "loss": 12.2183, + "step": 19085 + }, + { + "epoch": 1.0393089787833616, + "grad_norm": 0.5211586953071619, + "learning_rate": 9.849213499922316e-05, + "loss": 11.8532, + "step": 19086 + }, + { + "epoch": 1.0393634327799446, + "grad_norm": 0.5557305775491236, + "learning_rate": 9.848331774947471e-05, + "loss": 11.9333, + "step": 19087 + }, + { + "epoch": 1.0394178867765276, + "grad_norm": 0.5457468580264888, + "learning_rate": 9.84745005115202e-05, + "loss": 12.0708, + "step": 19088 + }, + { + "epoch": 1.0394723407731106, + "grad_norm": 0.5179863842829507, + "learning_rate": 9.846568328542827e-05, + "loss": 12.0847, + "step": 19089 + }, + { + "epoch": 1.0395267947696936, + "grad_norm": 0.5237475254373459, + "learning_rate": 9.845686607126744e-05, + "loss": 12.0979, + "step": 19090 + }, + { + "epoch": 1.0395812487662766, + "grad_norm": 0.6168798677338446, + "learning_rate": 9.844804886910627e-05, + "loss": 12.062, + "step": 19091 + }, + { + "epoch": 1.0396357027628595, + "grad_norm": 0.5484336017930553, + "learning_rate": 9.843923167901336e-05, + "loss": 12.0649, + "step": 19092 + }, + { + "epoch": 1.0396901567594428, + "grad_norm": 0.5006446798431007, + "learning_rate": 9.843041450105722e-05, + "loss": 12.0172, + "step": 19093 + }, + { + "epoch": 1.0397446107560258, + "grad_norm": 0.5387552497608356, + "learning_rate": 9.842159733530645e-05, + "loss": 12.0331, + "step": 19094 + }, + { + "epoch": 1.0397990647526087, + "grad_norm": 0.5992092342972867, + "learning_rate": 9.841278018182959e-05, + "loss": 12.042, + "step": 19095 + }, + { + "epoch": 1.0398535187491917, + "grad_norm": 0.5518016802549429, + "learning_rate": 9.840396304069522e-05, + "loss": 12.1501, + "step": 19096 + }, + { + "epoch": 1.0399079727457747, + "grad_norm": 0.5231586092041085, + "learning_rate": 9.839514591197191e-05, + "loss": 12.1477, + "step": 19097 + }, + { + "epoch": 1.0399624267423577, + "grad_norm": 0.5250451595601056, + "learning_rate": 9.838632879572821e-05, + "loss": 11.9877, + "step": 19098 + }, + { + "epoch": 1.0400168807389407, + "grad_norm": 0.5751355650493677, + "learning_rate": 9.837751169203268e-05, + "loss": 12.0432, + "step": 19099 + }, + { + "epoch": 1.0400713347355237, + "grad_norm": 0.5475822383731445, + "learning_rate": 9.836869460095388e-05, + "loss": 12.1132, + "step": 19100 + }, + { + "epoch": 1.0401257887321067, + "grad_norm": 0.5095680052981686, + "learning_rate": 9.83598775225604e-05, + "loss": 12.0877, + "step": 19101 + }, + { + "epoch": 1.0401802427286897, + "grad_norm": 0.5203026601536467, + "learning_rate": 9.835106045692084e-05, + "loss": 12.1754, + "step": 19102 + }, + { + "epoch": 1.0402346967252727, + "grad_norm": 0.5393357415466559, + "learning_rate": 9.834224340410366e-05, + "loss": 12.0501, + "step": 19103 + }, + { + "epoch": 1.040289150721856, + "grad_norm": 0.5147226504600757, + "learning_rate": 9.833342636417747e-05, + "loss": 12.2127, + "step": 19104 + }, + { + "epoch": 1.040343604718439, + "grad_norm": 0.5151805541245005, + "learning_rate": 9.832460933721083e-05, + "loss": 12.0432, + "step": 19105 + }, + { + "epoch": 1.040398058715022, + "grad_norm": 0.5298712966251693, + "learning_rate": 9.831579232327231e-05, + "loss": 11.9219, + "step": 19106 + }, + { + "epoch": 1.0404525127116049, + "grad_norm": 0.5028227447111524, + "learning_rate": 9.830697532243049e-05, + "loss": 12.0594, + "step": 19107 + }, + { + "epoch": 1.0405069667081879, + "grad_norm": 0.5437150776845209, + "learning_rate": 9.82981583347539e-05, + "loss": 12.0402, + "step": 19108 + }, + { + "epoch": 1.0405614207047709, + "grad_norm": 0.5714209694702647, + "learning_rate": 9.828934136031113e-05, + "loss": 12.0347, + "step": 19109 + }, + { + "epoch": 1.0406158747013539, + "grad_norm": 0.5090023205310441, + "learning_rate": 9.828052439917071e-05, + "loss": 11.9786, + "step": 19110 + }, + { + "epoch": 1.0406703286979369, + "grad_norm": 0.5052478240697585, + "learning_rate": 9.827170745140121e-05, + "loss": 11.9894, + "step": 19111 + }, + { + "epoch": 1.0407247826945198, + "grad_norm": 0.520875318006523, + "learning_rate": 9.826289051707127e-05, + "loss": 12.0789, + "step": 19112 + }, + { + "epoch": 1.0407792366911028, + "grad_norm": 0.5776766893725211, + "learning_rate": 9.825407359624935e-05, + "loss": 11.9286, + "step": 19113 + }, + { + "epoch": 1.0408336906876858, + "grad_norm": 0.5587302106616825, + "learning_rate": 9.824525668900402e-05, + "loss": 11.995, + "step": 19114 + }, + { + "epoch": 1.0408881446842688, + "grad_norm": 0.5382811136105777, + "learning_rate": 9.823643979540386e-05, + "loss": 11.9325, + "step": 19115 + }, + { + "epoch": 1.040942598680852, + "grad_norm": 0.5148025263395594, + "learning_rate": 9.822762291551746e-05, + "loss": 11.9106, + "step": 19116 + }, + { + "epoch": 1.040997052677435, + "grad_norm": 0.5838594177462733, + "learning_rate": 9.821880604941337e-05, + "loss": 12.2477, + "step": 19117 + }, + { + "epoch": 1.041051506674018, + "grad_norm": 0.5123221411956282, + "learning_rate": 9.820998919716013e-05, + "loss": 12.0669, + "step": 19118 + }, + { + "epoch": 1.041105960670601, + "grad_norm": 0.565955948169352, + "learning_rate": 9.820117235882633e-05, + "loss": 12.0751, + "step": 19119 + }, + { + "epoch": 1.041160414667184, + "grad_norm": 0.5621022639430323, + "learning_rate": 9.81923555344805e-05, + "loss": 12.0712, + "step": 19120 + }, + { + "epoch": 1.041214868663767, + "grad_norm": 0.5499786944827505, + "learning_rate": 9.818353872419121e-05, + "loss": 12.1968, + "step": 19121 + }, + { + "epoch": 1.04126932266035, + "grad_norm": 0.5583103952439287, + "learning_rate": 9.817472192802707e-05, + "loss": 12.1014, + "step": 19122 + }, + { + "epoch": 1.041323776656933, + "grad_norm": 0.5163468250550807, + "learning_rate": 9.816590514605657e-05, + "loss": 11.9431, + "step": 19123 + }, + { + "epoch": 1.041378230653516, + "grad_norm": 0.5665414309500235, + "learning_rate": 9.815708837834829e-05, + "loss": 12.0765, + "step": 19124 + }, + { + "epoch": 1.041432684650099, + "grad_norm": 0.5285343488866202, + "learning_rate": 9.814827162497082e-05, + "loss": 12.055, + "step": 19125 + }, + { + "epoch": 1.041487138646682, + "grad_norm": 0.5315934905705026, + "learning_rate": 9.813945488599266e-05, + "loss": 12.0311, + "step": 19126 + }, + { + "epoch": 1.041541592643265, + "grad_norm": 0.6469743773818489, + "learning_rate": 9.813063816148244e-05, + "loss": 12.1042, + "step": 19127 + }, + { + "epoch": 1.0415960466398482, + "grad_norm": 0.5832980168406593, + "learning_rate": 9.81218214515087e-05, + "loss": 12.1441, + "step": 19128 + }, + { + "epoch": 1.0416505006364312, + "grad_norm": 0.5807553796234487, + "learning_rate": 9.811300475613997e-05, + "loss": 12.1559, + "step": 19129 + }, + { + "epoch": 1.0417049546330142, + "grad_norm": 0.5623617827311373, + "learning_rate": 9.810418807544483e-05, + "loss": 12.1247, + "step": 19130 + }, + { + "epoch": 1.0417594086295972, + "grad_norm": 0.5565142904968856, + "learning_rate": 9.809537140949187e-05, + "loss": 12.0773, + "step": 19131 + }, + { + "epoch": 1.0418138626261801, + "grad_norm": 0.48345586368369114, + "learning_rate": 9.808655475834962e-05, + "loss": 12.0419, + "step": 19132 + }, + { + "epoch": 1.0418683166227631, + "grad_norm": 0.6031622305737758, + "learning_rate": 9.807773812208662e-05, + "loss": 12.1528, + "step": 19133 + }, + { + "epoch": 1.0419227706193461, + "grad_norm": 0.5163447645287528, + "learning_rate": 9.806892150077147e-05, + "loss": 12.037, + "step": 19134 + }, + { + "epoch": 1.0419772246159291, + "grad_norm": 0.5209695862658643, + "learning_rate": 9.80601048944727e-05, + "loss": 12.0789, + "step": 19135 + }, + { + "epoch": 1.0420316786125121, + "grad_norm": 0.5494787522018224, + "learning_rate": 9.805128830325887e-05, + "loss": 12.1595, + "step": 19136 + }, + { + "epoch": 1.042086132609095, + "grad_norm": 0.5891751228617196, + "learning_rate": 9.804247172719854e-05, + "loss": 12.1231, + "step": 19137 + }, + { + "epoch": 1.042140586605678, + "grad_norm": 0.5817300733494469, + "learning_rate": 9.803365516636028e-05, + "loss": 12.1236, + "step": 19138 + }, + { + "epoch": 1.042195040602261, + "grad_norm": 0.5814731823831777, + "learning_rate": 9.802483862081267e-05, + "loss": 12.1709, + "step": 19139 + }, + { + "epoch": 1.0422494945988443, + "grad_norm": 0.5579691487335727, + "learning_rate": 9.801602209062424e-05, + "loss": 12.1035, + "step": 19140 + }, + { + "epoch": 1.0423039485954273, + "grad_norm": 0.5740579702293666, + "learning_rate": 9.800720557586354e-05, + "loss": 11.904, + "step": 19141 + }, + { + "epoch": 1.0423584025920103, + "grad_norm": 0.5928448752546925, + "learning_rate": 9.799838907659918e-05, + "loss": 12.1609, + "step": 19142 + }, + { + "epoch": 1.0424128565885933, + "grad_norm": 0.5212588888353008, + "learning_rate": 9.798957259289966e-05, + "loss": 12.1121, + "step": 19143 + }, + { + "epoch": 1.0424673105851763, + "grad_norm": 0.5754309059415892, + "learning_rate": 9.798075612483356e-05, + "loss": 12.0057, + "step": 19144 + }, + { + "epoch": 1.0425217645817593, + "grad_norm": 0.544196322848585, + "learning_rate": 9.797193967246943e-05, + "loss": 12.0195, + "step": 19145 + }, + { + "epoch": 1.0425762185783423, + "grad_norm": 0.5588146386938192, + "learning_rate": 9.796312323587585e-05, + "loss": 12.0907, + "step": 19146 + }, + { + "epoch": 1.0426306725749253, + "grad_norm": 0.567915320166216, + "learning_rate": 9.795430681512137e-05, + "loss": 12.0485, + "step": 19147 + }, + { + "epoch": 1.0426851265715082, + "grad_norm": 0.5346818621860815, + "learning_rate": 9.794549041027454e-05, + "loss": 12.1685, + "step": 19148 + }, + { + "epoch": 1.0427395805680912, + "grad_norm": 0.5617888190170663, + "learning_rate": 9.793667402140388e-05, + "loss": 12.1387, + "step": 19149 + }, + { + "epoch": 1.0427940345646742, + "grad_norm": 0.524658144114567, + "learning_rate": 9.792785764857802e-05, + "loss": 12.0566, + "step": 19150 + }, + { + "epoch": 1.0428484885612574, + "grad_norm": 0.5713117601184278, + "learning_rate": 9.79190412918655e-05, + "loss": 12.0968, + "step": 19151 + }, + { + "epoch": 1.0429029425578404, + "grad_norm": 0.49936954976258013, + "learning_rate": 9.791022495133489e-05, + "loss": 12.0575, + "step": 19152 + }, + { + "epoch": 1.0429573965544234, + "grad_norm": 0.6129223975829453, + "learning_rate": 9.790140862705468e-05, + "loss": 12.193, + "step": 19153 + }, + { + "epoch": 1.0430118505510064, + "grad_norm": 0.6142847898671409, + "learning_rate": 9.789259231909346e-05, + "loss": 12.1012, + "step": 19154 + }, + { + "epoch": 1.0430663045475894, + "grad_norm": 0.5700271812475715, + "learning_rate": 9.788377602751982e-05, + "loss": 12.015, + "step": 19155 + }, + { + "epoch": 1.0431207585441724, + "grad_norm": 0.4888156128269216, + "learning_rate": 9.787495975240227e-05, + "loss": 11.9991, + "step": 19156 + }, + { + "epoch": 1.0431752125407554, + "grad_norm": 0.5083256571510111, + "learning_rate": 9.78661434938094e-05, + "loss": 11.995, + "step": 19157 + }, + { + "epoch": 1.0432296665373384, + "grad_norm": 0.6171669719782117, + "learning_rate": 9.785732725180977e-05, + "loss": 12.0293, + "step": 19158 + }, + { + "epoch": 1.0432841205339214, + "grad_norm": 0.585761777581188, + "learning_rate": 9.78485110264719e-05, + "loss": 11.9625, + "step": 19159 + }, + { + "epoch": 1.0433385745305044, + "grad_norm": 0.6041261370127157, + "learning_rate": 9.783969481786435e-05, + "loss": 12.0765, + "step": 19160 + }, + { + "epoch": 1.0433930285270874, + "grad_norm": 0.6467978619240164, + "learning_rate": 9.783087862605572e-05, + "loss": 12.1038, + "step": 19161 + }, + { + "epoch": 1.0434474825236704, + "grad_norm": 0.5517345887780721, + "learning_rate": 9.782206245111459e-05, + "loss": 12.184, + "step": 19162 + }, + { + "epoch": 1.0435019365202536, + "grad_norm": 0.5585662895914788, + "learning_rate": 9.781324629310942e-05, + "loss": 12.1267, + "step": 19163 + }, + { + "epoch": 1.0435563905168366, + "grad_norm": 0.5072476663086728, + "learning_rate": 9.780443015210881e-05, + "loss": 12.1028, + "step": 19164 + }, + { + "epoch": 1.0436108445134196, + "grad_norm": 0.5149472102231276, + "learning_rate": 9.779561402818131e-05, + "loss": 12.1073, + "step": 19165 + }, + { + "epoch": 1.0436652985100026, + "grad_norm": 0.5470801062193369, + "learning_rate": 9.778679792139552e-05, + "loss": 12.0336, + "step": 19166 + }, + { + "epoch": 1.0437197525065856, + "grad_norm": 0.4981163730176472, + "learning_rate": 9.777798183181993e-05, + "loss": 12.0814, + "step": 19167 + }, + { + "epoch": 1.0437742065031685, + "grad_norm": 0.5134029700065394, + "learning_rate": 9.776916575952314e-05, + "loss": 12.08, + "step": 19168 + }, + { + "epoch": 1.0438286604997515, + "grad_norm": 0.5832556991628526, + "learning_rate": 9.776034970457369e-05, + "loss": 12.0783, + "step": 19169 + }, + { + "epoch": 1.0438831144963345, + "grad_norm": 0.5377942104959453, + "learning_rate": 9.775153366704013e-05, + "loss": 12.1176, + "step": 19170 + }, + { + "epoch": 1.0439375684929175, + "grad_norm": 0.6218772734868826, + "learning_rate": 9.774271764699101e-05, + "loss": 11.987, + "step": 19171 + }, + { + "epoch": 1.0439920224895005, + "grad_norm": 0.5466535332473306, + "learning_rate": 9.773390164449495e-05, + "loss": 11.9507, + "step": 19172 + }, + { + "epoch": 1.0440464764860835, + "grad_norm": 0.5297762922881801, + "learning_rate": 9.772508565962042e-05, + "loss": 11.9809, + "step": 19173 + }, + { + "epoch": 1.0441009304826667, + "grad_norm": 0.5582817533417825, + "learning_rate": 9.7716269692436e-05, + "loss": 12.1064, + "step": 19174 + }, + { + "epoch": 1.0441553844792497, + "grad_norm": 0.5852479088644624, + "learning_rate": 9.770745374301022e-05, + "loss": 12.076, + "step": 19175 + }, + { + "epoch": 1.0442098384758327, + "grad_norm": 0.5511628923438155, + "learning_rate": 9.76986378114117e-05, + "loss": 12.1843, + "step": 19176 + }, + { + "epoch": 1.0442642924724157, + "grad_norm": 0.5528512384448054, + "learning_rate": 9.768982189770894e-05, + "loss": 12.042, + "step": 19177 + }, + { + "epoch": 1.0443187464689987, + "grad_norm": 0.5579554993906317, + "learning_rate": 9.768100600197053e-05, + "loss": 12.0521, + "step": 19178 + }, + { + "epoch": 1.0443732004655817, + "grad_norm": 0.5920345224342375, + "learning_rate": 9.7672190124265e-05, + "loss": 12.1227, + "step": 19179 + }, + { + "epoch": 1.0444276544621647, + "grad_norm": 0.5402265066864046, + "learning_rate": 9.76633742646609e-05, + "loss": 12.1896, + "step": 19180 + }, + { + "epoch": 1.0444821084587477, + "grad_norm": 0.5230294482354321, + "learning_rate": 9.76545584232268e-05, + "loss": 11.9813, + "step": 19181 + }, + { + "epoch": 1.0445365624553307, + "grad_norm": 0.5513612958958378, + "learning_rate": 9.764574260003128e-05, + "loss": 12.1628, + "step": 19182 + }, + { + "epoch": 1.0445910164519137, + "grad_norm": 0.5647767986230641, + "learning_rate": 9.763692679514284e-05, + "loss": 11.8818, + "step": 19183 + }, + { + "epoch": 1.0446454704484966, + "grad_norm": 0.5521198979994846, + "learning_rate": 9.762811100863003e-05, + "loss": 12.1333, + "step": 19184 + }, + { + "epoch": 1.0446999244450796, + "grad_norm": 0.5230827615522323, + "learning_rate": 9.761929524056145e-05, + "loss": 12.1484, + "step": 19185 + }, + { + "epoch": 1.0447543784416629, + "grad_norm": 0.5349816199825447, + "learning_rate": 9.761047949100558e-05, + "loss": 12.0065, + "step": 19186 + }, + { + "epoch": 1.0448088324382458, + "grad_norm": 0.5682286337147697, + "learning_rate": 9.760166376003107e-05, + "loss": 12.1066, + "step": 19187 + }, + { + "epoch": 1.0448632864348288, + "grad_norm": 0.533617795780643, + "learning_rate": 9.759284804770642e-05, + "loss": 12.0198, + "step": 19188 + }, + { + "epoch": 1.0449177404314118, + "grad_norm": 0.6171098731477254, + "learning_rate": 9.758403235410019e-05, + "loss": 12.1067, + "step": 19189 + }, + { + "epoch": 1.0449721944279948, + "grad_norm": 0.5329021938234568, + "learning_rate": 9.757521667928092e-05, + "loss": 11.9268, + "step": 19190 + }, + { + "epoch": 1.0450266484245778, + "grad_norm": 0.5366953787561417, + "learning_rate": 9.756640102331718e-05, + "loss": 12.0649, + "step": 19191 + }, + { + "epoch": 1.0450811024211608, + "grad_norm": 0.5661101853878987, + "learning_rate": 9.755758538627753e-05, + "loss": 12.1772, + "step": 19192 + }, + { + "epoch": 1.0451355564177438, + "grad_norm": 0.6285858479349945, + "learning_rate": 9.754876976823049e-05, + "loss": 12.073, + "step": 19193 + }, + { + "epoch": 1.0451900104143268, + "grad_norm": 0.5891952748044051, + "learning_rate": 9.753995416924462e-05, + "loss": 11.997, + "step": 19194 + }, + { + "epoch": 1.0452444644109098, + "grad_norm": 0.5354460728775402, + "learning_rate": 9.753113858938847e-05, + "loss": 12.0828, + "step": 19195 + }, + { + "epoch": 1.0452989184074928, + "grad_norm": 0.6363409217279798, + "learning_rate": 9.752232302873061e-05, + "loss": 12.0971, + "step": 19196 + }, + { + "epoch": 1.0453533724040758, + "grad_norm": 0.6408417171455034, + "learning_rate": 9.751350748733959e-05, + "loss": 12.0337, + "step": 19197 + }, + { + "epoch": 1.045407826400659, + "grad_norm": 0.5498651446757598, + "learning_rate": 9.750469196528392e-05, + "loss": 12.0722, + "step": 19198 + }, + { + "epoch": 1.045462280397242, + "grad_norm": 0.6273062786380214, + "learning_rate": 9.749587646263221e-05, + "loss": 11.9887, + "step": 19199 + }, + { + "epoch": 1.045516734393825, + "grad_norm": 0.631159313500319, + "learning_rate": 9.748706097945298e-05, + "loss": 12.0544, + "step": 19200 + }, + { + "epoch": 1.045571188390408, + "grad_norm": 0.5390729842486164, + "learning_rate": 9.74782455158148e-05, + "loss": 12.036, + "step": 19201 + }, + { + "epoch": 1.045625642386991, + "grad_norm": 0.5481233693458728, + "learning_rate": 9.746943007178622e-05, + "loss": 12.1109, + "step": 19202 + }, + { + "epoch": 1.045680096383574, + "grad_norm": 0.6065527736780751, + "learning_rate": 9.746061464743575e-05, + "loss": 12.1451, + "step": 19203 + }, + { + "epoch": 1.045734550380157, + "grad_norm": 0.573328558430312, + "learning_rate": 9.745179924283196e-05, + "loss": 12.1437, + "step": 19204 + }, + { + "epoch": 1.04578900437674, + "grad_norm": 0.5542725518096085, + "learning_rate": 9.744298385804341e-05, + "loss": 11.9381, + "step": 19205 + }, + { + "epoch": 1.045843458373323, + "grad_norm": 0.5887718192429499, + "learning_rate": 9.743416849313866e-05, + "loss": 12.1066, + "step": 19206 + }, + { + "epoch": 1.045897912369906, + "grad_norm": 0.5810414666499816, + "learning_rate": 9.742535314818624e-05, + "loss": 12.1456, + "step": 19207 + }, + { + "epoch": 1.045952366366489, + "grad_norm": 0.5934364387216299, + "learning_rate": 9.74165378232547e-05, + "loss": 12.07, + "step": 19208 + }, + { + "epoch": 1.046006820363072, + "grad_norm": 0.544803947709447, + "learning_rate": 9.740772251841257e-05, + "loss": 11.9487, + "step": 19209 + }, + { + "epoch": 1.0460612743596551, + "grad_norm": 0.6212074752348654, + "learning_rate": 9.739890723372845e-05, + "loss": 12.0921, + "step": 19210 + }, + { + "epoch": 1.0461157283562381, + "grad_norm": 0.6278018299328106, + "learning_rate": 9.739009196927086e-05, + "loss": 12.0741, + "step": 19211 + }, + { + "epoch": 1.046170182352821, + "grad_norm": 0.5454639466797658, + "learning_rate": 9.738127672510836e-05, + "loss": 12.0252, + "step": 19212 + }, + { + "epoch": 1.046224636349404, + "grad_norm": 0.5713008151891188, + "learning_rate": 9.737246150130951e-05, + "loss": 11.949, + "step": 19213 + }, + { + "epoch": 1.046279090345987, + "grad_norm": 0.5813303234196573, + "learning_rate": 9.736364629794283e-05, + "loss": 12.1801, + "step": 19214 + }, + { + "epoch": 1.04633354434257, + "grad_norm": 0.563521297845805, + "learning_rate": 9.735483111507686e-05, + "loss": 11.8658, + "step": 19215 + }, + { + "epoch": 1.046387998339153, + "grad_norm": 0.5257352010378813, + "learning_rate": 9.734601595278018e-05, + "loss": 12.0067, + "step": 19216 + }, + { + "epoch": 1.046442452335736, + "grad_norm": 0.5098668625026491, + "learning_rate": 9.733720081112132e-05, + "loss": 12.052, + "step": 19217 + }, + { + "epoch": 1.046496906332319, + "grad_norm": 0.5296134836923844, + "learning_rate": 9.732838569016884e-05, + "loss": 12.1223, + "step": 19218 + }, + { + "epoch": 1.046551360328902, + "grad_norm": 0.5534850423144859, + "learning_rate": 9.731957058999127e-05, + "loss": 12.0633, + "step": 19219 + }, + { + "epoch": 1.046605814325485, + "grad_norm": 0.5026348269791794, + "learning_rate": 9.731075551065714e-05, + "loss": 12.0089, + "step": 19220 + }, + { + "epoch": 1.0466602683220683, + "grad_norm": 0.5766132676570002, + "learning_rate": 9.730194045223506e-05, + "loss": 12.0963, + "step": 19221 + }, + { + "epoch": 1.0467147223186513, + "grad_norm": 0.5168019245496915, + "learning_rate": 9.729312541479355e-05, + "loss": 12.0861, + "step": 19222 + }, + { + "epoch": 1.0467691763152343, + "grad_norm": 0.6158430966438531, + "learning_rate": 9.728431039840118e-05, + "loss": 12.0681, + "step": 19223 + }, + { + "epoch": 1.0468236303118172, + "grad_norm": 0.5121067618505272, + "learning_rate": 9.72754954031264e-05, + "loss": 11.8811, + "step": 19224 + }, + { + "epoch": 1.0468780843084002, + "grad_norm": 0.5995318782549505, + "learning_rate": 9.726668042903786e-05, + "loss": 12.1476, + "step": 19225 + }, + { + "epoch": 1.0469325383049832, + "grad_norm": 0.5251005441098441, + "learning_rate": 9.725786547620407e-05, + "loss": 12.0328, + "step": 19226 + }, + { + "epoch": 1.0469869923015662, + "grad_norm": 0.5925345936070446, + "learning_rate": 9.724905054469357e-05, + "loss": 12.2029, + "step": 19227 + }, + { + "epoch": 1.0470414462981492, + "grad_norm": 0.5644629534226009, + "learning_rate": 9.724023563457492e-05, + "loss": 12.0922, + "step": 19228 + }, + { + "epoch": 1.0470959002947322, + "grad_norm": 0.5145965892903692, + "learning_rate": 9.723142074591665e-05, + "loss": 12.0152, + "step": 19229 + }, + { + "epoch": 1.0471503542913152, + "grad_norm": 0.617468919722614, + "learning_rate": 9.722260587878734e-05, + "loss": 11.9832, + "step": 19230 + }, + { + "epoch": 1.0472048082878982, + "grad_norm": 0.5774638227556544, + "learning_rate": 9.721379103325548e-05, + "loss": 12.1204, + "step": 19231 + }, + { + "epoch": 1.0472592622844812, + "grad_norm": 0.48206586144768443, + "learning_rate": 9.720497620938965e-05, + "loss": 12.013, + "step": 19232 + }, + { + "epoch": 1.0473137162810644, + "grad_norm": 0.6548010509130806, + "learning_rate": 9.719616140725846e-05, + "loss": 12.1303, + "step": 19233 + }, + { + "epoch": 1.0473681702776474, + "grad_norm": 0.698867251761497, + "learning_rate": 9.718734662693034e-05, + "loss": 12.2486, + "step": 19234 + }, + { + "epoch": 1.0474226242742304, + "grad_norm": 0.5256828253879882, + "learning_rate": 9.717853186847386e-05, + "loss": 12.0381, + "step": 19235 + }, + { + "epoch": 1.0474770782708134, + "grad_norm": 0.6207702632861498, + "learning_rate": 9.716971713195762e-05, + "loss": 12.1398, + "step": 19236 + }, + { + "epoch": 1.0475315322673964, + "grad_norm": 0.5712375661066159, + "learning_rate": 9.716090241745012e-05, + "loss": 12.0401, + "step": 19237 + }, + { + "epoch": 1.0475859862639794, + "grad_norm": 0.4935113312443913, + "learning_rate": 9.715208772501992e-05, + "loss": 12.0417, + "step": 19238 + }, + { + "epoch": 1.0476404402605624, + "grad_norm": 0.6217792785204247, + "learning_rate": 9.714327305473558e-05, + "loss": 12.1523, + "step": 19239 + }, + { + "epoch": 1.0476948942571453, + "grad_norm": 0.5723809438569106, + "learning_rate": 9.713445840666562e-05, + "loss": 12.0664, + "step": 19240 + }, + { + "epoch": 1.0477493482537283, + "grad_norm": 0.5677739931138212, + "learning_rate": 9.712564378087858e-05, + "loss": 12.0922, + "step": 19241 + }, + { + "epoch": 1.0478038022503113, + "grad_norm": 0.5640126414664229, + "learning_rate": 9.7116829177443e-05, + "loss": 12.131, + "step": 19242 + }, + { + "epoch": 1.0478582562468943, + "grad_norm": 0.6640791811526993, + "learning_rate": 9.710801459642751e-05, + "loss": 11.9364, + "step": 19243 + }, + { + "epoch": 1.0479127102434775, + "grad_norm": 0.5250140126089315, + "learning_rate": 9.709920003790054e-05, + "loss": 12.0516, + "step": 19244 + }, + { + "epoch": 1.0479671642400605, + "grad_norm": 0.6142053809536904, + "learning_rate": 9.709038550193068e-05, + "loss": 12.0713, + "step": 19245 + }, + { + "epoch": 1.0480216182366435, + "grad_norm": 0.5986476016795224, + "learning_rate": 9.708157098858645e-05, + "loss": 12.1929, + "step": 19246 + }, + { + "epoch": 1.0480760722332265, + "grad_norm": 0.6491176720117351, + "learning_rate": 9.707275649793642e-05, + "loss": 12.0925, + "step": 19247 + }, + { + "epoch": 1.0481305262298095, + "grad_norm": 0.6034636611147448, + "learning_rate": 9.706394203004914e-05, + "loss": 11.9917, + "step": 19248 + }, + { + "epoch": 1.0481849802263925, + "grad_norm": 0.5886294219976741, + "learning_rate": 9.705512758499313e-05, + "loss": 12.1267, + "step": 19249 + }, + { + "epoch": 1.0482394342229755, + "grad_norm": 0.6028214189036069, + "learning_rate": 9.704631316283695e-05, + "loss": 11.9418, + "step": 19250 + }, + { + "epoch": 1.0482938882195585, + "grad_norm": 0.5716779343276686, + "learning_rate": 9.703749876364913e-05, + "loss": 12.0819, + "step": 19251 + }, + { + "epoch": 1.0483483422161415, + "grad_norm": 0.5664468715715333, + "learning_rate": 9.702868438749822e-05, + "loss": 12.1577, + "step": 19252 + }, + { + "epoch": 1.0484027962127245, + "grad_norm": 0.6202810352599408, + "learning_rate": 9.701987003445278e-05, + "loss": 11.9958, + "step": 19253 + }, + { + "epoch": 1.0484572502093075, + "grad_norm": 0.6287441056132557, + "learning_rate": 9.70110557045813e-05, + "loss": 12.074, + "step": 19254 + }, + { + "epoch": 1.0485117042058905, + "grad_norm": 0.6081310291659571, + "learning_rate": 9.700224139795236e-05, + "loss": 12.0567, + "step": 19255 + }, + { + "epoch": 1.0485661582024737, + "grad_norm": 0.6344891453275684, + "learning_rate": 9.699342711463448e-05, + "loss": 12.0007, + "step": 19256 + }, + { + "epoch": 1.0486206121990567, + "grad_norm": 0.5084448846445512, + "learning_rate": 9.698461285469624e-05, + "loss": 12.0539, + "step": 19257 + }, + { + "epoch": 1.0486750661956397, + "grad_norm": 0.5475980659381137, + "learning_rate": 9.697579861820611e-05, + "loss": 12.1256, + "step": 19258 + }, + { + "epoch": 1.0487295201922227, + "grad_norm": 0.523600751764549, + "learning_rate": 9.696698440523271e-05, + "loss": 12.1141, + "step": 19259 + }, + { + "epoch": 1.0487839741888056, + "grad_norm": 0.6069424425625963, + "learning_rate": 9.695817021584454e-05, + "loss": 12.0484, + "step": 19260 + }, + { + "epoch": 1.0488384281853886, + "grad_norm": 0.5742100551630694, + "learning_rate": 9.694935605011017e-05, + "loss": 12.0797, + "step": 19261 + }, + { + "epoch": 1.0488928821819716, + "grad_norm": 0.5941016295730495, + "learning_rate": 9.69405419080981e-05, + "loss": 12.1006, + "step": 19262 + }, + { + "epoch": 1.0489473361785546, + "grad_norm": 0.554492287310076, + "learning_rate": 9.693172778987692e-05, + "loss": 12.0692, + "step": 19263 + }, + { + "epoch": 1.0490017901751376, + "grad_norm": 0.6211903334798932, + "learning_rate": 9.69229136955151e-05, + "loss": 11.9765, + "step": 19264 + }, + { + "epoch": 1.0490562441717206, + "grad_norm": 0.6354330720514915, + "learning_rate": 9.691409962508124e-05, + "loss": 12.0981, + "step": 19265 + }, + { + "epoch": 1.0491106981683036, + "grad_norm": 0.581561168575633, + "learning_rate": 9.690528557864386e-05, + "loss": 12.0893, + "step": 19266 + }, + { + "epoch": 1.0491651521648866, + "grad_norm": 0.5602093649184966, + "learning_rate": 9.689647155627149e-05, + "loss": 11.992, + "step": 19267 + }, + { + "epoch": 1.0492196061614698, + "grad_norm": 0.541758484464551, + "learning_rate": 9.688765755803268e-05, + "loss": 11.9762, + "step": 19268 + }, + { + "epoch": 1.0492740601580528, + "grad_norm": 0.5567906003713667, + "learning_rate": 9.687884358399594e-05, + "loss": 12.0616, + "step": 19269 + }, + { + "epoch": 1.0493285141546358, + "grad_norm": 0.5468525484934957, + "learning_rate": 9.687002963422986e-05, + "loss": 12.0693, + "step": 19270 + }, + { + "epoch": 1.0493829681512188, + "grad_norm": 0.6748688401673388, + "learning_rate": 9.686121570880294e-05, + "loss": 12.0306, + "step": 19271 + }, + { + "epoch": 1.0494374221478018, + "grad_norm": 0.6129731420684097, + "learning_rate": 9.685240180778376e-05, + "loss": 12.0475, + "step": 19272 + }, + { + "epoch": 1.0494918761443848, + "grad_norm": 0.5224925834604115, + "learning_rate": 9.684358793124084e-05, + "loss": 12.1006, + "step": 19273 + }, + { + "epoch": 1.0495463301409678, + "grad_norm": 0.525354978290664, + "learning_rate": 9.683477407924268e-05, + "loss": 12.0718, + "step": 19274 + }, + { + "epoch": 1.0496007841375508, + "grad_norm": 0.5501231338286172, + "learning_rate": 9.682596025185786e-05, + "loss": 12.037, + "step": 19275 + }, + { + "epoch": 1.0496552381341338, + "grad_norm": 0.5525824228493883, + "learning_rate": 9.68171464491549e-05, + "loss": 11.8471, + "step": 19276 + }, + { + "epoch": 1.0497096921307167, + "grad_norm": 0.5237595322292766, + "learning_rate": 9.680833267120234e-05, + "loss": 12.1528, + "step": 19277 + }, + { + "epoch": 1.0497641461272997, + "grad_norm": 0.547492121864182, + "learning_rate": 9.679951891806873e-05, + "loss": 12.1126, + "step": 19278 + }, + { + "epoch": 1.0498186001238827, + "grad_norm": 0.5423175265745502, + "learning_rate": 9.679070518982259e-05, + "loss": 12.0847, + "step": 19279 + }, + { + "epoch": 1.049873054120466, + "grad_norm": 0.5409640984493687, + "learning_rate": 9.678189148653246e-05, + "loss": 12.018, + "step": 19280 + }, + { + "epoch": 1.049927508117049, + "grad_norm": 0.5342685429477202, + "learning_rate": 9.677307780826687e-05, + "loss": 12.0387, + "step": 19281 + }, + { + "epoch": 1.049981962113632, + "grad_norm": 0.5299288139375267, + "learning_rate": 9.676426415509439e-05, + "loss": 12.0441, + "step": 19282 + }, + { + "epoch": 1.050036416110215, + "grad_norm": 0.664810509725584, + "learning_rate": 9.675545052708358e-05, + "loss": 12.095, + "step": 19283 + }, + { + "epoch": 1.050090870106798, + "grad_norm": 0.5478869296306361, + "learning_rate": 9.674663692430286e-05, + "loss": 12.0243, + "step": 19284 + }, + { + "epoch": 1.050145324103381, + "grad_norm": 0.5575254940100597, + "learning_rate": 9.673782334682085e-05, + "loss": 12.0398, + "step": 19285 + }, + { + "epoch": 1.050199778099964, + "grad_norm": 0.5672195448002826, + "learning_rate": 9.672900979470608e-05, + "loss": 12.0406, + "step": 19286 + }, + { + "epoch": 1.050254232096547, + "grad_norm": 0.5550502418984661, + "learning_rate": 9.672019626802708e-05, + "loss": 12.0973, + "step": 19287 + }, + { + "epoch": 1.0503086860931299, + "grad_norm": 0.5527939947632422, + "learning_rate": 9.671138276685238e-05, + "loss": 12.1604, + "step": 19288 + }, + { + "epoch": 1.0503631400897129, + "grad_norm": 0.524206438490076, + "learning_rate": 9.670256929125053e-05, + "loss": 12.1217, + "step": 19289 + }, + { + "epoch": 1.0504175940862959, + "grad_norm": 0.56160257257029, + "learning_rate": 9.669375584129005e-05, + "loss": 12.0312, + "step": 19290 + }, + { + "epoch": 1.050472048082879, + "grad_norm": 0.5204986254117848, + "learning_rate": 9.668494241703945e-05, + "loss": 12.0738, + "step": 19291 + }, + { + "epoch": 1.050526502079462, + "grad_norm": 0.5324262785130692, + "learning_rate": 9.667612901856732e-05, + "loss": 11.7358, + "step": 19292 + }, + { + "epoch": 1.050580956076045, + "grad_norm": 0.5772614596923383, + "learning_rate": 9.666731564594222e-05, + "loss": 12.0383, + "step": 19293 + }, + { + "epoch": 1.050635410072628, + "grad_norm": 0.5660892623770573, + "learning_rate": 9.665850229923258e-05, + "loss": 12.0707, + "step": 19294 + }, + { + "epoch": 1.050689864069211, + "grad_norm": 0.5317718569733886, + "learning_rate": 9.664968897850695e-05, + "loss": 12.0076, + "step": 19295 + }, + { + "epoch": 1.050744318065794, + "grad_norm": 0.5517253525771574, + "learning_rate": 9.664087568383394e-05, + "loss": 12.115, + "step": 19296 + }, + { + "epoch": 1.050798772062377, + "grad_norm": 0.5468316584293257, + "learning_rate": 9.663206241528204e-05, + "loss": 12.1009, + "step": 19297 + }, + { + "epoch": 1.05085322605896, + "grad_norm": 0.539461242898282, + "learning_rate": 9.662324917291979e-05, + "loss": 12.1092, + "step": 19298 + }, + { + "epoch": 1.050907680055543, + "grad_norm": 0.4899975306952434, + "learning_rate": 9.661443595681573e-05, + "loss": 11.9413, + "step": 19299 + }, + { + "epoch": 1.050962134052126, + "grad_norm": 0.5716632962528633, + "learning_rate": 9.660562276703838e-05, + "loss": 12.0186, + "step": 19300 + }, + { + "epoch": 1.051016588048709, + "grad_norm": 0.531144636239472, + "learning_rate": 9.659680960365626e-05, + "loss": 11.9862, + "step": 19301 + }, + { + "epoch": 1.051071042045292, + "grad_norm": 0.5140450260765325, + "learning_rate": 9.658799646673793e-05, + "loss": 12.1339, + "step": 19302 + }, + { + "epoch": 1.0511254960418752, + "grad_norm": 0.5405131973007933, + "learning_rate": 9.657918335635194e-05, + "loss": 12.002, + "step": 19303 + }, + { + "epoch": 1.0511799500384582, + "grad_norm": 0.5421925293191863, + "learning_rate": 9.657037027256676e-05, + "loss": 12.0472, + "step": 19304 + }, + { + "epoch": 1.0512344040350412, + "grad_norm": 0.5010776098536464, + "learning_rate": 9.656155721545094e-05, + "loss": 12.0539, + "step": 19305 + }, + { + "epoch": 1.0512888580316242, + "grad_norm": 0.5373765483696279, + "learning_rate": 9.655274418507307e-05, + "loss": 12.0694, + "step": 19306 + }, + { + "epoch": 1.0513433120282072, + "grad_norm": 0.5229203708756343, + "learning_rate": 9.654393118150159e-05, + "loss": 11.9288, + "step": 19307 + }, + { + "epoch": 1.0513977660247902, + "grad_norm": 0.523050323587943, + "learning_rate": 9.653511820480511e-05, + "loss": 12.0839, + "step": 19308 + }, + { + "epoch": 1.0514522200213732, + "grad_norm": 0.52587795112077, + "learning_rate": 9.652630525505213e-05, + "loss": 12.0395, + "step": 19309 + }, + { + "epoch": 1.0515066740179562, + "grad_norm": 0.5440393969364299, + "learning_rate": 9.651749233231117e-05, + "loss": 12.0127, + "step": 19310 + }, + { + "epoch": 1.0515611280145392, + "grad_norm": 0.5877520952516384, + "learning_rate": 9.65086794366508e-05, + "loss": 12.1713, + "step": 19311 + }, + { + "epoch": 1.0516155820111222, + "grad_norm": 0.5626253843201025, + "learning_rate": 9.649986656813951e-05, + "loss": 12.0419, + "step": 19312 + }, + { + "epoch": 1.0516700360077051, + "grad_norm": 0.560972630053483, + "learning_rate": 9.649105372684586e-05, + "loss": 11.9527, + "step": 19313 + }, + { + "epoch": 1.0517244900042884, + "grad_norm": 0.5201536054810402, + "learning_rate": 9.648224091283835e-05, + "loss": 11.8398, + "step": 19314 + }, + { + "epoch": 1.0517789440008714, + "grad_norm": 0.5708413242075481, + "learning_rate": 9.647342812618553e-05, + "loss": 12.0981, + "step": 19315 + }, + { + "epoch": 1.0518333979974543, + "grad_norm": 0.538158698770974, + "learning_rate": 9.646461536695591e-05, + "loss": 11.9265, + "step": 19316 + }, + { + "epoch": 1.0518878519940373, + "grad_norm": 0.5379127553390638, + "learning_rate": 9.645580263521805e-05, + "loss": 12.1438, + "step": 19317 + }, + { + "epoch": 1.0519423059906203, + "grad_norm": 0.5904144811653973, + "learning_rate": 9.644698993104044e-05, + "loss": 11.9552, + "step": 19318 + }, + { + "epoch": 1.0519967599872033, + "grad_norm": 0.5484578526745109, + "learning_rate": 9.643817725449163e-05, + "loss": 12.0061, + "step": 19319 + }, + { + "epoch": 1.0520512139837863, + "grad_norm": 0.5452909707857968, + "learning_rate": 9.642936460564019e-05, + "loss": 12.0431, + "step": 19320 + }, + { + "epoch": 1.0521056679803693, + "grad_norm": 0.5896543186751604, + "learning_rate": 9.642055198455457e-05, + "loss": 12.1793, + "step": 19321 + }, + { + "epoch": 1.0521601219769523, + "grad_norm": 0.5424352016462574, + "learning_rate": 9.641173939130337e-05, + "loss": 11.9724, + "step": 19322 + }, + { + "epoch": 1.0522145759735353, + "grad_norm": 0.548154158162294, + "learning_rate": 9.640292682595508e-05, + "loss": 11.9925, + "step": 19323 + }, + { + "epoch": 1.0522690299701183, + "grad_norm": 0.5113758994292995, + "learning_rate": 9.639411428857823e-05, + "loss": 12.145, + "step": 19324 + }, + { + "epoch": 1.0523234839667013, + "grad_norm": 0.5444394344892941, + "learning_rate": 9.638530177924136e-05, + "loss": 12.011, + "step": 19325 + }, + { + "epoch": 1.0523779379632845, + "grad_norm": 0.5504481687682757, + "learning_rate": 9.637648929801297e-05, + "loss": 11.9842, + "step": 19326 + }, + { + "epoch": 1.0524323919598675, + "grad_norm": 0.5950719535763734, + "learning_rate": 9.636767684496162e-05, + "loss": 12.0425, + "step": 19327 + }, + { + "epoch": 1.0524868459564505, + "grad_norm": 0.5807610266139246, + "learning_rate": 9.635886442015582e-05, + "loss": 12.0858, + "step": 19328 + }, + { + "epoch": 1.0525412999530335, + "grad_norm": 0.5517142506957727, + "learning_rate": 9.635005202366407e-05, + "loss": 11.974, + "step": 19329 + }, + { + "epoch": 1.0525957539496165, + "grad_norm": 0.6235125416785774, + "learning_rate": 9.634123965555495e-05, + "loss": 12.0126, + "step": 19330 + }, + { + "epoch": 1.0526502079461995, + "grad_norm": 0.5510346855094569, + "learning_rate": 9.633242731589698e-05, + "loss": 12.0586, + "step": 19331 + }, + { + "epoch": 1.0527046619427824, + "grad_norm": 0.5484858157775377, + "learning_rate": 9.632361500475866e-05, + "loss": 12.1546, + "step": 19332 + }, + { + "epoch": 1.0527591159393654, + "grad_norm": 0.5569662093016531, + "learning_rate": 9.631480272220855e-05, + "loss": 12.0479, + "step": 19333 + }, + { + "epoch": 1.0528135699359484, + "grad_norm": 0.5913187816809812, + "learning_rate": 9.630599046831513e-05, + "loss": 12.048, + "step": 19334 + }, + { + "epoch": 1.0528680239325314, + "grad_norm": 0.5415003726459723, + "learning_rate": 9.629717824314696e-05, + "loss": 12.1657, + "step": 19335 + }, + { + "epoch": 1.0529224779291144, + "grad_norm": 0.5880368853026926, + "learning_rate": 9.628836604677253e-05, + "loss": 11.9169, + "step": 19336 + }, + { + "epoch": 1.0529769319256976, + "grad_norm": 0.6214821322491036, + "learning_rate": 9.627955387926041e-05, + "loss": 11.9572, + "step": 19337 + }, + { + "epoch": 1.0530313859222806, + "grad_norm": 0.5374898237994633, + "learning_rate": 9.627074174067909e-05, + "loss": 12.0442, + "step": 19338 + }, + { + "epoch": 1.0530858399188636, + "grad_norm": 0.5723645363983014, + "learning_rate": 9.62619296310971e-05, + "loss": 12.2039, + "step": 19339 + }, + { + "epoch": 1.0531402939154466, + "grad_norm": 0.5839935285049328, + "learning_rate": 9.625311755058296e-05, + "loss": 11.9861, + "step": 19340 + }, + { + "epoch": 1.0531947479120296, + "grad_norm": 0.5062275376127757, + "learning_rate": 9.624430549920523e-05, + "loss": 11.994, + "step": 19341 + }, + { + "epoch": 1.0532492019086126, + "grad_norm": 0.5318769916007122, + "learning_rate": 9.62354934770324e-05, + "loss": 12.105, + "step": 19342 + }, + { + "epoch": 1.0533036559051956, + "grad_norm": 0.5984575688586973, + "learning_rate": 9.622668148413306e-05, + "loss": 12.076, + "step": 19343 + }, + { + "epoch": 1.0533581099017786, + "grad_norm": 0.5588173460010458, + "learning_rate": 9.621786952057561e-05, + "loss": 11.9369, + "step": 19344 + }, + { + "epoch": 1.0534125638983616, + "grad_norm": 0.5773820580009359, + "learning_rate": 9.620905758642867e-05, + "loss": 12.0063, + "step": 19345 + }, + { + "epoch": 1.0534670178949446, + "grad_norm": 0.5589950202886801, + "learning_rate": 9.620024568176071e-05, + "loss": 12.0991, + "step": 19346 + }, + { + "epoch": 1.0535214718915276, + "grad_norm": 0.6144528426187391, + "learning_rate": 9.61914338066403e-05, + "loss": 11.9937, + "step": 19347 + }, + { + "epoch": 1.0535759258881106, + "grad_norm": 0.5543121445619258, + "learning_rate": 9.618262196113594e-05, + "loss": 12.1332, + "step": 19348 + }, + { + "epoch": 1.0536303798846938, + "grad_norm": 0.5227719779637813, + "learning_rate": 9.617381014531614e-05, + "loss": 12.1155, + "step": 19349 + }, + { + "epoch": 1.0536848338812768, + "grad_norm": 0.47408434348923856, + "learning_rate": 9.616499835924943e-05, + "loss": 12.0586, + "step": 19350 + }, + { + "epoch": 1.0537392878778598, + "grad_norm": 0.5539712618336022, + "learning_rate": 9.615618660300434e-05, + "loss": 12.0447, + "step": 19351 + }, + { + "epoch": 1.0537937418744427, + "grad_norm": 0.5532620334707203, + "learning_rate": 9.614737487664938e-05, + "loss": 12.1215, + "step": 19352 + }, + { + "epoch": 1.0538481958710257, + "grad_norm": 0.4965213600162703, + "learning_rate": 9.613856318025308e-05, + "loss": 11.9894, + "step": 19353 + }, + { + "epoch": 1.0539026498676087, + "grad_norm": 0.5440742933537539, + "learning_rate": 9.612975151388401e-05, + "loss": 12.0088, + "step": 19354 + }, + { + "epoch": 1.0539571038641917, + "grad_norm": 0.6165878924064676, + "learning_rate": 9.61209398776106e-05, + "loss": 11.9843, + "step": 19355 + }, + { + "epoch": 1.0540115578607747, + "grad_norm": 0.5577108598869567, + "learning_rate": 9.61121282715014e-05, + "loss": 12.0244, + "step": 19356 + }, + { + "epoch": 1.0540660118573577, + "grad_norm": 0.5275445677344449, + "learning_rate": 9.610331669562495e-05, + "loss": 12.0602, + "step": 19357 + }, + { + "epoch": 1.0541204658539407, + "grad_norm": 0.5336427825182262, + "learning_rate": 9.609450515004977e-05, + "loss": 12.1088, + "step": 19358 + }, + { + "epoch": 1.0541749198505237, + "grad_norm": 0.5544962996409228, + "learning_rate": 9.608569363484436e-05, + "loss": 12.0743, + "step": 19359 + }, + { + "epoch": 1.0542293738471067, + "grad_norm": 0.5712553238000525, + "learning_rate": 9.607688215007728e-05, + "loss": 12.0085, + "step": 19360 + }, + { + "epoch": 1.05428382784369, + "grad_norm": 0.5872483044221241, + "learning_rate": 9.6068070695817e-05, + "loss": 12.0825, + "step": 19361 + }, + { + "epoch": 1.054338281840273, + "grad_norm": 0.5301262680488378, + "learning_rate": 9.605925927213207e-05, + "loss": 12.0546, + "step": 19362 + }, + { + "epoch": 1.0543927358368559, + "grad_norm": 0.5941182422544317, + "learning_rate": 9.605044787909098e-05, + "loss": 11.9446, + "step": 19363 + }, + { + "epoch": 1.0544471898334389, + "grad_norm": 0.5981957575214851, + "learning_rate": 9.604163651676232e-05, + "loss": 11.9136, + "step": 19364 + }, + { + "epoch": 1.0545016438300219, + "grad_norm": 0.5661166063853473, + "learning_rate": 9.603282518521453e-05, + "loss": 12.0079, + "step": 19365 + }, + { + "epoch": 1.0545560978266049, + "grad_norm": 0.5799740773054626, + "learning_rate": 9.602401388451615e-05, + "loss": 11.96, + "step": 19366 + }, + { + "epoch": 1.0546105518231879, + "grad_norm": 0.5895977707861102, + "learning_rate": 9.60152026147357e-05, + "loss": 12.0868, + "step": 19367 + }, + { + "epoch": 1.0546650058197709, + "grad_norm": 0.5138707350489121, + "learning_rate": 9.60063913759417e-05, + "loss": 12.1091, + "step": 19368 + }, + { + "epoch": 1.0547194598163538, + "grad_norm": 0.5883943166266282, + "learning_rate": 9.599758016820269e-05, + "loss": 11.9937, + "step": 19369 + }, + { + "epoch": 1.0547739138129368, + "grad_norm": 0.5673055198369863, + "learning_rate": 9.598876899158715e-05, + "loss": 12.0495, + "step": 19370 + }, + { + "epoch": 1.0548283678095198, + "grad_norm": 0.5766062835107026, + "learning_rate": 9.597995784616363e-05, + "loss": 12.0353, + "step": 19371 + }, + { + "epoch": 1.0548828218061028, + "grad_norm": 0.6058495704831871, + "learning_rate": 9.597114673200062e-05, + "loss": 12.0471, + "step": 19372 + }, + { + "epoch": 1.054937275802686, + "grad_norm": 0.5255673156686416, + "learning_rate": 9.596233564916665e-05, + "loss": 12.0796, + "step": 19373 + }, + { + "epoch": 1.054991729799269, + "grad_norm": 0.6016153150932537, + "learning_rate": 9.595352459773025e-05, + "loss": 12.1089, + "step": 19374 + }, + { + "epoch": 1.055046183795852, + "grad_norm": 0.556265804799959, + "learning_rate": 9.594471357775993e-05, + "loss": 12.0073, + "step": 19375 + }, + { + "epoch": 1.055100637792435, + "grad_norm": 0.5608704283257167, + "learning_rate": 9.593590258932417e-05, + "loss": 11.8563, + "step": 19376 + }, + { + "epoch": 1.055155091789018, + "grad_norm": 0.5781513393423386, + "learning_rate": 9.592709163249153e-05, + "loss": 12.1517, + "step": 19377 + }, + { + "epoch": 1.055209545785601, + "grad_norm": 0.5657138345497488, + "learning_rate": 9.591828070733047e-05, + "loss": 12.0803, + "step": 19378 + }, + { + "epoch": 1.055263999782184, + "grad_norm": 0.4916113789165632, + "learning_rate": 9.590946981390958e-05, + "loss": 12.1075, + "step": 19379 + }, + { + "epoch": 1.055318453778767, + "grad_norm": 0.5017788759661065, + "learning_rate": 9.590065895229732e-05, + "loss": 12.0225, + "step": 19380 + }, + { + "epoch": 1.05537290777535, + "grad_norm": 0.6423824315364451, + "learning_rate": 9.589184812256225e-05, + "loss": 12.0609, + "step": 19381 + }, + { + "epoch": 1.055427361771933, + "grad_norm": 0.5300731974853954, + "learning_rate": 9.588303732477283e-05, + "loss": 12.0407, + "step": 19382 + }, + { + "epoch": 1.055481815768516, + "grad_norm": 0.533408299576154, + "learning_rate": 9.587422655899762e-05, + "loss": 12.114, + "step": 19383 + }, + { + "epoch": 1.0555362697650992, + "grad_norm": 0.565065946695608, + "learning_rate": 9.586541582530514e-05, + "loss": 12.0338, + "step": 19384 + }, + { + "epoch": 1.0555907237616822, + "grad_norm": 0.5949280397479372, + "learning_rate": 9.585660512376384e-05, + "loss": 11.9603, + "step": 19385 + }, + { + "epoch": 1.0556451777582652, + "grad_norm": 0.5447417047295078, + "learning_rate": 9.58477944544423e-05, + "loss": 11.9043, + "step": 19386 + }, + { + "epoch": 1.0556996317548482, + "grad_norm": 0.6134685780708565, + "learning_rate": 9.583898381740898e-05, + "loss": 12.0248, + "step": 19387 + }, + { + "epoch": 1.0557540857514311, + "grad_norm": 0.5944890783600187, + "learning_rate": 9.583017321273243e-05, + "loss": 11.9876, + "step": 19388 + }, + { + "epoch": 1.0558085397480141, + "grad_norm": 0.5340596628731121, + "learning_rate": 9.582136264048114e-05, + "loss": 12.1036, + "step": 19389 + }, + { + "epoch": 1.0558629937445971, + "grad_norm": 0.6800523990437837, + "learning_rate": 9.581255210072364e-05, + "loss": 12.0343, + "step": 19390 + }, + { + "epoch": 1.0559174477411801, + "grad_norm": 0.5345874595919592, + "learning_rate": 9.580374159352845e-05, + "loss": 12.071, + "step": 19391 + }, + { + "epoch": 1.0559719017377631, + "grad_norm": 0.48246863244610083, + "learning_rate": 9.579493111896406e-05, + "loss": 11.9095, + "step": 19392 + }, + { + "epoch": 1.0560263557343461, + "grad_norm": 0.564840799839841, + "learning_rate": 9.5786120677099e-05, + "loss": 11.8749, + "step": 19393 + }, + { + "epoch": 1.056080809730929, + "grad_norm": 0.5859494509249186, + "learning_rate": 9.577731026800179e-05, + "loss": 12.0816, + "step": 19394 + }, + { + "epoch": 1.056135263727512, + "grad_norm": 0.5356116104743887, + "learning_rate": 9.57684998917409e-05, + "loss": 11.9958, + "step": 19395 + }, + { + "epoch": 1.0561897177240953, + "grad_norm": 0.602266790571894, + "learning_rate": 9.575968954838487e-05, + "loss": 12.1512, + "step": 19396 + }, + { + "epoch": 1.0562441717206783, + "grad_norm": 0.6157757269371643, + "learning_rate": 9.57508792380022e-05, + "loss": 12.1429, + "step": 19397 + }, + { + "epoch": 1.0562986257172613, + "grad_norm": 0.5980005309795724, + "learning_rate": 9.57420689606614e-05, + "loss": 12.0587, + "step": 19398 + }, + { + "epoch": 1.0563530797138443, + "grad_norm": 0.5908034979075625, + "learning_rate": 9.5733258716431e-05, + "loss": 12.1337, + "step": 19399 + }, + { + "epoch": 1.0564075337104273, + "grad_norm": 0.5343902534124586, + "learning_rate": 9.572444850537948e-05, + "loss": 11.9634, + "step": 19400 + }, + { + "epoch": 1.0564619877070103, + "grad_norm": 0.5842238118117028, + "learning_rate": 9.571563832757536e-05, + "loss": 12.0509, + "step": 19401 + }, + { + "epoch": 1.0565164417035933, + "grad_norm": 0.61275874507477, + "learning_rate": 9.570682818308715e-05, + "loss": 12.0451, + "step": 19402 + }, + { + "epoch": 1.0565708957001763, + "grad_norm": 0.5382671440730098, + "learning_rate": 9.56980180719834e-05, + "loss": 12.1103, + "step": 19403 + }, + { + "epoch": 1.0566253496967593, + "grad_norm": 0.5133640065547999, + "learning_rate": 9.568920799433261e-05, + "loss": 11.9877, + "step": 19404 + }, + { + "epoch": 1.0566798036933422, + "grad_norm": 0.5625235678126909, + "learning_rate": 9.568039795020319e-05, + "loss": 12.07, + "step": 19405 + }, + { + "epoch": 1.0567342576899252, + "grad_norm": 0.6284570348095425, + "learning_rate": 9.567158793966374e-05, + "loss": 12.1412, + "step": 19406 + }, + { + "epoch": 1.0567887116865085, + "grad_norm": 0.576328893838415, + "learning_rate": 9.566277796278276e-05, + "loss": 11.9983, + "step": 19407 + }, + { + "epoch": 1.0568431656830914, + "grad_norm": 0.5531351910635469, + "learning_rate": 9.565396801962874e-05, + "loss": 12.0894, + "step": 19408 + }, + { + "epoch": 1.0568976196796744, + "grad_norm": 0.5862286076121059, + "learning_rate": 9.56451581102702e-05, + "loss": 12.0713, + "step": 19409 + }, + { + "epoch": 1.0569520736762574, + "grad_norm": 0.49995967750380066, + "learning_rate": 9.563634823477563e-05, + "loss": 12.0698, + "step": 19410 + }, + { + "epoch": 1.0570065276728404, + "grad_norm": 0.5108491936131714, + "learning_rate": 9.562753839321355e-05, + "loss": 12.0247, + "step": 19411 + }, + { + "epoch": 1.0570609816694234, + "grad_norm": 0.5810846071878304, + "learning_rate": 9.561872858565245e-05, + "loss": 12.1781, + "step": 19412 + }, + { + "epoch": 1.0571154356660064, + "grad_norm": 0.5720799743612933, + "learning_rate": 9.560991881216088e-05, + "loss": 11.911, + "step": 19413 + }, + { + "epoch": 1.0571698896625894, + "grad_norm": 0.5087805100179055, + "learning_rate": 9.560110907280734e-05, + "loss": 11.9394, + "step": 19414 + }, + { + "epoch": 1.0572243436591724, + "grad_norm": 0.5364730959626036, + "learning_rate": 9.559229936766028e-05, + "loss": 12.0287, + "step": 19415 + }, + { + "epoch": 1.0572787976557554, + "grad_norm": 0.5284840458321186, + "learning_rate": 9.558348969678822e-05, + "loss": 11.9812, + "step": 19416 + }, + { + "epoch": 1.0573332516523384, + "grad_norm": 0.5570402363204487, + "learning_rate": 9.55746800602597e-05, + "loss": 12.0449, + "step": 19417 + }, + { + "epoch": 1.0573877056489214, + "grad_norm": 0.6044717380725996, + "learning_rate": 9.556587045814321e-05, + "loss": 12.1534, + "step": 19418 + }, + { + "epoch": 1.0574421596455046, + "grad_norm": 0.5526742651145066, + "learning_rate": 9.555706089050727e-05, + "loss": 12.1023, + "step": 19419 + }, + { + "epoch": 1.0574966136420876, + "grad_norm": 0.5613785787803098, + "learning_rate": 9.554825135742037e-05, + "loss": 12.0306, + "step": 19420 + }, + { + "epoch": 1.0575510676386706, + "grad_norm": 0.5217897384186181, + "learning_rate": 9.553944185895098e-05, + "loss": 11.9021, + "step": 19421 + }, + { + "epoch": 1.0576055216352536, + "grad_norm": 0.5383082673278936, + "learning_rate": 9.553063239516766e-05, + "loss": 12.214, + "step": 19422 + }, + { + "epoch": 1.0576599756318366, + "grad_norm": 0.5102142305765428, + "learning_rate": 9.552182296613888e-05, + "loss": 12.0834, + "step": 19423 + }, + { + "epoch": 1.0577144296284196, + "grad_norm": 0.5830261124845366, + "learning_rate": 9.55130135719332e-05, + "loss": 12.0077, + "step": 19424 + }, + { + "epoch": 1.0577688836250025, + "grad_norm": 0.5591125259697896, + "learning_rate": 9.550420421261905e-05, + "loss": 12.0461, + "step": 19425 + }, + { + "epoch": 1.0578233376215855, + "grad_norm": 0.5584932430347865, + "learning_rate": 9.549539488826497e-05, + "loss": 12.0933, + "step": 19426 + }, + { + "epoch": 1.0578777916181685, + "grad_norm": 0.606281460145651, + "learning_rate": 9.548658559893942e-05, + "loss": 12.1265, + "step": 19427 + }, + { + "epoch": 1.0579322456147515, + "grad_norm": 0.5216271627094825, + "learning_rate": 9.547777634471095e-05, + "loss": 12.0587, + "step": 19428 + }, + { + "epoch": 1.0579866996113345, + "grad_norm": 0.6148456316966616, + "learning_rate": 9.546896712564807e-05, + "loss": 12.3049, + "step": 19429 + }, + { + "epoch": 1.0580411536079175, + "grad_norm": 0.5183933982480431, + "learning_rate": 9.546015794181925e-05, + "loss": 12.1366, + "step": 19430 + }, + { + "epoch": 1.0580956076045007, + "grad_norm": 0.5372059927978043, + "learning_rate": 9.5451348793293e-05, + "loss": 11.9844, + "step": 19431 + }, + { + "epoch": 1.0581500616010837, + "grad_norm": 0.5360406190489272, + "learning_rate": 9.544253968013784e-05, + "loss": 11.9618, + "step": 19432 + }, + { + "epoch": 1.0582045155976667, + "grad_norm": 0.5348052474035279, + "learning_rate": 9.543373060242225e-05, + "loss": 12.1427, + "step": 19433 + }, + { + "epoch": 1.0582589695942497, + "grad_norm": 0.5252043494377767, + "learning_rate": 9.542492156021475e-05, + "loss": 12.0432, + "step": 19434 + }, + { + "epoch": 1.0583134235908327, + "grad_norm": 0.5148590361233946, + "learning_rate": 9.541611255358381e-05, + "loss": 11.8975, + "step": 19435 + }, + { + "epoch": 1.0583678775874157, + "grad_norm": 0.5501657885838557, + "learning_rate": 9.540730358259795e-05, + "loss": 12.0395, + "step": 19436 + }, + { + "epoch": 1.0584223315839987, + "grad_norm": 0.5489019089616707, + "learning_rate": 9.539849464732566e-05, + "loss": 12.0335, + "step": 19437 + }, + { + "epoch": 1.0584767855805817, + "grad_norm": 0.6431949386899316, + "learning_rate": 9.538968574783543e-05, + "loss": 12.0754, + "step": 19438 + }, + { + "epoch": 1.0585312395771647, + "grad_norm": 0.6179053784918723, + "learning_rate": 9.53808768841958e-05, + "loss": 12.1651, + "step": 19439 + }, + { + "epoch": 1.0585856935737477, + "grad_norm": 0.529575356971183, + "learning_rate": 9.537206805647524e-05, + "loss": 11.9781, + "step": 19440 + }, + { + "epoch": 1.0586401475703306, + "grad_norm": 0.5669109622144446, + "learning_rate": 9.536325926474227e-05, + "loss": 12.092, + "step": 19441 + }, + { + "epoch": 1.0586946015669136, + "grad_norm": 0.5637886966334247, + "learning_rate": 9.535445050906536e-05, + "loss": 12.0313, + "step": 19442 + }, + { + "epoch": 1.0587490555634969, + "grad_norm": 0.5354476191036248, + "learning_rate": 9.534564178951302e-05, + "loss": 11.9178, + "step": 19443 + }, + { + "epoch": 1.0588035095600798, + "grad_norm": 0.5809895908083332, + "learning_rate": 9.533683310615378e-05, + "loss": 12.1999, + "step": 19444 + }, + { + "epoch": 1.0588579635566628, + "grad_norm": 0.5156859117911462, + "learning_rate": 9.532802445905608e-05, + "loss": 12.0573, + "step": 19445 + }, + { + "epoch": 1.0589124175532458, + "grad_norm": 0.524969301738632, + "learning_rate": 9.531921584828845e-05, + "loss": 12.1403, + "step": 19446 + }, + { + "epoch": 1.0589668715498288, + "grad_norm": 0.5506294772032159, + "learning_rate": 9.531040727391938e-05, + "loss": 11.8896, + "step": 19447 + }, + { + "epoch": 1.0590213255464118, + "grad_norm": 0.515800456877427, + "learning_rate": 9.530159873601738e-05, + "loss": 12.0135, + "step": 19448 + }, + { + "epoch": 1.0590757795429948, + "grad_norm": 0.5434180280161715, + "learning_rate": 9.529279023465089e-05, + "loss": 12.0846, + "step": 19449 + }, + { + "epoch": 1.0591302335395778, + "grad_norm": 0.49115390638875817, + "learning_rate": 9.528398176988849e-05, + "loss": 11.9628, + "step": 19450 + }, + { + "epoch": 1.0591846875361608, + "grad_norm": 0.5960538189920022, + "learning_rate": 9.527517334179864e-05, + "loss": 12.1663, + "step": 19451 + }, + { + "epoch": 1.0592391415327438, + "grad_norm": 0.6018235777881578, + "learning_rate": 9.526636495044984e-05, + "loss": 12.0909, + "step": 19452 + }, + { + "epoch": 1.0592935955293268, + "grad_norm": 0.5492354738252602, + "learning_rate": 9.525755659591057e-05, + "loss": 12.0349, + "step": 19453 + }, + { + "epoch": 1.05934804952591, + "grad_norm": 0.5318644500366964, + "learning_rate": 9.524874827824936e-05, + "loss": 12.1293, + "step": 19454 + }, + { + "epoch": 1.059402503522493, + "grad_norm": 0.5377986794027186, + "learning_rate": 9.523993999753466e-05, + "loss": 12.0167, + "step": 19455 + }, + { + "epoch": 1.059456957519076, + "grad_norm": 0.5683297293895401, + "learning_rate": 9.523113175383498e-05, + "loss": 12.1201, + "step": 19456 + }, + { + "epoch": 1.059511411515659, + "grad_norm": 0.5281439523592487, + "learning_rate": 9.522232354721882e-05, + "loss": 12.0985, + "step": 19457 + }, + { + "epoch": 1.059565865512242, + "grad_norm": 0.5574099817183141, + "learning_rate": 9.521351537775467e-05, + "loss": 12.0668, + "step": 19458 + }, + { + "epoch": 1.059620319508825, + "grad_norm": 0.5481013815660997, + "learning_rate": 9.520470724551104e-05, + "loss": 12.0165, + "step": 19459 + }, + { + "epoch": 1.059674773505408, + "grad_norm": 0.5258555008954695, + "learning_rate": 9.51958991505564e-05, + "loss": 12.1447, + "step": 19460 + }, + { + "epoch": 1.059729227501991, + "grad_norm": 0.5913586523355205, + "learning_rate": 9.518709109295922e-05, + "loss": 11.9954, + "step": 19461 + }, + { + "epoch": 1.059783681498574, + "grad_norm": 0.5426523361113232, + "learning_rate": 9.517828307278807e-05, + "loss": 12.1235, + "step": 19462 + }, + { + "epoch": 1.059838135495157, + "grad_norm": 0.6265948071629064, + "learning_rate": 9.51694750901114e-05, + "loss": 12.0963, + "step": 19463 + }, + { + "epoch": 1.05989258949174, + "grad_norm": 0.5890047993985151, + "learning_rate": 9.516066714499772e-05, + "loss": 12.1072, + "step": 19464 + }, + { + "epoch": 1.059947043488323, + "grad_norm": 0.5556535862287739, + "learning_rate": 9.515185923751547e-05, + "loss": 12.0839, + "step": 19465 + }, + { + "epoch": 1.0600014974849061, + "grad_norm": 0.5251302373355244, + "learning_rate": 9.514305136773318e-05, + "loss": 12.1077, + "step": 19466 + }, + { + "epoch": 1.0600559514814891, + "grad_norm": 0.5924405189573182, + "learning_rate": 9.513424353571934e-05, + "loss": 12.1408, + "step": 19467 + }, + { + "epoch": 1.0601104054780721, + "grad_norm": 0.533802556666249, + "learning_rate": 9.512543574154245e-05, + "loss": 11.9806, + "step": 19468 + }, + { + "epoch": 1.060164859474655, + "grad_norm": 0.5166721009870676, + "learning_rate": 9.511662798527096e-05, + "loss": 12.1354, + "step": 19469 + }, + { + "epoch": 1.060219313471238, + "grad_norm": 0.5407636437450513, + "learning_rate": 9.510782026697343e-05, + "loss": 11.9844, + "step": 19470 + }, + { + "epoch": 1.060273767467821, + "grad_norm": 0.5331792045706529, + "learning_rate": 9.509901258671827e-05, + "loss": 12.0391, + "step": 19471 + }, + { + "epoch": 1.060328221464404, + "grad_norm": 0.5508183535166898, + "learning_rate": 9.5090204944574e-05, + "loss": 12.0949, + "step": 19472 + }, + { + "epoch": 1.060382675460987, + "grad_norm": 0.5688078405882937, + "learning_rate": 9.508139734060915e-05, + "loss": 12.086, + "step": 19473 + }, + { + "epoch": 1.06043712945757, + "grad_norm": 0.5733624994938668, + "learning_rate": 9.507258977489223e-05, + "loss": 12.0644, + "step": 19474 + }, + { + "epoch": 1.060491583454153, + "grad_norm": 0.5443580440120191, + "learning_rate": 9.506378224749163e-05, + "loss": 12.0725, + "step": 19475 + }, + { + "epoch": 1.060546037450736, + "grad_norm": 0.5061060765819786, + "learning_rate": 9.505497475847586e-05, + "loss": 12.0051, + "step": 19476 + }, + { + "epoch": 1.0606004914473193, + "grad_norm": 0.571105398310208, + "learning_rate": 9.504616730791345e-05, + "loss": 12.1534, + "step": 19477 + }, + { + "epoch": 1.0606549454439023, + "grad_norm": 0.5546744851328956, + "learning_rate": 9.503735989587289e-05, + "loss": 12.0144, + "step": 19478 + }, + { + "epoch": 1.0607093994404853, + "grad_norm": 0.5460818596781322, + "learning_rate": 9.502855252242264e-05, + "loss": 12.0555, + "step": 19479 + }, + { + "epoch": 1.0607638534370682, + "grad_norm": 0.545806618163049, + "learning_rate": 9.50197451876312e-05, + "loss": 11.9989, + "step": 19480 + }, + { + "epoch": 1.0608183074336512, + "grad_norm": 0.5690009421971667, + "learning_rate": 9.501093789156706e-05, + "loss": 12.1087, + "step": 19481 + }, + { + "epoch": 1.0608727614302342, + "grad_norm": 0.5247933770362788, + "learning_rate": 9.50021306342987e-05, + "loss": 12.0453, + "step": 19482 + }, + { + "epoch": 1.0609272154268172, + "grad_norm": 0.5369592685857938, + "learning_rate": 9.49933234158946e-05, + "loss": 11.8929, + "step": 19483 + }, + { + "epoch": 1.0609816694234002, + "grad_norm": 0.5544839201165852, + "learning_rate": 9.49845162364233e-05, + "loss": 12.0165, + "step": 19484 + }, + { + "epoch": 1.0610361234199832, + "grad_norm": 0.5717511692715508, + "learning_rate": 9.497570909595322e-05, + "loss": 12.03, + "step": 19485 + }, + { + "epoch": 1.0610905774165662, + "grad_norm": 0.5651487710115926, + "learning_rate": 9.496690199455286e-05, + "loss": 12.047, + "step": 19486 + }, + { + "epoch": 1.0611450314131492, + "grad_norm": 0.5318155068592443, + "learning_rate": 9.49580949322907e-05, + "loss": 12.0402, + "step": 19487 + }, + { + "epoch": 1.0611994854097322, + "grad_norm": 0.5704015631935876, + "learning_rate": 9.494928790923525e-05, + "loss": 12.0697, + "step": 19488 + }, + { + "epoch": 1.0612539394063154, + "grad_norm": 0.513769491953973, + "learning_rate": 9.494048092545499e-05, + "loss": 11.8972, + "step": 19489 + }, + { + "epoch": 1.0613083934028984, + "grad_norm": 0.5308279098688354, + "learning_rate": 9.49316739810184e-05, + "loss": 11.9496, + "step": 19490 + }, + { + "epoch": 1.0613628473994814, + "grad_norm": 0.5652421659166454, + "learning_rate": 9.492286707599395e-05, + "loss": 12.0402, + "step": 19491 + }, + { + "epoch": 1.0614173013960644, + "grad_norm": 0.6012514551528869, + "learning_rate": 9.491406021045016e-05, + "loss": 12.0058, + "step": 19492 + }, + { + "epoch": 1.0614717553926474, + "grad_norm": 0.6147753858557873, + "learning_rate": 9.490525338445547e-05, + "loss": 12.1222, + "step": 19493 + }, + { + "epoch": 1.0615262093892304, + "grad_norm": 0.4778765331953231, + "learning_rate": 9.489644659807842e-05, + "loss": 12.003, + "step": 19494 + }, + { + "epoch": 1.0615806633858134, + "grad_norm": 0.5490182527436342, + "learning_rate": 9.488763985138742e-05, + "loss": 12.0958, + "step": 19495 + }, + { + "epoch": 1.0616351173823964, + "grad_norm": 0.6310603682268788, + "learning_rate": 9.487883314445101e-05, + "loss": 12.0279, + "step": 19496 + }, + { + "epoch": 1.0616895713789793, + "grad_norm": 0.5287781068237357, + "learning_rate": 9.487002647733763e-05, + "loss": 11.9563, + "step": 19497 + }, + { + "epoch": 1.0617440253755623, + "grad_norm": 0.5139150948204092, + "learning_rate": 9.486121985011579e-05, + "loss": 12.152, + "step": 19498 + }, + { + "epoch": 1.0617984793721453, + "grad_norm": 0.537475871099728, + "learning_rate": 9.485241326285397e-05, + "loss": 11.9787, + "step": 19499 + }, + { + "epoch": 1.0618529333687283, + "grad_norm": 0.5714777270599907, + "learning_rate": 9.484360671562064e-05, + "loss": 11.9879, + "step": 19500 + }, + { + "epoch": 1.0619073873653115, + "grad_norm": 0.6072144152856551, + "learning_rate": 9.483480020848431e-05, + "loss": 12.0891, + "step": 19501 + }, + { + "epoch": 1.0619618413618945, + "grad_norm": 0.5306079632416387, + "learning_rate": 9.482599374151342e-05, + "loss": 12.0473, + "step": 19502 + }, + { + "epoch": 1.0620162953584775, + "grad_norm": 0.571602416194433, + "learning_rate": 9.481718731477648e-05, + "loss": 12.0993, + "step": 19503 + }, + { + "epoch": 1.0620707493550605, + "grad_norm": 0.5274166829278563, + "learning_rate": 9.480838092834196e-05, + "loss": 12.0607, + "step": 19504 + }, + { + "epoch": 1.0621252033516435, + "grad_norm": 0.5308149700387995, + "learning_rate": 9.479957458227836e-05, + "loss": 12.0445, + "step": 19505 + }, + { + "epoch": 1.0621796573482265, + "grad_norm": 0.5719164937398165, + "learning_rate": 9.479076827665413e-05, + "loss": 12.2446, + "step": 19506 + }, + { + "epoch": 1.0622341113448095, + "grad_norm": 0.5484840373867881, + "learning_rate": 9.478196201153775e-05, + "loss": 12.0868, + "step": 19507 + }, + { + "epoch": 1.0622885653413925, + "grad_norm": 0.5174545724860756, + "learning_rate": 9.47731557869977e-05, + "loss": 12.0515, + "step": 19508 + }, + { + "epoch": 1.0623430193379755, + "grad_norm": 0.5325860096749648, + "learning_rate": 9.476434960310248e-05, + "loss": 12.0681, + "step": 19509 + }, + { + "epoch": 1.0623974733345585, + "grad_norm": 0.5766542534429969, + "learning_rate": 9.475554345992052e-05, + "loss": 12.1705, + "step": 19510 + }, + { + "epoch": 1.0624519273311415, + "grad_norm": 0.6170470995755548, + "learning_rate": 9.474673735752038e-05, + "loss": 12.1246, + "step": 19511 + }, + { + "epoch": 1.0625063813277245, + "grad_norm": 0.5433823077381589, + "learning_rate": 9.473793129597047e-05, + "loss": 12.1055, + "step": 19512 + }, + { + "epoch": 1.0625608353243077, + "grad_norm": 0.5693997414772987, + "learning_rate": 9.472912527533929e-05, + "loss": 12.0046, + "step": 19513 + }, + { + "epoch": 1.0626152893208907, + "grad_norm": 0.5397460210874416, + "learning_rate": 9.472031929569533e-05, + "loss": 12.1269, + "step": 19514 + }, + { + "epoch": 1.0626697433174737, + "grad_norm": 0.6686676208401305, + "learning_rate": 9.471151335710706e-05, + "loss": 12.0209, + "step": 19515 + }, + { + "epoch": 1.0627241973140567, + "grad_norm": 0.590749113523507, + "learning_rate": 9.470270745964294e-05, + "loss": 12.106, + "step": 19516 + }, + { + "epoch": 1.0627786513106396, + "grad_norm": 0.5271313705140186, + "learning_rate": 9.469390160337144e-05, + "loss": 12.0992, + "step": 19517 + }, + { + "epoch": 1.0628331053072226, + "grad_norm": 0.623477010706432, + "learning_rate": 9.468509578836105e-05, + "loss": 12.0731, + "step": 19518 + }, + { + "epoch": 1.0628875593038056, + "grad_norm": 0.6143950276182006, + "learning_rate": 9.467629001468025e-05, + "loss": 12.1269, + "step": 19519 + }, + { + "epoch": 1.0629420133003886, + "grad_norm": 0.5382089392313393, + "learning_rate": 9.466748428239753e-05, + "loss": 12.0799, + "step": 19520 + }, + { + "epoch": 1.0629964672969716, + "grad_norm": 0.5141725536001891, + "learning_rate": 9.465867859158131e-05, + "loss": 12.1073, + "step": 19521 + }, + { + "epoch": 1.0630509212935546, + "grad_norm": 0.5260665309132682, + "learning_rate": 9.464987294230012e-05, + "loss": 11.9465, + "step": 19522 + }, + { + "epoch": 1.0631053752901376, + "grad_norm": 0.5607901580338629, + "learning_rate": 9.464106733462242e-05, + "loss": 11.952, + "step": 19523 + }, + { + "epoch": 1.0631598292867208, + "grad_norm": 0.5205969784510579, + "learning_rate": 9.463226176861668e-05, + "loss": 12.0901, + "step": 19524 + }, + { + "epoch": 1.0632142832833038, + "grad_norm": 0.5739424072770057, + "learning_rate": 9.46234562443514e-05, + "loss": 12.0407, + "step": 19525 + }, + { + "epoch": 1.0632687372798868, + "grad_norm": 0.6787278281315261, + "learning_rate": 9.461465076189499e-05, + "loss": 12.1001, + "step": 19526 + }, + { + "epoch": 1.0633231912764698, + "grad_norm": 0.541509223022238, + "learning_rate": 9.460584532131596e-05, + "loss": 12.0837, + "step": 19527 + }, + { + "epoch": 1.0633776452730528, + "grad_norm": 0.5348501043810973, + "learning_rate": 9.45970399226828e-05, + "loss": 12.1429, + "step": 19528 + }, + { + "epoch": 1.0634320992696358, + "grad_norm": 0.5568295755065166, + "learning_rate": 9.458823456606394e-05, + "loss": 11.994, + "step": 19529 + }, + { + "epoch": 1.0634865532662188, + "grad_norm": 0.5496086327124223, + "learning_rate": 9.457942925152788e-05, + "loss": 12.0708, + "step": 19530 + }, + { + "epoch": 1.0635410072628018, + "grad_norm": 0.4676056507372799, + "learning_rate": 9.45706239791431e-05, + "loss": 11.9249, + "step": 19531 + }, + { + "epoch": 1.0635954612593848, + "grad_norm": 0.6022564596587495, + "learning_rate": 9.456181874897803e-05, + "loss": 12.0366, + "step": 19532 + }, + { + "epoch": 1.0636499152559677, + "grad_norm": 0.5478683764357231, + "learning_rate": 9.455301356110119e-05, + "loss": 12.0423, + "step": 19533 + }, + { + "epoch": 1.0637043692525507, + "grad_norm": 0.6722146474969767, + "learning_rate": 9.454420841558103e-05, + "loss": 12.206, + "step": 19534 + }, + { + "epoch": 1.0637588232491337, + "grad_norm": 0.6034418312173684, + "learning_rate": 9.453540331248607e-05, + "loss": 12.155, + "step": 19535 + }, + { + "epoch": 1.063813277245717, + "grad_norm": 0.5906543027720849, + "learning_rate": 9.452659825188466e-05, + "loss": 12.0325, + "step": 19536 + }, + { + "epoch": 1.0638677312423, + "grad_norm": 0.6082275877099078, + "learning_rate": 9.451779323384535e-05, + "loss": 12.0325, + "step": 19537 + }, + { + "epoch": 1.063922185238883, + "grad_norm": 0.5894290320323842, + "learning_rate": 9.450898825843662e-05, + "loss": 12.0888, + "step": 19538 + }, + { + "epoch": 1.063976639235466, + "grad_norm": 0.5957966361615813, + "learning_rate": 9.45001833257269e-05, + "loss": 12.115, + "step": 19539 + }, + { + "epoch": 1.064031093232049, + "grad_norm": 0.5244965725214917, + "learning_rate": 9.449137843578469e-05, + "loss": 12.0886, + "step": 19540 + }, + { + "epoch": 1.064085547228632, + "grad_norm": 0.5694397091625615, + "learning_rate": 9.448257358867845e-05, + "loss": 12.0047, + "step": 19541 + }, + { + "epoch": 1.064140001225215, + "grad_norm": 0.5498986514812435, + "learning_rate": 9.447376878447662e-05, + "loss": 12.0668, + "step": 19542 + }, + { + "epoch": 1.064194455221798, + "grad_norm": 0.5324227496696069, + "learning_rate": 9.44649640232477e-05, + "loss": 12.0771, + "step": 19543 + }, + { + "epoch": 1.064248909218381, + "grad_norm": 0.5567550837917793, + "learning_rate": 9.445615930506014e-05, + "loss": 12.1119, + "step": 19544 + }, + { + "epoch": 1.0643033632149639, + "grad_norm": 0.4977052343332378, + "learning_rate": 9.444735462998248e-05, + "loss": 12.0982, + "step": 19545 + }, + { + "epoch": 1.0643578172115469, + "grad_norm": 0.5370782475408601, + "learning_rate": 9.443854999808305e-05, + "loss": 12.0679, + "step": 19546 + }, + { + "epoch": 1.06441227120813, + "grad_norm": 0.5201187509950764, + "learning_rate": 9.442974540943039e-05, + "loss": 12.0481, + "step": 19547 + }, + { + "epoch": 1.064466725204713, + "grad_norm": 0.5978019918197744, + "learning_rate": 9.442094086409298e-05, + "loss": 12.0072, + "step": 19548 + }, + { + "epoch": 1.064521179201296, + "grad_norm": 0.491584250591522, + "learning_rate": 9.441213636213928e-05, + "loss": 12.0287, + "step": 19549 + }, + { + "epoch": 1.064575633197879, + "grad_norm": 0.5829670045537616, + "learning_rate": 9.440333190363772e-05, + "loss": 12.1143, + "step": 19550 + }, + { + "epoch": 1.064630087194462, + "grad_norm": 0.4979406577192298, + "learning_rate": 9.439452748865682e-05, + "loss": 11.9809, + "step": 19551 + }, + { + "epoch": 1.064684541191045, + "grad_norm": 0.5641474829559183, + "learning_rate": 9.438572311726499e-05, + "loss": 11.993, + "step": 19552 + }, + { + "epoch": 1.064738995187628, + "grad_norm": 0.509601444512635, + "learning_rate": 9.437691878953074e-05, + "loss": 12.0681, + "step": 19553 + }, + { + "epoch": 1.064793449184211, + "grad_norm": 0.5973256500028669, + "learning_rate": 9.436811450552248e-05, + "loss": 12.0173, + "step": 19554 + }, + { + "epoch": 1.064847903180794, + "grad_norm": 0.48301332542954895, + "learning_rate": 9.435931026530876e-05, + "loss": 12.0674, + "step": 19555 + }, + { + "epoch": 1.064902357177377, + "grad_norm": 0.6157513310070719, + "learning_rate": 9.435050606895796e-05, + "loss": 12.0662, + "step": 19556 + }, + { + "epoch": 1.06495681117396, + "grad_norm": 0.5259273499843691, + "learning_rate": 9.434170191653856e-05, + "loss": 12.1298, + "step": 19557 + }, + { + "epoch": 1.065011265170543, + "grad_norm": 0.533012420826072, + "learning_rate": 9.433289780811905e-05, + "loss": 11.9811, + "step": 19558 + }, + { + "epoch": 1.0650657191671262, + "grad_norm": 0.5566711277744718, + "learning_rate": 9.432409374376783e-05, + "loss": 12.0475, + "step": 19559 + }, + { + "epoch": 1.0651201731637092, + "grad_norm": 0.5622517247526756, + "learning_rate": 9.431528972355345e-05, + "loss": 12.0318, + "step": 19560 + }, + { + "epoch": 1.0651746271602922, + "grad_norm": 0.5727603606850494, + "learning_rate": 9.430648574754433e-05, + "loss": 11.9535, + "step": 19561 + }, + { + "epoch": 1.0652290811568752, + "grad_norm": 0.5337964597823531, + "learning_rate": 9.429768181580894e-05, + "loss": 11.9784, + "step": 19562 + }, + { + "epoch": 1.0652835351534582, + "grad_norm": 0.598288206977741, + "learning_rate": 9.428887792841572e-05, + "loss": 12.0688, + "step": 19563 + }, + { + "epoch": 1.0653379891500412, + "grad_norm": 0.5484710879790106, + "learning_rate": 9.428007408543314e-05, + "loss": 12.0379, + "step": 19564 + }, + { + "epoch": 1.0653924431466242, + "grad_norm": 0.6066171274342483, + "learning_rate": 9.42712702869297e-05, + "loss": 12.0907, + "step": 19565 + }, + { + "epoch": 1.0654468971432072, + "grad_norm": 0.5275822158642726, + "learning_rate": 9.426246653297379e-05, + "loss": 12.0258, + "step": 19566 + }, + { + "epoch": 1.0655013511397902, + "grad_norm": 0.5733642999382029, + "learning_rate": 9.42536628236339e-05, + "loss": 12.0058, + "step": 19567 + }, + { + "epoch": 1.0655558051363732, + "grad_norm": 0.47405334283595374, + "learning_rate": 9.42448591589785e-05, + "loss": 12.043, + "step": 19568 + }, + { + "epoch": 1.0656102591329562, + "grad_norm": 0.6533793324086261, + "learning_rate": 9.423605553907603e-05, + "loss": 12.2868, + "step": 19569 + }, + { + "epoch": 1.0656647131295394, + "grad_norm": 0.5733888302247439, + "learning_rate": 9.422725196399495e-05, + "loss": 12.0235, + "step": 19570 + }, + { + "epoch": 1.0657191671261224, + "grad_norm": 0.5723354138462762, + "learning_rate": 9.421844843380375e-05, + "loss": 12.1644, + "step": 19571 + }, + { + "epoch": 1.0657736211227054, + "grad_norm": 0.5991701179514328, + "learning_rate": 9.420964494857085e-05, + "loss": 12.0841, + "step": 19572 + }, + { + "epoch": 1.0658280751192883, + "grad_norm": 0.558710862217451, + "learning_rate": 9.420084150836473e-05, + "loss": 12.0514, + "step": 19573 + }, + { + "epoch": 1.0658825291158713, + "grad_norm": 0.5469144969444083, + "learning_rate": 9.419203811325383e-05, + "loss": 12.2402, + "step": 19574 + }, + { + "epoch": 1.0659369831124543, + "grad_norm": 0.5620895209488639, + "learning_rate": 9.418323476330664e-05, + "loss": 11.9929, + "step": 19575 + }, + { + "epoch": 1.0659914371090373, + "grad_norm": 0.5949008351746472, + "learning_rate": 9.417443145859159e-05, + "loss": 12.1162, + "step": 19576 + }, + { + "epoch": 1.0660458911056203, + "grad_norm": 0.6035968787730213, + "learning_rate": 9.416562819917712e-05, + "loss": 12.0362, + "step": 19577 + }, + { + "epoch": 1.0661003451022033, + "grad_norm": 0.5390471506547698, + "learning_rate": 9.415682498513169e-05, + "loss": 12.1288, + "step": 19578 + }, + { + "epoch": 1.0661547990987863, + "grad_norm": 0.5114014464809856, + "learning_rate": 9.414802181652379e-05, + "loss": 12.0689, + "step": 19579 + }, + { + "epoch": 1.0662092530953693, + "grad_norm": 0.5633583605680194, + "learning_rate": 9.413921869342185e-05, + "loss": 12.0915, + "step": 19580 + }, + { + "epoch": 1.0662637070919523, + "grad_norm": 0.5469194789493036, + "learning_rate": 9.41304156158943e-05, + "loss": 12.1314, + "step": 19581 + }, + { + "epoch": 1.0663181610885353, + "grad_norm": 0.5300734025899072, + "learning_rate": 9.412161258400966e-05, + "loss": 12.055, + "step": 19582 + }, + { + "epoch": 1.0663726150851185, + "grad_norm": 0.5632702110680303, + "learning_rate": 9.411280959783633e-05, + "loss": 12.0251, + "step": 19583 + }, + { + "epoch": 1.0664270690817015, + "grad_norm": 0.5470144743292025, + "learning_rate": 9.410400665744279e-05, + "loss": 12.028, + "step": 19584 + }, + { + "epoch": 1.0664815230782845, + "grad_norm": 0.6196978189107037, + "learning_rate": 9.40952037628975e-05, + "loss": 12.0663, + "step": 19585 + }, + { + "epoch": 1.0665359770748675, + "grad_norm": 0.5865783159570598, + "learning_rate": 9.408640091426887e-05, + "loss": 12.1339, + "step": 19586 + }, + { + "epoch": 1.0665904310714505, + "grad_norm": 0.5862795288555418, + "learning_rate": 9.407759811162539e-05, + "loss": 12.0186, + "step": 19587 + }, + { + "epoch": 1.0666448850680335, + "grad_norm": 0.619000559758314, + "learning_rate": 9.406879535503549e-05, + "loss": 12.2063, + "step": 19588 + }, + { + "epoch": 1.0666993390646164, + "grad_norm": 0.5430701053644917, + "learning_rate": 9.405999264456763e-05, + "loss": 12.0267, + "step": 19589 + }, + { + "epoch": 1.0667537930611994, + "grad_norm": 0.5631622351629713, + "learning_rate": 9.405118998029027e-05, + "loss": 12.1148, + "step": 19590 + }, + { + "epoch": 1.0668082470577824, + "grad_norm": 0.5454245133367549, + "learning_rate": 9.404238736227186e-05, + "loss": 12.12, + "step": 19591 + }, + { + "epoch": 1.0668627010543654, + "grad_norm": 0.5889022062111674, + "learning_rate": 9.403358479058081e-05, + "loss": 12.0937, + "step": 19592 + }, + { + "epoch": 1.0669171550509484, + "grad_norm": 0.5065637392389071, + "learning_rate": 9.402478226528562e-05, + "loss": 12.0159, + "step": 19593 + }, + { + "epoch": 1.0669716090475316, + "grad_norm": 0.5709809758938452, + "learning_rate": 9.401597978645474e-05, + "loss": 12.0441, + "step": 19594 + }, + { + "epoch": 1.0670260630441146, + "grad_norm": 0.5713043581230659, + "learning_rate": 9.400717735415665e-05, + "loss": 12.1235, + "step": 19595 + }, + { + "epoch": 1.0670805170406976, + "grad_norm": 0.5593388188584824, + "learning_rate": 9.399837496845969e-05, + "loss": 12.026, + "step": 19596 + }, + { + "epoch": 1.0671349710372806, + "grad_norm": 0.5719142697400577, + "learning_rate": 9.398957262943238e-05, + "loss": 12.0984, + "step": 19597 + }, + { + "epoch": 1.0671894250338636, + "grad_norm": 0.557749710207554, + "learning_rate": 9.398077033714317e-05, + "loss": 12.1424, + "step": 19598 + }, + { + "epoch": 1.0672438790304466, + "grad_norm": 0.5740652249237118, + "learning_rate": 9.397196809166052e-05, + "loss": 11.9395, + "step": 19599 + }, + { + "epoch": 1.0672983330270296, + "grad_norm": 0.5598491479754933, + "learning_rate": 9.396316589305283e-05, + "loss": 12.0671, + "step": 19600 + }, + { + "epoch": 1.0673527870236126, + "grad_norm": 0.5444547240942728, + "learning_rate": 9.395436374138857e-05, + "loss": 12.0203, + "step": 19601 + }, + { + "epoch": 1.0674072410201956, + "grad_norm": 0.5634520180772729, + "learning_rate": 9.39455616367362e-05, + "loss": 12.0253, + "step": 19602 + }, + { + "epoch": 1.0674616950167786, + "grad_norm": 0.5152166816141512, + "learning_rate": 9.393675957916415e-05, + "loss": 12.0868, + "step": 19603 + }, + { + "epoch": 1.0675161490133616, + "grad_norm": 0.5936970924799284, + "learning_rate": 9.392795756874088e-05, + "loss": 12.1438, + "step": 19604 + }, + { + "epoch": 1.0675706030099446, + "grad_norm": 0.6169697993657006, + "learning_rate": 9.391915560553488e-05, + "loss": 12.1226, + "step": 19605 + }, + { + "epoch": 1.0676250570065278, + "grad_norm": 0.5925611242588401, + "learning_rate": 9.391035368961449e-05, + "loss": 11.9379, + "step": 19606 + }, + { + "epoch": 1.0676795110031108, + "grad_norm": 0.53909222039229, + "learning_rate": 9.390155182104823e-05, + "loss": 11.9986, + "step": 19607 + }, + { + "epoch": 1.0677339649996938, + "grad_norm": 0.532438590720537, + "learning_rate": 9.38927499999045e-05, + "loss": 11.9909, + "step": 19608 + }, + { + "epoch": 1.0677884189962767, + "grad_norm": 0.4970209959955968, + "learning_rate": 9.388394822625179e-05, + "loss": 12.0488, + "step": 19609 + }, + { + "epoch": 1.0678428729928597, + "grad_norm": 0.5998770171857966, + "learning_rate": 9.387514650015851e-05, + "loss": 12.19, + "step": 19610 + }, + { + "epoch": 1.0678973269894427, + "grad_norm": 0.552797692326681, + "learning_rate": 9.386634482169313e-05, + "loss": 12.1439, + "step": 19611 + }, + { + "epoch": 1.0679517809860257, + "grad_norm": 0.501143711536902, + "learning_rate": 9.385754319092409e-05, + "loss": 11.9222, + "step": 19612 + }, + { + "epoch": 1.0680062349826087, + "grad_norm": 0.524867903996969, + "learning_rate": 9.384874160791981e-05, + "loss": 11.9817, + "step": 19613 + }, + { + "epoch": 1.0680606889791917, + "grad_norm": 0.4905934833221118, + "learning_rate": 9.383994007274875e-05, + "loss": 12.0082, + "step": 19614 + }, + { + "epoch": 1.0681151429757747, + "grad_norm": 0.5459065117734525, + "learning_rate": 9.383113858547939e-05, + "loss": 12.0647, + "step": 19615 + }, + { + "epoch": 1.0681695969723577, + "grad_norm": 0.5203237520036577, + "learning_rate": 9.382233714618008e-05, + "loss": 11.8745, + "step": 19616 + }, + { + "epoch": 1.068224050968941, + "grad_norm": 0.5411184863716206, + "learning_rate": 9.381353575491933e-05, + "loss": 12.0235, + "step": 19617 + }, + { + "epoch": 1.068278504965524, + "grad_norm": 0.517806378865651, + "learning_rate": 9.380473441176554e-05, + "loss": 12.0839, + "step": 19618 + }, + { + "epoch": 1.068332958962107, + "grad_norm": 0.5317189778110568, + "learning_rate": 9.379593311678719e-05, + "loss": 12.0236, + "step": 19619 + }, + { + "epoch": 1.0683874129586899, + "grad_norm": 0.6585334192721659, + "learning_rate": 9.378713187005271e-05, + "loss": 12.1009, + "step": 19620 + }, + { + "epoch": 1.0684418669552729, + "grad_norm": 0.5453745591164424, + "learning_rate": 9.377833067163052e-05, + "loss": 12.1599, + "step": 19621 + }, + { + "epoch": 1.0684963209518559, + "grad_norm": 0.5446659967144174, + "learning_rate": 9.376952952158908e-05, + "loss": 12.0764, + "step": 19622 + }, + { + "epoch": 1.0685507749484389, + "grad_norm": 0.6447871707801308, + "learning_rate": 9.376072841999683e-05, + "loss": 12.1909, + "step": 19623 + }, + { + "epoch": 1.0686052289450219, + "grad_norm": 0.5668415135101286, + "learning_rate": 9.37519273669222e-05, + "loss": 11.9781, + "step": 19624 + }, + { + "epoch": 1.0686596829416048, + "grad_norm": 0.5501112084761463, + "learning_rate": 9.374312636243366e-05, + "loss": 12.0072, + "step": 19625 + }, + { + "epoch": 1.0687141369381878, + "grad_norm": 0.5165365292243039, + "learning_rate": 9.373432540659958e-05, + "loss": 11.9746, + "step": 19626 + }, + { + "epoch": 1.0687685909347708, + "grad_norm": 0.5392483614694529, + "learning_rate": 9.372552449948845e-05, + "loss": 12.0356, + "step": 19627 + }, + { + "epoch": 1.0688230449313538, + "grad_norm": 0.5047342672491422, + "learning_rate": 9.37167236411687e-05, + "loss": 12.137, + "step": 19628 + }, + { + "epoch": 1.068877498927937, + "grad_norm": 0.511943413415665, + "learning_rate": 9.370792283170874e-05, + "loss": 12.0181, + "step": 19629 + }, + { + "epoch": 1.06893195292452, + "grad_norm": 0.5675433984161167, + "learning_rate": 9.369912207117702e-05, + "loss": 12.0602, + "step": 19630 + }, + { + "epoch": 1.068986406921103, + "grad_norm": 0.5173814895325556, + "learning_rate": 9.3690321359642e-05, + "loss": 12.0295, + "step": 19631 + }, + { + "epoch": 1.069040860917686, + "grad_norm": 0.5028811051182804, + "learning_rate": 9.368152069717209e-05, + "loss": 11.9977, + "step": 19632 + }, + { + "epoch": 1.069095314914269, + "grad_norm": 0.5410129930454582, + "learning_rate": 9.367272008383575e-05, + "loss": 12.0648, + "step": 19633 + }, + { + "epoch": 1.069149768910852, + "grad_norm": 0.525174263316461, + "learning_rate": 9.36639195197014e-05, + "loss": 12.0833, + "step": 19634 + }, + { + "epoch": 1.069204222907435, + "grad_norm": 0.5199187033738576, + "learning_rate": 9.365511900483749e-05, + "loss": 11.9703, + "step": 19635 + }, + { + "epoch": 1.069258676904018, + "grad_norm": 0.5663549514936985, + "learning_rate": 9.364631853931242e-05, + "loss": 12.0634, + "step": 19636 + }, + { + "epoch": 1.069313130900601, + "grad_norm": 0.539456448316862, + "learning_rate": 9.363751812319463e-05, + "loss": 12.0389, + "step": 19637 + }, + { + "epoch": 1.069367584897184, + "grad_norm": 0.6009446430801567, + "learning_rate": 9.362871775655259e-05, + "loss": 12.1115, + "step": 19638 + }, + { + "epoch": 1.069422038893767, + "grad_norm": 0.5332866736126232, + "learning_rate": 9.36199174394547e-05, + "loss": 12.0605, + "step": 19639 + }, + { + "epoch": 1.0694764928903502, + "grad_norm": 0.4817335547024529, + "learning_rate": 9.361111717196939e-05, + "loss": 11.9135, + "step": 19640 + }, + { + "epoch": 1.0695309468869332, + "grad_norm": 0.6048174574996448, + "learning_rate": 9.360231695416509e-05, + "loss": 11.8708, + "step": 19641 + }, + { + "epoch": 1.0695854008835162, + "grad_norm": 0.6008411408181712, + "learning_rate": 9.359351678611027e-05, + "loss": 12.1587, + "step": 19642 + }, + { + "epoch": 1.0696398548800992, + "grad_norm": 0.5591678637215574, + "learning_rate": 9.358471666787334e-05, + "loss": 12.0818, + "step": 19643 + }, + { + "epoch": 1.0696943088766822, + "grad_norm": 0.5587664758147319, + "learning_rate": 9.357591659952272e-05, + "loss": 12.1245, + "step": 19644 + }, + { + "epoch": 1.0697487628732651, + "grad_norm": 0.5643472269024349, + "learning_rate": 9.356711658112685e-05, + "loss": 12.2275, + "step": 19645 + }, + { + "epoch": 1.0698032168698481, + "grad_norm": 0.5681908835197387, + "learning_rate": 9.355831661275419e-05, + "loss": 12.1445, + "step": 19646 + }, + { + "epoch": 1.0698576708664311, + "grad_norm": 0.5659950063808108, + "learning_rate": 9.354951669447312e-05, + "loss": 12.0002, + "step": 19647 + }, + { + "epoch": 1.0699121248630141, + "grad_norm": 0.5655190560505993, + "learning_rate": 9.354071682635208e-05, + "loss": 12.124, + "step": 19648 + }, + { + "epoch": 1.0699665788595971, + "grad_norm": 0.5745424362482439, + "learning_rate": 9.353191700845952e-05, + "loss": 12.0382, + "step": 19649 + }, + { + "epoch": 1.07002103285618, + "grad_norm": 0.559601469047529, + "learning_rate": 9.352311724086385e-05, + "loss": 12.0086, + "step": 19650 + }, + { + "epoch": 1.070075486852763, + "grad_norm": 0.6435323007808978, + "learning_rate": 9.351431752363351e-05, + "loss": 12.0784, + "step": 19651 + }, + { + "epoch": 1.070129940849346, + "grad_norm": 0.5525163137115525, + "learning_rate": 9.35055178568369e-05, + "loss": 12.0001, + "step": 19652 + }, + { + "epoch": 1.0701843948459293, + "grad_norm": 0.6056573547042106, + "learning_rate": 9.34967182405425e-05, + "loss": 12.1013, + "step": 19653 + }, + { + "epoch": 1.0702388488425123, + "grad_norm": 0.5763798225266978, + "learning_rate": 9.34879186748187e-05, + "loss": 11.9068, + "step": 19654 + }, + { + "epoch": 1.0702933028390953, + "grad_norm": 0.5364143506435711, + "learning_rate": 9.347911915973394e-05, + "loss": 11.9713, + "step": 19655 + }, + { + "epoch": 1.0703477568356783, + "grad_norm": 0.5208839549174432, + "learning_rate": 9.347031969535669e-05, + "loss": 12.0338, + "step": 19656 + }, + { + "epoch": 1.0704022108322613, + "grad_norm": 0.5959918016455371, + "learning_rate": 9.346152028175527e-05, + "loss": 12.0198, + "step": 19657 + }, + { + "epoch": 1.0704566648288443, + "grad_norm": 0.5705087335878948, + "learning_rate": 9.345272091899818e-05, + "loss": 11.9041, + "step": 19658 + }, + { + "epoch": 1.0705111188254273, + "grad_norm": 0.5065772424078389, + "learning_rate": 9.344392160715383e-05, + "loss": 12.0587, + "step": 19659 + }, + { + "epoch": 1.0705655728220103, + "grad_norm": 0.54577926969694, + "learning_rate": 9.343512234629064e-05, + "loss": 12.167, + "step": 19660 + }, + { + "epoch": 1.0706200268185933, + "grad_norm": 0.7992702878849013, + "learning_rate": 9.342632313647703e-05, + "loss": 12.2066, + "step": 19661 + }, + { + "epoch": 1.0706744808151762, + "grad_norm": 0.5313366529744018, + "learning_rate": 9.341752397778146e-05, + "loss": 11.997, + "step": 19662 + }, + { + "epoch": 1.0707289348117595, + "grad_norm": 0.5810677183782393, + "learning_rate": 9.340872487027231e-05, + "loss": 12.1089, + "step": 19663 + }, + { + "epoch": 1.0707833888083425, + "grad_norm": 0.5265076902015842, + "learning_rate": 9.339992581401801e-05, + "loss": 12.012, + "step": 19664 + }, + { + "epoch": 1.0708378428049254, + "grad_norm": 0.5335293416359094, + "learning_rate": 9.339112680908701e-05, + "loss": 12.0478, + "step": 19665 + }, + { + "epoch": 1.0708922968015084, + "grad_norm": 0.6252614505693662, + "learning_rate": 9.338232785554776e-05, + "loss": 12.0812, + "step": 19666 + }, + { + "epoch": 1.0709467507980914, + "grad_norm": 0.5053589716009665, + "learning_rate": 9.337352895346858e-05, + "loss": 12.057, + "step": 19667 + }, + { + "epoch": 1.0710012047946744, + "grad_norm": 0.5472130868707699, + "learning_rate": 9.336473010291795e-05, + "loss": 12.1641, + "step": 19668 + }, + { + "epoch": 1.0710556587912574, + "grad_norm": 0.6263777805974757, + "learning_rate": 9.33559313039643e-05, + "loss": 11.9064, + "step": 19669 + }, + { + "epoch": 1.0711101127878404, + "grad_norm": 0.6581749372028808, + "learning_rate": 9.334713255667606e-05, + "loss": 12.1449, + "step": 19670 + }, + { + "epoch": 1.0711645667844234, + "grad_norm": 0.5375481306366297, + "learning_rate": 9.33383338611216e-05, + "loss": 12.1026, + "step": 19671 + }, + { + "epoch": 1.0712190207810064, + "grad_norm": 0.5738863975371193, + "learning_rate": 9.332953521736941e-05, + "loss": 12.0464, + "step": 19672 + }, + { + "epoch": 1.0712734747775894, + "grad_norm": 0.5699091332528314, + "learning_rate": 9.332073662548784e-05, + "loss": 12.1012, + "step": 19673 + }, + { + "epoch": 1.0713279287741724, + "grad_norm": 0.5551052554943668, + "learning_rate": 9.331193808554538e-05, + "loss": 12.0756, + "step": 19674 + }, + { + "epoch": 1.0713823827707554, + "grad_norm": 0.5517571887946263, + "learning_rate": 9.330313959761035e-05, + "loss": 12.1347, + "step": 19675 + }, + { + "epoch": 1.0714368367673386, + "grad_norm": 0.5738099283189709, + "learning_rate": 9.329434116175132e-05, + "loss": 12.0, + "step": 19676 + }, + { + "epoch": 1.0714912907639216, + "grad_norm": 0.6123625032427639, + "learning_rate": 9.328554277803657e-05, + "loss": 12.1277, + "step": 19677 + }, + { + "epoch": 1.0715457447605046, + "grad_norm": 0.555374077530613, + "learning_rate": 9.327674444653456e-05, + "loss": 12.1037, + "step": 19678 + }, + { + "epoch": 1.0716001987570876, + "grad_norm": 0.5859849512211, + "learning_rate": 9.326794616731369e-05, + "loss": 12.2179, + "step": 19679 + }, + { + "epoch": 1.0716546527536706, + "grad_norm": 0.5651225046719894, + "learning_rate": 9.325914794044243e-05, + "loss": 12.0778, + "step": 19680 + }, + { + "epoch": 1.0717091067502535, + "grad_norm": 0.5450966905251793, + "learning_rate": 9.325034976598916e-05, + "loss": 12.0184, + "step": 19681 + }, + { + "epoch": 1.0717635607468365, + "grad_norm": 0.5909705718224748, + "learning_rate": 9.32415516440223e-05, + "loss": 11.9641, + "step": 19682 + }, + { + "epoch": 1.0718180147434195, + "grad_norm": 0.5573469872729605, + "learning_rate": 9.323275357461028e-05, + "loss": 11.9704, + "step": 19683 + }, + { + "epoch": 1.0718724687400025, + "grad_norm": 0.5780723863034695, + "learning_rate": 9.322395555782148e-05, + "loss": 12.0434, + "step": 19684 + }, + { + "epoch": 1.0719269227365855, + "grad_norm": 0.5458575677248596, + "learning_rate": 9.321515759372436e-05, + "loss": 12.0696, + "step": 19685 + }, + { + "epoch": 1.0719813767331685, + "grad_norm": 0.5381775132127713, + "learning_rate": 9.320635968238732e-05, + "loss": 11.8941, + "step": 19686 + }, + { + "epoch": 1.0720358307297517, + "grad_norm": 0.5852552369318467, + "learning_rate": 9.319756182387876e-05, + "loss": 12.1267, + "step": 19687 + }, + { + "epoch": 1.0720902847263347, + "grad_norm": 0.495255603248781, + "learning_rate": 9.318876401826708e-05, + "loss": 12.0333, + "step": 19688 + }, + { + "epoch": 1.0721447387229177, + "grad_norm": 0.6171089323945832, + "learning_rate": 9.317996626562074e-05, + "loss": 11.9859, + "step": 19689 + }, + { + "epoch": 1.0721991927195007, + "grad_norm": 0.5408401519883336, + "learning_rate": 9.317116856600807e-05, + "loss": 11.9493, + "step": 19690 + }, + { + "epoch": 1.0722536467160837, + "grad_norm": 0.5010606561302176, + "learning_rate": 9.316237091949758e-05, + "loss": 11.9302, + "step": 19691 + }, + { + "epoch": 1.0723081007126667, + "grad_norm": 0.5705628538375778, + "learning_rate": 9.315357332615763e-05, + "loss": 12.0055, + "step": 19692 + }, + { + "epoch": 1.0723625547092497, + "grad_norm": 0.5563152377068212, + "learning_rate": 9.314477578605665e-05, + "loss": 12.1444, + "step": 19693 + }, + { + "epoch": 1.0724170087058327, + "grad_norm": 0.5762005531402592, + "learning_rate": 9.313597829926306e-05, + "loss": 12.1609, + "step": 19694 + }, + { + "epoch": 1.0724714627024157, + "grad_norm": 0.5241867847401332, + "learning_rate": 9.312718086584523e-05, + "loss": 11.9909, + "step": 19695 + }, + { + "epoch": 1.0725259166989987, + "grad_norm": 0.5294744951317413, + "learning_rate": 9.311838348587162e-05, + "loss": 12.113, + "step": 19696 + }, + { + "epoch": 1.0725803706955817, + "grad_norm": 0.48081367114560963, + "learning_rate": 9.31095861594106e-05, + "loss": 12.0423, + "step": 19697 + }, + { + "epoch": 1.0726348246921646, + "grad_norm": 0.529069169475715, + "learning_rate": 9.310078888653059e-05, + "loss": 12.0427, + "step": 19698 + }, + { + "epoch": 1.0726892786887479, + "grad_norm": 0.5197802975156901, + "learning_rate": 9.30919916673e-05, + "loss": 12.0114, + "step": 19699 + }, + { + "epoch": 1.0727437326853309, + "grad_norm": 0.5195023067243941, + "learning_rate": 9.308319450178724e-05, + "loss": 11.9279, + "step": 19700 + }, + { + "epoch": 1.0727981866819138, + "grad_norm": 0.5350785743310953, + "learning_rate": 9.30743973900607e-05, + "loss": 12.0655, + "step": 19701 + }, + { + "epoch": 1.0728526406784968, + "grad_norm": 0.54928519418668, + "learning_rate": 9.306560033218883e-05, + "loss": 12.0851, + "step": 19702 + }, + { + "epoch": 1.0729070946750798, + "grad_norm": 0.5802036630071491, + "learning_rate": 9.305680332824001e-05, + "loss": 12.1587, + "step": 19703 + }, + { + "epoch": 1.0729615486716628, + "grad_norm": 0.6146438445621326, + "learning_rate": 9.304800637828266e-05, + "loss": 11.8956, + "step": 19704 + }, + { + "epoch": 1.0730160026682458, + "grad_norm": 0.519056383374783, + "learning_rate": 9.303920948238518e-05, + "loss": 11.9627, + "step": 19705 + }, + { + "epoch": 1.0730704566648288, + "grad_norm": 0.5166818684415007, + "learning_rate": 9.3030412640616e-05, + "loss": 12.0785, + "step": 19706 + }, + { + "epoch": 1.0731249106614118, + "grad_norm": 0.550748553582585, + "learning_rate": 9.302161585304345e-05, + "loss": 11.9352, + "step": 19707 + }, + { + "epoch": 1.0731793646579948, + "grad_norm": 0.5569639363473886, + "learning_rate": 9.301281911973601e-05, + "loss": 11.9928, + "step": 19708 + }, + { + "epoch": 1.0732338186545778, + "grad_norm": 0.565818920261101, + "learning_rate": 9.300402244076206e-05, + "loss": 12.042, + "step": 19709 + }, + { + "epoch": 1.073288272651161, + "grad_norm": 0.5144223908841118, + "learning_rate": 9.299522581619e-05, + "loss": 11.9998, + "step": 19710 + }, + { + "epoch": 1.073342726647744, + "grad_norm": 0.5705112569405207, + "learning_rate": 9.298642924608824e-05, + "loss": 12.1002, + "step": 19711 + }, + { + "epoch": 1.073397180644327, + "grad_norm": 0.5158525970327664, + "learning_rate": 9.297763273052517e-05, + "loss": 12.0533, + "step": 19712 + }, + { + "epoch": 1.07345163464091, + "grad_norm": 0.5697894517000133, + "learning_rate": 9.29688362695692e-05, + "loss": 11.9829, + "step": 19713 + }, + { + "epoch": 1.073506088637493, + "grad_norm": 0.5632348411887524, + "learning_rate": 9.296003986328875e-05, + "loss": 12.1041, + "step": 19714 + }, + { + "epoch": 1.073560542634076, + "grad_norm": 0.6554950222699993, + "learning_rate": 9.295124351175222e-05, + "loss": 12.1818, + "step": 19715 + }, + { + "epoch": 1.073614996630659, + "grad_norm": 0.5770111055480366, + "learning_rate": 9.294244721502804e-05, + "loss": 11.9797, + "step": 19716 + }, + { + "epoch": 1.073669450627242, + "grad_norm": 0.528664251870247, + "learning_rate": 9.293365097318452e-05, + "loss": 12.0613, + "step": 19717 + }, + { + "epoch": 1.073723904623825, + "grad_norm": 0.6015470481259096, + "learning_rate": 9.292485478629011e-05, + "loss": 12.034, + "step": 19718 + }, + { + "epoch": 1.073778358620408, + "grad_norm": 0.5302372269520177, + "learning_rate": 9.291605865441324e-05, + "loss": 12.0271, + "step": 19719 + }, + { + "epoch": 1.073832812616991, + "grad_norm": 0.5323550202865295, + "learning_rate": 9.290726257762228e-05, + "loss": 12.022, + "step": 19720 + }, + { + "epoch": 1.073887266613574, + "grad_norm": 0.5080055405797005, + "learning_rate": 9.289846655598564e-05, + "loss": 11.9097, + "step": 19721 + }, + { + "epoch": 1.0739417206101571, + "grad_norm": 0.5355357619919747, + "learning_rate": 9.28896705895717e-05, + "loss": 12.0039, + "step": 19722 + }, + { + "epoch": 1.0739961746067401, + "grad_norm": 0.5721031071639637, + "learning_rate": 9.288087467844888e-05, + "loss": 11.9958, + "step": 19723 + }, + { + "epoch": 1.0740506286033231, + "grad_norm": 0.5302677196758343, + "learning_rate": 9.287207882268556e-05, + "loss": 11.977, + "step": 19724 + }, + { + "epoch": 1.0741050825999061, + "grad_norm": 0.513418394157247, + "learning_rate": 9.286328302235015e-05, + "loss": 12.0055, + "step": 19725 + }, + { + "epoch": 1.074159536596489, + "grad_norm": 0.5404659909582347, + "learning_rate": 9.285448727751111e-05, + "loss": 12.1113, + "step": 19726 + }, + { + "epoch": 1.074213990593072, + "grad_norm": 0.5523398496251313, + "learning_rate": 9.284569158823673e-05, + "loss": 11.9773, + "step": 19727 + }, + { + "epoch": 1.074268444589655, + "grad_norm": 0.5637373287061666, + "learning_rate": 9.283689595459544e-05, + "loss": 12.1073, + "step": 19728 + }, + { + "epoch": 1.074322898586238, + "grad_norm": 0.4880685056636543, + "learning_rate": 9.282810037665566e-05, + "loss": 12.0374, + "step": 19729 + }, + { + "epoch": 1.074377352582821, + "grad_norm": 0.5277581048098522, + "learning_rate": 9.281930485448576e-05, + "loss": 12.1272, + "step": 19730 + }, + { + "epoch": 1.074431806579404, + "grad_norm": 0.5349540016588795, + "learning_rate": 9.281050938815416e-05, + "loss": 11.9587, + "step": 19731 + }, + { + "epoch": 1.074486260575987, + "grad_norm": 0.5629341828892994, + "learning_rate": 9.280171397772926e-05, + "loss": 12.0396, + "step": 19732 + }, + { + "epoch": 1.0745407145725703, + "grad_norm": 0.513798232372738, + "learning_rate": 9.279291862327943e-05, + "loss": 12.0225, + "step": 19733 + }, + { + "epoch": 1.0745951685691533, + "grad_norm": 0.5353011570995032, + "learning_rate": 9.278412332487306e-05, + "loss": 12.1895, + "step": 19734 + }, + { + "epoch": 1.0746496225657363, + "grad_norm": 0.5271662973407337, + "learning_rate": 9.277532808257855e-05, + "loss": 12.1193, + "step": 19735 + }, + { + "epoch": 1.0747040765623193, + "grad_norm": 0.5634901604870521, + "learning_rate": 9.276653289646437e-05, + "loss": 12.0464, + "step": 19736 + }, + { + "epoch": 1.0747585305589022, + "grad_norm": 0.5648214953375595, + "learning_rate": 9.275773776659878e-05, + "loss": 12.1405, + "step": 19737 + }, + { + "epoch": 1.0748129845554852, + "grad_norm": 0.5593975129303271, + "learning_rate": 9.274894269305025e-05, + "loss": 11.97, + "step": 19738 + }, + { + "epoch": 1.0748674385520682, + "grad_norm": 0.5080976573341622, + "learning_rate": 9.274014767588714e-05, + "loss": 11.9913, + "step": 19739 + }, + { + "epoch": 1.0749218925486512, + "grad_norm": 0.5214783380460148, + "learning_rate": 9.273135271517787e-05, + "loss": 12.1178, + "step": 19740 + }, + { + "epoch": 1.0749763465452342, + "grad_norm": 0.5308172619512298, + "learning_rate": 9.272255781099082e-05, + "loss": 12.0744, + "step": 19741 + }, + { + "epoch": 1.0750308005418172, + "grad_norm": 0.6115011038726479, + "learning_rate": 9.271376296339439e-05, + "loss": 12.1452, + "step": 19742 + }, + { + "epoch": 1.0750852545384002, + "grad_norm": 0.5792122936650925, + "learning_rate": 9.270496817245696e-05, + "loss": 11.9555, + "step": 19743 + }, + { + "epoch": 1.0751397085349832, + "grad_norm": 0.5638891886366614, + "learning_rate": 9.26961734382469e-05, + "loss": 12.0211, + "step": 19744 + }, + { + "epoch": 1.0751941625315662, + "grad_norm": 0.5014330119294012, + "learning_rate": 9.268737876083265e-05, + "loss": 11.9881, + "step": 19745 + }, + { + "epoch": 1.0752486165281494, + "grad_norm": 0.5313886809459771, + "learning_rate": 9.267858414028257e-05, + "loss": 12.0741, + "step": 19746 + }, + { + "epoch": 1.0753030705247324, + "grad_norm": 0.5412319133024731, + "learning_rate": 9.266978957666504e-05, + "loss": 11.9375, + "step": 19747 + }, + { + "epoch": 1.0753575245213154, + "grad_norm": 0.560446843590823, + "learning_rate": 9.266099507004844e-05, + "loss": 12.0861, + "step": 19748 + }, + { + "epoch": 1.0754119785178984, + "grad_norm": 0.5805220017073658, + "learning_rate": 9.265220062050119e-05, + "loss": 12.1195, + "step": 19749 + }, + { + "epoch": 1.0754664325144814, + "grad_norm": 0.504645492060845, + "learning_rate": 9.264340622809163e-05, + "loss": 11.993, + "step": 19750 + }, + { + "epoch": 1.0755208865110644, + "grad_norm": 0.5617676971852157, + "learning_rate": 9.263461189288819e-05, + "loss": 12.0088, + "step": 19751 + }, + { + "epoch": 1.0755753405076474, + "grad_norm": 0.6004664230819657, + "learning_rate": 9.262581761495926e-05, + "loss": 12.0947, + "step": 19752 + }, + { + "epoch": 1.0756297945042304, + "grad_norm": 0.5362974399259468, + "learning_rate": 9.261702339437319e-05, + "loss": 11.9674, + "step": 19753 + }, + { + "epoch": 1.0756842485008133, + "grad_norm": 0.536708197993625, + "learning_rate": 9.26082292311984e-05, + "loss": 12.1122, + "step": 19754 + }, + { + "epoch": 1.0757387024973963, + "grad_norm": 0.6283518034842255, + "learning_rate": 9.259943512550325e-05, + "loss": 12.0162, + "step": 19755 + }, + { + "epoch": 1.0757931564939793, + "grad_norm": 0.6175713448932344, + "learning_rate": 9.259064107735616e-05, + "loss": 12.0244, + "step": 19756 + }, + { + "epoch": 1.0758476104905625, + "grad_norm": 0.5444838556408125, + "learning_rate": 9.258184708682546e-05, + "loss": 12.0978, + "step": 19757 + }, + { + "epoch": 1.0759020644871455, + "grad_norm": 0.5624415326003953, + "learning_rate": 9.257305315397957e-05, + "loss": 11.9287, + "step": 19758 + }, + { + "epoch": 1.0759565184837285, + "grad_norm": 0.734909947442244, + "learning_rate": 9.256425927888687e-05, + "loss": 12.1097, + "step": 19759 + }, + { + "epoch": 1.0760109724803115, + "grad_norm": 0.52136453544778, + "learning_rate": 9.255546546161573e-05, + "loss": 12.0123, + "step": 19760 + }, + { + "epoch": 1.0760654264768945, + "grad_norm": 0.5185993467141862, + "learning_rate": 9.254667170223453e-05, + "loss": 12.0284, + "step": 19761 + }, + { + "epoch": 1.0761198804734775, + "grad_norm": 0.5748138736745205, + "learning_rate": 9.253787800081163e-05, + "loss": 12.0021, + "step": 19762 + }, + { + "epoch": 1.0761743344700605, + "grad_norm": 0.6109488906780046, + "learning_rate": 9.25290843574155e-05, + "loss": 12.1826, + "step": 19763 + }, + { + "epoch": 1.0762287884666435, + "grad_norm": 0.6420538097109626, + "learning_rate": 9.252029077211444e-05, + "loss": 12.1369, + "step": 19764 + }, + { + "epoch": 1.0762832424632265, + "grad_norm": 0.8507627928391415, + "learning_rate": 9.251149724497685e-05, + "loss": 12.0548, + "step": 19765 + }, + { + "epoch": 1.0763376964598095, + "grad_norm": 0.6035701910113489, + "learning_rate": 9.250270377607114e-05, + "loss": 12.0962, + "step": 19766 + }, + { + "epoch": 1.0763921504563925, + "grad_norm": 0.6450246044835068, + "learning_rate": 9.249391036546564e-05, + "loss": 12.0165, + "step": 19767 + }, + { + "epoch": 1.0764466044529755, + "grad_norm": 0.697372254910513, + "learning_rate": 9.248511701322876e-05, + "loss": 11.9212, + "step": 19768 + }, + { + "epoch": 1.0765010584495587, + "grad_norm": 0.5828957964868062, + "learning_rate": 9.247632371942885e-05, + "loss": 12.078, + "step": 19769 + }, + { + "epoch": 1.0765555124461417, + "grad_norm": 0.6521946979037693, + "learning_rate": 9.246753048413433e-05, + "loss": 11.9964, + "step": 19770 + }, + { + "epoch": 1.0766099664427247, + "grad_norm": 0.6143374194357522, + "learning_rate": 9.245873730741356e-05, + "loss": 11.9747, + "step": 19771 + }, + { + "epoch": 1.0766644204393077, + "grad_norm": 0.5431569085936767, + "learning_rate": 9.24499441893349e-05, + "loss": 12.0608, + "step": 19772 + }, + { + "epoch": 1.0767188744358906, + "grad_norm": 0.54729633266687, + "learning_rate": 9.244115112996671e-05, + "loss": 12.0335, + "step": 19773 + }, + { + "epoch": 1.0767733284324736, + "grad_norm": 0.5452416566310941, + "learning_rate": 9.243235812937743e-05, + "loss": 12.1001, + "step": 19774 + }, + { + "epoch": 1.0768277824290566, + "grad_norm": 0.5694708937075351, + "learning_rate": 9.24235651876354e-05, + "loss": 12.0374, + "step": 19775 + }, + { + "epoch": 1.0768822364256396, + "grad_norm": 0.611623532142073, + "learning_rate": 9.241477230480904e-05, + "loss": 12.0619, + "step": 19776 + }, + { + "epoch": 1.0769366904222226, + "grad_norm": 0.6025778412048199, + "learning_rate": 9.240597948096662e-05, + "loss": 12.1174, + "step": 19777 + }, + { + "epoch": 1.0769911444188056, + "grad_norm": 0.5934774686655594, + "learning_rate": 9.23971867161766e-05, + "loss": 11.984, + "step": 19778 + }, + { + "epoch": 1.0770455984153886, + "grad_norm": 0.5202670150688955, + "learning_rate": 9.238839401050732e-05, + "loss": 12.087, + "step": 19779 + }, + { + "epoch": 1.0771000524119718, + "grad_norm": 0.5626755625868136, + "learning_rate": 9.237960136402718e-05, + "loss": 12.0306, + "step": 19780 + }, + { + "epoch": 1.0771545064085548, + "grad_norm": 0.5285808303053771, + "learning_rate": 9.237080877680453e-05, + "loss": 12.0357, + "step": 19781 + }, + { + "epoch": 1.0772089604051378, + "grad_norm": 0.4884744313035158, + "learning_rate": 9.236201624890776e-05, + "loss": 11.9146, + "step": 19782 + }, + { + "epoch": 1.0772634144017208, + "grad_norm": 0.5199914902622657, + "learning_rate": 9.235322378040522e-05, + "loss": 12.0246, + "step": 19783 + }, + { + "epoch": 1.0773178683983038, + "grad_norm": 0.5091585427844985, + "learning_rate": 9.234443137136529e-05, + "loss": 11.8571, + "step": 19784 + }, + { + "epoch": 1.0773723223948868, + "grad_norm": 0.5342246407419041, + "learning_rate": 9.233563902185633e-05, + "loss": 12.1116, + "step": 19785 + }, + { + "epoch": 1.0774267763914698, + "grad_norm": 0.5571358933957491, + "learning_rate": 9.232684673194676e-05, + "loss": 12.1752, + "step": 19786 + }, + { + "epoch": 1.0774812303880528, + "grad_norm": 0.5312270871426307, + "learning_rate": 9.231805450170495e-05, + "loss": 12.0943, + "step": 19787 + }, + { + "epoch": 1.0775356843846358, + "grad_norm": 0.5564105200114353, + "learning_rate": 9.230926233119918e-05, + "loss": 12.0351, + "step": 19788 + }, + { + "epoch": 1.0775901383812188, + "grad_norm": 0.5238471065822412, + "learning_rate": 9.230047022049788e-05, + "loss": 11.9625, + "step": 19789 + }, + { + "epoch": 1.0776445923778017, + "grad_norm": 0.6448027773800971, + "learning_rate": 9.229167816966943e-05, + "loss": 11.8557, + "step": 19790 + }, + { + "epoch": 1.0776990463743847, + "grad_norm": 0.5475912973962153, + "learning_rate": 9.228288617878217e-05, + "loss": 11.902, + "step": 19791 + }, + { + "epoch": 1.077753500370968, + "grad_norm": 0.48479578615231733, + "learning_rate": 9.227409424790449e-05, + "loss": 11.7981, + "step": 19792 + }, + { + "epoch": 1.077807954367551, + "grad_norm": 0.4969317117219396, + "learning_rate": 9.226530237710474e-05, + "loss": 11.9856, + "step": 19793 + }, + { + "epoch": 1.077862408364134, + "grad_norm": 0.5843041316087823, + "learning_rate": 9.22565105664513e-05, + "loss": 11.9749, + "step": 19794 + }, + { + "epoch": 1.077916862360717, + "grad_norm": 0.4976117563615809, + "learning_rate": 9.224771881601252e-05, + "loss": 12.003, + "step": 19795 + }, + { + "epoch": 1.0779713163573, + "grad_norm": 0.5603380367776644, + "learning_rate": 9.22389271258568e-05, + "loss": 11.9834, + "step": 19796 + }, + { + "epoch": 1.078025770353883, + "grad_norm": 0.5859491969938188, + "learning_rate": 9.223013549605252e-05, + "loss": 12.2125, + "step": 19797 + }, + { + "epoch": 1.078080224350466, + "grad_norm": 0.5718941113825561, + "learning_rate": 9.222134392666797e-05, + "loss": 12.162, + "step": 19798 + }, + { + "epoch": 1.078134678347049, + "grad_norm": 0.4832940013238779, + "learning_rate": 9.221255241777152e-05, + "loss": 12.0352, + "step": 19799 + }, + { + "epoch": 1.078189132343632, + "grad_norm": 0.5301172166326132, + "learning_rate": 9.22037609694316e-05, + "loss": 12.0537, + "step": 19800 + }, + { + "epoch": 1.0782435863402149, + "grad_norm": 0.5360919272697344, + "learning_rate": 9.219496958171656e-05, + "loss": 12.0751, + "step": 19801 + }, + { + "epoch": 1.0782980403367979, + "grad_norm": 0.5142437128335051, + "learning_rate": 9.218617825469471e-05, + "loss": 12.0602, + "step": 19802 + }, + { + "epoch": 1.078352494333381, + "grad_norm": 0.573524430103686, + "learning_rate": 9.217738698843447e-05, + "loss": 12.0742, + "step": 19803 + }, + { + "epoch": 1.078406948329964, + "grad_norm": 0.5571919577366389, + "learning_rate": 9.216859578300418e-05, + "loss": 11.9993, + "step": 19804 + }, + { + "epoch": 1.078461402326547, + "grad_norm": 0.5056902483771932, + "learning_rate": 9.215980463847221e-05, + "loss": 12.0203, + "step": 19805 + }, + { + "epoch": 1.07851585632313, + "grad_norm": 0.5503370149624551, + "learning_rate": 9.215101355490688e-05, + "loss": 12.0564, + "step": 19806 + }, + { + "epoch": 1.078570310319713, + "grad_norm": 0.613107159969414, + "learning_rate": 9.214222253237664e-05, + "loss": 11.9812, + "step": 19807 + }, + { + "epoch": 1.078624764316296, + "grad_norm": 0.5653136291345044, + "learning_rate": 9.213343157094976e-05, + "loss": 12.0614, + "step": 19808 + }, + { + "epoch": 1.078679218312879, + "grad_norm": 0.5599890854147888, + "learning_rate": 9.212464067069464e-05, + "loss": 12.0691, + "step": 19809 + }, + { + "epoch": 1.078733672309462, + "grad_norm": 0.5443108479194124, + "learning_rate": 9.211584983167963e-05, + "loss": 12.139, + "step": 19810 + }, + { + "epoch": 1.078788126306045, + "grad_norm": 0.5315181193854013, + "learning_rate": 9.210705905397307e-05, + "loss": 12.0453, + "step": 19811 + }, + { + "epoch": 1.078842580302628, + "grad_norm": 0.5482503737684693, + "learning_rate": 9.209826833764338e-05, + "loss": 12.0683, + "step": 19812 + }, + { + "epoch": 1.078897034299211, + "grad_norm": 0.513803567111454, + "learning_rate": 9.208947768275886e-05, + "loss": 12.0799, + "step": 19813 + }, + { + "epoch": 1.078951488295794, + "grad_norm": 0.4985326492421896, + "learning_rate": 9.208068708938791e-05, + "loss": 12.0466, + "step": 19814 + }, + { + "epoch": 1.079005942292377, + "grad_norm": 0.525518283907291, + "learning_rate": 9.207189655759885e-05, + "loss": 11.9293, + "step": 19815 + }, + { + "epoch": 1.0790603962889602, + "grad_norm": 0.5482022188079608, + "learning_rate": 9.206310608746005e-05, + "loss": 12.0235, + "step": 19816 + }, + { + "epoch": 1.0791148502855432, + "grad_norm": 0.51705141705295, + "learning_rate": 9.20543156790399e-05, + "loss": 12.1233, + "step": 19817 + }, + { + "epoch": 1.0791693042821262, + "grad_norm": 0.5997363988760009, + "learning_rate": 9.20455253324067e-05, + "loss": 12.0288, + "step": 19818 + }, + { + "epoch": 1.0792237582787092, + "grad_norm": 0.5943740067115404, + "learning_rate": 9.203673504762881e-05, + "loss": 12.0617, + "step": 19819 + }, + { + "epoch": 1.0792782122752922, + "grad_norm": 0.5372814630032169, + "learning_rate": 9.202794482477464e-05, + "loss": 11.9594, + "step": 19820 + }, + { + "epoch": 1.0793326662718752, + "grad_norm": 0.5385223976676541, + "learning_rate": 9.201915466391248e-05, + "loss": 11.9173, + "step": 19821 + }, + { + "epoch": 1.0793871202684582, + "grad_norm": 0.630652030358931, + "learning_rate": 9.201036456511069e-05, + "loss": 12.0707, + "step": 19822 + }, + { + "epoch": 1.0794415742650412, + "grad_norm": 0.5256875959058562, + "learning_rate": 9.200157452843768e-05, + "loss": 12.0294, + "step": 19823 + }, + { + "epoch": 1.0794960282616242, + "grad_norm": 0.5809380055850841, + "learning_rate": 9.199278455396176e-05, + "loss": 12.1731, + "step": 19824 + }, + { + "epoch": 1.0795504822582072, + "grad_norm": 0.4861395432745022, + "learning_rate": 9.19839946417513e-05, + "loss": 11.9302, + "step": 19825 + }, + { + "epoch": 1.0796049362547901, + "grad_norm": 0.5560515010457268, + "learning_rate": 9.197520479187463e-05, + "loss": 12.0977, + "step": 19826 + }, + { + "epoch": 1.0796593902513734, + "grad_norm": 0.5504295389034873, + "learning_rate": 9.196641500440014e-05, + "loss": 11.9997, + "step": 19827 + }, + { + "epoch": 1.0797138442479564, + "grad_norm": 0.5612610027101926, + "learning_rate": 9.195762527939613e-05, + "loss": 12.1131, + "step": 19828 + }, + { + "epoch": 1.0797682982445393, + "grad_norm": 0.5119241540906272, + "learning_rate": 9.194883561693098e-05, + "loss": 11.9476, + "step": 19829 + }, + { + "epoch": 1.0798227522411223, + "grad_norm": 0.5424708040339065, + "learning_rate": 9.194004601707304e-05, + "loss": 11.9638, + "step": 19830 + }, + { + "epoch": 1.0798772062377053, + "grad_norm": 0.5738978063293178, + "learning_rate": 9.193125647989064e-05, + "loss": 12.1109, + "step": 19831 + }, + { + "epoch": 1.0799316602342883, + "grad_norm": 0.6388819616332108, + "learning_rate": 9.192246700545215e-05, + "loss": 12.0414, + "step": 19832 + }, + { + "epoch": 1.0799861142308713, + "grad_norm": 0.5863079235322, + "learning_rate": 9.191367759382587e-05, + "loss": 12.1259, + "step": 19833 + }, + { + "epoch": 1.0800405682274543, + "grad_norm": 0.5865267660978265, + "learning_rate": 9.190488824508024e-05, + "loss": 11.9783, + "step": 19834 + }, + { + "epoch": 1.0800950222240373, + "grad_norm": 0.5942721031349165, + "learning_rate": 9.189609895928353e-05, + "loss": 12.07, + "step": 19835 + }, + { + "epoch": 1.0801494762206203, + "grad_norm": 0.5580497812786545, + "learning_rate": 9.188730973650414e-05, + "loss": 12.0397, + "step": 19836 + }, + { + "epoch": 1.0802039302172033, + "grad_norm": 0.5574042050126845, + "learning_rate": 9.18785205768104e-05, + "loss": 12.1398, + "step": 19837 + }, + { + "epoch": 1.0802583842137863, + "grad_norm": 0.5082332192849419, + "learning_rate": 9.186973148027063e-05, + "loss": 11.935, + "step": 19838 + }, + { + "epoch": 1.0803128382103695, + "grad_norm": 0.506636418655922, + "learning_rate": 9.186094244695318e-05, + "loss": 12.0906, + "step": 19839 + }, + { + "epoch": 1.0803672922069525, + "grad_norm": 0.5550175858574048, + "learning_rate": 9.185215347692641e-05, + "loss": 12.0529, + "step": 19840 + }, + { + "epoch": 1.0804217462035355, + "grad_norm": 0.5306856622699136, + "learning_rate": 9.184336457025866e-05, + "loss": 12.0439, + "step": 19841 + }, + { + "epoch": 1.0804762002001185, + "grad_norm": 0.5751168937192283, + "learning_rate": 9.183457572701828e-05, + "loss": 12.0151, + "step": 19842 + }, + { + "epoch": 1.0805306541967015, + "grad_norm": 0.5028561230990091, + "learning_rate": 9.18257869472736e-05, + "loss": 11.926, + "step": 19843 + }, + { + "epoch": 1.0805851081932845, + "grad_norm": 0.5415314966630953, + "learning_rate": 9.181699823109296e-05, + "loss": 12.0274, + "step": 19844 + }, + { + "epoch": 1.0806395621898675, + "grad_norm": 0.5355038008075436, + "learning_rate": 9.180820957854473e-05, + "loss": 12.0452, + "step": 19845 + }, + { + "epoch": 1.0806940161864504, + "grad_norm": 0.5102761512889469, + "learning_rate": 9.179942098969725e-05, + "loss": 12.0176, + "step": 19846 + }, + { + "epoch": 1.0807484701830334, + "grad_norm": 0.5606942033044104, + "learning_rate": 9.179063246461887e-05, + "loss": 12.0285, + "step": 19847 + }, + { + "epoch": 1.0808029241796164, + "grad_norm": 0.5991012448887261, + "learning_rate": 9.178184400337786e-05, + "loss": 12.1098, + "step": 19848 + }, + { + "epoch": 1.0808573781761994, + "grad_norm": 0.5367535375112495, + "learning_rate": 9.177305560604261e-05, + "loss": 11.9596, + "step": 19849 + }, + { + "epoch": 1.0809118321727826, + "grad_norm": 0.5004396697476243, + "learning_rate": 9.176426727268148e-05, + "loss": 12.0197, + "step": 19850 + }, + { + "epoch": 1.0809662861693656, + "grad_norm": 0.5799104998920007, + "learning_rate": 9.175547900336279e-05, + "loss": 12.2113, + "step": 19851 + }, + { + "epoch": 1.0810207401659486, + "grad_norm": 0.5749851243210843, + "learning_rate": 9.174669079815486e-05, + "loss": 12.086, + "step": 19852 + }, + { + "epoch": 1.0810751941625316, + "grad_norm": 0.5086492814233681, + "learning_rate": 9.173790265712605e-05, + "loss": 11.9306, + "step": 19853 + }, + { + "epoch": 1.0811296481591146, + "grad_norm": 0.5366057011087302, + "learning_rate": 9.172911458034471e-05, + "loss": 12.0747, + "step": 19854 + }, + { + "epoch": 1.0811841021556976, + "grad_norm": 0.5403876564087411, + "learning_rate": 9.172032656787913e-05, + "loss": 11.9863, + "step": 19855 + }, + { + "epoch": 1.0812385561522806, + "grad_norm": 0.5950587307869056, + "learning_rate": 9.171153861979769e-05, + "loss": 12.1185, + "step": 19856 + }, + { + "epoch": 1.0812930101488636, + "grad_norm": 0.6135176279796881, + "learning_rate": 9.170275073616877e-05, + "loss": 12.0065, + "step": 19857 + }, + { + "epoch": 1.0813474641454466, + "grad_norm": 0.5083830167400131, + "learning_rate": 9.169396291706061e-05, + "loss": 11.8831, + "step": 19858 + }, + { + "epoch": 1.0814019181420296, + "grad_norm": 0.5703483834579192, + "learning_rate": 9.168517516254158e-05, + "loss": 12.0295, + "step": 19859 + }, + { + "epoch": 1.0814563721386126, + "grad_norm": 0.6074469083028137, + "learning_rate": 9.167638747268002e-05, + "loss": 12.0883, + "step": 19860 + }, + { + "epoch": 1.0815108261351956, + "grad_norm": 0.5191785966022419, + "learning_rate": 9.166759984754428e-05, + "loss": 12.0195, + "step": 19861 + }, + { + "epoch": 1.0815652801317788, + "grad_norm": 0.5605617016454646, + "learning_rate": 9.165881228720267e-05, + "loss": 12.1311, + "step": 19862 + }, + { + "epoch": 1.0816197341283618, + "grad_norm": 0.5390198212079476, + "learning_rate": 9.165002479172354e-05, + "loss": 12.1054, + "step": 19863 + }, + { + "epoch": 1.0816741881249448, + "grad_norm": 0.5659962448785403, + "learning_rate": 9.164123736117523e-05, + "loss": 12.1215, + "step": 19864 + }, + { + "epoch": 1.0817286421215278, + "grad_norm": 0.5921829218396504, + "learning_rate": 9.163244999562604e-05, + "loss": 12.0794, + "step": 19865 + }, + { + "epoch": 1.0817830961181107, + "grad_norm": 0.5304361683476481, + "learning_rate": 9.162366269514432e-05, + "loss": 12.1065, + "step": 19866 + }, + { + "epoch": 1.0818375501146937, + "grad_norm": 0.5807102749268369, + "learning_rate": 9.161487545979844e-05, + "loss": 12.0703, + "step": 19867 + }, + { + "epoch": 1.0818920041112767, + "grad_norm": 0.5276436081169286, + "learning_rate": 9.160608828965666e-05, + "loss": 11.9785, + "step": 19868 + }, + { + "epoch": 1.0819464581078597, + "grad_norm": 0.5661969343941075, + "learning_rate": 9.159730118478737e-05, + "loss": 12.0878, + "step": 19869 + }, + { + "epoch": 1.0820009121044427, + "grad_norm": 0.5362217126007136, + "learning_rate": 9.158851414525886e-05, + "loss": 11.8735, + "step": 19870 + }, + { + "epoch": 1.0820553661010257, + "grad_norm": 0.5508481037378804, + "learning_rate": 9.157972717113945e-05, + "loss": 12.1429, + "step": 19871 + }, + { + "epoch": 1.0821098200976087, + "grad_norm": 0.562342947545884, + "learning_rate": 9.15709402624975e-05, + "loss": 12.1656, + "step": 19872 + }, + { + "epoch": 1.082164274094192, + "grad_norm": 0.500928640737876, + "learning_rate": 9.156215341940136e-05, + "loss": 12.081, + "step": 19873 + }, + { + "epoch": 1.082218728090775, + "grad_norm": 0.4966378820375146, + "learning_rate": 9.155336664191932e-05, + "loss": 12.1059, + "step": 19874 + }, + { + "epoch": 1.082273182087358, + "grad_norm": 0.5528438165037192, + "learning_rate": 9.154457993011972e-05, + "loss": 11.9159, + "step": 19875 + }, + { + "epoch": 1.082327636083941, + "grad_norm": 0.5484776550801352, + "learning_rate": 9.153579328407088e-05, + "loss": 11.9934, + "step": 19876 + }, + { + "epoch": 1.0823820900805239, + "grad_norm": 0.5575050542309941, + "learning_rate": 9.152700670384116e-05, + "loss": 12.0534, + "step": 19877 + }, + { + "epoch": 1.0824365440771069, + "grad_norm": 0.5426585602757926, + "learning_rate": 9.151822018949881e-05, + "loss": 12.0549, + "step": 19878 + }, + { + "epoch": 1.0824909980736899, + "grad_norm": 0.6123470415390558, + "learning_rate": 9.150943374111222e-05, + "loss": 11.9764, + "step": 19879 + }, + { + "epoch": 1.0825454520702729, + "grad_norm": 0.5821696164344822, + "learning_rate": 9.15006473587497e-05, + "loss": 12.0439, + "step": 19880 + }, + { + "epoch": 1.0825999060668559, + "grad_norm": 0.5430943652819135, + "learning_rate": 9.149186104247958e-05, + "loss": 12.1013, + "step": 19881 + }, + { + "epoch": 1.0826543600634388, + "grad_norm": 0.6327773300958239, + "learning_rate": 9.148307479237014e-05, + "loss": 12.0013, + "step": 19882 + }, + { + "epoch": 1.0827088140600218, + "grad_norm": 0.518913600156964, + "learning_rate": 9.147428860848977e-05, + "loss": 12.094, + "step": 19883 + }, + { + "epoch": 1.0827632680566048, + "grad_norm": 0.5645919083115407, + "learning_rate": 9.146550249090675e-05, + "loss": 11.8218, + "step": 19884 + }, + { + "epoch": 1.0828177220531878, + "grad_norm": 0.5441439766884042, + "learning_rate": 9.145671643968942e-05, + "loss": 12.0403, + "step": 19885 + }, + { + "epoch": 1.082872176049771, + "grad_norm": 0.5940575936371498, + "learning_rate": 9.14479304549061e-05, + "loss": 12.0329, + "step": 19886 + }, + { + "epoch": 1.082926630046354, + "grad_norm": 0.6043211414787795, + "learning_rate": 9.143914453662512e-05, + "loss": 12.0626, + "step": 19887 + }, + { + "epoch": 1.082981084042937, + "grad_norm": 0.541390862693401, + "learning_rate": 9.143035868491476e-05, + "loss": 12.048, + "step": 19888 + }, + { + "epoch": 1.08303553803952, + "grad_norm": 0.531445579848505, + "learning_rate": 9.142157289984337e-05, + "loss": 12.0976, + "step": 19889 + }, + { + "epoch": 1.083089992036103, + "grad_norm": 0.513369597722478, + "learning_rate": 9.141278718147929e-05, + "loss": 11.9035, + "step": 19890 + }, + { + "epoch": 1.083144446032686, + "grad_norm": 0.5891037711902334, + "learning_rate": 9.140400152989079e-05, + "loss": 12.0648, + "step": 19891 + }, + { + "epoch": 1.083198900029269, + "grad_norm": 0.5543595431977636, + "learning_rate": 9.139521594514623e-05, + "loss": 12.0995, + "step": 19892 + }, + { + "epoch": 1.083253354025852, + "grad_norm": 0.5438529356820038, + "learning_rate": 9.138643042731389e-05, + "loss": 12.0272, + "step": 19893 + }, + { + "epoch": 1.083307808022435, + "grad_norm": 0.5467179470834783, + "learning_rate": 9.137764497646213e-05, + "loss": 12.1245, + "step": 19894 + }, + { + "epoch": 1.083362262019018, + "grad_norm": 0.4989757185609027, + "learning_rate": 9.136885959265926e-05, + "loss": 12.0709, + "step": 19895 + }, + { + "epoch": 1.083416716015601, + "grad_norm": 0.54996556335291, + "learning_rate": 9.136007427597358e-05, + "loss": 11.9735, + "step": 19896 + }, + { + "epoch": 1.0834711700121842, + "grad_norm": 0.5670794449956671, + "learning_rate": 9.135128902647344e-05, + "loss": 12.0571, + "step": 19897 + }, + { + "epoch": 1.0835256240087672, + "grad_norm": 0.5164709777065184, + "learning_rate": 9.134250384422708e-05, + "loss": 12.0582, + "step": 19898 + }, + { + "epoch": 1.0835800780053502, + "grad_norm": 0.5613349135182275, + "learning_rate": 9.133371872930288e-05, + "loss": 12.0784, + "step": 19899 + }, + { + "epoch": 1.0836345320019332, + "grad_norm": 0.5563800171494543, + "learning_rate": 9.132493368176913e-05, + "loss": 12.08, + "step": 19900 + }, + { + "epoch": 1.0836889859985162, + "grad_norm": 0.5206282421692374, + "learning_rate": 9.131614870169416e-05, + "loss": 12.1176, + "step": 19901 + }, + { + "epoch": 1.0837434399950991, + "grad_norm": 0.5123339744790743, + "learning_rate": 9.130736378914627e-05, + "loss": 12.1279, + "step": 19902 + }, + { + "epoch": 1.0837978939916821, + "grad_norm": 0.5556191526608115, + "learning_rate": 9.129857894419377e-05, + "loss": 11.9749, + "step": 19903 + }, + { + "epoch": 1.0838523479882651, + "grad_norm": 0.5499176741013869, + "learning_rate": 9.128979416690497e-05, + "loss": 11.9974, + "step": 19904 + }, + { + "epoch": 1.0839068019848481, + "grad_norm": 0.49136687224716824, + "learning_rate": 9.12810094573482e-05, + "loss": 11.9636, + "step": 19905 + }, + { + "epoch": 1.0839612559814311, + "grad_norm": 0.5667764010346874, + "learning_rate": 9.127222481559176e-05, + "loss": 12.1023, + "step": 19906 + }, + { + "epoch": 1.084015709978014, + "grad_norm": 0.5326825165723065, + "learning_rate": 9.126344024170402e-05, + "loss": 12.0809, + "step": 19907 + }, + { + "epoch": 1.084070163974597, + "grad_norm": 0.5370302873191101, + "learning_rate": 9.125465573575316e-05, + "loss": 12.0619, + "step": 19908 + }, + { + "epoch": 1.0841246179711803, + "grad_norm": 0.5560791563957217, + "learning_rate": 9.12458712978076e-05, + "loss": 12.0018, + "step": 19909 + }, + { + "epoch": 1.0841790719677633, + "grad_norm": 0.5058743738074365, + "learning_rate": 9.12370869279356e-05, + "loss": 12.0217, + "step": 19910 + }, + { + "epoch": 1.0842335259643463, + "grad_norm": 0.5990810630139142, + "learning_rate": 9.122830262620547e-05, + "loss": 12.1044, + "step": 19911 + }, + { + "epoch": 1.0842879799609293, + "grad_norm": 0.5410580811988184, + "learning_rate": 9.121951839268556e-05, + "loss": 11.9739, + "step": 19912 + }, + { + "epoch": 1.0843424339575123, + "grad_norm": 0.5568019602049695, + "learning_rate": 9.121073422744413e-05, + "loss": 12.1421, + "step": 19913 + }, + { + "epoch": 1.0843968879540953, + "grad_norm": 0.5569345235690699, + "learning_rate": 9.12019501305495e-05, + "loss": 11.8679, + "step": 19914 + }, + { + "epoch": 1.0844513419506783, + "grad_norm": 0.5183950269777624, + "learning_rate": 9.119316610206998e-05, + "loss": 12.0276, + "step": 19915 + }, + { + "epoch": 1.0845057959472613, + "grad_norm": 0.5064129400906593, + "learning_rate": 9.118438214207387e-05, + "loss": 12.1255, + "step": 19916 + }, + { + "epoch": 1.0845602499438443, + "grad_norm": 0.6021752218151646, + "learning_rate": 9.117559825062953e-05, + "loss": 12.0236, + "step": 19917 + }, + { + "epoch": 1.0846147039404272, + "grad_norm": 0.5157551802055335, + "learning_rate": 9.116681442780519e-05, + "loss": 11.9216, + "step": 19918 + }, + { + "epoch": 1.0846691579370102, + "grad_norm": 0.5436451505997936, + "learning_rate": 9.115803067366918e-05, + "loss": 12.1423, + "step": 19919 + }, + { + "epoch": 1.0847236119335935, + "grad_norm": 0.5973381170371647, + "learning_rate": 9.114924698828978e-05, + "loss": 12.0881, + "step": 19920 + }, + { + "epoch": 1.0847780659301764, + "grad_norm": 0.5206950377698941, + "learning_rate": 9.114046337173534e-05, + "loss": 12.1696, + "step": 19921 + }, + { + "epoch": 1.0848325199267594, + "grad_norm": 0.5343122491422189, + "learning_rate": 9.113167982407414e-05, + "loss": 12.0945, + "step": 19922 + }, + { + "epoch": 1.0848869739233424, + "grad_norm": 0.5405889694086997, + "learning_rate": 9.11228963453745e-05, + "loss": 12.1126, + "step": 19923 + }, + { + "epoch": 1.0849414279199254, + "grad_norm": 0.6113488144750007, + "learning_rate": 9.11141129357047e-05, + "loss": 11.9141, + "step": 19924 + }, + { + "epoch": 1.0849958819165084, + "grad_norm": 0.5452989191660556, + "learning_rate": 9.110532959513304e-05, + "loss": 12.0923, + "step": 19925 + }, + { + "epoch": 1.0850503359130914, + "grad_norm": 0.5544969239297456, + "learning_rate": 9.109654632372784e-05, + "loss": 12.088, + "step": 19926 + }, + { + "epoch": 1.0851047899096744, + "grad_norm": 0.6536920844173818, + "learning_rate": 9.10877631215574e-05, + "loss": 12.079, + "step": 19927 + }, + { + "epoch": 1.0851592439062574, + "grad_norm": 0.5271331003375122, + "learning_rate": 9.107897998869e-05, + "loss": 12.1115, + "step": 19928 + }, + { + "epoch": 1.0852136979028404, + "grad_norm": 0.49071808736334793, + "learning_rate": 9.107019692519393e-05, + "loss": 11.9722, + "step": 19929 + }, + { + "epoch": 1.0852681518994234, + "grad_norm": 0.6372190248928601, + "learning_rate": 9.106141393113752e-05, + "loss": 12.0689, + "step": 19930 + }, + { + "epoch": 1.0853226058960064, + "grad_norm": 0.5980830574240124, + "learning_rate": 9.105263100658902e-05, + "loss": 12.0512, + "step": 19931 + }, + { + "epoch": 1.0853770598925896, + "grad_norm": 0.5493450308605987, + "learning_rate": 9.10438481516168e-05, + "loss": 12.0894, + "step": 19932 + }, + { + "epoch": 1.0854315138891726, + "grad_norm": 0.6556303700259074, + "learning_rate": 9.10350653662891e-05, + "loss": 12.1051, + "step": 19933 + }, + { + "epoch": 1.0854859678857556, + "grad_norm": 0.5177009660696416, + "learning_rate": 9.102628265067425e-05, + "loss": 11.9703, + "step": 19934 + }, + { + "epoch": 1.0855404218823386, + "grad_norm": 0.5046612680720217, + "learning_rate": 9.101750000484052e-05, + "loss": 12.0555, + "step": 19935 + }, + { + "epoch": 1.0855948758789216, + "grad_norm": 0.5705460610848567, + "learning_rate": 9.100871742885622e-05, + "loss": 11.8679, + "step": 19936 + }, + { + "epoch": 1.0856493298755046, + "grad_norm": 0.5619400076838802, + "learning_rate": 9.099993492278965e-05, + "loss": 12.1102, + "step": 19937 + }, + { + "epoch": 1.0857037838720875, + "grad_norm": 0.5665782307003279, + "learning_rate": 9.09911524867091e-05, + "loss": 12.0628, + "step": 19938 + }, + { + "epoch": 1.0857582378686705, + "grad_norm": 0.5035529583625047, + "learning_rate": 9.098237012068286e-05, + "loss": 11.9725, + "step": 19939 + }, + { + "epoch": 1.0858126918652535, + "grad_norm": 0.5110637395954961, + "learning_rate": 9.09735878247792e-05, + "loss": 11.9253, + "step": 19940 + }, + { + "epoch": 1.0858671458618365, + "grad_norm": 0.539311907986121, + "learning_rate": 9.096480559906645e-05, + "loss": 12.0985, + "step": 19941 + }, + { + "epoch": 1.0859215998584195, + "grad_norm": 0.6409624825959144, + "learning_rate": 9.095602344361286e-05, + "loss": 12.023, + "step": 19942 + }, + { + "epoch": 1.0859760538550027, + "grad_norm": 0.5882799322296413, + "learning_rate": 9.094724135848677e-05, + "loss": 12.0073, + "step": 19943 + }, + { + "epoch": 1.0860305078515857, + "grad_norm": 0.546907140635376, + "learning_rate": 9.093845934375645e-05, + "loss": 11.8597, + "step": 19944 + }, + { + "epoch": 1.0860849618481687, + "grad_norm": 0.5879607208007526, + "learning_rate": 9.092967739949019e-05, + "loss": 11.9852, + "step": 19945 + }, + { + "epoch": 1.0861394158447517, + "grad_norm": 0.5450063255763744, + "learning_rate": 9.092089552575628e-05, + "loss": 12.0052, + "step": 19946 + }, + { + "epoch": 1.0861938698413347, + "grad_norm": 0.5314778428648255, + "learning_rate": 9.0912113722623e-05, + "loss": 12.0105, + "step": 19947 + }, + { + "epoch": 1.0862483238379177, + "grad_norm": 0.5585952771522082, + "learning_rate": 9.090333199015868e-05, + "loss": 12.2198, + "step": 19948 + }, + { + "epoch": 1.0863027778345007, + "grad_norm": 0.5732565799501155, + "learning_rate": 9.089455032843155e-05, + "loss": 12.0317, + "step": 19949 + }, + { + "epoch": 1.0863572318310837, + "grad_norm": 0.5765505756080281, + "learning_rate": 9.088576873750992e-05, + "loss": 12.2115, + "step": 19950 + }, + { + "epoch": 1.0864116858276667, + "grad_norm": 0.5513923575328709, + "learning_rate": 9.087698721746209e-05, + "loss": 12.103, + "step": 19951 + }, + { + "epoch": 1.0864661398242497, + "grad_norm": 0.5820435479206023, + "learning_rate": 9.086820576835634e-05, + "loss": 12.0404, + "step": 19952 + }, + { + "epoch": 1.0865205938208327, + "grad_norm": 0.6022803029144533, + "learning_rate": 9.085942439026092e-05, + "loss": 12.0666, + "step": 19953 + }, + { + "epoch": 1.0865750478174157, + "grad_norm": 0.5037362536968005, + "learning_rate": 9.085064308324418e-05, + "loss": 11.9326, + "step": 19954 + }, + { + "epoch": 1.0866295018139986, + "grad_norm": 0.5629619369554842, + "learning_rate": 9.084186184737437e-05, + "loss": 12.075, + "step": 19955 + }, + { + "epoch": 1.0866839558105819, + "grad_norm": 0.7293732390996722, + "learning_rate": 9.083308068271977e-05, + "loss": 12.0475, + "step": 19956 + }, + { + "epoch": 1.0867384098071649, + "grad_norm": 0.6011848720393436, + "learning_rate": 9.082429958934869e-05, + "loss": 11.9534, + "step": 19957 + }, + { + "epoch": 1.0867928638037478, + "grad_norm": 0.6314019833993705, + "learning_rate": 9.08155185673294e-05, + "loss": 12.1621, + "step": 19958 + }, + { + "epoch": 1.0868473178003308, + "grad_norm": 0.5321012509526907, + "learning_rate": 9.080673761673016e-05, + "loss": 11.9822, + "step": 19959 + }, + { + "epoch": 1.0869017717969138, + "grad_norm": 0.5358458161111209, + "learning_rate": 9.079795673761927e-05, + "loss": 12.0879, + "step": 19960 + }, + { + "epoch": 1.0869562257934968, + "grad_norm": 0.5516116169410465, + "learning_rate": 9.078917593006502e-05, + "loss": 12.0031, + "step": 19961 + }, + { + "epoch": 1.0870106797900798, + "grad_norm": 0.5659572430474552, + "learning_rate": 9.078039519413567e-05, + "loss": 12.1379, + "step": 19962 + }, + { + "epoch": 1.0870651337866628, + "grad_norm": 0.5606247963131835, + "learning_rate": 9.077161452989952e-05, + "loss": 11.9215, + "step": 19963 + }, + { + "epoch": 1.0871195877832458, + "grad_norm": 0.5748818529484704, + "learning_rate": 9.076283393742484e-05, + "loss": 11.9057, + "step": 19964 + }, + { + "epoch": 1.0871740417798288, + "grad_norm": 0.6397330396077927, + "learning_rate": 9.075405341677989e-05, + "loss": 12.0713, + "step": 19965 + }, + { + "epoch": 1.087228495776412, + "grad_norm": 0.5587576937613958, + "learning_rate": 9.0745272968033e-05, + "loss": 12.0238, + "step": 19966 + }, + { + "epoch": 1.087282949772995, + "grad_norm": 0.5372686915159008, + "learning_rate": 9.073649259125242e-05, + "loss": 11.7746, + "step": 19967 + }, + { + "epoch": 1.087337403769578, + "grad_norm": 0.5633905760899219, + "learning_rate": 9.072771228650646e-05, + "loss": 11.9728, + "step": 19968 + }, + { + "epoch": 1.087391857766161, + "grad_norm": 0.5949007043908271, + "learning_rate": 9.071893205386331e-05, + "loss": 12.115, + "step": 19969 + }, + { + "epoch": 1.087446311762744, + "grad_norm": 0.5807917281893856, + "learning_rate": 9.071015189339131e-05, + "loss": 11.9429, + "step": 19970 + }, + { + "epoch": 1.087500765759327, + "grad_norm": 0.5756543238824169, + "learning_rate": 9.070137180515875e-05, + "loss": 12.053, + "step": 19971 + }, + { + "epoch": 1.08755521975591, + "grad_norm": 0.5297283885852245, + "learning_rate": 9.069259178923386e-05, + "loss": 11.8516, + "step": 19972 + }, + { + "epoch": 1.087609673752493, + "grad_norm": 0.5551320857290687, + "learning_rate": 9.068381184568494e-05, + "loss": 12.0597, + "step": 19973 + }, + { + "epoch": 1.087664127749076, + "grad_norm": 0.5625803842858186, + "learning_rate": 9.067503197458027e-05, + "loss": 11.9823, + "step": 19974 + }, + { + "epoch": 1.087718581745659, + "grad_norm": 0.5795953180190437, + "learning_rate": 9.066625217598812e-05, + "loss": 12.1733, + "step": 19975 + }, + { + "epoch": 1.087773035742242, + "grad_norm": 0.5473935443239211, + "learning_rate": 9.06574724499767e-05, + "loss": 12.0024, + "step": 19976 + }, + { + "epoch": 1.087827489738825, + "grad_norm": 0.5667668328190282, + "learning_rate": 9.064869279661439e-05, + "loss": 11.9631, + "step": 19977 + }, + { + "epoch": 1.087881943735408, + "grad_norm": 0.5266704522455978, + "learning_rate": 9.063991321596948e-05, + "loss": 12.045, + "step": 19978 + }, + { + "epoch": 1.0879363977319911, + "grad_norm": 0.5844567425175714, + "learning_rate": 9.063113370811009e-05, + "loss": 12.139, + "step": 19979 + }, + { + "epoch": 1.0879908517285741, + "grad_norm": 0.5825710730257588, + "learning_rate": 9.062235427310457e-05, + "loss": 12.0948, + "step": 19980 + }, + { + "epoch": 1.0880453057251571, + "grad_norm": 0.6299771286484094, + "learning_rate": 9.061357491102122e-05, + "loss": 12.0189, + "step": 19981 + }, + { + "epoch": 1.0880997597217401, + "grad_norm": 0.5363310138015335, + "learning_rate": 9.060479562192829e-05, + "loss": 11.8844, + "step": 19982 + }, + { + "epoch": 1.088154213718323, + "grad_norm": 0.5707402502112603, + "learning_rate": 9.059601640589403e-05, + "loss": 12.0814, + "step": 19983 + }, + { + "epoch": 1.088208667714906, + "grad_norm": 0.5639069057481662, + "learning_rate": 9.058723726298673e-05, + "loss": 11.9652, + "step": 19984 + }, + { + "epoch": 1.088263121711489, + "grad_norm": 0.738216354293828, + "learning_rate": 9.057845819327466e-05, + "loss": 12.0537, + "step": 19985 + }, + { + "epoch": 1.088317575708072, + "grad_norm": 0.5441504601844046, + "learning_rate": 9.056967919682608e-05, + "loss": 12.0143, + "step": 19986 + }, + { + "epoch": 1.088372029704655, + "grad_norm": 0.6504410720166462, + "learning_rate": 9.056090027370923e-05, + "loss": 12.1874, + "step": 19987 + }, + { + "epoch": 1.088426483701238, + "grad_norm": 0.6195252656867826, + "learning_rate": 9.055212142399245e-05, + "loss": 12.0596, + "step": 19988 + }, + { + "epoch": 1.088480937697821, + "grad_norm": 0.5673916365171481, + "learning_rate": 9.054334264774394e-05, + "loss": 12.1216, + "step": 19989 + }, + { + "epoch": 1.0885353916944043, + "grad_norm": 0.5562905035399163, + "learning_rate": 9.053456394503197e-05, + "loss": 11.995, + "step": 19990 + }, + { + "epoch": 1.0885898456909873, + "grad_norm": 0.6864049145436131, + "learning_rate": 9.052578531592479e-05, + "loss": 11.9271, + "step": 19991 + }, + { + "epoch": 1.0886442996875703, + "grad_norm": 0.554609993264955, + "learning_rate": 9.051700676049073e-05, + "loss": 11.9662, + "step": 19992 + }, + { + "epoch": 1.0886987536841533, + "grad_norm": 0.5390644716978882, + "learning_rate": 9.050822827879801e-05, + "loss": 11.9687, + "step": 19993 + }, + { + "epoch": 1.0887532076807362, + "grad_norm": 0.6197942093873294, + "learning_rate": 9.04994498709149e-05, + "loss": 11.9846, + "step": 19994 + }, + { + "epoch": 1.0888076616773192, + "grad_norm": 0.5532516314359079, + "learning_rate": 9.049067153690965e-05, + "loss": 12.0499, + "step": 19995 + }, + { + "epoch": 1.0888621156739022, + "grad_norm": 0.5730859665342325, + "learning_rate": 9.048189327685055e-05, + "loss": 12.216, + "step": 19996 + }, + { + "epoch": 1.0889165696704852, + "grad_norm": 0.5500219671907687, + "learning_rate": 9.047311509080584e-05, + "loss": 12.0319, + "step": 19997 + }, + { + "epoch": 1.0889710236670682, + "grad_norm": 0.5921074346229978, + "learning_rate": 9.04643369788438e-05, + "loss": 12.1958, + "step": 19998 + }, + { + "epoch": 1.0890254776636512, + "grad_norm": 0.6028458643295544, + "learning_rate": 9.045555894103265e-05, + "loss": 11.9421, + "step": 19999 + }, + { + "epoch": 1.0890799316602342, + "grad_norm": 0.5073641549341452, + "learning_rate": 9.044678097744068e-05, + "loss": 12.0276, + "step": 20000 + }, + { + "epoch": 1.0891343856568172, + "grad_norm": 0.6065992995903755, + "learning_rate": 9.043800308813614e-05, + "loss": 12.1435, + "step": 20001 + }, + { + "epoch": 1.0891888396534004, + "grad_norm": 0.5643750133310085, + "learning_rate": 9.042922527318728e-05, + "loss": 12.0401, + "step": 20002 + }, + { + "epoch": 1.0892432936499834, + "grad_norm": 0.6550555665516066, + "learning_rate": 9.042044753266238e-05, + "loss": 12.1827, + "step": 20003 + }, + { + "epoch": 1.0892977476465664, + "grad_norm": 0.4899034343825559, + "learning_rate": 9.04116698666297e-05, + "loss": 12.0562, + "step": 20004 + }, + { + "epoch": 1.0893522016431494, + "grad_norm": 0.6185593649271679, + "learning_rate": 9.040289227515745e-05, + "loss": 12.0924, + "step": 20005 + }, + { + "epoch": 1.0894066556397324, + "grad_norm": 0.570663308596088, + "learning_rate": 9.039411475831395e-05, + "loss": 11.9767, + "step": 20006 + }, + { + "epoch": 1.0894611096363154, + "grad_norm": 0.5268150422309706, + "learning_rate": 9.038533731616741e-05, + "loss": 12.0193, + "step": 20007 + }, + { + "epoch": 1.0895155636328984, + "grad_norm": 0.5258947574793097, + "learning_rate": 9.037655994878614e-05, + "loss": 12.1018, + "step": 20008 + }, + { + "epoch": 1.0895700176294814, + "grad_norm": 0.575274588880538, + "learning_rate": 9.036778265623832e-05, + "loss": 11.9754, + "step": 20009 + }, + { + "epoch": 1.0896244716260644, + "grad_norm": 0.5802939253572615, + "learning_rate": 9.035900543859224e-05, + "loss": 12.0168, + "step": 20010 + }, + { + "epoch": 1.0896789256226473, + "grad_norm": 0.5122724331256518, + "learning_rate": 9.035022829591613e-05, + "loss": 11.9511, + "step": 20011 + }, + { + "epoch": 1.0897333796192303, + "grad_norm": 0.4769146188379691, + "learning_rate": 9.03414512282783e-05, + "loss": 11.9888, + "step": 20012 + }, + { + "epoch": 1.0897878336158136, + "grad_norm": 0.5265505779491405, + "learning_rate": 9.033267423574693e-05, + "loss": 11.9005, + "step": 20013 + }, + { + "epoch": 1.0898422876123965, + "grad_norm": 0.5333724765858119, + "learning_rate": 9.032389731839031e-05, + "loss": 12.0755, + "step": 20014 + }, + { + "epoch": 1.0898967416089795, + "grad_norm": 0.5193352601069574, + "learning_rate": 9.03151204762767e-05, + "loss": 12.1701, + "step": 20015 + }, + { + "epoch": 1.0899511956055625, + "grad_norm": 0.49464098071032003, + "learning_rate": 9.030634370947433e-05, + "loss": 12.0579, + "step": 20016 + }, + { + "epoch": 1.0900056496021455, + "grad_norm": 0.568910824507174, + "learning_rate": 9.029756701805147e-05, + "loss": 12.0425, + "step": 20017 + }, + { + "epoch": 1.0900601035987285, + "grad_norm": 0.5670081222423411, + "learning_rate": 9.028879040207638e-05, + "loss": 11.9655, + "step": 20018 + }, + { + "epoch": 1.0901145575953115, + "grad_norm": 0.5209534940742387, + "learning_rate": 9.028001386161724e-05, + "loss": 12.0386, + "step": 20019 + }, + { + "epoch": 1.0901690115918945, + "grad_norm": 0.5257222292295778, + "learning_rate": 9.027123739674236e-05, + "loss": 12.1165, + "step": 20020 + }, + { + "epoch": 1.0902234655884775, + "grad_norm": 0.562692073697708, + "learning_rate": 9.026246100751996e-05, + "loss": 12.1326, + "step": 20021 + }, + { + "epoch": 1.0902779195850605, + "grad_norm": 0.5356455064032006, + "learning_rate": 9.02536846940183e-05, + "loss": 12.1093, + "step": 20022 + }, + { + "epoch": 1.0903323735816435, + "grad_norm": 0.6178913806743136, + "learning_rate": 9.024490845630564e-05, + "loss": 12.0731, + "step": 20023 + }, + { + "epoch": 1.0903868275782265, + "grad_norm": 0.5086643420023287, + "learning_rate": 9.023613229445018e-05, + "loss": 12.0523, + "step": 20024 + }, + { + "epoch": 1.0904412815748097, + "grad_norm": 0.5219656284534158, + "learning_rate": 9.022735620852019e-05, + "loss": 11.7372, + "step": 20025 + }, + { + "epoch": 1.0904957355713927, + "grad_norm": 0.5162106532165133, + "learning_rate": 9.021858019858393e-05, + "loss": 11.9562, + "step": 20026 + }, + { + "epoch": 1.0905501895679757, + "grad_norm": 0.5000648210155602, + "learning_rate": 9.020980426470963e-05, + "loss": 11.9445, + "step": 20027 + }, + { + "epoch": 1.0906046435645587, + "grad_norm": 0.534566777693027, + "learning_rate": 9.020102840696558e-05, + "loss": 12.0463, + "step": 20028 + }, + { + "epoch": 1.0906590975611417, + "grad_norm": 0.5624082349442754, + "learning_rate": 9.01922526254199e-05, + "loss": 11.9304, + "step": 20029 + }, + { + "epoch": 1.0907135515577246, + "grad_norm": 0.551904556369177, + "learning_rate": 9.018347692014095e-05, + "loss": 11.9993, + "step": 20030 + }, + { + "epoch": 1.0907680055543076, + "grad_norm": 0.5942305762390805, + "learning_rate": 9.017470129119692e-05, + "loss": 12.092, + "step": 20031 + }, + { + "epoch": 1.0908224595508906, + "grad_norm": 0.5024243307789622, + "learning_rate": 9.016592573865606e-05, + "loss": 12.0821, + "step": 20032 + }, + { + "epoch": 1.0908769135474736, + "grad_norm": 0.52087478799685, + "learning_rate": 9.01571502625866e-05, + "loss": 12.0875, + "step": 20033 + }, + { + "epoch": 1.0909313675440566, + "grad_norm": 0.49630439482687516, + "learning_rate": 9.014837486305682e-05, + "loss": 12.0044, + "step": 20034 + }, + { + "epoch": 1.0909858215406396, + "grad_norm": 0.5379210037700731, + "learning_rate": 9.013959954013492e-05, + "loss": 12.0714, + "step": 20035 + }, + { + "epoch": 1.0910402755372228, + "grad_norm": 0.5401713498441595, + "learning_rate": 9.01308242938891e-05, + "loss": 11.8857, + "step": 20036 + }, + { + "epoch": 1.0910947295338058, + "grad_norm": 0.5356267210334869, + "learning_rate": 9.012204912438769e-05, + "loss": 12.1584, + "step": 20037 + }, + { + "epoch": 1.0911491835303888, + "grad_norm": 0.5685396376260402, + "learning_rate": 9.011327403169891e-05, + "loss": 12.0256, + "step": 20038 + }, + { + "epoch": 1.0912036375269718, + "grad_norm": 0.6397245614974457, + "learning_rate": 9.010449901589094e-05, + "loss": 12.0091, + "step": 20039 + }, + { + "epoch": 1.0912580915235548, + "grad_norm": 0.5338238370933304, + "learning_rate": 9.009572407703201e-05, + "loss": 11.9764, + "step": 20040 + }, + { + "epoch": 1.0913125455201378, + "grad_norm": 0.5576473803063768, + "learning_rate": 9.008694921519044e-05, + "loss": 11.9742, + "step": 20041 + }, + { + "epoch": 1.0913669995167208, + "grad_norm": 0.5860825346362053, + "learning_rate": 9.00781744304344e-05, + "loss": 12.0667, + "step": 20042 + }, + { + "epoch": 1.0914214535133038, + "grad_norm": 0.5144441272948119, + "learning_rate": 9.006939972283213e-05, + "loss": 11.9995, + "step": 20043 + }, + { + "epoch": 1.0914759075098868, + "grad_norm": 0.5611980194921362, + "learning_rate": 9.006062509245188e-05, + "loss": 12.1328, + "step": 20044 + }, + { + "epoch": 1.0915303615064698, + "grad_norm": 0.5431351711512413, + "learning_rate": 9.005185053936186e-05, + "loss": 12.0958, + "step": 20045 + }, + { + "epoch": 1.0915848155030528, + "grad_norm": 0.6682242249022382, + "learning_rate": 9.004307606363033e-05, + "loss": 12.0687, + "step": 20046 + }, + { + "epoch": 1.0916392694996357, + "grad_norm": 0.5671285226395476, + "learning_rate": 9.00343016653255e-05, + "loss": 11.931, + "step": 20047 + }, + { + "epoch": 1.0916937234962187, + "grad_norm": 0.5021756500744794, + "learning_rate": 9.002552734451566e-05, + "loss": 11.944, + "step": 20048 + }, + { + "epoch": 1.091748177492802, + "grad_norm": 0.5825252285222624, + "learning_rate": 9.001675310126897e-05, + "loss": 12.0314, + "step": 20049 + }, + { + "epoch": 1.091802631489385, + "grad_norm": 0.608147473703928, + "learning_rate": 9.000797893565367e-05, + "loss": 12.222, + "step": 20050 + }, + { + "epoch": 1.091857085485968, + "grad_norm": 0.5639127355682383, + "learning_rate": 8.999920484773798e-05, + "loss": 11.8921, + "step": 20051 + }, + { + "epoch": 1.091911539482551, + "grad_norm": 0.5647725739831364, + "learning_rate": 8.999043083759017e-05, + "loss": 11.9675, + "step": 20052 + }, + { + "epoch": 1.091965993479134, + "grad_norm": 0.5480108093055222, + "learning_rate": 8.998165690527846e-05, + "loss": 12.0654, + "step": 20053 + }, + { + "epoch": 1.092020447475717, + "grad_norm": 0.6067634857647759, + "learning_rate": 8.997288305087104e-05, + "loss": 12.0903, + "step": 20054 + }, + { + "epoch": 1.0920749014723, + "grad_norm": 0.5585022925172852, + "learning_rate": 8.996410927443619e-05, + "loss": 11.9205, + "step": 20055 + }, + { + "epoch": 1.092129355468883, + "grad_norm": 0.5333906347604619, + "learning_rate": 8.99553355760421e-05, + "loss": 12.0811, + "step": 20056 + }, + { + "epoch": 1.092183809465466, + "grad_norm": 0.5919546618318178, + "learning_rate": 8.994656195575699e-05, + "loss": 12.0538, + "step": 20057 + }, + { + "epoch": 1.0922382634620489, + "grad_norm": 0.5386646282360394, + "learning_rate": 8.993778841364915e-05, + "loss": 12.0325, + "step": 20058 + }, + { + "epoch": 1.0922927174586319, + "grad_norm": 0.5627313682717267, + "learning_rate": 8.992901494978671e-05, + "loss": 12.0302, + "step": 20059 + }, + { + "epoch": 1.092347171455215, + "grad_norm": 0.66080896523592, + "learning_rate": 8.992024156423795e-05, + "loss": 12.0468, + "step": 20060 + }, + { + "epoch": 1.092401625451798, + "grad_norm": 0.5483386811294285, + "learning_rate": 8.991146825707107e-05, + "loss": 12.0382, + "step": 20061 + }, + { + "epoch": 1.092456079448381, + "grad_norm": 0.5698024277752847, + "learning_rate": 8.99026950283543e-05, + "loss": 12.0269, + "step": 20062 + }, + { + "epoch": 1.092510533444964, + "grad_norm": 0.5034213710347741, + "learning_rate": 8.989392187815587e-05, + "loss": 11.9569, + "step": 20063 + }, + { + "epoch": 1.092564987441547, + "grad_norm": 0.5252457182569386, + "learning_rate": 8.988514880654402e-05, + "loss": 12.0722, + "step": 20064 + }, + { + "epoch": 1.09261944143813, + "grad_norm": 0.5485424380270981, + "learning_rate": 8.987637581358693e-05, + "loss": 12.1475, + "step": 20065 + }, + { + "epoch": 1.092673895434713, + "grad_norm": 0.5563476844267424, + "learning_rate": 8.986760289935285e-05, + "loss": 12.0159, + "step": 20066 + }, + { + "epoch": 1.092728349431296, + "grad_norm": 0.5586996576538787, + "learning_rate": 8.985883006390999e-05, + "loss": 12.0525, + "step": 20067 + }, + { + "epoch": 1.092782803427879, + "grad_norm": 0.5824339016281458, + "learning_rate": 8.985005730732658e-05, + "loss": 12.0977, + "step": 20068 + }, + { + "epoch": 1.092837257424462, + "grad_norm": 0.5784505890381453, + "learning_rate": 8.984128462967081e-05, + "loss": 12.0047, + "step": 20069 + }, + { + "epoch": 1.092891711421045, + "grad_norm": 0.52087938675949, + "learning_rate": 8.983251203101092e-05, + "loss": 12.0008, + "step": 20070 + }, + { + "epoch": 1.092946165417628, + "grad_norm": 0.6487274880530316, + "learning_rate": 8.982373951141511e-05, + "loss": 12.0405, + "step": 20071 + }, + { + "epoch": 1.0930006194142112, + "grad_norm": 0.5389941667346818, + "learning_rate": 8.981496707095162e-05, + "loss": 12.1138, + "step": 20072 + }, + { + "epoch": 1.0930550734107942, + "grad_norm": 0.5928294869034778, + "learning_rate": 8.980619470968865e-05, + "loss": 11.8825, + "step": 20073 + }, + { + "epoch": 1.0931095274073772, + "grad_norm": 0.5458634722915138, + "learning_rate": 8.97974224276944e-05, + "loss": 11.9042, + "step": 20074 + }, + { + "epoch": 1.0931639814039602, + "grad_norm": 0.5636647294429769, + "learning_rate": 8.978865022503712e-05, + "loss": 12.1298, + "step": 20075 + }, + { + "epoch": 1.0932184354005432, + "grad_norm": 0.5938311159587132, + "learning_rate": 8.9779878101785e-05, + "loss": 12.1066, + "step": 20076 + }, + { + "epoch": 1.0932728893971262, + "grad_norm": 0.5960768360565969, + "learning_rate": 8.977110605800628e-05, + "loss": 12.0255, + "step": 20077 + }, + { + "epoch": 1.0933273433937092, + "grad_norm": 0.576698533575002, + "learning_rate": 8.976233409376916e-05, + "loss": 11.9549, + "step": 20078 + }, + { + "epoch": 1.0933817973902922, + "grad_norm": 0.6343789106509875, + "learning_rate": 8.975356220914184e-05, + "loss": 11.9698, + "step": 20079 + }, + { + "epoch": 1.0934362513868752, + "grad_norm": 0.5420024279986068, + "learning_rate": 8.974479040419254e-05, + "loss": 11.9848, + "step": 20080 + }, + { + "epoch": 1.0934907053834582, + "grad_norm": 0.580584398832918, + "learning_rate": 8.973601867898946e-05, + "loss": 12.1149, + "step": 20081 + }, + { + "epoch": 1.0935451593800412, + "grad_norm": 0.6080394022495313, + "learning_rate": 8.972724703360083e-05, + "loss": 12.0821, + "step": 20082 + }, + { + "epoch": 1.0935996133766244, + "grad_norm": 0.6053854430232528, + "learning_rate": 8.971847546809482e-05, + "loss": 11.8476, + "step": 20083 + }, + { + "epoch": 1.0936540673732074, + "grad_norm": 0.5395166590259115, + "learning_rate": 8.970970398253971e-05, + "loss": 12.0407, + "step": 20084 + }, + { + "epoch": 1.0937085213697904, + "grad_norm": 0.5644145067321656, + "learning_rate": 8.970093257700362e-05, + "loss": 12.0285, + "step": 20085 + }, + { + "epoch": 1.0937629753663733, + "grad_norm": 0.5903288855279883, + "learning_rate": 8.969216125155483e-05, + "loss": 11.9965, + "step": 20086 + }, + { + "epoch": 1.0938174293629563, + "grad_norm": 0.5131401448105591, + "learning_rate": 8.968339000626154e-05, + "loss": 12.0213, + "step": 20087 + }, + { + "epoch": 1.0938718833595393, + "grad_norm": 0.5098649236324263, + "learning_rate": 8.967461884119191e-05, + "loss": 12.0144, + "step": 20088 + }, + { + "epoch": 1.0939263373561223, + "grad_norm": 0.6312531703837619, + "learning_rate": 8.966584775641423e-05, + "loss": 12.0738, + "step": 20089 + }, + { + "epoch": 1.0939807913527053, + "grad_norm": 0.5581019802511759, + "learning_rate": 8.96570767519966e-05, + "loss": 11.9121, + "step": 20090 + }, + { + "epoch": 1.0940352453492883, + "grad_norm": 0.5632930161470753, + "learning_rate": 8.96483058280073e-05, + "loss": 11.974, + "step": 20091 + }, + { + "epoch": 1.0940896993458713, + "grad_norm": 0.6088835188583688, + "learning_rate": 8.963953498451449e-05, + "loss": 12.0442, + "step": 20092 + }, + { + "epoch": 1.0941441533424543, + "grad_norm": 0.5602940826984238, + "learning_rate": 8.963076422158641e-05, + "loss": 12.0847, + "step": 20093 + }, + { + "epoch": 1.0941986073390373, + "grad_norm": 0.5516614152175064, + "learning_rate": 8.962199353929123e-05, + "loss": 12.0685, + "step": 20094 + }, + { + "epoch": 1.0942530613356205, + "grad_norm": 0.5263358727348857, + "learning_rate": 8.961322293769718e-05, + "loss": 11.9879, + "step": 20095 + }, + { + "epoch": 1.0943075153322035, + "grad_norm": 0.5990840897682311, + "learning_rate": 8.960445241687242e-05, + "loss": 12.176, + "step": 20096 + }, + { + "epoch": 1.0943619693287865, + "grad_norm": 0.6522974451215476, + "learning_rate": 8.95956819768852e-05, + "loss": 12.2024, + "step": 20097 + }, + { + "epoch": 1.0944164233253695, + "grad_norm": 0.501404330370195, + "learning_rate": 8.958691161780371e-05, + "loss": 12.0772, + "step": 20098 + }, + { + "epoch": 1.0944708773219525, + "grad_norm": 0.5666126899727636, + "learning_rate": 8.95781413396962e-05, + "loss": 11.9306, + "step": 20099 + }, + { + "epoch": 1.0945253313185355, + "grad_norm": 0.5762612189338195, + "learning_rate": 8.956937114263072e-05, + "loss": 12.0253, + "step": 20100 + }, + { + "epoch": 1.0945797853151185, + "grad_norm": 0.5554150959529307, + "learning_rate": 8.956060102667559e-05, + "loss": 12.0569, + "step": 20101 + }, + { + "epoch": 1.0946342393117015, + "grad_norm": 0.5361180229130249, + "learning_rate": 8.955183099189897e-05, + "loss": 12.0235, + "step": 20102 + }, + { + "epoch": 1.0946886933082844, + "grad_norm": 0.5243410510388125, + "learning_rate": 8.954306103836908e-05, + "loss": 11.9835, + "step": 20103 + }, + { + "epoch": 1.0947431473048674, + "grad_norm": 0.5157913590340827, + "learning_rate": 8.95342911661541e-05, + "loss": 12.0156, + "step": 20104 + }, + { + "epoch": 1.0947976013014504, + "grad_norm": 0.5460772186703869, + "learning_rate": 8.952552137532222e-05, + "loss": 12.1232, + "step": 20105 + }, + { + "epoch": 1.0948520552980336, + "grad_norm": 0.685412340208087, + "learning_rate": 8.951675166594165e-05, + "loss": 12.0335, + "step": 20106 + }, + { + "epoch": 1.0949065092946166, + "grad_norm": 0.6091407511085071, + "learning_rate": 8.950798203808054e-05, + "loss": 11.9644, + "step": 20107 + }, + { + "epoch": 1.0949609632911996, + "grad_norm": 0.6472951916502481, + "learning_rate": 8.949921249180715e-05, + "loss": 12.1011, + "step": 20108 + }, + { + "epoch": 1.0950154172877826, + "grad_norm": 0.5905049314827779, + "learning_rate": 8.94904430271897e-05, + "loss": 12.1145, + "step": 20109 + }, + { + "epoch": 1.0950698712843656, + "grad_norm": 0.5224146984482988, + "learning_rate": 8.948167364429628e-05, + "loss": 12.0614, + "step": 20110 + }, + { + "epoch": 1.0951243252809486, + "grad_norm": 0.619051476455258, + "learning_rate": 8.947290434319508e-05, + "loss": 12.1789, + "step": 20111 + }, + { + "epoch": 1.0951787792775316, + "grad_norm": 0.5208124207888019, + "learning_rate": 8.94641351239544e-05, + "loss": 12.024, + "step": 20112 + }, + { + "epoch": 1.0952332332741146, + "grad_norm": 0.5526687975045929, + "learning_rate": 8.945536598664235e-05, + "loss": 12.1497, + "step": 20113 + }, + { + "epoch": 1.0952876872706976, + "grad_norm": 0.5252404891630289, + "learning_rate": 8.944659693132715e-05, + "loss": 12.0619, + "step": 20114 + }, + { + "epoch": 1.0953421412672806, + "grad_norm": 0.5319056381621993, + "learning_rate": 8.943782795807698e-05, + "loss": 12.0097, + "step": 20115 + }, + { + "epoch": 1.0953965952638636, + "grad_norm": 0.49652797801671555, + "learning_rate": 8.942905906696004e-05, + "loss": 11.9446, + "step": 20116 + }, + { + "epoch": 1.0954510492604466, + "grad_norm": 0.5454351264879337, + "learning_rate": 8.94202902580445e-05, + "loss": 11.8903, + "step": 20117 + }, + { + "epoch": 1.0955055032570296, + "grad_norm": 0.5506251747120456, + "learning_rate": 8.941152153139854e-05, + "loss": 12.1033, + "step": 20118 + }, + { + "epoch": 1.0955599572536128, + "grad_norm": 0.5990370360918348, + "learning_rate": 8.940275288709041e-05, + "loss": 12.1675, + "step": 20119 + }, + { + "epoch": 1.0956144112501958, + "grad_norm": 0.5337433802798721, + "learning_rate": 8.939398432518823e-05, + "loss": 12.0019, + "step": 20120 + }, + { + "epoch": 1.0956688652467788, + "grad_norm": 0.5445019440378159, + "learning_rate": 8.93852158457602e-05, + "loss": 12.0999, + "step": 20121 + }, + { + "epoch": 1.0957233192433617, + "grad_norm": 0.6067101126084533, + "learning_rate": 8.937644744887451e-05, + "loss": 12.1753, + "step": 20122 + }, + { + "epoch": 1.0957777732399447, + "grad_norm": 0.6552684164311637, + "learning_rate": 8.936767913459932e-05, + "loss": 12.1914, + "step": 20123 + }, + { + "epoch": 1.0958322272365277, + "grad_norm": 0.5696952036299877, + "learning_rate": 8.935891090300288e-05, + "loss": 12.0084, + "step": 20124 + }, + { + "epoch": 1.0958866812331107, + "grad_norm": 0.5644830707317485, + "learning_rate": 8.935014275415332e-05, + "loss": 12.0161, + "step": 20125 + }, + { + "epoch": 1.0959411352296937, + "grad_norm": 0.5801519224720506, + "learning_rate": 8.934137468811883e-05, + "loss": 12.0711, + "step": 20126 + }, + { + "epoch": 1.0959955892262767, + "grad_norm": 0.5415007310440003, + "learning_rate": 8.93326067049676e-05, + "loss": 12.0049, + "step": 20127 + }, + { + "epoch": 1.0960500432228597, + "grad_norm": 0.6284333256617635, + "learning_rate": 8.932383880476782e-05, + "loss": 12.0675, + "step": 20128 + }, + { + "epoch": 1.0961044972194427, + "grad_norm": 0.620463081209787, + "learning_rate": 8.931507098758768e-05, + "loss": 12.1533, + "step": 20129 + }, + { + "epoch": 1.096158951216026, + "grad_norm": 0.5116319195529034, + "learning_rate": 8.930630325349531e-05, + "loss": 12.0879, + "step": 20130 + }, + { + "epoch": 1.096213405212609, + "grad_norm": 0.5817236198966054, + "learning_rate": 8.929753560255892e-05, + "loss": 11.973, + "step": 20131 + }, + { + "epoch": 1.096267859209192, + "grad_norm": 0.5274798141762556, + "learning_rate": 8.928876803484669e-05, + "loss": 11.9422, + "step": 20132 + }, + { + "epoch": 1.096322313205775, + "grad_norm": 0.5738998062283146, + "learning_rate": 8.92800005504268e-05, + "loss": 12.0354, + "step": 20133 + }, + { + "epoch": 1.0963767672023579, + "grad_norm": 0.5315282150985224, + "learning_rate": 8.92712331493674e-05, + "loss": 11.9889, + "step": 20134 + }, + { + "epoch": 1.0964312211989409, + "grad_norm": 0.5160836765505249, + "learning_rate": 8.926246583173672e-05, + "loss": 11.7833, + "step": 20135 + }, + { + "epoch": 1.0964856751955239, + "grad_norm": 0.539779799869873, + "learning_rate": 8.925369859760288e-05, + "loss": 12.0612, + "step": 20136 + }, + { + "epoch": 1.0965401291921069, + "grad_norm": 0.5651266964591511, + "learning_rate": 8.924493144703411e-05, + "loss": 12.0528, + "step": 20137 + }, + { + "epoch": 1.0965945831886899, + "grad_norm": 0.606502159416733, + "learning_rate": 8.923616438009855e-05, + "loss": 12.1592, + "step": 20138 + }, + { + "epoch": 1.0966490371852728, + "grad_norm": 0.5998599114540937, + "learning_rate": 8.92273973968644e-05, + "loss": 11.994, + "step": 20139 + }, + { + "epoch": 1.0967034911818558, + "grad_norm": 0.5697227085835541, + "learning_rate": 8.921863049739979e-05, + "loss": 12.0578, + "step": 20140 + }, + { + "epoch": 1.0967579451784388, + "grad_norm": 0.5481736629362859, + "learning_rate": 8.920986368177292e-05, + "loss": 12.0411, + "step": 20141 + }, + { + "epoch": 1.096812399175022, + "grad_norm": 0.5980455811264401, + "learning_rate": 8.920109695005195e-05, + "loss": 12.0798, + "step": 20142 + }, + { + "epoch": 1.096866853171605, + "grad_norm": 0.5560172118990516, + "learning_rate": 8.919233030230507e-05, + "loss": 12.1112, + "step": 20143 + }, + { + "epoch": 1.096921307168188, + "grad_norm": 0.5615406718987744, + "learning_rate": 8.918356373860044e-05, + "loss": 11.9977, + "step": 20144 + }, + { + "epoch": 1.096975761164771, + "grad_norm": 0.5818790254722687, + "learning_rate": 8.917479725900622e-05, + "loss": 12.1381, + "step": 20145 + }, + { + "epoch": 1.097030215161354, + "grad_norm": 0.6037929523246859, + "learning_rate": 8.91660308635906e-05, + "loss": 12.0707, + "step": 20146 + }, + { + "epoch": 1.097084669157937, + "grad_norm": 0.5516240368100374, + "learning_rate": 8.915726455242174e-05, + "loss": 12.0374, + "step": 20147 + }, + { + "epoch": 1.09713912315452, + "grad_norm": 0.5310517505703545, + "learning_rate": 8.914849832556782e-05, + "loss": 12.1701, + "step": 20148 + }, + { + "epoch": 1.097193577151103, + "grad_norm": 0.6207564108102965, + "learning_rate": 8.913973218309702e-05, + "loss": 12.1254, + "step": 20149 + }, + { + "epoch": 1.097248031147686, + "grad_norm": 0.5907823547374166, + "learning_rate": 8.913096612507745e-05, + "loss": 12.0863, + "step": 20150 + }, + { + "epoch": 1.097302485144269, + "grad_norm": 0.5575367026254822, + "learning_rate": 8.912220015157732e-05, + "loss": 11.9744, + "step": 20151 + }, + { + "epoch": 1.097356939140852, + "grad_norm": 0.6751885967052378, + "learning_rate": 8.911343426266478e-05, + "loss": 12.0798, + "step": 20152 + }, + { + "epoch": 1.0974113931374352, + "grad_norm": 0.5988546304632669, + "learning_rate": 8.9104668458408e-05, + "loss": 12.0581, + "step": 20153 + }, + { + "epoch": 1.0974658471340182, + "grad_norm": 0.5312345113132421, + "learning_rate": 8.909590273887516e-05, + "loss": 12.0057, + "step": 20154 + }, + { + "epoch": 1.0975203011306012, + "grad_norm": 0.6099930599315821, + "learning_rate": 8.908713710413438e-05, + "loss": 12.2494, + "step": 20155 + }, + { + "epoch": 1.0975747551271842, + "grad_norm": 0.614170068332695, + "learning_rate": 8.907837155425385e-05, + "loss": 12.0462, + "step": 20156 + }, + { + "epoch": 1.0976292091237672, + "grad_norm": 0.5822291739120788, + "learning_rate": 8.906960608930176e-05, + "loss": 12.0543, + "step": 20157 + }, + { + "epoch": 1.0976836631203502, + "grad_norm": 0.5188234845750623, + "learning_rate": 8.906084070934623e-05, + "loss": 12.0245, + "step": 20158 + }, + { + "epoch": 1.0977381171169331, + "grad_norm": 0.5011474986009845, + "learning_rate": 8.905207541445551e-05, + "loss": 12.0592, + "step": 20159 + }, + { + "epoch": 1.0977925711135161, + "grad_norm": 0.6494712131666551, + "learning_rate": 8.904331020469759e-05, + "loss": 12.1785, + "step": 20160 + }, + { + "epoch": 1.0978470251100991, + "grad_norm": 0.5733330915908963, + "learning_rate": 8.903454508014076e-05, + "loss": 12.0438, + "step": 20161 + }, + { + "epoch": 1.0979014791066821, + "grad_norm": 0.5601787541031239, + "learning_rate": 8.902578004085315e-05, + "loss": 12.0481, + "step": 20162 + }, + { + "epoch": 1.0979559331032651, + "grad_norm": 0.6770310032193882, + "learning_rate": 8.901701508690291e-05, + "loss": 12.0976, + "step": 20163 + }, + { + "epoch": 1.098010387099848, + "grad_norm": 0.6271802627011732, + "learning_rate": 8.900825021835821e-05, + "loss": 12.1657, + "step": 20164 + }, + { + "epoch": 1.0980648410964313, + "grad_norm": 0.5596518808132004, + "learning_rate": 8.89994854352872e-05, + "loss": 11.9854, + "step": 20165 + }, + { + "epoch": 1.0981192950930143, + "grad_norm": 0.5020805826317156, + "learning_rate": 8.899072073775802e-05, + "loss": 11.9333, + "step": 20166 + }, + { + "epoch": 1.0981737490895973, + "grad_norm": 0.6042882510332298, + "learning_rate": 8.898195612583886e-05, + "loss": 12.1209, + "step": 20167 + }, + { + "epoch": 1.0982282030861803, + "grad_norm": 0.652798989112454, + "learning_rate": 8.897319159959783e-05, + "loss": 12.0498, + "step": 20168 + }, + { + "epoch": 1.0982826570827633, + "grad_norm": 0.5933847046017819, + "learning_rate": 8.896442715910317e-05, + "loss": 12.1471, + "step": 20169 + }, + { + "epoch": 1.0983371110793463, + "grad_norm": 0.602944215692327, + "learning_rate": 8.895566280442294e-05, + "loss": 11.9518, + "step": 20170 + }, + { + "epoch": 1.0983915650759293, + "grad_norm": 0.5666090980810462, + "learning_rate": 8.894689853562532e-05, + "loss": 12.0612, + "step": 20171 + }, + { + "epoch": 1.0984460190725123, + "grad_norm": 0.5955716178058724, + "learning_rate": 8.893813435277845e-05, + "loss": 12.0099, + "step": 20172 + }, + { + "epoch": 1.0985004730690953, + "grad_norm": 0.630128554425275, + "learning_rate": 8.892937025595053e-05, + "loss": 12.0716, + "step": 20173 + }, + { + "epoch": 1.0985549270656783, + "grad_norm": 0.56439065043191, + "learning_rate": 8.892060624520968e-05, + "loss": 11.9565, + "step": 20174 + }, + { + "epoch": 1.0986093810622612, + "grad_norm": 0.5866824579384988, + "learning_rate": 8.891184232062405e-05, + "loss": 11.9924, + "step": 20175 + }, + { + "epoch": 1.0986638350588445, + "grad_norm": 0.7665723124085866, + "learning_rate": 8.89030784822618e-05, + "loss": 12.2208, + "step": 20176 + }, + { + "epoch": 1.0987182890554275, + "grad_norm": 0.6276742135752813, + "learning_rate": 8.889431473019108e-05, + "loss": 12.0264, + "step": 20177 + }, + { + "epoch": 1.0987727430520104, + "grad_norm": 0.579773130352321, + "learning_rate": 8.888555106448e-05, + "loss": 12.1391, + "step": 20178 + }, + { + "epoch": 1.0988271970485934, + "grad_norm": 0.5597768164792385, + "learning_rate": 8.88767874851968e-05, + "loss": 12.1392, + "step": 20179 + }, + { + "epoch": 1.0988816510451764, + "grad_norm": 0.5509950255852855, + "learning_rate": 8.886802399240952e-05, + "loss": 11.9973, + "step": 20180 + }, + { + "epoch": 1.0989361050417594, + "grad_norm": 0.5743486501236731, + "learning_rate": 8.885926058618636e-05, + "loss": 12.0458, + "step": 20181 + }, + { + "epoch": 1.0989905590383424, + "grad_norm": 0.5308475307375142, + "learning_rate": 8.885049726659546e-05, + "loss": 11.9559, + "step": 20182 + }, + { + "epoch": 1.0990450130349254, + "grad_norm": 0.5065825279705609, + "learning_rate": 8.884173403370494e-05, + "loss": 11.9881, + "step": 20183 + }, + { + "epoch": 1.0990994670315084, + "grad_norm": 0.5652712077381974, + "learning_rate": 8.883297088758298e-05, + "loss": 12.0564, + "step": 20184 + }, + { + "epoch": 1.0991539210280914, + "grad_norm": 0.5415480303899827, + "learning_rate": 8.882420782829772e-05, + "loss": 12.0772, + "step": 20185 + }, + { + "epoch": 1.0992083750246744, + "grad_norm": 0.5352570980250407, + "learning_rate": 8.881544485591729e-05, + "loss": 12.0056, + "step": 20186 + }, + { + "epoch": 1.0992628290212574, + "grad_norm": 0.5173298551932756, + "learning_rate": 8.880668197050984e-05, + "loss": 12.0798, + "step": 20187 + }, + { + "epoch": 1.0993172830178404, + "grad_norm": 0.48717873604380874, + "learning_rate": 8.87979191721435e-05, + "loss": 12.0184, + "step": 20188 + }, + { + "epoch": 1.0993717370144236, + "grad_norm": 0.5597341952123152, + "learning_rate": 8.878915646088646e-05, + "loss": 11.9295, + "step": 20189 + }, + { + "epoch": 1.0994261910110066, + "grad_norm": 0.5615709176867464, + "learning_rate": 8.878039383680678e-05, + "loss": 12.1507, + "step": 20190 + }, + { + "epoch": 1.0994806450075896, + "grad_norm": 0.5740671509788972, + "learning_rate": 8.877163129997265e-05, + "loss": 12.022, + "step": 20191 + }, + { + "epoch": 1.0995350990041726, + "grad_norm": 0.5352406945605978, + "learning_rate": 8.876286885045218e-05, + "loss": 12.1071, + "step": 20192 + }, + { + "epoch": 1.0995895530007556, + "grad_norm": 0.5373223953618459, + "learning_rate": 8.875410648831355e-05, + "loss": 11.9739, + "step": 20193 + }, + { + "epoch": 1.0996440069973386, + "grad_norm": 0.5296057941726166, + "learning_rate": 8.874534421362484e-05, + "loss": 11.9395, + "step": 20194 + }, + { + "epoch": 1.0996984609939215, + "grad_norm": 0.5577670315621532, + "learning_rate": 8.873658202645424e-05, + "loss": 11.9845, + "step": 20195 + }, + { + "epoch": 1.0997529149905045, + "grad_norm": 0.5335575364798968, + "learning_rate": 8.872781992686987e-05, + "loss": 12.0861, + "step": 20196 + }, + { + "epoch": 1.0998073689870875, + "grad_norm": 0.5508209498312792, + "learning_rate": 8.871905791493987e-05, + "loss": 11.9597, + "step": 20197 + }, + { + "epoch": 1.0998618229836705, + "grad_norm": 0.6250421100736349, + "learning_rate": 8.871029599073235e-05, + "loss": 12.1618, + "step": 20198 + }, + { + "epoch": 1.0999162769802535, + "grad_norm": 0.591381183464453, + "learning_rate": 8.87015341543155e-05, + "loss": 12.1627, + "step": 20199 + }, + { + "epoch": 1.0999707309768367, + "grad_norm": 0.5787438375914769, + "learning_rate": 8.869277240575738e-05, + "loss": 12.0613, + "step": 20200 + }, + { + "epoch": 1.1000251849734197, + "grad_norm": 0.5910180724776823, + "learning_rate": 8.868401074512616e-05, + "loss": 11.9308, + "step": 20201 + }, + { + "epoch": 1.1000796389700027, + "grad_norm": 0.5395654718912095, + "learning_rate": 8.867524917248999e-05, + "loss": 12.0461, + "step": 20202 + }, + { + "epoch": 1.1001340929665857, + "grad_norm": 0.5436651477781418, + "learning_rate": 8.866648768791697e-05, + "loss": 12.1008, + "step": 20203 + }, + { + "epoch": 1.1001885469631687, + "grad_norm": 0.6733763075191992, + "learning_rate": 8.865772629147523e-05, + "loss": 11.9532, + "step": 20204 + }, + { + "epoch": 1.1002430009597517, + "grad_norm": 0.5931005392577273, + "learning_rate": 8.86489649832329e-05, + "loss": 12.0734, + "step": 20205 + }, + { + "epoch": 1.1002974549563347, + "grad_norm": 0.5793400162860026, + "learning_rate": 8.864020376325814e-05, + "loss": 12.0362, + "step": 20206 + }, + { + "epoch": 1.1003519089529177, + "grad_norm": 0.5345822138575493, + "learning_rate": 8.863144263161906e-05, + "loss": 12.0126, + "step": 20207 + }, + { + "epoch": 1.1004063629495007, + "grad_norm": 0.5150319586228077, + "learning_rate": 8.86226815883838e-05, + "loss": 12.0538, + "step": 20208 + }, + { + "epoch": 1.1004608169460837, + "grad_norm": 0.5139144427714485, + "learning_rate": 8.86139206336205e-05, + "loss": 11.9446, + "step": 20209 + }, + { + "epoch": 1.1005152709426667, + "grad_norm": 0.5159596684044622, + "learning_rate": 8.860515976739722e-05, + "loss": 12.0148, + "step": 20210 + }, + { + "epoch": 1.1005697249392496, + "grad_norm": 0.5954957669807992, + "learning_rate": 8.859639898978213e-05, + "loss": 11.9208, + "step": 20211 + }, + { + "epoch": 1.1006241789358329, + "grad_norm": 0.47558291302665023, + "learning_rate": 8.858763830084338e-05, + "loss": 12.0234, + "step": 20212 + }, + { + "epoch": 1.1006786329324159, + "grad_norm": 0.5723861581364824, + "learning_rate": 8.857887770064905e-05, + "loss": 12.1122, + "step": 20213 + }, + { + "epoch": 1.1007330869289988, + "grad_norm": 0.5300311766221824, + "learning_rate": 8.857011718926728e-05, + "loss": 11.9661, + "step": 20214 + }, + { + "epoch": 1.1007875409255818, + "grad_norm": 0.4804128022172825, + "learning_rate": 8.85613567667662e-05, + "loss": 11.9646, + "step": 20215 + }, + { + "epoch": 1.1008419949221648, + "grad_norm": 0.6411895825078691, + "learning_rate": 8.855259643321391e-05, + "loss": 12.0086, + "step": 20216 + }, + { + "epoch": 1.1008964489187478, + "grad_norm": 0.5045659544266304, + "learning_rate": 8.854383618867857e-05, + "loss": 11.9535, + "step": 20217 + }, + { + "epoch": 1.1009509029153308, + "grad_norm": 0.5546269037658237, + "learning_rate": 8.853507603322828e-05, + "loss": 12.0156, + "step": 20218 + }, + { + "epoch": 1.1010053569119138, + "grad_norm": 0.5472239519449394, + "learning_rate": 8.852631596693115e-05, + "loss": 12.0827, + "step": 20219 + }, + { + "epoch": 1.1010598109084968, + "grad_norm": 0.6151265780247128, + "learning_rate": 8.851755598985537e-05, + "loss": 12.1356, + "step": 20220 + }, + { + "epoch": 1.1011142649050798, + "grad_norm": 0.5877863943068412, + "learning_rate": 8.850879610206894e-05, + "loss": 12.0763, + "step": 20221 + }, + { + "epoch": 1.1011687189016628, + "grad_norm": 0.5374816326611753, + "learning_rate": 8.850003630364005e-05, + "loss": 11.9959, + "step": 20222 + }, + { + "epoch": 1.101223172898246, + "grad_norm": 0.6347521513899049, + "learning_rate": 8.849127659463679e-05, + "loss": 11.9279, + "step": 20223 + }, + { + "epoch": 1.101277626894829, + "grad_norm": 0.5497226963963572, + "learning_rate": 8.848251697512732e-05, + "loss": 11.9248, + "step": 20224 + }, + { + "epoch": 1.101332080891412, + "grad_norm": 0.5671760158929101, + "learning_rate": 8.847375744517972e-05, + "loss": 11.9878, + "step": 20225 + }, + { + "epoch": 1.101386534887995, + "grad_norm": 0.505449844315798, + "learning_rate": 8.846499800486211e-05, + "loss": 12.0726, + "step": 20226 + }, + { + "epoch": 1.101440988884578, + "grad_norm": 0.5733129507632411, + "learning_rate": 8.845623865424262e-05, + "loss": 12.1238, + "step": 20227 + }, + { + "epoch": 1.101495442881161, + "grad_norm": 0.5792222536553667, + "learning_rate": 8.844747939338933e-05, + "loss": 12.0608, + "step": 20228 + }, + { + "epoch": 1.101549896877744, + "grad_norm": 0.5357805512260676, + "learning_rate": 8.843872022237039e-05, + "loss": 12.0475, + "step": 20229 + }, + { + "epoch": 1.101604350874327, + "grad_norm": 0.5568764403483965, + "learning_rate": 8.842996114125395e-05, + "loss": 11.9596, + "step": 20230 + }, + { + "epoch": 1.10165880487091, + "grad_norm": 0.5652795661893997, + "learning_rate": 8.842120215010803e-05, + "loss": 12.1226, + "step": 20231 + }, + { + "epoch": 1.101713258867493, + "grad_norm": 0.5696300592102768, + "learning_rate": 8.841244324900076e-05, + "loss": 12.1162, + "step": 20232 + }, + { + "epoch": 1.101767712864076, + "grad_norm": 0.6205151314076434, + "learning_rate": 8.84036844380003e-05, + "loss": 12.1801, + "step": 20233 + }, + { + "epoch": 1.101822166860659, + "grad_norm": 0.5466304713585389, + "learning_rate": 8.839492571717473e-05, + "loss": 11.9161, + "step": 20234 + }, + { + "epoch": 1.1018766208572421, + "grad_norm": 0.5140270883111085, + "learning_rate": 8.838616708659217e-05, + "loss": 12.0871, + "step": 20235 + }, + { + "epoch": 1.1019310748538251, + "grad_norm": 0.5832650218878539, + "learning_rate": 8.83774085463207e-05, + "loss": 12.0507, + "step": 20236 + }, + { + "epoch": 1.1019855288504081, + "grad_norm": 0.6361237961683173, + "learning_rate": 8.836865009642848e-05, + "loss": 12.1832, + "step": 20237 + }, + { + "epoch": 1.1020399828469911, + "grad_norm": 0.5433931408566341, + "learning_rate": 8.835989173698358e-05, + "loss": 12.112, + "step": 20238 + }, + { + "epoch": 1.102094436843574, + "grad_norm": 0.531281605006899, + "learning_rate": 8.835113346805408e-05, + "loss": 12.1103, + "step": 20239 + }, + { + "epoch": 1.102148890840157, + "grad_norm": 0.547987291454372, + "learning_rate": 8.83423752897082e-05, + "loss": 12.0016, + "step": 20240 + }, + { + "epoch": 1.10220334483674, + "grad_norm": 0.6196430697296458, + "learning_rate": 8.833361720201391e-05, + "loss": 12.106, + "step": 20241 + }, + { + "epoch": 1.102257798833323, + "grad_norm": 0.5075520452259732, + "learning_rate": 8.832485920503937e-05, + "loss": 12.064, + "step": 20242 + }, + { + "epoch": 1.102312252829906, + "grad_norm": 0.573113397993567, + "learning_rate": 8.831610129885266e-05, + "loss": 12.1014, + "step": 20243 + }, + { + "epoch": 1.102366706826489, + "grad_norm": 0.5467387171808178, + "learning_rate": 8.830734348352195e-05, + "loss": 12.0631, + "step": 20244 + }, + { + "epoch": 1.102421160823072, + "grad_norm": 0.5386865259364804, + "learning_rate": 8.829858575911527e-05, + "loss": 12.0966, + "step": 20245 + }, + { + "epoch": 1.1024756148196553, + "grad_norm": 0.5252737553681605, + "learning_rate": 8.828982812570075e-05, + "loss": 11.9106, + "step": 20246 + }, + { + "epoch": 1.1025300688162383, + "grad_norm": 0.48756803933840664, + "learning_rate": 8.82810705833465e-05, + "loss": 11.9954, + "step": 20247 + }, + { + "epoch": 1.1025845228128213, + "grad_norm": 0.5584157079026932, + "learning_rate": 8.827231313212061e-05, + "loss": 12.0073, + "step": 20248 + }, + { + "epoch": 1.1026389768094043, + "grad_norm": 0.5613385083370157, + "learning_rate": 8.826355577209118e-05, + "loss": 12.0158, + "step": 20249 + }, + { + "epoch": 1.1026934308059873, + "grad_norm": 0.5457529803685589, + "learning_rate": 8.825479850332633e-05, + "loss": 12.097, + "step": 20250 + }, + { + "epoch": 1.1027478848025702, + "grad_norm": 0.567677049333743, + "learning_rate": 8.824604132589412e-05, + "loss": 12.0287, + "step": 20251 + }, + { + "epoch": 1.1028023387991532, + "grad_norm": 0.4833128160398218, + "learning_rate": 8.823728423986266e-05, + "loss": 12.0636, + "step": 20252 + }, + { + "epoch": 1.1028567927957362, + "grad_norm": 0.573142902524311, + "learning_rate": 8.822852724530004e-05, + "loss": 11.9064, + "step": 20253 + }, + { + "epoch": 1.1029112467923192, + "grad_norm": 0.5058465021018069, + "learning_rate": 8.821977034227435e-05, + "loss": 12.0118, + "step": 20254 + }, + { + "epoch": 1.1029657007889022, + "grad_norm": 0.5663852923558932, + "learning_rate": 8.821101353085374e-05, + "loss": 12.1247, + "step": 20255 + }, + { + "epoch": 1.1030201547854852, + "grad_norm": 0.6070677912259818, + "learning_rate": 8.820225681110624e-05, + "loss": 12.1287, + "step": 20256 + }, + { + "epoch": 1.1030746087820682, + "grad_norm": 0.541915999624938, + "learning_rate": 8.819350018309999e-05, + "loss": 12.1196, + "step": 20257 + }, + { + "epoch": 1.1031290627786514, + "grad_norm": 0.5319353360682426, + "learning_rate": 8.818474364690306e-05, + "loss": 11.7641, + "step": 20258 + }, + { + "epoch": 1.1031835167752344, + "grad_norm": 0.5757522620881451, + "learning_rate": 8.817598720258353e-05, + "loss": 11.9708, + "step": 20259 + }, + { + "epoch": 1.1032379707718174, + "grad_norm": 0.6081238647231033, + "learning_rate": 8.816723085020954e-05, + "loss": 12.0519, + "step": 20260 + }, + { + "epoch": 1.1032924247684004, + "grad_norm": 0.5427343785399987, + "learning_rate": 8.815847458984911e-05, + "loss": 12.0829, + "step": 20261 + }, + { + "epoch": 1.1033468787649834, + "grad_norm": 0.5652734711236881, + "learning_rate": 8.814971842157039e-05, + "loss": 12.0157, + "step": 20262 + }, + { + "epoch": 1.1034013327615664, + "grad_norm": 0.5534906612787915, + "learning_rate": 8.814096234544143e-05, + "loss": 12.0162, + "step": 20263 + }, + { + "epoch": 1.1034557867581494, + "grad_norm": 0.5036252440792517, + "learning_rate": 8.813220636153035e-05, + "loss": 12.009, + "step": 20264 + }, + { + "epoch": 1.1035102407547324, + "grad_norm": 0.5257087999044628, + "learning_rate": 8.812345046990519e-05, + "loss": 11.9534, + "step": 20265 + }, + { + "epoch": 1.1035646947513154, + "grad_norm": 0.49793946160611324, + "learning_rate": 8.81146946706341e-05, + "loss": 12.03, + "step": 20266 + }, + { + "epoch": 1.1036191487478983, + "grad_norm": 0.5546572372244591, + "learning_rate": 8.810593896378513e-05, + "loss": 11.9098, + "step": 20267 + }, + { + "epoch": 1.1036736027444813, + "grad_norm": 0.5424649818390285, + "learning_rate": 8.809718334942639e-05, + "loss": 11.9215, + "step": 20268 + }, + { + "epoch": 1.1037280567410646, + "grad_norm": 0.5866022695322437, + "learning_rate": 8.808842782762592e-05, + "loss": 12.0288, + "step": 20269 + }, + { + "epoch": 1.1037825107376475, + "grad_norm": 0.5200616494258773, + "learning_rate": 8.807967239845187e-05, + "loss": 12.028, + "step": 20270 + }, + { + "epoch": 1.1038369647342305, + "grad_norm": 0.471892641538079, + "learning_rate": 8.807091706197228e-05, + "loss": 11.9984, + "step": 20271 + }, + { + "epoch": 1.1038914187308135, + "grad_norm": 0.5069317963591853, + "learning_rate": 8.806216181825522e-05, + "loss": 12.0616, + "step": 20272 + }, + { + "epoch": 1.1039458727273965, + "grad_norm": 0.5181631910018965, + "learning_rate": 8.805340666736878e-05, + "loss": 12.1398, + "step": 20273 + }, + { + "epoch": 1.1040003267239795, + "grad_norm": 0.5026001587635697, + "learning_rate": 8.804465160938108e-05, + "loss": 12.0038, + "step": 20274 + }, + { + "epoch": 1.1040547807205625, + "grad_norm": 0.5208749547424139, + "learning_rate": 8.803589664436017e-05, + "loss": 11.9452, + "step": 20275 + }, + { + "epoch": 1.1041092347171455, + "grad_norm": 0.5329006436594417, + "learning_rate": 8.802714177237412e-05, + "loss": 12.0868, + "step": 20276 + }, + { + "epoch": 1.1041636887137285, + "grad_norm": 0.6238998929891365, + "learning_rate": 8.801838699349101e-05, + "loss": 12.08, + "step": 20277 + }, + { + "epoch": 1.1042181427103115, + "grad_norm": 0.5713533790144977, + "learning_rate": 8.800963230777896e-05, + "loss": 12.0738, + "step": 20278 + }, + { + "epoch": 1.1042725967068945, + "grad_norm": 0.5067501866499987, + "learning_rate": 8.8000877715306e-05, + "loss": 12.1064, + "step": 20279 + }, + { + "epoch": 1.1043270507034775, + "grad_norm": 0.5696655009372665, + "learning_rate": 8.799212321614029e-05, + "loss": 12.0327, + "step": 20280 + }, + { + "epoch": 1.1043815047000605, + "grad_norm": 0.5907525642848902, + "learning_rate": 8.798336881034976e-05, + "loss": 11.9661, + "step": 20281 + }, + { + "epoch": 1.1044359586966437, + "grad_norm": 0.5029806148791618, + "learning_rate": 8.797461449800262e-05, + "loss": 12.0543, + "step": 20282 + }, + { + "epoch": 1.1044904126932267, + "grad_norm": 0.5740037503429757, + "learning_rate": 8.796586027916686e-05, + "loss": 12.0909, + "step": 20283 + }, + { + "epoch": 1.1045448666898097, + "grad_norm": 0.5359098765190763, + "learning_rate": 8.795710615391061e-05, + "loss": 12.0901, + "step": 20284 + }, + { + "epoch": 1.1045993206863927, + "grad_norm": 0.5381826336912698, + "learning_rate": 8.794835212230193e-05, + "loss": 11.9385, + "step": 20285 + }, + { + "epoch": 1.1046537746829757, + "grad_norm": 0.5164361531574544, + "learning_rate": 8.793959818440887e-05, + "loss": 11.9999, + "step": 20286 + }, + { + "epoch": 1.1047082286795586, + "grad_norm": 0.4952044177832422, + "learning_rate": 8.793084434029952e-05, + "loss": 11.934, + "step": 20287 + }, + { + "epoch": 1.1047626826761416, + "grad_norm": 0.5353442979609403, + "learning_rate": 8.792209059004193e-05, + "loss": 11.9958, + "step": 20288 + }, + { + "epoch": 1.1048171366727246, + "grad_norm": 0.5198804593022757, + "learning_rate": 8.79133369337042e-05, + "loss": 11.8735, + "step": 20289 + }, + { + "epoch": 1.1048715906693076, + "grad_norm": 0.5539164060775061, + "learning_rate": 8.790458337135444e-05, + "loss": 11.9442, + "step": 20290 + }, + { + "epoch": 1.1049260446658906, + "grad_norm": 0.5149487649125211, + "learning_rate": 8.789582990306062e-05, + "loss": 12.0169, + "step": 20291 + }, + { + "epoch": 1.1049804986624736, + "grad_norm": 0.5593324766953811, + "learning_rate": 8.788707652889084e-05, + "loss": 12.0518, + "step": 20292 + }, + { + "epoch": 1.1050349526590568, + "grad_norm": 0.5223788666070568, + "learning_rate": 8.78783232489132e-05, + "loss": 12.0239, + "step": 20293 + }, + { + "epoch": 1.1050894066556398, + "grad_norm": 0.6409900434279258, + "learning_rate": 8.786957006319577e-05, + "loss": 11.9321, + "step": 20294 + }, + { + "epoch": 1.1051438606522228, + "grad_norm": 0.5991656646274143, + "learning_rate": 8.786081697180659e-05, + "loss": 12.1513, + "step": 20295 + }, + { + "epoch": 1.1051983146488058, + "grad_norm": 0.5437244510373981, + "learning_rate": 8.785206397481371e-05, + "loss": 11.9969, + "step": 20296 + }, + { + "epoch": 1.1052527686453888, + "grad_norm": 0.5410626905453222, + "learning_rate": 8.784331107228525e-05, + "loss": 12.117, + "step": 20297 + }, + { + "epoch": 1.1053072226419718, + "grad_norm": 0.5280928701320662, + "learning_rate": 8.783455826428921e-05, + "loss": 12.0798, + "step": 20298 + }, + { + "epoch": 1.1053616766385548, + "grad_norm": 0.5852564727198065, + "learning_rate": 8.782580555089368e-05, + "loss": 11.9185, + "step": 20299 + }, + { + "epoch": 1.1054161306351378, + "grad_norm": 0.5640739042641488, + "learning_rate": 8.78170529321668e-05, + "loss": 12.045, + "step": 20300 + }, + { + "epoch": 1.1054705846317208, + "grad_norm": 0.5711766805877603, + "learning_rate": 8.780830040817651e-05, + "loss": 12.0574, + "step": 20301 + }, + { + "epoch": 1.1055250386283038, + "grad_norm": 0.5487375604738145, + "learning_rate": 8.779954797899091e-05, + "loss": 12.0689, + "step": 20302 + }, + { + "epoch": 1.1055794926248868, + "grad_norm": 0.5463485657321694, + "learning_rate": 8.779079564467807e-05, + "loss": 12.0732, + "step": 20303 + }, + { + "epoch": 1.1056339466214697, + "grad_norm": 0.5958010094516646, + "learning_rate": 8.778204340530606e-05, + "loss": 12.14, + "step": 20304 + }, + { + "epoch": 1.105688400618053, + "grad_norm": 0.555501224415568, + "learning_rate": 8.777329126094292e-05, + "loss": 11.9945, + "step": 20305 + }, + { + "epoch": 1.105742854614636, + "grad_norm": 0.5719709735434838, + "learning_rate": 8.776453921165674e-05, + "loss": 12.0651, + "step": 20306 + }, + { + "epoch": 1.105797308611219, + "grad_norm": 0.5703447668093605, + "learning_rate": 8.775578725751553e-05, + "loss": 11.9732, + "step": 20307 + }, + { + "epoch": 1.105851762607802, + "grad_norm": 0.584597127790104, + "learning_rate": 8.77470353985874e-05, + "loss": 12.0707, + "step": 20308 + }, + { + "epoch": 1.105906216604385, + "grad_norm": 0.5099775442129576, + "learning_rate": 8.773828363494036e-05, + "loss": 11.9749, + "step": 20309 + }, + { + "epoch": 1.105960670600968, + "grad_norm": 0.5249223983386117, + "learning_rate": 8.77295319666425e-05, + "loss": 11.933, + "step": 20310 + }, + { + "epoch": 1.106015124597551, + "grad_norm": 0.5734136019444562, + "learning_rate": 8.772078039376184e-05, + "loss": 11.9393, + "step": 20311 + }, + { + "epoch": 1.106069578594134, + "grad_norm": 0.6383360418108145, + "learning_rate": 8.771202891636646e-05, + "loss": 12.2153, + "step": 20312 + }, + { + "epoch": 1.106124032590717, + "grad_norm": 0.564871378026721, + "learning_rate": 8.770327753452438e-05, + "loss": 12.0592, + "step": 20313 + }, + { + "epoch": 1.1061784865873, + "grad_norm": 0.5003922451111757, + "learning_rate": 8.769452624830367e-05, + "loss": 11.9846, + "step": 20314 + }, + { + "epoch": 1.1062329405838829, + "grad_norm": 0.6299781577491396, + "learning_rate": 8.768577505777242e-05, + "loss": 11.9441, + "step": 20315 + }, + { + "epoch": 1.106287394580466, + "grad_norm": 0.6763607365341504, + "learning_rate": 8.767702396299864e-05, + "loss": 12.0107, + "step": 20316 + }, + { + "epoch": 1.106341848577049, + "grad_norm": 0.5623560295924026, + "learning_rate": 8.766827296405039e-05, + "loss": 12.0481, + "step": 20317 + }, + { + "epoch": 1.106396302573632, + "grad_norm": 0.5965173681242213, + "learning_rate": 8.765952206099572e-05, + "loss": 12.0934, + "step": 20318 + }, + { + "epoch": 1.106450756570215, + "grad_norm": 0.5279141459995098, + "learning_rate": 8.765077125390266e-05, + "loss": 12.0243, + "step": 20319 + }, + { + "epoch": 1.106505210566798, + "grad_norm": 0.5381397115637279, + "learning_rate": 8.76420205428393e-05, + "loss": 12.0341, + "step": 20320 + }, + { + "epoch": 1.106559664563381, + "grad_norm": 0.5484374520949024, + "learning_rate": 8.763326992787365e-05, + "loss": 11.9527, + "step": 20321 + }, + { + "epoch": 1.106614118559964, + "grad_norm": 0.5771416993957864, + "learning_rate": 8.762451940907376e-05, + "loss": 12.0113, + "step": 20322 + }, + { + "epoch": 1.106668572556547, + "grad_norm": 0.5563674089393889, + "learning_rate": 8.761576898650768e-05, + "loss": 11.9658, + "step": 20323 + }, + { + "epoch": 1.10672302655313, + "grad_norm": 0.5536476891827287, + "learning_rate": 8.760701866024347e-05, + "loss": 12.0788, + "step": 20324 + }, + { + "epoch": 1.106777480549713, + "grad_norm": 0.5891056273868874, + "learning_rate": 8.759826843034915e-05, + "loss": 12.0831, + "step": 20325 + }, + { + "epoch": 1.106831934546296, + "grad_norm": 0.599718056726197, + "learning_rate": 8.758951829689275e-05, + "loss": 12.0573, + "step": 20326 + }, + { + "epoch": 1.106886388542879, + "grad_norm": 0.5371038737374279, + "learning_rate": 8.758076825994237e-05, + "loss": 12.0321, + "step": 20327 + }, + { + "epoch": 1.1069408425394622, + "grad_norm": 0.6004552767530613, + "learning_rate": 8.7572018319566e-05, + "loss": 12.0596, + "step": 20328 + }, + { + "epoch": 1.1069952965360452, + "grad_norm": 0.5865194360324422, + "learning_rate": 8.756326847583171e-05, + "loss": 12.0869, + "step": 20329 + }, + { + "epoch": 1.1070497505326282, + "grad_norm": 0.5048031034073145, + "learning_rate": 8.755451872880757e-05, + "loss": 12.0092, + "step": 20330 + }, + { + "epoch": 1.1071042045292112, + "grad_norm": 0.5322675844630659, + "learning_rate": 8.754576907856154e-05, + "loss": 12.0208, + "step": 20331 + }, + { + "epoch": 1.1071586585257942, + "grad_norm": 0.5441646557030124, + "learning_rate": 8.753701952516169e-05, + "loss": 12.0182, + "step": 20332 + }, + { + "epoch": 1.1072131125223772, + "grad_norm": 0.5608449926792899, + "learning_rate": 8.752827006867607e-05, + "loss": 11.9662, + "step": 20333 + }, + { + "epoch": 1.1072675665189602, + "grad_norm": 0.5723882401594803, + "learning_rate": 8.751952070917273e-05, + "loss": 11.997, + "step": 20334 + }, + { + "epoch": 1.1073220205155432, + "grad_norm": 0.5455646117400574, + "learning_rate": 8.751077144671968e-05, + "loss": 11.9964, + "step": 20335 + }, + { + "epoch": 1.1073764745121262, + "grad_norm": 0.5869649085404693, + "learning_rate": 8.750202228138497e-05, + "loss": 12.0137, + "step": 20336 + }, + { + "epoch": 1.1074309285087092, + "grad_norm": 0.4929235067216394, + "learning_rate": 8.749327321323659e-05, + "loss": 11.9886, + "step": 20337 + }, + { + "epoch": 1.1074853825052922, + "grad_norm": 0.542015727952765, + "learning_rate": 8.748452424234266e-05, + "loss": 12.0229, + "step": 20338 + }, + { + "epoch": 1.1075398365018754, + "grad_norm": 0.5877668144850737, + "learning_rate": 8.747577536877117e-05, + "loss": 11.9163, + "step": 20339 + }, + { + "epoch": 1.1075942904984584, + "grad_norm": 0.5608190697937362, + "learning_rate": 8.746702659259017e-05, + "loss": 12.0286, + "step": 20340 + }, + { + "epoch": 1.1076487444950414, + "grad_norm": 0.513750153259443, + "learning_rate": 8.745827791386762e-05, + "loss": 12.0426, + "step": 20341 + }, + { + "epoch": 1.1077031984916244, + "grad_norm": 0.5838775241989291, + "learning_rate": 8.744952933267163e-05, + "loss": 11.9967, + "step": 20342 + }, + { + "epoch": 1.1077576524882073, + "grad_norm": 0.5905911772599802, + "learning_rate": 8.744078084907021e-05, + "loss": 12.1397, + "step": 20343 + }, + { + "epoch": 1.1078121064847903, + "grad_norm": 0.49872395232447336, + "learning_rate": 8.743203246313136e-05, + "loss": 11.8628, + "step": 20344 + }, + { + "epoch": 1.1078665604813733, + "grad_norm": 0.5638960798286776, + "learning_rate": 8.742328417492316e-05, + "loss": 12.0169, + "step": 20345 + }, + { + "epoch": 1.1079210144779563, + "grad_norm": 0.5880102201495002, + "learning_rate": 8.74145359845136e-05, + "loss": 12.0852, + "step": 20346 + }, + { + "epoch": 1.1079754684745393, + "grad_norm": 0.5782451800864982, + "learning_rate": 8.740578789197071e-05, + "loss": 12.0772, + "step": 20347 + }, + { + "epoch": 1.1080299224711223, + "grad_norm": 0.5032096969628987, + "learning_rate": 8.739703989736252e-05, + "loss": 11.8333, + "step": 20348 + }, + { + "epoch": 1.1080843764677053, + "grad_norm": 0.5392127032883857, + "learning_rate": 8.738829200075707e-05, + "loss": 12.1177, + "step": 20349 + }, + { + "epoch": 1.1081388304642883, + "grad_norm": 0.5569545293518601, + "learning_rate": 8.737954420222243e-05, + "loss": 11.9352, + "step": 20350 + }, + { + "epoch": 1.1081932844608713, + "grad_norm": 0.5571004661020185, + "learning_rate": 8.737079650182653e-05, + "loss": 12.1027, + "step": 20351 + }, + { + "epoch": 1.1082477384574545, + "grad_norm": 0.5202757639842748, + "learning_rate": 8.73620488996374e-05, + "loss": 11.9588, + "step": 20352 + }, + { + "epoch": 1.1083021924540375, + "grad_norm": 0.5326218707584636, + "learning_rate": 8.735330139572312e-05, + "loss": 12.1588, + "step": 20353 + }, + { + "epoch": 1.1083566464506205, + "grad_norm": 0.581581715781423, + "learning_rate": 8.73445539901517e-05, + "loss": 12.0124, + "step": 20354 + }, + { + "epoch": 1.1084111004472035, + "grad_norm": 0.5643813781203146, + "learning_rate": 8.733580668299113e-05, + "loss": 12.0239, + "step": 20355 + }, + { + "epoch": 1.1084655544437865, + "grad_norm": 0.4929896961613901, + "learning_rate": 8.732705947430948e-05, + "loss": 11.9148, + "step": 20356 + }, + { + "epoch": 1.1085200084403695, + "grad_norm": 0.5223543752886799, + "learning_rate": 8.731831236417472e-05, + "loss": 11.9633, + "step": 20357 + }, + { + "epoch": 1.1085744624369525, + "grad_norm": 0.5729833090573728, + "learning_rate": 8.73095653526549e-05, + "loss": 12.0722, + "step": 20358 + }, + { + "epoch": 1.1086289164335354, + "grad_norm": 0.5251201813942938, + "learning_rate": 8.7300818439818e-05, + "loss": 12.0149, + "step": 20359 + }, + { + "epoch": 1.1086833704301184, + "grad_norm": 0.558311215109325, + "learning_rate": 8.729207162573214e-05, + "loss": 11.9108, + "step": 20360 + }, + { + "epoch": 1.1087378244267014, + "grad_norm": 0.5365779165147309, + "learning_rate": 8.72833249104652e-05, + "loss": 12.0024, + "step": 20361 + }, + { + "epoch": 1.1087922784232844, + "grad_norm": 0.5151686326710411, + "learning_rate": 8.727457829408527e-05, + "loss": 11.9575, + "step": 20362 + }, + { + "epoch": 1.1088467324198676, + "grad_norm": 0.576146484496607, + "learning_rate": 8.726583177666034e-05, + "loss": 12.146, + "step": 20363 + }, + { + "epoch": 1.1089011864164506, + "grad_norm": 0.5604576831869068, + "learning_rate": 8.725708535825845e-05, + "loss": 12.0227, + "step": 20364 + }, + { + "epoch": 1.1089556404130336, + "grad_norm": 0.5488182443925755, + "learning_rate": 8.724833903894761e-05, + "loss": 12.0595, + "step": 20365 + }, + { + "epoch": 1.1090100944096166, + "grad_norm": 0.5702859955842696, + "learning_rate": 8.72395928187958e-05, + "loss": 12.0216, + "step": 20366 + }, + { + "epoch": 1.1090645484061996, + "grad_norm": 0.630035859859341, + "learning_rate": 8.723084669787107e-05, + "loss": 12.0802, + "step": 20367 + }, + { + "epoch": 1.1091190024027826, + "grad_norm": 0.5346963959972917, + "learning_rate": 8.722210067624143e-05, + "loss": 12.0565, + "step": 20368 + }, + { + "epoch": 1.1091734563993656, + "grad_norm": 0.5638355168601141, + "learning_rate": 8.721335475397486e-05, + "loss": 12.0975, + "step": 20369 + }, + { + "epoch": 1.1092279103959486, + "grad_norm": 0.544604177941654, + "learning_rate": 8.72046089311394e-05, + "loss": 11.9492, + "step": 20370 + }, + { + "epoch": 1.1092823643925316, + "grad_norm": 0.5112490088018798, + "learning_rate": 8.719586320780307e-05, + "loss": 11.96, + "step": 20371 + }, + { + "epoch": 1.1093368183891146, + "grad_norm": 0.5328271332370599, + "learning_rate": 8.718711758403382e-05, + "loss": 11.9347, + "step": 20372 + }, + { + "epoch": 1.1093912723856976, + "grad_norm": 0.5674785663412966, + "learning_rate": 8.717837205989969e-05, + "loss": 11.9964, + "step": 20373 + }, + { + "epoch": 1.1094457263822806, + "grad_norm": 0.5953098610929797, + "learning_rate": 8.71696266354687e-05, + "loss": 12.0257, + "step": 20374 + }, + { + "epoch": 1.1095001803788638, + "grad_norm": 0.5529267917614739, + "learning_rate": 8.716088131080882e-05, + "loss": 11.9219, + "step": 20375 + }, + { + "epoch": 1.1095546343754468, + "grad_norm": 0.612605982175686, + "learning_rate": 8.715213608598811e-05, + "loss": 12.0591, + "step": 20376 + }, + { + "epoch": 1.1096090883720298, + "grad_norm": 0.6502316142012194, + "learning_rate": 8.714339096107454e-05, + "loss": 12.023, + "step": 20377 + }, + { + "epoch": 1.1096635423686128, + "grad_norm": 0.5152773212699467, + "learning_rate": 8.713464593613612e-05, + "loss": 11.94, + "step": 20378 + }, + { + "epoch": 1.1097179963651957, + "grad_norm": 0.5519647029298497, + "learning_rate": 8.712590101124084e-05, + "loss": 12.0978, + "step": 20379 + }, + { + "epoch": 1.1097724503617787, + "grad_norm": 0.5142008477824004, + "learning_rate": 8.711715618645671e-05, + "loss": 12.1463, + "step": 20380 + }, + { + "epoch": 1.1098269043583617, + "grad_norm": 0.5431164123937956, + "learning_rate": 8.710841146185177e-05, + "loss": 12.0085, + "step": 20381 + }, + { + "epoch": 1.1098813583549447, + "grad_norm": 0.5465098198668166, + "learning_rate": 8.709966683749396e-05, + "loss": 11.8521, + "step": 20382 + }, + { + "epoch": 1.1099358123515277, + "grad_norm": 0.5771607951376807, + "learning_rate": 8.70909223134513e-05, + "loss": 12.0074, + "step": 20383 + }, + { + "epoch": 1.1099902663481107, + "grad_norm": 0.5732513893159404, + "learning_rate": 8.70821778897918e-05, + "loss": 12.0577, + "step": 20384 + }, + { + "epoch": 1.1100447203446937, + "grad_norm": 0.5687566895392488, + "learning_rate": 8.707343356658344e-05, + "loss": 12.0488, + "step": 20385 + }, + { + "epoch": 1.110099174341277, + "grad_norm": 0.5436380291903499, + "learning_rate": 8.706468934389421e-05, + "loss": 11.9659, + "step": 20386 + }, + { + "epoch": 1.11015362833786, + "grad_norm": 0.5525146804435588, + "learning_rate": 8.705594522179214e-05, + "loss": 12.0789, + "step": 20387 + }, + { + "epoch": 1.110208082334443, + "grad_norm": 0.5607722918179316, + "learning_rate": 8.704720120034523e-05, + "loss": 12.0246, + "step": 20388 + }, + { + "epoch": 1.110262536331026, + "grad_norm": 0.5643884542944015, + "learning_rate": 8.703845727962144e-05, + "loss": 12.0696, + "step": 20389 + }, + { + "epoch": 1.1103169903276089, + "grad_norm": 0.5559613977033397, + "learning_rate": 8.702971345968879e-05, + "loss": 12.0368, + "step": 20390 + }, + { + "epoch": 1.1103714443241919, + "grad_norm": 0.5926001588421552, + "learning_rate": 8.702096974061527e-05, + "loss": 11.9943, + "step": 20391 + }, + { + "epoch": 1.1104258983207749, + "grad_norm": 0.5782246297613625, + "learning_rate": 8.701222612246887e-05, + "loss": 12.0798, + "step": 20392 + }, + { + "epoch": 1.1104803523173579, + "grad_norm": 0.5650841524659677, + "learning_rate": 8.700348260531756e-05, + "loss": 12.1178, + "step": 20393 + }, + { + "epoch": 1.1105348063139409, + "grad_norm": 0.527666922247384, + "learning_rate": 8.699473918922934e-05, + "loss": 11.9583, + "step": 20394 + }, + { + "epoch": 1.1105892603105239, + "grad_norm": 0.4831398814838757, + "learning_rate": 8.698599587427223e-05, + "loss": 12.0063, + "step": 20395 + }, + { + "epoch": 1.1106437143071068, + "grad_norm": 0.6126566647506558, + "learning_rate": 8.697725266051419e-05, + "loss": 12.1569, + "step": 20396 + }, + { + "epoch": 1.1106981683036898, + "grad_norm": 0.6469344452197978, + "learning_rate": 8.696850954802319e-05, + "loss": 12.0177, + "step": 20397 + }, + { + "epoch": 1.110752622300273, + "grad_norm": 0.5665602869832791, + "learning_rate": 8.695976653686726e-05, + "loss": 12.167, + "step": 20398 + }, + { + "epoch": 1.110807076296856, + "grad_norm": 0.5185074852028833, + "learning_rate": 8.695102362711439e-05, + "loss": 12.0196, + "step": 20399 + }, + { + "epoch": 1.110861530293439, + "grad_norm": 0.5260885913422563, + "learning_rate": 8.694228081883254e-05, + "loss": 12.0168, + "step": 20400 + }, + { + "epoch": 1.110915984290022, + "grad_norm": 0.6708326897002365, + "learning_rate": 8.693353811208973e-05, + "loss": 12.0019, + "step": 20401 + }, + { + "epoch": 1.110970438286605, + "grad_norm": 0.5639695878205355, + "learning_rate": 8.69247955069539e-05, + "loss": 12.1039, + "step": 20402 + }, + { + "epoch": 1.111024892283188, + "grad_norm": 0.5449293478224777, + "learning_rate": 8.691605300349304e-05, + "loss": 12.0094, + "step": 20403 + }, + { + "epoch": 1.111079346279771, + "grad_norm": 0.5740721272416982, + "learning_rate": 8.690731060177515e-05, + "loss": 12.112, + "step": 20404 + }, + { + "epoch": 1.111133800276354, + "grad_norm": 0.4930037984290305, + "learning_rate": 8.68985683018682e-05, + "loss": 12.0775, + "step": 20405 + }, + { + "epoch": 1.111188254272937, + "grad_norm": 0.6027153909312933, + "learning_rate": 8.68898261038402e-05, + "loss": 11.9441, + "step": 20406 + }, + { + "epoch": 1.11124270826952, + "grad_norm": 0.5260394731253033, + "learning_rate": 8.68810840077591e-05, + "loss": 11.9547, + "step": 20407 + }, + { + "epoch": 1.111297162266103, + "grad_norm": 0.604684947929935, + "learning_rate": 8.687234201369287e-05, + "loss": 12.1559, + "step": 20408 + }, + { + "epoch": 1.1113516162626862, + "grad_norm": 0.5853957597714421, + "learning_rate": 8.686360012170954e-05, + "loss": 11.8884, + "step": 20409 + }, + { + "epoch": 1.1114060702592692, + "grad_norm": 0.5204564010716333, + "learning_rate": 8.685485833187706e-05, + "loss": 12.0701, + "step": 20410 + }, + { + "epoch": 1.1114605242558522, + "grad_norm": 0.554571373235644, + "learning_rate": 8.684611664426344e-05, + "loss": 12.0565, + "step": 20411 + }, + { + "epoch": 1.1115149782524352, + "grad_norm": 0.49278774796167696, + "learning_rate": 8.683737505893655e-05, + "loss": 11.9681, + "step": 20412 + }, + { + "epoch": 1.1115694322490182, + "grad_norm": 0.5799618073088181, + "learning_rate": 8.682863357596447e-05, + "loss": 11.8597, + "step": 20413 + }, + { + "epoch": 1.1116238862456012, + "grad_norm": 0.6174819718064491, + "learning_rate": 8.681989219541516e-05, + "loss": 12.0611, + "step": 20414 + }, + { + "epoch": 1.1116783402421841, + "grad_norm": 0.5563880126339784, + "learning_rate": 8.681115091735654e-05, + "loss": 12.0307, + "step": 20415 + }, + { + "epoch": 1.1117327942387671, + "grad_norm": 0.5428688058215628, + "learning_rate": 8.680240974185665e-05, + "loss": 11.9264, + "step": 20416 + }, + { + "epoch": 1.1117872482353501, + "grad_norm": 0.6190313780390646, + "learning_rate": 8.679366866898343e-05, + "loss": 12.1029, + "step": 20417 + }, + { + "epoch": 1.1118417022319331, + "grad_norm": 0.5624661924655168, + "learning_rate": 8.678492769880486e-05, + "loss": 12.0339, + "step": 20418 + }, + { + "epoch": 1.1118961562285161, + "grad_norm": 0.5296270992929764, + "learning_rate": 8.677618683138889e-05, + "loss": 12.0566, + "step": 20419 + }, + { + "epoch": 1.1119506102250991, + "grad_norm": 0.6563129787591534, + "learning_rate": 8.676744606680352e-05, + "loss": 12.0153, + "step": 20420 + }, + { + "epoch": 1.112005064221682, + "grad_norm": 0.47750180191668606, + "learning_rate": 8.675870540511675e-05, + "loss": 12.0877, + "step": 20421 + }, + { + "epoch": 1.1120595182182653, + "grad_norm": 0.5971189211479879, + "learning_rate": 8.674996484639647e-05, + "loss": 12.052, + "step": 20422 + }, + { + "epoch": 1.1121139722148483, + "grad_norm": 0.5647061566072635, + "learning_rate": 8.674122439071069e-05, + "loss": 11.9557, + "step": 20423 + }, + { + "epoch": 1.1121684262114313, + "grad_norm": 0.5718603926874921, + "learning_rate": 8.673248403812735e-05, + "loss": 12.0426, + "step": 20424 + }, + { + "epoch": 1.1122228802080143, + "grad_norm": 0.5862622020485386, + "learning_rate": 8.672374378871445e-05, + "loss": 11.9336, + "step": 20425 + }, + { + "epoch": 1.1122773342045973, + "grad_norm": 0.5319568509209703, + "learning_rate": 8.671500364253995e-05, + "loss": 12.0574, + "step": 20426 + }, + { + "epoch": 1.1123317882011803, + "grad_norm": 0.5581882484166962, + "learning_rate": 8.670626359967181e-05, + "loss": 11.901, + "step": 20427 + }, + { + "epoch": 1.1123862421977633, + "grad_norm": 0.5718910216535872, + "learning_rate": 8.669752366017799e-05, + "loss": 11.9544, + "step": 20428 + }, + { + "epoch": 1.1124406961943463, + "grad_norm": 0.535119723211312, + "learning_rate": 8.668878382412646e-05, + "loss": 11.8905, + "step": 20429 + }, + { + "epoch": 1.1124951501909293, + "grad_norm": 0.5585066425693939, + "learning_rate": 8.668004409158519e-05, + "loss": 12.0553, + "step": 20430 + }, + { + "epoch": 1.1125496041875123, + "grad_norm": 0.5887672763553066, + "learning_rate": 8.667130446262214e-05, + "loss": 12.0295, + "step": 20431 + }, + { + "epoch": 1.1126040581840952, + "grad_norm": 0.5512357232765116, + "learning_rate": 8.666256493730525e-05, + "loss": 12.0979, + "step": 20432 + }, + { + "epoch": 1.1126585121806785, + "grad_norm": 0.5512854382094922, + "learning_rate": 8.665382551570248e-05, + "loss": 12.1492, + "step": 20433 + }, + { + "epoch": 1.1127129661772615, + "grad_norm": 0.5461053454512196, + "learning_rate": 8.664508619788181e-05, + "loss": 12.1001, + "step": 20434 + }, + { + "epoch": 1.1127674201738444, + "grad_norm": 0.6048445467025007, + "learning_rate": 8.663634698391117e-05, + "loss": 12.1017, + "step": 20435 + }, + { + "epoch": 1.1128218741704274, + "grad_norm": 0.5082271795316599, + "learning_rate": 8.662760787385854e-05, + "loss": 12.0915, + "step": 20436 + }, + { + "epoch": 1.1128763281670104, + "grad_norm": 0.5304286711573969, + "learning_rate": 8.661886886779189e-05, + "loss": 12.0, + "step": 20437 + }, + { + "epoch": 1.1129307821635934, + "grad_norm": 0.641587832548868, + "learning_rate": 8.661012996577915e-05, + "loss": 12.0823, + "step": 20438 + }, + { + "epoch": 1.1129852361601764, + "grad_norm": 0.5334425328832866, + "learning_rate": 8.66013911678883e-05, + "loss": 12.0591, + "step": 20439 + }, + { + "epoch": 1.1130396901567594, + "grad_norm": 0.5508297466926332, + "learning_rate": 8.659265247418727e-05, + "loss": 11.9763, + "step": 20440 + }, + { + "epoch": 1.1130941441533424, + "grad_norm": 0.6130747054086031, + "learning_rate": 8.658391388474404e-05, + "loss": 12.0046, + "step": 20441 + }, + { + "epoch": 1.1131485981499254, + "grad_norm": 0.5395795817917738, + "learning_rate": 8.657517539962654e-05, + "loss": 12.0152, + "step": 20442 + }, + { + "epoch": 1.1132030521465084, + "grad_norm": 0.6118794450591564, + "learning_rate": 8.65664370189027e-05, + "loss": 11.8971, + "step": 20443 + }, + { + "epoch": 1.1132575061430914, + "grad_norm": 0.5236304783024495, + "learning_rate": 8.655769874264052e-05, + "loss": 11.9936, + "step": 20444 + }, + { + "epoch": 1.1133119601396746, + "grad_norm": 0.6092612193110597, + "learning_rate": 8.654896057090792e-05, + "loss": 12.0868, + "step": 20445 + }, + { + "epoch": 1.1133664141362576, + "grad_norm": 0.5321936221086799, + "learning_rate": 8.654022250377283e-05, + "loss": 12.0753, + "step": 20446 + }, + { + "epoch": 1.1134208681328406, + "grad_norm": 0.6038692861905606, + "learning_rate": 8.653148454130327e-05, + "loss": 12.0654, + "step": 20447 + }, + { + "epoch": 1.1134753221294236, + "grad_norm": 0.5325827501991652, + "learning_rate": 8.652274668356713e-05, + "loss": 12.0671, + "step": 20448 + }, + { + "epoch": 1.1135297761260066, + "grad_norm": 0.5699565426641002, + "learning_rate": 8.651400893063238e-05, + "loss": 11.9932, + "step": 20449 + }, + { + "epoch": 1.1135842301225896, + "grad_norm": 0.5456973476350043, + "learning_rate": 8.650527128256695e-05, + "loss": 12.0736, + "step": 20450 + }, + { + "epoch": 1.1136386841191726, + "grad_norm": 0.6364466397889185, + "learning_rate": 8.649653373943883e-05, + "loss": 12.1013, + "step": 20451 + }, + { + "epoch": 1.1136931381157555, + "grad_norm": 0.5958478372373719, + "learning_rate": 8.648779630131589e-05, + "loss": 11.9619, + "step": 20452 + }, + { + "epoch": 1.1137475921123385, + "grad_norm": 0.5976355951406671, + "learning_rate": 8.647905896826611e-05, + "loss": 12.0252, + "step": 20453 + }, + { + "epoch": 1.1138020461089215, + "grad_norm": 0.5255235963552749, + "learning_rate": 8.647032174035744e-05, + "loss": 12.146, + "step": 20454 + }, + { + "epoch": 1.1138565001055045, + "grad_norm": 0.6238917781714524, + "learning_rate": 8.646158461765782e-05, + "loss": 12.003, + "step": 20455 + }, + { + "epoch": 1.1139109541020877, + "grad_norm": 0.5990554357556224, + "learning_rate": 8.645284760023519e-05, + "loss": 11.9492, + "step": 20456 + }, + { + "epoch": 1.1139654080986707, + "grad_norm": 0.5639341509059039, + "learning_rate": 8.644411068815747e-05, + "loss": 11.8785, + "step": 20457 + }, + { + "epoch": 1.1140198620952537, + "grad_norm": 0.5225938502442502, + "learning_rate": 8.643537388149263e-05, + "loss": 12.0676, + "step": 20458 + }, + { + "epoch": 1.1140743160918367, + "grad_norm": 0.6080400007278929, + "learning_rate": 8.64266371803086e-05, + "loss": 12.0214, + "step": 20459 + }, + { + "epoch": 1.1141287700884197, + "grad_norm": 0.5962382006537994, + "learning_rate": 8.641790058467332e-05, + "loss": 12.0873, + "step": 20460 + }, + { + "epoch": 1.1141832240850027, + "grad_norm": 0.5414322662812325, + "learning_rate": 8.640916409465474e-05, + "loss": 12.0977, + "step": 20461 + }, + { + "epoch": 1.1142376780815857, + "grad_norm": 0.5442969491055653, + "learning_rate": 8.640042771032076e-05, + "loss": 11.9356, + "step": 20462 + }, + { + "epoch": 1.1142921320781687, + "grad_norm": 0.5857955462241692, + "learning_rate": 8.639169143173934e-05, + "loss": 11.9875, + "step": 20463 + }, + { + "epoch": 1.1143465860747517, + "grad_norm": 0.5722991266071655, + "learning_rate": 8.63829552589784e-05, + "loss": 11.9408, + "step": 20464 + }, + { + "epoch": 1.1144010400713347, + "grad_norm": 0.5745187959531379, + "learning_rate": 8.637421919210588e-05, + "loss": 12.0425, + "step": 20465 + }, + { + "epoch": 1.1144554940679177, + "grad_norm": 0.5737292188169556, + "learning_rate": 8.636548323118974e-05, + "loss": 12.011, + "step": 20466 + }, + { + "epoch": 1.1145099480645007, + "grad_norm": 0.5953915731831864, + "learning_rate": 8.635674737629786e-05, + "loss": 11.9588, + "step": 20467 + }, + { + "epoch": 1.1145644020610839, + "grad_norm": 0.6105710976926062, + "learning_rate": 8.634801162749819e-05, + "loss": 12.0818, + "step": 20468 + }, + { + "epoch": 1.1146188560576669, + "grad_norm": 0.4860264760757827, + "learning_rate": 8.63392759848587e-05, + "loss": 11.9673, + "step": 20469 + }, + { + "epoch": 1.1146733100542499, + "grad_norm": 0.4796813075837099, + "learning_rate": 8.633054044844729e-05, + "loss": 12.024, + "step": 20470 + }, + { + "epoch": 1.1147277640508328, + "grad_norm": 0.5950842721789306, + "learning_rate": 8.632180501833192e-05, + "loss": 12.0347, + "step": 20471 + }, + { + "epoch": 1.1147822180474158, + "grad_norm": 0.6542190907390353, + "learning_rate": 8.631306969458047e-05, + "loss": 12.1151, + "step": 20472 + }, + { + "epoch": 1.1148366720439988, + "grad_norm": 0.5149019425783683, + "learning_rate": 8.630433447726084e-05, + "loss": 12.0373, + "step": 20473 + }, + { + "epoch": 1.1148911260405818, + "grad_norm": 0.5401207092106911, + "learning_rate": 8.629559936644103e-05, + "loss": 12.0794, + "step": 20474 + }, + { + "epoch": 1.1149455800371648, + "grad_norm": 0.5253189533374024, + "learning_rate": 8.628686436218894e-05, + "loss": 11.6059, + "step": 20475 + }, + { + "epoch": 1.1150000340337478, + "grad_norm": 0.5998630256837638, + "learning_rate": 8.627812946457249e-05, + "loss": 12.0531, + "step": 20476 + }, + { + "epoch": 1.1150544880303308, + "grad_norm": 0.5713336693894561, + "learning_rate": 8.626939467365961e-05, + "loss": 12.1183, + "step": 20477 + }, + { + "epoch": 1.1151089420269138, + "grad_norm": 0.6539845428263784, + "learning_rate": 8.626065998951821e-05, + "loss": 12.0565, + "step": 20478 + }, + { + "epoch": 1.115163396023497, + "grad_norm": 0.5795015131651489, + "learning_rate": 8.625192541221623e-05, + "loss": 12.1167, + "step": 20479 + }, + { + "epoch": 1.11521785002008, + "grad_norm": 0.6687968959099096, + "learning_rate": 8.624319094182157e-05, + "loss": 11.9152, + "step": 20480 + }, + { + "epoch": 1.115272304016663, + "grad_norm": 0.5662463661833551, + "learning_rate": 8.623445657840222e-05, + "loss": 12.0687, + "step": 20481 + }, + { + "epoch": 1.115326758013246, + "grad_norm": 0.5569974547007831, + "learning_rate": 8.622572232202599e-05, + "loss": 12.0152, + "step": 20482 + }, + { + "epoch": 1.115381212009829, + "grad_norm": 0.5356611435949286, + "learning_rate": 8.621698817276087e-05, + "loss": 11.987, + "step": 20483 + }, + { + "epoch": 1.115435666006412, + "grad_norm": 0.5740871642686105, + "learning_rate": 8.620825413067473e-05, + "loss": 12.1727, + "step": 20484 + }, + { + "epoch": 1.115490120002995, + "grad_norm": 0.5397304764024144, + "learning_rate": 8.619952019583555e-05, + "loss": 11.9573, + "step": 20485 + }, + { + "epoch": 1.115544573999578, + "grad_norm": 0.5413113226753937, + "learning_rate": 8.61907863683112e-05, + "loss": 12.1108, + "step": 20486 + }, + { + "epoch": 1.115599027996161, + "grad_norm": 0.5224978649881019, + "learning_rate": 8.618205264816962e-05, + "loss": 12.0286, + "step": 20487 + }, + { + "epoch": 1.115653481992744, + "grad_norm": 0.49798079977913895, + "learning_rate": 8.617331903547872e-05, + "loss": 11.9539, + "step": 20488 + }, + { + "epoch": 1.115707935989327, + "grad_norm": 0.5374865498581153, + "learning_rate": 8.616458553030641e-05, + "loss": 11.9546, + "step": 20489 + }, + { + "epoch": 1.11576238998591, + "grad_norm": 0.5421550574793342, + "learning_rate": 8.615585213272059e-05, + "loss": 12.0558, + "step": 20490 + }, + { + "epoch": 1.115816843982493, + "grad_norm": 0.5236490567634463, + "learning_rate": 8.614711884278922e-05, + "loss": 11.924, + "step": 20491 + }, + { + "epoch": 1.1158712979790761, + "grad_norm": 0.5894706823009638, + "learning_rate": 8.613838566058014e-05, + "loss": 12.1306, + "step": 20492 + }, + { + "epoch": 1.1159257519756591, + "grad_norm": 0.4902113230346238, + "learning_rate": 8.612965258616133e-05, + "loss": 11.9747, + "step": 20493 + }, + { + "epoch": 1.1159802059722421, + "grad_norm": 0.5215407998834232, + "learning_rate": 8.612091961960064e-05, + "loss": 12.0714, + "step": 20494 + }, + { + "epoch": 1.1160346599688251, + "grad_norm": 0.5243379429072624, + "learning_rate": 8.611218676096599e-05, + "loss": 12.0208, + "step": 20495 + }, + { + "epoch": 1.116089113965408, + "grad_norm": 0.6471210179145048, + "learning_rate": 8.610345401032532e-05, + "loss": 12.1623, + "step": 20496 + }, + { + "epoch": 1.116143567961991, + "grad_norm": 0.4908431841804648, + "learning_rate": 8.609472136774654e-05, + "loss": 11.8585, + "step": 20497 + }, + { + "epoch": 1.116198021958574, + "grad_norm": 0.5336784621895109, + "learning_rate": 8.608598883329752e-05, + "loss": 11.7578, + "step": 20498 + }, + { + "epoch": 1.116252475955157, + "grad_norm": 0.539822673026537, + "learning_rate": 8.60772564070462e-05, + "loss": 12.0355, + "step": 20499 + }, + { + "epoch": 1.11630692995174, + "grad_norm": 0.5368551058329707, + "learning_rate": 8.606852408906047e-05, + "loss": 11.9642, + "step": 20500 + }, + { + "epoch": 1.116361383948323, + "grad_norm": 0.5519632231326893, + "learning_rate": 8.605979187940823e-05, + "loss": 12.0173, + "step": 20501 + }, + { + "epoch": 1.1164158379449063, + "grad_norm": 0.558058323641093, + "learning_rate": 8.605105977815739e-05, + "loss": 11.9624, + "step": 20502 + }, + { + "epoch": 1.1164702919414893, + "grad_norm": 0.5412411084680292, + "learning_rate": 8.604232778537584e-05, + "loss": 12.0976, + "step": 20503 + }, + { + "epoch": 1.1165247459380723, + "grad_norm": 0.6807011047904982, + "learning_rate": 8.60335959011315e-05, + "loss": 12.1143, + "step": 20504 + }, + { + "epoch": 1.1165791999346553, + "grad_norm": 0.5167678272691687, + "learning_rate": 8.602486412549225e-05, + "loss": 11.9778, + "step": 20505 + }, + { + "epoch": 1.1166336539312383, + "grad_norm": 0.5476282127282881, + "learning_rate": 8.601613245852597e-05, + "loss": 12.1037, + "step": 20506 + }, + { + "epoch": 1.1166881079278212, + "grad_norm": 0.4953667512885385, + "learning_rate": 8.600740090030062e-05, + "loss": 11.962, + "step": 20507 + }, + { + "epoch": 1.1167425619244042, + "grad_norm": 0.5890134000418888, + "learning_rate": 8.599866945088406e-05, + "loss": 12.0408, + "step": 20508 + }, + { + "epoch": 1.1167970159209872, + "grad_norm": 0.5883335810102944, + "learning_rate": 8.59899381103442e-05, + "loss": 12.1385, + "step": 20509 + }, + { + "epoch": 1.1168514699175702, + "grad_norm": 0.5502270776658903, + "learning_rate": 8.598120687874893e-05, + "loss": 11.9543, + "step": 20510 + }, + { + "epoch": 1.1169059239141532, + "grad_norm": 0.5549793460829717, + "learning_rate": 8.597247575616615e-05, + "loss": 11.8679, + "step": 20511 + }, + { + "epoch": 1.1169603779107362, + "grad_norm": 0.5659023150716123, + "learning_rate": 8.596374474266378e-05, + "loss": 12.1417, + "step": 20512 + }, + { + "epoch": 1.1170148319073192, + "grad_norm": 0.5642468541772573, + "learning_rate": 8.595501383830963e-05, + "loss": 12.0823, + "step": 20513 + }, + { + "epoch": 1.1170692859039022, + "grad_norm": 0.5123625037887571, + "learning_rate": 8.594628304317168e-05, + "loss": 12.0234, + "step": 20514 + }, + { + "epoch": 1.1171237399004854, + "grad_norm": 0.47432507338245566, + "learning_rate": 8.593755235731779e-05, + "loss": 11.8732, + "step": 20515 + }, + { + "epoch": 1.1171781938970684, + "grad_norm": 0.5304811960593719, + "learning_rate": 8.592882178081584e-05, + "loss": 11.974, + "step": 20516 + }, + { + "epoch": 1.1172326478936514, + "grad_norm": 0.5517583033909015, + "learning_rate": 8.59200913137337e-05, + "loss": 12.0905, + "step": 20517 + }, + { + "epoch": 1.1172871018902344, + "grad_norm": 0.46713763306512546, + "learning_rate": 8.591136095613934e-05, + "loss": 11.9744, + "step": 20518 + }, + { + "epoch": 1.1173415558868174, + "grad_norm": 0.5486187992330056, + "learning_rate": 8.590263070810058e-05, + "loss": 12.0381, + "step": 20519 + }, + { + "epoch": 1.1173960098834004, + "grad_norm": 0.5527723804788405, + "learning_rate": 8.589390056968534e-05, + "loss": 11.9795, + "step": 20520 + }, + { + "epoch": 1.1174504638799834, + "grad_norm": 0.5018470446533754, + "learning_rate": 8.588517054096147e-05, + "loss": 12.0284, + "step": 20521 + }, + { + "epoch": 1.1175049178765664, + "grad_norm": 0.5263256875552247, + "learning_rate": 8.587644062199694e-05, + "loss": 11.9745, + "step": 20522 + }, + { + "epoch": 1.1175593718731494, + "grad_norm": 0.5830871320910996, + "learning_rate": 8.586771081285952e-05, + "loss": 12.161, + "step": 20523 + }, + { + "epoch": 1.1176138258697323, + "grad_norm": 0.5812253637558997, + "learning_rate": 8.585898111361716e-05, + "loss": 11.9857, + "step": 20524 + }, + { + "epoch": 1.1176682798663153, + "grad_norm": 0.6240263377907601, + "learning_rate": 8.585025152433775e-05, + "loss": 12.0903, + "step": 20525 + }, + { + "epoch": 1.1177227338628986, + "grad_norm": 0.5347853323347552, + "learning_rate": 8.584152204508916e-05, + "loss": 12.0122, + "step": 20526 + }, + { + "epoch": 1.1177771878594815, + "grad_norm": 0.5142875269130167, + "learning_rate": 8.583279267593924e-05, + "loss": 12.0679, + "step": 20527 + }, + { + "epoch": 1.1178316418560645, + "grad_norm": 0.524234472729522, + "learning_rate": 8.582406341695591e-05, + "loss": 11.9171, + "step": 20528 + }, + { + "epoch": 1.1178860958526475, + "grad_norm": 0.6411909173864685, + "learning_rate": 8.581533426820703e-05, + "loss": 11.9259, + "step": 20529 + }, + { + "epoch": 1.1179405498492305, + "grad_norm": 0.5997938318227984, + "learning_rate": 8.580660522976051e-05, + "loss": 12.0067, + "step": 20530 + }, + { + "epoch": 1.1179950038458135, + "grad_norm": 0.5543244796967985, + "learning_rate": 8.57978763016842e-05, + "loss": 11.9591, + "step": 20531 + }, + { + "epoch": 1.1180494578423965, + "grad_norm": 0.511095836273182, + "learning_rate": 8.578914748404603e-05, + "loss": 11.9835, + "step": 20532 + }, + { + "epoch": 1.1181039118389795, + "grad_norm": 0.527984273724554, + "learning_rate": 8.578041877691376e-05, + "loss": 12.0705, + "step": 20533 + }, + { + "epoch": 1.1181583658355625, + "grad_norm": 0.5387854188171488, + "learning_rate": 8.577169018035537e-05, + "loss": 11.9265, + "step": 20534 + }, + { + "epoch": 1.1182128198321455, + "grad_norm": 0.5687718892618461, + "learning_rate": 8.576296169443872e-05, + "loss": 11.8669, + "step": 20535 + }, + { + "epoch": 1.1182672738287285, + "grad_norm": 0.5026518080973875, + "learning_rate": 8.575423331923164e-05, + "loss": 12.0234, + "step": 20536 + }, + { + "epoch": 1.1183217278253115, + "grad_norm": 0.5401622858273115, + "learning_rate": 8.574550505480204e-05, + "loss": 12.0075, + "step": 20537 + }, + { + "epoch": 1.1183761818218947, + "grad_norm": 0.5619456287560358, + "learning_rate": 8.573677690121779e-05, + "loss": 11.9651, + "step": 20538 + }, + { + "epoch": 1.1184306358184777, + "grad_norm": 0.8599802351019422, + "learning_rate": 8.572804885854676e-05, + "loss": 12.0924, + "step": 20539 + }, + { + "epoch": 1.1184850898150607, + "grad_norm": 0.5401716468800819, + "learning_rate": 8.571932092685676e-05, + "loss": 12.0439, + "step": 20540 + }, + { + "epoch": 1.1185395438116437, + "grad_norm": 0.5613658674342403, + "learning_rate": 8.571059310621577e-05, + "loss": 11.9226, + "step": 20541 + }, + { + "epoch": 1.1185939978082267, + "grad_norm": 0.5119920663394302, + "learning_rate": 8.570186539669163e-05, + "loss": 12.002, + "step": 20542 + }, + { + "epoch": 1.1186484518048097, + "grad_norm": 0.5920825355337451, + "learning_rate": 8.569313779835215e-05, + "loss": 12.1506, + "step": 20543 + }, + { + "epoch": 1.1187029058013926, + "grad_norm": 0.6027706362517788, + "learning_rate": 8.568441031126519e-05, + "loss": 11.9428, + "step": 20544 + }, + { + "epoch": 1.1187573597979756, + "grad_norm": 0.5351833490287161, + "learning_rate": 8.56756829354987e-05, + "loss": 12.0721, + "step": 20545 + }, + { + "epoch": 1.1188118137945586, + "grad_norm": 0.5572626660345444, + "learning_rate": 8.56669556711205e-05, + "loss": 11.9794, + "step": 20546 + }, + { + "epoch": 1.1188662677911416, + "grad_norm": 0.5189188827745314, + "learning_rate": 8.565822851819845e-05, + "loss": 11.9398, + "step": 20547 + }, + { + "epoch": 1.1189207217877246, + "grad_norm": 0.5874501284539313, + "learning_rate": 8.564950147680043e-05, + "loss": 12.1981, + "step": 20548 + }, + { + "epoch": 1.1189751757843078, + "grad_norm": 0.5324668275406199, + "learning_rate": 8.564077454699428e-05, + "loss": 11.9317, + "step": 20549 + }, + { + "epoch": 1.1190296297808908, + "grad_norm": 0.6009991727214898, + "learning_rate": 8.563204772884787e-05, + "loss": 12.2401, + "step": 20550 + }, + { + "epoch": 1.1190840837774738, + "grad_norm": 0.5977241246014698, + "learning_rate": 8.562332102242905e-05, + "loss": 12.0362, + "step": 20551 + }, + { + "epoch": 1.1191385377740568, + "grad_norm": 0.6377908932742733, + "learning_rate": 8.561459442780578e-05, + "loss": 12.2194, + "step": 20552 + }, + { + "epoch": 1.1191929917706398, + "grad_norm": 0.6035991811431012, + "learning_rate": 8.560586794504577e-05, + "loss": 12.0193, + "step": 20553 + }, + { + "epoch": 1.1192474457672228, + "grad_norm": 0.5325660866753731, + "learning_rate": 8.559714157421695e-05, + "loss": 12.063, + "step": 20554 + }, + { + "epoch": 1.1193018997638058, + "grad_norm": 0.7824485461261439, + "learning_rate": 8.558841531538715e-05, + "loss": 12.02, + "step": 20555 + }, + { + "epoch": 1.1193563537603888, + "grad_norm": 0.55243015110621, + "learning_rate": 8.557968916862428e-05, + "loss": 12.1466, + "step": 20556 + }, + { + "epoch": 1.1194108077569718, + "grad_norm": 0.5050461586089697, + "learning_rate": 8.557096313399615e-05, + "loss": 11.994, + "step": 20557 + }, + { + "epoch": 1.1194652617535548, + "grad_norm": 0.5906472124260127, + "learning_rate": 8.556223721157064e-05, + "loss": 12.1666, + "step": 20558 + }, + { + "epoch": 1.1195197157501378, + "grad_norm": 0.5649664453950213, + "learning_rate": 8.55535114014156e-05, + "loss": 12.1033, + "step": 20559 + }, + { + "epoch": 1.1195741697467207, + "grad_norm": 0.5679386653316959, + "learning_rate": 8.554478570359887e-05, + "loss": 12.0274, + "step": 20560 + }, + { + "epoch": 1.119628623743304, + "grad_norm": 0.5260415529671344, + "learning_rate": 8.553606011818832e-05, + "loss": 12.1302, + "step": 20561 + }, + { + "epoch": 1.119683077739887, + "grad_norm": 0.48550420063201516, + "learning_rate": 8.55273346452518e-05, + "loss": 11.9974, + "step": 20562 + }, + { + "epoch": 1.11973753173647, + "grad_norm": 0.5596068466457854, + "learning_rate": 8.551860928485715e-05, + "loss": 11.95, + "step": 20563 + }, + { + "epoch": 1.119791985733053, + "grad_norm": 0.5320204569567574, + "learning_rate": 8.550988403707221e-05, + "loss": 11.9891, + "step": 20564 + }, + { + "epoch": 1.119846439729636, + "grad_norm": 0.5272038792017315, + "learning_rate": 8.550115890196484e-05, + "loss": 12.0433, + "step": 20565 + }, + { + "epoch": 1.119900893726219, + "grad_norm": 0.5367106092079147, + "learning_rate": 8.549243387960286e-05, + "loss": 12.0138, + "step": 20566 + }, + { + "epoch": 1.119955347722802, + "grad_norm": 0.5226903680437649, + "learning_rate": 8.548370897005418e-05, + "loss": 11.9826, + "step": 20567 + }, + { + "epoch": 1.120009801719385, + "grad_norm": 0.6286936405364363, + "learning_rate": 8.547498417338661e-05, + "loss": 11.9526, + "step": 20568 + }, + { + "epoch": 1.120064255715968, + "grad_norm": 0.5376852340755028, + "learning_rate": 8.5466259489668e-05, + "loss": 12.0677, + "step": 20569 + }, + { + "epoch": 1.120118709712551, + "grad_norm": 0.5371857089677221, + "learning_rate": 8.54575349189662e-05, + "loss": 11.8697, + "step": 20570 + }, + { + "epoch": 1.120173163709134, + "grad_norm": 0.589717476056914, + "learning_rate": 8.544881046134905e-05, + "loss": 11.8035, + "step": 20571 + }, + { + "epoch": 1.120227617705717, + "grad_norm": 0.521908997324979, + "learning_rate": 8.54400861168844e-05, + "loss": 11.9903, + "step": 20572 + }, + { + "epoch": 1.1202820717023, + "grad_norm": 0.5720192903917372, + "learning_rate": 8.543136188564007e-05, + "loss": 12.0403, + "step": 20573 + }, + { + "epoch": 1.120336525698883, + "grad_norm": 0.6255945238675611, + "learning_rate": 8.542263776768392e-05, + "loss": 12.0523, + "step": 20574 + }, + { + "epoch": 1.120390979695466, + "grad_norm": 0.569480533231531, + "learning_rate": 8.541391376308376e-05, + "loss": 12.0133, + "step": 20575 + }, + { + "epoch": 1.120445433692049, + "grad_norm": 0.533932773695763, + "learning_rate": 8.540518987190746e-05, + "loss": 11.9363, + "step": 20576 + }, + { + "epoch": 1.120499887688632, + "grad_norm": 0.5475647975327231, + "learning_rate": 8.539646609422285e-05, + "loss": 12.0669, + "step": 20577 + }, + { + "epoch": 1.120554341685215, + "grad_norm": 0.5928832029061398, + "learning_rate": 8.538774243009775e-05, + "loss": 11.9212, + "step": 20578 + }, + { + "epoch": 1.120608795681798, + "grad_norm": 0.5038069420499145, + "learning_rate": 8.537901887960004e-05, + "loss": 11.9198, + "step": 20579 + }, + { + "epoch": 1.120663249678381, + "grad_norm": 0.5096724960488348, + "learning_rate": 8.537029544279754e-05, + "loss": 11.927, + "step": 20580 + }, + { + "epoch": 1.120717703674964, + "grad_norm": 0.5598120008559261, + "learning_rate": 8.536157211975806e-05, + "loss": 12.149, + "step": 20581 + }, + { + "epoch": 1.120772157671547, + "grad_norm": 0.564549478719887, + "learning_rate": 8.535284891054947e-05, + "loss": 12.011, + "step": 20582 + }, + { + "epoch": 1.12082661166813, + "grad_norm": 0.5757046759921594, + "learning_rate": 8.534412581523959e-05, + "loss": 12.0404, + "step": 20583 + }, + { + "epoch": 1.120881065664713, + "grad_norm": 0.5195550332959707, + "learning_rate": 8.533540283389621e-05, + "loss": 12.0446, + "step": 20584 + }, + { + "epoch": 1.1209355196612962, + "grad_norm": 0.537669835620897, + "learning_rate": 8.53266799665872e-05, + "loss": 11.986, + "step": 20585 + }, + { + "epoch": 1.1209899736578792, + "grad_norm": 0.5266346732329061, + "learning_rate": 8.531795721338041e-05, + "loss": 12.0639, + "step": 20586 + }, + { + "epoch": 1.1210444276544622, + "grad_norm": 0.6063960080528453, + "learning_rate": 8.530923457434364e-05, + "loss": 12.0502, + "step": 20587 + }, + { + "epoch": 1.1210988816510452, + "grad_norm": 0.5711788447519252, + "learning_rate": 8.530051204954472e-05, + "loss": 12.0723, + "step": 20588 + }, + { + "epoch": 1.1211533356476282, + "grad_norm": 0.5400089161958422, + "learning_rate": 8.529178963905147e-05, + "loss": 12.1502, + "step": 20589 + }, + { + "epoch": 1.1212077896442112, + "grad_norm": 0.5623434008351368, + "learning_rate": 8.528306734293174e-05, + "loss": 11.9905, + "step": 20590 + }, + { + "epoch": 1.1212622436407942, + "grad_norm": 0.5020169980727249, + "learning_rate": 8.527434516125335e-05, + "loss": 12.04, + "step": 20591 + }, + { + "epoch": 1.1213166976373772, + "grad_norm": 0.5083796763533629, + "learning_rate": 8.526562309408417e-05, + "loss": 12.041, + "step": 20592 + }, + { + "epoch": 1.1213711516339602, + "grad_norm": 0.5386733757095622, + "learning_rate": 8.525690114149191e-05, + "loss": 12.0352, + "step": 20593 + }, + { + "epoch": 1.1214256056305432, + "grad_norm": 0.5225178827581798, + "learning_rate": 8.524817930354447e-05, + "loss": 11.9339, + "step": 20594 + }, + { + "epoch": 1.1214800596271262, + "grad_norm": 0.5441174805016625, + "learning_rate": 8.523945758030966e-05, + "loss": 11.9746, + "step": 20595 + }, + { + "epoch": 1.1215345136237094, + "grad_norm": 0.5150008167747349, + "learning_rate": 8.523073597185533e-05, + "loss": 12.0508, + "step": 20596 + }, + { + "epoch": 1.1215889676202924, + "grad_norm": 0.5307939347724793, + "learning_rate": 8.522201447824925e-05, + "loss": 11.5901, + "step": 20597 + }, + { + "epoch": 1.1216434216168754, + "grad_norm": 0.5660549221776581, + "learning_rate": 8.521329309955927e-05, + "loss": 11.9772, + "step": 20598 + }, + { + "epoch": 1.1216978756134584, + "grad_norm": 0.5287968192433967, + "learning_rate": 8.520457183585321e-05, + "loss": 12.0573, + "step": 20599 + }, + { + "epoch": 1.1217523296100413, + "grad_norm": 0.6005800707822906, + "learning_rate": 8.519585068719884e-05, + "loss": 12.1486, + "step": 20600 + }, + { + "epoch": 1.1218067836066243, + "grad_norm": 0.58718529211763, + "learning_rate": 8.518712965366406e-05, + "loss": 11.9406, + "step": 20601 + }, + { + "epoch": 1.1218612376032073, + "grad_norm": 0.5926879722142123, + "learning_rate": 8.517840873531669e-05, + "loss": 12.0648, + "step": 20602 + }, + { + "epoch": 1.1219156915997903, + "grad_norm": 0.555690531355676, + "learning_rate": 8.516968793222443e-05, + "loss": 12.0557, + "step": 20603 + }, + { + "epoch": 1.1219701455963733, + "grad_norm": 0.5334045336469574, + "learning_rate": 8.516096724445516e-05, + "loss": 12.0405, + "step": 20604 + }, + { + "epoch": 1.1220245995929563, + "grad_norm": 0.6051364508916515, + "learning_rate": 8.515224667207671e-05, + "loss": 11.9985, + "step": 20605 + }, + { + "epoch": 1.1220790535895393, + "grad_norm": 0.5152068370069078, + "learning_rate": 8.514352621515689e-05, + "loss": 11.9404, + "step": 20606 + }, + { + "epoch": 1.1221335075861223, + "grad_norm": 0.5700140228021777, + "learning_rate": 8.51348058737635e-05, + "loss": 12.0488, + "step": 20607 + }, + { + "epoch": 1.1221879615827055, + "grad_norm": 0.5311689879860269, + "learning_rate": 8.512608564796435e-05, + "loss": 11.9787, + "step": 20608 + }, + { + "epoch": 1.1222424155792885, + "grad_norm": 0.6004770577091992, + "learning_rate": 8.511736553782725e-05, + "loss": 11.8855, + "step": 20609 + }, + { + "epoch": 1.1222968695758715, + "grad_norm": 0.572080055635242, + "learning_rate": 8.510864554342e-05, + "loss": 11.9507, + "step": 20610 + }, + { + "epoch": 1.1223513235724545, + "grad_norm": 0.5223463898176213, + "learning_rate": 8.509992566481042e-05, + "loss": 11.9969, + "step": 20611 + }, + { + "epoch": 1.1224057775690375, + "grad_norm": 0.5549006948857823, + "learning_rate": 8.509120590206637e-05, + "loss": 12.1299, + "step": 20612 + }, + { + "epoch": 1.1224602315656205, + "grad_norm": 0.568669376047195, + "learning_rate": 8.508248625525557e-05, + "loss": 11.8601, + "step": 20613 + }, + { + "epoch": 1.1225146855622035, + "grad_norm": 0.5619581181952749, + "learning_rate": 8.507376672444585e-05, + "loss": 12.1542, + "step": 20614 + }, + { + "epoch": 1.1225691395587865, + "grad_norm": 0.5640475607492639, + "learning_rate": 8.506504730970501e-05, + "loss": 11.9929, + "step": 20615 + }, + { + "epoch": 1.1226235935553694, + "grad_norm": 0.5495919073592045, + "learning_rate": 8.505632801110087e-05, + "loss": 11.9254, + "step": 20616 + }, + { + "epoch": 1.1226780475519524, + "grad_norm": 0.5614813120214723, + "learning_rate": 8.504760882870124e-05, + "loss": 12.0447, + "step": 20617 + }, + { + "epoch": 1.1227325015485354, + "grad_norm": 0.6016034729577902, + "learning_rate": 8.503888976257392e-05, + "loss": 12.0285, + "step": 20618 + }, + { + "epoch": 1.1227869555451186, + "grad_norm": 0.5867644357431405, + "learning_rate": 8.503017081278668e-05, + "loss": 11.9198, + "step": 20619 + }, + { + "epoch": 1.1228414095417016, + "grad_norm": 0.5127568851116143, + "learning_rate": 8.502145197940736e-05, + "loss": 11.9742, + "step": 20620 + }, + { + "epoch": 1.1228958635382846, + "grad_norm": 0.5337297068088316, + "learning_rate": 8.501273326250374e-05, + "loss": 11.9856, + "step": 20621 + }, + { + "epoch": 1.1229503175348676, + "grad_norm": 0.5461410367404895, + "learning_rate": 8.500401466214364e-05, + "loss": 12.0635, + "step": 20622 + }, + { + "epoch": 1.1230047715314506, + "grad_norm": 0.5617180325957596, + "learning_rate": 8.49952961783948e-05, + "loss": 11.954, + "step": 20623 + }, + { + "epoch": 1.1230592255280336, + "grad_norm": 0.510976442238758, + "learning_rate": 8.498657781132509e-05, + "loss": 11.8703, + "step": 20624 + }, + { + "epoch": 1.1231136795246166, + "grad_norm": 0.5383942493352548, + "learning_rate": 8.497785956100223e-05, + "loss": 12.0471, + "step": 20625 + }, + { + "epoch": 1.1231681335211996, + "grad_norm": 0.5160260825142511, + "learning_rate": 8.496914142749407e-05, + "loss": 11.9466, + "step": 20626 + }, + { + "epoch": 1.1232225875177826, + "grad_norm": 0.5590712471310877, + "learning_rate": 8.496042341086836e-05, + "loss": 12.0509, + "step": 20627 + }, + { + "epoch": 1.1232770415143656, + "grad_norm": 0.5524489743549429, + "learning_rate": 8.495170551119296e-05, + "loss": 11.9801, + "step": 20628 + }, + { + "epoch": 1.1233314955109486, + "grad_norm": 0.5421061679085142, + "learning_rate": 8.49429877285356e-05, + "loss": 11.9793, + "step": 20629 + }, + { + "epoch": 1.1233859495075316, + "grad_norm": 0.5565855504551699, + "learning_rate": 8.49342700629641e-05, + "loss": 12.1138, + "step": 20630 + }, + { + "epoch": 1.1234404035041148, + "grad_norm": 0.5393647732434288, + "learning_rate": 8.492555251454623e-05, + "loss": 12.0277, + "step": 20631 + }, + { + "epoch": 1.1234948575006978, + "grad_norm": 0.5653211907581519, + "learning_rate": 8.491683508334983e-05, + "loss": 12.0809, + "step": 20632 + }, + { + "epoch": 1.1235493114972808, + "grad_norm": 0.5566333487947905, + "learning_rate": 8.490811776944263e-05, + "loss": 11.9899, + "step": 20633 + }, + { + "epoch": 1.1236037654938638, + "grad_norm": 0.5489336043672067, + "learning_rate": 8.489940057289243e-05, + "loss": 12.0761, + "step": 20634 + }, + { + "epoch": 1.1236582194904468, + "grad_norm": 0.5431071514195215, + "learning_rate": 8.489068349376702e-05, + "loss": 12.0434, + "step": 20635 + }, + { + "epoch": 1.1237126734870297, + "grad_norm": 0.537853868515468, + "learning_rate": 8.48819665321342e-05, + "loss": 12.0849, + "step": 20636 + }, + { + "epoch": 1.1237671274836127, + "grad_norm": 0.5511346778647371, + "learning_rate": 8.487324968806173e-05, + "loss": 11.9328, + "step": 20637 + }, + { + "epoch": 1.1238215814801957, + "grad_norm": 0.5667308329210314, + "learning_rate": 8.486453296161739e-05, + "loss": 12.0361, + "step": 20638 + }, + { + "epoch": 1.1238760354767787, + "grad_norm": 0.5471565466242325, + "learning_rate": 8.485581635286901e-05, + "loss": 12.0199, + "step": 20639 + }, + { + "epoch": 1.1239304894733617, + "grad_norm": 0.6623247915833094, + "learning_rate": 8.484709986188433e-05, + "loss": 11.9894, + "step": 20640 + }, + { + "epoch": 1.1239849434699447, + "grad_norm": 0.5511119629493311, + "learning_rate": 8.483838348873116e-05, + "loss": 12.0032, + "step": 20641 + }, + { + "epoch": 1.124039397466528, + "grad_norm": 0.5424432658011794, + "learning_rate": 8.482966723347726e-05, + "loss": 12.0568, + "step": 20642 + }, + { + "epoch": 1.124093851463111, + "grad_norm": 0.5191386336139402, + "learning_rate": 8.48209510961904e-05, + "loss": 11.7916, + "step": 20643 + }, + { + "epoch": 1.124148305459694, + "grad_norm": 0.6078510380658935, + "learning_rate": 8.481223507693838e-05, + "loss": 12.0313, + "step": 20644 + }, + { + "epoch": 1.124202759456277, + "grad_norm": 0.5488151658308369, + "learning_rate": 8.480351917578896e-05, + "loss": 12.1066, + "step": 20645 + }, + { + "epoch": 1.12425721345286, + "grad_norm": 0.5545155623942513, + "learning_rate": 8.479480339280992e-05, + "loss": 12.0689, + "step": 20646 + }, + { + "epoch": 1.1243116674494429, + "grad_norm": 0.5300148673499622, + "learning_rate": 8.478608772806904e-05, + "loss": 11.94, + "step": 20647 + }, + { + "epoch": 1.1243661214460259, + "grad_norm": 0.6296302325770019, + "learning_rate": 8.47773721816341e-05, + "loss": 12.1124, + "step": 20648 + }, + { + "epoch": 1.1244205754426089, + "grad_norm": 0.5481045776550506, + "learning_rate": 8.476865675357284e-05, + "loss": 11.9786, + "step": 20649 + }, + { + "epoch": 1.1244750294391919, + "grad_norm": 0.5632361699445835, + "learning_rate": 8.475994144395307e-05, + "loss": 11.9742, + "step": 20650 + }, + { + "epoch": 1.1245294834357749, + "grad_norm": 0.6013238189165129, + "learning_rate": 8.475122625284257e-05, + "loss": 12.0224, + "step": 20651 + }, + { + "epoch": 1.1245839374323578, + "grad_norm": 0.6487355626787903, + "learning_rate": 8.474251118030912e-05, + "loss": 11.9843, + "step": 20652 + }, + { + "epoch": 1.1246383914289408, + "grad_norm": 0.5609178175685393, + "learning_rate": 8.47337962264204e-05, + "loss": 11.9898, + "step": 20653 + }, + { + "epoch": 1.1246928454255238, + "grad_norm": 0.5067012783158299, + "learning_rate": 8.472508139124426e-05, + "loss": 12.0245, + "step": 20654 + }, + { + "epoch": 1.124747299422107, + "grad_norm": 0.5781508273221143, + "learning_rate": 8.471636667484846e-05, + "loss": 12.078, + "step": 20655 + }, + { + "epoch": 1.12480175341869, + "grad_norm": 0.5277975985823132, + "learning_rate": 8.470765207730075e-05, + "loss": 12.0828, + "step": 20656 + }, + { + "epoch": 1.124856207415273, + "grad_norm": 0.5802282833173706, + "learning_rate": 8.469893759866892e-05, + "loss": 11.9441, + "step": 20657 + }, + { + "epoch": 1.124910661411856, + "grad_norm": 0.5611172893829233, + "learning_rate": 8.46902232390207e-05, + "loss": 12.1023, + "step": 20658 + }, + { + "epoch": 1.124965115408439, + "grad_norm": 0.5192570995241735, + "learning_rate": 8.468150899842387e-05, + "loss": 12.0198, + "step": 20659 + }, + { + "epoch": 1.125019569405022, + "grad_norm": 0.6822053457142676, + "learning_rate": 8.467279487694617e-05, + "loss": 12.0171, + "step": 20660 + }, + { + "epoch": 1.125074023401605, + "grad_norm": 0.5516464625410687, + "learning_rate": 8.46640808746554e-05, + "loss": 12.0902, + "step": 20661 + }, + { + "epoch": 1.125128477398188, + "grad_norm": 0.5654967257334075, + "learning_rate": 8.465536699161934e-05, + "loss": 12.095, + "step": 20662 + }, + { + "epoch": 1.125182931394771, + "grad_norm": 0.5408078521278947, + "learning_rate": 8.464665322790574e-05, + "loss": 12.0886, + "step": 20663 + }, + { + "epoch": 1.125237385391354, + "grad_norm": 0.595730130403624, + "learning_rate": 8.463793958358228e-05, + "loss": 12.0437, + "step": 20664 + }, + { + "epoch": 1.1252918393879372, + "grad_norm": 0.6521803950094307, + "learning_rate": 8.46292260587168e-05, + "loss": 12.06, + "step": 20665 + }, + { + "epoch": 1.1253462933845202, + "grad_norm": 0.4951956217474917, + "learning_rate": 8.462051265337702e-05, + "loss": 11.9553, + "step": 20666 + }, + { + "epoch": 1.1254007473811032, + "grad_norm": 0.5791223500643684, + "learning_rate": 8.461179936763074e-05, + "loss": 11.9232, + "step": 20667 + }, + { + "epoch": 1.1254552013776862, + "grad_norm": 0.5697549963250519, + "learning_rate": 8.460308620154566e-05, + "loss": 11.9406, + "step": 20668 + }, + { + "epoch": 1.1255096553742692, + "grad_norm": 0.6110789914736281, + "learning_rate": 8.459437315518959e-05, + "loss": 12.1223, + "step": 20669 + }, + { + "epoch": 1.1255641093708522, + "grad_norm": 0.5377100319564458, + "learning_rate": 8.458566022863026e-05, + "loss": 12.0876, + "step": 20670 + }, + { + "epoch": 1.1256185633674352, + "grad_norm": 0.5596473554488341, + "learning_rate": 8.457694742193538e-05, + "loss": 12.0281, + "step": 20671 + }, + { + "epoch": 1.1256730173640181, + "grad_norm": 0.6157363408101525, + "learning_rate": 8.456823473517277e-05, + "loss": 12.0079, + "step": 20672 + }, + { + "epoch": 1.1257274713606011, + "grad_norm": 0.5958012714277305, + "learning_rate": 8.45595221684102e-05, + "loss": 12.0952, + "step": 20673 + }, + { + "epoch": 1.1257819253571841, + "grad_norm": 0.5909040796004863, + "learning_rate": 8.455080972171535e-05, + "loss": 11.9936, + "step": 20674 + }, + { + "epoch": 1.1258363793537671, + "grad_norm": 0.5259277364480356, + "learning_rate": 8.454209739515594e-05, + "loss": 11.9397, + "step": 20675 + }, + { + "epoch": 1.1258908333503501, + "grad_norm": 0.6289245786484194, + "learning_rate": 8.453338518879981e-05, + "loss": 12.0402, + "step": 20676 + }, + { + "epoch": 1.125945287346933, + "grad_norm": 0.5463792922285373, + "learning_rate": 8.452467310271467e-05, + "loss": 11.9932, + "step": 20677 + }, + { + "epoch": 1.1259997413435163, + "grad_norm": 0.5498803039109769, + "learning_rate": 8.451596113696827e-05, + "loss": 12.1414, + "step": 20678 + }, + { + "epoch": 1.1260541953400993, + "grad_norm": 0.5195443857787331, + "learning_rate": 8.450724929162834e-05, + "loss": 11.9772, + "step": 20679 + }, + { + "epoch": 1.1261086493366823, + "grad_norm": 0.6088749495130134, + "learning_rate": 8.449853756676265e-05, + "loss": 12.1435, + "step": 20680 + }, + { + "epoch": 1.1261631033332653, + "grad_norm": 0.4979402048421249, + "learning_rate": 8.448982596243893e-05, + "loss": 11.9464, + "step": 20681 + }, + { + "epoch": 1.1262175573298483, + "grad_norm": 0.5471764648632118, + "learning_rate": 8.448111447872493e-05, + "loss": 11.9167, + "step": 20682 + }, + { + "epoch": 1.1262720113264313, + "grad_norm": 0.5347362131973291, + "learning_rate": 8.44724031156884e-05, + "loss": 11.9314, + "step": 20683 + }, + { + "epoch": 1.1263264653230143, + "grad_norm": 0.5300495252964539, + "learning_rate": 8.446369187339704e-05, + "loss": 11.9897, + "step": 20684 + }, + { + "epoch": 1.1263809193195973, + "grad_norm": 0.4901735506020483, + "learning_rate": 8.445498075191863e-05, + "loss": 12.0088, + "step": 20685 + }, + { + "epoch": 1.1264353733161803, + "grad_norm": 0.5439219481947628, + "learning_rate": 8.44462697513209e-05, + "loss": 12.0971, + "step": 20686 + }, + { + "epoch": 1.1264898273127633, + "grad_norm": 0.5318345497439173, + "learning_rate": 8.443755887167155e-05, + "loss": 12.0603, + "step": 20687 + }, + { + "epoch": 1.1265442813093463, + "grad_norm": 0.5827985748435938, + "learning_rate": 8.442884811303837e-05, + "loss": 11.9276, + "step": 20688 + }, + { + "epoch": 1.1265987353059295, + "grad_norm": 0.5364340700027024, + "learning_rate": 8.442013747548909e-05, + "loss": 12.1521, + "step": 20689 + }, + { + "epoch": 1.1266531893025125, + "grad_norm": 0.5170281003949356, + "learning_rate": 8.441142695909143e-05, + "loss": 11.9498, + "step": 20690 + }, + { + "epoch": 1.1267076432990955, + "grad_norm": 0.5055257982311351, + "learning_rate": 8.440271656391313e-05, + "loss": 12.0405, + "step": 20691 + }, + { + "epoch": 1.1267620972956784, + "grad_norm": 0.5428997872547985, + "learning_rate": 8.439400629002192e-05, + "loss": 12.1165, + "step": 20692 + }, + { + "epoch": 1.1268165512922614, + "grad_norm": 0.5820501425587059, + "learning_rate": 8.438529613748556e-05, + "loss": 11.9797, + "step": 20693 + }, + { + "epoch": 1.1268710052888444, + "grad_norm": 0.5406008973033072, + "learning_rate": 8.437658610637172e-05, + "loss": 11.9376, + "step": 20694 + }, + { + "epoch": 1.1269254592854274, + "grad_norm": 0.5758219611985749, + "learning_rate": 8.436787619674819e-05, + "loss": 12.0318, + "step": 20695 + }, + { + "epoch": 1.1269799132820104, + "grad_norm": 0.4994890911792227, + "learning_rate": 8.435916640868266e-05, + "loss": 11.9158, + "step": 20696 + }, + { + "epoch": 1.1270343672785934, + "grad_norm": 0.575546458026418, + "learning_rate": 8.435045674224287e-05, + "loss": 11.7971, + "step": 20697 + }, + { + "epoch": 1.1270888212751764, + "grad_norm": 0.5398853059585484, + "learning_rate": 8.434174719749654e-05, + "loss": 11.9874, + "step": 20698 + }, + { + "epoch": 1.1271432752717594, + "grad_norm": 0.6294282540302789, + "learning_rate": 8.433303777451143e-05, + "loss": 12.1287, + "step": 20699 + }, + { + "epoch": 1.1271977292683424, + "grad_norm": 0.7472791478523968, + "learning_rate": 8.432432847335525e-05, + "loss": 11.9709, + "step": 20700 + }, + { + "epoch": 1.1272521832649254, + "grad_norm": 0.5856051668569312, + "learning_rate": 8.431561929409571e-05, + "loss": 11.9004, + "step": 20701 + }, + { + "epoch": 1.1273066372615086, + "grad_norm": 0.5889737421383973, + "learning_rate": 8.430691023680055e-05, + "loss": 12.041, + "step": 20702 + }, + { + "epoch": 1.1273610912580916, + "grad_norm": 0.5480485563764228, + "learning_rate": 8.429820130153752e-05, + "loss": 12.0473, + "step": 20703 + }, + { + "epoch": 1.1274155452546746, + "grad_norm": 0.5528813501864324, + "learning_rate": 8.428949248837428e-05, + "loss": 12.1747, + "step": 20704 + }, + { + "epoch": 1.1274699992512576, + "grad_norm": 0.5446607362507545, + "learning_rate": 8.428078379737858e-05, + "loss": 11.9402, + "step": 20705 + }, + { + "epoch": 1.1275244532478406, + "grad_norm": 0.584286209918372, + "learning_rate": 8.427207522861813e-05, + "loss": 12.1177, + "step": 20706 + }, + { + "epoch": 1.1275789072444236, + "grad_norm": 0.5559404672854191, + "learning_rate": 8.426336678216066e-05, + "loss": 12.0242, + "step": 20707 + }, + { + "epoch": 1.1276333612410065, + "grad_norm": 0.6205962003625162, + "learning_rate": 8.425465845807392e-05, + "loss": 12.0569, + "step": 20708 + }, + { + "epoch": 1.1276878152375895, + "grad_norm": 0.5098974570827257, + "learning_rate": 8.424595025642555e-05, + "loss": 12.053, + "step": 20709 + }, + { + "epoch": 1.1277422692341725, + "grad_norm": 0.5212660381706562, + "learning_rate": 8.423724217728334e-05, + "loss": 12.102, + "step": 20710 + }, + { + "epoch": 1.1277967232307555, + "grad_norm": 0.5943423464276083, + "learning_rate": 8.422853422071497e-05, + "loss": 12.042, + "step": 20711 + }, + { + "epoch": 1.1278511772273387, + "grad_norm": 0.559493213992451, + "learning_rate": 8.421982638678818e-05, + "loss": 11.9594, + "step": 20712 + }, + { + "epoch": 1.1279056312239217, + "grad_norm": 0.5556647353810512, + "learning_rate": 8.421111867557068e-05, + "loss": 12.1456, + "step": 20713 + }, + { + "epoch": 1.1279600852205047, + "grad_norm": 0.6328214803425803, + "learning_rate": 8.420241108713013e-05, + "loss": 12.1421, + "step": 20714 + }, + { + "epoch": 1.1280145392170877, + "grad_norm": 0.5926666590503891, + "learning_rate": 8.419370362153431e-05, + "loss": 12.0015, + "step": 20715 + }, + { + "epoch": 1.1280689932136707, + "grad_norm": 0.5315547798326242, + "learning_rate": 8.418499627885089e-05, + "loss": 11.9904, + "step": 20716 + }, + { + "epoch": 1.1281234472102537, + "grad_norm": 0.6096039674322927, + "learning_rate": 8.417628905914758e-05, + "loss": 12.0197, + "step": 20717 + }, + { + "epoch": 1.1281779012068367, + "grad_norm": 0.5584602220091498, + "learning_rate": 8.416758196249211e-05, + "loss": 12.0328, + "step": 20718 + }, + { + "epoch": 1.1282323552034197, + "grad_norm": 0.5960657696185826, + "learning_rate": 8.415887498895219e-05, + "loss": 11.9276, + "step": 20719 + }, + { + "epoch": 1.1282868092000027, + "grad_norm": 0.6119079673118717, + "learning_rate": 8.415016813859548e-05, + "loss": 11.9412, + "step": 20720 + }, + { + "epoch": 1.1283412631965857, + "grad_norm": 0.5498559512074462, + "learning_rate": 8.414146141148972e-05, + "loss": 11.9704, + "step": 20721 + }, + { + "epoch": 1.1283957171931687, + "grad_norm": 0.542946266885336, + "learning_rate": 8.413275480770266e-05, + "loss": 12.0014, + "step": 20722 + }, + { + "epoch": 1.1284501711897517, + "grad_norm": 0.5427884476921939, + "learning_rate": 8.412404832730197e-05, + "loss": 12.0812, + "step": 20723 + }, + { + "epoch": 1.1285046251863347, + "grad_norm": 0.5462960786118958, + "learning_rate": 8.411534197035529e-05, + "loss": 11.9141, + "step": 20724 + }, + { + "epoch": 1.1285590791829179, + "grad_norm": 0.565910848925007, + "learning_rate": 8.41066357369304e-05, + "loss": 11.9949, + "step": 20725 + }, + { + "epoch": 1.1286135331795009, + "grad_norm": 0.5239689661926308, + "learning_rate": 8.409792962709497e-05, + "loss": 11.7701, + "step": 20726 + }, + { + "epoch": 1.1286679871760839, + "grad_norm": 0.5462626931379434, + "learning_rate": 8.40892236409167e-05, + "loss": 12.043, + "step": 20727 + }, + { + "epoch": 1.1287224411726668, + "grad_norm": 0.5312671667088449, + "learning_rate": 8.40805177784633e-05, + "loss": 12.0121, + "step": 20728 + }, + { + "epoch": 1.1287768951692498, + "grad_norm": 0.6380397818351647, + "learning_rate": 8.407181203980247e-05, + "loss": 12.1194, + "step": 20729 + }, + { + "epoch": 1.1288313491658328, + "grad_norm": 0.578529542017685, + "learning_rate": 8.406310642500189e-05, + "loss": 12.1208, + "step": 20730 + }, + { + "epoch": 1.1288858031624158, + "grad_norm": 0.5497148691261249, + "learning_rate": 8.405440093412927e-05, + "loss": 11.9895, + "step": 20731 + }, + { + "epoch": 1.1289402571589988, + "grad_norm": 0.5100078242558772, + "learning_rate": 8.404569556725229e-05, + "loss": 12.0079, + "step": 20732 + }, + { + "epoch": 1.1289947111555818, + "grad_norm": 0.5535915308511297, + "learning_rate": 8.403699032443871e-05, + "loss": 12.0246, + "step": 20733 + }, + { + "epoch": 1.1290491651521648, + "grad_norm": 0.6122926920072559, + "learning_rate": 8.402828520575614e-05, + "loss": 11.9979, + "step": 20734 + }, + { + "epoch": 1.129103619148748, + "grad_norm": 0.5094572839458141, + "learning_rate": 8.40195802112723e-05, + "loss": 11.7763, + "step": 20735 + }, + { + "epoch": 1.129158073145331, + "grad_norm": 0.5970999949646029, + "learning_rate": 8.401087534105485e-05, + "loss": 12.0442, + "step": 20736 + }, + { + "epoch": 1.129212527141914, + "grad_norm": 0.5621315848332539, + "learning_rate": 8.400217059517155e-05, + "loss": 12.0884, + "step": 20737 + }, + { + "epoch": 1.129266981138497, + "grad_norm": 0.6562930582146015, + "learning_rate": 8.399346597369005e-05, + "loss": 12.0669, + "step": 20738 + }, + { + "epoch": 1.12932143513508, + "grad_norm": 0.574893916287573, + "learning_rate": 8.398476147667803e-05, + "loss": 12.0122, + "step": 20739 + }, + { + "epoch": 1.129375889131663, + "grad_norm": 0.6457764055167019, + "learning_rate": 8.39760571042032e-05, + "loss": 11.9435, + "step": 20740 + }, + { + "epoch": 1.129430343128246, + "grad_norm": 0.5280157277237386, + "learning_rate": 8.396735285633324e-05, + "loss": 12.0004, + "step": 20741 + }, + { + "epoch": 1.129484797124829, + "grad_norm": 0.5123033419856629, + "learning_rate": 8.395864873313584e-05, + "loss": 11.9824, + "step": 20742 + }, + { + "epoch": 1.129539251121412, + "grad_norm": 0.4986953868214987, + "learning_rate": 8.394994473467869e-05, + "loss": 11.8879, + "step": 20743 + }, + { + "epoch": 1.129593705117995, + "grad_norm": 0.5713935244026771, + "learning_rate": 8.394124086102943e-05, + "loss": 11.8317, + "step": 20744 + }, + { + "epoch": 1.129648159114578, + "grad_norm": 0.5064476313565307, + "learning_rate": 8.393253711225579e-05, + "loss": 11.8824, + "step": 20745 + }, + { + "epoch": 1.129702613111161, + "grad_norm": 0.5383477103141034, + "learning_rate": 8.392383348842543e-05, + "loss": 12.062, + "step": 20746 + }, + { + "epoch": 1.129757067107744, + "grad_norm": 0.6857506568266057, + "learning_rate": 8.3915129989606e-05, + "loss": 12.0129, + "step": 20747 + }, + { + "epoch": 1.1298115211043271, + "grad_norm": 0.6066521891673776, + "learning_rate": 8.390642661586528e-05, + "loss": 12.1521, + "step": 20748 + }, + { + "epoch": 1.1298659751009101, + "grad_norm": 0.604582705656562, + "learning_rate": 8.389772336727084e-05, + "loss": 12.0386, + "step": 20749 + }, + { + "epoch": 1.1299204290974931, + "grad_norm": 0.545887271751831, + "learning_rate": 8.388902024389042e-05, + "loss": 12.0592, + "step": 20750 + }, + { + "epoch": 1.1299748830940761, + "grad_norm": 0.6514439487333721, + "learning_rate": 8.388031724579169e-05, + "loss": 12.2167, + "step": 20751 + }, + { + "epoch": 1.1300293370906591, + "grad_norm": 0.544262019270203, + "learning_rate": 8.387161437304232e-05, + "loss": 11.9321, + "step": 20752 + }, + { + "epoch": 1.130083791087242, + "grad_norm": 0.5831675179655477, + "learning_rate": 8.386291162570998e-05, + "loss": 11.9923, + "step": 20753 + }, + { + "epoch": 1.130138245083825, + "grad_norm": 0.5468630280688331, + "learning_rate": 8.385420900386234e-05, + "loss": 11.8361, + "step": 20754 + }, + { + "epoch": 1.130192699080408, + "grad_norm": 0.5239437358619802, + "learning_rate": 8.384550650756707e-05, + "loss": 11.8683, + "step": 20755 + }, + { + "epoch": 1.130247153076991, + "grad_norm": 0.5770978379625384, + "learning_rate": 8.383680413689186e-05, + "loss": 11.9685, + "step": 20756 + }, + { + "epoch": 1.130301607073574, + "grad_norm": 0.5528615214271068, + "learning_rate": 8.382810189190438e-05, + "loss": 12.027, + "step": 20757 + }, + { + "epoch": 1.1303560610701573, + "grad_norm": 0.5572352510964343, + "learning_rate": 8.381939977267225e-05, + "loss": 11.9764, + "step": 20758 + }, + { + "epoch": 1.1304105150667403, + "grad_norm": 0.5438921781892165, + "learning_rate": 8.38106977792632e-05, + "loss": 11.9362, + "step": 20759 + }, + { + "epoch": 1.1304649690633233, + "grad_norm": 0.6162761724338172, + "learning_rate": 8.38019959117449e-05, + "loss": 11.9992, + "step": 20760 + }, + { + "epoch": 1.1305194230599063, + "grad_norm": 0.5644696651643153, + "learning_rate": 8.379329417018497e-05, + "loss": 11.9646, + "step": 20761 + }, + { + "epoch": 1.1305738770564893, + "grad_norm": 0.661556556635499, + "learning_rate": 8.378459255465112e-05, + "loss": 12.0489, + "step": 20762 + }, + { + "epoch": 1.1306283310530723, + "grad_norm": 0.5901163540090555, + "learning_rate": 8.377589106521101e-05, + "loss": 12.0038, + "step": 20763 + }, + { + "epoch": 1.1306827850496552, + "grad_norm": 0.5887311343504869, + "learning_rate": 8.376718970193229e-05, + "loss": 12.1154, + "step": 20764 + }, + { + "epoch": 1.1307372390462382, + "grad_norm": 0.6280187554043807, + "learning_rate": 8.375848846488262e-05, + "loss": 12.0463, + "step": 20765 + }, + { + "epoch": 1.1307916930428212, + "grad_norm": 0.5887668368648779, + "learning_rate": 8.374978735412965e-05, + "loss": 12.0367, + "step": 20766 + }, + { + "epoch": 1.1308461470394042, + "grad_norm": 0.5647871665134128, + "learning_rate": 8.374108636974107e-05, + "loss": 11.8751, + "step": 20767 + }, + { + "epoch": 1.1309006010359872, + "grad_norm": 0.5375661928587627, + "learning_rate": 8.373238551178453e-05, + "loss": 12.0289, + "step": 20768 + }, + { + "epoch": 1.1309550550325702, + "grad_norm": 0.5365046984815546, + "learning_rate": 8.372368478032765e-05, + "loss": 12.0516, + "step": 20769 + }, + { + "epoch": 1.1310095090291532, + "grad_norm": 0.57074124799673, + "learning_rate": 8.371498417543817e-05, + "loss": 12.1869, + "step": 20770 + }, + { + "epoch": 1.1310639630257362, + "grad_norm": 0.5485043911729385, + "learning_rate": 8.37062836971837e-05, + "loss": 12.1736, + "step": 20771 + }, + { + "epoch": 1.1311184170223194, + "grad_norm": 0.5538806730895031, + "learning_rate": 8.369758334563189e-05, + "loss": 12.04, + "step": 20772 + }, + { + "epoch": 1.1311728710189024, + "grad_norm": 0.5623222575053048, + "learning_rate": 8.368888312085043e-05, + "loss": 12.0119, + "step": 20773 + }, + { + "epoch": 1.1312273250154854, + "grad_norm": 0.5860184149076423, + "learning_rate": 8.368018302290694e-05, + "loss": 12.0345, + "step": 20774 + }, + { + "epoch": 1.1312817790120684, + "grad_norm": 0.5368386044729148, + "learning_rate": 8.367148305186907e-05, + "loss": 12.0709, + "step": 20775 + }, + { + "epoch": 1.1313362330086514, + "grad_norm": 0.6503054542888462, + "learning_rate": 8.366278320780449e-05, + "loss": 12.1014, + "step": 20776 + }, + { + "epoch": 1.1313906870052344, + "grad_norm": 0.5586135419144223, + "learning_rate": 8.365408349078085e-05, + "loss": 12.0123, + "step": 20777 + }, + { + "epoch": 1.1314451410018174, + "grad_norm": 0.5508012796026294, + "learning_rate": 8.36453839008658e-05, + "loss": 11.9524, + "step": 20778 + }, + { + "epoch": 1.1314995949984004, + "grad_norm": 0.5687890564258543, + "learning_rate": 8.3636684438127e-05, + "loss": 12.0393, + "step": 20779 + }, + { + "epoch": 1.1315540489949834, + "grad_norm": 0.5505202177610155, + "learning_rate": 8.362798510263205e-05, + "loss": 11.9455, + "step": 20780 + }, + { + "epoch": 1.1316085029915663, + "grad_norm": 0.5414750858393832, + "learning_rate": 8.361928589444865e-05, + "loss": 11.9601, + "step": 20781 + }, + { + "epoch": 1.1316629569881496, + "grad_norm": 0.5258501960877228, + "learning_rate": 8.361058681364442e-05, + "loss": 11.9788, + "step": 20782 + }, + { + "epoch": 1.1317174109847326, + "grad_norm": 0.5440550629269573, + "learning_rate": 8.360188786028707e-05, + "loss": 11.9825, + "step": 20783 + }, + { + "epoch": 1.1317718649813155, + "grad_norm": 0.5351544035011582, + "learning_rate": 8.359318903444416e-05, + "loss": 12.0982, + "step": 20784 + }, + { + "epoch": 1.1318263189778985, + "grad_norm": 0.5323293259840147, + "learning_rate": 8.358449033618334e-05, + "loss": 11.9676, + "step": 20785 + }, + { + "epoch": 1.1318807729744815, + "grad_norm": 0.5529529749133446, + "learning_rate": 8.357579176557228e-05, + "loss": 11.8974, + "step": 20786 + }, + { + "epoch": 1.1319352269710645, + "grad_norm": 0.590287551039442, + "learning_rate": 8.356709332267863e-05, + "loss": 11.9511, + "step": 20787 + }, + { + "epoch": 1.1319896809676475, + "grad_norm": 0.5125296807104094, + "learning_rate": 8.355839500757002e-05, + "loss": 12.0024, + "step": 20788 + }, + { + "epoch": 1.1320441349642305, + "grad_norm": 0.5804900466222254, + "learning_rate": 8.354969682031407e-05, + "loss": 12.1384, + "step": 20789 + }, + { + "epoch": 1.1320985889608135, + "grad_norm": 0.5368063847193392, + "learning_rate": 8.354099876097845e-05, + "loss": 11.9719, + "step": 20790 + }, + { + "epoch": 1.1321530429573965, + "grad_norm": 0.5779207309703169, + "learning_rate": 8.353230082963078e-05, + "loss": 11.9777, + "step": 20791 + }, + { + "epoch": 1.1322074969539795, + "grad_norm": 0.5739429398866446, + "learning_rate": 8.352360302633868e-05, + "loss": 12.0905, + "step": 20792 + }, + { + "epoch": 1.1322619509505625, + "grad_norm": 0.5300896274934617, + "learning_rate": 8.351490535116987e-05, + "loss": 12.0226, + "step": 20793 + }, + { + "epoch": 1.1323164049471455, + "grad_norm": 0.571155766038941, + "learning_rate": 8.35062078041919e-05, + "loss": 12.0976, + "step": 20794 + }, + { + "epoch": 1.1323708589437287, + "grad_norm": 0.564676725194143, + "learning_rate": 8.349751038547239e-05, + "loss": 11.9845, + "step": 20795 + }, + { + "epoch": 1.1324253129403117, + "grad_norm": 0.5337945908674252, + "learning_rate": 8.3488813095079e-05, + "loss": 11.9669, + "step": 20796 + }, + { + "epoch": 1.1324797669368947, + "grad_norm": 0.5344172893573053, + "learning_rate": 8.348011593307939e-05, + "loss": 11.9563, + "step": 20797 + }, + { + "epoch": 1.1325342209334777, + "grad_norm": 0.5806123347477514, + "learning_rate": 8.347141889954117e-05, + "loss": 12.061, + "step": 20798 + }, + { + "epoch": 1.1325886749300607, + "grad_norm": 0.5690743516430177, + "learning_rate": 8.346272199453196e-05, + "loss": 12.024, + "step": 20799 + }, + { + "epoch": 1.1326431289266436, + "grad_norm": 0.5225427525492036, + "learning_rate": 8.34540252181194e-05, + "loss": 11.9932, + "step": 20800 + }, + { + "epoch": 1.1326975829232266, + "grad_norm": 0.5668045251543957, + "learning_rate": 8.344532857037113e-05, + "loss": 11.9542, + "step": 20801 + }, + { + "epoch": 1.1327520369198096, + "grad_norm": 0.591275603933892, + "learning_rate": 8.343663205135474e-05, + "loss": 12.0476, + "step": 20802 + }, + { + "epoch": 1.1328064909163926, + "grad_norm": 0.5238304872807422, + "learning_rate": 8.342793566113787e-05, + "loss": 12.0662, + "step": 20803 + }, + { + "epoch": 1.1328609449129756, + "grad_norm": 0.5531231281251932, + "learning_rate": 8.341923939978821e-05, + "loss": 11.9729, + "step": 20804 + }, + { + "epoch": 1.1329153989095588, + "grad_norm": 0.5525939622220033, + "learning_rate": 8.341054326737327e-05, + "loss": 11.9551, + "step": 20805 + }, + { + "epoch": 1.1329698529061418, + "grad_norm": 0.5020694928463673, + "learning_rate": 8.340184726396076e-05, + "loss": 11.9396, + "step": 20806 + }, + { + "epoch": 1.1330243069027248, + "grad_norm": 0.4834336628662161, + "learning_rate": 8.339315138961821e-05, + "loss": 11.9399, + "step": 20807 + }, + { + "epoch": 1.1330787608993078, + "grad_norm": 0.5434950420977097, + "learning_rate": 8.338445564441335e-05, + "loss": 12.0291, + "step": 20808 + }, + { + "epoch": 1.1331332148958908, + "grad_norm": 0.5607777847547193, + "learning_rate": 8.337576002841375e-05, + "loss": 12.1495, + "step": 20809 + }, + { + "epoch": 1.1331876688924738, + "grad_norm": 0.5272975127627831, + "learning_rate": 8.336706454168701e-05, + "loss": 12.1168, + "step": 20810 + }, + { + "epoch": 1.1332421228890568, + "grad_norm": 0.5269370953237635, + "learning_rate": 8.335836918430075e-05, + "loss": 12.1105, + "step": 20811 + }, + { + "epoch": 1.1332965768856398, + "grad_norm": 0.5280620247681213, + "learning_rate": 8.334967395632264e-05, + "loss": 12.0228, + "step": 20812 + }, + { + "epoch": 1.1333510308822228, + "grad_norm": 0.5604435021674699, + "learning_rate": 8.334097885782024e-05, + "loss": 12.0042, + "step": 20813 + }, + { + "epoch": 1.1334054848788058, + "grad_norm": 0.4949261064367269, + "learning_rate": 8.333228388886121e-05, + "loss": 11.9785, + "step": 20814 + }, + { + "epoch": 1.1334599388753888, + "grad_norm": 0.5435580938905246, + "learning_rate": 8.33235890495131e-05, + "loss": 12.0545, + "step": 20815 + }, + { + "epoch": 1.1335143928719718, + "grad_norm": 0.5391843622721241, + "learning_rate": 8.331489433984357e-05, + "loss": 12.1, + "step": 20816 + }, + { + "epoch": 1.1335688468685547, + "grad_norm": 0.5577771887455933, + "learning_rate": 8.330619975992021e-05, + "loss": 12.0138, + "step": 20817 + }, + { + "epoch": 1.133623300865138, + "grad_norm": 0.5249232630778278, + "learning_rate": 8.329750530981064e-05, + "loss": 11.9539, + "step": 20818 + }, + { + "epoch": 1.133677754861721, + "grad_norm": 0.5104218897688741, + "learning_rate": 8.328881098958246e-05, + "loss": 12.0681, + "step": 20819 + }, + { + "epoch": 1.133732208858304, + "grad_norm": 0.5857135309788942, + "learning_rate": 8.32801167993033e-05, + "loss": 11.8939, + "step": 20820 + }, + { + "epoch": 1.133786662854887, + "grad_norm": 0.6147682545125815, + "learning_rate": 8.327142273904078e-05, + "loss": 12.0024, + "step": 20821 + }, + { + "epoch": 1.13384111685147, + "grad_norm": 0.5800537403969274, + "learning_rate": 8.326272880886245e-05, + "loss": 12.0978, + "step": 20822 + }, + { + "epoch": 1.133895570848053, + "grad_norm": 0.5212552774853718, + "learning_rate": 8.325403500883597e-05, + "loss": 11.96, + "step": 20823 + }, + { + "epoch": 1.133950024844636, + "grad_norm": 0.5668164666074186, + "learning_rate": 8.324534133902892e-05, + "loss": 11.9927, + "step": 20824 + }, + { + "epoch": 1.134004478841219, + "grad_norm": 0.5557225954644414, + "learning_rate": 8.32366477995089e-05, + "loss": 12.0414, + "step": 20825 + }, + { + "epoch": 1.134058932837802, + "grad_norm": 0.536066984690846, + "learning_rate": 8.322795439034352e-05, + "loss": 12.0549, + "step": 20826 + }, + { + "epoch": 1.134113386834385, + "grad_norm": 0.5094212987549769, + "learning_rate": 8.321926111160038e-05, + "loss": 11.9891, + "step": 20827 + }, + { + "epoch": 1.134167840830968, + "grad_norm": 0.6266210587123217, + "learning_rate": 8.321056796334707e-05, + "loss": 12.0174, + "step": 20828 + }, + { + "epoch": 1.134222294827551, + "grad_norm": 0.5920350735747615, + "learning_rate": 8.320187494565121e-05, + "loss": 12.1067, + "step": 20829 + }, + { + "epoch": 1.134276748824134, + "grad_norm": 0.5157900799792909, + "learning_rate": 8.319318205858036e-05, + "loss": 11.9817, + "step": 20830 + }, + { + "epoch": 1.134331202820717, + "grad_norm": 0.5484848096232319, + "learning_rate": 8.318448930220216e-05, + "loss": 11.996, + "step": 20831 + }, + { + "epoch": 1.1343856568173, + "grad_norm": 0.4911315591287539, + "learning_rate": 8.31757966765842e-05, + "loss": 11.7924, + "step": 20832 + }, + { + "epoch": 1.134440110813883, + "grad_norm": 0.48866603006549925, + "learning_rate": 8.316710418179406e-05, + "loss": 11.8883, + "step": 20833 + }, + { + "epoch": 1.134494564810466, + "grad_norm": 0.5355135372081358, + "learning_rate": 8.315841181789937e-05, + "loss": 11.9627, + "step": 20834 + }, + { + "epoch": 1.134549018807049, + "grad_norm": 0.5275092246816174, + "learning_rate": 8.314971958496766e-05, + "loss": 11.9631, + "step": 20835 + }, + { + "epoch": 1.134603472803632, + "grad_norm": 0.5295825837600786, + "learning_rate": 8.314102748306659e-05, + "loss": 11.9989, + "step": 20836 + }, + { + "epoch": 1.134657926800215, + "grad_norm": 0.5321704154501747, + "learning_rate": 8.313233551226369e-05, + "loss": 12.0102, + "step": 20837 + }, + { + "epoch": 1.134712380796798, + "grad_norm": 0.5254003594455219, + "learning_rate": 8.312364367262658e-05, + "loss": 12.021, + "step": 20838 + }, + { + "epoch": 1.134766834793381, + "grad_norm": 0.5216226383713083, + "learning_rate": 8.311495196422284e-05, + "loss": 11.9733, + "step": 20839 + }, + { + "epoch": 1.134821288789964, + "grad_norm": 0.5459613177918216, + "learning_rate": 8.310626038712007e-05, + "loss": 12.0244, + "step": 20840 + }, + { + "epoch": 1.1348757427865472, + "grad_norm": 0.5404943272558749, + "learning_rate": 8.309756894138583e-05, + "loss": 12.1336, + "step": 20841 + }, + { + "epoch": 1.1349301967831302, + "grad_norm": 0.48839452700605696, + "learning_rate": 8.308887762708776e-05, + "loss": 12.013, + "step": 20842 + }, + { + "epoch": 1.1349846507797132, + "grad_norm": 0.5520083087994961, + "learning_rate": 8.30801864442934e-05, + "loss": 12.0592, + "step": 20843 + }, + { + "epoch": 1.1350391047762962, + "grad_norm": 0.5462084428929227, + "learning_rate": 8.30714953930704e-05, + "loss": 11.9144, + "step": 20844 + }, + { + "epoch": 1.1350935587728792, + "grad_norm": 0.5471507247040064, + "learning_rate": 8.306280447348622e-05, + "loss": 11.8395, + "step": 20845 + }, + { + "epoch": 1.1351480127694622, + "grad_norm": 0.5501453700212303, + "learning_rate": 8.305411368560854e-05, + "loss": 11.9186, + "step": 20846 + }, + { + "epoch": 1.1352024667660452, + "grad_norm": 0.5454816436293293, + "learning_rate": 8.304542302950491e-05, + "loss": 11.9424, + "step": 20847 + }, + { + "epoch": 1.1352569207626282, + "grad_norm": 0.4800147172461387, + "learning_rate": 8.303673250524293e-05, + "loss": 11.9729, + "step": 20848 + }, + { + "epoch": 1.1353113747592112, + "grad_norm": 0.5395301056753729, + "learning_rate": 8.302804211289015e-05, + "loss": 11.9276, + "step": 20849 + }, + { + "epoch": 1.1353658287557942, + "grad_norm": 0.545924304880344, + "learning_rate": 8.301935185251415e-05, + "loss": 11.9468, + "step": 20850 + }, + { + "epoch": 1.1354202827523772, + "grad_norm": 0.5531911619321243, + "learning_rate": 8.301066172418252e-05, + "loss": 12.0504, + "step": 20851 + }, + { + "epoch": 1.1354747367489604, + "grad_norm": 0.5295766318287469, + "learning_rate": 8.300197172796283e-05, + "loss": 12.0351, + "step": 20852 + }, + { + "epoch": 1.1355291907455434, + "grad_norm": 0.5322049728341919, + "learning_rate": 8.299328186392266e-05, + "loss": 11.9966, + "step": 20853 + }, + { + "epoch": 1.1355836447421264, + "grad_norm": 0.5081287053833177, + "learning_rate": 8.298459213212964e-05, + "loss": 11.9673, + "step": 20854 + }, + { + "epoch": 1.1356380987387094, + "grad_norm": 0.6124065673217175, + "learning_rate": 8.297590253265125e-05, + "loss": 11.983, + "step": 20855 + }, + { + "epoch": 1.1356925527352923, + "grad_norm": 0.53898433722122, + "learning_rate": 8.296721306555505e-05, + "loss": 11.9183, + "step": 20856 + }, + { + "epoch": 1.1357470067318753, + "grad_norm": 0.5382650722310977, + "learning_rate": 8.295852373090869e-05, + "loss": 12.0799, + "step": 20857 + }, + { + "epoch": 1.1358014607284583, + "grad_norm": 0.561854909778086, + "learning_rate": 8.294983452877971e-05, + "loss": 11.9602, + "step": 20858 + }, + { + "epoch": 1.1358559147250413, + "grad_norm": 0.6108217648046498, + "learning_rate": 8.294114545923567e-05, + "loss": 11.9697, + "step": 20859 + }, + { + "epoch": 1.1359103687216243, + "grad_norm": 0.5682286780819156, + "learning_rate": 8.293245652234415e-05, + "loss": 12.0881, + "step": 20860 + }, + { + "epoch": 1.1359648227182073, + "grad_norm": 0.6408087879625746, + "learning_rate": 8.29237677181727e-05, + "loss": 12.0791, + "step": 20861 + }, + { + "epoch": 1.1360192767147903, + "grad_norm": 0.5605205521100582, + "learning_rate": 8.291507904678892e-05, + "loss": 12.1151, + "step": 20862 + }, + { + "epoch": 1.1360737307113733, + "grad_norm": 0.5605707280569578, + "learning_rate": 8.29063905082603e-05, + "loss": 12.0089, + "step": 20863 + }, + { + "epoch": 1.1361281847079563, + "grad_norm": 0.5012292664038457, + "learning_rate": 8.289770210265453e-05, + "loss": 11.9938, + "step": 20864 + }, + { + "epoch": 1.1361826387045395, + "grad_norm": 0.5684573322543377, + "learning_rate": 8.288901383003907e-05, + "loss": 12.0373, + "step": 20865 + }, + { + "epoch": 1.1362370927011225, + "grad_norm": 0.6351742069265385, + "learning_rate": 8.288032569048148e-05, + "loss": 12.0249, + "step": 20866 + }, + { + "epoch": 1.1362915466977055, + "grad_norm": 0.5222002852865439, + "learning_rate": 8.287163768404934e-05, + "loss": 12.0471, + "step": 20867 + }, + { + "epoch": 1.1363460006942885, + "grad_norm": 0.5203810318401516, + "learning_rate": 8.286294981081024e-05, + "loss": 11.9294, + "step": 20868 + }, + { + "epoch": 1.1364004546908715, + "grad_norm": 0.57972478770493, + "learning_rate": 8.285426207083171e-05, + "loss": 12.0424, + "step": 20869 + }, + { + "epoch": 1.1364549086874545, + "grad_norm": 0.5544463610879068, + "learning_rate": 8.284557446418133e-05, + "loss": 12.0597, + "step": 20870 + }, + { + "epoch": 1.1365093626840375, + "grad_norm": 0.5035943721139681, + "learning_rate": 8.283688699092662e-05, + "loss": 11.9878, + "step": 20871 + }, + { + "epoch": 1.1365638166806205, + "grad_norm": 0.5344768186437175, + "learning_rate": 8.282819965113516e-05, + "loss": 12.0697, + "step": 20872 + }, + { + "epoch": 1.1366182706772034, + "grad_norm": 0.5616838941972153, + "learning_rate": 8.281951244487452e-05, + "loss": 11.989, + "step": 20873 + }, + { + "epoch": 1.1366727246737864, + "grad_norm": 0.5391340768095988, + "learning_rate": 8.281082537221223e-05, + "loss": 11.9472, + "step": 20874 + }, + { + "epoch": 1.1367271786703697, + "grad_norm": 0.6130211524475987, + "learning_rate": 8.280213843321583e-05, + "loss": 12.1204, + "step": 20875 + }, + { + "epoch": 1.1367816326669526, + "grad_norm": 0.522993663207442, + "learning_rate": 8.279345162795288e-05, + "loss": 12.0767, + "step": 20876 + }, + { + "epoch": 1.1368360866635356, + "grad_norm": 0.5704096775661565, + "learning_rate": 8.278476495649094e-05, + "loss": 12.0927, + "step": 20877 + }, + { + "epoch": 1.1368905406601186, + "grad_norm": 0.5182424211696456, + "learning_rate": 8.277607841889754e-05, + "loss": 11.9097, + "step": 20878 + }, + { + "epoch": 1.1369449946567016, + "grad_norm": 0.5664499558712293, + "learning_rate": 8.276739201524026e-05, + "loss": 11.9551, + "step": 20879 + }, + { + "epoch": 1.1369994486532846, + "grad_norm": 0.5124961895977534, + "learning_rate": 8.275870574558661e-05, + "loss": 11.9613, + "step": 20880 + }, + { + "epoch": 1.1370539026498676, + "grad_norm": 0.49355180719984976, + "learning_rate": 8.275001961000418e-05, + "loss": 12.0667, + "step": 20881 + }, + { + "epoch": 1.1371083566464506, + "grad_norm": 0.5412642353635919, + "learning_rate": 8.27413336085605e-05, + "loss": 12.0697, + "step": 20882 + }, + { + "epoch": 1.1371628106430336, + "grad_norm": 0.5478756978199564, + "learning_rate": 8.273264774132308e-05, + "loss": 11.9211, + "step": 20883 + }, + { + "epoch": 1.1372172646396166, + "grad_norm": 0.5521736725057819, + "learning_rate": 8.272396200835952e-05, + "loss": 12.0213, + "step": 20884 + }, + { + "epoch": 1.1372717186361996, + "grad_norm": 0.5305067708369225, + "learning_rate": 8.27152764097373e-05, + "loss": 11.9613, + "step": 20885 + }, + { + "epoch": 1.1373261726327826, + "grad_norm": 0.5280367029753679, + "learning_rate": 8.270659094552399e-05, + "loss": 12.0564, + "step": 20886 + }, + { + "epoch": 1.1373806266293656, + "grad_norm": 0.523282340775394, + "learning_rate": 8.269790561578713e-05, + "loss": 11.9566, + "step": 20887 + }, + { + "epoch": 1.1374350806259488, + "grad_norm": 0.5073888625187748, + "learning_rate": 8.268922042059426e-05, + "loss": 11.9844, + "step": 20888 + }, + { + "epoch": 1.1374895346225318, + "grad_norm": 0.5630654298574181, + "learning_rate": 8.268053536001291e-05, + "loss": 11.9883, + "step": 20889 + }, + { + "epoch": 1.1375439886191148, + "grad_norm": 0.49276678646697886, + "learning_rate": 8.26718504341106e-05, + "loss": 11.952, + "step": 20890 + }, + { + "epoch": 1.1375984426156978, + "grad_norm": 0.6022868989355634, + "learning_rate": 8.266316564295492e-05, + "loss": 12.0448, + "step": 20891 + }, + { + "epoch": 1.1376528966122808, + "grad_norm": 0.4980652688408364, + "learning_rate": 8.265448098661337e-05, + "loss": 12.0342, + "step": 20892 + }, + { + "epoch": 1.1377073506088637, + "grad_norm": 0.4817094395889994, + "learning_rate": 8.264579646515347e-05, + "loss": 11.9591, + "step": 20893 + }, + { + "epoch": 1.1377618046054467, + "grad_norm": 0.5647523062277882, + "learning_rate": 8.26371120786428e-05, + "loss": 12.1397, + "step": 20894 + }, + { + "epoch": 1.1378162586020297, + "grad_norm": 0.5375555758113099, + "learning_rate": 8.262842782714884e-05, + "loss": 11.9929, + "step": 20895 + }, + { + "epoch": 1.1378707125986127, + "grad_norm": 0.49082208075832834, + "learning_rate": 8.261974371073913e-05, + "loss": 11.9572, + "step": 20896 + }, + { + "epoch": 1.1379251665951957, + "grad_norm": 0.5972417861121767, + "learning_rate": 8.26110597294812e-05, + "loss": 12.2294, + "step": 20897 + }, + { + "epoch": 1.137979620591779, + "grad_norm": 0.5170383175147765, + "learning_rate": 8.26023758834426e-05, + "loss": 12.0637, + "step": 20898 + }, + { + "epoch": 1.138034074588362, + "grad_norm": 0.5175739439103323, + "learning_rate": 8.259369217269084e-05, + "loss": 11.8461, + "step": 20899 + }, + { + "epoch": 1.138088528584945, + "grad_norm": 0.6240727278872582, + "learning_rate": 8.258500859729345e-05, + "loss": 12.1108, + "step": 20900 + }, + { + "epoch": 1.138142982581528, + "grad_norm": 0.5173425148012402, + "learning_rate": 8.257632515731793e-05, + "loss": 12.1129, + "step": 20901 + }, + { + "epoch": 1.138197436578111, + "grad_norm": 0.5384766030862483, + "learning_rate": 8.256764185283184e-05, + "loss": 12.031, + "step": 20902 + }, + { + "epoch": 1.138251890574694, + "grad_norm": 0.4990048614018597, + "learning_rate": 8.25589586839027e-05, + "loss": 11.9201, + "step": 20903 + }, + { + "epoch": 1.1383063445712769, + "grad_norm": 0.5347219549756849, + "learning_rate": 8.255027565059806e-05, + "loss": 12.0611, + "step": 20904 + }, + { + "epoch": 1.1383607985678599, + "grad_norm": 0.523376663435599, + "learning_rate": 8.254159275298533e-05, + "loss": 11.9876, + "step": 20905 + }, + { + "epoch": 1.1384152525644429, + "grad_norm": 0.49233654599900334, + "learning_rate": 8.253290999113215e-05, + "loss": 12.0099, + "step": 20906 + }, + { + "epoch": 1.1384697065610259, + "grad_norm": 0.5117814584178179, + "learning_rate": 8.252422736510597e-05, + "loss": 12.0075, + "step": 20907 + }, + { + "epoch": 1.1385241605576089, + "grad_norm": 0.6113436907536076, + "learning_rate": 8.251554487497436e-05, + "loss": 12.0207, + "step": 20908 + }, + { + "epoch": 1.1385786145541918, + "grad_norm": 0.5237525973981209, + "learning_rate": 8.250686252080478e-05, + "loss": 12.0911, + "step": 20909 + }, + { + "epoch": 1.1386330685507748, + "grad_norm": 0.5453306120977369, + "learning_rate": 8.249818030266476e-05, + "loss": 11.9071, + "step": 20910 + }, + { + "epoch": 1.138687522547358, + "grad_norm": 0.5957861253600286, + "learning_rate": 8.248949822062182e-05, + "loss": 12.0551, + "step": 20911 + }, + { + "epoch": 1.138741976543941, + "grad_norm": 0.5245725217612074, + "learning_rate": 8.248081627474349e-05, + "loss": 12.0185, + "step": 20912 + }, + { + "epoch": 1.138796430540524, + "grad_norm": 0.5930235919132725, + "learning_rate": 8.247213446509728e-05, + "loss": 11.9996, + "step": 20913 + }, + { + "epoch": 1.138850884537107, + "grad_norm": 0.5450954577191609, + "learning_rate": 8.246345279175073e-05, + "loss": 11.9504, + "step": 20914 + }, + { + "epoch": 1.13890533853369, + "grad_norm": 0.5530840617194773, + "learning_rate": 8.245477125477125e-05, + "loss": 12.01, + "step": 20915 + }, + { + "epoch": 1.138959792530273, + "grad_norm": 0.5633213277208992, + "learning_rate": 8.244608985422641e-05, + "loss": 12.0457, + "step": 20916 + }, + { + "epoch": 1.139014246526856, + "grad_norm": 0.6085785006143246, + "learning_rate": 8.243740859018375e-05, + "loss": 12.1239, + "step": 20917 + }, + { + "epoch": 1.139068700523439, + "grad_norm": 0.5330646862283069, + "learning_rate": 8.242872746271073e-05, + "loss": 11.9485, + "step": 20918 + }, + { + "epoch": 1.139123154520022, + "grad_norm": 0.5364267288916039, + "learning_rate": 8.242004647187489e-05, + "loss": 12.0183, + "step": 20919 + }, + { + "epoch": 1.139177608516605, + "grad_norm": 0.5496380663520917, + "learning_rate": 8.24113656177437e-05, + "loss": 12.0552, + "step": 20920 + }, + { + "epoch": 1.139232062513188, + "grad_norm": 0.5371979806699659, + "learning_rate": 8.240268490038469e-05, + "loss": 12.0438, + "step": 20921 + }, + { + "epoch": 1.1392865165097712, + "grad_norm": 0.5498816405632554, + "learning_rate": 8.239400431986535e-05, + "loss": 11.9895, + "step": 20922 + }, + { + "epoch": 1.1393409705063542, + "grad_norm": 0.5625453952844264, + "learning_rate": 8.238532387625315e-05, + "loss": 11.9807, + "step": 20923 + }, + { + "epoch": 1.1393954245029372, + "grad_norm": 0.5611986634358086, + "learning_rate": 8.23766435696157e-05, + "loss": 12.0085, + "step": 20924 + }, + { + "epoch": 1.1394498784995202, + "grad_norm": 0.5377483954179323, + "learning_rate": 8.236796340002038e-05, + "loss": 12.0947, + "step": 20925 + }, + { + "epoch": 1.1395043324961032, + "grad_norm": 0.5262368616779615, + "learning_rate": 8.235928336753475e-05, + "loss": 11.9465, + "step": 20926 + }, + { + "epoch": 1.1395587864926862, + "grad_norm": 0.5223377240065364, + "learning_rate": 8.235060347222625e-05, + "loss": 12.0398, + "step": 20927 + }, + { + "epoch": 1.1396132404892692, + "grad_norm": 0.5642763894207888, + "learning_rate": 8.234192371416245e-05, + "loss": 11.8982, + "step": 20928 + }, + { + "epoch": 1.1396676944858521, + "grad_norm": 0.5019674705192412, + "learning_rate": 8.233324409341081e-05, + "loss": 12.0971, + "step": 20929 + }, + { + "epoch": 1.1397221484824351, + "grad_norm": 0.5285775151462369, + "learning_rate": 8.232456461003882e-05, + "loss": 11.9557, + "step": 20930 + }, + { + "epoch": 1.1397766024790181, + "grad_norm": 0.5729624412586914, + "learning_rate": 8.231588526411398e-05, + "loss": 12.1298, + "step": 20931 + }, + { + "epoch": 1.1398310564756011, + "grad_norm": 0.5606389022362214, + "learning_rate": 8.230720605570379e-05, + "loss": 11.9898, + "step": 20932 + }, + { + "epoch": 1.1398855104721841, + "grad_norm": 0.5903555006233688, + "learning_rate": 8.229852698487572e-05, + "loss": 11.9642, + "step": 20933 + }, + { + "epoch": 1.139939964468767, + "grad_norm": 0.6299819004174931, + "learning_rate": 8.228984805169732e-05, + "loss": 12.1306, + "step": 20934 + }, + { + "epoch": 1.1399944184653503, + "grad_norm": 0.5577541227970585, + "learning_rate": 8.228116925623599e-05, + "loss": 12.029, + "step": 20935 + }, + { + "epoch": 1.1400488724619333, + "grad_norm": 0.5175393691618557, + "learning_rate": 8.227249059855926e-05, + "loss": 12.1277, + "step": 20936 + }, + { + "epoch": 1.1401033264585163, + "grad_norm": 0.5606078093504049, + "learning_rate": 8.226381207873462e-05, + "loss": 12.153, + "step": 20937 + }, + { + "epoch": 1.1401577804550993, + "grad_norm": 0.5168465111345226, + "learning_rate": 8.225513369682954e-05, + "loss": 11.9966, + "step": 20938 + }, + { + "epoch": 1.1402122344516823, + "grad_norm": 0.6631339416457368, + "learning_rate": 8.224645545291151e-05, + "loss": 12.0144, + "step": 20939 + }, + { + "epoch": 1.1402666884482653, + "grad_norm": 0.6520044289618595, + "learning_rate": 8.223777734704804e-05, + "loss": 12.0726, + "step": 20940 + }, + { + "epoch": 1.1403211424448483, + "grad_norm": 0.5234933027330751, + "learning_rate": 8.222909937930658e-05, + "loss": 11.9941, + "step": 20941 + }, + { + "epoch": 1.1403755964414313, + "grad_norm": 0.5772608521354804, + "learning_rate": 8.222042154975464e-05, + "loss": 12.0001, + "step": 20942 + }, + { + "epoch": 1.1404300504380143, + "grad_norm": 0.5533461011161972, + "learning_rate": 8.221174385845967e-05, + "loss": 11.9857, + "step": 20943 + }, + { + "epoch": 1.1404845044345973, + "grad_norm": 0.5251306865625073, + "learning_rate": 8.220306630548917e-05, + "loss": 11.9284, + "step": 20944 + }, + { + "epoch": 1.1405389584311805, + "grad_norm": 0.5049067693066229, + "learning_rate": 8.219438889091062e-05, + "loss": 11.9974, + "step": 20945 + }, + { + "epoch": 1.1405934124277635, + "grad_norm": 0.562717881292853, + "learning_rate": 8.218571161479148e-05, + "loss": 12.0331, + "step": 20946 + }, + { + "epoch": 1.1406478664243465, + "grad_norm": 0.5125144286918268, + "learning_rate": 8.217703447719924e-05, + "loss": 11.9266, + "step": 20947 + }, + { + "epoch": 1.1407023204209294, + "grad_norm": 0.5769333365392472, + "learning_rate": 8.216835747820135e-05, + "loss": 12.0408, + "step": 20948 + }, + { + "epoch": 1.1407567744175124, + "grad_norm": 0.5479203582995349, + "learning_rate": 8.215968061786531e-05, + "loss": 12.1124, + "step": 20949 + }, + { + "epoch": 1.1408112284140954, + "grad_norm": 0.5596161916792809, + "learning_rate": 8.215100389625857e-05, + "loss": 12.0871, + "step": 20950 + }, + { + "epoch": 1.1408656824106784, + "grad_norm": 0.5305381593496411, + "learning_rate": 8.214232731344864e-05, + "loss": 12.0144, + "step": 20951 + }, + { + "epoch": 1.1409201364072614, + "grad_norm": 0.5416221276249467, + "learning_rate": 8.213365086950296e-05, + "loss": 12.0667, + "step": 20952 + }, + { + "epoch": 1.1409745904038444, + "grad_norm": 0.5333267437537554, + "learning_rate": 8.2124974564489e-05, + "loss": 11.9662, + "step": 20953 + }, + { + "epoch": 1.1410290444004274, + "grad_norm": 0.5099745896153601, + "learning_rate": 8.211629839847426e-05, + "loss": 11.9656, + "step": 20954 + }, + { + "epoch": 1.1410834983970104, + "grad_norm": 0.5423766204134551, + "learning_rate": 8.210762237152619e-05, + "loss": 11.9546, + "step": 20955 + }, + { + "epoch": 1.1411379523935934, + "grad_norm": 0.5532803593890093, + "learning_rate": 8.209894648371222e-05, + "loss": 12.0223, + "step": 20956 + }, + { + "epoch": 1.1411924063901764, + "grad_norm": 0.5835951375321685, + "learning_rate": 8.209027073509985e-05, + "loss": 12.0279, + "step": 20957 + }, + { + "epoch": 1.1412468603867596, + "grad_norm": 0.5905217931965142, + "learning_rate": 8.208159512575654e-05, + "loss": 12.1469, + "step": 20958 + }, + { + "epoch": 1.1413013143833426, + "grad_norm": 0.5422334570641386, + "learning_rate": 8.207291965574974e-05, + "loss": 12.1288, + "step": 20959 + }, + { + "epoch": 1.1413557683799256, + "grad_norm": 0.696890680767564, + "learning_rate": 8.206424432514694e-05, + "loss": 12.0775, + "step": 20960 + }, + { + "epoch": 1.1414102223765086, + "grad_norm": 0.5585119847024153, + "learning_rate": 8.205556913401555e-05, + "loss": 11.99, + "step": 20961 + }, + { + "epoch": 1.1414646763730916, + "grad_norm": 0.549365936991758, + "learning_rate": 8.20468940824231e-05, + "loss": 12.0516, + "step": 20962 + }, + { + "epoch": 1.1415191303696746, + "grad_norm": 0.607436255760605, + "learning_rate": 8.2038219170437e-05, + "loss": 12.0653, + "step": 20963 + }, + { + "epoch": 1.1415735843662576, + "grad_norm": 0.5655225055773133, + "learning_rate": 8.202954439812472e-05, + "loss": 12.1366, + "step": 20964 + }, + { + "epoch": 1.1416280383628405, + "grad_norm": 0.5301801390339581, + "learning_rate": 8.202086976555375e-05, + "loss": 12.0433, + "step": 20965 + }, + { + "epoch": 1.1416824923594235, + "grad_norm": 0.4744429712595387, + "learning_rate": 8.201219527279147e-05, + "loss": 11.8752, + "step": 20966 + }, + { + "epoch": 1.1417369463560065, + "grad_norm": 0.507296115025727, + "learning_rate": 8.200352091990539e-05, + "loss": 11.9695, + "step": 20967 + }, + { + "epoch": 1.1417914003525897, + "grad_norm": 0.5290589105091998, + "learning_rate": 8.199484670696295e-05, + "loss": 12.045, + "step": 20968 + }, + { + "epoch": 1.1418458543491727, + "grad_norm": 0.7096101281605665, + "learning_rate": 8.19861726340316e-05, + "loss": 12.1148, + "step": 20969 + }, + { + "epoch": 1.1419003083457557, + "grad_norm": 0.49424842042945244, + "learning_rate": 8.197749870117879e-05, + "loss": 12.0347, + "step": 20970 + }, + { + "epoch": 1.1419547623423387, + "grad_norm": 0.5749252311480237, + "learning_rate": 8.196882490847197e-05, + "loss": 12.0663, + "step": 20971 + }, + { + "epoch": 1.1420092163389217, + "grad_norm": 0.5670819687811587, + "learning_rate": 8.196015125597858e-05, + "loss": 12.0073, + "step": 20972 + }, + { + "epoch": 1.1420636703355047, + "grad_norm": 0.555447687916874, + "learning_rate": 8.195147774376609e-05, + "loss": 12.0688, + "step": 20973 + }, + { + "epoch": 1.1421181243320877, + "grad_norm": 0.5414378867694744, + "learning_rate": 8.194280437190194e-05, + "loss": 12.0326, + "step": 20974 + }, + { + "epoch": 1.1421725783286707, + "grad_norm": 0.5773807343132444, + "learning_rate": 8.19341311404536e-05, + "loss": 12.0903, + "step": 20975 + }, + { + "epoch": 1.1422270323252537, + "grad_norm": 0.5020329669878001, + "learning_rate": 8.192545804948845e-05, + "loss": 11.9684, + "step": 20976 + }, + { + "epoch": 1.1422814863218367, + "grad_norm": 0.48234046727032104, + "learning_rate": 8.191678509907396e-05, + "loss": 11.9321, + "step": 20977 + }, + { + "epoch": 1.1423359403184197, + "grad_norm": 0.567535834836397, + "learning_rate": 8.190811228927761e-05, + "loss": 12.0578, + "step": 20978 + }, + { + "epoch": 1.1423903943150027, + "grad_norm": 0.5327119061845109, + "learning_rate": 8.189943962016679e-05, + "loss": 11.9767, + "step": 20979 + }, + { + "epoch": 1.1424448483115857, + "grad_norm": 0.5632906184815956, + "learning_rate": 8.189076709180898e-05, + "loss": 12.0716, + "step": 20980 + }, + { + "epoch": 1.1424993023081689, + "grad_norm": 0.593895236471705, + "learning_rate": 8.188209470427159e-05, + "loss": 12.1147, + "step": 20981 + }, + { + "epoch": 1.1425537563047519, + "grad_norm": 0.520442931732901, + "learning_rate": 8.187342245762209e-05, + "loss": 11.8542, + "step": 20982 + }, + { + "epoch": 1.1426082103013349, + "grad_norm": 0.5405669907945672, + "learning_rate": 8.186475035192788e-05, + "loss": 12.0936, + "step": 20983 + }, + { + "epoch": 1.1426626642979179, + "grad_norm": 0.5492648899230916, + "learning_rate": 8.185607838725639e-05, + "loss": 11.9907, + "step": 20984 + }, + { + "epoch": 1.1427171182945008, + "grad_norm": 0.5803230662974547, + "learning_rate": 8.184740656367515e-05, + "loss": 12.0568, + "step": 20985 + }, + { + "epoch": 1.1427715722910838, + "grad_norm": 0.46560360122414524, + "learning_rate": 8.183873488125147e-05, + "loss": 11.9337, + "step": 20986 + }, + { + "epoch": 1.1428260262876668, + "grad_norm": 0.49780376200500465, + "learning_rate": 8.183006334005283e-05, + "loss": 12.0872, + "step": 20987 + }, + { + "epoch": 1.1428804802842498, + "grad_norm": 0.6016666680327781, + "learning_rate": 8.182139194014665e-05, + "loss": 12.0181, + "step": 20988 + }, + { + "epoch": 1.1429349342808328, + "grad_norm": 0.6388919810710112, + "learning_rate": 8.18127206816004e-05, + "loss": 12.1541, + "step": 20989 + }, + { + "epoch": 1.1429893882774158, + "grad_norm": 0.5375566325314137, + "learning_rate": 8.180404956448147e-05, + "loss": 12.0993, + "step": 20990 + }, + { + "epoch": 1.1430438422739988, + "grad_norm": 0.5732752135944795, + "learning_rate": 8.179537858885731e-05, + "loss": 12.0784, + "step": 20991 + }, + { + "epoch": 1.143098296270582, + "grad_norm": 0.5518683111626997, + "learning_rate": 8.178670775479534e-05, + "loss": 12.0159, + "step": 20992 + }, + { + "epoch": 1.143152750267165, + "grad_norm": 0.5556360373222765, + "learning_rate": 8.177803706236299e-05, + "loss": 11.9894, + "step": 20993 + }, + { + "epoch": 1.143207204263748, + "grad_norm": 0.5298063978603275, + "learning_rate": 8.176936651162767e-05, + "loss": 11.9655, + "step": 20994 + }, + { + "epoch": 1.143261658260331, + "grad_norm": 0.5793338571832891, + "learning_rate": 8.176069610265684e-05, + "loss": 12.0498, + "step": 20995 + }, + { + "epoch": 1.143316112256914, + "grad_norm": 0.6264256406282895, + "learning_rate": 8.175202583551787e-05, + "loss": 11.9412, + "step": 20996 + }, + { + "epoch": 1.143370566253497, + "grad_norm": 0.5361244133679585, + "learning_rate": 8.174335571027823e-05, + "loss": 12.0836, + "step": 20997 + }, + { + "epoch": 1.14342502025008, + "grad_norm": 0.5556546496980178, + "learning_rate": 8.173468572700529e-05, + "loss": 11.8692, + "step": 20998 + }, + { + "epoch": 1.143479474246663, + "grad_norm": 0.5293104875373195, + "learning_rate": 8.172601588576648e-05, + "loss": 11.9413, + "step": 20999 + }, + { + "epoch": 1.143533928243246, + "grad_norm": 0.5472832595633416, + "learning_rate": 8.171734618662927e-05, + "loss": 12.0546, + "step": 21000 + }, + { + "epoch": 1.143588382239829, + "grad_norm": 0.5230902969254605, + "learning_rate": 8.170867662966102e-05, + "loss": 12.0118, + "step": 21001 + }, + { + "epoch": 1.143642836236412, + "grad_norm": 0.5418456221500304, + "learning_rate": 8.170000721492918e-05, + "loss": 11.9179, + "step": 21002 + }, + { + "epoch": 1.143697290232995, + "grad_norm": 0.5203268480355566, + "learning_rate": 8.169133794250116e-05, + "loss": 12.0247, + "step": 21003 + }, + { + "epoch": 1.143751744229578, + "grad_norm": 0.5229551837980176, + "learning_rate": 8.168266881244436e-05, + "loss": 11.903, + "step": 21004 + }, + { + "epoch": 1.1438061982261611, + "grad_norm": 0.5344584978853519, + "learning_rate": 8.167399982482622e-05, + "loss": 11.9609, + "step": 21005 + }, + { + "epoch": 1.1438606522227441, + "grad_norm": 0.5228388886315154, + "learning_rate": 8.166533097971412e-05, + "loss": 12.05, + "step": 21006 + }, + { + "epoch": 1.1439151062193271, + "grad_norm": 0.5711820104447795, + "learning_rate": 8.165666227717546e-05, + "loss": 11.9169, + "step": 21007 + }, + { + "epoch": 1.1439695602159101, + "grad_norm": 0.5198358688515019, + "learning_rate": 8.164799371727768e-05, + "loss": 11.9617, + "step": 21008 + }, + { + "epoch": 1.1440240142124931, + "grad_norm": 0.5458365819539345, + "learning_rate": 8.163932530008817e-05, + "loss": 12.09, + "step": 21009 + }, + { + "epoch": 1.144078468209076, + "grad_norm": 0.5959144261997894, + "learning_rate": 8.163065702567433e-05, + "loss": 12.0294, + "step": 21010 + }, + { + "epoch": 1.144132922205659, + "grad_norm": 0.5189776173020441, + "learning_rate": 8.162198889410362e-05, + "loss": 12.0019, + "step": 21011 + }, + { + "epoch": 1.144187376202242, + "grad_norm": 0.5838636030572146, + "learning_rate": 8.161332090544339e-05, + "loss": 11.9223, + "step": 21012 + }, + { + "epoch": 1.144241830198825, + "grad_norm": 0.553351221664354, + "learning_rate": 8.160465305976107e-05, + "loss": 12.0005, + "step": 21013 + }, + { + "epoch": 1.144296284195408, + "grad_norm": 0.5587968080469131, + "learning_rate": 8.159598535712405e-05, + "loss": 11.8973, + "step": 21014 + }, + { + "epoch": 1.1443507381919913, + "grad_norm": 0.6187991735080954, + "learning_rate": 8.158731779759975e-05, + "loss": 12.2249, + "step": 21015 + }, + { + "epoch": 1.1444051921885743, + "grad_norm": 0.5533867559389424, + "learning_rate": 8.157865038125552e-05, + "loss": 12.02, + "step": 21016 + }, + { + "epoch": 1.1444596461851573, + "grad_norm": 0.6128098583652383, + "learning_rate": 8.156998310815882e-05, + "loss": 12.2211, + "step": 21017 + }, + { + "epoch": 1.1445141001817403, + "grad_norm": 0.5387386573489695, + "learning_rate": 8.156131597837701e-05, + "loss": 12.0796, + "step": 21018 + }, + { + "epoch": 1.1445685541783233, + "grad_norm": 0.5328584040496055, + "learning_rate": 8.15526489919775e-05, + "loss": 11.837, + "step": 21019 + }, + { + "epoch": 1.1446230081749063, + "grad_norm": 0.5710270942995136, + "learning_rate": 8.154398214902769e-05, + "loss": 12.1224, + "step": 21020 + }, + { + "epoch": 1.1446774621714892, + "grad_norm": 0.5795974385270815, + "learning_rate": 8.153531544959494e-05, + "loss": 12.0274, + "step": 21021 + }, + { + "epoch": 1.1447319161680722, + "grad_norm": 0.5319926295155886, + "learning_rate": 8.15266488937467e-05, + "loss": 11.9125, + "step": 21022 + }, + { + "epoch": 1.1447863701646552, + "grad_norm": 0.5604516086391089, + "learning_rate": 8.151798248155032e-05, + "loss": 11.8868, + "step": 21023 + }, + { + "epoch": 1.1448408241612382, + "grad_norm": 0.5489005774116194, + "learning_rate": 8.150931621307323e-05, + "loss": 11.9741, + "step": 21024 + }, + { + "epoch": 1.1448952781578212, + "grad_norm": 0.5389533652488021, + "learning_rate": 8.150065008838281e-05, + "loss": 12.0303, + "step": 21025 + }, + { + "epoch": 1.1449497321544042, + "grad_norm": 0.5405954217513335, + "learning_rate": 8.149198410754641e-05, + "loss": 11.9794, + "step": 21026 + }, + { + "epoch": 1.1450041861509872, + "grad_norm": 0.5810190019682795, + "learning_rate": 8.148331827063147e-05, + "loss": 11.935, + "step": 21027 + }, + { + "epoch": 1.1450586401475704, + "grad_norm": 0.7319128147122567, + "learning_rate": 8.147465257770532e-05, + "loss": 12.0482, + "step": 21028 + }, + { + "epoch": 1.1451130941441534, + "grad_norm": 0.5822036562162385, + "learning_rate": 8.14659870288354e-05, + "loss": 12.0907, + "step": 21029 + }, + { + "epoch": 1.1451675481407364, + "grad_norm": 0.5532352398438115, + "learning_rate": 8.145732162408907e-05, + "loss": 11.9685, + "step": 21030 + }, + { + "epoch": 1.1452220021373194, + "grad_norm": 0.5712018466987027, + "learning_rate": 8.144865636353371e-05, + "loss": 12.1183, + "step": 21031 + }, + { + "epoch": 1.1452764561339024, + "grad_norm": 0.54110084551623, + "learning_rate": 8.14399912472367e-05, + "loss": 12.0274, + "step": 21032 + }, + { + "epoch": 1.1453309101304854, + "grad_norm": 0.5363305858709243, + "learning_rate": 8.143132627526545e-05, + "loss": 12.0609, + "step": 21033 + }, + { + "epoch": 1.1453853641270684, + "grad_norm": 0.5843397847758113, + "learning_rate": 8.142266144768729e-05, + "loss": 12.0073, + "step": 21034 + }, + { + "epoch": 1.1454398181236514, + "grad_norm": 0.5350188464651414, + "learning_rate": 8.141399676456972e-05, + "loss": 12.0521, + "step": 21035 + }, + { + "epoch": 1.1454942721202344, + "grad_norm": 0.5420875012653492, + "learning_rate": 8.140533222597995e-05, + "loss": 12.0326, + "step": 21036 + }, + { + "epoch": 1.1455487261168174, + "grad_norm": 0.5653638168090159, + "learning_rate": 8.139666783198542e-05, + "loss": 11.9954, + "step": 21037 + }, + { + "epoch": 1.1456031801134006, + "grad_norm": 0.5723196320718857, + "learning_rate": 8.138800358265354e-05, + "loss": 12.0604, + "step": 21038 + }, + { + "epoch": 1.1456576341099836, + "grad_norm": 0.5773998459245281, + "learning_rate": 8.137933947805169e-05, + "loss": 12.0177, + "step": 21039 + }, + { + "epoch": 1.1457120881065666, + "grad_norm": 0.51146925626736, + "learning_rate": 8.13706755182472e-05, + "loss": 12.0252, + "step": 21040 + }, + { + "epoch": 1.1457665421031495, + "grad_norm": 0.6265872521892802, + "learning_rate": 8.136201170330746e-05, + "loss": 12.0123, + "step": 21041 + }, + { + "epoch": 1.1458209960997325, + "grad_norm": 0.4907310093920785, + "learning_rate": 8.135334803329983e-05, + "loss": 11.9729, + "step": 21042 + }, + { + "epoch": 1.1458754500963155, + "grad_norm": 0.5463913693585858, + "learning_rate": 8.134468450829172e-05, + "loss": 11.9928, + "step": 21043 + }, + { + "epoch": 1.1459299040928985, + "grad_norm": 0.5455569596044371, + "learning_rate": 8.133602112835043e-05, + "loss": 11.9847, + "step": 21044 + }, + { + "epoch": 1.1459843580894815, + "grad_norm": 0.5316381424131947, + "learning_rate": 8.132735789354346e-05, + "loss": 11.9869, + "step": 21045 + }, + { + "epoch": 1.1460388120860645, + "grad_norm": 0.5544940775590294, + "learning_rate": 8.131869480393803e-05, + "loss": 12.0745, + "step": 21046 + }, + { + "epoch": 1.1460932660826475, + "grad_norm": 0.6004156000299287, + "learning_rate": 8.131003185960154e-05, + "loss": 12.0626, + "step": 21047 + }, + { + "epoch": 1.1461477200792305, + "grad_norm": 0.5218071904391417, + "learning_rate": 8.130136906060137e-05, + "loss": 12.0363, + "step": 21048 + }, + { + "epoch": 1.1462021740758135, + "grad_norm": 0.5389146853602708, + "learning_rate": 8.129270640700492e-05, + "loss": 12.006, + "step": 21049 + }, + { + "epoch": 1.1462566280723965, + "grad_norm": 0.6203417650257257, + "learning_rate": 8.128404389887953e-05, + "loss": 12.0354, + "step": 21050 + }, + { + "epoch": 1.1463110820689797, + "grad_norm": 0.5902856739990013, + "learning_rate": 8.127538153629253e-05, + "loss": 12.0125, + "step": 21051 + }, + { + "epoch": 1.1463655360655627, + "grad_norm": 0.5306534686815397, + "learning_rate": 8.126671931931131e-05, + "loss": 12.1156, + "step": 21052 + }, + { + "epoch": 1.1464199900621457, + "grad_norm": 0.5373704830394005, + "learning_rate": 8.125805724800323e-05, + "loss": 11.9522, + "step": 21053 + }, + { + "epoch": 1.1464744440587287, + "grad_norm": 0.5377695949027106, + "learning_rate": 8.124939532243564e-05, + "loss": 11.9527, + "step": 21054 + }, + { + "epoch": 1.1465288980553117, + "grad_norm": 0.5911281005314784, + "learning_rate": 8.12407335426759e-05, + "loss": 12.1034, + "step": 21055 + }, + { + "epoch": 1.1465833520518947, + "grad_norm": 0.4977620166418643, + "learning_rate": 8.123207190879136e-05, + "loss": 11.9416, + "step": 21056 + }, + { + "epoch": 1.1466378060484776, + "grad_norm": 0.5116839969984828, + "learning_rate": 8.122341042084938e-05, + "loss": 12.0125, + "step": 21057 + }, + { + "epoch": 1.1466922600450606, + "grad_norm": 0.537829407364425, + "learning_rate": 8.12147490789173e-05, + "loss": 12.0088, + "step": 21058 + }, + { + "epoch": 1.1467467140416436, + "grad_norm": 0.5566024102714832, + "learning_rate": 8.120608788306245e-05, + "loss": 11.992, + "step": 21059 + }, + { + "epoch": 1.1468011680382266, + "grad_norm": 0.5172239653070049, + "learning_rate": 8.119742683335225e-05, + "loss": 12.038, + "step": 21060 + }, + { + "epoch": 1.1468556220348098, + "grad_norm": 0.5081733170513308, + "learning_rate": 8.1188765929854e-05, + "loss": 11.9034, + "step": 21061 + }, + { + "epoch": 1.1469100760313928, + "grad_norm": 0.6079477794406521, + "learning_rate": 8.118010517263506e-05, + "loss": 11.9627, + "step": 21062 + }, + { + "epoch": 1.1469645300279758, + "grad_norm": 0.5826173534014737, + "learning_rate": 8.11714445617628e-05, + "loss": 12.0594, + "step": 21063 + }, + { + "epoch": 1.1470189840245588, + "grad_norm": 0.5632061995460318, + "learning_rate": 8.116278409730452e-05, + "loss": 11.9167, + "step": 21064 + }, + { + "epoch": 1.1470734380211418, + "grad_norm": 0.5485729104136255, + "learning_rate": 8.115412377932762e-05, + "loss": 12.1227, + "step": 21065 + }, + { + "epoch": 1.1471278920177248, + "grad_norm": 0.5528861036138745, + "learning_rate": 8.11454636078994e-05, + "loss": 12.0376, + "step": 21066 + }, + { + "epoch": 1.1471823460143078, + "grad_norm": 0.5574868688652987, + "learning_rate": 8.11368035830872e-05, + "loss": 11.8677, + "step": 21067 + }, + { + "epoch": 1.1472368000108908, + "grad_norm": 0.5818010753479786, + "learning_rate": 8.112814370495839e-05, + "loss": 12.0715, + "step": 21068 + }, + { + "epoch": 1.1472912540074738, + "grad_norm": 0.6798929789492493, + "learning_rate": 8.11194839735803e-05, + "loss": 12.1723, + "step": 21069 + }, + { + "epoch": 1.1473457080040568, + "grad_norm": 0.5422378399543559, + "learning_rate": 8.111082438902025e-05, + "loss": 12.065, + "step": 21070 + }, + { + "epoch": 1.1474001620006398, + "grad_norm": 0.5122511501381658, + "learning_rate": 8.110216495134562e-05, + "loss": 11.9232, + "step": 21071 + }, + { + "epoch": 1.1474546159972228, + "grad_norm": 0.5453350705386038, + "learning_rate": 8.10935056606237e-05, + "loss": 12.0262, + "step": 21072 + }, + { + "epoch": 1.1475090699938058, + "grad_norm": 0.6438196027922032, + "learning_rate": 8.108484651692188e-05, + "loss": 12.0422, + "step": 21073 + }, + { + "epoch": 1.1475635239903887, + "grad_norm": 0.6528455938916412, + "learning_rate": 8.107618752030745e-05, + "loss": 12.0682, + "step": 21074 + }, + { + "epoch": 1.147617977986972, + "grad_norm": 0.5451997878687775, + "learning_rate": 8.10675286708478e-05, + "loss": 12.0776, + "step": 21075 + }, + { + "epoch": 1.147672431983555, + "grad_norm": 0.520908444699066, + "learning_rate": 8.105886996861017e-05, + "loss": 11.988, + "step": 21076 + }, + { + "epoch": 1.147726885980138, + "grad_norm": 0.5852947218268076, + "learning_rate": 8.105021141366196e-05, + "loss": 12.0616, + "step": 21077 + }, + { + "epoch": 1.147781339976721, + "grad_norm": 0.5432544667041285, + "learning_rate": 8.104155300607049e-05, + "loss": 11.9157, + "step": 21078 + }, + { + "epoch": 1.147835793973304, + "grad_norm": 0.4872885499050939, + "learning_rate": 8.103289474590308e-05, + "loss": 11.9543, + "step": 21079 + }, + { + "epoch": 1.147890247969887, + "grad_norm": 0.5648118091473893, + "learning_rate": 8.102423663322704e-05, + "loss": 11.9183, + "step": 21080 + }, + { + "epoch": 1.14794470196647, + "grad_norm": 0.5579987798738413, + "learning_rate": 8.101557866810972e-05, + "loss": 12.1045, + "step": 21081 + }, + { + "epoch": 1.147999155963053, + "grad_norm": 0.5483380124450635, + "learning_rate": 8.100692085061847e-05, + "loss": 12.0214, + "step": 21082 + }, + { + "epoch": 1.148053609959636, + "grad_norm": 0.532668117450102, + "learning_rate": 8.099826318082057e-05, + "loss": 12.1756, + "step": 21083 + }, + { + "epoch": 1.148108063956219, + "grad_norm": 0.5636987922499963, + "learning_rate": 8.098960565878337e-05, + "loss": 11.9614, + "step": 21084 + }, + { + "epoch": 1.148162517952802, + "grad_norm": 0.5606659244042904, + "learning_rate": 8.098094828457424e-05, + "loss": 12.1564, + "step": 21085 + }, + { + "epoch": 1.148216971949385, + "grad_norm": 0.5411349597727095, + "learning_rate": 8.097229105826036e-05, + "loss": 11.9888, + "step": 21086 + }, + { + "epoch": 1.148271425945968, + "grad_norm": 0.6505933994179495, + "learning_rate": 8.096363397990917e-05, + "loss": 11.8961, + "step": 21087 + }, + { + "epoch": 1.148325879942551, + "grad_norm": 0.495973865804788, + "learning_rate": 8.095497704958795e-05, + "loss": 11.969, + "step": 21088 + }, + { + "epoch": 1.148380333939134, + "grad_norm": 0.48686591604475277, + "learning_rate": 8.094632026736403e-05, + "loss": 11.9074, + "step": 21089 + }, + { + "epoch": 1.148434787935717, + "grad_norm": 0.5323346380652924, + "learning_rate": 8.093766363330471e-05, + "loss": 11.9727, + "step": 21090 + }, + { + "epoch": 1.1484892419323, + "grad_norm": 0.5178261071161707, + "learning_rate": 8.092900714747731e-05, + "loss": 11.9483, + "step": 21091 + }, + { + "epoch": 1.148543695928883, + "grad_norm": 0.543103451995716, + "learning_rate": 8.092035080994917e-05, + "loss": 11.9492, + "step": 21092 + }, + { + "epoch": 1.148598149925466, + "grad_norm": 0.5291390005538827, + "learning_rate": 8.091169462078754e-05, + "loss": 12.0143, + "step": 21093 + }, + { + "epoch": 1.148652603922049, + "grad_norm": 0.560070717878981, + "learning_rate": 8.09030385800598e-05, + "loss": 12.1458, + "step": 21094 + }, + { + "epoch": 1.148707057918632, + "grad_norm": 0.5588331491242884, + "learning_rate": 8.089438268783323e-05, + "loss": 12.0294, + "step": 21095 + }, + { + "epoch": 1.148761511915215, + "grad_norm": 0.5007083159102311, + "learning_rate": 8.08857269441752e-05, + "loss": 11.9746, + "step": 21096 + }, + { + "epoch": 1.148815965911798, + "grad_norm": 0.5677994203029244, + "learning_rate": 8.087707134915288e-05, + "loss": 11.9238, + "step": 21097 + }, + { + "epoch": 1.1488704199083812, + "grad_norm": 0.5813209840921494, + "learning_rate": 8.08684159028337e-05, + "loss": 12.088, + "step": 21098 + }, + { + "epoch": 1.1489248739049642, + "grad_norm": 0.5525302957048495, + "learning_rate": 8.085976060528491e-05, + "loss": 12.0606, + "step": 21099 + }, + { + "epoch": 1.1489793279015472, + "grad_norm": 0.5496389271612488, + "learning_rate": 8.085110545657385e-05, + "loss": 12.0706, + "step": 21100 + }, + { + "epoch": 1.1490337818981302, + "grad_norm": 0.5482635604991649, + "learning_rate": 8.084245045676779e-05, + "loss": 12.0023, + "step": 21101 + }, + { + "epoch": 1.1490882358947132, + "grad_norm": 0.5469133095471433, + "learning_rate": 8.083379560593406e-05, + "loss": 12.0108, + "step": 21102 + }, + { + "epoch": 1.1491426898912962, + "grad_norm": 0.5482534608450617, + "learning_rate": 8.082514090413994e-05, + "loss": 12.0259, + "step": 21103 + }, + { + "epoch": 1.1491971438878792, + "grad_norm": 0.5767821949269134, + "learning_rate": 8.081648635145272e-05, + "loss": 12.1142, + "step": 21104 + }, + { + "epoch": 1.1492515978844622, + "grad_norm": 0.5878543688318114, + "learning_rate": 8.080783194793975e-05, + "loss": 11.9575, + "step": 21105 + }, + { + "epoch": 1.1493060518810452, + "grad_norm": 0.537615416805045, + "learning_rate": 8.079917769366833e-05, + "loss": 11.9365, + "step": 21106 + }, + { + "epoch": 1.1493605058776282, + "grad_norm": 0.5902894231903612, + "learning_rate": 8.079052358870568e-05, + "loss": 12.0805, + "step": 21107 + }, + { + "epoch": 1.1494149598742114, + "grad_norm": 0.5797267215268892, + "learning_rate": 8.078186963311912e-05, + "loss": 12.0357, + "step": 21108 + }, + { + "epoch": 1.1494694138707944, + "grad_norm": 0.5578950816383108, + "learning_rate": 8.0773215826976e-05, + "loss": 12.0792, + "step": 21109 + }, + { + "epoch": 1.1495238678673774, + "grad_norm": 0.531064236981189, + "learning_rate": 8.076456217034356e-05, + "loss": 12.0386, + "step": 21110 + }, + { + "epoch": 1.1495783218639604, + "grad_norm": 0.508623729429057, + "learning_rate": 8.075590866328911e-05, + "loss": 12.0248, + "step": 21111 + }, + { + "epoch": 1.1496327758605434, + "grad_norm": 0.5843775193424045, + "learning_rate": 8.074725530587996e-05, + "loss": 12.0742, + "step": 21112 + }, + { + "epoch": 1.1496872298571263, + "grad_norm": 0.4926905519642264, + "learning_rate": 8.073860209818336e-05, + "loss": 11.9017, + "step": 21113 + }, + { + "epoch": 1.1497416838537093, + "grad_norm": 0.5328678072904144, + "learning_rate": 8.072994904026663e-05, + "loss": 11.9381, + "step": 21114 + }, + { + "epoch": 1.1497961378502923, + "grad_norm": 0.526885307689961, + "learning_rate": 8.072129613219703e-05, + "loss": 11.9485, + "step": 21115 + }, + { + "epoch": 1.1498505918468753, + "grad_norm": 0.5497004056654098, + "learning_rate": 8.071264337404192e-05, + "loss": 12.0153, + "step": 21116 + }, + { + "epoch": 1.1499050458434583, + "grad_norm": 0.5863572104234963, + "learning_rate": 8.070399076586849e-05, + "loss": 11.9572, + "step": 21117 + }, + { + "epoch": 1.1499594998400413, + "grad_norm": 0.5615088944244939, + "learning_rate": 8.069533830774407e-05, + "loss": 12.0666, + "step": 21118 + }, + { + "epoch": 1.1500139538366243, + "grad_norm": 0.7323211513084971, + "learning_rate": 8.06866859997359e-05, + "loss": 12.0178, + "step": 21119 + }, + { + "epoch": 1.1500684078332073, + "grad_norm": 0.5678232451995301, + "learning_rate": 8.067803384191133e-05, + "loss": 11.9026, + "step": 21120 + }, + { + "epoch": 1.1501228618297905, + "grad_norm": 0.5446422610668087, + "learning_rate": 8.066938183433762e-05, + "loss": 11.9061, + "step": 21121 + }, + { + "epoch": 1.1501773158263735, + "grad_norm": 0.537432196594231, + "learning_rate": 8.066072997708203e-05, + "loss": 11.9063, + "step": 21122 + }, + { + "epoch": 1.1502317698229565, + "grad_norm": 0.557170520357956, + "learning_rate": 8.065207827021184e-05, + "loss": 11.9433, + "step": 21123 + }, + { + "epoch": 1.1502862238195395, + "grad_norm": 0.7040948029689192, + "learning_rate": 8.064342671379435e-05, + "loss": 12.1554, + "step": 21124 + }, + { + "epoch": 1.1503406778161225, + "grad_norm": 0.6032339334585413, + "learning_rate": 8.063477530789681e-05, + "loss": 11.9106, + "step": 21125 + }, + { + "epoch": 1.1503951318127055, + "grad_norm": 0.5399728029226735, + "learning_rate": 8.062612405258652e-05, + "loss": 11.9057, + "step": 21126 + }, + { + "epoch": 1.1504495858092885, + "grad_norm": 0.4955261796241731, + "learning_rate": 8.061747294793071e-05, + "loss": 12.0238, + "step": 21127 + }, + { + "epoch": 1.1505040398058715, + "grad_norm": 0.5855436553356044, + "learning_rate": 8.06088219939967e-05, + "loss": 12.0454, + "step": 21128 + }, + { + "epoch": 1.1505584938024545, + "grad_norm": 0.605434000788639, + "learning_rate": 8.060017119085173e-05, + "loss": 12.1447, + "step": 21129 + }, + { + "epoch": 1.1506129477990374, + "grad_norm": 0.5567938188227343, + "learning_rate": 8.059152053856307e-05, + "loss": 11.985, + "step": 21130 + }, + { + "epoch": 1.1506674017956207, + "grad_norm": 0.5553514637675814, + "learning_rate": 8.058287003719802e-05, + "loss": 12.0672, + "step": 21131 + }, + { + "epoch": 1.1507218557922037, + "grad_norm": 0.5184625714277747, + "learning_rate": 8.057421968682383e-05, + "loss": 12.0448, + "step": 21132 + }, + { + "epoch": 1.1507763097887866, + "grad_norm": 0.5204256390067515, + "learning_rate": 8.056556948750777e-05, + "loss": 11.9548, + "step": 21133 + }, + { + "epoch": 1.1508307637853696, + "grad_norm": 0.5953661138703211, + "learning_rate": 8.055691943931707e-05, + "loss": 12.0341, + "step": 21134 + }, + { + "epoch": 1.1508852177819526, + "grad_norm": 0.8378579415423998, + "learning_rate": 8.054826954231906e-05, + "loss": 12.039, + "step": 21135 + }, + { + "epoch": 1.1509396717785356, + "grad_norm": 0.5715107785392818, + "learning_rate": 8.053961979658098e-05, + "loss": 12.1688, + "step": 21136 + }, + { + "epoch": 1.1509941257751186, + "grad_norm": 0.563690352731225, + "learning_rate": 8.053097020217006e-05, + "loss": 12.0174, + "step": 21137 + }, + { + "epoch": 1.1510485797717016, + "grad_norm": 0.5488959381633189, + "learning_rate": 8.052232075915359e-05, + "loss": 12.0471, + "step": 21138 + }, + { + "epoch": 1.1511030337682846, + "grad_norm": 0.5663448934394109, + "learning_rate": 8.05136714675988e-05, + "loss": 12.1247, + "step": 21139 + }, + { + "epoch": 1.1511574877648676, + "grad_norm": 0.5685910922950765, + "learning_rate": 8.050502232757297e-05, + "loss": 12.0227, + "step": 21140 + }, + { + "epoch": 1.1512119417614506, + "grad_norm": 0.5900570812695328, + "learning_rate": 8.049637333914336e-05, + "loss": 11.9444, + "step": 21141 + }, + { + "epoch": 1.1512663957580336, + "grad_norm": 0.5252079331707762, + "learning_rate": 8.04877245023772e-05, + "loss": 12.0593, + "step": 21142 + }, + { + "epoch": 1.1513208497546166, + "grad_norm": 0.5511587861500431, + "learning_rate": 8.047907581734178e-05, + "loss": 12.0923, + "step": 21143 + }, + { + "epoch": 1.1513753037511998, + "grad_norm": 0.47982691738561195, + "learning_rate": 8.047042728410436e-05, + "loss": 11.9816, + "step": 21144 + }, + { + "epoch": 1.1514297577477828, + "grad_norm": 0.5415394760446558, + "learning_rate": 8.046177890273216e-05, + "loss": 12.122, + "step": 21145 + }, + { + "epoch": 1.1514842117443658, + "grad_norm": 0.5198899441470982, + "learning_rate": 8.045313067329248e-05, + "loss": 11.7322, + "step": 21146 + }, + { + "epoch": 1.1515386657409488, + "grad_norm": 0.5429851464257295, + "learning_rate": 8.044448259585249e-05, + "loss": 12.0108, + "step": 21147 + }, + { + "epoch": 1.1515931197375318, + "grad_norm": 0.5622638318610981, + "learning_rate": 8.043583467047949e-05, + "loss": 12.1389, + "step": 21148 + }, + { + "epoch": 1.1516475737341147, + "grad_norm": 0.530104001255593, + "learning_rate": 8.042718689724072e-05, + "loss": 11.9547, + "step": 21149 + }, + { + "epoch": 1.1517020277306977, + "grad_norm": 0.5883781883269091, + "learning_rate": 8.041853927620345e-05, + "loss": 12.0862, + "step": 21150 + }, + { + "epoch": 1.1517564817272807, + "grad_norm": 0.5161862523816715, + "learning_rate": 8.040989180743487e-05, + "loss": 11.8887, + "step": 21151 + }, + { + "epoch": 1.1518109357238637, + "grad_norm": 0.5728392481592719, + "learning_rate": 8.040124449100226e-05, + "loss": 12.0205, + "step": 21152 + }, + { + "epoch": 1.1518653897204467, + "grad_norm": 0.5858138955307554, + "learning_rate": 8.039259732697286e-05, + "loss": 12.0949, + "step": 21153 + }, + { + "epoch": 1.1519198437170297, + "grad_norm": 0.5524384241816089, + "learning_rate": 8.038395031541392e-05, + "loss": 12.0203, + "step": 21154 + }, + { + "epoch": 1.151974297713613, + "grad_norm": 0.535014389337564, + "learning_rate": 8.037530345639267e-05, + "loss": 11.9085, + "step": 21155 + }, + { + "epoch": 1.152028751710196, + "grad_norm": 0.6554376807816443, + "learning_rate": 8.036665674997639e-05, + "loss": 11.9767, + "step": 21156 + }, + { + "epoch": 1.152083205706779, + "grad_norm": 0.5947689449903816, + "learning_rate": 8.035801019623224e-05, + "loss": 12.0815, + "step": 21157 + }, + { + "epoch": 1.152137659703362, + "grad_norm": 0.548992716856938, + "learning_rate": 8.034936379522749e-05, + "loss": 11.892, + "step": 21158 + }, + { + "epoch": 1.152192113699945, + "grad_norm": 0.5515032063781492, + "learning_rate": 8.034071754702938e-05, + "loss": 11.9991, + "step": 21159 + }, + { + "epoch": 1.152246567696528, + "grad_norm": 0.5111353487376212, + "learning_rate": 8.033207145170516e-05, + "loss": 11.8936, + "step": 21160 + }, + { + "epoch": 1.1523010216931109, + "grad_norm": 0.6442684090397887, + "learning_rate": 8.032342550932206e-05, + "loss": 12.0178, + "step": 21161 + }, + { + "epoch": 1.1523554756896939, + "grad_norm": 0.5372293081498746, + "learning_rate": 8.03147797199473e-05, + "loss": 11.7815, + "step": 21162 + }, + { + "epoch": 1.1524099296862769, + "grad_norm": 0.531260849518373, + "learning_rate": 8.030613408364812e-05, + "loss": 12.1365, + "step": 21163 + }, + { + "epoch": 1.1524643836828599, + "grad_norm": 0.5124011686686147, + "learning_rate": 8.029748860049168e-05, + "loss": 12.0414, + "step": 21164 + }, + { + "epoch": 1.1525188376794429, + "grad_norm": 0.5123375927676078, + "learning_rate": 8.028884327054534e-05, + "loss": 11.8998, + "step": 21165 + }, + { + "epoch": 1.1525732916760258, + "grad_norm": 0.5245392110966306, + "learning_rate": 8.028019809387629e-05, + "loss": 11.9568, + "step": 21166 + }, + { + "epoch": 1.1526277456726088, + "grad_norm": 0.5271637670116032, + "learning_rate": 8.027155307055167e-05, + "loss": 11.9273, + "step": 21167 + }, + { + "epoch": 1.152682199669192, + "grad_norm": 0.4852455642741703, + "learning_rate": 8.026290820063876e-05, + "loss": 11.9551, + "step": 21168 + }, + { + "epoch": 1.152736653665775, + "grad_norm": 0.5851320866655585, + "learning_rate": 8.02542634842048e-05, + "loss": 11.9567, + "step": 21169 + }, + { + "epoch": 1.152791107662358, + "grad_norm": 0.5351084480055083, + "learning_rate": 8.024561892131699e-05, + "loss": 11.8094, + "step": 21170 + }, + { + "epoch": 1.152845561658941, + "grad_norm": 0.5882382168396529, + "learning_rate": 8.023697451204258e-05, + "loss": 12.1339, + "step": 21171 + }, + { + "epoch": 1.152900015655524, + "grad_norm": 0.6477972692190181, + "learning_rate": 8.022833025644875e-05, + "loss": 12.2011, + "step": 21172 + }, + { + "epoch": 1.152954469652107, + "grad_norm": 0.5532746213673208, + "learning_rate": 8.021968615460275e-05, + "loss": 12.0372, + "step": 21173 + }, + { + "epoch": 1.15300892364869, + "grad_norm": 0.5286103616233467, + "learning_rate": 8.021104220657178e-05, + "loss": 11.9014, + "step": 21174 + }, + { + "epoch": 1.153063377645273, + "grad_norm": 0.57629068999438, + "learning_rate": 8.020239841242305e-05, + "loss": 12.0307, + "step": 21175 + }, + { + "epoch": 1.153117831641856, + "grad_norm": 0.5705025655303789, + "learning_rate": 8.019375477222386e-05, + "loss": 12.0665, + "step": 21176 + }, + { + "epoch": 1.153172285638439, + "grad_norm": 0.5119161902042083, + "learning_rate": 8.01851112860413e-05, + "loss": 12.0077, + "step": 21177 + }, + { + "epoch": 1.1532267396350222, + "grad_norm": 0.5084787593407395, + "learning_rate": 8.017646795394264e-05, + "loss": 11.99, + "step": 21178 + }, + { + "epoch": 1.1532811936316052, + "grad_norm": 0.5625113833150053, + "learning_rate": 8.016782477599507e-05, + "loss": 11.9331, + "step": 21179 + }, + { + "epoch": 1.1533356476281882, + "grad_norm": 0.5683958156635728, + "learning_rate": 8.015918175226584e-05, + "loss": 12.0022, + "step": 21180 + }, + { + "epoch": 1.1533901016247712, + "grad_norm": 0.552218824644041, + "learning_rate": 8.015053888282215e-05, + "loss": 11.9492, + "step": 21181 + }, + { + "epoch": 1.1534445556213542, + "grad_norm": 0.538202368181314, + "learning_rate": 8.014189616773117e-05, + "loss": 11.9867, + "step": 21182 + }, + { + "epoch": 1.1534990096179372, + "grad_norm": 0.5772816653059827, + "learning_rate": 8.013325360706017e-05, + "loss": 11.9636, + "step": 21183 + }, + { + "epoch": 1.1535534636145202, + "grad_norm": 0.5305500868198957, + "learning_rate": 8.012461120087631e-05, + "loss": 11.9883, + "step": 21184 + }, + { + "epoch": 1.1536079176111032, + "grad_norm": 0.6269183943377432, + "learning_rate": 8.01159689492468e-05, + "loss": 12.0116, + "step": 21185 + }, + { + "epoch": 1.1536623716076861, + "grad_norm": 0.5043439783473882, + "learning_rate": 8.010732685223888e-05, + "loss": 11.9738, + "step": 21186 + }, + { + "epoch": 1.1537168256042691, + "grad_norm": 0.5478230975249527, + "learning_rate": 8.009868490991969e-05, + "loss": 12.1364, + "step": 21187 + }, + { + "epoch": 1.1537712796008521, + "grad_norm": 0.5167076642854395, + "learning_rate": 8.009004312235648e-05, + "loss": 11.9932, + "step": 21188 + }, + { + "epoch": 1.1538257335974351, + "grad_norm": 0.591628232371505, + "learning_rate": 8.008140148961641e-05, + "loss": 12.0235, + "step": 21189 + }, + { + "epoch": 1.1538801875940181, + "grad_norm": 0.5232239089879509, + "learning_rate": 8.007276001176672e-05, + "loss": 12.0338, + "step": 21190 + }, + { + "epoch": 1.1539346415906013, + "grad_norm": 0.5218900289117727, + "learning_rate": 8.006411868887456e-05, + "loss": 12.0667, + "step": 21191 + }, + { + "epoch": 1.1539890955871843, + "grad_norm": 0.5002955849688884, + "learning_rate": 8.005547752100718e-05, + "loss": 12.0313, + "step": 21192 + }, + { + "epoch": 1.1540435495837673, + "grad_norm": 0.6418628227847089, + "learning_rate": 8.004683650823175e-05, + "loss": 12.0277, + "step": 21193 + }, + { + "epoch": 1.1540980035803503, + "grad_norm": 0.5421520459976056, + "learning_rate": 8.003819565061548e-05, + "loss": 12.0242, + "step": 21194 + }, + { + "epoch": 1.1541524575769333, + "grad_norm": 0.5854748086041517, + "learning_rate": 8.002955494822553e-05, + "loss": 11.9826, + "step": 21195 + }, + { + "epoch": 1.1542069115735163, + "grad_norm": 0.5257715022701855, + "learning_rate": 8.002091440112914e-05, + "loss": 11.9302, + "step": 21196 + }, + { + "epoch": 1.1542613655700993, + "grad_norm": 0.546327156552604, + "learning_rate": 8.001227400939345e-05, + "loss": 11.973, + "step": 21197 + }, + { + "epoch": 1.1543158195666823, + "grad_norm": 0.594397566181094, + "learning_rate": 8.000363377308566e-05, + "loss": 12.0479, + "step": 21198 + }, + { + "epoch": 1.1543702735632653, + "grad_norm": 0.5339239788357752, + "learning_rate": 7.999499369227298e-05, + "loss": 11.9646, + "step": 21199 + }, + { + "epoch": 1.1544247275598483, + "grad_norm": 0.5500296085494902, + "learning_rate": 7.998635376702257e-05, + "loss": 11.9813, + "step": 21200 + }, + { + "epoch": 1.1544791815564315, + "grad_norm": 0.5488862208409117, + "learning_rate": 7.997771399740163e-05, + "loss": 12.0284, + "step": 21201 + }, + { + "epoch": 1.1545336355530145, + "grad_norm": 0.5354108211709937, + "learning_rate": 7.996907438347734e-05, + "loss": 11.8456, + "step": 21202 + }, + { + "epoch": 1.1545880895495975, + "grad_norm": 0.5203275691147052, + "learning_rate": 7.99604349253169e-05, + "loss": 12.0175, + "step": 21203 + }, + { + "epoch": 1.1546425435461805, + "grad_norm": 0.5707911160642656, + "learning_rate": 7.995179562298746e-05, + "loss": 11.8751, + "step": 21204 + }, + { + "epoch": 1.1546969975427634, + "grad_norm": 0.5927001165041338, + "learning_rate": 7.994315647655624e-05, + "loss": 12.0275, + "step": 21205 + }, + { + "epoch": 1.1547514515393464, + "grad_norm": 0.5277214276087671, + "learning_rate": 7.993451748609042e-05, + "loss": 11.9848, + "step": 21206 + }, + { + "epoch": 1.1548059055359294, + "grad_norm": 0.5769471969029595, + "learning_rate": 7.992587865165713e-05, + "loss": 11.9488, + "step": 21207 + }, + { + "epoch": 1.1548603595325124, + "grad_norm": 0.5496266310399047, + "learning_rate": 7.991723997332358e-05, + "loss": 12.0007, + "step": 21208 + }, + { + "epoch": 1.1549148135290954, + "grad_norm": 0.6594062965515542, + "learning_rate": 7.990860145115694e-05, + "loss": 12.0828, + "step": 21209 + }, + { + "epoch": 1.1549692675256784, + "grad_norm": 0.5677854047819902, + "learning_rate": 7.989996308522437e-05, + "loss": 12.0297, + "step": 21210 + }, + { + "epoch": 1.1550237215222614, + "grad_norm": 0.551883355920494, + "learning_rate": 7.989132487559307e-05, + "loss": 12.0298, + "step": 21211 + }, + { + "epoch": 1.1550781755188444, + "grad_norm": 0.5323634440159634, + "learning_rate": 7.98826868223302e-05, + "loss": 11.9089, + "step": 21212 + }, + { + "epoch": 1.1551326295154274, + "grad_norm": 0.5037670116250631, + "learning_rate": 7.987404892550289e-05, + "loss": 11.9592, + "step": 21213 + }, + { + "epoch": 1.1551870835120106, + "grad_norm": 0.5199204506116677, + "learning_rate": 7.98654111851784e-05, + "loss": 11.9533, + "step": 21214 + }, + { + "epoch": 1.1552415375085936, + "grad_norm": 0.6107623890788886, + "learning_rate": 7.985677360142384e-05, + "loss": 12.0708, + "step": 21215 + }, + { + "epoch": 1.1552959915051766, + "grad_norm": 0.5511599905622395, + "learning_rate": 7.984813617430644e-05, + "loss": 12.1261, + "step": 21216 + }, + { + "epoch": 1.1553504455017596, + "grad_norm": 0.5115863465619226, + "learning_rate": 7.983949890389322e-05, + "loss": 11.9577, + "step": 21217 + }, + { + "epoch": 1.1554048994983426, + "grad_norm": 0.5637422061740728, + "learning_rate": 7.983086179025148e-05, + "loss": 12.0071, + "step": 21218 + }, + { + "epoch": 1.1554593534949256, + "grad_norm": 0.4981769207547077, + "learning_rate": 7.982222483344834e-05, + "loss": 11.9628, + "step": 21219 + }, + { + "epoch": 1.1555138074915086, + "grad_norm": 0.5414525482918751, + "learning_rate": 7.981358803355095e-05, + "loss": 11.975, + "step": 21220 + }, + { + "epoch": 1.1555682614880916, + "grad_norm": 0.5530668572217571, + "learning_rate": 7.980495139062649e-05, + "loss": 12.0245, + "step": 21221 + }, + { + "epoch": 1.1556227154846745, + "grad_norm": 0.5786923404746341, + "learning_rate": 7.979631490474213e-05, + "loss": 11.9814, + "step": 21222 + }, + { + "epoch": 1.1556771694812575, + "grad_norm": 0.6240826110107114, + "learning_rate": 7.978767857596499e-05, + "loss": 12.1326, + "step": 21223 + }, + { + "epoch": 1.1557316234778405, + "grad_norm": 0.49542183088818065, + "learning_rate": 7.977904240436224e-05, + "loss": 11.7954, + "step": 21224 + }, + { + "epoch": 1.1557860774744237, + "grad_norm": 0.556239010444873, + "learning_rate": 7.977040639000107e-05, + "loss": 12.0706, + "step": 21225 + }, + { + "epoch": 1.1558405314710067, + "grad_norm": 0.5166307537336219, + "learning_rate": 7.976177053294867e-05, + "loss": 11.9495, + "step": 21226 + }, + { + "epoch": 1.1558949854675897, + "grad_norm": 0.5862827674070573, + "learning_rate": 7.975313483327206e-05, + "loss": 12.0701, + "step": 21227 + }, + { + "epoch": 1.1559494394641727, + "grad_norm": 0.5799460978782308, + "learning_rate": 7.974449929103847e-05, + "loss": 11.9343, + "step": 21228 + }, + { + "epoch": 1.1560038934607557, + "grad_norm": 0.5799848983304122, + "learning_rate": 7.973586390631508e-05, + "loss": 11.9795, + "step": 21229 + }, + { + "epoch": 1.1560583474573387, + "grad_norm": 0.5479810690267198, + "learning_rate": 7.9727228679169e-05, + "loss": 12.0038, + "step": 21230 + }, + { + "epoch": 1.1561128014539217, + "grad_norm": 0.5268258755091737, + "learning_rate": 7.971859360966739e-05, + "loss": 11.9109, + "step": 21231 + }, + { + "epoch": 1.1561672554505047, + "grad_norm": 0.5399960113306216, + "learning_rate": 7.970995869787738e-05, + "loss": 12.1343, + "step": 21232 + }, + { + "epoch": 1.1562217094470877, + "grad_norm": 0.5416405261216394, + "learning_rate": 7.970132394386616e-05, + "loss": 11.9387, + "step": 21233 + }, + { + "epoch": 1.1562761634436707, + "grad_norm": 0.4994698027199779, + "learning_rate": 7.969268934770084e-05, + "loss": 11.9743, + "step": 21234 + }, + { + "epoch": 1.1563306174402537, + "grad_norm": 0.5605141229754358, + "learning_rate": 7.968405490944855e-05, + "loss": 11.9807, + "step": 21235 + }, + { + "epoch": 1.1563850714368367, + "grad_norm": 0.5164187375636196, + "learning_rate": 7.967542062917648e-05, + "loss": 11.9675, + "step": 21236 + }, + { + "epoch": 1.1564395254334197, + "grad_norm": 0.6047170965949826, + "learning_rate": 7.966678650695179e-05, + "loss": 12.0729, + "step": 21237 + }, + { + "epoch": 1.1564939794300029, + "grad_norm": 0.5670926295472087, + "learning_rate": 7.965815254284152e-05, + "loss": 12.0425, + "step": 21238 + }, + { + "epoch": 1.1565484334265859, + "grad_norm": 0.5591787087438296, + "learning_rate": 7.964951873691289e-05, + "loss": 12.0772, + "step": 21239 + }, + { + "epoch": 1.1566028874231689, + "grad_norm": 0.5418566161904471, + "learning_rate": 7.964088508923297e-05, + "loss": 12.067, + "step": 21240 + }, + { + "epoch": 1.1566573414197518, + "grad_norm": 0.60068859839871, + "learning_rate": 7.963225159986899e-05, + "loss": 12.1196, + "step": 21241 + }, + { + "epoch": 1.1567117954163348, + "grad_norm": 0.6292034462025742, + "learning_rate": 7.962361826888802e-05, + "loss": 12.1183, + "step": 21242 + }, + { + "epoch": 1.1567662494129178, + "grad_norm": 0.5440244217598444, + "learning_rate": 7.961498509635722e-05, + "loss": 11.9312, + "step": 21243 + }, + { + "epoch": 1.1568207034095008, + "grad_norm": 0.6557745520530629, + "learning_rate": 7.96063520823437e-05, + "loss": 12.0664, + "step": 21244 + }, + { + "epoch": 1.1568751574060838, + "grad_norm": 0.51055205754822, + "learning_rate": 7.959771922691463e-05, + "loss": 12.0452, + "step": 21245 + }, + { + "epoch": 1.1569296114026668, + "grad_norm": 0.5914900864544089, + "learning_rate": 7.95890865301371e-05, + "loss": 11.9728, + "step": 21246 + }, + { + "epoch": 1.1569840653992498, + "grad_norm": 0.5354573851902568, + "learning_rate": 7.958045399207827e-05, + "loss": 11.9417, + "step": 21247 + }, + { + "epoch": 1.157038519395833, + "grad_norm": 0.5204337816259289, + "learning_rate": 7.957182161280526e-05, + "loss": 12.0514, + "step": 21248 + }, + { + "epoch": 1.157092973392416, + "grad_norm": 0.5252946339072717, + "learning_rate": 7.956318939238517e-05, + "loss": 11.989, + "step": 21249 + }, + { + "epoch": 1.157147427388999, + "grad_norm": 0.5091175709706328, + "learning_rate": 7.955455733088516e-05, + "loss": 12.0704, + "step": 21250 + }, + { + "epoch": 1.157201881385582, + "grad_norm": 0.5174848837304137, + "learning_rate": 7.954592542837229e-05, + "loss": 12.0009, + "step": 21251 + }, + { + "epoch": 1.157256335382165, + "grad_norm": 0.5023318740844385, + "learning_rate": 7.953729368491378e-05, + "loss": 11.8673, + "step": 21252 + }, + { + "epoch": 1.157310789378748, + "grad_norm": 0.6140836736504344, + "learning_rate": 7.95286621005767e-05, + "loss": 11.9944, + "step": 21253 + }, + { + "epoch": 1.157365243375331, + "grad_norm": 0.5182919984248456, + "learning_rate": 7.952003067542818e-05, + "loss": 12.1002, + "step": 21254 + }, + { + "epoch": 1.157419697371914, + "grad_norm": 0.5287698514499525, + "learning_rate": 7.951139940953533e-05, + "loss": 12.0127, + "step": 21255 + }, + { + "epoch": 1.157474151368497, + "grad_norm": 0.5765625554102265, + "learning_rate": 7.950276830296527e-05, + "loss": 12.0541, + "step": 21256 + }, + { + "epoch": 1.15752860536508, + "grad_norm": 0.5937487107232046, + "learning_rate": 7.949413735578517e-05, + "loss": 12.071, + "step": 21257 + }, + { + "epoch": 1.157583059361663, + "grad_norm": 0.5589663206974158, + "learning_rate": 7.948550656806205e-05, + "loss": 11.8881, + "step": 21258 + }, + { + "epoch": 1.157637513358246, + "grad_norm": 0.666793727274845, + "learning_rate": 7.947687593986308e-05, + "loss": 12.0645, + "step": 21259 + }, + { + "epoch": 1.157691967354829, + "grad_norm": 0.6111311427044318, + "learning_rate": 7.946824547125536e-05, + "loss": 11.9878, + "step": 21260 + }, + { + "epoch": 1.1577464213514121, + "grad_norm": 0.5227747121282392, + "learning_rate": 7.945961516230601e-05, + "loss": 11.9656, + "step": 21261 + }, + { + "epoch": 1.1578008753479951, + "grad_norm": 0.5919891131184604, + "learning_rate": 7.94509850130821e-05, + "loss": 11.9137, + "step": 21262 + }, + { + "epoch": 1.1578553293445781, + "grad_norm": 0.5526761338976475, + "learning_rate": 7.944235502365083e-05, + "loss": 11.9691, + "step": 21263 + }, + { + "epoch": 1.1579097833411611, + "grad_norm": 0.5191002217963598, + "learning_rate": 7.943372519407924e-05, + "loss": 11.9662, + "step": 21264 + }, + { + "epoch": 1.1579642373377441, + "grad_norm": 0.5784555927342834, + "learning_rate": 7.942509552443445e-05, + "loss": 12.0409, + "step": 21265 + }, + { + "epoch": 1.158018691334327, + "grad_norm": 0.5888502247943441, + "learning_rate": 7.941646601478357e-05, + "loss": 12.0134, + "step": 21266 + }, + { + "epoch": 1.15807314533091, + "grad_norm": 0.5631901823699048, + "learning_rate": 7.940783666519372e-05, + "loss": 11.9675, + "step": 21267 + }, + { + "epoch": 1.158127599327493, + "grad_norm": 0.5360246408255919, + "learning_rate": 7.939920747573195e-05, + "loss": 12.0723, + "step": 21268 + }, + { + "epoch": 1.158182053324076, + "grad_norm": 0.5450568280593008, + "learning_rate": 7.939057844646542e-05, + "loss": 11.9873, + "step": 21269 + }, + { + "epoch": 1.158236507320659, + "grad_norm": 0.7460199505110907, + "learning_rate": 7.93819495774612e-05, + "loss": 12.0658, + "step": 21270 + }, + { + "epoch": 1.1582909613172423, + "grad_norm": 0.6202295260594721, + "learning_rate": 7.937332086878639e-05, + "loss": 12.0706, + "step": 21271 + }, + { + "epoch": 1.1583454153138253, + "grad_norm": 0.5186259869149088, + "learning_rate": 7.93646923205081e-05, + "loss": 12.0008, + "step": 21272 + }, + { + "epoch": 1.1583998693104083, + "grad_norm": 0.5389499880625561, + "learning_rate": 7.935606393269341e-05, + "loss": 12.0292, + "step": 21273 + }, + { + "epoch": 1.1584543233069913, + "grad_norm": 0.6026633532668297, + "learning_rate": 7.934743570540944e-05, + "loss": 11.9039, + "step": 21274 + }, + { + "epoch": 1.1585087773035743, + "grad_norm": 0.5740250909525888, + "learning_rate": 7.933880763872328e-05, + "loss": 12.0317, + "step": 21275 + }, + { + "epoch": 1.1585632313001573, + "grad_norm": 0.5241001314895766, + "learning_rate": 7.933017973270202e-05, + "loss": 12.0434, + "step": 21276 + }, + { + "epoch": 1.1586176852967403, + "grad_norm": 0.5617167280592977, + "learning_rate": 7.932155198741276e-05, + "loss": 12.1402, + "step": 21277 + }, + { + "epoch": 1.1586721392933232, + "grad_norm": 0.5273417370388322, + "learning_rate": 7.931292440292258e-05, + "loss": 11.9136, + "step": 21278 + }, + { + "epoch": 1.1587265932899062, + "grad_norm": 0.5187379134264419, + "learning_rate": 7.930429697929855e-05, + "loss": 12.0329, + "step": 21279 + }, + { + "epoch": 1.1587810472864892, + "grad_norm": 0.5208629925119909, + "learning_rate": 7.929566971660777e-05, + "loss": 11.9753, + "step": 21280 + }, + { + "epoch": 1.1588355012830722, + "grad_norm": 0.5019973934481978, + "learning_rate": 7.928704261491735e-05, + "loss": 11.9225, + "step": 21281 + }, + { + "epoch": 1.1588899552796552, + "grad_norm": 0.5206683386613279, + "learning_rate": 7.927841567429435e-05, + "loss": 12.0117, + "step": 21282 + }, + { + "epoch": 1.1589444092762382, + "grad_norm": 0.5106389297711984, + "learning_rate": 7.926978889480587e-05, + "loss": 12.0175, + "step": 21283 + }, + { + "epoch": 1.1589988632728214, + "grad_norm": 0.5863089332292762, + "learning_rate": 7.926116227651896e-05, + "loss": 12.0695, + "step": 21284 + }, + { + "epoch": 1.1590533172694044, + "grad_norm": 0.5314155203167451, + "learning_rate": 7.925253581950077e-05, + "loss": 11.9583, + "step": 21285 + }, + { + "epoch": 1.1591077712659874, + "grad_norm": 0.5579858506104858, + "learning_rate": 7.924390952381832e-05, + "loss": 12.0124, + "step": 21286 + }, + { + "epoch": 1.1591622252625704, + "grad_norm": 0.5483341561174498, + "learning_rate": 7.923528338953876e-05, + "loss": 12.0567, + "step": 21287 + }, + { + "epoch": 1.1592166792591534, + "grad_norm": 0.5681173540784744, + "learning_rate": 7.922665741672905e-05, + "loss": 12.078, + "step": 21288 + }, + { + "epoch": 1.1592711332557364, + "grad_norm": 0.5228607695356412, + "learning_rate": 7.921803160545637e-05, + "loss": 12.0183, + "step": 21289 + }, + { + "epoch": 1.1593255872523194, + "grad_norm": 0.5857548834210604, + "learning_rate": 7.920940595578775e-05, + "loss": 12.0019, + "step": 21290 + }, + { + "epoch": 1.1593800412489024, + "grad_norm": 0.523380359704805, + "learning_rate": 7.920078046779028e-05, + "loss": 12.037, + "step": 21291 + }, + { + "epoch": 1.1594344952454854, + "grad_norm": 0.5778553722466274, + "learning_rate": 7.919215514153103e-05, + "loss": 12.0122, + "step": 21292 + }, + { + "epoch": 1.1594889492420684, + "grad_norm": 0.5528023586460237, + "learning_rate": 7.918352997707708e-05, + "loss": 12.0816, + "step": 21293 + }, + { + "epoch": 1.1595434032386516, + "grad_norm": 0.5587668613138254, + "learning_rate": 7.917490497449547e-05, + "loss": 11.8789, + "step": 21294 + }, + { + "epoch": 1.1595978572352346, + "grad_norm": 0.555394070804435, + "learning_rate": 7.916628013385331e-05, + "loss": 12.0878, + "step": 21295 + }, + { + "epoch": 1.1596523112318176, + "grad_norm": 0.5488890422522659, + "learning_rate": 7.915765545521761e-05, + "loss": 12.0984, + "step": 21296 + }, + { + "epoch": 1.1597067652284005, + "grad_norm": 0.546272098096521, + "learning_rate": 7.914903093865555e-05, + "loss": 11.9192, + "step": 21297 + }, + { + "epoch": 1.1597612192249835, + "grad_norm": 0.6543254243942339, + "learning_rate": 7.91404065842341e-05, + "loss": 12.0496, + "step": 21298 + }, + { + "epoch": 1.1598156732215665, + "grad_norm": 0.6017006398399886, + "learning_rate": 7.913178239202032e-05, + "loss": 12.1064, + "step": 21299 + }, + { + "epoch": 1.1598701272181495, + "grad_norm": 0.5230995730140959, + "learning_rate": 7.912315836208128e-05, + "loss": 12.0087, + "step": 21300 + }, + { + "epoch": 1.1599245812147325, + "grad_norm": 0.5614196050577605, + "learning_rate": 7.911453449448409e-05, + "loss": 12.0527, + "step": 21301 + }, + { + "epoch": 1.1599790352113155, + "grad_norm": 0.4539513549397306, + "learning_rate": 7.910591078929578e-05, + "loss": 12.0398, + "step": 21302 + }, + { + "epoch": 1.1600334892078985, + "grad_norm": 0.5356462699840768, + "learning_rate": 7.909728724658342e-05, + "loss": 12.0782, + "step": 21303 + }, + { + "epoch": 1.1600879432044815, + "grad_norm": 0.5066520142058429, + "learning_rate": 7.908866386641404e-05, + "loss": 11.807, + "step": 21304 + }, + { + "epoch": 1.1601423972010645, + "grad_norm": 0.5668869110468674, + "learning_rate": 7.908004064885475e-05, + "loss": 12.0233, + "step": 21305 + }, + { + "epoch": 1.1601968511976475, + "grad_norm": 0.5285924976149727, + "learning_rate": 7.907141759397255e-05, + "loss": 12.1045, + "step": 21306 + }, + { + "epoch": 1.1602513051942305, + "grad_norm": 0.546543507842577, + "learning_rate": 7.906279470183453e-05, + "loss": 12.0654, + "step": 21307 + }, + { + "epoch": 1.1603057591908137, + "grad_norm": 0.5385015810317186, + "learning_rate": 7.905417197250772e-05, + "loss": 12.1201, + "step": 21308 + }, + { + "epoch": 1.1603602131873967, + "grad_norm": 0.5577951474519199, + "learning_rate": 7.904554940605918e-05, + "loss": 11.9945, + "step": 21309 + }, + { + "epoch": 1.1604146671839797, + "grad_norm": 0.5469466081823766, + "learning_rate": 7.903692700255596e-05, + "loss": 12.0259, + "step": 21310 + }, + { + "epoch": 1.1604691211805627, + "grad_norm": 0.5346637133607479, + "learning_rate": 7.902830476206509e-05, + "loss": 11.9273, + "step": 21311 + }, + { + "epoch": 1.1605235751771457, + "grad_norm": 0.5918326644363558, + "learning_rate": 7.901968268465366e-05, + "loss": 12.0651, + "step": 21312 + }, + { + "epoch": 1.1605780291737287, + "grad_norm": 0.5384818229036822, + "learning_rate": 7.90110607703887e-05, + "loss": 11.9657, + "step": 21313 + }, + { + "epoch": 1.1606324831703116, + "grad_norm": 0.6664454169675025, + "learning_rate": 7.900243901933726e-05, + "loss": 12.014, + "step": 21314 + }, + { + "epoch": 1.1606869371668946, + "grad_norm": 0.5937989905265902, + "learning_rate": 7.899381743156636e-05, + "loss": 12.1148, + "step": 21315 + }, + { + "epoch": 1.1607413911634776, + "grad_norm": 0.5574320242817628, + "learning_rate": 7.898519600714304e-05, + "loss": 11.9253, + "step": 21316 + }, + { + "epoch": 1.1607958451600606, + "grad_norm": 0.669922264965814, + "learning_rate": 7.897657474613442e-05, + "loss": 11.993, + "step": 21317 + }, + { + "epoch": 1.1608502991566438, + "grad_norm": 0.5635964816746843, + "learning_rate": 7.896795364860743e-05, + "loss": 11.9512, + "step": 21318 + }, + { + "epoch": 1.1609047531532268, + "grad_norm": 0.5431889182245252, + "learning_rate": 7.895933271462919e-05, + "loss": 12.103, + "step": 21319 + }, + { + "epoch": 1.1609592071498098, + "grad_norm": 0.5465032975915614, + "learning_rate": 7.895071194426669e-05, + "loss": 11.995, + "step": 21320 + }, + { + "epoch": 1.1610136611463928, + "grad_norm": 0.6067239504686526, + "learning_rate": 7.894209133758698e-05, + "loss": 12.0264, + "step": 21321 + }, + { + "epoch": 1.1610681151429758, + "grad_norm": 0.5230913732754815, + "learning_rate": 7.893347089465707e-05, + "loss": 11.9428, + "step": 21322 + }, + { + "epoch": 1.1611225691395588, + "grad_norm": 0.554245263055647, + "learning_rate": 7.892485061554407e-05, + "loss": 11.9681, + "step": 21323 + }, + { + "epoch": 1.1611770231361418, + "grad_norm": 0.5982935869902283, + "learning_rate": 7.891623050031495e-05, + "loss": 11.9538, + "step": 21324 + }, + { + "epoch": 1.1612314771327248, + "grad_norm": 0.6120429744966326, + "learning_rate": 7.890761054903675e-05, + "loss": 11.9878, + "step": 21325 + }, + { + "epoch": 1.1612859311293078, + "grad_norm": 0.5260012624394336, + "learning_rate": 7.889899076177651e-05, + "loss": 12.005, + "step": 21326 + }, + { + "epoch": 1.1613403851258908, + "grad_norm": 0.5592786523396323, + "learning_rate": 7.88903711386013e-05, + "loss": 11.9804, + "step": 21327 + }, + { + "epoch": 1.1613948391224738, + "grad_norm": 0.641811322354292, + "learning_rate": 7.888175167957807e-05, + "loss": 12.0675, + "step": 21328 + }, + { + "epoch": 1.1614492931190568, + "grad_norm": 0.5965510411636518, + "learning_rate": 7.887313238477387e-05, + "loss": 11.9095, + "step": 21329 + }, + { + "epoch": 1.1615037471156398, + "grad_norm": 0.5483269925942232, + "learning_rate": 7.886451325425574e-05, + "loss": 12.0477, + "step": 21330 + }, + { + "epoch": 1.161558201112223, + "grad_norm": 0.5454992293707681, + "learning_rate": 7.88558942880907e-05, + "loss": 11.9792, + "step": 21331 + }, + { + "epoch": 1.161612655108806, + "grad_norm": 0.5596179934384802, + "learning_rate": 7.884727548634578e-05, + "loss": 12.11, + "step": 21332 + }, + { + "epoch": 1.161667109105389, + "grad_norm": 0.6544838598092451, + "learning_rate": 7.883865684908797e-05, + "loss": 11.9894, + "step": 21333 + }, + { + "epoch": 1.161721563101972, + "grad_norm": 0.5976716142108864, + "learning_rate": 7.883003837638433e-05, + "loss": 12.1143, + "step": 21334 + }, + { + "epoch": 1.161776017098555, + "grad_norm": 0.6820796469692685, + "learning_rate": 7.882142006830186e-05, + "loss": 12.1575, + "step": 21335 + }, + { + "epoch": 1.161830471095138, + "grad_norm": 0.5891571395628682, + "learning_rate": 7.881280192490759e-05, + "loss": 11.9877, + "step": 21336 + }, + { + "epoch": 1.161884925091721, + "grad_norm": 0.6234461921548158, + "learning_rate": 7.880418394626852e-05, + "loss": 12.0753, + "step": 21337 + }, + { + "epoch": 1.161939379088304, + "grad_norm": 0.5503187924707625, + "learning_rate": 7.879556613245168e-05, + "loss": 11.9641, + "step": 21338 + }, + { + "epoch": 1.161993833084887, + "grad_norm": 0.5876566938983689, + "learning_rate": 7.878694848352406e-05, + "loss": 12.0456, + "step": 21339 + }, + { + "epoch": 1.16204828708147, + "grad_norm": 0.5923178089296206, + "learning_rate": 7.877833099955269e-05, + "loss": 12.075, + "step": 21340 + }, + { + "epoch": 1.1621027410780531, + "grad_norm": 0.5391073852811503, + "learning_rate": 7.876971368060457e-05, + "loss": 11.9976, + "step": 21341 + }, + { + "epoch": 1.162157195074636, + "grad_norm": 0.5005271848541565, + "learning_rate": 7.876109652674672e-05, + "loss": 11.874, + "step": 21342 + }, + { + "epoch": 1.162211649071219, + "grad_norm": 0.6154958135301781, + "learning_rate": 7.875247953804615e-05, + "loss": 12.0391, + "step": 21343 + }, + { + "epoch": 1.162266103067802, + "grad_norm": 0.5388557819299863, + "learning_rate": 7.874386271456986e-05, + "loss": 11.933, + "step": 21344 + }, + { + "epoch": 1.162320557064385, + "grad_norm": 0.576313255775912, + "learning_rate": 7.873524605638483e-05, + "loss": 12.0524, + "step": 21345 + }, + { + "epoch": 1.162375011060968, + "grad_norm": 0.5233283920183274, + "learning_rate": 7.872662956355812e-05, + "loss": 12.0269, + "step": 21346 + }, + { + "epoch": 1.162429465057551, + "grad_norm": 0.5535799199008398, + "learning_rate": 7.871801323615675e-05, + "loss": 12.0384, + "step": 21347 + }, + { + "epoch": 1.162483919054134, + "grad_norm": 0.5528278765162272, + "learning_rate": 7.870939707424762e-05, + "loss": 12.034, + "step": 21348 + }, + { + "epoch": 1.162538373050717, + "grad_norm": 0.5317526454774004, + "learning_rate": 7.870078107789778e-05, + "loss": 12.0472, + "step": 21349 + }, + { + "epoch": 1.1625928270473, + "grad_norm": 0.5714501615963714, + "learning_rate": 7.869216524717426e-05, + "loss": 11.973, + "step": 21350 + }, + { + "epoch": 1.162647281043883, + "grad_norm": 0.5215241127983764, + "learning_rate": 7.868354958214404e-05, + "loss": 11.9012, + "step": 21351 + }, + { + "epoch": 1.162701735040466, + "grad_norm": 0.4966222070787855, + "learning_rate": 7.867493408287412e-05, + "loss": 11.972, + "step": 21352 + }, + { + "epoch": 1.162756189037049, + "grad_norm": 0.580673890672549, + "learning_rate": 7.866631874943148e-05, + "loss": 11.9781, + "step": 21353 + }, + { + "epoch": 1.1628106430336322, + "grad_norm": 0.6109334189838569, + "learning_rate": 7.865770358188312e-05, + "loss": 12.0469, + "step": 21354 + }, + { + "epoch": 1.1628650970302152, + "grad_norm": 0.6331103900132437, + "learning_rate": 7.864908858029604e-05, + "loss": 11.9192, + "step": 21355 + }, + { + "epoch": 1.1629195510267982, + "grad_norm": 0.5767142908888633, + "learning_rate": 7.86404737447372e-05, + "loss": 12.213, + "step": 21356 + }, + { + "epoch": 1.1629740050233812, + "grad_norm": 0.6159866567963135, + "learning_rate": 7.863185907527369e-05, + "loss": 12.0352, + "step": 21357 + }, + { + "epoch": 1.1630284590199642, + "grad_norm": 0.5060434680232894, + "learning_rate": 7.862324457197237e-05, + "loss": 11.8988, + "step": 21358 + }, + { + "epoch": 1.1630829130165472, + "grad_norm": 0.660432155580146, + "learning_rate": 7.86146302349003e-05, + "loss": 12.1318, + "step": 21359 + }, + { + "epoch": 1.1631373670131302, + "grad_norm": 0.6432927532773697, + "learning_rate": 7.860601606412444e-05, + "loss": 11.8871, + "step": 21360 + }, + { + "epoch": 1.1631918210097132, + "grad_norm": 0.5569170269303703, + "learning_rate": 7.859740205971178e-05, + "loss": 12.0168, + "step": 21361 + }, + { + "epoch": 1.1632462750062962, + "grad_norm": 0.5579488273407743, + "learning_rate": 7.858878822172933e-05, + "loss": 12.0225, + "step": 21362 + }, + { + "epoch": 1.1633007290028792, + "grad_norm": 0.5476491036133607, + "learning_rate": 7.858017455024405e-05, + "loss": 11.8573, + "step": 21363 + }, + { + "epoch": 1.1633551829994624, + "grad_norm": 0.5435103348402449, + "learning_rate": 7.857156104532293e-05, + "loss": 11.998, + "step": 21364 + }, + { + "epoch": 1.1634096369960454, + "grad_norm": 0.5526049723870691, + "learning_rate": 7.856294770703292e-05, + "loss": 11.9178, + "step": 21365 + }, + { + "epoch": 1.1634640909926284, + "grad_norm": 0.6471772261019824, + "learning_rate": 7.855433453544105e-05, + "loss": 12.1013, + "step": 21366 + }, + { + "epoch": 1.1635185449892114, + "grad_norm": 0.5763607212315821, + "learning_rate": 7.854572153061428e-05, + "loss": 11.945, + "step": 21367 + }, + { + "epoch": 1.1635729989857944, + "grad_norm": 0.5876161848674404, + "learning_rate": 7.853710869261957e-05, + "loss": 11.9471, + "step": 21368 + }, + { + "epoch": 1.1636274529823774, + "grad_norm": 0.5490189329583168, + "learning_rate": 7.85284960215239e-05, + "loss": 11.9542, + "step": 21369 + }, + { + "epoch": 1.1636819069789603, + "grad_norm": 0.5655948032052398, + "learning_rate": 7.851988351739423e-05, + "loss": 12.0385, + "step": 21370 + }, + { + "epoch": 1.1637363609755433, + "grad_norm": 0.6149743281232586, + "learning_rate": 7.851127118029753e-05, + "loss": 11.9412, + "step": 21371 + }, + { + "epoch": 1.1637908149721263, + "grad_norm": 0.5610191324150361, + "learning_rate": 7.850265901030081e-05, + "loss": 11.9229, + "step": 21372 + }, + { + "epoch": 1.1638452689687093, + "grad_norm": 0.5474582047453633, + "learning_rate": 7.849404700747103e-05, + "loss": 11.9991, + "step": 21373 + }, + { + "epoch": 1.1638997229652923, + "grad_norm": 0.5261407730884615, + "learning_rate": 7.848543517187514e-05, + "loss": 11.9588, + "step": 21374 + }, + { + "epoch": 1.1639541769618753, + "grad_norm": 0.6201605935308045, + "learning_rate": 7.847682350358012e-05, + "loss": 12.0262, + "step": 21375 + }, + { + "epoch": 1.1640086309584583, + "grad_norm": 0.5875726654817186, + "learning_rate": 7.846821200265292e-05, + "loss": 12.1394, + "step": 21376 + }, + { + "epoch": 1.1640630849550415, + "grad_norm": 0.545933145885558, + "learning_rate": 7.845960066916052e-05, + "loss": 11.8759, + "step": 21377 + }, + { + "epoch": 1.1641175389516245, + "grad_norm": 0.5308942247364341, + "learning_rate": 7.845098950316991e-05, + "loss": 12.0471, + "step": 21378 + }, + { + "epoch": 1.1641719929482075, + "grad_norm": 0.5420715926627245, + "learning_rate": 7.844237850474798e-05, + "loss": 11.8491, + "step": 21379 + }, + { + "epoch": 1.1642264469447905, + "grad_norm": 0.5368358898850524, + "learning_rate": 7.843376767396174e-05, + "loss": 12.0462, + "step": 21380 + }, + { + "epoch": 1.1642809009413735, + "grad_norm": 0.5109476748719622, + "learning_rate": 7.842515701087813e-05, + "loss": 12.0705, + "step": 21381 + }, + { + "epoch": 1.1643353549379565, + "grad_norm": 0.528363230321364, + "learning_rate": 7.841654651556409e-05, + "loss": 11.8376, + "step": 21382 + }, + { + "epoch": 1.1643898089345395, + "grad_norm": 0.5139255861100533, + "learning_rate": 7.840793618808664e-05, + "loss": 11.8097, + "step": 21383 + }, + { + "epoch": 1.1644442629311225, + "grad_norm": 0.5236125064547095, + "learning_rate": 7.839932602851269e-05, + "loss": 12.0464, + "step": 21384 + }, + { + "epoch": 1.1644987169277055, + "grad_norm": 0.5277906866132869, + "learning_rate": 7.839071603690922e-05, + "loss": 12.0775, + "step": 21385 + }, + { + "epoch": 1.1645531709242884, + "grad_norm": 0.5543228153366013, + "learning_rate": 7.838210621334316e-05, + "loss": 12.0554, + "step": 21386 + }, + { + "epoch": 1.1646076249208714, + "grad_norm": 0.5684060371693637, + "learning_rate": 7.837349655788146e-05, + "loss": 11.8817, + "step": 21387 + }, + { + "epoch": 1.1646620789174547, + "grad_norm": 0.5108695795591632, + "learning_rate": 7.836488707059109e-05, + "loss": 11.7814, + "step": 21388 + }, + { + "epoch": 1.1647165329140376, + "grad_norm": 0.5589237735708477, + "learning_rate": 7.835627775153899e-05, + "loss": 11.9772, + "step": 21389 + }, + { + "epoch": 1.1647709869106206, + "grad_norm": 0.5192688732247119, + "learning_rate": 7.834766860079208e-05, + "loss": 11.8846, + "step": 21390 + }, + { + "epoch": 1.1648254409072036, + "grad_norm": 0.6108269002553003, + "learning_rate": 7.833905961841734e-05, + "loss": 11.9929, + "step": 21391 + }, + { + "epoch": 1.1648798949037866, + "grad_norm": 0.5471500568396419, + "learning_rate": 7.833045080448172e-05, + "loss": 12.1352, + "step": 21392 + }, + { + "epoch": 1.1649343489003696, + "grad_norm": 0.6343554048826865, + "learning_rate": 7.832184215905211e-05, + "loss": 11.9481, + "step": 21393 + }, + { + "epoch": 1.1649888028969526, + "grad_norm": 0.5065980631943604, + "learning_rate": 7.831323368219551e-05, + "loss": 11.9507, + "step": 21394 + }, + { + "epoch": 1.1650432568935356, + "grad_norm": 0.5531899671964808, + "learning_rate": 7.830462537397884e-05, + "loss": 11.8348, + "step": 21395 + }, + { + "epoch": 1.1650977108901186, + "grad_norm": 0.564211332494212, + "learning_rate": 7.829601723446904e-05, + "loss": 11.9069, + "step": 21396 + }, + { + "epoch": 1.1651521648867016, + "grad_norm": 0.508681001390616, + "learning_rate": 7.828740926373307e-05, + "loss": 12.025, + "step": 21397 + }, + { + "epoch": 1.1652066188832846, + "grad_norm": 0.5496652300910502, + "learning_rate": 7.827880146183785e-05, + "loss": 11.9943, + "step": 21398 + }, + { + "epoch": 1.1652610728798676, + "grad_norm": 0.5495858512184852, + "learning_rate": 7.82701938288503e-05, + "loss": 12.0881, + "step": 21399 + }, + { + "epoch": 1.1653155268764506, + "grad_norm": 0.558278490805566, + "learning_rate": 7.826158636483736e-05, + "loss": 11.9001, + "step": 21400 + }, + { + "epoch": 1.1653699808730338, + "grad_norm": 0.5387885611485658, + "learning_rate": 7.825297906986597e-05, + "loss": 12.0711, + "step": 21401 + }, + { + "epoch": 1.1654244348696168, + "grad_norm": 0.4769160878218986, + "learning_rate": 7.824437194400307e-05, + "loss": 11.9543, + "step": 21402 + }, + { + "epoch": 1.1654788888661998, + "grad_norm": 0.5122308867949037, + "learning_rate": 7.823576498731557e-05, + "loss": 12.0465, + "step": 21403 + }, + { + "epoch": 1.1655333428627828, + "grad_norm": 0.5677880671335251, + "learning_rate": 7.822715819987042e-05, + "loss": 11.9772, + "step": 21404 + }, + { + "epoch": 1.1655877968593658, + "grad_norm": 0.49972644574935027, + "learning_rate": 7.821855158173453e-05, + "loss": 12.0451, + "step": 21405 + }, + { + "epoch": 1.1656422508559487, + "grad_norm": 0.5070277872774278, + "learning_rate": 7.820994513297484e-05, + "loss": 12.0071, + "step": 21406 + }, + { + "epoch": 1.1656967048525317, + "grad_norm": 0.5061112900393233, + "learning_rate": 7.820133885365827e-05, + "loss": 11.9472, + "step": 21407 + }, + { + "epoch": 1.1657511588491147, + "grad_norm": 0.546498177730851, + "learning_rate": 7.819273274385179e-05, + "loss": 12.0763, + "step": 21408 + }, + { + "epoch": 1.1658056128456977, + "grad_norm": 0.5124945710238951, + "learning_rate": 7.818412680362222e-05, + "loss": 12.0112, + "step": 21409 + }, + { + "epoch": 1.1658600668422807, + "grad_norm": 0.5119162206929523, + "learning_rate": 7.817552103303654e-05, + "loss": 11.9779, + "step": 21410 + }, + { + "epoch": 1.165914520838864, + "grad_norm": 0.5696860492088878, + "learning_rate": 7.81669154321617e-05, + "loss": 11.923, + "step": 21411 + }, + { + "epoch": 1.165968974835447, + "grad_norm": 0.5315858939238846, + "learning_rate": 7.815831000106457e-05, + "loss": 12.0212, + "step": 21412 + }, + { + "epoch": 1.16602342883203, + "grad_norm": 0.5426692120256107, + "learning_rate": 7.814970473981208e-05, + "loss": 11.9583, + "step": 21413 + }, + { + "epoch": 1.166077882828613, + "grad_norm": 0.5719014979707006, + "learning_rate": 7.814109964847115e-05, + "loss": 11.8939, + "step": 21414 + }, + { + "epoch": 1.166132336825196, + "grad_norm": 0.5993287668117351, + "learning_rate": 7.81324947271087e-05, + "loss": 11.9861, + "step": 21415 + }, + { + "epoch": 1.166186790821779, + "grad_norm": 0.5455525230575496, + "learning_rate": 7.812388997579161e-05, + "loss": 11.9907, + "step": 21416 + }, + { + "epoch": 1.1662412448183619, + "grad_norm": 0.5466803659336565, + "learning_rate": 7.811528539458686e-05, + "loss": 11.9992, + "step": 21417 + }, + { + "epoch": 1.1662956988149449, + "grad_norm": 0.5648320572273463, + "learning_rate": 7.810668098356134e-05, + "loss": 11.9194, + "step": 21418 + }, + { + "epoch": 1.1663501528115279, + "grad_norm": 0.5427405229639615, + "learning_rate": 7.809807674278191e-05, + "loss": 11.7762, + "step": 21419 + }, + { + "epoch": 1.1664046068081109, + "grad_norm": 0.5612093245163017, + "learning_rate": 7.808947267231549e-05, + "loss": 12.1771, + "step": 21420 + }, + { + "epoch": 1.1664590608046939, + "grad_norm": 0.5351677144029716, + "learning_rate": 7.808086877222903e-05, + "loss": 11.9764, + "step": 21421 + }, + { + "epoch": 1.1665135148012769, + "grad_norm": 0.6642511035229242, + "learning_rate": 7.80722650425894e-05, + "loss": 12.0197, + "step": 21422 + }, + { + "epoch": 1.1665679687978598, + "grad_norm": 0.5569335625516744, + "learning_rate": 7.806366148346352e-05, + "loss": 12.0209, + "step": 21423 + }, + { + "epoch": 1.166622422794443, + "grad_norm": 0.5508548278692025, + "learning_rate": 7.805505809491828e-05, + "loss": 11.9193, + "step": 21424 + }, + { + "epoch": 1.166676876791026, + "grad_norm": 0.5410126673008115, + "learning_rate": 7.804645487702058e-05, + "loss": 12.0918, + "step": 21425 + }, + { + "epoch": 1.166731330787609, + "grad_norm": 0.5194402193255873, + "learning_rate": 7.803785182983735e-05, + "loss": 12.0509, + "step": 21426 + }, + { + "epoch": 1.166785784784192, + "grad_norm": 0.5470937659480506, + "learning_rate": 7.802924895343543e-05, + "loss": 12.0272, + "step": 21427 + }, + { + "epoch": 1.166840238780775, + "grad_norm": 0.5389157927800435, + "learning_rate": 7.802064624788183e-05, + "loss": 12.1264, + "step": 21428 + }, + { + "epoch": 1.166894692777358, + "grad_norm": 0.5848160886153482, + "learning_rate": 7.801204371324332e-05, + "loss": 12.0118, + "step": 21429 + }, + { + "epoch": 1.166949146773941, + "grad_norm": 0.489528793970145, + "learning_rate": 7.800344134958685e-05, + "loss": 11.9481, + "step": 21430 + }, + { + "epoch": 1.167003600770524, + "grad_norm": 0.51385469906854, + "learning_rate": 7.799483915697928e-05, + "loss": 11.8606, + "step": 21431 + }, + { + "epoch": 1.167058054767107, + "grad_norm": 0.6091393168735681, + "learning_rate": 7.798623713548757e-05, + "loss": 11.9637, + "step": 21432 + }, + { + "epoch": 1.16711250876369, + "grad_norm": 0.5699559473013444, + "learning_rate": 7.797763528517855e-05, + "loss": 12.0566, + "step": 21433 + }, + { + "epoch": 1.1671669627602732, + "grad_norm": 0.4878021996624549, + "learning_rate": 7.796903360611915e-05, + "loss": 11.9056, + "step": 21434 + }, + { + "epoch": 1.1672214167568562, + "grad_norm": 0.5638929882084674, + "learning_rate": 7.796043209837622e-05, + "loss": 11.765, + "step": 21435 + }, + { + "epoch": 1.1672758707534392, + "grad_norm": 0.5080449196689862, + "learning_rate": 7.795183076201669e-05, + "loss": 11.9961, + "step": 21436 + }, + { + "epoch": 1.1673303247500222, + "grad_norm": 0.5320629353679006, + "learning_rate": 7.794322959710741e-05, + "loss": 11.9381, + "step": 21437 + }, + { + "epoch": 1.1673847787466052, + "grad_norm": 0.5284954434762772, + "learning_rate": 7.79346286037153e-05, + "loss": 11.8977, + "step": 21438 + }, + { + "epoch": 1.1674392327431882, + "grad_norm": 0.5356057188016835, + "learning_rate": 7.792602778190717e-05, + "loss": 11.9503, + "step": 21439 + }, + { + "epoch": 1.1674936867397712, + "grad_norm": 0.5816844473371946, + "learning_rate": 7.791742713174998e-05, + "loss": 11.9861, + "step": 21440 + }, + { + "epoch": 1.1675481407363542, + "grad_norm": 0.5891457658875092, + "learning_rate": 7.790882665331057e-05, + "loss": 12.0884, + "step": 21441 + }, + { + "epoch": 1.1676025947329371, + "grad_norm": 0.5551175021627227, + "learning_rate": 7.790022634665584e-05, + "loss": 11.8205, + "step": 21442 + }, + { + "epoch": 1.1676570487295201, + "grad_norm": 0.5438903206830921, + "learning_rate": 7.789162621185263e-05, + "loss": 12.1324, + "step": 21443 + }, + { + "epoch": 1.1677115027261031, + "grad_norm": 0.5401067958423126, + "learning_rate": 7.788302624896784e-05, + "loss": 11.8797, + "step": 21444 + }, + { + "epoch": 1.1677659567226861, + "grad_norm": 0.5592458681040414, + "learning_rate": 7.787442645806837e-05, + "loss": 11.9851, + "step": 21445 + }, + { + "epoch": 1.1678204107192691, + "grad_norm": 0.575958398770358, + "learning_rate": 7.786582683922107e-05, + "loss": 11.9914, + "step": 21446 + }, + { + "epoch": 1.1678748647158523, + "grad_norm": 0.5036211915335588, + "learning_rate": 7.78572273924928e-05, + "loss": 11.9179, + "step": 21447 + }, + { + "epoch": 1.1679293187124353, + "grad_norm": 0.5415017112148952, + "learning_rate": 7.784862811795048e-05, + "loss": 12.0637, + "step": 21448 + }, + { + "epoch": 1.1679837727090183, + "grad_norm": 0.6061407429736834, + "learning_rate": 7.784002901566091e-05, + "loss": 12.0024, + "step": 21449 + }, + { + "epoch": 1.1680382267056013, + "grad_norm": 0.6181644418935153, + "learning_rate": 7.783143008569099e-05, + "loss": 12.0362, + "step": 21450 + }, + { + "epoch": 1.1680926807021843, + "grad_norm": 0.5514534086004671, + "learning_rate": 7.782283132810759e-05, + "loss": 12.0524, + "step": 21451 + }, + { + "epoch": 1.1681471346987673, + "grad_norm": 0.5047071015530673, + "learning_rate": 7.781423274297757e-05, + "loss": 11.9453, + "step": 21452 + }, + { + "epoch": 1.1682015886953503, + "grad_norm": 0.52471953065119, + "learning_rate": 7.78056343303678e-05, + "loss": 11.9401, + "step": 21453 + }, + { + "epoch": 1.1682560426919333, + "grad_norm": 0.6706785428933535, + "learning_rate": 7.779703609034514e-05, + "loss": 11.9634, + "step": 21454 + }, + { + "epoch": 1.1683104966885163, + "grad_norm": 0.5778686607698854, + "learning_rate": 7.778843802297645e-05, + "loss": 12.1274, + "step": 21455 + }, + { + "epoch": 1.1683649506850993, + "grad_norm": 0.5590734244778683, + "learning_rate": 7.777984012832859e-05, + "loss": 11.9344, + "step": 21456 + }, + { + "epoch": 1.1684194046816823, + "grad_norm": 0.5578065982592294, + "learning_rate": 7.777124240646842e-05, + "loss": 12.0206, + "step": 21457 + }, + { + "epoch": 1.1684738586782655, + "grad_norm": 0.5428385181545224, + "learning_rate": 7.77626448574628e-05, + "loss": 12.0342, + "step": 21458 + }, + { + "epoch": 1.1685283126748485, + "grad_norm": 0.6147564819772413, + "learning_rate": 7.77540474813786e-05, + "loss": 12.1248, + "step": 21459 + }, + { + "epoch": 1.1685827666714315, + "grad_norm": 0.5909814840434413, + "learning_rate": 7.774545027828265e-05, + "loss": 11.9908, + "step": 21460 + }, + { + "epoch": 1.1686372206680145, + "grad_norm": 0.5652429534110385, + "learning_rate": 7.773685324824178e-05, + "loss": 11.8524, + "step": 21461 + }, + { + "epoch": 1.1686916746645974, + "grad_norm": 0.6087531963234392, + "learning_rate": 7.77282563913229e-05, + "loss": 11.9695, + "step": 21462 + }, + { + "epoch": 1.1687461286611804, + "grad_norm": 0.5512132667893598, + "learning_rate": 7.771965970759281e-05, + "loss": 12.0036, + "step": 21463 + }, + { + "epoch": 1.1688005826577634, + "grad_norm": 0.5260982059537934, + "learning_rate": 7.771106319711841e-05, + "loss": 11.9306, + "step": 21464 + }, + { + "epoch": 1.1688550366543464, + "grad_norm": 0.579546662874138, + "learning_rate": 7.770246685996648e-05, + "loss": 11.9179, + "step": 21465 + }, + { + "epoch": 1.1689094906509294, + "grad_norm": 0.5552546566089984, + "learning_rate": 7.769387069620394e-05, + "loss": 12.0308, + "step": 21466 + }, + { + "epoch": 1.1689639446475124, + "grad_norm": 0.5320251798810329, + "learning_rate": 7.76852747058976e-05, + "loss": 11.9921, + "step": 21467 + }, + { + "epoch": 1.1690183986440954, + "grad_norm": 0.5515518523649794, + "learning_rate": 7.767667888911434e-05, + "loss": 12.0567, + "step": 21468 + }, + { + "epoch": 1.1690728526406784, + "grad_norm": 0.5240191542993571, + "learning_rate": 7.766808324592091e-05, + "loss": 11.94, + "step": 21469 + }, + { + "epoch": 1.1691273066372614, + "grad_norm": 0.5596577161314998, + "learning_rate": 7.765948777638423e-05, + "loss": 11.9289, + "step": 21470 + }, + { + "epoch": 1.1691817606338446, + "grad_norm": 0.5554941562208867, + "learning_rate": 7.765089248057114e-05, + "loss": 11.9961, + "step": 21471 + }, + { + "epoch": 1.1692362146304276, + "grad_norm": 0.5683779647278253, + "learning_rate": 7.764229735854843e-05, + "loss": 12.1373, + "step": 21472 + }, + { + "epoch": 1.1692906686270106, + "grad_norm": 0.5305858502239906, + "learning_rate": 7.763370241038297e-05, + "loss": 12.0558, + "step": 21473 + }, + { + "epoch": 1.1693451226235936, + "grad_norm": 0.5020052882068247, + "learning_rate": 7.762510763614161e-05, + "loss": 11.8852, + "step": 21474 + }, + { + "epoch": 1.1693995766201766, + "grad_norm": 0.5547686098086152, + "learning_rate": 7.761651303589114e-05, + "loss": 11.9912, + "step": 21475 + }, + { + "epoch": 1.1694540306167596, + "grad_norm": 0.5308770890161167, + "learning_rate": 7.76079186096984e-05, + "loss": 11.9896, + "step": 21476 + }, + { + "epoch": 1.1695084846133426, + "grad_norm": 0.5203878529266085, + "learning_rate": 7.759932435763027e-05, + "loss": 11.8653, + "step": 21477 + }, + { + "epoch": 1.1695629386099256, + "grad_norm": 0.5600808721754098, + "learning_rate": 7.75907302797536e-05, + "loss": 12.0303, + "step": 21478 + }, + { + "epoch": 1.1696173926065085, + "grad_norm": 0.5237632925496599, + "learning_rate": 7.758213637613512e-05, + "loss": 11.7995, + "step": 21479 + }, + { + "epoch": 1.1696718466030915, + "grad_norm": 0.5309656644064694, + "learning_rate": 7.757354264684168e-05, + "loss": 12.0239, + "step": 21480 + }, + { + "epoch": 1.1697263005996748, + "grad_norm": 0.6012737482998177, + "learning_rate": 7.756494909194016e-05, + "loss": 12.0147, + "step": 21481 + }, + { + "epoch": 1.1697807545962577, + "grad_norm": 0.5278493531239027, + "learning_rate": 7.755635571149735e-05, + "loss": 11.9673, + "step": 21482 + }, + { + "epoch": 1.1698352085928407, + "grad_norm": 0.6371868709985388, + "learning_rate": 7.75477625055801e-05, + "loss": 11.9891, + "step": 21483 + }, + { + "epoch": 1.1698896625894237, + "grad_norm": 0.5384168711632088, + "learning_rate": 7.75391694742552e-05, + "loss": 11.9979, + "step": 21484 + }, + { + "epoch": 1.1699441165860067, + "grad_norm": 0.7231925061144907, + "learning_rate": 7.753057661758949e-05, + "loss": 11.9619, + "step": 21485 + }, + { + "epoch": 1.1699985705825897, + "grad_norm": 0.6527073810977893, + "learning_rate": 7.752198393564977e-05, + "loss": 11.9728, + "step": 21486 + }, + { + "epoch": 1.1700530245791727, + "grad_norm": 0.5188787105760849, + "learning_rate": 7.751339142850288e-05, + "loss": 12.0277, + "step": 21487 + }, + { + "epoch": 1.1701074785757557, + "grad_norm": 0.4975516801920765, + "learning_rate": 7.750479909621568e-05, + "loss": 12.0173, + "step": 21488 + }, + { + "epoch": 1.1701619325723387, + "grad_norm": 0.5568290775559681, + "learning_rate": 7.749620693885489e-05, + "loss": 11.9154, + "step": 21489 + }, + { + "epoch": 1.1702163865689217, + "grad_norm": 0.592378744097072, + "learning_rate": 7.748761495648736e-05, + "loss": 11.9838, + "step": 21490 + }, + { + "epoch": 1.1702708405655047, + "grad_norm": 0.5392249836439629, + "learning_rate": 7.74790231491799e-05, + "loss": 11.8422, + "step": 21491 + }, + { + "epoch": 1.1703252945620877, + "grad_norm": 0.5828539964433967, + "learning_rate": 7.747043151699935e-05, + "loss": 12.0813, + "step": 21492 + }, + { + "epoch": 1.1703797485586707, + "grad_norm": 0.5842192642892001, + "learning_rate": 7.74618400600125e-05, + "loss": 12.0662, + "step": 21493 + }, + { + "epoch": 1.1704342025552539, + "grad_norm": 0.49210379808848115, + "learning_rate": 7.745324877828617e-05, + "loss": 11.9989, + "step": 21494 + }, + { + "epoch": 1.1704886565518369, + "grad_norm": 0.5979440408864463, + "learning_rate": 7.744465767188715e-05, + "loss": 11.8663, + "step": 21495 + }, + { + "epoch": 1.1705431105484199, + "grad_norm": 0.5539942505207179, + "learning_rate": 7.743606674088227e-05, + "loss": 12.0314, + "step": 21496 + }, + { + "epoch": 1.1705975645450029, + "grad_norm": 0.548218170252454, + "learning_rate": 7.74274759853383e-05, + "loss": 12.1344, + "step": 21497 + }, + { + "epoch": 1.1706520185415858, + "grad_norm": 0.5625011843254378, + "learning_rate": 7.74188854053221e-05, + "loss": 12.1762, + "step": 21498 + }, + { + "epoch": 1.1707064725381688, + "grad_norm": 0.5460425827148898, + "learning_rate": 7.74102950009004e-05, + "loss": 12.0292, + "step": 21499 + }, + { + "epoch": 1.1707609265347518, + "grad_norm": 0.5408142850430533, + "learning_rate": 7.740170477214003e-05, + "loss": 12.0924, + "step": 21500 + }, + { + "epoch": 1.1708153805313348, + "grad_norm": 0.5781869069480234, + "learning_rate": 7.739311471910781e-05, + "loss": 12.073, + "step": 21501 + }, + { + "epoch": 1.1708698345279178, + "grad_norm": 0.5392614362905274, + "learning_rate": 7.738452484187052e-05, + "loss": 11.9822, + "step": 21502 + }, + { + "epoch": 1.1709242885245008, + "grad_norm": 0.5686113171382908, + "learning_rate": 7.737593514049492e-05, + "loss": 11.9301, + "step": 21503 + }, + { + "epoch": 1.170978742521084, + "grad_norm": 0.6093378513520353, + "learning_rate": 7.736734561504787e-05, + "loss": 11.9898, + "step": 21504 + }, + { + "epoch": 1.171033196517667, + "grad_norm": 0.5521503110204157, + "learning_rate": 7.735875626559614e-05, + "loss": 12.0431, + "step": 21505 + }, + { + "epoch": 1.17108765051425, + "grad_norm": 0.5503166172575223, + "learning_rate": 7.735016709220652e-05, + "loss": 12.0454, + "step": 21506 + }, + { + "epoch": 1.171142104510833, + "grad_norm": 0.5399224442906748, + "learning_rate": 7.73415780949458e-05, + "loss": 11.8754, + "step": 21507 + }, + { + "epoch": 1.171196558507416, + "grad_norm": 0.6172229676242614, + "learning_rate": 7.733298927388077e-05, + "loss": 12.1151, + "step": 21508 + }, + { + "epoch": 1.171251012503999, + "grad_norm": 0.5221733846267943, + "learning_rate": 7.732440062907822e-05, + "loss": 12.1128, + "step": 21509 + }, + { + "epoch": 1.171305466500582, + "grad_norm": 0.5535158775303224, + "learning_rate": 7.731581216060492e-05, + "loss": 11.9696, + "step": 21510 + }, + { + "epoch": 1.171359920497165, + "grad_norm": 0.5784357095092382, + "learning_rate": 7.730722386852768e-05, + "loss": 12.0313, + "step": 21511 + }, + { + "epoch": 1.171414374493748, + "grad_norm": 0.5411237223474381, + "learning_rate": 7.729863575291326e-05, + "loss": 11.9306, + "step": 21512 + }, + { + "epoch": 1.171468828490331, + "grad_norm": 0.5534941810446697, + "learning_rate": 7.729004781382847e-05, + "loss": 11.8688, + "step": 21513 + }, + { + "epoch": 1.171523282486914, + "grad_norm": 0.5361318082877433, + "learning_rate": 7.728146005134005e-05, + "loss": 12.0888, + "step": 21514 + }, + { + "epoch": 1.171577736483497, + "grad_norm": 0.5330867725931275, + "learning_rate": 7.727287246551482e-05, + "loss": 11.9465, + "step": 21515 + }, + { + "epoch": 1.17163219048008, + "grad_norm": 0.6008539322265298, + "learning_rate": 7.726428505641955e-05, + "loss": 12.0014, + "step": 21516 + }, + { + "epoch": 1.1716866444766632, + "grad_norm": 0.5886242340830706, + "learning_rate": 7.725569782412102e-05, + "loss": 11.9946, + "step": 21517 + }, + { + "epoch": 1.1717410984732461, + "grad_norm": 0.5120914005227445, + "learning_rate": 7.7247110768686e-05, + "loss": 11.9638, + "step": 21518 + }, + { + "epoch": 1.1717955524698291, + "grad_norm": 0.5123307090634064, + "learning_rate": 7.723852389018126e-05, + "loss": 11.9975, + "step": 21519 + }, + { + "epoch": 1.1718500064664121, + "grad_norm": 0.5193406394637023, + "learning_rate": 7.722993718867357e-05, + "loss": 11.958, + "step": 21520 + }, + { + "epoch": 1.1719044604629951, + "grad_norm": 0.5594636417028016, + "learning_rate": 7.72213506642297e-05, + "loss": 11.9261, + "step": 21521 + }, + { + "epoch": 1.1719589144595781, + "grad_norm": 0.5427898649075381, + "learning_rate": 7.721276431691643e-05, + "loss": 12.0003, + "step": 21522 + }, + { + "epoch": 1.172013368456161, + "grad_norm": 0.5094372058973505, + "learning_rate": 7.720417814680052e-05, + "loss": 12.0733, + "step": 21523 + }, + { + "epoch": 1.172067822452744, + "grad_norm": 0.5052121062928325, + "learning_rate": 7.719559215394875e-05, + "loss": 12.0188, + "step": 21524 + }, + { + "epoch": 1.172122276449327, + "grad_norm": 0.5406966806318048, + "learning_rate": 7.718700633842787e-05, + "loss": 11.9344, + "step": 21525 + }, + { + "epoch": 1.17217673044591, + "grad_norm": 0.5726195194524218, + "learning_rate": 7.717842070030467e-05, + "loss": 11.9564, + "step": 21526 + }, + { + "epoch": 1.172231184442493, + "grad_norm": 0.49501450545394016, + "learning_rate": 7.71698352396459e-05, + "loss": 11.8853, + "step": 21527 + }, + { + "epoch": 1.1722856384390763, + "grad_norm": 0.5695613901911343, + "learning_rate": 7.71612499565183e-05, + "loss": 12.0181, + "step": 21528 + }, + { + "epoch": 1.1723400924356593, + "grad_norm": 0.537949886667884, + "learning_rate": 7.715266485098868e-05, + "loss": 11.9117, + "step": 21529 + }, + { + "epoch": 1.1723945464322423, + "grad_norm": 0.5456325926834007, + "learning_rate": 7.714407992312376e-05, + "loss": 11.9595, + "step": 21530 + }, + { + "epoch": 1.1724490004288253, + "grad_norm": 0.600319765366184, + "learning_rate": 7.71354951729903e-05, + "loss": 12.0551, + "step": 21531 + }, + { + "epoch": 1.1725034544254083, + "grad_norm": 0.6105793578343066, + "learning_rate": 7.712691060065507e-05, + "loss": 11.9822, + "step": 21532 + }, + { + "epoch": 1.1725579084219913, + "grad_norm": 0.505282344779516, + "learning_rate": 7.711832620618482e-05, + "loss": 11.9356, + "step": 21533 + }, + { + "epoch": 1.1726123624185743, + "grad_norm": 0.5079079065072811, + "learning_rate": 7.710974198964629e-05, + "loss": 11.9677, + "step": 21534 + }, + { + "epoch": 1.1726668164151572, + "grad_norm": 0.5928270465117728, + "learning_rate": 7.710115795110625e-05, + "loss": 11.9835, + "step": 21535 + }, + { + "epoch": 1.1727212704117402, + "grad_norm": 0.5729532760680833, + "learning_rate": 7.709257409063142e-05, + "loss": 11.918, + "step": 21536 + }, + { + "epoch": 1.1727757244083232, + "grad_norm": 0.6133392232219819, + "learning_rate": 7.70839904082886e-05, + "loss": 12.0699, + "step": 21537 + }, + { + "epoch": 1.1728301784049062, + "grad_norm": 0.5242315398613062, + "learning_rate": 7.707540690414452e-05, + "loss": 12.1721, + "step": 21538 + }, + { + "epoch": 1.1728846324014892, + "grad_norm": 0.48245773539151876, + "learning_rate": 7.706682357826595e-05, + "loss": 11.9284, + "step": 21539 + }, + { + "epoch": 1.1729390863980722, + "grad_norm": 0.5321526831624755, + "learning_rate": 7.705824043071957e-05, + "loss": 12.0253, + "step": 21540 + }, + { + "epoch": 1.1729935403946554, + "grad_norm": 0.5277659031414554, + "learning_rate": 7.704965746157215e-05, + "loss": 11.9888, + "step": 21541 + }, + { + "epoch": 1.1730479943912384, + "grad_norm": 0.5525367153991201, + "learning_rate": 7.704107467089045e-05, + "loss": 11.9003, + "step": 21542 + }, + { + "epoch": 1.1731024483878214, + "grad_norm": 0.5292540902969628, + "learning_rate": 7.703249205874121e-05, + "loss": 11.7963, + "step": 21543 + }, + { + "epoch": 1.1731569023844044, + "grad_norm": 0.4988636316086766, + "learning_rate": 7.702390962519117e-05, + "loss": 11.9743, + "step": 21544 + }, + { + "epoch": 1.1732113563809874, + "grad_norm": 0.5287904304911054, + "learning_rate": 7.701532737030706e-05, + "loss": 11.9094, + "step": 21545 + }, + { + "epoch": 1.1732658103775704, + "grad_norm": 0.5412664155879475, + "learning_rate": 7.70067452941556e-05, + "loss": 11.9634, + "step": 21546 + }, + { + "epoch": 1.1733202643741534, + "grad_norm": 0.5867668093510661, + "learning_rate": 7.699816339680357e-05, + "loss": 12.0047, + "step": 21547 + }, + { + "epoch": 1.1733747183707364, + "grad_norm": 0.5837210487251758, + "learning_rate": 7.698958167831763e-05, + "loss": 12.0087, + "step": 21548 + }, + { + "epoch": 1.1734291723673194, + "grad_norm": 0.5097681136664706, + "learning_rate": 7.698100013876465e-05, + "loss": 12.0246, + "step": 21549 + }, + { + "epoch": 1.1734836263639024, + "grad_norm": 0.5253903708182228, + "learning_rate": 7.697241877821121e-05, + "loss": 11.9348, + "step": 21550 + }, + { + "epoch": 1.1735380803604856, + "grad_norm": 0.6293028844089644, + "learning_rate": 7.696383759672412e-05, + "loss": 11.9706, + "step": 21551 + }, + { + "epoch": 1.1735925343570686, + "grad_norm": 0.5576519320191654, + "learning_rate": 7.695525659437006e-05, + "loss": 12.0808, + "step": 21552 + }, + { + "epoch": 1.1736469883536516, + "grad_norm": 0.5181540665008444, + "learning_rate": 7.694667577121582e-05, + "loss": 12.0329, + "step": 21553 + }, + { + "epoch": 1.1737014423502345, + "grad_norm": 0.5389763408112704, + "learning_rate": 7.69380951273281e-05, + "loss": 12.0049, + "step": 21554 + }, + { + "epoch": 1.1737558963468175, + "grad_norm": 0.5413564993798322, + "learning_rate": 7.69295146627736e-05, + "loss": 12.0269, + "step": 21555 + }, + { + "epoch": 1.1738103503434005, + "grad_norm": 0.7198102054487848, + "learning_rate": 7.692093437761908e-05, + "loss": 12.1316, + "step": 21556 + }, + { + "epoch": 1.1738648043399835, + "grad_norm": 0.5786289001723626, + "learning_rate": 7.691235427193123e-05, + "loss": 12.0126, + "step": 21557 + }, + { + "epoch": 1.1739192583365665, + "grad_norm": 0.513170917273, + "learning_rate": 7.690377434577681e-05, + "loss": 11.8932, + "step": 21558 + }, + { + "epoch": 1.1739737123331495, + "grad_norm": 0.5374406423491991, + "learning_rate": 7.68951945992225e-05, + "loss": 12.0512, + "step": 21559 + }, + { + "epoch": 1.1740281663297325, + "grad_norm": 0.5724407425139194, + "learning_rate": 7.688661503233503e-05, + "loss": 12.0738, + "step": 21560 + }, + { + "epoch": 1.1740826203263155, + "grad_norm": 0.5209037313444472, + "learning_rate": 7.687803564518112e-05, + "loss": 11.9837, + "step": 21561 + }, + { + "epoch": 1.1741370743228985, + "grad_norm": 0.5103557591440533, + "learning_rate": 7.686945643782747e-05, + "loss": 11.9613, + "step": 21562 + }, + { + "epoch": 1.1741915283194815, + "grad_norm": 0.5317941678247606, + "learning_rate": 7.68608774103408e-05, + "loss": 11.9773, + "step": 21563 + }, + { + "epoch": 1.1742459823160647, + "grad_norm": 0.5251832866116968, + "learning_rate": 7.685229856278784e-05, + "loss": 11.9801, + "step": 21564 + }, + { + "epoch": 1.1743004363126477, + "grad_norm": 0.5508580766149924, + "learning_rate": 7.684371989523528e-05, + "loss": 12.0116, + "step": 21565 + }, + { + "epoch": 1.1743548903092307, + "grad_norm": 0.541710490167256, + "learning_rate": 7.683514140774985e-05, + "loss": 12.0006, + "step": 21566 + }, + { + "epoch": 1.1744093443058137, + "grad_norm": 0.5776584803130658, + "learning_rate": 7.682656310039826e-05, + "loss": 11.9537, + "step": 21567 + }, + { + "epoch": 1.1744637983023967, + "grad_norm": 0.5629310797797762, + "learning_rate": 7.681798497324716e-05, + "loss": 11.9284, + "step": 21568 + }, + { + "epoch": 1.1745182522989797, + "grad_norm": 0.5587841189061926, + "learning_rate": 7.680940702636335e-05, + "loss": 11.9966, + "step": 21569 + }, + { + "epoch": 1.1745727062955627, + "grad_norm": 0.6449658253645129, + "learning_rate": 7.680082925981346e-05, + "loss": 12.0528, + "step": 21570 + }, + { + "epoch": 1.1746271602921456, + "grad_norm": 0.5749180238364583, + "learning_rate": 7.67922516736642e-05, + "loss": 12.0674, + "step": 21571 + }, + { + "epoch": 1.1746816142887286, + "grad_norm": 0.6287072461563051, + "learning_rate": 7.678367426798228e-05, + "loss": 12.0589, + "step": 21572 + }, + { + "epoch": 1.1747360682853116, + "grad_norm": 0.6296938676638881, + "learning_rate": 7.67750970428344e-05, + "loss": 12.1065, + "step": 21573 + }, + { + "epoch": 1.1747905222818948, + "grad_norm": 0.5412884507051381, + "learning_rate": 7.676651999828726e-05, + "loss": 11.9356, + "step": 21574 + }, + { + "epoch": 1.1748449762784778, + "grad_norm": 0.5511691378546619, + "learning_rate": 7.675794313440756e-05, + "loss": 11.9952, + "step": 21575 + }, + { + "epoch": 1.1748994302750608, + "grad_norm": 0.6065834705687223, + "learning_rate": 7.6749366451262e-05, + "loss": 11.9431, + "step": 21576 + }, + { + "epoch": 1.1749538842716438, + "grad_norm": 0.5623202627458568, + "learning_rate": 7.674078994891727e-05, + "loss": 12.0089, + "step": 21577 + }, + { + "epoch": 1.1750083382682268, + "grad_norm": 0.5780561703387903, + "learning_rate": 7.673221362744005e-05, + "loss": 11.9446, + "step": 21578 + }, + { + "epoch": 1.1750627922648098, + "grad_norm": 0.502136544401422, + "learning_rate": 7.672363748689706e-05, + "loss": 11.9293, + "step": 21579 + }, + { + "epoch": 1.1751172462613928, + "grad_norm": 0.494545000401578, + "learning_rate": 7.671506152735495e-05, + "loss": 11.9548, + "step": 21580 + }, + { + "epoch": 1.1751717002579758, + "grad_norm": 0.5151063631831978, + "learning_rate": 7.670648574888042e-05, + "loss": 11.9267, + "step": 21581 + }, + { + "epoch": 1.1752261542545588, + "grad_norm": 0.6079219359619696, + "learning_rate": 7.669791015154017e-05, + "loss": 11.9529, + "step": 21582 + }, + { + "epoch": 1.1752806082511418, + "grad_norm": 0.5665895568151611, + "learning_rate": 7.668933473540087e-05, + "loss": 11.9499, + "step": 21583 + }, + { + "epoch": 1.1753350622477248, + "grad_norm": 0.5909458742546837, + "learning_rate": 7.668075950052922e-05, + "loss": 12.0102, + "step": 21584 + }, + { + "epoch": 1.1753895162443078, + "grad_norm": 0.5969256567130299, + "learning_rate": 7.667218444699187e-05, + "loss": 11.8862, + "step": 21585 + }, + { + "epoch": 1.1754439702408908, + "grad_norm": 0.5237406253917345, + "learning_rate": 7.666360957485554e-05, + "loss": 11.9454, + "step": 21586 + }, + { + "epoch": 1.175498424237474, + "grad_norm": 0.5921875596250283, + "learning_rate": 7.66550348841869e-05, + "loss": 12.0029, + "step": 21587 + }, + { + "epoch": 1.175552878234057, + "grad_norm": 0.5465794547164644, + "learning_rate": 7.664646037505263e-05, + "loss": 11.8206, + "step": 21588 + }, + { + "epoch": 1.17560733223064, + "grad_norm": 0.5875587815375578, + "learning_rate": 7.663788604751943e-05, + "loss": 12.0814, + "step": 21589 + }, + { + "epoch": 1.175661786227223, + "grad_norm": 0.5642222664525203, + "learning_rate": 7.66293119016539e-05, + "loss": 12.0113, + "step": 21590 + }, + { + "epoch": 1.175716240223806, + "grad_norm": 0.5204404510690026, + "learning_rate": 7.662073793752278e-05, + "loss": 11.9023, + "step": 21591 + }, + { + "epoch": 1.175770694220389, + "grad_norm": 0.5556397579875381, + "learning_rate": 7.661216415519273e-05, + "loss": 11.9804, + "step": 21592 + }, + { + "epoch": 1.175825148216972, + "grad_norm": 0.5640496495315787, + "learning_rate": 7.660359055473039e-05, + "loss": 12.0748, + "step": 21593 + }, + { + "epoch": 1.175879602213555, + "grad_norm": 0.49745883549555725, + "learning_rate": 7.659501713620246e-05, + "loss": 11.9385, + "step": 21594 + }, + { + "epoch": 1.175934056210138, + "grad_norm": 0.5404957254428031, + "learning_rate": 7.65864438996756e-05, + "loss": 11.916, + "step": 21595 + }, + { + "epoch": 1.175988510206721, + "grad_norm": 0.5242902139557931, + "learning_rate": 7.657787084521649e-05, + "loss": 12.1823, + "step": 21596 + }, + { + "epoch": 1.1760429642033041, + "grad_norm": 0.5657329966808164, + "learning_rate": 7.656929797289177e-05, + "loss": 12.0118, + "step": 21597 + }, + { + "epoch": 1.1760974181998871, + "grad_norm": 0.5700382513723844, + "learning_rate": 7.656072528276811e-05, + "loss": 12.0433, + "step": 21598 + }, + { + "epoch": 1.17615187219647, + "grad_norm": 0.5633965987301115, + "learning_rate": 7.655215277491225e-05, + "loss": 12.0989, + "step": 21599 + }, + { + "epoch": 1.176206326193053, + "grad_norm": 0.5158507634434333, + "learning_rate": 7.654358044939073e-05, + "loss": 12.009, + "step": 21600 + }, + { + "epoch": 1.176260780189636, + "grad_norm": 0.5670445956130047, + "learning_rate": 7.653500830627023e-05, + "loss": 12.0958, + "step": 21601 + }, + { + "epoch": 1.176315234186219, + "grad_norm": 0.5698455091294612, + "learning_rate": 7.652643634561748e-05, + "loss": 12.001, + "step": 21602 + }, + { + "epoch": 1.176369688182802, + "grad_norm": 0.5565254874646226, + "learning_rate": 7.651786456749908e-05, + "loss": 12.017, + "step": 21603 + }, + { + "epoch": 1.176424142179385, + "grad_norm": 0.5624231475396476, + "learning_rate": 7.650929297198171e-05, + "loss": 11.9315, + "step": 21604 + }, + { + "epoch": 1.176478596175968, + "grad_norm": 0.6829559344536329, + "learning_rate": 7.650072155913203e-05, + "loss": 11.9368, + "step": 21605 + }, + { + "epoch": 1.176533050172551, + "grad_norm": 0.5266904437228023, + "learning_rate": 7.649215032901666e-05, + "loss": 12.1225, + "step": 21606 + }, + { + "epoch": 1.176587504169134, + "grad_norm": 0.5355267032043709, + "learning_rate": 7.648357928170228e-05, + "loss": 11.8722, + "step": 21607 + }, + { + "epoch": 1.176641958165717, + "grad_norm": 0.5974199310957993, + "learning_rate": 7.647500841725553e-05, + "loss": 11.9625, + "step": 21608 + }, + { + "epoch": 1.1766964121623, + "grad_norm": 0.5469317279833944, + "learning_rate": 7.646643773574309e-05, + "loss": 11.9221, + "step": 21609 + }, + { + "epoch": 1.176750866158883, + "grad_norm": 0.5394380790937495, + "learning_rate": 7.645786723723156e-05, + "loss": 11.8321, + "step": 21610 + }, + { + "epoch": 1.1768053201554662, + "grad_norm": 0.5701565621650369, + "learning_rate": 7.644929692178758e-05, + "loss": 12.1174, + "step": 21611 + }, + { + "epoch": 1.1768597741520492, + "grad_norm": 0.5291252400811727, + "learning_rate": 7.644072678947781e-05, + "loss": 12.0025, + "step": 21612 + }, + { + "epoch": 1.1769142281486322, + "grad_norm": 0.5223918746204956, + "learning_rate": 7.643215684036891e-05, + "loss": 12.0317, + "step": 21613 + }, + { + "epoch": 1.1769686821452152, + "grad_norm": 0.6607752055855449, + "learning_rate": 7.642358707452752e-05, + "loss": 12.2902, + "step": 21614 + }, + { + "epoch": 1.1770231361417982, + "grad_norm": 0.5726191469782652, + "learning_rate": 7.641501749202028e-05, + "loss": 12.0658, + "step": 21615 + }, + { + "epoch": 1.1770775901383812, + "grad_norm": 0.6062855462084796, + "learning_rate": 7.640644809291381e-05, + "loss": 12.0044, + "step": 21616 + }, + { + "epoch": 1.1771320441349642, + "grad_norm": 0.5062403174202712, + "learning_rate": 7.639787887727474e-05, + "loss": 11.9501, + "step": 21617 + }, + { + "epoch": 1.1771864981315472, + "grad_norm": 0.5440039427500988, + "learning_rate": 7.638930984516975e-05, + "loss": 12.0606, + "step": 21618 + }, + { + "epoch": 1.1772409521281302, + "grad_norm": 0.6159668397552996, + "learning_rate": 7.638074099666545e-05, + "loss": 12.1079, + "step": 21619 + }, + { + "epoch": 1.1772954061247132, + "grad_norm": 0.5651141397183372, + "learning_rate": 7.637217233182845e-05, + "loss": 12.1082, + "step": 21620 + }, + { + "epoch": 1.1773498601212964, + "grad_norm": 0.5454780345859144, + "learning_rate": 7.63636038507254e-05, + "loss": 11.9098, + "step": 21621 + }, + { + "epoch": 1.1774043141178794, + "grad_norm": 0.5504587016802348, + "learning_rate": 7.635503555342294e-05, + "loss": 11.9108, + "step": 21622 + }, + { + "epoch": 1.1774587681144624, + "grad_norm": 0.5042810541540745, + "learning_rate": 7.634646743998765e-05, + "loss": 12.0666, + "step": 21623 + }, + { + "epoch": 1.1775132221110454, + "grad_norm": 0.5973572988736263, + "learning_rate": 7.633789951048622e-05, + "loss": 12.0896, + "step": 21624 + }, + { + "epoch": 1.1775676761076284, + "grad_norm": 0.5247466352235615, + "learning_rate": 7.632933176498527e-05, + "loss": 11.9067, + "step": 21625 + }, + { + "epoch": 1.1776221301042114, + "grad_norm": 0.6310328034642694, + "learning_rate": 7.632076420355139e-05, + "loss": 12.0383, + "step": 21626 + }, + { + "epoch": 1.1776765841007943, + "grad_norm": 0.515311925378349, + "learning_rate": 7.631219682625123e-05, + "loss": 12.053, + "step": 21627 + }, + { + "epoch": 1.1777310380973773, + "grad_norm": 0.6070465029960924, + "learning_rate": 7.630362963315138e-05, + "loss": 12.006, + "step": 21628 + }, + { + "epoch": 1.1777854920939603, + "grad_norm": 0.5652235188360191, + "learning_rate": 7.629506262431852e-05, + "loss": 11.9344, + "step": 21629 + }, + { + "epoch": 1.1778399460905433, + "grad_norm": 0.5773397818903766, + "learning_rate": 7.62864957998192e-05, + "loss": 12.0128, + "step": 21630 + }, + { + "epoch": 1.1778944000871263, + "grad_norm": 0.6079772690009896, + "learning_rate": 7.627792915972006e-05, + "loss": 12.0603, + "step": 21631 + }, + { + "epoch": 1.1779488540837093, + "grad_norm": 0.5497045741866009, + "learning_rate": 7.626936270408774e-05, + "loss": 11.927, + "step": 21632 + }, + { + "epoch": 1.1780033080802923, + "grad_norm": 0.5537199405150324, + "learning_rate": 7.626079643298882e-05, + "loss": 12.0391, + "step": 21633 + }, + { + "epoch": 1.1780577620768755, + "grad_norm": 0.5762763130172146, + "learning_rate": 7.62522303464899e-05, + "loss": 12.0694, + "step": 21634 + }, + { + "epoch": 1.1781122160734585, + "grad_norm": 0.5431762987688716, + "learning_rate": 7.624366444465764e-05, + "loss": 11.958, + "step": 21635 + }, + { + "epoch": 1.1781666700700415, + "grad_norm": 0.5582454172540878, + "learning_rate": 7.623509872755866e-05, + "loss": 12.1113, + "step": 21636 + }, + { + "epoch": 1.1782211240666245, + "grad_norm": 0.5281912605122874, + "learning_rate": 7.622653319525951e-05, + "loss": 12.1244, + "step": 21637 + }, + { + "epoch": 1.1782755780632075, + "grad_norm": 0.5317912076200442, + "learning_rate": 7.621796784782683e-05, + "loss": 11.9328, + "step": 21638 + }, + { + "epoch": 1.1783300320597905, + "grad_norm": 0.5226098405298144, + "learning_rate": 7.620940268532724e-05, + "loss": 11.9744, + "step": 21639 + }, + { + "epoch": 1.1783844860563735, + "grad_norm": 0.5211359218380267, + "learning_rate": 7.620083770782731e-05, + "loss": 11.7664, + "step": 21640 + }, + { + "epoch": 1.1784389400529565, + "grad_norm": 0.5195296694323369, + "learning_rate": 7.619227291539364e-05, + "loss": 11.9052, + "step": 21641 + }, + { + "epoch": 1.1784933940495395, + "grad_norm": 0.5125149024680569, + "learning_rate": 7.618370830809287e-05, + "loss": 12.0138, + "step": 21642 + }, + { + "epoch": 1.1785478480461224, + "grad_norm": 0.5779339418846191, + "learning_rate": 7.617514388599158e-05, + "loss": 12.0039, + "step": 21643 + }, + { + "epoch": 1.1786023020427057, + "grad_norm": 0.6368765557088946, + "learning_rate": 7.616657964915634e-05, + "loss": 12.0219, + "step": 21644 + }, + { + "epoch": 1.1786567560392887, + "grad_norm": 0.6600037004525121, + "learning_rate": 7.615801559765378e-05, + "loss": 11.9651, + "step": 21645 + }, + { + "epoch": 1.1787112100358716, + "grad_norm": 0.5504352862620744, + "learning_rate": 7.614945173155049e-05, + "loss": 12.0223, + "step": 21646 + }, + { + "epoch": 1.1787656640324546, + "grad_norm": 0.5182725241016743, + "learning_rate": 7.614088805091308e-05, + "loss": 12.006, + "step": 21647 + }, + { + "epoch": 1.1788201180290376, + "grad_norm": 0.5330077841931283, + "learning_rate": 7.613232455580811e-05, + "loss": 11.8409, + "step": 21648 + }, + { + "epoch": 1.1788745720256206, + "grad_norm": 0.5525951312144873, + "learning_rate": 7.612376124630224e-05, + "loss": 12.0381, + "step": 21649 + }, + { + "epoch": 1.1789290260222036, + "grad_norm": 0.5190346790686747, + "learning_rate": 7.611519812246194e-05, + "loss": 11.9219, + "step": 21650 + }, + { + "epoch": 1.1789834800187866, + "grad_norm": 0.5979838501324751, + "learning_rate": 7.610663518435388e-05, + "loss": 12.0164, + "step": 21651 + }, + { + "epoch": 1.1790379340153696, + "grad_norm": 0.5470158345829987, + "learning_rate": 7.609807243204464e-05, + "loss": 12.0324, + "step": 21652 + }, + { + "epoch": 1.1790923880119526, + "grad_norm": 0.5308636105202947, + "learning_rate": 7.60895098656008e-05, + "loss": 12.0483, + "step": 21653 + }, + { + "epoch": 1.1791468420085356, + "grad_norm": 0.500336870950848, + "learning_rate": 7.608094748508893e-05, + "loss": 11.7895, + "step": 21654 + }, + { + "epoch": 1.1792012960051186, + "grad_norm": 0.5666076552703959, + "learning_rate": 7.607238529057563e-05, + "loss": 11.9856, + "step": 21655 + }, + { + "epoch": 1.1792557500017016, + "grad_norm": 0.5528134284284965, + "learning_rate": 7.606382328212748e-05, + "loss": 12.012, + "step": 21656 + }, + { + "epoch": 1.1793102039982848, + "grad_norm": 0.5439674157238209, + "learning_rate": 7.605526145981103e-05, + "loss": 11.9978, + "step": 21657 + }, + { + "epoch": 1.1793646579948678, + "grad_norm": 0.5572339329752212, + "learning_rate": 7.604669982369289e-05, + "loss": 11.9457, + "step": 21658 + }, + { + "epoch": 1.1794191119914508, + "grad_norm": 0.5019183247288584, + "learning_rate": 7.603813837383968e-05, + "loss": 12.0303, + "step": 21659 + }, + { + "epoch": 1.1794735659880338, + "grad_norm": 0.5234298019560105, + "learning_rate": 7.602957711031788e-05, + "loss": 12.0421, + "step": 21660 + }, + { + "epoch": 1.1795280199846168, + "grad_norm": 0.5988081752598906, + "learning_rate": 7.60210160331941e-05, + "loss": 12.0151, + "step": 21661 + }, + { + "epoch": 1.1795824739811998, + "grad_norm": 0.5503929227153116, + "learning_rate": 7.601245514253494e-05, + "loss": 12.022, + "step": 21662 + }, + { + "epoch": 1.1796369279777827, + "grad_norm": 0.5875432886014211, + "learning_rate": 7.600389443840694e-05, + "loss": 12.1746, + "step": 21663 + }, + { + "epoch": 1.1796913819743657, + "grad_norm": 0.5548592242777497, + "learning_rate": 7.599533392087667e-05, + "loss": 12.0658, + "step": 21664 + }, + { + "epoch": 1.1797458359709487, + "grad_norm": 0.5441249909569249, + "learning_rate": 7.598677359001074e-05, + "loss": 12.0069, + "step": 21665 + }, + { + "epoch": 1.1798002899675317, + "grad_norm": 0.5411189061569485, + "learning_rate": 7.597821344587566e-05, + "loss": 11.9706, + "step": 21666 + }, + { + "epoch": 1.179854743964115, + "grad_norm": 0.5401778057740976, + "learning_rate": 7.596965348853804e-05, + "loss": 11.9824, + "step": 21667 + }, + { + "epoch": 1.179909197960698, + "grad_norm": 0.5358619510051359, + "learning_rate": 7.59610937180644e-05, + "loss": 11.9398, + "step": 21668 + }, + { + "epoch": 1.179963651957281, + "grad_norm": 0.5755059795425265, + "learning_rate": 7.595253413452133e-05, + "loss": 11.9483, + "step": 21669 + }, + { + "epoch": 1.180018105953864, + "grad_norm": 0.6660613258187639, + "learning_rate": 7.594397473797545e-05, + "loss": 11.9965, + "step": 21670 + }, + { + "epoch": 1.180072559950447, + "grad_norm": 0.5245059523204031, + "learning_rate": 7.593541552849319e-05, + "loss": 11.9452, + "step": 21671 + }, + { + "epoch": 1.18012701394703, + "grad_norm": 0.5805989717286782, + "learning_rate": 7.592685650614118e-05, + "loss": 12.0659, + "step": 21672 + }, + { + "epoch": 1.180181467943613, + "grad_norm": 0.5281261993262514, + "learning_rate": 7.591829767098598e-05, + "loss": 11.9192, + "step": 21673 + }, + { + "epoch": 1.1802359219401959, + "grad_norm": 0.5152599095149386, + "learning_rate": 7.590973902309413e-05, + "loss": 12.0361, + "step": 21674 + }, + { + "epoch": 1.1802903759367789, + "grad_norm": 0.5273053352868992, + "learning_rate": 7.59011805625322e-05, + "loss": 11.8206, + "step": 21675 + }, + { + "epoch": 1.1803448299333619, + "grad_norm": 0.6876622776737558, + "learning_rate": 7.589262228936674e-05, + "loss": 11.9064, + "step": 21676 + }, + { + "epoch": 1.1803992839299449, + "grad_norm": 0.517094720883418, + "learning_rate": 7.588406420366427e-05, + "loss": 11.9869, + "step": 21677 + }, + { + "epoch": 1.1804537379265279, + "grad_norm": 0.6099536909062926, + "learning_rate": 7.587550630549136e-05, + "loss": 11.8845, + "step": 21678 + }, + { + "epoch": 1.1805081919231109, + "grad_norm": 0.6056931162179835, + "learning_rate": 7.586694859491455e-05, + "loss": 12.1122, + "step": 21679 + }, + { + "epoch": 1.180562645919694, + "grad_norm": 0.5603632374089449, + "learning_rate": 7.585839107200046e-05, + "loss": 11.96, + "step": 21680 + }, + { + "epoch": 1.180617099916277, + "grad_norm": 0.5319769323960366, + "learning_rate": 7.584983373681552e-05, + "loss": 11.9465, + "step": 21681 + }, + { + "epoch": 1.18067155391286, + "grad_norm": 0.5019727975318679, + "learning_rate": 7.584127658942632e-05, + "loss": 11.7859, + "step": 21682 + }, + { + "epoch": 1.180726007909443, + "grad_norm": 0.5351146581359351, + "learning_rate": 7.58327196298994e-05, + "loss": 12.0504, + "step": 21683 + }, + { + "epoch": 1.180780461906026, + "grad_norm": 0.5645373231965284, + "learning_rate": 7.582416285830132e-05, + "loss": 12.011, + "step": 21684 + }, + { + "epoch": 1.180834915902609, + "grad_norm": 0.5979440946985238, + "learning_rate": 7.58156062746986e-05, + "loss": 12.0611, + "step": 21685 + }, + { + "epoch": 1.180889369899192, + "grad_norm": 0.5055138933136387, + "learning_rate": 7.580704987915777e-05, + "loss": 12.0173, + "step": 21686 + }, + { + "epoch": 1.180943823895775, + "grad_norm": 0.4761881341162523, + "learning_rate": 7.579849367174539e-05, + "loss": 11.9772, + "step": 21687 + }, + { + "epoch": 1.180998277892358, + "grad_norm": 0.6189338825141933, + "learning_rate": 7.578993765252798e-05, + "loss": 11.8785, + "step": 21688 + }, + { + "epoch": 1.181052731888941, + "grad_norm": 0.656185572498181, + "learning_rate": 7.578138182157208e-05, + "loss": 12.0732, + "step": 21689 + }, + { + "epoch": 1.181107185885524, + "grad_norm": 0.5553444220352391, + "learning_rate": 7.577282617894423e-05, + "loss": 11.8689, + "step": 21690 + }, + { + "epoch": 1.1811616398821072, + "grad_norm": 0.6184382676376969, + "learning_rate": 7.576427072471093e-05, + "loss": 12.1293, + "step": 21691 + }, + { + "epoch": 1.1812160938786902, + "grad_norm": 0.5651102836980537, + "learning_rate": 7.575571545893873e-05, + "loss": 12.1049, + "step": 21692 + }, + { + "epoch": 1.1812705478752732, + "grad_norm": 0.5838191518166306, + "learning_rate": 7.574716038169414e-05, + "loss": 12.0283, + "step": 21693 + }, + { + "epoch": 1.1813250018718562, + "grad_norm": 0.5161780662387591, + "learning_rate": 7.573860549304368e-05, + "loss": 11.9599, + "step": 21694 + }, + { + "epoch": 1.1813794558684392, + "grad_norm": 0.5102349093103432, + "learning_rate": 7.573005079305392e-05, + "loss": 11.8245, + "step": 21695 + }, + { + "epoch": 1.1814339098650222, + "grad_norm": 0.6195600594406752, + "learning_rate": 7.572149628179135e-05, + "loss": 12.0737, + "step": 21696 + }, + { + "epoch": 1.1814883638616052, + "grad_norm": 0.639473472962265, + "learning_rate": 7.57129419593225e-05, + "loss": 12.0344, + "step": 21697 + }, + { + "epoch": 1.1815428178581882, + "grad_norm": 0.495696446874667, + "learning_rate": 7.570438782571388e-05, + "loss": 11.9619, + "step": 21698 + }, + { + "epoch": 1.1815972718547711, + "grad_norm": 0.5520409378708824, + "learning_rate": 7.569583388103201e-05, + "loss": 11.9519, + "step": 21699 + }, + { + "epoch": 1.1816517258513541, + "grad_norm": 0.5965402705952677, + "learning_rate": 7.568728012534345e-05, + "loss": 12.1446, + "step": 21700 + }, + { + "epoch": 1.1817061798479371, + "grad_norm": 0.5520543923776201, + "learning_rate": 7.567872655871464e-05, + "loss": 11.9793, + "step": 21701 + }, + { + "epoch": 1.1817606338445201, + "grad_norm": 0.512609167119209, + "learning_rate": 7.567017318121214e-05, + "loss": 11.9533, + "step": 21702 + }, + { + "epoch": 1.1818150878411031, + "grad_norm": 0.5471941486705536, + "learning_rate": 7.566161999290246e-05, + "loss": 11.9136, + "step": 21703 + }, + { + "epoch": 1.1818695418376863, + "grad_norm": 0.7079993458364057, + "learning_rate": 7.565306699385208e-05, + "loss": 11.9342, + "step": 21704 + }, + { + "epoch": 1.1819239958342693, + "grad_norm": 0.560453505923944, + "learning_rate": 7.564451418412756e-05, + "loss": 11.854, + "step": 21705 + }, + { + "epoch": 1.1819784498308523, + "grad_norm": 0.566423257294691, + "learning_rate": 7.563596156379536e-05, + "loss": 12.0045, + "step": 21706 + }, + { + "epoch": 1.1820329038274353, + "grad_norm": 0.5686640800327653, + "learning_rate": 7.562740913292201e-05, + "loss": 11.8475, + "step": 21707 + }, + { + "epoch": 1.1820873578240183, + "grad_norm": 0.5770219785563593, + "learning_rate": 7.561885689157402e-05, + "loss": 11.9858, + "step": 21708 + }, + { + "epoch": 1.1821418118206013, + "grad_norm": 0.530537262039549, + "learning_rate": 7.56103048398179e-05, + "loss": 11.9835, + "step": 21709 + }, + { + "epoch": 1.1821962658171843, + "grad_norm": 0.5525937311384657, + "learning_rate": 7.560175297772016e-05, + "loss": 11.9754, + "step": 21710 + }, + { + "epoch": 1.1822507198137673, + "grad_norm": 0.5695581815367907, + "learning_rate": 7.559320130534724e-05, + "loss": 12.0579, + "step": 21711 + }, + { + "epoch": 1.1823051738103503, + "grad_norm": 0.5483632717658697, + "learning_rate": 7.558464982276569e-05, + "loss": 11.8674, + "step": 21712 + }, + { + "epoch": 1.1823596278069333, + "grad_norm": 0.6059415138577536, + "learning_rate": 7.5576098530042e-05, + "loss": 11.7938, + "step": 21713 + }, + { + "epoch": 1.1824140818035165, + "grad_norm": 0.5610916969377091, + "learning_rate": 7.556754742724267e-05, + "loss": 11.7845, + "step": 21714 + }, + { + "epoch": 1.1824685358000995, + "grad_norm": 0.5227998543254345, + "learning_rate": 7.555899651443417e-05, + "loss": 12.0602, + "step": 21715 + }, + { + "epoch": 1.1825229897966825, + "grad_norm": 0.5181757698245862, + "learning_rate": 7.555044579168303e-05, + "loss": 11.9774, + "step": 21716 + }, + { + "epoch": 1.1825774437932655, + "grad_norm": 0.5710367409027894, + "learning_rate": 7.554189525905569e-05, + "loss": 12.0563, + "step": 21717 + }, + { + "epoch": 1.1826318977898485, + "grad_norm": 0.46707603485019517, + "learning_rate": 7.553334491661871e-05, + "loss": 11.9401, + "step": 21718 + }, + { + "epoch": 1.1826863517864314, + "grad_norm": 0.557364000105476, + "learning_rate": 7.552479476443854e-05, + "loss": 12.0401, + "step": 21719 + }, + { + "epoch": 1.1827408057830144, + "grad_norm": 0.5968173852566319, + "learning_rate": 7.55162448025817e-05, + "loss": 11.9308, + "step": 21720 + }, + { + "epoch": 1.1827952597795974, + "grad_norm": 0.6053471368379904, + "learning_rate": 7.550769503111459e-05, + "loss": 12.148, + "step": 21721 + }, + { + "epoch": 1.1828497137761804, + "grad_norm": 0.5733610961834373, + "learning_rate": 7.549914545010377e-05, + "loss": 11.9982, + "step": 21722 + }, + { + "epoch": 1.1829041677727634, + "grad_norm": 0.521571558531158, + "learning_rate": 7.54905960596157e-05, + "loss": 12.0328, + "step": 21723 + }, + { + "epoch": 1.1829586217693464, + "grad_norm": 0.563622430367173, + "learning_rate": 7.548204685971688e-05, + "loss": 12.0217, + "step": 21724 + }, + { + "epoch": 1.1830130757659294, + "grad_norm": 0.5433801146102819, + "learning_rate": 7.547349785047376e-05, + "loss": 11.926, + "step": 21725 + }, + { + "epoch": 1.1830675297625124, + "grad_norm": 0.5531502300311864, + "learning_rate": 7.546494903195284e-05, + "loss": 11.7903, + "step": 21726 + }, + { + "epoch": 1.1831219837590956, + "grad_norm": 0.5920247200071598, + "learning_rate": 7.54564004042206e-05, + "loss": 12.1427, + "step": 21727 + }, + { + "epoch": 1.1831764377556786, + "grad_norm": 0.5443411208538833, + "learning_rate": 7.544785196734347e-05, + "loss": 12.0634, + "step": 21728 + }, + { + "epoch": 1.1832308917522616, + "grad_norm": 0.6103023491678193, + "learning_rate": 7.543930372138799e-05, + "loss": 12.0133, + "step": 21729 + }, + { + "epoch": 1.1832853457488446, + "grad_norm": 0.5284839116957317, + "learning_rate": 7.543075566642063e-05, + "loss": 11.9542, + "step": 21730 + }, + { + "epoch": 1.1833397997454276, + "grad_norm": 0.5370732737599623, + "learning_rate": 7.542220780250781e-05, + "loss": 11.9737, + "step": 21731 + }, + { + "epoch": 1.1833942537420106, + "grad_norm": 0.5081119754958391, + "learning_rate": 7.5413660129716e-05, + "loss": 11.9504, + "step": 21732 + }, + { + "epoch": 1.1834487077385936, + "grad_norm": 0.5198359969619085, + "learning_rate": 7.540511264811172e-05, + "loss": 11.9739, + "step": 21733 + }, + { + "epoch": 1.1835031617351766, + "grad_norm": 0.5294198800766511, + "learning_rate": 7.53965653577614e-05, + "loss": 12.0196, + "step": 21734 + }, + { + "epoch": 1.1835576157317595, + "grad_norm": 0.6708523712972554, + "learning_rate": 7.538801825873151e-05, + "loss": 12.0584, + "step": 21735 + }, + { + "epoch": 1.1836120697283425, + "grad_norm": 0.5905873501532606, + "learning_rate": 7.537947135108852e-05, + "loss": 12.0055, + "step": 21736 + }, + { + "epoch": 1.1836665237249258, + "grad_norm": 0.5280514428250044, + "learning_rate": 7.537092463489888e-05, + "loss": 11.9186, + "step": 21737 + }, + { + "epoch": 1.1837209777215087, + "grad_norm": 0.511630308174885, + "learning_rate": 7.536237811022908e-05, + "loss": 11.8857, + "step": 21738 + }, + { + "epoch": 1.1837754317180917, + "grad_norm": 0.5773849467589317, + "learning_rate": 7.535383177714553e-05, + "loss": 12.0339, + "step": 21739 + }, + { + "epoch": 1.1838298857146747, + "grad_norm": 0.5902880598466377, + "learning_rate": 7.534528563571478e-05, + "loss": 12.0105, + "step": 21740 + }, + { + "epoch": 1.1838843397112577, + "grad_norm": 0.5576658102997276, + "learning_rate": 7.533673968600317e-05, + "loss": 11.8993, + "step": 21741 + }, + { + "epoch": 1.1839387937078407, + "grad_norm": 0.5011873542817606, + "learning_rate": 7.532819392807723e-05, + "loss": 11.9773, + "step": 21742 + }, + { + "epoch": 1.1839932477044237, + "grad_norm": 0.5056615027385634, + "learning_rate": 7.531964836200336e-05, + "loss": 11.9591, + "step": 21743 + }, + { + "epoch": 1.1840477017010067, + "grad_norm": 0.5902346509462059, + "learning_rate": 7.531110298784807e-05, + "loss": 12.0789, + "step": 21744 + }, + { + "epoch": 1.1841021556975897, + "grad_norm": 0.5357587708500191, + "learning_rate": 7.530255780567777e-05, + "loss": 11.9819, + "step": 21745 + }, + { + "epoch": 1.1841566096941727, + "grad_norm": 0.5714466400676492, + "learning_rate": 7.529401281555892e-05, + "loss": 11.9447, + "step": 21746 + }, + { + "epoch": 1.1842110636907557, + "grad_norm": 0.6919689987784267, + "learning_rate": 7.528546801755799e-05, + "loss": 12.066, + "step": 21747 + }, + { + "epoch": 1.1842655176873387, + "grad_norm": 0.569462809235025, + "learning_rate": 7.52769234117414e-05, + "loss": 12.0948, + "step": 21748 + }, + { + "epoch": 1.1843199716839217, + "grad_norm": 0.5582325725628782, + "learning_rate": 7.526837899817559e-05, + "loss": 11.9375, + "step": 21749 + }, + { + "epoch": 1.1843744256805049, + "grad_norm": 0.5830476014734609, + "learning_rate": 7.525983477692703e-05, + "loss": 12.0411, + "step": 21750 + }, + { + "epoch": 1.1844288796770879, + "grad_norm": 0.6654803020952056, + "learning_rate": 7.525129074806213e-05, + "loss": 12.0759, + "step": 21751 + }, + { + "epoch": 1.1844833336736709, + "grad_norm": 0.5128129143192934, + "learning_rate": 7.524274691164734e-05, + "loss": 11.9515, + "step": 21752 + }, + { + "epoch": 1.1845377876702539, + "grad_norm": 0.6029074652028825, + "learning_rate": 7.523420326774911e-05, + "loss": 11.9985, + "step": 21753 + }, + { + "epoch": 1.1845922416668369, + "grad_norm": 0.5067885544790296, + "learning_rate": 7.522565981643387e-05, + "loss": 11.8589, + "step": 21754 + }, + { + "epoch": 1.1846466956634198, + "grad_norm": 0.5158502405510174, + "learning_rate": 7.521711655776802e-05, + "loss": 11.9687, + "step": 21755 + }, + { + "epoch": 1.1847011496600028, + "grad_norm": 0.5380068678218914, + "learning_rate": 7.520857349181806e-05, + "loss": 12.0553, + "step": 21756 + }, + { + "epoch": 1.1847556036565858, + "grad_norm": 0.5258209740523851, + "learning_rate": 7.520003061865038e-05, + "loss": 11.9678, + "step": 21757 + }, + { + "epoch": 1.1848100576531688, + "grad_norm": 0.5674122484543469, + "learning_rate": 7.519148793833143e-05, + "loss": 12.027, + "step": 21758 + }, + { + "epoch": 1.1848645116497518, + "grad_norm": 0.5415370607754529, + "learning_rate": 7.518294545092763e-05, + "loss": 11.8541, + "step": 21759 + }, + { + "epoch": 1.1849189656463348, + "grad_norm": 0.5215861042928728, + "learning_rate": 7.517440315650542e-05, + "loss": 11.8873, + "step": 21760 + }, + { + "epoch": 1.184973419642918, + "grad_norm": 0.7521342107961826, + "learning_rate": 7.51658610551312e-05, + "loss": 12.246, + "step": 21761 + }, + { + "epoch": 1.185027873639501, + "grad_norm": 0.5522051040985905, + "learning_rate": 7.51573191468714e-05, + "loss": 11.9465, + "step": 21762 + }, + { + "epoch": 1.185082327636084, + "grad_norm": 0.5986971920610508, + "learning_rate": 7.514877743179248e-05, + "loss": 12.0501, + "step": 21763 + }, + { + "epoch": 1.185136781632667, + "grad_norm": 0.6577498856361116, + "learning_rate": 7.514023590996081e-05, + "loss": 12.0744, + "step": 21764 + }, + { + "epoch": 1.18519123562925, + "grad_norm": 0.5709753816871418, + "learning_rate": 7.513169458144284e-05, + "loss": 11.8767, + "step": 21765 + }, + { + "epoch": 1.185245689625833, + "grad_norm": 0.5424432614333545, + "learning_rate": 7.512315344630496e-05, + "loss": 11.942, + "step": 21766 + }, + { + "epoch": 1.185300143622416, + "grad_norm": 0.5524916153826994, + "learning_rate": 7.511461250461364e-05, + "loss": 12.0522, + "step": 21767 + }, + { + "epoch": 1.185354597618999, + "grad_norm": 0.5386178542301641, + "learning_rate": 7.510607175643525e-05, + "loss": 12.0999, + "step": 21768 + }, + { + "epoch": 1.185409051615582, + "grad_norm": 0.5302354868198255, + "learning_rate": 7.509753120183624e-05, + "loss": 12.0432, + "step": 21769 + }, + { + "epoch": 1.185463505612165, + "grad_norm": 0.5581106828952281, + "learning_rate": 7.508899084088301e-05, + "loss": 12.0233, + "step": 21770 + }, + { + "epoch": 1.185517959608748, + "grad_norm": 0.5425977101660957, + "learning_rate": 7.508045067364194e-05, + "loss": 12.047, + "step": 21771 + }, + { + "epoch": 1.185572413605331, + "grad_norm": 0.593887575565652, + "learning_rate": 7.507191070017948e-05, + "loss": 11.974, + "step": 21772 + }, + { + "epoch": 1.185626867601914, + "grad_norm": 0.5225432690410278, + "learning_rate": 7.506337092056202e-05, + "loss": 12.0737, + "step": 21773 + }, + { + "epoch": 1.1856813215984972, + "grad_norm": 0.6143012266055953, + "learning_rate": 7.505483133485595e-05, + "loss": 11.9907, + "step": 21774 + }, + { + "epoch": 1.1857357755950801, + "grad_norm": 0.5629345433031676, + "learning_rate": 7.504629194312773e-05, + "loss": 11.9683, + "step": 21775 + }, + { + "epoch": 1.1857902295916631, + "grad_norm": 0.581947804657824, + "learning_rate": 7.50377527454437e-05, + "loss": 11.9701, + "step": 21776 + }, + { + "epoch": 1.1858446835882461, + "grad_norm": 0.5147110297629652, + "learning_rate": 7.502921374187029e-05, + "loss": 11.9844, + "step": 21777 + }, + { + "epoch": 1.1858991375848291, + "grad_norm": 0.5407679106530497, + "learning_rate": 7.50206749324739e-05, + "loss": 11.8965, + "step": 21778 + }, + { + "epoch": 1.1859535915814121, + "grad_norm": 0.5643712912136952, + "learning_rate": 7.501213631732095e-05, + "loss": 12.0641, + "step": 21779 + }, + { + "epoch": 1.186008045577995, + "grad_norm": 0.6000764496420661, + "learning_rate": 7.500359789647785e-05, + "loss": 12.1549, + "step": 21780 + }, + { + "epoch": 1.186062499574578, + "grad_norm": 0.5520801822640712, + "learning_rate": 7.49950596700109e-05, + "loss": 11.9133, + "step": 21781 + }, + { + "epoch": 1.186116953571161, + "grad_norm": 0.6109658539324219, + "learning_rate": 7.498652163798658e-05, + "loss": 11.9836, + "step": 21782 + }, + { + "epoch": 1.186171407567744, + "grad_norm": 0.5252949920892036, + "learning_rate": 7.497798380047127e-05, + "loss": 12.1093, + "step": 21783 + }, + { + "epoch": 1.1862258615643273, + "grad_norm": 0.6179409760128146, + "learning_rate": 7.496944615753136e-05, + "loss": 12.0782, + "step": 21784 + }, + { + "epoch": 1.1862803155609103, + "grad_norm": 0.5840217304614371, + "learning_rate": 7.496090870923323e-05, + "loss": 11.9846, + "step": 21785 + }, + { + "epoch": 1.1863347695574933, + "grad_norm": 0.594856080206936, + "learning_rate": 7.495237145564327e-05, + "loss": 12.0495, + "step": 21786 + }, + { + "epoch": 1.1863892235540763, + "grad_norm": 0.5039334530956467, + "learning_rate": 7.494383439682787e-05, + "loss": 11.902, + "step": 21787 + }, + { + "epoch": 1.1864436775506593, + "grad_norm": 0.48203965829591866, + "learning_rate": 7.493529753285339e-05, + "loss": 11.7758, + "step": 21788 + }, + { + "epoch": 1.1864981315472423, + "grad_norm": 0.5000380138956201, + "learning_rate": 7.492676086378626e-05, + "loss": 11.9807, + "step": 21789 + }, + { + "epoch": 1.1865525855438253, + "grad_norm": 0.5560242956456776, + "learning_rate": 7.491822438969289e-05, + "loss": 11.9622, + "step": 21790 + }, + { + "epoch": 1.1866070395404082, + "grad_norm": 0.561093459286848, + "learning_rate": 7.490968811063956e-05, + "loss": 11.9649, + "step": 21791 + }, + { + "epoch": 1.1866614935369912, + "grad_norm": 0.569981991409385, + "learning_rate": 7.49011520266927e-05, + "loss": 12.2138, + "step": 21792 + }, + { + "epoch": 1.1867159475335742, + "grad_norm": 0.5575985421141544, + "learning_rate": 7.489261613791871e-05, + "loss": 11.9822, + "step": 21793 + }, + { + "epoch": 1.1867704015301572, + "grad_norm": 0.4961472956487318, + "learning_rate": 7.488408044438393e-05, + "loss": 12.0297, + "step": 21794 + }, + { + "epoch": 1.1868248555267402, + "grad_norm": 0.5311876448727464, + "learning_rate": 7.487554494615476e-05, + "loss": 12.059, + "step": 21795 + }, + { + "epoch": 1.1868793095233232, + "grad_norm": 0.5626886924773438, + "learning_rate": 7.486700964329758e-05, + "loss": 11.9969, + "step": 21796 + }, + { + "epoch": 1.1869337635199064, + "grad_norm": 0.5843387177835994, + "learning_rate": 7.485847453587873e-05, + "loss": 12.0714, + "step": 21797 + }, + { + "epoch": 1.1869882175164894, + "grad_norm": 0.5189952502279995, + "learning_rate": 7.48499396239646e-05, + "loss": 12.1539, + "step": 21798 + }, + { + "epoch": 1.1870426715130724, + "grad_norm": 0.5209743899056666, + "learning_rate": 7.484140490762158e-05, + "loss": 11.8418, + "step": 21799 + }, + { + "epoch": 1.1870971255096554, + "grad_norm": 0.4994690385578024, + "learning_rate": 7.483287038691601e-05, + "loss": 11.8865, + "step": 21800 + }, + { + "epoch": 1.1871515795062384, + "grad_norm": 0.5424976862399479, + "learning_rate": 7.482433606191426e-05, + "loss": 12.0755, + "step": 21801 + }, + { + "epoch": 1.1872060335028214, + "grad_norm": 0.5428216072561727, + "learning_rate": 7.481580193268267e-05, + "loss": 11.8406, + "step": 21802 + }, + { + "epoch": 1.1872604874994044, + "grad_norm": 0.611819389257739, + "learning_rate": 7.480726799928764e-05, + "loss": 11.9912, + "step": 21803 + }, + { + "epoch": 1.1873149414959874, + "grad_norm": 0.5115133464397964, + "learning_rate": 7.47987342617955e-05, + "loss": 12.0168, + "step": 21804 + }, + { + "epoch": 1.1873693954925704, + "grad_norm": 0.6358308408573634, + "learning_rate": 7.479020072027266e-05, + "loss": 12.166, + "step": 21805 + }, + { + "epoch": 1.1874238494891534, + "grad_norm": 0.5419860484051422, + "learning_rate": 7.478166737478543e-05, + "loss": 12.1022, + "step": 21806 + }, + { + "epoch": 1.1874783034857366, + "grad_norm": 0.50538323000821, + "learning_rate": 7.477313422540017e-05, + "loss": 11.9044, + "step": 21807 + }, + { + "epoch": 1.1875327574823196, + "grad_norm": 0.5468604627982997, + "learning_rate": 7.476460127218328e-05, + "loss": 12.081, + "step": 21808 + }, + { + "epoch": 1.1875872114789026, + "grad_norm": 0.5568421073854486, + "learning_rate": 7.475606851520107e-05, + "loss": 12.0435, + "step": 21809 + }, + { + "epoch": 1.1876416654754856, + "grad_norm": 0.6510994234584724, + "learning_rate": 7.474753595451992e-05, + "loss": 12.0943, + "step": 21810 + }, + { + "epoch": 1.1876961194720685, + "grad_norm": 0.5522041966154742, + "learning_rate": 7.473900359020615e-05, + "loss": 12.0387, + "step": 21811 + }, + { + "epoch": 1.1877505734686515, + "grad_norm": 0.6119962614850255, + "learning_rate": 7.473047142232611e-05, + "loss": 12.0829, + "step": 21812 + }, + { + "epoch": 1.1878050274652345, + "grad_norm": 0.5305426444618092, + "learning_rate": 7.472193945094619e-05, + "loss": 12.0808, + "step": 21813 + }, + { + "epoch": 1.1878594814618175, + "grad_norm": 0.6337365180981813, + "learning_rate": 7.47134076761327e-05, + "loss": 12.129, + "step": 21814 + }, + { + "epoch": 1.1879139354584005, + "grad_norm": 0.5250699532031685, + "learning_rate": 7.470487609795197e-05, + "loss": 11.983, + "step": 21815 + }, + { + "epoch": 1.1879683894549835, + "grad_norm": 0.5471512499498226, + "learning_rate": 7.469634471647037e-05, + "loss": 12.1291, + "step": 21816 + }, + { + "epoch": 1.1880228434515665, + "grad_norm": 0.5467733525912786, + "learning_rate": 7.468781353175425e-05, + "loss": 11.9967, + "step": 21817 + }, + { + "epoch": 1.1880772974481495, + "grad_norm": 0.50643789381646, + "learning_rate": 7.467928254386993e-05, + "loss": 11.8694, + "step": 21818 + }, + { + "epoch": 1.1881317514447325, + "grad_norm": 0.6046764353226523, + "learning_rate": 7.467075175288377e-05, + "loss": 12.0005, + "step": 21819 + }, + { + "epoch": 1.1881862054413157, + "grad_norm": 0.5516677753495105, + "learning_rate": 7.466222115886208e-05, + "loss": 11.8673, + "step": 21820 + }, + { + "epoch": 1.1882406594378987, + "grad_norm": 0.5172722454661293, + "learning_rate": 7.465369076187124e-05, + "loss": 11.9521, + "step": 21821 + }, + { + "epoch": 1.1882951134344817, + "grad_norm": 0.49449399673176725, + "learning_rate": 7.46451605619775e-05, + "loss": 11.9785, + "step": 21822 + }, + { + "epoch": 1.1883495674310647, + "grad_norm": 0.6243888753730924, + "learning_rate": 7.463663055924728e-05, + "loss": 11.9585, + "step": 21823 + }, + { + "epoch": 1.1884040214276477, + "grad_norm": 0.5698569554828076, + "learning_rate": 7.462810075374685e-05, + "loss": 11.8915, + "step": 21824 + }, + { + "epoch": 1.1884584754242307, + "grad_norm": 0.5130897372640837, + "learning_rate": 7.461957114554256e-05, + "loss": 12.0606, + "step": 21825 + }, + { + "epoch": 1.1885129294208137, + "grad_norm": 0.585214140568694, + "learning_rate": 7.461104173470075e-05, + "loss": 12.0466, + "step": 21826 + }, + { + "epoch": 1.1885673834173967, + "grad_norm": 0.8057724923497462, + "learning_rate": 7.460251252128774e-05, + "loss": 12.0085, + "step": 21827 + }, + { + "epoch": 1.1886218374139796, + "grad_norm": 0.5206311767301525, + "learning_rate": 7.459398350536985e-05, + "loss": 12.0089, + "step": 21828 + }, + { + "epoch": 1.1886762914105626, + "grad_norm": 0.5250316044303713, + "learning_rate": 7.458545468701341e-05, + "loss": 12.0066, + "step": 21829 + }, + { + "epoch": 1.1887307454071459, + "grad_norm": 0.5309614924213232, + "learning_rate": 7.457692606628473e-05, + "loss": 11.9824, + "step": 21830 + }, + { + "epoch": 1.1887851994037288, + "grad_norm": 0.6201426355334324, + "learning_rate": 7.456839764325016e-05, + "loss": 11.9513, + "step": 21831 + }, + { + "epoch": 1.1888396534003118, + "grad_norm": 0.6264038379079032, + "learning_rate": 7.455986941797597e-05, + "loss": 12.1016, + "step": 21832 + }, + { + "epoch": 1.1888941073968948, + "grad_norm": 0.5442003806645258, + "learning_rate": 7.45513413905285e-05, + "loss": 12.0651, + "step": 21833 + }, + { + "epoch": 1.1889485613934778, + "grad_norm": 0.5446554848843562, + "learning_rate": 7.454281356097407e-05, + "loss": 11.9691, + "step": 21834 + }, + { + "epoch": 1.1890030153900608, + "grad_norm": 0.6018851851438473, + "learning_rate": 7.453428592937901e-05, + "loss": 12.0177, + "step": 21835 + }, + { + "epoch": 1.1890574693866438, + "grad_norm": 0.5436708939061847, + "learning_rate": 7.45257584958096e-05, + "loss": 11.8384, + "step": 21836 + }, + { + "epoch": 1.1891119233832268, + "grad_norm": 0.5096974488486813, + "learning_rate": 7.451723126033214e-05, + "loss": 11.8501, + "step": 21837 + }, + { + "epoch": 1.1891663773798098, + "grad_norm": 0.6681927843726951, + "learning_rate": 7.450870422301298e-05, + "loss": 12.0757, + "step": 21838 + }, + { + "epoch": 1.1892208313763928, + "grad_norm": 0.5688654042324359, + "learning_rate": 7.450017738391841e-05, + "loss": 11.7817, + "step": 21839 + }, + { + "epoch": 1.1892752853729758, + "grad_norm": 0.5095351042546655, + "learning_rate": 7.449165074311475e-05, + "loss": 11.9059, + "step": 21840 + }, + { + "epoch": 1.1893297393695588, + "grad_norm": 0.5143100203772302, + "learning_rate": 7.448312430066831e-05, + "loss": 12.0267, + "step": 21841 + }, + { + "epoch": 1.1893841933661418, + "grad_norm": 0.5155825069420525, + "learning_rate": 7.447459805664534e-05, + "loss": 11.8781, + "step": 21842 + }, + { + "epoch": 1.1894386473627248, + "grad_norm": 0.5223028192969036, + "learning_rate": 7.446607201111219e-05, + "loss": 12.0149, + "step": 21843 + }, + { + "epoch": 1.189493101359308, + "grad_norm": 0.5689111220899553, + "learning_rate": 7.445754616413514e-05, + "loss": 11.7965, + "step": 21844 + }, + { + "epoch": 1.189547555355891, + "grad_norm": 0.5745430197939587, + "learning_rate": 7.444902051578049e-05, + "loss": 12.1652, + "step": 21845 + }, + { + "epoch": 1.189602009352474, + "grad_norm": 0.5448431461543952, + "learning_rate": 7.444049506611454e-05, + "loss": 11.8951, + "step": 21846 + }, + { + "epoch": 1.189656463349057, + "grad_norm": 0.5872918584595909, + "learning_rate": 7.44319698152036e-05, + "loss": 12.0168, + "step": 21847 + }, + { + "epoch": 1.18971091734564, + "grad_norm": 0.5226448915531637, + "learning_rate": 7.442344476311393e-05, + "loss": 12.0349, + "step": 21848 + }, + { + "epoch": 1.189765371342223, + "grad_norm": 0.5432356020572908, + "learning_rate": 7.441491990991185e-05, + "loss": 11.9819, + "step": 21849 + }, + { + "epoch": 1.189819825338806, + "grad_norm": 0.5628765091838128, + "learning_rate": 7.440639525566365e-05, + "loss": 12.068, + "step": 21850 + }, + { + "epoch": 1.189874279335389, + "grad_norm": 0.531129064044447, + "learning_rate": 7.439787080043565e-05, + "loss": 11.977, + "step": 21851 + }, + { + "epoch": 1.189928733331972, + "grad_norm": 0.5306473030453958, + "learning_rate": 7.438934654429407e-05, + "loss": 11.9767, + "step": 21852 + }, + { + "epoch": 1.189983187328555, + "grad_norm": 0.6468496902013098, + "learning_rate": 7.438082248730521e-05, + "loss": 12.0752, + "step": 21853 + }, + { + "epoch": 1.1900376413251381, + "grad_norm": 0.662395989981953, + "learning_rate": 7.437229862953536e-05, + "loss": 12.1451, + "step": 21854 + }, + { + "epoch": 1.190092095321721, + "grad_norm": 0.5634303787723975, + "learning_rate": 7.436377497105085e-05, + "loss": 11.8939, + "step": 21855 + }, + { + "epoch": 1.190146549318304, + "grad_norm": 0.5284039031704981, + "learning_rate": 7.43552515119179e-05, + "loss": 12.0463, + "step": 21856 + }, + { + "epoch": 1.190201003314887, + "grad_norm": 0.5481202976593118, + "learning_rate": 7.434672825220283e-05, + "loss": 12.0324, + "step": 21857 + }, + { + "epoch": 1.19025545731147, + "grad_norm": 0.5671166962659867, + "learning_rate": 7.433820519197189e-05, + "loss": 12.0328, + "step": 21858 + }, + { + "epoch": 1.190309911308053, + "grad_norm": 0.552763651588234, + "learning_rate": 7.432968233129139e-05, + "loss": 12.0551, + "step": 21859 + }, + { + "epoch": 1.190364365304636, + "grad_norm": 0.5391150052421613, + "learning_rate": 7.432115967022754e-05, + "loss": 11.999, + "step": 21860 + }, + { + "epoch": 1.190418819301219, + "grad_norm": 0.5515344211903856, + "learning_rate": 7.431263720884674e-05, + "loss": 11.9639, + "step": 21861 + }, + { + "epoch": 1.190473273297802, + "grad_norm": 0.4927736099365677, + "learning_rate": 7.430411494721512e-05, + "loss": 12.0217, + "step": 21862 + }, + { + "epoch": 1.190527727294385, + "grad_norm": 0.6616589330031137, + "learning_rate": 7.429559288539903e-05, + "loss": 12.0721, + "step": 21863 + }, + { + "epoch": 1.190582181290968, + "grad_norm": 0.5585836406806164, + "learning_rate": 7.428707102346469e-05, + "loss": 11.963, + "step": 21864 + }, + { + "epoch": 1.190636635287551, + "grad_norm": 0.5778445956640136, + "learning_rate": 7.427854936147841e-05, + "loss": 12.1283, + "step": 21865 + }, + { + "epoch": 1.190691089284134, + "grad_norm": 0.5773331365814527, + "learning_rate": 7.427002789950645e-05, + "loss": 12.102, + "step": 21866 + }, + { + "epoch": 1.1907455432807172, + "grad_norm": 0.6090155689074539, + "learning_rate": 7.426150663761508e-05, + "loss": 12.058, + "step": 21867 + }, + { + "epoch": 1.1907999972773002, + "grad_norm": 0.5268971066809944, + "learning_rate": 7.425298557587054e-05, + "loss": 11.9267, + "step": 21868 + }, + { + "epoch": 1.1908544512738832, + "grad_norm": 0.5352024391379133, + "learning_rate": 7.42444647143391e-05, + "loss": 11.8219, + "step": 21869 + }, + { + "epoch": 1.1909089052704662, + "grad_norm": 0.5267535550203765, + "learning_rate": 7.423594405308703e-05, + "loss": 11.9724, + "step": 21870 + }, + { + "epoch": 1.1909633592670492, + "grad_norm": 0.5316513047581957, + "learning_rate": 7.42274235921806e-05, + "loss": 12.0106, + "step": 21871 + }, + { + "epoch": 1.1910178132636322, + "grad_norm": 0.5899821522090289, + "learning_rate": 7.421890333168602e-05, + "loss": 12.1346, + "step": 21872 + }, + { + "epoch": 1.1910722672602152, + "grad_norm": 0.5992321160668765, + "learning_rate": 7.421038327166958e-05, + "loss": 12.0992, + "step": 21873 + }, + { + "epoch": 1.1911267212567982, + "grad_norm": 0.5220059104085887, + "learning_rate": 7.420186341219751e-05, + "loss": 12.0391, + "step": 21874 + }, + { + "epoch": 1.1911811752533812, + "grad_norm": 0.5483952568272, + "learning_rate": 7.419334375333606e-05, + "loss": 12.0126, + "step": 21875 + }, + { + "epoch": 1.1912356292499642, + "grad_norm": 0.5386623185595957, + "learning_rate": 7.418482429515152e-05, + "loss": 12.078, + "step": 21876 + }, + { + "epoch": 1.1912900832465474, + "grad_norm": 0.5439532173277934, + "learning_rate": 7.417630503771013e-05, + "loss": 12.0004, + "step": 21877 + }, + { + "epoch": 1.1913445372431304, + "grad_norm": 0.5736166552730142, + "learning_rate": 7.41677859810781e-05, + "loss": 12.0908, + "step": 21878 + }, + { + "epoch": 1.1913989912397134, + "grad_norm": 0.5648809372426857, + "learning_rate": 7.41592671253217e-05, + "loss": 11.9778, + "step": 21879 + }, + { + "epoch": 1.1914534452362964, + "grad_norm": 0.48510183035173565, + "learning_rate": 7.41507484705072e-05, + "loss": 11.9495, + "step": 21880 + }, + { + "epoch": 1.1915078992328794, + "grad_norm": 0.5211179210723154, + "learning_rate": 7.41422300167008e-05, + "loss": 11.9781, + "step": 21881 + }, + { + "epoch": 1.1915623532294624, + "grad_norm": 0.5299213249093413, + "learning_rate": 7.413371176396876e-05, + "loss": 12.0785, + "step": 21882 + }, + { + "epoch": 1.1916168072260453, + "grad_norm": 0.49308108682965807, + "learning_rate": 7.41251937123773e-05, + "loss": 11.9841, + "step": 21883 + }, + { + "epoch": 1.1916712612226283, + "grad_norm": 0.5342328033936905, + "learning_rate": 7.41166758619927e-05, + "loss": 11.8857, + "step": 21884 + }, + { + "epoch": 1.1917257152192113, + "grad_norm": 0.5273168195257198, + "learning_rate": 7.410815821288113e-05, + "loss": 12.0555, + "step": 21885 + }, + { + "epoch": 1.1917801692157943, + "grad_norm": 0.5528610747944571, + "learning_rate": 7.409964076510886e-05, + "loss": 11.9563, + "step": 21886 + }, + { + "epoch": 1.1918346232123773, + "grad_norm": 0.5396212993556745, + "learning_rate": 7.409112351874214e-05, + "loss": 11.9265, + "step": 21887 + }, + { + "epoch": 1.1918890772089603, + "grad_norm": 0.5604629835030344, + "learning_rate": 7.40826064738472e-05, + "loss": 11.9565, + "step": 21888 + }, + { + "epoch": 1.1919435312055433, + "grad_norm": 0.5256247292380115, + "learning_rate": 7.407408963049027e-05, + "loss": 11.9664, + "step": 21889 + }, + { + "epoch": 1.1919979852021265, + "grad_norm": 0.5554092422140184, + "learning_rate": 7.406557298873754e-05, + "loss": 11.9756, + "step": 21890 + }, + { + "epoch": 1.1920524391987095, + "grad_norm": 0.5385375597876597, + "learning_rate": 7.40570565486553e-05, + "loss": 11.9556, + "step": 21891 + }, + { + "epoch": 1.1921068931952925, + "grad_norm": 0.5466236276669121, + "learning_rate": 7.404854031030971e-05, + "loss": 11.9882, + "step": 21892 + }, + { + "epoch": 1.1921613471918755, + "grad_norm": 0.610434010639925, + "learning_rate": 7.404002427376703e-05, + "loss": 11.9038, + "step": 21893 + }, + { + "epoch": 1.1922158011884585, + "grad_norm": 0.5481300770223017, + "learning_rate": 7.403150843909348e-05, + "loss": 11.7977, + "step": 21894 + }, + { + "epoch": 1.1922702551850415, + "grad_norm": 0.5432539067052593, + "learning_rate": 7.402299280635526e-05, + "loss": 11.8553, + "step": 21895 + }, + { + "epoch": 1.1923247091816245, + "grad_norm": 0.5326296538994771, + "learning_rate": 7.401447737561862e-05, + "loss": 12.0045, + "step": 21896 + }, + { + "epoch": 1.1923791631782075, + "grad_norm": 0.48761894352387997, + "learning_rate": 7.400596214694973e-05, + "loss": 11.9545, + "step": 21897 + }, + { + "epoch": 1.1924336171747905, + "grad_norm": 0.5777590971349343, + "learning_rate": 7.399744712041485e-05, + "loss": 12.0196, + "step": 21898 + }, + { + "epoch": 1.1924880711713735, + "grad_norm": 0.5979944522655282, + "learning_rate": 7.398893229608019e-05, + "loss": 11.9432, + "step": 21899 + }, + { + "epoch": 1.1925425251679567, + "grad_norm": 0.5351970404514349, + "learning_rate": 7.398041767401196e-05, + "loss": 11.9262, + "step": 21900 + }, + { + "epoch": 1.1925969791645397, + "grad_norm": 0.5851772163331859, + "learning_rate": 7.397190325427641e-05, + "loss": 12.0415, + "step": 21901 + }, + { + "epoch": 1.1926514331611227, + "grad_norm": 0.5520233932528713, + "learning_rate": 7.396338903693964e-05, + "loss": 11.9481, + "step": 21902 + }, + { + "epoch": 1.1927058871577056, + "grad_norm": 0.5674022158261073, + "learning_rate": 7.395487502206794e-05, + "loss": 11.8746, + "step": 21903 + }, + { + "epoch": 1.1927603411542886, + "grad_norm": 0.5574839517552891, + "learning_rate": 7.394636120972749e-05, + "loss": 11.9236, + "step": 21904 + }, + { + "epoch": 1.1928147951508716, + "grad_norm": 0.5526424355538554, + "learning_rate": 7.393784759998452e-05, + "loss": 11.8955, + "step": 21905 + }, + { + "epoch": 1.1928692491474546, + "grad_norm": 0.5830260492488549, + "learning_rate": 7.392933419290522e-05, + "loss": 11.9274, + "step": 21906 + }, + { + "epoch": 1.1929237031440376, + "grad_norm": 0.5597230753997932, + "learning_rate": 7.392082098855577e-05, + "loss": 12.0131, + "step": 21907 + }, + { + "epoch": 1.1929781571406206, + "grad_norm": 0.6554421652823247, + "learning_rate": 7.391230798700242e-05, + "loss": 12.031, + "step": 21908 + }, + { + "epoch": 1.1930326111372036, + "grad_norm": 0.5903428092902078, + "learning_rate": 7.390379518831129e-05, + "loss": 12.0472, + "step": 21909 + }, + { + "epoch": 1.1930870651337866, + "grad_norm": 0.5491708507929751, + "learning_rate": 7.389528259254866e-05, + "loss": 11.9285, + "step": 21910 + }, + { + "epoch": 1.1931415191303696, + "grad_norm": 0.5961736138488625, + "learning_rate": 7.388677019978072e-05, + "loss": 11.9876, + "step": 21911 + }, + { + "epoch": 1.1931959731269526, + "grad_norm": 0.5362989632073789, + "learning_rate": 7.387825801007359e-05, + "loss": 11.9254, + "step": 21912 + }, + { + "epoch": 1.1932504271235358, + "grad_norm": 0.59063688744146, + "learning_rate": 7.38697460234935e-05, + "loss": 12.0482, + "step": 21913 + }, + { + "epoch": 1.1933048811201188, + "grad_norm": 0.538421744485904, + "learning_rate": 7.386123424010667e-05, + "loss": 11.9666, + "step": 21914 + }, + { + "epoch": 1.1933593351167018, + "grad_norm": 0.5636419177521339, + "learning_rate": 7.385272265997924e-05, + "loss": 11.8589, + "step": 21915 + }, + { + "epoch": 1.1934137891132848, + "grad_norm": 0.5653417044781114, + "learning_rate": 7.384421128317746e-05, + "loss": 11.9675, + "step": 21916 + }, + { + "epoch": 1.1934682431098678, + "grad_norm": 0.5327041947186542, + "learning_rate": 7.383570010976746e-05, + "loss": 11.9112, + "step": 21917 + }, + { + "epoch": 1.1935226971064508, + "grad_norm": 0.5864609024597708, + "learning_rate": 7.382718913981543e-05, + "loss": 11.9947, + "step": 21918 + }, + { + "epoch": 1.1935771511030338, + "grad_norm": 0.6286781208678304, + "learning_rate": 7.381867837338758e-05, + "loss": 11.9866, + "step": 21919 + }, + { + "epoch": 1.1936316050996167, + "grad_norm": 0.5571969791110689, + "learning_rate": 7.381016781055007e-05, + "loss": 11.8934, + "step": 21920 + }, + { + "epoch": 1.1936860590961997, + "grad_norm": 0.5391164813349384, + "learning_rate": 7.380165745136914e-05, + "loss": 11.9263, + "step": 21921 + }, + { + "epoch": 1.1937405130927827, + "grad_norm": 0.5685581972310029, + "learning_rate": 7.379314729591086e-05, + "loss": 11.9015, + "step": 21922 + }, + { + "epoch": 1.1937949670893657, + "grad_norm": 0.5347922702222191, + "learning_rate": 7.378463734424148e-05, + "loss": 11.9034, + "step": 21923 + }, + { + "epoch": 1.193849421085949, + "grad_norm": 0.5334756761875965, + "learning_rate": 7.377612759642714e-05, + "loss": 11.8916, + "step": 21924 + }, + { + "epoch": 1.193903875082532, + "grad_norm": 0.5378746969796901, + "learning_rate": 7.376761805253402e-05, + "loss": 11.9487, + "step": 21925 + }, + { + "epoch": 1.193958329079115, + "grad_norm": 0.5632364163634869, + "learning_rate": 7.375910871262832e-05, + "loss": 11.9742, + "step": 21926 + }, + { + "epoch": 1.194012783075698, + "grad_norm": 0.5509891768269785, + "learning_rate": 7.37505995767762e-05, + "loss": 11.9957, + "step": 21927 + }, + { + "epoch": 1.194067237072281, + "grad_norm": 0.5028240155737436, + "learning_rate": 7.37420906450438e-05, + "loss": 11.9756, + "step": 21928 + }, + { + "epoch": 1.194121691068864, + "grad_norm": 0.5700726798776007, + "learning_rate": 7.373358191749732e-05, + "loss": 11.9226, + "step": 21929 + }, + { + "epoch": 1.194176145065447, + "grad_norm": 0.640326226880041, + "learning_rate": 7.372507339420291e-05, + "loss": 12.0049, + "step": 21930 + }, + { + "epoch": 1.1942305990620299, + "grad_norm": 0.5734539135195217, + "learning_rate": 7.371656507522676e-05, + "loss": 12.0835, + "step": 21931 + }, + { + "epoch": 1.1942850530586129, + "grad_norm": 0.5266682529185589, + "learning_rate": 7.370805696063499e-05, + "loss": 11.9226, + "step": 21932 + }, + { + "epoch": 1.1943395070551959, + "grad_norm": 0.5321589207592013, + "learning_rate": 7.369954905049376e-05, + "loss": 11.8316, + "step": 21933 + }, + { + "epoch": 1.1943939610517789, + "grad_norm": 0.6018036908920806, + "learning_rate": 7.369104134486926e-05, + "loss": 11.9235, + "step": 21934 + }, + { + "epoch": 1.1944484150483619, + "grad_norm": 0.5212117869171325, + "learning_rate": 7.368253384382761e-05, + "loss": 12.0144, + "step": 21935 + }, + { + "epoch": 1.1945028690449448, + "grad_norm": 0.55760427495541, + "learning_rate": 7.367402654743503e-05, + "loss": 11.8912, + "step": 21936 + }, + { + "epoch": 1.194557323041528, + "grad_norm": 0.5723866896650784, + "learning_rate": 7.36655194557576e-05, + "loss": 12.0242, + "step": 21937 + }, + { + "epoch": 1.194611777038111, + "grad_norm": 0.6439026464612314, + "learning_rate": 7.365701256886152e-05, + "loss": 12.0879, + "step": 21938 + }, + { + "epoch": 1.194666231034694, + "grad_norm": 0.6360830880730395, + "learning_rate": 7.364850588681293e-05, + "loss": 11.77, + "step": 21939 + }, + { + "epoch": 1.194720685031277, + "grad_norm": 0.5790733971628051, + "learning_rate": 7.363999940967799e-05, + "loss": 12.0057, + "step": 21940 + }, + { + "epoch": 1.19477513902786, + "grad_norm": 0.5885949248156588, + "learning_rate": 7.363149313752284e-05, + "loss": 11.979, + "step": 21941 + }, + { + "epoch": 1.194829593024443, + "grad_norm": 0.5387787434352426, + "learning_rate": 7.362298707041361e-05, + "loss": 11.9989, + "step": 21942 + }, + { + "epoch": 1.194884047021026, + "grad_norm": 0.5671488692927811, + "learning_rate": 7.361448120841645e-05, + "loss": 11.9839, + "step": 21943 + }, + { + "epoch": 1.194938501017609, + "grad_norm": 0.5458994071998429, + "learning_rate": 7.360597555159752e-05, + "loss": 12.0155, + "step": 21944 + }, + { + "epoch": 1.194992955014192, + "grad_norm": 0.5285626015856262, + "learning_rate": 7.359747010002294e-05, + "loss": 11.8917, + "step": 21945 + }, + { + "epoch": 1.195047409010775, + "grad_norm": 0.5591984973781156, + "learning_rate": 7.358896485375883e-05, + "loss": 11.9563, + "step": 21946 + }, + { + "epoch": 1.1951018630073582, + "grad_norm": 0.5750925795714951, + "learning_rate": 7.358045981287141e-05, + "loss": 12.0029, + "step": 21947 + }, + { + "epoch": 1.1951563170039412, + "grad_norm": 0.5418965446382998, + "learning_rate": 7.357195497742673e-05, + "loss": 11.9467, + "step": 21948 + }, + { + "epoch": 1.1952107710005242, + "grad_norm": 0.539027786051126, + "learning_rate": 7.356345034749098e-05, + "loss": 11.9311, + "step": 21949 + }, + { + "epoch": 1.1952652249971072, + "grad_norm": 0.5456273462070516, + "learning_rate": 7.355494592313026e-05, + "loss": 12.0312, + "step": 21950 + }, + { + "epoch": 1.1953196789936902, + "grad_norm": 0.5476710821233999, + "learning_rate": 7.354644170441075e-05, + "loss": 11.9479, + "step": 21951 + }, + { + "epoch": 1.1953741329902732, + "grad_norm": 0.5823357085415194, + "learning_rate": 7.353793769139851e-05, + "loss": 12.0686, + "step": 21952 + }, + { + "epoch": 1.1954285869868562, + "grad_norm": 0.5753955765326135, + "learning_rate": 7.352943388415973e-05, + "loss": 12.0287, + "step": 21953 + }, + { + "epoch": 1.1954830409834392, + "grad_norm": 0.5315100817541516, + "learning_rate": 7.35209302827605e-05, + "loss": 11.9147, + "step": 21954 + }, + { + "epoch": 1.1955374949800222, + "grad_norm": 0.541929125728084, + "learning_rate": 7.351242688726693e-05, + "loss": 11.989, + "step": 21955 + }, + { + "epoch": 1.1955919489766051, + "grad_norm": 0.58048912274076, + "learning_rate": 7.350392369774521e-05, + "loss": 12.0564, + "step": 21956 + }, + { + "epoch": 1.1956464029731881, + "grad_norm": 0.5415460020106003, + "learning_rate": 7.34954207142614e-05, + "loss": 11.9536, + "step": 21957 + }, + { + "epoch": 1.1957008569697711, + "grad_norm": 0.5395732840520236, + "learning_rate": 7.348691793688162e-05, + "loss": 12.0654, + "step": 21958 + }, + { + "epoch": 1.1957553109663541, + "grad_norm": 0.5234919368691676, + "learning_rate": 7.347841536567205e-05, + "loss": 11.7763, + "step": 21959 + }, + { + "epoch": 1.1958097649629373, + "grad_norm": 0.5268100722474017, + "learning_rate": 7.346991300069876e-05, + "loss": 11.9582, + "step": 21960 + }, + { + "epoch": 1.1958642189595203, + "grad_norm": 0.6006650936852125, + "learning_rate": 7.346141084202787e-05, + "loss": 12.0406, + "step": 21961 + }, + { + "epoch": 1.1959186729561033, + "grad_norm": 0.5727018594013691, + "learning_rate": 7.345290888972554e-05, + "loss": 12.0073, + "step": 21962 + }, + { + "epoch": 1.1959731269526863, + "grad_norm": 0.5304001919462159, + "learning_rate": 7.344440714385781e-05, + "loss": 12.0074, + "step": 21963 + }, + { + "epoch": 1.1960275809492693, + "grad_norm": 0.5307128137436281, + "learning_rate": 7.34359056044908e-05, + "loss": 12.0686, + "step": 21964 + }, + { + "epoch": 1.1960820349458523, + "grad_norm": 0.6126776099286598, + "learning_rate": 7.342740427169068e-05, + "loss": 12.0333, + "step": 21965 + }, + { + "epoch": 1.1961364889424353, + "grad_norm": 0.5468384048153477, + "learning_rate": 7.34189031455235e-05, + "loss": 11.9079, + "step": 21966 + }, + { + "epoch": 1.1961909429390183, + "grad_norm": 0.5427062902505094, + "learning_rate": 7.341040222605539e-05, + "loss": 11.9996, + "step": 21967 + }, + { + "epoch": 1.1962453969356013, + "grad_norm": 0.5923454382568167, + "learning_rate": 7.340190151335245e-05, + "loss": 12.1062, + "step": 21968 + }, + { + "epoch": 1.1962998509321843, + "grad_norm": 0.5215597390807284, + "learning_rate": 7.339340100748078e-05, + "loss": 12.0299, + "step": 21969 + }, + { + "epoch": 1.1963543049287675, + "grad_norm": 0.5252418576237954, + "learning_rate": 7.338490070850649e-05, + "loss": 11.9611, + "step": 21970 + }, + { + "epoch": 1.1964087589253505, + "grad_norm": 0.5642377871979222, + "learning_rate": 7.337640061649566e-05, + "loss": 12.0417, + "step": 21971 + }, + { + "epoch": 1.1964632129219335, + "grad_norm": 0.5532303926034008, + "learning_rate": 7.336790073151447e-05, + "loss": 11.9488, + "step": 21972 + }, + { + "epoch": 1.1965176669185165, + "grad_norm": 0.5866435522997976, + "learning_rate": 7.335940105362888e-05, + "loss": 12.0359, + "step": 21973 + }, + { + "epoch": 1.1965721209150995, + "grad_norm": 0.5965417508798888, + "learning_rate": 7.33509015829051e-05, + "loss": 12.0077, + "step": 21974 + }, + { + "epoch": 1.1966265749116825, + "grad_norm": 0.5517370879178353, + "learning_rate": 7.334240231940914e-05, + "loss": 11.9778, + "step": 21975 + }, + { + "epoch": 1.1966810289082654, + "grad_norm": 0.5736355459938184, + "learning_rate": 7.333390326320715e-05, + "loss": 11.9716, + "step": 21976 + }, + { + "epoch": 1.1967354829048484, + "grad_norm": 0.6115971430366769, + "learning_rate": 7.332540441436519e-05, + "loss": 12.0461, + "step": 21977 + }, + { + "epoch": 1.1967899369014314, + "grad_norm": 0.5074682437089225, + "learning_rate": 7.331690577294936e-05, + "loss": 11.7933, + "step": 21978 + }, + { + "epoch": 1.1968443908980144, + "grad_norm": 0.5400402430781936, + "learning_rate": 7.330840733902575e-05, + "loss": 11.9894, + "step": 21979 + }, + { + "epoch": 1.1968988448945974, + "grad_norm": 0.5863845049108035, + "learning_rate": 7.329990911266043e-05, + "loss": 11.9397, + "step": 21980 + }, + { + "epoch": 1.1969532988911804, + "grad_norm": 0.5672006204768686, + "learning_rate": 7.32914110939195e-05, + "loss": 12.021, + "step": 21981 + }, + { + "epoch": 1.1970077528877634, + "grad_norm": 0.5649091205380448, + "learning_rate": 7.32829132828691e-05, + "loss": 11.9162, + "step": 21982 + }, + { + "epoch": 1.1970622068843466, + "grad_norm": 0.5193042287535758, + "learning_rate": 7.327441567957518e-05, + "loss": 11.8243, + "step": 21983 + }, + { + "epoch": 1.1971166608809296, + "grad_norm": 0.6211147748830858, + "learning_rate": 7.326591828410388e-05, + "loss": 12.1122, + "step": 21984 + }, + { + "epoch": 1.1971711148775126, + "grad_norm": 0.5599456933873733, + "learning_rate": 7.32574210965213e-05, + "loss": 12.0533, + "step": 21985 + }, + { + "epoch": 1.1972255688740956, + "grad_norm": 0.584362125676465, + "learning_rate": 7.324892411689348e-05, + "loss": 11.9424, + "step": 21986 + }, + { + "epoch": 1.1972800228706786, + "grad_norm": 0.6199258810366467, + "learning_rate": 7.324042734528653e-05, + "loss": 11.9043, + "step": 21987 + }, + { + "epoch": 1.1973344768672616, + "grad_norm": 0.566877086814113, + "learning_rate": 7.32319307817665e-05, + "loss": 11.9762, + "step": 21988 + }, + { + "epoch": 1.1973889308638446, + "grad_norm": 0.5703254437265531, + "learning_rate": 7.322343442639948e-05, + "loss": 12.0712, + "step": 21989 + }, + { + "epoch": 1.1974433848604276, + "grad_norm": 0.6513271552859072, + "learning_rate": 7.32149382792515e-05, + "loss": 11.9653, + "step": 21990 + }, + { + "epoch": 1.1974978388570106, + "grad_norm": 0.5497910689981529, + "learning_rate": 7.320644234038865e-05, + "loss": 12.0185, + "step": 21991 + }, + { + "epoch": 1.1975522928535935, + "grad_norm": 0.543022473176367, + "learning_rate": 7.319794660987704e-05, + "loss": 12.0448, + "step": 21992 + }, + { + "epoch": 1.1976067468501765, + "grad_norm": 0.5516768398291159, + "learning_rate": 7.318945108778267e-05, + "loss": 12.062, + "step": 21993 + }, + { + "epoch": 1.1976612008467598, + "grad_norm": 0.5261862423612065, + "learning_rate": 7.318095577417161e-05, + "loss": 11.9819, + "step": 21994 + }, + { + "epoch": 1.1977156548433427, + "grad_norm": 0.5705506301206956, + "learning_rate": 7.317246066910992e-05, + "loss": 11.9227, + "step": 21995 + }, + { + "epoch": 1.1977701088399257, + "grad_norm": 0.5288704172122437, + "learning_rate": 7.31639657726637e-05, + "loss": 12.0418, + "step": 21996 + }, + { + "epoch": 1.1978245628365087, + "grad_norm": 0.5464414712091592, + "learning_rate": 7.315547108489897e-05, + "loss": 11.859, + "step": 21997 + }, + { + "epoch": 1.1978790168330917, + "grad_norm": 0.5154675847793856, + "learning_rate": 7.314697660588181e-05, + "loss": 11.9805, + "step": 21998 + }, + { + "epoch": 1.1979334708296747, + "grad_norm": 0.54948897474018, + "learning_rate": 7.313848233567826e-05, + "loss": 11.9353, + "step": 21999 + }, + { + "epoch": 1.1979879248262577, + "grad_norm": 0.5794043846753852, + "learning_rate": 7.312998827435438e-05, + "loss": 11.8999, + "step": 22000 + }, + { + "epoch": 1.1980423788228407, + "grad_norm": 0.5547427345883039, + "learning_rate": 7.312149442197623e-05, + "loss": 11.9735, + "step": 22001 + }, + { + "epoch": 1.1980968328194237, + "grad_norm": 0.5325978083539448, + "learning_rate": 7.311300077860986e-05, + "loss": 12.0776, + "step": 22002 + }, + { + "epoch": 1.1981512868160067, + "grad_norm": 0.6224752343369995, + "learning_rate": 7.310450734432125e-05, + "loss": 12.2011, + "step": 22003 + }, + { + "epoch": 1.1982057408125897, + "grad_norm": 0.5198020646625031, + "learning_rate": 7.309601411917655e-05, + "loss": 11.8813, + "step": 22004 + }, + { + "epoch": 1.1982601948091727, + "grad_norm": 0.5493944469541123, + "learning_rate": 7.308752110324173e-05, + "loss": 11.9463, + "step": 22005 + }, + { + "epoch": 1.1983146488057557, + "grad_norm": 0.6173523081537328, + "learning_rate": 7.307902829658286e-05, + "loss": 12.0457, + "step": 22006 + }, + { + "epoch": 1.1983691028023389, + "grad_norm": 0.567660112391515, + "learning_rate": 7.307053569926597e-05, + "loss": 12.0803, + "step": 22007 + }, + { + "epoch": 1.1984235567989219, + "grad_norm": 0.5880222453600811, + "learning_rate": 7.30620433113571e-05, + "loss": 12.0014, + "step": 22008 + }, + { + "epoch": 1.1984780107955049, + "grad_norm": 0.5611166320327582, + "learning_rate": 7.305355113292233e-05, + "loss": 11.9751, + "step": 22009 + }, + { + "epoch": 1.1985324647920879, + "grad_norm": 0.5648240770131326, + "learning_rate": 7.304505916402766e-05, + "loss": 12.0712, + "step": 22010 + }, + { + "epoch": 1.1985869187886709, + "grad_norm": 0.5093535437447665, + "learning_rate": 7.303656740473914e-05, + "loss": 11.9333, + "step": 22011 + }, + { + "epoch": 1.1986413727852538, + "grad_norm": 0.5327392268729846, + "learning_rate": 7.302807585512281e-05, + "loss": 11.957, + "step": 22012 + }, + { + "epoch": 1.1986958267818368, + "grad_norm": 0.559389433152678, + "learning_rate": 7.301958451524464e-05, + "loss": 12.0161, + "step": 22013 + }, + { + "epoch": 1.1987502807784198, + "grad_norm": 0.5470471857605839, + "learning_rate": 7.301109338517074e-05, + "loss": 11.9553, + "step": 22014 + }, + { + "epoch": 1.1988047347750028, + "grad_norm": 0.47837328899612946, + "learning_rate": 7.300260246496708e-05, + "loss": 11.8601, + "step": 22015 + }, + { + "epoch": 1.1988591887715858, + "grad_norm": 0.5899237141298945, + "learning_rate": 7.299411175469972e-05, + "loss": 11.977, + "step": 22016 + }, + { + "epoch": 1.198913642768169, + "grad_norm": 0.5375233053095325, + "learning_rate": 7.298562125443466e-05, + "loss": 11.9505, + "step": 22017 + }, + { + "epoch": 1.198968096764752, + "grad_norm": 0.48675226641449054, + "learning_rate": 7.297713096423794e-05, + "loss": 12.0649, + "step": 22018 + }, + { + "epoch": 1.199022550761335, + "grad_norm": 0.5555814473334991, + "learning_rate": 7.296864088417559e-05, + "loss": 12.0215, + "step": 22019 + }, + { + "epoch": 1.199077004757918, + "grad_norm": 0.5859006106638064, + "learning_rate": 7.296015101431362e-05, + "loss": 12.0142, + "step": 22020 + }, + { + "epoch": 1.199131458754501, + "grad_norm": 0.5139311204066148, + "learning_rate": 7.295166135471807e-05, + "loss": 11.8954, + "step": 22021 + }, + { + "epoch": 1.199185912751084, + "grad_norm": 0.55179853803515, + "learning_rate": 7.294317190545494e-05, + "loss": 12.0594, + "step": 22022 + }, + { + "epoch": 1.199240366747667, + "grad_norm": 0.5353701993641484, + "learning_rate": 7.293468266659023e-05, + "loss": 11.9732, + "step": 22023 + }, + { + "epoch": 1.19929482074425, + "grad_norm": 0.5464661093525545, + "learning_rate": 7.292619363818995e-05, + "loss": 11.9928, + "step": 22024 + }, + { + "epoch": 1.199349274740833, + "grad_norm": 0.6882576058037276, + "learning_rate": 7.291770482032014e-05, + "loss": 12.1068, + "step": 22025 + }, + { + "epoch": 1.199403728737416, + "grad_norm": 0.5362359921119237, + "learning_rate": 7.29092162130468e-05, + "loss": 11.9675, + "step": 22026 + }, + { + "epoch": 1.199458182733999, + "grad_norm": 0.560517441415461, + "learning_rate": 7.290072781643595e-05, + "loss": 11.9185, + "step": 22027 + }, + { + "epoch": 1.199512636730582, + "grad_norm": 0.57787623729744, + "learning_rate": 7.289223963055357e-05, + "loss": 12.0093, + "step": 22028 + }, + { + "epoch": 1.199567090727165, + "grad_norm": 0.5177218457333005, + "learning_rate": 7.288375165546567e-05, + "loss": 11.8983, + "step": 22029 + }, + { + "epoch": 1.1996215447237482, + "grad_norm": 0.637719324734241, + "learning_rate": 7.287526389123827e-05, + "loss": 12.0996, + "step": 22030 + }, + { + "epoch": 1.1996759987203311, + "grad_norm": 0.5255124147758452, + "learning_rate": 7.286677633793737e-05, + "loss": 11.9215, + "step": 22031 + }, + { + "epoch": 1.1997304527169141, + "grad_norm": 0.48575137124148565, + "learning_rate": 7.285828899562902e-05, + "loss": 11.8506, + "step": 22032 + }, + { + "epoch": 1.1997849067134971, + "grad_norm": 0.6130569886618931, + "learning_rate": 7.28498018643791e-05, + "loss": 11.9346, + "step": 22033 + }, + { + "epoch": 1.1998393607100801, + "grad_norm": 0.5802945288033231, + "learning_rate": 7.284131494425369e-05, + "loss": 12.0002, + "step": 22034 + }, + { + "epoch": 1.1998938147066631, + "grad_norm": 0.5819325035949174, + "learning_rate": 7.283282823531877e-05, + "loss": 12.0333, + "step": 22035 + }, + { + "epoch": 1.1999482687032461, + "grad_norm": 0.5236939960500671, + "learning_rate": 7.282434173764035e-05, + "loss": 11.9424, + "step": 22036 + }, + { + "epoch": 1.200002722699829, + "grad_norm": 0.5966003358843548, + "learning_rate": 7.281585545128438e-05, + "loss": 11.9131, + "step": 22037 + }, + { + "epoch": 1.200057176696412, + "grad_norm": 0.5546101784497532, + "learning_rate": 7.28073693763169e-05, + "loss": 12.0133, + "step": 22038 + }, + { + "epoch": 1.200111630692995, + "grad_norm": 0.5156972528773146, + "learning_rate": 7.279888351280386e-05, + "loss": 12.0682, + "step": 22039 + }, + { + "epoch": 1.2001660846895783, + "grad_norm": 0.5116502255674945, + "learning_rate": 7.279039786081124e-05, + "loss": 12.0056, + "step": 22040 + }, + { + "epoch": 1.2002205386861613, + "grad_norm": 0.55062344944069, + "learning_rate": 7.278191242040508e-05, + "loss": 12.006, + "step": 22041 + }, + { + "epoch": 1.2002749926827443, + "grad_norm": 0.5618602417553507, + "learning_rate": 7.277342719165137e-05, + "loss": 12.0006, + "step": 22042 + }, + { + "epoch": 1.2003294466793273, + "grad_norm": 0.5445278229766826, + "learning_rate": 7.276494217461602e-05, + "loss": 12.0291, + "step": 22043 + }, + { + "epoch": 1.2003839006759103, + "grad_norm": 0.600400346533125, + "learning_rate": 7.275645736936503e-05, + "loss": 12.0862, + "step": 22044 + }, + { + "epoch": 1.2004383546724933, + "grad_norm": 0.568299995987044, + "learning_rate": 7.27479727759644e-05, + "loss": 12.0462, + "step": 22045 + }, + { + "epoch": 1.2004928086690763, + "grad_norm": 0.5273174219750538, + "learning_rate": 7.273948839448011e-05, + "loss": 12.0002, + "step": 22046 + }, + { + "epoch": 1.2005472626656593, + "grad_norm": 0.568077861293959, + "learning_rate": 7.273100422497813e-05, + "loss": 11.8807, + "step": 22047 + }, + { + "epoch": 1.2006017166622422, + "grad_norm": 0.5055822204763117, + "learning_rate": 7.272252026752444e-05, + "loss": 11.9585, + "step": 22048 + }, + { + "epoch": 1.2006561706588252, + "grad_norm": 0.5276861482917721, + "learning_rate": 7.271403652218501e-05, + "loss": 11.9588, + "step": 22049 + }, + { + "epoch": 1.2007106246554082, + "grad_norm": 0.4972628498060447, + "learning_rate": 7.27055529890258e-05, + "loss": 11.8181, + "step": 22050 + }, + { + "epoch": 1.2007650786519912, + "grad_norm": 0.6470355406641413, + "learning_rate": 7.269706966811278e-05, + "loss": 11.9526, + "step": 22051 + }, + { + "epoch": 1.2008195326485742, + "grad_norm": 0.556983911002417, + "learning_rate": 7.268858655951196e-05, + "loss": 11.9277, + "step": 22052 + }, + { + "epoch": 1.2008739866451574, + "grad_norm": 0.5454720828432393, + "learning_rate": 7.268010366328926e-05, + "loss": 11.9055, + "step": 22053 + }, + { + "epoch": 1.2009284406417404, + "grad_norm": 0.5822493836480481, + "learning_rate": 7.267162097951063e-05, + "loss": 12.0546, + "step": 22054 + }, + { + "epoch": 1.2009828946383234, + "grad_norm": 0.5324998231694357, + "learning_rate": 7.266313850824209e-05, + "loss": 11.9716, + "step": 22055 + }, + { + "epoch": 1.2010373486349064, + "grad_norm": 0.5819054785121974, + "learning_rate": 7.26546562495495e-05, + "loss": 12.0384, + "step": 22056 + }, + { + "epoch": 1.2010918026314894, + "grad_norm": 0.5133980500777363, + "learning_rate": 7.264617420349895e-05, + "loss": 11.9353, + "step": 22057 + }, + { + "epoch": 1.2011462566280724, + "grad_norm": 0.5658558153461438, + "learning_rate": 7.263769237015631e-05, + "loss": 11.9319, + "step": 22058 + }, + { + "epoch": 1.2012007106246554, + "grad_norm": 0.47454789496767735, + "learning_rate": 7.26292107495876e-05, + "loss": 11.975, + "step": 22059 + }, + { + "epoch": 1.2012551646212384, + "grad_norm": 0.5440066283558274, + "learning_rate": 7.262072934185871e-05, + "loss": 11.9386, + "step": 22060 + }, + { + "epoch": 1.2013096186178214, + "grad_norm": 0.5391034435579533, + "learning_rate": 7.261224814703562e-05, + "loss": 12.0825, + "step": 22061 + }, + { + "epoch": 1.2013640726144044, + "grad_norm": 0.584625249614327, + "learning_rate": 7.260376716518431e-05, + "loss": 11.8243, + "step": 22062 + }, + { + "epoch": 1.2014185266109874, + "grad_norm": 0.5094079871757436, + "learning_rate": 7.259528639637068e-05, + "loss": 12.0372, + "step": 22063 + }, + { + "epoch": 1.2014729806075706, + "grad_norm": 0.7009737554586154, + "learning_rate": 7.258680584066069e-05, + "loss": 12.017, + "step": 22064 + }, + { + "epoch": 1.2015274346041536, + "grad_norm": 0.5222235715445752, + "learning_rate": 7.25783254981203e-05, + "loss": 12.0611, + "step": 22065 + }, + { + "epoch": 1.2015818886007366, + "grad_norm": 0.6015498109875128, + "learning_rate": 7.256984536881545e-05, + "loss": 11.9995, + "step": 22066 + }, + { + "epoch": 1.2016363425973196, + "grad_norm": 0.5289823743578922, + "learning_rate": 7.256136545281207e-05, + "loss": 11.8069, + "step": 22067 + }, + { + "epoch": 1.2016907965939025, + "grad_norm": 0.5015003605636306, + "learning_rate": 7.255288575017612e-05, + "loss": 11.9141, + "step": 22068 + }, + { + "epoch": 1.2017452505904855, + "grad_norm": 0.5378912269149257, + "learning_rate": 7.254440626097354e-05, + "loss": 11.9026, + "step": 22069 + }, + { + "epoch": 1.2017997045870685, + "grad_norm": 0.6546457360339119, + "learning_rate": 7.253592698527025e-05, + "loss": 11.9585, + "step": 22070 + }, + { + "epoch": 1.2018541585836515, + "grad_norm": 0.5534231465095167, + "learning_rate": 7.252744792313223e-05, + "loss": 11.9595, + "step": 22071 + }, + { + "epoch": 1.2019086125802345, + "grad_norm": 0.5249925229316503, + "learning_rate": 7.251896907462537e-05, + "loss": 11.9611, + "step": 22072 + }, + { + "epoch": 1.2019630665768175, + "grad_norm": 0.5746386431126431, + "learning_rate": 7.25104904398156e-05, + "loss": 12.0867, + "step": 22073 + }, + { + "epoch": 1.2020175205734005, + "grad_norm": 0.5710153979610945, + "learning_rate": 7.250201201876888e-05, + "loss": 11.9604, + "step": 22074 + }, + { + "epoch": 1.2020719745699835, + "grad_norm": 0.5717775825974136, + "learning_rate": 7.249353381155111e-05, + "loss": 12.0821, + "step": 22075 + }, + { + "epoch": 1.2021264285665665, + "grad_norm": 0.5998918160103405, + "learning_rate": 7.248505581822825e-05, + "loss": 12.0195, + "step": 22076 + }, + { + "epoch": 1.2021808825631497, + "grad_norm": 0.5254032594783522, + "learning_rate": 7.247657803886619e-05, + "loss": 11.8963, + "step": 22077 + }, + { + "epoch": 1.2022353365597327, + "grad_norm": 0.5537859402263627, + "learning_rate": 7.246810047353087e-05, + "loss": 11.9807, + "step": 22078 + }, + { + "epoch": 1.2022897905563157, + "grad_norm": 0.5903931312597357, + "learning_rate": 7.245962312228823e-05, + "loss": 11.9017, + "step": 22079 + }, + { + "epoch": 1.2023442445528987, + "grad_norm": 0.5417485964314547, + "learning_rate": 7.245114598520419e-05, + "loss": 11.9962, + "step": 22080 + }, + { + "epoch": 1.2023986985494817, + "grad_norm": 0.5291549933040715, + "learning_rate": 7.244266906234465e-05, + "loss": 11.8927, + "step": 22081 + }, + { + "epoch": 1.2024531525460647, + "grad_norm": 0.609881952281548, + "learning_rate": 7.243419235377556e-05, + "loss": 11.9386, + "step": 22082 + }, + { + "epoch": 1.2025076065426477, + "grad_norm": 0.5623469278131573, + "learning_rate": 7.242571585956279e-05, + "loss": 11.9458, + "step": 22083 + }, + { + "epoch": 1.2025620605392306, + "grad_norm": 0.547022649623105, + "learning_rate": 7.241723957977229e-05, + "loss": 11.9944, + "step": 22084 + }, + { + "epoch": 1.2026165145358136, + "grad_norm": 0.5352473947782485, + "learning_rate": 7.240876351446995e-05, + "loss": 11.8539, + "step": 22085 + }, + { + "epoch": 1.2026709685323966, + "grad_norm": 0.6625965168116763, + "learning_rate": 7.240028766372168e-05, + "loss": 12.0785, + "step": 22086 + }, + { + "epoch": 1.2027254225289798, + "grad_norm": 0.718071770523711, + "learning_rate": 7.239181202759342e-05, + "loss": 12.1267, + "step": 22087 + }, + { + "epoch": 1.2027798765255628, + "grad_norm": 0.5455341048641585, + "learning_rate": 7.238333660615105e-05, + "loss": 11.9934, + "step": 22088 + }, + { + "epoch": 1.2028343305221458, + "grad_norm": 0.5202972777844755, + "learning_rate": 7.237486139946046e-05, + "loss": 11.9052, + "step": 22089 + }, + { + "epoch": 1.2028887845187288, + "grad_norm": 0.595001783655921, + "learning_rate": 7.236638640758761e-05, + "loss": 12.0565, + "step": 22090 + }, + { + "epoch": 1.2029432385153118, + "grad_norm": 0.5074587400834651, + "learning_rate": 7.235791163059839e-05, + "loss": 12.0901, + "step": 22091 + }, + { + "epoch": 1.2029976925118948, + "grad_norm": 0.7264258267848539, + "learning_rate": 7.23494370685587e-05, + "loss": 11.8703, + "step": 22092 + }, + { + "epoch": 1.2030521465084778, + "grad_norm": 0.5209063478833043, + "learning_rate": 7.234096272153438e-05, + "loss": 11.948, + "step": 22093 + }, + { + "epoch": 1.2031066005050608, + "grad_norm": 0.5365000809312424, + "learning_rate": 7.233248858959139e-05, + "loss": 11.9564, + "step": 22094 + }, + { + "epoch": 1.2031610545016438, + "grad_norm": 0.5794642254238294, + "learning_rate": 7.232401467279559e-05, + "loss": 11.9069, + "step": 22095 + }, + { + "epoch": 1.2032155084982268, + "grad_norm": 0.5413611195574631, + "learning_rate": 7.231554097121291e-05, + "loss": 12.0539, + "step": 22096 + }, + { + "epoch": 1.2032699624948098, + "grad_norm": 0.5224053850158283, + "learning_rate": 7.230706748490923e-05, + "loss": 11.934, + "step": 22097 + }, + { + "epoch": 1.2033244164913928, + "grad_norm": 0.509226677967835, + "learning_rate": 7.229859421395042e-05, + "loss": 11.9446, + "step": 22098 + }, + { + "epoch": 1.2033788704879758, + "grad_norm": 0.5373899283698169, + "learning_rate": 7.22901211584024e-05, + "loss": 12.1334, + "step": 22099 + }, + { + "epoch": 1.203433324484559, + "grad_norm": 0.5349429933355131, + "learning_rate": 7.228164831833102e-05, + "loss": 11.9687, + "step": 22100 + }, + { + "epoch": 1.203487778481142, + "grad_norm": 0.5473400868933365, + "learning_rate": 7.22731756938022e-05, + "loss": 12.0422, + "step": 22101 + }, + { + "epoch": 1.203542232477725, + "grad_norm": 0.5280914671988042, + "learning_rate": 7.226470328488184e-05, + "loss": 11.9748, + "step": 22102 + }, + { + "epoch": 1.203596686474308, + "grad_norm": 0.5490625735743139, + "learning_rate": 7.225623109163584e-05, + "loss": 11.9687, + "step": 22103 + }, + { + "epoch": 1.203651140470891, + "grad_norm": 0.5523105319659944, + "learning_rate": 7.224775911412996e-05, + "loss": 11.7874, + "step": 22104 + }, + { + "epoch": 1.203705594467474, + "grad_norm": 0.5269022681343605, + "learning_rate": 7.223928735243019e-05, + "loss": 11.9339, + "step": 22105 + }, + { + "epoch": 1.203760048464057, + "grad_norm": 0.5386341462858246, + "learning_rate": 7.223081580660236e-05, + "loss": 12.1385, + "step": 22106 + }, + { + "epoch": 1.20381450246064, + "grad_norm": 0.5050447802853469, + "learning_rate": 7.222234447671239e-05, + "loss": 11.9921, + "step": 22107 + }, + { + "epoch": 1.203868956457223, + "grad_norm": 0.5736470925018637, + "learning_rate": 7.22138733628261e-05, + "loss": 12.0373, + "step": 22108 + }, + { + "epoch": 1.203923410453806, + "grad_norm": 0.5264139579520998, + "learning_rate": 7.22054024650094e-05, + "loss": 11.9781, + "step": 22109 + }, + { + "epoch": 1.2039778644503891, + "grad_norm": 0.5299022741525633, + "learning_rate": 7.219693178332816e-05, + "loss": 11.8533, + "step": 22110 + }, + { + "epoch": 1.2040323184469721, + "grad_norm": 0.5799698616022124, + "learning_rate": 7.218846131784824e-05, + "loss": 12.0601, + "step": 22111 + }, + { + "epoch": 1.204086772443555, + "grad_norm": 0.5806676006289738, + "learning_rate": 7.217999106863549e-05, + "loss": 11.9913, + "step": 22112 + }, + { + "epoch": 1.204141226440138, + "grad_norm": 0.5501912701320303, + "learning_rate": 7.217152103575584e-05, + "loss": 11.9818, + "step": 22113 + }, + { + "epoch": 1.204195680436721, + "grad_norm": 0.5655415207289628, + "learning_rate": 7.216305121927508e-05, + "loss": 11.9584, + "step": 22114 + }, + { + "epoch": 1.204250134433304, + "grad_norm": 0.592532109262495, + "learning_rate": 7.21545816192591e-05, + "loss": 11.9461, + "step": 22115 + }, + { + "epoch": 1.204304588429887, + "grad_norm": 0.48483512667346723, + "learning_rate": 7.214611223577375e-05, + "loss": 12.1258, + "step": 22116 + }, + { + "epoch": 1.20435904242647, + "grad_norm": 0.6021119418504816, + "learning_rate": 7.213764306888492e-05, + "loss": 11.9288, + "step": 22117 + }, + { + "epoch": 1.204413496423053, + "grad_norm": 0.5611011265865317, + "learning_rate": 7.212917411865844e-05, + "loss": 12.0514, + "step": 22118 + }, + { + "epoch": 1.204467950419636, + "grad_norm": 0.5305076937015334, + "learning_rate": 7.212070538516017e-05, + "loss": 11.9955, + "step": 22119 + }, + { + "epoch": 1.204522404416219, + "grad_norm": 0.5534774058940248, + "learning_rate": 7.2112236868456e-05, + "loss": 12.0304, + "step": 22120 + }, + { + "epoch": 1.204576858412802, + "grad_norm": 0.5845141801582963, + "learning_rate": 7.210376856861175e-05, + "loss": 12.0212, + "step": 22121 + }, + { + "epoch": 1.204631312409385, + "grad_norm": 0.5383471489402331, + "learning_rate": 7.209530048569325e-05, + "loss": 11.99, + "step": 22122 + }, + { + "epoch": 1.2046857664059683, + "grad_norm": 0.5803132674896115, + "learning_rate": 7.208683261976641e-05, + "loss": 12.0276, + "step": 22123 + }, + { + "epoch": 1.2047402204025512, + "grad_norm": 0.5633793886050223, + "learning_rate": 7.207836497089701e-05, + "loss": 12.0332, + "step": 22124 + }, + { + "epoch": 1.2047946743991342, + "grad_norm": 0.5597780825700696, + "learning_rate": 7.206989753915092e-05, + "loss": 12.1111, + "step": 22125 + }, + { + "epoch": 1.2048491283957172, + "grad_norm": 0.5462479356952539, + "learning_rate": 7.2061430324594e-05, + "loss": 12.0149, + "step": 22126 + }, + { + "epoch": 1.2049035823923002, + "grad_norm": 0.5657041967252572, + "learning_rate": 7.205296332729206e-05, + "loss": 11.9446, + "step": 22127 + }, + { + "epoch": 1.2049580363888832, + "grad_norm": 0.560924352421728, + "learning_rate": 7.2044496547311e-05, + "loss": 12.0223, + "step": 22128 + }, + { + "epoch": 1.2050124903854662, + "grad_norm": 0.5336889827650598, + "learning_rate": 7.203602998471661e-05, + "loss": 11.9153, + "step": 22129 + }, + { + "epoch": 1.2050669443820492, + "grad_norm": 0.5347407201619592, + "learning_rate": 7.202756363957473e-05, + "loss": 11.9628, + "step": 22130 + }, + { + "epoch": 1.2051213983786322, + "grad_norm": 0.5443814156617037, + "learning_rate": 7.201909751195122e-05, + "loss": 11.9808, + "step": 22131 + }, + { + "epoch": 1.2051758523752152, + "grad_norm": 0.5346787885470732, + "learning_rate": 7.201063160191191e-05, + "loss": 11.9713, + "step": 22132 + }, + { + "epoch": 1.2052303063717984, + "grad_norm": 0.5654284677731318, + "learning_rate": 7.200216590952262e-05, + "loss": 11.8919, + "step": 22133 + }, + { + "epoch": 1.2052847603683814, + "grad_norm": 0.5169820110637174, + "learning_rate": 7.199370043484917e-05, + "loss": 11.9123, + "step": 22134 + }, + { + "epoch": 1.2053392143649644, + "grad_norm": 0.5622142007958477, + "learning_rate": 7.198523517795741e-05, + "loss": 11.9989, + "step": 22135 + }, + { + "epoch": 1.2053936683615474, + "grad_norm": 0.5471605534669074, + "learning_rate": 7.197677013891315e-05, + "loss": 11.9656, + "step": 22136 + }, + { + "epoch": 1.2054481223581304, + "grad_norm": 0.555817997447976, + "learning_rate": 7.196830531778222e-05, + "loss": 12.0999, + "step": 22137 + }, + { + "epoch": 1.2055025763547134, + "grad_norm": 0.5182694496488273, + "learning_rate": 7.195984071463045e-05, + "loss": 11.8154, + "step": 22138 + }, + { + "epoch": 1.2055570303512964, + "grad_norm": 0.6090367274971842, + "learning_rate": 7.195137632952367e-05, + "loss": 12.0171, + "step": 22139 + }, + { + "epoch": 1.2056114843478793, + "grad_norm": 0.529902810752365, + "learning_rate": 7.194291216252769e-05, + "loss": 11.9217, + "step": 22140 + }, + { + "epoch": 1.2056659383444623, + "grad_norm": 0.6266915580456607, + "learning_rate": 7.193444821370833e-05, + "loss": 12.1702, + "step": 22141 + }, + { + "epoch": 1.2057203923410453, + "grad_norm": 0.5324727183403394, + "learning_rate": 7.192598448313141e-05, + "loss": 12.0653, + "step": 22142 + }, + { + "epoch": 1.2057748463376283, + "grad_norm": 0.5717854460507498, + "learning_rate": 7.191752097086275e-05, + "loss": 11.9419, + "step": 22143 + }, + { + "epoch": 1.2058293003342113, + "grad_norm": 0.5500219032678746, + "learning_rate": 7.190905767696816e-05, + "loss": 11.9897, + "step": 22144 + }, + { + "epoch": 1.2058837543307943, + "grad_norm": 0.6058155988404833, + "learning_rate": 7.190059460151342e-05, + "loss": 11.987, + "step": 22145 + }, + { + "epoch": 1.2059382083273773, + "grad_norm": 0.5239431508726817, + "learning_rate": 7.189213174456439e-05, + "loss": 12.0315, + "step": 22146 + }, + { + "epoch": 1.2059926623239605, + "grad_norm": 0.5485371883024152, + "learning_rate": 7.188366910618684e-05, + "loss": 11.9058, + "step": 22147 + }, + { + "epoch": 1.2060471163205435, + "grad_norm": 0.5739704399628821, + "learning_rate": 7.187520668644662e-05, + "loss": 11.917, + "step": 22148 + }, + { + "epoch": 1.2061015703171265, + "grad_norm": 0.5965933753043204, + "learning_rate": 7.186674448540947e-05, + "loss": 12.0307, + "step": 22149 + }, + { + "epoch": 1.2061560243137095, + "grad_norm": 0.5515978531681733, + "learning_rate": 7.185828250314126e-05, + "loss": 12.0179, + "step": 22150 + }, + { + "epoch": 1.2062104783102925, + "grad_norm": 0.5232068825150631, + "learning_rate": 7.184982073970776e-05, + "loss": 12.0243, + "step": 22151 + }, + { + "epoch": 1.2062649323068755, + "grad_norm": 0.558244250566568, + "learning_rate": 7.184135919517479e-05, + "loss": 11.8912, + "step": 22152 + }, + { + "epoch": 1.2063193863034585, + "grad_norm": 0.5243481543708333, + "learning_rate": 7.183289786960813e-05, + "loss": 12.0872, + "step": 22153 + }, + { + "epoch": 1.2063738403000415, + "grad_norm": 0.5432256391753869, + "learning_rate": 7.182443676307357e-05, + "loss": 12.0196, + "step": 22154 + }, + { + "epoch": 1.2064282942966245, + "grad_norm": 0.5113051679382367, + "learning_rate": 7.181597587563691e-05, + "loss": 11.969, + "step": 22155 + }, + { + "epoch": 1.2064827482932075, + "grad_norm": 0.5187867259613202, + "learning_rate": 7.180751520736395e-05, + "loss": 11.9699, + "step": 22156 + }, + { + "epoch": 1.2065372022897907, + "grad_norm": 0.536965619191231, + "learning_rate": 7.17990547583205e-05, + "loss": 11.9784, + "step": 22157 + }, + { + "epoch": 1.2065916562863737, + "grad_norm": 0.6097347835717006, + "learning_rate": 7.17905945285723e-05, + "loss": 12.1263, + "step": 22158 + }, + { + "epoch": 1.2066461102829567, + "grad_norm": 0.5507974083113052, + "learning_rate": 7.178213451818519e-05, + "loss": 11.9288, + "step": 22159 + }, + { + "epoch": 1.2067005642795396, + "grad_norm": 0.5705949664999904, + "learning_rate": 7.177367472722492e-05, + "loss": 11.9414, + "step": 22160 + }, + { + "epoch": 1.2067550182761226, + "grad_norm": 0.8062485571839195, + "learning_rate": 7.176521515575725e-05, + "loss": 11.9337, + "step": 22161 + }, + { + "epoch": 1.2068094722727056, + "grad_norm": 0.5864169340682323, + "learning_rate": 7.175675580384806e-05, + "loss": 11.9364, + "step": 22162 + }, + { + "epoch": 1.2068639262692886, + "grad_norm": 0.5337505307724522, + "learning_rate": 7.17482966715631e-05, + "loss": 11.997, + "step": 22163 + }, + { + "epoch": 1.2069183802658716, + "grad_norm": 0.5436177172422525, + "learning_rate": 7.173983775896807e-05, + "loss": 12.0018, + "step": 22164 + }, + { + "epoch": 1.2069728342624546, + "grad_norm": 0.5648042686347058, + "learning_rate": 7.17313790661288e-05, + "loss": 12.0179, + "step": 22165 + }, + { + "epoch": 1.2070272882590376, + "grad_norm": 0.5700943268038059, + "learning_rate": 7.172292059311108e-05, + "loss": 12.0263, + "step": 22166 + }, + { + "epoch": 1.2070817422556206, + "grad_norm": 0.5838511277616565, + "learning_rate": 7.171446233998067e-05, + "loss": 11.9884, + "step": 22167 + }, + { + "epoch": 1.2071361962522036, + "grad_norm": 0.5887753007205588, + "learning_rate": 7.170600430680335e-05, + "loss": 12.0172, + "step": 22168 + }, + { + "epoch": 1.2071906502487866, + "grad_norm": 0.5278608586343595, + "learning_rate": 7.169754649364487e-05, + "loss": 11.9377, + "step": 22169 + }, + { + "epoch": 1.2072451042453698, + "grad_norm": 0.5147691660591388, + "learning_rate": 7.168908890057102e-05, + "loss": 11.9707, + "step": 22170 + }, + { + "epoch": 1.2072995582419528, + "grad_norm": 0.5332753264935212, + "learning_rate": 7.168063152764756e-05, + "loss": 11.8652, + "step": 22171 + }, + { + "epoch": 1.2073540122385358, + "grad_norm": 0.5971722709092365, + "learning_rate": 7.167217437494024e-05, + "loss": 11.9451, + "step": 22172 + }, + { + "epoch": 1.2074084662351188, + "grad_norm": 0.5479329384980722, + "learning_rate": 7.166371744251492e-05, + "loss": 11.9568, + "step": 22173 + }, + { + "epoch": 1.2074629202317018, + "grad_norm": 0.5476576886552974, + "learning_rate": 7.165526073043723e-05, + "loss": 11.9433, + "step": 22174 + }, + { + "epoch": 1.2075173742282848, + "grad_norm": 0.5637277927302408, + "learning_rate": 7.164680423877299e-05, + "loss": 12.0464, + "step": 22175 + }, + { + "epoch": 1.2075718282248677, + "grad_norm": 0.6468205685455182, + "learning_rate": 7.163834796758794e-05, + "loss": 12.0958, + "step": 22176 + }, + { + "epoch": 1.2076262822214507, + "grad_norm": 0.5717610794146486, + "learning_rate": 7.162989191694787e-05, + "loss": 12.0327, + "step": 22177 + }, + { + "epoch": 1.2076807362180337, + "grad_norm": 0.5335832001968199, + "learning_rate": 7.162143608691851e-05, + "loss": 12.0202, + "step": 22178 + }, + { + "epoch": 1.2077351902146167, + "grad_norm": 0.6300336057485512, + "learning_rate": 7.161298047756561e-05, + "loss": 11.9623, + "step": 22179 + }, + { + "epoch": 1.2077896442112, + "grad_norm": 0.7084449429386043, + "learning_rate": 7.160452508895497e-05, + "loss": 11.8882, + "step": 22180 + }, + { + "epoch": 1.207844098207783, + "grad_norm": 0.6087104624616091, + "learning_rate": 7.15960699211523e-05, + "loss": 12.0099, + "step": 22181 + }, + { + "epoch": 1.207898552204366, + "grad_norm": 0.550174407470673, + "learning_rate": 7.158761497422331e-05, + "loss": 11.918, + "step": 22182 + }, + { + "epoch": 1.207953006200949, + "grad_norm": 0.5466120104557948, + "learning_rate": 7.157916024823386e-05, + "loss": 11.9994, + "step": 22183 + }, + { + "epoch": 1.208007460197532, + "grad_norm": 0.541649744902221, + "learning_rate": 7.157070574324957e-05, + "loss": 12.0657, + "step": 22184 + }, + { + "epoch": 1.208061914194115, + "grad_norm": 0.6087705722689702, + "learning_rate": 7.156225145933625e-05, + "loss": 11.9982, + "step": 22185 + }, + { + "epoch": 1.208116368190698, + "grad_norm": 0.5725121504350018, + "learning_rate": 7.155379739655965e-05, + "loss": 12.0643, + "step": 22186 + }, + { + "epoch": 1.208170822187281, + "grad_norm": 0.6066352387285385, + "learning_rate": 7.154534355498545e-05, + "loss": 12.0684, + "step": 22187 + }, + { + "epoch": 1.2082252761838639, + "grad_norm": 0.57731847318181, + "learning_rate": 7.153688993467946e-05, + "loss": 11.9755, + "step": 22188 + }, + { + "epoch": 1.2082797301804469, + "grad_norm": 0.6293896024435822, + "learning_rate": 7.15284365357074e-05, + "loss": 12.1648, + "step": 22189 + }, + { + "epoch": 1.2083341841770299, + "grad_norm": 0.4920126587746231, + "learning_rate": 7.151998335813497e-05, + "loss": 12.0314, + "step": 22190 + }, + { + "epoch": 1.2083886381736129, + "grad_norm": 0.5472210948432139, + "learning_rate": 7.151153040202794e-05, + "loss": 12.0244, + "step": 22191 + }, + { + "epoch": 1.2084430921701959, + "grad_norm": 0.5024160976264329, + "learning_rate": 7.150307766745202e-05, + "loss": 11.8616, + "step": 22192 + }, + { + "epoch": 1.208497546166779, + "grad_norm": 0.5253886343420642, + "learning_rate": 7.149462515447296e-05, + "loss": 11.9704, + "step": 22193 + }, + { + "epoch": 1.208552000163362, + "grad_norm": 0.607827313687678, + "learning_rate": 7.148617286315646e-05, + "loss": 11.9671, + "step": 22194 + }, + { + "epoch": 1.208606454159945, + "grad_norm": 0.5251209836074804, + "learning_rate": 7.147772079356827e-05, + "loss": 11.8412, + "step": 22195 + }, + { + "epoch": 1.208660908156528, + "grad_norm": 0.67730418799882, + "learning_rate": 7.14692689457741e-05, + "loss": 12.0243, + "step": 22196 + }, + { + "epoch": 1.208715362153111, + "grad_norm": 0.5731436442116857, + "learning_rate": 7.14608173198397e-05, + "loss": 11.9742, + "step": 22197 + }, + { + "epoch": 1.208769816149694, + "grad_norm": 0.645929462247328, + "learning_rate": 7.145236591583072e-05, + "loss": 12.0492, + "step": 22198 + }, + { + "epoch": 1.208824270146277, + "grad_norm": 0.6119640998397153, + "learning_rate": 7.144391473381296e-05, + "loss": 12.057, + "step": 22199 + }, + { + "epoch": 1.20887872414286, + "grad_norm": 0.5738906898356876, + "learning_rate": 7.143546377385211e-05, + "loss": 12.0368, + "step": 22200 + }, + { + "epoch": 1.208933178139443, + "grad_norm": 0.6179311492166093, + "learning_rate": 7.142701303601388e-05, + "loss": 11.8716, + "step": 22201 + }, + { + "epoch": 1.208987632136026, + "grad_norm": 0.5294809640433394, + "learning_rate": 7.1418562520364e-05, + "loss": 11.9695, + "step": 22202 + }, + { + "epoch": 1.2090420861326092, + "grad_norm": 0.5285947080877262, + "learning_rate": 7.141011222696818e-05, + "loss": 11.887, + "step": 22203 + }, + { + "epoch": 1.2090965401291922, + "grad_norm": 0.5063114473792395, + "learning_rate": 7.14016621558921e-05, + "loss": 11.9013, + "step": 22204 + }, + { + "epoch": 1.2091509941257752, + "grad_norm": 0.578170449929612, + "learning_rate": 7.139321230720151e-05, + "loss": 11.9329, + "step": 22205 + }, + { + "epoch": 1.2092054481223582, + "grad_norm": 0.5967577483165094, + "learning_rate": 7.138476268096208e-05, + "loss": 12.0208, + "step": 22206 + }, + { + "epoch": 1.2092599021189412, + "grad_norm": 0.5141582545857023, + "learning_rate": 7.137631327723952e-05, + "loss": 11.9588, + "step": 22207 + }, + { + "epoch": 1.2093143561155242, + "grad_norm": 0.5621523592159323, + "learning_rate": 7.136786409609957e-05, + "loss": 11.9533, + "step": 22208 + }, + { + "epoch": 1.2093688101121072, + "grad_norm": 0.5581547531966375, + "learning_rate": 7.135941513760792e-05, + "loss": 11.8932, + "step": 22209 + }, + { + "epoch": 1.2094232641086902, + "grad_norm": 0.5387537878154146, + "learning_rate": 7.135096640183022e-05, + "loss": 12.1501, + "step": 22210 + }, + { + "epoch": 1.2094777181052732, + "grad_norm": 0.6273124702833844, + "learning_rate": 7.134251788883224e-05, + "loss": 11.9529, + "step": 22211 + }, + { + "epoch": 1.2095321721018562, + "grad_norm": 0.6198131612178198, + "learning_rate": 7.133406959867965e-05, + "loss": 12.0887, + "step": 22212 + }, + { + "epoch": 1.2095866260984391, + "grad_norm": 0.5908403742200561, + "learning_rate": 7.132562153143819e-05, + "loss": 11.9164, + "step": 22213 + }, + { + "epoch": 1.2096410800950221, + "grad_norm": 0.508816991417017, + "learning_rate": 7.131717368717342e-05, + "loss": 11.9498, + "step": 22214 + }, + { + "epoch": 1.2096955340916051, + "grad_norm": 0.5326215958832874, + "learning_rate": 7.130872606595114e-05, + "loss": 11.9998, + "step": 22215 + }, + { + "epoch": 1.2097499880881883, + "grad_norm": 0.5395288580198045, + "learning_rate": 7.130027866783703e-05, + "loss": 12.0351, + "step": 22216 + }, + { + "epoch": 1.2098044420847713, + "grad_norm": 0.5535370494809941, + "learning_rate": 7.129183149289677e-05, + "loss": 12.0239, + "step": 22217 + }, + { + "epoch": 1.2098588960813543, + "grad_norm": 0.5553002147851895, + "learning_rate": 7.128338454119603e-05, + "loss": 12.0336, + "step": 22218 + }, + { + "epoch": 1.2099133500779373, + "grad_norm": 0.5585794211770257, + "learning_rate": 7.127493781280052e-05, + "loss": 11.8861, + "step": 22219 + }, + { + "epoch": 1.2099678040745203, + "grad_norm": 0.5464541689008439, + "learning_rate": 7.12664913077759e-05, + "loss": 11.9994, + "step": 22220 + }, + { + "epoch": 1.2100222580711033, + "grad_norm": 0.5440864382681991, + "learning_rate": 7.125804502618784e-05, + "loss": 12.0383, + "step": 22221 + }, + { + "epoch": 1.2100767120676863, + "grad_norm": 0.6148614300199156, + "learning_rate": 7.124959896810207e-05, + "loss": 12.1397, + "step": 22222 + }, + { + "epoch": 1.2101311660642693, + "grad_norm": 0.5157349583589322, + "learning_rate": 7.124115313358428e-05, + "loss": 12.0791, + "step": 22223 + }, + { + "epoch": 1.2101856200608523, + "grad_norm": 0.5429489920842572, + "learning_rate": 7.123270752270005e-05, + "loss": 12.0118, + "step": 22224 + }, + { + "epoch": 1.2102400740574353, + "grad_norm": 0.5254134460917859, + "learning_rate": 7.122426213551513e-05, + "loss": 11.9898, + "step": 22225 + }, + { + "epoch": 1.2102945280540183, + "grad_norm": 0.5446806538940857, + "learning_rate": 7.121581697209516e-05, + "loss": 11.9785, + "step": 22226 + }, + { + "epoch": 1.2103489820506015, + "grad_norm": 0.5197203705656492, + "learning_rate": 7.120737203250582e-05, + "loss": 11.9456, + "step": 22227 + }, + { + "epoch": 1.2104034360471845, + "grad_norm": 0.5630397593922395, + "learning_rate": 7.11989273168128e-05, + "loss": 11.984, + "step": 22228 + }, + { + "epoch": 1.2104578900437675, + "grad_norm": 0.6025670215102922, + "learning_rate": 7.119048282508176e-05, + "loss": 12.1266, + "step": 22229 + }, + { + "epoch": 1.2105123440403505, + "grad_norm": 0.5154841225457294, + "learning_rate": 7.118203855737833e-05, + "loss": 11.9637, + "step": 22230 + }, + { + "epoch": 1.2105667980369335, + "grad_norm": 0.5638156237527165, + "learning_rate": 7.117359451376822e-05, + "loss": 11.9354, + "step": 22231 + }, + { + "epoch": 1.2106212520335164, + "grad_norm": 0.5660231263125333, + "learning_rate": 7.116515069431704e-05, + "loss": 11.9912, + "step": 22232 + }, + { + "epoch": 1.2106757060300994, + "grad_norm": 0.6107715028659775, + "learning_rate": 7.115670709909056e-05, + "loss": 11.9319, + "step": 22233 + }, + { + "epoch": 1.2107301600266824, + "grad_norm": 0.6017035343736808, + "learning_rate": 7.114826372815432e-05, + "loss": 11.9558, + "step": 22234 + }, + { + "epoch": 1.2107846140232654, + "grad_norm": 0.5229652175836073, + "learning_rate": 7.113982058157402e-05, + "loss": 11.8972, + "step": 22235 + }, + { + "epoch": 1.2108390680198484, + "grad_norm": 0.5701160898836272, + "learning_rate": 7.113137765941528e-05, + "loss": 11.9585, + "step": 22236 + }, + { + "epoch": 1.2108935220164314, + "grad_norm": 0.553893458828715, + "learning_rate": 7.112293496174381e-05, + "loss": 11.9503, + "step": 22237 + }, + { + "epoch": 1.2109479760130144, + "grad_norm": 0.4892356897325319, + "learning_rate": 7.111449248862525e-05, + "loss": 11.9463, + "step": 22238 + }, + { + "epoch": 1.2110024300095974, + "grad_norm": 0.5822877339074625, + "learning_rate": 7.110605024012524e-05, + "loss": 11.9796, + "step": 22239 + }, + { + "epoch": 1.2110568840061806, + "grad_norm": 0.5853273380034315, + "learning_rate": 7.109760821630943e-05, + "loss": 12.045, + "step": 22240 + }, + { + "epoch": 1.2111113380027636, + "grad_norm": 0.5656663684305093, + "learning_rate": 7.108916641724345e-05, + "loss": 12.0361, + "step": 22241 + }, + { + "epoch": 1.2111657919993466, + "grad_norm": 0.5540103396672027, + "learning_rate": 7.108072484299299e-05, + "loss": 11.9424, + "step": 22242 + }, + { + "epoch": 1.2112202459959296, + "grad_norm": 0.5130192052115216, + "learning_rate": 7.107228349362368e-05, + "loss": 11.9045, + "step": 22243 + }, + { + "epoch": 1.2112746999925126, + "grad_norm": 0.563121879580242, + "learning_rate": 7.106384236920109e-05, + "loss": 12.0127, + "step": 22244 + }, + { + "epoch": 1.2113291539890956, + "grad_norm": 0.5140509319795811, + "learning_rate": 7.105540146979095e-05, + "loss": 11.9238, + "step": 22245 + }, + { + "epoch": 1.2113836079856786, + "grad_norm": 0.5607506092057514, + "learning_rate": 7.104696079545886e-05, + "loss": 11.9958, + "step": 22246 + }, + { + "epoch": 1.2114380619822616, + "grad_norm": 0.5316034142149118, + "learning_rate": 7.10385203462704e-05, + "loss": 11.6815, + "step": 22247 + }, + { + "epoch": 1.2114925159788446, + "grad_norm": 0.5238273300317361, + "learning_rate": 7.10300801222913e-05, + "loss": 12.1356, + "step": 22248 + }, + { + "epoch": 1.2115469699754275, + "grad_norm": 0.5737696562662198, + "learning_rate": 7.102164012358719e-05, + "loss": 11.9107, + "step": 22249 + }, + { + "epoch": 1.2116014239720108, + "grad_norm": 0.4843855987877133, + "learning_rate": 7.101320035022363e-05, + "loss": 11.9216, + "step": 22250 + }, + { + "epoch": 1.2116558779685938, + "grad_norm": 0.5321854172161506, + "learning_rate": 7.10047608022663e-05, + "loss": 12.0449, + "step": 22251 + }, + { + "epoch": 1.2117103319651767, + "grad_norm": 0.542024674210292, + "learning_rate": 7.099632147978081e-05, + "loss": 11.9542, + "step": 22252 + }, + { + "epoch": 1.2117647859617597, + "grad_norm": 0.5624187728852066, + "learning_rate": 7.098788238283278e-05, + "loss": 11.9009, + "step": 22253 + }, + { + "epoch": 1.2118192399583427, + "grad_norm": 0.545018263851282, + "learning_rate": 7.097944351148787e-05, + "loss": 11.9881, + "step": 22254 + }, + { + "epoch": 1.2118736939549257, + "grad_norm": 0.547322436937603, + "learning_rate": 7.097100486581165e-05, + "loss": 12.1616, + "step": 22255 + }, + { + "epoch": 1.2119281479515087, + "grad_norm": 0.5854550211063693, + "learning_rate": 7.096256644586976e-05, + "loss": 12.0093, + "step": 22256 + }, + { + "epoch": 1.2119826019480917, + "grad_norm": 0.5294357935560317, + "learning_rate": 7.095412825172784e-05, + "loss": 11.9794, + "step": 22257 + }, + { + "epoch": 1.2120370559446747, + "grad_norm": 0.6303869108387259, + "learning_rate": 7.094569028345146e-05, + "loss": 12.0509, + "step": 22258 + }, + { + "epoch": 1.2120915099412577, + "grad_norm": 0.5067586615831682, + "learning_rate": 7.093725254110627e-05, + "loss": 11.9189, + "step": 22259 + }, + { + "epoch": 1.2121459639378407, + "grad_norm": 0.5578965016345673, + "learning_rate": 7.09288150247579e-05, + "loss": 11.9744, + "step": 22260 + }, + { + "epoch": 1.2122004179344237, + "grad_norm": 0.7391896317516873, + "learning_rate": 7.092037773447193e-05, + "loss": 11.9811, + "step": 22261 + }, + { + "epoch": 1.2122548719310067, + "grad_norm": 0.5437859337146264, + "learning_rate": 7.091194067031398e-05, + "loss": 11.9437, + "step": 22262 + }, + { + "epoch": 1.2123093259275899, + "grad_norm": 0.5742713919066256, + "learning_rate": 7.090350383234966e-05, + "loss": 11.929, + "step": 22263 + }, + { + "epoch": 1.2123637799241729, + "grad_norm": 0.5562476907318756, + "learning_rate": 7.08950672206446e-05, + "loss": 11.9208, + "step": 22264 + }, + { + "epoch": 1.2124182339207559, + "grad_norm": 0.5611820438255171, + "learning_rate": 7.088663083526434e-05, + "loss": 11.9286, + "step": 22265 + }, + { + "epoch": 1.2124726879173389, + "grad_norm": 0.5372397433175323, + "learning_rate": 7.087819467627454e-05, + "loss": 11.978, + "step": 22266 + }, + { + "epoch": 1.2125271419139219, + "grad_norm": 0.5401658270660569, + "learning_rate": 7.086975874374077e-05, + "loss": 12.0468, + "step": 22267 + }, + { + "epoch": 1.2125815959105049, + "grad_norm": 0.5468530284162061, + "learning_rate": 7.086132303772864e-05, + "loss": 11.9912, + "step": 22268 + }, + { + "epoch": 1.2126360499070878, + "grad_norm": 0.5289231617253853, + "learning_rate": 7.085288755830375e-05, + "loss": 11.9488, + "step": 22269 + }, + { + "epoch": 1.2126905039036708, + "grad_norm": 0.4974327342037205, + "learning_rate": 7.084445230553167e-05, + "loss": 12.022, + "step": 22270 + }, + { + "epoch": 1.2127449579002538, + "grad_norm": 0.5586721468309652, + "learning_rate": 7.083601727947806e-05, + "loss": 12.0494, + "step": 22271 + }, + { + "epoch": 1.2127994118968368, + "grad_norm": 0.5432637061339359, + "learning_rate": 7.082758248020844e-05, + "loss": 12.0238, + "step": 22272 + }, + { + "epoch": 1.21285386589342, + "grad_norm": 0.5474105843631708, + "learning_rate": 7.081914790778845e-05, + "loss": 11.9483, + "step": 22273 + }, + { + "epoch": 1.212908319890003, + "grad_norm": 0.5318146688778155, + "learning_rate": 7.081071356228368e-05, + "loss": 11.7935, + "step": 22274 + }, + { + "epoch": 1.212962773886586, + "grad_norm": 0.5240733562038657, + "learning_rate": 7.080227944375968e-05, + "loss": 11.9969, + "step": 22275 + }, + { + "epoch": 1.213017227883169, + "grad_norm": 0.5855271049242533, + "learning_rate": 7.079384555228204e-05, + "loss": 12.1057, + "step": 22276 + }, + { + "epoch": 1.213071681879752, + "grad_norm": 0.5257915104221932, + "learning_rate": 7.078541188791636e-05, + "loss": 11.9402, + "step": 22277 + }, + { + "epoch": 1.213126135876335, + "grad_norm": 0.5357458275395512, + "learning_rate": 7.077697845072821e-05, + "loss": 11.8799, + "step": 22278 + }, + { + "epoch": 1.213180589872918, + "grad_norm": 0.5892448472689967, + "learning_rate": 7.076854524078318e-05, + "loss": 12.0626, + "step": 22279 + }, + { + "epoch": 1.213235043869501, + "grad_norm": 0.5561518650997543, + "learning_rate": 7.076011225814685e-05, + "loss": 11.994, + "step": 22280 + }, + { + "epoch": 1.213289497866084, + "grad_norm": 0.5671959921232491, + "learning_rate": 7.075167950288477e-05, + "loss": 11.9708, + "step": 22281 + }, + { + "epoch": 1.213343951862667, + "grad_norm": 0.5241208392779223, + "learning_rate": 7.074324697506255e-05, + "loss": 11.8656, + "step": 22282 + }, + { + "epoch": 1.21339840585925, + "grad_norm": 0.6521234757186432, + "learning_rate": 7.073481467474575e-05, + "loss": 12.013, + "step": 22283 + }, + { + "epoch": 1.213452859855833, + "grad_norm": 0.48788085413657317, + "learning_rate": 7.072638260199997e-05, + "loss": 12.0519, + "step": 22284 + }, + { + "epoch": 1.213507313852416, + "grad_norm": 0.563264647372284, + "learning_rate": 7.07179507568907e-05, + "loss": 11.8737, + "step": 22285 + }, + { + "epoch": 1.2135617678489992, + "grad_norm": 0.594163666289845, + "learning_rate": 7.070951913948359e-05, + "loss": 12.0171, + "step": 22286 + }, + { + "epoch": 1.2136162218455822, + "grad_norm": 0.5633618600797079, + "learning_rate": 7.070108774984415e-05, + "loss": 12.028, + "step": 22287 + }, + { + "epoch": 1.2136706758421651, + "grad_norm": 0.5191886984552105, + "learning_rate": 7.069265658803796e-05, + "loss": 11.9681, + "step": 22288 + }, + { + "epoch": 1.2137251298387481, + "grad_norm": 0.537810626953105, + "learning_rate": 7.06842256541306e-05, + "loss": 12.0174, + "step": 22289 + }, + { + "epoch": 1.2137795838353311, + "grad_norm": 0.5692931708589785, + "learning_rate": 7.067579494818761e-05, + "loss": 11.9745, + "step": 22290 + }, + { + "epoch": 1.2138340378319141, + "grad_norm": 0.6202625084016253, + "learning_rate": 7.066736447027455e-05, + "loss": 11.9476, + "step": 22291 + }, + { + "epoch": 1.2138884918284971, + "grad_norm": 0.5324484456463837, + "learning_rate": 7.065893422045698e-05, + "loss": 12.1948, + "step": 22292 + }, + { + "epoch": 1.21394294582508, + "grad_norm": 0.5021182461547399, + "learning_rate": 7.065050419880046e-05, + "loss": 11.9584, + "step": 22293 + }, + { + "epoch": 1.213997399821663, + "grad_norm": 0.5939087172169644, + "learning_rate": 7.064207440537061e-05, + "loss": 12.0118, + "step": 22294 + }, + { + "epoch": 1.214051853818246, + "grad_norm": 0.5634578786794503, + "learning_rate": 7.063364484023285e-05, + "loss": 11.9904, + "step": 22295 + }, + { + "epoch": 1.214106307814829, + "grad_norm": 0.5484709194305013, + "learning_rate": 7.062521550345277e-05, + "loss": 11.8776, + "step": 22296 + }, + { + "epoch": 1.2141607618114123, + "grad_norm": 0.5421099908982828, + "learning_rate": 7.061678639509596e-05, + "loss": 12.0019, + "step": 22297 + }, + { + "epoch": 1.2142152158079953, + "grad_norm": 0.4991751723091259, + "learning_rate": 7.060835751522797e-05, + "loss": 11.9418, + "step": 22298 + }, + { + "epoch": 1.2142696698045783, + "grad_norm": 0.577553957827333, + "learning_rate": 7.059992886391428e-05, + "loss": 12.0586, + "step": 22299 + }, + { + "epoch": 1.2143241238011613, + "grad_norm": 0.5255331978796294, + "learning_rate": 7.059150044122052e-05, + "loss": 11.9671, + "step": 22300 + }, + { + "epoch": 1.2143785777977443, + "grad_norm": 0.5196347588231244, + "learning_rate": 7.058307224721216e-05, + "loss": 11.8502, + "step": 22301 + }, + { + "epoch": 1.2144330317943273, + "grad_norm": 0.5355706344964161, + "learning_rate": 7.057464428195476e-05, + "loss": 11.8549, + "step": 22302 + }, + { + "epoch": 1.2144874857909103, + "grad_norm": 0.4955177547160181, + "learning_rate": 7.056621654551385e-05, + "loss": 11.8426, + "step": 22303 + }, + { + "epoch": 1.2145419397874933, + "grad_norm": 0.523880147256863, + "learning_rate": 7.055778903795502e-05, + "loss": 11.9664, + "step": 22304 + }, + { + "epoch": 1.2145963937840762, + "grad_norm": 0.5350504152097167, + "learning_rate": 7.054936175934375e-05, + "loss": 11.8836, + "step": 22305 + }, + { + "epoch": 1.2146508477806592, + "grad_norm": 0.5548101050832339, + "learning_rate": 7.054093470974557e-05, + "loss": 11.903, + "step": 22306 + }, + { + "epoch": 1.2147053017772422, + "grad_norm": 0.6393004098557902, + "learning_rate": 7.053250788922599e-05, + "loss": 12.1723, + "step": 22307 + }, + { + "epoch": 1.2147597557738252, + "grad_norm": 0.5315787203549713, + "learning_rate": 7.05240812978506e-05, + "loss": 12.0701, + "step": 22308 + }, + { + "epoch": 1.2148142097704082, + "grad_norm": 0.5644005983604934, + "learning_rate": 7.051565493568488e-05, + "loss": 11.8484, + "step": 22309 + }, + { + "epoch": 1.2148686637669914, + "grad_norm": 0.5338038664794075, + "learning_rate": 7.050722880279439e-05, + "loss": 11.9557, + "step": 22310 + }, + { + "epoch": 1.2149231177635744, + "grad_norm": 0.5581254134054306, + "learning_rate": 7.049880289924464e-05, + "loss": 11.9729, + "step": 22311 + }, + { + "epoch": 1.2149775717601574, + "grad_norm": 0.5450984713680336, + "learning_rate": 7.049037722510113e-05, + "loss": 11.8956, + "step": 22312 + }, + { + "epoch": 1.2150320257567404, + "grad_norm": 0.6100547288983047, + "learning_rate": 7.04819517804294e-05, + "loss": 12.0668, + "step": 22313 + }, + { + "epoch": 1.2150864797533234, + "grad_norm": 0.5259666863737034, + "learning_rate": 7.047352656529498e-05, + "loss": 12.1377, + "step": 22314 + }, + { + "epoch": 1.2151409337499064, + "grad_norm": 0.5224156702911438, + "learning_rate": 7.046510157976336e-05, + "loss": 11.9721, + "step": 22315 + }, + { + "epoch": 1.2151953877464894, + "grad_norm": 0.5203314355568164, + "learning_rate": 7.045667682390004e-05, + "loss": 11.9608, + "step": 22316 + }, + { + "epoch": 1.2152498417430724, + "grad_norm": 0.5803029169990991, + "learning_rate": 7.04482522977706e-05, + "loss": 11.9736, + "step": 22317 + }, + { + "epoch": 1.2153042957396554, + "grad_norm": 0.55320366865983, + "learning_rate": 7.043982800144046e-05, + "loss": 11.9466, + "step": 22318 + }, + { + "epoch": 1.2153587497362384, + "grad_norm": 0.5532418365233676, + "learning_rate": 7.043140393497518e-05, + "loss": 12.005, + "step": 22319 + }, + { + "epoch": 1.2154132037328216, + "grad_norm": 0.5154581249122987, + "learning_rate": 7.042298009844027e-05, + "loss": 11.7698, + "step": 22320 + }, + { + "epoch": 1.2154676577294046, + "grad_norm": 0.5570806186394629, + "learning_rate": 7.041455649190123e-05, + "loss": 12.0179, + "step": 22321 + }, + { + "epoch": 1.2155221117259876, + "grad_norm": 0.6075919078474695, + "learning_rate": 7.040613311542357e-05, + "loss": 12.0396, + "step": 22322 + }, + { + "epoch": 1.2155765657225706, + "grad_norm": 0.5666390942113456, + "learning_rate": 7.039770996907277e-05, + "loss": 11.9011, + "step": 22323 + }, + { + "epoch": 1.2156310197191535, + "grad_norm": 0.5092742503468421, + "learning_rate": 7.038928705291436e-05, + "loss": 11.926, + "step": 22324 + }, + { + "epoch": 1.2156854737157365, + "grad_norm": 0.5501225754048922, + "learning_rate": 7.038086436701381e-05, + "loss": 11.8644, + "step": 22325 + }, + { + "epoch": 1.2157399277123195, + "grad_norm": 0.5719510745672549, + "learning_rate": 7.037244191143661e-05, + "loss": 12.0941, + "step": 22326 + }, + { + "epoch": 1.2157943817089025, + "grad_norm": 0.5577233311053958, + "learning_rate": 7.03640196862483e-05, + "loss": 11.9658, + "step": 22327 + }, + { + "epoch": 1.2158488357054855, + "grad_norm": 0.554705498761114, + "learning_rate": 7.035559769151432e-05, + "loss": 11.8951, + "step": 22328 + }, + { + "epoch": 1.2159032897020685, + "grad_norm": 0.6049096053195613, + "learning_rate": 7.034717592730018e-05, + "loss": 12.1224, + "step": 22329 + }, + { + "epoch": 1.2159577436986515, + "grad_norm": 0.5216970000678234, + "learning_rate": 7.033875439367137e-05, + "loss": 11.9001, + "step": 22330 + }, + { + "epoch": 1.2160121976952345, + "grad_norm": 0.5569509677271596, + "learning_rate": 7.033033309069339e-05, + "loss": 12.0039, + "step": 22331 + }, + { + "epoch": 1.2160666516918175, + "grad_norm": 0.5508933269962223, + "learning_rate": 7.03219120184317e-05, + "loss": 11.99, + "step": 22332 + }, + { + "epoch": 1.2161211056884007, + "grad_norm": 0.5963243297822097, + "learning_rate": 7.031349117695183e-05, + "loss": 11.9617, + "step": 22333 + }, + { + "epoch": 1.2161755596849837, + "grad_norm": 0.5379930064043972, + "learning_rate": 7.030507056631923e-05, + "loss": 11.9727, + "step": 22334 + }, + { + "epoch": 1.2162300136815667, + "grad_norm": 0.5386747676124668, + "learning_rate": 7.029665018659936e-05, + "loss": 11.9948, + "step": 22335 + }, + { + "epoch": 1.2162844676781497, + "grad_norm": 0.5723447515807047, + "learning_rate": 7.028823003785774e-05, + "loss": 11.9454, + "step": 22336 + }, + { + "epoch": 1.2163389216747327, + "grad_norm": 0.583893023561163, + "learning_rate": 7.027981012015981e-05, + "loss": 11.9514, + "step": 22337 + }, + { + "epoch": 1.2163933756713157, + "grad_norm": 0.5887967027117306, + "learning_rate": 7.027139043357106e-05, + "loss": 12.0417, + "step": 22338 + }, + { + "epoch": 1.2164478296678987, + "grad_norm": 0.5689338247182932, + "learning_rate": 7.026297097815697e-05, + "loss": 12.0591, + "step": 22339 + }, + { + "epoch": 1.2165022836644817, + "grad_norm": 0.5141804373708683, + "learning_rate": 7.025455175398299e-05, + "loss": 12.0649, + "step": 22340 + }, + { + "epoch": 1.2165567376610646, + "grad_norm": 0.7292763687760996, + "learning_rate": 7.02461327611146e-05, + "loss": 12.0871, + "step": 22341 + }, + { + "epoch": 1.2166111916576476, + "grad_norm": 0.5518115603447546, + "learning_rate": 7.02377139996173e-05, + "loss": 12.036, + "step": 22342 + }, + { + "epoch": 1.2166656456542309, + "grad_norm": 0.5506875059426861, + "learning_rate": 7.02292954695565e-05, + "loss": 11.9148, + "step": 22343 + }, + { + "epoch": 1.2167200996508138, + "grad_norm": 0.7162991465665424, + "learning_rate": 7.022087717099775e-05, + "loss": 11.9591, + "step": 22344 + }, + { + "epoch": 1.2167745536473968, + "grad_norm": 0.5522233152721466, + "learning_rate": 7.021245910400638e-05, + "loss": 11.9098, + "step": 22345 + }, + { + "epoch": 1.2168290076439798, + "grad_norm": 0.5137823090818378, + "learning_rate": 7.020404126864794e-05, + "loss": 11.9703, + "step": 22346 + }, + { + "epoch": 1.2168834616405628, + "grad_norm": 0.5967318474798641, + "learning_rate": 7.01956236649879e-05, + "loss": 12.0042, + "step": 22347 + }, + { + "epoch": 1.2169379156371458, + "grad_norm": 0.510429647743093, + "learning_rate": 7.018720629309167e-05, + "loss": 12.0425, + "step": 22348 + }, + { + "epoch": 1.2169923696337288, + "grad_norm": 0.6660538355641454, + "learning_rate": 7.017878915302471e-05, + "loss": 11.9772, + "step": 22349 + }, + { + "epoch": 1.2170468236303118, + "grad_norm": 0.5156984135756433, + "learning_rate": 7.017037224485251e-05, + "loss": 11.9955, + "step": 22350 + }, + { + "epoch": 1.2171012776268948, + "grad_norm": 0.5515831340977696, + "learning_rate": 7.016195556864049e-05, + "loss": 12.0198, + "step": 22351 + }, + { + "epoch": 1.2171557316234778, + "grad_norm": 0.5009435038304422, + "learning_rate": 7.015353912445408e-05, + "loss": 11.9356, + "step": 22352 + }, + { + "epoch": 1.2172101856200608, + "grad_norm": 0.5027011911111109, + "learning_rate": 7.01451229123588e-05, + "loss": 11.8627, + "step": 22353 + }, + { + "epoch": 1.2172646396166438, + "grad_norm": 0.5668826387820898, + "learning_rate": 7.013670693242005e-05, + "loss": 12.0749, + "step": 22354 + }, + { + "epoch": 1.2173190936132268, + "grad_norm": 0.5515281227037341, + "learning_rate": 7.012829118470328e-05, + "loss": 12.139, + "step": 22355 + }, + { + "epoch": 1.21737354760981, + "grad_norm": 0.4810495926302193, + "learning_rate": 7.011987566927387e-05, + "loss": 11.9472, + "step": 22356 + }, + { + "epoch": 1.217428001606393, + "grad_norm": 0.5723272624958875, + "learning_rate": 7.011146038619735e-05, + "loss": 11.8951, + "step": 22357 + }, + { + "epoch": 1.217482455602976, + "grad_norm": 0.5549262552776699, + "learning_rate": 7.010304533553913e-05, + "loss": 12.0505, + "step": 22358 + }, + { + "epoch": 1.217536909599559, + "grad_norm": 0.5405482614457372, + "learning_rate": 7.009463051736465e-05, + "loss": 11.9682, + "step": 22359 + }, + { + "epoch": 1.217591363596142, + "grad_norm": 0.5412814365790568, + "learning_rate": 7.008621593173932e-05, + "loss": 11.9465, + "step": 22360 + }, + { + "epoch": 1.217645817592725, + "grad_norm": 0.5467583588804531, + "learning_rate": 7.00778015787286e-05, + "loss": 12.0006, + "step": 22361 + }, + { + "epoch": 1.217700271589308, + "grad_norm": 0.4916924452821014, + "learning_rate": 7.006938745839792e-05, + "loss": 12.0715, + "step": 22362 + }, + { + "epoch": 1.217754725585891, + "grad_norm": 0.5508482146314773, + "learning_rate": 7.006097357081269e-05, + "loss": 12.0779, + "step": 22363 + }, + { + "epoch": 1.217809179582474, + "grad_norm": 0.5041038578527925, + "learning_rate": 7.005255991603838e-05, + "loss": 11.9602, + "step": 22364 + }, + { + "epoch": 1.217863633579057, + "grad_norm": 0.5695379925221715, + "learning_rate": 7.004414649414037e-05, + "loss": 12.0427, + "step": 22365 + }, + { + "epoch": 1.2179180875756401, + "grad_norm": 0.525114739195111, + "learning_rate": 7.003573330518409e-05, + "loss": 11.8928, + "step": 22366 + }, + { + "epoch": 1.2179725415722231, + "grad_norm": 0.5566647811168054, + "learning_rate": 7.002732034923499e-05, + "loss": 12.0557, + "step": 22367 + }, + { + "epoch": 1.2180269955688061, + "grad_norm": 0.49262671405552383, + "learning_rate": 7.001890762635844e-05, + "loss": 11.9055, + "step": 22368 + }, + { + "epoch": 1.218081449565389, + "grad_norm": 0.5578951026885405, + "learning_rate": 7.00104951366199e-05, + "loss": 11.9872, + "step": 22369 + }, + { + "epoch": 1.218135903561972, + "grad_norm": 0.5683924026785109, + "learning_rate": 7.000208288008478e-05, + "loss": 11.9625, + "step": 22370 + }, + { + "epoch": 1.218190357558555, + "grad_norm": 0.5086219925298866, + "learning_rate": 6.999367085681852e-05, + "loss": 11.9946, + "step": 22371 + }, + { + "epoch": 1.218244811555138, + "grad_norm": 0.515937880076869, + "learning_rate": 6.998525906688649e-05, + "loss": 12.0035, + "step": 22372 + }, + { + "epoch": 1.218299265551721, + "grad_norm": 0.5588427097572062, + "learning_rate": 6.997684751035411e-05, + "loss": 11.9378, + "step": 22373 + }, + { + "epoch": 1.218353719548304, + "grad_norm": 0.5279204411786923, + "learning_rate": 6.996843618728684e-05, + "loss": 11.9883, + "step": 22374 + }, + { + "epoch": 1.218408173544887, + "grad_norm": 0.5212889295302302, + "learning_rate": 6.996002509775001e-05, + "loss": 11.9559, + "step": 22375 + }, + { + "epoch": 1.21846262754147, + "grad_norm": 0.5420376501032255, + "learning_rate": 6.995161424180908e-05, + "loss": 11.8928, + "step": 22376 + }, + { + "epoch": 1.218517081538053, + "grad_norm": 0.5166400531308509, + "learning_rate": 6.994320361952943e-05, + "loss": 11.8835, + "step": 22377 + }, + { + "epoch": 1.218571535534636, + "grad_norm": 0.5292885786067021, + "learning_rate": 6.993479323097647e-05, + "loss": 11.993, + "step": 22378 + }, + { + "epoch": 1.218625989531219, + "grad_norm": 0.583737031027247, + "learning_rate": 6.992638307621557e-05, + "loss": 11.8614, + "step": 22379 + }, + { + "epoch": 1.2186804435278022, + "grad_norm": 0.561478994551792, + "learning_rate": 6.99179731553122e-05, + "loss": 12.0964, + "step": 22380 + }, + { + "epoch": 1.2187348975243852, + "grad_norm": 0.5640655878626899, + "learning_rate": 6.990956346833168e-05, + "loss": 11.9127, + "step": 22381 + }, + { + "epoch": 1.2187893515209682, + "grad_norm": 0.5762987678441256, + "learning_rate": 6.990115401533946e-05, + "loss": 12.0166, + "step": 22382 + }, + { + "epoch": 1.2188438055175512, + "grad_norm": 0.6597124157320646, + "learning_rate": 6.98927447964009e-05, + "loss": 12.0442, + "step": 22383 + }, + { + "epoch": 1.2188982595141342, + "grad_norm": 0.5609973227230155, + "learning_rate": 6.988433581158145e-05, + "loss": 11.9126, + "step": 22384 + }, + { + "epoch": 1.2189527135107172, + "grad_norm": 0.5440578199406028, + "learning_rate": 6.987592706094643e-05, + "loss": 11.8648, + "step": 22385 + }, + { + "epoch": 1.2190071675073002, + "grad_norm": 0.5859657659305496, + "learning_rate": 6.986751854456124e-05, + "loss": 12.0417, + "step": 22386 + }, + { + "epoch": 1.2190616215038832, + "grad_norm": 0.5791757679621246, + "learning_rate": 6.98591102624913e-05, + "loss": 11.8141, + "step": 22387 + }, + { + "epoch": 1.2191160755004662, + "grad_norm": 0.580928826714245, + "learning_rate": 6.985070221480195e-05, + "loss": 11.9734, + "step": 22388 + }, + { + "epoch": 1.2191705294970492, + "grad_norm": 0.523511068552816, + "learning_rate": 6.98422944015586e-05, + "loss": 11.9273, + "step": 22389 + }, + { + "epoch": 1.2192249834936324, + "grad_norm": 0.5280995417782323, + "learning_rate": 6.98338868228266e-05, + "loss": 11.9703, + "step": 22390 + }, + { + "epoch": 1.2192794374902154, + "grad_norm": 0.554149458968344, + "learning_rate": 6.982547947867141e-05, + "loss": 11.9068, + "step": 22391 + }, + { + "epoch": 1.2193338914867984, + "grad_norm": 0.5710116531873979, + "learning_rate": 6.981707236915833e-05, + "loss": 11.856, + "step": 22392 + }, + { + "epoch": 1.2193883454833814, + "grad_norm": 0.5646006857839734, + "learning_rate": 6.980866549435275e-05, + "loss": 11.9466, + "step": 22393 + }, + { + "epoch": 1.2194427994799644, + "grad_norm": 0.5457992022543512, + "learning_rate": 6.980025885432006e-05, + "loss": 12.0449, + "step": 22394 + }, + { + "epoch": 1.2194972534765474, + "grad_norm": 0.6146281492599752, + "learning_rate": 6.979185244912563e-05, + "loss": 11.9747, + "step": 22395 + }, + { + "epoch": 1.2195517074731304, + "grad_norm": 0.589197910788417, + "learning_rate": 6.978344627883482e-05, + "loss": 12.0342, + "step": 22396 + }, + { + "epoch": 1.2196061614697133, + "grad_norm": 0.5438373320570249, + "learning_rate": 6.977504034351297e-05, + "loss": 12.0001, + "step": 22397 + }, + { + "epoch": 1.2196606154662963, + "grad_norm": 0.5923276252131006, + "learning_rate": 6.97666346432255e-05, + "loss": 12.0653, + "step": 22398 + }, + { + "epoch": 1.2197150694628793, + "grad_norm": 0.6069501760450577, + "learning_rate": 6.975822917803773e-05, + "loss": 12.0088, + "step": 22399 + }, + { + "epoch": 1.2197695234594623, + "grad_norm": 0.5136480609945389, + "learning_rate": 6.974982394801505e-05, + "loss": 12.0383, + "step": 22400 + }, + { + "epoch": 1.2198239774560453, + "grad_norm": 0.5076437212057705, + "learning_rate": 6.974141895322282e-05, + "loss": 11.9397, + "step": 22401 + }, + { + "epoch": 1.2198784314526283, + "grad_norm": 0.5724996968219022, + "learning_rate": 6.973301419372637e-05, + "loss": 11.965, + "step": 22402 + }, + { + "epoch": 1.2199328854492115, + "grad_norm": 0.4995089364803861, + "learning_rate": 6.97246096695911e-05, + "loss": 11.975, + "step": 22403 + }, + { + "epoch": 1.2199873394457945, + "grad_norm": 0.580369141175968, + "learning_rate": 6.971620538088233e-05, + "loss": 11.9444, + "step": 22404 + }, + { + "epoch": 1.2200417934423775, + "grad_norm": 0.49372070185979805, + "learning_rate": 6.970780132766545e-05, + "loss": 11.7919, + "step": 22405 + }, + { + "epoch": 1.2200962474389605, + "grad_norm": 0.6917361868930043, + "learning_rate": 6.969939751000577e-05, + "loss": 12.0882, + "step": 22406 + }, + { + "epoch": 1.2201507014355435, + "grad_norm": 0.6322136547605418, + "learning_rate": 6.969099392796865e-05, + "loss": 12.0858, + "step": 22407 + }, + { + "epoch": 1.2202051554321265, + "grad_norm": 0.5223013259575535, + "learning_rate": 6.968259058161946e-05, + "loss": 11.8596, + "step": 22408 + }, + { + "epoch": 1.2202596094287095, + "grad_norm": 0.5512881492846489, + "learning_rate": 6.967418747102351e-05, + "loss": 12.0115, + "step": 22409 + }, + { + "epoch": 1.2203140634252925, + "grad_norm": 0.5173212222099689, + "learning_rate": 6.966578459624617e-05, + "loss": 11.8679, + "step": 22410 + }, + { + "epoch": 1.2203685174218755, + "grad_norm": 0.5708016938914064, + "learning_rate": 6.965738195735279e-05, + "loss": 12.0618, + "step": 22411 + }, + { + "epoch": 1.2204229714184585, + "grad_norm": 0.5448068878427165, + "learning_rate": 6.964897955440868e-05, + "loss": 11.8162, + "step": 22412 + }, + { + "epoch": 1.2204774254150417, + "grad_norm": 0.5529709578796422, + "learning_rate": 6.964057738747918e-05, + "loss": 12.0061, + "step": 22413 + }, + { + "epoch": 1.2205318794116247, + "grad_norm": 0.5358876828902283, + "learning_rate": 6.963217545662965e-05, + "loss": 11.9137, + "step": 22414 + }, + { + "epoch": 1.2205863334082077, + "grad_norm": 0.5647036952014872, + "learning_rate": 6.962377376192548e-05, + "loss": 12.0191, + "step": 22415 + }, + { + "epoch": 1.2206407874047907, + "grad_norm": 0.5940657194824316, + "learning_rate": 6.961537230343188e-05, + "loss": 12.184, + "step": 22416 + }, + { + "epoch": 1.2206952414013736, + "grad_norm": 0.5790464934404309, + "learning_rate": 6.960697108121423e-05, + "loss": 12.0073, + "step": 22417 + }, + { + "epoch": 1.2207496953979566, + "grad_norm": 0.5045579535013146, + "learning_rate": 6.959857009533787e-05, + "loss": 11.9809, + "step": 22418 + }, + { + "epoch": 1.2208041493945396, + "grad_norm": 0.5530522189101298, + "learning_rate": 6.959016934586816e-05, + "loss": 12.039, + "step": 22419 + }, + { + "epoch": 1.2208586033911226, + "grad_norm": 0.5335794066957925, + "learning_rate": 6.958176883287037e-05, + "loss": 11.9908, + "step": 22420 + }, + { + "epoch": 1.2209130573877056, + "grad_norm": 0.5344672194234419, + "learning_rate": 6.957336855640986e-05, + "loss": 11.9482, + "step": 22421 + }, + { + "epoch": 1.2209675113842886, + "grad_norm": 0.5409941895580455, + "learning_rate": 6.956496851655194e-05, + "loss": 12.055, + "step": 22422 + }, + { + "epoch": 1.2210219653808716, + "grad_norm": 0.5162961278211933, + "learning_rate": 6.955656871336191e-05, + "loss": 11.9066, + "step": 22423 + }, + { + "epoch": 1.2210764193774546, + "grad_norm": 0.5731886725647957, + "learning_rate": 6.954816914690509e-05, + "loss": 11.9437, + "step": 22424 + }, + { + "epoch": 1.2211308733740376, + "grad_norm": 0.5375378154163946, + "learning_rate": 6.95397698172469e-05, + "loss": 11.897, + "step": 22425 + }, + { + "epoch": 1.2211853273706208, + "grad_norm": 0.5308922702002112, + "learning_rate": 6.95313707244525e-05, + "loss": 11.8796, + "step": 22426 + }, + { + "epoch": 1.2212397813672038, + "grad_norm": 0.5496555211264113, + "learning_rate": 6.952297186858728e-05, + "loss": 11.9544, + "step": 22427 + }, + { + "epoch": 1.2212942353637868, + "grad_norm": 0.5110784431805737, + "learning_rate": 6.951457324971653e-05, + "loss": 11.971, + "step": 22428 + }, + { + "epoch": 1.2213486893603698, + "grad_norm": 0.549558745358558, + "learning_rate": 6.950617486790558e-05, + "loss": 11.9023, + "step": 22429 + }, + { + "epoch": 1.2214031433569528, + "grad_norm": 0.554236279059842, + "learning_rate": 6.949777672321973e-05, + "loss": 12.076, + "step": 22430 + }, + { + "epoch": 1.2214575973535358, + "grad_norm": 0.5544054192583099, + "learning_rate": 6.948937881572428e-05, + "loss": 11.9806, + "step": 22431 + }, + { + "epoch": 1.2215120513501188, + "grad_norm": 0.5659384279633011, + "learning_rate": 6.948098114548454e-05, + "loss": 11.8784, + "step": 22432 + }, + { + "epoch": 1.2215665053467017, + "grad_norm": 0.5623758176488516, + "learning_rate": 6.947258371256582e-05, + "loss": 11.9009, + "step": 22433 + }, + { + "epoch": 1.2216209593432847, + "grad_norm": 0.517444801061649, + "learning_rate": 6.94641865170334e-05, + "loss": 12.0179, + "step": 22434 + }, + { + "epoch": 1.2216754133398677, + "grad_norm": 0.5516104639008468, + "learning_rate": 6.945578955895259e-05, + "loss": 11.8954, + "step": 22435 + }, + { + "epoch": 1.221729867336451, + "grad_norm": 0.5662922139479862, + "learning_rate": 6.944739283838868e-05, + "loss": 11.9934, + "step": 22436 + }, + { + "epoch": 1.221784321333034, + "grad_norm": 0.5264966002719521, + "learning_rate": 6.943899635540697e-05, + "loss": 11.8817, + "step": 22437 + }, + { + "epoch": 1.221838775329617, + "grad_norm": 0.5127025445740278, + "learning_rate": 6.943060011007274e-05, + "loss": 11.8937, + "step": 22438 + }, + { + "epoch": 1.2218932293262, + "grad_norm": 0.5271368637409032, + "learning_rate": 6.942220410245128e-05, + "loss": 11.9038, + "step": 22439 + }, + { + "epoch": 1.221947683322783, + "grad_norm": 0.5469080513964029, + "learning_rate": 6.94138083326079e-05, + "loss": 12.1692, + "step": 22440 + }, + { + "epoch": 1.222002137319366, + "grad_norm": 0.4922272766274975, + "learning_rate": 6.940541280060788e-05, + "loss": 11.9012, + "step": 22441 + }, + { + "epoch": 1.222056591315949, + "grad_norm": 0.5071699516397944, + "learning_rate": 6.93970175065165e-05, + "loss": 11.9217, + "step": 22442 + }, + { + "epoch": 1.222111045312532, + "grad_norm": 0.5443291129826455, + "learning_rate": 6.938862245039904e-05, + "loss": 12.0067, + "step": 22443 + }, + { + "epoch": 1.2221654993091149, + "grad_norm": 0.5463068505259102, + "learning_rate": 6.938022763232079e-05, + "loss": 11.8455, + "step": 22444 + }, + { + "epoch": 1.2222199533056979, + "grad_norm": 0.5480977159865835, + "learning_rate": 6.937183305234705e-05, + "loss": 11.9707, + "step": 22445 + }, + { + "epoch": 1.2222744073022809, + "grad_norm": 0.5634515913441669, + "learning_rate": 6.936343871054304e-05, + "loss": 11.9748, + "step": 22446 + }, + { + "epoch": 1.2223288612988639, + "grad_norm": 0.571882199790371, + "learning_rate": 6.935504460697407e-05, + "loss": 12.0599, + "step": 22447 + }, + { + "epoch": 1.2223833152954469, + "grad_norm": 0.528124344999855, + "learning_rate": 6.934665074170542e-05, + "loss": 11.8866, + "step": 22448 + }, + { + "epoch": 1.2224377692920299, + "grad_norm": 0.5836550455259616, + "learning_rate": 6.933825711480236e-05, + "loss": 11.9151, + "step": 22449 + }, + { + "epoch": 1.222492223288613, + "grad_norm": 0.5056020976244936, + "learning_rate": 6.932986372633012e-05, + "loss": 11.9975, + "step": 22450 + }, + { + "epoch": 1.222546677285196, + "grad_norm": 0.48986824900253506, + "learning_rate": 6.932147057635403e-05, + "loss": 12.004, + "step": 22451 + }, + { + "epoch": 1.222601131281779, + "grad_norm": 0.5660171207746736, + "learning_rate": 6.931307766493933e-05, + "loss": 12.0485, + "step": 22452 + }, + { + "epoch": 1.222655585278362, + "grad_norm": 0.6416172517087058, + "learning_rate": 6.930468499215128e-05, + "loss": 11.9987, + "step": 22453 + }, + { + "epoch": 1.222710039274945, + "grad_norm": 0.5803503295819011, + "learning_rate": 6.929629255805514e-05, + "loss": 11.8998, + "step": 22454 + }, + { + "epoch": 1.222764493271528, + "grad_norm": 0.524470433421093, + "learning_rate": 6.928790036271622e-05, + "loss": 12.0086, + "step": 22455 + }, + { + "epoch": 1.222818947268111, + "grad_norm": 0.5916510426763025, + "learning_rate": 6.92795084061997e-05, + "loss": 11.8913, + "step": 22456 + }, + { + "epoch": 1.222873401264694, + "grad_norm": 0.6499721457360714, + "learning_rate": 6.927111668857088e-05, + "loss": 12.0046, + "step": 22457 + }, + { + "epoch": 1.222927855261277, + "grad_norm": 0.5795488762817579, + "learning_rate": 6.926272520989501e-05, + "loss": 12.0773, + "step": 22458 + }, + { + "epoch": 1.22298230925786, + "grad_norm": 0.4982720873340564, + "learning_rate": 6.925433397023734e-05, + "loss": 11.9238, + "step": 22459 + }, + { + "epoch": 1.2230367632544432, + "grad_norm": 0.7991892105498443, + "learning_rate": 6.924594296966313e-05, + "loss": 11.9147, + "step": 22460 + }, + { + "epoch": 1.2230912172510262, + "grad_norm": 0.5443048221822511, + "learning_rate": 6.923755220823759e-05, + "loss": 11.9642, + "step": 22461 + }, + { + "epoch": 1.2231456712476092, + "grad_norm": 0.5778090281931294, + "learning_rate": 6.922916168602604e-05, + "loss": 12.0044, + "step": 22462 + }, + { + "epoch": 1.2232001252441922, + "grad_norm": 0.644367087982054, + "learning_rate": 6.922077140309368e-05, + "loss": 11.9334, + "step": 22463 + }, + { + "epoch": 1.2232545792407752, + "grad_norm": 0.529193243856524, + "learning_rate": 6.921238135950578e-05, + "loss": 11.8468, + "step": 22464 + }, + { + "epoch": 1.2233090332373582, + "grad_norm": 0.5449996993842378, + "learning_rate": 6.92039915553276e-05, + "loss": 11.8906, + "step": 22465 + }, + { + "epoch": 1.2233634872339412, + "grad_norm": 0.5335279736405514, + "learning_rate": 6.919560199062426e-05, + "loss": 12.024, + "step": 22466 + }, + { + "epoch": 1.2234179412305242, + "grad_norm": 0.7067743752537503, + "learning_rate": 6.918721266546113e-05, + "loss": 12.007, + "step": 22467 + }, + { + "epoch": 1.2234723952271072, + "grad_norm": 0.6387622983406572, + "learning_rate": 6.91788235799034e-05, + "loss": 12.0687, + "step": 22468 + }, + { + "epoch": 1.2235268492236901, + "grad_norm": 0.6232709815148468, + "learning_rate": 6.917043473401631e-05, + "loss": 11.9692, + "step": 22469 + }, + { + "epoch": 1.2235813032202731, + "grad_norm": 0.5561707891930123, + "learning_rate": 6.916204612786508e-05, + "loss": 11.9515, + "step": 22470 + }, + { + "epoch": 1.2236357572168561, + "grad_norm": 0.4914024925290726, + "learning_rate": 6.915365776151495e-05, + "loss": 11.924, + "step": 22471 + }, + { + "epoch": 1.2236902112134391, + "grad_norm": 0.6152761621250927, + "learning_rate": 6.914526963503116e-05, + "loss": 12.0788, + "step": 22472 + }, + { + "epoch": 1.2237446652100223, + "grad_norm": 0.5532885451751817, + "learning_rate": 6.91368817484789e-05, + "loss": 11.9162, + "step": 22473 + }, + { + "epoch": 1.2237991192066053, + "grad_norm": 0.5519537101500993, + "learning_rate": 6.912849410192343e-05, + "loss": 11.8547, + "step": 22474 + }, + { + "epoch": 1.2238535732031883, + "grad_norm": 0.5261638613155791, + "learning_rate": 6.912010669543003e-05, + "loss": 11.9965, + "step": 22475 + }, + { + "epoch": 1.2239080271997713, + "grad_norm": 0.5554353324154613, + "learning_rate": 6.911171952906381e-05, + "loss": 11.9629, + "step": 22476 + }, + { + "epoch": 1.2239624811963543, + "grad_norm": 0.5097970921595514, + "learning_rate": 6.910333260289002e-05, + "loss": 12.0675, + "step": 22477 + }, + { + "epoch": 1.2240169351929373, + "grad_norm": 0.570485128699268, + "learning_rate": 6.909494591697391e-05, + "loss": 11.9794, + "step": 22478 + }, + { + "epoch": 1.2240713891895203, + "grad_norm": 0.5508712214127788, + "learning_rate": 6.908655947138069e-05, + "loss": 11.9146, + "step": 22479 + }, + { + "epoch": 1.2241258431861033, + "grad_norm": 0.583189624217757, + "learning_rate": 6.907817326617558e-05, + "loss": 12.0606, + "step": 22480 + }, + { + "epoch": 1.2241802971826863, + "grad_norm": 0.5304849872975674, + "learning_rate": 6.906978730142377e-05, + "loss": 11.942, + "step": 22481 + }, + { + "epoch": 1.2242347511792693, + "grad_norm": 0.5421844094714305, + "learning_rate": 6.906140157719048e-05, + "loss": 11.887, + "step": 22482 + }, + { + "epoch": 1.2242892051758525, + "grad_norm": 0.5693422574327641, + "learning_rate": 6.90530160935409e-05, + "loss": 11.9978, + "step": 22483 + }, + { + "epoch": 1.2243436591724355, + "grad_norm": 0.5567753428968648, + "learning_rate": 6.904463085054028e-05, + "loss": 11.9522, + "step": 22484 + }, + { + "epoch": 1.2243981131690185, + "grad_norm": 0.5357986277554075, + "learning_rate": 6.903624584825382e-05, + "loss": 12.0713, + "step": 22485 + }, + { + "epoch": 1.2244525671656015, + "grad_norm": 0.5721509995507145, + "learning_rate": 6.902786108674668e-05, + "loss": 12.022, + "step": 22486 + }, + { + "epoch": 1.2245070211621845, + "grad_norm": 0.5412860233253918, + "learning_rate": 6.901947656608409e-05, + "loss": 11.8419, + "step": 22487 + }, + { + "epoch": 1.2245614751587675, + "grad_norm": 0.5522774571518956, + "learning_rate": 6.901109228633123e-05, + "loss": 11.9114, + "step": 22488 + }, + { + "epoch": 1.2246159291553504, + "grad_norm": 0.5264038505000138, + "learning_rate": 6.900270824755333e-05, + "loss": 11.9053, + "step": 22489 + }, + { + "epoch": 1.2246703831519334, + "grad_norm": 0.5416035535578183, + "learning_rate": 6.899432444981555e-05, + "loss": 12.0428, + "step": 22490 + }, + { + "epoch": 1.2247248371485164, + "grad_norm": 0.5632192382301364, + "learning_rate": 6.898594089318313e-05, + "loss": 11.7867, + "step": 22491 + }, + { + "epoch": 1.2247792911450994, + "grad_norm": 0.5267971404154178, + "learning_rate": 6.897755757772122e-05, + "loss": 11.8979, + "step": 22492 + }, + { + "epoch": 1.2248337451416824, + "grad_norm": 0.5483848947228833, + "learning_rate": 6.896917450349502e-05, + "loss": 12.0011, + "step": 22493 + }, + { + "epoch": 1.2248881991382654, + "grad_norm": 0.5449668670560219, + "learning_rate": 6.896079167056973e-05, + "loss": 11.9352, + "step": 22494 + }, + { + "epoch": 1.2249426531348484, + "grad_norm": 0.6690299927337976, + "learning_rate": 6.895240907901056e-05, + "loss": 12.0275, + "step": 22495 + }, + { + "epoch": 1.2249971071314316, + "grad_norm": 0.5341100907560594, + "learning_rate": 6.894402672888263e-05, + "loss": 11.8851, + "step": 22496 + }, + { + "epoch": 1.2250515611280146, + "grad_norm": 0.6020366719678959, + "learning_rate": 6.893564462025116e-05, + "loss": 11.8098, + "step": 22497 + }, + { + "epoch": 1.2251060151245976, + "grad_norm": 0.6167733876103798, + "learning_rate": 6.892726275318133e-05, + "loss": 12.0455, + "step": 22498 + }, + { + "epoch": 1.2251604691211806, + "grad_norm": 0.5388056884271457, + "learning_rate": 6.891888112773828e-05, + "loss": 11.9592, + "step": 22499 + }, + { + "epoch": 1.2252149231177636, + "grad_norm": 0.552119657574981, + "learning_rate": 6.891049974398727e-05, + "loss": 11.9421, + "step": 22500 + }, + { + "epoch": 1.2252693771143466, + "grad_norm": 0.5919754129219378, + "learning_rate": 6.89021186019934e-05, + "loss": 11.9928, + "step": 22501 + }, + { + "epoch": 1.2253238311109296, + "grad_norm": 0.6203640101726013, + "learning_rate": 6.889373770182189e-05, + "loss": 12.0415, + "step": 22502 + }, + { + "epoch": 1.2253782851075126, + "grad_norm": 0.6101859476466426, + "learning_rate": 6.888535704353789e-05, + "loss": 12.0055, + "step": 22503 + }, + { + "epoch": 1.2254327391040956, + "grad_norm": 0.5680737713519539, + "learning_rate": 6.887697662720655e-05, + "loss": 11.971, + "step": 22504 + }, + { + "epoch": 1.2254871931006786, + "grad_norm": 0.5394158768390048, + "learning_rate": 6.886859645289312e-05, + "loss": 11.9425, + "step": 22505 + }, + { + "epoch": 1.2255416470972618, + "grad_norm": 0.5569811013637291, + "learning_rate": 6.886021652066266e-05, + "loss": 12.0418, + "step": 22506 + }, + { + "epoch": 1.2255961010938448, + "grad_norm": 0.4913095262555458, + "learning_rate": 6.885183683058037e-05, + "loss": 11.9588, + "step": 22507 + }, + { + "epoch": 1.2256505550904278, + "grad_norm": 0.5106557013586046, + "learning_rate": 6.884345738271144e-05, + "loss": 11.8817, + "step": 22508 + }, + { + "epoch": 1.2257050090870107, + "grad_norm": 0.5293005486103188, + "learning_rate": 6.883507817712099e-05, + "loss": 11.9469, + "step": 22509 + }, + { + "epoch": 1.2257594630835937, + "grad_norm": 0.5521016993068111, + "learning_rate": 6.882669921387419e-05, + "loss": 12.1155, + "step": 22510 + }, + { + "epoch": 1.2258139170801767, + "grad_norm": 0.5996763306083485, + "learning_rate": 6.881832049303622e-05, + "loss": 11.9832, + "step": 22511 + }, + { + "epoch": 1.2258683710767597, + "grad_norm": 0.49781124861363907, + "learning_rate": 6.880994201467224e-05, + "loss": 12.0111, + "step": 22512 + }, + { + "epoch": 1.2259228250733427, + "grad_norm": 0.568234994183213, + "learning_rate": 6.880156377884736e-05, + "loss": 12.0161, + "step": 22513 + }, + { + "epoch": 1.2259772790699257, + "grad_norm": 0.5972351646468848, + "learning_rate": 6.879318578562674e-05, + "loss": 11.9595, + "step": 22514 + }, + { + "epoch": 1.2260317330665087, + "grad_norm": 0.6252271107783179, + "learning_rate": 6.878480803507559e-05, + "loss": 12.1492, + "step": 22515 + }, + { + "epoch": 1.2260861870630917, + "grad_norm": 0.6310811557791951, + "learning_rate": 6.877643052725898e-05, + "loss": 11.9716, + "step": 22516 + }, + { + "epoch": 1.2261406410596747, + "grad_norm": 0.5824550513346468, + "learning_rate": 6.876805326224207e-05, + "loss": 12.0157, + "step": 22517 + }, + { + "epoch": 1.2261950950562577, + "grad_norm": 0.5659324624616336, + "learning_rate": 6.875967624009002e-05, + "loss": 11.9627, + "step": 22518 + }, + { + "epoch": 1.226249549052841, + "grad_norm": 0.5534296008852613, + "learning_rate": 6.875129946086797e-05, + "loss": 11.8946, + "step": 22519 + }, + { + "epoch": 1.2263040030494239, + "grad_norm": 0.5444844881677916, + "learning_rate": 6.874292292464105e-05, + "loss": 11.9176, + "step": 22520 + }, + { + "epoch": 1.2263584570460069, + "grad_norm": 0.5676441010144988, + "learning_rate": 6.87345466314744e-05, + "loss": 11.958, + "step": 22521 + }, + { + "epoch": 1.2264129110425899, + "grad_norm": 0.5618143800630517, + "learning_rate": 6.872617058143315e-05, + "loss": 11.8106, + "step": 22522 + }, + { + "epoch": 1.2264673650391729, + "grad_norm": 0.5839632123162697, + "learning_rate": 6.871779477458246e-05, + "loss": 11.8015, + "step": 22523 + }, + { + "epoch": 1.2265218190357559, + "grad_norm": 0.5790251698032695, + "learning_rate": 6.870941921098745e-05, + "loss": 11.9906, + "step": 22524 + }, + { + "epoch": 1.2265762730323388, + "grad_norm": 0.5921096649894817, + "learning_rate": 6.870104389071328e-05, + "loss": 12.1944, + "step": 22525 + }, + { + "epoch": 1.2266307270289218, + "grad_norm": 0.549904217895622, + "learning_rate": 6.869266881382497e-05, + "loss": 12.0126, + "step": 22526 + }, + { + "epoch": 1.2266851810255048, + "grad_norm": 0.5672576721021367, + "learning_rate": 6.868429398038774e-05, + "loss": 11.9439, + "step": 22527 + }, + { + "epoch": 1.2267396350220878, + "grad_norm": 0.5552647166083431, + "learning_rate": 6.86759193904667e-05, + "loss": 11.9606, + "step": 22528 + }, + { + "epoch": 1.2267940890186708, + "grad_norm": 0.5234237128835707, + "learning_rate": 6.866754504412696e-05, + "loss": 11.9319, + "step": 22529 + }, + { + "epoch": 1.226848543015254, + "grad_norm": 0.5150598652908518, + "learning_rate": 6.865917094143365e-05, + "loss": 11.8037, + "step": 22530 + }, + { + "epoch": 1.226902997011837, + "grad_norm": 0.5196617226235651, + "learning_rate": 6.865079708245188e-05, + "loss": 12.0576, + "step": 22531 + }, + { + "epoch": 1.22695745100842, + "grad_norm": 0.5557178074468829, + "learning_rate": 6.864242346724677e-05, + "loss": 11.9688, + "step": 22532 + }, + { + "epoch": 1.227011905005003, + "grad_norm": 0.5535336575135944, + "learning_rate": 6.86340500958834e-05, + "loss": 12.0447, + "step": 22533 + }, + { + "epoch": 1.227066359001586, + "grad_norm": 0.5763363214579711, + "learning_rate": 6.862567696842694e-05, + "loss": 12.0194, + "step": 22534 + }, + { + "epoch": 1.227120812998169, + "grad_norm": 0.569721398763426, + "learning_rate": 6.861730408494254e-05, + "loss": 12.0445, + "step": 22535 + }, + { + "epoch": 1.227175266994752, + "grad_norm": 0.5694185737185509, + "learning_rate": 6.860893144549519e-05, + "loss": 11.8702, + "step": 22536 + }, + { + "epoch": 1.227229720991335, + "grad_norm": 0.524479808326079, + "learning_rate": 6.860055905015003e-05, + "loss": 11.8846, + "step": 22537 + }, + { + "epoch": 1.227284174987918, + "grad_norm": 0.5434479798074577, + "learning_rate": 6.85921868989722e-05, + "loss": 11.9942, + "step": 22538 + }, + { + "epoch": 1.227338628984501, + "grad_norm": 0.5475471972796995, + "learning_rate": 6.858381499202682e-05, + "loss": 11.9149, + "step": 22539 + }, + { + "epoch": 1.227393082981084, + "grad_norm": 0.578268790355292, + "learning_rate": 6.857544332937894e-05, + "loss": 11.9532, + "step": 22540 + }, + { + "epoch": 1.227447536977667, + "grad_norm": 0.5972913130155101, + "learning_rate": 6.85670719110937e-05, + "loss": 12.0852, + "step": 22541 + }, + { + "epoch": 1.22750199097425, + "grad_norm": 0.5799477358588745, + "learning_rate": 6.855870073723616e-05, + "loss": 12.0768, + "step": 22542 + }, + { + "epoch": 1.2275564449708332, + "grad_norm": 0.6316007411233439, + "learning_rate": 6.855032980787146e-05, + "loss": 12.0178, + "step": 22543 + }, + { + "epoch": 1.2276108989674162, + "grad_norm": 0.5529893064266413, + "learning_rate": 6.854195912306463e-05, + "loss": 11.7676, + "step": 22544 + }, + { + "epoch": 1.2276653529639991, + "grad_norm": 0.5453360472387511, + "learning_rate": 6.853358868288082e-05, + "loss": 11.8291, + "step": 22545 + }, + { + "epoch": 1.2277198069605821, + "grad_norm": 0.5922061215016341, + "learning_rate": 6.852521848738515e-05, + "loss": 11.9044, + "step": 22546 + }, + { + "epoch": 1.2277742609571651, + "grad_norm": 0.5260442546307095, + "learning_rate": 6.851684853664262e-05, + "loss": 11.9625, + "step": 22547 + }, + { + "epoch": 1.2278287149537481, + "grad_norm": 0.6544268062678831, + "learning_rate": 6.850847883071832e-05, + "loss": 11.9651, + "step": 22548 + }, + { + "epoch": 1.2278831689503311, + "grad_norm": 0.5514728833294227, + "learning_rate": 6.850010936967742e-05, + "loss": 11.8761, + "step": 22549 + }, + { + "epoch": 1.227937622946914, + "grad_norm": 0.534652900630826, + "learning_rate": 6.849174015358493e-05, + "loss": 11.9961, + "step": 22550 + }, + { + "epoch": 1.227992076943497, + "grad_norm": 0.5620353073739606, + "learning_rate": 6.848337118250596e-05, + "loss": 12.0729, + "step": 22551 + }, + { + "epoch": 1.22804653094008, + "grad_norm": 0.5092306169240086, + "learning_rate": 6.847500245650558e-05, + "loss": 11.8961, + "step": 22552 + }, + { + "epoch": 1.2281009849366633, + "grad_norm": 0.5395441372993375, + "learning_rate": 6.846663397564885e-05, + "loss": 11.9984, + "step": 22553 + }, + { + "epoch": 1.2281554389332463, + "grad_norm": 0.555298253458449, + "learning_rate": 6.845826574000089e-05, + "loss": 11.9878, + "step": 22554 + }, + { + "epoch": 1.2282098929298293, + "grad_norm": 0.5385689252278301, + "learning_rate": 6.844989774962671e-05, + "loss": 11.8666, + "step": 22555 + }, + { + "epoch": 1.2282643469264123, + "grad_norm": 0.6204148714636181, + "learning_rate": 6.844153000459147e-05, + "loss": 11.9007, + "step": 22556 + }, + { + "epoch": 1.2283188009229953, + "grad_norm": 0.5367444221664323, + "learning_rate": 6.843316250496017e-05, + "loss": 11.9166, + "step": 22557 + }, + { + "epoch": 1.2283732549195783, + "grad_norm": 0.5213410503064586, + "learning_rate": 6.842479525079787e-05, + "loss": 12.0957, + "step": 22558 + }, + { + "epoch": 1.2284277089161613, + "grad_norm": 0.5080702699495391, + "learning_rate": 6.841642824216965e-05, + "loss": 11.8988, + "step": 22559 + }, + { + "epoch": 1.2284821629127443, + "grad_norm": 0.48398772506905424, + "learning_rate": 6.840806147914058e-05, + "loss": 11.9082, + "step": 22560 + }, + { + "epoch": 1.2285366169093273, + "grad_norm": 0.5325699704306123, + "learning_rate": 6.839969496177574e-05, + "loss": 11.9781, + "step": 22561 + }, + { + "epoch": 1.2285910709059102, + "grad_norm": 0.5933961282585728, + "learning_rate": 6.839132869014017e-05, + "loss": 12.106, + "step": 22562 + }, + { + "epoch": 1.2286455249024932, + "grad_norm": 0.5650542432538881, + "learning_rate": 6.838296266429893e-05, + "loss": 11.9166, + "step": 22563 + }, + { + "epoch": 1.2286999788990762, + "grad_norm": 0.5643171401506263, + "learning_rate": 6.837459688431707e-05, + "loss": 11.9447, + "step": 22564 + }, + { + "epoch": 1.2287544328956592, + "grad_norm": 0.5603946865686427, + "learning_rate": 6.836623135025964e-05, + "loss": 11.9263, + "step": 22565 + }, + { + "epoch": 1.2288088868922424, + "grad_norm": 0.5456345787971714, + "learning_rate": 6.835786606219174e-05, + "loss": 12.0358, + "step": 22566 + }, + { + "epoch": 1.2288633408888254, + "grad_norm": 0.535565212057959, + "learning_rate": 6.834950102017834e-05, + "loss": 11.9318, + "step": 22567 + }, + { + "epoch": 1.2289177948854084, + "grad_norm": 0.610236385075074, + "learning_rate": 6.834113622428455e-05, + "loss": 11.9999, + "step": 22568 + }, + { + "epoch": 1.2289722488819914, + "grad_norm": 0.579675211951449, + "learning_rate": 6.833277167457536e-05, + "loss": 12.0322, + "step": 22569 + }, + { + "epoch": 1.2290267028785744, + "grad_norm": 0.5672237234485954, + "learning_rate": 6.832440737111589e-05, + "loss": 11.8957, + "step": 22570 + }, + { + "epoch": 1.2290811568751574, + "grad_norm": 0.6164650484087583, + "learning_rate": 6.831604331397109e-05, + "loss": 11.9038, + "step": 22571 + }, + { + "epoch": 1.2291356108717404, + "grad_norm": 0.5123542532218338, + "learning_rate": 6.830767950320608e-05, + "loss": 11.9202, + "step": 22572 + }, + { + "epoch": 1.2291900648683234, + "grad_norm": 0.6149653410400626, + "learning_rate": 6.829931593888588e-05, + "loss": 11.9309, + "step": 22573 + }, + { + "epoch": 1.2292445188649064, + "grad_norm": 0.49101752482147454, + "learning_rate": 6.82909526210755e-05, + "loss": 11.8955, + "step": 22574 + }, + { + "epoch": 1.2292989728614894, + "grad_norm": 0.5521005653498695, + "learning_rate": 6.828258954984e-05, + "loss": 11.9297, + "step": 22575 + }, + { + "epoch": 1.2293534268580726, + "grad_norm": 0.5256936737272673, + "learning_rate": 6.82742267252444e-05, + "loss": 11.9761, + "step": 22576 + }, + { + "epoch": 1.2294078808546556, + "grad_norm": 0.5728446845923955, + "learning_rate": 6.826586414735374e-05, + "loss": 11.9677, + "step": 22577 + }, + { + "epoch": 1.2294623348512386, + "grad_norm": 0.5371594106111425, + "learning_rate": 6.825750181623302e-05, + "loss": 12.0253, + "step": 22578 + }, + { + "epoch": 1.2295167888478216, + "grad_norm": 0.5341225782071433, + "learning_rate": 6.82491397319473e-05, + "loss": 11.9273, + "step": 22579 + }, + { + "epoch": 1.2295712428444046, + "grad_norm": 0.5874889172319521, + "learning_rate": 6.82407778945616e-05, + "loss": 11.8723, + "step": 22580 + }, + { + "epoch": 1.2296256968409875, + "grad_norm": 0.5269104334742466, + "learning_rate": 6.823241630414095e-05, + "loss": 11.9875, + "step": 22581 + }, + { + "epoch": 1.2296801508375705, + "grad_norm": 0.5422815228069731, + "learning_rate": 6.822405496075031e-05, + "loss": 12.0433, + "step": 22582 + }, + { + "epoch": 1.2297346048341535, + "grad_norm": 0.568900395477977, + "learning_rate": 6.821569386445478e-05, + "loss": 11.9877, + "step": 22583 + }, + { + "epoch": 1.2297890588307365, + "grad_norm": 0.48023974666047436, + "learning_rate": 6.820733301531935e-05, + "loss": 11.8652, + "step": 22584 + }, + { + "epoch": 1.2298435128273195, + "grad_norm": 0.5149114235042755, + "learning_rate": 6.819897241340902e-05, + "loss": 11.9277, + "step": 22585 + }, + { + "epoch": 1.2298979668239025, + "grad_norm": 0.5066632032578524, + "learning_rate": 6.819061205878883e-05, + "loss": 11.9283, + "step": 22586 + }, + { + "epoch": 1.2299524208204855, + "grad_norm": 0.5299972732180512, + "learning_rate": 6.818225195152376e-05, + "loss": 11.9157, + "step": 22587 + }, + { + "epoch": 1.2300068748170685, + "grad_norm": 0.4802202635590141, + "learning_rate": 6.817389209167883e-05, + "loss": 11.8945, + "step": 22588 + }, + { + "epoch": 1.2300613288136517, + "grad_norm": 0.5332295352909948, + "learning_rate": 6.816553247931907e-05, + "loss": 11.9258, + "step": 22589 + }, + { + "epoch": 1.2301157828102347, + "grad_norm": 0.5719248548778227, + "learning_rate": 6.815717311450947e-05, + "loss": 11.8852, + "step": 22590 + }, + { + "epoch": 1.2301702368068177, + "grad_norm": 0.6258691862808952, + "learning_rate": 6.814881399731501e-05, + "loss": 12.0193, + "step": 22591 + }, + { + "epoch": 1.2302246908034007, + "grad_norm": 0.5201591560747867, + "learning_rate": 6.814045512780072e-05, + "loss": 11.9122, + "step": 22592 + }, + { + "epoch": 1.2302791447999837, + "grad_norm": 0.5890835281633671, + "learning_rate": 6.813209650603158e-05, + "loss": 11.9035, + "step": 22593 + }, + { + "epoch": 1.2303335987965667, + "grad_norm": 0.6592927364606244, + "learning_rate": 6.812373813207262e-05, + "loss": 11.9232, + "step": 22594 + }, + { + "epoch": 1.2303880527931497, + "grad_norm": 0.5377431673788448, + "learning_rate": 6.81153800059888e-05, + "loss": 11.8767, + "step": 22595 + }, + { + "epoch": 1.2304425067897327, + "grad_norm": 0.4924899511594738, + "learning_rate": 6.810702212784521e-05, + "loss": 11.8827, + "step": 22596 + }, + { + "epoch": 1.2304969607863157, + "grad_norm": 0.5719182262177606, + "learning_rate": 6.809866449770668e-05, + "loss": 11.8717, + "step": 22597 + }, + { + "epoch": 1.2305514147828986, + "grad_norm": 0.5529520024374325, + "learning_rate": 6.809030711563831e-05, + "loss": 11.9094, + "step": 22598 + }, + { + "epoch": 1.2306058687794816, + "grad_norm": 0.5762758018285343, + "learning_rate": 6.808194998170505e-05, + "loss": 11.9475, + "step": 22599 + }, + { + "epoch": 1.2306603227760649, + "grad_norm": 0.4990905240149167, + "learning_rate": 6.807359309597192e-05, + "loss": 11.9709, + "step": 22600 + }, + { + "epoch": 1.2307147767726478, + "grad_norm": 0.5124147148330038, + "learning_rate": 6.806523645850387e-05, + "loss": 11.9831, + "step": 22601 + }, + { + "epoch": 1.2307692307692308, + "grad_norm": 0.5695992152209984, + "learning_rate": 6.805688006936591e-05, + "loss": 11.8572, + "step": 22602 + }, + { + "epoch": 1.2308236847658138, + "grad_norm": 0.6755224310305814, + "learning_rate": 6.804852392862299e-05, + "loss": 11.8961, + "step": 22603 + }, + { + "epoch": 1.2308781387623968, + "grad_norm": 0.546045519852647, + "learning_rate": 6.80401680363401e-05, + "loss": 12.0278, + "step": 22604 + }, + { + "epoch": 1.2309325927589798, + "grad_norm": 0.5436839481613359, + "learning_rate": 6.803181239258223e-05, + "loss": 11.9962, + "step": 22605 + }, + { + "epoch": 1.2309870467555628, + "grad_norm": 0.5958397610471744, + "learning_rate": 6.80234569974144e-05, + "loss": 12.0153, + "step": 22606 + }, + { + "epoch": 1.2310415007521458, + "grad_norm": 0.5994537104580747, + "learning_rate": 6.801510185090148e-05, + "loss": 12.0225, + "step": 22607 + }, + { + "epoch": 1.2310959547487288, + "grad_norm": 0.5257403281060432, + "learning_rate": 6.800674695310848e-05, + "loss": 11.9313, + "step": 22608 + }, + { + "epoch": 1.2311504087453118, + "grad_norm": 0.5503936262250163, + "learning_rate": 6.79983923041004e-05, + "loss": 12.1216, + "step": 22609 + }, + { + "epoch": 1.2312048627418948, + "grad_norm": 0.515757488444049, + "learning_rate": 6.799003790394218e-05, + "loss": 11.9001, + "step": 22610 + }, + { + "epoch": 1.2312593167384778, + "grad_norm": 0.6445319175777956, + "learning_rate": 6.798168375269881e-05, + "loss": 11.8449, + "step": 22611 + }, + { + "epoch": 1.2313137707350608, + "grad_norm": 0.6110454760683547, + "learning_rate": 6.797332985043524e-05, + "loss": 12.0362, + "step": 22612 + }, + { + "epoch": 1.231368224731644, + "grad_norm": 0.5364761671587409, + "learning_rate": 6.796497619721641e-05, + "loss": 11.9972, + "step": 22613 + }, + { + "epoch": 1.231422678728227, + "grad_norm": 0.6156344582695163, + "learning_rate": 6.79566227931073e-05, + "loss": 11.9898, + "step": 22614 + }, + { + "epoch": 1.23147713272481, + "grad_norm": 0.5757783441910294, + "learning_rate": 6.794826963817284e-05, + "loss": 12.0149, + "step": 22615 + }, + { + "epoch": 1.231531586721393, + "grad_norm": 0.5549759110624761, + "learning_rate": 6.793991673247808e-05, + "loss": 11.9959, + "step": 22616 + }, + { + "epoch": 1.231586040717976, + "grad_norm": 0.4844406136239094, + "learning_rate": 6.793156407608788e-05, + "loss": 11.8264, + "step": 22617 + }, + { + "epoch": 1.231640494714559, + "grad_norm": 0.5388303063570763, + "learning_rate": 6.792321166906718e-05, + "loss": 11.9343, + "step": 22618 + }, + { + "epoch": 1.231694948711142, + "grad_norm": 0.5448056529738672, + "learning_rate": 6.791485951148098e-05, + "loss": 11.9791, + "step": 22619 + }, + { + "epoch": 1.231749402707725, + "grad_norm": 0.5034495799225918, + "learning_rate": 6.79065076033942e-05, + "loss": 11.9938, + "step": 22620 + }, + { + "epoch": 1.231803856704308, + "grad_norm": 0.5739551857827964, + "learning_rate": 6.789815594487181e-05, + "loss": 11.9997, + "step": 22621 + }, + { + "epoch": 1.231858310700891, + "grad_norm": 0.5570374439432412, + "learning_rate": 6.788980453597874e-05, + "loss": 11.9231, + "step": 22622 + }, + { + "epoch": 1.2319127646974741, + "grad_norm": 0.5414628401592906, + "learning_rate": 6.788145337677996e-05, + "loss": 12.04, + "step": 22623 + }, + { + "epoch": 1.2319672186940571, + "grad_norm": 0.592957815956549, + "learning_rate": 6.787310246734036e-05, + "loss": 12.0072, + "step": 22624 + }, + { + "epoch": 1.2320216726906401, + "grad_norm": 0.5401153412064662, + "learning_rate": 6.786475180772492e-05, + "loss": 12.0457, + "step": 22625 + }, + { + "epoch": 1.232076126687223, + "grad_norm": 0.5082280335715815, + "learning_rate": 6.785640139799856e-05, + "loss": 11.9912, + "step": 22626 + }, + { + "epoch": 1.232130580683806, + "grad_norm": 0.6033043900846138, + "learning_rate": 6.78480512382262e-05, + "loss": 12.0248, + "step": 22627 + }, + { + "epoch": 1.232185034680389, + "grad_norm": 0.5695370069701157, + "learning_rate": 6.78397013284728e-05, + "loss": 11.861, + "step": 22628 + }, + { + "epoch": 1.232239488676972, + "grad_norm": 0.5128287328816599, + "learning_rate": 6.783135166880326e-05, + "loss": 11.9754, + "step": 22629 + }, + { + "epoch": 1.232293942673555, + "grad_norm": 0.5195913340414557, + "learning_rate": 6.782300225928253e-05, + "loss": 11.8701, + "step": 22630 + }, + { + "epoch": 1.232348396670138, + "grad_norm": 0.5027984525122968, + "learning_rate": 6.78146530999755e-05, + "loss": 11.8952, + "step": 22631 + }, + { + "epoch": 1.232402850666721, + "grad_norm": 0.5218699466640082, + "learning_rate": 6.780630419094715e-05, + "loss": 11.858, + "step": 22632 + }, + { + "epoch": 1.232457304663304, + "grad_norm": 0.5555496894314489, + "learning_rate": 6.779795553226239e-05, + "loss": 11.9742, + "step": 22633 + }, + { + "epoch": 1.232511758659887, + "grad_norm": 0.5483631381595321, + "learning_rate": 6.778960712398612e-05, + "loss": 11.8262, + "step": 22634 + }, + { + "epoch": 1.23256621265647, + "grad_norm": 0.6204895092534077, + "learning_rate": 6.778125896618326e-05, + "loss": 12.127, + "step": 22635 + }, + { + "epoch": 1.2326206666530533, + "grad_norm": 0.5007419609052064, + "learning_rate": 6.777291105891876e-05, + "loss": 11.8728, + "step": 22636 + }, + { + "epoch": 1.2326751206496362, + "grad_norm": 0.5545802228894495, + "learning_rate": 6.77645634022575e-05, + "loss": 11.7296, + "step": 22637 + }, + { + "epoch": 1.2327295746462192, + "grad_norm": 0.5471509780195176, + "learning_rate": 6.775621599626438e-05, + "loss": 12.0449, + "step": 22638 + }, + { + "epoch": 1.2327840286428022, + "grad_norm": 0.5134534463126355, + "learning_rate": 6.774786884100435e-05, + "loss": 11.8806, + "step": 22639 + }, + { + "epoch": 1.2328384826393852, + "grad_norm": 0.5137325719406078, + "learning_rate": 6.773952193654228e-05, + "loss": 11.828, + "step": 22640 + }, + { + "epoch": 1.2328929366359682, + "grad_norm": 0.5495636242292773, + "learning_rate": 6.77311752829431e-05, + "loss": 12.1211, + "step": 22641 + }, + { + "epoch": 1.2329473906325512, + "grad_norm": 0.4992174580434379, + "learning_rate": 6.772282888027171e-05, + "loss": 11.7942, + "step": 22642 + }, + { + "epoch": 1.2330018446291342, + "grad_norm": 0.5063635264465519, + "learning_rate": 6.771448272859302e-05, + "loss": 11.9813, + "step": 22643 + }, + { + "epoch": 1.2330562986257172, + "grad_norm": 0.5161442399417954, + "learning_rate": 6.770613682797193e-05, + "loss": 11.9526, + "step": 22644 + }, + { + "epoch": 1.2331107526223002, + "grad_norm": 0.5498698779672171, + "learning_rate": 6.769779117847335e-05, + "loss": 11.9349, + "step": 22645 + }, + { + "epoch": 1.2331652066188834, + "grad_norm": 0.5908461002668517, + "learning_rate": 6.768944578016217e-05, + "loss": 12.0561, + "step": 22646 + }, + { + "epoch": 1.2332196606154664, + "grad_norm": 0.5286359439660577, + "learning_rate": 6.768110063310325e-05, + "loss": 11.79, + "step": 22647 + }, + { + "epoch": 1.2332741146120494, + "grad_norm": 0.5591648237871142, + "learning_rate": 6.767275573736152e-05, + "loss": 11.9224, + "step": 22648 + }, + { + "epoch": 1.2333285686086324, + "grad_norm": 0.5916860885641544, + "learning_rate": 6.766441109300187e-05, + "loss": 12.0007, + "step": 22649 + }, + { + "epoch": 1.2333830226052154, + "grad_norm": 0.5530588051323196, + "learning_rate": 6.765606670008917e-05, + "loss": 11.9313, + "step": 22650 + }, + { + "epoch": 1.2334374766017984, + "grad_norm": 0.5291355069703004, + "learning_rate": 6.764772255868834e-05, + "loss": 12.0127, + "step": 22651 + }, + { + "epoch": 1.2334919305983814, + "grad_norm": 0.5586532283395358, + "learning_rate": 6.763937866886422e-05, + "loss": 11.9683, + "step": 22652 + }, + { + "epoch": 1.2335463845949644, + "grad_norm": 0.631528253563068, + "learning_rate": 6.763103503068171e-05, + "loss": 11.8984, + "step": 22653 + }, + { + "epoch": 1.2336008385915473, + "grad_norm": 0.5216279564163208, + "learning_rate": 6.762269164420572e-05, + "loss": 11.8951, + "step": 22654 + }, + { + "epoch": 1.2336552925881303, + "grad_norm": 0.5698621996284772, + "learning_rate": 6.76143485095011e-05, + "loss": 11.7775, + "step": 22655 + }, + { + "epoch": 1.2337097465847133, + "grad_norm": 0.5700367434440197, + "learning_rate": 6.760600562663279e-05, + "loss": 11.982, + "step": 22656 + }, + { + "epoch": 1.2337642005812963, + "grad_norm": 0.5284410018572115, + "learning_rate": 6.759766299566554e-05, + "loss": 11.8915, + "step": 22657 + }, + { + "epoch": 1.2338186545778793, + "grad_norm": 0.587121349461119, + "learning_rate": 6.758932061666431e-05, + "loss": 12.0293, + "step": 22658 + }, + { + "epoch": 1.2338731085744625, + "grad_norm": 0.5086158498825752, + "learning_rate": 6.758097848969397e-05, + "loss": 11.8235, + "step": 22659 + }, + { + "epoch": 1.2339275625710455, + "grad_norm": 0.5458134683974727, + "learning_rate": 6.75726366148194e-05, + "loss": 11.7725, + "step": 22660 + }, + { + "epoch": 1.2339820165676285, + "grad_norm": 0.516343793279754, + "learning_rate": 6.756429499210541e-05, + "loss": 11.9901, + "step": 22661 + }, + { + "epoch": 1.2340364705642115, + "grad_norm": 0.5301177127299678, + "learning_rate": 6.755595362161693e-05, + "loss": 11.984, + "step": 22662 + }, + { + "epoch": 1.2340909245607945, + "grad_norm": 0.5508190832135074, + "learning_rate": 6.754761250341879e-05, + "loss": 11.9713, + "step": 22663 + }, + { + "epoch": 1.2341453785573775, + "grad_norm": 0.5372543443815975, + "learning_rate": 6.753927163757581e-05, + "loss": 11.9204, + "step": 22664 + }, + { + "epoch": 1.2341998325539605, + "grad_norm": 0.5476867763403248, + "learning_rate": 6.753093102415295e-05, + "loss": 11.9879, + "step": 22665 + }, + { + "epoch": 1.2342542865505435, + "grad_norm": 0.5621986188289442, + "learning_rate": 6.752259066321504e-05, + "loss": 12.0198, + "step": 22666 + }, + { + "epoch": 1.2343087405471265, + "grad_norm": 0.5251636266109724, + "learning_rate": 6.751425055482688e-05, + "loss": 12.0163, + "step": 22667 + }, + { + "epoch": 1.2343631945437095, + "grad_norm": 0.5277914289651588, + "learning_rate": 6.750591069905334e-05, + "loss": 11.8987, + "step": 22668 + }, + { + "epoch": 1.2344176485402927, + "grad_norm": 0.5784950361547624, + "learning_rate": 6.74975710959593e-05, + "loss": 11.8538, + "step": 22669 + }, + { + "epoch": 1.2344721025368757, + "grad_norm": 0.5752232794584748, + "learning_rate": 6.748923174560958e-05, + "loss": 11.9443, + "step": 22670 + }, + { + "epoch": 1.2345265565334587, + "grad_norm": 0.4871636448689952, + "learning_rate": 6.748089264806907e-05, + "loss": 11.984, + "step": 22671 + }, + { + "epoch": 1.2345810105300417, + "grad_norm": 0.5971699616649676, + "learning_rate": 6.747255380340257e-05, + "loss": 11.897, + "step": 22672 + }, + { + "epoch": 1.2346354645266246, + "grad_norm": 0.5349004302448748, + "learning_rate": 6.746421521167497e-05, + "loss": 12.0665, + "step": 22673 + }, + { + "epoch": 1.2346899185232076, + "grad_norm": 0.5500605269688773, + "learning_rate": 6.745587687295108e-05, + "loss": 11.8755, + "step": 22674 + }, + { + "epoch": 1.2347443725197906, + "grad_norm": 0.590098038149082, + "learning_rate": 6.744753878729574e-05, + "loss": 11.855, + "step": 22675 + }, + { + "epoch": 1.2347988265163736, + "grad_norm": 0.524694545292558, + "learning_rate": 6.743920095477383e-05, + "loss": 11.8882, + "step": 22676 + }, + { + "epoch": 1.2348532805129566, + "grad_norm": 0.5093109039751735, + "learning_rate": 6.743086337545012e-05, + "loss": 11.7996, + "step": 22677 + }, + { + "epoch": 1.2349077345095396, + "grad_norm": 0.5393763795827721, + "learning_rate": 6.742252604938949e-05, + "loss": 11.9541, + "step": 22678 + }, + { + "epoch": 1.2349621885061226, + "grad_norm": 0.5886730619525234, + "learning_rate": 6.741418897665675e-05, + "loss": 11.9881, + "step": 22679 + }, + { + "epoch": 1.2350166425027056, + "grad_norm": 0.5932752272406046, + "learning_rate": 6.740585215731674e-05, + "loss": 12.0575, + "step": 22680 + }, + { + "epoch": 1.2350710964992886, + "grad_norm": 0.5251342499524592, + "learning_rate": 6.73975155914343e-05, + "loss": 12.0279, + "step": 22681 + }, + { + "epoch": 1.2351255504958716, + "grad_norm": 0.5798823011980077, + "learning_rate": 6.738917927907424e-05, + "loss": 11.9904, + "step": 22682 + }, + { + "epoch": 1.2351800044924548, + "grad_norm": 0.5514062392490326, + "learning_rate": 6.73808432203014e-05, + "loss": 11.9444, + "step": 22683 + }, + { + "epoch": 1.2352344584890378, + "grad_norm": 0.5154261983976824, + "learning_rate": 6.737250741518058e-05, + "loss": 12.0411, + "step": 22684 + }, + { + "epoch": 1.2352889124856208, + "grad_norm": 0.5192587201413797, + "learning_rate": 6.736417186377663e-05, + "loss": 11.9051, + "step": 22685 + }, + { + "epoch": 1.2353433664822038, + "grad_norm": 0.5458591462569395, + "learning_rate": 6.735583656615434e-05, + "loss": 11.8986, + "step": 22686 + }, + { + "epoch": 1.2353978204787868, + "grad_norm": 0.5108083682756132, + "learning_rate": 6.734750152237856e-05, + "loss": 11.988, + "step": 22687 + }, + { + "epoch": 1.2354522744753698, + "grad_norm": 0.5506524263833593, + "learning_rate": 6.733916673251408e-05, + "loss": 12.0789, + "step": 22688 + }, + { + "epoch": 1.2355067284719528, + "grad_norm": 0.5131917728864864, + "learning_rate": 6.73308321966257e-05, + "loss": 11.963, + "step": 22689 + }, + { + "epoch": 1.2355611824685357, + "grad_norm": 0.5439987044445266, + "learning_rate": 6.732249791477825e-05, + "loss": 11.9902, + "step": 22690 + }, + { + "epoch": 1.2356156364651187, + "grad_norm": 0.5099899001935355, + "learning_rate": 6.731416388703652e-05, + "loss": 12.0014, + "step": 22691 + }, + { + "epoch": 1.2356700904617017, + "grad_norm": 0.5273773552932229, + "learning_rate": 6.730583011346536e-05, + "loss": 11.9103, + "step": 22692 + }, + { + "epoch": 1.235724544458285, + "grad_norm": 0.530539161167646, + "learning_rate": 6.729749659412954e-05, + "loss": 11.9131, + "step": 22693 + }, + { + "epoch": 1.235778998454868, + "grad_norm": 0.500679073640153, + "learning_rate": 6.728916332909387e-05, + "loss": 11.9508, + "step": 22694 + }, + { + "epoch": 1.235833452451451, + "grad_norm": 0.5657946855188445, + "learning_rate": 6.728083031842315e-05, + "loss": 11.8095, + "step": 22695 + }, + { + "epoch": 1.235887906448034, + "grad_norm": 0.5454305197094377, + "learning_rate": 6.727249756218219e-05, + "loss": 11.8639, + "step": 22696 + }, + { + "epoch": 1.235942360444617, + "grad_norm": 0.5170051698920389, + "learning_rate": 6.726416506043578e-05, + "loss": 11.9299, + "step": 22697 + }, + { + "epoch": 1.2359968144412, + "grad_norm": 0.5403322015639688, + "learning_rate": 6.725583281324871e-05, + "loss": 11.993, + "step": 22698 + }, + { + "epoch": 1.236051268437783, + "grad_norm": 0.5114468184722819, + "learning_rate": 6.724750082068576e-05, + "loss": 11.9085, + "step": 22699 + }, + { + "epoch": 1.236105722434366, + "grad_norm": 0.5425623936775001, + "learning_rate": 6.723916908281174e-05, + "loss": 11.9386, + "step": 22700 + }, + { + "epoch": 1.2361601764309489, + "grad_norm": 0.5399263853903555, + "learning_rate": 6.723083759969145e-05, + "loss": 11.9577, + "step": 22701 + }, + { + "epoch": 1.2362146304275319, + "grad_norm": 0.5799004326323731, + "learning_rate": 6.722250637138963e-05, + "loss": 11.8526, + "step": 22702 + }, + { + "epoch": 1.2362690844241149, + "grad_norm": 0.5418963677157963, + "learning_rate": 6.721417539797113e-05, + "loss": 11.9665, + "step": 22703 + }, + { + "epoch": 1.2363235384206979, + "grad_norm": 0.5320204572101112, + "learning_rate": 6.720584467950068e-05, + "loss": 11.8885, + "step": 22704 + }, + { + "epoch": 1.2363779924172809, + "grad_norm": 0.5013538756593339, + "learning_rate": 6.719751421604309e-05, + "loss": 12.0037, + "step": 22705 + }, + { + "epoch": 1.236432446413864, + "grad_norm": 0.6402306975556139, + "learning_rate": 6.718918400766312e-05, + "loss": 11.9172, + "step": 22706 + }, + { + "epoch": 1.236486900410447, + "grad_norm": 0.5376631435166885, + "learning_rate": 6.71808540544256e-05, + "loss": 11.9102, + "step": 22707 + }, + { + "epoch": 1.23654135440703, + "grad_norm": 0.5384022363175526, + "learning_rate": 6.717252435639523e-05, + "loss": 11.887, + "step": 22708 + }, + { + "epoch": 1.236595808403613, + "grad_norm": 0.5905386507610483, + "learning_rate": 6.716419491363681e-05, + "loss": 12.0708, + "step": 22709 + }, + { + "epoch": 1.236650262400196, + "grad_norm": 0.5622731857639514, + "learning_rate": 6.715586572621512e-05, + "loss": 11.9543, + "step": 22710 + }, + { + "epoch": 1.236704716396779, + "grad_norm": 0.5143368789981935, + "learning_rate": 6.714753679419495e-05, + "loss": 11.9082, + "step": 22711 + }, + { + "epoch": 1.236759170393362, + "grad_norm": 0.5170015509220544, + "learning_rate": 6.713920811764101e-05, + "loss": 11.7931, + "step": 22712 + }, + { + "epoch": 1.236813624389945, + "grad_norm": 0.5738164528129192, + "learning_rate": 6.713087969661808e-05, + "loss": 11.9441, + "step": 22713 + }, + { + "epoch": 1.236868078386528, + "grad_norm": 0.5163665854909195, + "learning_rate": 6.712255153119098e-05, + "loss": 12.0418, + "step": 22714 + }, + { + "epoch": 1.236922532383111, + "grad_norm": 0.5418832478842345, + "learning_rate": 6.711422362142443e-05, + "loss": 11.9186, + "step": 22715 + }, + { + "epoch": 1.2369769863796942, + "grad_norm": 0.5630243743266894, + "learning_rate": 6.710589596738319e-05, + "loss": 11.8954, + "step": 22716 + }, + { + "epoch": 1.2370314403762772, + "grad_norm": 0.5540732424339679, + "learning_rate": 6.709756856913203e-05, + "loss": 12.0347, + "step": 22717 + }, + { + "epoch": 1.2370858943728602, + "grad_norm": 0.5835943891047622, + "learning_rate": 6.70892414267357e-05, + "loss": 11.9687, + "step": 22718 + }, + { + "epoch": 1.2371403483694432, + "grad_norm": 0.638789111867418, + "learning_rate": 6.708091454025891e-05, + "loss": 12.0823, + "step": 22719 + }, + { + "epoch": 1.2371948023660262, + "grad_norm": 0.5060120714308841, + "learning_rate": 6.707258790976647e-05, + "loss": 11.8976, + "step": 22720 + }, + { + "epoch": 1.2372492563626092, + "grad_norm": 0.5404010664963349, + "learning_rate": 6.706426153532311e-05, + "loss": 11.9189, + "step": 22721 + }, + { + "epoch": 1.2373037103591922, + "grad_norm": 0.5313752114208501, + "learning_rate": 6.705593541699358e-05, + "loss": 11.8907, + "step": 22722 + }, + { + "epoch": 1.2373581643557752, + "grad_norm": 0.5405744362055556, + "learning_rate": 6.704760955484262e-05, + "loss": 11.9977, + "step": 22723 + }, + { + "epoch": 1.2374126183523582, + "grad_norm": 0.5053750280482343, + "learning_rate": 6.703928394893496e-05, + "loss": 11.916, + "step": 22724 + }, + { + "epoch": 1.2374670723489412, + "grad_norm": 0.6332462231421013, + "learning_rate": 6.703095859933534e-05, + "loss": 12.0191, + "step": 22725 + }, + { + "epoch": 1.2375215263455241, + "grad_norm": 0.5620715208301164, + "learning_rate": 6.702263350610853e-05, + "loss": 11.9745, + "step": 22726 + }, + { + "epoch": 1.2375759803421071, + "grad_norm": 0.523419185547234, + "learning_rate": 6.70143086693193e-05, + "loss": 12.0299, + "step": 22727 + }, + { + "epoch": 1.2376304343386901, + "grad_norm": 0.536929624789274, + "learning_rate": 6.700598408903231e-05, + "loss": 12.017, + "step": 22728 + }, + { + "epoch": 1.2376848883352733, + "grad_norm": 0.5240904329169529, + "learning_rate": 6.69976597653123e-05, + "loss": 12.0377, + "step": 22729 + }, + { + "epoch": 1.2377393423318563, + "grad_norm": 0.5417193831271322, + "learning_rate": 6.698933569822401e-05, + "loss": 11.9908, + "step": 22730 + }, + { + "epoch": 1.2377937963284393, + "grad_norm": 0.5075968817592331, + "learning_rate": 6.698101188783222e-05, + "loss": 12.0386, + "step": 22731 + }, + { + "epoch": 1.2378482503250223, + "grad_norm": 0.5439818920805917, + "learning_rate": 6.697268833420159e-05, + "loss": 11.9398, + "step": 22732 + }, + { + "epoch": 1.2379027043216053, + "grad_norm": 0.540981612610113, + "learning_rate": 6.69643650373969e-05, + "loss": 11.966, + "step": 22733 + }, + { + "epoch": 1.2379571583181883, + "grad_norm": 0.5410089949216403, + "learning_rate": 6.695604199748282e-05, + "loss": 11.9297, + "step": 22734 + }, + { + "epoch": 1.2380116123147713, + "grad_norm": 0.5080515709221153, + "learning_rate": 6.694771921452411e-05, + "loss": 11.9432, + "step": 22735 + }, + { + "epoch": 1.2380660663113543, + "grad_norm": 0.5701795735195642, + "learning_rate": 6.693939668858547e-05, + "loss": 11.9448, + "step": 22736 + }, + { + "epoch": 1.2381205203079373, + "grad_norm": 0.5625459367180433, + "learning_rate": 6.693107441973166e-05, + "loss": 12.0541, + "step": 22737 + }, + { + "epoch": 1.2381749743045203, + "grad_norm": 0.5421907134881676, + "learning_rate": 6.692275240802734e-05, + "loss": 11.8146, + "step": 22738 + }, + { + "epoch": 1.2382294283011035, + "grad_norm": 0.5157314531188918, + "learning_rate": 6.691443065353724e-05, + "loss": 11.9038, + "step": 22739 + }, + { + "epoch": 1.2382838822976865, + "grad_norm": 0.5451684293546573, + "learning_rate": 6.690610915632605e-05, + "loss": 11.9046, + "step": 22740 + }, + { + "epoch": 1.2383383362942695, + "grad_norm": 0.5315778944147992, + "learning_rate": 6.689778791645854e-05, + "loss": 11.9785, + "step": 22741 + }, + { + "epoch": 1.2383927902908525, + "grad_norm": 0.6178763847409804, + "learning_rate": 6.688946693399938e-05, + "loss": 11.9117, + "step": 22742 + }, + { + "epoch": 1.2384472442874355, + "grad_norm": 0.6453867300968816, + "learning_rate": 6.688114620901327e-05, + "loss": 11.9758, + "step": 22743 + }, + { + "epoch": 1.2385016982840185, + "grad_norm": 0.5725942618275108, + "learning_rate": 6.687282574156492e-05, + "loss": 11.9988, + "step": 22744 + }, + { + "epoch": 1.2385561522806015, + "grad_norm": 0.5849737479201819, + "learning_rate": 6.686450553171904e-05, + "loss": 11.944, + "step": 22745 + }, + { + "epoch": 1.2386106062771844, + "grad_norm": 0.5283609630593522, + "learning_rate": 6.685618557954031e-05, + "loss": 11.8822, + "step": 22746 + }, + { + "epoch": 1.2386650602737674, + "grad_norm": 0.5740219832501973, + "learning_rate": 6.684786588509346e-05, + "loss": 11.9405, + "step": 22747 + }, + { + "epoch": 1.2387195142703504, + "grad_norm": 0.4982715936990555, + "learning_rate": 6.683954644844316e-05, + "loss": 11.8322, + "step": 22748 + }, + { + "epoch": 1.2387739682669334, + "grad_norm": 0.6233930846331157, + "learning_rate": 6.683122726965409e-05, + "loss": 11.988, + "step": 22749 + }, + { + "epoch": 1.2388284222635164, + "grad_norm": 0.5632371456358553, + "learning_rate": 6.682290834879096e-05, + "loss": 11.8357, + "step": 22750 + }, + { + "epoch": 1.2388828762600994, + "grad_norm": 0.5964121805149151, + "learning_rate": 6.681458968591846e-05, + "loss": 11.9755, + "step": 22751 + }, + { + "epoch": 1.2389373302566826, + "grad_norm": 0.5756759926454542, + "learning_rate": 6.680627128110129e-05, + "loss": 11.9904, + "step": 22752 + }, + { + "epoch": 1.2389917842532656, + "grad_norm": 0.5681878427286643, + "learning_rate": 6.679795313440412e-05, + "loss": 11.8841, + "step": 22753 + }, + { + "epoch": 1.2390462382498486, + "grad_norm": 0.5422053691231631, + "learning_rate": 6.678963524589162e-05, + "loss": 11.959, + "step": 22754 + }, + { + "epoch": 1.2391006922464316, + "grad_norm": 0.6214782310646312, + "learning_rate": 6.67813176156285e-05, + "loss": 12.027, + "step": 22755 + }, + { + "epoch": 1.2391551462430146, + "grad_norm": 0.5894579910660742, + "learning_rate": 6.67730002436794e-05, + "loss": 11.9687, + "step": 22756 + }, + { + "epoch": 1.2392096002395976, + "grad_norm": 0.48326707282042, + "learning_rate": 6.676468313010907e-05, + "loss": 11.9699, + "step": 22757 + }, + { + "epoch": 1.2392640542361806, + "grad_norm": 0.5369041904412026, + "learning_rate": 6.67563662749821e-05, + "loss": 12.0289, + "step": 22758 + }, + { + "epoch": 1.2393185082327636, + "grad_norm": 0.5745372613026676, + "learning_rate": 6.674804967836321e-05, + "loss": 11.8012, + "step": 22759 + }, + { + "epoch": 1.2393729622293466, + "grad_norm": 0.5116956700294779, + "learning_rate": 6.673973334031707e-05, + "loss": 11.855, + "step": 22760 + }, + { + "epoch": 1.2394274162259296, + "grad_norm": 0.5551668780314658, + "learning_rate": 6.673141726090833e-05, + "loss": 12.1515, + "step": 22761 + }, + { + "epoch": 1.2394818702225125, + "grad_norm": 0.5104546891451653, + "learning_rate": 6.672310144020163e-05, + "loss": 11.8827, + "step": 22762 + }, + { + "epoch": 1.2395363242190958, + "grad_norm": 0.5674099124673071, + "learning_rate": 6.671478587826173e-05, + "loss": 11.9037, + "step": 22763 + }, + { + "epoch": 1.2395907782156788, + "grad_norm": 0.5461280995964353, + "learning_rate": 6.670647057515322e-05, + "loss": 11.9623, + "step": 22764 + }, + { + "epoch": 1.2396452322122617, + "grad_norm": 0.5709435991393212, + "learning_rate": 6.669815553094079e-05, + "loss": 11.8487, + "step": 22765 + }, + { + "epoch": 1.2396996862088447, + "grad_norm": 0.48453856223587394, + "learning_rate": 6.668984074568908e-05, + "loss": 11.881, + "step": 22766 + }, + { + "epoch": 1.2397541402054277, + "grad_norm": 0.6409868095410967, + "learning_rate": 6.668152621946276e-05, + "loss": 11.9496, + "step": 22767 + }, + { + "epoch": 1.2398085942020107, + "grad_norm": 0.5477782554657079, + "learning_rate": 6.667321195232648e-05, + "loss": 12.0103, + "step": 22768 + }, + { + "epoch": 1.2398630481985937, + "grad_norm": 0.5040437193973005, + "learning_rate": 6.666489794434487e-05, + "loss": 11.8298, + "step": 22769 + }, + { + "epoch": 1.2399175021951767, + "grad_norm": 0.5680760203946319, + "learning_rate": 6.665658419558262e-05, + "loss": 11.9437, + "step": 22770 + }, + { + "epoch": 1.2399719561917597, + "grad_norm": 0.528085507850227, + "learning_rate": 6.664827070610436e-05, + "loss": 11.9805, + "step": 22771 + }, + { + "epoch": 1.2400264101883427, + "grad_norm": 0.5444655791242297, + "learning_rate": 6.663995747597475e-05, + "loss": 11.8826, + "step": 22772 + }, + { + "epoch": 1.2400808641849257, + "grad_norm": 0.503196129850154, + "learning_rate": 6.66316445052584e-05, + "loss": 11.9569, + "step": 22773 + }, + { + "epoch": 1.2401353181815087, + "grad_norm": 0.4988060771487552, + "learning_rate": 6.662333179401998e-05, + "loss": 11.804, + "step": 22774 + }, + { + "epoch": 1.2401897721780917, + "grad_norm": 0.5450478647484439, + "learning_rate": 6.661501934232414e-05, + "loss": 12.018, + "step": 22775 + }, + { + "epoch": 1.240244226174675, + "grad_norm": 0.5498181916634691, + "learning_rate": 6.660670715023551e-05, + "loss": 11.8347, + "step": 22776 + }, + { + "epoch": 1.2402986801712579, + "grad_norm": 0.5582544321055958, + "learning_rate": 6.659839521781879e-05, + "loss": 12.0097, + "step": 22777 + }, + { + "epoch": 1.2403531341678409, + "grad_norm": 0.5152176707698568, + "learning_rate": 6.659008354513844e-05, + "loss": 11.7026, + "step": 22778 + }, + { + "epoch": 1.2404075881644239, + "grad_norm": 0.5728070721846744, + "learning_rate": 6.658177213225927e-05, + "loss": 11.87, + "step": 22779 + }, + { + "epoch": 1.2404620421610069, + "grad_norm": 0.5975612798629394, + "learning_rate": 6.657346097924581e-05, + "loss": 11.8497, + "step": 22780 + }, + { + "epoch": 1.2405164961575899, + "grad_norm": 0.5670380370392075, + "learning_rate": 6.656515008616275e-05, + "loss": 12.0412, + "step": 22781 + }, + { + "epoch": 1.2405709501541728, + "grad_norm": 0.5194776893845515, + "learning_rate": 6.655683945307467e-05, + "loss": 11.9909, + "step": 22782 + }, + { + "epoch": 1.2406254041507558, + "grad_norm": 0.546511889568762, + "learning_rate": 6.654852908004623e-05, + "loss": 11.9428, + "step": 22783 + }, + { + "epoch": 1.2406798581473388, + "grad_norm": 0.5292786096158895, + "learning_rate": 6.654021896714204e-05, + "loss": 11.9456, + "step": 22784 + }, + { + "epoch": 1.2407343121439218, + "grad_norm": 0.5195446486772057, + "learning_rate": 6.653190911442669e-05, + "loss": 12.0358, + "step": 22785 + }, + { + "epoch": 1.240788766140505, + "grad_norm": 0.5578130700424351, + "learning_rate": 6.652359952196483e-05, + "loss": 12.1051, + "step": 22786 + }, + { + "epoch": 1.240843220137088, + "grad_norm": 0.5627174905235722, + "learning_rate": 6.651529018982113e-05, + "loss": 11.8365, + "step": 22787 + }, + { + "epoch": 1.240897674133671, + "grad_norm": 0.5522797222027749, + "learning_rate": 6.65069811180601e-05, + "loss": 12.0269, + "step": 22788 + }, + { + "epoch": 1.240952128130254, + "grad_norm": 0.5400125516162491, + "learning_rate": 6.64986723067464e-05, + "loss": 12.0219, + "step": 22789 + }, + { + "epoch": 1.241006582126837, + "grad_norm": 0.5535811517532464, + "learning_rate": 6.649036375594466e-05, + "loss": 11.9918, + "step": 22790 + }, + { + "epoch": 1.24106103612342, + "grad_norm": 0.5981970822688325, + "learning_rate": 6.648205546571947e-05, + "loss": 11.8923, + "step": 22791 + }, + { + "epoch": 1.241115490120003, + "grad_norm": 0.49991588153224736, + "learning_rate": 6.647374743613542e-05, + "loss": 11.9389, + "step": 22792 + }, + { + "epoch": 1.241169944116586, + "grad_norm": 0.5064208515861376, + "learning_rate": 6.646543966725715e-05, + "loss": 12.0333, + "step": 22793 + }, + { + "epoch": 1.241224398113169, + "grad_norm": 0.5021847292174838, + "learning_rate": 6.645713215914924e-05, + "loss": 11.8361, + "step": 22794 + }, + { + "epoch": 1.241278852109752, + "grad_norm": 0.5778181717137493, + "learning_rate": 6.64488249118763e-05, + "loss": 11.8868, + "step": 22795 + }, + { + "epoch": 1.241333306106335, + "grad_norm": 0.47529978052258676, + "learning_rate": 6.644051792550288e-05, + "loss": 11.9687, + "step": 22796 + }, + { + "epoch": 1.241387760102918, + "grad_norm": 0.5428177435372803, + "learning_rate": 6.643221120009371e-05, + "loss": 11.9814, + "step": 22797 + }, + { + "epoch": 1.241442214099501, + "grad_norm": 0.5526512328224183, + "learning_rate": 6.642390473571324e-05, + "loss": 11.892, + "step": 22798 + }, + { + "epoch": 1.2414966680960842, + "grad_norm": 0.5800933044027282, + "learning_rate": 6.641559853242612e-05, + "loss": 12.0973, + "step": 22799 + }, + { + "epoch": 1.2415511220926672, + "grad_norm": 0.6170253827013614, + "learning_rate": 6.640729259029692e-05, + "loss": 11.9002, + "step": 22800 + }, + { + "epoch": 1.2416055760892502, + "grad_norm": 0.5343101617382512, + "learning_rate": 6.639898690939025e-05, + "loss": 11.9431, + "step": 22801 + }, + { + "epoch": 1.2416600300858331, + "grad_norm": 0.516671138329763, + "learning_rate": 6.639068148977072e-05, + "loss": 11.9978, + "step": 22802 + }, + { + "epoch": 1.2417144840824161, + "grad_norm": 0.5553064455594036, + "learning_rate": 6.638237633150288e-05, + "loss": 12.0286, + "step": 22803 + }, + { + "epoch": 1.2417689380789991, + "grad_norm": 0.5912290720511462, + "learning_rate": 6.637407143465131e-05, + "loss": 11.8816, + "step": 22804 + }, + { + "epoch": 1.2418233920755821, + "grad_norm": 0.5343535316534611, + "learning_rate": 6.63657667992806e-05, + "loss": 11.7998, + "step": 22805 + }, + { + "epoch": 1.2418778460721651, + "grad_norm": 0.5206141984023502, + "learning_rate": 6.635746242545532e-05, + "loss": 11.9438, + "step": 22806 + }, + { + "epoch": 1.241932300068748, + "grad_norm": 0.6804279007851276, + "learning_rate": 6.63491583132401e-05, + "loss": 11.9349, + "step": 22807 + }, + { + "epoch": 1.241986754065331, + "grad_norm": 0.5287261857381383, + "learning_rate": 6.634085446269944e-05, + "loss": 11.8697, + "step": 22808 + }, + { + "epoch": 1.2420412080619143, + "grad_norm": 0.5142994957197141, + "learning_rate": 6.633255087389793e-05, + "loss": 11.9309, + "step": 22809 + }, + { + "epoch": 1.2420956620584973, + "grad_norm": 0.6111271483717513, + "learning_rate": 6.632424754690017e-05, + "loss": 11.9755, + "step": 22810 + }, + { + "epoch": 1.2421501160550803, + "grad_norm": 0.5223875794535703, + "learning_rate": 6.631594448177066e-05, + "loss": 11.9082, + "step": 22811 + }, + { + "epoch": 1.2422045700516633, + "grad_norm": 0.5995704030578055, + "learning_rate": 6.630764167857405e-05, + "loss": 11.9782, + "step": 22812 + }, + { + "epoch": 1.2422590240482463, + "grad_norm": 0.505948768567176, + "learning_rate": 6.629933913737486e-05, + "loss": 11.9421, + "step": 22813 + }, + { + "epoch": 1.2423134780448293, + "grad_norm": 0.54299871443734, + "learning_rate": 6.629103685823767e-05, + "loss": 11.9526, + "step": 22814 + }, + { + "epoch": 1.2423679320414123, + "grad_norm": 0.493709313312357, + "learning_rate": 6.628273484122703e-05, + "loss": 11.8863, + "step": 22815 + }, + { + "epoch": 1.2424223860379953, + "grad_norm": 0.5821463873061075, + "learning_rate": 6.62744330864075e-05, + "loss": 11.8817, + "step": 22816 + }, + { + "epoch": 1.2424768400345783, + "grad_norm": 0.5618459142456248, + "learning_rate": 6.626613159384366e-05, + "loss": 11.9891, + "step": 22817 + }, + { + "epoch": 1.2425312940311612, + "grad_norm": 0.5702148586567803, + "learning_rate": 6.62578303636e-05, + "loss": 11.9193, + "step": 22818 + }, + { + "epoch": 1.2425857480277442, + "grad_norm": 0.5470756108060282, + "learning_rate": 6.624952939574111e-05, + "loss": 12.0095, + "step": 22819 + }, + { + "epoch": 1.2426402020243272, + "grad_norm": 0.5709138327143086, + "learning_rate": 6.624122869033154e-05, + "loss": 11.872, + "step": 22820 + }, + { + "epoch": 1.2426946560209102, + "grad_norm": 0.5554455275669883, + "learning_rate": 6.623292824743585e-05, + "loss": 11.9505, + "step": 22821 + }, + { + "epoch": 1.2427491100174934, + "grad_norm": 0.5779973836898594, + "learning_rate": 6.622462806711857e-05, + "loss": 12.0125, + "step": 22822 + }, + { + "epoch": 1.2428035640140764, + "grad_norm": 0.5528729451071411, + "learning_rate": 6.621632814944421e-05, + "loss": 11.9637, + "step": 22823 + }, + { + "epoch": 1.2428580180106594, + "grad_norm": 0.6124236927248429, + "learning_rate": 6.620802849447738e-05, + "loss": 12.025, + "step": 22824 + }, + { + "epoch": 1.2429124720072424, + "grad_norm": 0.5262678844303744, + "learning_rate": 6.61997291022826e-05, + "loss": 11.8004, + "step": 22825 + }, + { + "epoch": 1.2429669260038254, + "grad_norm": 0.5208419089742101, + "learning_rate": 6.619142997292437e-05, + "loss": 11.86, + "step": 22826 + }, + { + "epoch": 1.2430213800004084, + "grad_norm": 0.5465260354290734, + "learning_rate": 6.618313110646728e-05, + "loss": 11.9544, + "step": 22827 + }, + { + "epoch": 1.2430758339969914, + "grad_norm": 0.5774287225303286, + "learning_rate": 6.617483250297582e-05, + "loss": 12.0348, + "step": 22828 + }, + { + "epoch": 1.2431302879935744, + "grad_norm": 0.6067544016343792, + "learning_rate": 6.616653416251454e-05, + "loss": 11.9491, + "step": 22829 + }, + { + "epoch": 1.2431847419901574, + "grad_norm": 0.49681862261512644, + "learning_rate": 6.615823608514794e-05, + "loss": 12.0043, + "step": 22830 + }, + { + "epoch": 1.2432391959867404, + "grad_norm": 0.5031150136579183, + "learning_rate": 6.61499382709406e-05, + "loss": 11.9967, + "step": 22831 + }, + { + "epoch": 1.2432936499833234, + "grad_norm": 0.5081850415140976, + "learning_rate": 6.6141640719957e-05, + "loss": 11.9372, + "step": 22832 + }, + { + "epoch": 1.2433481039799066, + "grad_norm": 0.6067099171071557, + "learning_rate": 6.61333434322617e-05, + "loss": 12.0013, + "step": 22833 + }, + { + "epoch": 1.2434025579764896, + "grad_norm": 0.4989131758117803, + "learning_rate": 6.612504640791914e-05, + "loss": 11.8733, + "step": 22834 + }, + { + "epoch": 1.2434570119730726, + "grad_norm": 0.5458558129514194, + "learning_rate": 6.611674964699396e-05, + "loss": 12.0634, + "step": 22835 + }, + { + "epoch": 1.2435114659696556, + "grad_norm": 0.5371725161108272, + "learning_rate": 6.61084531495506e-05, + "loss": 11.7361, + "step": 22836 + }, + { + "epoch": 1.2435659199662386, + "grad_norm": 0.5092776703288111, + "learning_rate": 6.610015691565359e-05, + "loss": 12.0028, + "step": 22837 + }, + { + "epoch": 1.2436203739628215, + "grad_norm": 0.555539266294269, + "learning_rate": 6.609186094536746e-05, + "loss": 12.0336, + "step": 22838 + }, + { + "epoch": 1.2436748279594045, + "grad_norm": 0.5857264935503956, + "learning_rate": 6.60835652387567e-05, + "loss": 12.0261, + "step": 22839 + }, + { + "epoch": 1.2437292819559875, + "grad_norm": 0.5883447922003333, + "learning_rate": 6.607526979588583e-05, + "loss": 12.0262, + "step": 22840 + }, + { + "epoch": 1.2437837359525705, + "grad_norm": 0.5495328691077903, + "learning_rate": 6.606697461681934e-05, + "loss": 11.9726, + "step": 22841 + }, + { + "epoch": 1.2438381899491535, + "grad_norm": 0.6093131901575297, + "learning_rate": 6.605867970162174e-05, + "loss": 11.9135, + "step": 22842 + }, + { + "epoch": 1.2438926439457365, + "grad_norm": 0.542856367570383, + "learning_rate": 6.605038505035754e-05, + "loss": 12.0268, + "step": 22843 + }, + { + "epoch": 1.2439470979423195, + "grad_norm": 0.5403828018426662, + "learning_rate": 6.604209066309124e-05, + "loss": 11.9622, + "step": 22844 + }, + { + "epoch": 1.2440015519389025, + "grad_norm": 0.5417773340777237, + "learning_rate": 6.603379653988732e-05, + "loss": 12.0037, + "step": 22845 + }, + { + "epoch": 1.2440560059354857, + "grad_norm": 0.6233297521403925, + "learning_rate": 6.602550268081031e-05, + "loss": 12.0848, + "step": 22846 + }, + { + "epoch": 1.2441104599320687, + "grad_norm": 0.5563424335303409, + "learning_rate": 6.601720908592471e-05, + "loss": 11.9075, + "step": 22847 + }, + { + "epoch": 1.2441649139286517, + "grad_norm": 0.5498742965493167, + "learning_rate": 6.600891575529501e-05, + "loss": 11.9492, + "step": 22848 + }, + { + "epoch": 1.2442193679252347, + "grad_norm": 0.5495562028184575, + "learning_rate": 6.600062268898563e-05, + "loss": 11.9896, + "step": 22849 + }, + { + "epoch": 1.2442738219218177, + "grad_norm": 0.5960210216426902, + "learning_rate": 6.599232988706112e-05, + "loss": 12.032, + "step": 22850 + }, + { + "epoch": 1.2443282759184007, + "grad_norm": 0.5405426947554883, + "learning_rate": 6.598403734958596e-05, + "loss": 11.881, + "step": 22851 + }, + { + "epoch": 1.2443827299149837, + "grad_norm": 0.6180780284103252, + "learning_rate": 6.597574507662463e-05, + "loss": 12.0612, + "step": 22852 + }, + { + "epoch": 1.2444371839115667, + "grad_norm": 0.5053292031113816, + "learning_rate": 6.596745306824162e-05, + "loss": 11.999, + "step": 22853 + }, + { + "epoch": 1.2444916379081497, + "grad_norm": 0.6042403860038301, + "learning_rate": 6.595916132450139e-05, + "loss": 11.9975, + "step": 22854 + }, + { + "epoch": 1.2445460919047326, + "grad_norm": 0.6292996176273324, + "learning_rate": 6.595086984546844e-05, + "loss": 11.965, + "step": 22855 + }, + { + "epoch": 1.2446005459013159, + "grad_norm": 0.5859200357053135, + "learning_rate": 6.594257863120721e-05, + "loss": 12.0571, + "step": 22856 + }, + { + "epoch": 1.2446549998978989, + "grad_norm": 0.5907369634070254, + "learning_rate": 6.593428768178223e-05, + "loss": 12.0495, + "step": 22857 + }, + { + "epoch": 1.2447094538944818, + "grad_norm": 0.9430568370294133, + "learning_rate": 6.592599699725796e-05, + "loss": 11.966, + "step": 22858 + }, + { + "epoch": 1.2447639078910648, + "grad_norm": 0.5372757406434971, + "learning_rate": 6.591770657769884e-05, + "loss": 12.0155, + "step": 22859 + }, + { + "epoch": 1.2448183618876478, + "grad_norm": 0.5540848802763377, + "learning_rate": 6.590941642316931e-05, + "loss": 12.0618, + "step": 22860 + }, + { + "epoch": 1.2448728158842308, + "grad_norm": 0.5110306054951284, + "learning_rate": 6.590112653373391e-05, + "loss": 11.9085, + "step": 22861 + }, + { + "epoch": 1.2449272698808138, + "grad_norm": 0.5478302796397301, + "learning_rate": 6.589283690945704e-05, + "loss": 12.1195, + "step": 22862 + }, + { + "epoch": 1.2449817238773968, + "grad_norm": 0.5953012209816627, + "learning_rate": 6.588454755040322e-05, + "loss": 11.8825, + "step": 22863 + }, + { + "epoch": 1.2450361778739798, + "grad_norm": 0.6720573249958113, + "learning_rate": 6.587625845663687e-05, + "loss": 11.8381, + "step": 22864 + }, + { + "epoch": 1.2450906318705628, + "grad_norm": 0.5689001755365769, + "learning_rate": 6.586796962822247e-05, + "loss": 11.9777, + "step": 22865 + }, + { + "epoch": 1.2451450858671458, + "grad_norm": 0.5345723488025051, + "learning_rate": 6.585968106522443e-05, + "loss": 11.9733, + "step": 22866 + }, + { + "epoch": 1.2451995398637288, + "grad_norm": 0.5711652832127669, + "learning_rate": 6.585139276770724e-05, + "loss": 11.9361, + "step": 22867 + }, + { + "epoch": 1.2452539938603118, + "grad_norm": 0.6009085892003865, + "learning_rate": 6.58431047357354e-05, + "loss": 11.9393, + "step": 22868 + }, + { + "epoch": 1.245308447856895, + "grad_norm": 0.5404850062028494, + "learning_rate": 6.583481696937326e-05, + "loss": 11.9827, + "step": 22869 + }, + { + "epoch": 1.245362901853478, + "grad_norm": 0.5653755541851823, + "learning_rate": 6.582652946868532e-05, + "loss": 11.9453, + "step": 22870 + }, + { + "epoch": 1.245417355850061, + "grad_norm": 0.5379684103837694, + "learning_rate": 6.5818242233736e-05, + "loss": 12.0339, + "step": 22871 + }, + { + "epoch": 1.245471809846644, + "grad_norm": 0.5248870771654717, + "learning_rate": 6.580995526458977e-05, + "loss": 12.0152, + "step": 22872 + }, + { + "epoch": 1.245526263843227, + "grad_norm": 0.5737092984332973, + "learning_rate": 6.580166856131106e-05, + "loss": 11.9311, + "step": 22873 + }, + { + "epoch": 1.24558071783981, + "grad_norm": 0.48237775427526935, + "learning_rate": 6.579338212396432e-05, + "loss": 11.9401, + "step": 22874 + }, + { + "epoch": 1.245635171836393, + "grad_norm": 0.5559850193376921, + "learning_rate": 6.578509595261397e-05, + "loss": 11.8752, + "step": 22875 + }, + { + "epoch": 1.245689625832976, + "grad_norm": 0.4930299259343834, + "learning_rate": 6.577681004732445e-05, + "loss": 11.7788, + "step": 22876 + }, + { + "epoch": 1.245744079829559, + "grad_norm": 0.5179040222610088, + "learning_rate": 6.576852440816022e-05, + "loss": 11.9134, + "step": 22877 + }, + { + "epoch": 1.245798533826142, + "grad_norm": 0.510126945991866, + "learning_rate": 6.576023903518568e-05, + "loss": 11.8957, + "step": 22878 + }, + { + "epoch": 1.2458529878227251, + "grad_norm": 0.5671174775740991, + "learning_rate": 6.575195392846525e-05, + "loss": 11.969, + "step": 22879 + }, + { + "epoch": 1.2459074418193081, + "grad_norm": 0.5147618181834822, + "learning_rate": 6.574366908806337e-05, + "loss": 11.9845, + "step": 22880 + }, + { + "epoch": 1.2459618958158911, + "grad_norm": 0.507930474647851, + "learning_rate": 6.573538451404446e-05, + "loss": 11.9498, + "step": 22881 + }, + { + "epoch": 1.2460163498124741, + "grad_norm": 0.5310853360787483, + "learning_rate": 6.572710020647295e-05, + "loss": 11.9354, + "step": 22882 + }, + { + "epoch": 1.246070803809057, + "grad_norm": 0.5177735012089201, + "learning_rate": 6.571881616541323e-05, + "loss": 11.8733, + "step": 22883 + }, + { + "epoch": 1.24612525780564, + "grad_norm": 0.5360896782995199, + "learning_rate": 6.571053239092977e-05, + "loss": 12.0134, + "step": 22884 + }, + { + "epoch": 1.246179711802223, + "grad_norm": 0.5017990803176361, + "learning_rate": 6.570224888308695e-05, + "loss": 11.938, + "step": 22885 + }, + { + "epoch": 1.246234165798806, + "grad_norm": 0.5250885074961562, + "learning_rate": 6.569396564194921e-05, + "loss": 11.9224, + "step": 22886 + }, + { + "epoch": 1.246288619795389, + "grad_norm": 0.5691015680339959, + "learning_rate": 6.568568266758094e-05, + "loss": 11.8836, + "step": 22887 + }, + { + "epoch": 1.246343073791972, + "grad_norm": 0.5500882464683317, + "learning_rate": 6.567739996004658e-05, + "loss": 12.0345, + "step": 22888 + }, + { + "epoch": 1.246397527788555, + "grad_norm": 0.5518441169966526, + "learning_rate": 6.56691175194105e-05, + "loss": 11.9901, + "step": 22889 + }, + { + "epoch": 1.246451981785138, + "grad_norm": 0.5058913121367379, + "learning_rate": 6.56608353457371e-05, + "loss": 11.9948, + "step": 22890 + }, + { + "epoch": 1.246506435781721, + "grad_norm": 0.5543399473024002, + "learning_rate": 6.565255343909081e-05, + "loss": 11.9444, + "step": 22891 + }, + { + "epoch": 1.2465608897783043, + "grad_norm": 0.4886695931854353, + "learning_rate": 6.564427179953603e-05, + "loss": 11.8646, + "step": 22892 + }, + { + "epoch": 1.2466153437748873, + "grad_norm": 0.5083409658185977, + "learning_rate": 6.563599042713715e-05, + "loss": 11.9316, + "step": 22893 + }, + { + "epoch": 1.2466697977714702, + "grad_norm": 0.509851838688398, + "learning_rate": 6.562770932195856e-05, + "loss": 12.0863, + "step": 22894 + }, + { + "epoch": 1.2467242517680532, + "grad_norm": 0.5570191696101516, + "learning_rate": 6.561942848406469e-05, + "loss": 12.0591, + "step": 22895 + }, + { + "epoch": 1.2467787057646362, + "grad_norm": 0.5679042787084743, + "learning_rate": 6.561114791351987e-05, + "loss": 12.0477, + "step": 22896 + }, + { + "epoch": 1.2468331597612192, + "grad_norm": 0.5306661509851105, + "learning_rate": 6.560286761038858e-05, + "loss": 11.938, + "step": 22897 + }, + { + "epoch": 1.2468876137578022, + "grad_norm": 0.5204155469272607, + "learning_rate": 6.559458757473516e-05, + "loss": 11.8639, + "step": 22898 + }, + { + "epoch": 1.2469420677543852, + "grad_norm": 0.4769915772493378, + "learning_rate": 6.558630780662397e-05, + "loss": 11.9881, + "step": 22899 + }, + { + "epoch": 1.2469965217509682, + "grad_norm": 0.552021418415857, + "learning_rate": 6.557802830611943e-05, + "loss": 11.9603, + "step": 22900 + }, + { + "epoch": 1.2470509757475512, + "grad_norm": 0.6544919271195204, + "learning_rate": 6.556974907328591e-05, + "loss": 12.048, + "step": 22901 + }, + { + "epoch": 1.2471054297441344, + "grad_norm": 0.5398390438960115, + "learning_rate": 6.55614701081878e-05, + "loss": 11.8334, + "step": 22902 + }, + { + "epoch": 1.2471598837407174, + "grad_norm": 0.5215423436765481, + "learning_rate": 6.555319141088947e-05, + "loss": 11.908, + "step": 22903 + }, + { + "epoch": 1.2472143377373004, + "grad_norm": 0.5968230781626701, + "learning_rate": 6.554491298145531e-05, + "loss": 11.9538, + "step": 22904 + }, + { + "epoch": 1.2472687917338834, + "grad_norm": 0.5614518888979998, + "learning_rate": 6.553663481994965e-05, + "loss": 11.9059, + "step": 22905 + }, + { + "epoch": 1.2473232457304664, + "grad_norm": 0.5533441516707198, + "learning_rate": 6.552835692643693e-05, + "loss": 11.9552, + "step": 22906 + }, + { + "epoch": 1.2473776997270494, + "grad_norm": 0.49192685322037805, + "learning_rate": 6.552007930098147e-05, + "loss": 11.905, + "step": 22907 + }, + { + "epoch": 1.2474321537236324, + "grad_norm": 0.5075322530471847, + "learning_rate": 6.551180194364772e-05, + "loss": 11.9061, + "step": 22908 + }, + { + "epoch": 1.2474866077202154, + "grad_norm": 0.5567091267623645, + "learning_rate": 6.550352485449991e-05, + "loss": 11.9821, + "step": 22909 + }, + { + "epoch": 1.2475410617167983, + "grad_norm": 0.543098851357432, + "learning_rate": 6.549524803360248e-05, + "loss": 11.9562, + "step": 22910 + }, + { + "epoch": 1.2475955157133813, + "grad_norm": 0.6312837386891761, + "learning_rate": 6.54869714810198e-05, + "loss": 11.8395, + "step": 22911 + }, + { + "epoch": 1.2476499697099643, + "grad_norm": 0.736972893975821, + "learning_rate": 6.547869519681622e-05, + "loss": 12.0243, + "step": 22912 + }, + { + "epoch": 1.2477044237065473, + "grad_norm": 0.5898809579991051, + "learning_rate": 6.54704191810561e-05, + "loss": 12.0607, + "step": 22913 + }, + { + "epoch": 1.2477588777031303, + "grad_norm": 0.5352698872447551, + "learning_rate": 6.546214343380379e-05, + "loss": 11.928, + "step": 22914 + }, + { + "epoch": 1.2478133316997133, + "grad_norm": 0.5886613853761997, + "learning_rate": 6.545386795512364e-05, + "loss": 11.8943, + "step": 22915 + }, + { + "epoch": 1.2478677856962965, + "grad_norm": 0.5940386684191961, + "learning_rate": 6.544559274507998e-05, + "loss": 11.8677, + "step": 22916 + }, + { + "epoch": 1.2479222396928795, + "grad_norm": 0.6104365727146253, + "learning_rate": 6.543731780373721e-05, + "loss": 11.9247, + "step": 22917 + }, + { + "epoch": 1.2479766936894625, + "grad_norm": 0.5376032921813544, + "learning_rate": 6.542904313115971e-05, + "loss": 12.0488, + "step": 22918 + }, + { + "epoch": 1.2480311476860455, + "grad_norm": 0.5762837312993669, + "learning_rate": 6.542076872741172e-05, + "loss": 12.0017, + "step": 22919 + }, + { + "epoch": 1.2480856016826285, + "grad_norm": 0.5363121824057173, + "learning_rate": 6.54124945925576e-05, + "loss": 11.9132, + "step": 22920 + }, + { + "epoch": 1.2481400556792115, + "grad_norm": 0.5791488075391481, + "learning_rate": 6.540422072666175e-05, + "loss": 11.7995, + "step": 22921 + }, + { + "epoch": 1.2481945096757945, + "grad_norm": 0.5395242927333442, + "learning_rate": 6.539594712978848e-05, + "loss": 11.955, + "step": 22922 + }, + { + "epoch": 1.2482489636723775, + "grad_norm": 0.5487681436831426, + "learning_rate": 6.538767380200212e-05, + "loss": 11.9998, + "step": 22923 + }, + { + "epoch": 1.2483034176689605, + "grad_norm": 0.5203020553522513, + "learning_rate": 6.537940074336701e-05, + "loss": 11.9544, + "step": 22924 + }, + { + "epoch": 1.2483578716655435, + "grad_norm": 0.5889912883526086, + "learning_rate": 6.537112795394751e-05, + "loss": 11.9886, + "step": 22925 + }, + { + "epoch": 1.2484123256621267, + "grad_norm": 0.5290888907053558, + "learning_rate": 6.536285543380791e-05, + "loss": 11.8611, + "step": 22926 + }, + { + "epoch": 1.2484667796587097, + "grad_norm": 0.528132850681092, + "learning_rate": 6.535458318301255e-05, + "loss": 11.9693, + "step": 22927 + }, + { + "epoch": 1.2485212336552927, + "grad_norm": 0.555364238149745, + "learning_rate": 6.53463112016258e-05, + "loss": 11.9385, + "step": 22928 + }, + { + "epoch": 1.2485756876518757, + "grad_norm": 0.49825336231402734, + "learning_rate": 6.53380394897119e-05, + "loss": 11.8099, + "step": 22929 + }, + { + "epoch": 1.2486301416484586, + "grad_norm": 0.5050365700015226, + "learning_rate": 6.532976804733524e-05, + "loss": 11.9691, + "step": 22930 + }, + { + "epoch": 1.2486845956450416, + "grad_norm": 0.5237797915712181, + "learning_rate": 6.532149687456011e-05, + "loss": 11.8852, + "step": 22931 + }, + { + "epoch": 1.2487390496416246, + "grad_norm": 0.5184891802825093, + "learning_rate": 6.531322597145081e-05, + "loss": 12.0183, + "step": 22932 + }, + { + "epoch": 1.2487935036382076, + "grad_norm": 0.5059313801881709, + "learning_rate": 6.530495533807171e-05, + "loss": 11.9459, + "step": 22933 + }, + { + "epoch": 1.2488479576347906, + "grad_norm": 0.5441516367093713, + "learning_rate": 6.529668497448709e-05, + "loss": 12.0306, + "step": 22934 + }, + { + "epoch": 1.2489024116313736, + "grad_norm": 0.5734294262571994, + "learning_rate": 6.528841488076127e-05, + "loss": 11.9553, + "step": 22935 + }, + { + "epoch": 1.2489568656279566, + "grad_norm": 0.5639771296590832, + "learning_rate": 6.528014505695856e-05, + "loss": 11.9432, + "step": 22936 + }, + { + "epoch": 1.2490113196245396, + "grad_norm": 0.549302656599705, + "learning_rate": 6.527187550314327e-05, + "loss": 12.0203, + "step": 22937 + }, + { + "epoch": 1.2490657736211226, + "grad_norm": 0.5319682895932197, + "learning_rate": 6.52636062193797e-05, + "loss": 11.8934, + "step": 22938 + }, + { + "epoch": 1.2491202276177058, + "grad_norm": 0.5681112030552473, + "learning_rate": 6.525533720573214e-05, + "loss": 11.8764, + "step": 22939 + }, + { + "epoch": 1.2491746816142888, + "grad_norm": 0.5331700864501003, + "learning_rate": 6.524706846226492e-05, + "loss": 11.8594, + "step": 22940 + }, + { + "epoch": 1.2492291356108718, + "grad_norm": 0.5368385453299201, + "learning_rate": 6.52387999890423e-05, + "loss": 12.0113, + "step": 22941 + }, + { + "epoch": 1.2492835896074548, + "grad_norm": 0.561377539879999, + "learning_rate": 6.523053178612861e-05, + "loss": 11.8856, + "step": 22942 + }, + { + "epoch": 1.2493380436040378, + "grad_norm": 0.5632375007956382, + "learning_rate": 6.522226385358813e-05, + "loss": 12.0217, + "step": 22943 + }, + { + "epoch": 1.2493924976006208, + "grad_norm": 0.5442323860301268, + "learning_rate": 6.521399619148517e-05, + "loss": 11.972, + "step": 22944 + }, + { + "epoch": 1.2494469515972038, + "grad_norm": 0.524902231818379, + "learning_rate": 6.5205728799884e-05, + "loss": 11.9334, + "step": 22945 + }, + { + "epoch": 1.2495014055937868, + "grad_norm": 0.7055924414692931, + "learning_rate": 6.519746167884892e-05, + "loss": 12.0036, + "step": 22946 + }, + { + "epoch": 1.2495558595903697, + "grad_norm": 0.590997549458098, + "learning_rate": 6.518919482844423e-05, + "loss": 11.7153, + "step": 22947 + }, + { + "epoch": 1.2496103135869527, + "grad_norm": 0.5579959947996137, + "learning_rate": 6.518092824873421e-05, + "loss": 11.8822, + "step": 22948 + }, + { + "epoch": 1.249664767583536, + "grad_norm": 0.5963325373914566, + "learning_rate": 6.51726619397831e-05, + "loss": 11.8778, + "step": 22949 + }, + { + "epoch": 1.249719221580119, + "grad_norm": 0.585942340167395, + "learning_rate": 6.516439590165522e-05, + "loss": 11.9765, + "step": 22950 + }, + { + "epoch": 1.249773675576702, + "grad_norm": 0.5401228626095188, + "learning_rate": 6.515613013441485e-05, + "loss": 11.9633, + "step": 22951 + }, + { + "epoch": 1.249828129573285, + "grad_norm": 0.5686170809381587, + "learning_rate": 6.514786463812624e-05, + "loss": 11.8138, + "step": 22952 + }, + { + "epoch": 1.249882583569868, + "grad_norm": 0.5419393697444189, + "learning_rate": 6.513959941285368e-05, + "loss": 11.8725, + "step": 22953 + }, + { + "epoch": 1.249937037566451, + "grad_norm": 0.5487208114712231, + "learning_rate": 6.513133445866143e-05, + "loss": 11.8953, + "step": 22954 + }, + { + "epoch": 1.249991491563034, + "grad_norm": 0.5181968881859137, + "learning_rate": 6.512306977561379e-05, + "loss": 11.8141, + "step": 22955 + }, + { + "epoch": 1.250045945559617, + "grad_norm": 0.5156325427064138, + "learning_rate": 6.5114805363775e-05, + "loss": 11.8736, + "step": 22956 + }, + { + "epoch": 1.2501003995562, + "grad_norm": 0.5216489790880355, + "learning_rate": 6.510654122320934e-05, + "loss": 11.9813, + "step": 22957 + }, + { + "epoch": 1.2501548535527829, + "grad_norm": 0.5144461249999991, + "learning_rate": 6.509827735398108e-05, + "loss": 11.9433, + "step": 22958 + }, + { + "epoch": 1.2502093075493659, + "grad_norm": 0.5668710856366023, + "learning_rate": 6.509001375615446e-05, + "loss": 12.0624, + "step": 22959 + }, + { + "epoch": 1.2502637615459489, + "grad_norm": 0.5463624945385009, + "learning_rate": 6.508175042979374e-05, + "loss": 11.9766, + "step": 22960 + }, + { + "epoch": 1.2503182155425319, + "grad_norm": 0.489156747425386, + "learning_rate": 6.507348737496319e-05, + "loss": 11.9146, + "step": 22961 + }, + { + "epoch": 1.2503726695391149, + "grad_norm": 0.5718373374121186, + "learning_rate": 6.506522459172705e-05, + "loss": 12.0009, + "step": 22962 + }, + { + "epoch": 1.250427123535698, + "grad_norm": 0.52519505403957, + "learning_rate": 6.505696208014959e-05, + "loss": 11.9269, + "step": 22963 + }, + { + "epoch": 1.250481577532281, + "grad_norm": 0.6130330578240278, + "learning_rate": 6.504869984029504e-05, + "loss": 11.9778, + "step": 22964 + }, + { + "epoch": 1.250536031528864, + "grad_norm": 0.5223733110557156, + "learning_rate": 6.504043787222767e-05, + "loss": 11.8907, + "step": 22965 + }, + { + "epoch": 1.250590485525447, + "grad_norm": 0.5227047815433213, + "learning_rate": 6.503217617601171e-05, + "loss": 12.0034, + "step": 22966 + }, + { + "epoch": 1.25064493952203, + "grad_norm": 0.5840137043222837, + "learning_rate": 6.502391475171142e-05, + "loss": 11.9016, + "step": 22967 + }, + { + "epoch": 1.250699393518613, + "grad_norm": 0.6283189772930491, + "learning_rate": 6.501565359939108e-05, + "loss": 12.0354, + "step": 22968 + }, + { + "epoch": 1.250753847515196, + "grad_norm": 0.5581576060407885, + "learning_rate": 6.500739271911482e-05, + "loss": 11.9421, + "step": 22969 + }, + { + "epoch": 1.250808301511779, + "grad_norm": 0.5156956675751961, + "learning_rate": 6.499913211094697e-05, + "loss": 11.9535, + "step": 22970 + }, + { + "epoch": 1.250862755508362, + "grad_norm": 0.5931791196620244, + "learning_rate": 6.499087177495173e-05, + "loss": 11.9979, + "step": 22971 + }, + { + "epoch": 1.2509172095049452, + "grad_norm": 0.4876625214204427, + "learning_rate": 6.498261171119333e-05, + "loss": 11.7944, + "step": 22972 + }, + { + "epoch": 1.2509716635015282, + "grad_norm": 0.5471450507710717, + "learning_rate": 6.497435191973605e-05, + "loss": 11.9792, + "step": 22973 + }, + { + "epoch": 1.2510261174981112, + "grad_norm": 0.5485752705540134, + "learning_rate": 6.496609240064404e-05, + "loss": 12.1258, + "step": 22974 + }, + { + "epoch": 1.2510805714946942, + "grad_norm": 0.5385059712319659, + "learning_rate": 6.495783315398159e-05, + "loss": 11.9627, + "step": 22975 + }, + { + "epoch": 1.2511350254912772, + "grad_norm": 0.5762942386677303, + "learning_rate": 6.494957417981291e-05, + "loss": 12.0554, + "step": 22976 + }, + { + "epoch": 1.2511894794878602, + "grad_norm": 0.530072852836688, + "learning_rate": 6.49413154782022e-05, + "loss": 11.8623, + "step": 22977 + }, + { + "epoch": 1.2512439334844432, + "grad_norm": 0.5243166244014196, + "learning_rate": 6.493305704921371e-05, + "loss": 11.972, + "step": 22978 + }, + { + "epoch": 1.2512983874810262, + "grad_norm": 0.5025990193059422, + "learning_rate": 6.49247988929117e-05, + "loss": 11.9838, + "step": 22979 + }, + { + "epoch": 1.2513528414776092, + "grad_norm": 0.5799394634592401, + "learning_rate": 6.491654100936029e-05, + "loss": 12.1679, + "step": 22980 + }, + { + "epoch": 1.2514072954741922, + "grad_norm": 0.5624798675759112, + "learning_rate": 6.490828339862372e-05, + "loss": 11.9008, + "step": 22981 + }, + { + "epoch": 1.2514617494707752, + "grad_norm": 0.5181483739429259, + "learning_rate": 6.490002606076625e-05, + "loss": 11.9532, + "step": 22982 + }, + { + "epoch": 1.2515162034673581, + "grad_norm": 0.6048615842854305, + "learning_rate": 6.489176899585205e-05, + "loss": 12.0913, + "step": 22983 + }, + { + "epoch": 1.2515706574639411, + "grad_norm": 0.5168002705232881, + "learning_rate": 6.488351220394536e-05, + "loss": 11.9505, + "step": 22984 + }, + { + "epoch": 1.2516251114605241, + "grad_norm": 0.5534263194047239, + "learning_rate": 6.487525568511036e-05, + "loss": 11.9204, + "step": 22985 + }, + { + "epoch": 1.2516795654571073, + "grad_norm": 0.5006750619220163, + "learning_rate": 6.486699943941126e-05, + "loss": 11.9048, + "step": 22986 + }, + { + "epoch": 1.2517340194536903, + "grad_norm": 0.5385279523589465, + "learning_rate": 6.485874346691227e-05, + "loss": 11.9173, + "step": 22987 + }, + { + "epoch": 1.2517884734502733, + "grad_norm": 0.5664077290372833, + "learning_rate": 6.485048776767754e-05, + "loss": 11.9994, + "step": 22988 + }, + { + "epoch": 1.2518429274468563, + "grad_norm": 0.6190079342912472, + "learning_rate": 6.48422323417714e-05, + "loss": 11.8936, + "step": 22989 + }, + { + "epoch": 1.2518973814434393, + "grad_norm": 0.5514339130229768, + "learning_rate": 6.483397718925792e-05, + "loss": 11.9804, + "step": 22990 + }, + { + "epoch": 1.2519518354400223, + "grad_norm": 0.5127695023833323, + "learning_rate": 6.482572231020132e-05, + "loss": 11.8427, + "step": 22991 + }, + { + "epoch": 1.2520062894366053, + "grad_norm": 0.5620487969975653, + "learning_rate": 6.481746770466577e-05, + "loss": 11.9001, + "step": 22992 + }, + { + "epoch": 1.2520607434331883, + "grad_norm": 0.49963670589846776, + "learning_rate": 6.480921337271553e-05, + "loss": 11.8697, + "step": 22993 + }, + { + "epoch": 1.2521151974297713, + "grad_norm": 0.5557818379722875, + "learning_rate": 6.480095931441475e-05, + "loss": 11.8505, + "step": 22994 + }, + { + "epoch": 1.2521696514263545, + "grad_norm": 0.5568201837159433, + "learning_rate": 6.47927055298276e-05, + "loss": 12.027, + "step": 22995 + }, + { + "epoch": 1.2522241054229375, + "grad_norm": 0.5674997406215048, + "learning_rate": 6.478445201901827e-05, + "loss": 12.0055, + "step": 22996 + }, + { + "epoch": 1.2522785594195205, + "grad_norm": 0.5519263533010362, + "learning_rate": 6.477619878205095e-05, + "loss": 11.9743, + "step": 22997 + }, + { + "epoch": 1.2523330134161035, + "grad_norm": 0.49660196651081107, + "learning_rate": 6.476794581898983e-05, + "loss": 11.9608, + "step": 22998 + }, + { + "epoch": 1.2523874674126865, + "grad_norm": 0.5599330299498402, + "learning_rate": 6.475969312989909e-05, + "loss": 11.9485, + "step": 22999 + }, + { + "epoch": 1.2524419214092695, + "grad_norm": 0.5344015870015618, + "learning_rate": 6.475144071484285e-05, + "loss": 11.8989, + "step": 23000 + }, + { + "epoch": 1.2524963754058525, + "grad_norm": 0.5197967280637122, + "learning_rate": 6.474318857388532e-05, + "loss": 11.8861, + "step": 23001 + }, + { + "epoch": 1.2525508294024355, + "grad_norm": 0.5200498758350924, + "learning_rate": 6.473493670709068e-05, + "loss": 11.8961, + "step": 23002 + }, + { + "epoch": 1.2526052833990184, + "grad_norm": 0.5206830469370257, + "learning_rate": 6.472668511452307e-05, + "loss": 11.9804, + "step": 23003 + }, + { + "epoch": 1.2526597373956014, + "grad_norm": 0.5131759205830891, + "learning_rate": 6.471843379624669e-05, + "loss": 11.9574, + "step": 23004 + }, + { + "epoch": 1.2527141913921844, + "grad_norm": 0.513270765710931, + "learning_rate": 6.471018275232568e-05, + "loss": 11.9623, + "step": 23005 + }, + { + "epoch": 1.2527686453887674, + "grad_norm": 0.5124940921030762, + "learning_rate": 6.47019319828242e-05, + "loss": 11.8931, + "step": 23006 + }, + { + "epoch": 1.2528230993853504, + "grad_norm": 0.5672971005079075, + "learning_rate": 6.469368148780641e-05, + "loss": 11.9584, + "step": 23007 + }, + { + "epoch": 1.2528775533819334, + "grad_norm": 0.6092767991292602, + "learning_rate": 6.468543126733651e-05, + "loss": 12.1172, + "step": 23008 + }, + { + "epoch": 1.2529320073785164, + "grad_norm": 0.5113985421572147, + "learning_rate": 6.46771813214786e-05, + "loss": 11.9211, + "step": 23009 + }, + { + "epoch": 1.2529864613750996, + "grad_norm": 0.5015182630609952, + "learning_rate": 6.466893165029685e-05, + "loss": 11.9121, + "step": 23010 + }, + { + "epoch": 1.2530409153716826, + "grad_norm": 0.5314409037250363, + "learning_rate": 6.466068225385542e-05, + "loss": 11.8639, + "step": 23011 + }, + { + "epoch": 1.2530953693682656, + "grad_norm": 0.545839570387426, + "learning_rate": 6.465243313221842e-05, + "loss": 11.9892, + "step": 23012 + }, + { + "epoch": 1.2531498233648486, + "grad_norm": 0.4882250111572915, + "learning_rate": 6.464418428545006e-05, + "loss": 11.9823, + "step": 23013 + }, + { + "epoch": 1.2532042773614316, + "grad_norm": 0.5030383229180341, + "learning_rate": 6.463593571361441e-05, + "loss": 11.9142, + "step": 23014 + }, + { + "epoch": 1.2532587313580146, + "grad_norm": 0.5949180837371855, + "learning_rate": 6.46276874167757e-05, + "loss": 11.9972, + "step": 23015 + }, + { + "epoch": 1.2533131853545976, + "grad_norm": 0.5167607919930672, + "learning_rate": 6.4619439394998e-05, + "loss": 11.7229, + "step": 23016 + }, + { + "epoch": 1.2533676393511806, + "grad_norm": 0.5033872352171551, + "learning_rate": 6.461119164834548e-05, + "loss": 11.8811, + "step": 23017 + }, + { + "epoch": 1.2534220933477638, + "grad_norm": 0.5670707557320989, + "learning_rate": 6.460294417688227e-05, + "loss": 12.1498, + "step": 23018 + }, + { + "epoch": 1.2534765473443468, + "grad_norm": 0.5405924893289286, + "learning_rate": 6.459469698067253e-05, + "loss": 11.9884, + "step": 23019 + }, + { + "epoch": 1.2535310013409298, + "grad_norm": 0.604217689453398, + "learning_rate": 6.458645005978033e-05, + "loss": 12.097, + "step": 23020 + }, + { + "epoch": 1.2535854553375128, + "grad_norm": 0.49637233258769686, + "learning_rate": 6.457820341426985e-05, + "loss": 11.8859, + "step": 23021 + }, + { + "epoch": 1.2536399093340957, + "grad_norm": 0.5607773866657347, + "learning_rate": 6.456995704420518e-05, + "loss": 12.0768, + "step": 23022 + }, + { + "epoch": 1.2536943633306787, + "grad_norm": 0.5030846259925406, + "learning_rate": 6.456171094965049e-05, + "loss": 11.9584, + "step": 23023 + }, + { + "epoch": 1.2537488173272617, + "grad_norm": 0.571124339836725, + "learning_rate": 6.455346513066985e-05, + "loss": 11.9904, + "step": 23024 + }, + { + "epoch": 1.2538032713238447, + "grad_norm": 0.515989313493274, + "learning_rate": 6.454521958732743e-05, + "loss": 12.0152, + "step": 23025 + }, + { + "epoch": 1.2538577253204277, + "grad_norm": 0.5371035254206465, + "learning_rate": 6.45369743196873e-05, + "loss": 11.8703, + "step": 23026 + }, + { + "epoch": 1.2539121793170107, + "grad_norm": 0.5229543350872914, + "learning_rate": 6.452872932781363e-05, + "loss": 11.9173, + "step": 23027 + }, + { + "epoch": 1.2539666333135937, + "grad_norm": 0.5932582677642733, + "learning_rate": 6.45204846117705e-05, + "loss": 11.8756, + "step": 23028 + }, + { + "epoch": 1.2540210873101767, + "grad_norm": 0.5891383987686949, + "learning_rate": 6.451224017162209e-05, + "loss": 11.965, + "step": 23029 + }, + { + "epoch": 1.2540755413067597, + "grad_norm": 0.4903832737201092, + "learning_rate": 6.450399600743238e-05, + "loss": 11.9513, + "step": 23030 + }, + { + "epoch": 1.2541299953033427, + "grad_norm": 0.5674561792441308, + "learning_rate": 6.449575211926556e-05, + "loss": 11.8607, + "step": 23031 + }, + { + "epoch": 1.2541844492999257, + "grad_norm": 0.5579231146428775, + "learning_rate": 6.448750850718575e-05, + "loss": 12.0236, + "step": 23032 + }, + { + "epoch": 1.254238903296509, + "grad_norm": 0.5181541147940405, + "learning_rate": 6.4479265171257e-05, + "loss": 11.8999, + "step": 23033 + }, + { + "epoch": 1.2542933572930919, + "grad_norm": 0.5489469574332925, + "learning_rate": 6.447102211154346e-05, + "loss": 11.9517, + "step": 23034 + }, + { + "epoch": 1.2543478112896749, + "grad_norm": 0.5369419671946504, + "learning_rate": 6.44627793281092e-05, + "loss": 12.0059, + "step": 23035 + }, + { + "epoch": 1.2544022652862579, + "grad_norm": 0.5083090277377812, + "learning_rate": 6.445453682101835e-05, + "loss": 12.0114, + "step": 23036 + }, + { + "epoch": 1.2544567192828409, + "grad_norm": 0.506253196690573, + "learning_rate": 6.444629459033493e-05, + "loss": 12.0002, + "step": 23037 + }, + { + "epoch": 1.2545111732794239, + "grad_norm": 0.5636252841187461, + "learning_rate": 6.443805263612313e-05, + "loss": 12.0145, + "step": 23038 + }, + { + "epoch": 1.2545656272760068, + "grad_norm": 0.5842933617248807, + "learning_rate": 6.442981095844702e-05, + "loss": 11.812, + "step": 23039 + }, + { + "epoch": 1.2546200812725898, + "grad_norm": 0.5633163761859511, + "learning_rate": 6.442156955737064e-05, + "loss": 11.9461, + "step": 23040 + }, + { + "epoch": 1.2546745352691728, + "grad_norm": 0.5499040542405221, + "learning_rate": 6.441332843295807e-05, + "loss": 12.1107, + "step": 23041 + }, + { + "epoch": 1.254728989265756, + "grad_norm": 0.5445402995069029, + "learning_rate": 6.440508758527344e-05, + "loss": 11.9543, + "step": 23042 + }, + { + "epoch": 1.254783443262339, + "grad_norm": 0.5453971556779926, + "learning_rate": 6.439684701438085e-05, + "loss": 12.0705, + "step": 23043 + }, + { + "epoch": 1.254837897258922, + "grad_norm": 0.5205463045709341, + "learning_rate": 6.438860672034433e-05, + "loss": 11.8273, + "step": 23044 + }, + { + "epoch": 1.254892351255505, + "grad_norm": 0.5060498204368865, + "learning_rate": 6.438036670322795e-05, + "loss": 11.8163, + "step": 23045 + }, + { + "epoch": 1.254946805252088, + "grad_norm": 0.5179264361560095, + "learning_rate": 6.437212696309585e-05, + "loss": 11.8962, + "step": 23046 + }, + { + "epoch": 1.255001259248671, + "grad_norm": 0.5623083269078556, + "learning_rate": 6.436388750001205e-05, + "loss": 12.0951, + "step": 23047 + }, + { + "epoch": 1.255055713245254, + "grad_norm": 0.523609816078576, + "learning_rate": 6.435564831404061e-05, + "loss": 11.7966, + "step": 23048 + }, + { + "epoch": 1.255110167241837, + "grad_norm": 0.5472175230259042, + "learning_rate": 6.434740940524569e-05, + "loss": 11.8966, + "step": 23049 + }, + { + "epoch": 1.25516462123842, + "grad_norm": 0.5651524739962391, + "learning_rate": 6.433917077369127e-05, + "loss": 11.9129, + "step": 23050 + }, + { + "epoch": 1.255219075235003, + "grad_norm": 0.6087141538496955, + "learning_rate": 6.433093241944141e-05, + "loss": 11.8583, + "step": 23051 + }, + { + "epoch": 1.255273529231586, + "grad_norm": 0.5504794378725869, + "learning_rate": 6.432269434256021e-05, + "loss": 11.9824, + "step": 23052 + }, + { + "epoch": 1.255327983228169, + "grad_norm": 0.5703693768570628, + "learning_rate": 6.431445654311173e-05, + "loss": 11.7509, + "step": 23053 + }, + { + "epoch": 1.255382437224752, + "grad_norm": 0.5131923723032333, + "learning_rate": 6.430621902116e-05, + "loss": 12.0155, + "step": 23054 + }, + { + "epoch": 1.255436891221335, + "grad_norm": 0.5824598217731434, + "learning_rate": 6.429798177676913e-05, + "loss": 12.092, + "step": 23055 + }, + { + "epoch": 1.2554913452179182, + "grad_norm": 0.5023407872327654, + "learning_rate": 6.428974481000312e-05, + "loss": 11.9702, + "step": 23056 + }, + { + "epoch": 1.2555457992145012, + "grad_norm": 0.5230683133346614, + "learning_rate": 6.428150812092606e-05, + "loss": 12.0129, + "step": 23057 + }, + { + "epoch": 1.2556002532110841, + "grad_norm": 0.5839902902678337, + "learning_rate": 6.427327170960197e-05, + "loss": 11.9333, + "step": 23058 + }, + { + "epoch": 1.2556547072076671, + "grad_norm": 0.5607862559971485, + "learning_rate": 6.426503557609494e-05, + "loss": 11.8152, + "step": 23059 + }, + { + "epoch": 1.2557091612042501, + "grad_norm": 0.5186679038061481, + "learning_rate": 6.425679972046895e-05, + "loss": 11.9656, + "step": 23060 + }, + { + "epoch": 1.2557636152008331, + "grad_norm": 0.5383825335095569, + "learning_rate": 6.424856414278809e-05, + "loss": 11.9613, + "step": 23061 + }, + { + "epoch": 1.2558180691974161, + "grad_norm": 0.5256951533910921, + "learning_rate": 6.424032884311639e-05, + "loss": 11.9378, + "step": 23062 + }, + { + "epoch": 1.2558725231939991, + "grad_norm": 0.5566260473565764, + "learning_rate": 6.423209382151787e-05, + "loss": 11.9813, + "step": 23063 + }, + { + "epoch": 1.255926977190582, + "grad_norm": 0.5047489211992726, + "learning_rate": 6.422385907805661e-05, + "loss": 11.7673, + "step": 23064 + }, + { + "epoch": 1.2559814311871653, + "grad_norm": 0.5319156804588298, + "learning_rate": 6.421562461279662e-05, + "loss": 11.9302, + "step": 23065 + }, + { + "epoch": 1.2560358851837483, + "grad_norm": 0.5469902777045331, + "learning_rate": 6.420739042580192e-05, + "loss": 11.8669, + "step": 23066 + }, + { + "epoch": 1.2560903391803313, + "grad_norm": 0.5895496587336508, + "learning_rate": 6.419915651713657e-05, + "loss": 11.8581, + "step": 23067 + }, + { + "epoch": 1.2561447931769143, + "grad_norm": 0.4797947992699836, + "learning_rate": 6.419092288686458e-05, + "loss": 11.9102, + "step": 23068 + }, + { + "epoch": 1.2561992471734973, + "grad_norm": 0.5577967038474159, + "learning_rate": 6.418268953505e-05, + "loss": 11.9772, + "step": 23069 + }, + { + "epoch": 1.2562537011700803, + "grad_norm": 0.5434598292455445, + "learning_rate": 6.417445646175682e-05, + "loss": 11.9336, + "step": 23070 + }, + { + "epoch": 1.2563081551666633, + "grad_norm": 0.5360937510040483, + "learning_rate": 6.416622366704906e-05, + "loss": 11.8435, + "step": 23071 + }, + { + "epoch": 1.2563626091632463, + "grad_norm": 0.5290102951434025, + "learning_rate": 6.415799115099075e-05, + "loss": 12.0287, + "step": 23072 + }, + { + "epoch": 1.2564170631598293, + "grad_norm": 0.4780939803977824, + "learning_rate": 6.414975891364591e-05, + "loss": 11.8835, + "step": 23073 + }, + { + "epoch": 1.2564715171564123, + "grad_norm": 0.5697258236466961, + "learning_rate": 6.414152695507855e-05, + "loss": 11.9116, + "step": 23074 + }, + { + "epoch": 1.2565259711529952, + "grad_norm": 0.546484044969463, + "learning_rate": 6.413329527535272e-05, + "loss": 11.888, + "step": 23075 + }, + { + "epoch": 1.2565804251495782, + "grad_norm": 0.6539474260009979, + "learning_rate": 6.412506387453239e-05, + "loss": 12.0349, + "step": 23076 + }, + { + "epoch": 1.2566348791461612, + "grad_norm": 0.5893862166396721, + "learning_rate": 6.411683275268157e-05, + "loss": 11.8892, + "step": 23077 + }, + { + "epoch": 1.2566893331427442, + "grad_norm": 0.5086572468603361, + "learning_rate": 6.410860190986428e-05, + "loss": 11.8784, + "step": 23078 + }, + { + "epoch": 1.2567437871393274, + "grad_norm": 0.5472500138255146, + "learning_rate": 6.410037134614455e-05, + "loss": 11.8455, + "step": 23079 + }, + { + "epoch": 1.2567982411359104, + "grad_norm": 0.544630527948792, + "learning_rate": 6.409214106158633e-05, + "loss": 12.0407, + "step": 23080 + }, + { + "epoch": 1.2568526951324934, + "grad_norm": 0.5492984380849422, + "learning_rate": 6.408391105625365e-05, + "loss": 11.9997, + "step": 23081 + }, + { + "epoch": 1.2569071491290764, + "grad_norm": 0.5511564523785917, + "learning_rate": 6.407568133021048e-05, + "loss": 12.0376, + "step": 23082 + }, + { + "epoch": 1.2569616031256594, + "grad_norm": 0.6279874531789499, + "learning_rate": 6.406745188352085e-05, + "loss": 11.9121, + "step": 23083 + }, + { + "epoch": 1.2570160571222424, + "grad_norm": 0.6220941261245012, + "learning_rate": 6.405922271624874e-05, + "loss": 11.9591, + "step": 23084 + }, + { + "epoch": 1.2570705111188254, + "grad_norm": 0.5370646846868051, + "learning_rate": 6.405099382845814e-05, + "loss": 11.8795, + "step": 23085 + }, + { + "epoch": 1.2571249651154084, + "grad_norm": 0.5544988273277008, + "learning_rate": 6.404276522021301e-05, + "loss": 12.0876, + "step": 23086 + }, + { + "epoch": 1.2571794191119914, + "grad_norm": 0.6192562302073945, + "learning_rate": 6.40345368915774e-05, + "loss": 11.9662, + "step": 23087 + }, + { + "epoch": 1.2572338731085746, + "grad_norm": 0.575416522526096, + "learning_rate": 6.402630884261526e-05, + "loss": 11.9524, + "step": 23088 + }, + { + "epoch": 1.2572883271051576, + "grad_norm": 0.5161518379421237, + "learning_rate": 6.401808107339062e-05, + "loss": 11.9907, + "step": 23089 + }, + { + "epoch": 1.2573427811017406, + "grad_norm": 0.5524182548409718, + "learning_rate": 6.400985358396733e-05, + "loss": 11.9012, + "step": 23090 + }, + { + "epoch": 1.2573972350983236, + "grad_norm": 0.5543905883301882, + "learning_rate": 6.40016263744095e-05, + "loss": 11.9165, + "step": 23091 + }, + { + "epoch": 1.2574516890949066, + "grad_norm": 0.5261536283933002, + "learning_rate": 6.399339944478107e-05, + "loss": 11.7749, + "step": 23092 + }, + { + "epoch": 1.2575061430914896, + "grad_norm": 0.5374076723199579, + "learning_rate": 6.398517279514598e-05, + "loss": 11.9491, + "step": 23093 + }, + { + "epoch": 1.2575605970880726, + "grad_norm": 0.5399215177823272, + "learning_rate": 6.397694642556824e-05, + "loss": 11.9369, + "step": 23094 + }, + { + "epoch": 1.2576150510846555, + "grad_norm": 0.5332425503379244, + "learning_rate": 6.396872033611179e-05, + "loss": 11.9261, + "step": 23095 + }, + { + "epoch": 1.2576695050812385, + "grad_norm": 0.5709318959269863, + "learning_rate": 6.396049452684062e-05, + "loss": 11.9361, + "step": 23096 + }, + { + "epoch": 1.2577239590778215, + "grad_norm": 0.6037910756924191, + "learning_rate": 6.395226899781868e-05, + "loss": 11.9538, + "step": 23097 + }, + { + "epoch": 1.2577784130744045, + "grad_norm": 0.5358524572601534, + "learning_rate": 6.394404374910996e-05, + "loss": 11.9424, + "step": 23098 + }, + { + "epoch": 1.2578328670709875, + "grad_norm": 0.6004955658989394, + "learning_rate": 6.393581878077844e-05, + "loss": 11.9896, + "step": 23099 + }, + { + "epoch": 1.2578873210675705, + "grad_norm": 0.5485408023650694, + "learning_rate": 6.392759409288799e-05, + "loss": 11.9138, + "step": 23100 + }, + { + "epoch": 1.2579417750641535, + "grad_norm": 0.5666292944739462, + "learning_rate": 6.391936968550261e-05, + "loss": 11.931, + "step": 23101 + }, + { + "epoch": 1.2579962290607365, + "grad_norm": 0.5597404353871231, + "learning_rate": 6.391114555868627e-05, + "loss": 11.9859, + "step": 23102 + }, + { + "epoch": 1.2580506830573197, + "grad_norm": 0.5670522544599553, + "learning_rate": 6.390292171250291e-05, + "loss": 11.8759, + "step": 23103 + }, + { + "epoch": 1.2581051370539027, + "grad_norm": 0.5995999828066928, + "learning_rate": 6.389469814701651e-05, + "loss": 11.9792, + "step": 23104 + }, + { + "epoch": 1.2581595910504857, + "grad_norm": 0.5782929205834054, + "learning_rate": 6.388647486229097e-05, + "loss": 11.9888, + "step": 23105 + }, + { + "epoch": 1.2582140450470687, + "grad_norm": 0.5484559404524344, + "learning_rate": 6.387825185839026e-05, + "loss": 11.8832, + "step": 23106 + }, + { + "epoch": 1.2582684990436517, + "grad_norm": 0.598937853919029, + "learning_rate": 6.387002913537834e-05, + "loss": 11.9739, + "step": 23107 + }, + { + "epoch": 1.2583229530402347, + "grad_norm": 0.4995317441061127, + "learning_rate": 6.38618066933191e-05, + "loss": 11.9355, + "step": 23108 + }, + { + "epoch": 1.2583774070368177, + "grad_norm": 0.5618095843570149, + "learning_rate": 6.385358453227657e-05, + "loss": 12.1109, + "step": 23109 + }, + { + "epoch": 1.2584318610334007, + "grad_norm": 0.5526007604359351, + "learning_rate": 6.384536265231457e-05, + "loss": 11.8914, + "step": 23110 + }, + { + "epoch": 1.2584863150299836, + "grad_norm": 0.5531099195759396, + "learning_rate": 6.383714105349712e-05, + "loss": 12.0769, + "step": 23111 + }, + { + "epoch": 1.2585407690265669, + "grad_norm": 0.4985342167243614, + "learning_rate": 6.382891973588809e-05, + "loss": 11.9721, + "step": 23112 + }, + { + "epoch": 1.2585952230231499, + "grad_norm": 0.556810224128595, + "learning_rate": 6.382069869955149e-05, + "loss": 11.8717, + "step": 23113 + }, + { + "epoch": 1.2586496770197328, + "grad_norm": 0.6270485080778043, + "learning_rate": 6.381247794455118e-05, + "loss": 11.9444, + "step": 23114 + }, + { + "epoch": 1.2587041310163158, + "grad_norm": 0.5523639002205809, + "learning_rate": 6.380425747095111e-05, + "loss": 11.9966, + "step": 23115 + }, + { + "epoch": 1.2587585850128988, + "grad_norm": 0.5353121475579924, + "learning_rate": 6.379603727881522e-05, + "loss": 11.8434, + "step": 23116 + }, + { + "epoch": 1.2588130390094818, + "grad_norm": 0.5446101326768436, + "learning_rate": 6.37878173682074e-05, + "loss": 11.8856, + "step": 23117 + }, + { + "epoch": 1.2588674930060648, + "grad_norm": 0.6029760601144726, + "learning_rate": 6.37795977391916e-05, + "loss": 12.0094, + "step": 23118 + }, + { + "epoch": 1.2589219470026478, + "grad_norm": 0.5865087141597802, + "learning_rate": 6.37713783918317e-05, + "loss": 11.9017, + "step": 23119 + }, + { + "epoch": 1.2589764009992308, + "grad_norm": 0.5477665226348177, + "learning_rate": 6.376315932619169e-05, + "loss": 11.9262, + "step": 23120 + }, + { + "epoch": 1.2590308549958138, + "grad_norm": 0.5282841499264371, + "learning_rate": 6.37549405423354e-05, + "loss": 11.9528, + "step": 23121 + }, + { + "epoch": 1.2590853089923968, + "grad_norm": 0.5412124065498328, + "learning_rate": 6.374672204032675e-05, + "loss": 12.0454, + "step": 23122 + }, + { + "epoch": 1.2591397629889798, + "grad_norm": 0.5553462550966848, + "learning_rate": 6.373850382022965e-05, + "loss": 11.959, + "step": 23123 + }, + { + "epoch": 1.2591942169855628, + "grad_norm": 0.6113007116293697, + "learning_rate": 6.373028588210808e-05, + "loss": 12.0259, + "step": 23124 + }, + { + "epoch": 1.2592486709821458, + "grad_norm": 0.5830883374819321, + "learning_rate": 6.372206822602586e-05, + "loss": 11.8463, + "step": 23125 + }, + { + "epoch": 1.259303124978729, + "grad_norm": 0.5504992518560115, + "learning_rate": 6.371385085204693e-05, + "loss": 11.9425, + "step": 23126 + }, + { + "epoch": 1.259357578975312, + "grad_norm": 0.576981204386988, + "learning_rate": 6.370563376023517e-05, + "loss": 11.8694, + "step": 23127 + }, + { + "epoch": 1.259412032971895, + "grad_norm": 0.5528066597285911, + "learning_rate": 6.36974169506545e-05, + "loss": 11.9209, + "step": 23128 + }, + { + "epoch": 1.259466486968478, + "grad_norm": 0.5578907904421315, + "learning_rate": 6.36892004233688e-05, + "loss": 12.0281, + "step": 23129 + }, + { + "epoch": 1.259520940965061, + "grad_norm": 0.49908705921764723, + "learning_rate": 6.368098417844199e-05, + "loss": 11.9853, + "step": 23130 + }, + { + "epoch": 1.259575394961644, + "grad_norm": 0.6456895948428514, + "learning_rate": 6.367276821593791e-05, + "loss": 11.9007, + "step": 23131 + }, + { + "epoch": 1.259629848958227, + "grad_norm": 0.5393688064228819, + "learning_rate": 6.366455253592048e-05, + "loss": 11.8418, + "step": 23132 + }, + { + "epoch": 1.25968430295481, + "grad_norm": 0.5367421126036882, + "learning_rate": 6.365633713845358e-05, + "loss": 11.9011, + "step": 23133 + }, + { + "epoch": 1.259738756951393, + "grad_norm": 0.5310563904804384, + "learning_rate": 6.364812202360111e-05, + "loss": 11.8744, + "step": 23134 + }, + { + "epoch": 1.2597932109479761, + "grad_norm": 0.545635062163532, + "learning_rate": 6.363990719142691e-05, + "loss": 12.103, + "step": 23135 + }, + { + "epoch": 1.2598476649445591, + "grad_norm": 0.529964804335486, + "learning_rate": 6.363169264199491e-05, + "loss": 12.0641, + "step": 23136 + }, + { + "epoch": 1.2599021189411421, + "grad_norm": 0.5454530713208232, + "learning_rate": 6.362347837536898e-05, + "loss": 11.8868, + "step": 23137 + }, + { + "epoch": 1.2599565729377251, + "grad_norm": 0.5138547367416849, + "learning_rate": 6.361526439161297e-05, + "loss": 11.9207, + "step": 23138 + }, + { + "epoch": 1.260011026934308, + "grad_norm": 0.5379980430562593, + "learning_rate": 6.360705069079076e-05, + "loss": 11.9224, + "step": 23139 + }, + { + "epoch": 1.260065480930891, + "grad_norm": 0.5570539254887212, + "learning_rate": 6.359883727296625e-05, + "loss": 12.0006, + "step": 23140 + }, + { + "epoch": 1.260119934927474, + "grad_norm": 0.5660768674787063, + "learning_rate": 6.359062413820327e-05, + "loss": 12.0522, + "step": 23141 + }, + { + "epoch": 1.260174388924057, + "grad_norm": 0.49642938404356757, + "learning_rate": 6.35824112865657e-05, + "loss": 11.9169, + "step": 23142 + }, + { + "epoch": 1.26022884292064, + "grad_norm": 0.5354280412353992, + "learning_rate": 6.357419871811741e-05, + "loss": 11.9543, + "step": 23143 + }, + { + "epoch": 1.260283296917223, + "grad_norm": 0.5997684021188936, + "learning_rate": 6.356598643292225e-05, + "loss": 11.9438, + "step": 23144 + }, + { + "epoch": 1.260337750913806, + "grad_norm": 0.5336866194677516, + "learning_rate": 6.355777443104409e-05, + "loss": 11.9164, + "step": 23145 + }, + { + "epoch": 1.260392204910389, + "grad_norm": 0.5803367431008287, + "learning_rate": 6.354956271254678e-05, + "loss": 12.1357, + "step": 23146 + }, + { + "epoch": 1.260446658906972, + "grad_norm": 0.5217981905140658, + "learning_rate": 6.35413512774942e-05, + "loss": 11.9656, + "step": 23147 + }, + { + "epoch": 1.260501112903555, + "grad_norm": 0.5283137024171627, + "learning_rate": 6.353314012595018e-05, + "loss": 11.9016, + "step": 23148 + }, + { + "epoch": 1.2605555669001383, + "grad_norm": 0.47655219752417416, + "learning_rate": 6.352492925797859e-05, + "loss": 11.9532, + "step": 23149 + }, + { + "epoch": 1.2606100208967213, + "grad_norm": 0.517452982016707, + "learning_rate": 6.351671867364327e-05, + "loss": 11.9848, + "step": 23150 + }, + { + "epoch": 1.2606644748933042, + "grad_norm": 0.518384238667633, + "learning_rate": 6.350850837300805e-05, + "loss": 11.9969, + "step": 23151 + }, + { + "epoch": 1.2607189288898872, + "grad_norm": 0.5072064806068445, + "learning_rate": 6.35002983561368e-05, + "loss": 12.031, + "step": 23152 + }, + { + "epoch": 1.2607733828864702, + "grad_norm": 0.49020800588185137, + "learning_rate": 6.349208862309334e-05, + "loss": 11.8534, + "step": 23153 + }, + { + "epoch": 1.2608278368830532, + "grad_norm": 0.46912186730601757, + "learning_rate": 6.348387917394152e-05, + "loss": 11.8691, + "step": 23154 + }, + { + "epoch": 1.2608822908796362, + "grad_norm": 0.5524909289000592, + "learning_rate": 6.347567000874519e-05, + "loss": 12.024, + "step": 23155 + }, + { + "epoch": 1.2609367448762192, + "grad_norm": 0.5420559822279098, + "learning_rate": 6.346746112756816e-05, + "loss": 11.7939, + "step": 23156 + }, + { + "epoch": 1.2609911988728022, + "grad_norm": 0.6005430952423993, + "learning_rate": 6.345925253047426e-05, + "loss": 11.8845, + "step": 23157 + }, + { + "epoch": 1.2610456528693854, + "grad_norm": 0.584496432963144, + "learning_rate": 6.345104421752737e-05, + "loss": 12.0256, + "step": 23158 + }, + { + "epoch": 1.2611001068659684, + "grad_norm": 0.5289213329822015, + "learning_rate": 6.344283618879128e-05, + "loss": 11.8286, + "step": 23159 + }, + { + "epoch": 1.2611545608625514, + "grad_norm": 0.5215239341627839, + "learning_rate": 6.343462844432988e-05, + "loss": 11.9474, + "step": 23160 + }, + { + "epoch": 1.2612090148591344, + "grad_norm": 0.5349334059457926, + "learning_rate": 6.342642098420688e-05, + "loss": 12.0584, + "step": 23161 + }, + { + "epoch": 1.2612634688557174, + "grad_norm": 0.5446407112239008, + "learning_rate": 6.341821380848618e-05, + "loss": 12.0434, + "step": 23162 + }, + { + "epoch": 1.2613179228523004, + "grad_norm": 0.564088965609529, + "learning_rate": 6.341000691723158e-05, + "loss": 11.9337, + "step": 23163 + }, + { + "epoch": 1.2613723768488834, + "grad_norm": 0.5340390154873355, + "learning_rate": 6.340180031050691e-05, + "loss": 11.8474, + "step": 23164 + }, + { + "epoch": 1.2614268308454664, + "grad_norm": 0.47720631400330055, + "learning_rate": 6.339359398837596e-05, + "loss": 11.8776, + "step": 23165 + }, + { + "epoch": 1.2614812848420494, + "grad_norm": 0.5736225944813493, + "learning_rate": 6.338538795090258e-05, + "loss": 12.0683, + "step": 23166 + }, + { + "epoch": 1.2615357388386323, + "grad_norm": 0.5591859961628993, + "learning_rate": 6.337718219815057e-05, + "loss": 12.0328, + "step": 23167 + }, + { + "epoch": 1.2615901928352153, + "grad_norm": 0.539813660015235, + "learning_rate": 6.336897673018369e-05, + "loss": 12.006, + "step": 23168 + }, + { + "epoch": 1.2616446468317983, + "grad_norm": 0.47370333657589964, + "learning_rate": 6.336077154706581e-05, + "loss": 11.9422, + "step": 23169 + }, + { + "epoch": 1.2616991008283813, + "grad_norm": 0.5175435505840981, + "learning_rate": 6.335256664886078e-05, + "loss": 11.8468, + "step": 23170 + }, + { + "epoch": 1.2617535548249643, + "grad_norm": 0.5364831212928655, + "learning_rate": 6.334436203563228e-05, + "loss": 11.9393, + "step": 23171 + }, + { + "epoch": 1.2618080088215473, + "grad_norm": 0.5821503420255064, + "learning_rate": 6.333615770744414e-05, + "loss": 11.9126, + "step": 23172 + }, + { + "epoch": 1.2618624628181305, + "grad_norm": 0.5733777960952744, + "learning_rate": 6.332795366436024e-05, + "loss": 12.055, + "step": 23173 + }, + { + "epoch": 1.2619169168147135, + "grad_norm": 0.5389332093998159, + "learning_rate": 6.33197499064443e-05, + "loss": 11.9563, + "step": 23174 + }, + { + "epoch": 1.2619713708112965, + "grad_norm": 0.5275144965712444, + "learning_rate": 6.331154643376012e-05, + "loss": 11.7789, + "step": 23175 + }, + { + "epoch": 1.2620258248078795, + "grad_norm": 0.5537466378450545, + "learning_rate": 6.330334324637153e-05, + "loss": 11.935, + "step": 23176 + }, + { + "epoch": 1.2620802788044625, + "grad_norm": 0.5470867083695272, + "learning_rate": 6.329514034434229e-05, + "loss": 11.9791, + "step": 23177 + }, + { + "epoch": 1.2621347328010455, + "grad_norm": 0.5287713218484725, + "learning_rate": 6.328693772773619e-05, + "loss": 12.1183, + "step": 23178 + }, + { + "epoch": 1.2621891867976285, + "grad_norm": 0.5675339930776699, + "learning_rate": 6.327873539661701e-05, + "loss": 11.777, + "step": 23179 + }, + { + "epoch": 1.2622436407942115, + "grad_norm": 0.5937655479464573, + "learning_rate": 6.327053335104858e-05, + "loss": 11.9364, + "step": 23180 + }, + { + "epoch": 1.2622980947907947, + "grad_norm": 0.5768170937460548, + "learning_rate": 6.326233159109462e-05, + "loss": 11.8478, + "step": 23181 + }, + { + "epoch": 1.2623525487873777, + "grad_norm": 0.5116243289974295, + "learning_rate": 6.325413011681893e-05, + "loss": 11.8969, + "step": 23182 + }, + { + "epoch": 1.2624070027839607, + "grad_norm": 0.5474243620780744, + "learning_rate": 6.32459289282853e-05, + "loss": 11.9277, + "step": 23183 + }, + { + "epoch": 1.2624614567805437, + "grad_norm": 0.5677173261618479, + "learning_rate": 6.323772802555745e-05, + "loss": 11.9387, + "step": 23184 + }, + { + "epoch": 1.2625159107771267, + "grad_norm": 0.5112451267947925, + "learning_rate": 6.32295274086992e-05, + "loss": 11.8689, + "step": 23185 + }, + { + "epoch": 1.2625703647737097, + "grad_norm": 0.512827670829193, + "learning_rate": 6.322132707777433e-05, + "loss": 11.971, + "step": 23186 + }, + { + "epoch": 1.2626248187702926, + "grad_norm": 0.5207663536707516, + "learning_rate": 6.321312703284658e-05, + "loss": 11.8929, + "step": 23187 + }, + { + "epoch": 1.2626792727668756, + "grad_norm": 0.5155169795566987, + "learning_rate": 6.320492727397974e-05, + "loss": 11.9738, + "step": 23188 + }, + { + "epoch": 1.2627337267634586, + "grad_norm": 0.5314580142077816, + "learning_rate": 6.319672780123755e-05, + "loss": 11.9145, + "step": 23189 + }, + { + "epoch": 1.2627881807600416, + "grad_norm": 0.5421368749543253, + "learning_rate": 6.318852861468378e-05, + "loss": 12.0622, + "step": 23190 + }, + { + "epoch": 1.2628426347566246, + "grad_norm": 0.5179067735758253, + "learning_rate": 6.31803297143822e-05, + "loss": 11.9269, + "step": 23191 + }, + { + "epoch": 1.2628970887532076, + "grad_norm": 0.5736250693930064, + "learning_rate": 6.317213110039651e-05, + "loss": 11.7545, + "step": 23192 + }, + { + "epoch": 1.2629515427497906, + "grad_norm": 0.5697848875943401, + "learning_rate": 6.316393277279053e-05, + "loss": 12.0713, + "step": 23193 + }, + { + "epoch": 1.2630059967463736, + "grad_norm": 0.4869568018442, + "learning_rate": 6.315573473162797e-05, + "loss": 11.9277, + "step": 23194 + }, + { + "epoch": 1.2630604507429566, + "grad_norm": 0.5616724562357337, + "learning_rate": 6.314753697697258e-05, + "loss": 12.0319, + "step": 23195 + }, + { + "epoch": 1.2631149047395398, + "grad_norm": 0.5267218874973129, + "learning_rate": 6.313933950888815e-05, + "loss": 12.0089, + "step": 23196 + }, + { + "epoch": 1.2631693587361228, + "grad_norm": 0.5520135585587579, + "learning_rate": 6.31311423274384e-05, + "loss": 12.0212, + "step": 23197 + }, + { + "epoch": 1.2632238127327058, + "grad_norm": 0.5351385629875579, + "learning_rate": 6.312294543268706e-05, + "loss": 11.7936, + "step": 23198 + }, + { + "epoch": 1.2632782667292888, + "grad_norm": 0.5610677188971568, + "learning_rate": 6.31147488246979e-05, + "loss": 11.9184, + "step": 23199 + }, + { + "epoch": 1.2633327207258718, + "grad_norm": 0.5747318834051146, + "learning_rate": 6.310655250353464e-05, + "loss": 11.9075, + "step": 23200 + }, + { + "epoch": 1.2633871747224548, + "grad_norm": 0.6582293748444622, + "learning_rate": 6.3098356469261e-05, + "loss": 12.006, + "step": 23201 + }, + { + "epoch": 1.2634416287190378, + "grad_norm": 0.5305480900275891, + "learning_rate": 6.309016072194071e-05, + "loss": 11.9112, + "step": 23202 + }, + { + "epoch": 1.2634960827156207, + "grad_norm": 0.5357069806269253, + "learning_rate": 6.308196526163755e-05, + "loss": 11.9746, + "step": 23203 + }, + { + "epoch": 1.2635505367122037, + "grad_norm": 0.5747193317278483, + "learning_rate": 6.30737700884152e-05, + "loss": 12.0809, + "step": 23204 + }, + { + "epoch": 1.263604990708787, + "grad_norm": 0.5651868305719783, + "learning_rate": 6.306557520233741e-05, + "loss": 11.8314, + "step": 23205 + }, + { + "epoch": 1.26365944470537, + "grad_norm": 0.5116818528225225, + "learning_rate": 6.305738060346788e-05, + "loss": 11.9867, + "step": 23206 + }, + { + "epoch": 1.263713898701953, + "grad_norm": 0.6191881391424359, + "learning_rate": 6.304918629187037e-05, + "loss": 12.0998, + "step": 23207 + }, + { + "epoch": 1.263768352698536, + "grad_norm": 0.5084801937641094, + "learning_rate": 6.30409922676086e-05, + "loss": 11.8947, + "step": 23208 + }, + { + "epoch": 1.263822806695119, + "grad_norm": 0.5234002530945163, + "learning_rate": 6.303279853074626e-05, + "loss": 11.805, + "step": 23209 + }, + { + "epoch": 1.263877260691702, + "grad_norm": 0.5706221736923324, + "learning_rate": 6.302460508134711e-05, + "loss": 11.9956, + "step": 23210 + }, + { + "epoch": 1.263931714688285, + "grad_norm": 0.5396563639855002, + "learning_rate": 6.30164119194748e-05, + "loss": 11.9397, + "step": 23211 + }, + { + "epoch": 1.263986168684868, + "grad_norm": 0.6354073308334479, + "learning_rate": 6.300821904519308e-05, + "loss": 12.1244, + "step": 23212 + }, + { + "epoch": 1.264040622681451, + "grad_norm": 0.5845475176699747, + "learning_rate": 6.300002645856566e-05, + "loss": 11.9064, + "step": 23213 + }, + { + "epoch": 1.264095076678034, + "grad_norm": 0.512881774459477, + "learning_rate": 6.299183415965622e-05, + "loss": 11.9788, + "step": 23214 + }, + { + "epoch": 1.2641495306746169, + "grad_norm": 0.5672581737919049, + "learning_rate": 6.298364214852849e-05, + "loss": 11.9306, + "step": 23215 + }, + { + "epoch": 1.2642039846711999, + "grad_norm": 0.6185248005041978, + "learning_rate": 6.297545042524617e-05, + "loss": 11.9911, + "step": 23216 + }, + { + "epoch": 1.2642584386677829, + "grad_norm": 0.579008583626923, + "learning_rate": 6.296725898987292e-05, + "loss": 11.9292, + "step": 23217 + }, + { + "epoch": 1.2643128926643659, + "grad_norm": 0.5554296521119939, + "learning_rate": 6.295906784247252e-05, + "loss": 12.0232, + "step": 23218 + }, + { + "epoch": 1.264367346660949, + "grad_norm": 0.6114492731324929, + "learning_rate": 6.295087698310861e-05, + "loss": 11.9058, + "step": 23219 + }, + { + "epoch": 1.264421800657532, + "grad_norm": 0.5274101672933743, + "learning_rate": 6.294268641184493e-05, + "loss": 11.9656, + "step": 23220 + }, + { + "epoch": 1.264476254654115, + "grad_norm": 0.5238667204154219, + "learning_rate": 6.293449612874508e-05, + "loss": 11.8131, + "step": 23221 + }, + { + "epoch": 1.264530708650698, + "grad_norm": 0.5434777512564144, + "learning_rate": 6.292630613387282e-05, + "loss": 11.9128, + "step": 23222 + }, + { + "epoch": 1.264585162647281, + "grad_norm": 0.5432222398151255, + "learning_rate": 6.291811642729182e-05, + "loss": 11.8331, + "step": 23223 + }, + { + "epoch": 1.264639616643864, + "grad_norm": 0.5524449538973034, + "learning_rate": 6.290992700906577e-05, + "loss": 11.972, + "step": 23224 + }, + { + "epoch": 1.264694070640447, + "grad_norm": 0.5445788443924797, + "learning_rate": 6.290173787925835e-05, + "loss": 11.9811, + "step": 23225 + }, + { + "epoch": 1.26474852463703, + "grad_norm": 0.5215192081338217, + "learning_rate": 6.289354903793324e-05, + "loss": 11.9049, + "step": 23226 + }, + { + "epoch": 1.264802978633613, + "grad_norm": 0.5496417411242858, + "learning_rate": 6.28853604851541e-05, + "loss": 11.9841, + "step": 23227 + }, + { + "epoch": 1.2648574326301962, + "grad_norm": 0.5736958780394876, + "learning_rate": 6.287717222098464e-05, + "loss": 12.0088, + "step": 23228 + }, + { + "epoch": 1.2649118866267792, + "grad_norm": 0.5553454283394794, + "learning_rate": 6.286898424548848e-05, + "loss": 11.9605, + "step": 23229 + }, + { + "epoch": 1.2649663406233622, + "grad_norm": 0.49554090269372214, + "learning_rate": 6.286079655872938e-05, + "loss": 11.9359, + "step": 23230 + }, + { + "epoch": 1.2650207946199452, + "grad_norm": 0.5995344148429932, + "learning_rate": 6.285260916077093e-05, + "loss": 11.8937, + "step": 23231 + }, + { + "epoch": 1.2650752486165282, + "grad_norm": 0.5470788013975341, + "learning_rate": 6.284442205167681e-05, + "loss": 12.0121, + "step": 23232 + }, + { + "epoch": 1.2651297026131112, + "grad_norm": 0.5481015507073892, + "learning_rate": 6.283623523151068e-05, + "loss": 11.9989, + "step": 23233 + }, + { + "epoch": 1.2651841566096942, + "grad_norm": 0.6058346904021367, + "learning_rate": 6.282804870033623e-05, + "loss": 11.7442, + "step": 23234 + }, + { + "epoch": 1.2652386106062772, + "grad_norm": 0.5694201142122483, + "learning_rate": 6.281986245821712e-05, + "loss": 12.051, + "step": 23235 + }, + { + "epoch": 1.2652930646028602, + "grad_norm": 0.5331388845783178, + "learning_rate": 6.281167650521699e-05, + "loss": 11.9646, + "step": 23236 + }, + { + "epoch": 1.2653475185994432, + "grad_norm": 0.5898842206056572, + "learning_rate": 6.28034908413995e-05, + "loss": 12.0569, + "step": 23237 + }, + { + "epoch": 1.2654019725960262, + "grad_norm": 0.5033871143616334, + "learning_rate": 6.27953054668283e-05, + "loss": 11.8935, + "step": 23238 + }, + { + "epoch": 1.2654564265926092, + "grad_norm": 0.5025364947010356, + "learning_rate": 6.278712038156704e-05, + "loss": 11.8969, + "step": 23239 + }, + { + "epoch": 1.2655108805891921, + "grad_norm": 0.5339284807788707, + "learning_rate": 6.27789355856794e-05, + "loss": 11.727, + "step": 23240 + }, + { + "epoch": 1.2655653345857751, + "grad_norm": 0.5831331266534203, + "learning_rate": 6.277075107922899e-05, + "loss": 11.9637, + "step": 23241 + }, + { + "epoch": 1.2656197885823581, + "grad_norm": 0.5602101162288189, + "learning_rate": 6.276256686227944e-05, + "loss": 11.963, + "step": 23242 + }, + { + "epoch": 1.2656742425789413, + "grad_norm": 0.6348367389306971, + "learning_rate": 6.275438293489442e-05, + "loss": 12.0841, + "step": 23243 + }, + { + "epoch": 1.2657286965755243, + "grad_norm": 0.5662549810181848, + "learning_rate": 6.274619929713755e-05, + "loss": 11.9062, + "step": 23244 + }, + { + "epoch": 1.2657831505721073, + "grad_norm": 0.5880688167587791, + "learning_rate": 6.273801594907249e-05, + "loss": 12.0878, + "step": 23245 + }, + { + "epoch": 1.2658376045686903, + "grad_norm": 0.5658460549330208, + "learning_rate": 6.272983289076288e-05, + "loss": 12.0015, + "step": 23246 + }, + { + "epoch": 1.2658920585652733, + "grad_norm": 0.5288370428761704, + "learning_rate": 6.272165012227235e-05, + "loss": 11.8035, + "step": 23247 + }, + { + "epoch": 1.2659465125618563, + "grad_norm": 0.5091781328094663, + "learning_rate": 6.271346764366451e-05, + "loss": 12.032, + "step": 23248 + }, + { + "epoch": 1.2660009665584393, + "grad_norm": 0.6019770426441612, + "learning_rate": 6.270528545500298e-05, + "loss": 11.9276, + "step": 23249 + }, + { + "epoch": 1.2660554205550223, + "grad_norm": 0.5592074457787859, + "learning_rate": 6.269710355635145e-05, + "loss": 11.8501, + "step": 23250 + }, + { + "epoch": 1.2661098745516055, + "grad_norm": 0.5581442322272826, + "learning_rate": 6.268892194777348e-05, + "loss": 11.9014, + "step": 23251 + }, + { + "epoch": 1.2661643285481885, + "grad_norm": 0.5455366191873362, + "learning_rate": 6.268074062933269e-05, + "loss": 11.9663, + "step": 23252 + }, + { + "epoch": 1.2662187825447715, + "grad_norm": 0.6696918915359905, + "learning_rate": 6.267255960109273e-05, + "loss": 11.9712, + "step": 23253 + }, + { + "epoch": 1.2662732365413545, + "grad_norm": 0.5379251392148051, + "learning_rate": 6.26643788631172e-05, + "loss": 12.0345, + "step": 23254 + }, + { + "epoch": 1.2663276905379375, + "grad_norm": 0.5931605972582454, + "learning_rate": 6.26561984154697e-05, + "loss": 12.0742, + "step": 23255 + }, + { + "epoch": 1.2663821445345205, + "grad_norm": 0.5017334888256585, + "learning_rate": 6.26480182582139e-05, + "loss": 11.9426, + "step": 23256 + }, + { + "epoch": 1.2664365985311035, + "grad_norm": 0.5068977268219558, + "learning_rate": 6.263983839141335e-05, + "loss": 11.9662, + "step": 23257 + }, + { + "epoch": 1.2664910525276865, + "grad_norm": 0.5356184965527556, + "learning_rate": 6.26316588151317e-05, + "loss": 11.7663, + "step": 23258 + }, + { + "epoch": 1.2665455065242694, + "grad_norm": 0.6047985115412942, + "learning_rate": 6.262347952943253e-05, + "loss": 11.9585, + "step": 23259 + }, + { + "epoch": 1.2665999605208524, + "grad_norm": 0.5935450485177922, + "learning_rate": 6.261530053437946e-05, + "loss": 12.0218, + "step": 23260 + }, + { + "epoch": 1.2666544145174354, + "grad_norm": 0.5792616729946286, + "learning_rate": 6.26071218300361e-05, + "loss": 11.942, + "step": 23261 + }, + { + "epoch": 1.2667088685140184, + "grad_norm": 0.4911537276124451, + "learning_rate": 6.2598943416466e-05, + "loss": 11.8469, + "step": 23262 + }, + { + "epoch": 1.2667633225106014, + "grad_norm": 0.5888120572545132, + "learning_rate": 6.25907652937328e-05, + "loss": 12.0768, + "step": 23263 + }, + { + "epoch": 1.2668177765071844, + "grad_norm": 0.5842572999363864, + "learning_rate": 6.258258746190008e-05, + "loss": 11.8555, + "step": 23264 + }, + { + "epoch": 1.2668722305037674, + "grad_norm": 0.5770417131764191, + "learning_rate": 6.257440992103143e-05, + "loss": 11.9088, + "step": 23265 + }, + { + "epoch": 1.2669266845003506, + "grad_norm": 0.5123296797440976, + "learning_rate": 6.256623267119043e-05, + "loss": 11.9103, + "step": 23266 + }, + { + "epoch": 1.2669811384969336, + "grad_norm": 0.5641928636700514, + "learning_rate": 6.25580557124407e-05, + "loss": 11.9741, + "step": 23267 + }, + { + "epoch": 1.2670355924935166, + "grad_norm": 0.5655096911419298, + "learning_rate": 6.25498790448458e-05, + "loss": 12.0109, + "step": 23268 + }, + { + "epoch": 1.2670900464900996, + "grad_norm": 0.5428229007565369, + "learning_rate": 6.254170266846933e-05, + "loss": 11.9479, + "step": 23269 + }, + { + "epoch": 1.2671445004866826, + "grad_norm": 0.5706404677280447, + "learning_rate": 6.253352658337487e-05, + "loss": 12.0651, + "step": 23270 + }, + { + "epoch": 1.2671989544832656, + "grad_norm": 0.5262681530829951, + "learning_rate": 6.2525350789626e-05, + "loss": 11.8401, + "step": 23271 + }, + { + "epoch": 1.2672534084798486, + "grad_norm": 0.6176674300786359, + "learning_rate": 6.251717528728627e-05, + "loss": 11.9482, + "step": 23272 + }, + { + "epoch": 1.2673078624764316, + "grad_norm": 0.5983821018764575, + "learning_rate": 6.250900007641927e-05, + "loss": 11.9959, + "step": 23273 + }, + { + "epoch": 1.2673623164730146, + "grad_norm": 0.5273196446127435, + "learning_rate": 6.250082515708857e-05, + "loss": 11.8718, + "step": 23274 + }, + { + "epoch": 1.2674167704695978, + "grad_norm": 0.5228535750345598, + "learning_rate": 6.249265052935774e-05, + "loss": 11.754, + "step": 23275 + }, + { + "epoch": 1.2674712244661808, + "grad_norm": 0.5031082126493903, + "learning_rate": 6.248447619329036e-05, + "loss": 11.9304, + "step": 23276 + }, + { + "epoch": 1.2675256784627638, + "grad_norm": 0.5775344306743875, + "learning_rate": 6.247630214894995e-05, + "loss": 11.9764, + "step": 23277 + }, + { + "epoch": 1.2675801324593468, + "grad_norm": 0.5511462710066333, + "learning_rate": 6.246812839640013e-05, + "loss": 12.0393, + "step": 23278 + }, + { + "epoch": 1.2676345864559297, + "grad_norm": 0.5546271233692556, + "learning_rate": 6.245995493570445e-05, + "loss": 11.8554, + "step": 23279 + }, + { + "epoch": 1.2676890404525127, + "grad_norm": 0.5542476229734516, + "learning_rate": 6.245178176692645e-05, + "loss": 11.9874, + "step": 23280 + }, + { + "epoch": 1.2677434944490957, + "grad_norm": 0.5601960937182989, + "learning_rate": 6.244360889012973e-05, + "loss": 11.974, + "step": 23281 + }, + { + "epoch": 1.2677979484456787, + "grad_norm": 0.536459073035721, + "learning_rate": 6.243543630537775e-05, + "loss": 12.0649, + "step": 23282 + }, + { + "epoch": 1.2678524024422617, + "grad_norm": 0.5326910175065769, + "learning_rate": 6.242726401273414e-05, + "loss": 11.9427, + "step": 23283 + }, + { + "epoch": 1.2679068564388447, + "grad_norm": 0.6847770682281887, + "learning_rate": 6.241909201226242e-05, + "loss": 12.0452, + "step": 23284 + }, + { + "epoch": 1.2679613104354277, + "grad_norm": 0.4851762226565081, + "learning_rate": 6.241092030402614e-05, + "loss": 11.9293, + "step": 23285 + }, + { + "epoch": 1.2680157644320107, + "grad_norm": 0.5076819760948074, + "learning_rate": 6.240274888808883e-05, + "loss": 12.0396, + "step": 23286 + }, + { + "epoch": 1.2680702184285937, + "grad_norm": 0.5597168326451334, + "learning_rate": 6.239457776451409e-05, + "loss": 12.0031, + "step": 23287 + }, + { + "epoch": 1.2681246724251767, + "grad_norm": 0.6254650905558556, + "learning_rate": 6.238640693336539e-05, + "loss": 11.8565, + "step": 23288 + }, + { + "epoch": 1.26817912642176, + "grad_norm": 0.5411474727450071, + "learning_rate": 6.237823639470628e-05, + "loss": 11.9987, + "step": 23289 + }, + { + "epoch": 1.2682335804183429, + "grad_norm": 0.5686613732412673, + "learning_rate": 6.237006614860035e-05, + "loss": 11.9573, + "step": 23290 + }, + { + "epoch": 1.2682880344149259, + "grad_norm": 0.5273454910767795, + "learning_rate": 6.236189619511113e-05, + "loss": 12.0214, + "step": 23291 + }, + { + "epoch": 1.2683424884115089, + "grad_norm": 0.5251569727347625, + "learning_rate": 6.235372653430207e-05, + "loss": 11.8177, + "step": 23292 + }, + { + "epoch": 1.2683969424080919, + "grad_norm": 0.5271534590038707, + "learning_rate": 6.234555716623672e-05, + "loss": 11.8267, + "step": 23293 + }, + { + "epoch": 1.2684513964046749, + "grad_norm": 0.6089298643908899, + "learning_rate": 6.233738809097866e-05, + "loss": 12.0401, + "step": 23294 + }, + { + "epoch": 1.2685058504012579, + "grad_norm": 0.5535058687639198, + "learning_rate": 6.23292193085914e-05, + "loss": 12.0141, + "step": 23295 + }, + { + "epoch": 1.2685603043978408, + "grad_norm": 0.57154260724941, + "learning_rate": 6.232105081913841e-05, + "loss": 11.9038, + "step": 23296 + }, + { + "epoch": 1.2686147583944238, + "grad_norm": 0.5788602074094982, + "learning_rate": 6.231288262268328e-05, + "loss": 12.0593, + "step": 23297 + }, + { + "epoch": 1.268669212391007, + "grad_norm": 0.5300962947667739, + "learning_rate": 6.23047147192895e-05, + "loss": 11.9953, + "step": 23298 + }, + { + "epoch": 1.26872366638759, + "grad_norm": 0.6186935749669186, + "learning_rate": 6.229654710902055e-05, + "loss": 11.9587, + "step": 23299 + }, + { + "epoch": 1.268778120384173, + "grad_norm": 0.5494811757785866, + "learning_rate": 6.228837979193997e-05, + "loss": 12.0647, + "step": 23300 + }, + { + "epoch": 1.268832574380756, + "grad_norm": 0.5447030181196109, + "learning_rate": 6.228021276811134e-05, + "loss": 12.0203, + "step": 23301 + }, + { + "epoch": 1.268887028377339, + "grad_norm": 0.5637450092619816, + "learning_rate": 6.227204603759805e-05, + "loss": 11.9965, + "step": 23302 + }, + { + "epoch": 1.268941482373922, + "grad_norm": 0.5638522258794966, + "learning_rate": 6.226387960046367e-05, + "loss": 11.7628, + "step": 23303 + }, + { + "epoch": 1.268995936370505, + "grad_norm": 0.5467926557255012, + "learning_rate": 6.225571345677165e-05, + "loss": 11.9061, + "step": 23304 + }, + { + "epoch": 1.269050390367088, + "grad_norm": 0.5593070649001599, + "learning_rate": 6.224754760658558e-05, + "loss": 12.0445, + "step": 23305 + }, + { + "epoch": 1.269104844363671, + "grad_norm": 0.5343540127281311, + "learning_rate": 6.223938204996889e-05, + "loss": 11.9997, + "step": 23306 + }, + { + "epoch": 1.269159298360254, + "grad_norm": 0.5801851478240748, + "learning_rate": 6.223121678698509e-05, + "loss": 12.0018, + "step": 23307 + }, + { + "epoch": 1.269213752356837, + "grad_norm": 0.5533776677930815, + "learning_rate": 6.222305181769769e-05, + "loss": 11.923, + "step": 23308 + }, + { + "epoch": 1.26926820635342, + "grad_norm": 0.5789373933804217, + "learning_rate": 6.221488714217019e-05, + "loss": 12.0229, + "step": 23309 + }, + { + "epoch": 1.269322660350003, + "grad_norm": 0.5536744500147756, + "learning_rate": 6.220672276046604e-05, + "loss": 11.883, + "step": 23310 + }, + { + "epoch": 1.269377114346586, + "grad_norm": 0.5783446621668239, + "learning_rate": 6.219855867264878e-05, + "loss": 11.9313, + "step": 23311 + }, + { + "epoch": 1.269431568343169, + "grad_norm": 0.5404254904073282, + "learning_rate": 6.219039487878187e-05, + "loss": 12.0112, + "step": 23312 + }, + { + "epoch": 1.2694860223397522, + "grad_norm": 0.5262018222211831, + "learning_rate": 6.218223137892876e-05, + "loss": 11.8799, + "step": 23313 + }, + { + "epoch": 1.2695404763363352, + "grad_norm": 0.540015923190593, + "learning_rate": 6.217406817315297e-05, + "loss": 11.9449, + "step": 23314 + }, + { + "epoch": 1.2695949303329181, + "grad_norm": 0.5976045275009173, + "learning_rate": 6.216590526151795e-05, + "loss": 12.1262, + "step": 23315 + }, + { + "epoch": 1.2696493843295011, + "grad_norm": 0.5368482700177268, + "learning_rate": 6.215774264408723e-05, + "loss": 11.9444, + "step": 23316 + }, + { + "epoch": 1.2697038383260841, + "grad_norm": 0.5558777228267541, + "learning_rate": 6.214958032092423e-05, + "loss": 12.0168, + "step": 23317 + }, + { + "epoch": 1.2697582923226671, + "grad_norm": 0.5077506313370604, + "learning_rate": 6.214141829209245e-05, + "loss": 11.9611, + "step": 23318 + }, + { + "epoch": 1.2698127463192501, + "grad_norm": 0.5487927148954824, + "learning_rate": 6.213325655765534e-05, + "loss": 11.9458, + "step": 23319 + }, + { + "epoch": 1.2698672003158331, + "grad_norm": 0.595899878814489, + "learning_rate": 6.21250951176764e-05, + "loss": 11.9533, + "step": 23320 + }, + { + "epoch": 1.2699216543124163, + "grad_norm": 0.5506273459385893, + "learning_rate": 6.211693397221908e-05, + "loss": 11.7498, + "step": 23321 + }, + { + "epoch": 1.2699761083089993, + "grad_norm": 0.538278177967009, + "learning_rate": 6.210877312134679e-05, + "loss": 11.9185, + "step": 23322 + }, + { + "epoch": 1.2700305623055823, + "grad_norm": 0.5227067430337492, + "learning_rate": 6.210061256512306e-05, + "loss": 11.9251, + "step": 23323 + }, + { + "epoch": 1.2700850163021653, + "grad_norm": 0.5872058249802069, + "learning_rate": 6.209245230361131e-05, + "loss": 11.9939, + "step": 23324 + }, + { + "epoch": 1.2701394702987483, + "grad_norm": 0.524340618375271, + "learning_rate": 6.208429233687503e-05, + "loss": 12.0238, + "step": 23325 + }, + { + "epoch": 1.2701939242953313, + "grad_norm": 0.5671513504042198, + "learning_rate": 6.20761326649776e-05, + "loss": 11.8688, + "step": 23326 + }, + { + "epoch": 1.2702483782919143, + "grad_norm": 0.5589026914595644, + "learning_rate": 6.206797328798257e-05, + "loss": 12.0015, + "step": 23327 + }, + { + "epoch": 1.2703028322884973, + "grad_norm": 0.5103027645540704, + "learning_rate": 6.205981420595332e-05, + "loss": 11.9565, + "step": 23328 + }, + { + "epoch": 1.2703572862850803, + "grad_norm": 0.5709819143252606, + "learning_rate": 6.205165541895334e-05, + "loss": 11.9735, + "step": 23329 + }, + { + "epoch": 1.2704117402816633, + "grad_norm": 0.6095574741860947, + "learning_rate": 6.204349692704604e-05, + "loss": 12.035, + "step": 23330 + }, + { + "epoch": 1.2704661942782463, + "grad_norm": 0.5646974626640744, + "learning_rate": 6.20353387302949e-05, + "loss": 11.8901, + "step": 23331 + }, + { + "epoch": 1.2705206482748292, + "grad_norm": 0.5210950415666447, + "learning_rate": 6.20271808287633e-05, + "loss": 12.0456, + "step": 23332 + }, + { + "epoch": 1.2705751022714122, + "grad_norm": 0.5811755441916826, + "learning_rate": 6.201902322251471e-05, + "loss": 11.9576, + "step": 23333 + }, + { + "epoch": 1.2706295562679952, + "grad_norm": 0.5897526476423185, + "learning_rate": 6.201086591161255e-05, + "loss": 11.8529, + "step": 23334 + }, + { + "epoch": 1.2706840102645782, + "grad_norm": 0.6421405117499367, + "learning_rate": 6.200270889612029e-05, + "loss": 12.0423, + "step": 23335 + }, + { + "epoch": 1.2707384642611614, + "grad_norm": 0.5818935058785432, + "learning_rate": 6.199455217610135e-05, + "loss": 11.9691, + "step": 23336 + }, + { + "epoch": 1.2707929182577444, + "grad_norm": 0.6004240477887385, + "learning_rate": 6.198639575161914e-05, + "loss": 11.946, + "step": 23337 + }, + { + "epoch": 1.2708473722543274, + "grad_norm": 0.539171698349238, + "learning_rate": 6.197823962273705e-05, + "loss": 12.0062, + "step": 23338 + }, + { + "epoch": 1.2709018262509104, + "grad_norm": 0.6258640458409044, + "learning_rate": 6.197008378951858e-05, + "loss": 11.7681, + "step": 23339 + }, + { + "epoch": 1.2709562802474934, + "grad_norm": 0.5848841771074273, + "learning_rate": 6.196192825202711e-05, + "loss": 12.032, + "step": 23340 + }, + { + "epoch": 1.2710107342440764, + "grad_norm": 0.5291610932441859, + "learning_rate": 6.195377301032611e-05, + "loss": 11.7731, + "step": 23341 + }, + { + "epoch": 1.2710651882406594, + "grad_norm": 0.4886445273656309, + "learning_rate": 6.19456180644789e-05, + "loss": 11.9044, + "step": 23342 + }, + { + "epoch": 1.2711196422372424, + "grad_norm": 0.5475517712648182, + "learning_rate": 6.193746341454894e-05, + "loss": 12.0123, + "step": 23343 + }, + { + "epoch": 1.2711740962338254, + "grad_norm": 0.5115837751985072, + "learning_rate": 6.192930906059966e-05, + "loss": 11.8949, + "step": 23344 + }, + { + "epoch": 1.2712285502304086, + "grad_norm": 0.5539929387810759, + "learning_rate": 6.192115500269447e-05, + "loss": 11.9995, + "step": 23345 + }, + { + "epoch": 1.2712830042269916, + "grad_norm": 0.4909185350024668, + "learning_rate": 6.191300124089675e-05, + "loss": 11.916, + "step": 23346 + }, + { + "epoch": 1.2713374582235746, + "grad_norm": 0.49260331961665, + "learning_rate": 6.190484777526993e-05, + "loss": 11.8449, + "step": 23347 + }, + { + "epoch": 1.2713919122201576, + "grad_norm": 0.631466451668857, + "learning_rate": 6.189669460587739e-05, + "loss": 11.8119, + "step": 23348 + }, + { + "epoch": 1.2714463662167406, + "grad_norm": 0.6089354795289456, + "learning_rate": 6.188854173278254e-05, + "loss": 12.0137, + "step": 23349 + }, + { + "epoch": 1.2715008202133236, + "grad_norm": 0.6371413460409984, + "learning_rate": 6.188038915604877e-05, + "loss": 11.9834, + "step": 23350 + }, + { + "epoch": 1.2715552742099065, + "grad_norm": 0.5769881919737412, + "learning_rate": 6.187223687573956e-05, + "loss": 11.9342, + "step": 23351 + }, + { + "epoch": 1.2716097282064895, + "grad_norm": 0.6152241490133966, + "learning_rate": 6.186408489191818e-05, + "loss": 11.9962, + "step": 23352 + }, + { + "epoch": 1.2716641822030725, + "grad_norm": 0.6116610121964597, + "learning_rate": 6.185593320464805e-05, + "loss": 11.973, + "step": 23353 + }, + { + "epoch": 1.2717186361996555, + "grad_norm": 0.5256129458254312, + "learning_rate": 6.184778181399258e-05, + "loss": 11.8692, + "step": 23354 + }, + { + "epoch": 1.2717730901962385, + "grad_norm": 0.4920780804838193, + "learning_rate": 6.183963072001517e-05, + "loss": 11.9996, + "step": 23355 + }, + { + "epoch": 1.2718275441928215, + "grad_norm": 0.5127263926711845, + "learning_rate": 6.18314799227792e-05, + "loss": 11.9083, + "step": 23356 + }, + { + "epoch": 1.2718819981894045, + "grad_norm": 0.6217008392542402, + "learning_rate": 6.182332942234804e-05, + "loss": 12.036, + "step": 23357 + }, + { + "epoch": 1.2719364521859875, + "grad_norm": 0.5130976735521793, + "learning_rate": 6.181517921878508e-05, + "loss": 11.8585, + "step": 23358 + }, + { + "epoch": 1.2719909061825707, + "grad_norm": 0.5202591226152474, + "learning_rate": 6.180702931215367e-05, + "loss": 11.9987, + "step": 23359 + }, + { + "epoch": 1.2720453601791537, + "grad_norm": 0.6111920306799264, + "learning_rate": 6.17988797025172e-05, + "loss": 12.0882, + "step": 23360 + }, + { + "epoch": 1.2720998141757367, + "grad_norm": 0.6293271982113089, + "learning_rate": 6.17907303899391e-05, + "loss": 11.9575, + "step": 23361 + }, + { + "epoch": 1.2721542681723197, + "grad_norm": 0.5099706333577905, + "learning_rate": 6.178258137448265e-05, + "loss": 12.0043, + "step": 23362 + }, + { + "epoch": 1.2722087221689027, + "grad_norm": 0.6005071181171349, + "learning_rate": 6.177443265621127e-05, + "loss": 11.9809, + "step": 23363 + }, + { + "epoch": 1.2722631761654857, + "grad_norm": 0.48955023102144846, + "learning_rate": 6.176628423518827e-05, + "loss": 11.8855, + "step": 23364 + }, + { + "epoch": 1.2723176301620687, + "grad_norm": 0.5513402720154954, + "learning_rate": 6.17581361114771e-05, + "loss": 11.9205, + "step": 23365 + }, + { + "epoch": 1.2723720841586517, + "grad_norm": 0.5271349303137202, + "learning_rate": 6.174998828514106e-05, + "loss": 11.9024, + "step": 23366 + }, + { + "epoch": 1.2724265381552347, + "grad_norm": 0.5440387096455964, + "learning_rate": 6.174184075624352e-05, + "loss": 11.8205, + "step": 23367 + }, + { + "epoch": 1.2724809921518179, + "grad_norm": 0.5316353551943467, + "learning_rate": 6.173369352484786e-05, + "loss": 11.9881, + "step": 23368 + }, + { + "epoch": 1.2725354461484009, + "grad_norm": 0.5195114242919787, + "learning_rate": 6.17255465910174e-05, + "loss": 11.8855, + "step": 23369 + }, + { + "epoch": 1.2725899001449839, + "grad_norm": 0.6067601046513386, + "learning_rate": 6.171739995481551e-05, + "loss": 12.0578, + "step": 23370 + }, + { + "epoch": 1.2726443541415668, + "grad_norm": 0.5922350874741872, + "learning_rate": 6.170925361630557e-05, + "loss": 11.955, + "step": 23371 + }, + { + "epoch": 1.2726988081381498, + "grad_norm": 0.523548089866187, + "learning_rate": 6.170110757555088e-05, + "loss": 11.9278, + "step": 23372 + }, + { + "epoch": 1.2727532621347328, + "grad_norm": 0.5592176843160478, + "learning_rate": 6.169296183261477e-05, + "loss": 11.8571, + "step": 23373 + }, + { + "epoch": 1.2728077161313158, + "grad_norm": 0.6185745649402246, + "learning_rate": 6.168481638756064e-05, + "loss": 12.0033, + "step": 23374 + }, + { + "epoch": 1.2728621701278988, + "grad_norm": 0.5580615287086415, + "learning_rate": 6.167667124045178e-05, + "loss": 11.9415, + "step": 23375 + }, + { + "epoch": 1.2729166241244818, + "grad_norm": 0.5315801320232145, + "learning_rate": 6.166852639135156e-05, + "loss": 11.8717, + "step": 23376 + }, + { + "epoch": 1.2729710781210648, + "grad_norm": 0.5221864854326923, + "learning_rate": 6.16603818403233e-05, + "loss": 11.9166, + "step": 23377 + }, + { + "epoch": 1.2730255321176478, + "grad_norm": 0.49489873083951413, + "learning_rate": 6.165223758743037e-05, + "loss": 11.9756, + "step": 23378 + }, + { + "epoch": 1.2730799861142308, + "grad_norm": 0.5284182535982023, + "learning_rate": 6.164409363273604e-05, + "loss": 11.8718, + "step": 23379 + }, + { + "epoch": 1.2731344401108138, + "grad_norm": 0.5700556655297687, + "learning_rate": 6.163594997630369e-05, + "loss": 11.7334, + "step": 23380 + }, + { + "epoch": 1.2731888941073968, + "grad_norm": 0.5452737708328596, + "learning_rate": 6.162780661819665e-05, + "loss": 12.0551, + "step": 23381 + }, + { + "epoch": 1.27324334810398, + "grad_norm": 0.5649806032124066, + "learning_rate": 6.161966355847819e-05, + "loss": 12.0111, + "step": 23382 + }, + { + "epoch": 1.273297802100563, + "grad_norm": 0.5035865394764981, + "learning_rate": 6.161152079721166e-05, + "loss": 12.0424, + "step": 23383 + }, + { + "epoch": 1.273352256097146, + "grad_norm": 0.5401260476633886, + "learning_rate": 6.16033783344604e-05, + "loss": 11.9739, + "step": 23384 + }, + { + "epoch": 1.273406710093729, + "grad_norm": 0.5715111184080219, + "learning_rate": 6.159523617028768e-05, + "loss": 12.0213, + "step": 23385 + }, + { + "epoch": 1.273461164090312, + "grad_norm": 0.656240407456829, + "learning_rate": 6.158709430475687e-05, + "loss": 11.9437, + "step": 23386 + }, + { + "epoch": 1.273515618086895, + "grad_norm": 0.5331603550570894, + "learning_rate": 6.157895273793123e-05, + "loss": 11.8895, + "step": 23387 + }, + { + "epoch": 1.273570072083478, + "grad_norm": 0.5571948358317733, + "learning_rate": 6.15708114698741e-05, + "loss": 11.9225, + "step": 23388 + }, + { + "epoch": 1.273624526080061, + "grad_norm": 0.576153893707623, + "learning_rate": 6.156267050064881e-05, + "loss": 11.9145, + "step": 23389 + }, + { + "epoch": 1.273678980076644, + "grad_norm": 0.580075660793452, + "learning_rate": 6.155452983031862e-05, + "loss": 11.9521, + "step": 23390 + }, + { + "epoch": 1.2737334340732271, + "grad_norm": 0.5433488848620138, + "learning_rate": 6.154638945894689e-05, + "loss": 11.9118, + "step": 23391 + }, + { + "epoch": 1.2737878880698101, + "grad_norm": 0.5541799106192558, + "learning_rate": 6.153824938659684e-05, + "loss": 12.0682, + "step": 23392 + }, + { + "epoch": 1.2738423420663931, + "grad_norm": 0.8937971582716784, + "learning_rate": 6.153010961333184e-05, + "loss": 12.0163, + "step": 23393 + }, + { + "epoch": 1.2738967960629761, + "grad_norm": 0.5796018149701783, + "learning_rate": 6.152197013921515e-05, + "loss": 11.8705, + "step": 23394 + }, + { + "epoch": 1.2739512500595591, + "grad_norm": 0.5156819839981832, + "learning_rate": 6.151383096431008e-05, + "loss": 11.792, + "step": 23395 + }, + { + "epoch": 1.274005704056142, + "grad_norm": 0.5247084374996511, + "learning_rate": 6.150569208867989e-05, + "loss": 11.9977, + "step": 23396 + }, + { + "epoch": 1.274060158052725, + "grad_norm": 0.5045128890404776, + "learning_rate": 6.149755351238791e-05, + "loss": 11.734, + "step": 23397 + }, + { + "epoch": 1.274114612049308, + "grad_norm": 0.5428727538934164, + "learning_rate": 6.148941523549739e-05, + "loss": 11.7835, + "step": 23398 + }, + { + "epoch": 1.274169066045891, + "grad_norm": 0.5639829723044368, + "learning_rate": 6.148127725807166e-05, + "loss": 11.9327, + "step": 23399 + }, + { + "epoch": 1.274223520042474, + "grad_norm": 0.5416806279693785, + "learning_rate": 6.147313958017398e-05, + "loss": 11.9288, + "step": 23400 + }, + { + "epoch": 1.274277974039057, + "grad_norm": 0.5457388335316171, + "learning_rate": 6.146500220186766e-05, + "loss": 12.0199, + "step": 23401 + }, + { + "epoch": 1.27433242803564, + "grad_norm": 0.5179846459378248, + "learning_rate": 6.145686512321588e-05, + "loss": 11.9786, + "step": 23402 + }, + { + "epoch": 1.274386882032223, + "grad_norm": 0.5599616162200108, + "learning_rate": 6.1448728344282e-05, + "loss": 11.7671, + "step": 23403 + }, + { + "epoch": 1.274441336028806, + "grad_norm": 0.5396860211024469, + "learning_rate": 6.144059186512928e-05, + "loss": 11.9588, + "step": 23404 + }, + { + "epoch": 1.274495790025389, + "grad_norm": 0.5213496629497232, + "learning_rate": 6.143245568582099e-05, + "loss": 11.9472, + "step": 23405 + }, + { + "epoch": 1.2745502440219723, + "grad_norm": 0.5279588003393272, + "learning_rate": 6.142431980642039e-05, + "loss": 11.8882, + "step": 23406 + }, + { + "epoch": 1.2746046980185552, + "grad_norm": 0.555959952393999, + "learning_rate": 6.141618422699074e-05, + "loss": 11.8884, + "step": 23407 + }, + { + "epoch": 1.2746591520151382, + "grad_norm": 0.566584123832805, + "learning_rate": 6.14080489475953e-05, + "loss": 11.8963, + "step": 23408 + }, + { + "epoch": 1.2747136060117212, + "grad_norm": 0.5402824653181659, + "learning_rate": 6.139991396829735e-05, + "loss": 12.0018, + "step": 23409 + }, + { + "epoch": 1.2747680600083042, + "grad_norm": 0.557910135170613, + "learning_rate": 6.139177928916016e-05, + "loss": 11.9609, + "step": 23410 + }, + { + "epoch": 1.2748225140048872, + "grad_norm": 0.5543819070305888, + "learning_rate": 6.138364491024696e-05, + "loss": 11.8824, + "step": 23411 + }, + { + "epoch": 1.2748769680014702, + "grad_norm": 0.5733588974557553, + "learning_rate": 6.137551083162105e-05, + "loss": 12.017, + "step": 23412 + }, + { + "epoch": 1.2749314219980532, + "grad_norm": 0.48920246956431995, + "learning_rate": 6.136737705334559e-05, + "loss": 11.9517, + "step": 23413 + }, + { + "epoch": 1.2749858759946364, + "grad_norm": 0.552769775479179, + "learning_rate": 6.13592435754839e-05, + "loss": 11.9836, + "step": 23414 + }, + { + "epoch": 1.2750403299912194, + "grad_norm": 0.5824338554355197, + "learning_rate": 6.135111039809922e-05, + "loss": 11.9281, + "step": 23415 + }, + { + "epoch": 1.2750947839878024, + "grad_norm": 0.5714441455621442, + "learning_rate": 6.134297752125477e-05, + "loss": 11.9724, + "step": 23416 + }, + { + "epoch": 1.2751492379843854, + "grad_norm": 0.4878833198075495, + "learning_rate": 6.133484494501382e-05, + "loss": 11.9296, + "step": 23417 + }, + { + "epoch": 1.2752036919809684, + "grad_norm": 0.5395389807250964, + "learning_rate": 6.132671266943962e-05, + "loss": 11.9294, + "step": 23418 + }, + { + "epoch": 1.2752581459775514, + "grad_norm": 0.6499568272746115, + "learning_rate": 6.131858069459537e-05, + "loss": 12.0162, + "step": 23419 + }, + { + "epoch": 1.2753125999741344, + "grad_norm": 0.5111275170025651, + "learning_rate": 6.13104490205443e-05, + "loss": 11.9132, + "step": 23420 + }, + { + "epoch": 1.2753670539707174, + "grad_norm": 0.5512976507197156, + "learning_rate": 6.130231764734968e-05, + "loss": 11.8927, + "step": 23421 + }, + { + "epoch": 1.2754215079673004, + "grad_norm": 0.5757738157915058, + "learning_rate": 6.129418657507478e-05, + "loss": 11.9277, + "step": 23422 + }, + { + "epoch": 1.2754759619638834, + "grad_norm": 0.5713838120879103, + "learning_rate": 6.128605580378273e-05, + "loss": 11.8534, + "step": 23423 + }, + { + "epoch": 1.2755304159604663, + "grad_norm": 0.5810350740042042, + "learning_rate": 6.127792533353681e-05, + "loss": 11.9139, + "step": 23424 + }, + { + "epoch": 1.2755848699570493, + "grad_norm": 0.5520984846109018, + "learning_rate": 6.126979516440021e-05, + "loss": 11.8544, + "step": 23425 + }, + { + "epoch": 1.2756393239536323, + "grad_norm": 0.6195542466138565, + "learning_rate": 6.12616652964362e-05, + "loss": 12.0248, + "step": 23426 + }, + { + "epoch": 1.2756937779502153, + "grad_norm": 0.6065463181157293, + "learning_rate": 6.125353572970798e-05, + "loss": 12.0263, + "step": 23427 + }, + { + "epoch": 1.2757482319467983, + "grad_norm": 0.6022890840270189, + "learning_rate": 6.124540646427877e-05, + "loss": 11.9128, + "step": 23428 + }, + { + "epoch": 1.2758026859433815, + "grad_norm": 0.5505979578393485, + "learning_rate": 6.123727750021177e-05, + "loss": 11.9789, + "step": 23429 + }, + { + "epoch": 1.2758571399399645, + "grad_norm": 0.5313454778922818, + "learning_rate": 6.122914883757019e-05, + "loss": 11.8778, + "step": 23430 + }, + { + "epoch": 1.2759115939365475, + "grad_norm": 0.5165311898873544, + "learning_rate": 6.122102047641725e-05, + "loss": 11.7385, + "step": 23431 + }, + { + "epoch": 1.2759660479331305, + "grad_norm": 0.5619170786324981, + "learning_rate": 6.121289241681621e-05, + "loss": 12.0393, + "step": 23432 + }, + { + "epoch": 1.2760205019297135, + "grad_norm": 0.5371501925842642, + "learning_rate": 6.120476465883018e-05, + "loss": 11.9275, + "step": 23433 + }, + { + "epoch": 1.2760749559262965, + "grad_norm": 0.5539806629730009, + "learning_rate": 6.11966372025224e-05, + "loss": 11.9057, + "step": 23434 + }, + { + "epoch": 1.2761294099228795, + "grad_norm": 0.5254273132834162, + "learning_rate": 6.11885100479561e-05, + "loss": 11.9598, + "step": 23435 + }, + { + "epoch": 1.2761838639194625, + "grad_norm": 0.5256322934788941, + "learning_rate": 6.118038319519441e-05, + "loss": 11.8457, + "step": 23436 + }, + { + "epoch": 1.2762383179160455, + "grad_norm": 0.5742182452294878, + "learning_rate": 6.11722566443006e-05, + "loss": 12.0103, + "step": 23437 + }, + { + "epoch": 1.2762927719126287, + "grad_norm": 0.5619782065172815, + "learning_rate": 6.116413039533784e-05, + "loss": 11.9429, + "step": 23438 + }, + { + "epoch": 1.2763472259092117, + "grad_norm": 0.5912072790665202, + "learning_rate": 6.11560044483693e-05, + "loss": 11.9045, + "step": 23439 + }, + { + "epoch": 1.2764016799057947, + "grad_norm": 0.640642372825217, + "learning_rate": 6.114787880345818e-05, + "loss": 11.9004, + "step": 23440 + }, + { + "epoch": 1.2764561339023777, + "grad_norm": 0.529558915606228, + "learning_rate": 6.113975346066769e-05, + "loss": 11.945, + "step": 23441 + }, + { + "epoch": 1.2765105878989607, + "grad_norm": 0.5491331705510023, + "learning_rate": 6.1131628420061e-05, + "loss": 11.865, + "step": 23442 + }, + { + "epoch": 1.2765650418955437, + "grad_norm": 0.5521629736406414, + "learning_rate": 6.112350368170128e-05, + "loss": 11.9299, + "step": 23443 + }, + { + "epoch": 1.2766194958921266, + "grad_norm": 0.5413581163713285, + "learning_rate": 6.11153792456517e-05, + "loss": 11.999, + "step": 23444 + }, + { + "epoch": 1.2766739498887096, + "grad_norm": 0.5127209632085405, + "learning_rate": 6.110725511197546e-05, + "loss": 11.9061, + "step": 23445 + }, + { + "epoch": 1.2767284038852926, + "grad_norm": 0.5368385954467395, + "learning_rate": 6.109913128073572e-05, + "loss": 11.9514, + "step": 23446 + }, + { + "epoch": 1.2767828578818756, + "grad_norm": 0.5673990511629734, + "learning_rate": 6.109100775199565e-05, + "loss": 12.0544, + "step": 23447 + }, + { + "epoch": 1.2768373118784586, + "grad_norm": 0.5432429153901958, + "learning_rate": 6.108288452581844e-05, + "loss": 11.9729, + "step": 23448 + }, + { + "epoch": 1.2768917658750416, + "grad_norm": 0.531720712228885, + "learning_rate": 6.107476160226725e-05, + "loss": 12.0446, + "step": 23449 + }, + { + "epoch": 1.2769462198716246, + "grad_norm": 0.5586792284313403, + "learning_rate": 6.106663898140524e-05, + "loss": 12.2062, + "step": 23450 + }, + { + "epoch": 1.2770006738682076, + "grad_norm": 0.5147979941718789, + "learning_rate": 6.105851666329557e-05, + "loss": 11.9341, + "step": 23451 + }, + { + "epoch": 1.2770551278647908, + "grad_norm": 0.5359541791723327, + "learning_rate": 6.105039464800143e-05, + "loss": 11.8957, + "step": 23452 + }, + { + "epoch": 1.2771095818613738, + "grad_norm": 0.5456329212624745, + "learning_rate": 6.104227293558593e-05, + "loss": 11.9419, + "step": 23453 + }, + { + "epoch": 1.2771640358579568, + "grad_norm": 0.692260373025588, + "learning_rate": 6.103415152611225e-05, + "loss": 12.0249, + "step": 23454 + }, + { + "epoch": 1.2772184898545398, + "grad_norm": 0.5121736932820726, + "learning_rate": 6.1026030419643544e-05, + "loss": 11.8515, + "step": 23455 + }, + { + "epoch": 1.2772729438511228, + "grad_norm": 0.534931595755875, + "learning_rate": 6.1017909616242966e-05, + "loss": 11.9126, + "step": 23456 + }, + { + "epoch": 1.2773273978477058, + "grad_norm": 0.6172589535524825, + "learning_rate": 6.100978911597366e-05, + "loss": 11.936, + "step": 23457 + }, + { + "epoch": 1.2773818518442888, + "grad_norm": 0.5175136620150501, + "learning_rate": 6.1001668918898734e-05, + "loss": 11.8191, + "step": 23458 + }, + { + "epoch": 1.2774363058408718, + "grad_norm": 0.5067862787806349, + "learning_rate": 6.099354902508141e-05, + "loss": 11.9255, + "step": 23459 + }, + { + "epoch": 1.2774907598374547, + "grad_norm": 0.5599109908899864, + "learning_rate": 6.098542943458478e-05, + "loss": 12.0136, + "step": 23460 + }, + { + "epoch": 1.277545213834038, + "grad_norm": 0.5299940958146621, + "learning_rate": 6.0977310147472e-05, + "loss": 11.8644, + "step": 23461 + }, + { + "epoch": 1.277599667830621, + "grad_norm": 0.5700600588235133, + "learning_rate": 6.096919116380622e-05, + "loss": 11.9795, + "step": 23462 + }, + { + "epoch": 1.277654121827204, + "grad_norm": 0.5638360644329294, + "learning_rate": 6.0961072483650526e-05, + "loss": 11.9326, + "step": 23463 + }, + { + "epoch": 1.277708575823787, + "grad_norm": 0.6578693654611429, + "learning_rate": 6.095295410706809e-05, + "loss": 12.0909, + "step": 23464 + }, + { + "epoch": 1.27776302982037, + "grad_norm": 0.533684078825207, + "learning_rate": 6.094483603412203e-05, + "loss": 11.9293, + "step": 23465 + }, + { + "epoch": 1.277817483816953, + "grad_norm": 0.47515698786075217, + "learning_rate": 6.093671826487547e-05, + "loss": 11.9598, + "step": 23466 + }, + { + "epoch": 1.277871937813536, + "grad_norm": 0.5637803768239379, + "learning_rate": 6.092860079939154e-05, + "loss": 12.0752, + "step": 23467 + }, + { + "epoch": 1.277926391810119, + "grad_norm": 0.574273764631386, + "learning_rate": 6.092048363773337e-05, + "loss": 11.9568, + "step": 23468 + }, + { + "epoch": 1.277980845806702, + "grad_norm": 0.522232082491125, + "learning_rate": 6.0912366779964035e-05, + "loss": 11.9929, + "step": 23469 + }, + { + "epoch": 1.278035299803285, + "grad_norm": 0.5076968821147885, + "learning_rate": 6.0904250226146734e-05, + "loss": 11.9391, + "step": 23470 + }, + { + "epoch": 1.278089753799868, + "grad_norm": 0.5842175089414539, + "learning_rate": 6.0896133976344526e-05, + "loss": 11.8999, + "step": 23471 + }, + { + "epoch": 1.2781442077964509, + "grad_norm": 0.5444197913072716, + "learning_rate": 6.0888018030620586e-05, + "loss": 11.8363, + "step": 23472 + }, + { + "epoch": 1.2781986617930339, + "grad_norm": 0.6741627689583077, + "learning_rate": 6.0879902389037915e-05, + "loss": 12.0643, + "step": 23473 + }, + { + "epoch": 1.2782531157896169, + "grad_norm": 0.5012801147988735, + "learning_rate": 6.087178705165969e-05, + "loss": 11.8394, + "step": 23474 + }, + { + "epoch": 1.2783075697861999, + "grad_norm": 0.5118881877419978, + "learning_rate": 6.086367201854902e-05, + "loss": 11.9598, + "step": 23475 + }, + { + "epoch": 1.278362023782783, + "grad_norm": 0.564274403607142, + "learning_rate": 6.085555728976899e-05, + "loss": 11.8439, + "step": 23476 + }, + { + "epoch": 1.278416477779366, + "grad_norm": 0.5169678361466552, + "learning_rate": 6.084744286538273e-05, + "loss": 11.8106, + "step": 23477 + }, + { + "epoch": 1.278470931775949, + "grad_norm": 0.5286873537986246, + "learning_rate": 6.08393287454533e-05, + "loss": 11.986, + "step": 23478 + }, + { + "epoch": 1.278525385772532, + "grad_norm": 0.5170143293378469, + "learning_rate": 6.083121493004384e-05, + "loss": 11.962, + "step": 23479 + }, + { + "epoch": 1.278579839769115, + "grad_norm": 0.5290861828173461, + "learning_rate": 6.082310141921739e-05, + "loss": 11.9349, + "step": 23480 + }, + { + "epoch": 1.278634293765698, + "grad_norm": 0.5574460050941807, + "learning_rate": 6.0814988213037094e-05, + "loss": 11.9356, + "step": 23481 + }, + { + "epoch": 1.278688747762281, + "grad_norm": 0.7228777916774161, + "learning_rate": 6.080687531156606e-05, + "loss": 11.964, + "step": 23482 + }, + { + "epoch": 1.278743201758864, + "grad_norm": 0.5450852058212383, + "learning_rate": 6.079876271486729e-05, + "loss": 11.9233, + "step": 23483 + }, + { + "epoch": 1.2787976557554472, + "grad_norm": 0.5047807597759899, + "learning_rate": 6.079065042300393e-05, + "loss": 11.7822, + "step": 23484 + }, + { + "epoch": 1.2788521097520302, + "grad_norm": 0.5826234685577726, + "learning_rate": 6.0782538436039e-05, + "loss": 11.8873, + "step": 23485 + }, + { + "epoch": 1.2789065637486132, + "grad_norm": 0.4916929922109629, + "learning_rate": 6.0774426754035685e-05, + "loss": 11.9222, + "step": 23486 + }, + { + "epoch": 1.2789610177451962, + "grad_norm": 0.5386901618287323, + "learning_rate": 6.076631537705698e-05, + "loss": 11.9255, + "step": 23487 + }, + { + "epoch": 1.2790154717417792, + "grad_norm": 0.5870239611376995, + "learning_rate": 6.075820430516599e-05, + "loss": 12.0858, + "step": 23488 + }, + { + "epoch": 1.2790699257383622, + "grad_norm": 0.4760090413465526, + "learning_rate": 6.0750093538425804e-05, + "loss": 11.8747, + "step": 23489 + }, + { + "epoch": 1.2791243797349452, + "grad_norm": 0.5780090479800983, + "learning_rate": 6.074198307689945e-05, + "loss": 11.9545, + "step": 23490 + }, + { + "epoch": 1.2791788337315282, + "grad_norm": 0.5252310941109722, + "learning_rate": 6.0733872920650026e-05, + "loss": 11.9644, + "step": 23491 + }, + { + "epoch": 1.2792332877281112, + "grad_norm": 0.5351756410748505, + "learning_rate": 6.0725763069740606e-05, + "loss": 11.8312, + "step": 23492 + }, + { + "epoch": 1.2792877417246942, + "grad_norm": 0.661062772247031, + "learning_rate": 6.071765352423422e-05, + "loss": 11.9972, + "step": 23493 + }, + { + "epoch": 1.2793421957212772, + "grad_norm": 0.49745495292634456, + "learning_rate": 6.070954428419395e-05, + "loss": 11.8081, + "step": 23494 + }, + { + "epoch": 1.2793966497178602, + "grad_norm": 0.5352121245957874, + "learning_rate": 6.070143534968286e-05, + "loss": 12.01, + "step": 23495 + }, + { + "epoch": 1.2794511037144431, + "grad_norm": 0.5423569846107363, + "learning_rate": 6.069332672076398e-05, + "loss": 12.0703, + "step": 23496 + }, + { + "epoch": 1.2795055577110261, + "grad_norm": 0.5854606957132008, + "learning_rate": 6.068521839750039e-05, + "loss": 11.911, + "step": 23497 + }, + { + "epoch": 1.2795600117076091, + "grad_norm": 0.5446942177758455, + "learning_rate": 6.067711037995514e-05, + "loss": 11.9375, + "step": 23498 + }, + { + "epoch": 1.2796144657041923, + "grad_norm": 0.6168174619334406, + "learning_rate": 6.066900266819127e-05, + "loss": 12.051, + "step": 23499 + }, + { + "epoch": 1.2796689197007753, + "grad_norm": 0.6167618173501145, + "learning_rate": 6.066089526227183e-05, + "loss": 12.0398, + "step": 23500 + }, + { + "epoch": 1.2797233736973583, + "grad_norm": 0.5456757129491338, + "learning_rate": 6.065278816225988e-05, + "loss": 11.9841, + "step": 23501 + }, + { + "epoch": 1.2797778276939413, + "grad_norm": 0.5606503968145746, + "learning_rate": 6.0644681368218457e-05, + "loss": 11.976, + "step": 23502 + }, + { + "epoch": 1.2798322816905243, + "grad_norm": 0.6417888688006007, + "learning_rate": 6.0636574880210574e-05, + "loss": 11.9576, + "step": 23503 + }, + { + "epoch": 1.2798867356871073, + "grad_norm": 0.536462312518082, + "learning_rate": 6.062846869829929e-05, + "loss": 11.868, + "step": 23504 + }, + { + "epoch": 1.2799411896836903, + "grad_norm": 0.4967739744330492, + "learning_rate": 6.062036282254764e-05, + "loss": 11.857, + "step": 23505 + }, + { + "epoch": 1.2799956436802733, + "grad_norm": 0.5456281349254622, + "learning_rate": 6.0612257253018646e-05, + "loss": 12.0076, + "step": 23506 + }, + { + "epoch": 1.2800500976768563, + "grad_norm": 0.5479946047060017, + "learning_rate": 6.0604151989775316e-05, + "loss": 11.9893, + "step": 23507 + }, + { + "epoch": 1.2801045516734395, + "grad_norm": 0.5749185320388244, + "learning_rate": 6.059604703288073e-05, + "loss": 11.8828, + "step": 23508 + }, + { + "epoch": 1.2801590056700225, + "grad_norm": 0.5171287018473631, + "learning_rate": 6.05879423823979e-05, + "loss": 11.8257, + "step": 23509 + }, + { + "epoch": 1.2802134596666055, + "grad_norm": 0.6543870163652833, + "learning_rate": 6.0579838038389826e-05, + "loss": 11.9345, + "step": 23510 + }, + { + "epoch": 1.2802679136631885, + "grad_norm": 0.49841729897115733, + "learning_rate": 6.0571734000919554e-05, + "loss": 11.9877, + "step": 23511 + }, + { + "epoch": 1.2803223676597715, + "grad_norm": 0.5220794510578373, + "learning_rate": 6.056363027005011e-05, + "loss": 11.9932, + "step": 23512 + }, + { + "epoch": 1.2803768216563545, + "grad_norm": 0.6025757275996647, + "learning_rate": 6.055552684584447e-05, + "loss": 11.9358, + "step": 23513 + }, + { + "epoch": 1.2804312756529375, + "grad_norm": 0.5372976783892274, + "learning_rate": 6.054742372836566e-05, + "loss": 11.9075, + "step": 23514 + }, + { + "epoch": 1.2804857296495205, + "grad_norm": 0.5521924338851903, + "learning_rate": 6.0539320917676714e-05, + "loss": 11.9101, + "step": 23515 + }, + { + "epoch": 1.2805401836461034, + "grad_norm": 0.5861845568906207, + "learning_rate": 6.0531218413840616e-05, + "loss": 11.8921, + "step": 23516 + }, + { + "epoch": 1.2805946376426864, + "grad_norm": 0.558562405022393, + "learning_rate": 6.0523116216920374e-05, + "loss": 11.8053, + "step": 23517 + }, + { + "epoch": 1.2806490916392694, + "grad_norm": 0.517342550968619, + "learning_rate": 6.0515014326978994e-05, + "loss": 11.9603, + "step": 23518 + }, + { + "epoch": 1.2807035456358524, + "grad_norm": 0.5778837259537967, + "learning_rate": 6.05069127440795e-05, + "loss": 11.9863, + "step": 23519 + }, + { + "epoch": 1.2807579996324354, + "grad_norm": 0.5400045352346342, + "learning_rate": 6.0498811468284876e-05, + "loss": 11.9889, + "step": 23520 + }, + { + "epoch": 1.2808124536290184, + "grad_norm": 0.5218498590662102, + "learning_rate": 6.049071049965811e-05, + "loss": 11.7868, + "step": 23521 + }, + { + "epoch": 1.2808669076256016, + "grad_norm": 0.5186413862549286, + "learning_rate": 6.048260983826224e-05, + "loss": 12.0402, + "step": 23522 + }, + { + "epoch": 1.2809213616221846, + "grad_norm": 0.5169947844750505, + "learning_rate": 6.04745094841602e-05, + "loss": 11.9166, + "step": 23523 + }, + { + "epoch": 1.2809758156187676, + "grad_norm": 0.5430486455984835, + "learning_rate": 6.0466409437414996e-05, + "loss": 11.91, + "step": 23524 + }, + { + "epoch": 1.2810302696153506, + "grad_norm": 0.5519218148989543, + "learning_rate": 6.045830969808963e-05, + "loss": 11.9521, + "step": 23525 + }, + { + "epoch": 1.2810847236119336, + "grad_norm": 0.6091077701215787, + "learning_rate": 6.045021026624707e-05, + "loss": 11.9696, + "step": 23526 + }, + { + "epoch": 1.2811391776085166, + "grad_norm": 0.5080581539476212, + "learning_rate": 6.0442111141950306e-05, + "loss": 11.9114, + "step": 23527 + }, + { + "epoch": 1.2811936316050996, + "grad_norm": 0.5620295368624172, + "learning_rate": 6.0434012325262336e-05, + "loss": 11.9139, + "step": 23528 + }, + { + "epoch": 1.2812480856016826, + "grad_norm": 0.5600735699797295, + "learning_rate": 6.042591381624608e-05, + "loss": 11.8936, + "step": 23529 + }, + { + "epoch": 1.2813025395982656, + "grad_norm": 0.5644602481955032, + "learning_rate": 6.041781561496458e-05, + "loss": 11.8653, + "step": 23530 + }, + { + "epoch": 1.2813569935948488, + "grad_norm": 0.5071749127778168, + "learning_rate": 6.0409717721480796e-05, + "loss": 11.93, + "step": 23531 + }, + { + "epoch": 1.2814114475914318, + "grad_norm": 0.5515682328891715, + "learning_rate": 6.040162013585772e-05, + "loss": 11.8772, + "step": 23532 + }, + { + "epoch": 1.2814659015880148, + "grad_norm": 0.5675425006480583, + "learning_rate": 6.039352285815823e-05, + "loss": 11.881, + "step": 23533 + }, + { + "epoch": 1.2815203555845978, + "grad_norm": 0.5471934437434559, + "learning_rate": 6.038542588844536e-05, + "loss": 12.0435, + "step": 23534 + }, + { + "epoch": 1.2815748095811808, + "grad_norm": 0.5565368557901031, + "learning_rate": 6.037732922678206e-05, + "loss": 11.98, + "step": 23535 + }, + { + "epoch": 1.2816292635777637, + "grad_norm": 0.5070680059614715, + "learning_rate": 6.036923287323131e-05, + "loss": 11.7727, + "step": 23536 + }, + { + "epoch": 1.2816837175743467, + "grad_norm": 0.553493808862405, + "learning_rate": 6.0361136827856025e-05, + "loss": 12.0505, + "step": 23537 + }, + { + "epoch": 1.2817381715709297, + "grad_norm": 0.5608070685866844, + "learning_rate": 6.0353041090719196e-05, + "loss": 12.0352, + "step": 23538 + }, + { + "epoch": 1.2817926255675127, + "grad_norm": 0.5491546897844471, + "learning_rate": 6.034494566188378e-05, + "loss": 11.9842, + "step": 23539 + }, + { + "epoch": 1.2818470795640957, + "grad_norm": 0.5405475520046354, + "learning_rate": 6.033685054141272e-05, + "loss": 11.9077, + "step": 23540 + }, + { + "epoch": 1.2819015335606787, + "grad_norm": 0.5240052472356858, + "learning_rate": 6.0328755729368925e-05, + "loss": 11.8298, + "step": 23541 + }, + { + "epoch": 1.2819559875572617, + "grad_norm": 0.4917994796912182, + "learning_rate": 6.032066122581545e-05, + "loss": 11.8881, + "step": 23542 + }, + { + "epoch": 1.2820104415538447, + "grad_norm": 0.5027360717309111, + "learning_rate": 6.031256703081511e-05, + "loss": 11.9357, + "step": 23543 + }, + { + "epoch": 1.2820648955504277, + "grad_norm": 0.5170751853146255, + "learning_rate": 6.0304473144430926e-05, + "loss": 11.8487, + "step": 23544 + }, + { + "epoch": 1.2821193495470107, + "grad_norm": 0.5896824757083037, + "learning_rate": 6.0296379566725794e-05, + "loss": 11.8997, + "step": 23545 + }, + { + "epoch": 1.282173803543594, + "grad_norm": 0.5906842332292388, + "learning_rate": 6.028828629776269e-05, + "loss": 11.8785, + "step": 23546 + }, + { + "epoch": 1.2822282575401769, + "grad_norm": 0.5480852917853959, + "learning_rate": 6.028019333760452e-05, + "loss": 11.8501, + "step": 23547 + }, + { + "epoch": 1.2822827115367599, + "grad_norm": 0.5572639996001882, + "learning_rate": 6.0272100686314234e-05, + "loss": 11.9747, + "step": 23548 + }, + { + "epoch": 1.2823371655333429, + "grad_norm": 0.5369151321080103, + "learning_rate": 6.0264008343954757e-05, + "loss": 11.894, + "step": 23549 + }, + { + "epoch": 1.2823916195299259, + "grad_norm": 0.5091076563718924, + "learning_rate": 6.025591631058901e-05, + "loss": 11.925, + "step": 23550 + }, + { + "epoch": 1.2824460735265089, + "grad_norm": 0.6046272320357335, + "learning_rate": 6.0247824586279934e-05, + "loss": 11.9582, + "step": 23551 + }, + { + "epoch": 1.2825005275230918, + "grad_norm": 0.5397308357040135, + "learning_rate": 6.023973317109042e-05, + "loss": 11.9748, + "step": 23552 + }, + { + "epoch": 1.2825549815196748, + "grad_norm": 0.5633565540476361, + "learning_rate": 6.0231642065083446e-05, + "loss": 11.9608, + "step": 23553 + }, + { + "epoch": 1.282609435516258, + "grad_norm": 0.5766835774345143, + "learning_rate": 6.0223551268321874e-05, + "loss": 12.1016, + "step": 23554 + }, + { + "epoch": 1.282663889512841, + "grad_norm": 0.5828712749179817, + "learning_rate": 6.021546078086864e-05, + "loss": 11.944, + "step": 23555 + }, + { + "epoch": 1.282718343509424, + "grad_norm": 0.5719930660253305, + "learning_rate": 6.0207370602786626e-05, + "loss": 11.9348, + "step": 23556 + }, + { + "epoch": 1.282772797506007, + "grad_norm": 0.4884218997155032, + "learning_rate": 6.019928073413879e-05, + "loss": 11.8262, + "step": 23557 + }, + { + "epoch": 1.28282725150259, + "grad_norm": 0.5204375143460239, + "learning_rate": 6.019119117498802e-05, + "loss": 11.9146, + "step": 23558 + }, + { + "epoch": 1.282881705499173, + "grad_norm": 0.5678449367458687, + "learning_rate": 6.018310192539722e-05, + "loss": 11.979, + "step": 23559 + }, + { + "epoch": 1.282936159495756, + "grad_norm": 0.5339692036640276, + "learning_rate": 6.0175012985429313e-05, + "loss": 11.9012, + "step": 23560 + }, + { + "epoch": 1.282990613492339, + "grad_norm": 0.5594405997241506, + "learning_rate": 6.0166924355147166e-05, + "loss": 11.9855, + "step": 23561 + }, + { + "epoch": 1.283045067488922, + "grad_norm": 0.5583463392382488, + "learning_rate": 6.015883603461372e-05, + "loss": 11.9826, + "step": 23562 + }, + { + "epoch": 1.283099521485505, + "grad_norm": 0.529926138611232, + "learning_rate": 6.015074802389184e-05, + "loss": 11.8863, + "step": 23563 + }, + { + "epoch": 1.283153975482088, + "grad_norm": 0.5203733800981577, + "learning_rate": 6.0142660323044434e-05, + "loss": 11.8016, + "step": 23564 + }, + { + "epoch": 1.283208429478671, + "grad_norm": 0.5748523056640842, + "learning_rate": 6.0134572932134356e-05, + "loss": 11.8061, + "step": 23565 + }, + { + "epoch": 1.283262883475254, + "grad_norm": 0.5779499015108761, + "learning_rate": 6.0126485851224534e-05, + "loss": 11.9713, + "step": 23566 + }, + { + "epoch": 1.283317337471837, + "grad_norm": 0.5401640476135271, + "learning_rate": 6.011839908037783e-05, + "loss": 11.9529, + "step": 23567 + }, + { + "epoch": 1.28337179146842, + "grad_norm": 0.538671360492073, + "learning_rate": 6.011031261965716e-05, + "loss": 11.8232, + "step": 23568 + }, + { + "epoch": 1.2834262454650032, + "grad_norm": 0.5492423700879108, + "learning_rate": 6.010222646912539e-05, + "loss": 11.7709, + "step": 23569 + }, + { + "epoch": 1.2834806994615862, + "grad_norm": 0.5787543441007559, + "learning_rate": 6.00941406288454e-05, + "loss": 11.8725, + "step": 23570 + }, + { + "epoch": 1.2835351534581692, + "grad_norm": 0.6230576628149644, + "learning_rate": 6.0086055098880055e-05, + "loss": 11.9372, + "step": 23571 + }, + { + "epoch": 1.2835896074547521, + "grad_norm": 0.5521267877334615, + "learning_rate": 6.007796987929225e-05, + "loss": 11.9578, + "step": 23572 + }, + { + "epoch": 1.2836440614513351, + "grad_norm": 0.5963668936277664, + "learning_rate": 6.0069884970144865e-05, + "loss": 11.9397, + "step": 23573 + }, + { + "epoch": 1.2836985154479181, + "grad_norm": 0.5510056906815245, + "learning_rate": 6.006180037150073e-05, + "loss": 11.6585, + "step": 23574 + }, + { + "epoch": 1.2837529694445011, + "grad_norm": 0.5483257883956336, + "learning_rate": 6.005371608342272e-05, + "loss": 11.8992, + "step": 23575 + }, + { + "epoch": 1.2838074234410841, + "grad_norm": 0.5198488024735248, + "learning_rate": 6.004563210597372e-05, + "loss": 11.9707, + "step": 23576 + }, + { + "epoch": 1.283861877437667, + "grad_norm": 0.5488059841016077, + "learning_rate": 6.0037548439216596e-05, + "loss": 11.9824, + "step": 23577 + }, + { + "epoch": 1.2839163314342503, + "grad_norm": 0.5917836802938782, + "learning_rate": 6.0029465083214166e-05, + "loss": 11.8574, + "step": 23578 + }, + { + "epoch": 1.2839707854308333, + "grad_norm": 0.5013777123048397, + "learning_rate": 6.002138203802934e-05, + "loss": 11.8769, + "step": 23579 + }, + { + "epoch": 1.2840252394274163, + "grad_norm": 0.5305015348335042, + "learning_rate": 6.0013299303724955e-05, + "loss": 11.9521, + "step": 23580 + }, + { + "epoch": 1.2840796934239993, + "grad_norm": 0.5501595583170282, + "learning_rate": 6.0005216880363866e-05, + "loss": 11.9786, + "step": 23581 + }, + { + "epoch": 1.2841341474205823, + "grad_norm": 0.4893357817304148, + "learning_rate": 5.99971347680089e-05, + "loss": 11.8602, + "step": 23582 + }, + { + "epoch": 1.2841886014171653, + "grad_norm": 0.5931966388465176, + "learning_rate": 5.998905296672296e-05, + "loss": 12.0242, + "step": 23583 + }, + { + "epoch": 1.2842430554137483, + "grad_norm": 0.502447089109366, + "learning_rate": 5.9980971476568825e-05, + "loss": 11.9041, + "step": 23584 + }, + { + "epoch": 1.2842975094103313, + "grad_norm": 0.5075812170370226, + "learning_rate": 5.997289029760937e-05, + "loss": 11.8359, + "step": 23585 + }, + { + "epoch": 1.2843519634069143, + "grad_norm": 0.5448849983072174, + "learning_rate": 5.996480942990743e-05, + "loss": 11.8758, + "step": 23586 + }, + { + "epoch": 1.2844064174034973, + "grad_norm": 0.553796465616994, + "learning_rate": 5.995672887352586e-05, + "loss": 11.9681, + "step": 23587 + }, + { + "epoch": 1.2844608714000803, + "grad_norm": 0.5487255265970418, + "learning_rate": 5.994864862852746e-05, + "loss": 11.8359, + "step": 23588 + }, + { + "epoch": 1.2845153253966632, + "grad_norm": 0.573686709985358, + "learning_rate": 5.9940568694975096e-05, + "loss": 11.9695, + "step": 23589 + }, + { + "epoch": 1.2845697793932462, + "grad_norm": 0.5637612380220137, + "learning_rate": 5.993248907293156e-05, + "loss": 11.7809, + "step": 23590 + }, + { + "epoch": 1.2846242333898292, + "grad_norm": 0.4951625100799867, + "learning_rate": 5.9924409762459746e-05, + "loss": 11.815, + "step": 23591 + }, + { + "epoch": 1.2846786873864124, + "grad_norm": 0.5436142416323838, + "learning_rate": 5.991633076362242e-05, + "loss": 12.0503, + "step": 23592 + }, + { + "epoch": 1.2847331413829954, + "grad_norm": 0.5323256001549563, + "learning_rate": 5.990825207648247e-05, + "loss": 12.0098, + "step": 23593 + }, + { + "epoch": 1.2847875953795784, + "grad_norm": 0.5898551135137177, + "learning_rate": 5.9900173701102634e-05, + "loss": 11.9454, + "step": 23594 + }, + { + "epoch": 1.2848420493761614, + "grad_norm": 0.4723549336831795, + "learning_rate": 5.9892095637545784e-05, + "loss": 11.9479, + "step": 23595 + }, + { + "epoch": 1.2848965033727444, + "grad_norm": 0.4998330178812302, + "learning_rate": 5.988401788587472e-05, + "loss": 11.88, + "step": 23596 + }, + { + "epoch": 1.2849509573693274, + "grad_norm": 0.5051779881653149, + "learning_rate": 5.987594044615225e-05, + "loss": 11.9079, + "step": 23597 + }, + { + "epoch": 1.2850054113659104, + "grad_norm": 0.5245486970219934, + "learning_rate": 5.986786331844122e-05, + "loss": 11.889, + "step": 23598 + }, + { + "epoch": 1.2850598653624934, + "grad_norm": 0.5615451894349467, + "learning_rate": 5.98597865028044e-05, + "loss": 11.7919, + "step": 23599 + }, + { + "epoch": 1.2851143193590764, + "grad_norm": 0.4951813094482244, + "learning_rate": 5.9851709999304615e-05, + "loss": 12.0349, + "step": 23600 + }, + { + "epoch": 1.2851687733556596, + "grad_norm": 0.5609478608796767, + "learning_rate": 5.984363380800465e-05, + "loss": 12.0273, + "step": 23601 + }, + { + "epoch": 1.2852232273522426, + "grad_norm": 0.5654504929409612, + "learning_rate": 5.9835557928967335e-05, + "loss": 11.9728, + "step": 23602 + }, + { + "epoch": 1.2852776813488256, + "grad_norm": 0.5180004123328328, + "learning_rate": 5.982748236225551e-05, + "loss": 11.9134, + "step": 23603 + }, + { + "epoch": 1.2853321353454086, + "grad_norm": 0.5284728078042654, + "learning_rate": 5.9819407107931856e-05, + "loss": 11.8346, + "step": 23604 + }, + { + "epoch": 1.2853865893419916, + "grad_norm": 0.5193660021771949, + "learning_rate": 5.981133216605923e-05, + "loss": 11.9533, + "step": 23605 + }, + { + "epoch": 1.2854410433385746, + "grad_norm": 0.5128027865249251, + "learning_rate": 5.9803257536700444e-05, + "loss": 11.9465, + "step": 23606 + }, + { + "epoch": 1.2854954973351576, + "grad_norm": 0.5292023147523693, + "learning_rate": 5.979518321991826e-05, + "loss": 11.9244, + "step": 23607 + }, + { + "epoch": 1.2855499513317405, + "grad_norm": 0.5840133205707407, + "learning_rate": 5.9787109215775484e-05, + "loss": 11.9686, + "step": 23608 + }, + { + "epoch": 1.2856044053283235, + "grad_norm": 0.5453332728022575, + "learning_rate": 5.977903552433488e-05, + "loss": 11.8874, + "step": 23609 + }, + { + "epoch": 1.2856588593249065, + "grad_norm": 0.5694304844855177, + "learning_rate": 5.9770962145659245e-05, + "loss": 12.0776, + "step": 23610 + }, + { + "epoch": 1.2857133133214895, + "grad_norm": 0.5776448964429606, + "learning_rate": 5.9762889079811354e-05, + "loss": 11.9809, + "step": 23611 + }, + { + "epoch": 1.2857677673180725, + "grad_norm": 0.6017741977456825, + "learning_rate": 5.975481632685396e-05, + "loss": 11.8373, + "step": 23612 + }, + { + "epoch": 1.2858222213146555, + "grad_norm": 0.5445144755282323, + "learning_rate": 5.974674388684993e-05, + "loss": 11.8613, + "step": 23613 + }, + { + "epoch": 1.2858766753112385, + "grad_norm": 0.5497804284953607, + "learning_rate": 5.973867175986193e-05, + "loss": 11.9679, + "step": 23614 + }, + { + "epoch": 1.2859311293078217, + "grad_norm": 0.5528742445411363, + "learning_rate": 5.973059994595277e-05, + "loss": 11.9977, + "step": 23615 + }, + { + "epoch": 1.2859855833044047, + "grad_norm": 0.6365557765121772, + "learning_rate": 5.97225284451852e-05, + "loss": 11.8794, + "step": 23616 + }, + { + "epoch": 1.2860400373009877, + "grad_norm": 0.5608041996493477, + "learning_rate": 5.9714457257622016e-05, + "loss": 12.0025, + "step": 23617 + }, + { + "epoch": 1.2860944912975707, + "grad_norm": 0.4649658802703738, + "learning_rate": 5.9706386383325976e-05, + "loss": 11.8543, + "step": 23618 + }, + { + "epoch": 1.2861489452941537, + "grad_norm": 0.5246046723573995, + "learning_rate": 5.969831582235983e-05, + "loss": 11.7959, + "step": 23619 + }, + { + "epoch": 1.2862033992907367, + "grad_norm": 0.481864440579084, + "learning_rate": 5.969024557478633e-05, + "loss": 11.9564, + "step": 23620 + }, + { + "epoch": 1.2862578532873197, + "grad_norm": 0.5037851902097632, + "learning_rate": 5.9682175640668244e-05, + "loss": 11.7368, + "step": 23621 + }, + { + "epoch": 1.2863123072839027, + "grad_norm": 0.5100480487418915, + "learning_rate": 5.967410602006833e-05, + "loss": 11.8962, + "step": 23622 + }, + { + "epoch": 1.2863667612804857, + "grad_norm": 0.5635853340470395, + "learning_rate": 5.9666036713049356e-05, + "loss": 11.7806, + "step": 23623 + }, + { + "epoch": 1.2864212152770689, + "grad_norm": 0.5291879304710763, + "learning_rate": 5.9657967719674015e-05, + "loss": 11.9973, + "step": 23624 + }, + { + "epoch": 1.2864756692736519, + "grad_norm": 0.5879224653664377, + "learning_rate": 5.9649899040005085e-05, + "loss": 11.9562, + "step": 23625 + }, + { + "epoch": 1.2865301232702349, + "grad_norm": 0.4965507218558734, + "learning_rate": 5.9641830674105294e-05, + "loss": 11.877, + "step": 23626 + }, + { + "epoch": 1.2865845772668179, + "grad_norm": 0.6443914294268037, + "learning_rate": 5.9633762622037396e-05, + "loss": 12.0468, + "step": 23627 + }, + { + "epoch": 1.2866390312634008, + "grad_norm": 0.5358332881599275, + "learning_rate": 5.962569488386413e-05, + "loss": 11.9733, + "step": 23628 + }, + { + "epoch": 1.2866934852599838, + "grad_norm": 0.4888460672702682, + "learning_rate": 5.9617627459648253e-05, + "loss": 11.755, + "step": 23629 + }, + { + "epoch": 1.2867479392565668, + "grad_norm": 0.5055719708645405, + "learning_rate": 5.960956034945245e-05, + "loss": 11.8508, + "step": 23630 + }, + { + "epoch": 1.2868023932531498, + "grad_norm": 0.5385345014820144, + "learning_rate": 5.96014935533395e-05, + "loss": 11.8001, + "step": 23631 + }, + { + "epoch": 1.2868568472497328, + "grad_norm": 0.5515374747980404, + "learning_rate": 5.95934270713721e-05, + "loss": 11.989, + "step": 23632 + }, + { + "epoch": 1.2869113012463158, + "grad_norm": 0.5364470059610076, + "learning_rate": 5.958536090361302e-05, + "loss": 11.8877, + "step": 23633 + }, + { + "epoch": 1.2869657552428988, + "grad_norm": 0.6145460880505714, + "learning_rate": 5.9577295050124926e-05, + "loss": 12.0312, + "step": 23634 + }, + { + "epoch": 1.2870202092394818, + "grad_norm": 0.6416609088364478, + "learning_rate": 5.956922951097055e-05, + "loss": 11.8751, + "step": 23635 + }, + { + "epoch": 1.2870746632360648, + "grad_norm": 0.5836040120019552, + "learning_rate": 5.956116428621263e-05, + "loss": 11.9816, + "step": 23636 + }, + { + "epoch": 1.2871291172326478, + "grad_norm": 0.5161969708347284, + "learning_rate": 5.955309937591389e-05, + "loss": 11.936, + "step": 23637 + }, + { + "epoch": 1.2871835712292308, + "grad_norm": 0.5620302967688622, + "learning_rate": 5.954503478013703e-05, + "loss": 11.9001, + "step": 23638 + }, + { + "epoch": 1.287238025225814, + "grad_norm": 0.6406360918722982, + "learning_rate": 5.9536970498944745e-05, + "loss": 11.8731, + "step": 23639 + }, + { + "epoch": 1.287292479222397, + "grad_norm": 0.5256318177011917, + "learning_rate": 5.9528906532399776e-05, + "loss": 11.9375, + "step": 23640 + }, + { + "epoch": 1.28734693321898, + "grad_norm": 0.5221750789978684, + "learning_rate": 5.952084288056482e-05, + "loss": 11.951, + "step": 23641 + }, + { + "epoch": 1.287401387215563, + "grad_norm": 0.5805291395444275, + "learning_rate": 5.951277954350259e-05, + "loss": 11.9437, + "step": 23642 + }, + { + "epoch": 1.287455841212146, + "grad_norm": 0.5745203857240743, + "learning_rate": 5.950471652127578e-05, + "loss": 11.9151, + "step": 23643 + }, + { + "epoch": 1.287510295208729, + "grad_norm": 0.5976754697185809, + "learning_rate": 5.949665381394707e-05, + "loss": 12.0609, + "step": 23644 + }, + { + "epoch": 1.287564749205312, + "grad_norm": 0.5771545026760481, + "learning_rate": 5.948859142157917e-05, + "loss": 11.8742, + "step": 23645 + }, + { + "epoch": 1.287619203201895, + "grad_norm": 0.5279876863213004, + "learning_rate": 5.948052934423478e-05, + "loss": 11.8626, + "step": 23646 + }, + { + "epoch": 1.287673657198478, + "grad_norm": 0.48604035686339353, + "learning_rate": 5.947246758197658e-05, + "loss": 11.9474, + "step": 23647 + }, + { + "epoch": 1.2877281111950611, + "grad_norm": 0.6153703921357986, + "learning_rate": 5.946440613486728e-05, + "loss": 11.9791, + "step": 23648 + }, + { + "epoch": 1.2877825651916441, + "grad_norm": 0.49441011159873605, + "learning_rate": 5.945634500296955e-05, + "loss": 11.8251, + "step": 23649 + }, + { + "epoch": 1.2878370191882271, + "grad_norm": 0.5557404908758066, + "learning_rate": 5.944828418634607e-05, + "loss": 11.8788, + "step": 23650 + }, + { + "epoch": 1.2878914731848101, + "grad_norm": 0.5616918914548961, + "learning_rate": 5.9440223685059536e-05, + "loss": 11.9618, + "step": 23651 + }, + { + "epoch": 1.2879459271813931, + "grad_norm": 0.5308384410030225, + "learning_rate": 5.9432163499172645e-05, + "loss": 11.8682, + "step": 23652 + }, + { + "epoch": 1.288000381177976, + "grad_norm": 0.5100486469580383, + "learning_rate": 5.942410362874807e-05, + "loss": 11.93, + "step": 23653 + }, + { + "epoch": 1.288054835174559, + "grad_norm": 0.5627076991753851, + "learning_rate": 5.941604407384842e-05, + "loss": 11.8506, + "step": 23654 + }, + { + "epoch": 1.288109289171142, + "grad_norm": 0.5809962828595369, + "learning_rate": 5.940798483453645e-05, + "loss": 12.1106, + "step": 23655 + }, + { + "epoch": 1.288163743167725, + "grad_norm": 0.5761211825251635, + "learning_rate": 5.939992591087478e-05, + "loss": 11.9726, + "step": 23656 + }, + { + "epoch": 1.288218197164308, + "grad_norm": 0.5176080305630002, + "learning_rate": 5.939186730292611e-05, + "loss": 11.832, + "step": 23657 + }, + { + "epoch": 1.288272651160891, + "grad_norm": 0.5168934673239702, + "learning_rate": 5.938380901075308e-05, + "loss": 11.9764, + "step": 23658 + }, + { + "epoch": 1.288327105157474, + "grad_norm": 0.5240142885989021, + "learning_rate": 5.937575103441836e-05, + "loss": 11.9576, + "step": 23659 + }, + { + "epoch": 1.288381559154057, + "grad_norm": 0.5473455566039885, + "learning_rate": 5.936769337398462e-05, + "loss": 12.0432, + "step": 23660 + }, + { + "epoch": 1.28843601315064, + "grad_norm": 0.533416332353287, + "learning_rate": 5.935963602951449e-05, + "loss": 11.8169, + "step": 23661 + }, + { + "epoch": 1.2884904671472233, + "grad_norm": 0.5651284474850365, + "learning_rate": 5.9351579001070655e-05, + "loss": 12.0078, + "step": 23662 + }, + { + "epoch": 1.2885449211438063, + "grad_norm": 0.6008388714869841, + "learning_rate": 5.93435222887158e-05, + "loss": 11.9991, + "step": 23663 + }, + { + "epoch": 1.2885993751403892, + "grad_norm": 0.5592068078442981, + "learning_rate": 5.933546589251251e-05, + "loss": 11.9556, + "step": 23664 + }, + { + "epoch": 1.2886538291369722, + "grad_norm": 0.6156076208952148, + "learning_rate": 5.9327409812523424e-05, + "loss": 11.9975, + "step": 23665 + }, + { + "epoch": 1.2887082831335552, + "grad_norm": 0.5952681026970403, + "learning_rate": 5.9319354048811237e-05, + "loss": 11.9256, + "step": 23666 + }, + { + "epoch": 1.2887627371301382, + "grad_norm": 0.5675388295157449, + "learning_rate": 5.931129860143858e-05, + "loss": 11.8337, + "step": 23667 + }, + { + "epoch": 1.2888171911267212, + "grad_norm": 0.5549848730510045, + "learning_rate": 5.9303243470468095e-05, + "loss": 11.9471, + "step": 23668 + }, + { + "epoch": 1.2888716451233042, + "grad_norm": 0.5394915762639385, + "learning_rate": 5.929518865596241e-05, + "loss": 11.9956, + "step": 23669 + }, + { + "epoch": 1.2889260991198872, + "grad_norm": 0.5430149223289498, + "learning_rate": 5.928713415798416e-05, + "loss": 11.9668, + "step": 23670 + }, + { + "epoch": 1.2889805531164704, + "grad_norm": 0.5864954645272095, + "learning_rate": 5.927907997659598e-05, + "loss": 11.9191, + "step": 23671 + }, + { + "epoch": 1.2890350071130534, + "grad_norm": 0.5578452271628115, + "learning_rate": 5.9271026111860486e-05, + "loss": 11.8341, + "step": 23672 + }, + { + "epoch": 1.2890894611096364, + "grad_norm": 0.5312293951369268, + "learning_rate": 5.926297256384039e-05, + "loss": 11.844, + "step": 23673 + }, + { + "epoch": 1.2891439151062194, + "grad_norm": 0.5332665994177112, + "learning_rate": 5.92549193325982e-05, + "loss": 11.982, + "step": 23674 + }, + { + "epoch": 1.2891983691028024, + "grad_norm": 0.5135015601352471, + "learning_rate": 5.92468664181966e-05, + "loss": 12.0161, + "step": 23675 + }, + { + "epoch": 1.2892528230993854, + "grad_norm": 0.5983764378190438, + "learning_rate": 5.923881382069818e-05, + "loss": 11.7917, + "step": 23676 + }, + { + "epoch": 1.2893072770959684, + "grad_norm": 0.5557263800671538, + "learning_rate": 5.923076154016559e-05, + "loss": 11.9336, + "step": 23677 + }, + { + "epoch": 1.2893617310925514, + "grad_norm": 0.5779450898848552, + "learning_rate": 5.922270957666145e-05, + "loss": 12.007, + "step": 23678 + }, + { + "epoch": 1.2894161850891344, + "grad_norm": 0.5638874342151753, + "learning_rate": 5.921465793024834e-05, + "loss": 11.9898, + "step": 23679 + }, + { + "epoch": 1.2894706390857174, + "grad_norm": 0.5401848903416356, + "learning_rate": 5.9206606600988904e-05, + "loss": 12.068, + "step": 23680 + }, + { + "epoch": 1.2895250930823003, + "grad_norm": 0.5063761932526026, + "learning_rate": 5.919855558894574e-05, + "loss": 11.9837, + "step": 23681 + }, + { + "epoch": 1.2895795470788833, + "grad_norm": 0.5055177441847434, + "learning_rate": 5.919050489418143e-05, + "loss": 11.9499, + "step": 23682 + }, + { + "epoch": 1.2896340010754663, + "grad_norm": 0.572799586197494, + "learning_rate": 5.918245451675863e-05, + "loss": 12.0118, + "step": 23683 + }, + { + "epoch": 1.2896884550720493, + "grad_norm": 0.5243849526515628, + "learning_rate": 5.9174404456739896e-05, + "loss": 11.9704, + "step": 23684 + }, + { + "epoch": 1.2897429090686325, + "grad_norm": 0.5067721466066325, + "learning_rate": 5.9166354714187834e-05, + "loss": 11.9171, + "step": 23685 + }, + { + "epoch": 1.2897973630652155, + "grad_norm": 0.5837874081527563, + "learning_rate": 5.915830528916504e-05, + "loss": 11.9104, + "step": 23686 + }, + { + "epoch": 1.2898518170617985, + "grad_norm": 0.6207793209349238, + "learning_rate": 5.9150256181734085e-05, + "loss": 11.9239, + "step": 23687 + }, + { + "epoch": 1.2899062710583815, + "grad_norm": 0.5254442650792589, + "learning_rate": 5.914220739195763e-05, + "loss": 11.9123, + "step": 23688 + }, + { + "epoch": 1.2899607250549645, + "grad_norm": 0.49713421448790956, + "learning_rate": 5.913415891989821e-05, + "loss": 11.9195, + "step": 23689 + }, + { + "epoch": 1.2900151790515475, + "grad_norm": 0.534137301964781, + "learning_rate": 5.912611076561843e-05, + "loss": 11.7454, + "step": 23690 + }, + { + "epoch": 1.2900696330481305, + "grad_norm": 0.4969835572591292, + "learning_rate": 5.9118062929180854e-05, + "loss": 11.985, + "step": 23691 + }, + { + "epoch": 1.2901240870447135, + "grad_norm": 0.5544418764078004, + "learning_rate": 5.911001541064809e-05, + "loss": 11.9663, + "step": 23692 + }, + { + "epoch": 1.2901785410412965, + "grad_norm": 0.601756892019909, + "learning_rate": 5.9101968210082714e-05, + "loss": 11.9447, + "step": 23693 + }, + { + "epoch": 1.2902329950378797, + "grad_norm": 0.5287257994580373, + "learning_rate": 5.909392132754727e-05, + "loss": 11.8107, + "step": 23694 + }, + { + "epoch": 1.2902874490344627, + "grad_norm": 0.5607373362066157, + "learning_rate": 5.908587476310437e-05, + "loss": 11.9615, + "step": 23695 + }, + { + "epoch": 1.2903419030310457, + "grad_norm": 0.5465967799122933, + "learning_rate": 5.907782851681656e-05, + "loss": 11.9678, + "step": 23696 + }, + { + "epoch": 1.2903963570276287, + "grad_norm": 0.572141024917695, + "learning_rate": 5.9069782588746424e-05, + "loss": 11.9509, + "step": 23697 + }, + { + "epoch": 1.2904508110242117, + "grad_norm": 0.5471142728851983, + "learning_rate": 5.9061736978956515e-05, + "loss": 11.8992, + "step": 23698 + }, + { + "epoch": 1.2905052650207947, + "grad_norm": 0.5580420481713758, + "learning_rate": 5.9053691687509385e-05, + "loss": 11.8991, + "step": 23699 + }, + { + "epoch": 1.2905597190173776, + "grad_norm": 0.6032433260272864, + "learning_rate": 5.904564671446763e-05, + "loss": 11.9893, + "step": 23700 + }, + { + "epoch": 1.2906141730139606, + "grad_norm": 0.5273241292063136, + "learning_rate": 5.903760205989379e-05, + "loss": 11.8925, + "step": 23701 + }, + { + "epoch": 1.2906686270105436, + "grad_norm": 0.5173306857448301, + "learning_rate": 5.902955772385045e-05, + "loss": 12.0058, + "step": 23702 + }, + { + "epoch": 1.2907230810071266, + "grad_norm": 0.6058203988463428, + "learning_rate": 5.902151370640012e-05, + "loss": 11.9569, + "step": 23703 + }, + { + "epoch": 1.2907775350037096, + "grad_norm": 0.5844213336884051, + "learning_rate": 5.90134700076054e-05, + "loss": 12.0116, + "step": 23704 + }, + { + "epoch": 1.2908319890002926, + "grad_norm": 0.5443874397600219, + "learning_rate": 5.900542662752879e-05, + "loss": 11.9553, + "step": 23705 + }, + { + "epoch": 1.2908864429968756, + "grad_norm": 0.5647633404967047, + "learning_rate": 5.899738356623286e-05, + "loss": 11.9905, + "step": 23706 + }, + { + "epoch": 1.2909408969934586, + "grad_norm": 0.501919395815344, + "learning_rate": 5.8989340823780135e-05, + "loss": 11.9122, + "step": 23707 + }, + { + "epoch": 1.2909953509900416, + "grad_norm": 0.48842413756295855, + "learning_rate": 5.898129840023319e-05, + "loss": 11.8905, + "step": 23708 + }, + { + "epoch": 1.2910498049866248, + "grad_norm": 0.6030129318770379, + "learning_rate": 5.8973256295654546e-05, + "loss": 11.9723, + "step": 23709 + }, + { + "epoch": 1.2911042589832078, + "grad_norm": 0.5126006741755619, + "learning_rate": 5.8965214510106715e-05, + "loss": 11.9802, + "step": 23710 + }, + { + "epoch": 1.2911587129797908, + "grad_norm": 0.6057647348014951, + "learning_rate": 5.895717304365228e-05, + "loss": 11.811, + "step": 23711 + }, + { + "epoch": 1.2912131669763738, + "grad_norm": 0.5512818310963955, + "learning_rate": 5.894913189635375e-05, + "loss": 11.9218, + "step": 23712 + }, + { + "epoch": 1.2912676209729568, + "grad_norm": 0.5559498739751749, + "learning_rate": 5.894109106827366e-05, + "loss": 11.8989, + "step": 23713 + }, + { + "epoch": 1.2913220749695398, + "grad_norm": 0.5551262936257649, + "learning_rate": 5.8933050559474535e-05, + "loss": 12.003, + "step": 23714 + }, + { + "epoch": 1.2913765289661228, + "grad_norm": 0.5136157492964187, + "learning_rate": 5.89250103700189e-05, + "loss": 12.0017, + "step": 23715 + }, + { + "epoch": 1.2914309829627058, + "grad_norm": 0.531313460374404, + "learning_rate": 5.891697049996925e-05, + "loss": 11.9446, + "step": 23716 + }, + { + "epoch": 1.291485436959289, + "grad_norm": 0.48825477336853895, + "learning_rate": 5.890893094938814e-05, + "loss": 11.9323, + "step": 23717 + }, + { + "epoch": 1.291539890955872, + "grad_norm": 0.5101215598549473, + "learning_rate": 5.890089171833808e-05, + "loss": 11.972, + "step": 23718 + }, + { + "epoch": 1.291594344952455, + "grad_norm": 0.5541828303361535, + "learning_rate": 5.889285280688156e-05, + "loss": 11.953, + "step": 23719 + }, + { + "epoch": 1.291648798949038, + "grad_norm": 0.5143235778991876, + "learning_rate": 5.888481421508112e-05, + "loss": 11.9354, + "step": 23720 + }, + { + "epoch": 1.291703252945621, + "grad_norm": 0.5063198474722517, + "learning_rate": 5.887677594299924e-05, + "loss": 11.9112, + "step": 23721 + }, + { + "epoch": 1.291757706942204, + "grad_norm": 0.5603941868171852, + "learning_rate": 5.8868737990698465e-05, + "loss": 11.8923, + "step": 23722 + }, + { + "epoch": 1.291812160938787, + "grad_norm": 0.5081302596193433, + "learning_rate": 5.8860700358241286e-05, + "loss": 11.8388, + "step": 23723 + }, + { + "epoch": 1.29186661493537, + "grad_norm": 0.5332331312804888, + "learning_rate": 5.8852663045690236e-05, + "loss": 11.9585, + "step": 23724 + }, + { + "epoch": 1.291921068931953, + "grad_norm": 0.6407102059965988, + "learning_rate": 5.884462605310772e-05, + "loss": 12.1426, + "step": 23725 + }, + { + "epoch": 1.291975522928536, + "grad_norm": 0.5282749642282629, + "learning_rate": 5.8836589380556306e-05, + "loss": 11.9303, + "step": 23726 + }, + { + "epoch": 1.292029976925119, + "grad_norm": 0.5010477551716277, + "learning_rate": 5.882855302809849e-05, + "loss": 11.8868, + "step": 23727 + }, + { + "epoch": 1.2920844309217019, + "grad_norm": 0.5263521332847538, + "learning_rate": 5.8820516995796735e-05, + "loss": 11.686, + "step": 23728 + }, + { + "epoch": 1.2921388849182849, + "grad_norm": 0.5315598524409277, + "learning_rate": 5.881248128371355e-05, + "loss": 11.8769, + "step": 23729 + }, + { + "epoch": 1.2921933389148679, + "grad_norm": 0.5113617335711895, + "learning_rate": 5.880444589191143e-05, + "loss": 11.9031, + "step": 23730 + }, + { + "epoch": 1.2922477929114509, + "grad_norm": 0.5513426784972582, + "learning_rate": 5.879641082045284e-05, + "loss": 11.9405, + "step": 23731 + }, + { + "epoch": 1.292302246908034, + "grad_norm": 0.5547387914133479, + "learning_rate": 5.8788376069400244e-05, + "loss": 11.8762, + "step": 23732 + }, + { + "epoch": 1.292356700904617, + "grad_norm": 0.5883632763509766, + "learning_rate": 5.878034163881616e-05, + "loss": 11.9423, + "step": 23733 + }, + { + "epoch": 1.2924111549012, + "grad_norm": 0.5147257702675143, + "learning_rate": 5.87723075287631e-05, + "loss": 11.9103, + "step": 23734 + }, + { + "epoch": 1.292465608897783, + "grad_norm": 0.5376391013574864, + "learning_rate": 5.8764273739303464e-05, + "loss": 11.7991, + "step": 23735 + }, + { + "epoch": 1.292520062894366, + "grad_norm": 0.547459391107265, + "learning_rate": 5.8756240270499715e-05, + "loss": 11.882, + "step": 23736 + }, + { + "epoch": 1.292574516890949, + "grad_norm": 0.5494695018731429, + "learning_rate": 5.874820712241438e-05, + "loss": 11.9814, + "step": 23737 + }, + { + "epoch": 1.292628970887532, + "grad_norm": 0.5471057445038715, + "learning_rate": 5.874017429510992e-05, + "loss": 11.9549, + "step": 23738 + }, + { + "epoch": 1.292683424884115, + "grad_norm": 0.6807750576173539, + "learning_rate": 5.873214178864877e-05, + "loss": 11.8596, + "step": 23739 + }, + { + "epoch": 1.292737878880698, + "grad_norm": 0.5740114418723041, + "learning_rate": 5.87241096030934e-05, + "loss": 11.9223, + "step": 23740 + }, + { + "epoch": 1.2927923328772812, + "grad_norm": 0.5288712231651108, + "learning_rate": 5.8716077738506284e-05, + "loss": 11.8839, + "step": 23741 + }, + { + "epoch": 1.2928467868738642, + "grad_norm": 0.5266905118122426, + "learning_rate": 5.870804619494987e-05, + "loss": 11.9494, + "step": 23742 + }, + { + "epoch": 1.2929012408704472, + "grad_norm": 0.5449159386048019, + "learning_rate": 5.8700014972486606e-05, + "loss": 11.8494, + "step": 23743 + }, + { + "epoch": 1.2929556948670302, + "grad_norm": 0.5882369463915113, + "learning_rate": 5.869198407117897e-05, + "loss": 11.9339, + "step": 23744 + }, + { + "epoch": 1.2930101488636132, + "grad_norm": 0.5461902077565499, + "learning_rate": 5.8683953491089395e-05, + "loss": 11.988, + "step": 23745 + }, + { + "epoch": 1.2930646028601962, + "grad_norm": 0.5408887115112508, + "learning_rate": 5.867592323228031e-05, + "loss": 11.9151, + "step": 23746 + }, + { + "epoch": 1.2931190568567792, + "grad_norm": 0.5891116599950841, + "learning_rate": 5.866789329481418e-05, + "loss": 11.9295, + "step": 23747 + }, + { + "epoch": 1.2931735108533622, + "grad_norm": 0.5673280691471253, + "learning_rate": 5.865986367875342e-05, + "loss": 11.9891, + "step": 23748 + }, + { + "epoch": 1.2932279648499452, + "grad_norm": 0.5540971147823334, + "learning_rate": 5.865183438416051e-05, + "loss": 11.9701, + "step": 23749 + }, + { + "epoch": 1.2932824188465282, + "grad_norm": 0.5066621558352484, + "learning_rate": 5.864380541109788e-05, + "loss": 11.9131, + "step": 23750 + }, + { + "epoch": 1.2933368728431112, + "grad_norm": 0.5779754619780152, + "learning_rate": 5.863577675962795e-05, + "loss": 11.9878, + "step": 23751 + }, + { + "epoch": 1.2933913268396942, + "grad_norm": 0.5818797433262762, + "learning_rate": 5.862774842981315e-05, + "loss": 11.9, + "step": 23752 + }, + { + "epoch": 1.2934457808362771, + "grad_norm": 0.5821332362538245, + "learning_rate": 5.861972042171593e-05, + "loss": 11.967, + "step": 23753 + }, + { + "epoch": 1.2935002348328601, + "grad_norm": 0.5601375137464693, + "learning_rate": 5.861169273539871e-05, + "loss": 11.81, + "step": 23754 + }, + { + "epoch": 1.2935546888294434, + "grad_norm": 0.5738173837500221, + "learning_rate": 5.86036653709239e-05, + "loss": 11.8963, + "step": 23755 + }, + { + "epoch": 1.2936091428260263, + "grad_norm": 0.6331948374525299, + "learning_rate": 5.859563832835393e-05, + "loss": 11.9184, + "step": 23756 + }, + { + "epoch": 1.2936635968226093, + "grad_norm": 0.5269610859734193, + "learning_rate": 5.858761160775121e-05, + "loss": 11.9363, + "step": 23757 + }, + { + "epoch": 1.2937180508191923, + "grad_norm": 0.5604407469730615, + "learning_rate": 5.8579585209178164e-05, + "loss": 12.0051, + "step": 23758 + }, + { + "epoch": 1.2937725048157753, + "grad_norm": 0.529801134839837, + "learning_rate": 5.8571559132697206e-05, + "loss": 11.9784, + "step": 23759 + }, + { + "epoch": 1.2938269588123583, + "grad_norm": 0.5561562143215054, + "learning_rate": 5.856353337837076e-05, + "loss": 11.9694, + "step": 23760 + }, + { + "epoch": 1.2938814128089413, + "grad_norm": 0.4916798146514856, + "learning_rate": 5.8555507946261235e-05, + "loss": 11.8015, + "step": 23761 + }, + { + "epoch": 1.2939358668055243, + "grad_norm": 0.5454294032558833, + "learning_rate": 5.8547482836431014e-05, + "loss": 11.9613, + "step": 23762 + }, + { + "epoch": 1.2939903208021073, + "grad_norm": 0.566705039358307, + "learning_rate": 5.853945804894254e-05, + "loss": 11.8301, + "step": 23763 + }, + { + "epoch": 1.2940447747986905, + "grad_norm": 0.5164305283152058, + "learning_rate": 5.8531433583858195e-05, + "loss": 11.8817, + "step": 23764 + }, + { + "epoch": 1.2940992287952735, + "grad_norm": 0.5587180282690472, + "learning_rate": 5.8523409441240375e-05, + "loss": 11.9838, + "step": 23765 + }, + { + "epoch": 1.2941536827918565, + "grad_norm": 0.5042120080470417, + "learning_rate": 5.8515385621151464e-05, + "loss": 12.0209, + "step": 23766 + }, + { + "epoch": 1.2942081367884395, + "grad_norm": 0.5293438762147187, + "learning_rate": 5.8507362123653865e-05, + "loss": 11.926, + "step": 23767 + }, + { + "epoch": 1.2942625907850225, + "grad_norm": 0.5307943430357466, + "learning_rate": 5.849933894880999e-05, + "loss": 11.8703, + "step": 23768 + }, + { + "epoch": 1.2943170447816055, + "grad_norm": 0.5962872100297739, + "learning_rate": 5.8491316096682215e-05, + "loss": 11.9902, + "step": 23769 + }, + { + "epoch": 1.2943714987781885, + "grad_norm": 0.4928160541912051, + "learning_rate": 5.848329356733291e-05, + "loss": 11.823, + "step": 23770 + }, + { + "epoch": 1.2944259527747715, + "grad_norm": 0.47542862553071874, + "learning_rate": 5.847527136082449e-05, + "loss": 11.8341, + "step": 23771 + }, + { + "epoch": 1.2944804067713545, + "grad_norm": 0.5182900495334072, + "learning_rate": 5.8467249477219313e-05, + "loss": 11.8392, + "step": 23772 + }, + { + "epoch": 1.2945348607679374, + "grad_norm": 0.5317771082088423, + "learning_rate": 5.8459227916579785e-05, + "loss": 11.8587, + "step": 23773 + }, + { + "epoch": 1.2945893147645204, + "grad_norm": 0.5421895251981557, + "learning_rate": 5.8451206678968285e-05, + "loss": 11.8498, + "step": 23774 + }, + { + "epoch": 1.2946437687611034, + "grad_norm": 0.48099212544232717, + "learning_rate": 5.844318576444714e-05, + "loss": 11.7839, + "step": 23775 + }, + { + "epoch": 1.2946982227576864, + "grad_norm": 0.6457133927383553, + "learning_rate": 5.8435165173078765e-05, + "loss": 11.9891, + "step": 23776 + }, + { + "epoch": 1.2947526767542694, + "grad_norm": 0.581678108408296, + "learning_rate": 5.842714490492551e-05, + "loss": 12.0298, + "step": 23777 + }, + { + "epoch": 1.2948071307508524, + "grad_norm": 0.5619604634688051, + "learning_rate": 5.8419124960049745e-05, + "loss": 12.0587, + "step": 23778 + }, + { + "epoch": 1.2948615847474356, + "grad_norm": 0.5902723998282311, + "learning_rate": 5.841110533851384e-05, + "loss": 11.9813, + "step": 23779 + }, + { + "epoch": 1.2949160387440186, + "grad_norm": 0.5290632394977075, + "learning_rate": 5.840308604038015e-05, + "loss": 11.9145, + "step": 23780 + }, + { + "epoch": 1.2949704927406016, + "grad_norm": 0.5630728018540572, + "learning_rate": 5.839506706571103e-05, + "loss": 11.872, + "step": 23781 + }, + { + "epoch": 1.2950249467371846, + "grad_norm": 0.49309045151418357, + "learning_rate": 5.838704841456886e-05, + "loss": 11.8864, + "step": 23782 + }, + { + "epoch": 1.2950794007337676, + "grad_norm": 0.5205485808217177, + "learning_rate": 5.837903008701599e-05, + "loss": 11.8321, + "step": 23783 + }, + { + "epoch": 1.2951338547303506, + "grad_norm": 0.5734610177021967, + "learning_rate": 5.8371012083114776e-05, + "loss": 11.9272, + "step": 23784 + }, + { + "epoch": 1.2951883087269336, + "grad_norm": 0.61766529693796, + "learning_rate": 5.836299440292751e-05, + "loss": 12.018, + "step": 23785 + }, + { + "epoch": 1.2952427627235166, + "grad_norm": 0.5309879938683016, + "learning_rate": 5.835497704651662e-05, + "loss": 11.8067, + "step": 23786 + }, + { + "epoch": 1.2952972167200998, + "grad_norm": 0.5115278956316054, + "learning_rate": 5.834696001394437e-05, + "loss": 11.9181, + "step": 23787 + }, + { + "epoch": 1.2953516707166828, + "grad_norm": 0.5730103687587895, + "learning_rate": 5.8338943305273184e-05, + "loss": 12.0184, + "step": 23788 + }, + { + "epoch": 1.2954061247132658, + "grad_norm": 0.5128052120920888, + "learning_rate": 5.833092692056532e-05, + "loss": 11.7312, + "step": 23789 + }, + { + "epoch": 1.2954605787098488, + "grad_norm": 0.5364698494166938, + "learning_rate": 5.832291085988316e-05, + "loss": 11.9624, + "step": 23790 + }, + { + "epoch": 1.2955150327064318, + "grad_norm": 0.5467671699510431, + "learning_rate": 5.8314895123289074e-05, + "loss": 12.015, + "step": 23791 + }, + { + "epoch": 1.2955694867030147, + "grad_norm": 0.5578199902118518, + "learning_rate": 5.830687971084532e-05, + "loss": 11.8048, + "step": 23792 + }, + { + "epoch": 1.2956239406995977, + "grad_norm": 0.5275092571653156, + "learning_rate": 5.829886462261429e-05, + "loss": 11.8615, + "step": 23793 + }, + { + "epoch": 1.2956783946961807, + "grad_norm": 0.545961655763849, + "learning_rate": 5.829084985865829e-05, + "loss": 11.9292, + "step": 23794 + }, + { + "epoch": 1.2957328486927637, + "grad_norm": 0.5126942762884821, + "learning_rate": 5.828283541903959e-05, + "loss": 11.8929, + "step": 23795 + }, + { + "epoch": 1.2957873026893467, + "grad_norm": 0.6318212840121846, + "learning_rate": 5.8274821303820605e-05, + "loss": 12.0266, + "step": 23796 + }, + { + "epoch": 1.2958417566859297, + "grad_norm": 0.5517809070225351, + "learning_rate": 5.826680751306357e-05, + "loss": 11.8974, + "step": 23797 + }, + { + "epoch": 1.2958962106825127, + "grad_norm": 0.5450332140338545, + "learning_rate": 5.825879404683087e-05, + "loss": 12.0884, + "step": 23798 + }, + { + "epoch": 1.2959506646790957, + "grad_norm": 0.5179500438583599, + "learning_rate": 5.825078090518474e-05, + "loss": 11.829, + "step": 23799 + }, + { + "epoch": 1.2960051186756787, + "grad_norm": 0.4967420919153505, + "learning_rate": 5.824276808818756e-05, + "loss": 11.9634, + "step": 23800 + }, + { + "epoch": 1.2960595726722617, + "grad_norm": 0.5318850930840494, + "learning_rate": 5.8234755595901635e-05, + "loss": 11.9274, + "step": 23801 + }, + { + "epoch": 1.296114026668845, + "grad_norm": 0.5272746189101266, + "learning_rate": 5.822674342838923e-05, + "loss": 11.8998, + "step": 23802 + }, + { + "epoch": 1.296168480665428, + "grad_norm": 0.504146223444168, + "learning_rate": 5.8218731585712716e-05, + "loss": 11.9453, + "step": 23803 + }, + { + "epoch": 1.2962229346620109, + "grad_norm": 0.5549635152298085, + "learning_rate": 5.821072006793434e-05, + "loss": 12.0533, + "step": 23804 + }, + { + "epoch": 1.2962773886585939, + "grad_norm": 0.5818619237050583, + "learning_rate": 5.8202708875116385e-05, + "loss": 11.8858, + "step": 23805 + }, + { + "epoch": 1.2963318426551769, + "grad_norm": 0.6054106256274053, + "learning_rate": 5.8194698007321205e-05, + "loss": 11.9465, + "step": 23806 + }, + { + "epoch": 1.2963862966517599, + "grad_norm": 0.5968692808356221, + "learning_rate": 5.818668746461102e-05, + "loss": 12.0636, + "step": 23807 + }, + { + "epoch": 1.2964407506483429, + "grad_norm": 0.5005143996104131, + "learning_rate": 5.817867724704822e-05, + "loss": 11.8417, + "step": 23808 + }, + { + "epoch": 1.2964952046449258, + "grad_norm": 0.5110394529088859, + "learning_rate": 5.8170667354694984e-05, + "loss": 11.887, + "step": 23809 + }, + { + "epoch": 1.2965496586415088, + "grad_norm": 0.5864714130318646, + "learning_rate": 5.8162657787613715e-05, + "loss": 11.8843, + "step": 23810 + }, + { + "epoch": 1.296604112638092, + "grad_norm": 0.5275926335063034, + "learning_rate": 5.815464854586657e-05, + "loss": 11.9233, + "step": 23811 + }, + { + "epoch": 1.296658566634675, + "grad_norm": 0.5586000244429157, + "learning_rate": 5.814663962951592e-05, + "loss": 11.8799, + "step": 23812 + }, + { + "epoch": 1.296713020631258, + "grad_norm": 0.530092297639588, + "learning_rate": 5.8138631038624036e-05, + "loss": 11.8257, + "step": 23813 + }, + { + "epoch": 1.296767474627841, + "grad_norm": 0.5328622060712425, + "learning_rate": 5.813062277325319e-05, + "loss": 11.7601, + "step": 23814 + }, + { + "epoch": 1.296821928624424, + "grad_norm": 0.5578334577050502, + "learning_rate": 5.812261483346564e-05, + "loss": 11.9954, + "step": 23815 + }, + { + "epoch": 1.296876382621007, + "grad_norm": 0.5591524005561552, + "learning_rate": 5.8114607219323623e-05, + "loss": 11.8942, + "step": 23816 + }, + { + "epoch": 1.29693083661759, + "grad_norm": 0.5014435898895022, + "learning_rate": 5.810659993088944e-05, + "loss": 11.9737, + "step": 23817 + }, + { + "epoch": 1.296985290614173, + "grad_norm": 0.5155678054343346, + "learning_rate": 5.8098592968225396e-05, + "loss": 11.8323, + "step": 23818 + }, + { + "epoch": 1.297039744610756, + "grad_norm": 0.5822572248388822, + "learning_rate": 5.809058633139369e-05, + "loss": 12.0131, + "step": 23819 + }, + { + "epoch": 1.297094198607339, + "grad_norm": 0.5669873017065186, + "learning_rate": 5.8082580020456656e-05, + "loss": 11.9765, + "step": 23820 + }, + { + "epoch": 1.297148652603922, + "grad_norm": 0.565166966704871, + "learning_rate": 5.807457403547646e-05, + "loss": 11.9164, + "step": 23821 + }, + { + "epoch": 1.297203106600505, + "grad_norm": 0.6252835077462645, + "learning_rate": 5.806656837651545e-05, + "loss": 11.9195, + "step": 23822 + }, + { + "epoch": 1.297257560597088, + "grad_norm": 0.5209635888907312, + "learning_rate": 5.8058563043635796e-05, + "loss": 11.868, + "step": 23823 + }, + { + "epoch": 1.297312014593671, + "grad_norm": 0.5397714261569933, + "learning_rate": 5.8050558036899826e-05, + "loss": 11.922, + "step": 23824 + }, + { + "epoch": 1.2973664685902542, + "grad_norm": 0.5615187512027605, + "learning_rate": 5.804255335636976e-05, + "loss": 12.0561, + "step": 23825 + }, + { + "epoch": 1.2974209225868372, + "grad_norm": 0.5356667127732464, + "learning_rate": 5.8034549002107786e-05, + "loss": 11.9817, + "step": 23826 + }, + { + "epoch": 1.2974753765834202, + "grad_norm": 0.5363407231181678, + "learning_rate": 5.802654497417621e-05, + "loss": 11.8955, + "step": 23827 + }, + { + "epoch": 1.2975298305800032, + "grad_norm": 0.5562681698490525, + "learning_rate": 5.8018541272637286e-05, + "loss": 11.9367, + "step": 23828 + }, + { + "epoch": 1.2975842845765861, + "grad_norm": 0.5507851988821135, + "learning_rate": 5.8010537897553195e-05, + "loss": 11.8209, + "step": 23829 + }, + { + "epoch": 1.2976387385731691, + "grad_norm": 0.5723448092638935, + "learning_rate": 5.800253484898623e-05, + "loss": 11.934, + "step": 23830 + }, + { + "epoch": 1.2976931925697521, + "grad_norm": 0.5014592242239699, + "learning_rate": 5.7994532126998566e-05, + "loss": 12.0612, + "step": 23831 + }, + { + "epoch": 1.2977476465663351, + "grad_norm": 0.4921722432223852, + "learning_rate": 5.798652973165251e-05, + "loss": 11.8723, + "step": 23832 + }, + { + "epoch": 1.2978021005629181, + "grad_norm": 0.5835628798465374, + "learning_rate": 5.7978527663010196e-05, + "loss": 12.0561, + "step": 23833 + }, + { + "epoch": 1.2978565545595013, + "grad_norm": 0.5568174068152698, + "learning_rate": 5.797052592113394e-05, + "loss": 11.8702, + "step": 23834 + }, + { + "epoch": 1.2979110085560843, + "grad_norm": 0.5443639620474775, + "learning_rate": 5.796252450608592e-05, + "loss": 11.9998, + "step": 23835 + }, + { + "epoch": 1.2979654625526673, + "grad_norm": 0.5243651156283624, + "learning_rate": 5.795452341792833e-05, + "loss": 11.8549, + "step": 23836 + }, + { + "epoch": 1.2980199165492503, + "grad_norm": 0.5552627226889993, + "learning_rate": 5.7946522656723447e-05, + "loss": 11.9421, + "step": 23837 + }, + { + "epoch": 1.2980743705458333, + "grad_norm": 0.5577937602476003, + "learning_rate": 5.793852222253342e-05, + "loss": 11.9975, + "step": 23838 + }, + { + "epoch": 1.2981288245424163, + "grad_norm": 0.5406367919214983, + "learning_rate": 5.7930522115420495e-05, + "loss": 11.9579, + "step": 23839 + }, + { + "epoch": 1.2981832785389993, + "grad_norm": 0.5159964460721704, + "learning_rate": 5.792252233544693e-05, + "loss": 11.9685, + "step": 23840 + }, + { + "epoch": 1.2982377325355823, + "grad_norm": 0.5752622332443564, + "learning_rate": 5.791452288267483e-05, + "loss": 12.0587, + "step": 23841 + }, + { + "epoch": 1.2982921865321653, + "grad_norm": 0.5143233796983403, + "learning_rate": 5.790652375716652e-05, + "loss": 11.828, + "step": 23842 + }, + { + "epoch": 1.2983466405287483, + "grad_norm": 0.5817519099039418, + "learning_rate": 5.789852495898409e-05, + "loss": 11.948, + "step": 23843 + }, + { + "epoch": 1.2984010945253313, + "grad_norm": 0.5981241894383442, + "learning_rate": 5.789052648818982e-05, + "loss": 12.0364, + "step": 23844 + }, + { + "epoch": 1.2984555485219142, + "grad_norm": 0.55008481899298, + "learning_rate": 5.78825283448459e-05, + "loss": 11.9335, + "step": 23845 + }, + { + "epoch": 1.2985100025184972, + "grad_norm": 0.5705241028715168, + "learning_rate": 5.787453052901446e-05, + "loss": 11.9181, + "step": 23846 + }, + { + "epoch": 1.2985644565150802, + "grad_norm": 0.5941401412513208, + "learning_rate": 5.786653304075775e-05, + "loss": 12.0423, + "step": 23847 + }, + { + "epoch": 1.2986189105116632, + "grad_norm": 0.5608612963629043, + "learning_rate": 5.785853588013793e-05, + "loss": 11.9988, + "step": 23848 + }, + { + "epoch": 1.2986733645082464, + "grad_norm": 0.5342991395931518, + "learning_rate": 5.78505390472172e-05, + "loss": 11.8275, + "step": 23849 + }, + { + "epoch": 1.2987278185048294, + "grad_norm": 0.592120336158684, + "learning_rate": 5.784254254205779e-05, + "loss": 11.9297, + "step": 23850 + }, + { + "epoch": 1.2987822725014124, + "grad_norm": 0.5739279250310847, + "learning_rate": 5.7834546364721785e-05, + "loss": 12.0212, + "step": 23851 + }, + { + "epoch": 1.2988367264979954, + "grad_norm": 0.5394185905729064, + "learning_rate": 5.782655051527146e-05, + "loss": 11.9065, + "step": 23852 + }, + { + "epoch": 1.2988911804945784, + "grad_norm": 0.5220110030213705, + "learning_rate": 5.781855499376891e-05, + "loss": 11.8859, + "step": 23853 + }, + { + "epoch": 1.2989456344911614, + "grad_norm": 0.7403391556608028, + "learning_rate": 5.7810559800276396e-05, + "loss": 11.9985, + "step": 23854 + }, + { + "epoch": 1.2990000884877444, + "grad_norm": 0.5237367908716927, + "learning_rate": 5.780256493485605e-05, + "loss": 11.9219, + "step": 23855 + }, + { + "epoch": 1.2990545424843274, + "grad_norm": 0.5028134170372256, + "learning_rate": 5.779457039756999e-05, + "loss": 11.9866, + "step": 23856 + }, + { + "epoch": 1.2991089964809106, + "grad_norm": 0.5407719520119866, + "learning_rate": 5.7786576188480465e-05, + "loss": 11.8646, + "step": 23857 + }, + { + "epoch": 1.2991634504774936, + "grad_norm": 0.5432444500769634, + "learning_rate": 5.777858230764956e-05, + "loss": 11.799, + "step": 23858 + }, + { + "epoch": 1.2992179044740766, + "grad_norm": 0.556584499302641, + "learning_rate": 5.777058875513952e-05, + "loss": 11.9791, + "step": 23859 + }, + { + "epoch": 1.2992723584706596, + "grad_norm": 0.5299547991900019, + "learning_rate": 5.7762595531012444e-05, + "loss": 11.9898, + "step": 23860 + }, + { + "epoch": 1.2993268124672426, + "grad_norm": 0.5567155647967426, + "learning_rate": 5.775460263533048e-05, + "loss": 11.829, + "step": 23861 + }, + { + "epoch": 1.2993812664638256, + "grad_norm": 0.5485088855415148, + "learning_rate": 5.774661006815587e-05, + "loss": 12.0006, + "step": 23862 + }, + { + "epoch": 1.2994357204604086, + "grad_norm": 0.6038910920864603, + "learning_rate": 5.773861782955066e-05, + "loss": 11.9628, + "step": 23863 + }, + { + "epoch": 1.2994901744569916, + "grad_norm": 0.5907919696422618, + "learning_rate": 5.773062591957708e-05, + "loss": 12.1342, + "step": 23864 + }, + { + "epoch": 1.2995446284535745, + "grad_norm": 0.56804388009057, + "learning_rate": 5.7722634338297256e-05, + "loss": 11.9145, + "step": 23865 + }, + { + "epoch": 1.2995990824501575, + "grad_norm": 0.5934006202192055, + "learning_rate": 5.7714643085773256e-05, + "loss": 11.9192, + "step": 23866 + }, + { + "epoch": 1.2996535364467405, + "grad_norm": 0.521596306869903, + "learning_rate": 5.770665216206734e-05, + "loss": 11.9393, + "step": 23867 + }, + { + "epoch": 1.2997079904433235, + "grad_norm": 0.5476409909677536, + "learning_rate": 5.769866156724154e-05, + "loss": 11.8477, + "step": 23868 + }, + { + "epoch": 1.2997624444399065, + "grad_norm": 0.5489940155779739, + "learning_rate": 5.769067130135808e-05, + "loss": 11.8936, + "step": 23869 + }, + { + "epoch": 1.2998168984364895, + "grad_norm": 0.5687623535481318, + "learning_rate": 5.7682681364479027e-05, + "loss": 11.9452, + "step": 23870 + }, + { + "epoch": 1.2998713524330725, + "grad_norm": 0.49442109312059335, + "learning_rate": 5.7674691756666574e-05, + "loss": 11.8604, + "step": 23871 + }, + { + "epoch": 1.2999258064296557, + "grad_norm": 0.5268642892099454, + "learning_rate": 5.766670247798277e-05, + "loss": 11.9767, + "step": 23872 + }, + { + "epoch": 1.2999802604262387, + "grad_norm": 0.5900355269341435, + "learning_rate": 5.7658713528489795e-05, + "loss": 11.9823, + "step": 23873 + }, + { + "epoch": 1.3000347144228217, + "grad_norm": 0.5524735410150464, + "learning_rate": 5.76507249082498e-05, + "loss": 12.0053, + "step": 23874 + }, + { + "epoch": 1.3000891684194047, + "grad_norm": 0.5957451169779749, + "learning_rate": 5.7642736617324864e-05, + "loss": 11.9088, + "step": 23875 + }, + { + "epoch": 1.3001436224159877, + "grad_norm": 0.5603659799437561, + "learning_rate": 5.763474865577708e-05, + "loss": 11.9087, + "step": 23876 + }, + { + "epoch": 1.3001980764125707, + "grad_norm": 0.5687007741173038, + "learning_rate": 5.7626761023668644e-05, + "loss": 11.9719, + "step": 23877 + }, + { + "epoch": 1.3002525304091537, + "grad_norm": 0.523834443595407, + "learning_rate": 5.761877372106157e-05, + "loss": 11.9838, + "step": 23878 + }, + { + "epoch": 1.3003069844057367, + "grad_norm": 0.5162018241912517, + "learning_rate": 5.761078674801805e-05, + "loss": 11.8398, + "step": 23879 + }, + { + "epoch": 1.3003614384023197, + "grad_norm": 0.5216713171190042, + "learning_rate": 5.760280010460013e-05, + "loss": 12.0186, + "step": 23880 + }, + { + "epoch": 1.3004158923989029, + "grad_norm": 0.5385634257076888, + "learning_rate": 5.759481379086998e-05, + "loss": 11.9314, + "step": 23881 + }, + { + "epoch": 1.3004703463954859, + "grad_norm": 0.5015222462913387, + "learning_rate": 5.758682780688964e-05, + "loss": 11.8739, + "step": 23882 + }, + { + "epoch": 1.3005248003920689, + "grad_norm": 0.5829663844393825, + "learning_rate": 5.757884215272123e-05, + "loss": 11.9699, + "step": 23883 + }, + { + "epoch": 1.3005792543886519, + "grad_norm": 0.5987181288521144, + "learning_rate": 5.7570856828426914e-05, + "loss": 11.7919, + "step": 23884 + }, + { + "epoch": 1.3006337083852348, + "grad_norm": 0.5477114358282926, + "learning_rate": 5.756287183406872e-05, + "loss": 11.9544, + "step": 23885 + }, + { + "epoch": 1.3006881623818178, + "grad_norm": 0.5637247972403945, + "learning_rate": 5.755488716970874e-05, + "loss": 11.9266, + "step": 23886 + }, + { + "epoch": 1.3007426163784008, + "grad_norm": 0.5424718020020158, + "learning_rate": 5.754690283540904e-05, + "loss": 11.864, + "step": 23887 + }, + { + "epoch": 1.3007970703749838, + "grad_norm": 0.5247746464249025, + "learning_rate": 5.753891883123176e-05, + "loss": 11.9365, + "step": 23888 + }, + { + "epoch": 1.3008515243715668, + "grad_norm": 0.5233613628141781, + "learning_rate": 5.7530935157239e-05, + "loss": 12.0639, + "step": 23889 + }, + { + "epoch": 1.3009059783681498, + "grad_norm": 0.5323720067503869, + "learning_rate": 5.7522951813492764e-05, + "loss": 11.9468, + "step": 23890 + }, + { + "epoch": 1.3009604323647328, + "grad_norm": 0.481733645939565, + "learning_rate": 5.7514968800055226e-05, + "loss": 11.8236, + "step": 23891 + }, + { + "epoch": 1.3010148863613158, + "grad_norm": 0.5137549636562518, + "learning_rate": 5.750698611698837e-05, + "loss": 11.92, + "step": 23892 + }, + { + "epoch": 1.3010693403578988, + "grad_norm": 0.5909328738816363, + "learning_rate": 5.7499003764354354e-05, + "loss": 11.988, + "step": 23893 + }, + { + "epoch": 1.3011237943544818, + "grad_norm": 0.576355762077326, + "learning_rate": 5.749102174221517e-05, + "loss": 11.999, + "step": 23894 + }, + { + "epoch": 1.301178248351065, + "grad_norm": 0.5370711057197125, + "learning_rate": 5.7483040050632974e-05, + "loss": 11.8079, + "step": 23895 + }, + { + "epoch": 1.301232702347648, + "grad_norm": 0.5246186221614892, + "learning_rate": 5.747505868966979e-05, + "loss": 11.9394, + "step": 23896 + }, + { + "epoch": 1.301287156344231, + "grad_norm": 0.5614223842972061, + "learning_rate": 5.746707765938764e-05, + "loss": 11.8673, + "step": 23897 + }, + { + "epoch": 1.301341610340814, + "grad_norm": 0.5842289670745541, + "learning_rate": 5.745909695984864e-05, + "loss": 11.9609, + "step": 23898 + }, + { + "epoch": 1.301396064337397, + "grad_norm": 0.5484661025083538, + "learning_rate": 5.745111659111485e-05, + "loss": 11.8219, + "step": 23899 + }, + { + "epoch": 1.30145051833398, + "grad_norm": 0.485387211382524, + "learning_rate": 5.744313655324829e-05, + "loss": 11.9354, + "step": 23900 + }, + { + "epoch": 1.301504972330563, + "grad_norm": 0.5524606598815619, + "learning_rate": 5.743515684631107e-05, + "loss": 11.8696, + "step": 23901 + }, + { + "epoch": 1.301559426327146, + "grad_norm": 0.563018059080123, + "learning_rate": 5.742717747036517e-05, + "loss": 12.0242, + "step": 23902 + }, + { + "epoch": 1.301613880323729, + "grad_norm": 0.5186934091779591, + "learning_rate": 5.741919842547272e-05, + "loss": 11.9867, + "step": 23903 + }, + { + "epoch": 1.3016683343203121, + "grad_norm": 0.5546046245961251, + "learning_rate": 5.741121971169567e-05, + "loss": 11.9718, + "step": 23904 + }, + { + "epoch": 1.3017227883168951, + "grad_norm": 0.5722461246795527, + "learning_rate": 5.740324132909618e-05, + "loss": 11.9853, + "step": 23905 + }, + { + "epoch": 1.3017772423134781, + "grad_norm": 0.56459150025772, + "learning_rate": 5.739526327773622e-05, + "loss": 12.0055, + "step": 23906 + }, + { + "epoch": 1.3018316963100611, + "grad_norm": 0.5408282463553108, + "learning_rate": 5.73872855576778e-05, + "loss": 11.9205, + "step": 23907 + }, + { + "epoch": 1.3018861503066441, + "grad_norm": 0.5474122335268758, + "learning_rate": 5.737930816898303e-05, + "loss": 12.0301, + "step": 23908 + }, + { + "epoch": 1.3019406043032271, + "grad_norm": 0.5187639001840174, + "learning_rate": 5.737133111171386e-05, + "loss": 11.9641, + "step": 23909 + }, + { + "epoch": 1.30199505829981, + "grad_norm": 0.6272055705797348, + "learning_rate": 5.736335438593239e-05, + "loss": 12.128, + "step": 23910 + }, + { + "epoch": 1.302049512296393, + "grad_norm": 0.5272254647433033, + "learning_rate": 5.7355377991700645e-05, + "loss": 11.9563, + "step": 23911 + }, + { + "epoch": 1.302103966292976, + "grad_norm": 0.5549107036377859, + "learning_rate": 5.734740192908059e-05, + "loss": 11.8946, + "step": 23912 + }, + { + "epoch": 1.302158420289559, + "grad_norm": 0.5343595563170288, + "learning_rate": 5.733942619813435e-05, + "loss": 11.9768, + "step": 23913 + }, + { + "epoch": 1.302212874286142, + "grad_norm": 0.49984534310555945, + "learning_rate": 5.7331450798923835e-05, + "loss": 11.8715, + "step": 23914 + }, + { + "epoch": 1.302267328282725, + "grad_norm": 0.5395153645029583, + "learning_rate": 5.732347573151116e-05, + "loss": 11.958, + "step": 23915 + }, + { + "epoch": 1.302321782279308, + "grad_norm": 0.48031310327344234, + "learning_rate": 5.7315500995958284e-05, + "loss": 11.91, + "step": 23916 + }, + { + "epoch": 1.302376236275891, + "grad_norm": 0.4942641907324273, + "learning_rate": 5.73075265923272e-05, + "loss": 11.8832, + "step": 23917 + }, + { + "epoch": 1.3024306902724743, + "grad_norm": 0.5386913802218604, + "learning_rate": 5.729955252067998e-05, + "loss": 11.9891, + "step": 23918 + }, + { + "epoch": 1.3024851442690573, + "grad_norm": 0.5376796334453726, + "learning_rate": 5.7291578781078556e-05, + "loss": 11.9751, + "step": 23919 + }, + { + "epoch": 1.3025395982656403, + "grad_norm": 0.5565333856111213, + "learning_rate": 5.7283605373585035e-05, + "loss": 11.8746, + "step": 23920 + }, + { + "epoch": 1.3025940522622232, + "grad_norm": 0.5851019026703895, + "learning_rate": 5.727563229826132e-05, + "loss": 11.941, + "step": 23921 + }, + { + "epoch": 1.3026485062588062, + "grad_norm": 0.4855587166916286, + "learning_rate": 5.726765955516944e-05, + "loss": 11.9242, + "step": 23922 + }, + { + "epoch": 1.3027029602553892, + "grad_norm": 0.5296173452222003, + "learning_rate": 5.7259687144371454e-05, + "loss": 11.8905, + "step": 23923 + }, + { + "epoch": 1.3027574142519722, + "grad_norm": 0.581536295566947, + "learning_rate": 5.725171506592927e-05, + "loss": 11.7452, + "step": 23924 + }, + { + "epoch": 1.3028118682485552, + "grad_norm": 0.5704231691825496, + "learning_rate": 5.724374331990495e-05, + "loss": 11.8212, + "step": 23925 + }, + { + "epoch": 1.3028663222451382, + "grad_norm": 0.6077926864510211, + "learning_rate": 5.723577190636047e-05, + "loss": 11.9739, + "step": 23926 + }, + { + "epoch": 1.3029207762417214, + "grad_norm": 0.539981475425696, + "learning_rate": 5.722780082535775e-05, + "loss": 11.9746, + "step": 23927 + }, + { + "epoch": 1.3029752302383044, + "grad_norm": 0.5626468766958902, + "learning_rate": 5.721983007695886e-05, + "loss": 11.9676, + "step": 23928 + }, + { + "epoch": 1.3030296842348874, + "grad_norm": 0.5107973326796912, + "learning_rate": 5.72118596612257e-05, + "loss": 11.9145, + "step": 23929 + }, + { + "epoch": 1.3030841382314704, + "grad_norm": 0.5337323240886189, + "learning_rate": 5.720388957822034e-05, + "loss": 11.822, + "step": 23930 + }, + { + "epoch": 1.3031385922280534, + "grad_norm": 0.6045486064607628, + "learning_rate": 5.7195919828004675e-05, + "loss": 11.9822, + "step": 23931 + }, + { + "epoch": 1.3031930462246364, + "grad_norm": 0.5117673129782551, + "learning_rate": 5.718795041064071e-05, + "loss": 11.9986, + "step": 23932 + }, + { + "epoch": 1.3032475002212194, + "grad_norm": 0.5175239048840924, + "learning_rate": 5.717998132619047e-05, + "loss": 11.9173, + "step": 23933 + }, + { + "epoch": 1.3033019542178024, + "grad_norm": 0.5182460916005857, + "learning_rate": 5.7172012574715826e-05, + "loss": 11.8938, + "step": 23934 + }, + { + "epoch": 1.3033564082143854, + "grad_norm": 0.5171902548628802, + "learning_rate": 5.716404415627887e-05, + "loss": 11.8766, + "step": 23935 + }, + { + "epoch": 1.3034108622109684, + "grad_norm": 0.6326446928776895, + "learning_rate": 5.715607607094142e-05, + "loss": 11.9336, + "step": 23936 + }, + { + "epoch": 1.3034653162075513, + "grad_norm": 0.5356317795730359, + "learning_rate": 5.714810831876551e-05, + "loss": 11.9468, + "step": 23937 + }, + { + "epoch": 1.3035197702041343, + "grad_norm": 0.5355944319998637, + "learning_rate": 5.7140140899813144e-05, + "loss": 11.8743, + "step": 23938 + }, + { + "epoch": 1.3035742242007173, + "grad_norm": 0.576354179545349, + "learning_rate": 5.7132173814146176e-05, + "loss": 11.9397, + "step": 23939 + }, + { + "epoch": 1.3036286781973003, + "grad_norm": 0.7450549003939333, + "learning_rate": 5.712420706182666e-05, + "loss": 12.009, + "step": 23940 + }, + { + "epoch": 1.3036831321938833, + "grad_norm": 0.8013479299433881, + "learning_rate": 5.7116240642916455e-05, + "loss": 12.0488, + "step": 23941 + }, + { + "epoch": 1.3037375861904665, + "grad_norm": 0.5770703440886297, + "learning_rate": 5.7108274557477606e-05, + "loss": 11.8351, + "step": 23942 + }, + { + "epoch": 1.3037920401870495, + "grad_norm": 0.5448431251972765, + "learning_rate": 5.7100308805571946e-05, + "loss": 11.942, + "step": 23943 + }, + { + "epoch": 1.3038464941836325, + "grad_norm": 0.56651186123102, + "learning_rate": 5.709234338726149e-05, + "loss": 11.9969, + "step": 23944 + }, + { + "epoch": 1.3039009481802155, + "grad_norm": 0.5477211928099325, + "learning_rate": 5.7084378302608245e-05, + "loss": 11.869, + "step": 23945 + }, + { + "epoch": 1.3039554021767985, + "grad_norm": 0.5792967807923594, + "learning_rate": 5.707641355167399e-05, + "loss": 11.993, + "step": 23946 + }, + { + "epoch": 1.3040098561733815, + "grad_norm": 0.6211499004231313, + "learning_rate": 5.706844913452075e-05, + "loss": 11.8483, + "step": 23947 + }, + { + "epoch": 1.3040643101699645, + "grad_norm": 0.5835205538992685, + "learning_rate": 5.7060485051210486e-05, + "loss": 11.8844, + "step": 23948 + }, + { + "epoch": 1.3041187641665475, + "grad_norm": 0.5406301439325446, + "learning_rate": 5.705252130180505e-05, + "loss": 11.8945, + "step": 23949 + }, + { + "epoch": 1.3041732181631307, + "grad_norm": 0.510877086515905, + "learning_rate": 5.7044557886366455e-05, + "loss": 11.9438, + "step": 23950 + }, + { + "epoch": 1.3042276721597137, + "grad_norm": 0.6818922851058039, + "learning_rate": 5.703659480495653e-05, + "loss": 12.0, + "step": 23951 + }, + { + "epoch": 1.3042821261562967, + "grad_norm": 0.5291035509334728, + "learning_rate": 5.702863205763731e-05, + "loss": 11.9178, + "step": 23952 + }, + { + "epoch": 1.3043365801528797, + "grad_norm": 0.6095401079017516, + "learning_rate": 5.702066964447059e-05, + "loss": 11.9598, + "step": 23953 + }, + { + "epoch": 1.3043910341494627, + "grad_norm": 0.5731255467056443, + "learning_rate": 5.7012707565518365e-05, + "loss": 11.964, + "step": 23954 + }, + { + "epoch": 1.3044454881460457, + "grad_norm": 0.5698033834226252, + "learning_rate": 5.700474582084261e-05, + "loss": 11.9513, + "step": 23955 + }, + { + "epoch": 1.3044999421426287, + "grad_norm": 0.5418096243886533, + "learning_rate": 5.699678441050508e-05, + "loss": 11.9278, + "step": 23956 + }, + { + "epoch": 1.3045543961392116, + "grad_norm": 0.5285863962434048, + "learning_rate": 5.698882333456781e-05, + "loss": 11.8176, + "step": 23957 + }, + { + "epoch": 1.3046088501357946, + "grad_norm": 0.5437655748493991, + "learning_rate": 5.698086259309261e-05, + "loss": 11.8714, + "step": 23958 + }, + { + "epoch": 1.3046633041323776, + "grad_norm": 0.5249223565079, + "learning_rate": 5.697290218614145e-05, + "loss": 11.9625, + "step": 23959 + }, + { + "epoch": 1.3047177581289606, + "grad_norm": 0.5897176494325972, + "learning_rate": 5.696494211377625e-05, + "loss": 11.9968, + "step": 23960 + }, + { + "epoch": 1.3047722121255436, + "grad_norm": 0.5744789564895469, + "learning_rate": 5.695698237605885e-05, + "loss": 11.9869, + "step": 23961 + }, + { + "epoch": 1.3048266661221266, + "grad_norm": 0.5416320966784032, + "learning_rate": 5.69490229730512e-05, + "loss": 11.9474, + "step": 23962 + }, + { + "epoch": 1.3048811201187096, + "grad_norm": 0.5158342676255641, + "learning_rate": 5.694106390481514e-05, + "loss": 11.7993, + "step": 23963 + }, + { + "epoch": 1.3049355741152926, + "grad_norm": 0.5253323981802934, + "learning_rate": 5.693310517141261e-05, + "loss": 12.033, + "step": 23964 + }, + { + "epoch": 1.3049900281118758, + "grad_norm": 0.5401077524287262, + "learning_rate": 5.69251467729055e-05, + "loss": 12.0019, + "step": 23965 + }, + { + "epoch": 1.3050444821084588, + "grad_norm": 0.575819176008194, + "learning_rate": 5.691718870935562e-05, + "loss": 11.9024, + "step": 23966 + }, + { + "epoch": 1.3050989361050418, + "grad_norm": 0.5009735122171262, + "learning_rate": 5.690923098082495e-05, + "loss": 12.007, + "step": 23967 + }, + { + "epoch": 1.3051533901016248, + "grad_norm": 0.5469346786144591, + "learning_rate": 5.690127358737527e-05, + "loss": 12.0547, + "step": 23968 + }, + { + "epoch": 1.3052078440982078, + "grad_norm": 0.6110871644336969, + "learning_rate": 5.689331652906857e-05, + "loss": 11.9572, + "step": 23969 + }, + { + "epoch": 1.3052622980947908, + "grad_norm": 0.4965294515940343, + "learning_rate": 5.688535980596663e-05, + "loss": 11.8601, + "step": 23970 + }, + { + "epoch": 1.3053167520913738, + "grad_norm": 0.5301775928037475, + "learning_rate": 5.687740341813135e-05, + "loss": 11.8822, + "step": 23971 + }, + { + "epoch": 1.3053712060879568, + "grad_norm": 0.9010601067024929, + "learning_rate": 5.6869447365624675e-05, + "loss": 11.9948, + "step": 23972 + }, + { + "epoch": 1.3054256600845398, + "grad_norm": 0.5239675187616208, + "learning_rate": 5.6861491648508346e-05, + "loss": 11.7696, + "step": 23973 + }, + { + "epoch": 1.305480114081123, + "grad_norm": 0.6069009527012706, + "learning_rate": 5.6853536266844346e-05, + "loss": 12.0514, + "step": 23974 + }, + { + "epoch": 1.305534568077706, + "grad_norm": 0.5187344931404261, + "learning_rate": 5.684558122069448e-05, + "loss": 11.8903, + "step": 23975 + }, + { + "epoch": 1.305589022074289, + "grad_norm": 0.545560513923747, + "learning_rate": 5.683762651012057e-05, + "loss": 11.8777, + "step": 23976 + }, + { + "epoch": 1.305643476070872, + "grad_norm": 0.5995544898189384, + "learning_rate": 5.682967213518455e-05, + "loss": 11.9604, + "step": 23977 + }, + { + "epoch": 1.305697930067455, + "grad_norm": 0.5321000437618817, + "learning_rate": 5.682171809594821e-05, + "loss": 11.8635, + "step": 23978 + }, + { + "epoch": 1.305752384064038, + "grad_norm": 0.5049934564198645, + "learning_rate": 5.6813764392473464e-05, + "loss": 12.0295, + "step": 23979 + }, + { + "epoch": 1.305806838060621, + "grad_norm": 0.4867020493912554, + "learning_rate": 5.68058110248221e-05, + "loss": 11.889, + "step": 23980 + }, + { + "epoch": 1.305861292057204, + "grad_norm": 0.5488352103955727, + "learning_rate": 5.679785799305597e-05, + "loss": 11.9764, + "step": 23981 + }, + { + "epoch": 1.305915746053787, + "grad_norm": 0.5984157365757806, + "learning_rate": 5.6789905297237e-05, + "loss": 12.0128, + "step": 23982 + }, + { + "epoch": 1.30597020005037, + "grad_norm": 0.5676370800152911, + "learning_rate": 5.678195293742693e-05, + "loss": 11.9651, + "step": 23983 + }, + { + "epoch": 1.306024654046953, + "grad_norm": 0.5449524338822279, + "learning_rate": 5.677400091368767e-05, + "loss": 11.8785, + "step": 23984 + }, + { + "epoch": 1.3060791080435359, + "grad_norm": 0.5984049980477618, + "learning_rate": 5.6766049226081e-05, + "loss": 11.8982, + "step": 23985 + }, + { + "epoch": 1.3061335620401189, + "grad_norm": 0.5300111201265589, + "learning_rate": 5.675809787466883e-05, + "loss": 11.7741, + "step": 23986 + }, + { + "epoch": 1.3061880160367019, + "grad_norm": 0.4962737380666011, + "learning_rate": 5.675014685951292e-05, + "loss": 11.8057, + "step": 23987 + }, + { + "epoch": 1.306242470033285, + "grad_norm": 0.6006448553922471, + "learning_rate": 5.67421961806751e-05, + "loss": 12.1342, + "step": 23988 + }, + { + "epoch": 1.306296924029868, + "grad_norm": 0.5084978759646408, + "learning_rate": 5.673424583821725e-05, + "loss": 11.9022, + "step": 23989 + }, + { + "epoch": 1.306351378026451, + "grad_norm": 0.5722836251395635, + "learning_rate": 5.6726295832201115e-05, + "loss": 11.922, + "step": 23990 + }, + { + "epoch": 1.306405832023034, + "grad_norm": 0.5830201612688393, + "learning_rate": 5.671834616268861e-05, + "loss": 11.972, + "step": 23991 + }, + { + "epoch": 1.306460286019617, + "grad_norm": 0.5242589673921513, + "learning_rate": 5.6710396829741455e-05, + "loss": 11.9899, + "step": 23992 + }, + { + "epoch": 1.3065147400162, + "grad_norm": 0.5522634693503423, + "learning_rate": 5.670244783342151e-05, + "loss": 11.9227, + "step": 23993 + }, + { + "epoch": 1.306569194012783, + "grad_norm": 0.5975844373618137, + "learning_rate": 5.669449917379064e-05, + "loss": 11.9588, + "step": 23994 + }, + { + "epoch": 1.306623648009366, + "grad_norm": 0.5658906806418614, + "learning_rate": 5.6686550850910566e-05, + "loss": 11.9769, + "step": 23995 + }, + { + "epoch": 1.306678102005949, + "grad_norm": 0.576029591485249, + "learning_rate": 5.6678602864843186e-05, + "loss": 11.9018, + "step": 23996 + }, + { + "epoch": 1.3067325560025322, + "grad_norm": 0.5397198944961555, + "learning_rate": 5.667065521565025e-05, + "loss": 11.9782, + "step": 23997 + }, + { + "epoch": 1.3067870099991152, + "grad_norm": 0.5318343941425525, + "learning_rate": 5.666270790339352e-05, + "loss": 11.8726, + "step": 23998 + }, + { + "epoch": 1.3068414639956982, + "grad_norm": 0.4897478737466349, + "learning_rate": 5.665476092813488e-05, + "loss": 11.8749, + "step": 23999 + }, + { + "epoch": 1.3068959179922812, + "grad_norm": 0.5184081515081808, + "learning_rate": 5.664681428993604e-05, + "loss": 11.8335, + "step": 24000 + }, + { + "epoch": 1.3069503719888642, + "grad_norm": 0.5645208913521276, + "learning_rate": 5.663886798885889e-05, + "loss": 11.8736, + "step": 24001 + }, + { + "epoch": 1.3070048259854472, + "grad_norm": 0.5654316704582217, + "learning_rate": 5.663092202496514e-05, + "loss": 11.983, + "step": 24002 + }, + { + "epoch": 1.3070592799820302, + "grad_norm": 0.5317254218554905, + "learning_rate": 5.662297639831662e-05, + "loss": 11.9351, + "step": 24003 + }, + { + "epoch": 1.3071137339786132, + "grad_norm": 0.5035033916689757, + "learning_rate": 5.661503110897512e-05, + "loss": 11.8796, + "step": 24004 + }, + { + "epoch": 1.3071681879751962, + "grad_norm": 0.5551355566986512, + "learning_rate": 5.660708615700241e-05, + "loss": 11.9708, + "step": 24005 + }, + { + "epoch": 1.3072226419717792, + "grad_norm": 0.5316190494277161, + "learning_rate": 5.6599141542460334e-05, + "loss": 11.9273, + "step": 24006 + }, + { + "epoch": 1.3072770959683622, + "grad_norm": 0.5489959422403061, + "learning_rate": 5.659119726541054e-05, + "loss": 11.9331, + "step": 24007 + }, + { + "epoch": 1.3073315499649452, + "grad_norm": 0.5314437492337467, + "learning_rate": 5.658325332591486e-05, + "loss": 11.9833, + "step": 24008 + }, + { + "epoch": 1.3073860039615282, + "grad_norm": 0.5465914577428189, + "learning_rate": 5.657530972403514e-05, + "loss": 11.7488, + "step": 24009 + }, + { + "epoch": 1.3074404579581111, + "grad_norm": 0.5306735013351034, + "learning_rate": 5.656736645983305e-05, + "loss": 11.9382, + "step": 24010 + }, + { + "epoch": 1.3074949119546941, + "grad_norm": 0.5726449187442675, + "learning_rate": 5.6559423533370435e-05, + "loss": 11.819, + "step": 24011 + }, + { + "epoch": 1.3075493659512774, + "grad_norm": 0.5486158402846787, + "learning_rate": 5.655148094470899e-05, + "loss": 11.8961, + "step": 24012 + }, + { + "epoch": 1.3076038199478603, + "grad_norm": 0.5426016652965927, + "learning_rate": 5.654353869391057e-05, + "loss": 11.9445, + "step": 24013 + }, + { + "epoch": 1.3076582739444433, + "grad_norm": 0.5245954095774398, + "learning_rate": 5.6535596781036826e-05, + "loss": 11.8985, + "step": 24014 + }, + { + "epoch": 1.3077127279410263, + "grad_norm": 0.5068428886140711, + "learning_rate": 5.652765520614958e-05, + "loss": 12.0234, + "step": 24015 + }, + { + "epoch": 1.3077671819376093, + "grad_norm": 0.5968747370915668, + "learning_rate": 5.651971396931064e-05, + "loss": 11.9745, + "step": 24016 + }, + { + "epoch": 1.3078216359341923, + "grad_norm": 0.5904562558238576, + "learning_rate": 5.651177307058163e-05, + "loss": 11.829, + "step": 24017 + }, + { + "epoch": 1.3078760899307753, + "grad_norm": 0.6144528212683565, + "learning_rate": 5.650383251002437e-05, + "loss": 12.0584, + "step": 24018 + }, + { + "epoch": 1.3079305439273583, + "grad_norm": 0.5637354895837392, + "learning_rate": 5.649589228770062e-05, + "loss": 12.0088, + "step": 24019 + }, + { + "epoch": 1.3079849979239415, + "grad_norm": 0.5174112427094429, + "learning_rate": 5.64879524036721e-05, + "loss": 11.859, + "step": 24020 + }, + { + "epoch": 1.3080394519205245, + "grad_norm": 0.5722180502695624, + "learning_rate": 5.6480012858000575e-05, + "loss": 11.8917, + "step": 24021 + }, + { + "epoch": 1.3080939059171075, + "grad_norm": 0.5278915893715153, + "learning_rate": 5.6472073650747736e-05, + "loss": 11.841, + "step": 24022 + }, + { + "epoch": 1.3081483599136905, + "grad_norm": 0.5434381151015685, + "learning_rate": 5.646413478197539e-05, + "loss": 12.0007, + "step": 24023 + }, + { + "epoch": 1.3082028139102735, + "grad_norm": 0.57750000746015, + "learning_rate": 5.64561962517452e-05, + "loss": 11.8534, + "step": 24024 + }, + { + "epoch": 1.3082572679068565, + "grad_norm": 0.5459021214604409, + "learning_rate": 5.6448258060118954e-05, + "loss": 11.9986, + "step": 24025 + }, + { + "epoch": 1.3083117219034395, + "grad_norm": 0.5416026288585469, + "learning_rate": 5.644032020715837e-05, + "loss": 12.0973, + "step": 24026 + }, + { + "epoch": 1.3083661759000225, + "grad_norm": 0.5158199941024781, + "learning_rate": 5.6432382692925124e-05, + "loss": 11.9428, + "step": 24027 + }, + { + "epoch": 1.3084206298966055, + "grad_norm": 0.4821028621826143, + "learning_rate": 5.642444551748101e-05, + "loss": 11.8582, + "step": 24028 + }, + { + "epoch": 1.3084750838931885, + "grad_norm": 0.5817373794363259, + "learning_rate": 5.641650868088768e-05, + "loss": 11.84, + "step": 24029 + }, + { + "epoch": 1.3085295378897714, + "grad_norm": 0.5238270562827404, + "learning_rate": 5.6408572183206874e-05, + "loss": 11.8774, + "step": 24030 + }, + { + "epoch": 1.3085839918863544, + "grad_norm": 0.5662106271145493, + "learning_rate": 5.6400636024500376e-05, + "loss": 11.8612, + "step": 24031 + }, + { + "epoch": 1.3086384458829374, + "grad_norm": 0.4658411699306566, + "learning_rate": 5.63927002048298e-05, + "loss": 11.8203, + "step": 24032 + }, + { + "epoch": 1.3086928998795204, + "grad_norm": 0.5369833044130452, + "learning_rate": 5.6384764724256935e-05, + "loss": 11.7989, + "step": 24033 + }, + { + "epoch": 1.3087473538761034, + "grad_norm": 0.6215086146378541, + "learning_rate": 5.6376829582843416e-05, + "loss": 11.9815, + "step": 24034 + }, + { + "epoch": 1.3088018078726866, + "grad_norm": 0.575460996600585, + "learning_rate": 5.636889478065103e-05, + "loss": 11.8371, + "step": 24035 + }, + { + "epoch": 1.3088562618692696, + "grad_norm": 0.5267694407632315, + "learning_rate": 5.636096031774143e-05, + "loss": 11.8948, + "step": 24036 + }, + { + "epoch": 1.3089107158658526, + "grad_norm": 0.9051984971751704, + "learning_rate": 5.6353026194176284e-05, + "loss": 11.8355, + "step": 24037 + }, + { + "epoch": 1.3089651698624356, + "grad_norm": 0.5451482067920245, + "learning_rate": 5.6345092410017366e-05, + "loss": 11.8367, + "step": 24038 + }, + { + "epoch": 1.3090196238590186, + "grad_norm": 0.556870168449486, + "learning_rate": 5.633715896532629e-05, + "loss": 12.0332, + "step": 24039 + }, + { + "epoch": 1.3090740778556016, + "grad_norm": 0.5546790452992627, + "learning_rate": 5.632922586016482e-05, + "loss": 11.9491, + "step": 24040 + }, + { + "epoch": 1.3091285318521846, + "grad_norm": 0.548344602851646, + "learning_rate": 5.6321293094594586e-05, + "loss": 11.9903, + "step": 24041 + }, + { + "epoch": 1.3091829858487676, + "grad_norm": 0.5352260763084332, + "learning_rate": 5.63133606686773e-05, + "loss": 11.7799, + "step": 24042 + }, + { + "epoch": 1.3092374398453506, + "grad_norm": 0.5511868697841315, + "learning_rate": 5.6305428582474676e-05, + "loss": 12.0075, + "step": 24043 + }, + { + "epoch": 1.3092918938419338, + "grad_norm": 0.5807556727173098, + "learning_rate": 5.6297496836048336e-05, + "loss": 11.9363, + "step": 24044 + }, + { + "epoch": 1.3093463478385168, + "grad_norm": 0.5416019456363944, + "learning_rate": 5.6289565429460024e-05, + "loss": 11.9437, + "step": 24045 + }, + { + "epoch": 1.3094008018350998, + "grad_norm": 0.5568288851944784, + "learning_rate": 5.6281634362771383e-05, + "loss": 11.9437, + "step": 24046 + }, + { + "epoch": 1.3094552558316828, + "grad_norm": 0.5165909000398059, + "learning_rate": 5.627370363604405e-05, + "loss": 11.9533, + "step": 24047 + }, + { + "epoch": 1.3095097098282658, + "grad_norm": 0.4648065800283373, + "learning_rate": 5.626577324933977e-05, + "loss": 11.9777, + "step": 24048 + }, + { + "epoch": 1.3095641638248487, + "grad_norm": 0.5402151427444775, + "learning_rate": 5.625784320272014e-05, + "loss": 11.9627, + "step": 24049 + }, + { + "epoch": 1.3096186178214317, + "grad_norm": 0.5151015954640977, + "learning_rate": 5.6249913496246884e-05, + "loss": 11.8636, + "step": 24050 + }, + { + "epoch": 1.3096730718180147, + "grad_norm": 0.7071352397373822, + "learning_rate": 5.6241984129981594e-05, + "loss": 11.9924, + "step": 24051 + }, + { + "epoch": 1.3097275258145977, + "grad_norm": 0.5532174252587672, + "learning_rate": 5.623405510398598e-05, + "loss": 11.8778, + "step": 24052 + }, + { + "epoch": 1.3097819798111807, + "grad_norm": 0.5289146410485642, + "learning_rate": 5.6226126418321734e-05, + "loss": 11.926, + "step": 24053 + }, + { + "epoch": 1.3098364338077637, + "grad_norm": 0.5334503954840796, + "learning_rate": 5.621819807305043e-05, + "loss": 11.9385, + "step": 24054 + }, + { + "epoch": 1.3098908878043467, + "grad_norm": 0.5675098184871193, + "learning_rate": 5.621027006823381e-05, + "loss": 11.91, + "step": 24055 + }, + { + "epoch": 1.3099453418009297, + "grad_norm": 0.5520081124621085, + "learning_rate": 5.620234240393345e-05, + "loss": 11.8502, + "step": 24056 + }, + { + "epoch": 1.3099997957975127, + "grad_norm": 0.5103938780111638, + "learning_rate": 5.619441508021099e-05, + "loss": 11.9141, + "step": 24057 + }, + { + "epoch": 1.310054249794096, + "grad_norm": 0.6273218532787944, + "learning_rate": 5.618648809712815e-05, + "loss": 12.0362, + "step": 24058 + }, + { + "epoch": 1.310108703790679, + "grad_norm": 0.6317001937888972, + "learning_rate": 5.6178561454746495e-05, + "loss": 11.962, + "step": 24059 + }, + { + "epoch": 1.310163157787262, + "grad_norm": 0.5852638614045123, + "learning_rate": 5.6170635153127703e-05, + "loss": 11.9447, + "step": 24060 + }, + { + "epoch": 1.3102176117838449, + "grad_norm": 0.5165607266868081, + "learning_rate": 5.6162709192333394e-05, + "loss": 11.9877, + "step": 24061 + }, + { + "epoch": 1.3102720657804279, + "grad_norm": 0.541180216936752, + "learning_rate": 5.615478357242523e-05, + "loss": 11.7278, + "step": 24062 + }, + { + "epoch": 1.3103265197770109, + "grad_norm": 0.6027574576790281, + "learning_rate": 5.61468582934648e-05, + "loss": 11.7352, + "step": 24063 + }, + { + "epoch": 1.3103809737735939, + "grad_norm": 0.561972172242614, + "learning_rate": 5.6138933355513744e-05, + "loss": 11.9482, + "step": 24064 + }, + { + "epoch": 1.3104354277701769, + "grad_norm": 0.540328313228767, + "learning_rate": 5.613100875863373e-05, + "loss": 11.8044, + "step": 24065 + }, + { + "epoch": 1.3104898817667598, + "grad_norm": 0.5062809733534139, + "learning_rate": 5.6123084502886355e-05, + "loss": 11.4273, + "step": 24066 + }, + { + "epoch": 1.310544335763343, + "grad_norm": 0.5369482911576979, + "learning_rate": 5.611516058833319e-05, + "loss": 11.9504, + "step": 24067 + }, + { + "epoch": 1.310598789759926, + "grad_norm": 0.5616688717766474, + "learning_rate": 5.610723701503595e-05, + "loss": 11.8698, + "step": 24068 + }, + { + "epoch": 1.310653243756509, + "grad_norm": 0.5599362798095469, + "learning_rate": 5.609931378305615e-05, + "loss": 11.7793, + "step": 24069 + }, + { + "epoch": 1.310707697753092, + "grad_norm": 0.5818904319739762, + "learning_rate": 5.609139089245548e-05, + "loss": 12.0357, + "step": 24070 + }, + { + "epoch": 1.310762151749675, + "grad_norm": 0.5087136123233187, + "learning_rate": 5.608346834329547e-05, + "loss": 11.8868, + "step": 24071 + }, + { + "epoch": 1.310816605746258, + "grad_norm": 0.5162082398924985, + "learning_rate": 5.607554613563783e-05, + "loss": 11.928, + "step": 24072 + }, + { + "epoch": 1.310871059742841, + "grad_norm": 0.5629835155002205, + "learning_rate": 5.606762426954408e-05, + "loss": 11.8736, + "step": 24073 + }, + { + "epoch": 1.310925513739424, + "grad_norm": 0.5890715222779773, + "learning_rate": 5.605970274507588e-05, + "loss": 11.9634, + "step": 24074 + }, + { + "epoch": 1.310979967736007, + "grad_norm": 0.5558526509435125, + "learning_rate": 5.605178156229476e-05, + "loss": 11.8771, + "step": 24075 + }, + { + "epoch": 1.31103442173259, + "grad_norm": 0.5009418577530341, + "learning_rate": 5.604386072126239e-05, + "loss": 11.9972, + "step": 24076 + }, + { + "epoch": 1.311088875729173, + "grad_norm": 0.5493961153888932, + "learning_rate": 5.603594022204033e-05, + "loss": 11.7846, + "step": 24077 + }, + { + "epoch": 1.311143329725756, + "grad_norm": 0.5293464153718507, + "learning_rate": 5.602802006469014e-05, + "loss": 11.8235, + "step": 24078 + }, + { + "epoch": 1.311197783722339, + "grad_norm": 0.47494824965195975, + "learning_rate": 5.602010024927343e-05, + "loss": 11.9343, + "step": 24079 + }, + { + "epoch": 1.311252237718922, + "grad_norm": 0.5400570253824287, + "learning_rate": 5.601218077585184e-05, + "loss": 11.9424, + "step": 24080 + }, + { + "epoch": 1.311306691715505, + "grad_norm": 0.6292446865449202, + "learning_rate": 5.600426164448688e-05, + "loss": 11.9883, + "step": 24081 + }, + { + "epoch": 1.3113611457120882, + "grad_norm": 0.5384956763460513, + "learning_rate": 5.5996342855240194e-05, + "loss": 12.0204, + "step": 24082 + }, + { + "epoch": 1.3114155997086712, + "grad_norm": 0.4949942337977581, + "learning_rate": 5.598842440817328e-05, + "loss": 11.8607, + "step": 24083 + }, + { + "epoch": 1.3114700537052542, + "grad_norm": 0.5495955408059275, + "learning_rate": 5.59805063033478e-05, + "loss": 11.8031, + "step": 24084 + }, + { + "epoch": 1.3115245077018372, + "grad_norm": 0.5227917745529473, + "learning_rate": 5.5972588540825245e-05, + "loss": 11.9676, + "step": 24085 + }, + { + "epoch": 1.3115789616984201, + "grad_norm": 0.5493167528344649, + "learning_rate": 5.596467112066727e-05, + "loss": 12.0129, + "step": 24086 + }, + { + "epoch": 1.3116334156950031, + "grad_norm": 0.5369586038681113, + "learning_rate": 5.595675404293539e-05, + "loss": 11.934, + "step": 24087 + }, + { + "epoch": 1.3116878696915861, + "grad_norm": 0.5692549213598999, + "learning_rate": 5.594883730769114e-05, + "loss": 12.0284, + "step": 24088 + }, + { + "epoch": 1.3117423236881691, + "grad_norm": 0.5246548461312474, + "learning_rate": 5.594092091499617e-05, + "loss": 12.104, + "step": 24089 + }, + { + "epoch": 1.3117967776847523, + "grad_norm": 0.5422912381640886, + "learning_rate": 5.593300486491193e-05, + "loss": 11.8926, + "step": 24090 + }, + { + "epoch": 1.3118512316813353, + "grad_norm": 0.5507037008770228, + "learning_rate": 5.592508915750003e-05, + "loss": 11.8453, + "step": 24091 + }, + { + "epoch": 1.3119056856779183, + "grad_norm": 0.6109536311951776, + "learning_rate": 5.5917173792822086e-05, + "loss": 11.8903, + "step": 24092 + }, + { + "epoch": 1.3119601396745013, + "grad_norm": 0.5667929815034383, + "learning_rate": 5.590925877093954e-05, + "loss": 11.925, + "step": 24093 + }, + { + "epoch": 1.3120145936710843, + "grad_norm": 0.5151352515239541, + "learning_rate": 5.590134409191403e-05, + "loss": 11.7802, + "step": 24094 + }, + { + "epoch": 1.3120690476676673, + "grad_norm": 0.5312198409149739, + "learning_rate": 5.589342975580703e-05, + "loss": 11.8378, + "step": 24095 + }, + { + "epoch": 1.3121235016642503, + "grad_norm": 0.51001384620566, + "learning_rate": 5.588551576268016e-05, + "loss": 11.7902, + "step": 24096 + }, + { + "epoch": 1.3121779556608333, + "grad_norm": 0.5474836460471255, + "learning_rate": 5.587760211259492e-05, + "loss": 12.0006, + "step": 24097 + }, + { + "epoch": 1.3122324096574163, + "grad_norm": 0.5813697004632432, + "learning_rate": 5.5869688805612784e-05, + "loss": 11.9648, + "step": 24098 + }, + { + "epoch": 1.3122868636539993, + "grad_norm": 0.5495130797855269, + "learning_rate": 5.5861775841795417e-05, + "loss": 11.8442, + "step": 24099 + }, + { + "epoch": 1.3123413176505823, + "grad_norm": 0.5563958357476311, + "learning_rate": 5.585386322120423e-05, + "loss": 11.9653, + "step": 24100 + }, + { + "epoch": 1.3123957716471653, + "grad_norm": 0.5601210651888593, + "learning_rate": 5.584595094390079e-05, + "loss": 11.9562, + "step": 24101 + }, + { + "epoch": 1.3124502256437482, + "grad_norm": 0.5085020324736595, + "learning_rate": 5.583803900994671e-05, + "loss": 11.974, + "step": 24102 + }, + { + "epoch": 1.3125046796403312, + "grad_norm": 0.5350322377035194, + "learning_rate": 5.583012741940339e-05, + "loss": 12.0051, + "step": 24103 + }, + { + "epoch": 1.3125591336369142, + "grad_norm": 0.5307092152711169, + "learning_rate": 5.5822216172332444e-05, + "loss": 11.9561, + "step": 24104 + }, + { + "epoch": 1.3126135876334974, + "grad_norm": 0.5560314482779806, + "learning_rate": 5.581430526879532e-05, + "loss": 11.8514, + "step": 24105 + }, + { + "epoch": 1.3126680416300804, + "grad_norm": 0.5246609139397009, + "learning_rate": 5.58063947088536e-05, + "loss": 11.9932, + "step": 24106 + }, + { + "epoch": 1.3127224956266634, + "grad_norm": 0.5446374354321504, + "learning_rate": 5.579848449256877e-05, + "loss": 11.8102, + "step": 24107 + }, + { + "epoch": 1.3127769496232464, + "grad_norm": 0.557710650956272, + "learning_rate": 5.57905746200023e-05, + "loss": 11.9275, + "step": 24108 + }, + { + "epoch": 1.3128314036198294, + "grad_norm": 0.569629619957673, + "learning_rate": 5.5782665091215766e-05, + "loss": 11.9469, + "step": 24109 + }, + { + "epoch": 1.3128858576164124, + "grad_norm": 0.5452258413061813, + "learning_rate": 5.5774755906270613e-05, + "loss": 12.0616, + "step": 24110 + }, + { + "epoch": 1.3129403116129954, + "grad_norm": 0.549681639306198, + "learning_rate": 5.576684706522841e-05, + "loss": 11.814, + "step": 24111 + }, + { + "epoch": 1.3129947656095784, + "grad_norm": 0.8968357936170647, + "learning_rate": 5.5758938568150585e-05, + "loss": 12.0231, + "step": 24112 + }, + { + "epoch": 1.3130492196061614, + "grad_norm": 0.6114355622916864, + "learning_rate": 5.575103041509867e-05, + "loss": 11.9277, + "step": 24113 + }, + { + "epoch": 1.3131036736027446, + "grad_norm": 0.5632441548114183, + "learning_rate": 5.5743122606134214e-05, + "loss": 11.8432, + "step": 24114 + }, + { + "epoch": 1.3131581275993276, + "grad_norm": 0.6229341802493858, + "learning_rate": 5.5735215141318607e-05, + "loss": 12.0, + "step": 24115 + }, + { + "epoch": 1.3132125815959106, + "grad_norm": 0.5325063396721057, + "learning_rate": 5.572730802071343e-05, + "loss": 11.888, + "step": 24116 + }, + { + "epoch": 1.3132670355924936, + "grad_norm": 0.570325299049395, + "learning_rate": 5.571940124438013e-05, + "loss": 11.9914, + "step": 24117 + }, + { + "epoch": 1.3133214895890766, + "grad_norm": 0.5685258266474903, + "learning_rate": 5.571149481238016e-05, + "loss": 11.9793, + "step": 24118 + }, + { + "epoch": 1.3133759435856596, + "grad_norm": 0.5074342320921148, + "learning_rate": 5.570358872477507e-05, + "loss": 11.8027, + "step": 24119 + }, + { + "epoch": 1.3134303975822426, + "grad_norm": 0.5158155993323239, + "learning_rate": 5.5695682981626266e-05, + "loss": 11.8288, + "step": 24120 + }, + { + "epoch": 1.3134848515788256, + "grad_norm": 0.5378069571240769, + "learning_rate": 5.5687777582995284e-05, + "loss": 11.7794, + "step": 24121 + }, + { + "epoch": 1.3135393055754085, + "grad_norm": 0.5895099475242283, + "learning_rate": 5.567987252894355e-05, + "loss": 12.0278, + "step": 24122 + }, + { + "epoch": 1.3135937595719915, + "grad_norm": 0.5474999267889047, + "learning_rate": 5.56719678195326e-05, + "loss": 11.9065, + "step": 24123 + }, + { + "epoch": 1.3136482135685745, + "grad_norm": 0.6210183064545841, + "learning_rate": 5.566406345482382e-05, + "loss": 12.0531, + "step": 24124 + }, + { + "epoch": 1.3137026675651575, + "grad_norm": 0.5584809577507116, + "learning_rate": 5.565615943487872e-05, + "loss": 11.88, + "step": 24125 + }, + { + "epoch": 1.3137571215617405, + "grad_norm": 0.5620921521487785, + "learning_rate": 5.564825575975883e-05, + "loss": 11.8425, + "step": 24126 + }, + { + "epoch": 1.3138115755583235, + "grad_norm": 0.6000775354646204, + "learning_rate": 5.5640352429525466e-05, + "loss": 11.9881, + "step": 24127 + }, + { + "epoch": 1.3138660295549067, + "grad_norm": 0.5725438406253455, + "learning_rate": 5.5632449444240165e-05, + "loss": 11.8342, + "step": 24128 + }, + { + "epoch": 1.3139204835514897, + "grad_norm": 0.5747252928413819, + "learning_rate": 5.5624546803964414e-05, + "loss": 11.9359, + "step": 24129 + }, + { + "epoch": 1.3139749375480727, + "grad_norm": 0.5580723502020485, + "learning_rate": 5.5616644508759585e-05, + "loss": 11.8348, + "step": 24130 + }, + { + "epoch": 1.3140293915446557, + "grad_norm": 0.5206431814318879, + "learning_rate": 5.560874255868722e-05, + "loss": 11.994, + "step": 24131 + }, + { + "epoch": 1.3140838455412387, + "grad_norm": 0.5139602020182866, + "learning_rate": 5.5600840953808675e-05, + "loss": 11.8856, + "step": 24132 + }, + { + "epoch": 1.3141382995378217, + "grad_norm": 0.5273997701362135, + "learning_rate": 5.5592939694185473e-05, + "loss": 11.9026, + "step": 24133 + }, + { + "epoch": 1.3141927535344047, + "grad_norm": 0.5255371617969886, + "learning_rate": 5.5585038779878984e-05, + "loss": 11.869, + "step": 24134 + }, + { + "epoch": 1.3142472075309877, + "grad_norm": 0.5605092161408242, + "learning_rate": 5.557713821095067e-05, + "loss": 11.9114, + "step": 24135 + }, + { + "epoch": 1.3143016615275707, + "grad_norm": 0.5323861289247597, + "learning_rate": 5.556923798746204e-05, + "loss": 11.8343, + "step": 24136 + }, + { + "epoch": 1.3143561155241539, + "grad_norm": 0.5361199374183426, + "learning_rate": 5.556133810947446e-05, + "loss": 11.9019, + "step": 24137 + }, + { + "epoch": 1.3144105695207369, + "grad_norm": 0.5515330296315245, + "learning_rate": 5.5553438577049354e-05, + "loss": 11.9666, + "step": 24138 + }, + { + "epoch": 1.3144650235173199, + "grad_norm": 0.532408575148832, + "learning_rate": 5.5545539390248135e-05, + "loss": 12.028, + "step": 24139 + }, + { + "epoch": 1.3145194775139029, + "grad_norm": 0.5354823885175315, + "learning_rate": 5.553764054913225e-05, + "loss": 11.9947, + "step": 24140 + }, + { + "epoch": 1.3145739315104858, + "grad_norm": 0.5659012633566877, + "learning_rate": 5.552974205376318e-05, + "loss": 11.8438, + "step": 24141 + }, + { + "epoch": 1.3146283855070688, + "grad_norm": 0.5160737526667496, + "learning_rate": 5.552184390420223e-05, + "loss": 11.8927, + "step": 24142 + }, + { + "epoch": 1.3146828395036518, + "grad_norm": 0.523950389511685, + "learning_rate": 5.551394610051095e-05, + "loss": 11.8478, + "step": 24143 + }, + { + "epoch": 1.3147372935002348, + "grad_norm": 0.5525784865107607, + "learning_rate": 5.550604864275062e-05, + "loss": 11.9292, + "step": 24144 + }, + { + "epoch": 1.3147917474968178, + "grad_norm": 0.5705633641822888, + "learning_rate": 5.5498151530982765e-05, + "loss": 11.9413, + "step": 24145 + }, + { + "epoch": 1.3148462014934008, + "grad_norm": 0.5416032511796919, + "learning_rate": 5.549025476526872e-05, + "loss": 11.989, + "step": 24146 + }, + { + "epoch": 1.3149006554899838, + "grad_norm": 0.5547615693651402, + "learning_rate": 5.548235834566995e-05, + "loss": 11.8103, + "step": 24147 + }, + { + "epoch": 1.3149551094865668, + "grad_norm": 0.529193120232034, + "learning_rate": 5.5474462272247816e-05, + "loss": 11.8657, + "step": 24148 + }, + { + "epoch": 1.3150095634831498, + "grad_norm": 0.5883229261148182, + "learning_rate": 5.5466566545063705e-05, + "loss": 11.8602, + "step": 24149 + }, + { + "epoch": 1.3150640174797328, + "grad_norm": 0.5435475930209137, + "learning_rate": 5.545867116417903e-05, + "loss": 11.9026, + "step": 24150 + }, + { + "epoch": 1.315118471476316, + "grad_norm": 0.5343514848972449, + "learning_rate": 5.545077612965525e-05, + "loss": 11.9724, + "step": 24151 + }, + { + "epoch": 1.315172925472899, + "grad_norm": 0.5512682603081, + "learning_rate": 5.544288144155365e-05, + "loss": 11.9269, + "step": 24152 + }, + { + "epoch": 1.315227379469482, + "grad_norm": 0.5353583863282051, + "learning_rate": 5.543498709993573e-05, + "loss": 11.957, + "step": 24153 + }, + { + "epoch": 1.315281833466065, + "grad_norm": 0.5511459412465701, + "learning_rate": 5.5427093104862785e-05, + "loss": 11.9519, + "step": 24154 + }, + { + "epoch": 1.315336287462648, + "grad_norm": 0.5174468989461876, + "learning_rate": 5.541919945639626e-05, + "loss": 11.8569, + "step": 24155 + }, + { + "epoch": 1.315390741459231, + "grad_norm": 0.4994135320521054, + "learning_rate": 5.54113061545975e-05, + "loss": 11.927, + "step": 24156 + }, + { + "epoch": 1.315445195455814, + "grad_norm": 0.509100283757051, + "learning_rate": 5.540341319952793e-05, + "loss": 11.8701, + "step": 24157 + }, + { + "epoch": 1.315499649452397, + "grad_norm": 0.5681110353943868, + "learning_rate": 5.5395520591248896e-05, + "loss": 11.9094, + "step": 24158 + }, + { + "epoch": 1.31555410344898, + "grad_norm": 0.5072899762004406, + "learning_rate": 5.5387628329821736e-05, + "loss": 11.9164, + "step": 24159 + }, + { + "epoch": 1.3156085574455632, + "grad_norm": 0.5677614542523862, + "learning_rate": 5.537973641530789e-05, + "loss": 12.0422, + "step": 24160 + }, + { + "epoch": 1.3156630114421461, + "grad_norm": 0.5821230787364737, + "learning_rate": 5.537184484776867e-05, + "loss": 11.8967, + "step": 24161 + }, + { + "epoch": 1.3157174654387291, + "grad_norm": 0.5055793912300793, + "learning_rate": 5.536395362726548e-05, + "loss": 11.9621, + "step": 24162 + }, + { + "epoch": 1.3157719194353121, + "grad_norm": 0.5134850537248695, + "learning_rate": 5.535606275385968e-05, + "loss": 11.7587, + "step": 24163 + }, + { + "epoch": 1.3158263734318951, + "grad_norm": 0.5419693543723557, + "learning_rate": 5.53481722276126e-05, + "loss": 11.9077, + "step": 24164 + }, + { + "epoch": 1.3158808274284781, + "grad_norm": 0.5173375709901956, + "learning_rate": 5.5340282048585654e-05, + "loss": 12.0389, + "step": 24165 + }, + { + "epoch": 1.315935281425061, + "grad_norm": 0.5504550053957293, + "learning_rate": 5.533239221684015e-05, + "loss": 11.9779, + "step": 24166 + }, + { + "epoch": 1.315989735421644, + "grad_norm": 0.5451182986736287, + "learning_rate": 5.5324502732437454e-05, + "loss": 11.9775, + "step": 24167 + }, + { + "epoch": 1.316044189418227, + "grad_norm": 0.503295735563272, + "learning_rate": 5.531661359543895e-05, + "loss": 11.9263, + "step": 24168 + }, + { + "epoch": 1.31609864341481, + "grad_norm": 0.5789356449436505, + "learning_rate": 5.53087248059059e-05, + "loss": 11.9394, + "step": 24169 + }, + { + "epoch": 1.316153097411393, + "grad_norm": 0.5325233102245421, + "learning_rate": 5.5300836363899736e-05, + "loss": 12.0002, + "step": 24170 + }, + { + "epoch": 1.316207551407976, + "grad_norm": 0.5495782431769719, + "learning_rate": 5.529294826948173e-05, + "loss": 11.6923, + "step": 24171 + }, + { + "epoch": 1.316262005404559, + "grad_norm": 0.512139927840057, + "learning_rate": 5.528506052271325e-05, + "loss": 11.8159, + "step": 24172 + }, + { + "epoch": 1.316316459401142, + "grad_norm": 0.48019392667061545, + "learning_rate": 5.527717312365569e-05, + "loss": 11.8032, + "step": 24173 + }, + { + "epoch": 1.316370913397725, + "grad_norm": 0.539613227120157, + "learning_rate": 5.526928607237028e-05, + "loss": 11.8105, + "step": 24174 + }, + { + "epoch": 1.3164253673943083, + "grad_norm": 0.5114423431699913, + "learning_rate": 5.5261399368918454e-05, + "loss": 11.7604, + "step": 24175 + }, + { + "epoch": 1.3164798213908913, + "grad_norm": 0.5524076486521812, + "learning_rate": 5.5253513013361434e-05, + "loss": 11.8275, + "step": 24176 + }, + { + "epoch": 1.3165342753874743, + "grad_norm": 0.5331113835805055, + "learning_rate": 5.5245627005760656e-05, + "loss": 11.7874, + "step": 24177 + }, + { + "epoch": 1.3165887293840572, + "grad_norm": 0.5454035226612589, + "learning_rate": 5.5237741346177385e-05, + "loss": 11.9775, + "step": 24178 + }, + { + "epoch": 1.3166431833806402, + "grad_norm": 0.5283015966927866, + "learning_rate": 5.52298560346729e-05, + "loss": 11.8976, + "step": 24179 + }, + { + "epoch": 1.3166976373772232, + "grad_norm": 0.5871850154321271, + "learning_rate": 5.5221971071308595e-05, + "loss": 11.9353, + "step": 24180 + }, + { + "epoch": 1.3167520913738062, + "grad_norm": 0.6136044105531907, + "learning_rate": 5.521408645614572e-05, + "loss": 11.8291, + "step": 24181 + }, + { + "epoch": 1.3168065453703892, + "grad_norm": 0.5487668700359158, + "learning_rate": 5.520620218924566e-05, + "loss": 12.0251, + "step": 24182 + }, + { + "epoch": 1.3168609993669722, + "grad_norm": 0.5327122282694071, + "learning_rate": 5.519831827066965e-05, + "loss": 11.9015, + "step": 24183 + }, + { + "epoch": 1.3169154533635554, + "grad_norm": 0.5624584878840289, + "learning_rate": 5.519043470047902e-05, + "loss": 11.9414, + "step": 24184 + }, + { + "epoch": 1.3169699073601384, + "grad_norm": 0.5313704514202723, + "learning_rate": 5.518255147873512e-05, + "loss": 11.8792, + "step": 24185 + }, + { + "epoch": 1.3170243613567214, + "grad_norm": 0.6087527962046092, + "learning_rate": 5.517466860549919e-05, + "loss": 11.7326, + "step": 24186 + }, + { + "epoch": 1.3170788153533044, + "grad_norm": 0.5545308368584826, + "learning_rate": 5.5166786080832635e-05, + "loss": 11.9461, + "step": 24187 + }, + { + "epoch": 1.3171332693498874, + "grad_norm": 0.5577385464793357, + "learning_rate": 5.5158903904796576e-05, + "loss": 11.8884, + "step": 24188 + }, + { + "epoch": 1.3171877233464704, + "grad_norm": 0.5656530028652127, + "learning_rate": 5.515102207745241e-05, + "loss": 11.9084, + "step": 24189 + }, + { + "epoch": 1.3172421773430534, + "grad_norm": 0.5910234974929589, + "learning_rate": 5.5143140598861455e-05, + "loss": 11.8402, + "step": 24190 + }, + { + "epoch": 1.3172966313396364, + "grad_norm": 0.5865373586103833, + "learning_rate": 5.513525946908491e-05, + "loss": 11.9717, + "step": 24191 + }, + { + "epoch": 1.3173510853362194, + "grad_norm": 0.5158602067307513, + "learning_rate": 5.512737868818416e-05, + "loss": 12.0789, + "step": 24192 + }, + { + "epoch": 1.3174055393328024, + "grad_norm": 0.5260099951371222, + "learning_rate": 5.51194982562204e-05, + "loss": 11.8863, + "step": 24193 + }, + { + "epoch": 1.3174599933293853, + "grad_norm": 0.5960360865287289, + "learning_rate": 5.5111618173254984e-05, + "loss": 11.8579, + "step": 24194 + }, + { + "epoch": 1.3175144473259683, + "grad_norm": 0.5974298738793148, + "learning_rate": 5.510373843934913e-05, + "loss": 11.9543, + "step": 24195 + }, + { + "epoch": 1.3175689013225513, + "grad_norm": 0.5251629446111129, + "learning_rate": 5.509585905456411e-05, + "loss": 11.8436, + "step": 24196 + }, + { + "epoch": 1.3176233553191343, + "grad_norm": 0.5550405248967265, + "learning_rate": 5.508798001896132e-05, + "loss": 11.9545, + "step": 24197 + }, + { + "epoch": 1.3176778093157175, + "grad_norm": 0.48351700828475436, + "learning_rate": 5.508010133260184e-05, + "loss": 11.8354, + "step": 24198 + }, + { + "epoch": 1.3177322633123005, + "grad_norm": 0.5464638364304195, + "learning_rate": 5.5072222995547026e-05, + "loss": 11.8961, + "step": 24199 + }, + { + "epoch": 1.3177867173088835, + "grad_norm": 0.5637109969518448, + "learning_rate": 5.506434500785817e-05, + "loss": 12.0174, + "step": 24200 + }, + { + "epoch": 1.3178411713054665, + "grad_norm": 0.5533901495275418, + "learning_rate": 5.5056467369596484e-05, + "loss": 11.8048, + "step": 24201 + }, + { + "epoch": 1.3178956253020495, + "grad_norm": 0.5677288863780411, + "learning_rate": 5.504859008082328e-05, + "loss": 11.8226, + "step": 24202 + }, + { + "epoch": 1.3179500792986325, + "grad_norm": 0.6358236306367222, + "learning_rate": 5.504071314159973e-05, + "loss": 11.9235, + "step": 24203 + }, + { + "epoch": 1.3180045332952155, + "grad_norm": 0.5985184796954829, + "learning_rate": 5.503283655198718e-05, + "loss": 11.8566, + "step": 24204 + }, + { + "epoch": 1.3180589872917985, + "grad_norm": 0.5934497309754219, + "learning_rate": 5.502496031204678e-05, + "loss": 11.9266, + "step": 24205 + }, + { + "epoch": 1.3181134412883815, + "grad_norm": 0.5245202950791581, + "learning_rate": 5.501708442183985e-05, + "loss": 11.8096, + "step": 24206 + }, + { + "epoch": 1.3181678952849647, + "grad_norm": 0.5235175834459708, + "learning_rate": 5.5009208881427686e-05, + "loss": 11.9094, + "step": 24207 + }, + { + "epoch": 1.3182223492815477, + "grad_norm": 0.5299367809455927, + "learning_rate": 5.500133369087137e-05, + "loss": 11.9188, + "step": 24208 + }, + { + "epoch": 1.3182768032781307, + "grad_norm": 0.5295516934968243, + "learning_rate": 5.4993458850232285e-05, + "loss": 11.8589, + "step": 24209 + }, + { + "epoch": 1.3183312572747137, + "grad_norm": 0.5885540068868155, + "learning_rate": 5.4985584359571565e-05, + "loss": 12.0495, + "step": 24210 + }, + { + "epoch": 1.3183857112712967, + "grad_norm": 0.6380428176961217, + "learning_rate": 5.497771021895048e-05, + "loss": 11.8852, + "step": 24211 + }, + { + "epoch": 1.3184401652678797, + "grad_norm": 0.5672198964296776, + "learning_rate": 5.496983642843033e-05, + "loss": 12.014, + "step": 24212 + }, + { + "epoch": 1.3184946192644627, + "grad_norm": 0.8291922617570414, + "learning_rate": 5.4961962988072237e-05, + "loss": 11.9553, + "step": 24213 + }, + { + "epoch": 1.3185490732610456, + "grad_norm": 0.5886373816221026, + "learning_rate": 5.495408989793751e-05, + "loss": 11.9391, + "step": 24214 + }, + { + "epoch": 1.3186035272576286, + "grad_norm": 0.5320349527412451, + "learning_rate": 5.49462171580873e-05, + "loss": 11.9512, + "step": 24215 + }, + { + "epoch": 1.3186579812542116, + "grad_norm": 0.4912981698863273, + "learning_rate": 5.4938344768582886e-05, + "loss": 11.8482, + "step": 24216 + }, + { + "epoch": 1.3187124352507946, + "grad_norm": 0.5738559075083566, + "learning_rate": 5.493047272948547e-05, + "loss": 11.9387, + "step": 24217 + }, + { + "epoch": 1.3187668892473776, + "grad_norm": 0.6180322146989785, + "learning_rate": 5.492260104085621e-05, + "loss": 11.9237, + "step": 24218 + }, + { + "epoch": 1.3188213432439606, + "grad_norm": 0.5092424770447668, + "learning_rate": 5.491472970275642e-05, + "loss": 11.8272, + "step": 24219 + }, + { + "epoch": 1.3188757972405436, + "grad_norm": 0.6367962032492092, + "learning_rate": 5.490685871524719e-05, + "loss": 12.0051, + "step": 24220 + }, + { + "epoch": 1.3189302512371268, + "grad_norm": 0.537058284768866, + "learning_rate": 5.48989880783898e-05, + "loss": 11.8841, + "step": 24221 + }, + { + "epoch": 1.3189847052337098, + "grad_norm": 0.5582860930165967, + "learning_rate": 5.4891117792245494e-05, + "loss": 11.9276, + "step": 24222 + }, + { + "epoch": 1.3190391592302928, + "grad_norm": 0.6171494646409815, + "learning_rate": 5.488324785687537e-05, + "loss": 11.9321, + "step": 24223 + }, + { + "epoch": 1.3190936132268758, + "grad_norm": 0.5364331402214078, + "learning_rate": 5.48753782723407e-05, + "loss": 11.8419, + "step": 24224 + }, + { + "epoch": 1.3191480672234588, + "grad_norm": 0.5715325938987341, + "learning_rate": 5.4867509038702644e-05, + "loss": 12.0285, + "step": 24225 + }, + { + "epoch": 1.3192025212200418, + "grad_norm": 0.5696662353364319, + "learning_rate": 5.485964015602243e-05, + "loss": 11.8765, + "step": 24226 + }, + { + "epoch": 1.3192569752166248, + "grad_norm": 0.553330927101105, + "learning_rate": 5.4851771624361236e-05, + "loss": 11.9422, + "step": 24227 + }, + { + "epoch": 1.3193114292132078, + "grad_norm": 0.6435706481748136, + "learning_rate": 5.4843903443780185e-05, + "loss": 12.1121, + "step": 24228 + }, + { + "epoch": 1.3193658832097908, + "grad_norm": 0.5227215737264612, + "learning_rate": 5.4836035614340566e-05, + "loss": 11.9436, + "step": 24229 + }, + { + "epoch": 1.319420337206374, + "grad_norm": 0.7070280218748599, + "learning_rate": 5.482816813610345e-05, + "loss": 11.7973, + "step": 24230 + }, + { + "epoch": 1.319474791202957, + "grad_norm": 0.5108586704723379, + "learning_rate": 5.482030100913014e-05, + "loss": 11.7991, + "step": 24231 + }, + { + "epoch": 1.31952924519954, + "grad_norm": 0.6067175139272265, + "learning_rate": 5.481243423348168e-05, + "loss": 11.8569, + "step": 24232 + }, + { + "epoch": 1.319583699196123, + "grad_norm": 0.5096432546957752, + "learning_rate": 5.480456780921932e-05, + "loss": 11.9371, + "step": 24233 + }, + { + "epoch": 1.319638153192706, + "grad_norm": 0.568550815802961, + "learning_rate": 5.479670173640426e-05, + "loss": 11.9072, + "step": 24234 + }, + { + "epoch": 1.319692607189289, + "grad_norm": 0.5512807238669505, + "learning_rate": 5.478883601509759e-05, + "loss": 11.9034, + "step": 24235 + }, + { + "epoch": 1.319747061185872, + "grad_norm": 0.5151034126730991, + "learning_rate": 5.478097064536054e-05, + "loss": 11.9168, + "step": 24236 + }, + { + "epoch": 1.319801515182455, + "grad_norm": 0.5536047071293094, + "learning_rate": 5.477310562725426e-05, + "loss": 11.9195, + "step": 24237 + }, + { + "epoch": 1.319855969179038, + "grad_norm": 0.5049946864835259, + "learning_rate": 5.476524096083986e-05, + "loss": 11.7928, + "step": 24238 + }, + { + "epoch": 1.319910423175621, + "grad_norm": 0.5610898251218176, + "learning_rate": 5.475737664617856e-05, + "loss": 11.7958, + "step": 24239 + }, + { + "epoch": 1.319964877172204, + "grad_norm": 0.5647761995622971, + "learning_rate": 5.474951268333145e-05, + "loss": 11.8977, + "step": 24240 + }, + { + "epoch": 1.320019331168787, + "grad_norm": 0.5679213557007524, + "learning_rate": 5.474164907235976e-05, + "loss": 11.83, + "step": 24241 + }, + { + "epoch": 1.3200737851653699, + "grad_norm": 0.5218434301393386, + "learning_rate": 5.473378581332456e-05, + "loss": 11.928, + "step": 24242 + }, + { + "epoch": 1.3201282391619529, + "grad_norm": 0.5110787526488226, + "learning_rate": 5.472592290628707e-05, + "loss": 11.8448, + "step": 24243 + }, + { + "epoch": 1.3201826931585359, + "grad_norm": 0.5731643963526519, + "learning_rate": 5.471806035130834e-05, + "loss": 11.8394, + "step": 24244 + }, + { + "epoch": 1.320237147155119, + "grad_norm": 0.5569903728328777, + "learning_rate": 5.4710198148449576e-05, + "loss": 12.0095, + "step": 24245 + }, + { + "epoch": 1.320291601151702, + "grad_norm": 0.5234051767866967, + "learning_rate": 5.470233629777195e-05, + "loss": 11.9206, + "step": 24246 + }, + { + "epoch": 1.320346055148285, + "grad_norm": 0.5198547765554428, + "learning_rate": 5.469447479933656e-05, + "loss": 11.9631, + "step": 24247 + }, + { + "epoch": 1.320400509144868, + "grad_norm": 0.5586794934699644, + "learning_rate": 5.468661365320449e-05, + "loss": 12.0107, + "step": 24248 + }, + { + "epoch": 1.320454963141451, + "grad_norm": 0.5489873444126732, + "learning_rate": 5.467875285943693e-05, + "loss": 11.9004, + "step": 24249 + }, + { + "epoch": 1.320509417138034, + "grad_norm": 0.5008034757135255, + "learning_rate": 5.467089241809498e-05, + "loss": 11.891, + "step": 24250 + }, + { + "epoch": 1.320563871134617, + "grad_norm": 0.532421244376691, + "learning_rate": 5.466303232923979e-05, + "loss": 11.8025, + "step": 24251 + }, + { + "epoch": 1.3206183251312, + "grad_norm": 0.5069687974078494, + "learning_rate": 5.465517259293242e-05, + "loss": 11.7175, + "step": 24252 + }, + { + "epoch": 1.3206727791277832, + "grad_norm": 0.5436007210662951, + "learning_rate": 5.464731320923409e-05, + "loss": 11.9452, + "step": 24253 + }, + { + "epoch": 1.3207272331243662, + "grad_norm": 0.537046876651317, + "learning_rate": 5.463945417820582e-05, + "loss": 11.7874, + "step": 24254 + }, + { + "epoch": 1.3207816871209492, + "grad_norm": 0.5993244268052655, + "learning_rate": 5.463159549990875e-05, + "loss": 11.9102, + "step": 24255 + }, + { + "epoch": 1.3208361411175322, + "grad_norm": 0.617724157337032, + "learning_rate": 5.4623737174404034e-05, + "loss": 11.8869, + "step": 24256 + }, + { + "epoch": 1.3208905951141152, + "grad_norm": 0.549806880415778, + "learning_rate": 5.461587920175276e-05, + "loss": 11.9112, + "step": 24257 + }, + { + "epoch": 1.3209450491106982, + "grad_norm": 0.5788189893060056, + "learning_rate": 5.460802158201601e-05, + "loss": 11.9908, + "step": 24258 + }, + { + "epoch": 1.3209995031072812, + "grad_norm": 0.57018458798926, + "learning_rate": 5.4600164315254876e-05, + "loss": 11.8401, + "step": 24259 + }, + { + "epoch": 1.3210539571038642, + "grad_norm": 0.5149251024593897, + "learning_rate": 5.459230740153046e-05, + "loss": 11.9464, + "step": 24260 + }, + { + "epoch": 1.3211084111004472, + "grad_norm": 0.6838468707258352, + "learning_rate": 5.458445084090392e-05, + "loss": 11.927, + "step": 24261 + }, + { + "epoch": 1.3211628650970302, + "grad_norm": 0.5518764799362267, + "learning_rate": 5.4576594633436275e-05, + "loss": 11.803, + "step": 24262 + }, + { + "epoch": 1.3212173190936132, + "grad_norm": 0.5660078351865124, + "learning_rate": 5.456873877918868e-05, + "loss": 11.8962, + "step": 24263 + }, + { + "epoch": 1.3212717730901962, + "grad_norm": 0.5327815934793932, + "learning_rate": 5.4560883278222156e-05, + "loss": 11.8649, + "step": 24264 + }, + { + "epoch": 1.3213262270867792, + "grad_norm": 0.575616894847649, + "learning_rate": 5.455302813059786e-05, + "loss": 12.0352, + "step": 24265 + }, + { + "epoch": 1.3213806810833622, + "grad_norm": 0.5414418027524431, + "learning_rate": 5.454517333637679e-05, + "loss": 11.9338, + "step": 24266 + }, + { + "epoch": 1.3214351350799451, + "grad_norm": 0.5228913502066164, + "learning_rate": 5.4537318895620126e-05, + "loss": 11.9759, + "step": 24267 + }, + { + "epoch": 1.3214895890765284, + "grad_norm": 0.5037217239524571, + "learning_rate": 5.4529464808388894e-05, + "loss": 11.9221, + "step": 24268 + }, + { + "epoch": 1.3215440430731114, + "grad_norm": 0.6567086658448069, + "learning_rate": 5.4521611074744125e-05, + "loss": 11.8346, + "step": 24269 + }, + { + "epoch": 1.3215984970696943, + "grad_norm": 0.5419176707417476, + "learning_rate": 5.4513757694746935e-05, + "loss": 11.9521, + "step": 24270 + }, + { + "epoch": 1.3216529510662773, + "grad_norm": 0.47978489373554256, + "learning_rate": 5.450590466845844e-05, + "loss": 11.9456, + "step": 24271 + }, + { + "epoch": 1.3217074050628603, + "grad_norm": 0.5356943236231767, + "learning_rate": 5.449805199593962e-05, + "loss": 11.9803, + "step": 24272 + }, + { + "epoch": 1.3217618590594433, + "grad_norm": 0.5273473014510761, + "learning_rate": 5.449019967725161e-05, + "loss": 11.8619, + "step": 24273 + }, + { + "epoch": 1.3218163130560263, + "grad_norm": 0.5501787690953788, + "learning_rate": 5.44823477124554e-05, + "loss": 12.0075, + "step": 24274 + }, + { + "epoch": 1.3218707670526093, + "grad_norm": 0.5408211223098347, + "learning_rate": 5.447449610161213e-05, + "loss": 11.9897, + "step": 24275 + }, + { + "epoch": 1.3219252210491923, + "grad_norm": 0.5398280967216086, + "learning_rate": 5.4466644844782765e-05, + "loss": 11.9916, + "step": 24276 + }, + { + "epoch": 1.3219796750457755, + "grad_norm": 0.5126606641286382, + "learning_rate": 5.445879394202846e-05, + "loss": 11.9482, + "step": 24277 + }, + { + "epoch": 1.3220341290423585, + "grad_norm": 0.5223280025625218, + "learning_rate": 5.44509433934102e-05, + "loss": 11.8561, + "step": 24278 + }, + { + "epoch": 1.3220885830389415, + "grad_norm": 0.5009270176350861, + "learning_rate": 5.444309319898901e-05, + "loss": 11.7228, + "step": 24279 + }, + { + "epoch": 1.3221430370355245, + "grad_norm": 0.5563132453475934, + "learning_rate": 5.443524335882601e-05, + "loss": 11.9657, + "step": 24280 + }, + { + "epoch": 1.3221974910321075, + "grad_norm": 0.5256210233397686, + "learning_rate": 5.442739387298215e-05, + "loss": 11.9307, + "step": 24281 + }, + { + "epoch": 1.3222519450286905, + "grad_norm": 0.5373238434806185, + "learning_rate": 5.441954474151851e-05, + "loss": 11.9033, + "step": 24282 + }, + { + "epoch": 1.3223063990252735, + "grad_norm": 0.5461130735811333, + "learning_rate": 5.44116959644962e-05, + "loss": 11.9797, + "step": 24283 + }, + { + "epoch": 1.3223608530218565, + "grad_norm": 0.5347686060063062, + "learning_rate": 5.440384754197613e-05, + "loss": 11.8984, + "step": 24284 + }, + { + "epoch": 1.3224153070184395, + "grad_norm": 0.5468334467073818, + "learning_rate": 5.4395999474019434e-05, + "loss": 11.8378, + "step": 24285 + }, + { + "epoch": 1.3224697610150224, + "grad_norm": 0.5826262109080343, + "learning_rate": 5.4388151760687033e-05, + "loss": 12.0424, + "step": 24286 + }, + { + "epoch": 1.3225242150116054, + "grad_norm": 0.5224605350270002, + "learning_rate": 5.438030440204007e-05, + "loss": 11.9409, + "step": 24287 + }, + { + "epoch": 1.3225786690081884, + "grad_norm": 0.4771661678720687, + "learning_rate": 5.437245739813952e-05, + "loss": 11.8821, + "step": 24288 + }, + { + "epoch": 1.3226331230047714, + "grad_norm": 0.5457161085290173, + "learning_rate": 5.436461074904634e-05, + "loss": 11.8993, + "step": 24289 + }, + { + "epoch": 1.3226875770013544, + "grad_norm": 0.5260466621327258, + "learning_rate": 5.435676445482165e-05, + "loss": 12.0321, + "step": 24290 + }, + { + "epoch": 1.3227420309979376, + "grad_norm": 0.5487822149381587, + "learning_rate": 5.4348918515526356e-05, + "loss": 11.9641, + "step": 24291 + }, + { + "epoch": 1.3227964849945206, + "grad_norm": 0.5397681819089222, + "learning_rate": 5.434107293122158e-05, + "loss": 11.8594, + "step": 24292 + }, + { + "epoch": 1.3228509389911036, + "grad_norm": 0.5507672832446914, + "learning_rate": 5.4333227701968246e-05, + "loss": 11.9825, + "step": 24293 + }, + { + "epoch": 1.3229053929876866, + "grad_norm": 0.7380789884234731, + "learning_rate": 5.4325382827827374e-05, + "loss": 12.0395, + "step": 24294 + }, + { + "epoch": 1.3229598469842696, + "grad_norm": 0.5656365995695363, + "learning_rate": 5.431753830886004e-05, + "loss": 11.9359, + "step": 24295 + }, + { + "epoch": 1.3230143009808526, + "grad_norm": 0.5887100710632452, + "learning_rate": 5.4309694145127145e-05, + "loss": 11.9868, + "step": 24296 + }, + { + "epoch": 1.3230687549774356, + "grad_norm": 0.5049209463935754, + "learning_rate": 5.4301850336689755e-05, + "loss": 11.8749, + "step": 24297 + }, + { + "epoch": 1.3231232089740186, + "grad_norm": 0.544689104187778, + "learning_rate": 5.429400688360886e-05, + "loss": 11.9015, + "step": 24298 + }, + { + "epoch": 1.3231776629706016, + "grad_norm": 0.5438795568555299, + "learning_rate": 5.428616378594538e-05, + "loss": 11.82, + "step": 24299 + }, + { + "epoch": 1.3232321169671848, + "grad_norm": 0.5328578054970738, + "learning_rate": 5.427832104376041e-05, + "loss": 11.7133, + "step": 24300 + }, + { + "epoch": 1.3232865709637678, + "grad_norm": 0.5500810269527809, + "learning_rate": 5.427047865711483e-05, + "loss": 11.9908, + "step": 24301 + }, + { + "epoch": 1.3233410249603508, + "grad_norm": 0.6258240133814889, + "learning_rate": 5.4262636626069715e-05, + "loss": 11.9089, + "step": 24302 + }, + { + "epoch": 1.3233954789569338, + "grad_norm": 0.5670219671761658, + "learning_rate": 5.425479495068597e-05, + "loss": 11.8998, + "step": 24303 + }, + { + "epoch": 1.3234499329535168, + "grad_norm": 0.5993249992658796, + "learning_rate": 5.4246953631024633e-05, + "loss": 12.1534, + "step": 24304 + }, + { + "epoch": 1.3235043869500998, + "grad_norm": 0.5772207235736112, + "learning_rate": 5.4239112667146676e-05, + "loss": 11.945, + "step": 24305 + }, + { + "epoch": 1.3235588409466827, + "grad_norm": 0.5207581196920541, + "learning_rate": 5.4231272059113014e-05, + "loss": 11.925, + "step": 24306 + }, + { + "epoch": 1.3236132949432657, + "grad_norm": 0.6237959400309706, + "learning_rate": 5.4223431806984706e-05, + "loss": 11.9534, + "step": 24307 + }, + { + "epoch": 1.3236677489398487, + "grad_norm": 0.6336839598475581, + "learning_rate": 5.421559191082266e-05, + "loss": 12.0141, + "step": 24308 + }, + { + "epoch": 1.3237222029364317, + "grad_norm": 0.5943396579206478, + "learning_rate": 5.42077523706878e-05, + "loss": 12.0129, + "step": 24309 + }, + { + "epoch": 1.3237766569330147, + "grad_norm": 0.5404487076541026, + "learning_rate": 5.4199913186641194e-05, + "loss": 11.9076, + "step": 24310 + }, + { + "epoch": 1.3238311109295977, + "grad_norm": 0.5890314787145994, + "learning_rate": 5.4192074358743696e-05, + "loss": 11.961, + "step": 24311 + }, + { + "epoch": 1.3238855649261807, + "grad_norm": 0.5241183180791975, + "learning_rate": 5.418423588705634e-05, + "loss": 11.8418, + "step": 24312 + }, + { + "epoch": 1.3239400189227637, + "grad_norm": 0.5657800702247222, + "learning_rate": 5.417639777164003e-05, + "loss": 11.9448, + "step": 24313 + }, + { + "epoch": 1.3239944729193467, + "grad_norm": 0.5704038597285788, + "learning_rate": 5.416856001255577e-05, + "loss": 11.9572, + "step": 24314 + }, + { + "epoch": 1.32404892691593, + "grad_norm": 0.5161054332427804, + "learning_rate": 5.4160722609864425e-05, + "loss": 11.8315, + "step": 24315 + }, + { + "epoch": 1.324103380912513, + "grad_norm": 0.5530415399281876, + "learning_rate": 5.4152885563626985e-05, + "loss": 11.7823, + "step": 24316 + }, + { + "epoch": 1.3241578349090959, + "grad_norm": 0.6341196816585852, + "learning_rate": 5.414504887390446e-05, + "loss": 12.083, + "step": 24317 + }, + { + "epoch": 1.3242122889056789, + "grad_norm": 0.5866654115954333, + "learning_rate": 5.41372125407577e-05, + "loss": 12.0122, + "step": 24318 + }, + { + "epoch": 1.3242667429022619, + "grad_norm": 0.5358296736449896, + "learning_rate": 5.4129376564247635e-05, + "loss": 11.8467, + "step": 24319 + }, + { + "epoch": 1.3243211968988449, + "grad_norm": 0.5614961187626117, + "learning_rate": 5.412154094443527e-05, + "loss": 12.1306, + "step": 24320 + }, + { + "epoch": 1.3243756508954279, + "grad_norm": 0.5591502289964349, + "learning_rate": 5.4113705681381455e-05, + "loss": 11.8171, + "step": 24321 + }, + { + "epoch": 1.3244301048920109, + "grad_norm": 0.5498758234731576, + "learning_rate": 5.41058707751472e-05, + "loss": 11.8002, + "step": 24322 + }, + { + "epoch": 1.324484558888594, + "grad_norm": 0.5196810290051165, + "learning_rate": 5.409803622579335e-05, + "loss": 11.9014, + "step": 24323 + }, + { + "epoch": 1.324539012885177, + "grad_norm": 0.4875341647826073, + "learning_rate": 5.409020203338091e-05, + "loss": 11.9946, + "step": 24324 + }, + { + "epoch": 1.32459346688176, + "grad_norm": 0.5962898438983585, + "learning_rate": 5.4082368197970724e-05, + "loss": 11.8129, + "step": 24325 + }, + { + "epoch": 1.324647920878343, + "grad_norm": 0.5242946635270267, + "learning_rate": 5.407453471962377e-05, + "loss": 12.0071, + "step": 24326 + }, + { + "epoch": 1.324702374874926, + "grad_norm": 0.5151795242568236, + "learning_rate": 5.4066701598400904e-05, + "loss": 11.8575, + "step": 24327 + }, + { + "epoch": 1.324756828871509, + "grad_norm": 0.508533860739261, + "learning_rate": 5.405886883436311e-05, + "loss": 11.9134, + "step": 24328 + }, + { + "epoch": 1.324811282868092, + "grad_norm": 0.5317603989694534, + "learning_rate": 5.4051036427571254e-05, + "loss": 11.8262, + "step": 24329 + }, + { + "epoch": 1.324865736864675, + "grad_norm": 0.5590911167033477, + "learning_rate": 5.404320437808621e-05, + "loss": 11.9181, + "step": 24330 + }, + { + "epoch": 1.324920190861258, + "grad_norm": 0.5598679141628443, + "learning_rate": 5.40353726859689e-05, + "loss": 11.9615, + "step": 24331 + }, + { + "epoch": 1.324974644857841, + "grad_norm": 0.5136271129954619, + "learning_rate": 5.40275413512803e-05, + "loss": 11.8343, + "step": 24332 + }, + { + "epoch": 1.325029098854424, + "grad_norm": 0.5734043586137014, + "learning_rate": 5.401971037408119e-05, + "loss": 11.9757, + "step": 24333 + }, + { + "epoch": 1.325083552851007, + "grad_norm": 0.5598120208934344, + "learning_rate": 5.401187975443257e-05, + "loss": 11.9283, + "step": 24334 + }, + { + "epoch": 1.32513800684759, + "grad_norm": 0.5448110291910014, + "learning_rate": 5.400404949239525e-05, + "loss": 11.8949, + "step": 24335 + }, + { + "epoch": 1.325192460844173, + "grad_norm": 0.5642133229271152, + "learning_rate": 5.399621958803018e-05, + "loss": 11.9411, + "step": 24336 + }, + { + "epoch": 1.325246914840756, + "grad_norm": 0.5177282489550481, + "learning_rate": 5.39883900413982e-05, + "loss": 11.9441, + "step": 24337 + }, + { + "epoch": 1.3253013688373392, + "grad_norm": 0.5626101259603494, + "learning_rate": 5.3980560852560244e-05, + "loss": 11.9748, + "step": 24338 + }, + { + "epoch": 1.3253558228339222, + "grad_norm": 0.524196397718115, + "learning_rate": 5.3972732021577175e-05, + "loss": 11.857, + "step": 24339 + }, + { + "epoch": 1.3254102768305052, + "grad_norm": 0.5694045290272125, + "learning_rate": 5.3964903548509816e-05, + "loss": 11.9094, + "step": 24340 + }, + { + "epoch": 1.3254647308270882, + "grad_norm": 0.5012759947368628, + "learning_rate": 5.3957075433419126e-05, + "loss": 11.9684, + "step": 24341 + }, + { + "epoch": 1.3255191848236711, + "grad_norm": 0.5424842903304663, + "learning_rate": 5.394924767636589e-05, + "loss": 11.8677, + "step": 24342 + }, + { + "epoch": 1.3255736388202541, + "grad_norm": 0.5094963885154813, + "learning_rate": 5.3941420277411035e-05, + "loss": 11.9124, + "step": 24343 + }, + { + "epoch": 1.3256280928168371, + "grad_norm": 0.5695293612183323, + "learning_rate": 5.3933593236615465e-05, + "loss": 11.7505, + "step": 24344 + }, + { + "epoch": 1.3256825468134201, + "grad_norm": 0.5244664486741112, + "learning_rate": 5.392576655403996e-05, + "loss": 11.8237, + "step": 24345 + }, + { + "epoch": 1.3257370008100031, + "grad_norm": 0.5433440505139582, + "learning_rate": 5.3917940229745446e-05, + "loss": 11.9967, + "step": 24346 + }, + { + "epoch": 1.3257914548065863, + "grad_norm": 0.5418608453655283, + "learning_rate": 5.3910114263792734e-05, + "loss": 11.9847, + "step": 24347 + }, + { + "epoch": 1.3258459088031693, + "grad_norm": 0.5146203601815554, + "learning_rate": 5.3902288656242714e-05, + "loss": 12.0123, + "step": 24348 + }, + { + "epoch": 1.3259003627997523, + "grad_norm": 0.6105695683271954, + "learning_rate": 5.389446340715626e-05, + "loss": 11.8639, + "step": 24349 + }, + { + "epoch": 1.3259548167963353, + "grad_norm": 0.5043270843253015, + "learning_rate": 5.3886638516594136e-05, + "loss": 11.9484, + "step": 24350 + }, + { + "epoch": 1.3260092707929183, + "grad_norm": 0.6502463641638829, + "learning_rate": 5.3878813984617294e-05, + "loss": 12.0225, + "step": 24351 + }, + { + "epoch": 1.3260637247895013, + "grad_norm": 0.5707764957435898, + "learning_rate": 5.387098981128648e-05, + "loss": 11.9455, + "step": 24352 + }, + { + "epoch": 1.3261181787860843, + "grad_norm": 0.5853274073453641, + "learning_rate": 5.386316599666259e-05, + "loss": 11.9512, + "step": 24353 + }, + { + "epoch": 1.3261726327826673, + "grad_norm": 0.5331348033249079, + "learning_rate": 5.385534254080649e-05, + "loss": 11.8601, + "step": 24354 + }, + { + "epoch": 1.3262270867792503, + "grad_norm": 0.535285651695918, + "learning_rate": 5.3847519443778946e-05, + "loss": 11.7711, + "step": 24355 + }, + { + "epoch": 1.3262815407758333, + "grad_norm": 0.5094922793687584, + "learning_rate": 5.3839696705640876e-05, + "loss": 11.8656, + "step": 24356 + }, + { + "epoch": 1.3263359947724163, + "grad_norm": 0.549080849299552, + "learning_rate": 5.383187432645303e-05, + "loss": 11.8211, + "step": 24357 + }, + { + "epoch": 1.3263904487689993, + "grad_norm": 0.5365808510333939, + "learning_rate": 5.382405230627629e-05, + "loss": 11.9175, + "step": 24358 + }, + { + "epoch": 1.3264449027655822, + "grad_norm": 0.6795797989768289, + "learning_rate": 5.3816230645171494e-05, + "loss": 12.0065, + "step": 24359 + }, + { + "epoch": 1.3264993567621652, + "grad_norm": 0.6255954293734473, + "learning_rate": 5.3808409343199376e-05, + "loss": 11.967, + "step": 24360 + }, + { + "epoch": 1.3265538107587485, + "grad_norm": 0.6366357279444079, + "learning_rate": 5.380058840042085e-05, + "loss": 11.8996, + "step": 24361 + }, + { + "epoch": 1.3266082647553314, + "grad_norm": 0.6546017019244861, + "learning_rate": 5.379276781689666e-05, + "loss": 12.0854, + "step": 24362 + }, + { + "epoch": 1.3266627187519144, + "grad_norm": 0.5896448603649662, + "learning_rate": 5.3784947592687706e-05, + "loss": 11.9614, + "step": 24363 + }, + { + "epoch": 1.3267171727484974, + "grad_norm": 0.5935185979095843, + "learning_rate": 5.3777127727854704e-05, + "loss": 11.9514, + "step": 24364 + }, + { + "epoch": 1.3267716267450804, + "grad_norm": 0.5664400696177851, + "learning_rate": 5.3769308222458495e-05, + "loss": 11.9728, + "step": 24365 + }, + { + "epoch": 1.3268260807416634, + "grad_norm": 0.6030275831908156, + "learning_rate": 5.3761489076559954e-05, + "loss": 11.9886, + "step": 24366 + }, + { + "epoch": 1.3268805347382464, + "grad_norm": 0.5607611995842933, + "learning_rate": 5.3753670290219784e-05, + "loss": 11.8956, + "step": 24367 + }, + { + "epoch": 1.3269349887348294, + "grad_norm": 0.6031829565266845, + "learning_rate": 5.374585186349888e-05, + "loss": 12.0342, + "step": 24368 + }, + { + "epoch": 1.3269894427314124, + "grad_norm": 0.5511490707745242, + "learning_rate": 5.373803379645797e-05, + "loss": 11.9302, + "step": 24369 + }, + { + "epoch": 1.3270438967279956, + "grad_norm": 0.5238988219315068, + "learning_rate": 5.373021608915783e-05, + "loss": 11.92, + "step": 24370 + }, + { + "epoch": 1.3270983507245786, + "grad_norm": 0.6318560091882891, + "learning_rate": 5.372239874165934e-05, + "loss": 11.8764, + "step": 24371 + }, + { + "epoch": 1.3271528047211616, + "grad_norm": 0.5305198440793832, + "learning_rate": 5.3714581754023184e-05, + "loss": 11.9477, + "step": 24372 + }, + { + "epoch": 1.3272072587177446, + "grad_norm": 0.5510571124139215, + "learning_rate": 5.3706765126310254e-05, + "loss": 12.007, + "step": 24373 + }, + { + "epoch": 1.3272617127143276, + "grad_norm": 0.5449207675209723, + "learning_rate": 5.3698948858581245e-05, + "loss": 11.8913, + "step": 24374 + }, + { + "epoch": 1.3273161667109106, + "grad_norm": 0.5610527251170293, + "learning_rate": 5.369113295089696e-05, + "loss": 11.868, + "step": 24375 + }, + { + "epoch": 1.3273706207074936, + "grad_norm": 0.4940865860536345, + "learning_rate": 5.368331740331824e-05, + "loss": 11.8369, + "step": 24376 + }, + { + "epoch": 1.3274250747040766, + "grad_norm": 0.5277221793351025, + "learning_rate": 5.367550221590576e-05, + "loss": 11.8139, + "step": 24377 + }, + { + "epoch": 1.3274795287006596, + "grad_norm": 0.5952348535984634, + "learning_rate": 5.3667687388720434e-05, + "loss": 11.9857, + "step": 24378 + }, + { + "epoch": 1.3275339826972425, + "grad_norm": 0.5445620574401022, + "learning_rate": 5.365987292182286e-05, + "loss": 11.8433, + "step": 24379 + }, + { + "epoch": 1.3275884366938255, + "grad_norm": 0.5296148059847346, + "learning_rate": 5.365205881527389e-05, + "loss": 11.8902, + "step": 24380 + }, + { + "epoch": 1.3276428906904085, + "grad_norm": 0.5251909452213419, + "learning_rate": 5.364424506913433e-05, + "loss": 11.9295, + "step": 24381 + }, + { + "epoch": 1.3276973446869915, + "grad_norm": 0.530573351604225, + "learning_rate": 5.363643168346484e-05, + "loss": 11.9302, + "step": 24382 + }, + { + "epoch": 1.3277517986835745, + "grad_norm": 0.54045335631268, + "learning_rate": 5.362861865832627e-05, + "loss": 11.9627, + "step": 24383 + }, + { + "epoch": 1.3278062526801575, + "grad_norm": 0.612712364336341, + "learning_rate": 5.36208059937793e-05, + "loss": 12.139, + "step": 24384 + }, + { + "epoch": 1.3278607066767407, + "grad_norm": 0.5417724530265201, + "learning_rate": 5.361299368988477e-05, + "loss": 12.0772, + "step": 24385 + }, + { + "epoch": 1.3279151606733237, + "grad_norm": 0.6751105267403107, + "learning_rate": 5.3605181746703325e-05, + "loss": 11.9898, + "step": 24386 + }, + { + "epoch": 1.3279696146699067, + "grad_norm": 0.615979215688675, + "learning_rate": 5.3597370164295776e-05, + "loss": 11.9471, + "step": 24387 + }, + { + "epoch": 1.3280240686664897, + "grad_norm": 0.5665303733879382, + "learning_rate": 5.358955894272295e-05, + "loss": 11.9218, + "step": 24388 + }, + { + "epoch": 1.3280785226630727, + "grad_norm": 0.5351548006752035, + "learning_rate": 5.3581748082045394e-05, + "loss": 11.9376, + "step": 24389 + }, + { + "epoch": 1.3281329766596557, + "grad_norm": 0.5269036829983818, + "learning_rate": 5.3573937582324004e-05, + "loss": 11.9498, + "step": 24390 + }, + { + "epoch": 1.3281874306562387, + "grad_norm": 0.5086667444009628, + "learning_rate": 5.356612744361942e-05, + "loss": 12.0167, + "step": 24391 + }, + { + "epoch": 1.3282418846528217, + "grad_norm": 0.5647739110462783, + "learning_rate": 5.355831766599242e-05, + "loss": 11.8642, + "step": 24392 + }, + { + "epoch": 1.3282963386494049, + "grad_norm": 0.5607725796111824, + "learning_rate": 5.355050824950376e-05, + "loss": 11.9575, + "step": 24393 + }, + { + "epoch": 1.3283507926459879, + "grad_norm": 0.5592590286363692, + "learning_rate": 5.35426991942141e-05, + "loss": 12.0412, + "step": 24394 + }, + { + "epoch": 1.3284052466425709, + "grad_norm": 0.5414799946184244, + "learning_rate": 5.353489050018423e-05, + "loss": 11.8695, + "step": 24395 + }, + { + "epoch": 1.3284597006391539, + "grad_norm": 0.5355594770259369, + "learning_rate": 5.352708216747482e-05, + "loss": 11.9433, + "step": 24396 + }, + { + "epoch": 1.3285141546357369, + "grad_norm": 0.5112299339077214, + "learning_rate": 5.351927419614665e-05, + "loss": 11.8644, + "step": 24397 + }, + { + "epoch": 1.3285686086323198, + "grad_norm": 0.6137036904474334, + "learning_rate": 5.351146658626041e-05, + "loss": 11.9186, + "step": 24398 + }, + { + "epoch": 1.3286230626289028, + "grad_norm": 0.6907624157770516, + "learning_rate": 5.3503659337876735e-05, + "loss": 11.9553, + "step": 24399 + }, + { + "epoch": 1.3286775166254858, + "grad_norm": 0.5539291518076767, + "learning_rate": 5.349585245105646e-05, + "loss": 11.9558, + "step": 24400 + }, + { + "epoch": 1.3287319706220688, + "grad_norm": 0.51189330897039, + "learning_rate": 5.348804592586019e-05, + "loss": 11.8123, + "step": 24401 + }, + { + "epoch": 1.3287864246186518, + "grad_norm": 0.5444129357563464, + "learning_rate": 5.3480239762348684e-05, + "loss": 11.8758, + "step": 24402 + }, + { + "epoch": 1.3288408786152348, + "grad_norm": 0.6037881277554985, + "learning_rate": 5.347243396058266e-05, + "loss": 12.0076, + "step": 24403 + }, + { + "epoch": 1.3288953326118178, + "grad_norm": 0.546067934881867, + "learning_rate": 5.346462852062277e-05, + "loss": 12.0116, + "step": 24404 + }, + { + "epoch": 1.3289497866084008, + "grad_norm": 0.48186776905170187, + "learning_rate": 5.345682344252976e-05, + "loss": 11.8478, + "step": 24405 + }, + { + "epoch": 1.3290042406049838, + "grad_norm": 0.5434736762770173, + "learning_rate": 5.344901872636425e-05, + "loss": 11.8911, + "step": 24406 + }, + { + "epoch": 1.3290586946015668, + "grad_norm": 0.6060622428524457, + "learning_rate": 5.3441214372187033e-05, + "loss": 11.9495, + "step": 24407 + }, + { + "epoch": 1.32911314859815, + "grad_norm": 0.5304292446815432, + "learning_rate": 5.343341038005873e-05, + "loss": 12.0005, + "step": 24408 + }, + { + "epoch": 1.329167602594733, + "grad_norm": 0.677423001758076, + "learning_rate": 5.342560675004e-05, + "loss": 12.0136, + "step": 24409 + }, + { + "epoch": 1.329222056591316, + "grad_norm": 0.5518555001602644, + "learning_rate": 5.341780348219161e-05, + "loss": 12.0136, + "step": 24410 + }, + { + "epoch": 1.329276510587899, + "grad_norm": 0.500025501783482, + "learning_rate": 5.341000057657415e-05, + "loss": 11.8849, + "step": 24411 + }, + { + "epoch": 1.329330964584482, + "grad_norm": 0.5974749784099989, + "learning_rate": 5.340219803324838e-05, + "loss": 11.8318, + "step": 24412 + }, + { + "epoch": 1.329385418581065, + "grad_norm": 0.5510432659653153, + "learning_rate": 5.339439585227488e-05, + "loss": 11.807, + "step": 24413 + }, + { + "epoch": 1.329439872577648, + "grad_norm": 0.5955238957795417, + "learning_rate": 5.3386594033714376e-05, + "loss": 11.9354, + "step": 24414 + }, + { + "epoch": 1.329494326574231, + "grad_norm": 0.5705872812703834, + "learning_rate": 5.337879257762758e-05, + "loss": 11.9356, + "step": 24415 + }, + { + "epoch": 1.329548780570814, + "grad_norm": 0.5612735588298833, + "learning_rate": 5.337099148407507e-05, + "loss": 11.856, + "step": 24416 + }, + { + "epoch": 1.3296032345673972, + "grad_norm": 0.5406940615655401, + "learning_rate": 5.336319075311759e-05, + "loss": 11.9474, + "step": 24417 + }, + { + "epoch": 1.3296576885639801, + "grad_norm": 0.6178418823145054, + "learning_rate": 5.335539038481575e-05, + "loss": 12.0239, + "step": 24418 + }, + { + "epoch": 1.3297121425605631, + "grad_norm": 0.5790966834105574, + "learning_rate": 5.3347590379230186e-05, + "loss": 11.9535, + "step": 24419 + }, + { + "epoch": 1.3297665965571461, + "grad_norm": 0.5336972842510217, + "learning_rate": 5.3339790736421625e-05, + "loss": 11.9026, + "step": 24420 + }, + { + "epoch": 1.3298210505537291, + "grad_norm": 0.5112016277825095, + "learning_rate": 5.333199145645064e-05, + "loss": 11.9398, + "step": 24421 + }, + { + "epoch": 1.3298755045503121, + "grad_norm": 0.5677668095965251, + "learning_rate": 5.332419253937795e-05, + "loss": 11.8973, + "step": 24422 + }, + { + "epoch": 1.329929958546895, + "grad_norm": 0.5846641795364168, + "learning_rate": 5.3316393985264114e-05, + "loss": 11.867, + "step": 24423 + }, + { + "epoch": 1.329984412543478, + "grad_norm": 0.5615385904841783, + "learning_rate": 5.3308595794169844e-05, + "loss": 11.9397, + "step": 24424 + }, + { + "epoch": 1.330038866540061, + "grad_norm": 0.5154131337766664, + "learning_rate": 5.330079796615579e-05, + "loss": 11.8674, + "step": 24425 + }, + { + "epoch": 1.330093320536644, + "grad_norm": 0.5112974569783938, + "learning_rate": 5.329300050128254e-05, + "loss": 11.8693, + "step": 24426 + }, + { + "epoch": 1.330147774533227, + "grad_norm": 0.547558573878345, + "learning_rate": 5.328520339961078e-05, + "loss": 11.8623, + "step": 24427 + }, + { + "epoch": 1.33020222852981, + "grad_norm": 0.562825292170743, + "learning_rate": 5.327740666120107e-05, + "loss": 11.8996, + "step": 24428 + }, + { + "epoch": 1.330256682526393, + "grad_norm": 0.5582074920459502, + "learning_rate": 5.3269610286114126e-05, + "loss": 11.7927, + "step": 24429 + }, + { + "epoch": 1.330311136522976, + "grad_norm": 0.5180893200089595, + "learning_rate": 5.3261814274410523e-05, + "loss": 11.8159, + "step": 24430 + }, + { + "epoch": 1.3303655905195593, + "grad_norm": 0.6306627129274233, + "learning_rate": 5.325401862615087e-05, + "loss": 11.8643, + "step": 24431 + }, + { + "epoch": 1.3304200445161423, + "grad_norm": 0.5500720777771656, + "learning_rate": 5.324622334139583e-05, + "loss": 11.8704, + "step": 24432 + }, + { + "epoch": 1.3304744985127253, + "grad_norm": 0.53183988508379, + "learning_rate": 5.323842842020597e-05, + "loss": 11.8969, + "step": 24433 + }, + { + "epoch": 1.3305289525093082, + "grad_norm": 0.5738645551365829, + "learning_rate": 5.323063386264196e-05, + "loss": 11.8925, + "step": 24434 + }, + { + "epoch": 1.3305834065058912, + "grad_norm": 0.5545923361344481, + "learning_rate": 5.3222839668764335e-05, + "loss": 11.8049, + "step": 24435 + }, + { + "epoch": 1.3306378605024742, + "grad_norm": 0.534822476559011, + "learning_rate": 5.321504583863377e-05, + "loss": 11.8954, + "step": 24436 + }, + { + "epoch": 1.3306923144990572, + "grad_norm": 0.5072876371324885, + "learning_rate": 5.320725237231089e-05, + "loss": 11.9209, + "step": 24437 + }, + { + "epoch": 1.3307467684956402, + "grad_norm": 0.5403614508507687, + "learning_rate": 5.319945926985622e-05, + "loss": 11.897, + "step": 24438 + }, + { + "epoch": 1.3308012224922232, + "grad_norm": 0.5495669396723336, + "learning_rate": 5.319166653133049e-05, + "loss": 11.9898, + "step": 24439 + }, + { + "epoch": 1.3308556764888064, + "grad_norm": 0.49418161704933866, + "learning_rate": 5.318387415679413e-05, + "loss": 11.7318, + "step": 24440 + }, + { + "epoch": 1.3309101304853894, + "grad_norm": 0.5258276724310946, + "learning_rate": 5.31760821463078e-05, + "loss": 11.7775, + "step": 24441 + }, + { + "epoch": 1.3309645844819724, + "grad_norm": 0.5363831551349146, + "learning_rate": 5.316829049993217e-05, + "loss": 11.9428, + "step": 24442 + }, + { + "epoch": 1.3310190384785554, + "grad_norm": 0.5348364804923785, + "learning_rate": 5.316049921772772e-05, + "loss": 11.85, + "step": 24443 + }, + { + "epoch": 1.3310734924751384, + "grad_norm": 0.5216952354541312, + "learning_rate": 5.315270829975512e-05, + "loss": 11.9201, + "step": 24444 + }, + { + "epoch": 1.3311279464717214, + "grad_norm": 0.6610065133816551, + "learning_rate": 5.314491774607487e-05, + "loss": 11.9976, + "step": 24445 + }, + { + "epoch": 1.3311824004683044, + "grad_norm": 0.5636125963597513, + "learning_rate": 5.3137127556747645e-05, + "loss": 12.0046, + "step": 24446 + }, + { + "epoch": 1.3312368544648874, + "grad_norm": 0.5023090693827555, + "learning_rate": 5.3129337731833926e-05, + "loss": 11.9616, + "step": 24447 + }, + { + "epoch": 1.3312913084614704, + "grad_norm": 0.6430240866603419, + "learning_rate": 5.312154827139435e-05, + "loss": 11.9319, + "step": 24448 + }, + { + "epoch": 1.3313457624580534, + "grad_norm": 0.5633246444295514, + "learning_rate": 5.3113759175489554e-05, + "loss": 11.9111, + "step": 24449 + }, + { + "epoch": 1.3314002164546364, + "grad_norm": 0.5242721961133227, + "learning_rate": 5.310597044417995e-05, + "loss": 11.9042, + "step": 24450 + }, + { + "epoch": 1.3314546704512193, + "grad_norm": 0.5296018505627887, + "learning_rate": 5.309818207752617e-05, + "loss": 11.9196, + "step": 24451 + }, + { + "epoch": 1.3315091244478023, + "grad_norm": 0.6046831151338544, + "learning_rate": 5.309039407558885e-05, + "loss": 11.9763, + "step": 24452 + }, + { + "epoch": 1.3315635784443853, + "grad_norm": 0.5651034819341412, + "learning_rate": 5.3082606438428437e-05, + "loss": 11.8477, + "step": 24453 + }, + { + "epoch": 1.3316180324409685, + "grad_norm": 0.5141442487505451, + "learning_rate": 5.3074819166105594e-05, + "loss": 11.6833, + "step": 24454 + }, + { + "epoch": 1.3316724864375515, + "grad_norm": 0.49895548784066357, + "learning_rate": 5.306703225868079e-05, + "loss": 11.7769, + "step": 24455 + }, + { + "epoch": 1.3317269404341345, + "grad_norm": 0.5204968497412266, + "learning_rate": 5.305924571621464e-05, + "loss": 11.8639, + "step": 24456 + }, + { + "epoch": 1.3317813944307175, + "grad_norm": 0.5857602081241274, + "learning_rate": 5.3051459538767645e-05, + "loss": 11.9054, + "step": 24457 + }, + { + "epoch": 1.3318358484273005, + "grad_norm": 0.5659551784629318, + "learning_rate": 5.304367372640035e-05, + "loss": 12.0636, + "step": 24458 + }, + { + "epoch": 1.3318903024238835, + "grad_norm": 0.5773041733355975, + "learning_rate": 5.303588827917343e-05, + "loss": 12.0934, + "step": 24459 + }, + { + "epoch": 1.3319447564204665, + "grad_norm": 0.5280070943189968, + "learning_rate": 5.3028103197147226e-05, + "loss": 11.9716, + "step": 24460 + }, + { + "epoch": 1.3319992104170495, + "grad_norm": 0.5735275222348536, + "learning_rate": 5.3020318480382404e-05, + "loss": 11.8713, + "step": 24461 + }, + { + "epoch": 1.3320536644136325, + "grad_norm": 0.48462536419252333, + "learning_rate": 5.301253412893943e-05, + "loss": 11.8588, + "step": 24462 + }, + { + "epoch": 1.3321081184102157, + "grad_norm": 0.5338053619619285, + "learning_rate": 5.300475014287887e-05, + "loss": 11.8625, + "step": 24463 + }, + { + "epoch": 1.3321625724067987, + "grad_norm": 0.5613163297630709, + "learning_rate": 5.299696652226129e-05, + "loss": 11.9371, + "step": 24464 + }, + { + "epoch": 1.3322170264033817, + "grad_norm": 0.5626807947280761, + "learning_rate": 5.298918326714715e-05, + "loss": 11.8772, + "step": 24465 + }, + { + "epoch": 1.3322714803999647, + "grad_norm": 0.601793737070652, + "learning_rate": 5.2981400377597036e-05, + "loss": 11.9692, + "step": 24466 + }, + { + "epoch": 1.3323259343965477, + "grad_norm": 0.5741701224189307, + "learning_rate": 5.29736178536714e-05, + "loss": 11.901, + "step": 24467 + }, + { + "epoch": 1.3323803883931307, + "grad_norm": 0.5736657088276385, + "learning_rate": 5.296583569543083e-05, + "loss": 11.9024, + "step": 24468 + }, + { + "epoch": 1.3324348423897137, + "grad_norm": 0.5978380892139845, + "learning_rate": 5.295805390293582e-05, + "loss": 11.9477, + "step": 24469 + }, + { + "epoch": 1.3324892963862967, + "grad_norm": 0.5819745812226959, + "learning_rate": 5.295027247624683e-05, + "loss": 12.0241, + "step": 24470 + }, + { + "epoch": 1.3325437503828796, + "grad_norm": 0.5016597715091647, + "learning_rate": 5.294249141542444e-05, + "loss": 11.8833, + "step": 24471 + }, + { + "epoch": 1.3325982043794626, + "grad_norm": 0.5297508849514344, + "learning_rate": 5.293471072052911e-05, + "loss": 11.8493, + "step": 24472 + }, + { + "epoch": 1.3326526583760456, + "grad_norm": 0.5317063762016192, + "learning_rate": 5.292693039162135e-05, + "loss": 11.8721, + "step": 24473 + }, + { + "epoch": 1.3327071123726286, + "grad_norm": 0.5432951166550877, + "learning_rate": 5.291915042876172e-05, + "loss": 11.8806, + "step": 24474 + }, + { + "epoch": 1.3327615663692116, + "grad_norm": 0.5898378762569593, + "learning_rate": 5.291137083201062e-05, + "loss": 11.969, + "step": 24475 + }, + { + "epoch": 1.3328160203657946, + "grad_norm": 0.571938426440639, + "learning_rate": 5.290359160142864e-05, + "loss": 11.8397, + "step": 24476 + }, + { + "epoch": 1.3328704743623776, + "grad_norm": 0.5815525163283235, + "learning_rate": 5.28958127370762e-05, + "loss": 11.9098, + "step": 24477 + }, + { + "epoch": 1.3329249283589608, + "grad_norm": 0.5968767462739033, + "learning_rate": 5.288803423901385e-05, + "loss": 11.9672, + "step": 24478 + }, + { + "epoch": 1.3329793823555438, + "grad_norm": 0.5754187193247214, + "learning_rate": 5.288025610730205e-05, + "loss": 11.9174, + "step": 24479 + }, + { + "epoch": 1.3330338363521268, + "grad_norm": 0.5898023123021086, + "learning_rate": 5.287247834200124e-05, + "loss": 11.9645, + "step": 24480 + }, + { + "epoch": 1.3330882903487098, + "grad_norm": 0.5287901927630678, + "learning_rate": 5.2864700943171975e-05, + "loss": 11.9177, + "step": 24481 + }, + { + "epoch": 1.3331427443452928, + "grad_norm": 0.6187442586926514, + "learning_rate": 5.285692391087467e-05, + "loss": 11.9769, + "step": 24482 + }, + { + "epoch": 1.3331971983418758, + "grad_norm": 0.5660317088857969, + "learning_rate": 5.2849147245169875e-05, + "loss": 12.0203, + "step": 24483 + }, + { + "epoch": 1.3332516523384588, + "grad_norm": 0.5753054490269349, + "learning_rate": 5.2841370946117963e-05, + "loss": 11.9575, + "step": 24484 + }, + { + "epoch": 1.3333061063350418, + "grad_norm": 0.4944736485581836, + "learning_rate": 5.2833595013779466e-05, + "loss": 11.9116, + "step": 24485 + }, + { + "epoch": 1.333360560331625, + "grad_norm": 0.4940920220088117, + "learning_rate": 5.282581944821489e-05, + "loss": 11.8072, + "step": 24486 + }, + { + "epoch": 1.333415014328208, + "grad_norm": 0.5380199073446132, + "learning_rate": 5.28180442494846e-05, + "loss": 11.9904, + "step": 24487 + }, + { + "epoch": 1.333469468324791, + "grad_norm": 0.6578582201421894, + "learning_rate": 5.281026941764916e-05, + "loss": 12.122, + "step": 24488 + }, + { + "epoch": 1.333523922321374, + "grad_norm": 0.5837883037327447, + "learning_rate": 5.280249495276896e-05, + "loss": 11.8773, + "step": 24489 + }, + { + "epoch": 1.333578376317957, + "grad_norm": 0.5433843728532075, + "learning_rate": 5.2794720854904464e-05, + "loss": 11.9688, + "step": 24490 + }, + { + "epoch": 1.33363283031454, + "grad_norm": 0.560986084298655, + "learning_rate": 5.278694712411616e-05, + "loss": 12.0565, + "step": 24491 + }, + { + "epoch": 1.333687284311123, + "grad_norm": 0.515790404395713, + "learning_rate": 5.277917376046444e-05, + "loss": 11.9032, + "step": 24492 + }, + { + "epoch": 1.333741738307706, + "grad_norm": 0.55056638242684, + "learning_rate": 5.2771400764009814e-05, + "loss": 11.9398, + "step": 24493 + }, + { + "epoch": 1.333796192304289, + "grad_norm": 0.5499523177854556, + "learning_rate": 5.276362813481266e-05, + "loss": 11.8566, + "step": 24494 + }, + { + "epoch": 1.333850646300872, + "grad_norm": 0.5444431428316789, + "learning_rate": 5.2755855872933505e-05, + "loss": 11.9437, + "step": 24495 + }, + { + "epoch": 1.333905100297455, + "grad_norm": 0.5484030889653387, + "learning_rate": 5.274808397843267e-05, + "loss": 11.78, + "step": 24496 + }, + { + "epoch": 1.333959554294038, + "grad_norm": 0.5904625367617514, + "learning_rate": 5.274031245137068e-05, + "loss": 11.8786, + "step": 24497 + }, + { + "epoch": 1.334014008290621, + "grad_norm": 0.5597318931763078, + "learning_rate": 5.273254129180798e-05, + "loss": 11.9085, + "step": 24498 + }, + { + "epoch": 1.3340684622872039, + "grad_norm": 0.5368922913995148, + "learning_rate": 5.2724770499804975e-05, + "loss": 11.8652, + "step": 24499 + }, + { + "epoch": 1.3341229162837869, + "grad_norm": 0.5684830044239768, + "learning_rate": 5.2717000075422026e-05, + "loss": 11.9427, + "step": 24500 + }, + { + "epoch": 1.33417737028037, + "grad_norm": 0.5291246322300477, + "learning_rate": 5.270923001871967e-05, + "loss": 11.8035, + "step": 24501 + }, + { + "epoch": 1.334231824276953, + "grad_norm": 0.7121779788468687, + "learning_rate": 5.270146032975822e-05, + "loss": 12.0363, + "step": 24502 + }, + { + "epoch": 1.334286278273536, + "grad_norm": 0.6311172161278445, + "learning_rate": 5.26936910085982e-05, + "loss": 11.7624, + "step": 24503 + }, + { + "epoch": 1.334340732270119, + "grad_norm": 0.5395628372007741, + "learning_rate": 5.268592205529992e-05, + "loss": 11.9868, + "step": 24504 + }, + { + "epoch": 1.334395186266702, + "grad_norm": 0.5465804213449359, + "learning_rate": 5.267815346992388e-05, + "loss": 11.9089, + "step": 24505 + }, + { + "epoch": 1.334449640263285, + "grad_norm": 0.5342797119889956, + "learning_rate": 5.267038525253043e-05, + "loss": 11.8063, + "step": 24506 + }, + { + "epoch": 1.334504094259868, + "grad_norm": 0.5615334252366635, + "learning_rate": 5.266261740317999e-05, + "loss": 11.9759, + "step": 24507 + }, + { + "epoch": 1.334558548256451, + "grad_norm": 0.5544772304893303, + "learning_rate": 5.265484992193301e-05, + "loss": 11.7509, + "step": 24508 + }, + { + "epoch": 1.334613002253034, + "grad_norm": 0.6759601781059053, + "learning_rate": 5.264708280884988e-05, + "loss": 12.0361, + "step": 24509 + }, + { + "epoch": 1.3346674562496172, + "grad_norm": 0.5188243734877167, + "learning_rate": 5.2639316063990954e-05, + "loss": 11.8241, + "step": 24510 + }, + { + "epoch": 1.3347219102462002, + "grad_norm": 0.6528429173010141, + "learning_rate": 5.2631549687416615e-05, + "loss": 11.9826, + "step": 24511 + }, + { + "epoch": 1.3347763642427832, + "grad_norm": 0.55452836762981, + "learning_rate": 5.26237836791873e-05, + "loss": 11.8725, + "step": 24512 + }, + { + "epoch": 1.3348308182393662, + "grad_norm": 0.5459149363313589, + "learning_rate": 5.261601803936341e-05, + "loss": 11.8963, + "step": 24513 + }, + { + "epoch": 1.3348852722359492, + "grad_norm": 0.5294156458948727, + "learning_rate": 5.2608252768005286e-05, + "loss": 11.869, + "step": 24514 + }, + { + "epoch": 1.3349397262325322, + "grad_norm": 0.602535210597049, + "learning_rate": 5.260048786517337e-05, + "loss": 11.9743, + "step": 24515 + }, + { + "epoch": 1.3349941802291152, + "grad_norm": 0.6139798693690568, + "learning_rate": 5.2592723330927965e-05, + "loss": 12.0177, + "step": 24516 + }, + { + "epoch": 1.3350486342256982, + "grad_norm": 0.5571415481736457, + "learning_rate": 5.258495916532953e-05, + "loss": 11.8239, + "step": 24517 + }, + { + "epoch": 1.3351030882222812, + "grad_norm": 0.6025306949219089, + "learning_rate": 5.257719536843837e-05, + "loss": 12.0201, + "step": 24518 + }, + { + "epoch": 1.3351575422188642, + "grad_norm": 0.5926428550507084, + "learning_rate": 5.256943194031494e-05, + "loss": 11.9947, + "step": 24519 + }, + { + "epoch": 1.3352119962154472, + "grad_norm": 0.547831804283872, + "learning_rate": 5.256166888101954e-05, + "loss": 11.8195, + "step": 24520 + }, + { + "epoch": 1.3352664502120302, + "grad_norm": 0.5741966033625567, + "learning_rate": 5.2553906190612535e-05, + "loss": 11.9355, + "step": 24521 + }, + { + "epoch": 1.3353209042086132, + "grad_norm": 0.5688103106950454, + "learning_rate": 5.254614386915431e-05, + "loss": 11.8386, + "step": 24522 + }, + { + "epoch": 1.3353753582051962, + "grad_norm": 0.5438147681305743, + "learning_rate": 5.253838191670527e-05, + "loss": 11.9718, + "step": 24523 + }, + { + "epoch": 1.3354298122017794, + "grad_norm": 0.5413384242111259, + "learning_rate": 5.2530620333325696e-05, + "loss": 11.7993, + "step": 24524 + }, + { + "epoch": 1.3354842661983624, + "grad_norm": 0.5437024314337768, + "learning_rate": 5.2522859119076006e-05, + "loss": 11.9408, + "step": 24525 + }, + { + "epoch": 1.3355387201949454, + "grad_norm": 0.5641061542872094, + "learning_rate": 5.251509827401648e-05, + "loss": 11.9104, + "step": 24526 + }, + { + "epoch": 1.3355931741915283, + "grad_norm": 0.5531141663828754, + "learning_rate": 5.250733779820758e-05, + "loss": 11.9219, + "step": 24527 + }, + { + "epoch": 1.3356476281881113, + "grad_norm": 0.584910086915897, + "learning_rate": 5.2499577691709526e-05, + "loss": 11.9262, + "step": 24528 + }, + { + "epoch": 1.3357020821846943, + "grad_norm": 0.4837542733785465, + "learning_rate": 5.249181795458276e-05, + "loss": 11.7622, + "step": 24529 + }, + { + "epoch": 1.3357565361812773, + "grad_norm": 0.5236927571332141, + "learning_rate": 5.248405858688759e-05, + "loss": 12.0457, + "step": 24530 + }, + { + "epoch": 1.3358109901778603, + "grad_norm": 0.5643534582442321, + "learning_rate": 5.247629958868432e-05, + "loss": 11.8375, + "step": 24531 + }, + { + "epoch": 1.3358654441744433, + "grad_norm": 0.6102868601131803, + "learning_rate": 5.246854096003333e-05, + "loss": 11.8655, + "step": 24532 + }, + { + "epoch": 1.3359198981710265, + "grad_norm": 0.5241006028585222, + "learning_rate": 5.2460782700994914e-05, + "loss": 11.9341, + "step": 24533 + }, + { + "epoch": 1.3359743521676095, + "grad_norm": 0.5393948666480911, + "learning_rate": 5.2453024811629416e-05, + "loss": 11.897, + "step": 24534 + }, + { + "epoch": 1.3360288061641925, + "grad_norm": 0.5685925549442461, + "learning_rate": 5.2445267291997214e-05, + "loss": 11.9586, + "step": 24535 + }, + { + "epoch": 1.3360832601607755, + "grad_norm": 0.5566302805644933, + "learning_rate": 5.243751014215855e-05, + "loss": 11.9093, + "step": 24536 + }, + { + "epoch": 1.3361377141573585, + "grad_norm": 0.5696849007858308, + "learning_rate": 5.2429753362173814e-05, + "loss": 11.7861, + "step": 24537 + }, + { + "epoch": 1.3361921681539415, + "grad_norm": 0.5224432631826207, + "learning_rate": 5.242199695210328e-05, + "loss": 11.8364, + "step": 24538 + }, + { + "epoch": 1.3362466221505245, + "grad_norm": 0.5013398649851833, + "learning_rate": 5.2414240912007286e-05, + "loss": 11.9152, + "step": 24539 + }, + { + "epoch": 1.3363010761471075, + "grad_norm": 0.6143408768650449, + "learning_rate": 5.240648524194616e-05, + "loss": 12.0068, + "step": 24540 + }, + { + "epoch": 1.3363555301436905, + "grad_norm": 0.5643609576473816, + "learning_rate": 5.239872994198012e-05, + "loss": 11.8676, + "step": 24541 + }, + { + "epoch": 1.3364099841402735, + "grad_norm": 0.4948973319808279, + "learning_rate": 5.23909750121696e-05, + "loss": 11.9777, + "step": 24542 + }, + { + "epoch": 1.3364644381368564, + "grad_norm": 0.5487869439216826, + "learning_rate": 5.238322045257479e-05, + "loss": 12.0445, + "step": 24543 + }, + { + "epoch": 1.3365188921334394, + "grad_norm": 0.5121733353384509, + "learning_rate": 5.237546626325609e-05, + "loss": 11.8944, + "step": 24544 + }, + { + "epoch": 1.3365733461300224, + "grad_norm": 0.5654297358283139, + "learning_rate": 5.2367712444273697e-05, + "loss": 11.9342, + "step": 24545 + }, + { + "epoch": 1.3366278001266054, + "grad_norm": 0.6073104836902603, + "learning_rate": 5.2359958995687973e-05, + "loss": 11.9483, + "step": 24546 + }, + { + "epoch": 1.3366822541231884, + "grad_norm": 0.5533297583586704, + "learning_rate": 5.235220591755925e-05, + "loss": 11.8096, + "step": 24547 + }, + { + "epoch": 1.3367367081197716, + "grad_norm": 0.5467902640031804, + "learning_rate": 5.23444532099477e-05, + "loss": 11.8904, + "step": 24548 + }, + { + "epoch": 1.3367911621163546, + "grad_norm": 0.523885299781789, + "learning_rate": 5.2336700872913724e-05, + "loss": 11.8243, + "step": 24549 + }, + { + "epoch": 1.3368456161129376, + "grad_norm": 0.5399534978432189, + "learning_rate": 5.232894890651756e-05, + "loss": 11.8559, + "step": 24550 + }, + { + "epoch": 1.3369000701095206, + "grad_norm": 0.5301881372557181, + "learning_rate": 5.232119731081944e-05, + "loss": 11.9029, + "step": 24551 + }, + { + "epoch": 1.3369545241061036, + "grad_norm": 0.5376157825698882, + "learning_rate": 5.2313446085879714e-05, + "loss": 11.9368, + "step": 24552 + }, + { + "epoch": 1.3370089781026866, + "grad_norm": 0.602887367693488, + "learning_rate": 5.23056952317586e-05, + "loss": 11.889, + "step": 24553 + }, + { + "epoch": 1.3370634320992696, + "grad_norm": 0.5694006198900216, + "learning_rate": 5.2297944748516436e-05, + "loss": 11.9248, + "step": 24554 + }, + { + "epoch": 1.3371178860958526, + "grad_norm": 0.5864370331877468, + "learning_rate": 5.2290194636213406e-05, + "loss": 11.8373, + "step": 24555 + }, + { + "epoch": 1.3371723400924358, + "grad_norm": 0.656334369441371, + "learning_rate": 5.228244489490983e-05, + "loss": 12.0397, + "step": 24556 + }, + { + "epoch": 1.3372267940890188, + "grad_norm": 0.5181307474081975, + "learning_rate": 5.227469552466602e-05, + "loss": 11.9691, + "step": 24557 + }, + { + "epoch": 1.3372812480856018, + "grad_norm": 0.6116110610278155, + "learning_rate": 5.226694652554211e-05, + "loss": 11.8194, + "step": 24558 + }, + { + "epoch": 1.3373357020821848, + "grad_norm": 0.5458583824834689, + "learning_rate": 5.225919789759853e-05, + "loss": 11.8655, + "step": 24559 + }, + { + "epoch": 1.3373901560787678, + "grad_norm": 0.5168599012507781, + "learning_rate": 5.225144964089536e-05, + "loss": 11.7887, + "step": 24560 + }, + { + "epoch": 1.3374446100753508, + "grad_norm": 0.5947057044254064, + "learning_rate": 5.2243701755492916e-05, + "loss": 12.0628, + "step": 24561 + }, + { + "epoch": 1.3374990640719338, + "grad_norm": 0.5383409990700289, + "learning_rate": 5.223595424145149e-05, + "loss": 11.8655, + "step": 24562 + }, + { + "epoch": 1.3375535180685167, + "grad_norm": 0.5026192462405705, + "learning_rate": 5.222820709883125e-05, + "loss": 11.9324, + "step": 24563 + }, + { + "epoch": 1.3376079720650997, + "grad_norm": 0.5398532142020857, + "learning_rate": 5.2220460327692534e-05, + "loss": 11.8372, + "step": 24564 + }, + { + "epoch": 1.3376624260616827, + "grad_norm": 0.5478062871579805, + "learning_rate": 5.221271392809548e-05, + "loss": 11.8825, + "step": 24565 + }, + { + "epoch": 1.3377168800582657, + "grad_norm": 0.5492161498882498, + "learning_rate": 5.220496790010043e-05, + "loss": 11.8732, + "step": 24566 + }, + { + "epoch": 1.3377713340548487, + "grad_norm": 0.5711176807493772, + "learning_rate": 5.219722224376752e-05, + "loss": 12.0181, + "step": 24567 + }, + { + "epoch": 1.3378257880514317, + "grad_norm": 0.5198765195672812, + "learning_rate": 5.218947695915702e-05, + "loss": 12.0444, + "step": 24568 + }, + { + "epoch": 1.3378802420480147, + "grad_norm": 0.56607808304362, + "learning_rate": 5.2181732046329216e-05, + "loss": 11.9568, + "step": 24569 + }, + { + "epoch": 1.3379346960445977, + "grad_norm": 0.5950939842943866, + "learning_rate": 5.217398750534428e-05, + "loss": 11.8325, + "step": 24570 + }, + { + "epoch": 1.337989150041181, + "grad_norm": 0.5148668023660602, + "learning_rate": 5.216624333626239e-05, + "loss": 11.9365, + "step": 24571 + }, + { + "epoch": 1.338043604037764, + "grad_norm": 0.5832434549780464, + "learning_rate": 5.215849953914386e-05, + "loss": 11.9594, + "step": 24572 + }, + { + "epoch": 1.338098058034347, + "grad_norm": 0.5323926007781992, + "learning_rate": 5.2150756114048814e-05, + "loss": 11.8446, + "step": 24573 + }, + { + "epoch": 1.3381525120309299, + "grad_norm": 0.6715059857263452, + "learning_rate": 5.214301306103756e-05, + "loss": 11.7965, + "step": 24574 + }, + { + "epoch": 1.3382069660275129, + "grad_norm": 0.5972905943475129, + "learning_rate": 5.213527038017022e-05, + "loss": 11.9489, + "step": 24575 + }, + { + "epoch": 1.3382614200240959, + "grad_norm": 0.6272140179190515, + "learning_rate": 5.212752807150708e-05, + "loss": 11.8413, + "step": 24576 + }, + { + "epoch": 1.3383158740206789, + "grad_norm": 0.5213794721947207, + "learning_rate": 5.2119786135108284e-05, + "loss": 11.8303, + "step": 24577 + }, + { + "epoch": 1.3383703280172619, + "grad_norm": 0.6199835343926031, + "learning_rate": 5.211204457103404e-05, + "loss": 11.9136, + "step": 24578 + }, + { + "epoch": 1.3384247820138448, + "grad_norm": 0.6456791515026141, + "learning_rate": 5.2104303379344624e-05, + "loss": 11.9072, + "step": 24579 + }, + { + "epoch": 1.338479236010428, + "grad_norm": 0.49702494056767493, + "learning_rate": 5.209656256010016e-05, + "loss": 11.7111, + "step": 24580 + }, + { + "epoch": 1.338533690007011, + "grad_norm": 0.5736092594306474, + "learning_rate": 5.208882211336088e-05, + "loss": 11.918, + "step": 24581 + }, + { + "epoch": 1.338588144003594, + "grad_norm": 0.5551138604712236, + "learning_rate": 5.2081082039186904e-05, + "loss": 11.782, + "step": 24582 + }, + { + "epoch": 1.338642598000177, + "grad_norm": 0.532812278004759, + "learning_rate": 5.2073342337638476e-05, + "loss": 11.829, + "step": 24583 + }, + { + "epoch": 1.33869705199676, + "grad_norm": 0.6295566813729274, + "learning_rate": 5.206560300877581e-05, + "loss": 11.8954, + "step": 24584 + }, + { + "epoch": 1.338751505993343, + "grad_norm": 0.5912823516384009, + "learning_rate": 5.2057864052659e-05, + "loss": 11.9551, + "step": 24585 + }, + { + "epoch": 1.338805959989926, + "grad_norm": 0.5743786285938248, + "learning_rate": 5.205012546934833e-05, + "loss": 11.971, + "step": 24586 + }, + { + "epoch": 1.338860413986509, + "grad_norm": 0.5508310161518909, + "learning_rate": 5.2042387258903894e-05, + "loss": 12.0477, + "step": 24587 + }, + { + "epoch": 1.338914867983092, + "grad_norm": 0.5812276817649266, + "learning_rate": 5.203464942138594e-05, + "loss": 11.8536, + "step": 24588 + }, + { + "epoch": 1.338969321979675, + "grad_norm": 0.5663361545925268, + "learning_rate": 5.202691195685454e-05, + "loss": 11.8048, + "step": 24589 + }, + { + "epoch": 1.339023775976258, + "grad_norm": 0.5225567662720166, + "learning_rate": 5.201917486536996e-05, + "loss": 11.7475, + "step": 24590 + }, + { + "epoch": 1.339078229972841, + "grad_norm": 0.5407205251730083, + "learning_rate": 5.201143814699233e-05, + "loss": 11.9106, + "step": 24591 + }, + { + "epoch": 1.339132683969424, + "grad_norm": 0.53190134231552, + "learning_rate": 5.200370180178176e-05, + "loss": 11.9709, + "step": 24592 + }, + { + "epoch": 1.339187137966007, + "grad_norm": 0.5828162426608439, + "learning_rate": 5.1995965829798496e-05, + "loss": 11.9461, + "step": 24593 + }, + { + "epoch": 1.3392415919625902, + "grad_norm": 0.5606408360367208, + "learning_rate": 5.198823023110262e-05, + "loss": 11.9083, + "step": 24594 + }, + { + "epoch": 1.3392960459591732, + "grad_norm": 0.5499153178025608, + "learning_rate": 5.19804950057543e-05, + "loss": 12.0599, + "step": 24595 + }, + { + "epoch": 1.3393504999557562, + "grad_norm": 0.6272253214515026, + "learning_rate": 5.197276015381376e-05, + "loss": 12.0533, + "step": 24596 + }, + { + "epoch": 1.3394049539523392, + "grad_norm": 0.5874620400508778, + "learning_rate": 5.196502567534105e-05, + "loss": 11.934, + "step": 24597 + }, + { + "epoch": 1.3394594079489222, + "grad_norm": 0.5537663911405211, + "learning_rate": 5.1957291570396385e-05, + "loss": 11.8273, + "step": 24598 + }, + { + "epoch": 1.3395138619455051, + "grad_norm": 0.5634268633033013, + "learning_rate": 5.194955783903984e-05, + "loss": 11.9272, + "step": 24599 + }, + { + "epoch": 1.3395683159420881, + "grad_norm": 0.5515821492661436, + "learning_rate": 5.1941824481331626e-05, + "loss": 11.9599, + "step": 24600 + }, + { + "epoch": 1.3396227699386711, + "grad_norm": 0.5012151590987814, + "learning_rate": 5.193409149733186e-05, + "loss": 11.928, + "step": 24601 + }, + { + "epoch": 1.3396772239352541, + "grad_norm": 0.5688783355733719, + "learning_rate": 5.1926358887100604e-05, + "loss": 11.8969, + "step": 24602 + }, + { + "epoch": 1.3397316779318373, + "grad_norm": 0.4964984174350957, + "learning_rate": 5.191862665069809e-05, + "loss": 11.7818, + "step": 24603 + }, + { + "epoch": 1.3397861319284203, + "grad_norm": 0.5021339874537967, + "learning_rate": 5.191089478818436e-05, + "loss": 11.96, + "step": 24604 + }, + { + "epoch": 1.3398405859250033, + "grad_norm": 0.5401406439005793, + "learning_rate": 5.1903163299619575e-05, + "loss": 11.8579, + "step": 24605 + }, + { + "epoch": 1.3398950399215863, + "grad_norm": 0.5401028643326528, + "learning_rate": 5.18954321850639e-05, + "loss": 11.9232, + "step": 24606 + }, + { + "epoch": 1.3399494939181693, + "grad_norm": 0.5118698735519258, + "learning_rate": 5.1887701444577376e-05, + "loss": 11.9735, + "step": 24607 + }, + { + "epoch": 1.3400039479147523, + "grad_norm": 0.5270790401729122, + "learning_rate": 5.187997107822018e-05, + "loss": 11.9389, + "step": 24608 + }, + { + "epoch": 1.3400584019113353, + "grad_norm": 0.5570540128271331, + "learning_rate": 5.1872241086052374e-05, + "loss": 11.8259, + "step": 24609 + }, + { + "epoch": 1.3401128559079183, + "grad_norm": 0.5356794580221736, + "learning_rate": 5.186451146813411e-05, + "loss": 12.009, + "step": 24610 + }, + { + "epoch": 1.3401673099045013, + "grad_norm": 0.6214559074683523, + "learning_rate": 5.18567822245255e-05, + "loss": 11.8368, + "step": 24611 + }, + { + "epoch": 1.3402217639010843, + "grad_norm": 0.5544359717823504, + "learning_rate": 5.1849053355286583e-05, + "loss": 11.8034, + "step": 24612 + }, + { + "epoch": 1.3402762178976673, + "grad_norm": 0.5700299153115338, + "learning_rate": 5.184132486047752e-05, + "loss": 11.9758, + "step": 24613 + }, + { + "epoch": 1.3403306718942503, + "grad_norm": 0.5951283432976049, + "learning_rate": 5.1833596740158376e-05, + "loss": 11.8935, + "step": 24614 + }, + { + "epoch": 1.3403851258908333, + "grad_norm": 0.5615266933171653, + "learning_rate": 5.1825868994389304e-05, + "loss": 11.8921, + "step": 24615 + }, + { + "epoch": 1.3404395798874162, + "grad_norm": 0.4976954408349177, + "learning_rate": 5.1818141623230285e-05, + "loss": 11.7621, + "step": 24616 + }, + { + "epoch": 1.3404940338839992, + "grad_norm": 0.5422445813314399, + "learning_rate": 5.18104146267415e-05, + "loss": 11.9329, + "step": 24617 + }, + { + "epoch": 1.3405484878805825, + "grad_norm": 0.580614160854788, + "learning_rate": 5.1802688004983046e-05, + "loss": 12.011, + "step": 24618 + }, + { + "epoch": 1.3406029418771654, + "grad_norm": 0.5532611306209777, + "learning_rate": 5.1794961758014936e-05, + "loss": 11.9552, + "step": 24619 + }, + { + "epoch": 1.3406573958737484, + "grad_norm": 0.5203892391062399, + "learning_rate": 5.178723588589732e-05, + "loss": 11.9495, + "step": 24620 + }, + { + "epoch": 1.3407118498703314, + "grad_norm": 0.6274415942007482, + "learning_rate": 5.177951038869024e-05, + "loss": 11.9656, + "step": 24621 + }, + { + "epoch": 1.3407663038669144, + "grad_norm": 0.5545298279893404, + "learning_rate": 5.177178526645375e-05, + "loss": 11.8779, + "step": 24622 + }, + { + "epoch": 1.3408207578634974, + "grad_norm": 0.5475076096938358, + "learning_rate": 5.176406051924798e-05, + "loss": 11.8929, + "step": 24623 + }, + { + "epoch": 1.3408752118600804, + "grad_norm": 0.5193635150414153, + "learning_rate": 5.175633614713292e-05, + "loss": 11.8949, + "step": 24624 + }, + { + "epoch": 1.3409296658566634, + "grad_norm": 0.5310161217107139, + "learning_rate": 5.174861215016873e-05, + "loss": 11.8812, + "step": 24625 + }, + { + "epoch": 1.3409841198532466, + "grad_norm": 0.5776940149513465, + "learning_rate": 5.174088852841536e-05, + "loss": 12.0292, + "step": 24626 + }, + { + "epoch": 1.3410385738498296, + "grad_norm": 0.5795239369137751, + "learning_rate": 5.173316528193295e-05, + "loss": 11.8541, + "step": 24627 + }, + { + "epoch": 1.3410930278464126, + "grad_norm": 0.6115089736970947, + "learning_rate": 5.172544241078159e-05, + "loss": 11.9109, + "step": 24628 + }, + { + "epoch": 1.3411474818429956, + "grad_norm": 0.5783162807753064, + "learning_rate": 5.171771991502123e-05, + "loss": 11.7701, + "step": 24629 + }, + { + "epoch": 1.3412019358395786, + "grad_norm": 0.546458977271874, + "learning_rate": 5.1709997794712064e-05, + "loss": 11.7898, + "step": 24630 + }, + { + "epoch": 1.3412563898361616, + "grad_norm": 0.5056918842436856, + "learning_rate": 5.170227604991398e-05, + "loss": 11.8176, + "step": 24631 + }, + { + "epoch": 1.3413108438327446, + "grad_norm": 0.47401625281647586, + "learning_rate": 5.169455468068709e-05, + "loss": 11.878, + "step": 24632 + }, + { + "epoch": 1.3413652978293276, + "grad_norm": 0.5009397159064994, + "learning_rate": 5.1686833687091475e-05, + "loss": 11.758, + "step": 24633 + }, + { + "epoch": 1.3414197518259106, + "grad_norm": 0.5628233913388982, + "learning_rate": 5.1679113069187115e-05, + "loss": 12.0128, + "step": 24634 + }, + { + "epoch": 1.3414742058224935, + "grad_norm": 0.498647738732315, + "learning_rate": 5.1671392827034106e-05, + "loss": 11.9058, + "step": 24635 + }, + { + "epoch": 1.3415286598190765, + "grad_norm": 0.49257969311586597, + "learning_rate": 5.166367296069241e-05, + "loss": 11.9904, + "step": 24636 + }, + { + "epoch": 1.3415831138156595, + "grad_norm": 0.5506269605856507, + "learning_rate": 5.1655953470222154e-05, + "loss": 11.857, + "step": 24637 + }, + { + "epoch": 1.3416375678122425, + "grad_norm": 0.559868816327295, + "learning_rate": 5.164823435568327e-05, + "loss": 12.0076, + "step": 24638 + }, + { + "epoch": 1.3416920218088255, + "grad_norm": 0.5731561395593245, + "learning_rate": 5.164051561713581e-05, + "loss": 11.9248, + "step": 24639 + }, + { + "epoch": 1.3417464758054085, + "grad_norm": 0.5540462796840516, + "learning_rate": 5.16327972546399e-05, + "loss": 11.8274, + "step": 24640 + }, + { + "epoch": 1.3418009298019917, + "grad_norm": 0.5320036454013334, + "learning_rate": 5.1625079268255386e-05, + "loss": 11.9924, + "step": 24641 + }, + { + "epoch": 1.3418553837985747, + "grad_norm": 0.5241473322774366, + "learning_rate": 5.1617361658042405e-05, + "loss": 11.8711, + "step": 24642 + }, + { + "epoch": 1.3419098377951577, + "grad_norm": 0.5055339413936103, + "learning_rate": 5.16096444240609e-05, + "loss": 11.9069, + "step": 24643 + }, + { + "epoch": 1.3419642917917407, + "grad_norm": 0.5048298227752752, + "learning_rate": 5.160192756637091e-05, + "loss": 11.9758, + "step": 24644 + }, + { + "epoch": 1.3420187457883237, + "grad_norm": 0.5124961733519724, + "learning_rate": 5.1594211085032484e-05, + "loss": 11.7453, + "step": 24645 + }, + { + "epoch": 1.3420731997849067, + "grad_norm": 0.5360412781099567, + "learning_rate": 5.158649498010557e-05, + "loss": 11.8183, + "step": 24646 + }, + { + "epoch": 1.3421276537814897, + "grad_norm": 0.5217600581621155, + "learning_rate": 5.157877925165021e-05, + "loss": 11.8677, + "step": 24647 + }, + { + "epoch": 1.3421821077780727, + "grad_norm": 0.5520672183749687, + "learning_rate": 5.157106389972635e-05, + "loss": 11.852, + "step": 24648 + }, + { + "epoch": 1.3422365617746557, + "grad_norm": 0.5344003240530755, + "learning_rate": 5.156334892439405e-05, + "loss": 11.978, + "step": 24649 + }, + { + "epoch": 1.3422910157712389, + "grad_norm": 0.5756585787262845, + "learning_rate": 5.1555634325713284e-05, + "loss": 11.8156, + "step": 24650 + }, + { + "epoch": 1.3423454697678219, + "grad_norm": 0.5104893397762217, + "learning_rate": 5.154792010374399e-05, + "loss": 11.8454, + "step": 24651 + }, + { + "epoch": 1.3423999237644049, + "grad_norm": 0.5863480953662468, + "learning_rate": 5.154020625854623e-05, + "loss": 11.9262, + "step": 24652 + }, + { + "epoch": 1.3424543777609879, + "grad_norm": 0.492592261822517, + "learning_rate": 5.153249279017991e-05, + "loss": 11.9287, + "step": 24653 + }, + { + "epoch": 1.3425088317575709, + "grad_norm": 0.5872998914003171, + "learning_rate": 5.152477969870506e-05, + "loss": 11.908, + "step": 24654 + }, + { + "epoch": 1.3425632857541538, + "grad_norm": 0.5605485116233468, + "learning_rate": 5.1517066984181685e-05, + "loss": 11.8801, + "step": 24655 + }, + { + "epoch": 1.3426177397507368, + "grad_norm": 0.5990806711376991, + "learning_rate": 5.1509354646669695e-05, + "loss": 11.9292, + "step": 24656 + }, + { + "epoch": 1.3426721937473198, + "grad_norm": 0.7279063077741952, + "learning_rate": 5.1501642686229126e-05, + "loss": 12.01, + "step": 24657 + }, + { + "epoch": 1.3427266477439028, + "grad_norm": 0.5294510462086448, + "learning_rate": 5.1493931102919893e-05, + "loss": 11.9327, + "step": 24658 + }, + { + "epoch": 1.3427811017404858, + "grad_norm": 0.5012113477080508, + "learning_rate": 5.1486219896802e-05, + "loss": 11.979, + "step": 24659 + }, + { + "epoch": 1.3428355557370688, + "grad_norm": 0.5208283316466333, + "learning_rate": 5.147850906793542e-05, + "loss": 11.8707, + "step": 24660 + }, + { + "epoch": 1.3428900097336518, + "grad_norm": 0.582335091795844, + "learning_rate": 5.1470798616380045e-05, + "loss": 12.0025, + "step": 24661 + }, + { + "epoch": 1.3429444637302348, + "grad_norm": 0.6343594526751137, + "learning_rate": 5.146308854219591e-05, + "loss": 12.1038, + "step": 24662 + }, + { + "epoch": 1.3429989177268178, + "grad_norm": 0.5244064185971522, + "learning_rate": 5.145537884544291e-05, + "loss": 11.9433, + "step": 24663 + }, + { + "epoch": 1.343053371723401, + "grad_norm": 0.7054565994001688, + "learning_rate": 5.1447669526181055e-05, + "loss": 12.0494, + "step": 24664 + }, + { + "epoch": 1.343107825719984, + "grad_norm": 0.570649830549098, + "learning_rate": 5.143996058447023e-05, + "loss": 11.9568, + "step": 24665 + }, + { + "epoch": 1.343162279716567, + "grad_norm": 0.6754946316357943, + "learning_rate": 5.143225202037042e-05, + "loss": 12.0417, + "step": 24666 + }, + { + "epoch": 1.34321673371315, + "grad_norm": 0.5069186991077531, + "learning_rate": 5.14245438339416e-05, + "loss": 11.9761, + "step": 24667 + }, + { + "epoch": 1.343271187709733, + "grad_norm": 0.5415640747473932, + "learning_rate": 5.141683602524363e-05, + "loss": 11.8968, + "step": 24668 + }, + { + "epoch": 1.343325641706316, + "grad_norm": 0.5194004817624808, + "learning_rate": 5.140912859433653e-05, + "loss": 11.9379, + "step": 24669 + }, + { + "epoch": 1.343380095702899, + "grad_norm": 0.5227581353768905, + "learning_rate": 5.140142154128018e-05, + "loss": 11.8972, + "step": 24670 + }, + { + "epoch": 1.343434549699482, + "grad_norm": 0.6213626815070582, + "learning_rate": 5.139371486613451e-05, + "loss": 11.9509, + "step": 24671 + }, + { + "epoch": 1.343489003696065, + "grad_norm": 0.5427038923186286, + "learning_rate": 5.13860085689595e-05, + "loss": 11.887, + "step": 24672 + }, + { + "epoch": 1.3435434576926482, + "grad_norm": 0.536390506451965, + "learning_rate": 5.1378302649814994e-05, + "loss": 11.7878, + "step": 24673 + }, + { + "epoch": 1.3435979116892312, + "grad_norm": 0.5814214936671521, + "learning_rate": 5.1370597108761e-05, + "loss": 12.0254, + "step": 24674 + }, + { + "epoch": 1.3436523656858141, + "grad_norm": 0.6034499203431405, + "learning_rate": 5.136289194585736e-05, + "loss": 12.001, + "step": 24675 + }, + { + "epoch": 1.3437068196823971, + "grad_norm": 0.5879422515014885, + "learning_rate": 5.1355187161164034e-05, + "loss": 11.9203, + "step": 24676 + }, + { + "epoch": 1.3437612736789801, + "grad_norm": 0.5096279975774792, + "learning_rate": 5.134748275474096e-05, + "loss": 11.9085, + "step": 24677 + }, + { + "epoch": 1.3438157276755631, + "grad_norm": 0.5210584951397081, + "learning_rate": 5.133977872664799e-05, + "loss": 11.9083, + "step": 24678 + }, + { + "epoch": 1.3438701816721461, + "grad_norm": 0.5695970056479077, + "learning_rate": 5.1332075076945084e-05, + "loss": 11.7609, + "step": 24679 + }, + { + "epoch": 1.343924635668729, + "grad_norm": 0.5433632179172042, + "learning_rate": 5.132437180569214e-05, + "loss": 11.8395, + "step": 24680 + }, + { + "epoch": 1.343979089665312, + "grad_norm": 0.5408495754745256, + "learning_rate": 5.131666891294899e-05, + "loss": 11.8961, + "step": 24681 + }, + { + "epoch": 1.344033543661895, + "grad_norm": 0.5367184083183594, + "learning_rate": 5.130896639877565e-05, + "loss": 11.92, + "step": 24682 + }, + { + "epoch": 1.344087997658478, + "grad_norm": 0.6222809867914364, + "learning_rate": 5.13012642632319e-05, + "loss": 11.8826, + "step": 24683 + }, + { + "epoch": 1.344142451655061, + "grad_norm": 0.5244961821012735, + "learning_rate": 5.129356250637771e-05, + "loss": 11.989, + "step": 24684 + }, + { + "epoch": 1.344196905651644, + "grad_norm": 0.548225762241452, + "learning_rate": 5.1285861128272924e-05, + "loss": 11.8568, + "step": 24685 + }, + { + "epoch": 1.344251359648227, + "grad_norm": 0.5935253842941441, + "learning_rate": 5.1278160128977494e-05, + "loss": 11.9319, + "step": 24686 + }, + { + "epoch": 1.3443058136448103, + "grad_norm": 0.5671982583908727, + "learning_rate": 5.1270459508551207e-05, + "loss": 11.9201, + "step": 24687 + }, + { + "epoch": 1.3443602676413933, + "grad_norm": 0.5610219671353242, + "learning_rate": 5.126275926705402e-05, + "loss": 11.9223, + "step": 24688 + }, + { + "epoch": 1.3444147216379763, + "grad_norm": 0.5069111973494803, + "learning_rate": 5.12550594045458e-05, + "loss": 11.8724, + "step": 24689 + }, + { + "epoch": 1.3444691756345593, + "grad_norm": 0.5620002209813412, + "learning_rate": 5.1247359921086444e-05, + "loss": 12.0272, + "step": 24690 + }, + { + "epoch": 1.3445236296311422, + "grad_norm": 0.6137102404689122, + "learning_rate": 5.1239660816735765e-05, + "loss": 12.0232, + "step": 24691 + }, + { + "epoch": 1.3445780836277252, + "grad_norm": 0.5901212641170603, + "learning_rate": 5.123196209155364e-05, + "loss": 11.9088, + "step": 24692 + }, + { + "epoch": 1.3446325376243082, + "grad_norm": 0.5769018309674903, + "learning_rate": 5.1224263745599955e-05, + "loss": 11.8818, + "step": 24693 + }, + { + "epoch": 1.3446869916208912, + "grad_norm": 0.5005582799986129, + "learning_rate": 5.121656577893461e-05, + "loss": 11.7601, + "step": 24694 + }, + { + "epoch": 1.3447414456174742, + "grad_norm": 0.5932355413713493, + "learning_rate": 5.120886819161739e-05, + "loss": 12.0082, + "step": 24695 + }, + { + "epoch": 1.3447958996140574, + "grad_norm": 0.5305838490879435, + "learning_rate": 5.120117098370824e-05, + "loss": 12.0842, + "step": 24696 + }, + { + "epoch": 1.3448503536106404, + "grad_norm": 0.5598975906879371, + "learning_rate": 5.119347415526692e-05, + "loss": 11.8764, + "step": 24697 + }, + { + "epoch": 1.3449048076072234, + "grad_norm": 0.5366234881540021, + "learning_rate": 5.118577770635337e-05, + "loss": 11.9488, + "step": 24698 + }, + { + "epoch": 1.3449592616038064, + "grad_norm": 0.5326388231089194, + "learning_rate": 5.1178081637027356e-05, + "loss": 11.8126, + "step": 24699 + }, + { + "epoch": 1.3450137156003894, + "grad_norm": 0.5264614507788572, + "learning_rate": 5.117038594734882e-05, + "loss": 11.9151, + "step": 24700 + }, + { + "epoch": 1.3450681695969724, + "grad_norm": 0.5568658150801595, + "learning_rate": 5.116269063737753e-05, + "loss": 12.0364, + "step": 24701 + }, + { + "epoch": 1.3451226235935554, + "grad_norm": 0.5426277076062659, + "learning_rate": 5.115499570717333e-05, + "loss": 11.7796, + "step": 24702 + }, + { + "epoch": 1.3451770775901384, + "grad_norm": 0.5405251899144996, + "learning_rate": 5.1147301156796065e-05, + "loss": 12.0124, + "step": 24703 + }, + { + "epoch": 1.3452315315867214, + "grad_norm": 0.5650397200967617, + "learning_rate": 5.113960698630562e-05, + "loss": 11.94, + "step": 24704 + }, + { + "epoch": 1.3452859855833044, + "grad_norm": 0.6118572302047037, + "learning_rate": 5.113191319576174e-05, + "loss": 11.9214, + "step": 24705 + }, + { + "epoch": 1.3453404395798874, + "grad_norm": 0.5286389829934046, + "learning_rate": 5.112421978522435e-05, + "loss": 11.743, + "step": 24706 + }, + { + "epoch": 1.3453948935764704, + "grad_norm": 0.5937446151677802, + "learning_rate": 5.111652675475317e-05, + "loss": 11.9297, + "step": 24707 + }, + { + "epoch": 1.3454493475730533, + "grad_norm": 0.540039344359391, + "learning_rate": 5.110883410440812e-05, + "loss": 11.9706, + "step": 24708 + }, + { + "epoch": 1.3455038015696363, + "grad_norm": 0.5318002732872339, + "learning_rate": 5.110114183424893e-05, + "loss": 11.8151, + "step": 24709 + }, + { + "epoch": 1.3455582555662193, + "grad_norm": 0.5701680828712842, + "learning_rate": 5.109344994433547e-05, + "loss": 11.9632, + "step": 24710 + }, + { + "epoch": 1.3456127095628025, + "grad_norm": 0.5322747869200056, + "learning_rate": 5.108575843472763e-05, + "loss": 11.953, + "step": 24711 + }, + { + "epoch": 1.3456671635593855, + "grad_norm": 0.6386986840153218, + "learning_rate": 5.107806730548503e-05, + "loss": 11.9134, + "step": 24712 + }, + { + "epoch": 1.3457216175559685, + "grad_norm": 0.5787294346188318, + "learning_rate": 5.107037655666765e-05, + "loss": 11.8885, + "step": 24713 + }, + { + "epoch": 1.3457760715525515, + "grad_norm": 0.48484950700769913, + "learning_rate": 5.1062686188335184e-05, + "loss": 11.911, + "step": 24714 + }, + { + "epoch": 1.3458305255491345, + "grad_norm": 0.560911470456335, + "learning_rate": 5.1054996200547465e-05, + "loss": 11.9997, + "step": 24715 + }, + { + "epoch": 1.3458849795457175, + "grad_norm": 0.5227067169613693, + "learning_rate": 5.104730659336435e-05, + "loss": 11.8588, + "step": 24716 + }, + { + "epoch": 1.3459394335423005, + "grad_norm": 0.5328178749192024, + "learning_rate": 5.103961736684555e-05, + "loss": 11.8337, + "step": 24717 + }, + { + "epoch": 1.3459938875388835, + "grad_norm": 0.574303286229096, + "learning_rate": 5.1031928521050933e-05, + "loss": 11.9587, + "step": 24718 + }, + { + "epoch": 1.3460483415354665, + "grad_norm": 0.5469256303813594, + "learning_rate": 5.102424005604021e-05, + "loss": 11.8126, + "step": 24719 + }, + { + "epoch": 1.3461027955320497, + "grad_norm": 0.5502648367970351, + "learning_rate": 5.101655197187325e-05, + "loss": 11.8713, + "step": 24720 + }, + { + "epoch": 1.3461572495286327, + "grad_norm": 0.5825838781402476, + "learning_rate": 5.10088642686098e-05, + "loss": 11.9476, + "step": 24721 + }, + { + "epoch": 1.3462117035252157, + "grad_norm": 0.5312492793536634, + "learning_rate": 5.100117694630959e-05, + "loss": 11.9215, + "step": 24722 + }, + { + "epoch": 1.3462661575217987, + "grad_norm": 0.5925804114597782, + "learning_rate": 5.0993490005032506e-05, + "loss": 11.9656, + "step": 24723 + }, + { + "epoch": 1.3463206115183817, + "grad_norm": 0.597512351118672, + "learning_rate": 5.098580344483821e-05, + "loss": 11.857, + "step": 24724 + }, + { + "epoch": 1.3463750655149647, + "grad_norm": 0.5527618332523438, + "learning_rate": 5.097811726578652e-05, + "loss": 12.0091, + "step": 24725 + }, + { + "epoch": 1.3464295195115477, + "grad_norm": 0.5815921715331781, + "learning_rate": 5.0970431467937254e-05, + "loss": 11.9153, + "step": 24726 + }, + { + "epoch": 1.3464839735081306, + "grad_norm": 0.49469894164678824, + "learning_rate": 5.09627460513501e-05, + "loss": 11.874, + "step": 24727 + }, + { + "epoch": 1.3465384275047136, + "grad_norm": 0.5202688863510311, + "learning_rate": 5.095506101608488e-05, + "loss": 11.915, + "step": 24728 + }, + { + "epoch": 1.3465928815012966, + "grad_norm": 0.5437772142738702, + "learning_rate": 5.0947376362201304e-05, + "loss": 11.7232, + "step": 24729 + }, + { + "epoch": 1.3466473354978796, + "grad_norm": 0.538512507411147, + "learning_rate": 5.0939692089759195e-05, + "loss": 11.9361, + "step": 24730 + }, + { + "epoch": 1.3467017894944626, + "grad_norm": 0.5852451636107295, + "learning_rate": 5.093200819881827e-05, + "loss": 11.8578, + "step": 24731 + }, + { + "epoch": 1.3467562434910456, + "grad_norm": 0.5744333870393465, + "learning_rate": 5.092432468943823e-05, + "loss": 11.9063, + "step": 24732 + }, + { + "epoch": 1.3468106974876286, + "grad_norm": 0.5660255027197925, + "learning_rate": 5.091664156167892e-05, + "loss": 11.9843, + "step": 24733 + }, + { + "epoch": 1.3468651514842118, + "grad_norm": 0.8385179784822178, + "learning_rate": 5.090895881559998e-05, + "loss": 11.9528, + "step": 24734 + }, + { + "epoch": 1.3469196054807948, + "grad_norm": 0.5644388973902699, + "learning_rate": 5.090127645126127e-05, + "loss": 11.9393, + "step": 24735 + }, + { + "epoch": 1.3469740594773778, + "grad_norm": 0.5061500219352385, + "learning_rate": 5.089359446872242e-05, + "loss": 11.8778, + "step": 24736 + }, + { + "epoch": 1.3470285134739608, + "grad_norm": 0.6064606475805742, + "learning_rate": 5.088591286804322e-05, + "loss": 12.0032, + "step": 24737 + }, + { + "epoch": 1.3470829674705438, + "grad_norm": 0.6085706595767844, + "learning_rate": 5.0878231649283424e-05, + "loss": 12.0709, + "step": 24738 + }, + { + "epoch": 1.3471374214671268, + "grad_norm": 0.5112846789243923, + "learning_rate": 5.0870550812502704e-05, + "loss": 11.8724, + "step": 24739 + }, + { + "epoch": 1.3471918754637098, + "grad_norm": 0.5642057804212833, + "learning_rate": 5.086287035776086e-05, + "loss": 11.9766, + "step": 24740 + }, + { + "epoch": 1.3472463294602928, + "grad_norm": 0.538498722557541, + "learning_rate": 5.0855190285117585e-05, + "loss": 11.9649, + "step": 24741 + }, + { + "epoch": 1.3473007834568758, + "grad_norm": 0.5167847240960812, + "learning_rate": 5.084751059463253e-05, + "loss": 11.85, + "step": 24742 + }, + { + "epoch": 1.347355237453459, + "grad_norm": 0.5688071269020584, + "learning_rate": 5.0839831286365535e-05, + "loss": 11.9503, + "step": 24743 + }, + { + "epoch": 1.347409691450042, + "grad_norm": 0.6078205857455574, + "learning_rate": 5.083215236037622e-05, + "loss": 11.8844, + "step": 24744 + }, + { + "epoch": 1.347464145446625, + "grad_norm": 0.5504748636150579, + "learning_rate": 5.082447381672435e-05, + "loss": 11.9087, + "step": 24745 + }, + { + "epoch": 1.347518599443208, + "grad_norm": 0.556827012989121, + "learning_rate": 5.081679565546959e-05, + "loss": 11.8018, + "step": 24746 + }, + { + "epoch": 1.347573053439791, + "grad_norm": 0.5326846761653032, + "learning_rate": 5.080911787667173e-05, + "loss": 11.8898, + "step": 24747 + }, + { + "epoch": 1.347627507436374, + "grad_norm": 0.4771338407625155, + "learning_rate": 5.080144048039036e-05, + "loss": 11.9173, + "step": 24748 + }, + { + "epoch": 1.347681961432957, + "grad_norm": 0.5514342083191643, + "learning_rate": 5.079376346668525e-05, + "loss": 11.9127, + "step": 24749 + }, + { + "epoch": 1.34773641542954, + "grad_norm": 0.49419029608704157, + "learning_rate": 5.078608683561612e-05, + "loss": 11.8876, + "step": 24750 + }, + { + "epoch": 1.347790869426123, + "grad_norm": 0.5668823733875155, + "learning_rate": 5.077841058724263e-05, + "loss": 11.8694, + "step": 24751 + }, + { + "epoch": 1.347845323422706, + "grad_norm": 0.532912246970959, + "learning_rate": 5.077073472162444e-05, + "loss": 12.0189, + "step": 24752 + }, + { + "epoch": 1.347899777419289, + "grad_norm": 0.5671877456159174, + "learning_rate": 5.076305923882131e-05, + "loss": 11.974, + "step": 24753 + }, + { + "epoch": 1.347954231415872, + "grad_norm": 0.5616592638243587, + "learning_rate": 5.0755384138892846e-05, + "loss": 11.8129, + "step": 24754 + }, + { + "epoch": 1.3480086854124549, + "grad_norm": 0.570931293347498, + "learning_rate": 5.074770942189882e-05, + "loss": 11.9228, + "step": 24755 + }, + { + "epoch": 1.3480631394090379, + "grad_norm": 0.5067208853009946, + "learning_rate": 5.074003508789882e-05, + "loss": 11.9231, + "step": 24756 + }, + { + "epoch": 1.348117593405621, + "grad_norm": 0.48252121944302095, + "learning_rate": 5.073236113695261e-05, + "loss": 11.7249, + "step": 24757 + }, + { + "epoch": 1.348172047402204, + "grad_norm": 0.517549680877548, + "learning_rate": 5.072468756911979e-05, + "loss": 11.9628, + "step": 24758 + }, + { + "epoch": 1.348226501398787, + "grad_norm": 0.6033185712884095, + "learning_rate": 5.0717014384460036e-05, + "loss": 11.9381, + "step": 24759 + }, + { + "epoch": 1.34828095539537, + "grad_norm": 0.5521904788131673, + "learning_rate": 5.070934158303311e-05, + "loss": 11.8499, + "step": 24760 + }, + { + "epoch": 1.348335409391953, + "grad_norm": 0.5511601673095677, + "learning_rate": 5.0701669164898603e-05, + "loss": 11.8565, + "step": 24761 + }, + { + "epoch": 1.348389863388536, + "grad_norm": 0.5607754924702814, + "learning_rate": 5.069399713011618e-05, + "loss": 11.8313, + "step": 24762 + }, + { + "epoch": 1.348444317385119, + "grad_norm": 0.5987504656166703, + "learning_rate": 5.0686325478745466e-05, + "loss": 11.9435, + "step": 24763 + }, + { + "epoch": 1.348498771381702, + "grad_norm": 0.5803073742557056, + "learning_rate": 5.067865421084615e-05, + "loss": 12.02, + "step": 24764 + }, + { + "epoch": 1.348553225378285, + "grad_norm": 0.5411527019282852, + "learning_rate": 5.0670983326477936e-05, + "loss": 12.0168, + "step": 24765 + }, + { + "epoch": 1.3486076793748683, + "grad_norm": 0.5220430512587575, + "learning_rate": 5.0663312825700395e-05, + "loss": 11.9947, + "step": 24766 + }, + { + "epoch": 1.3486621333714512, + "grad_norm": 0.5189351798939571, + "learning_rate": 5.065564270857323e-05, + "loss": 11.868, + "step": 24767 + }, + { + "epoch": 1.3487165873680342, + "grad_norm": 0.6134265449723086, + "learning_rate": 5.064797297515604e-05, + "loss": 12.0574, + "step": 24768 + }, + { + "epoch": 1.3487710413646172, + "grad_norm": 0.5799543714779022, + "learning_rate": 5.0640303625508514e-05, + "loss": 11.7574, + "step": 24769 + }, + { + "epoch": 1.3488254953612002, + "grad_norm": 0.47761785916124516, + "learning_rate": 5.063263465969024e-05, + "loss": 11.93, + "step": 24770 + }, + { + "epoch": 1.3488799493577832, + "grad_norm": 0.532400116807748, + "learning_rate": 5.06249660777609e-05, + "loss": 11.9961, + "step": 24771 + }, + { + "epoch": 1.3489344033543662, + "grad_norm": 0.5422298339313437, + "learning_rate": 5.0617297879780114e-05, + "loss": 11.8649, + "step": 24772 + }, + { + "epoch": 1.3489888573509492, + "grad_norm": 0.5578831490030823, + "learning_rate": 5.060963006580746e-05, + "loss": 11.8773, + "step": 24773 + }, + { + "epoch": 1.3490433113475322, + "grad_norm": 0.5969063133631078, + "learning_rate": 5.06019626359026e-05, + "loss": 12.0546, + "step": 24774 + }, + { + "epoch": 1.3490977653441152, + "grad_norm": 0.5065166455132252, + "learning_rate": 5.059429559012521e-05, + "loss": 11.8178, + "step": 24775 + }, + { + "epoch": 1.3491522193406982, + "grad_norm": 0.531827767695221, + "learning_rate": 5.058662892853483e-05, + "loss": 12.0171, + "step": 24776 + }, + { + "epoch": 1.3492066733372812, + "grad_norm": 0.502000655939235, + "learning_rate": 5.057896265119113e-05, + "loss": 11.8882, + "step": 24777 + }, + { + "epoch": 1.3492611273338642, + "grad_norm": 0.5363887226520877, + "learning_rate": 5.057129675815368e-05, + "loss": 11.7719, + "step": 24778 + }, + { + "epoch": 1.3493155813304472, + "grad_norm": 0.5046527835178513, + "learning_rate": 5.056363124948216e-05, + "loss": 11.9025, + "step": 24779 + }, + { + "epoch": 1.3493700353270301, + "grad_norm": 0.5511538817038009, + "learning_rate": 5.055596612523607e-05, + "loss": 11.9255, + "step": 24780 + }, + { + "epoch": 1.3494244893236134, + "grad_norm": 0.508313966078211, + "learning_rate": 5.054830138547514e-05, + "loss": 11.8887, + "step": 24781 + }, + { + "epoch": 1.3494789433201964, + "grad_norm": 0.5247460253492393, + "learning_rate": 5.054063703025891e-05, + "loss": 11.8504, + "step": 24782 + }, + { + "epoch": 1.3495333973167793, + "grad_norm": 0.5344680484674968, + "learning_rate": 5.053297305964693e-05, + "loss": 11.851, + "step": 24783 + }, + { + "epoch": 1.3495878513133623, + "grad_norm": 0.5471176781384924, + "learning_rate": 5.0525309473698887e-05, + "loss": 11.8643, + "step": 24784 + }, + { + "epoch": 1.3496423053099453, + "grad_norm": 0.5463936967650441, + "learning_rate": 5.051764627247431e-05, + "loss": 12.0349, + "step": 24785 + }, + { + "epoch": 1.3496967593065283, + "grad_norm": 0.526339112385976, + "learning_rate": 5.05099834560328e-05, + "loss": 11.8445, + "step": 24786 + }, + { + "epoch": 1.3497512133031113, + "grad_norm": 0.5630772598984861, + "learning_rate": 5.0502321024434e-05, + "loss": 11.9418, + "step": 24787 + }, + { + "epoch": 1.3498056672996943, + "grad_norm": 0.551988855827672, + "learning_rate": 5.049465897773742e-05, + "loss": 11.9497, + "step": 24788 + }, + { + "epoch": 1.3498601212962775, + "grad_norm": 0.5761189634907042, + "learning_rate": 5.04869973160027e-05, + "loss": 11.9864, + "step": 24789 + }, + { + "epoch": 1.3499145752928605, + "grad_norm": 0.5447156986558133, + "learning_rate": 5.047933603928936e-05, + "loss": 11.9061, + "step": 24790 + }, + { + "epoch": 1.3499690292894435, + "grad_norm": 0.5088697949684696, + "learning_rate": 5.047167514765705e-05, + "loss": 11.9659, + "step": 24791 + }, + { + "epoch": 1.3500234832860265, + "grad_norm": 0.5166478617367919, + "learning_rate": 5.0464014641165304e-05, + "loss": 11.9017, + "step": 24792 + }, + { + "epoch": 1.3500779372826095, + "grad_norm": 0.49868874735038105, + "learning_rate": 5.045635451987365e-05, + "loss": 11.9322, + "step": 24793 + }, + { + "epoch": 1.3501323912791925, + "grad_norm": 0.5614526172360104, + "learning_rate": 5.044869478384171e-05, + "loss": 12.0084, + "step": 24794 + }, + { + "epoch": 1.3501868452757755, + "grad_norm": 0.5314844398684915, + "learning_rate": 5.044103543312901e-05, + "loss": 11.7961, + "step": 24795 + }, + { + "epoch": 1.3502412992723585, + "grad_norm": 0.5552772230831415, + "learning_rate": 5.043337646779516e-05, + "loss": 11.9476, + "step": 24796 + }, + { + "epoch": 1.3502957532689415, + "grad_norm": 0.5023316306806409, + "learning_rate": 5.042571788789965e-05, + "loss": 11.8229, + "step": 24797 + }, + { + "epoch": 1.3503502072655245, + "grad_norm": 0.5011525564571289, + "learning_rate": 5.0418059693502065e-05, + "loss": 11.8788, + "step": 24798 + }, + { + "epoch": 1.3504046612621075, + "grad_norm": 0.5200250815765821, + "learning_rate": 5.041040188466202e-05, + "loss": 11.9296, + "step": 24799 + }, + { + "epoch": 1.3504591152586904, + "grad_norm": 0.6615102104803817, + "learning_rate": 5.040274446143896e-05, + "loss": 12.0247, + "step": 24800 + }, + { + "epoch": 1.3505135692552734, + "grad_norm": 0.595575885434631, + "learning_rate": 5.0395087423892494e-05, + "loss": 11.8824, + "step": 24801 + }, + { + "epoch": 1.3505680232518564, + "grad_norm": 0.5113084742992144, + "learning_rate": 5.0387430772082166e-05, + "loss": 11.819, + "step": 24802 + }, + { + "epoch": 1.3506224772484394, + "grad_norm": 0.5640317571968053, + "learning_rate": 5.0379774506067455e-05, + "loss": 11.7749, + "step": 24803 + }, + { + "epoch": 1.3506769312450226, + "grad_norm": 0.661235023433339, + "learning_rate": 5.0372118625907984e-05, + "loss": 12.0037, + "step": 24804 + }, + { + "epoch": 1.3507313852416056, + "grad_norm": 0.5357440654739187, + "learning_rate": 5.036446313166319e-05, + "loss": 11.9756, + "step": 24805 + }, + { + "epoch": 1.3507858392381886, + "grad_norm": 0.5308581111306975, + "learning_rate": 5.03568080233927e-05, + "loss": 11.8394, + "step": 24806 + }, + { + "epoch": 1.3508402932347716, + "grad_norm": 0.4946856917540988, + "learning_rate": 5.034915330115595e-05, + "loss": 11.8253, + "step": 24807 + }, + { + "epoch": 1.3508947472313546, + "grad_norm": 0.5021633837792272, + "learning_rate": 5.034149896501252e-05, + "loss": 11.9361, + "step": 24808 + }, + { + "epoch": 1.3509492012279376, + "grad_norm": 0.5608247897623255, + "learning_rate": 5.0333845015021966e-05, + "loss": 11.9191, + "step": 24809 + }, + { + "epoch": 1.3510036552245206, + "grad_norm": 0.5198315437772559, + "learning_rate": 5.032619145124371e-05, + "loss": 11.9982, + "step": 24810 + }, + { + "epoch": 1.3510581092211036, + "grad_norm": 0.6440230274453169, + "learning_rate": 5.0318538273737406e-05, + "loss": 11.9214, + "step": 24811 + }, + { + "epoch": 1.3511125632176866, + "grad_norm": 0.5506894777910234, + "learning_rate": 5.031088548256239e-05, + "loss": 11.9466, + "step": 24812 + }, + { + "epoch": 1.3511670172142698, + "grad_norm": 0.5293822800229081, + "learning_rate": 5.0303233077778265e-05, + "loss": 11.9223, + "step": 24813 + }, + { + "epoch": 1.3512214712108528, + "grad_norm": 0.5591197450370005, + "learning_rate": 5.0295581059444584e-05, + "loss": 11.8849, + "step": 24814 + }, + { + "epoch": 1.3512759252074358, + "grad_norm": 0.5050360193004697, + "learning_rate": 5.028792942762075e-05, + "loss": 11.8091, + "step": 24815 + }, + { + "epoch": 1.3513303792040188, + "grad_norm": 0.49635674747590736, + "learning_rate": 5.0280278182366356e-05, + "loss": 11.9033, + "step": 24816 + }, + { + "epoch": 1.3513848332006018, + "grad_norm": 0.5362261060313038, + "learning_rate": 5.0272627323740815e-05, + "loss": 11.9254, + "step": 24817 + }, + { + "epoch": 1.3514392871971848, + "grad_norm": 0.5232223570308987, + "learning_rate": 5.0264976851803716e-05, + "loss": 11.8598, + "step": 24818 + }, + { + "epoch": 1.3514937411937678, + "grad_norm": 0.5556135801606398, + "learning_rate": 5.025732676661444e-05, + "loss": 11.9191, + "step": 24819 + }, + { + "epoch": 1.3515481951903507, + "grad_norm": 0.5240704904942667, + "learning_rate": 5.0249677068232534e-05, + "loss": 11.9336, + "step": 24820 + }, + { + "epoch": 1.3516026491869337, + "grad_norm": 0.5573539217851793, + "learning_rate": 5.024202775671758e-05, + "loss": 11.9374, + "step": 24821 + }, + { + "epoch": 1.3516571031835167, + "grad_norm": 0.5501697310148302, + "learning_rate": 5.023437883212887e-05, + "loss": 11.9706, + "step": 24822 + }, + { + "epoch": 1.3517115571800997, + "grad_norm": 0.5063270064138293, + "learning_rate": 5.022673029452598e-05, + "loss": 11.9344, + "step": 24823 + }, + { + "epoch": 1.3517660111766827, + "grad_norm": 0.49663763397517213, + "learning_rate": 5.021908214396842e-05, + "loss": 11.9286, + "step": 24824 + }, + { + "epoch": 1.3518204651732657, + "grad_norm": 0.670876281443096, + "learning_rate": 5.021143438051559e-05, + "loss": 11.8892, + "step": 24825 + }, + { + "epoch": 1.3518749191698487, + "grad_norm": 0.5606586921665635, + "learning_rate": 5.020378700422702e-05, + "loss": 12.0507, + "step": 24826 + }, + { + "epoch": 1.351929373166432, + "grad_norm": 0.5107979228761818, + "learning_rate": 5.019614001516213e-05, + "loss": 11.947, + "step": 24827 + }, + { + "epoch": 1.351983827163015, + "grad_norm": 0.5142782902402404, + "learning_rate": 5.018849341338043e-05, + "loss": 11.9033, + "step": 24828 + }, + { + "epoch": 1.352038281159598, + "grad_norm": 0.5959443943528853, + "learning_rate": 5.018084719894134e-05, + "loss": 11.8944, + "step": 24829 + }, + { + "epoch": 1.352092735156181, + "grad_norm": 0.5487928972675663, + "learning_rate": 5.0173201371904336e-05, + "loss": 11.912, + "step": 24830 + }, + { + "epoch": 1.3521471891527639, + "grad_norm": 0.5716065146438394, + "learning_rate": 5.016555593232893e-05, + "loss": 11.9627, + "step": 24831 + }, + { + "epoch": 1.3522016431493469, + "grad_norm": 0.5838625403119779, + "learning_rate": 5.015791088027446e-05, + "loss": 11.8686, + "step": 24832 + }, + { + "epoch": 1.3522560971459299, + "grad_norm": 0.5147912815954611, + "learning_rate": 5.0150266215800454e-05, + "loss": 11.9449, + "step": 24833 + }, + { + "epoch": 1.3523105511425129, + "grad_norm": 0.5475568976877669, + "learning_rate": 5.014262193896632e-05, + "loss": 11.887, + "step": 24834 + }, + { + "epoch": 1.3523650051390959, + "grad_norm": 0.5511623497369028, + "learning_rate": 5.013497804983152e-05, + "loss": 11.9286, + "step": 24835 + }, + { + "epoch": 1.352419459135679, + "grad_norm": 0.5407169117373076, + "learning_rate": 5.0127334548455504e-05, + "loss": 11.8254, + "step": 24836 + }, + { + "epoch": 1.352473913132262, + "grad_norm": 0.5617974496305375, + "learning_rate": 5.0119691434897685e-05, + "loss": 11.8755, + "step": 24837 + }, + { + "epoch": 1.352528367128845, + "grad_norm": 0.5490719035581467, + "learning_rate": 5.011204870921754e-05, + "loss": 11.8921, + "step": 24838 + }, + { + "epoch": 1.352582821125428, + "grad_norm": 0.5316405625088215, + "learning_rate": 5.0104406371474435e-05, + "loss": 11.9682, + "step": 24839 + }, + { + "epoch": 1.352637275122011, + "grad_norm": 0.5493005504319712, + "learning_rate": 5.009676442172787e-05, + "loss": 11.8559, + "step": 24840 + }, + { + "epoch": 1.352691729118594, + "grad_norm": 0.5042634568948384, + "learning_rate": 5.008912286003723e-05, + "loss": 11.8346, + "step": 24841 + }, + { + "epoch": 1.352746183115177, + "grad_norm": 0.6517204678807134, + "learning_rate": 5.008148168646191e-05, + "loss": 11.9003, + "step": 24842 + }, + { + "epoch": 1.35280063711176, + "grad_norm": 0.5750013340571146, + "learning_rate": 5.0073840901061395e-05, + "loss": 11.9398, + "step": 24843 + }, + { + "epoch": 1.352855091108343, + "grad_norm": 0.5844993004803447, + "learning_rate": 5.006620050389502e-05, + "loss": 11.9265, + "step": 24844 + }, + { + "epoch": 1.352909545104926, + "grad_norm": 0.5277049449750013, + "learning_rate": 5.005856049502229e-05, + "loss": 11.9073, + "step": 24845 + }, + { + "epoch": 1.352963999101509, + "grad_norm": 0.4876604087834955, + "learning_rate": 5.005092087450254e-05, + "loss": 11.8304, + "step": 24846 + }, + { + "epoch": 1.353018453098092, + "grad_norm": 0.5619475806379481, + "learning_rate": 5.004328164239518e-05, + "loss": 11.9532, + "step": 24847 + }, + { + "epoch": 1.353072907094675, + "grad_norm": 0.53215760304349, + "learning_rate": 5.003564279875969e-05, + "loss": 11.8062, + "step": 24848 + }, + { + "epoch": 1.353127361091258, + "grad_norm": 0.5662691808049553, + "learning_rate": 5.00280043436554e-05, + "loss": 11.9444, + "step": 24849 + }, + { + "epoch": 1.353181815087841, + "grad_norm": 0.5046826535678688, + "learning_rate": 5.0020366277141754e-05, + "loss": 11.8574, + "step": 24850 + }, + { + "epoch": 1.3532362690844242, + "grad_norm": 0.5465399341865931, + "learning_rate": 5.0012728599278114e-05, + "loss": 11.9395, + "step": 24851 + }, + { + "epoch": 1.3532907230810072, + "grad_norm": 0.5409542996884615, + "learning_rate": 5.000509131012384e-05, + "loss": 11.8603, + "step": 24852 + }, + { + "epoch": 1.3533451770775902, + "grad_norm": 0.587445380148168, + "learning_rate": 4.999745440973841e-05, + "loss": 11.8569, + "step": 24853 + }, + { + "epoch": 1.3533996310741732, + "grad_norm": 0.5277849863970896, + "learning_rate": 4.998981789818111e-05, + "loss": 11.8868, + "step": 24854 + }, + { + "epoch": 1.3534540850707562, + "grad_norm": 0.5301040362510373, + "learning_rate": 4.998218177551143e-05, + "loss": 11.8316, + "step": 24855 + }, + { + "epoch": 1.3535085390673391, + "grad_norm": 0.5613774863843418, + "learning_rate": 4.997454604178864e-05, + "loss": 11.9655, + "step": 24856 + }, + { + "epoch": 1.3535629930639221, + "grad_norm": 0.544793573707874, + "learning_rate": 4.9966910697072166e-05, + "loss": 11.9052, + "step": 24857 + }, + { + "epoch": 1.3536174470605051, + "grad_norm": 0.5252444204166443, + "learning_rate": 4.995927574142143e-05, + "loss": 11.898, + "step": 24858 + }, + { + "epoch": 1.3536719010570883, + "grad_norm": 0.5068669077648342, + "learning_rate": 4.995164117489571e-05, + "loss": 11.7729, + "step": 24859 + }, + { + "epoch": 1.3537263550536713, + "grad_norm": 0.502792964628515, + "learning_rate": 4.9944006997554474e-05, + "loss": 11.9333, + "step": 24860 + }, + { + "epoch": 1.3537808090502543, + "grad_norm": 0.5630809332803862, + "learning_rate": 4.993637320945699e-05, + "loss": 11.9314, + "step": 24861 + }, + { + "epoch": 1.3538352630468373, + "grad_norm": 0.5063112968886009, + "learning_rate": 4.9928739810662696e-05, + "loss": 11.8642, + "step": 24862 + }, + { + "epoch": 1.3538897170434203, + "grad_norm": 0.6370058690335387, + "learning_rate": 4.992110680123092e-05, + "loss": 11.953, + "step": 24863 + }, + { + "epoch": 1.3539441710400033, + "grad_norm": 0.611175174799429, + "learning_rate": 4.991347418122099e-05, + "loss": 11.8584, + "step": 24864 + }, + { + "epoch": 1.3539986250365863, + "grad_norm": 0.562780777127195, + "learning_rate": 4.990584195069231e-05, + "loss": 11.8797, + "step": 24865 + }, + { + "epoch": 1.3540530790331693, + "grad_norm": 0.5177942871062793, + "learning_rate": 4.989821010970417e-05, + "loss": 11.8644, + "step": 24866 + }, + { + "epoch": 1.3541075330297523, + "grad_norm": 0.5616015574802201, + "learning_rate": 4.989057865831599e-05, + "loss": 11.9686, + "step": 24867 + }, + { + "epoch": 1.3541619870263353, + "grad_norm": 0.5276231084759514, + "learning_rate": 4.988294759658703e-05, + "loss": 11.8057, + "step": 24868 + }, + { + "epoch": 1.3542164410229183, + "grad_norm": 0.550768685905666, + "learning_rate": 4.9875316924576676e-05, + "loss": 11.9847, + "step": 24869 + }, + { + "epoch": 1.3542708950195013, + "grad_norm": 0.5744872109531516, + "learning_rate": 4.9867686642344304e-05, + "loss": 11.9256, + "step": 24870 + }, + { + "epoch": 1.3543253490160843, + "grad_norm": 0.5274101657090788, + "learning_rate": 4.986005674994917e-05, + "loss": 11.9319, + "step": 24871 + }, + { + "epoch": 1.3543798030126672, + "grad_norm": 0.5618023876300211, + "learning_rate": 4.985242724745067e-05, + "loss": 11.9709, + "step": 24872 + }, + { + "epoch": 1.3544342570092502, + "grad_norm": 0.5078380684729866, + "learning_rate": 4.984479813490812e-05, + "loss": 11.7467, + "step": 24873 + }, + { + "epoch": 1.3544887110058335, + "grad_norm": 0.5867966059024414, + "learning_rate": 4.983716941238079e-05, + "loss": 11.7997, + "step": 24874 + }, + { + "epoch": 1.3545431650024164, + "grad_norm": 0.5338787637271132, + "learning_rate": 4.9829541079928075e-05, + "loss": 11.7774, + "step": 24875 + }, + { + "epoch": 1.3545976189989994, + "grad_norm": 0.5902886660661296, + "learning_rate": 4.9821913137609224e-05, + "loss": 11.956, + "step": 24876 + }, + { + "epoch": 1.3546520729955824, + "grad_norm": 0.5348857228702834, + "learning_rate": 4.9814285585483634e-05, + "loss": 11.9083, + "step": 24877 + }, + { + "epoch": 1.3547065269921654, + "grad_norm": 0.5916120896647244, + "learning_rate": 4.980665842361054e-05, + "loss": 11.9277, + "step": 24878 + }, + { + "epoch": 1.3547609809887484, + "grad_norm": 0.5844802679823214, + "learning_rate": 4.9799031652049277e-05, + "loss": 11.8835, + "step": 24879 + }, + { + "epoch": 1.3548154349853314, + "grad_norm": 0.5590944803749194, + "learning_rate": 4.9791405270859215e-05, + "loss": 11.9702, + "step": 24880 + }, + { + "epoch": 1.3548698889819144, + "grad_norm": 0.6653605419238408, + "learning_rate": 4.9783779280099564e-05, + "loss": 12.0233, + "step": 24881 + }, + { + "epoch": 1.3549243429784974, + "grad_norm": 0.5345790397368297, + "learning_rate": 4.977615367982973e-05, + "loss": 11.8585, + "step": 24882 + }, + { + "epoch": 1.3549787969750806, + "grad_norm": 0.4899892796727021, + "learning_rate": 4.976852847010889e-05, + "loss": 11.6916, + "step": 24883 + }, + { + "epoch": 1.3550332509716636, + "grad_norm": 0.5972743588836793, + "learning_rate": 4.976090365099638e-05, + "loss": 11.9908, + "step": 24884 + }, + { + "epoch": 1.3550877049682466, + "grad_norm": 0.6000639830695981, + "learning_rate": 4.975327922255155e-05, + "loss": 11.8087, + "step": 24885 + }, + { + "epoch": 1.3551421589648296, + "grad_norm": 0.5487570979590168, + "learning_rate": 4.974565518483362e-05, + "loss": 11.9484, + "step": 24886 + }, + { + "epoch": 1.3551966129614126, + "grad_norm": 0.5623411375132161, + "learning_rate": 4.973803153790193e-05, + "loss": 11.8157, + "step": 24887 + }, + { + "epoch": 1.3552510669579956, + "grad_norm": 0.606979887834055, + "learning_rate": 4.97304082818157e-05, + "loss": 11.8969, + "step": 24888 + }, + { + "epoch": 1.3553055209545786, + "grad_norm": 0.5377885831843022, + "learning_rate": 4.972278541663429e-05, + "loss": 11.8924, + "step": 24889 + }, + { + "epoch": 1.3553599749511616, + "grad_norm": 0.5205384980636952, + "learning_rate": 4.9715162942416884e-05, + "loss": 11.928, + "step": 24890 + }, + { + "epoch": 1.3554144289477446, + "grad_norm": 0.614298542184706, + "learning_rate": 4.97075408592228e-05, + "loss": 12.0008, + "step": 24891 + }, + { + "epoch": 1.3554688829443275, + "grad_norm": 0.5766063853548465, + "learning_rate": 4.9699919167111395e-05, + "loss": 11.9444, + "step": 24892 + }, + { + "epoch": 1.3555233369409105, + "grad_norm": 0.5616804705502019, + "learning_rate": 4.969229786614179e-05, + "loss": 11.9267, + "step": 24893 + }, + { + "epoch": 1.3555777909374935, + "grad_norm": 0.5489564411038799, + "learning_rate": 4.968467695637333e-05, + "loss": 11.9348, + "step": 24894 + }, + { + "epoch": 1.3556322449340765, + "grad_norm": 0.6474272376209784, + "learning_rate": 4.9677056437865244e-05, + "loss": 11.5415, + "step": 24895 + }, + { + "epoch": 1.3556866989306595, + "grad_norm": 0.5156865616846723, + "learning_rate": 4.9669436310676785e-05, + "loss": 11.8739, + "step": 24896 + }, + { + "epoch": 1.3557411529272427, + "grad_norm": 0.5824334064447036, + "learning_rate": 4.966181657486728e-05, + "loss": 12.044, + "step": 24897 + }, + { + "epoch": 1.3557956069238257, + "grad_norm": 0.5951441337550208, + "learning_rate": 4.9654197230495895e-05, + "loss": 11.8968, + "step": 24898 + }, + { + "epoch": 1.3558500609204087, + "grad_norm": 0.5427233998533754, + "learning_rate": 4.9646578277621947e-05, + "loss": 11.7835, + "step": 24899 + }, + { + "epoch": 1.3559045149169917, + "grad_norm": 0.5475569697171353, + "learning_rate": 4.963895971630461e-05, + "loss": 11.8824, + "step": 24900 + }, + { + "epoch": 1.3559589689135747, + "grad_norm": 0.5679944901373409, + "learning_rate": 4.9631341546603215e-05, + "loss": 11.8764, + "step": 24901 + }, + { + "epoch": 1.3560134229101577, + "grad_norm": 0.5977108933058346, + "learning_rate": 4.962372376857695e-05, + "loss": 12.0388, + "step": 24902 + }, + { + "epoch": 1.3560678769067407, + "grad_norm": 0.5859425763958728, + "learning_rate": 4.961610638228502e-05, + "loss": 11.8136, + "step": 24903 + }, + { + "epoch": 1.3561223309033237, + "grad_norm": 0.5281229218351752, + "learning_rate": 4.960848938778673e-05, + "loss": 11.867, + "step": 24904 + }, + { + "epoch": 1.3561767848999067, + "grad_norm": 0.5318152722254618, + "learning_rate": 4.960087278514124e-05, + "loss": 11.8967, + "step": 24905 + }, + { + "epoch": 1.3562312388964899, + "grad_norm": 0.5261775820020697, + "learning_rate": 4.959325657440782e-05, + "loss": 11.8881, + "step": 24906 + }, + { + "epoch": 1.3562856928930729, + "grad_norm": 0.6243009110779695, + "learning_rate": 4.958564075564573e-05, + "loss": 11.922, + "step": 24907 + }, + { + "epoch": 1.3563401468896559, + "grad_norm": 0.5097660088059349, + "learning_rate": 4.95780253289141e-05, + "loss": 11.9009, + "step": 24908 + }, + { + "epoch": 1.3563946008862389, + "grad_norm": 0.5416975746395439, + "learning_rate": 4.957041029427225e-05, + "loss": 11.917, + "step": 24909 + }, + { + "epoch": 1.3564490548828219, + "grad_norm": 0.5849221474800707, + "learning_rate": 4.9562795651779316e-05, + "loss": 11.9707, + "step": 24910 + }, + { + "epoch": 1.3565035088794049, + "grad_norm": 0.5897685853806195, + "learning_rate": 4.955518140149456e-05, + "loss": 11.9131, + "step": 24911 + }, + { + "epoch": 1.3565579628759878, + "grad_norm": 0.5436011638671914, + "learning_rate": 4.954756754347719e-05, + "loss": 11.7322, + "step": 24912 + }, + { + "epoch": 1.3566124168725708, + "grad_norm": 0.6011848819011538, + "learning_rate": 4.9539954077786353e-05, + "loss": 11.8981, + "step": 24913 + }, + { + "epoch": 1.3566668708691538, + "grad_norm": 0.5648134955862548, + "learning_rate": 4.953234100448133e-05, + "loss": 12.0249, + "step": 24914 + }, + { + "epoch": 1.3567213248657368, + "grad_norm": 0.5168500769792862, + "learning_rate": 4.952472832362125e-05, + "loss": 11.8804, + "step": 24915 + }, + { + "epoch": 1.3567757788623198, + "grad_norm": 0.6142474939863247, + "learning_rate": 4.951711603526539e-05, + "loss": 11.8979, + "step": 24916 + }, + { + "epoch": 1.3568302328589028, + "grad_norm": 0.5684425158241186, + "learning_rate": 4.9509504139472854e-05, + "loss": 11.9123, + "step": 24917 + }, + { + "epoch": 1.3568846868554858, + "grad_norm": 0.5667423344177543, + "learning_rate": 4.950189263630288e-05, + "loss": 11.9643, + "step": 24918 + }, + { + "epoch": 1.3569391408520688, + "grad_norm": 0.5771960361685287, + "learning_rate": 4.949428152581471e-05, + "loss": 11.859, + "step": 24919 + }, + { + "epoch": 1.3569935948486518, + "grad_norm": 0.5392846440373192, + "learning_rate": 4.9486670808067426e-05, + "loss": 11.9325, + "step": 24920 + }, + { + "epoch": 1.357048048845235, + "grad_norm": 0.5666340357531485, + "learning_rate": 4.947906048312031e-05, + "loss": 12.0368, + "step": 24921 + }, + { + "epoch": 1.357102502841818, + "grad_norm": 0.5274722567663747, + "learning_rate": 4.9471450551032485e-05, + "loss": 11.851, + "step": 24922 + }, + { + "epoch": 1.357156956838401, + "grad_norm": 0.5371877229027102, + "learning_rate": 4.94638410118631e-05, + "loss": 11.8442, + "step": 24923 + }, + { + "epoch": 1.357211410834984, + "grad_norm": 0.5163432852791142, + "learning_rate": 4.9456231865671386e-05, + "loss": 12.0214, + "step": 24924 + }, + { + "epoch": 1.357265864831567, + "grad_norm": 0.5429702860581987, + "learning_rate": 4.944862311251647e-05, + "loss": 11.8162, + "step": 24925 + }, + { + "epoch": 1.35732031882815, + "grad_norm": 0.49390150469090427, + "learning_rate": 4.944101475245756e-05, + "loss": 11.7471, + "step": 24926 + }, + { + "epoch": 1.357374772824733, + "grad_norm": 0.5073739712088102, + "learning_rate": 4.943340678555377e-05, + "loss": 11.9097, + "step": 24927 + }, + { + "epoch": 1.357429226821316, + "grad_norm": 0.5357613198551402, + "learning_rate": 4.9425799211864286e-05, + "loss": 11.8786, + "step": 24928 + }, + { + "epoch": 1.3574836808178992, + "grad_norm": 0.5262640462521961, + "learning_rate": 4.9418192031448306e-05, + "loss": 11.9509, + "step": 24929 + }, + { + "epoch": 1.3575381348144822, + "grad_norm": 0.595498353326288, + "learning_rate": 4.9410585244364916e-05, + "loss": 11.9361, + "step": 24930 + }, + { + "epoch": 1.3575925888110651, + "grad_norm": 0.6056796594259529, + "learning_rate": 4.940297885067333e-05, + "loss": 11.8909, + "step": 24931 + }, + { + "epoch": 1.3576470428076481, + "grad_norm": 0.5782904163323127, + "learning_rate": 4.939537285043266e-05, + "loss": 12.0208, + "step": 24932 + }, + { + "epoch": 1.3577014968042311, + "grad_norm": 0.49495929222631285, + "learning_rate": 4.938776724370203e-05, + "loss": 11.7721, + "step": 24933 + }, + { + "epoch": 1.3577559508008141, + "grad_norm": 0.5133983141652042, + "learning_rate": 4.938016203054064e-05, + "loss": 11.861, + "step": 24934 + }, + { + "epoch": 1.3578104047973971, + "grad_norm": 0.4951558290143576, + "learning_rate": 4.9372557211007574e-05, + "loss": 11.9239, + "step": 24935 + }, + { + "epoch": 1.3578648587939801, + "grad_norm": 0.5733830050411346, + "learning_rate": 4.936495278516202e-05, + "loss": 11.8048, + "step": 24936 + }, + { + "epoch": 1.357919312790563, + "grad_norm": 0.5362533987137719, + "learning_rate": 4.935734875306305e-05, + "loss": 11.9157, + "step": 24937 + }, + { + "epoch": 1.357973766787146, + "grad_norm": 0.6084958145610728, + "learning_rate": 4.9349745114769863e-05, + "loss": 11.7731, + "step": 24938 + }, + { + "epoch": 1.358028220783729, + "grad_norm": 0.5651082642116758, + "learning_rate": 4.934214187034152e-05, + "loss": 11.86, + "step": 24939 + }, + { + "epoch": 1.358082674780312, + "grad_norm": 0.5388066316508631, + "learning_rate": 4.933453901983718e-05, + "loss": 11.8609, + "step": 24940 + }, + { + "epoch": 1.358137128776895, + "grad_norm": 0.550791723540091, + "learning_rate": 4.9326936563315984e-05, + "loss": 11.9139, + "step": 24941 + }, + { + "epoch": 1.358191582773478, + "grad_norm": 0.5582596246482534, + "learning_rate": 4.931933450083704e-05, + "loss": 11.908, + "step": 24942 + }, + { + "epoch": 1.358246036770061, + "grad_norm": 0.5569203634323466, + "learning_rate": 4.9311732832459446e-05, + "loss": 11.9891, + "step": 24943 + }, + { + "epoch": 1.3583004907666443, + "grad_norm": 0.5156216176427671, + "learning_rate": 4.930413155824228e-05, + "loss": 11.7841, + "step": 24944 + }, + { + "epoch": 1.3583549447632273, + "grad_norm": 0.5196163341381659, + "learning_rate": 4.929653067824469e-05, + "loss": 11.9316, + "step": 24945 + }, + { + "epoch": 1.3584093987598103, + "grad_norm": 0.5008788675494064, + "learning_rate": 4.9288930192525815e-05, + "loss": 11.9397, + "step": 24946 + }, + { + "epoch": 1.3584638527563933, + "grad_norm": 0.5955207579212103, + "learning_rate": 4.928133010114469e-05, + "loss": 11.9151, + "step": 24947 + }, + { + "epoch": 1.3585183067529762, + "grad_norm": 0.6049074577657653, + "learning_rate": 4.927373040416049e-05, + "loss": 11.7953, + "step": 24948 + }, + { + "epoch": 1.3585727607495592, + "grad_norm": 0.5651218456736897, + "learning_rate": 4.926613110163222e-05, + "loss": 11.9555, + "step": 24949 + }, + { + "epoch": 1.3586272147461422, + "grad_norm": 0.5609675285351408, + "learning_rate": 4.9258532193619064e-05, + "loss": 11.8516, + "step": 24950 + }, + { + "epoch": 1.3586816687427252, + "grad_norm": 0.5203702149403598, + "learning_rate": 4.925093368018004e-05, + "loss": 11.8554, + "step": 24951 + }, + { + "epoch": 1.3587361227393082, + "grad_norm": 0.5292728328043207, + "learning_rate": 4.924333556137429e-05, + "loss": 11.9143, + "step": 24952 + }, + { + "epoch": 1.3587905767358914, + "grad_norm": 0.5660082640547254, + "learning_rate": 4.9235737837260885e-05, + "loss": 12.0576, + "step": 24953 + }, + { + "epoch": 1.3588450307324744, + "grad_norm": 0.5166251485491004, + "learning_rate": 4.922814050789886e-05, + "loss": 11.8077, + "step": 24954 + }, + { + "epoch": 1.3588994847290574, + "grad_norm": 0.5195651561494627, + "learning_rate": 4.9220543573347324e-05, + "loss": 11.809, + "step": 24955 + }, + { + "epoch": 1.3589539387256404, + "grad_norm": 0.56371074239359, + "learning_rate": 4.921294703366539e-05, + "loss": 11.8172, + "step": 24956 + }, + { + "epoch": 1.3590083927222234, + "grad_norm": 0.5142443462087007, + "learning_rate": 4.920535088891207e-05, + "loss": 11.727, + "step": 24957 + }, + { + "epoch": 1.3590628467188064, + "grad_norm": 0.5345036061525329, + "learning_rate": 4.9197755139146495e-05, + "loss": 11.8688, + "step": 24958 + }, + { + "epoch": 1.3591173007153894, + "grad_norm": 0.613562091302831, + "learning_rate": 4.919015978442765e-05, + "loss": 11.9773, + "step": 24959 + }, + { + "epoch": 1.3591717547119724, + "grad_norm": 0.5397915387932382, + "learning_rate": 4.9182564824814684e-05, + "loss": 11.7113, + "step": 24960 + }, + { + "epoch": 1.3592262087085554, + "grad_norm": 0.549886462065191, + "learning_rate": 4.9174970260366585e-05, + "loss": 11.946, + "step": 24961 + }, + { + "epoch": 1.3592806627051384, + "grad_norm": 0.550303995844349, + "learning_rate": 4.916737609114247e-05, + "loss": 11.8847, + "step": 24962 + }, + { + "epoch": 1.3593351167017214, + "grad_norm": 0.5478981988353996, + "learning_rate": 4.9159782317201364e-05, + "loss": 11.9462, + "step": 24963 + }, + { + "epoch": 1.3593895706983044, + "grad_norm": 0.5773689655458489, + "learning_rate": 4.9152188938602285e-05, + "loss": 11.759, + "step": 24964 + }, + { + "epoch": 1.3594440246948873, + "grad_norm": 0.5539399400396439, + "learning_rate": 4.914459595540435e-05, + "loss": 11.8204, + "step": 24965 + }, + { + "epoch": 1.3594984786914703, + "grad_norm": 0.5791466590168848, + "learning_rate": 4.9137003367666525e-05, + "loss": 12.0109, + "step": 24966 + }, + { + "epoch": 1.3595529326880536, + "grad_norm": 0.5757685910387154, + "learning_rate": 4.912941117544788e-05, + "loss": 11.9751, + "step": 24967 + }, + { + "epoch": 1.3596073866846365, + "grad_norm": 0.545043819119108, + "learning_rate": 4.912181937880752e-05, + "loss": 11.8309, + "step": 24968 + }, + { + "epoch": 1.3596618406812195, + "grad_norm": 0.527299133856043, + "learning_rate": 4.9114227977804384e-05, + "loss": 11.8493, + "step": 24969 + }, + { + "epoch": 1.3597162946778025, + "grad_norm": 0.5551793271739268, + "learning_rate": 4.9106636972497575e-05, + "loss": 12.0106, + "step": 24970 + }, + { + "epoch": 1.3597707486743855, + "grad_norm": 0.5471731565250786, + "learning_rate": 4.9099046362946053e-05, + "loss": 11.8008, + "step": 24971 + }, + { + "epoch": 1.3598252026709685, + "grad_norm": 0.5184502680272319, + "learning_rate": 4.909145614920893e-05, + "loss": 11.9077, + "step": 24972 + }, + { + "epoch": 1.3598796566675515, + "grad_norm": 0.5701095237909425, + "learning_rate": 4.9083866331345174e-05, + "loss": 11.8327, + "step": 24973 + }, + { + "epoch": 1.3599341106641345, + "grad_norm": 0.6190660343408305, + "learning_rate": 4.907627690941378e-05, + "loss": 11.9276, + "step": 24974 + }, + { + "epoch": 1.3599885646607175, + "grad_norm": 0.5447161710312252, + "learning_rate": 4.906868788347383e-05, + "loss": 11.9611, + "step": 24975 + }, + { + "epoch": 1.3600430186573007, + "grad_norm": 0.58208473872352, + "learning_rate": 4.9061099253584264e-05, + "loss": 11.8794, + "step": 24976 + }, + { + "epoch": 1.3600974726538837, + "grad_norm": 0.5814764443183652, + "learning_rate": 4.9053511019804136e-05, + "loss": 11.9842, + "step": 24977 + }, + { + "epoch": 1.3601519266504667, + "grad_norm": 0.5274231067978153, + "learning_rate": 4.904592318219249e-05, + "loss": 11.9158, + "step": 24978 + }, + { + "epoch": 1.3602063806470497, + "grad_norm": 0.5660687616682143, + "learning_rate": 4.903833574080825e-05, + "loss": 11.879, + "step": 24979 + }, + { + "epoch": 1.3602608346436327, + "grad_norm": 0.5969691407107678, + "learning_rate": 4.9030748695710494e-05, + "loss": 11.8296, + "step": 24980 + }, + { + "epoch": 1.3603152886402157, + "grad_norm": 0.532669920349359, + "learning_rate": 4.902316204695815e-05, + "loss": 11.8981, + "step": 24981 + }, + { + "epoch": 1.3603697426367987, + "grad_norm": 0.5229960340859549, + "learning_rate": 4.901557579461028e-05, + "loss": 11.9678, + "step": 24982 + }, + { + "epoch": 1.3604241966333817, + "grad_norm": 0.5684134918572549, + "learning_rate": 4.900798993872584e-05, + "loss": 11.9296, + "step": 24983 + }, + { + "epoch": 1.3604786506299646, + "grad_norm": 0.5987188117018615, + "learning_rate": 4.900040447936379e-05, + "loss": 11.9985, + "step": 24984 + }, + { + "epoch": 1.3605331046265476, + "grad_norm": 0.7762637705718533, + "learning_rate": 4.899281941658319e-05, + "loss": 11.8949, + "step": 24985 + }, + { + "epoch": 1.3605875586231306, + "grad_norm": 0.5754549163370596, + "learning_rate": 4.8985234750442934e-05, + "loss": 11.9371, + "step": 24986 + }, + { + "epoch": 1.3606420126197136, + "grad_norm": 0.5003681898766952, + "learning_rate": 4.897765048100209e-05, + "loss": 11.7886, + "step": 24987 + }, + { + "epoch": 1.3606964666162966, + "grad_norm": 0.4882107238229703, + "learning_rate": 4.897006660831954e-05, + "loss": 11.8665, + "step": 24988 + }, + { + "epoch": 1.3607509206128796, + "grad_norm": 0.716362334980051, + "learning_rate": 4.896248313245433e-05, + "loss": 11.7667, + "step": 24989 + }, + { + "epoch": 1.3608053746094628, + "grad_norm": 0.5407693854731415, + "learning_rate": 4.895490005346543e-05, + "loss": 11.914, + "step": 24990 + }, + { + "epoch": 1.3608598286060458, + "grad_norm": 0.5903030712307776, + "learning_rate": 4.894731737141177e-05, + "loss": 11.8986, + "step": 24991 + }, + { + "epoch": 1.3609142826026288, + "grad_norm": 0.5072839435812865, + "learning_rate": 4.89397350863524e-05, + "loss": 11.892, + "step": 24992 + }, + { + "epoch": 1.3609687365992118, + "grad_norm": 0.501903336905804, + "learning_rate": 4.893215319834613e-05, + "loss": 11.8784, + "step": 24993 + }, + { + "epoch": 1.3610231905957948, + "grad_norm": 0.5636339992920806, + "learning_rate": 4.8924571707452004e-05, + "loss": 12.0286, + "step": 24994 + }, + { + "epoch": 1.3610776445923778, + "grad_norm": 0.5393615922258653, + "learning_rate": 4.891699061372901e-05, + "loss": 11.8979, + "step": 24995 + }, + { + "epoch": 1.3611320985889608, + "grad_norm": 0.6009854715854602, + "learning_rate": 4.890940991723603e-05, + "loss": 12.016, + "step": 24996 + }, + { + "epoch": 1.3611865525855438, + "grad_norm": 0.5298412273781102, + "learning_rate": 4.890182961803208e-05, + "loss": 11.9466, + "step": 24997 + }, + { + "epoch": 1.3612410065821268, + "grad_norm": 0.568971366230619, + "learning_rate": 4.8894249716176044e-05, + "loss": 11.9406, + "step": 24998 + }, + { + "epoch": 1.36129546057871, + "grad_norm": 0.4855817437367864, + "learning_rate": 4.8886670211726916e-05, + "loss": 11.8565, + "step": 24999 + }, + { + "epoch": 1.361349914575293, + "grad_norm": 0.5258545939766616, + "learning_rate": 4.887909110474358e-05, + "loss": 11.778, + "step": 25000 + }, + { + "epoch": 1.361404368571876, + "grad_norm": 0.5644727999056377, + "learning_rate": 4.8871512395285015e-05, + "loss": 11.906, + "step": 25001 + }, + { + "epoch": 1.361458822568459, + "grad_norm": 0.5631330583033027, + "learning_rate": 4.8863934083410165e-05, + "loss": 11.9083, + "step": 25002 + }, + { + "epoch": 1.361513276565042, + "grad_norm": 0.5928746067366175, + "learning_rate": 4.885635616917795e-05, + "loss": 11.8918, + "step": 25003 + }, + { + "epoch": 1.361567730561625, + "grad_norm": 0.5687408818579682, + "learning_rate": 4.884877865264724e-05, + "loss": 11.764, + "step": 25004 + }, + { + "epoch": 1.361622184558208, + "grad_norm": 0.5547182565620317, + "learning_rate": 4.884120153387706e-05, + "loss": 11.7977, + "step": 25005 + }, + { + "epoch": 1.361676638554791, + "grad_norm": 0.5406958966746962, + "learning_rate": 4.8833624812926215e-05, + "loss": 11.9221, + "step": 25006 + }, + { + "epoch": 1.361731092551374, + "grad_norm": 0.6347223954139799, + "learning_rate": 4.882604848985374e-05, + "loss": 11.9636, + "step": 25007 + }, + { + "epoch": 1.361785546547957, + "grad_norm": 0.5340752072264334, + "learning_rate": 4.881847256471844e-05, + "loss": 11.8655, + "step": 25008 + }, + { + "epoch": 1.36184000054454, + "grad_norm": 0.5216907951302637, + "learning_rate": 4.881089703757934e-05, + "loss": 11.9513, + "step": 25009 + }, + { + "epoch": 1.361894454541123, + "grad_norm": 0.5954455984659678, + "learning_rate": 4.8803321908495235e-05, + "loss": 11.9156, + "step": 25010 + }, + { + "epoch": 1.361948908537706, + "grad_norm": 0.5497393167262156, + "learning_rate": 4.879574717752509e-05, + "loss": 11.8963, + "step": 25011 + }, + { + "epoch": 1.3620033625342889, + "grad_norm": 0.6359159377486772, + "learning_rate": 4.878817284472783e-05, + "loss": 11.9282, + "step": 25012 + }, + { + "epoch": 1.3620578165308719, + "grad_norm": 0.5206324241828418, + "learning_rate": 4.8780598910162346e-05, + "loss": 11.8689, + "step": 25013 + }, + { + "epoch": 1.362112270527455, + "grad_norm": 0.5272454399438127, + "learning_rate": 4.8773025373887496e-05, + "loss": 11.8483, + "step": 25014 + }, + { + "epoch": 1.362166724524038, + "grad_norm": 0.5282543903076978, + "learning_rate": 4.8765452235962164e-05, + "loss": 11.9176, + "step": 25015 + }, + { + "epoch": 1.362221178520621, + "grad_norm": 0.559768472267573, + "learning_rate": 4.875787949644526e-05, + "loss": 11.8522, + "step": 25016 + }, + { + "epoch": 1.362275632517204, + "grad_norm": 0.4938271457560536, + "learning_rate": 4.875030715539572e-05, + "loss": 11.9746, + "step": 25017 + }, + { + "epoch": 1.362330086513787, + "grad_norm": 0.5901155786033425, + "learning_rate": 4.874273521287235e-05, + "loss": 12.0275, + "step": 25018 + }, + { + "epoch": 1.36238454051037, + "grad_norm": 0.5128795830427856, + "learning_rate": 4.8735163668934105e-05, + "loss": 11.8649, + "step": 25019 + }, + { + "epoch": 1.362438994506953, + "grad_norm": 0.5243891263339125, + "learning_rate": 4.872759252363979e-05, + "loss": 12.0431, + "step": 25020 + }, + { + "epoch": 1.362493448503536, + "grad_norm": 0.5440153012281209, + "learning_rate": 4.872002177704834e-05, + "loss": 11.8967, + "step": 25021 + }, + { + "epoch": 1.3625479025001193, + "grad_norm": 0.5309573831848331, + "learning_rate": 4.871245142921856e-05, + "loss": 11.8478, + "step": 25022 + }, + { + "epoch": 1.3626023564967022, + "grad_norm": 0.6028129225695973, + "learning_rate": 4.870488148020941e-05, + "loss": 12.0187, + "step": 25023 + }, + { + "epoch": 1.3626568104932852, + "grad_norm": 0.5359101509285282, + "learning_rate": 4.8697311930079705e-05, + "loss": 11.8899, + "step": 25024 + }, + { + "epoch": 1.3627112644898682, + "grad_norm": 0.5397611360516368, + "learning_rate": 4.8689742778888256e-05, + "loss": 11.7515, + "step": 25025 + }, + { + "epoch": 1.3627657184864512, + "grad_norm": 0.524039195851762, + "learning_rate": 4.868217402669398e-05, + "loss": 11.8701, + "step": 25026 + }, + { + "epoch": 1.3628201724830342, + "grad_norm": 0.5795503373074804, + "learning_rate": 4.8674605673555765e-05, + "loss": 11.9561, + "step": 25027 + }, + { + "epoch": 1.3628746264796172, + "grad_norm": 0.5932548255088791, + "learning_rate": 4.866703771953238e-05, + "loss": 11.9275, + "step": 25028 + }, + { + "epoch": 1.3629290804762002, + "grad_norm": 0.4853918644768155, + "learning_rate": 4.865947016468275e-05, + "loss": 11.9024, + "step": 25029 + }, + { + "epoch": 1.3629835344727832, + "grad_norm": 0.5411828980209702, + "learning_rate": 4.865190300906567e-05, + "loss": 11.8984, + "step": 25030 + }, + { + "epoch": 1.3630379884693662, + "grad_norm": 0.5102095294681734, + "learning_rate": 4.864433625274003e-05, + "loss": 11.8875, + "step": 25031 + }, + { + "epoch": 1.3630924424659492, + "grad_norm": 0.5811649139961952, + "learning_rate": 4.86367698957646e-05, + "loss": 11.9619, + "step": 25032 + }, + { + "epoch": 1.3631468964625322, + "grad_norm": 0.5633498008262142, + "learning_rate": 4.862920393819831e-05, + "loss": 11.889, + "step": 25033 + }, + { + "epoch": 1.3632013504591152, + "grad_norm": 0.5285322694474356, + "learning_rate": 4.862163838009993e-05, + "loss": 11.8194, + "step": 25034 + }, + { + "epoch": 1.3632558044556982, + "grad_norm": 0.5347508240447599, + "learning_rate": 4.8614073221528286e-05, + "loss": 11.9274, + "step": 25035 + }, + { + "epoch": 1.3633102584522812, + "grad_norm": 0.5788175449431141, + "learning_rate": 4.8606508462542254e-05, + "loss": 11.8674, + "step": 25036 + }, + { + "epoch": 1.3633647124488644, + "grad_norm": 0.5147804669991923, + "learning_rate": 4.85989441032006e-05, + "loss": 11.824, + "step": 25037 + }, + { + "epoch": 1.3634191664454474, + "grad_norm": 0.5149866176731968, + "learning_rate": 4.859138014356217e-05, + "loss": 11.915, + "step": 25038 + }, + { + "epoch": 1.3634736204420304, + "grad_norm": 0.5697637018498767, + "learning_rate": 4.858381658368583e-05, + "loss": 11.8232, + "step": 25039 + }, + { + "epoch": 1.3635280744386133, + "grad_norm": 0.5000018076758913, + "learning_rate": 4.857625342363031e-05, + "loss": 11.793, + "step": 25040 + }, + { + "epoch": 1.3635825284351963, + "grad_norm": 0.5272847581622794, + "learning_rate": 4.856869066345452e-05, + "loss": 11.8191, + "step": 25041 + }, + { + "epoch": 1.3636369824317793, + "grad_norm": 0.5357826017991316, + "learning_rate": 4.856112830321716e-05, + "loss": 11.9674, + "step": 25042 + }, + { + "epoch": 1.3636914364283623, + "grad_norm": 0.5568912482017461, + "learning_rate": 4.855356634297714e-05, + "loss": 11.8877, + "step": 25043 + }, + { + "epoch": 1.3637458904249453, + "grad_norm": 0.5041668188356421, + "learning_rate": 4.854600478279322e-05, + "loss": 11.9672, + "step": 25044 + }, + { + "epoch": 1.3638003444215283, + "grad_norm": 0.5459840938632199, + "learning_rate": 4.853844362272415e-05, + "loss": 11.941, + "step": 25045 + }, + { + "epoch": 1.3638547984181115, + "grad_norm": 0.7894134375331131, + "learning_rate": 4.853088286282882e-05, + "loss": 12.0943, + "step": 25046 + }, + { + "epoch": 1.3639092524146945, + "grad_norm": 0.572334369362874, + "learning_rate": 4.8523322503165926e-05, + "loss": 11.9471, + "step": 25047 + }, + { + "epoch": 1.3639637064112775, + "grad_norm": 0.5214567336702625, + "learning_rate": 4.851576254379435e-05, + "loss": 11.8591, + "step": 25048 + }, + { + "epoch": 1.3640181604078605, + "grad_norm": 0.49154675703009915, + "learning_rate": 4.850820298477281e-05, + "loss": 11.8707, + "step": 25049 + }, + { + "epoch": 1.3640726144044435, + "grad_norm": 0.5325034004871464, + "learning_rate": 4.850064382616011e-05, + "loss": 11.8024, + "step": 25050 + }, + { + "epoch": 1.3641270684010265, + "grad_norm": 0.5189007373633713, + "learning_rate": 4.849308506801509e-05, + "loss": 11.8567, + "step": 25051 + }, + { + "epoch": 1.3641815223976095, + "grad_norm": 0.5972202800016274, + "learning_rate": 4.848552671039642e-05, + "loss": 11.76, + "step": 25052 + }, + { + "epoch": 1.3642359763941925, + "grad_norm": 0.5665482075263434, + "learning_rate": 4.847796875336298e-05, + "loss": 11.8383, + "step": 25053 + }, + { + "epoch": 1.3642904303907755, + "grad_norm": 0.5531479756601952, + "learning_rate": 4.84704111969735e-05, + "loss": 11.774, + "step": 25054 + }, + { + "epoch": 1.3643448843873585, + "grad_norm": 0.5419518727756614, + "learning_rate": 4.84628540412867e-05, + "loss": 11.8875, + "step": 25055 + }, + { + "epoch": 1.3643993383839415, + "grad_norm": 0.5558341957921653, + "learning_rate": 4.845529728636143e-05, + "loss": 11.8634, + "step": 25056 + }, + { + "epoch": 1.3644537923805244, + "grad_norm": 0.614957087424945, + "learning_rate": 4.844774093225638e-05, + "loss": 11.7651, + "step": 25057 + }, + { + "epoch": 1.3645082463771074, + "grad_norm": 0.580647493413801, + "learning_rate": 4.844018497903038e-05, + "loss": 11.8813, + "step": 25058 + }, + { + "epoch": 1.3645627003736904, + "grad_norm": 0.49358035332314476, + "learning_rate": 4.8432629426742104e-05, + "loss": 11.9072, + "step": 25059 + }, + { + "epoch": 1.3646171543702736, + "grad_norm": 0.5582621331307084, + "learning_rate": 4.8425074275450344e-05, + "loss": 11.9912, + "step": 25060 + }, + { + "epoch": 1.3646716083668566, + "grad_norm": 0.5544974137118196, + "learning_rate": 4.8417519525213906e-05, + "loss": 11.925, + "step": 25061 + }, + { + "epoch": 1.3647260623634396, + "grad_norm": 0.5267663796662333, + "learning_rate": 4.8409965176091445e-05, + "loss": 11.8615, + "step": 25062 + }, + { + "epoch": 1.3647805163600226, + "grad_norm": 0.52441552411104, + "learning_rate": 4.840241122814181e-05, + "loss": 11.7422, + "step": 25063 + }, + { + "epoch": 1.3648349703566056, + "grad_norm": 0.5639695274470757, + "learning_rate": 4.839485768142361e-05, + "loss": 11.8825, + "step": 25064 + }, + { + "epoch": 1.3648894243531886, + "grad_norm": 0.5510800558494531, + "learning_rate": 4.838730453599566e-05, + "loss": 11.8495, + "step": 25065 + }, + { + "epoch": 1.3649438783497716, + "grad_norm": 0.5248793378922084, + "learning_rate": 4.8379751791916716e-05, + "loss": 11.8267, + "step": 25066 + }, + { + "epoch": 1.3649983323463546, + "grad_norm": 0.5712846925018582, + "learning_rate": 4.837219944924544e-05, + "loss": 11.7966, + "step": 25067 + }, + { + "epoch": 1.3650527863429376, + "grad_norm": 0.49709859810052864, + "learning_rate": 4.836464750804064e-05, + "loss": 11.929, + "step": 25068 + }, + { + "epoch": 1.3651072403395208, + "grad_norm": 0.587437006672066, + "learning_rate": 4.8357095968360955e-05, + "loss": 11.9191, + "step": 25069 + }, + { + "epoch": 1.3651616943361038, + "grad_norm": 0.522421371357169, + "learning_rate": 4.83495448302652e-05, + "loss": 11.9529, + "step": 25070 + }, + { + "epoch": 1.3652161483326868, + "grad_norm": 0.5547914272280554, + "learning_rate": 4.8341994093812005e-05, + "loss": 11.8742, + "step": 25071 + }, + { + "epoch": 1.3652706023292698, + "grad_norm": 0.5628350573401014, + "learning_rate": 4.833444375906012e-05, + "loss": 11.8248, + "step": 25072 + }, + { + "epoch": 1.3653250563258528, + "grad_norm": 0.5454799580843143, + "learning_rate": 4.8326893826068354e-05, + "loss": 11.8551, + "step": 25073 + }, + { + "epoch": 1.3653795103224358, + "grad_norm": 0.5680693707193344, + "learning_rate": 4.831934429489524e-05, + "loss": 11.8932, + "step": 25074 + }, + { + "epoch": 1.3654339643190188, + "grad_norm": 0.5389331220279573, + "learning_rate": 4.831179516559958e-05, + "loss": 11.9218, + "step": 25075 + }, + { + "epoch": 1.3654884183156017, + "grad_norm": 0.6550773298623871, + "learning_rate": 4.83042464382401e-05, + "loss": 11.8776, + "step": 25076 + }, + { + "epoch": 1.3655428723121847, + "grad_norm": 0.5385274700697291, + "learning_rate": 4.829669811287544e-05, + "loss": 11.9966, + "step": 25077 + }, + { + "epoch": 1.3655973263087677, + "grad_norm": 0.5645934639414184, + "learning_rate": 4.828915018956435e-05, + "loss": 12.0673, + "step": 25078 + }, + { + "epoch": 1.3656517803053507, + "grad_norm": 0.526032775194877, + "learning_rate": 4.828160266836547e-05, + "loss": 11.8598, + "step": 25079 + }, + { + "epoch": 1.3657062343019337, + "grad_norm": 0.5167559172748148, + "learning_rate": 4.827405554933756e-05, + "loss": 11.9208, + "step": 25080 + }, + { + "epoch": 1.3657606882985167, + "grad_norm": 0.5437637163621964, + "learning_rate": 4.826650883253923e-05, + "loss": 11.8471, + "step": 25081 + }, + { + "epoch": 1.3658151422950997, + "grad_norm": 0.588792792656736, + "learning_rate": 4.8258962518029205e-05, + "loss": 11.7768, + "step": 25082 + }, + { + "epoch": 1.3658695962916827, + "grad_norm": 0.5382521194921016, + "learning_rate": 4.825141660586623e-05, + "loss": 11.8772, + "step": 25083 + }, + { + "epoch": 1.365924050288266, + "grad_norm": 0.5069262275811791, + "learning_rate": 4.824387109610885e-05, + "loss": 11.9157, + "step": 25084 + }, + { + "epoch": 1.365978504284849, + "grad_norm": 0.6846066968569539, + "learning_rate": 4.823632598881585e-05, + "loss": 11.6901, + "step": 25085 + }, + { + "epoch": 1.366032958281432, + "grad_norm": 0.5400538581503838, + "learning_rate": 4.822878128404581e-05, + "loss": 11.9139, + "step": 25086 + }, + { + "epoch": 1.366087412278015, + "grad_norm": 0.548696516380634, + "learning_rate": 4.8221236981857455e-05, + "loss": 11.8388, + "step": 25087 + }, + { + "epoch": 1.3661418662745979, + "grad_norm": 0.6096702113572322, + "learning_rate": 4.821369308230949e-05, + "loss": 12.0497, + "step": 25088 + }, + { + "epoch": 1.3661963202711809, + "grad_norm": 0.5265305841633061, + "learning_rate": 4.820614958546048e-05, + "loss": 11.8539, + "step": 25089 + }, + { + "epoch": 1.3662507742677639, + "grad_norm": 0.5527700762439866, + "learning_rate": 4.819860649136919e-05, + "loss": 11.9079, + "step": 25090 + }, + { + "epoch": 1.3663052282643469, + "grad_norm": 0.5337874650232489, + "learning_rate": 4.8191063800094175e-05, + "loss": 11.9632, + "step": 25091 + }, + { + "epoch": 1.36635968226093, + "grad_norm": 0.5312908667458723, + "learning_rate": 4.818352151169418e-05, + "loss": 11.9241, + "step": 25092 + }, + { + "epoch": 1.366414136257513, + "grad_norm": 0.5200527242232768, + "learning_rate": 4.8175979626227806e-05, + "loss": 11.8154, + "step": 25093 + }, + { + "epoch": 1.366468590254096, + "grad_norm": 0.6659346802966777, + "learning_rate": 4.816843814375367e-05, + "loss": 12.0484, + "step": 25094 + }, + { + "epoch": 1.366523044250679, + "grad_norm": 0.5748616208321881, + "learning_rate": 4.816089706433048e-05, + "loss": 11.8186, + "step": 25095 + }, + { + "epoch": 1.366577498247262, + "grad_norm": 0.59562321821412, + "learning_rate": 4.8153356388016827e-05, + "loss": 11.965, + "step": 25096 + }, + { + "epoch": 1.366631952243845, + "grad_norm": 0.5705926117797742, + "learning_rate": 4.814581611487139e-05, + "loss": 11.8715, + "step": 25097 + }, + { + "epoch": 1.366686406240428, + "grad_norm": 0.5407234790532826, + "learning_rate": 4.8138276244952754e-05, + "loss": 11.7471, + "step": 25098 + }, + { + "epoch": 1.366740860237011, + "grad_norm": 0.6501824942662773, + "learning_rate": 4.813073677831958e-05, + "loss": 11.9337, + "step": 25099 + }, + { + "epoch": 1.366795314233594, + "grad_norm": 0.5784681159521441, + "learning_rate": 4.812319771503053e-05, + "loss": 11.9465, + "step": 25100 + }, + { + "epoch": 1.366849768230177, + "grad_norm": 0.6056400943832342, + "learning_rate": 4.811565905514416e-05, + "loss": 12.0192, + "step": 25101 + }, + { + "epoch": 1.36690422222676, + "grad_norm": 0.5396109459761824, + "learning_rate": 4.810812079871916e-05, + "loss": 11.9325, + "step": 25102 + }, + { + "epoch": 1.366958676223343, + "grad_norm": 0.5546552113436038, + "learning_rate": 4.810058294581412e-05, + "loss": 11.9535, + "step": 25103 + }, + { + "epoch": 1.367013130219926, + "grad_norm": 0.5599864368610015, + "learning_rate": 4.809304549648761e-05, + "loss": 11.9501, + "step": 25104 + }, + { + "epoch": 1.367067584216509, + "grad_norm": 0.49451505886383473, + "learning_rate": 4.808550845079832e-05, + "loss": 11.9211, + "step": 25105 + }, + { + "epoch": 1.367122038213092, + "grad_norm": 0.55109309273116, + "learning_rate": 4.807797180880479e-05, + "loss": 11.9217, + "step": 25106 + }, + { + "epoch": 1.3671764922096752, + "grad_norm": 0.537961764596016, + "learning_rate": 4.8070435570565685e-05, + "loss": 11.798, + "step": 25107 + }, + { + "epoch": 1.3672309462062582, + "grad_norm": 0.541574789119973, + "learning_rate": 4.806289973613956e-05, + "loss": 11.7621, + "step": 25108 + }, + { + "epoch": 1.3672854002028412, + "grad_norm": 0.5324905199750999, + "learning_rate": 4.8055364305585014e-05, + "loss": 11.9363, + "step": 25109 + }, + { + "epoch": 1.3673398541994242, + "grad_norm": 0.5219113301890548, + "learning_rate": 4.804782927896072e-05, + "loss": 11.8879, + "step": 25110 + }, + { + "epoch": 1.3673943081960072, + "grad_norm": 0.5669892893288252, + "learning_rate": 4.804029465632518e-05, + "loss": 11.9929, + "step": 25111 + }, + { + "epoch": 1.3674487621925902, + "grad_norm": 0.5728257842324002, + "learning_rate": 4.8032760437737055e-05, + "loss": 11.8895, + "step": 25112 + }, + { + "epoch": 1.3675032161891731, + "grad_norm": 0.5028461713954278, + "learning_rate": 4.8025226623254906e-05, + "loss": 11.8547, + "step": 25113 + }, + { + "epoch": 1.3675576701857561, + "grad_norm": 0.51435943829678, + "learning_rate": 4.8017693212937266e-05, + "loss": 11.8178, + "step": 25114 + }, + { + "epoch": 1.3676121241823391, + "grad_norm": 0.5929600916261433, + "learning_rate": 4.8010160206842803e-05, + "loss": 11.7993, + "step": 25115 + }, + { + "epoch": 1.3676665781789223, + "grad_norm": 0.5382834120972866, + "learning_rate": 4.800262760503002e-05, + "loss": 11.9301, + "step": 25116 + }, + { + "epoch": 1.3677210321755053, + "grad_norm": 0.49008064164726056, + "learning_rate": 4.799509540755754e-05, + "loss": 11.935, + "step": 25117 + }, + { + "epoch": 1.3677754861720883, + "grad_norm": 0.5619016132843284, + "learning_rate": 4.798756361448391e-05, + "loss": 11.8332, + "step": 25118 + }, + { + "epoch": 1.3678299401686713, + "grad_norm": 0.4842402516284035, + "learning_rate": 4.798003222586773e-05, + "loss": 11.9025, + "step": 25119 + }, + { + "epoch": 1.3678843941652543, + "grad_norm": 0.5270994832602205, + "learning_rate": 4.797250124176751e-05, + "loss": 11.7359, + "step": 25120 + }, + { + "epoch": 1.3679388481618373, + "grad_norm": 0.5546253579220396, + "learning_rate": 4.796497066224184e-05, + "loss": 11.8169, + "step": 25121 + }, + { + "epoch": 1.3679933021584203, + "grad_norm": 0.5895691163162081, + "learning_rate": 4.795744048734932e-05, + "loss": 11.8789, + "step": 25122 + }, + { + "epoch": 1.3680477561550033, + "grad_norm": 0.5342975863541174, + "learning_rate": 4.794991071714848e-05, + "loss": 11.9294, + "step": 25123 + }, + { + "epoch": 1.3681022101515863, + "grad_norm": 0.5366704933948835, + "learning_rate": 4.794238135169783e-05, + "loss": 11.8725, + "step": 25124 + }, + { + "epoch": 1.3681566641481693, + "grad_norm": 0.5003059131680908, + "learning_rate": 4.7934852391055985e-05, + "loss": 11.9016, + "step": 25125 + }, + { + "epoch": 1.3682111181447523, + "grad_norm": 0.5766781270961125, + "learning_rate": 4.792732383528141e-05, + "loss": 12.0251, + "step": 25126 + }, + { + "epoch": 1.3682655721413353, + "grad_norm": 0.5609779498089513, + "learning_rate": 4.791979568443274e-05, + "loss": 11.8133, + "step": 25127 + }, + { + "epoch": 1.3683200261379183, + "grad_norm": 0.5411245033745566, + "learning_rate": 4.7912267938568445e-05, + "loss": 12.0362, + "step": 25128 + }, + { + "epoch": 1.3683744801345012, + "grad_norm": 0.5237389445196476, + "learning_rate": 4.7904740597747124e-05, + "loss": 11.9776, + "step": 25129 + }, + { + "epoch": 1.3684289341310845, + "grad_norm": 0.7018697693221225, + "learning_rate": 4.789721366202724e-05, + "loss": 12.1431, + "step": 25130 + }, + { + "epoch": 1.3684833881276675, + "grad_norm": 0.5315774511077166, + "learning_rate": 4.7889687131467355e-05, + "loss": 11.8995, + "step": 25131 + }, + { + "epoch": 1.3685378421242504, + "grad_norm": 0.5699725123374426, + "learning_rate": 4.7882161006126034e-05, + "loss": 12.0593, + "step": 25132 + }, + { + "epoch": 1.3685922961208334, + "grad_norm": 0.5555549938187669, + "learning_rate": 4.7874635286061776e-05, + "loss": 11.9752, + "step": 25133 + }, + { + "epoch": 1.3686467501174164, + "grad_norm": 0.5380393157728871, + "learning_rate": 4.78671099713331e-05, + "loss": 11.9835, + "step": 25134 + }, + { + "epoch": 1.3687012041139994, + "grad_norm": 0.5505625833372509, + "learning_rate": 4.785958506199848e-05, + "loss": 11.8858, + "step": 25135 + }, + { + "epoch": 1.3687556581105824, + "grad_norm": 0.5378844181894424, + "learning_rate": 4.7852060558116466e-05, + "loss": 11.8883, + "step": 25136 + }, + { + "epoch": 1.3688101121071654, + "grad_norm": 0.5239550100816478, + "learning_rate": 4.7844536459745615e-05, + "loss": 11.9211, + "step": 25137 + }, + { + "epoch": 1.3688645661037484, + "grad_norm": 0.5158765324469426, + "learning_rate": 4.783701276694436e-05, + "loss": 11.774, + "step": 25138 + }, + { + "epoch": 1.3689190201003316, + "grad_norm": 0.544013191229186, + "learning_rate": 4.782948947977127e-05, + "loss": 11.8005, + "step": 25139 + }, + { + "epoch": 1.3689734740969146, + "grad_norm": 0.6405344375573496, + "learning_rate": 4.7821966598284786e-05, + "loss": 11.8133, + "step": 25140 + }, + { + "epoch": 1.3690279280934976, + "grad_norm": 0.5154339333499073, + "learning_rate": 4.7814444122543476e-05, + "loss": 11.8936, + "step": 25141 + }, + { + "epoch": 1.3690823820900806, + "grad_norm": 0.5362485119681301, + "learning_rate": 4.7806922052605765e-05, + "loss": 11.8353, + "step": 25142 + }, + { + "epoch": 1.3691368360866636, + "grad_norm": 0.5109631313283256, + "learning_rate": 4.779940038853018e-05, + "loss": 12.0095, + "step": 25143 + }, + { + "epoch": 1.3691912900832466, + "grad_norm": 0.5810936133366243, + "learning_rate": 4.7791879130375286e-05, + "loss": 11.9121, + "step": 25144 + }, + { + "epoch": 1.3692457440798296, + "grad_norm": 0.5453383983930444, + "learning_rate": 4.778435827819943e-05, + "loss": 11.8733, + "step": 25145 + }, + { + "epoch": 1.3693001980764126, + "grad_norm": 0.6126937632389491, + "learning_rate": 4.777683783206118e-05, + "loss": 11.8779, + "step": 25146 + }, + { + "epoch": 1.3693546520729956, + "grad_norm": 0.5910559879078056, + "learning_rate": 4.7769317792018974e-05, + "loss": 11.9472, + "step": 25147 + }, + { + "epoch": 1.3694091060695786, + "grad_norm": 0.5774818513328772, + "learning_rate": 4.77617981581313e-05, + "loss": 11.918, + "step": 25148 + }, + { + "epoch": 1.3694635600661615, + "grad_norm": 0.5975863467060338, + "learning_rate": 4.7754278930456685e-05, + "loss": 12.0273, + "step": 25149 + }, + { + "epoch": 1.3695180140627445, + "grad_norm": 0.5685265374387557, + "learning_rate": 4.774676010905353e-05, + "loss": 11.905, + "step": 25150 + }, + { + "epoch": 1.3695724680593275, + "grad_norm": 0.5739833439041315, + "learning_rate": 4.7739241693980366e-05, + "loss": 11.9733, + "step": 25151 + }, + { + "epoch": 1.3696269220559105, + "grad_norm": 0.5565029887800292, + "learning_rate": 4.773172368529558e-05, + "loss": 11.8743, + "step": 25152 + }, + { + "epoch": 1.3696813760524935, + "grad_norm": 0.5897144964944023, + "learning_rate": 4.772420608305771e-05, + "loss": 12.0013, + "step": 25153 + }, + { + "epoch": 1.3697358300490767, + "grad_norm": 0.5215098716775922, + "learning_rate": 4.771668888732519e-05, + "loss": 11.794, + "step": 25154 + }, + { + "epoch": 1.3697902840456597, + "grad_norm": 0.5405015588673393, + "learning_rate": 4.770917209815642e-05, + "loss": 11.9028, + "step": 25155 + }, + { + "epoch": 1.3698447380422427, + "grad_norm": 0.5862248229033459, + "learning_rate": 4.770165571560994e-05, + "loss": 11.9305, + "step": 25156 + }, + { + "epoch": 1.3698991920388257, + "grad_norm": 0.5012201539882242, + "learning_rate": 4.769413973974412e-05, + "loss": 11.8819, + "step": 25157 + }, + { + "epoch": 1.3699536460354087, + "grad_norm": 0.6094271616706446, + "learning_rate": 4.768662417061743e-05, + "loss": 12.0085, + "step": 25158 + }, + { + "epoch": 1.3700081000319917, + "grad_norm": 0.5649822763892307, + "learning_rate": 4.767910900828837e-05, + "loss": 11.9952, + "step": 25159 + }, + { + "epoch": 1.3700625540285747, + "grad_norm": 0.5411521046562873, + "learning_rate": 4.76715942528153e-05, + "loss": 11.9949, + "step": 25160 + }, + { + "epoch": 1.3701170080251577, + "grad_norm": 0.5488463848978068, + "learning_rate": 4.766407990425671e-05, + "loss": 11.9352, + "step": 25161 + }, + { + "epoch": 1.370171462021741, + "grad_norm": 0.5654767948971301, + "learning_rate": 4.765656596267099e-05, + "loss": 11.8449, + "step": 25162 + }, + { + "epoch": 1.3702259160183239, + "grad_norm": 0.5506834463189413, + "learning_rate": 4.7649052428116614e-05, + "loss": 11.8977, + "step": 25163 + }, + { + "epoch": 1.3702803700149069, + "grad_norm": 0.5271581165985557, + "learning_rate": 4.7641539300651994e-05, + "loss": 11.9711, + "step": 25164 + }, + { + "epoch": 1.3703348240114899, + "grad_norm": 0.568354222805055, + "learning_rate": 4.763402658033551e-05, + "loss": 11.7845, + "step": 25165 + }, + { + "epoch": 1.3703892780080729, + "grad_norm": 0.5373291371622491, + "learning_rate": 4.7626514267225654e-05, + "loss": 11.9826, + "step": 25166 + }, + { + "epoch": 1.3704437320046559, + "grad_norm": 0.562625605928785, + "learning_rate": 4.761900236138076e-05, + "loss": 11.9212, + "step": 25167 + }, + { + "epoch": 1.3704981860012388, + "grad_norm": 0.4849888466500203, + "learning_rate": 4.7611490862859333e-05, + "loss": 11.84, + "step": 25168 + }, + { + "epoch": 1.3705526399978218, + "grad_norm": 0.634616885300659, + "learning_rate": 4.76039797717197e-05, + "loss": 11.8696, + "step": 25169 + }, + { + "epoch": 1.3706070939944048, + "grad_norm": 0.654571175472763, + "learning_rate": 4.75964690880203e-05, + "loss": 11.9599, + "step": 25170 + }, + { + "epoch": 1.3706615479909878, + "grad_norm": 0.5840094561144437, + "learning_rate": 4.758895881181959e-05, + "loss": 11.9393, + "step": 25171 + }, + { + "epoch": 1.3707160019875708, + "grad_norm": 0.5412491234691605, + "learning_rate": 4.7581448943175886e-05, + "loss": 12.0059, + "step": 25172 + }, + { + "epoch": 1.3707704559841538, + "grad_norm": 0.5806772752957869, + "learning_rate": 4.757393948214767e-05, + "loss": 11.7347, + "step": 25173 + }, + { + "epoch": 1.3708249099807368, + "grad_norm": 0.5187746714760103, + "learning_rate": 4.756643042879329e-05, + "loss": 11.8312, + "step": 25174 + }, + { + "epoch": 1.3708793639773198, + "grad_norm": 0.526855635336909, + "learning_rate": 4.7558921783171095e-05, + "loss": 11.8599, + "step": 25175 + }, + { + "epoch": 1.3709338179739028, + "grad_norm": 0.5579671947534771, + "learning_rate": 4.755141354533956e-05, + "loss": 11.8686, + "step": 25176 + }, + { + "epoch": 1.370988271970486, + "grad_norm": 0.5205755938172167, + "learning_rate": 4.7543905715356984e-05, + "loss": 11.8199, + "step": 25177 + }, + { + "epoch": 1.371042725967069, + "grad_norm": 0.5795151769743531, + "learning_rate": 4.753639829328185e-05, + "loss": 11.8959, + "step": 25178 + }, + { + "epoch": 1.371097179963652, + "grad_norm": 0.5063909628788978, + "learning_rate": 4.7528891279172424e-05, + "loss": 11.8671, + "step": 25179 + }, + { + "epoch": 1.371151633960235, + "grad_norm": 0.5889294962063523, + "learning_rate": 4.752138467308714e-05, + "loss": 11.9673, + "step": 25180 + }, + { + "epoch": 1.371206087956818, + "grad_norm": 0.49894799623537534, + "learning_rate": 4.7513878475084414e-05, + "loss": 11.681, + "step": 25181 + }, + { + "epoch": 1.371260541953401, + "grad_norm": 0.5264034405288752, + "learning_rate": 4.750637268522254e-05, + "loss": 11.7979, + "step": 25182 + }, + { + "epoch": 1.371314995949984, + "grad_norm": 0.5741823667048461, + "learning_rate": 4.749886730355995e-05, + "loss": 11.9187, + "step": 25183 + }, + { + "epoch": 1.371369449946567, + "grad_norm": 0.5478759188318844, + "learning_rate": 4.7491362330154964e-05, + "loss": 11.9207, + "step": 25184 + }, + { + "epoch": 1.37142390394315, + "grad_norm": 0.5566758328287349, + "learning_rate": 4.748385776506592e-05, + "loss": 11.8836, + "step": 25185 + }, + { + "epoch": 1.3714783579397332, + "grad_norm": 0.5954818195244415, + "learning_rate": 4.747635360835124e-05, + "loss": 11.9352, + "step": 25186 + }, + { + "epoch": 1.3715328119363162, + "grad_norm": 0.5044565346058839, + "learning_rate": 4.74688498600692e-05, + "loss": 11.8805, + "step": 25187 + }, + { + "epoch": 1.3715872659328991, + "grad_norm": 0.5042383277607629, + "learning_rate": 4.7461346520278236e-05, + "loss": 11.8401, + "step": 25188 + }, + { + "epoch": 1.3716417199294821, + "grad_norm": 0.4840766664645052, + "learning_rate": 4.745384358903662e-05, + "loss": 11.9215, + "step": 25189 + }, + { + "epoch": 1.3716961739260651, + "grad_norm": 0.5489749929948173, + "learning_rate": 4.7446341066402754e-05, + "loss": 11.908, + "step": 25190 + }, + { + "epoch": 1.3717506279226481, + "grad_norm": 0.526247010486432, + "learning_rate": 4.7438838952434916e-05, + "loss": 11.8905, + "step": 25191 + }, + { + "epoch": 1.3718050819192311, + "grad_norm": 0.6014050591841199, + "learning_rate": 4.743133724719149e-05, + "loss": 12.0415, + "step": 25192 + }, + { + "epoch": 1.371859535915814, + "grad_norm": 0.5231203855374652, + "learning_rate": 4.7423835950730834e-05, + "loss": 11.8379, + "step": 25193 + }, + { + "epoch": 1.371913989912397, + "grad_norm": 0.5238528836274371, + "learning_rate": 4.741633506311125e-05, + "loss": 12.0402, + "step": 25194 + }, + { + "epoch": 1.37196844390898, + "grad_norm": 0.4942266262421094, + "learning_rate": 4.740883458439106e-05, + "loss": 11.8109, + "step": 25195 + }, + { + "epoch": 1.372022897905563, + "grad_norm": 0.5534674871706498, + "learning_rate": 4.740133451462855e-05, + "loss": 11.9346, + "step": 25196 + }, + { + "epoch": 1.372077351902146, + "grad_norm": 0.5036089615406288, + "learning_rate": 4.739383485388209e-05, + "loss": 11.8729, + "step": 25197 + }, + { + "epoch": 1.372131805898729, + "grad_norm": 0.5676058382743236, + "learning_rate": 4.738633560221003e-05, + "loss": 11.9334, + "step": 25198 + }, + { + "epoch": 1.372186259895312, + "grad_norm": 0.5308956539066537, + "learning_rate": 4.7378836759670606e-05, + "loss": 11.9188, + "step": 25199 + }, + { + "epoch": 1.3722407138918953, + "grad_norm": 0.5227422629580815, + "learning_rate": 4.737133832632221e-05, + "loss": 11.892, + "step": 25200 + }, + { + "epoch": 1.3722951678884783, + "grad_norm": 0.5778140275861255, + "learning_rate": 4.736384030222308e-05, + "loss": 11.9294, + "step": 25201 + }, + { + "epoch": 1.3723496218850613, + "grad_norm": 0.600801053051945, + "learning_rate": 4.7356342687431585e-05, + "loss": 11.8202, + "step": 25202 + }, + { + "epoch": 1.3724040758816443, + "grad_norm": 0.6657469160372627, + "learning_rate": 4.734884548200597e-05, + "loss": 12.0397, + "step": 25203 + }, + { + "epoch": 1.3724585298782273, + "grad_norm": 0.5863296049708091, + "learning_rate": 4.7341348686004596e-05, + "loss": 11.8541, + "step": 25204 + }, + { + "epoch": 1.3725129838748102, + "grad_norm": 0.571748109594584, + "learning_rate": 4.733385229948572e-05, + "loss": 11.8864, + "step": 25205 + }, + { + "epoch": 1.3725674378713932, + "grad_norm": 0.5720556581573786, + "learning_rate": 4.7326356322507606e-05, + "loss": 11.9563, + "step": 25206 + }, + { + "epoch": 1.3726218918679762, + "grad_norm": 0.6269365995903576, + "learning_rate": 4.731886075512858e-05, + "loss": 11.8724, + "step": 25207 + }, + { + "epoch": 1.3726763458645592, + "grad_norm": 0.5166449411134307, + "learning_rate": 4.7311365597406964e-05, + "loss": 11.7918, + "step": 25208 + }, + { + "epoch": 1.3727307998611424, + "grad_norm": 0.5363266455847857, + "learning_rate": 4.7303870849400964e-05, + "loss": 11.8909, + "step": 25209 + }, + { + "epoch": 1.3727852538577254, + "grad_norm": 0.5368339221215567, + "learning_rate": 4.729637651116895e-05, + "loss": 11.9264, + "step": 25210 + }, + { + "epoch": 1.3728397078543084, + "grad_norm": 0.56968156954211, + "learning_rate": 4.72888825827691e-05, + "loss": 11.9953, + "step": 25211 + }, + { + "epoch": 1.3728941618508914, + "grad_norm": 0.5571619729912212, + "learning_rate": 4.728138906425978e-05, + "loss": 11.9766, + "step": 25212 + }, + { + "epoch": 1.3729486158474744, + "grad_norm": 0.5685060197223936, + "learning_rate": 4.7273895955699185e-05, + "loss": 11.6695, + "step": 25213 + }, + { + "epoch": 1.3730030698440574, + "grad_norm": 0.5598016315734076, + "learning_rate": 4.726640325714565e-05, + "loss": 11.7932, + "step": 25214 + }, + { + "epoch": 1.3730575238406404, + "grad_norm": 0.5451234135253817, + "learning_rate": 4.725891096865742e-05, + "loss": 11.8479, + "step": 25215 + }, + { + "epoch": 1.3731119778372234, + "grad_norm": 0.5907122942209788, + "learning_rate": 4.7251419090292694e-05, + "loss": 11.9886, + "step": 25216 + }, + { + "epoch": 1.3731664318338064, + "grad_norm": 0.59937953453529, + "learning_rate": 4.724392762210982e-05, + "loss": 11.925, + "step": 25217 + }, + { + "epoch": 1.3732208858303894, + "grad_norm": 0.6053097495226107, + "learning_rate": 4.723643656416698e-05, + "loss": 11.8532, + "step": 25218 + }, + { + "epoch": 1.3732753398269724, + "grad_norm": 0.5266506477173108, + "learning_rate": 4.722894591652244e-05, + "loss": 11.8468, + "step": 25219 + }, + { + "epoch": 1.3733297938235554, + "grad_norm": 0.5443021348448976, + "learning_rate": 4.722145567923452e-05, + "loss": 11.8475, + "step": 25220 + }, + { + "epoch": 1.3733842478201383, + "grad_norm": 0.5543951736863234, + "learning_rate": 4.7213965852361364e-05, + "loss": 11.8765, + "step": 25221 + }, + { + "epoch": 1.3734387018167213, + "grad_norm": 0.6131721022895514, + "learning_rate": 4.72064764359613e-05, + "loss": 11.8663, + "step": 25222 + }, + { + "epoch": 1.3734931558133046, + "grad_norm": 0.5172387160024928, + "learning_rate": 4.71989874300925e-05, + "loss": 11.9415, + "step": 25223 + }, + { + "epoch": 1.3735476098098875, + "grad_norm": 0.5210427710252998, + "learning_rate": 4.719149883481326e-05, + "loss": 11.9812, + "step": 25224 + }, + { + "epoch": 1.3736020638064705, + "grad_norm": 0.5658738284479276, + "learning_rate": 4.718401065018179e-05, + "loss": 11.9707, + "step": 25225 + }, + { + "epoch": 1.3736565178030535, + "grad_norm": 0.6153394309268462, + "learning_rate": 4.717652287625626e-05, + "loss": 11.9621, + "step": 25226 + }, + { + "epoch": 1.3737109717996365, + "grad_norm": 0.50846970213708, + "learning_rate": 4.716903551309498e-05, + "loss": 11.8379, + "step": 25227 + }, + { + "epoch": 1.3737654257962195, + "grad_norm": 0.555380332133304, + "learning_rate": 4.7161548560756116e-05, + "loss": 11.9199, + "step": 25228 + }, + { + "epoch": 1.3738198797928025, + "grad_norm": 0.5489522499044746, + "learning_rate": 4.71540620192979e-05, + "loss": 11.8861, + "step": 25229 + }, + { + "epoch": 1.3738743337893855, + "grad_norm": 0.552731690999242, + "learning_rate": 4.714657588877861e-05, + "loss": 11.9407, + "step": 25230 + }, + { + "epoch": 1.3739287877859685, + "grad_norm": 0.6860486499702981, + "learning_rate": 4.713909016925637e-05, + "loss": 12.0791, + "step": 25231 + }, + { + "epoch": 1.3739832417825517, + "grad_norm": 0.5805692979565028, + "learning_rate": 4.713160486078946e-05, + "loss": 11.894, + "step": 25232 + }, + { + "epoch": 1.3740376957791347, + "grad_norm": 0.5302705322568467, + "learning_rate": 4.7124119963436034e-05, + "loss": 11.8835, + "step": 25233 + }, + { + "epoch": 1.3740921497757177, + "grad_norm": 0.5372320335109962, + "learning_rate": 4.7116635477254336e-05, + "loss": 11.9224, + "step": 25234 + }, + { + "epoch": 1.3741466037723007, + "grad_norm": 0.6281229387489039, + "learning_rate": 4.710915140230255e-05, + "loss": 11.8337, + "step": 25235 + }, + { + "epoch": 1.3742010577688837, + "grad_norm": 0.6253712785976908, + "learning_rate": 4.710166773863885e-05, + "loss": 11.819, + "step": 25236 + }, + { + "epoch": 1.3742555117654667, + "grad_norm": 0.5015754396671326, + "learning_rate": 4.7094184486321476e-05, + "loss": 11.97, + "step": 25237 + }, + { + "epoch": 1.3743099657620497, + "grad_norm": 0.5246357859823589, + "learning_rate": 4.708670164540857e-05, + "loss": 11.921, + "step": 25238 + }, + { + "epoch": 1.3743644197586327, + "grad_norm": 0.5362420472950458, + "learning_rate": 4.707921921595838e-05, + "loss": 11.9285, + "step": 25239 + }, + { + "epoch": 1.3744188737552157, + "grad_norm": 0.5114109675130338, + "learning_rate": 4.707173719802902e-05, + "loss": 11.9729, + "step": 25240 + }, + { + "epoch": 1.3744733277517986, + "grad_norm": 0.5685769139154091, + "learning_rate": 4.70642555916787e-05, + "loss": 12.0068, + "step": 25241 + }, + { + "epoch": 1.3745277817483816, + "grad_norm": 0.5272460377539192, + "learning_rate": 4.705677439696565e-05, + "loss": 11.8695, + "step": 25242 + }, + { + "epoch": 1.3745822357449646, + "grad_norm": 0.5174468059830823, + "learning_rate": 4.704929361394795e-05, + "loss": 11.8628, + "step": 25243 + }, + { + "epoch": 1.3746366897415476, + "grad_norm": 0.5454333648435192, + "learning_rate": 4.7041813242683874e-05, + "loss": 11.8649, + "step": 25244 + }, + { + "epoch": 1.3746911437381306, + "grad_norm": 0.5325871754894, + "learning_rate": 4.703433328323155e-05, + "loss": 11.9209, + "step": 25245 + }, + { + "epoch": 1.3747455977347136, + "grad_norm": 0.5632304771927739, + "learning_rate": 4.702685373564907e-05, + "loss": 11.95, + "step": 25246 + }, + { + "epoch": 1.3748000517312968, + "grad_norm": 0.5144207844755943, + "learning_rate": 4.701937459999471e-05, + "loss": 11.9093, + "step": 25247 + }, + { + "epoch": 1.3748545057278798, + "grad_norm": 0.5334950450649695, + "learning_rate": 4.701189587632654e-05, + "loss": 11.9731, + "step": 25248 + }, + { + "epoch": 1.3749089597244628, + "grad_norm": 0.5165965965441954, + "learning_rate": 4.7004417564702785e-05, + "loss": 11.7983, + "step": 25249 + }, + { + "epoch": 1.3749634137210458, + "grad_norm": 0.5635104825476014, + "learning_rate": 4.699693966518154e-05, + "loss": 11.9499, + "step": 25250 + }, + { + "epoch": 1.3750178677176288, + "grad_norm": 0.5991284988665184, + "learning_rate": 4.698946217782101e-05, + "loss": 11.8144, + "step": 25251 + }, + { + "epoch": 1.3750723217142118, + "grad_norm": 0.5224801381198569, + "learning_rate": 4.698198510267928e-05, + "loss": 11.6778, + "step": 25252 + }, + { + "epoch": 1.3751267757107948, + "grad_norm": 0.5569192678137207, + "learning_rate": 4.6974508439814523e-05, + "loss": 11.8938, + "step": 25253 + }, + { + "epoch": 1.3751812297073778, + "grad_norm": 0.6074234566892095, + "learning_rate": 4.6967032189284955e-05, + "loss": 11.9493, + "step": 25254 + }, + { + "epoch": 1.3752356837039608, + "grad_norm": 0.5195477621291372, + "learning_rate": 4.695955635114856e-05, + "loss": 11.9062, + "step": 25255 + }, + { + "epoch": 1.375290137700544, + "grad_norm": 0.5788947863090088, + "learning_rate": 4.695208092546355e-05, + "loss": 11.9226, + "step": 25256 + }, + { + "epoch": 1.375344591697127, + "grad_norm": 0.5659844363225056, + "learning_rate": 4.69446059122881e-05, + "loss": 11.6769, + "step": 25257 + }, + { + "epoch": 1.37539904569371, + "grad_norm": 0.5824028162796828, + "learning_rate": 4.693713131168024e-05, + "loss": 11.9505, + "step": 25258 + }, + { + "epoch": 1.375453499690293, + "grad_norm": 0.581703679716766, + "learning_rate": 4.69296571236982e-05, + "loss": 11.8626, + "step": 25259 + }, + { + "epoch": 1.375507953686876, + "grad_norm": 0.5528773091074491, + "learning_rate": 4.6922183348399996e-05, + "loss": 11.8795, + "step": 25260 + }, + { + "epoch": 1.375562407683459, + "grad_norm": 0.5539793709418636, + "learning_rate": 4.6914709985843844e-05, + "loss": 12.0209, + "step": 25261 + }, + { + "epoch": 1.375616861680042, + "grad_norm": 0.557919650522969, + "learning_rate": 4.6907237036087756e-05, + "loss": 11.9375, + "step": 25262 + }, + { + "epoch": 1.375671315676625, + "grad_norm": 0.5279663707755494, + "learning_rate": 4.689976449918991e-05, + "loss": 11.8639, + "step": 25263 + }, + { + "epoch": 1.375725769673208, + "grad_norm": 0.524774750805883, + "learning_rate": 4.6892292375208467e-05, + "loss": 11.8669, + "step": 25264 + }, + { + "epoch": 1.375780223669791, + "grad_norm": 0.5125114864252607, + "learning_rate": 4.6884820664201404e-05, + "loss": 11.7979, + "step": 25265 + }, + { + "epoch": 1.375834677666374, + "grad_norm": 0.5581420293347898, + "learning_rate": 4.6877349366226906e-05, + "loss": 11.7847, + "step": 25266 + }, + { + "epoch": 1.375889131662957, + "grad_norm": 0.5485033093475268, + "learning_rate": 4.686987848134301e-05, + "loss": 11.9629, + "step": 25267 + }, + { + "epoch": 1.37594358565954, + "grad_norm": 0.5842899444584857, + "learning_rate": 4.686240800960786e-05, + "loss": 11.8865, + "step": 25268 + }, + { + "epoch": 1.3759980396561229, + "grad_norm": 0.5167122307149978, + "learning_rate": 4.6854937951079566e-05, + "loss": 11.9847, + "step": 25269 + }, + { + "epoch": 1.376052493652706, + "grad_norm": 0.5384273414088226, + "learning_rate": 4.6847468305816144e-05, + "loss": 11.9743, + "step": 25270 + }, + { + "epoch": 1.376106947649289, + "grad_norm": 0.537810582793952, + "learning_rate": 4.683999907387577e-05, + "loss": 11.9524, + "step": 25271 + }, + { + "epoch": 1.376161401645872, + "grad_norm": 0.5046327608039041, + "learning_rate": 4.683253025531644e-05, + "loss": 11.8534, + "step": 25272 + }, + { + "epoch": 1.376215855642455, + "grad_norm": 0.529295704030328, + "learning_rate": 4.6825061850196304e-05, + "loss": 11.8482, + "step": 25273 + }, + { + "epoch": 1.376270309639038, + "grad_norm": 0.6293232892194328, + "learning_rate": 4.68175938585734e-05, + "loss": 12.0959, + "step": 25274 + }, + { + "epoch": 1.376324763635621, + "grad_norm": 0.5299306994355352, + "learning_rate": 4.681012628050578e-05, + "loss": 11.8061, + "step": 25275 + }, + { + "epoch": 1.376379217632204, + "grad_norm": 0.5470687913712922, + "learning_rate": 4.680265911605157e-05, + "loss": 11.961, + "step": 25276 + }, + { + "epoch": 1.376433671628787, + "grad_norm": 0.6534864760950344, + "learning_rate": 4.679519236526877e-05, + "loss": 11.8893, + "step": 25277 + }, + { + "epoch": 1.37648812562537, + "grad_norm": 0.5708609657460811, + "learning_rate": 4.678772602821547e-05, + "loss": 11.8014, + "step": 25278 + }, + { + "epoch": 1.3765425796219533, + "grad_norm": 0.515604461607012, + "learning_rate": 4.678026010494977e-05, + "loss": 11.9886, + "step": 25279 + }, + { + "epoch": 1.3765970336185362, + "grad_norm": 0.5606195302778902, + "learning_rate": 4.6772794595529665e-05, + "loss": 11.8297, + "step": 25280 + }, + { + "epoch": 1.3766514876151192, + "grad_norm": 0.6045793687423382, + "learning_rate": 4.676532950001327e-05, + "loss": 11.8623, + "step": 25281 + }, + { + "epoch": 1.3767059416117022, + "grad_norm": 0.5499582869822852, + "learning_rate": 4.6757864818458565e-05, + "loss": 11.9649, + "step": 25282 + }, + { + "epoch": 1.3767603956082852, + "grad_norm": 0.586820199174203, + "learning_rate": 4.675040055092366e-05, + "loss": 11.8915, + "step": 25283 + }, + { + "epoch": 1.3768148496048682, + "grad_norm": 0.5161861001214637, + "learning_rate": 4.6742936697466574e-05, + "loss": 11.8473, + "step": 25284 + }, + { + "epoch": 1.3768693036014512, + "grad_norm": 0.5552569018298428, + "learning_rate": 4.673547325814531e-05, + "loss": 11.8344, + "step": 25285 + }, + { + "epoch": 1.3769237575980342, + "grad_norm": 0.5176177753705812, + "learning_rate": 4.672801023301797e-05, + "loss": 11.9716, + "step": 25286 + }, + { + "epoch": 1.3769782115946172, + "grad_norm": 0.530667070775309, + "learning_rate": 4.672054762214253e-05, + "loss": 11.8369, + "step": 25287 + }, + { + "epoch": 1.3770326655912002, + "grad_norm": 0.5371237937745807, + "learning_rate": 4.671308542557707e-05, + "loss": 11.8416, + "step": 25288 + }, + { + "epoch": 1.3770871195877832, + "grad_norm": 0.5502273005808457, + "learning_rate": 4.670562364337957e-05, + "loss": 11.7173, + "step": 25289 + }, + { + "epoch": 1.3771415735843662, + "grad_norm": 0.5168596563174641, + "learning_rate": 4.669816227560807e-05, + "loss": 11.8472, + "step": 25290 + }, + { + "epoch": 1.3771960275809492, + "grad_norm": 0.5011662809415851, + "learning_rate": 4.669070132232063e-05, + "loss": 11.9003, + "step": 25291 + }, + { + "epoch": 1.3772504815775322, + "grad_norm": 0.4688201579640735, + "learning_rate": 4.668324078357521e-05, + "loss": 11.8583, + "step": 25292 + }, + { + "epoch": 1.3773049355741154, + "grad_norm": 0.5398657184161583, + "learning_rate": 4.667578065942989e-05, + "loss": 11.8276, + "step": 25293 + }, + { + "epoch": 1.3773593895706984, + "grad_norm": 0.5140614422749251, + "learning_rate": 4.666832094994259e-05, + "loss": 11.7812, + "step": 25294 + }, + { + "epoch": 1.3774138435672814, + "grad_norm": 0.5605245670159629, + "learning_rate": 4.666086165517142e-05, + "loss": 11.984, + "step": 25295 + }, + { + "epoch": 1.3774682975638644, + "grad_norm": 0.5837910327272393, + "learning_rate": 4.665340277517434e-05, + "loss": 11.8923, + "step": 25296 + }, + { + "epoch": 1.3775227515604473, + "grad_norm": 0.5181840213998935, + "learning_rate": 4.6645944310009295e-05, + "loss": 11.8295, + "step": 25297 + }, + { + "epoch": 1.3775772055570303, + "grad_norm": 0.5549549871853204, + "learning_rate": 4.663848625973438e-05, + "loss": 11.8531, + "step": 25298 + }, + { + "epoch": 1.3776316595536133, + "grad_norm": 0.5703559806526367, + "learning_rate": 4.66310286244075e-05, + "loss": 11.8954, + "step": 25299 + }, + { + "epoch": 1.3776861135501963, + "grad_norm": 0.5370731923959247, + "learning_rate": 4.662357140408673e-05, + "loss": 11.915, + "step": 25300 + }, + { + "epoch": 1.3777405675467793, + "grad_norm": 0.544693483049611, + "learning_rate": 4.6616114598829994e-05, + "loss": 11.8466, + "step": 25301 + }, + { + "epoch": 1.3777950215433625, + "grad_norm": 0.5423966923917084, + "learning_rate": 4.660865820869529e-05, + "loss": 11.8965, + "step": 25302 + }, + { + "epoch": 1.3778494755399455, + "grad_norm": 0.6431719711641443, + "learning_rate": 4.660120223374066e-05, + "loss": 11.924, + "step": 25303 + }, + { + "epoch": 1.3779039295365285, + "grad_norm": 0.5159256627432793, + "learning_rate": 4.6593746674023994e-05, + "loss": 11.8755, + "step": 25304 + }, + { + "epoch": 1.3779583835331115, + "grad_norm": 0.5051576987891798, + "learning_rate": 4.658629152960335e-05, + "loss": 11.8992, + "step": 25305 + }, + { + "epoch": 1.3780128375296945, + "grad_norm": 0.5555260791708165, + "learning_rate": 4.657883680053666e-05, + "loss": 11.9193, + "step": 25306 + }, + { + "epoch": 1.3780672915262775, + "grad_norm": 0.605734829380128, + "learning_rate": 4.657138248688185e-05, + "loss": 11.8852, + "step": 25307 + }, + { + "epoch": 1.3781217455228605, + "grad_norm": 0.5981299656884339, + "learning_rate": 4.656392858869698e-05, + "loss": 11.737, + "step": 25308 + }, + { + "epoch": 1.3781761995194435, + "grad_norm": 0.5605431575308834, + "learning_rate": 4.655647510603991e-05, + "loss": 11.8532, + "step": 25309 + }, + { + "epoch": 1.3782306535160265, + "grad_norm": 0.6098572930186241, + "learning_rate": 4.6549022038968704e-05, + "loss": 11.8812, + "step": 25310 + }, + { + "epoch": 1.3782851075126095, + "grad_norm": 0.5475993808226356, + "learning_rate": 4.654156938754122e-05, + "loss": 11.8025, + "step": 25311 + }, + { + "epoch": 1.3783395615091925, + "grad_norm": 0.6035442108574312, + "learning_rate": 4.653411715181546e-05, + "loss": 11.8948, + "step": 25312 + }, + { + "epoch": 1.3783940155057754, + "grad_norm": 0.6582446198534526, + "learning_rate": 4.65266653318494e-05, + "loss": 11.9297, + "step": 25313 + }, + { + "epoch": 1.3784484695023584, + "grad_norm": 0.5647195182100125, + "learning_rate": 4.651921392770093e-05, + "loss": 11.7704, + "step": 25314 + }, + { + "epoch": 1.3785029234989414, + "grad_norm": 0.5096170369971617, + "learning_rate": 4.651176293942811e-05, + "loss": 11.884, + "step": 25315 + }, + { + "epoch": 1.3785573774955244, + "grad_norm": 0.5158635753513805, + "learning_rate": 4.65043123670887e-05, + "loss": 11.9498, + "step": 25316 + }, + { + "epoch": 1.3786118314921076, + "grad_norm": 0.5335481980621415, + "learning_rate": 4.649686221074072e-05, + "loss": 11.9132, + "step": 25317 + }, + { + "epoch": 1.3786662854886906, + "grad_norm": 0.6399638169629862, + "learning_rate": 4.648941247044216e-05, + "loss": 11.9258, + "step": 25318 + }, + { + "epoch": 1.3787207394852736, + "grad_norm": 0.6079666643576195, + "learning_rate": 4.648196314625086e-05, + "loss": 11.9187, + "step": 25319 + }, + { + "epoch": 1.3787751934818566, + "grad_norm": 0.5271068845853558, + "learning_rate": 4.647451423822484e-05, + "loss": 11.8874, + "step": 25320 + }, + { + "epoch": 1.3788296474784396, + "grad_norm": 0.5557132555366495, + "learning_rate": 4.6467065746421925e-05, + "loss": 11.8037, + "step": 25321 + }, + { + "epoch": 1.3788841014750226, + "grad_norm": 0.6228311442435652, + "learning_rate": 4.645961767090012e-05, + "loss": 12.0277, + "step": 25322 + }, + { + "epoch": 1.3789385554716056, + "grad_norm": 0.5514131563069321, + "learning_rate": 4.645217001171728e-05, + "loss": 11.9728, + "step": 25323 + }, + { + "epoch": 1.3789930094681886, + "grad_norm": 0.49910688224173033, + "learning_rate": 4.644472276893134e-05, + "loss": 11.8547, + "step": 25324 + }, + { + "epoch": 1.3790474634647718, + "grad_norm": 0.5045112355743566, + "learning_rate": 4.643727594260029e-05, + "loss": 11.9359, + "step": 25325 + }, + { + "epoch": 1.3791019174613548, + "grad_norm": 0.5641923857059223, + "learning_rate": 4.642982953278189e-05, + "loss": 12.0442, + "step": 25326 + }, + { + "epoch": 1.3791563714579378, + "grad_norm": 0.564463834160809, + "learning_rate": 4.642238353953412e-05, + "loss": 11.8735, + "step": 25327 + }, + { + "epoch": 1.3792108254545208, + "grad_norm": 0.570900474626188, + "learning_rate": 4.641493796291492e-05, + "loss": 11.9399, + "step": 25328 + }, + { + "epoch": 1.3792652794511038, + "grad_norm": 0.5315766014289637, + "learning_rate": 4.64074928029821e-05, + "loss": 11.8108, + "step": 25329 + }, + { + "epoch": 1.3793197334476868, + "grad_norm": 0.5351653281513066, + "learning_rate": 4.6400048059793656e-05, + "loss": 11.9468, + "step": 25330 + }, + { + "epoch": 1.3793741874442698, + "grad_norm": 0.5941142858822028, + "learning_rate": 4.639260373340738e-05, + "loss": 11.8205, + "step": 25331 + }, + { + "epoch": 1.3794286414408528, + "grad_norm": 0.5827751609890155, + "learning_rate": 4.638515982388125e-05, + "loss": 11.8947, + "step": 25332 + }, + { + "epoch": 1.3794830954374357, + "grad_norm": 0.5521555797331066, + "learning_rate": 4.6377716331273066e-05, + "loss": 11.909, + "step": 25333 + }, + { + "epoch": 1.3795375494340187, + "grad_norm": 0.5263839897161878, + "learning_rate": 4.637027325564076e-05, + "loss": 11.8809, + "step": 25334 + }, + { + "epoch": 1.3795920034306017, + "grad_norm": 0.5406684908264028, + "learning_rate": 4.636283059704227e-05, + "loss": 11.9414, + "step": 25335 + }, + { + "epoch": 1.3796464574271847, + "grad_norm": 0.5338849962676493, + "learning_rate": 4.635538835553533e-05, + "loss": 11.8371, + "step": 25336 + }, + { + "epoch": 1.3797009114237677, + "grad_norm": 0.5699664733897476, + "learning_rate": 4.6347946531177935e-05, + "loss": 11.9682, + "step": 25337 + }, + { + "epoch": 1.3797553654203507, + "grad_norm": 0.5638315708518855, + "learning_rate": 4.634050512402786e-05, + "loss": 11.8624, + "step": 25338 + }, + { + "epoch": 1.3798098194169337, + "grad_norm": 0.502264483104613, + "learning_rate": 4.633306413414301e-05, + "loss": 11.8048, + "step": 25339 + }, + { + "epoch": 1.379864273413517, + "grad_norm": 0.5765177970339285, + "learning_rate": 4.63256235615813e-05, + "loss": 11.8838, + "step": 25340 + }, + { + "epoch": 1.3799187274101, + "grad_norm": 0.5291928412229079, + "learning_rate": 4.631818340640049e-05, + "loss": 11.7628, + "step": 25341 + }, + { + "epoch": 1.379973181406683, + "grad_norm": 0.5935222709828194, + "learning_rate": 4.631074366865855e-05, + "loss": 11.9262, + "step": 25342 + }, + { + "epoch": 1.380027635403266, + "grad_norm": 0.5485942271044094, + "learning_rate": 4.630330434841321e-05, + "loss": 11.8891, + "step": 25343 + }, + { + "epoch": 1.3800820893998489, + "grad_norm": 0.6240681136806207, + "learning_rate": 4.629586544572243e-05, + "loss": 12.0713, + "step": 25344 + }, + { + "epoch": 1.3801365433964319, + "grad_norm": 0.5251818277570874, + "learning_rate": 4.6288426960644006e-05, + "loss": 11.9282, + "step": 25345 + }, + { + "epoch": 1.3801909973930149, + "grad_norm": 0.6202449369655912, + "learning_rate": 4.628098889323574e-05, + "loss": 12.031, + "step": 25346 + }, + { + "epoch": 1.3802454513895979, + "grad_norm": 0.5865949498063342, + "learning_rate": 4.627355124355556e-05, + "loss": 11.914, + "step": 25347 + }, + { + "epoch": 1.3802999053861809, + "grad_norm": 0.6780010259118175, + "learning_rate": 4.626611401166121e-05, + "loss": 11.8229, + "step": 25348 + }, + { + "epoch": 1.380354359382764, + "grad_norm": 0.5327655092422529, + "learning_rate": 4.62586771976106e-05, + "loss": 11.9316, + "step": 25349 + }, + { + "epoch": 1.380408813379347, + "grad_norm": 0.6142761650187462, + "learning_rate": 4.62512408014615e-05, + "loss": 11.9944, + "step": 25350 + }, + { + "epoch": 1.38046326737593, + "grad_norm": 0.5161875552161428, + "learning_rate": 4.6243804823271766e-05, + "loss": 11.8983, + "step": 25351 + }, + { + "epoch": 1.380517721372513, + "grad_norm": 0.5663489760836916, + "learning_rate": 4.6236369263099254e-05, + "loss": 11.8626, + "step": 25352 + }, + { + "epoch": 1.380572175369096, + "grad_norm": 0.5621841637946295, + "learning_rate": 4.622893412100171e-05, + "loss": 12.046, + "step": 25353 + }, + { + "epoch": 1.380626629365679, + "grad_norm": 0.5500645122406784, + "learning_rate": 4.622149939703704e-05, + "loss": 11.7661, + "step": 25354 + }, + { + "epoch": 1.380681083362262, + "grad_norm": 0.5699335645189085, + "learning_rate": 4.6214065091263e-05, + "loss": 11.8789, + "step": 25355 + }, + { + "epoch": 1.380735537358845, + "grad_norm": 0.5879388378839406, + "learning_rate": 4.620663120373738e-05, + "loss": 11.9825, + "step": 25356 + }, + { + "epoch": 1.380789991355428, + "grad_norm": 0.6442632725935946, + "learning_rate": 4.619919773451805e-05, + "loss": 11.9544, + "step": 25357 + }, + { + "epoch": 1.380844445352011, + "grad_norm": 0.5388417844431643, + "learning_rate": 4.6191764683662744e-05, + "loss": 11.9933, + "step": 25358 + }, + { + "epoch": 1.380898899348594, + "grad_norm": 0.5419684114350882, + "learning_rate": 4.618433205122933e-05, + "loss": 11.987, + "step": 25359 + }, + { + "epoch": 1.380953353345177, + "grad_norm": 0.5144860434373913, + "learning_rate": 4.617689983727555e-05, + "loss": 11.9195, + "step": 25360 + }, + { + "epoch": 1.38100780734176, + "grad_norm": 0.5200277033184444, + "learning_rate": 4.616946804185921e-05, + "loss": 11.9442, + "step": 25361 + }, + { + "epoch": 1.381062261338343, + "grad_norm": 0.5905218274944044, + "learning_rate": 4.6162036665038155e-05, + "loss": 11.8349, + "step": 25362 + }, + { + "epoch": 1.3811167153349262, + "grad_norm": 0.5034152148165404, + "learning_rate": 4.61546057068701e-05, + "loss": 11.8981, + "step": 25363 + }, + { + "epoch": 1.3811711693315092, + "grad_norm": 0.5545798446381608, + "learning_rate": 4.614717516741289e-05, + "loss": 11.9117, + "step": 25364 + }, + { + "epoch": 1.3812256233280922, + "grad_norm": 0.5808052724832219, + "learning_rate": 4.6139745046724294e-05, + "loss": 11.8981, + "step": 25365 + }, + { + "epoch": 1.3812800773246752, + "grad_norm": 0.5312349226885734, + "learning_rate": 4.613231534486202e-05, + "loss": 11.8773, + "step": 25366 + }, + { + "epoch": 1.3813345313212582, + "grad_norm": 0.5482902407992518, + "learning_rate": 4.6124886061883934e-05, + "loss": 11.963, + "step": 25367 + }, + { + "epoch": 1.3813889853178412, + "grad_norm": 0.5904775818451609, + "learning_rate": 4.6117457197847736e-05, + "loss": 11.9922, + "step": 25368 + }, + { + "epoch": 1.3814434393144241, + "grad_norm": 0.5438179334951186, + "learning_rate": 4.6110028752811266e-05, + "loss": 11.934, + "step": 25369 + }, + { + "epoch": 1.3814978933110071, + "grad_norm": 0.5615586565282236, + "learning_rate": 4.6102600726832204e-05, + "loss": 11.8404, + "step": 25370 + }, + { + "epoch": 1.3815523473075901, + "grad_norm": 0.5757624633721171, + "learning_rate": 4.609517311996839e-05, + "loss": 11.9536, + "step": 25371 + }, + { + "epoch": 1.3816068013041733, + "grad_norm": 0.5315687733262756, + "learning_rate": 4.608774593227753e-05, + "loss": 11.7931, + "step": 25372 + }, + { + "epoch": 1.3816612553007563, + "grad_norm": 0.510926070633032, + "learning_rate": 4.608031916381739e-05, + "loss": 11.7279, + "step": 25373 + }, + { + "epoch": 1.3817157092973393, + "grad_norm": 0.5413068769685286, + "learning_rate": 4.607289281464578e-05, + "loss": 11.9082, + "step": 25374 + }, + { + "epoch": 1.3817701632939223, + "grad_norm": 0.5720049337632385, + "learning_rate": 4.6065466884820376e-05, + "loss": 11.8959, + "step": 25375 + }, + { + "epoch": 1.3818246172905053, + "grad_norm": 0.5415679879386224, + "learning_rate": 4.605804137439892e-05, + "loss": 11.702, + "step": 25376 + }, + { + "epoch": 1.3818790712870883, + "grad_norm": 0.519743435854354, + "learning_rate": 4.605061628343922e-05, + "loss": 11.9134, + "step": 25377 + }, + { + "epoch": 1.3819335252836713, + "grad_norm": 0.5362190930944459, + "learning_rate": 4.604319161199894e-05, + "loss": 11.9347, + "step": 25378 + }, + { + "epoch": 1.3819879792802543, + "grad_norm": 0.5978464665510768, + "learning_rate": 4.603576736013587e-05, + "loss": 11.8299, + "step": 25379 + }, + { + "epoch": 1.3820424332768373, + "grad_norm": 0.601051854487191, + "learning_rate": 4.60283435279077e-05, + "loss": 11.8637, + "step": 25380 + }, + { + "epoch": 1.3820968872734203, + "grad_norm": 0.5364435772542022, + "learning_rate": 4.602092011537222e-05, + "loss": 11.9527, + "step": 25381 + }, + { + "epoch": 1.3821513412700033, + "grad_norm": 0.5385852669288471, + "learning_rate": 4.601349712258708e-05, + "loss": 11.8286, + "step": 25382 + }, + { + "epoch": 1.3822057952665863, + "grad_norm": 0.5261441609401453, + "learning_rate": 4.600607454961004e-05, + "loss": 11.8859, + "step": 25383 + }, + { + "epoch": 1.3822602492631693, + "grad_norm": 0.5543163456264253, + "learning_rate": 4.599865239649885e-05, + "loss": 11.8436, + "step": 25384 + }, + { + "epoch": 1.3823147032597523, + "grad_norm": 0.5696395854086732, + "learning_rate": 4.59912306633112e-05, + "loss": 11.8876, + "step": 25385 + }, + { + "epoch": 1.3823691572563352, + "grad_norm": 0.5372364461671898, + "learning_rate": 4.59838093501048e-05, + "loss": 11.7971, + "step": 25386 + }, + { + "epoch": 1.3824236112529185, + "grad_norm": 0.5336524515688087, + "learning_rate": 4.597638845693733e-05, + "loss": 11.9275, + "step": 25387 + }, + { + "epoch": 1.3824780652495015, + "grad_norm": 0.5475560823657954, + "learning_rate": 4.5968967983866495e-05, + "loss": 11.8402, + "step": 25388 + }, + { + "epoch": 1.3825325192460844, + "grad_norm": 0.5369080152016062, + "learning_rate": 4.5961547930950086e-05, + "loss": 11.911, + "step": 25389 + }, + { + "epoch": 1.3825869732426674, + "grad_norm": 0.5314198399045027, + "learning_rate": 4.595412829824569e-05, + "loss": 11.819, + "step": 25390 + }, + { + "epoch": 1.3826414272392504, + "grad_norm": 0.5615156119415065, + "learning_rate": 4.59467090858111e-05, + "loss": 11.8952, + "step": 25391 + }, + { + "epoch": 1.3826958812358334, + "grad_norm": 0.49312984592688724, + "learning_rate": 4.5939290293703926e-05, + "loss": 11.7849, + "step": 25392 + }, + { + "epoch": 1.3827503352324164, + "grad_norm": 0.5322878051254277, + "learning_rate": 4.593187192198195e-05, + "loss": 11.9094, + "step": 25393 + }, + { + "epoch": 1.3828047892289994, + "grad_norm": 0.5838399952260754, + "learning_rate": 4.5924453970702755e-05, + "loss": 11.9369, + "step": 25394 + }, + { + "epoch": 1.3828592432255826, + "grad_norm": 0.5053029548424226, + "learning_rate": 4.591703643992411e-05, + "loss": 11.7684, + "step": 25395 + }, + { + "epoch": 1.3829136972221656, + "grad_norm": 0.5615231519740667, + "learning_rate": 4.5909619329703655e-05, + "loss": 11.9489, + "step": 25396 + }, + { + "epoch": 1.3829681512187486, + "grad_norm": 0.5421039509312398, + "learning_rate": 4.590220264009903e-05, + "loss": 11.9166, + "step": 25397 + }, + { + "epoch": 1.3830226052153316, + "grad_norm": 0.5251245200652788, + "learning_rate": 4.589478637116801e-05, + "loss": 11.7721, + "step": 25398 + }, + { + "epoch": 1.3830770592119146, + "grad_norm": 0.518286402281066, + "learning_rate": 4.588737052296815e-05, + "loss": 11.8004, + "step": 25399 + }, + { + "epoch": 1.3831315132084976, + "grad_norm": 0.5281743691607864, + "learning_rate": 4.587995509555717e-05, + "loss": 11.8995, + "step": 25400 + }, + { + "epoch": 1.3831859672050806, + "grad_norm": 0.5350429109521715, + "learning_rate": 4.587254008899278e-05, + "loss": 11.805, + "step": 25401 + }, + { + "epoch": 1.3832404212016636, + "grad_norm": 0.5528001096401002, + "learning_rate": 4.586512550333255e-05, + "loss": 11.9221, + "step": 25402 + }, + { + "epoch": 1.3832948751982466, + "grad_norm": 0.7356239817084284, + "learning_rate": 4.5857711338634235e-05, + "loss": 11.975, + "step": 25403 + }, + { + "epoch": 1.3833493291948296, + "grad_norm": 0.5268355168398967, + "learning_rate": 4.585029759495538e-05, + "loss": 11.951, + "step": 25404 + }, + { + "epoch": 1.3834037831914126, + "grad_norm": 0.5754927109925986, + "learning_rate": 4.5842884272353745e-05, + "loss": 12.0442, + "step": 25405 + }, + { + "epoch": 1.3834582371879955, + "grad_norm": 0.5323192851534655, + "learning_rate": 4.583547137088692e-05, + "loss": 11.9776, + "step": 25406 + }, + { + "epoch": 1.3835126911845785, + "grad_norm": 0.6187177938805167, + "learning_rate": 4.5828058890612516e-05, + "loss": 11.9483, + "step": 25407 + }, + { + "epoch": 1.3835671451811615, + "grad_norm": 0.5396065121654304, + "learning_rate": 4.582064683158823e-05, + "loss": 11.8958, + "step": 25408 + }, + { + "epoch": 1.3836215991777445, + "grad_norm": 0.5310752884032458, + "learning_rate": 4.5813235193871665e-05, + "loss": 11.8103, + "step": 25409 + }, + { + "epoch": 1.3836760531743277, + "grad_norm": 0.5781549138458341, + "learning_rate": 4.580582397752046e-05, + "loss": 12.0116, + "step": 25410 + }, + { + "epoch": 1.3837305071709107, + "grad_norm": 0.5857054738959128, + "learning_rate": 4.5798413182592305e-05, + "loss": 11.7495, + "step": 25411 + }, + { + "epoch": 1.3837849611674937, + "grad_norm": 0.5771792869696822, + "learning_rate": 4.5791002809144724e-05, + "loss": 11.9045, + "step": 25412 + }, + { + "epoch": 1.3838394151640767, + "grad_norm": 0.5173491338387444, + "learning_rate": 4.5783592857235444e-05, + "loss": 11.8329, + "step": 25413 + }, + { + "epoch": 1.3838938691606597, + "grad_norm": 0.6159445884622708, + "learning_rate": 4.577618332692199e-05, + "loss": 12.0177, + "step": 25414 + }, + { + "epoch": 1.3839483231572427, + "grad_norm": 0.6027146149136118, + "learning_rate": 4.576877421826208e-05, + "loss": 11.9161, + "step": 25415 + }, + { + "epoch": 1.3840027771538257, + "grad_norm": 0.5776180683335476, + "learning_rate": 4.576136553131327e-05, + "loss": 11.8918, + "step": 25416 + }, + { + "epoch": 1.3840572311504087, + "grad_norm": 0.5812378036134579, + "learning_rate": 4.575395726613314e-05, + "loss": 11.9817, + "step": 25417 + }, + { + "epoch": 1.3841116851469917, + "grad_norm": 0.5221685414790718, + "learning_rate": 4.574654942277937e-05, + "loss": 11.901, + "step": 25418 + }, + { + "epoch": 1.384166139143575, + "grad_norm": 0.5835118401881975, + "learning_rate": 4.57391420013095e-05, + "loss": 11.9185, + "step": 25419 + }, + { + "epoch": 1.3842205931401579, + "grad_norm": 0.5535971253639945, + "learning_rate": 4.573173500178119e-05, + "loss": 11.8291, + "step": 25420 + }, + { + "epoch": 1.3842750471367409, + "grad_norm": 0.5632179748324755, + "learning_rate": 4.5724328424251985e-05, + "loss": 11.932, + "step": 25421 + }, + { + "epoch": 1.3843295011333239, + "grad_norm": 0.565666884742778, + "learning_rate": 4.5716922268779495e-05, + "loss": 11.8911, + "step": 25422 + }, + { + "epoch": 1.3843839551299069, + "grad_norm": 0.567112815335102, + "learning_rate": 4.570951653542136e-05, + "loss": 12.0588, + "step": 25423 + }, + { + "epoch": 1.3844384091264899, + "grad_norm": 0.5168100326162838, + "learning_rate": 4.570211122423509e-05, + "loss": 11.8124, + "step": 25424 + }, + { + "epoch": 1.3844928631230728, + "grad_norm": 0.4863290188853976, + "learning_rate": 4.5694706335278346e-05, + "loss": 11.8577, + "step": 25425 + }, + { + "epoch": 1.3845473171196558, + "grad_norm": 0.6330527557283268, + "learning_rate": 4.568730186860867e-05, + "loss": 12.1193, + "step": 25426 + }, + { + "epoch": 1.3846017711162388, + "grad_norm": 0.5254447852363948, + "learning_rate": 4.5679897824283615e-05, + "loss": 11.9341, + "step": 25427 + }, + { + "epoch": 1.3846562251128218, + "grad_norm": 0.6192901503995081, + "learning_rate": 4.56724942023608e-05, + "loss": 11.846, + "step": 25428 + }, + { + "epoch": 1.3847106791094048, + "grad_norm": 0.5601019083569937, + "learning_rate": 4.566509100289777e-05, + "loss": 12.0017, + "step": 25429 + }, + { + "epoch": 1.3847651331059878, + "grad_norm": 0.6557075919599537, + "learning_rate": 4.565768822595213e-05, + "loss": 11.9712, + "step": 25430 + }, + { + "epoch": 1.3848195871025708, + "grad_norm": 0.5268907779907597, + "learning_rate": 4.5650285871581376e-05, + "loss": 11.873, + "step": 25431 + }, + { + "epoch": 1.3848740410991538, + "grad_norm": 0.623219367007664, + "learning_rate": 4.564288393984313e-05, + "loss": 11.9674, + "step": 25432 + }, + { + "epoch": 1.384928495095737, + "grad_norm": 0.5609954238970658, + "learning_rate": 4.563548243079495e-05, + "loss": 11.8812, + "step": 25433 + }, + { + "epoch": 1.38498294909232, + "grad_norm": 0.5168749717472204, + "learning_rate": 4.562808134449436e-05, + "loss": 11.9109, + "step": 25434 + }, + { + "epoch": 1.385037403088903, + "grad_norm": 0.545321160679276, + "learning_rate": 4.562068068099895e-05, + "loss": 11.8895, + "step": 25435 + }, + { + "epoch": 1.385091857085486, + "grad_norm": 0.6123168173423389, + "learning_rate": 4.561328044036625e-05, + "loss": 11.9163, + "step": 25436 + }, + { + "epoch": 1.385146311082069, + "grad_norm": 0.5544430228111364, + "learning_rate": 4.5605880622653766e-05, + "loss": 11.8053, + "step": 25437 + }, + { + "epoch": 1.385200765078652, + "grad_norm": 0.5243571797546914, + "learning_rate": 4.559848122791911e-05, + "loss": 12.0372, + "step": 25438 + }, + { + "epoch": 1.385255219075235, + "grad_norm": 0.5226354727460679, + "learning_rate": 4.559108225621975e-05, + "loss": 11.8838, + "step": 25439 + }, + { + "epoch": 1.385309673071818, + "grad_norm": 0.5399147634701176, + "learning_rate": 4.55836837076133e-05, + "loss": 11.9785, + "step": 25440 + }, + { + "epoch": 1.385364127068401, + "grad_norm": 0.5556679512831352, + "learning_rate": 4.557628558215722e-05, + "loss": 11.7897, + "step": 25441 + }, + { + "epoch": 1.3854185810649842, + "grad_norm": 0.5826198731737683, + "learning_rate": 4.5568887879909096e-05, + "loss": 11.6895, + "step": 25442 + }, + { + "epoch": 1.3854730350615672, + "grad_norm": 0.5721627565551967, + "learning_rate": 4.556149060092639e-05, + "loss": 11.9325, + "step": 25443 + }, + { + "epoch": 1.3855274890581502, + "grad_norm": 0.627853917146858, + "learning_rate": 4.555409374526668e-05, + "loss": 11.8464, + "step": 25444 + }, + { + "epoch": 1.3855819430547331, + "grad_norm": 0.5387585089526937, + "learning_rate": 4.5546697312987484e-05, + "loss": 11.9004, + "step": 25445 + }, + { + "epoch": 1.3856363970513161, + "grad_norm": 0.5306238776071194, + "learning_rate": 4.553930130414631e-05, + "loss": 11.9283, + "step": 25446 + }, + { + "epoch": 1.3856908510478991, + "grad_norm": 0.5280528107632034, + "learning_rate": 4.5531905718800627e-05, + "loss": 11.858, + "step": 25447 + }, + { + "epoch": 1.3857453050444821, + "grad_norm": 0.5283147579328331, + "learning_rate": 4.5524510557008014e-05, + "loss": 11.7974, + "step": 25448 + }, + { + "epoch": 1.3857997590410651, + "grad_norm": 0.5155778636904704, + "learning_rate": 4.551711581882591e-05, + "loss": 11.8523, + "step": 25449 + }, + { + "epoch": 1.385854213037648, + "grad_norm": 0.5445239226906861, + "learning_rate": 4.5509721504311877e-05, + "loss": 11.909, + "step": 25450 + }, + { + "epoch": 1.385908667034231, + "grad_norm": 0.5471540714237298, + "learning_rate": 4.550232761352335e-05, + "loss": 11.9284, + "step": 25451 + }, + { + "epoch": 1.385963121030814, + "grad_norm": 0.7100869057455694, + "learning_rate": 4.5494934146517906e-05, + "loss": 12.0101, + "step": 25452 + }, + { + "epoch": 1.386017575027397, + "grad_norm": 0.5891335356799535, + "learning_rate": 4.548754110335297e-05, + "loss": 11.8092, + "step": 25453 + }, + { + "epoch": 1.38607202902398, + "grad_norm": 0.5924728538463313, + "learning_rate": 4.548014848408607e-05, + "loss": 11.883, + "step": 25454 + }, + { + "epoch": 1.386126483020563, + "grad_norm": 0.5510637713760189, + "learning_rate": 4.5472756288774656e-05, + "loss": 11.8986, + "step": 25455 + }, + { + "epoch": 1.386180937017146, + "grad_norm": 0.5799269518433228, + "learning_rate": 4.5465364517476275e-05, + "loss": 12.1175, + "step": 25456 + }, + { + "epoch": 1.3862353910137293, + "grad_norm": 0.500171227649584, + "learning_rate": 4.545797317024835e-05, + "loss": 11.8863, + "step": 25457 + }, + { + "epoch": 1.3862898450103123, + "grad_norm": 0.5859140594240135, + "learning_rate": 4.545058224714834e-05, + "loss": 11.8102, + "step": 25458 + }, + { + "epoch": 1.3863442990068953, + "grad_norm": 0.5806878009529896, + "learning_rate": 4.544319174823376e-05, + "loss": 11.9901, + "step": 25459 + }, + { + "epoch": 1.3863987530034783, + "grad_norm": 0.5696105103622016, + "learning_rate": 4.5435801673562096e-05, + "loss": 11.9582, + "step": 25460 + }, + { + "epoch": 1.3864532070000612, + "grad_norm": 0.5470078970069159, + "learning_rate": 4.542841202319076e-05, + "loss": 11.8457, + "step": 25461 + }, + { + "epoch": 1.3865076609966442, + "grad_norm": 0.6089801746010085, + "learning_rate": 4.542102279717727e-05, + "loss": 11.8616, + "step": 25462 + }, + { + "epoch": 1.3865621149932272, + "grad_norm": 0.5430565653973743, + "learning_rate": 4.541363399557903e-05, + "loss": 11.8596, + "step": 25463 + }, + { + "epoch": 1.3866165689898102, + "grad_norm": 0.5840183025092082, + "learning_rate": 4.540624561845356e-05, + "loss": 11.7728, + "step": 25464 + }, + { + "epoch": 1.3866710229863934, + "grad_norm": 0.5835431124442011, + "learning_rate": 4.5398857665858243e-05, + "loss": 11.8373, + "step": 25465 + }, + { + "epoch": 1.3867254769829764, + "grad_norm": 0.503293458535758, + "learning_rate": 4.53914701378506e-05, + "loss": 11.7757, + "step": 25466 + }, + { + "epoch": 1.3867799309795594, + "grad_norm": 0.5566424047691468, + "learning_rate": 4.538408303448804e-05, + "loss": 11.7665, + "step": 25467 + }, + { + "epoch": 1.3868343849761424, + "grad_norm": 0.6272897259346766, + "learning_rate": 4.537669635582799e-05, + "loss": 11.9857, + "step": 25468 + }, + { + "epoch": 1.3868888389727254, + "grad_norm": 0.6602007550727755, + "learning_rate": 4.5369310101927933e-05, + "loss": 11.8863, + "step": 25469 + }, + { + "epoch": 1.3869432929693084, + "grad_norm": 0.5598081238967012, + "learning_rate": 4.5361924272845246e-05, + "loss": 11.9909, + "step": 25470 + }, + { + "epoch": 1.3869977469658914, + "grad_norm": 0.5666801433389889, + "learning_rate": 4.5354538868637395e-05, + "loss": 11.8528, + "step": 25471 + }, + { + "epoch": 1.3870522009624744, + "grad_norm": 0.5043592237807832, + "learning_rate": 4.534715388936186e-05, + "loss": 11.9294, + "step": 25472 + }, + { + "epoch": 1.3871066549590574, + "grad_norm": 0.5636143197585487, + "learning_rate": 4.5339769335075986e-05, + "loss": 11.9502, + "step": 25473 + }, + { + "epoch": 1.3871611089556404, + "grad_norm": 0.5240660368865463, + "learning_rate": 4.533238520583726e-05, + "loss": 11.9469, + "step": 25474 + }, + { + "epoch": 1.3872155629522234, + "grad_norm": 0.5128105974233724, + "learning_rate": 4.532500150170305e-05, + "loss": 11.9357, + "step": 25475 + }, + { + "epoch": 1.3872700169488064, + "grad_norm": 0.5404955144886905, + "learning_rate": 4.531761822273082e-05, + "loss": 11.9008, + "step": 25476 + }, + { + "epoch": 1.3873244709453894, + "grad_norm": 0.5251588220746668, + "learning_rate": 4.531023536897797e-05, + "loss": 11.8544, + "step": 25477 + }, + { + "epoch": 1.3873789249419723, + "grad_norm": 0.5430300012882553, + "learning_rate": 4.530285294050186e-05, + "loss": 11.9175, + "step": 25478 + }, + { + "epoch": 1.3874333789385553, + "grad_norm": 0.5686834287718434, + "learning_rate": 4.5295470937359976e-05, + "loss": 12.0042, + "step": 25479 + }, + { + "epoch": 1.3874878329351386, + "grad_norm": 0.5858173784688535, + "learning_rate": 4.528808935960964e-05, + "loss": 11.9313, + "step": 25480 + }, + { + "epoch": 1.3875422869317215, + "grad_norm": 0.5593095346896827, + "learning_rate": 4.528070820730831e-05, + "loss": 11.8404, + "step": 25481 + }, + { + "epoch": 1.3875967409283045, + "grad_norm": 0.5063867083757851, + "learning_rate": 4.5273327480513395e-05, + "loss": 11.8159, + "step": 25482 + }, + { + "epoch": 1.3876511949248875, + "grad_norm": 0.6199157314152283, + "learning_rate": 4.526594717928223e-05, + "loss": 11.986, + "step": 25483 + }, + { + "epoch": 1.3877056489214705, + "grad_norm": 0.5319134109511767, + "learning_rate": 4.5258567303672286e-05, + "loss": 11.9429, + "step": 25484 + }, + { + "epoch": 1.3877601029180535, + "grad_norm": 0.583256809519084, + "learning_rate": 4.525118785374085e-05, + "loss": 11.8272, + "step": 25485 + }, + { + "epoch": 1.3878145569146365, + "grad_norm": 0.5252811924613185, + "learning_rate": 4.52438088295454e-05, + "loss": 11.9259, + "step": 25486 + }, + { + "epoch": 1.3878690109112195, + "grad_norm": 0.6129171070906239, + "learning_rate": 4.523643023114328e-05, + "loss": 11.8242, + "step": 25487 + }, + { + "epoch": 1.3879234649078025, + "grad_norm": 0.5827912163869879, + "learning_rate": 4.522905205859182e-05, + "loss": 12.0187, + "step": 25488 + }, + { + "epoch": 1.3879779189043857, + "grad_norm": 0.48887177592055336, + "learning_rate": 4.522167431194848e-05, + "loss": 11.9158, + "step": 25489 + }, + { + "epoch": 1.3880323729009687, + "grad_norm": 0.5907938633320445, + "learning_rate": 4.521429699127054e-05, + "loss": 11.9518, + "step": 25490 + }, + { + "epoch": 1.3880868268975517, + "grad_norm": 0.540746337114204, + "learning_rate": 4.5206920096615455e-05, + "loss": 11.9923, + "step": 25491 + }, + { + "epoch": 1.3881412808941347, + "grad_norm": 0.5476902998471366, + "learning_rate": 4.519954362804052e-05, + "loss": 11.8386, + "step": 25492 + }, + { + "epoch": 1.3881957348907177, + "grad_norm": 0.5363041472656379, + "learning_rate": 4.519216758560312e-05, + "loss": 11.8305, + "step": 25493 + }, + { + "epoch": 1.3882501888873007, + "grad_norm": 0.5791260720557126, + "learning_rate": 4.518479196936064e-05, + "loss": 11.9111, + "step": 25494 + }, + { + "epoch": 1.3883046428838837, + "grad_norm": 0.6012232839727382, + "learning_rate": 4.5177416779370386e-05, + "loss": 11.9442, + "step": 25495 + }, + { + "epoch": 1.3883590968804667, + "grad_norm": 0.559152603766884, + "learning_rate": 4.5170042015689765e-05, + "loss": 11.9554, + "step": 25496 + }, + { + "epoch": 1.3884135508770497, + "grad_norm": 0.5294207091331442, + "learning_rate": 4.5162667678376094e-05, + "loss": 11.8791, + "step": 25497 + }, + { + "epoch": 1.3884680048736326, + "grad_norm": 0.5144270454198645, + "learning_rate": 4.5155293767486686e-05, + "loss": 11.8537, + "step": 25498 + }, + { + "epoch": 1.3885224588702156, + "grad_norm": 0.5470857206995124, + "learning_rate": 4.5147920283078936e-05, + "loss": 11.8058, + "step": 25499 + }, + { + "epoch": 1.3885769128667986, + "grad_norm": 0.5702107730197208, + "learning_rate": 4.514054722521013e-05, + "loss": 11.9726, + "step": 25500 + }, + { + "epoch": 1.3886313668633816, + "grad_norm": 0.5766661552214912, + "learning_rate": 4.513317459393765e-05, + "loss": 11.8671, + "step": 25501 + }, + { + "epoch": 1.3886858208599646, + "grad_norm": 0.5540396004218782, + "learning_rate": 4.5125802389318785e-05, + "loss": 11.9179, + "step": 25502 + }, + { + "epoch": 1.3887402748565478, + "grad_norm": 0.5613776022593168, + "learning_rate": 4.511843061141091e-05, + "loss": 11.9476, + "step": 25503 + }, + { + "epoch": 1.3887947288531308, + "grad_norm": 0.524968666061809, + "learning_rate": 4.5111059260271294e-05, + "loss": 11.9534, + "step": 25504 + }, + { + "epoch": 1.3888491828497138, + "grad_norm": 0.5134529876106015, + "learning_rate": 4.5103688335957276e-05, + "loss": 11.8218, + "step": 25505 + }, + { + "epoch": 1.3889036368462968, + "grad_norm": 0.550685330143858, + "learning_rate": 4.509631783852626e-05, + "loss": 11.9059, + "step": 25506 + }, + { + "epoch": 1.3889580908428798, + "grad_norm": 0.5379171415541388, + "learning_rate": 4.508894776803542e-05, + "loss": 11.9005, + "step": 25507 + }, + { + "epoch": 1.3890125448394628, + "grad_norm": 0.5284340806057991, + "learning_rate": 4.5081578124542126e-05, + "loss": 11.952, + "step": 25508 + }, + { + "epoch": 1.3890669988360458, + "grad_norm": 0.5385966255964703, + "learning_rate": 4.507420890810372e-05, + "loss": 11.7745, + "step": 25509 + }, + { + "epoch": 1.3891214528326288, + "grad_norm": 0.513349547827117, + "learning_rate": 4.506684011877744e-05, + "loss": 11.8494, + "step": 25510 + }, + { + "epoch": 1.3891759068292118, + "grad_norm": 0.5573192302187278, + "learning_rate": 4.505947175662066e-05, + "loss": 11.959, + "step": 25511 + }, + { + "epoch": 1.389230360825795, + "grad_norm": 0.5856179498725235, + "learning_rate": 4.505210382169062e-05, + "loss": 12.0187, + "step": 25512 + }, + { + "epoch": 1.389284814822378, + "grad_norm": 0.5847391135503383, + "learning_rate": 4.504473631404465e-05, + "loss": 11.9218, + "step": 25513 + }, + { + "epoch": 1.389339268818961, + "grad_norm": 0.5553770178542953, + "learning_rate": 4.503736923374e-05, + "loss": 11.8982, + "step": 25514 + }, + { + "epoch": 1.389393722815544, + "grad_norm": 0.5086326480257842, + "learning_rate": 4.5030002580833985e-05, + "loss": 11.6923, + "step": 25515 + }, + { + "epoch": 1.389448176812127, + "grad_norm": 0.5409528637701126, + "learning_rate": 4.5022636355383966e-05, + "loss": 11.9011, + "step": 25516 + }, + { + "epoch": 1.38950263080871, + "grad_norm": 0.5161787951550454, + "learning_rate": 4.501527055744707e-05, + "loss": 11.8725, + "step": 25517 + }, + { + "epoch": 1.389557084805293, + "grad_norm": 0.5369030135149553, + "learning_rate": 4.500790518708068e-05, + "loss": 11.6291, + "step": 25518 + }, + { + "epoch": 1.389611538801876, + "grad_norm": 0.5515006321898562, + "learning_rate": 4.5000540244342015e-05, + "loss": 11.858, + "step": 25519 + }, + { + "epoch": 1.389665992798459, + "grad_norm": 0.6431150513188725, + "learning_rate": 4.4993175729288374e-05, + "loss": 11.751, + "step": 25520 + }, + { + "epoch": 1.389720446795042, + "grad_norm": 0.6076053213737065, + "learning_rate": 4.498581164197705e-05, + "loss": 12.0461, + "step": 25521 + }, + { + "epoch": 1.389774900791625, + "grad_norm": 0.46667096650716783, + "learning_rate": 4.4978447982465247e-05, + "loss": 11.7802, + "step": 25522 + }, + { + "epoch": 1.389829354788208, + "grad_norm": 0.5114626162584534, + "learning_rate": 4.49710847508103e-05, + "loss": 12.0273, + "step": 25523 + }, + { + "epoch": 1.389883808784791, + "grad_norm": 0.6418240926117168, + "learning_rate": 4.4963721947069383e-05, + "loss": 11.9599, + "step": 25524 + }, + { + "epoch": 1.389938262781374, + "grad_norm": 0.4839788863961615, + "learning_rate": 4.495635957129983e-05, + "loss": 11.7584, + "step": 25525 + }, + { + "epoch": 1.389992716777957, + "grad_norm": 0.5026526852372909, + "learning_rate": 4.494899762355887e-05, + "loss": 11.8142, + "step": 25526 + }, + { + "epoch": 1.39004717077454, + "grad_norm": 0.5296149944903044, + "learning_rate": 4.4941636103903684e-05, + "loss": 11.9144, + "step": 25527 + }, + { + "epoch": 1.390101624771123, + "grad_norm": 0.515495589090639, + "learning_rate": 4.493427501239161e-05, + "loss": 11.8679, + "step": 25528 + }, + { + "epoch": 1.390156078767706, + "grad_norm": 0.5398518962699593, + "learning_rate": 4.492691434907982e-05, + "loss": 11.8615, + "step": 25529 + }, + { + "epoch": 1.390210532764289, + "grad_norm": 0.581374360757756, + "learning_rate": 4.491955411402557e-05, + "loss": 11.9026, + "step": 25530 + }, + { + "epoch": 1.390264986760872, + "grad_norm": 0.5838567757464395, + "learning_rate": 4.491219430728615e-05, + "loss": 11.9498, + "step": 25531 + }, + { + "epoch": 1.390319440757455, + "grad_norm": 0.5434824934602308, + "learning_rate": 4.4904834928918696e-05, + "loss": 11.8908, + "step": 25532 + }, + { + "epoch": 1.390373894754038, + "grad_norm": 0.5400376119511892, + "learning_rate": 4.489747597898053e-05, + "loss": 11.8023, + "step": 25533 + }, + { + "epoch": 1.390428348750621, + "grad_norm": 0.5228692699215824, + "learning_rate": 4.489011745752879e-05, + "loss": 11.87, + "step": 25534 + }, + { + "epoch": 1.3904828027472043, + "grad_norm": 0.5350715212943529, + "learning_rate": 4.4882759364620787e-05, + "loss": 11.944, + "step": 25535 + }, + { + "epoch": 1.3905372567437873, + "grad_norm": 0.5297107854174508, + "learning_rate": 4.487540170031368e-05, + "loss": 11.9025, + "step": 25536 + }, + { + "epoch": 1.3905917107403702, + "grad_norm": 0.5217943928297886, + "learning_rate": 4.486804446466465e-05, + "loss": 11.8636, + "step": 25537 + }, + { + "epoch": 1.3906461647369532, + "grad_norm": 0.5646540885623449, + "learning_rate": 4.4860687657731004e-05, + "loss": 11.9166, + "step": 25538 + }, + { + "epoch": 1.3907006187335362, + "grad_norm": 0.5172199674405743, + "learning_rate": 4.4853331279569856e-05, + "loss": 11.7803, + "step": 25539 + }, + { + "epoch": 1.3907550727301192, + "grad_norm": 0.5196236659981314, + "learning_rate": 4.484597533023849e-05, + "loss": 11.8839, + "step": 25540 + }, + { + "epoch": 1.3908095267267022, + "grad_norm": 0.5585562010892676, + "learning_rate": 4.4838619809794025e-05, + "loss": 11.8597, + "step": 25541 + }, + { + "epoch": 1.3908639807232852, + "grad_norm": 0.511576594652112, + "learning_rate": 4.483126471829371e-05, + "loss": 11.8981, + "step": 25542 + }, + { + "epoch": 1.3909184347198682, + "grad_norm": 0.6139840837053527, + "learning_rate": 4.482391005579476e-05, + "loss": 12.0498, + "step": 25543 + }, + { + "epoch": 1.3909728887164512, + "grad_norm": 0.5784039918772768, + "learning_rate": 4.4816555822354314e-05, + "loss": 11.9092, + "step": 25544 + }, + { + "epoch": 1.3910273427130342, + "grad_norm": 0.5074897671565245, + "learning_rate": 4.480920201802961e-05, + "loss": 11.6981, + "step": 25545 + }, + { + "epoch": 1.3910817967096172, + "grad_norm": 0.4931749930730538, + "learning_rate": 4.480184864287781e-05, + "loss": 11.872, + "step": 25546 + }, + { + "epoch": 1.3911362507062002, + "grad_norm": 0.5373058663523753, + "learning_rate": 4.479449569695606e-05, + "loss": 11.7687, + "step": 25547 + }, + { + "epoch": 1.3911907047027832, + "grad_norm": 0.5148041142482348, + "learning_rate": 4.47871431803216e-05, + "loss": 11.8017, + "step": 25548 + }, + { + "epoch": 1.3912451586993662, + "grad_norm": 0.5340044989149939, + "learning_rate": 4.477979109303154e-05, + "loss": 11.9403, + "step": 25549 + }, + { + "epoch": 1.3912996126959494, + "grad_norm": 0.4967906977063719, + "learning_rate": 4.4772439435143124e-05, + "loss": 11.8576, + "step": 25550 + }, + { + "epoch": 1.3913540666925324, + "grad_norm": 0.5141829250771999, + "learning_rate": 4.4765088206713434e-05, + "loss": 11.969, + "step": 25551 + }, + { + "epoch": 1.3914085206891154, + "grad_norm": 0.5594723020273837, + "learning_rate": 4.4757737407799724e-05, + "loss": 11.8893, + "step": 25552 + }, + { + "epoch": 1.3914629746856984, + "grad_norm": 0.5806813743109116, + "learning_rate": 4.4750387038459086e-05, + "loss": 11.8356, + "step": 25553 + }, + { + "epoch": 1.3915174286822813, + "grad_norm": 0.568429998278889, + "learning_rate": 4.4743037098748696e-05, + "loss": 11.8732, + "step": 25554 + }, + { + "epoch": 1.3915718826788643, + "grad_norm": 0.5419538497127687, + "learning_rate": 4.473568758872575e-05, + "loss": 11.7468, + "step": 25555 + }, + { + "epoch": 1.3916263366754473, + "grad_norm": 0.5225619019455237, + "learning_rate": 4.472833850844736e-05, + "loss": 11.8665, + "step": 25556 + }, + { + "epoch": 1.3916807906720303, + "grad_norm": 0.5396102597901121, + "learning_rate": 4.4720989857970654e-05, + "loss": 12.0524, + "step": 25557 + }, + { + "epoch": 1.3917352446686133, + "grad_norm": 0.5108951221640262, + "learning_rate": 4.471364163735283e-05, + "loss": 11.9733, + "step": 25558 + }, + { + "epoch": 1.3917896986651965, + "grad_norm": 0.5130039467516576, + "learning_rate": 4.470629384665096e-05, + "loss": 11.9014, + "step": 25559 + }, + { + "epoch": 1.3918441526617795, + "grad_norm": 0.5247712073740278, + "learning_rate": 4.469894648592227e-05, + "loss": 11.9558, + "step": 25560 + }, + { + "epoch": 1.3918986066583625, + "grad_norm": 0.5626381865461848, + "learning_rate": 4.46915995552238e-05, + "loss": 11.9597, + "step": 25561 + }, + { + "epoch": 1.3919530606549455, + "grad_norm": 0.5176142223681709, + "learning_rate": 4.468425305461277e-05, + "loss": 11.8549, + "step": 25562 + }, + { + "epoch": 1.3920075146515285, + "grad_norm": 0.5361655587293035, + "learning_rate": 4.4676906984146225e-05, + "loss": 11.8692, + "step": 25563 + }, + { + "epoch": 1.3920619686481115, + "grad_norm": 0.5196121113509619, + "learning_rate": 4.4669561343881326e-05, + "loss": 11.7564, + "step": 25564 + }, + { + "epoch": 1.3921164226446945, + "grad_norm": 0.5965802711646296, + "learning_rate": 4.466221613387524e-05, + "loss": 11.9678, + "step": 25565 + }, + { + "epoch": 1.3921708766412775, + "grad_norm": 0.5278279479966292, + "learning_rate": 4.465487135418504e-05, + "loss": 11.8423, + "step": 25566 + }, + { + "epoch": 1.3922253306378605, + "grad_norm": 0.4899969182386487, + "learning_rate": 4.464752700486784e-05, + "loss": 11.8754, + "step": 25567 + }, + { + "epoch": 1.3922797846344435, + "grad_norm": 0.5544441426716407, + "learning_rate": 4.4640183085980715e-05, + "loss": 11.9249, + "step": 25568 + }, + { + "epoch": 1.3923342386310265, + "grad_norm": 0.534695084533497, + "learning_rate": 4.463283959758081e-05, + "loss": 11.8792, + "step": 25569 + }, + { + "epoch": 1.3923886926276094, + "grad_norm": 0.5690192789312117, + "learning_rate": 4.4625496539725264e-05, + "loss": 11.8757, + "step": 25570 + }, + { + "epoch": 1.3924431466241924, + "grad_norm": 0.4945283277132608, + "learning_rate": 4.461815391247112e-05, + "loss": 11.809, + "step": 25571 + }, + { + "epoch": 1.3924976006207754, + "grad_norm": 0.6311904744090591, + "learning_rate": 4.461081171587552e-05, + "loss": 11.9291, + "step": 25572 + }, + { + "epoch": 1.3925520546173586, + "grad_norm": 0.5131269660543281, + "learning_rate": 4.4603469949995504e-05, + "loss": 11.861, + "step": 25573 + }, + { + "epoch": 1.3926065086139416, + "grad_norm": 0.5299962043621316, + "learning_rate": 4.4596128614888235e-05, + "loss": 11.7927, + "step": 25574 + }, + { + "epoch": 1.3926609626105246, + "grad_norm": 0.4829217822093533, + "learning_rate": 4.458878771061072e-05, + "loss": 11.8651, + "step": 25575 + }, + { + "epoch": 1.3927154166071076, + "grad_norm": 0.5485101424678609, + "learning_rate": 4.458144723722013e-05, + "loss": 11.9357, + "step": 25576 + }, + { + "epoch": 1.3927698706036906, + "grad_norm": 0.5766981103581194, + "learning_rate": 4.4574107194773495e-05, + "loss": 11.898, + "step": 25577 + }, + { + "epoch": 1.3928243246002736, + "grad_norm": 0.5593629114152067, + "learning_rate": 4.456676758332785e-05, + "loss": 12.0337, + "step": 25578 + }, + { + "epoch": 1.3928787785968566, + "grad_norm": 0.5440644568553376, + "learning_rate": 4.455942840294033e-05, + "loss": 11.9586, + "step": 25579 + }, + { + "epoch": 1.3929332325934396, + "grad_norm": 0.5471786288865869, + "learning_rate": 4.455208965366803e-05, + "loss": 11.9155, + "step": 25580 + }, + { + "epoch": 1.3929876865900226, + "grad_norm": 0.5732036331337723, + "learning_rate": 4.454475133556794e-05, + "loss": 11.8596, + "step": 25581 + }, + { + "epoch": 1.3930421405866058, + "grad_norm": 0.510982090028107, + "learning_rate": 4.45374134486972e-05, + "loss": 11.9077, + "step": 25582 + }, + { + "epoch": 1.3930965945831888, + "grad_norm": 0.5482976943136708, + "learning_rate": 4.4530075993112785e-05, + "loss": 11.9974, + "step": 25583 + }, + { + "epoch": 1.3931510485797718, + "grad_norm": 0.5461209763693049, + "learning_rate": 4.4522738968871844e-05, + "loss": 11.9263, + "step": 25584 + }, + { + "epoch": 1.3932055025763548, + "grad_norm": 0.5476284641533394, + "learning_rate": 4.451540237603137e-05, + "loss": 11.8504, + "step": 25585 + }, + { + "epoch": 1.3932599565729378, + "grad_norm": 0.5555034223480412, + "learning_rate": 4.450806621464842e-05, + "loss": 11.8776, + "step": 25586 + }, + { + "epoch": 1.3933144105695208, + "grad_norm": 0.5415518693860845, + "learning_rate": 4.450073048478012e-05, + "loss": 11.9524, + "step": 25587 + }, + { + "epoch": 1.3933688645661038, + "grad_norm": 0.4836607640817616, + "learning_rate": 4.449339518648338e-05, + "loss": 11.8876, + "step": 25588 + }, + { + "epoch": 1.3934233185626868, + "grad_norm": 0.5806205706248562, + "learning_rate": 4.448606031981534e-05, + "loss": 11.9442, + "step": 25589 + }, + { + "epoch": 1.3934777725592697, + "grad_norm": 0.5245357843627159, + "learning_rate": 4.447872588483296e-05, + "loss": 11.884, + "step": 25590 + }, + { + "epoch": 1.3935322265558527, + "grad_norm": 0.5933993341539028, + "learning_rate": 4.447139188159334e-05, + "loss": 11.8883, + "step": 25591 + }, + { + "epoch": 1.3935866805524357, + "grad_norm": 0.5626036087049634, + "learning_rate": 4.446405831015351e-05, + "loss": 11.9668, + "step": 25592 + }, + { + "epoch": 1.3936411345490187, + "grad_norm": 0.5854899098641337, + "learning_rate": 4.4456725170570446e-05, + "loss": 11.9953, + "step": 25593 + }, + { + "epoch": 1.3936955885456017, + "grad_norm": 0.583069753177118, + "learning_rate": 4.444939246290123e-05, + "loss": 11.967, + "step": 25594 + }, + { + "epoch": 1.3937500425421847, + "grad_norm": 0.5900050905800227, + "learning_rate": 4.444206018720283e-05, + "loss": 11.8586, + "step": 25595 + }, + { + "epoch": 1.393804496538768, + "grad_norm": 0.4992737059483989, + "learning_rate": 4.443472834353232e-05, + "loss": 11.8711, + "step": 25596 + }, + { + "epoch": 1.393858950535351, + "grad_norm": 0.5004623240886439, + "learning_rate": 4.4427396931946675e-05, + "loss": 11.9301, + "step": 25597 + }, + { + "epoch": 1.393913404531934, + "grad_norm": 0.5464915720685761, + "learning_rate": 4.442006595250288e-05, + "loss": 11.7774, + "step": 25598 + }, + { + "epoch": 1.393967858528517, + "grad_norm": 0.5200105712903359, + "learning_rate": 4.4412735405258e-05, + "loss": 11.9007, + "step": 25599 + }, + { + "epoch": 1.3940223125251, + "grad_norm": 0.5065329383440792, + "learning_rate": 4.4405405290268976e-05, + "loss": 11.952, + "step": 25600 + }, + { + "epoch": 1.3940767665216829, + "grad_norm": 0.5755515418314147, + "learning_rate": 4.439807560759285e-05, + "loss": 11.9608, + "step": 25601 + }, + { + "epoch": 1.3941312205182659, + "grad_norm": 0.5944838060345434, + "learning_rate": 4.439074635728665e-05, + "loss": 11.9533, + "step": 25602 + }, + { + "epoch": 1.3941856745148489, + "grad_norm": 0.5493298611399842, + "learning_rate": 4.438341753940729e-05, + "loss": 11.8226, + "step": 25603 + }, + { + "epoch": 1.3942401285114319, + "grad_norm": 0.5235303374651344, + "learning_rate": 4.437608915401185e-05, + "loss": 11.9139, + "step": 25604 + }, + { + "epoch": 1.394294582508015, + "grad_norm": 0.5791529929868455, + "learning_rate": 4.436876120115723e-05, + "loss": 11.7544, + "step": 25605 + }, + { + "epoch": 1.394349036504598, + "grad_norm": 0.5406835856521723, + "learning_rate": 4.4361433680900474e-05, + "loss": 11.9002, + "step": 25606 + }, + { + "epoch": 1.394403490501181, + "grad_norm": 0.5226489783085283, + "learning_rate": 4.435410659329855e-05, + "loss": 11.8162, + "step": 25607 + }, + { + "epoch": 1.394457944497764, + "grad_norm": 0.5366347793195804, + "learning_rate": 4.434677993840839e-05, + "loss": 11.8039, + "step": 25608 + }, + { + "epoch": 1.394512398494347, + "grad_norm": 0.5243448582888759, + "learning_rate": 4.433945371628704e-05, + "loss": 11.8087, + "step": 25609 + }, + { + "epoch": 1.39456685249093, + "grad_norm": 0.5581672782464008, + "learning_rate": 4.433212792699138e-05, + "loss": 11.8108, + "step": 25610 + }, + { + "epoch": 1.394621306487513, + "grad_norm": 0.5777969188109933, + "learning_rate": 4.432480257057848e-05, + "loss": 12.0268, + "step": 25611 + }, + { + "epoch": 1.394675760484096, + "grad_norm": 0.5438379840429651, + "learning_rate": 4.431747764710522e-05, + "loss": 11.8112, + "step": 25612 + }, + { + "epoch": 1.394730214480679, + "grad_norm": 0.5495210033207965, + "learning_rate": 4.431015315662859e-05, + "loss": 11.909, + "step": 25613 + }, + { + "epoch": 1.394784668477262, + "grad_norm": 0.5553615982274764, + "learning_rate": 4.430282909920558e-05, + "loss": 11.9504, + "step": 25614 + }, + { + "epoch": 1.394839122473845, + "grad_norm": 0.5296519106752849, + "learning_rate": 4.429550547489307e-05, + "loss": 11.948, + "step": 25615 + }, + { + "epoch": 1.394893576470428, + "grad_norm": 0.5662093547186087, + "learning_rate": 4.4288182283748094e-05, + "loss": 11.9523, + "step": 25616 + }, + { + "epoch": 1.394948030467011, + "grad_norm": 0.5814211401726735, + "learning_rate": 4.4280859525827557e-05, + "loss": 11.9414, + "step": 25617 + }, + { + "epoch": 1.395002484463594, + "grad_norm": 0.5176920390380298, + "learning_rate": 4.427353720118835e-05, + "loss": 11.8059, + "step": 25618 + }, + { + "epoch": 1.395056938460177, + "grad_norm": 0.5637400044293797, + "learning_rate": 4.426621530988751e-05, + "loss": 11.9364, + "step": 25619 + }, + { + "epoch": 1.3951113924567602, + "grad_norm": 0.5891127385535817, + "learning_rate": 4.4258893851981895e-05, + "loss": 11.7332, + "step": 25620 + }, + { + "epoch": 1.3951658464533432, + "grad_norm": 0.5876437351777504, + "learning_rate": 4.42515728275285e-05, + "loss": 11.9318, + "step": 25621 + }, + { + "epoch": 1.3952203004499262, + "grad_norm": 0.49966714443086996, + "learning_rate": 4.424425223658419e-05, + "loss": 11.9052, + "step": 25622 + }, + { + "epoch": 1.3952747544465092, + "grad_norm": 0.5713599956178447, + "learning_rate": 4.423693207920596e-05, + "loss": 11.9294, + "step": 25623 + }, + { + "epoch": 1.3953292084430922, + "grad_norm": 0.5477477694312434, + "learning_rate": 4.4229612355450655e-05, + "loss": 11.756, + "step": 25624 + }, + { + "epoch": 1.3953836624396752, + "grad_norm": 0.48363716283238173, + "learning_rate": 4.422229306537524e-05, + "loss": 11.7877, + "step": 25625 + }, + { + "epoch": 1.3954381164362581, + "grad_norm": 0.5237849668974416, + "learning_rate": 4.421497420903668e-05, + "loss": 11.8695, + "step": 25626 + }, + { + "epoch": 1.3954925704328411, + "grad_norm": 0.5289670155708505, + "learning_rate": 4.420765578649182e-05, + "loss": 11.6797, + "step": 25627 + }, + { + "epoch": 1.3955470244294244, + "grad_norm": 0.5385830800928001, + "learning_rate": 4.420033779779754e-05, + "loss": 11.8105, + "step": 25628 + }, + { + "epoch": 1.3956014784260073, + "grad_norm": 0.5452827851664649, + "learning_rate": 4.419302024301084e-05, + "loss": 11.9292, + "step": 25629 + }, + { + "epoch": 1.3956559324225903, + "grad_norm": 0.5027393070915539, + "learning_rate": 4.4185703122188536e-05, + "loss": 11.8437, + "step": 25630 + }, + { + "epoch": 1.3957103864191733, + "grad_norm": 0.5452691469643637, + "learning_rate": 4.417838643538761e-05, + "loss": 11.9122, + "step": 25631 + }, + { + "epoch": 1.3957648404157563, + "grad_norm": 0.572446812699789, + "learning_rate": 4.417107018266486e-05, + "loss": 11.8799, + "step": 25632 + }, + { + "epoch": 1.3958192944123393, + "grad_norm": 0.5232701785438433, + "learning_rate": 4.416375436407727e-05, + "loss": 11.6885, + "step": 25633 + }, + { + "epoch": 1.3958737484089223, + "grad_norm": 0.5650317891925306, + "learning_rate": 4.415643897968166e-05, + "loss": 11.8378, + "step": 25634 + }, + { + "epoch": 1.3959282024055053, + "grad_norm": 0.597295783221329, + "learning_rate": 4.414912402953494e-05, + "loss": 11.8465, + "step": 25635 + }, + { + "epoch": 1.3959826564020883, + "grad_norm": 0.5439921230625333, + "learning_rate": 4.4141809513694043e-05, + "loss": 11.8836, + "step": 25636 + }, + { + "epoch": 1.3960371103986713, + "grad_norm": 0.5133879306520974, + "learning_rate": 4.4134495432215795e-05, + "loss": 11.894, + "step": 25637 + }, + { + "epoch": 1.3960915643952543, + "grad_norm": 0.5308147457633371, + "learning_rate": 4.4127181785157077e-05, + "loss": 11.8861, + "step": 25638 + }, + { + "epoch": 1.3961460183918373, + "grad_norm": 0.580976142919681, + "learning_rate": 4.411986857257473e-05, + "loss": 11.8624, + "step": 25639 + }, + { + "epoch": 1.3962004723884203, + "grad_norm": 0.5221320733890685, + "learning_rate": 4.411255579452566e-05, + "loss": 11.8193, + "step": 25640 + }, + { + "epoch": 1.3962549263850033, + "grad_norm": 0.5638440618118854, + "learning_rate": 4.410524345106676e-05, + "loss": 11.9277, + "step": 25641 + }, + { + "epoch": 1.3963093803815863, + "grad_norm": 0.5243922082073832, + "learning_rate": 4.409793154225482e-05, + "loss": 11.8344, + "step": 25642 + }, + { + "epoch": 1.3963638343781695, + "grad_norm": 0.5442168463416602, + "learning_rate": 4.4090620068146785e-05, + "loss": 11.8964, + "step": 25643 + }, + { + "epoch": 1.3964182883747525, + "grad_norm": 0.5371329213804302, + "learning_rate": 4.408330902879942e-05, + "loss": 11.7654, + "step": 25644 + }, + { + "epoch": 1.3964727423713355, + "grad_norm": 0.640722081839925, + "learning_rate": 4.407599842426967e-05, + "loss": 12.1038, + "step": 25645 + }, + { + "epoch": 1.3965271963679184, + "grad_norm": 0.5837988396414286, + "learning_rate": 4.406868825461429e-05, + "loss": 11.963, + "step": 25646 + }, + { + "epoch": 1.3965816503645014, + "grad_norm": 0.515178023263635, + "learning_rate": 4.4061378519890206e-05, + "loss": 11.874, + "step": 25647 + }, + { + "epoch": 1.3966361043610844, + "grad_norm": 0.5772878515993738, + "learning_rate": 4.4054069220154215e-05, + "loss": 11.7675, + "step": 25648 + }, + { + "epoch": 1.3966905583576674, + "grad_norm": 0.5585453623940094, + "learning_rate": 4.404676035546313e-05, + "loss": 11.896, + "step": 25649 + }, + { + "epoch": 1.3967450123542504, + "grad_norm": 0.5446295610336994, + "learning_rate": 4.403945192587383e-05, + "loss": 11.9455, + "step": 25650 + }, + { + "epoch": 1.3967994663508334, + "grad_norm": 0.520477903605035, + "learning_rate": 4.403214393144317e-05, + "loss": 11.984, + "step": 25651 + }, + { + "epoch": 1.3968539203474166, + "grad_norm": 0.7631798390604274, + "learning_rate": 4.402483637222791e-05, + "loss": 11.8636, + "step": 25652 + }, + { + "epoch": 1.3969083743439996, + "grad_norm": 0.5357976761801222, + "learning_rate": 4.4017529248284936e-05, + "loss": 11.844, + "step": 25653 + }, + { + "epoch": 1.3969628283405826, + "grad_norm": 0.5375128494981319, + "learning_rate": 4.4010222559671024e-05, + "loss": 11.8929, + "step": 25654 + }, + { + "epoch": 1.3970172823371656, + "grad_norm": 0.5684473019634566, + "learning_rate": 4.400291630644303e-05, + "loss": 11.8925, + "step": 25655 + }, + { + "epoch": 1.3970717363337486, + "grad_norm": 0.6145092032176454, + "learning_rate": 4.399561048865772e-05, + "loss": 11.8634, + "step": 25656 + }, + { + "epoch": 1.3971261903303316, + "grad_norm": 0.5871747327630277, + "learning_rate": 4.398830510637198e-05, + "loss": 12.017, + "step": 25657 + }, + { + "epoch": 1.3971806443269146, + "grad_norm": 0.5408892166409712, + "learning_rate": 4.398100015964256e-05, + "loss": 11.9226, + "step": 25658 + }, + { + "epoch": 1.3972350983234976, + "grad_norm": 0.6011747196050518, + "learning_rate": 4.397369564852624e-05, + "loss": 11.846, + "step": 25659 + }, + { + "epoch": 1.3972895523200806, + "grad_norm": 0.5365215992691857, + "learning_rate": 4.39663915730799e-05, + "loss": 11.9564, + "step": 25660 + }, + { + "epoch": 1.3973440063166636, + "grad_norm": 0.5578866734708908, + "learning_rate": 4.395908793336026e-05, + "loss": 11.9316, + "step": 25661 + }, + { + "epoch": 1.3973984603132465, + "grad_norm": 0.5684847679081799, + "learning_rate": 4.395178472942415e-05, + "loss": 11.9762, + "step": 25662 + }, + { + "epoch": 1.3974529143098295, + "grad_norm": 0.5737354362562226, + "learning_rate": 4.394448196132839e-05, + "loss": 11.8986, + "step": 25663 + }, + { + "epoch": 1.3975073683064125, + "grad_norm": 0.5643940556914142, + "learning_rate": 4.393717962912971e-05, + "loss": 11.8787, + "step": 25664 + }, + { + "epoch": 1.3975618223029955, + "grad_norm": 0.5163855351759655, + "learning_rate": 4.392987773288496e-05, + "loss": 11.9327, + "step": 25665 + }, + { + "epoch": 1.3976162762995787, + "grad_norm": 0.5165131419827805, + "learning_rate": 4.392257627265084e-05, + "loss": 11.8745, + "step": 25666 + }, + { + "epoch": 1.3976707302961617, + "grad_norm": 0.5623763370391212, + "learning_rate": 4.3915275248484224e-05, + "loss": 11.8753, + "step": 25667 + }, + { + "epoch": 1.3977251842927447, + "grad_norm": 0.5021095954282441, + "learning_rate": 4.390797466044182e-05, + "loss": 11.8867, + "step": 25668 + }, + { + "epoch": 1.3977796382893277, + "grad_norm": 0.5343072695118061, + "learning_rate": 4.3900674508580374e-05, + "loss": 11.9957, + "step": 25669 + }, + { + "epoch": 1.3978340922859107, + "grad_norm": 0.5840461650399251, + "learning_rate": 4.389337479295673e-05, + "loss": 11.9446, + "step": 25670 + }, + { + "epoch": 1.3978885462824937, + "grad_norm": 0.5555837544430504, + "learning_rate": 4.388607551362758e-05, + "loss": 11.8412, + "step": 25671 + }, + { + "epoch": 1.3979430002790767, + "grad_norm": 0.5567206681928537, + "learning_rate": 4.3878776670649746e-05, + "loss": 11.841, + "step": 25672 + }, + { + "epoch": 1.3979974542756597, + "grad_norm": 0.5194923352163608, + "learning_rate": 4.3871478264079924e-05, + "loss": 11.7848, + "step": 25673 + }, + { + "epoch": 1.3980519082722427, + "grad_norm": 0.49589168254954724, + "learning_rate": 4.38641802939749e-05, + "loss": 11.8045, + "step": 25674 + }, + { + "epoch": 1.398106362268826, + "grad_norm": 0.5753414999346085, + "learning_rate": 4.385688276039146e-05, + "loss": 11.8016, + "step": 25675 + }, + { + "epoch": 1.398160816265409, + "grad_norm": 0.5054072524763868, + "learning_rate": 4.384958566338627e-05, + "loss": 11.9053, + "step": 25676 + }, + { + "epoch": 1.3982152702619919, + "grad_norm": 0.5207491949903695, + "learning_rate": 4.384228900301617e-05, + "loss": 11.877, + "step": 25677 + }, + { + "epoch": 1.3982697242585749, + "grad_norm": 0.6050903200238807, + "learning_rate": 4.3834992779337845e-05, + "loss": 11.9729, + "step": 25678 + }, + { + "epoch": 1.3983241782551579, + "grad_norm": 0.5035147841598213, + "learning_rate": 4.3827696992407995e-05, + "loss": 11.8971, + "step": 25679 + }, + { + "epoch": 1.3983786322517409, + "grad_norm": 0.5849917897173539, + "learning_rate": 4.382040164228343e-05, + "loss": 11.9739, + "step": 25680 + }, + { + "epoch": 1.3984330862483239, + "grad_norm": 0.5173595634324137, + "learning_rate": 4.3813106729020806e-05, + "loss": 11.7193, + "step": 25681 + }, + { + "epoch": 1.3984875402449068, + "grad_norm": 0.5088712814321042, + "learning_rate": 4.380581225267693e-05, + "loss": 11.8223, + "step": 25682 + }, + { + "epoch": 1.3985419942414898, + "grad_norm": 0.5513836839263556, + "learning_rate": 4.379851821330843e-05, + "loss": 11.8423, + "step": 25683 + }, + { + "epoch": 1.3985964482380728, + "grad_norm": 0.5057797012944087, + "learning_rate": 4.379122461097208e-05, + "loss": 11.6728, + "step": 25684 + }, + { + "epoch": 1.3986509022346558, + "grad_norm": 0.5718664348820757, + "learning_rate": 4.3783931445724645e-05, + "loss": 11.9907, + "step": 25685 + }, + { + "epoch": 1.3987053562312388, + "grad_norm": 0.5910400993603159, + "learning_rate": 4.377663871762274e-05, + "loss": 12.0681, + "step": 25686 + }, + { + "epoch": 1.3987598102278218, + "grad_norm": 0.8504708164787771, + "learning_rate": 4.376934642672319e-05, + "loss": 11.9959, + "step": 25687 + }, + { + "epoch": 1.3988142642244048, + "grad_norm": 0.5088988067425663, + "learning_rate": 4.376205457308257e-05, + "loss": 11.8203, + "step": 25688 + }, + { + "epoch": 1.3988687182209878, + "grad_norm": 0.5316068582897, + "learning_rate": 4.375476315675764e-05, + "loss": 11.8087, + "step": 25689 + }, + { + "epoch": 1.398923172217571, + "grad_norm": 0.5454832715356545, + "learning_rate": 4.3747472177805135e-05, + "loss": 11.8985, + "step": 25690 + }, + { + "epoch": 1.398977626214154, + "grad_norm": 0.5422557431517581, + "learning_rate": 4.37401816362817e-05, + "loss": 11.9926, + "step": 25691 + }, + { + "epoch": 1.399032080210737, + "grad_norm": 0.5232367087176085, + "learning_rate": 4.373289153224407e-05, + "loss": 11.794, + "step": 25692 + }, + { + "epoch": 1.39908653420732, + "grad_norm": 0.5764699835647442, + "learning_rate": 4.372560186574887e-05, + "loss": 11.8392, + "step": 25693 + }, + { + "epoch": 1.399140988203903, + "grad_norm": 0.5653583270220415, + "learning_rate": 4.371831263685288e-05, + "loss": 11.8948, + "step": 25694 + }, + { + "epoch": 1.399195442200486, + "grad_norm": 0.5332046554670394, + "learning_rate": 4.3711023845612675e-05, + "loss": 11.964, + "step": 25695 + }, + { + "epoch": 1.399249896197069, + "grad_norm": 0.5613374131471114, + "learning_rate": 4.3703735492085e-05, + "loss": 11.922, + "step": 25696 + }, + { + "epoch": 1.399304350193652, + "grad_norm": 0.5404182515945336, + "learning_rate": 4.369644757632658e-05, + "loss": 11.9681, + "step": 25697 + }, + { + "epoch": 1.3993588041902352, + "grad_norm": 0.4801664996309418, + "learning_rate": 4.368916009839397e-05, + "loss": 11.8808, + "step": 25698 + }, + { + "epoch": 1.3994132581868182, + "grad_norm": 0.5792402197427226, + "learning_rate": 4.368187305834388e-05, + "loss": 11.7865, + "step": 25699 + }, + { + "epoch": 1.3994677121834012, + "grad_norm": 0.5203970169492241, + "learning_rate": 4.367458645623304e-05, + "loss": 11.7728, + "step": 25700 + }, + { + "epoch": 1.3995221661799842, + "grad_norm": 0.581598290117175, + "learning_rate": 4.366730029211801e-05, + "loss": 11.8598, + "step": 25701 + }, + { + "epoch": 1.3995766201765671, + "grad_norm": 0.6091364758528232, + "learning_rate": 4.366001456605554e-05, + "loss": 11.893, + "step": 25702 + }, + { + "epoch": 1.3996310741731501, + "grad_norm": 0.5477307382500277, + "learning_rate": 4.365272927810221e-05, + "loss": 11.8992, + "step": 25703 + }, + { + "epoch": 1.3996855281697331, + "grad_norm": 0.5498687874056694, + "learning_rate": 4.3645444428314754e-05, + "loss": 11.7799, + "step": 25704 + }, + { + "epoch": 1.3997399821663161, + "grad_norm": 0.5837738167710438, + "learning_rate": 4.363816001674972e-05, + "loss": 12.0774, + "step": 25705 + }, + { + "epoch": 1.3997944361628991, + "grad_norm": 0.5492987014768775, + "learning_rate": 4.363087604346385e-05, + "loss": 11.9124, + "step": 25706 + }, + { + "epoch": 1.399848890159482, + "grad_norm": 0.51453150083243, + "learning_rate": 4.362359250851374e-05, + "loss": 11.8891, + "step": 25707 + }, + { + "epoch": 1.399903344156065, + "grad_norm": 0.5271191344921754, + "learning_rate": 4.3616309411955994e-05, + "loss": 11.8725, + "step": 25708 + }, + { + "epoch": 1.399957798152648, + "grad_norm": 0.5335090745817481, + "learning_rate": 4.3609026753847315e-05, + "loss": 11.9013, + "step": 25709 + }, + { + "epoch": 1.400012252149231, + "grad_norm": 0.5408831310830207, + "learning_rate": 4.360174453424427e-05, + "loss": 11.8668, + "step": 25710 + }, + { + "epoch": 1.400066706145814, + "grad_norm": 0.5873790115469334, + "learning_rate": 4.3594462753203516e-05, + "loss": 11.8959, + "step": 25711 + }, + { + "epoch": 1.400121160142397, + "grad_norm": 0.5084114241267758, + "learning_rate": 4.358718141078171e-05, + "loss": 11.7836, + "step": 25712 + }, + { + "epoch": 1.4001756141389803, + "grad_norm": 0.5610339364949798, + "learning_rate": 4.357990050703541e-05, + "loss": 11.8994, + "step": 25713 + }, + { + "epoch": 1.4002300681355633, + "grad_norm": 0.5326563329470386, + "learning_rate": 4.3572620042021296e-05, + "loss": 11.9368, + "step": 25714 + }, + { + "epoch": 1.4002845221321463, + "grad_norm": 0.48618825218399014, + "learning_rate": 4.356534001579592e-05, + "loss": 11.7222, + "step": 25715 + }, + { + "epoch": 1.4003389761287293, + "grad_norm": 0.5149899727645597, + "learning_rate": 4.355806042841596e-05, + "loss": 11.9233, + "step": 25716 + }, + { + "epoch": 1.4003934301253123, + "grad_norm": 0.5748589060599986, + "learning_rate": 4.355078127993798e-05, + "loss": 11.9294, + "step": 25717 + }, + { + "epoch": 1.4004478841218952, + "grad_norm": 0.683915970408038, + "learning_rate": 4.354350257041857e-05, + "loss": 11.9369, + "step": 25718 + }, + { + "epoch": 1.4005023381184782, + "grad_norm": 0.5652671862783292, + "learning_rate": 4.3536224299914374e-05, + "loss": 11.8888, + "step": 25719 + }, + { + "epoch": 1.4005567921150612, + "grad_norm": 0.5390889528715983, + "learning_rate": 4.352894646848194e-05, + "loss": 11.9011, + "step": 25720 + }, + { + "epoch": 1.4006112461116442, + "grad_norm": 0.5569542491738358, + "learning_rate": 4.352166907617792e-05, + "loss": 11.8275, + "step": 25721 + }, + { + "epoch": 1.4006657001082274, + "grad_norm": 0.526211258247391, + "learning_rate": 4.351439212305883e-05, + "loss": 11.989, + "step": 25722 + }, + { + "epoch": 1.4007201541048104, + "grad_norm": 0.5817649898617785, + "learning_rate": 4.3507115609181315e-05, + "loss": 11.7951, + "step": 25723 + }, + { + "epoch": 1.4007746081013934, + "grad_norm": 0.5849301463570367, + "learning_rate": 4.349983953460197e-05, + "loss": 11.8035, + "step": 25724 + }, + { + "epoch": 1.4008290620979764, + "grad_norm": 0.5743438742601729, + "learning_rate": 4.3492563899377316e-05, + "loss": 11.8936, + "step": 25725 + }, + { + "epoch": 1.4008835160945594, + "grad_norm": 0.5725940876010246, + "learning_rate": 4.3485288703563986e-05, + "loss": 11.9187, + "step": 25726 + }, + { + "epoch": 1.4009379700911424, + "grad_norm": 0.533858575078255, + "learning_rate": 4.347801394721851e-05, + "loss": 11.8609, + "step": 25727 + }, + { + "epoch": 1.4009924240877254, + "grad_norm": 0.5972171361554862, + "learning_rate": 4.347073963039749e-05, + "loss": 11.8956, + "step": 25728 + }, + { + "epoch": 1.4010468780843084, + "grad_norm": 0.5431114513833376, + "learning_rate": 4.3463465753157505e-05, + "loss": 11.8938, + "step": 25729 + }, + { + "epoch": 1.4011013320808914, + "grad_norm": 0.5755390994591522, + "learning_rate": 4.3456192315555034e-05, + "loss": 11.9397, + "step": 25730 + }, + { + "epoch": 1.4011557860774744, + "grad_norm": 0.5730787357320373, + "learning_rate": 4.344891931764674e-05, + "loss": 11.8986, + "step": 25731 + }, + { + "epoch": 1.4012102400740574, + "grad_norm": 0.5007471459092729, + "learning_rate": 4.34416467594891e-05, + "loss": 11.8538, + "step": 25732 + }, + { + "epoch": 1.4012646940706404, + "grad_norm": 0.5371171533501721, + "learning_rate": 4.343437464113869e-05, + "loss": 11.8889, + "step": 25733 + }, + { + "epoch": 1.4013191480672234, + "grad_norm": 0.5478632832461535, + "learning_rate": 4.342710296265211e-05, + "loss": 11.8248, + "step": 25734 + }, + { + "epoch": 1.4013736020638063, + "grad_norm": 0.5631985721648314, + "learning_rate": 4.341983172408582e-05, + "loss": 11.875, + "step": 25735 + }, + { + "epoch": 1.4014280560603896, + "grad_norm": 0.5449914502765917, + "learning_rate": 4.341256092549646e-05, + "loss": 11.8051, + "step": 25736 + }, + { + "epoch": 1.4014825100569726, + "grad_norm": 0.5375566491885477, + "learning_rate": 4.340529056694047e-05, + "loss": 11.9043, + "step": 25737 + }, + { + "epoch": 1.4015369640535555, + "grad_norm": 0.5365139119787555, + "learning_rate": 4.339802064847447e-05, + "loss": 11.9476, + "step": 25738 + }, + { + "epoch": 1.4015914180501385, + "grad_norm": 0.5766863127162225, + "learning_rate": 4.339075117015495e-05, + "loss": 11.8478, + "step": 25739 + }, + { + "epoch": 1.4016458720467215, + "grad_norm": 0.6210547963280137, + "learning_rate": 4.338348213203841e-05, + "loss": 11.8514, + "step": 25740 + }, + { + "epoch": 1.4017003260433045, + "grad_norm": 0.5459567670103419, + "learning_rate": 4.3376213534181445e-05, + "loss": 11.7452, + "step": 25741 + }, + { + "epoch": 1.4017547800398875, + "grad_norm": 0.5594405762582951, + "learning_rate": 4.33689453766405e-05, + "loss": 11.9725, + "step": 25742 + }, + { + "epoch": 1.4018092340364705, + "grad_norm": 0.5454613069963479, + "learning_rate": 4.3361677659472164e-05, + "loss": 11.9589, + "step": 25743 + }, + { + "epoch": 1.4018636880330535, + "grad_norm": 0.5481028587273373, + "learning_rate": 4.335441038273289e-05, + "loss": 11.8762, + "step": 25744 + }, + { + "epoch": 1.4019181420296367, + "grad_norm": 0.5606037842650509, + "learning_rate": 4.334714354647923e-05, + "loss": 11.8584, + "step": 25745 + }, + { + "epoch": 1.4019725960262197, + "grad_norm": 0.5185782378086978, + "learning_rate": 4.33398771507677e-05, + "loss": 11.8792, + "step": 25746 + }, + { + "epoch": 1.4020270500228027, + "grad_norm": 0.5281439600207601, + "learning_rate": 4.333261119565476e-05, + "loss": 12.0006, + "step": 25747 + }, + { + "epoch": 1.4020815040193857, + "grad_norm": 0.6277291921330076, + "learning_rate": 4.332534568119698e-05, + "loss": 11.8947, + "step": 25748 + }, + { + "epoch": 1.4021359580159687, + "grad_norm": 0.534869803590322, + "learning_rate": 4.33180806074508e-05, + "loss": 11.9372, + "step": 25749 + }, + { + "epoch": 1.4021904120125517, + "grad_norm": 0.5720715166334139, + "learning_rate": 4.331081597447272e-05, + "loss": 12.0421, + "step": 25750 + }, + { + "epoch": 1.4022448660091347, + "grad_norm": 0.6700032731573184, + "learning_rate": 4.330355178231926e-05, + "loss": 11.8688, + "step": 25751 + }, + { + "epoch": 1.4022993200057177, + "grad_norm": 0.6422941064293342, + "learning_rate": 4.329628803104685e-05, + "loss": 11.9984, + "step": 25752 + }, + { + "epoch": 1.4023537740023007, + "grad_norm": 0.5218657818654752, + "learning_rate": 4.3289024720712055e-05, + "loss": 11.8448, + "step": 25753 + }, + { + "epoch": 1.4024082279988836, + "grad_norm": 0.5567331107952798, + "learning_rate": 4.3281761851371284e-05, + "loss": 11.8074, + "step": 25754 + }, + { + "epoch": 1.4024626819954666, + "grad_norm": 0.553640112261936, + "learning_rate": 4.327449942308107e-05, + "loss": 11.6889, + "step": 25755 + }, + { + "epoch": 1.4025171359920496, + "grad_norm": 0.5626446887770098, + "learning_rate": 4.326723743589783e-05, + "loss": 11.9616, + "step": 25756 + }, + { + "epoch": 1.4025715899886326, + "grad_norm": 0.5404421380628863, + "learning_rate": 4.325997588987808e-05, + "loss": 11.9401, + "step": 25757 + }, + { + "epoch": 1.4026260439852156, + "grad_norm": 0.5994161744225048, + "learning_rate": 4.325271478507833e-05, + "loss": 11.8843, + "step": 25758 + }, + { + "epoch": 1.4026804979817986, + "grad_norm": 0.507601738856093, + "learning_rate": 4.324545412155492e-05, + "loss": 11.8573, + "step": 25759 + }, + { + "epoch": 1.4027349519783818, + "grad_norm": 0.5399060464408709, + "learning_rate": 4.323819389936437e-05, + "loss": 11.8914, + "step": 25760 + }, + { + "epoch": 1.4027894059749648, + "grad_norm": 0.4985717580031476, + "learning_rate": 4.3230934118563185e-05, + "loss": 11.9473, + "step": 25761 + }, + { + "epoch": 1.4028438599715478, + "grad_norm": 0.5098343717376963, + "learning_rate": 4.322367477920773e-05, + "loss": 11.8673, + "step": 25762 + }, + { + "epoch": 1.4028983139681308, + "grad_norm": 0.5031072887679476, + "learning_rate": 4.321641588135454e-05, + "loss": 11.9143, + "step": 25763 + }, + { + "epoch": 1.4029527679647138, + "grad_norm": 0.5128637907285493, + "learning_rate": 4.320915742506e-05, + "loss": 11.806, + "step": 25764 + }, + { + "epoch": 1.4030072219612968, + "grad_norm": 0.5643419053060134, + "learning_rate": 4.32018994103806e-05, + "loss": 11.9542, + "step": 25765 + }, + { + "epoch": 1.4030616759578798, + "grad_norm": 0.5496149552258993, + "learning_rate": 4.319464183737272e-05, + "loss": 11.9774, + "step": 25766 + }, + { + "epoch": 1.4031161299544628, + "grad_norm": 0.5385685349043441, + "learning_rate": 4.318738470609284e-05, + "loss": 11.7526, + "step": 25767 + }, + { + "epoch": 1.403170583951046, + "grad_norm": 0.5051635585643108, + "learning_rate": 4.318012801659744e-05, + "loss": 11.8489, + "step": 25768 + }, + { + "epoch": 1.403225037947629, + "grad_norm": 0.5528945036488232, + "learning_rate": 4.3172871768942834e-05, + "loss": 12.0119, + "step": 25769 + }, + { + "epoch": 1.403279491944212, + "grad_norm": 0.5778477693792172, + "learning_rate": 4.3165615963185545e-05, + "loss": 11.9029, + "step": 25770 + }, + { + "epoch": 1.403333945940795, + "grad_norm": 0.5397905335023614, + "learning_rate": 4.315836059938191e-05, + "loss": 11.828, + "step": 25771 + }, + { + "epoch": 1.403388399937378, + "grad_norm": 0.5438297685566836, + "learning_rate": 4.31511056775884e-05, + "loss": 11.8823, + "step": 25772 + }, + { + "epoch": 1.403442853933961, + "grad_norm": 0.5701260681333727, + "learning_rate": 4.314385119786146e-05, + "loss": 11.8675, + "step": 25773 + }, + { + "epoch": 1.403497307930544, + "grad_norm": 0.5539240777978596, + "learning_rate": 4.313659716025744e-05, + "loss": 11.9092, + "step": 25774 + }, + { + "epoch": 1.403551761927127, + "grad_norm": 0.5696074381941687, + "learning_rate": 4.312934356483281e-05, + "loss": 11.845, + "step": 25775 + }, + { + "epoch": 1.40360621592371, + "grad_norm": 0.6578220433315457, + "learning_rate": 4.31220904116439e-05, + "loss": 12.0892, + "step": 25776 + }, + { + "epoch": 1.403660669920293, + "grad_norm": 0.5337505459524566, + "learning_rate": 4.3114837700747203e-05, + "loss": 11.8102, + "step": 25777 + }, + { + "epoch": 1.403715123916876, + "grad_norm": 0.5252660110524694, + "learning_rate": 4.310758543219905e-05, + "loss": 11.8788, + "step": 25778 + }, + { + "epoch": 1.403769577913459, + "grad_norm": 0.5426806737980335, + "learning_rate": 4.310033360605582e-05, + "loss": 11.8705, + "step": 25779 + }, + { + "epoch": 1.403824031910042, + "grad_norm": 0.5849980006069722, + "learning_rate": 4.309308222237398e-05, + "loss": 11.8774, + "step": 25780 + }, + { + "epoch": 1.403878485906625, + "grad_norm": 0.5298430039427744, + "learning_rate": 4.308583128120984e-05, + "loss": 11.9699, + "step": 25781 + }, + { + "epoch": 1.4039329399032079, + "grad_norm": 0.5711868513615855, + "learning_rate": 4.307858078261983e-05, + "loss": 11.8563, + "step": 25782 + }, + { + "epoch": 1.403987393899791, + "grad_norm": 0.6034781661497142, + "learning_rate": 4.307133072666034e-05, + "loss": 11.9382, + "step": 25783 + }, + { + "epoch": 1.404041847896374, + "grad_norm": 0.5275655011643641, + "learning_rate": 4.30640811133877e-05, + "loss": 11.883, + "step": 25784 + }, + { + "epoch": 1.404096301892957, + "grad_norm": 0.6132439151156021, + "learning_rate": 4.305683194285834e-05, + "loss": 11.9922, + "step": 25785 + }, + { + "epoch": 1.40415075588954, + "grad_norm": 0.4809888890409843, + "learning_rate": 4.304958321512858e-05, + "loss": 11.8524, + "step": 25786 + }, + { + "epoch": 1.404205209886123, + "grad_norm": 0.5915868691239101, + "learning_rate": 4.3042334930254855e-05, + "loss": 11.9001, + "step": 25787 + }, + { + "epoch": 1.404259663882706, + "grad_norm": 0.4780354993578049, + "learning_rate": 4.303508708829347e-05, + "loss": 11.8547, + "step": 25788 + }, + { + "epoch": 1.404314117879289, + "grad_norm": 0.5796849881484587, + "learning_rate": 4.302783968930078e-05, + "loss": 11.8499, + "step": 25789 + }, + { + "epoch": 1.404368571875872, + "grad_norm": 0.5537851104142394, + "learning_rate": 4.3020592733333185e-05, + "loss": 11.8688, + "step": 25790 + }, + { + "epoch": 1.404423025872455, + "grad_norm": 0.5442721735305788, + "learning_rate": 4.301334622044698e-05, + "loss": 11.8563, + "step": 25791 + }, + { + "epoch": 1.4044774798690383, + "grad_norm": 0.5955860010807744, + "learning_rate": 4.3006100150698594e-05, + "loss": 11.9449, + "step": 25792 + }, + { + "epoch": 1.4045319338656213, + "grad_norm": 0.5395442100716481, + "learning_rate": 4.299885452414429e-05, + "loss": 11.7314, + "step": 25793 + }, + { + "epoch": 1.4045863878622042, + "grad_norm": 0.5194883904364771, + "learning_rate": 4.2991609340840455e-05, + "loss": 11.916, + "step": 25794 + }, + { + "epoch": 1.4046408418587872, + "grad_norm": 0.5643779043933354, + "learning_rate": 4.298436460084346e-05, + "loss": 11.814, + "step": 25795 + }, + { + "epoch": 1.4046952958553702, + "grad_norm": 0.6175041382935421, + "learning_rate": 4.2977120304209586e-05, + "loss": 11.7237, + "step": 25796 + }, + { + "epoch": 1.4047497498519532, + "grad_norm": 0.5257130784124201, + "learning_rate": 4.29698764509952e-05, + "loss": 11.9171, + "step": 25797 + }, + { + "epoch": 1.4048042038485362, + "grad_norm": 0.5466752839396776, + "learning_rate": 4.296263304125664e-05, + "loss": 11.8934, + "step": 25798 + }, + { + "epoch": 1.4048586578451192, + "grad_norm": 0.5403470529263875, + "learning_rate": 4.295539007505016e-05, + "loss": 11.9722, + "step": 25799 + }, + { + "epoch": 1.4049131118417022, + "grad_norm": 0.5531022681001917, + "learning_rate": 4.294814755243217e-05, + "loss": 11.9278, + "step": 25800 + }, + { + "epoch": 1.4049675658382852, + "grad_norm": 0.5067067110711966, + "learning_rate": 4.2940905473458926e-05, + "loss": 11.8181, + "step": 25801 + }, + { + "epoch": 1.4050220198348682, + "grad_norm": 0.5813067824610553, + "learning_rate": 4.29336638381868e-05, + "loss": 11.9637, + "step": 25802 + }, + { + "epoch": 1.4050764738314512, + "grad_norm": 0.5669991039890107, + "learning_rate": 4.2926422646672036e-05, + "loss": 11.8125, + "step": 25803 + }, + { + "epoch": 1.4051309278280342, + "grad_norm": 0.49300588130870826, + "learning_rate": 4.2919181898970984e-05, + "loss": 11.8649, + "step": 25804 + }, + { + "epoch": 1.4051853818246172, + "grad_norm": 0.5076407580270612, + "learning_rate": 4.291194159513998e-05, + "loss": 11.8819, + "step": 25805 + }, + { + "epoch": 1.4052398358212004, + "grad_norm": 0.5236553023447134, + "learning_rate": 4.2904701735235256e-05, + "loss": 11.7725, + "step": 25806 + }, + { + "epoch": 1.4052942898177834, + "grad_norm": 0.5537238593380521, + "learning_rate": 4.289746231931319e-05, + "loss": 11.8104, + "step": 25807 + }, + { + "epoch": 1.4053487438143664, + "grad_norm": 0.6155129506674, + "learning_rate": 4.289022334743001e-05, + "loss": 11.8991, + "step": 25808 + }, + { + "epoch": 1.4054031978109494, + "grad_norm": 0.5394076255226601, + "learning_rate": 4.2882984819642015e-05, + "loss": 11.9354, + "step": 25809 + }, + { + "epoch": 1.4054576518075323, + "grad_norm": 0.5782024250137687, + "learning_rate": 4.287574673600554e-05, + "loss": 11.8426, + "step": 25810 + }, + { + "epoch": 1.4055121058041153, + "grad_norm": 0.5313240044500805, + "learning_rate": 4.28685090965768e-05, + "loss": 11.8289, + "step": 25811 + }, + { + "epoch": 1.4055665598006983, + "grad_norm": 0.5398556323000993, + "learning_rate": 4.286127190141216e-05, + "loss": 11.9861, + "step": 25812 + }, + { + "epoch": 1.4056210137972813, + "grad_norm": 0.5323480588440391, + "learning_rate": 4.285403515056781e-05, + "loss": 11.9367, + "step": 25813 + }, + { + "epoch": 1.4056754677938643, + "grad_norm": 0.5714705974394412, + "learning_rate": 4.28467988441001e-05, + "loss": 11.8651, + "step": 25814 + }, + { + "epoch": 1.4057299217904475, + "grad_norm": 0.5258662331210137, + "learning_rate": 4.283956298206523e-05, + "loss": 12.0163, + "step": 25815 + }, + { + "epoch": 1.4057843757870305, + "grad_norm": 0.5249059355857912, + "learning_rate": 4.28323275645195e-05, + "loss": 11.933, + "step": 25816 + }, + { + "epoch": 1.4058388297836135, + "grad_norm": 0.5088191200406901, + "learning_rate": 4.282509259151922e-05, + "loss": 11.9727, + "step": 25817 + }, + { + "epoch": 1.4058932837801965, + "grad_norm": 0.6366457523019411, + "learning_rate": 4.281785806312061e-05, + "loss": 11.6813, + "step": 25818 + }, + { + "epoch": 1.4059477377767795, + "grad_norm": 0.5581525711871052, + "learning_rate": 4.281062397937992e-05, + "loss": 11.836, + "step": 25819 + }, + { + "epoch": 1.4060021917733625, + "grad_norm": 0.6347471647572209, + "learning_rate": 4.280339034035338e-05, + "loss": 11.9899, + "step": 25820 + }, + { + "epoch": 1.4060566457699455, + "grad_norm": 0.5325272664915921, + "learning_rate": 4.279615714609726e-05, + "loss": 11.829, + "step": 25821 + }, + { + "epoch": 1.4061110997665285, + "grad_norm": 0.48389713357782344, + "learning_rate": 4.278892439666785e-05, + "loss": 11.8101, + "step": 25822 + }, + { + "epoch": 1.4061655537631115, + "grad_norm": 0.5360783902279086, + "learning_rate": 4.278169209212133e-05, + "loss": 11.8839, + "step": 25823 + }, + { + "epoch": 1.4062200077596945, + "grad_norm": 0.5481278820150643, + "learning_rate": 4.2774460232514e-05, + "loss": 11.8488, + "step": 25824 + }, + { + "epoch": 1.4062744617562775, + "grad_norm": 0.5720749791879448, + "learning_rate": 4.276722881790203e-05, + "loss": 11.8759, + "step": 25825 + }, + { + "epoch": 1.4063289157528605, + "grad_norm": 0.5443309638373263, + "learning_rate": 4.275999784834172e-05, + "loss": 11.7165, + "step": 25826 + }, + { + "epoch": 1.4063833697494434, + "grad_norm": 0.5959614687964457, + "learning_rate": 4.275276732388923e-05, + "loss": 11.933, + "step": 25827 + }, + { + "epoch": 1.4064378237460264, + "grad_norm": 0.5179583093528469, + "learning_rate": 4.274553724460086e-05, + "loss": 11.8332, + "step": 25828 + }, + { + "epoch": 1.4064922777426097, + "grad_norm": 0.5274418774361285, + "learning_rate": 4.273830761053279e-05, + "loss": 11.8372, + "step": 25829 + }, + { + "epoch": 1.4065467317391926, + "grad_norm": 0.5491854188103588, + "learning_rate": 4.273107842174121e-05, + "loss": 11.862, + "step": 25830 + }, + { + "epoch": 1.4066011857357756, + "grad_norm": 0.560334495452463, + "learning_rate": 4.272384967828237e-05, + "loss": 11.8708, + "step": 25831 + }, + { + "epoch": 1.4066556397323586, + "grad_norm": 0.4878984289510161, + "learning_rate": 4.2716621380212505e-05, + "loss": 11.8301, + "step": 25832 + }, + { + "epoch": 1.4067100937289416, + "grad_norm": 0.5708076057091399, + "learning_rate": 4.270939352758778e-05, + "loss": 11.7839, + "step": 25833 + }, + { + "epoch": 1.4067645477255246, + "grad_norm": 0.5787300159516624, + "learning_rate": 4.270216612046445e-05, + "loss": 11.909, + "step": 25834 + }, + { + "epoch": 1.4068190017221076, + "grad_norm": 0.5941794725826334, + "learning_rate": 4.269493915889864e-05, + "loss": 12.0, + "step": 25835 + }, + { + "epoch": 1.4068734557186906, + "grad_norm": 0.5648772748489747, + "learning_rate": 4.2687712642946634e-05, + "loss": 11.8915, + "step": 25836 + }, + { + "epoch": 1.4069279097152736, + "grad_norm": 0.5066468622749574, + "learning_rate": 4.268048657266455e-05, + "loss": 11.8217, + "step": 25837 + }, + { + "epoch": 1.4069823637118568, + "grad_norm": 0.613763671049065, + "learning_rate": 4.267326094810865e-05, + "loss": 12.0097, + "step": 25838 + }, + { + "epoch": 1.4070368177084398, + "grad_norm": 0.5151996896175168, + "learning_rate": 4.2666035769335086e-05, + "loss": 11.8486, + "step": 25839 + }, + { + "epoch": 1.4070912717050228, + "grad_norm": 0.5229846377104254, + "learning_rate": 4.265881103640002e-05, + "loss": 11.9894, + "step": 25840 + }, + { + "epoch": 1.4071457257016058, + "grad_norm": 0.5552971388629591, + "learning_rate": 4.265158674935967e-05, + "loss": 11.8026, + "step": 25841 + }, + { + "epoch": 1.4072001796981888, + "grad_norm": 0.5820783232430011, + "learning_rate": 4.2644362908270174e-05, + "loss": 11.8772, + "step": 25842 + }, + { + "epoch": 1.4072546336947718, + "grad_norm": 0.5743819396700995, + "learning_rate": 4.263713951318773e-05, + "loss": 11.8406, + "step": 25843 + }, + { + "epoch": 1.4073090876913548, + "grad_norm": 0.5078189515427319, + "learning_rate": 4.262991656416854e-05, + "loss": 11.6802, + "step": 25844 + }, + { + "epoch": 1.4073635416879378, + "grad_norm": 0.5683440894347913, + "learning_rate": 4.262269406126871e-05, + "loss": 11.8316, + "step": 25845 + }, + { + "epoch": 1.4074179956845208, + "grad_norm": 0.5744332619359597, + "learning_rate": 4.261547200454446e-05, + "loss": 11.9266, + "step": 25846 + }, + { + "epoch": 1.4074724496811037, + "grad_norm": 0.5612854370753293, + "learning_rate": 4.26082503940519e-05, + "loss": 11.8786, + "step": 25847 + }, + { + "epoch": 1.4075269036776867, + "grad_norm": 0.5525059653758136, + "learning_rate": 4.2601029229847235e-05, + "loss": 11.777, + "step": 25848 + }, + { + "epoch": 1.4075813576742697, + "grad_norm": 0.6006601390818188, + "learning_rate": 4.25938085119866e-05, + "loss": 11.9093, + "step": 25849 + }, + { + "epoch": 1.4076358116708527, + "grad_norm": 0.5156693231260574, + "learning_rate": 4.2586588240526095e-05, + "loss": 11.9375, + "step": 25850 + }, + { + "epoch": 1.4076902656674357, + "grad_norm": 0.5669870649255772, + "learning_rate": 4.2579368415521946e-05, + "loss": 11.9091, + "step": 25851 + }, + { + "epoch": 1.4077447196640187, + "grad_norm": 0.5197345446943886, + "learning_rate": 4.2572149037030226e-05, + "loss": 11.839, + "step": 25852 + }, + { + "epoch": 1.407799173660602, + "grad_norm": 0.5070070801694782, + "learning_rate": 4.2564930105107104e-05, + "loss": 11.9568, + "step": 25853 + }, + { + "epoch": 1.407853627657185, + "grad_norm": 0.5339323222051076, + "learning_rate": 4.255771161980875e-05, + "loss": 11.812, + "step": 25854 + }, + { + "epoch": 1.407908081653768, + "grad_norm": 0.5444994776815661, + "learning_rate": 4.255049358119123e-05, + "loss": 11.913, + "step": 25855 + }, + { + "epoch": 1.407962535650351, + "grad_norm": 0.5496574992041923, + "learning_rate": 4.254327598931074e-05, + "loss": 11.9443, + "step": 25856 + }, + { + "epoch": 1.408016989646934, + "grad_norm": 0.5914788405976005, + "learning_rate": 4.253605884422334e-05, + "loss": 11.8381, + "step": 25857 + }, + { + "epoch": 1.4080714436435169, + "grad_norm": 0.5455994167009618, + "learning_rate": 4.2528842145985214e-05, + "loss": 11.8733, + "step": 25858 + }, + { + "epoch": 1.4081258976400999, + "grad_norm": 0.5345820288709208, + "learning_rate": 4.252162589465245e-05, + "loss": 12.0014, + "step": 25859 + }, + { + "epoch": 1.4081803516366829, + "grad_norm": 0.5798557355366813, + "learning_rate": 4.251441009028113e-05, + "loss": 11.958, + "step": 25860 + }, + { + "epoch": 1.408234805633266, + "grad_norm": 0.5207074577882463, + "learning_rate": 4.2507194732927433e-05, + "loss": 11.8906, + "step": 25861 + }, + { + "epoch": 1.408289259629849, + "grad_norm": 0.5041419379473353, + "learning_rate": 4.2499979822647396e-05, + "loss": 11.8203, + "step": 25862 + }, + { + "epoch": 1.408343713626432, + "grad_norm": 0.5565637241578298, + "learning_rate": 4.2492765359497187e-05, + "loss": 11.7416, + "step": 25863 + }, + { + "epoch": 1.408398167623015, + "grad_norm": 0.5591302357801531, + "learning_rate": 4.248555134353285e-05, + "loss": 12.0768, + "step": 25864 + }, + { + "epoch": 1.408452621619598, + "grad_norm": 0.5502567109883069, + "learning_rate": 4.2478337774810504e-05, + "loss": 11.9551, + "step": 25865 + }, + { + "epoch": 1.408507075616181, + "grad_norm": 0.5295241005744646, + "learning_rate": 4.2471124653386294e-05, + "loss": 11.8909, + "step": 25866 + }, + { + "epoch": 1.408561529612764, + "grad_norm": 0.5834154258213458, + "learning_rate": 4.2463911979316226e-05, + "loss": 11.6616, + "step": 25867 + }, + { + "epoch": 1.408615983609347, + "grad_norm": 0.5363876180307716, + "learning_rate": 4.245669975265646e-05, + "loss": 11.9225, + "step": 25868 + }, + { + "epoch": 1.40867043760593, + "grad_norm": 0.5021256883576487, + "learning_rate": 4.2449487973463055e-05, + "loss": 11.9048, + "step": 25869 + }, + { + "epoch": 1.408724891602513, + "grad_norm": 0.5526527474962228, + "learning_rate": 4.2442276641792046e-05, + "loss": 11.9494, + "step": 25870 + }, + { + "epoch": 1.408779345599096, + "grad_norm": 0.5518407773921247, + "learning_rate": 4.2435065757699575e-05, + "loss": 11.8001, + "step": 25871 + }, + { + "epoch": 1.408833799595679, + "grad_norm": 0.535865054136345, + "learning_rate": 4.242785532124165e-05, + "loss": 11.7701, + "step": 25872 + }, + { + "epoch": 1.408888253592262, + "grad_norm": 0.5547459065259486, + "learning_rate": 4.242064533247442e-05, + "loss": 11.9321, + "step": 25873 + }, + { + "epoch": 1.408942707588845, + "grad_norm": 0.5226768792334109, + "learning_rate": 4.241343579145387e-05, + "loss": 11.8714, + "step": 25874 + }, + { + "epoch": 1.408997161585428, + "grad_norm": 0.5393516411682427, + "learning_rate": 4.240622669823614e-05, + "loss": 11.9472, + "step": 25875 + }, + { + "epoch": 1.4090516155820112, + "grad_norm": 0.49869323998245224, + "learning_rate": 4.2399018052877205e-05, + "loss": 11.8296, + "step": 25876 + }, + { + "epoch": 1.4091060695785942, + "grad_norm": 0.5525337755186532, + "learning_rate": 4.2391809855433164e-05, + "loss": 11.8329, + "step": 25877 + }, + { + "epoch": 1.4091605235751772, + "grad_norm": 0.5040937477148583, + "learning_rate": 4.2384602105960115e-05, + "loss": 11.9391, + "step": 25878 + }, + { + "epoch": 1.4092149775717602, + "grad_norm": 0.5958430697037971, + "learning_rate": 4.237739480451406e-05, + "loss": 12.0315, + "step": 25879 + }, + { + "epoch": 1.4092694315683432, + "grad_norm": 0.5141700211675545, + "learning_rate": 4.237018795115101e-05, + "loss": 11.8403, + "step": 25880 + }, + { + "epoch": 1.4093238855649262, + "grad_norm": 0.5118231921700256, + "learning_rate": 4.236298154592707e-05, + "loss": 11.9423, + "step": 25881 + }, + { + "epoch": 1.4093783395615092, + "grad_norm": 0.5595852370792733, + "learning_rate": 4.235577558889823e-05, + "loss": 11.9391, + "step": 25882 + }, + { + "epoch": 1.4094327935580921, + "grad_norm": 0.5730760624709044, + "learning_rate": 4.2348570080120576e-05, + "loss": 11.8369, + "step": 25883 + }, + { + "epoch": 1.4094872475546751, + "grad_norm": 0.5328544162840606, + "learning_rate": 4.234136501965007e-05, + "loss": 11.8821, + "step": 25884 + }, + { + "epoch": 1.4095417015512584, + "grad_norm": 0.49246478701004204, + "learning_rate": 4.2334160407542824e-05, + "loss": 11.9199, + "step": 25885 + }, + { + "epoch": 1.4095961555478413, + "grad_norm": 0.598958754766261, + "learning_rate": 4.232695624385478e-05, + "loss": 11.7857, + "step": 25886 + }, + { + "epoch": 1.4096506095444243, + "grad_norm": 0.522251487365959, + "learning_rate": 4.2319752528641986e-05, + "loss": 11.8622, + "step": 25887 + }, + { + "epoch": 1.4097050635410073, + "grad_norm": 0.5077513731704751, + "learning_rate": 4.231254926196051e-05, + "loss": 11.8541, + "step": 25888 + }, + { + "epoch": 1.4097595175375903, + "grad_norm": 0.5783091568956663, + "learning_rate": 4.230534644386632e-05, + "loss": 11.8103, + "step": 25889 + }, + { + "epoch": 1.4098139715341733, + "grad_norm": 0.5622060053448952, + "learning_rate": 4.229814407441544e-05, + "loss": 11.8969, + "step": 25890 + }, + { + "epoch": 1.4098684255307563, + "grad_norm": 0.6072125634276111, + "learning_rate": 4.2290942153663825e-05, + "loss": 11.9061, + "step": 25891 + }, + { + "epoch": 1.4099228795273393, + "grad_norm": 0.6973086392010021, + "learning_rate": 4.228374068166753e-05, + "loss": 11.8754, + "step": 25892 + }, + { + "epoch": 1.4099773335239223, + "grad_norm": 0.513304898126486, + "learning_rate": 4.2276539658482574e-05, + "loss": 11.8946, + "step": 25893 + }, + { + "epoch": 1.4100317875205053, + "grad_norm": 0.5527366824529903, + "learning_rate": 4.226933908416489e-05, + "loss": 11.7546, + "step": 25894 + }, + { + "epoch": 1.4100862415170883, + "grad_norm": 0.5491514441669649, + "learning_rate": 4.226213895877056e-05, + "loss": 11.9177, + "step": 25895 + }, + { + "epoch": 1.4101406955136713, + "grad_norm": 0.550730264638003, + "learning_rate": 4.2254939282355466e-05, + "loss": 11.8666, + "step": 25896 + }, + { + "epoch": 1.4101951495102543, + "grad_norm": 0.5456382860175364, + "learning_rate": 4.224774005497568e-05, + "loss": 11.753, + "step": 25897 + }, + { + "epoch": 1.4102496035068373, + "grad_norm": 0.5516554284156222, + "learning_rate": 4.224054127668713e-05, + "loss": 11.8482, + "step": 25898 + }, + { + "epoch": 1.4103040575034205, + "grad_norm": 0.5625259871003233, + "learning_rate": 4.223334294754585e-05, + "loss": 11.904, + "step": 25899 + }, + { + "epoch": 1.4103585115000035, + "grad_norm": 0.5618478905726333, + "learning_rate": 4.222614506760778e-05, + "loss": 11.9644, + "step": 25900 + }, + { + "epoch": 1.4104129654965865, + "grad_norm": 0.6460636520761004, + "learning_rate": 4.221894763692886e-05, + "loss": 12.0559, + "step": 25901 + }, + { + "epoch": 1.4104674194931694, + "grad_norm": 0.5273121768935696, + "learning_rate": 4.22117506555651e-05, + "loss": 11.9369, + "step": 25902 + }, + { + "epoch": 1.4105218734897524, + "grad_norm": 0.5162378960358935, + "learning_rate": 4.2204554123572485e-05, + "loss": 11.9836, + "step": 25903 + }, + { + "epoch": 1.4105763274863354, + "grad_norm": 0.5196327394712076, + "learning_rate": 4.219735804100691e-05, + "loss": 11.7405, + "step": 25904 + }, + { + "epoch": 1.4106307814829184, + "grad_norm": 0.5254282810437462, + "learning_rate": 4.219016240792442e-05, + "loss": 11.9094, + "step": 25905 + }, + { + "epoch": 1.4106852354795014, + "grad_norm": 0.5216154572046777, + "learning_rate": 4.218296722438087e-05, + "loss": 11.8347, + "step": 25906 + }, + { + "epoch": 1.4107396894760844, + "grad_norm": 0.541167348260566, + "learning_rate": 4.217577249043231e-05, + "loss": 11.9124, + "step": 25907 + }, + { + "epoch": 1.4107941434726676, + "grad_norm": 0.5565774346147542, + "learning_rate": 4.21685782061346e-05, + "loss": 11.8609, + "step": 25908 + }, + { + "epoch": 1.4108485974692506, + "grad_norm": 0.51847801428641, + "learning_rate": 4.216138437154377e-05, + "loss": 11.862, + "step": 25909 + }, + { + "epoch": 1.4109030514658336, + "grad_norm": 0.5120099100789547, + "learning_rate": 4.2154190986715704e-05, + "loss": 11.8856, + "step": 25910 + }, + { + "epoch": 1.4109575054624166, + "grad_norm": 0.5283512903397058, + "learning_rate": 4.214699805170632e-05, + "loss": 11.864, + "step": 25911 + }, + { + "epoch": 1.4110119594589996, + "grad_norm": 0.5636639989040957, + "learning_rate": 4.213980556657161e-05, + "loss": 11.9639, + "step": 25912 + }, + { + "epoch": 1.4110664134555826, + "grad_norm": 0.6077813526368897, + "learning_rate": 4.213261353136745e-05, + "loss": 11.934, + "step": 25913 + }, + { + "epoch": 1.4111208674521656, + "grad_norm": 0.5086606259692638, + "learning_rate": 4.212542194614978e-05, + "loss": 11.7002, + "step": 25914 + }, + { + "epoch": 1.4111753214487486, + "grad_norm": 0.5502609041887185, + "learning_rate": 4.211823081097458e-05, + "loss": 11.8365, + "step": 25915 + }, + { + "epoch": 1.4112297754453316, + "grad_norm": 0.5273037271101029, + "learning_rate": 4.2111040125897685e-05, + "loss": 11.8961, + "step": 25916 + }, + { + "epoch": 1.4112842294419146, + "grad_norm": 0.5133770616820434, + "learning_rate": 4.210384989097508e-05, + "loss": 11.8726, + "step": 25917 + }, + { + "epoch": 1.4113386834384976, + "grad_norm": 0.5029753219698895, + "learning_rate": 4.209666010626262e-05, + "loss": 11.79, + "step": 25918 + }, + { + "epoch": 1.4113931374350805, + "grad_norm": 0.4950874803758819, + "learning_rate": 4.208947077181626e-05, + "loss": 11.8046, + "step": 25919 + }, + { + "epoch": 1.4114475914316635, + "grad_norm": 0.6203688283204638, + "learning_rate": 4.208228188769191e-05, + "loss": 11.9855, + "step": 25920 + }, + { + "epoch": 1.4115020454282465, + "grad_norm": 0.5162160457790488, + "learning_rate": 4.20750934539454e-05, + "loss": 11.8382, + "step": 25921 + }, + { + "epoch": 1.4115564994248295, + "grad_norm": 0.5355800112870308, + "learning_rate": 4.206790547063272e-05, + "loss": 11.8719, + "step": 25922 + }, + { + "epoch": 1.4116109534214127, + "grad_norm": 0.6020767351640445, + "learning_rate": 4.206071793780968e-05, + "loss": 11.9148, + "step": 25923 + }, + { + "epoch": 1.4116654074179957, + "grad_norm": 0.5947597914235936, + "learning_rate": 4.2053530855532254e-05, + "loss": 11.9118, + "step": 25924 + }, + { + "epoch": 1.4117198614145787, + "grad_norm": 0.5909367902531036, + "learning_rate": 4.204634422385625e-05, + "loss": 11.84, + "step": 25925 + }, + { + "epoch": 1.4117743154111617, + "grad_norm": 0.5570109662277771, + "learning_rate": 4.203915804283759e-05, + "loss": 11.8911, + "step": 25926 + }, + { + "epoch": 1.4118287694077447, + "grad_norm": 0.5541344179528458, + "learning_rate": 4.2031972312532196e-05, + "loss": 11.8311, + "step": 25927 + }, + { + "epoch": 1.4118832234043277, + "grad_norm": 0.5637549961939454, + "learning_rate": 4.202478703299587e-05, + "loss": 11.8737, + "step": 25928 + }, + { + "epoch": 1.4119376774009107, + "grad_norm": 0.5726764453654672, + "learning_rate": 4.201760220428457e-05, + "loss": 11.9924, + "step": 25929 + }, + { + "epoch": 1.4119921313974937, + "grad_norm": 0.5894911664127516, + "learning_rate": 4.2010417826454106e-05, + "loss": 11.8918, + "step": 25930 + }, + { + "epoch": 1.412046585394077, + "grad_norm": 0.6004226055445582, + "learning_rate": 4.200323389956032e-05, + "loss": 11.8862, + "step": 25931 + }, + { + "epoch": 1.41210103939066, + "grad_norm": 0.5289710695479329, + "learning_rate": 4.199605042365916e-05, + "loss": 11.7657, + "step": 25932 + }, + { + "epoch": 1.4121554933872429, + "grad_norm": 0.5841983179479884, + "learning_rate": 4.19888673988064e-05, + "loss": 11.9163, + "step": 25933 + }, + { + "epoch": 1.4122099473838259, + "grad_norm": 0.5469696232190195, + "learning_rate": 4.1981684825057966e-05, + "loss": 11.9362, + "step": 25934 + }, + { + "epoch": 1.4122644013804089, + "grad_norm": 0.5852481285258428, + "learning_rate": 4.197450270246964e-05, + "loss": 11.9623, + "step": 25935 + }, + { + "epoch": 1.4123188553769919, + "grad_norm": 0.5803512784379835, + "learning_rate": 4.196732103109733e-05, + "loss": 11.9014, + "step": 25936 + }, + { + "epoch": 1.4123733093735749, + "grad_norm": 0.5560973915181886, + "learning_rate": 4.196013981099689e-05, + "loss": 11.9202, + "step": 25937 + }, + { + "epoch": 1.4124277633701579, + "grad_norm": 0.5695512447169855, + "learning_rate": 4.1952959042224095e-05, + "loss": 11.9226, + "step": 25938 + }, + { + "epoch": 1.4124822173667408, + "grad_norm": 0.555477549408239, + "learning_rate": 4.1945778724834906e-05, + "loss": 11.9163, + "step": 25939 + }, + { + "epoch": 1.4125366713633238, + "grad_norm": 0.5212130144156074, + "learning_rate": 4.193859885888499e-05, + "loss": 11.9148, + "step": 25940 + }, + { + "epoch": 1.4125911253599068, + "grad_norm": 0.5884439021591624, + "learning_rate": 4.1931419444430274e-05, + "loss": 11.9955, + "step": 25941 + }, + { + "epoch": 1.4126455793564898, + "grad_norm": 0.5246729502820188, + "learning_rate": 4.192424048152662e-05, + "loss": 11.9317, + "step": 25942 + }, + { + "epoch": 1.4127000333530728, + "grad_norm": 0.5588712101795598, + "learning_rate": 4.1917061970229775e-05, + "loss": 11.8533, + "step": 25943 + }, + { + "epoch": 1.4127544873496558, + "grad_norm": 0.5507920691027859, + "learning_rate": 4.1909883910595625e-05, + "loss": 11.9232, + "step": 25944 + }, + { + "epoch": 1.4128089413462388, + "grad_norm": 0.6211357304721636, + "learning_rate": 4.190270630267994e-05, + "loss": 11.8887, + "step": 25945 + }, + { + "epoch": 1.412863395342822, + "grad_norm": 0.5255692057499124, + "learning_rate": 4.189552914653857e-05, + "loss": 11.8578, + "step": 25946 + }, + { + "epoch": 1.412917849339405, + "grad_norm": 0.5233315789125356, + "learning_rate": 4.188835244222729e-05, + "loss": 12.0373, + "step": 25947 + }, + { + "epoch": 1.412972303335988, + "grad_norm": 0.6191500565628775, + "learning_rate": 4.188117618980192e-05, + "loss": 11.9274, + "step": 25948 + }, + { + "epoch": 1.413026757332571, + "grad_norm": 0.5176264051522861, + "learning_rate": 4.187400038931834e-05, + "loss": 11.8133, + "step": 25949 + }, + { + "epoch": 1.413081211329154, + "grad_norm": 0.5517724031078244, + "learning_rate": 4.1866825040832205e-05, + "loss": 11.953, + "step": 25950 + }, + { + "epoch": 1.413135665325737, + "grad_norm": 0.5731983108006287, + "learning_rate": 4.185965014439939e-05, + "loss": 11.9859, + "step": 25951 + }, + { + "epoch": 1.41319011932232, + "grad_norm": 0.5820963108441654, + "learning_rate": 4.1852475700075724e-05, + "loss": 11.7734, + "step": 25952 + }, + { + "epoch": 1.413244573318903, + "grad_norm": 0.5010189682805626, + "learning_rate": 4.184530170791692e-05, + "loss": 11.8814, + "step": 25953 + }, + { + "epoch": 1.413299027315486, + "grad_norm": 0.5401379374152437, + "learning_rate": 4.183812816797884e-05, + "loss": 11.9156, + "step": 25954 + }, + { + "epoch": 1.4133534813120692, + "grad_norm": 0.5478793243519098, + "learning_rate": 4.183095508031719e-05, + "loss": 11.7476, + "step": 25955 + }, + { + "epoch": 1.4134079353086522, + "grad_norm": 0.5464283418929661, + "learning_rate": 4.182378244498784e-05, + "loss": 11.9163, + "step": 25956 + }, + { + "epoch": 1.4134623893052352, + "grad_norm": 0.4842141536250966, + "learning_rate": 4.1816610262046464e-05, + "loss": 11.7685, + "step": 25957 + }, + { + "epoch": 1.4135168433018181, + "grad_norm": 0.5477737922724731, + "learning_rate": 4.180943853154888e-05, + "loss": 12.0035, + "step": 25958 + }, + { + "epoch": 1.4135712972984011, + "grad_norm": 0.538179849185479, + "learning_rate": 4.1802267253550945e-05, + "loss": 11.8145, + "step": 25959 + }, + { + "epoch": 1.4136257512949841, + "grad_norm": 0.5399542451935301, + "learning_rate": 4.179509642810826e-05, + "loss": 11.7071, + "step": 25960 + }, + { + "epoch": 1.4136802052915671, + "grad_norm": 0.5188807013594945, + "learning_rate": 4.178792605527669e-05, + "loss": 11.9183, + "step": 25961 + }, + { + "epoch": 1.4137346592881501, + "grad_norm": 0.5547543279310141, + "learning_rate": 4.178075613511195e-05, + "loss": 11.8354, + "step": 25962 + }, + { + "epoch": 1.4137891132847331, + "grad_norm": 0.5450098727823062, + "learning_rate": 4.1773586667669815e-05, + "loss": 11.9785, + "step": 25963 + }, + { + "epoch": 1.413843567281316, + "grad_norm": 0.5382920333937433, + "learning_rate": 4.1766417653006065e-05, + "loss": 11.7307, + "step": 25964 + }, + { + "epoch": 1.413898021277899, + "grad_norm": 0.5608936529144182, + "learning_rate": 4.175924909117638e-05, + "loss": 11.9767, + "step": 25965 + }, + { + "epoch": 1.413952475274482, + "grad_norm": 0.5389868238399339, + "learning_rate": 4.175208098223657e-05, + "loss": 11.7917, + "step": 25966 + }, + { + "epoch": 1.414006929271065, + "grad_norm": 0.5091692630679351, + "learning_rate": 4.174491332624233e-05, + "loss": 11.8441, + "step": 25967 + }, + { + "epoch": 1.414061383267648, + "grad_norm": 0.6467762865340976, + "learning_rate": 4.173774612324943e-05, + "loss": 11.9332, + "step": 25968 + }, + { + "epoch": 1.4141158372642313, + "grad_norm": 0.5567841750221176, + "learning_rate": 4.17305793733136e-05, + "loss": 11.9145, + "step": 25969 + }, + { + "epoch": 1.4141702912608143, + "grad_norm": 0.5222396009735883, + "learning_rate": 4.1723413076490514e-05, + "loss": 11.8906, + "step": 25970 + }, + { + "epoch": 1.4142247452573973, + "grad_norm": 0.5485755108908761, + "learning_rate": 4.171624723283598e-05, + "loss": 11.913, + "step": 25971 + }, + { + "epoch": 1.4142791992539803, + "grad_norm": 0.5922561876464869, + "learning_rate": 4.170908184240565e-05, + "loss": 11.9357, + "step": 25972 + }, + { + "epoch": 1.4143336532505633, + "grad_norm": 0.5856668848009153, + "learning_rate": 4.17019169052553e-05, + "loss": 11.8688, + "step": 25973 + }, + { + "epoch": 1.4143881072471463, + "grad_norm": 0.538227228542079, + "learning_rate": 4.169475242144059e-05, + "loss": 11.9368, + "step": 25974 + }, + { + "epoch": 1.4144425612437292, + "grad_norm": 0.5515683291958238, + "learning_rate": 4.168758839101726e-05, + "loss": 11.8369, + "step": 25975 + }, + { + "epoch": 1.4144970152403122, + "grad_norm": 0.5537332162780446, + "learning_rate": 4.168042481404106e-05, + "loss": 11.9316, + "step": 25976 + }, + { + "epoch": 1.4145514692368952, + "grad_norm": 0.5196901118528175, + "learning_rate": 4.1673261690567625e-05, + "loss": 11.9138, + "step": 25977 + }, + { + "epoch": 1.4146059232334784, + "grad_norm": 0.5744457087145551, + "learning_rate": 4.1666099020652725e-05, + "loss": 11.9619, + "step": 25978 + }, + { + "epoch": 1.4146603772300614, + "grad_norm": 0.5410766570338509, + "learning_rate": 4.1658936804352014e-05, + "loss": 11.8747, + "step": 25979 + }, + { + "epoch": 1.4147148312266444, + "grad_norm": 0.4945232811183512, + "learning_rate": 4.165177504172116e-05, + "loss": 11.8752, + "step": 25980 + }, + { + "epoch": 1.4147692852232274, + "grad_norm": 0.508991236122, + "learning_rate": 4.164461373281593e-05, + "loss": 11.9007, + "step": 25981 + }, + { + "epoch": 1.4148237392198104, + "grad_norm": 0.5330165501366706, + "learning_rate": 4.1637452877691926e-05, + "loss": 11.8423, + "step": 25982 + }, + { + "epoch": 1.4148781932163934, + "grad_norm": 0.5652449436507545, + "learning_rate": 4.163029247640492e-05, + "loss": 11.6582, + "step": 25983 + }, + { + "epoch": 1.4149326472129764, + "grad_norm": 0.519014294380056, + "learning_rate": 4.16231325290105e-05, + "loss": 11.8555, + "step": 25984 + }, + { + "epoch": 1.4149871012095594, + "grad_norm": 0.5537374027393267, + "learning_rate": 4.16159730355644e-05, + "loss": 11.7105, + "step": 25985 + }, + { + "epoch": 1.4150415552061424, + "grad_norm": 0.570714286819854, + "learning_rate": 4.160881399612232e-05, + "loss": 12.0244, + "step": 25986 + }, + { + "epoch": 1.4150960092027254, + "grad_norm": 0.5083293404096292, + "learning_rate": 4.160165541073986e-05, + "loss": 11.8749, + "step": 25987 + }, + { + "epoch": 1.4151504631993084, + "grad_norm": 0.5373861650093892, + "learning_rate": 4.1594497279472756e-05, + "loss": 11.9579, + "step": 25988 + }, + { + "epoch": 1.4152049171958914, + "grad_norm": 0.5370248081639699, + "learning_rate": 4.158733960237664e-05, + "loss": 11.8867, + "step": 25989 + }, + { + "epoch": 1.4152593711924744, + "grad_norm": 0.5386214998025568, + "learning_rate": 4.1580182379507125e-05, + "loss": 11.8309, + "step": 25990 + }, + { + "epoch": 1.4153138251890574, + "grad_norm": 0.5192645532576364, + "learning_rate": 4.157302561091995e-05, + "loss": 11.8792, + "step": 25991 + }, + { + "epoch": 1.4153682791856403, + "grad_norm": 0.5721741265348732, + "learning_rate": 4.156586929667068e-05, + "loss": 11.7963, + "step": 25992 + }, + { + "epoch": 1.4154227331822236, + "grad_norm": 0.5601051246503809, + "learning_rate": 4.1558713436815055e-05, + "loss": 11.9062, + "step": 25993 + }, + { + "epoch": 1.4154771871788066, + "grad_norm": 0.5503199851394696, + "learning_rate": 4.1551558031408634e-05, + "loss": 11.7497, + "step": 25994 + }, + { + "epoch": 1.4155316411753895, + "grad_norm": 0.5728285074879068, + "learning_rate": 4.154440308050713e-05, + "loss": 11.8929, + "step": 25995 + }, + { + "epoch": 1.4155860951719725, + "grad_norm": 0.5737624825764145, + "learning_rate": 4.1537248584166136e-05, + "loss": 11.8988, + "step": 25996 + }, + { + "epoch": 1.4156405491685555, + "grad_norm": 0.5604210694310348, + "learning_rate": 4.1530094542441276e-05, + "loss": 11.6098, + "step": 25997 + }, + { + "epoch": 1.4156950031651385, + "grad_norm": 0.5971442269481954, + "learning_rate": 4.152294095538826e-05, + "loss": 11.632, + "step": 25998 + }, + { + "epoch": 1.4157494571617215, + "grad_norm": 0.587712392055818, + "learning_rate": 4.151578782306266e-05, + "loss": 11.8529, + "step": 25999 + }, + { + "epoch": 1.4158039111583045, + "grad_norm": 0.5199201438969665, + "learning_rate": 4.150863514552006e-05, + "loss": 11.8342, + "step": 26000 + }, + { + "epoch": 1.4158583651548877, + "grad_norm": 0.5491790085603228, + "learning_rate": 4.150148292281615e-05, + "loss": 11.8861, + "step": 26001 + }, + { + "epoch": 1.4159128191514707, + "grad_norm": 0.5391689353760124, + "learning_rate": 4.149433115500649e-05, + "loss": 11.9343, + "step": 26002 + }, + { + "epoch": 1.4159672731480537, + "grad_norm": 0.5539038199046, + "learning_rate": 4.148717984214676e-05, + "loss": 11.8852, + "step": 26003 + }, + { + "epoch": 1.4160217271446367, + "grad_norm": 0.5887859534750126, + "learning_rate": 4.1480028984292494e-05, + "loss": 11.8545, + "step": 26004 + }, + { + "epoch": 1.4160761811412197, + "grad_norm": 0.4961945883089661, + "learning_rate": 4.147287858149939e-05, + "loss": 11.819, + "step": 26005 + }, + { + "epoch": 1.4161306351378027, + "grad_norm": 0.5518306064887253, + "learning_rate": 4.1465728633822945e-05, + "loss": 11.9726, + "step": 26006 + }, + { + "epoch": 1.4161850891343857, + "grad_norm": 0.5842209688541331, + "learning_rate": 4.1458579141318806e-05, + "loss": 11.8569, + "step": 26007 + }, + { + "epoch": 1.4162395431309687, + "grad_norm": 0.5350962990333932, + "learning_rate": 4.145143010404261e-05, + "loss": 11.9312, + "step": 26008 + }, + { + "epoch": 1.4162939971275517, + "grad_norm": 0.5791220006522387, + "learning_rate": 4.144428152204992e-05, + "loss": 11.9225, + "step": 26009 + }, + { + "epoch": 1.4163484511241347, + "grad_norm": 0.5465158073386612, + "learning_rate": 4.143713339539631e-05, + "loss": 11.9973, + "step": 26010 + }, + { + "epoch": 1.4164029051207176, + "grad_norm": 0.54725218969395, + "learning_rate": 4.1429985724137344e-05, + "loss": 11.8318, + "step": 26011 + }, + { + "epoch": 1.4164573591173006, + "grad_norm": 0.5809625208062601, + "learning_rate": 4.142283850832862e-05, + "loss": 11.957, + "step": 26012 + }, + { + "epoch": 1.4165118131138836, + "grad_norm": 0.522191329054975, + "learning_rate": 4.141569174802577e-05, + "loss": 11.8639, + "step": 26013 + }, + { + "epoch": 1.4165662671104666, + "grad_norm": 0.5063744605868702, + "learning_rate": 4.1408545443284296e-05, + "loss": 11.8261, + "step": 26014 + }, + { + "epoch": 1.4166207211070496, + "grad_norm": 0.5846019839427633, + "learning_rate": 4.140139959415983e-05, + "loss": 11.8435, + "step": 26015 + }, + { + "epoch": 1.4166751751036328, + "grad_norm": 0.5365235964460565, + "learning_rate": 4.139425420070786e-05, + "loss": 11.9106, + "step": 26016 + }, + { + "epoch": 1.4167296291002158, + "grad_norm": 0.5778544788992478, + "learning_rate": 4.1387109262984045e-05, + "loss": 11.7469, + "step": 26017 + }, + { + "epoch": 1.4167840830967988, + "grad_norm": 0.553398277724836, + "learning_rate": 4.137996478104386e-05, + "loss": 11.8099, + "step": 26018 + }, + { + "epoch": 1.4168385370933818, + "grad_norm": 0.5741702152478732, + "learning_rate": 4.13728207549429e-05, + "loss": 11.8734, + "step": 26019 + }, + { + "epoch": 1.4168929910899648, + "grad_norm": 0.550900619627361, + "learning_rate": 4.136567718473678e-05, + "loss": 11.8239, + "step": 26020 + }, + { + "epoch": 1.4169474450865478, + "grad_norm": 0.544200301115898, + "learning_rate": 4.135853407048093e-05, + "loss": 11.8117, + "step": 26021 + }, + { + "epoch": 1.4170018990831308, + "grad_norm": 0.5174507075260631, + "learning_rate": 4.135139141223098e-05, + "loss": 11.908, + "step": 26022 + }, + { + "epoch": 1.4170563530797138, + "grad_norm": 0.6257017394501336, + "learning_rate": 4.1344249210042416e-05, + "loss": 12.0665, + "step": 26023 + }, + { + "epoch": 1.4171108070762968, + "grad_norm": 0.533852899698103, + "learning_rate": 4.13371074639708e-05, + "loss": 11.7009, + "step": 26024 + }, + { + "epoch": 1.41716526107288, + "grad_norm": 0.5477964478698966, + "learning_rate": 4.132996617407171e-05, + "loss": 11.8385, + "step": 26025 + }, + { + "epoch": 1.417219715069463, + "grad_norm": 0.6206393240749204, + "learning_rate": 4.132282534040062e-05, + "loss": 11.9334, + "step": 26026 + }, + { + "epoch": 1.417274169066046, + "grad_norm": 0.5392711500843311, + "learning_rate": 4.13156849630131e-05, + "loss": 11.5982, + "step": 26027 + }, + { + "epoch": 1.417328623062629, + "grad_norm": 0.5773436095740883, + "learning_rate": 4.130854504196463e-05, + "loss": 11.9154, + "step": 26028 + }, + { + "epoch": 1.417383077059212, + "grad_norm": 0.614142876669388, + "learning_rate": 4.130140557731078e-05, + "loss": 11.7, + "step": 26029 + }, + { + "epoch": 1.417437531055795, + "grad_norm": 0.6222823179624032, + "learning_rate": 4.1294266569107054e-05, + "loss": 11.9589, + "step": 26030 + }, + { + "epoch": 1.417491985052378, + "grad_norm": 0.4657596046349355, + "learning_rate": 4.1287128017408915e-05, + "loss": 11.8332, + "step": 26031 + }, + { + "epoch": 1.417546439048961, + "grad_norm": 0.533174547148453, + "learning_rate": 4.127998992227196e-05, + "loss": 11.8108, + "step": 26032 + }, + { + "epoch": 1.417600893045544, + "grad_norm": 0.5240046027988714, + "learning_rate": 4.1272852283751603e-05, + "loss": 11.9401, + "step": 26033 + }, + { + "epoch": 1.417655347042127, + "grad_norm": 0.5544403069218031, + "learning_rate": 4.1265715101903405e-05, + "loss": 11.8395, + "step": 26034 + }, + { + "epoch": 1.41770980103871, + "grad_norm": 0.4970618399510174, + "learning_rate": 4.1258578376782886e-05, + "loss": 11.7907, + "step": 26035 + }, + { + "epoch": 1.417764255035293, + "grad_norm": 0.5300108850664907, + "learning_rate": 4.125144210844547e-05, + "loss": 11.8877, + "step": 26036 + }, + { + "epoch": 1.417818709031876, + "grad_norm": 0.5711334236326334, + "learning_rate": 4.1244306296946736e-05, + "loss": 11.9022, + "step": 26037 + }, + { + "epoch": 1.417873163028459, + "grad_norm": 0.6409867052380667, + "learning_rate": 4.1237170942342094e-05, + "loss": 11.9016, + "step": 26038 + }, + { + "epoch": 1.417927617025042, + "grad_norm": 0.5461814100789183, + "learning_rate": 4.12300360446871e-05, + "loss": 11.8645, + "step": 26039 + }, + { + "epoch": 1.417982071021625, + "grad_norm": 0.5403477338091139, + "learning_rate": 4.12229016040372e-05, + "loss": 11.8089, + "step": 26040 + }, + { + "epoch": 1.418036525018208, + "grad_norm": 0.5330103235941679, + "learning_rate": 4.121576762044784e-05, + "loss": 11.8497, + "step": 26041 + }, + { + "epoch": 1.418090979014791, + "grad_norm": 0.5678020052658446, + "learning_rate": 4.1208634093974575e-05, + "loss": 11.9499, + "step": 26042 + }, + { + "epoch": 1.418145433011374, + "grad_norm": 0.5869124138748899, + "learning_rate": 4.120150102467279e-05, + "loss": 11.8834, + "step": 26043 + }, + { + "epoch": 1.418199887007957, + "grad_norm": 0.5267235374511102, + "learning_rate": 4.1194368412598015e-05, + "loss": 11.9181, + "step": 26044 + }, + { + "epoch": 1.41825434100454, + "grad_norm": 0.5408424887123142, + "learning_rate": 4.118723625780566e-05, + "loss": 11.907, + "step": 26045 + }, + { + "epoch": 1.418308795001123, + "grad_norm": 0.5820581437565443, + "learning_rate": 4.118010456035123e-05, + "loss": 11.8812, + "step": 26046 + }, + { + "epoch": 1.418363248997706, + "grad_norm": 0.5774985774803503, + "learning_rate": 4.11729733202902e-05, + "loss": 11.9813, + "step": 26047 + }, + { + "epoch": 1.4184177029942893, + "grad_norm": 0.5343610725050975, + "learning_rate": 4.1165842537677954e-05, + "loss": 11.8822, + "step": 26048 + }, + { + "epoch": 1.4184721569908723, + "grad_norm": 0.537262384378862, + "learning_rate": 4.1158712212570036e-05, + "loss": 11.9668, + "step": 26049 + }, + { + "epoch": 1.4185266109874552, + "grad_norm": 0.5063228528453032, + "learning_rate": 4.115158234502182e-05, + "loss": 11.7703, + "step": 26050 + }, + { + "epoch": 1.4185810649840382, + "grad_norm": 0.5652784886923571, + "learning_rate": 4.114445293508875e-05, + "loss": 11.8963, + "step": 26051 + }, + { + "epoch": 1.4186355189806212, + "grad_norm": 0.565569947237846, + "learning_rate": 4.1137323982826314e-05, + "loss": 11.8755, + "step": 26052 + }, + { + "epoch": 1.4186899729772042, + "grad_norm": 0.5304698072295083, + "learning_rate": 4.113019548828988e-05, + "loss": 11.7958, + "step": 26053 + }, + { + "epoch": 1.4187444269737872, + "grad_norm": 0.6203083568848862, + "learning_rate": 4.1123067451534944e-05, + "loss": 11.9796, + "step": 26054 + }, + { + "epoch": 1.4187988809703702, + "grad_norm": 0.5354760052231521, + "learning_rate": 4.111593987261689e-05, + "loss": 11.9106, + "step": 26055 + }, + { + "epoch": 1.4188533349669532, + "grad_norm": 0.5340381212757845, + "learning_rate": 4.110881275159115e-05, + "loss": 11.9029, + "step": 26056 + }, + { + "epoch": 1.4189077889635362, + "grad_norm": 0.6101841736073859, + "learning_rate": 4.11016860885132e-05, + "loss": 11.9062, + "step": 26057 + }, + { + "epoch": 1.4189622429601192, + "grad_norm": 0.5232982904949554, + "learning_rate": 4.109455988343839e-05, + "loss": 11.8734, + "step": 26058 + }, + { + "epoch": 1.4190166969567022, + "grad_norm": 0.5467383575943048, + "learning_rate": 4.108743413642219e-05, + "loss": 11.7749, + "step": 26059 + }, + { + "epoch": 1.4190711509532852, + "grad_norm": 0.5555725119473592, + "learning_rate": 4.1080308847519966e-05, + "loss": 11.7675, + "step": 26060 + }, + { + "epoch": 1.4191256049498682, + "grad_norm": 0.524627134341648, + "learning_rate": 4.1073184016787124e-05, + "loss": 11.8669, + "step": 26061 + }, + { + "epoch": 1.4191800589464514, + "grad_norm": 0.5357802924944004, + "learning_rate": 4.106605964427911e-05, + "loss": 11.8418, + "step": 26062 + }, + { + "epoch": 1.4192345129430344, + "grad_norm": 0.5722948634309796, + "learning_rate": 4.105893573005128e-05, + "loss": 11.8218, + "step": 26063 + }, + { + "epoch": 1.4192889669396174, + "grad_norm": 0.5262608179823776, + "learning_rate": 4.1051812274159064e-05, + "loss": 11.9163, + "step": 26064 + }, + { + "epoch": 1.4193434209362004, + "grad_norm": 0.5398488565451185, + "learning_rate": 4.104468927665782e-05, + "loss": 11.9241, + "step": 26065 + }, + { + "epoch": 1.4193978749327834, + "grad_norm": 0.5687012735162117, + "learning_rate": 4.1037566737602994e-05, + "loss": 11.8976, + "step": 26066 + }, + { + "epoch": 1.4194523289293663, + "grad_norm": 0.5131670006623822, + "learning_rate": 4.1030444657049894e-05, + "loss": 11.8278, + "step": 26067 + }, + { + "epoch": 1.4195067829259493, + "grad_norm": 0.5906771309699338, + "learning_rate": 4.102332303505395e-05, + "loss": 11.9589, + "step": 26068 + }, + { + "epoch": 1.4195612369225323, + "grad_norm": 0.5221242590418391, + "learning_rate": 4.1016201871670566e-05, + "loss": 11.7565, + "step": 26069 + }, + { + "epoch": 1.4196156909191153, + "grad_norm": 0.5080847532759977, + "learning_rate": 4.10090811669551e-05, + "loss": 11.8155, + "step": 26070 + }, + { + "epoch": 1.4196701449156985, + "grad_norm": 0.6013787933184735, + "learning_rate": 4.10019609209629e-05, + "loss": 12.0543, + "step": 26071 + }, + { + "epoch": 1.4197245989122815, + "grad_norm": 0.5656051014291049, + "learning_rate": 4.0994841133749316e-05, + "loss": 11.9374, + "step": 26072 + }, + { + "epoch": 1.4197790529088645, + "grad_norm": 0.5337381619939967, + "learning_rate": 4.098772180536975e-05, + "loss": 11.8808, + "step": 26073 + }, + { + "epoch": 1.4198335069054475, + "grad_norm": 0.5726554967655715, + "learning_rate": 4.098060293587957e-05, + "loss": 11.8528, + "step": 26074 + }, + { + "epoch": 1.4198879609020305, + "grad_norm": 0.6861360714871024, + "learning_rate": 4.09734845253341e-05, + "loss": 11.9746, + "step": 26075 + }, + { + "epoch": 1.4199424148986135, + "grad_norm": 0.5730838285555493, + "learning_rate": 4.096636657378875e-05, + "loss": 11.8877, + "step": 26076 + }, + { + "epoch": 1.4199968688951965, + "grad_norm": 0.5645379395460011, + "learning_rate": 4.09592490812988e-05, + "loss": 11.9075, + "step": 26077 + }, + { + "epoch": 1.4200513228917795, + "grad_norm": 0.5888163244518616, + "learning_rate": 4.0952132047919654e-05, + "loss": 11.8095, + "step": 26078 + }, + { + "epoch": 1.4201057768883625, + "grad_norm": 0.5114499624367852, + "learning_rate": 4.094501547370661e-05, + "loss": 11.8477, + "step": 26079 + }, + { + "epoch": 1.4201602308849455, + "grad_norm": 0.5129660993821542, + "learning_rate": 4.093789935871505e-05, + "loss": 11.847, + "step": 26080 + }, + { + "epoch": 1.4202146848815285, + "grad_norm": 0.5390772687294157, + "learning_rate": 4.09307837030003e-05, + "loss": 11.9161, + "step": 26081 + }, + { + "epoch": 1.4202691388781115, + "grad_norm": 0.5380222504227737, + "learning_rate": 4.092366850661763e-05, + "loss": 11.8749, + "step": 26082 + }, + { + "epoch": 1.4203235928746945, + "grad_norm": 0.5877424641458174, + "learning_rate": 4.0916553769622435e-05, + "loss": 11.9353, + "step": 26083 + }, + { + "epoch": 1.4203780468712774, + "grad_norm": 0.5670860176310908, + "learning_rate": 4.0909439492070054e-05, + "loss": 11.8937, + "step": 26084 + }, + { + "epoch": 1.4204325008678604, + "grad_norm": 0.6083928727948095, + "learning_rate": 4.090232567401576e-05, + "loss": 11.9563, + "step": 26085 + }, + { + "epoch": 1.4204869548644437, + "grad_norm": 0.5287215448878694, + "learning_rate": 4.089521231551492e-05, + "loss": 11.8916, + "step": 26086 + }, + { + "epoch": 1.4205414088610266, + "grad_norm": 0.509914956110607, + "learning_rate": 4.088809941662278e-05, + "loss": 11.7701, + "step": 26087 + }, + { + "epoch": 1.4205958628576096, + "grad_norm": 0.6305841139139768, + "learning_rate": 4.088098697739473e-05, + "loss": 11.903, + "step": 26088 + }, + { + "epoch": 1.4206503168541926, + "grad_norm": 0.5456601598513064, + "learning_rate": 4.087387499788601e-05, + "loss": 11.8674, + "step": 26089 + }, + { + "epoch": 1.4207047708507756, + "grad_norm": 0.5477823649061792, + "learning_rate": 4.086676347815198e-05, + "loss": 11.8448, + "step": 26090 + }, + { + "epoch": 1.4207592248473586, + "grad_norm": 0.56747306327434, + "learning_rate": 4.0859652418247915e-05, + "loss": 11.9006, + "step": 26091 + }, + { + "epoch": 1.4208136788439416, + "grad_norm": 0.566420837252433, + "learning_rate": 4.0852541818229085e-05, + "loss": 11.8488, + "step": 26092 + }, + { + "epoch": 1.4208681328405246, + "grad_norm": 0.5619605691291363, + "learning_rate": 4.084543167815084e-05, + "loss": 11.9379, + "step": 26093 + }, + { + "epoch": 1.4209225868371076, + "grad_norm": 0.5967933155463008, + "learning_rate": 4.08383219980684e-05, + "loss": 11.9458, + "step": 26094 + }, + { + "epoch": 1.4209770408336908, + "grad_norm": 0.5697095771538786, + "learning_rate": 4.083121277803709e-05, + "loss": 11.9208, + "step": 26095 + }, + { + "epoch": 1.4210314948302738, + "grad_norm": 0.5351231640418428, + "learning_rate": 4.082410401811222e-05, + "loss": 11.8725, + "step": 26096 + }, + { + "epoch": 1.4210859488268568, + "grad_norm": 0.6268998248893082, + "learning_rate": 4.081699571834902e-05, + "loss": 11.958, + "step": 26097 + }, + { + "epoch": 1.4211404028234398, + "grad_norm": 0.5329125590064157, + "learning_rate": 4.080988787880281e-05, + "loss": 11.8002, + "step": 26098 + }, + { + "epoch": 1.4211948568200228, + "grad_norm": 0.6251433177643696, + "learning_rate": 4.0802780499528804e-05, + "loss": 12.0663, + "step": 26099 + }, + { + "epoch": 1.4212493108166058, + "grad_norm": 0.5002374462493898, + "learning_rate": 4.079567358058235e-05, + "loss": 11.7771, + "step": 26100 + }, + { + "epoch": 1.4213037648131888, + "grad_norm": 0.5244976714655124, + "learning_rate": 4.078856712201866e-05, + "loss": 11.8338, + "step": 26101 + }, + { + "epoch": 1.4213582188097718, + "grad_norm": 0.5511202951548435, + "learning_rate": 4.0781461123892964e-05, + "loss": 11.8922, + "step": 26102 + }, + { + "epoch": 1.4214126728063547, + "grad_norm": 0.5225968468841047, + "learning_rate": 4.077435558626059e-05, + "loss": 11.8962, + "step": 26103 + }, + { + "epoch": 1.4214671268029377, + "grad_norm": 0.5870296286405391, + "learning_rate": 4.0767250509176734e-05, + "loss": 11.9209, + "step": 26104 + }, + { + "epoch": 1.4215215807995207, + "grad_norm": 0.5483696163688081, + "learning_rate": 4.076014589269667e-05, + "loss": 11.854, + "step": 26105 + }, + { + "epoch": 1.4215760347961037, + "grad_norm": 0.5629190365951552, + "learning_rate": 4.0753041736875675e-05, + "loss": 11.9133, + "step": 26106 + }, + { + "epoch": 1.4216304887926867, + "grad_norm": 0.5403040801153737, + "learning_rate": 4.0745938041768926e-05, + "loss": 11.9471, + "step": 26107 + }, + { + "epoch": 1.4216849427892697, + "grad_norm": 0.6256786439182609, + "learning_rate": 4.0738834807431734e-05, + "loss": 12.0484, + "step": 26108 + }, + { + "epoch": 1.421739396785853, + "grad_norm": 0.5477870347909671, + "learning_rate": 4.073173203391928e-05, + "loss": 11.8385, + "step": 26109 + }, + { + "epoch": 1.421793850782436, + "grad_norm": 0.5396897595736808, + "learning_rate": 4.0724629721286846e-05, + "loss": 11.9866, + "step": 26110 + }, + { + "epoch": 1.421848304779019, + "grad_norm": 0.548165056983725, + "learning_rate": 4.071752786958962e-05, + "loss": 11.9578, + "step": 26111 + }, + { + "epoch": 1.421902758775602, + "grad_norm": 0.5067345279142136, + "learning_rate": 4.071042647888281e-05, + "loss": 11.9181, + "step": 26112 + }, + { + "epoch": 1.421957212772185, + "grad_norm": 0.5302774756390238, + "learning_rate": 4.0703325549221715e-05, + "loss": 11.7496, + "step": 26113 + }, + { + "epoch": 1.422011666768768, + "grad_norm": 0.5373970293476852, + "learning_rate": 4.069622508066146e-05, + "loss": 11.9389, + "step": 26114 + }, + { + "epoch": 1.4220661207653509, + "grad_norm": 0.5824346892358903, + "learning_rate": 4.068912507325733e-05, + "loss": 11.7986, + "step": 26115 + }, + { + "epoch": 1.4221205747619339, + "grad_norm": 0.5117474109183592, + "learning_rate": 4.0682025527064486e-05, + "loss": 11.8509, + "step": 26116 + }, + { + "epoch": 1.4221750287585169, + "grad_norm": 0.5371351655839012, + "learning_rate": 4.067492644213815e-05, + "loss": 11.8704, + "step": 26117 + }, + { + "epoch": 1.4222294827551, + "grad_norm": 0.548480832880369, + "learning_rate": 4.066782781853358e-05, + "loss": 11.9652, + "step": 26118 + }, + { + "epoch": 1.422283936751683, + "grad_norm": 0.5413346781788357, + "learning_rate": 4.066072965630588e-05, + "loss": 11.8309, + "step": 26119 + }, + { + "epoch": 1.422338390748266, + "grad_norm": 0.5582722841388086, + "learning_rate": 4.065363195551039e-05, + "loss": 11.9146, + "step": 26120 + }, + { + "epoch": 1.422392844744849, + "grad_norm": 0.6060756787871406, + "learning_rate": 4.064653471620213e-05, + "loss": 12.0039, + "step": 26121 + }, + { + "epoch": 1.422447298741432, + "grad_norm": 0.5455678881802932, + "learning_rate": 4.0639437938436365e-05, + "loss": 11.8754, + "step": 26122 + }, + { + "epoch": 1.422501752738015, + "grad_norm": 0.5312607281569431, + "learning_rate": 4.063234162226832e-05, + "loss": 11.8576, + "step": 26123 + }, + { + "epoch": 1.422556206734598, + "grad_norm": 0.5362934715754557, + "learning_rate": 4.062524576775312e-05, + "loss": 11.8552, + "step": 26124 + }, + { + "epoch": 1.422610660731181, + "grad_norm": 0.5407476284695961, + "learning_rate": 4.0618150374946e-05, + "loss": 11.9082, + "step": 26125 + }, + { + "epoch": 1.422665114727764, + "grad_norm": 0.5480359721976515, + "learning_rate": 4.061105544390205e-05, + "loss": 11.9509, + "step": 26126 + }, + { + "epoch": 1.422719568724347, + "grad_norm": 0.5530294624497818, + "learning_rate": 4.060396097467653e-05, + "loss": 11.952, + "step": 26127 + }, + { + "epoch": 1.42277402272093, + "grad_norm": 0.5398715923136687, + "learning_rate": 4.0596866967324546e-05, + "loss": 11.8422, + "step": 26128 + }, + { + "epoch": 1.422828476717513, + "grad_norm": 0.4983930270551854, + "learning_rate": 4.058977342190128e-05, + "loss": 11.8372, + "step": 26129 + }, + { + "epoch": 1.422882930714096, + "grad_norm": 0.6456253775489371, + "learning_rate": 4.058268033846198e-05, + "loss": 11.8939, + "step": 26130 + }, + { + "epoch": 1.422937384710679, + "grad_norm": 0.5009297295174231, + "learning_rate": 4.057558771706163e-05, + "loss": 11.7764, + "step": 26131 + }, + { + "epoch": 1.4229918387072622, + "grad_norm": 0.5131048532860858, + "learning_rate": 4.056849555775549e-05, + "loss": 11.8219, + "step": 26132 + }, + { + "epoch": 1.4230462927038452, + "grad_norm": 0.5496043825935185, + "learning_rate": 4.056140386059873e-05, + "loss": 11.8435, + "step": 26133 + }, + { + "epoch": 1.4231007467004282, + "grad_norm": 0.5249219292855265, + "learning_rate": 4.055431262564643e-05, + "loss": 11.9183, + "step": 26134 + }, + { + "epoch": 1.4231552006970112, + "grad_norm": 0.541743106037878, + "learning_rate": 4.054722185295379e-05, + "loss": 11.886, + "step": 26135 + }, + { + "epoch": 1.4232096546935942, + "grad_norm": 0.4990124456180477, + "learning_rate": 4.0540131542575885e-05, + "loss": 11.9758, + "step": 26136 + }, + { + "epoch": 1.4232641086901772, + "grad_norm": 0.5689709929035197, + "learning_rate": 4.0533041694567944e-05, + "loss": 12.0183, + "step": 26137 + }, + { + "epoch": 1.4233185626867602, + "grad_norm": 0.5319986108079289, + "learning_rate": 4.0525952308984995e-05, + "loss": 11.93, + "step": 26138 + }, + { + "epoch": 1.4233730166833432, + "grad_norm": 0.5119422973923216, + "learning_rate": 4.051886338588221e-05, + "loss": 11.8446, + "step": 26139 + }, + { + "epoch": 1.4234274706799261, + "grad_norm": 0.529031137841042, + "learning_rate": 4.051177492531479e-05, + "loss": 11.8661, + "step": 26140 + }, + { + "epoch": 1.4234819246765094, + "grad_norm": 0.5263093224068561, + "learning_rate": 4.050468692733772e-05, + "loss": 11.9608, + "step": 26141 + }, + { + "epoch": 1.4235363786730924, + "grad_norm": 0.5701329276668913, + "learning_rate": 4.049759939200621e-05, + "loss": 11.7892, + "step": 26142 + }, + { + "epoch": 1.4235908326696753, + "grad_norm": 0.5224754995736903, + "learning_rate": 4.049051231937532e-05, + "loss": 11.8852, + "step": 26143 + }, + { + "epoch": 1.4236452866662583, + "grad_norm": 0.6701563146020101, + "learning_rate": 4.0483425709500175e-05, + "loss": 11.981, + "step": 26144 + }, + { + "epoch": 1.4236997406628413, + "grad_norm": 0.5642484403765652, + "learning_rate": 4.047633956243594e-05, + "loss": 11.8037, + "step": 26145 + }, + { + "epoch": 1.4237541946594243, + "grad_norm": 0.519367438488522, + "learning_rate": 4.046925387823762e-05, + "loss": 11.8752, + "step": 26146 + }, + { + "epoch": 1.4238086486560073, + "grad_norm": 0.6012658325997082, + "learning_rate": 4.046216865696041e-05, + "loss": 11.9951, + "step": 26147 + }, + { + "epoch": 1.4238631026525903, + "grad_norm": 0.5626756090497997, + "learning_rate": 4.045508389865933e-05, + "loss": 11.9036, + "step": 26148 + }, + { + "epoch": 1.4239175566491733, + "grad_norm": 0.5470869524108306, + "learning_rate": 4.044799960338953e-05, + "loss": 11.8585, + "step": 26149 + }, + { + "epoch": 1.4239720106457563, + "grad_norm": 0.5231068677699756, + "learning_rate": 4.044091577120608e-05, + "loss": 11.9079, + "step": 26150 + }, + { + "epoch": 1.4240264646423393, + "grad_norm": 0.5193856497954444, + "learning_rate": 4.043383240216402e-05, + "loss": 11.8574, + "step": 26151 + }, + { + "epoch": 1.4240809186389223, + "grad_norm": 0.5775055306382821, + "learning_rate": 4.042674949631849e-05, + "loss": 11.8931, + "step": 26152 + }, + { + "epoch": 1.4241353726355053, + "grad_norm": 0.5401020807876373, + "learning_rate": 4.0419667053724533e-05, + "loss": 11.8396, + "step": 26153 + }, + { + "epoch": 1.4241898266320883, + "grad_norm": 0.5886578061616173, + "learning_rate": 4.0412585074437214e-05, + "loss": 11.8322, + "step": 26154 + }, + { + "epoch": 1.4242442806286713, + "grad_norm": 0.630354039105725, + "learning_rate": 4.040550355851168e-05, + "loss": 11.8991, + "step": 26155 + }, + { + "epoch": 1.4242987346252545, + "grad_norm": 0.5108081604678174, + "learning_rate": 4.0398422506002896e-05, + "loss": 11.8605, + "step": 26156 + }, + { + "epoch": 1.4243531886218375, + "grad_norm": 0.5493105690597134, + "learning_rate": 4.039134191696602e-05, + "loss": 11.8864, + "step": 26157 + }, + { + "epoch": 1.4244076426184205, + "grad_norm": 0.5939040435766363, + "learning_rate": 4.0384261791456024e-05, + "loss": 11.8893, + "step": 26158 + }, + { + "epoch": 1.4244620966150034, + "grad_norm": 0.5848380633593319, + "learning_rate": 4.037718212952805e-05, + "loss": 11.8828, + "step": 26159 + }, + { + "epoch": 1.4245165506115864, + "grad_norm": 0.603612017642678, + "learning_rate": 4.0370102931237055e-05, + "loss": 11.9141, + "step": 26160 + }, + { + "epoch": 1.4245710046081694, + "grad_norm": 0.6001998674268149, + "learning_rate": 4.036302419663819e-05, + "loss": 11.8094, + "step": 26161 + }, + { + "epoch": 1.4246254586047524, + "grad_norm": 0.5414205918706573, + "learning_rate": 4.035594592578644e-05, + "loss": 11.8223, + "step": 26162 + }, + { + "epoch": 1.4246799126013354, + "grad_norm": 0.5063186603290926, + "learning_rate": 4.034886811873683e-05, + "loss": 11.7757, + "step": 26163 + }, + { + "epoch": 1.4247343665979186, + "grad_norm": 0.5185572655618436, + "learning_rate": 4.034179077554445e-05, + "loss": 11.8154, + "step": 26164 + }, + { + "epoch": 1.4247888205945016, + "grad_norm": 0.5204732265212506, + "learning_rate": 4.0334713896264285e-05, + "loss": 11.7669, + "step": 26165 + }, + { + "epoch": 1.4248432745910846, + "grad_norm": 0.5523725407549351, + "learning_rate": 4.0327637480951384e-05, + "loss": 11.8258, + "step": 26166 + }, + { + "epoch": 1.4248977285876676, + "grad_norm": 0.5470036729652743, + "learning_rate": 4.0320561529660816e-05, + "loss": 11.8053, + "step": 26167 + }, + { + "epoch": 1.4249521825842506, + "grad_norm": 0.5282009126328882, + "learning_rate": 4.031348604244753e-05, + "loss": 11.8806, + "step": 26168 + }, + { + "epoch": 1.4250066365808336, + "grad_norm": 0.6393707360575117, + "learning_rate": 4.030641101936663e-05, + "loss": 11.7239, + "step": 26169 + }, + { + "epoch": 1.4250610905774166, + "grad_norm": 0.49075736873806214, + "learning_rate": 4.029933646047306e-05, + "loss": 11.8649, + "step": 26170 + }, + { + "epoch": 1.4251155445739996, + "grad_norm": 0.587060484404055, + "learning_rate": 4.0292262365821885e-05, + "loss": 11.8673, + "step": 26171 + }, + { + "epoch": 1.4251699985705826, + "grad_norm": 0.5549308115617355, + "learning_rate": 4.0285188735468095e-05, + "loss": 11.7866, + "step": 26172 + }, + { + "epoch": 1.4252244525671656, + "grad_norm": 0.6082517650070302, + "learning_rate": 4.0278115569466645e-05, + "loss": 12.0176, + "step": 26173 + }, + { + "epoch": 1.4252789065637486, + "grad_norm": 0.5425571203776478, + "learning_rate": 4.027104286787263e-05, + "loss": 11.8957, + "step": 26174 + }, + { + "epoch": 1.4253333605603316, + "grad_norm": 0.5535651829595233, + "learning_rate": 4.026397063074097e-05, + "loss": 11.6618, + "step": 26175 + }, + { + "epoch": 1.4253878145569145, + "grad_norm": 0.5535735271816487, + "learning_rate": 4.0256898858126726e-05, + "loss": 11.8162, + "step": 26176 + }, + { + "epoch": 1.4254422685534975, + "grad_norm": 0.5490468139436682, + "learning_rate": 4.0249827550084815e-05, + "loss": 11.8025, + "step": 26177 + }, + { + "epoch": 1.4254967225500805, + "grad_norm": 0.5263919424235999, + "learning_rate": 4.024275670667027e-05, + "loss": 11.784, + "step": 26178 + }, + { + "epoch": 1.4255511765466637, + "grad_norm": 0.6019793120904708, + "learning_rate": 4.0235686327938096e-05, + "loss": 11.7365, + "step": 26179 + }, + { + "epoch": 1.4256056305432467, + "grad_norm": 0.5459217845914554, + "learning_rate": 4.022861641394322e-05, + "loss": 11.8549, + "step": 26180 + }, + { + "epoch": 1.4256600845398297, + "grad_norm": 0.5721077614789521, + "learning_rate": 4.0221546964740684e-05, + "loss": 11.8439, + "step": 26181 + }, + { + "epoch": 1.4257145385364127, + "grad_norm": 0.5334336493246139, + "learning_rate": 4.0214477980385427e-05, + "loss": 11.9349, + "step": 26182 + }, + { + "epoch": 1.4257689925329957, + "grad_norm": 0.5811059264341433, + "learning_rate": 4.020740946093237e-05, + "loss": 11.7375, + "step": 26183 + }, + { + "epoch": 1.4258234465295787, + "grad_norm": 0.5033816827030118, + "learning_rate": 4.0200341406436557e-05, + "loss": 11.8887, + "step": 26184 + }, + { + "epoch": 1.4258779005261617, + "grad_norm": 0.6284397320310644, + "learning_rate": 4.019327381695289e-05, + "loss": 11.6962, + "step": 26185 + }, + { + "epoch": 1.4259323545227447, + "grad_norm": 0.529838968932736, + "learning_rate": 4.018620669253639e-05, + "loss": 11.7569, + "step": 26186 + }, + { + "epoch": 1.4259868085193277, + "grad_norm": 0.5636641361792095, + "learning_rate": 4.0179140033241936e-05, + "loss": 11.7074, + "step": 26187 + }, + { + "epoch": 1.426041262515911, + "grad_norm": 0.5397557261826808, + "learning_rate": 4.0172073839124527e-05, + "loss": 11.9393, + "step": 26188 + }, + { + "epoch": 1.426095716512494, + "grad_norm": 0.7155158038755675, + "learning_rate": 4.0165008110239144e-05, + "loss": 12.053, + "step": 26189 + }, + { + "epoch": 1.4261501705090769, + "grad_norm": 0.6202564568682207, + "learning_rate": 4.015794284664065e-05, + "loss": 11.8824, + "step": 26190 + }, + { + "epoch": 1.4262046245056599, + "grad_norm": 0.5361782999426571, + "learning_rate": 4.01508780483841e-05, + "loss": 11.8593, + "step": 26191 + }, + { + "epoch": 1.4262590785022429, + "grad_norm": 0.51339325970096, + "learning_rate": 4.0143813715524295e-05, + "loss": 11.7959, + "step": 26192 + }, + { + "epoch": 1.4263135324988259, + "grad_norm": 0.5247329542683532, + "learning_rate": 4.013674984811622e-05, + "loss": 11.8474, + "step": 26193 + }, + { + "epoch": 1.4263679864954089, + "grad_norm": 0.5310358060783806, + "learning_rate": 4.012968644621487e-05, + "loss": 11.8446, + "step": 26194 + }, + { + "epoch": 1.4264224404919918, + "grad_norm": 0.5698145778999093, + "learning_rate": 4.012262350987507e-05, + "loss": 11.9861, + "step": 26195 + }, + { + "epoch": 1.4264768944885748, + "grad_norm": 0.5218084305810632, + "learning_rate": 4.011556103915183e-05, + "loss": 11.7827, + "step": 26196 + }, + { + "epoch": 1.4265313484851578, + "grad_norm": 0.5828391582600716, + "learning_rate": 4.010849903409999e-05, + "loss": 11.8251, + "step": 26197 + }, + { + "epoch": 1.4265858024817408, + "grad_norm": 0.5927900006822193, + "learning_rate": 4.010143749477454e-05, + "loss": 11.8264, + "step": 26198 + }, + { + "epoch": 1.4266402564783238, + "grad_norm": 0.5523033674032058, + "learning_rate": 4.0094376421230326e-05, + "loss": 11.8712, + "step": 26199 + }, + { + "epoch": 1.4266947104749068, + "grad_norm": 0.48796831006239316, + "learning_rate": 4.0087315813522283e-05, + "loss": 11.7577, + "step": 26200 + }, + { + "epoch": 1.4267491644714898, + "grad_norm": 0.6614840011013181, + "learning_rate": 4.00802556717054e-05, + "loss": 11.8877, + "step": 26201 + }, + { + "epoch": 1.426803618468073, + "grad_norm": 0.509287481742404, + "learning_rate": 4.0073195995834424e-05, + "loss": 11.7933, + "step": 26202 + }, + { + "epoch": 1.426858072464656, + "grad_norm": 0.5295338816705573, + "learning_rate": 4.0066136785964316e-05, + "loss": 11.8831, + "step": 26203 + }, + { + "epoch": 1.426912526461239, + "grad_norm": 0.5209799435848458, + "learning_rate": 4.0059078042150024e-05, + "loss": 11.6572, + "step": 26204 + }, + { + "epoch": 1.426966980457822, + "grad_norm": 0.5310987449037925, + "learning_rate": 4.0052019764446355e-05, + "loss": 11.8869, + "step": 26205 + }, + { + "epoch": 1.427021434454405, + "grad_norm": 0.5326269055056083, + "learning_rate": 4.0044961952908264e-05, + "loss": 11.7507, + "step": 26206 + }, + { + "epoch": 1.427075888450988, + "grad_norm": 0.5432080036948125, + "learning_rate": 4.003790460759058e-05, + "loss": 11.9245, + "step": 26207 + }, + { + "epoch": 1.427130342447571, + "grad_norm": 0.5271225762138729, + "learning_rate": 4.0030847728548235e-05, + "loss": 11.9062, + "step": 26208 + }, + { + "epoch": 1.427184796444154, + "grad_norm": 0.5309945912159396, + "learning_rate": 4.002379131583603e-05, + "loss": 11.9904, + "step": 26209 + }, + { + "epoch": 1.427239250440737, + "grad_norm": 0.5184897846958649, + "learning_rate": 4.0016735369508896e-05, + "loss": 11.8469, + "step": 26210 + }, + { + "epoch": 1.4272937044373202, + "grad_norm": 0.5304836564332092, + "learning_rate": 4.000967988962176e-05, + "loss": 11.8576, + "step": 26211 + }, + { + "epoch": 1.4273481584339032, + "grad_norm": 0.5057999406033876, + "learning_rate": 4.000262487622932e-05, + "loss": 11.8753, + "step": 26212 + }, + { + "epoch": 1.4274026124304862, + "grad_norm": 0.6491732972027163, + "learning_rate": 3.999557032938659e-05, + "loss": 11.9121, + "step": 26213 + }, + { + "epoch": 1.4274570664270692, + "grad_norm": 0.5739909975227664, + "learning_rate": 3.998851624914832e-05, + "loss": 11.9992, + "step": 26214 + }, + { + "epoch": 1.4275115204236521, + "grad_norm": 0.4940367374012289, + "learning_rate": 3.998146263556941e-05, + "loss": 11.7942, + "step": 26215 + }, + { + "epoch": 1.4275659744202351, + "grad_norm": 0.544345263463131, + "learning_rate": 3.997440948870476e-05, + "loss": 11.7832, + "step": 26216 + }, + { + "epoch": 1.4276204284168181, + "grad_norm": 0.5316257964238552, + "learning_rate": 3.9967356808609126e-05, + "loss": 11.9227, + "step": 26217 + }, + { + "epoch": 1.4276748824134011, + "grad_norm": 0.49500755455490225, + "learning_rate": 3.9960304595337414e-05, + "loss": 11.8202, + "step": 26218 + }, + { + "epoch": 1.4277293364099841, + "grad_norm": 0.5218596838066738, + "learning_rate": 3.995325284894442e-05, + "loss": 11.8303, + "step": 26219 + }, + { + "epoch": 1.427783790406567, + "grad_norm": 0.529783229415229, + "learning_rate": 3.994620156948504e-05, + "loss": 11.8972, + "step": 26220 + }, + { + "epoch": 1.42783824440315, + "grad_norm": 0.5265848539579019, + "learning_rate": 3.993915075701407e-05, + "loss": 11.8772, + "step": 26221 + }, + { + "epoch": 1.427892698399733, + "grad_norm": 0.5954826891122208, + "learning_rate": 3.9932100411586296e-05, + "loss": 11.9406, + "step": 26222 + }, + { + "epoch": 1.427947152396316, + "grad_norm": 0.5807564782298249, + "learning_rate": 3.992505053325662e-05, + "loss": 11.7988, + "step": 26223 + }, + { + "epoch": 1.428001606392899, + "grad_norm": 0.5196498650807065, + "learning_rate": 3.9918001122079785e-05, + "loss": 11.9653, + "step": 26224 + }, + { + "epoch": 1.428056060389482, + "grad_norm": 0.5158191391345669, + "learning_rate": 3.9910952178110686e-05, + "loss": 11.8003, + "step": 26225 + }, + { + "epoch": 1.4281105143860653, + "grad_norm": 0.5334710572942586, + "learning_rate": 3.9903903701404054e-05, + "loss": 11.8635, + "step": 26226 + }, + { + "epoch": 1.4281649683826483, + "grad_norm": 0.5423864235929465, + "learning_rate": 3.9896855692014755e-05, + "loss": 11.8995, + "step": 26227 + }, + { + "epoch": 1.4282194223792313, + "grad_norm": 0.5497611115481867, + "learning_rate": 3.988980814999763e-05, + "loss": 11.9004, + "step": 26228 + }, + { + "epoch": 1.4282738763758143, + "grad_norm": 0.611807334997668, + "learning_rate": 3.9882761075407396e-05, + "loss": 11.6557, + "step": 26229 + }, + { + "epoch": 1.4283283303723973, + "grad_norm": 0.5336473333260907, + "learning_rate": 3.9875714468298916e-05, + "loss": 11.7722, + "step": 26230 + }, + { + "epoch": 1.4283827843689803, + "grad_norm": 0.5725208809146682, + "learning_rate": 3.9868668328726974e-05, + "loss": 11.9451, + "step": 26231 + }, + { + "epoch": 1.4284372383655632, + "grad_norm": 0.5715020903163366, + "learning_rate": 3.986162265674632e-05, + "loss": 11.9609, + "step": 26232 + }, + { + "epoch": 1.4284916923621462, + "grad_norm": 0.6011111369036927, + "learning_rate": 3.98545774524118e-05, + "loss": 11.9879, + "step": 26233 + }, + { + "epoch": 1.4285461463587295, + "grad_norm": 0.5837201780273968, + "learning_rate": 3.9847532715778147e-05, + "loss": 11.8455, + "step": 26234 + }, + { + "epoch": 1.4286006003553124, + "grad_norm": 0.5816848722841986, + "learning_rate": 3.984048844690019e-05, + "loss": 11.9353, + "step": 26235 + }, + { + "epoch": 1.4286550543518954, + "grad_norm": 0.5700366504916987, + "learning_rate": 3.983344464583266e-05, + "loss": 11.9536, + "step": 26236 + }, + { + "epoch": 1.4287095083484784, + "grad_norm": 0.539345981134819, + "learning_rate": 3.9826401312630345e-05, + "loss": 11.7942, + "step": 26237 + }, + { + "epoch": 1.4287639623450614, + "grad_norm": 0.5475082507150271, + "learning_rate": 3.981935844734806e-05, + "loss": 11.7893, + "step": 26238 + }, + { + "epoch": 1.4288184163416444, + "grad_norm": 0.5207017972419579, + "learning_rate": 3.981231605004051e-05, + "loss": 11.8262, + "step": 26239 + }, + { + "epoch": 1.4288728703382274, + "grad_norm": 0.5933324562536302, + "learning_rate": 3.9805274120762516e-05, + "loss": 11.9662, + "step": 26240 + }, + { + "epoch": 1.4289273243348104, + "grad_norm": 0.5644267583206891, + "learning_rate": 3.97982326595688e-05, + "loss": 11.8991, + "step": 26241 + }, + { + "epoch": 1.4289817783313934, + "grad_norm": 0.5959134431198703, + "learning_rate": 3.9791191666514093e-05, + "loss": 11.891, + "step": 26242 + }, + { + "epoch": 1.4290362323279764, + "grad_norm": 0.5069766114074111, + "learning_rate": 3.9784151141653206e-05, + "loss": 11.83, + "step": 26243 + }, + { + "epoch": 1.4290906863245594, + "grad_norm": 0.500081980322355, + "learning_rate": 3.977711108504083e-05, + "loss": 11.8952, + "step": 26244 + }, + { + "epoch": 1.4291451403211424, + "grad_norm": 0.5509883593483207, + "learning_rate": 3.977007149673177e-05, + "loss": 11.9447, + "step": 26245 + }, + { + "epoch": 1.4291995943177254, + "grad_norm": 0.5105273151697763, + "learning_rate": 3.97630323767807e-05, + "loss": 11.8086, + "step": 26246 + }, + { + "epoch": 1.4292540483143084, + "grad_norm": 0.5205523131391283, + "learning_rate": 3.9755993725242414e-05, + "loss": 11.9408, + "step": 26247 + }, + { + "epoch": 1.4293085023108913, + "grad_norm": 0.499158442176314, + "learning_rate": 3.9748955542171605e-05, + "loss": 11.8502, + "step": 26248 + }, + { + "epoch": 1.4293629563074746, + "grad_norm": 0.4944646452886164, + "learning_rate": 3.9741917827623024e-05, + "loss": 11.9085, + "step": 26249 + }, + { + "epoch": 1.4294174103040576, + "grad_norm": 0.5931700142739695, + "learning_rate": 3.973488058165141e-05, + "loss": 11.7838, + "step": 26250 + }, + { + "epoch": 1.4294718643006405, + "grad_norm": 0.5208407011360756, + "learning_rate": 3.972784380431149e-05, + "loss": 11.7544, + "step": 26251 + }, + { + "epoch": 1.4295263182972235, + "grad_norm": 0.5181320276905301, + "learning_rate": 3.972080749565792e-05, + "loss": 11.8213, + "step": 26252 + }, + { + "epoch": 1.4295807722938065, + "grad_norm": 0.5528504837602145, + "learning_rate": 3.9713771655745504e-05, + "loss": 11.8798, + "step": 26253 + }, + { + "epoch": 1.4296352262903895, + "grad_norm": 0.5721125616895366, + "learning_rate": 3.970673628462886e-05, + "loss": 11.8666, + "step": 26254 + }, + { + "epoch": 1.4296896802869725, + "grad_norm": 0.5634194346236857, + "learning_rate": 3.96997013823628e-05, + "loss": 11.809, + "step": 26255 + }, + { + "epoch": 1.4297441342835555, + "grad_norm": 0.5339333376747236, + "learning_rate": 3.9692666949001925e-05, + "loss": 11.8948, + "step": 26256 + }, + { + "epoch": 1.4297985882801385, + "grad_norm": 0.6087385653395961, + "learning_rate": 3.968563298460102e-05, + "loss": 11.8149, + "step": 26257 + }, + { + "epoch": 1.4298530422767217, + "grad_norm": 0.4913557122914008, + "learning_rate": 3.967859948921472e-05, + "loss": 11.7974, + "step": 26258 + }, + { + "epoch": 1.4299074962733047, + "grad_norm": 0.5510039420406786, + "learning_rate": 3.9671566462897734e-05, + "loss": 11.9316, + "step": 26259 + }, + { + "epoch": 1.4299619502698877, + "grad_norm": 0.6037638777716818, + "learning_rate": 3.9664533905704815e-05, + "loss": 11.9781, + "step": 26260 + }, + { + "epoch": 1.4300164042664707, + "grad_norm": 0.5375655881978909, + "learning_rate": 3.965750181769059e-05, + "loss": 11.9076, + "step": 26261 + }, + { + "epoch": 1.4300708582630537, + "grad_norm": 0.5605225482339553, + "learning_rate": 3.965047019890975e-05, + "loss": 11.898, + "step": 26262 + }, + { + "epoch": 1.4301253122596367, + "grad_norm": 0.5463242407483168, + "learning_rate": 3.964343904941694e-05, + "loss": 11.8452, + "step": 26263 + }, + { + "epoch": 1.4301797662562197, + "grad_norm": 0.5270700342324047, + "learning_rate": 3.9636408369266874e-05, + "loss": 11.8032, + "step": 26264 + }, + { + "epoch": 1.4302342202528027, + "grad_norm": 0.5304016085082199, + "learning_rate": 3.962937815851425e-05, + "loss": 11.7853, + "step": 26265 + }, + { + "epoch": 1.4302886742493857, + "grad_norm": 0.5134334150262553, + "learning_rate": 3.9622348417213674e-05, + "loss": 11.8255, + "step": 26266 + }, + { + "epoch": 1.4303431282459687, + "grad_norm": 0.5402625393626279, + "learning_rate": 3.961531914541987e-05, + "loss": 11.863, + "step": 26267 + }, + { + "epoch": 1.4303975822425516, + "grad_norm": 0.5673066880455563, + "learning_rate": 3.960829034318745e-05, + "loss": 11.7563, + "step": 26268 + }, + { + "epoch": 1.4304520362391346, + "grad_norm": 0.5133852115553672, + "learning_rate": 3.960126201057112e-05, + "loss": 11.8093, + "step": 26269 + }, + { + "epoch": 1.4305064902357176, + "grad_norm": 0.5717736604225998, + "learning_rate": 3.959423414762546e-05, + "loss": 11.9245, + "step": 26270 + }, + { + "epoch": 1.4305609442323006, + "grad_norm": 0.5379993862072009, + "learning_rate": 3.9587206754405215e-05, + "loss": 11.7369, + "step": 26271 + }, + { + "epoch": 1.4306153982288838, + "grad_norm": 0.5560623358336024, + "learning_rate": 3.958017983096497e-05, + "loss": 11.9136, + "step": 26272 + }, + { + "epoch": 1.4306698522254668, + "grad_norm": 0.5375427123651487, + "learning_rate": 3.957315337735935e-05, + "loss": 11.7954, + "step": 26273 + }, + { + "epoch": 1.4307243062220498, + "grad_norm": 0.5535774098794369, + "learning_rate": 3.956612739364306e-05, + "loss": 11.7769, + "step": 26274 + }, + { + "epoch": 1.4307787602186328, + "grad_norm": 0.5391283028540276, + "learning_rate": 3.955910187987066e-05, + "loss": 11.9049, + "step": 26275 + }, + { + "epoch": 1.4308332142152158, + "grad_norm": 0.5617764524621167, + "learning_rate": 3.955207683609682e-05, + "loss": 11.8965, + "step": 26276 + }, + { + "epoch": 1.4308876682117988, + "grad_norm": 0.561202540887149, + "learning_rate": 3.9545052262376205e-05, + "loss": 11.9329, + "step": 26277 + }, + { + "epoch": 1.4309421222083818, + "grad_norm": 0.5495423846866626, + "learning_rate": 3.953802815876336e-05, + "loss": 11.8629, + "step": 26278 + }, + { + "epoch": 1.4309965762049648, + "grad_norm": 0.6215470131896877, + "learning_rate": 3.9531004525312984e-05, + "loss": 12.0592, + "step": 26279 + }, + { + "epoch": 1.4310510302015478, + "grad_norm": 0.5042639495011494, + "learning_rate": 3.9523981362079633e-05, + "loss": 11.8585, + "step": 26280 + }, + { + "epoch": 1.431105484198131, + "grad_norm": 0.5553416594210632, + "learning_rate": 3.951695866911798e-05, + "loss": 11.7984, + "step": 26281 + }, + { + "epoch": 1.431159938194714, + "grad_norm": 0.5139239586395046, + "learning_rate": 3.9509936446482584e-05, + "loss": 11.9491, + "step": 26282 + }, + { + "epoch": 1.431214392191297, + "grad_norm": 0.5871534059786202, + "learning_rate": 3.9502914694228043e-05, + "loss": 11.9945, + "step": 26283 + }, + { + "epoch": 1.43126884618788, + "grad_norm": 0.5534052479716304, + "learning_rate": 3.9495893412409015e-05, + "loss": 11.8266, + "step": 26284 + }, + { + "epoch": 1.431323300184463, + "grad_norm": 0.5597620717468003, + "learning_rate": 3.948887260108003e-05, + "loss": 11.9313, + "step": 26285 + }, + { + "epoch": 1.431377754181046, + "grad_norm": 0.5563427268608134, + "learning_rate": 3.948185226029571e-05, + "loss": 11.8254, + "step": 26286 + }, + { + "epoch": 1.431432208177629, + "grad_norm": 0.5759137649375125, + "learning_rate": 3.9474832390110705e-05, + "loss": 11.9367, + "step": 26287 + }, + { + "epoch": 1.431486662174212, + "grad_norm": 0.49215373796281114, + "learning_rate": 3.9467812990579514e-05, + "loss": 11.8003, + "step": 26288 + }, + { + "epoch": 1.431541116170795, + "grad_norm": 0.5960469170428271, + "learning_rate": 3.94607940617568e-05, + "loss": 11.9812, + "step": 26289 + }, + { + "epoch": 1.431595570167378, + "grad_norm": 0.5400588528318553, + "learning_rate": 3.945377560369706e-05, + "loss": 11.8169, + "step": 26290 + }, + { + "epoch": 1.431650024163961, + "grad_norm": 0.5116560040426905, + "learning_rate": 3.944675761645495e-05, + "loss": 11.8077, + "step": 26291 + }, + { + "epoch": 1.431704478160544, + "grad_norm": 0.5938105752669841, + "learning_rate": 3.9439740100085024e-05, + "loss": 11.8301, + "step": 26292 + }, + { + "epoch": 1.431758932157127, + "grad_norm": 0.5676136162209637, + "learning_rate": 3.9432723054641786e-05, + "loss": 11.7587, + "step": 26293 + }, + { + "epoch": 1.43181338615371, + "grad_norm": 0.6049528042386776, + "learning_rate": 3.942570648017988e-05, + "loss": 11.9591, + "step": 26294 + }, + { + "epoch": 1.431867840150293, + "grad_norm": 0.5602248595932456, + "learning_rate": 3.9418690376753806e-05, + "loss": 11.8574, + "step": 26295 + }, + { + "epoch": 1.431922294146876, + "grad_norm": 0.5735933686420074, + "learning_rate": 3.94116747444182e-05, + "loss": 11.9627, + "step": 26296 + }, + { + "epoch": 1.431976748143459, + "grad_norm": 0.5344121507458838, + "learning_rate": 3.940465958322753e-05, + "loss": 11.8407, + "step": 26297 + }, + { + "epoch": 1.432031202140042, + "grad_norm": 0.5387036759812672, + "learning_rate": 3.9397644893236396e-05, + "loss": 11.941, + "step": 26298 + }, + { + "epoch": 1.432085656136625, + "grad_norm": 0.5520957462342303, + "learning_rate": 3.939063067449936e-05, + "loss": 11.83, + "step": 26299 + }, + { + "epoch": 1.432140110133208, + "grad_norm": 0.5798661274176112, + "learning_rate": 3.9383616927070924e-05, + "loss": 11.9121, + "step": 26300 + }, + { + "epoch": 1.432194564129791, + "grad_norm": 0.5537868892540694, + "learning_rate": 3.937660365100567e-05, + "loss": 12.0007, + "step": 26301 + }, + { + "epoch": 1.432249018126374, + "grad_norm": 0.6438595153869893, + "learning_rate": 3.9369590846358115e-05, + "loss": 11.8297, + "step": 26302 + }, + { + "epoch": 1.432303472122957, + "grad_norm": 0.5848865296047472, + "learning_rate": 3.9362578513182766e-05, + "loss": 12.022, + "step": 26303 + }, + { + "epoch": 1.4323579261195403, + "grad_norm": 0.5646558160250067, + "learning_rate": 3.9355566651534206e-05, + "loss": 11.7747, + "step": 26304 + }, + { + "epoch": 1.4324123801161233, + "grad_norm": 0.5127284651778233, + "learning_rate": 3.934855526146689e-05, + "loss": 11.8638, + "step": 26305 + }, + { + "epoch": 1.4324668341127063, + "grad_norm": 0.5319451536490147, + "learning_rate": 3.934154434303541e-05, + "loss": 11.9292, + "step": 26306 + }, + { + "epoch": 1.4325212881092892, + "grad_norm": 0.546971837514057, + "learning_rate": 3.9334533896294226e-05, + "loss": 11.9256, + "step": 26307 + }, + { + "epoch": 1.4325757421058722, + "grad_norm": 0.5490548675384516, + "learning_rate": 3.9327523921297884e-05, + "loss": 11.8369, + "step": 26308 + }, + { + "epoch": 1.4326301961024552, + "grad_norm": 0.5982828939345142, + "learning_rate": 3.932051441810092e-05, + "loss": 11.8974, + "step": 26309 + }, + { + "epoch": 1.4326846500990382, + "grad_norm": 0.5310811719517157, + "learning_rate": 3.931350538675777e-05, + "loss": 11.8493, + "step": 26310 + }, + { + "epoch": 1.4327391040956212, + "grad_norm": 0.5493572506616808, + "learning_rate": 3.930649682732302e-05, + "loss": 11.7759, + "step": 26311 + }, + { + "epoch": 1.4327935580922042, + "grad_norm": 0.5325032859555068, + "learning_rate": 3.929948873985113e-05, + "loss": 11.9618, + "step": 26312 + }, + { + "epoch": 1.4328480120887872, + "grad_norm": 0.5283287961537859, + "learning_rate": 3.9292481124396565e-05, + "loss": 11.8385, + "step": 26313 + }, + { + "epoch": 1.4329024660853702, + "grad_norm": 0.5244158558104707, + "learning_rate": 3.9285473981013876e-05, + "loss": 11.6144, + "step": 26314 + }, + { + "epoch": 1.4329569200819532, + "grad_norm": 0.5108402266768652, + "learning_rate": 3.9278467309757485e-05, + "loss": 11.8209, + "step": 26315 + }, + { + "epoch": 1.4330113740785362, + "grad_norm": 0.5963943287252947, + "learning_rate": 3.927146111068196e-05, + "loss": 11.8848, + "step": 26316 + }, + { + "epoch": 1.4330658280751192, + "grad_norm": 0.5553531840371618, + "learning_rate": 3.9264455383841694e-05, + "loss": 11.8934, + "step": 26317 + }, + { + "epoch": 1.4331202820717022, + "grad_norm": 0.5009744804337983, + "learning_rate": 3.925745012929125e-05, + "loss": 11.81, + "step": 26318 + }, + { + "epoch": 1.4331747360682854, + "grad_norm": 0.5576770648042227, + "learning_rate": 3.925044534708502e-05, + "loss": 11.8416, + "step": 26319 + }, + { + "epoch": 1.4332291900648684, + "grad_norm": 0.5287430877263224, + "learning_rate": 3.924344103727752e-05, + "loss": 11.9291, + "step": 26320 + }, + { + "epoch": 1.4332836440614514, + "grad_norm": 0.5399687620022304, + "learning_rate": 3.923643719992324e-05, + "loss": 11.9051, + "step": 26321 + }, + { + "epoch": 1.4333380980580344, + "grad_norm": 0.553287743124179, + "learning_rate": 3.922943383507662e-05, + "loss": 11.6551, + "step": 26322 + }, + { + "epoch": 1.4333925520546174, + "grad_norm": 0.5524811046205452, + "learning_rate": 3.922243094279211e-05, + "loss": 11.6851, + "step": 26323 + }, + { + "epoch": 1.4334470060512003, + "grad_norm": 0.5337478631145688, + "learning_rate": 3.9215428523124134e-05, + "loss": 11.9709, + "step": 26324 + }, + { + "epoch": 1.4335014600477833, + "grad_norm": 0.5352497624881862, + "learning_rate": 3.920842657612718e-05, + "loss": 11.8142, + "step": 26325 + }, + { + "epoch": 1.4335559140443663, + "grad_norm": 0.5726735856035453, + "learning_rate": 3.9201425101855734e-05, + "loss": 11.9576, + "step": 26326 + }, + { + "epoch": 1.4336103680409493, + "grad_norm": 0.5719626561301574, + "learning_rate": 3.9194424100364166e-05, + "loss": 11.8859, + "step": 26327 + }, + { + "epoch": 1.4336648220375325, + "grad_norm": 0.515771477977669, + "learning_rate": 3.918742357170698e-05, + "loss": 11.848, + "step": 26328 + }, + { + "epoch": 1.4337192760341155, + "grad_norm": 0.5041864435515397, + "learning_rate": 3.918042351593857e-05, + "loss": 11.9247, + "step": 26329 + }, + { + "epoch": 1.4337737300306985, + "grad_norm": 0.5749954366515224, + "learning_rate": 3.9173423933113405e-05, + "loss": 11.7697, + "step": 26330 + }, + { + "epoch": 1.4338281840272815, + "grad_norm": 0.5694306716716669, + "learning_rate": 3.916642482328586e-05, + "loss": 11.9713, + "step": 26331 + }, + { + "epoch": 1.4338826380238645, + "grad_norm": 0.579256836282674, + "learning_rate": 3.915942618651045e-05, + "loss": 11.896, + "step": 26332 + }, + { + "epoch": 1.4339370920204475, + "grad_norm": 0.5563756159888272, + "learning_rate": 3.915242802284152e-05, + "loss": 11.9364, + "step": 26333 + }, + { + "epoch": 1.4339915460170305, + "grad_norm": 0.6281289022665371, + "learning_rate": 3.91454303323335e-05, + "loss": 11.9363, + "step": 26334 + }, + { + "epoch": 1.4340460000136135, + "grad_norm": 0.5904478348938093, + "learning_rate": 3.9138433115040804e-05, + "loss": 11.7967, + "step": 26335 + }, + { + "epoch": 1.4341004540101965, + "grad_norm": 0.5799892327547969, + "learning_rate": 3.91314363710179e-05, + "loss": 11.8623, + "step": 26336 + }, + { + "epoch": 1.4341549080067795, + "grad_norm": 0.5405389470553, + "learning_rate": 3.9124440100319123e-05, + "loss": 11.9452, + "step": 26337 + }, + { + "epoch": 1.4342093620033625, + "grad_norm": 0.6783720474388546, + "learning_rate": 3.911744430299895e-05, + "loss": 11.9873, + "step": 26338 + }, + { + "epoch": 1.4342638159999455, + "grad_norm": 0.6327982039133199, + "learning_rate": 3.911044897911169e-05, + "loss": 11.9228, + "step": 26339 + }, + { + "epoch": 1.4343182699965284, + "grad_norm": 0.5590534285869779, + "learning_rate": 3.910345412871184e-05, + "loss": 11.891, + "step": 26340 + }, + { + "epoch": 1.4343727239931114, + "grad_norm": 0.5534170392644077, + "learning_rate": 3.90964597518537e-05, + "loss": 11.9337, + "step": 26341 + }, + { + "epoch": 1.4344271779896947, + "grad_norm": 0.5288421244190704, + "learning_rate": 3.9089465848591735e-05, + "loss": 11.8466, + "step": 26342 + }, + { + "epoch": 1.4344816319862776, + "grad_norm": 0.5962069198937746, + "learning_rate": 3.90824724189803e-05, + "loss": 11.8624, + "step": 26343 + }, + { + "epoch": 1.4345360859828606, + "grad_norm": 0.5750638101000574, + "learning_rate": 3.907547946307374e-05, + "loss": 11.9572, + "step": 26344 + }, + { + "epoch": 1.4345905399794436, + "grad_norm": 0.5350056698965392, + "learning_rate": 3.90684869809265e-05, + "loss": 11.9767, + "step": 26345 + }, + { + "epoch": 1.4346449939760266, + "grad_norm": 0.5163007456316633, + "learning_rate": 3.906149497259289e-05, + "loss": 11.7562, + "step": 26346 + }, + { + "epoch": 1.4346994479726096, + "grad_norm": 0.5973698289624069, + "learning_rate": 3.905450343812732e-05, + "loss": 11.8655, + "step": 26347 + }, + { + "epoch": 1.4347539019691926, + "grad_norm": 0.5816683959688256, + "learning_rate": 3.904751237758418e-05, + "loss": 12.0081, + "step": 26348 + }, + { + "epoch": 1.4348083559657756, + "grad_norm": 0.5248681375402489, + "learning_rate": 3.904052179101778e-05, + "loss": 11.8766, + "step": 26349 + }, + { + "epoch": 1.4348628099623586, + "grad_norm": 0.5227799303859438, + "learning_rate": 3.9033531678482535e-05, + "loss": 11.8141, + "step": 26350 + }, + { + "epoch": 1.4349172639589418, + "grad_norm": 0.6169144903374221, + "learning_rate": 3.9026542040032733e-05, + "loss": 11.8988, + "step": 26351 + }, + { + "epoch": 1.4349717179555248, + "grad_norm": 0.5674696223899558, + "learning_rate": 3.901955287572281e-05, + "loss": 11.9535, + "step": 26352 + }, + { + "epoch": 1.4350261719521078, + "grad_norm": 0.5495261823381598, + "learning_rate": 3.901256418560706e-05, + "loss": 11.7811, + "step": 26353 + }, + { + "epoch": 1.4350806259486908, + "grad_norm": 0.5634648186678949, + "learning_rate": 3.900557596973981e-05, + "loss": 11.9454, + "step": 26354 + }, + { + "epoch": 1.4351350799452738, + "grad_norm": 0.5225187981987457, + "learning_rate": 3.899858822817546e-05, + "loss": 11.8261, + "step": 26355 + }, + { + "epoch": 1.4351895339418568, + "grad_norm": 0.4900501069485667, + "learning_rate": 3.8991600960968285e-05, + "loss": 11.8859, + "step": 26356 + }, + { + "epoch": 1.4352439879384398, + "grad_norm": 0.5506176697346621, + "learning_rate": 3.898461416817265e-05, + "loss": 11.9595, + "step": 26357 + }, + { + "epoch": 1.4352984419350228, + "grad_norm": 0.5510950018951903, + "learning_rate": 3.8977627849842926e-05, + "loss": 11.9026, + "step": 26358 + }, + { + "epoch": 1.4353528959316058, + "grad_norm": 0.5596867563777409, + "learning_rate": 3.8970642006033366e-05, + "loss": 11.9269, + "step": 26359 + }, + { + "epoch": 1.4354073499281887, + "grad_norm": 0.5554123574041167, + "learning_rate": 3.896365663679836e-05, + "loss": 11.6873, + "step": 26360 + }, + { + "epoch": 1.4354618039247717, + "grad_norm": 0.4755923426377247, + "learning_rate": 3.895667174219216e-05, + "loss": 11.8354, + "step": 26361 + }, + { + "epoch": 1.4355162579213547, + "grad_norm": 0.5352883198171227, + "learning_rate": 3.894968732226916e-05, + "loss": 11.7626, + "step": 26362 + }, + { + "epoch": 1.4355707119179377, + "grad_norm": 0.5174955441146005, + "learning_rate": 3.8942703377083636e-05, + "loss": 11.836, + "step": 26363 + }, + { + "epoch": 1.4356251659145207, + "grad_norm": 0.5788548460555466, + "learning_rate": 3.8935719906689836e-05, + "loss": 11.9306, + "step": 26364 + }, + { + "epoch": 1.435679619911104, + "grad_norm": 0.5036639279723619, + "learning_rate": 3.892873691114216e-05, + "loss": 11.9272, + "step": 26365 + }, + { + "epoch": 1.435734073907687, + "grad_norm": 0.5320876843083453, + "learning_rate": 3.892175439049484e-05, + "loss": 11.7054, + "step": 26366 + }, + { + "epoch": 1.43578852790427, + "grad_norm": 0.5157691623536944, + "learning_rate": 3.891477234480223e-05, + "loss": 11.9383, + "step": 26367 + }, + { + "epoch": 1.435842981900853, + "grad_norm": 0.5954104878448915, + "learning_rate": 3.8907790774118555e-05, + "loss": 11.7849, + "step": 26368 + }, + { + "epoch": 1.435897435897436, + "grad_norm": 0.6063313458480252, + "learning_rate": 3.8900809678498155e-05, + "loss": 11.6839, + "step": 26369 + }, + { + "epoch": 1.435951889894019, + "grad_norm": 0.5522728385786745, + "learning_rate": 3.8893829057995326e-05, + "loss": 11.861, + "step": 26370 + }, + { + "epoch": 1.4360063438906019, + "grad_norm": 0.5285990004560783, + "learning_rate": 3.8886848912664306e-05, + "loss": 11.8667, + "step": 26371 + }, + { + "epoch": 1.4360607978871849, + "grad_norm": 0.5034565597361829, + "learning_rate": 3.887986924255946e-05, + "loss": 11.7953, + "step": 26372 + }, + { + "epoch": 1.4361152518837679, + "grad_norm": 0.5746892136454145, + "learning_rate": 3.887289004773493e-05, + "loss": 11.9137, + "step": 26373 + }, + { + "epoch": 1.436169705880351, + "grad_norm": 0.6797035662605884, + "learning_rate": 3.886591132824506e-05, + "loss": 11.9601, + "step": 26374 + }, + { + "epoch": 1.436224159876934, + "grad_norm": 0.6305808313574126, + "learning_rate": 3.885893308414417e-05, + "loss": 11.9089, + "step": 26375 + }, + { + "epoch": 1.436278613873517, + "grad_norm": 0.5573557777157498, + "learning_rate": 3.885195531548641e-05, + "loss": 11.8543, + "step": 26376 + }, + { + "epoch": 1.4363330678701, + "grad_norm": 0.6008072377443651, + "learning_rate": 3.884497802232614e-05, + "loss": 11.9674, + "step": 26377 + }, + { + "epoch": 1.436387521866683, + "grad_norm": 0.5162928348344781, + "learning_rate": 3.883800120471754e-05, + "loss": 11.8639, + "step": 26378 + }, + { + "epoch": 1.436441975863266, + "grad_norm": 0.5573660627832177, + "learning_rate": 3.883102486271495e-05, + "loss": 11.9008, + "step": 26379 + }, + { + "epoch": 1.436496429859849, + "grad_norm": 0.6070489743820368, + "learning_rate": 3.882404899637252e-05, + "loss": 11.9311, + "step": 26380 + }, + { + "epoch": 1.436550883856432, + "grad_norm": 0.5135783551282166, + "learning_rate": 3.8817073605744544e-05, + "loss": 11.8667, + "step": 26381 + }, + { + "epoch": 1.436605337853015, + "grad_norm": 0.5885763562446317, + "learning_rate": 3.881009869088534e-05, + "loss": 11.8776, + "step": 26382 + }, + { + "epoch": 1.436659791849598, + "grad_norm": 0.5819358618645047, + "learning_rate": 3.8803124251849e-05, + "loss": 11.7464, + "step": 26383 + }, + { + "epoch": 1.436714245846181, + "grad_norm": 0.525154708840969, + "learning_rate": 3.8796150288689824e-05, + "loss": 11.8576, + "step": 26384 + }, + { + "epoch": 1.436768699842764, + "grad_norm": 0.5921139619493957, + "learning_rate": 3.878917680146208e-05, + "loss": 11.8638, + "step": 26385 + }, + { + "epoch": 1.436823153839347, + "grad_norm": 0.718740564888115, + "learning_rate": 3.878220379021993e-05, + "loss": 11.9646, + "step": 26386 + }, + { + "epoch": 1.43687760783593, + "grad_norm": 0.5153265648328553, + "learning_rate": 3.877523125501767e-05, + "loss": 11.9061, + "step": 26387 + }, + { + "epoch": 1.436932061832513, + "grad_norm": 0.5316327421807723, + "learning_rate": 3.876825919590944e-05, + "loss": 11.9404, + "step": 26388 + }, + { + "epoch": 1.4369865158290962, + "grad_norm": 0.5156205903919038, + "learning_rate": 3.8761287612949526e-05, + "loss": 11.9662, + "step": 26389 + }, + { + "epoch": 1.4370409698256792, + "grad_norm": 0.5184465002568762, + "learning_rate": 3.875431650619208e-05, + "loss": 11.8669, + "step": 26390 + }, + { + "epoch": 1.4370954238222622, + "grad_norm": 0.5099609041209606, + "learning_rate": 3.874734587569134e-05, + "loss": 11.8613, + "step": 26391 + }, + { + "epoch": 1.4371498778188452, + "grad_norm": 0.5144846205085002, + "learning_rate": 3.874037572150158e-05, + "loss": 11.7146, + "step": 26392 + }, + { + "epoch": 1.4372043318154282, + "grad_norm": 0.5811415516372281, + "learning_rate": 3.8733406043676855e-05, + "loss": 11.8797, + "step": 26393 + }, + { + "epoch": 1.4372587858120112, + "grad_norm": 0.5806101047973045, + "learning_rate": 3.8726436842271485e-05, + "loss": 11.9977, + "step": 26394 + }, + { + "epoch": 1.4373132398085942, + "grad_norm": 0.5283532018391751, + "learning_rate": 3.8719468117339576e-05, + "loss": 11.8021, + "step": 26395 + }, + { + "epoch": 1.4373676938051771, + "grad_norm": 0.5078686479502453, + "learning_rate": 3.871249986893536e-05, + "loss": 11.9617, + "step": 26396 + }, + { + "epoch": 1.4374221478017604, + "grad_norm": 0.6796389836641767, + "learning_rate": 3.8705532097113064e-05, + "loss": 11.91, + "step": 26397 + }, + { + "epoch": 1.4374766017983434, + "grad_norm": 0.5635833071322215, + "learning_rate": 3.869856480192679e-05, + "loss": 11.8082, + "step": 26398 + }, + { + "epoch": 1.4375310557949263, + "grad_norm": 0.6260701241950422, + "learning_rate": 3.8691597983430784e-05, + "loss": 12.012, + "step": 26399 + }, + { + "epoch": 1.4375855097915093, + "grad_norm": 0.5743393146103634, + "learning_rate": 3.868463164167916e-05, + "loss": 11.9071, + "step": 26400 + }, + { + "epoch": 1.4376399637880923, + "grad_norm": 0.5868376642187261, + "learning_rate": 3.867766577672617e-05, + "loss": 11.9936, + "step": 26401 + }, + { + "epoch": 1.4376944177846753, + "grad_norm": 0.6182174557983814, + "learning_rate": 3.867070038862592e-05, + "loss": 11.759, + "step": 26402 + }, + { + "epoch": 1.4377488717812583, + "grad_norm": 0.5351313167747331, + "learning_rate": 3.866373547743256e-05, + "loss": 11.7878, + "step": 26403 + }, + { + "epoch": 1.4378033257778413, + "grad_norm": 0.5716894709738916, + "learning_rate": 3.8656771043200327e-05, + "loss": 11.9386, + "step": 26404 + }, + { + "epoch": 1.4378577797744243, + "grad_norm": 0.5221652512282031, + "learning_rate": 3.864980708598328e-05, + "loss": 11.8306, + "step": 26405 + }, + { + "epoch": 1.4379122337710073, + "grad_norm": 0.7000787175088273, + "learning_rate": 3.864284360583562e-05, + "loss": 11.7572, + "step": 26406 + }, + { + "epoch": 1.4379666877675903, + "grad_norm": 0.5853533856279501, + "learning_rate": 3.8635880602811535e-05, + "loss": 11.7812, + "step": 26407 + }, + { + "epoch": 1.4380211417641733, + "grad_norm": 0.5571842516686745, + "learning_rate": 3.86289180769651e-05, + "loss": 11.8221, + "step": 26408 + }, + { + "epoch": 1.4380755957607563, + "grad_norm": 0.6136674336037315, + "learning_rate": 3.862195602835053e-05, + "loss": 11.9586, + "step": 26409 + }, + { + "epoch": 1.4381300497573393, + "grad_norm": 0.492538472415326, + "learning_rate": 3.861499445702188e-05, + "loss": 11.771, + "step": 26410 + }, + { + "epoch": 1.4381845037539223, + "grad_norm": 0.6849348467466794, + "learning_rate": 3.860803336303337e-05, + "loss": 11.8212, + "step": 26411 + }, + { + "epoch": 1.4382389577505055, + "grad_norm": 0.567441722797438, + "learning_rate": 3.860107274643908e-05, + "loss": 11.9102, + "step": 26412 + }, + { + "epoch": 1.4382934117470885, + "grad_norm": 0.5350889633818435, + "learning_rate": 3.859411260729311e-05, + "loss": 11.7683, + "step": 26413 + }, + { + "epoch": 1.4383478657436715, + "grad_norm": 0.5794163374270717, + "learning_rate": 3.8587152945649664e-05, + "loss": 11.9231, + "step": 26414 + }, + { + "epoch": 1.4384023197402545, + "grad_norm": 0.5222541612633167, + "learning_rate": 3.8580193761562764e-05, + "loss": 11.9064, + "step": 26415 + }, + { + "epoch": 1.4384567737368374, + "grad_norm": 0.5449803229054676, + "learning_rate": 3.857323505508663e-05, + "loss": 11.8881, + "step": 26416 + }, + { + "epoch": 1.4385112277334204, + "grad_norm": 0.5909731847978851, + "learning_rate": 3.856627682627527e-05, + "loss": 11.8117, + "step": 26417 + }, + { + "epoch": 1.4385656817300034, + "grad_norm": 0.5943101138668914, + "learning_rate": 3.8559319075182855e-05, + "loss": 11.8565, + "step": 26418 + }, + { + "epoch": 1.4386201357265864, + "grad_norm": 0.48160549429955773, + "learning_rate": 3.855236180186351e-05, + "loss": 11.8283, + "step": 26419 + }, + { + "epoch": 1.4386745897231694, + "grad_norm": 0.5934331987570598, + "learning_rate": 3.854540500637127e-05, + "loss": 11.9252, + "step": 26420 + }, + { + "epoch": 1.4387290437197526, + "grad_norm": 0.5368310818555759, + "learning_rate": 3.853844868876031e-05, + "loss": 11.8327, + "step": 26421 + }, + { + "epoch": 1.4387834977163356, + "grad_norm": 0.5534315381139365, + "learning_rate": 3.853149284908466e-05, + "loss": 11.8371, + "step": 26422 + }, + { + "epoch": 1.4388379517129186, + "grad_norm": 0.5518777788488775, + "learning_rate": 3.852453748739841e-05, + "loss": 11.9097, + "step": 26423 + }, + { + "epoch": 1.4388924057095016, + "grad_norm": 0.5599390786515479, + "learning_rate": 3.8517582603755696e-05, + "loss": 12.0061, + "step": 26424 + }, + { + "epoch": 1.4389468597060846, + "grad_norm": 0.5794946414300994, + "learning_rate": 3.851062819821054e-05, + "loss": 11.8319, + "step": 26425 + }, + { + "epoch": 1.4390013137026676, + "grad_norm": 0.5266736447050263, + "learning_rate": 3.850367427081708e-05, + "loss": 11.7306, + "step": 26426 + }, + { + "epoch": 1.4390557676992506, + "grad_norm": 0.5496319407583486, + "learning_rate": 3.8496720821629326e-05, + "loss": 11.915, + "step": 26427 + }, + { + "epoch": 1.4391102216958336, + "grad_norm": 0.5414984675723962, + "learning_rate": 3.848976785070143e-05, + "loss": 11.8731, + "step": 26428 + }, + { + "epoch": 1.4391646756924166, + "grad_norm": 0.5293628663317904, + "learning_rate": 3.848281535808738e-05, + "loss": 11.7896, + "step": 26429 + }, + { + "epoch": 1.4392191296889996, + "grad_norm": 0.6275777157507865, + "learning_rate": 3.8475863343841255e-05, + "loss": 11.9586, + "step": 26430 + }, + { + "epoch": 1.4392735836855826, + "grad_norm": 0.5526135047547414, + "learning_rate": 3.8468911808017184e-05, + "loss": 11.8154, + "step": 26431 + }, + { + "epoch": 1.4393280376821656, + "grad_norm": 0.5011062321628569, + "learning_rate": 3.846196075066917e-05, + "loss": 11.9022, + "step": 26432 + }, + { + "epoch": 1.4393824916787485, + "grad_norm": 0.5148144995516717, + "learning_rate": 3.845501017185123e-05, + "loss": 11.795, + "step": 26433 + }, + { + "epoch": 1.4394369456753315, + "grad_norm": 0.5343934689759077, + "learning_rate": 3.8448060071617496e-05, + "loss": 11.7295, + "step": 26434 + }, + { + "epoch": 1.4394913996719148, + "grad_norm": 0.5310400119548662, + "learning_rate": 3.844111045002193e-05, + "loss": 11.7665, + "step": 26435 + }, + { + "epoch": 1.4395458536684977, + "grad_norm": 0.5262723138066365, + "learning_rate": 3.8434161307118655e-05, + "loss": 11.796, + "step": 26436 + }, + { + "epoch": 1.4396003076650807, + "grad_norm": 0.5262234443805276, + "learning_rate": 3.842721264296162e-05, + "loss": 11.7942, + "step": 26437 + }, + { + "epoch": 1.4396547616616637, + "grad_norm": 0.5552265090171387, + "learning_rate": 3.8420264457604946e-05, + "loss": 11.858, + "step": 26438 + }, + { + "epoch": 1.4397092156582467, + "grad_norm": 0.5284576041256092, + "learning_rate": 3.841331675110259e-05, + "loss": 11.9226, + "step": 26439 + }, + { + "epoch": 1.4397636696548297, + "grad_norm": 0.528154978750724, + "learning_rate": 3.84063695235086e-05, + "loss": 11.8791, + "step": 26440 + }, + { + "epoch": 1.4398181236514127, + "grad_norm": 0.5375114336456656, + "learning_rate": 3.839942277487706e-05, + "loss": 11.9747, + "step": 26441 + }, + { + "epoch": 1.4398725776479957, + "grad_norm": 0.6520639507404585, + "learning_rate": 3.839247650526192e-05, + "loss": 11.9064, + "step": 26442 + }, + { + "epoch": 1.4399270316445787, + "grad_norm": 0.4965413455352172, + "learning_rate": 3.8385530714717235e-05, + "loss": 11.8684, + "step": 26443 + }, + { + "epoch": 1.439981485641162, + "grad_norm": 0.5642819911632675, + "learning_rate": 3.837858540329694e-05, + "loss": 11.9912, + "step": 26444 + }, + { + "epoch": 1.440035939637745, + "grad_norm": 0.5705476875390401, + "learning_rate": 3.837164057105511e-05, + "loss": 12.0032, + "step": 26445 + }, + { + "epoch": 1.440090393634328, + "grad_norm": 0.516073449236866, + "learning_rate": 3.836469621804578e-05, + "loss": 11.6778, + "step": 26446 + }, + { + "epoch": 1.4401448476309109, + "grad_norm": 0.5273706663052106, + "learning_rate": 3.835775234432286e-05, + "loss": 11.8991, + "step": 26447 + }, + { + "epoch": 1.4401993016274939, + "grad_norm": 0.516940943447397, + "learning_rate": 3.8350808949940444e-05, + "loss": 11.9498, + "step": 26448 + }, + { + "epoch": 1.4402537556240769, + "grad_norm": 0.6129870866541962, + "learning_rate": 3.8343866034952426e-05, + "loss": 11.9421, + "step": 26449 + }, + { + "epoch": 1.4403082096206599, + "grad_norm": 0.5000039200400039, + "learning_rate": 3.8336923599412886e-05, + "loss": 11.9062, + "step": 26450 + }, + { + "epoch": 1.4403626636172429, + "grad_norm": 0.5895570357586272, + "learning_rate": 3.832998164337574e-05, + "loss": 11.8764, + "step": 26451 + }, + { + "epoch": 1.4404171176138258, + "grad_norm": 0.5096744817429743, + "learning_rate": 3.8323040166894996e-05, + "loss": 11.9575, + "step": 26452 + }, + { + "epoch": 1.4404715716104088, + "grad_norm": 0.5210919444623549, + "learning_rate": 3.83160991700247e-05, + "loss": 11.8493, + "step": 26453 + }, + { + "epoch": 1.4405260256069918, + "grad_norm": 0.566028201220997, + "learning_rate": 3.83091586528187e-05, + "loss": 11.9164, + "step": 26454 + }, + { + "epoch": 1.4405804796035748, + "grad_norm": 0.5592746555967717, + "learning_rate": 3.830221861533102e-05, + "loss": 11.7384, + "step": 26455 + }, + { + "epoch": 1.4406349336001578, + "grad_norm": 0.49807439105786255, + "learning_rate": 3.829527905761567e-05, + "loss": 11.8128, + "step": 26456 + }, + { + "epoch": 1.4406893875967408, + "grad_norm": 0.4993852580956869, + "learning_rate": 3.828833997972655e-05, + "loss": 11.9153, + "step": 26457 + }, + { + "epoch": 1.4407438415933238, + "grad_norm": 0.5091481273852589, + "learning_rate": 3.828140138171767e-05, + "loss": 11.7583, + "step": 26458 + }, + { + "epoch": 1.440798295589907, + "grad_norm": 0.5316267220699452, + "learning_rate": 3.827446326364295e-05, + "loss": 11.8889, + "step": 26459 + }, + { + "epoch": 1.44085274958649, + "grad_norm": 0.5049445355096555, + "learning_rate": 3.8267525625556363e-05, + "loss": 11.7802, + "step": 26460 + }, + { + "epoch": 1.440907203583073, + "grad_norm": 0.4884318562691344, + "learning_rate": 3.8260588467511824e-05, + "loss": 11.8209, + "step": 26461 + }, + { + "epoch": 1.440961657579656, + "grad_norm": 0.5591009650182845, + "learning_rate": 3.8253651789563316e-05, + "loss": 11.7407, + "step": 26462 + }, + { + "epoch": 1.441016111576239, + "grad_norm": 0.5713114332985573, + "learning_rate": 3.8246715591764825e-05, + "loss": 11.8233, + "step": 26463 + }, + { + "epoch": 1.441070565572822, + "grad_norm": 0.5529993521007889, + "learning_rate": 3.823977987417016e-05, + "loss": 11.793, + "step": 26464 + }, + { + "epoch": 1.441125019569405, + "grad_norm": 0.5700467073177319, + "learning_rate": 3.8232844636833364e-05, + "loss": 11.8365, + "step": 26465 + }, + { + "epoch": 1.441179473565988, + "grad_norm": 0.557722646863616, + "learning_rate": 3.8225909879808285e-05, + "loss": 11.7904, + "step": 26466 + }, + { + "epoch": 1.4412339275625712, + "grad_norm": 0.5416506466773395, + "learning_rate": 3.8218975603148885e-05, + "loss": 11.9212, + "step": 26467 + }, + { + "epoch": 1.4412883815591542, + "grad_norm": 0.4982781364953499, + "learning_rate": 3.821204180690914e-05, + "loss": 11.7878, + "step": 26468 + }, + { + "epoch": 1.4413428355557372, + "grad_norm": 0.5143642027104275, + "learning_rate": 3.820510849114288e-05, + "loss": 11.8249, + "step": 26469 + }, + { + "epoch": 1.4413972895523202, + "grad_norm": 0.5456973299290206, + "learning_rate": 3.819817565590409e-05, + "loss": 11.7678, + "step": 26470 + }, + { + "epoch": 1.4414517435489032, + "grad_norm": 0.4926600743195348, + "learning_rate": 3.8191243301246616e-05, + "loss": 11.8152, + "step": 26471 + }, + { + "epoch": 1.4415061975454861, + "grad_norm": 0.5032933729527044, + "learning_rate": 3.818431142722444e-05, + "loss": 11.8554, + "step": 26472 + }, + { + "epoch": 1.4415606515420691, + "grad_norm": 0.5550752795060577, + "learning_rate": 3.817738003389142e-05, + "loss": 11.8841, + "step": 26473 + }, + { + "epoch": 1.4416151055386521, + "grad_norm": 0.5282346410220012, + "learning_rate": 3.817044912130143e-05, + "loss": 11.8373, + "step": 26474 + }, + { + "epoch": 1.4416695595352351, + "grad_norm": 0.6017067112650445, + "learning_rate": 3.8163518689508425e-05, + "loss": 11.9226, + "step": 26475 + }, + { + "epoch": 1.4417240135318181, + "grad_norm": 0.4762906312380366, + "learning_rate": 3.8156588738566245e-05, + "loss": 11.8273, + "step": 26476 + }, + { + "epoch": 1.441778467528401, + "grad_norm": 0.5889751542683911, + "learning_rate": 3.8149659268528824e-05, + "loss": 11.9261, + "step": 26477 + }, + { + "epoch": 1.441832921524984, + "grad_norm": 0.5465175591594423, + "learning_rate": 3.814273027945e-05, + "loss": 11.9285, + "step": 26478 + }, + { + "epoch": 1.441887375521567, + "grad_norm": 0.5498044555217565, + "learning_rate": 3.8135801771383674e-05, + "loss": 11.845, + "step": 26479 + }, + { + "epoch": 1.44194182951815, + "grad_norm": 0.5569685373284927, + "learning_rate": 3.812887374438376e-05, + "loss": 11.8335, + "step": 26480 + }, + { + "epoch": 1.441996283514733, + "grad_norm": 0.5465969630641462, + "learning_rate": 3.8121946198504066e-05, + "loss": 11.9757, + "step": 26481 + }, + { + "epoch": 1.4420507375113163, + "grad_norm": 0.49155857069869185, + "learning_rate": 3.811501913379853e-05, + "loss": 11.8822, + "step": 26482 + }, + { + "epoch": 1.4421051915078993, + "grad_norm": 0.5603115641034294, + "learning_rate": 3.8108092550320985e-05, + "loss": 11.9908, + "step": 26483 + }, + { + "epoch": 1.4421596455044823, + "grad_norm": 0.5073353673802056, + "learning_rate": 3.810116644812526e-05, + "loss": 11.9003, + "step": 26484 + }, + { + "epoch": 1.4422140995010653, + "grad_norm": 0.5392727103105533, + "learning_rate": 3.809424082726528e-05, + "loss": 11.9213, + "step": 26485 + }, + { + "epoch": 1.4422685534976483, + "grad_norm": 0.5492288241804698, + "learning_rate": 3.8087315687794824e-05, + "loss": 11.8974, + "step": 26486 + }, + { + "epoch": 1.4423230074942313, + "grad_norm": 0.5193820078847496, + "learning_rate": 3.8080391029767825e-05, + "loss": 11.9137, + "step": 26487 + }, + { + "epoch": 1.4423774614908142, + "grad_norm": 0.5612989517853024, + "learning_rate": 3.807346685323805e-05, + "loss": 11.8743, + "step": 26488 + }, + { + "epoch": 1.4424319154873972, + "grad_norm": 0.5516156638909567, + "learning_rate": 3.806654315825938e-05, + "loss": 11.9698, + "step": 26489 + }, + { + "epoch": 1.4424863694839802, + "grad_norm": 0.533090466164547, + "learning_rate": 3.805961994488569e-05, + "loss": 11.8209, + "step": 26490 + }, + { + "epoch": 1.4425408234805634, + "grad_norm": 0.5513457231363742, + "learning_rate": 3.8052697213170763e-05, + "loss": 11.8851, + "step": 26491 + }, + { + "epoch": 1.4425952774771464, + "grad_norm": 0.5166375002092177, + "learning_rate": 3.8045774963168465e-05, + "loss": 11.9074, + "step": 26492 + }, + { + "epoch": 1.4426497314737294, + "grad_norm": 0.5589315034614036, + "learning_rate": 3.8038853194932636e-05, + "loss": 11.8397, + "step": 26493 + }, + { + "epoch": 1.4427041854703124, + "grad_norm": 0.6089769791437563, + "learning_rate": 3.803193190851702e-05, + "loss": 12.0076, + "step": 26494 + }, + { + "epoch": 1.4427586394668954, + "grad_norm": 0.5698836597910661, + "learning_rate": 3.802501110397553e-05, + "loss": 11.8266, + "step": 26495 + }, + { + "epoch": 1.4428130934634784, + "grad_norm": 0.5435638151684871, + "learning_rate": 3.8018090781361914e-05, + "loss": 11.887, + "step": 26496 + }, + { + "epoch": 1.4428675474600614, + "grad_norm": 0.5469715169926146, + "learning_rate": 3.8011170940730056e-05, + "loss": 11.8753, + "step": 26497 + }, + { + "epoch": 1.4429220014566444, + "grad_norm": 0.518215807742917, + "learning_rate": 3.80042515821337e-05, + "loss": 11.8149, + "step": 26498 + }, + { + "epoch": 1.4429764554532274, + "grad_norm": 0.5426434446045338, + "learning_rate": 3.799733270562671e-05, + "loss": 11.9739, + "step": 26499 + }, + { + "epoch": 1.4430309094498104, + "grad_norm": 0.5954430547312034, + "learning_rate": 3.7990414311262815e-05, + "loss": 11.8652, + "step": 26500 + }, + { + "epoch": 1.4430853634463934, + "grad_norm": 0.5290342827964081, + "learning_rate": 3.7983496399095865e-05, + "loss": 11.8137, + "step": 26501 + }, + { + "epoch": 1.4431398174429764, + "grad_norm": 0.5430989247657333, + "learning_rate": 3.797657896917968e-05, + "loss": 11.9709, + "step": 26502 + }, + { + "epoch": 1.4431942714395594, + "grad_norm": 0.547745476687581, + "learning_rate": 3.796966202156802e-05, + "loss": 11.9644, + "step": 26503 + }, + { + "epoch": 1.4432487254361424, + "grad_norm": 0.5779703008891022, + "learning_rate": 3.7962745556314636e-05, + "loss": 11.9251, + "step": 26504 + }, + { + "epoch": 1.4433031794327256, + "grad_norm": 0.5508320217686239, + "learning_rate": 3.795582957347338e-05, + "loss": 11.8728, + "step": 26505 + }, + { + "epoch": 1.4433576334293086, + "grad_norm": 0.5561825859949718, + "learning_rate": 3.7948914073097964e-05, + "loss": 11.9296, + "step": 26506 + }, + { + "epoch": 1.4434120874258916, + "grad_norm": 0.5901528433694065, + "learning_rate": 3.7941999055242236e-05, + "loss": 11.757, + "step": 26507 + }, + { + "epoch": 1.4434665414224745, + "grad_norm": 0.5434915409575033, + "learning_rate": 3.793508451995989e-05, + "loss": 11.8442, + "step": 26508 + }, + { + "epoch": 1.4435209954190575, + "grad_norm": 0.5755886356664981, + "learning_rate": 3.792817046730477e-05, + "loss": 11.9183, + "step": 26509 + }, + { + "epoch": 1.4435754494156405, + "grad_norm": 0.5414810281444997, + "learning_rate": 3.792125689733057e-05, + "loss": 11.7676, + "step": 26510 + }, + { + "epoch": 1.4436299034122235, + "grad_norm": 0.5362016108857978, + "learning_rate": 3.791434381009109e-05, + "loss": 11.839, + "step": 26511 + }, + { + "epoch": 1.4436843574088065, + "grad_norm": 0.5387616562075546, + "learning_rate": 3.790743120564012e-05, + "loss": 11.8144, + "step": 26512 + }, + { + "epoch": 1.4437388114053895, + "grad_norm": 0.5298039663680805, + "learning_rate": 3.790051908403138e-05, + "loss": 11.8176, + "step": 26513 + }, + { + "epoch": 1.4437932654019727, + "grad_norm": 0.6147624955791243, + "learning_rate": 3.789360744531861e-05, + "loss": 11.812, + "step": 26514 + }, + { + "epoch": 1.4438477193985557, + "grad_norm": 0.5303930286967303, + "learning_rate": 3.788669628955554e-05, + "loss": 11.7855, + "step": 26515 + }, + { + "epoch": 1.4439021733951387, + "grad_norm": 0.5383850553182316, + "learning_rate": 3.787978561679593e-05, + "loss": 11.8885, + "step": 26516 + }, + { + "epoch": 1.4439566273917217, + "grad_norm": 0.5290483551962385, + "learning_rate": 3.7872875427093554e-05, + "loss": 11.9994, + "step": 26517 + }, + { + "epoch": 1.4440110813883047, + "grad_norm": 0.5625168397314703, + "learning_rate": 3.786596572050209e-05, + "loss": 11.8345, + "step": 26518 + }, + { + "epoch": 1.4440655353848877, + "grad_norm": 0.5108397956744112, + "learning_rate": 3.7859056497075326e-05, + "loss": 11.8848, + "step": 26519 + }, + { + "epoch": 1.4441199893814707, + "grad_norm": 0.5683020133212698, + "learning_rate": 3.785214775686693e-05, + "loss": 11.7156, + "step": 26520 + }, + { + "epoch": 1.4441744433780537, + "grad_norm": 0.548609603558456, + "learning_rate": 3.78452394999307e-05, + "loss": 11.7671, + "step": 26521 + }, + { + "epoch": 1.4442288973746367, + "grad_norm": 0.5811885727393167, + "learning_rate": 3.7838331726320254e-05, + "loss": 11.8624, + "step": 26522 + }, + { + "epoch": 1.4442833513712197, + "grad_norm": 0.5426540756261988, + "learning_rate": 3.7831424436089415e-05, + "loss": 11.8728, + "step": 26523 + }, + { + "epoch": 1.4443378053678027, + "grad_norm": 0.5602572778176816, + "learning_rate": 3.7824517629291835e-05, + "loss": 11.8197, + "step": 26524 + }, + { + "epoch": 1.4443922593643856, + "grad_norm": 0.5192908691092869, + "learning_rate": 3.7817611305981205e-05, + "loss": 11.8315, + "step": 26525 + }, + { + "epoch": 1.4444467133609686, + "grad_norm": 0.534263257596746, + "learning_rate": 3.781070546621129e-05, + "loss": 11.889, + "step": 26526 + }, + { + "epoch": 1.4445011673575516, + "grad_norm": 0.5953627627965478, + "learning_rate": 3.7803800110035725e-05, + "loss": 11.9301, + "step": 26527 + }, + { + "epoch": 1.4445556213541346, + "grad_norm": 0.6921158756832816, + "learning_rate": 3.7796895237508245e-05, + "loss": 12.0371, + "step": 26528 + }, + { + "epoch": 1.4446100753507178, + "grad_norm": 0.5317652565629187, + "learning_rate": 3.778999084868257e-05, + "loss": 11.9459, + "step": 26529 + }, + { + "epoch": 1.4446645293473008, + "grad_norm": 0.5907446369727426, + "learning_rate": 3.7783086943612324e-05, + "loss": 11.8588, + "step": 26530 + }, + { + "epoch": 1.4447189833438838, + "grad_norm": 0.5225637083776867, + "learning_rate": 3.777618352235125e-05, + "loss": 11.8373, + "step": 26531 + }, + { + "epoch": 1.4447734373404668, + "grad_norm": 0.5645536273713637, + "learning_rate": 3.7769280584952994e-05, + "loss": 11.9279, + "step": 26532 + }, + { + "epoch": 1.4448278913370498, + "grad_norm": 0.519573391850562, + "learning_rate": 3.7762378131471266e-05, + "loss": 11.6787, + "step": 26533 + }, + { + "epoch": 1.4448823453336328, + "grad_norm": 0.5483649445711375, + "learning_rate": 3.7755476161959736e-05, + "loss": 11.9166, + "step": 26534 + }, + { + "epoch": 1.4449367993302158, + "grad_norm": 0.5154928719869454, + "learning_rate": 3.7748574676472016e-05, + "loss": 11.8932, + "step": 26535 + }, + { + "epoch": 1.4449912533267988, + "grad_norm": 0.581986857093285, + "learning_rate": 3.7741673675061865e-05, + "loss": 11.9404, + "step": 26536 + }, + { + "epoch": 1.445045707323382, + "grad_norm": 0.5947792692617778, + "learning_rate": 3.7734773157782854e-05, + "loss": 11.8795, + "step": 26537 + }, + { + "epoch": 1.445100161319965, + "grad_norm": 0.5449383458016724, + "learning_rate": 3.772787312468869e-05, + "loss": 12.0347, + "step": 26538 + }, + { + "epoch": 1.445154615316548, + "grad_norm": 0.5352143213393263, + "learning_rate": 3.7720973575833075e-05, + "loss": 11.8536, + "step": 26539 + }, + { + "epoch": 1.445209069313131, + "grad_norm": 0.545495224596101, + "learning_rate": 3.771407451126957e-05, + "loss": 11.8957, + "step": 26540 + }, + { + "epoch": 1.445263523309714, + "grad_norm": 0.5775567875249777, + "learning_rate": 3.7707175931051896e-05, + "loss": 11.9274, + "step": 26541 + }, + { + "epoch": 1.445317977306297, + "grad_norm": 0.574280040621081, + "learning_rate": 3.770027783523364e-05, + "loss": 11.9951, + "step": 26542 + }, + { + "epoch": 1.44537243130288, + "grad_norm": 0.5640495140559315, + "learning_rate": 3.769338022386851e-05, + "loss": 11.8683, + "step": 26543 + }, + { + "epoch": 1.445426885299463, + "grad_norm": 0.5200569374135725, + "learning_rate": 3.768648309701009e-05, + "loss": 11.8488, + "step": 26544 + }, + { + "epoch": 1.445481339296046, + "grad_norm": 0.5270282307754426, + "learning_rate": 3.767958645471201e-05, + "loss": 11.9137, + "step": 26545 + }, + { + "epoch": 1.445535793292629, + "grad_norm": 0.5133276504647482, + "learning_rate": 3.767269029702795e-05, + "loss": 11.8536, + "step": 26546 + }, + { + "epoch": 1.445590247289212, + "grad_norm": 0.5383081779076734, + "learning_rate": 3.766579462401146e-05, + "loss": 11.96, + "step": 26547 + }, + { + "epoch": 1.445644701285795, + "grad_norm": 0.5721004487225891, + "learning_rate": 3.7658899435716245e-05, + "loss": 11.8655, + "step": 26548 + }, + { + "epoch": 1.445699155282378, + "grad_norm": 0.46913941220405814, + "learning_rate": 3.7652004732195834e-05, + "loss": 11.6833, + "step": 26549 + }, + { + "epoch": 1.445753609278961, + "grad_norm": 0.5752380497709699, + "learning_rate": 3.76451105135039e-05, + "loss": 11.8896, + "step": 26550 + }, + { + "epoch": 1.445808063275544, + "grad_norm": 0.5650594853578803, + "learning_rate": 3.763821677969408e-05, + "loss": 11.835, + "step": 26551 + }, + { + "epoch": 1.4458625172721271, + "grad_norm": 0.5662700850432301, + "learning_rate": 3.7631323530819905e-05, + "loss": 11.8166, + "step": 26552 + }, + { + "epoch": 1.44591697126871, + "grad_norm": 0.5807502724892819, + "learning_rate": 3.762443076693506e-05, + "loss": 11.746, + "step": 26553 + }, + { + "epoch": 1.445971425265293, + "grad_norm": 0.5565076886180103, + "learning_rate": 3.7617538488093094e-05, + "loss": 11.881, + "step": 26554 + }, + { + "epoch": 1.446025879261876, + "grad_norm": 0.5800596085095544, + "learning_rate": 3.761064669434758e-05, + "loss": 11.9344, + "step": 26555 + }, + { + "epoch": 1.446080333258459, + "grad_norm": 0.5521810673268931, + "learning_rate": 3.7603755385752185e-05, + "loss": 11.94, + "step": 26556 + }, + { + "epoch": 1.446134787255042, + "grad_norm": 0.523797652176005, + "learning_rate": 3.75968645623604e-05, + "loss": 11.9031, + "step": 26557 + }, + { + "epoch": 1.446189241251625, + "grad_norm": 0.5597867020012961, + "learning_rate": 3.75899742242259e-05, + "loss": 11.8146, + "step": 26558 + }, + { + "epoch": 1.446243695248208, + "grad_norm": 0.5392480061645936, + "learning_rate": 3.758308437140219e-05, + "loss": 11.7328, + "step": 26559 + }, + { + "epoch": 1.446298149244791, + "grad_norm": 0.6091671909538555, + "learning_rate": 3.757619500394289e-05, + "loss": 12.0145, + "step": 26560 + }, + { + "epoch": 1.4463526032413743, + "grad_norm": 0.5692700371527594, + "learning_rate": 3.756930612190159e-05, + "loss": 11.8579, + "step": 26561 + }, + { + "epoch": 1.4464070572379573, + "grad_norm": 0.6063908307754797, + "learning_rate": 3.7562417725331814e-05, + "loss": 11.8455, + "step": 26562 + }, + { + "epoch": 1.4464615112345403, + "grad_norm": 0.594514892602684, + "learning_rate": 3.755552981428722e-05, + "loss": 11.8091, + "step": 26563 + }, + { + "epoch": 1.4465159652311232, + "grad_norm": 0.5557790256658615, + "learning_rate": 3.754864238882121e-05, + "loss": 11.8945, + "step": 26564 + }, + { + "epoch": 1.4465704192277062, + "grad_norm": 0.521261547004267, + "learning_rate": 3.754175544898745e-05, + "loss": 11.8221, + "step": 26565 + }, + { + "epoch": 1.4466248732242892, + "grad_norm": 0.5929063166523696, + "learning_rate": 3.753486899483949e-05, + "loss": 11.9031, + "step": 26566 + }, + { + "epoch": 1.4466793272208722, + "grad_norm": 0.4929755068670029, + "learning_rate": 3.7527983026430834e-05, + "loss": 12.0085, + "step": 26567 + }, + { + "epoch": 1.4467337812174552, + "grad_norm": 0.5439263161014277, + "learning_rate": 3.752109754381511e-05, + "loss": 11.9281, + "step": 26568 + }, + { + "epoch": 1.4467882352140382, + "grad_norm": 0.5373762935560514, + "learning_rate": 3.751421254704576e-05, + "loss": 11.9186, + "step": 26569 + }, + { + "epoch": 1.4468426892106212, + "grad_norm": 0.5807048933876086, + "learning_rate": 3.75073280361764e-05, + "loss": 11.9166, + "step": 26570 + }, + { + "epoch": 1.4468971432072042, + "grad_norm": 0.5481809421062326, + "learning_rate": 3.7500444011260515e-05, + "loss": 11.8524, + "step": 26571 + }, + { + "epoch": 1.4469515972037872, + "grad_norm": 0.587755210628643, + "learning_rate": 3.749356047235165e-05, + "loss": 12.0914, + "step": 26572 + }, + { + "epoch": 1.4470060512003702, + "grad_norm": 0.5665313473285608, + "learning_rate": 3.7486677419503427e-05, + "loss": 11.8461, + "step": 26573 + }, + { + "epoch": 1.4470605051969532, + "grad_norm": 0.5106968799287922, + "learning_rate": 3.74797948527692e-05, + "loss": 11.8505, + "step": 26574 + }, + { + "epoch": 1.4471149591935364, + "grad_norm": 0.5304620584865829, + "learning_rate": 3.7472912772202605e-05, + "loss": 11.8942, + "step": 26575 + }, + { + "epoch": 1.4471694131901194, + "grad_norm": 0.5264843733801304, + "learning_rate": 3.74660311778571e-05, + "loss": 11.8378, + "step": 26576 + }, + { + "epoch": 1.4472238671867024, + "grad_norm": 0.529979963182147, + "learning_rate": 3.7459150069786216e-05, + "loss": 11.838, + "step": 26577 + }, + { + "epoch": 1.4472783211832854, + "grad_norm": 0.5467720129741784, + "learning_rate": 3.745226944804352e-05, + "loss": 11.8257, + "step": 26578 + }, + { + "epoch": 1.4473327751798684, + "grad_norm": 0.533493764804981, + "learning_rate": 3.744538931268241e-05, + "loss": 11.8858, + "step": 26579 + }, + { + "epoch": 1.4473872291764514, + "grad_norm": 0.5378775567100834, + "learning_rate": 3.7438509663756494e-05, + "loss": 11.8686, + "step": 26580 + }, + { + "epoch": 1.4474416831730343, + "grad_norm": 0.5566027107870536, + "learning_rate": 3.743163050131917e-05, + "loss": 11.8811, + "step": 26581 + }, + { + "epoch": 1.4474961371696173, + "grad_norm": 0.5515944992876728, + "learning_rate": 3.742475182542403e-05, + "loss": 11.8411, + "step": 26582 + }, + { + "epoch": 1.4475505911662003, + "grad_norm": 0.552049968058611, + "learning_rate": 3.74178736361245e-05, + "loss": 11.8082, + "step": 26583 + }, + { + "epoch": 1.4476050451627835, + "grad_norm": 0.537822660831308, + "learning_rate": 3.741099593347406e-05, + "loss": 11.9451, + "step": 26584 + }, + { + "epoch": 1.4476594991593665, + "grad_norm": 0.5853675462032129, + "learning_rate": 3.740411871752622e-05, + "loss": 11.9511, + "step": 26585 + }, + { + "epoch": 1.4477139531559495, + "grad_norm": 0.5226096967266199, + "learning_rate": 3.739724198833444e-05, + "loss": 11.8439, + "step": 26586 + }, + { + "epoch": 1.4477684071525325, + "grad_norm": 0.5312786117885538, + "learning_rate": 3.739036574595221e-05, + "loss": 11.9, + "step": 26587 + }, + { + "epoch": 1.4478228611491155, + "grad_norm": 0.6276620068702384, + "learning_rate": 3.7383489990433005e-05, + "loss": 11.8458, + "step": 26588 + }, + { + "epoch": 1.4478773151456985, + "grad_norm": 0.680144299531745, + "learning_rate": 3.737661472183026e-05, + "loss": 12.0234, + "step": 26589 + }, + { + "epoch": 1.4479317691422815, + "grad_norm": 0.5355976618891843, + "learning_rate": 3.73697399401975e-05, + "loss": 11.8629, + "step": 26590 + }, + { + "epoch": 1.4479862231388645, + "grad_norm": 0.5919818661633061, + "learning_rate": 3.736286564558811e-05, + "loss": 11.9697, + "step": 26591 + }, + { + "epoch": 1.4480406771354475, + "grad_norm": 0.5310696870018928, + "learning_rate": 3.7355991838055606e-05, + "loss": 11.8087, + "step": 26592 + }, + { + "epoch": 1.4480951311320305, + "grad_norm": 0.5297474909597147, + "learning_rate": 3.734911851765339e-05, + "loss": 11.8513, + "step": 26593 + }, + { + "epoch": 1.4481495851286135, + "grad_norm": 0.5349907359447504, + "learning_rate": 3.7342245684434964e-05, + "loss": 11.8563, + "step": 26594 + }, + { + "epoch": 1.4482040391251965, + "grad_norm": 0.6776675951913991, + "learning_rate": 3.733537333845375e-05, + "loss": 11.9932, + "step": 26595 + }, + { + "epoch": 1.4482584931217795, + "grad_norm": 0.506046570760118, + "learning_rate": 3.7328501479763144e-05, + "loss": 11.8887, + "step": 26596 + }, + { + "epoch": 1.4483129471183624, + "grad_norm": 0.5844919760727451, + "learning_rate": 3.732163010841665e-05, + "loss": 11.7822, + "step": 26597 + }, + { + "epoch": 1.4483674011149457, + "grad_norm": 0.5726426211448638, + "learning_rate": 3.7314759224467646e-05, + "loss": 11.8709, + "step": 26598 + }, + { + "epoch": 1.4484218551115287, + "grad_norm": 0.6322767415967914, + "learning_rate": 3.730788882796957e-05, + "loss": 11.7881, + "step": 26599 + }, + { + "epoch": 1.4484763091081116, + "grad_norm": 0.5457326375300916, + "learning_rate": 3.730101891897592e-05, + "loss": 11.8803, + "step": 26600 + }, + { + "epoch": 1.4485307631046946, + "grad_norm": 0.5716956568315725, + "learning_rate": 3.729414949754001e-05, + "loss": 11.7994, + "step": 26601 + }, + { + "epoch": 1.4485852171012776, + "grad_norm": 0.5718924603303069, + "learning_rate": 3.728728056371536e-05, + "loss": 11.8763, + "step": 26602 + }, + { + "epoch": 1.4486396710978606, + "grad_norm": 0.5960681913198139, + "learning_rate": 3.728041211755529e-05, + "loss": 11.85, + "step": 26603 + }, + { + "epoch": 1.4486941250944436, + "grad_norm": 0.5632622976026171, + "learning_rate": 3.7273544159113284e-05, + "loss": 11.9303, + "step": 26604 + }, + { + "epoch": 1.4487485790910266, + "grad_norm": 0.6080133805513952, + "learning_rate": 3.726667668844271e-05, + "loss": 11.8867, + "step": 26605 + }, + { + "epoch": 1.4488030330876096, + "grad_norm": 0.5636445259452803, + "learning_rate": 3.725980970559696e-05, + "loss": 11.9338, + "step": 26606 + }, + { + "epoch": 1.4488574870841928, + "grad_norm": 0.5272925428562419, + "learning_rate": 3.7252943210629475e-05, + "loss": 11.8475, + "step": 26607 + }, + { + "epoch": 1.4489119410807758, + "grad_norm": 0.5407317561188552, + "learning_rate": 3.72460772035936e-05, + "loss": 11.8126, + "step": 26608 + }, + { + "epoch": 1.4489663950773588, + "grad_norm": 0.5934215016958524, + "learning_rate": 3.723921168454275e-05, + "loss": 11.9487, + "step": 26609 + }, + { + "epoch": 1.4490208490739418, + "grad_norm": 0.6207775714699713, + "learning_rate": 3.723234665353035e-05, + "loss": 11.9526, + "step": 26610 + }, + { + "epoch": 1.4490753030705248, + "grad_norm": 0.5380609919260102, + "learning_rate": 3.7225482110609725e-05, + "loss": 11.932, + "step": 26611 + }, + { + "epoch": 1.4491297570671078, + "grad_norm": 0.5668187977897106, + "learning_rate": 3.72186180558343e-05, + "loss": 11.7625, + "step": 26612 + }, + { + "epoch": 1.4491842110636908, + "grad_norm": 0.6256438118898571, + "learning_rate": 3.721175448925739e-05, + "loss": 11.9884, + "step": 26613 + }, + { + "epoch": 1.4492386650602738, + "grad_norm": 0.5845515425396075, + "learning_rate": 3.720489141093245e-05, + "loss": 11.7293, + "step": 26614 + }, + { + "epoch": 1.4492931190568568, + "grad_norm": 0.5545123865862376, + "learning_rate": 3.71980288209128e-05, + "loss": 11.6521, + "step": 26615 + }, + { + "epoch": 1.4493475730534398, + "grad_norm": 0.5884598664294155, + "learning_rate": 3.719116671925178e-05, + "loss": 11.8231, + "step": 26616 + }, + { + "epoch": 1.4494020270500227, + "grad_norm": 0.5254250350294184, + "learning_rate": 3.7184305106002815e-05, + "loss": 11.8132, + "step": 26617 + }, + { + "epoch": 1.4494564810466057, + "grad_norm": 0.5477490901207946, + "learning_rate": 3.717744398121919e-05, + "loss": 11.8797, + "step": 26618 + }, + { + "epoch": 1.4495109350431887, + "grad_norm": 0.5908517690865315, + "learning_rate": 3.717058334495432e-05, + "loss": 11.8457, + "step": 26619 + }, + { + "epoch": 1.4495653890397717, + "grad_norm": 0.51775420041226, + "learning_rate": 3.716372319726151e-05, + "loss": 11.6898, + "step": 26620 + }, + { + "epoch": 1.4496198430363547, + "grad_norm": 0.4816928323939602, + "learning_rate": 3.715686353819413e-05, + "loss": 11.718, + "step": 26621 + }, + { + "epoch": 1.449674297032938, + "grad_norm": 0.5499302883225158, + "learning_rate": 3.7150004367805544e-05, + "loss": 11.7975, + "step": 26622 + }, + { + "epoch": 1.449728751029521, + "grad_norm": 0.5083234784909617, + "learning_rate": 3.714314568614904e-05, + "loss": 11.9855, + "step": 26623 + }, + { + "epoch": 1.449783205026104, + "grad_norm": 0.5703637258080845, + "learning_rate": 3.713628749327803e-05, + "loss": 11.9224, + "step": 26624 + }, + { + "epoch": 1.449837659022687, + "grad_norm": 0.521465439116705, + "learning_rate": 3.712942978924573e-05, + "loss": 11.911, + "step": 26625 + }, + { + "epoch": 1.44989211301927, + "grad_norm": 0.5457540952526343, + "learning_rate": 3.712257257410553e-05, + "loss": 11.8902, + "step": 26626 + }, + { + "epoch": 1.449946567015853, + "grad_norm": 0.6187793111457757, + "learning_rate": 3.711571584791078e-05, + "loss": 12.0318, + "step": 26627 + }, + { + "epoch": 1.4500010210124359, + "grad_norm": 0.5475059480060864, + "learning_rate": 3.710885961071473e-05, + "loss": 11.7676, + "step": 26628 + }, + { + "epoch": 1.4500554750090189, + "grad_norm": 0.5617799550263257, + "learning_rate": 3.710200386257078e-05, + "loss": 11.8396, + "step": 26629 + }, + { + "epoch": 1.4501099290056019, + "grad_norm": 0.5459667987203591, + "learning_rate": 3.7095148603532145e-05, + "loss": 11.9124, + "step": 26630 + }, + { + "epoch": 1.450164383002185, + "grad_norm": 0.4810094622553554, + "learning_rate": 3.7088293833652235e-05, + "loss": 11.9806, + "step": 26631 + }, + { + "epoch": 1.450218836998768, + "grad_norm": 0.5419299697593373, + "learning_rate": 3.708143955298427e-05, + "loss": 11.9082, + "step": 26632 + }, + { + "epoch": 1.450273290995351, + "grad_norm": 0.5519326139380485, + "learning_rate": 3.707458576158157e-05, + "loss": 11.843, + "step": 26633 + }, + { + "epoch": 1.450327744991934, + "grad_norm": 0.5701770708516417, + "learning_rate": 3.706773245949752e-05, + "loss": 11.872, + "step": 26634 + }, + { + "epoch": 1.450382198988517, + "grad_norm": 0.5318936987815376, + "learning_rate": 3.7060879646785263e-05, + "loss": 11.8942, + "step": 26635 + }, + { + "epoch": 1.4504366529851, + "grad_norm": 0.5432286107373698, + "learning_rate": 3.7054027323498154e-05, + "loss": 11.9042, + "step": 26636 + }, + { + "epoch": 1.450491106981683, + "grad_norm": 0.5471231654232496, + "learning_rate": 3.704717548968953e-05, + "loss": 11.8201, + "step": 26637 + }, + { + "epoch": 1.450545560978266, + "grad_norm": 0.5292156627920321, + "learning_rate": 3.704032414541258e-05, + "loss": 11.9441, + "step": 26638 + }, + { + "epoch": 1.450600014974849, + "grad_norm": 0.5307262638375472, + "learning_rate": 3.703347329072068e-05, + "loss": 11.7009, + "step": 26639 + }, + { + "epoch": 1.450654468971432, + "grad_norm": 0.5031604460807896, + "learning_rate": 3.702662292566701e-05, + "loss": 11.8017, + "step": 26640 + }, + { + "epoch": 1.450708922968015, + "grad_norm": 0.5135695826525555, + "learning_rate": 3.701977305030492e-05, + "loss": 11.9195, + "step": 26641 + }, + { + "epoch": 1.450763376964598, + "grad_norm": 0.523123755323666, + "learning_rate": 3.701292366468759e-05, + "loss": 11.8516, + "step": 26642 + }, + { + "epoch": 1.450817830961181, + "grad_norm": 0.5273212093046554, + "learning_rate": 3.700607476886834e-05, + "loss": 11.7828, + "step": 26643 + }, + { + "epoch": 1.450872284957764, + "grad_norm": 0.5765967455236608, + "learning_rate": 3.699922636290047e-05, + "loss": 11.7969, + "step": 26644 + }, + { + "epoch": 1.4509267389543472, + "grad_norm": 0.5088291676098585, + "learning_rate": 3.699237844683713e-05, + "loss": 11.7438, + "step": 26645 + }, + { + "epoch": 1.4509811929509302, + "grad_norm": 0.619557520415979, + "learning_rate": 3.6985531020731645e-05, + "loss": 11.8212, + "step": 26646 + }, + { + "epoch": 1.4510356469475132, + "grad_norm": 0.563658842852241, + "learning_rate": 3.697868408463721e-05, + "loss": 11.8958, + "step": 26647 + }, + { + "epoch": 1.4510901009440962, + "grad_norm": 0.5518264658910694, + "learning_rate": 3.6971837638607086e-05, + "loss": 11.829, + "step": 26648 + }, + { + "epoch": 1.4511445549406792, + "grad_norm": 0.5108141829811468, + "learning_rate": 3.696499168269456e-05, + "loss": 11.715, + "step": 26649 + }, + { + "epoch": 1.4511990089372622, + "grad_norm": 0.5799726850046848, + "learning_rate": 3.6958146216952806e-05, + "loss": 11.8591, + "step": 26650 + }, + { + "epoch": 1.4512534629338452, + "grad_norm": 0.6435227694286262, + "learning_rate": 3.6951301241435096e-05, + "loss": 11.8321, + "step": 26651 + }, + { + "epoch": 1.4513079169304282, + "grad_norm": 0.547431749579825, + "learning_rate": 3.6944456756194625e-05, + "loss": 11.8445, + "step": 26652 + }, + { + "epoch": 1.4513623709270111, + "grad_norm": 0.5623785425870065, + "learning_rate": 3.6937612761284654e-05, + "loss": 11.9257, + "step": 26653 + }, + { + "epoch": 1.4514168249235944, + "grad_norm": 0.6250343284096292, + "learning_rate": 3.693076925675839e-05, + "loss": 12.0168, + "step": 26654 + }, + { + "epoch": 1.4514712789201774, + "grad_norm": 0.5355238706229178, + "learning_rate": 3.6923926242669e-05, + "loss": 11.8364, + "step": 26655 + }, + { + "epoch": 1.4515257329167603, + "grad_norm": 0.5228373184359539, + "learning_rate": 3.6917083719069775e-05, + "loss": 11.7161, + "step": 26656 + }, + { + "epoch": 1.4515801869133433, + "grad_norm": 0.5314041809686707, + "learning_rate": 3.691024168601386e-05, + "loss": 11.9017, + "step": 26657 + }, + { + "epoch": 1.4516346409099263, + "grad_norm": 0.504874262963168, + "learning_rate": 3.6903400143554476e-05, + "loss": 11.8323, + "step": 26658 + }, + { + "epoch": 1.4516890949065093, + "grad_norm": 0.5325775240786298, + "learning_rate": 3.6896559091744866e-05, + "loss": 11.9752, + "step": 26659 + }, + { + "epoch": 1.4517435489030923, + "grad_norm": 0.5333450132962121, + "learning_rate": 3.6889718530638165e-05, + "loss": 11.8763, + "step": 26660 + }, + { + "epoch": 1.4517980028996753, + "grad_norm": 0.5229551905724732, + "learning_rate": 3.688287846028763e-05, + "loss": 11.8127, + "step": 26661 + }, + { + "epoch": 1.4518524568962583, + "grad_norm": 0.5546493919859037, + "learning_rate": 3.687603888074638e-05, + "loss": 11.8888, + "step": 26662 + }, + { + "epoch": 1.4519069108928413, + "grad_norm": 0.6208436977953069, + "learning_rate": 3.686919979206768e-05, + "loss": 11.8377, + "step": 26663 + }, + { + "epoch": 1.4519613648894243, + "grad_norm": 0.6029887441206228, + "learning_rate": 3.6862361194304663e-05, + "loss": 11.8657, + "step": 26664 + }, + { + "epoch": 1.4520158188860073, + "grad_norm": 0.5307189853439686, + "learning_rate": 3.685552308751048e-05, + "loss": 11.8823, + "step": 26665 + }, + { + "epoch": 1.4520702728825903, + "grad_norm": 0.567352885839594, + "learning_rate": 3.6848685471738375e-05, + "loss": 11.8497, + "step": 26666 + }, + { + "epoch": 1.4521247268791733, + "grad_norm": 0.5181823060831318, + "learning_rate": 3.684184834704144e-05, + "loss": 11.9856, + "step": 26667 + }, + { + "epoch": 1.4521791808757565, + "grad_norm": 0.49518589676387914, + "learning_rate": 3.683501171347292e-05, + "loss": 11.9018, + "step": 26668 + }, + { + "epoch": 1.4522336348723395, + "grad_norm": 0.5724251951825899, + "learning_rate": 3.682817557108592e-05, + "loss": 11.9484, + "step": 26669 + }, + { + "epoch": 1.4522880888689225, + "grad_norm": 0.4889244917085866, + "learning_rate": 3.68213399199336e-05, + "loss": 11.8869, + "step": 26670 + }, + { + "epoch": 1.4523425428655055, + "grad_norm": 0.5889861321989771, + "learning_rate": 3.681450476006919e-05, + "loss": 11.8546, + "step": 26671 + }, + { + "epoch": 1.4523969968620885, + "grad_norm": 0.5525937298946881, + "learning_rate": 3.680767009154574e-05, + "loss": 11.8557, + "step": 26672 + }, + { + "epoch": 1.4524514508586714, + "grad_norm": 0.5281489612725787, + "learning_rate": 3.680083591441649e-05, + "loss": 11.8243, + "step": 26673 + }, + { + "epoch": 1.4525059048552544, + "grad_norm": 0.5640452792677921, + "learning_rate": 3.679400222873454e-05, + "loss": 12.0197, + "step": 26674 + }, + { + "epoch": 1.4525603588518374, + "grad_norm": 0.5531250123575686, + "learning_rate": 3.678716903455298e-05, + "loss": 11.909, + "step": 26675 + }, + { + "epoch": 1.4526148128484204, + "grad_norm": 0.516898248232284, + "learning_rate": 3.6780336331925035e-05, + "loss": 11.9252, + "step": 26676 + }, + { + "epoch": 1.4526692668450036, + "grad_norm": 0.5915812540439752, + "learning_rate": 3.677350412090377e-05, + "loss": 11.8243, + "step": 26677 + }, + { + "epoch": 1.4527237208415866, + "grad_norm": 0.5657633122548344, + "learning_rate": 3.676667240154236e-05, + "loss": 11.8663, + "step": 26678 + }, + { + "epoch": 1.4527781748381696, + "grad_norm": 0.5015793374178141, + "learning_rate": 3.6759841173893884e-05, + "loss": 11.7767, + "step": 26679 + }, + { + "epoch": 1.4528326288347526, + "grad_norm": 0.5368184892830379, + "learning_rate": 3.6753010438011524e-05, + "loss": 11.9265, + "step": 26680 + }, + { + "epoch": 1.4528870828313356, + "grad_norm": 0.578257864527884, + "learning_rate": 3.674618019394832e-05, + "loss": 12.0579, + "step": 26681 + }, + { + "epoch": 1.4529415368279186, + "grad_norm": 0.550452838930861, + "learning_rate": 3.6739350441757436e-05, + "loss": 11.8322, + "step": 26682 + }, + { + "epoch": 1.4529959908245016, + "grad_norm": 0.5818041465059394, + "learning_rate": 3.6732521181492e-05, + "loss": 11.957, + "step": 26683 + }, + { + "epoch": 1.4530504448210846, + "grad_norm": 0.49841726471481923, + "learning_rate": 3.672569241320509e-05, + "loss": 11.9075, + "step": 26684 + }, + { + "epoch": 1.4531048988176676, + "grad_norm": 0.5062898364607856, + "learning_rate": 3.671886413694977e-05, + "loss": 11.7822, + "step": 26685 + }, + { + "epoch": 1.4531593528142506, + "grad_norm": 0.5812687075027585, + "learning_rate": 3.671203635277921e-05, + "loss": 11.9392, + "step": 26686 + }, + { + "epoch": 1.4532138068108336, + "grad_norm": 0.5438822429058972, + "learning_rate": 3.670520906074644e-05, + "loss": 12.0322, + "step": 26687 + }, + { + "epoch": 1.4532682608074166, + "grad_norm": 0.6587171183868992, + "learning_rate": 3.6698382260904605e-05, + "loss": 11.8866, + "step": 26688 + }, + { + "epoch": 1.4533227148039995, + "grad_norm": 0.6229946243171511, + "learning_rate": 3.669155595330673e-05, + "loss": 11.8018, + "step": 26689 + }, + { + "epoch": 1.4533771688005825, + "grad_norm": 0.5237312266741596, + "learning_rate": 3.668473013800599e-05, + "loss": 11.7315, + "step": 26690 + }, + { + "epoch": 1.4534316227971655, + "grad_norm": 0.5458746431351207, + "learning_rate": 3.667790481505534e-05, + "loss": 11.8983, + "step": 26691 + }, + { + "epoch": 1.4534860767937487, + "grad_norm": 0.5293096620609764, + "learning_rate": 3.667107998450794e-05, + "loss": 11.8849, + "step": 26692 + }, + { + "epoch": 1.4535405307903317, + "grad_norm": 0.5801932421878874, + "learning_rate": 3.666425564641687e-05, + "loss": 11.8451, + "step": 26693 + }, + { + "epoch": 1.4535949847869147, + "grad_norm": 0.5618362862651421, + "learning_rate": 3.665743180083517e-05, + "loss": 11.9924, + "step": 26694 + }, + { + "epoch": 1.4536494387834977, + "grad_norm": 0.5225101256088879, + "learning_rate": 3.6650608447815904e-05, + "loss": 11.7398, + "step": 26695 + }, + { + "epoch": 1.4537038927800807, + "grad_norm": 0.560091871139057, + "learning_rate": 3.6643785587412106e-05, + "loss": 11.764, + "step": 26696 + }, + { + "epoch": 1.4537583467766637, + "grad_norm": 0.49389084916806575, + "learning_rate": 3.6636963219676843e-05, + "loss": 11.7378, + "step": 26697 + }, + { + "epoch": 1.4538128007732467, + "grad_norm": 0.5167474388162381, + "learning_rate": 3.6630141344663214e-05, + "loss": 11.8895, + "step": 26698 + }, + { + "epoch": 1.4538672547698297, + "grad_norm": 0.5112341266163132, + "learning_rate": 3.662331996242421e-05, + "loss": 11.7585, + "step": 26699 + }, + { + "epoch": 1.453921708766413, + "grad_norm": 0.6146450847183496, + "learning_rate": 3.661649907301292e-05, + "loss": 11.8389, + "step": 26700 + }, + { + "epoch": 1.453976162762996, + "grad_norm": 0.5976576909184441, + "learning_rate": 3.660967867648234e-05, + "loss": 11.934, + "step": 26701 + }, + { + "epoch": 1.454030616759579, + "grad_norm": 0.5418991571391522, + "learning_rate": 3.660285877288555e-05, + "loss": 12.0553, + "step": 26702 + }, + { + "epoch": 1.454085070756162, + "grad_norm": 0.515063518740225, + "learning_rate": 3.659603936227554e-05, + "loss": 11.8825, + "step": 26703 + }, + { + "epoch": 1.4541395247527449, + "grad_norm": 0.6161357065413106, + "learning_rate": 3.658922044470538e-05, + "loss": 11.9571, + "step": 26704 + }, + { + "epoch": 1.4541939787493279, + "grad_norm": 0.5561470406601793, + "learning_rate": 3.658240202022809e-05, + "loss": 11.8942, + "step": 26705 + }, + { + "epoch": 1.4542484327459109, + "grad_norm": 0.5795350217089544, + "learning_rate": 3.657558408889664e-05, + "loss": 11.9671, + "step": 26706 + }, + { + "epoch": 1.4543028867424939, + "grad_norm": 0.6153799742189493, + "learning_rate": 3.6568766650764085e-05, + "loss": 11.8301, + "step": 26707 + }, + { + "epoch": 1.4543573407390769, + "grad_norm": 0.5661994072207569, + "learning_rate": 3.656194970588347e-05, + "loss": 11.7437, + "step": 26708 + }, + { + "epoch": 1.4544117947356598, + "grad_norm": 0.5755120456972027, + "learning_rate": 3.655513325430774e-05, + "loss": 11.8081, + "step": 26709 + }, + { + "epoch": 1.4544662487322428, + "grad_norm": 0.5819433422622243, + "learning_rate": 3.6548317296089966e-05, + "loss": 11.7607, + "step": 26710 + }, + { + "epoch": 1.4545207027288258, + "grad_norm": 0.5758021353347903, + "learning_rate": 3.6541501831283086e-05, + "loss": 11.8294, + "step": 26711 + }, + { + "epoch": 1.4545751567254088, + "grad_norm": 0.5337121951230486, + "learning_rate": 3.653468685994016e-05, + "loss": 12.0003, + "step": 26712 + }, + { + "epoch": 1.4546296107219918, + "grad_norm": 0.5368404474628399, + "learning_rate": 3.6527872382114116e-05, + "loss": 11.5876, + "step": 26713 + }, + { + "epoch": 1.4546840647185748, + "grad_norm": 0.5330001295476007, + "learning_rate": 3.652105839785802e-05, + "loss": 11.7357, + "step": 26714 + }, + { + "epoch": 1.454738518715158, + "grad_norm": 0.5481479255822557, + "learning_rate": 3.651424490722481e-05, + "loss": 11.8336, + "step": 26715 + }, + { + "epoch": 1.454792972711741, + "grad_norm": 0.5863006525049337, + "learning_rate": 3.6507431910267454e-05, + "loss": 12.0167, + "step": 26716 + }, + { + "epoch": 1.454847426708324, + "grad_norm": 0.5850871674472374, + "learning_rate": 3.650061940703899e-05, + "loss": 11.8807, + "step": 26717 + }, + { + "epoch": 1.454901880704907, + "grad_norm": 0.5020447523837402, + "learning_rate": 3.649380739759232e-05, + "loss": 11.9299, + "step": 26718 + }, + { + "epoch": 1.45495633470149, + "grad_norm": 0.5603533755896191, + "learning_rate": 3.6486995881980454e-05, + "loss": 11.7836, + "step": 26719 + }, + { + "epoch": 1.455010788698073, + "grad_norm": 0.49994528514717607, + "learning_rate": 3.648018486025639e-05, + "loss": 11.8348, + "step": 26720 + }, + { + "epoch": 1.455065242694656, + "grad_norm": 0.5114811811329905, + "learning_rate": 3.647337433247304e-05, + "loss": 11.7676, + "step": 26721 + }, + { + "epoch": 1.455119696691239, + "grad_norm": 0.4767098339040379, + "learning_rate": 3.64665642986834e-05, + "loss": 11.8318, + "step": 26722 + }, + { + "epoch": 1.455174150687822, + "grad_norm": 0.5873454360190589, + "learning_rate": 3.645975475894039e-05, + "loss": 11.8242, + "step": 26723 + }, + { + "epoch": 1.4552286046844052, + "grad_norm": 0.7365266586157547, + "learning_rate": 3.645294571329702e-05, + "loss": 11.8758, + "step": 26724 + }, + { + "epoch": 1.4552830586809882, + "grad_norm": 0.6174881721673229, + "learning_rate": 3.6446137161806194e-05, + "loss": 12.0069, + "step": 26725 + }, + { + "epoch": 1.4553375126775712, + "grad_norm": 0.5699336506367176, + "learning_rate": 3.6439329104520824e-05, + "loss": 11.9938, + "step": 26726 + }, + { + "epoch": 1.4553919666741542, + "grad_norm": 0.5293799174951075, + "learning_rate": 3.6432521541493924e-05, + "loss": 11.9106, + "step": 26727 + }, + { + "epoch": 1.4554464206707372, + "grad_norm": 0.5537992367636208, + "learning_rate": 3.642571447277837e-05, + "loss": 11.7465, + "step": 26728 + }, + { + "epoch": 1.4555008746673201, + "grad_norm": 0.4849929869678965, + "learning_rate": 3.6418907898427156e-05, + "loss": 11.8867, + "step": 26729 + }, + { + "epoch": 1.4555553286639031, + "grad_norm": 0.585787101802056, + "learning_rate": 3.641210181849314e-05, + "loss": 11.8157, + "step": 26730 + }, + { + "epoch": 1.4556097826604861, + "grad_norm": 0.6370423223181031, + "learning_rate": 3.6405296233029285e-05, + "loss": 11.9242, + "step": 26731 + }, + { + "epoch": 1.4556642366570691, + "grad_norm": 0.5208407888828812, + "learning_rate": 3.639849114208854e-05, + "loss": 11.7121, + "step": 26732 + }, + { + "epoch": 1.4557186906536521, + "grad_norm": 0.5123049212917185, + "learning_rate": 3.6391686545723766e-05, + "loss": 11.7233, + "step": 26733 + }, + { + "epoch": 1.455773144650235, + "grad_norm": 0.5498272494814008, + "learning_rate": 3.638488244398795e-05, + "loss": 11.8731, + "step": 26734 + }, + { + "epoch": 1.455827598646818, + "grad_norm": 0.5633624620278386, + "learning_rate": 3.637807883693394e-05, + "loss": 11.8209, + "step": 26735 + }, + { + "epoch": 1.455882052643401, + "grad_norm": 0.5101358751773256, + "learning_rate": 3.6371275724614616e-05, + "loss": 11.6467, + "step": 26736 + }, + { + "epoch": 1.455936506639984, + "grad_norm": 0.6661543826322869, + "learning_rate": 3.6364473107082976e-05, + "loss": 11.8434, + "step": 26737 + }, + { + "epoch": 1.4559909606365673, + "grad_norm": 0.5377742590857267, + "learning_rate": 3.6357670984391825e-05, + "loss": 11.9791, + "step": 26738 + }, + { + "epoch": 1.4560454146331503, + "grad_norm": 0.6069873937588747, + "learning_rate": 3.635086935659412e-05, + "loss": 11.985, + "step": 26739 + }, + { + "epoch": 1.4560998686297333, + "grad_norm": 0.5805923894623883, + "learning_rate": 3.634406822374271e-05, + "loss": 11.7486, + "step": 26740 + }, + { + "epoch": 1.4561543226263163, + "grad_norm": 0.5034230426158893, + "learning_rate": 3.6337267585890486e-05, + "loss": 11.9536, + "step": 26741 + }, + { + "epoch": 1.4562087766228993, + "grad_norm": 0.5110394749315411, + "learning_rate": 3.63304674430904e-05, + "loss": 11.7933, + "step": 26742 + }, + { + "epoch": 1.4562632306194823, + "grad_norm": 0.5787534043271866, + "learning_rate": 3.632366779539522e-05, + "loss": 12.0233, + "step": 26743 + }, + { + "epoch": 1.4563176846160653, + "grad_norm": 0.542646975531092, + "learning_rate": 3.631686864285793e-05, + "loss": 11.8384, + "step": 26744 + }, + { + "epoch": 1.4563721386126482, + "grad_norm": 0.5042075821464146, + "learning_rate": 3.631006998553134e-05, + "loss": 11.8898, + "step": 26745 + }, + { + "epoch": 1.4564265926092312, + "grad_norm": 0.5677960509359683, + "learning_rate": 3.63032718234683e-05, + "loss": 11.8293, + "step": 26746 + }, + { + "epoch": 1.4564810466058145, + "grad_norm": 0.49353872529268183, + "learning_rate": 3.6296474156721725e-05, + "loss": 11.8039, + "step": 26747 + }, + { + "epoch": 1.4565355006023974, + "grad_norm": 0.5477240079817448, + "learning_rate": 3.628967698534441e-05, + "loss": 11.7792, + "step": 26748 + }, + { + "epoch": 1.4565899545989804, + "grad_norm": 0.5096398289991971, + "learning_rate": 3.62828803093893e-05, + "loss": 11.8037, + "step": 26749 + }, + { + "epoch": 1.4566444085955634, + "grad_norm": 0.5487537163745176, + "learning_rate": 3.627608412890916e-05, + "loss": 12.0059, + "step": 26750 + }, + { + "epoch": 1.4566988625921464, + "grad_norm": 0.5507765976045028, + "learning_rate": 3.6269288443956906e-05, + "loss": 11.9403, + "step": 26751 + }, + { + "epoch": 1.4567533165887294, + "grad_norm": 0.5494549162198724, + "learning_rate": 3.626249325458533e-05, + "loss": 11.8644, + "step": 26752 + }, + { + "epoch": 1.4568077705853124, + "grad_norm": 0.4974389737999597, + "learning_rate": 3.625569856084728e-05, + "loss": 11.7748, + "step": 26753 + }, + { + "epoch": 1.4568622245818954, + "grad_norm": 0.5486646637385234, + "learning_rate": 3.624890436279565e-05, + "loss": 11.8773, + "step": 26754 + }, + { + "epoch": 1.4569166785784784, + "grad_norm": 0.5552403145496668, + "learning_rate": 3.624211066048323e-05, + "loss": 11.8685, + "step": 26755 + }, + { + "epoch": 1.4569711325750614, + "grad_norm": 0.5307731225341632, + "learning_rate": 3.623531745396282e-05, + "loss": 11.8372, + "step": 26756 + }, + { + "epoch": 1.4570255865716444, + "grad_norm": 0.5385312329365134, + "learning_rate": 3.6228524743287294e-05, + "loss": 11.742, + "step": 26757 + }, + { + "epoch": 1.4570800405682274, + "grad_norm": 0.5731116936757621, + "learning_rate": 3.622173252850943e-05, + "loss": 11.869, + "step": 26758 + }, + { + "epoch": 1.4571344945648104, + "grad_norm": 0.5654345221512659, + "learning_rate": 3.621494080968211e-05, + "loss": 11.9001, + "step": 26759 + }, + { + "epoch": 1.4571889485613934, + "grad_norm": 0.5589460771174387, + "learning_rate": 3.620814958685807e-05, + "loss": 11.9373, + "step": 26760 + }, + { + "epoch": 1.4572434025579764, + "grad_norm": 0.5212287814667456, + "learning_rate": 3.6201358860090184e-05, + "loss": 11.8175, + "step": 26761 + }, + { + "epoch": 1.4572978565545596, + "grad_norm": 0.5560803817828982, + "learning_rate": 3.61945686294312e-05, + "loss": 12.0346, + "step": 26762 + }, + { + "epoch": 1.4573523105511426, + "grad_norm": 0.4815342156419329, + "learning_rate": 3.618777889493394e-05, + "loss": 11.8207, + "step": 26763 + }, + { + "epoch": 1.4574067645477256, + "grad_norm": 0.5340409393213523, + "learning_rate": 3.618098965665126e-05, + "loss": 11.8581, + "step": 26764 + }, + { + "epoch": 1.4574612185443085, + "grad_norm": 0.5808746622743467, + "learning_rate": 3.6174200914635904e-05, + "loss": 11.7477, + "step": 26765 + }, + { + "epoch": 1.4575156725408915, + "grad_norm": 0.5066700160259542, + "learning_rate": 3.616741266894067e-05, + "loss": 11.899, + "step": 26766 + }, + { + "epoch": 1.4575701265374745, + "grad_norm": 0.5024104816794057, + "learning_rate": 3.6160624919618304e-05, + "loss": 11.8207, + "step": 26767 + }, + { + "epoch": 1.4576245805340575, + "grad_norm": 0.6249859002216991, + "learning_rate": 3.6153837666721616e-05, + "loss": 11.919, + "step": 26768 + }, + { + "epoch": 1.4576790345306405, + "grad_norm": 0.5393837181351506, + "learning_rate": 3.614705091030344e-05, + "loss": 11.675, + "step": 26769 + }, + { + "epoch": 1.4577334885272237, + "grad_norm": 0.5552178948971118, + "learning_rate": 3.614026465041645e-05, + "loss": 11.8879, + "step": 26770 + }, + { + "epoch": 1.4577879425238067, + "grad_norm": 0.5082658302359733, + "learning_rate": 3.6133478887113525e-05, + "loss": 11.8018, + "step": 26771 + }, + { + "epoch": 1.4578423965203897, + "grad_norm": 0.5895837781272815, + "learning_rate": 3.612669362044734e-05, + "loss": 11.9849, + "step": 26772 + }, + { + "epoch": 1.4578968505169727, + "grad_norm": 0.5581416663970848, + "learning_rate": 3.611990885047073e-05, + "loss": 11.9058, + "step": 26773 + }, + { + "epoch": 1.4579513045135557, + "grad_norm": 0.5629753655164212, + "learning_rate": 3.6113124577236376e-05, + "loss": 11.7372, + "step": 26774 + }, + { + "epoch": 1.4580057585101387, + "grad_norm": 0.6344005842001699, + "learning_rate": 3.610634080079713e-05, + "loss": 12.0204, + "step": 26775 + }, + { + "epoch": 1.4580602125067217, + "grad_norm": 0.5377170799900914, + "learning_rate": 3.609955752120568e-05, + "loss": 11.9078, + "step": 26776 + }, + { + "epoch": 1.4581146665033047, + "grad_norm": 0.518073274344814, + "learning_rate": 3.609277473851477e-05, + "loss": 11.7801, + "step": 26777 + }, + { + "epoch": 1.4581691204998877, + "grad_norm": 0.5436874352859177, + "learning_rate": 3.6085992452777184e-05, + "loss": 11.8592, + "step": 26778 + }, + { + "epoch": 1.4582235744964707, + "grad_norm": 0.5310667102839995, + "learning_rate": 3.6079210664045607e-05, + "loss": 11.8141, + "step": 26779 + }, + { + "epoch": 1.4582780284930537, + "grad_norm": 0.586173361703436, + "learning_rate": 3.60724293723728e-05, + "loss": 11.759, + "step": 26780 + }, + { + "epoch": 1.4583324824896367, + "grad_norm": 0.5070016668348178, + "learning_rate": 3.606564857781154e-05, + "loss": 11.8323, + "step": 26781 + }, + { + "epoch": 1.4583869364862196, + "grad_norm": 0.5335407611867968, + "learning_rate": 3.605886828041449e-05, + "loss": 11.7087, + "step": 26782 + }, + { + "epoch": 1.4584413904828026, + "grad_norm": 0.6227259365432857, + "learning_rate": 3.605208848023444e-05, + "loss": 11.8679, + "step": 26783 + }, + { + "epoch": 1.4584958444793856, + "grad_norm": 0.4915145450358055, + "learning_rate": 3.6045309177324027e-05, + "loss": 11.7542, + "step": 26784 + }, + { + "epoch": 1.4585502984759688, + "grad_norm": 0.5394648993560738, + "learning_rate": 3.6038530371736055e-05, + "loss": 11.8166, + "step": 26785 + }, + { + "epoch": 1.4586047524725518, + "grad_norm": 0.5687555144585736, + "learning_rate": 3.603175206352321e-05, + "loss": 11.9645, + "step": 26786 + }, + { + "epoch": 1.4586592064691348, + "grad_norm": 0.5606517627161273, + "learning_rate": 3.602497425273813e-05, + "loss": 11.8242, + "step": 26787 + }, + { + "epoch": 1.4587136604657178, + "grad_norm": 0.5292711113321935, + "learning_rate": 3.6018196939433626e-05, + "loss": 11.8848, + "step": 26788 + }, + { + "epoch": 1.4587681144623008, + "grad_norm": 0.5183681193963549, + "learning_rate": 3.601142012366232e-05, + "loss": 11.9129, + "step": 26789 + }, + { + "epoch": 1.4588225684588838, + "grad_norm": 0.5621041526731029, + "learning_rate": 3.6004643805476943e-05, + "loss": 11.8582, + "step": 26790 + }, + { + "epoch": 1.4588770224554668, + "grad_norm": 0.5960845339392585, + "learning_rate": 3.599786798493021e-05, + "loss": 11.9627, + "step": 26791 + }, + { + "epoch": 1.4589314764520498, + "grad_norm": 0.5491270168168505, + "learning_rate": 3.5991092662074765e-05, + "loss": 11.7948, + "step": 26792 + }, + { + "epoch": 1.4589859304486328, + "grad_norm": 0.5597381496196345, + "learning_rate": 3.598431783696335e-05, + "loss": 11.8733, + "step": 26793 + }, + { + "epoch": 1.459040384445216, + "grad_norm": 0.5240365781155311, + "learning_rate": 3.597754350964858e-05, + "loss": 11.8348, + "step": 26794 + }, + { + "epoch": 1.459094838441799, + "grad_norm": 0.6225060645311038, + "learning_rate": 3.597076968018321e-05, + "loss": 11.9037, + "step": 26795 + }, + { + "epoch": 1.459149292438382, + "grad_norm": 0.5528228808901301, + "learning_rate": 3.596399634861987e-05, + "loss": 11.7331, + "step": 26796 + }, + { + "epoch": 1.459203746434965, + "grad_norm": 0.5245412508166607, + "learning_rate": 3.5957223515011195e-05, + "loss": 11.8954, + "step": 26797 + }, + { + "epoch": 1.459258200431548, + "grad_norm": 0.5355427766129638, + "learning_rate": 3.595045117940991e-05, + "loss": 11.8718, + "step": 26798 + }, + { + "epoch": 1.459312654428131, + "grad_norm": 0.528865987818189, + "learning_rate": 3.594367934186865e-05, + "loss": 11.8158, + "step": 26799 + }, + { + "epoch": 1.459367108424714, + "grad_norm": 0.5227031212702511, + "learning_rate": 3.5936908002440105e-05, + "loss": 11.8405, + "step": 26800 + }, + { + "epoch": 1.459421562421297, + "grad_norm": 0.6284403378354587, + "learning_rate": 3.593013716117687e-05, + "loss": 11.9101, + "step": 26801 + }, + { + "epoch": 1.45947601641788, + "grad_norm": 0.6028463324801152, + "learning_rate": 3.592336681813163e-05, + "loss": 11.8608, + "step": 26802 + }, + { + "epoch": 1.459530470414463, + "grad_norm": 0.5399263416332223, + "learning_rate": 3.591659697335707e-05, + "loss": 11.7132, + "step": 26803 + }, + { + "epoch": 1.459584924411046, + "grad_norm": 0.5004537076128194, + "learning_rate": 3.590982762690578e-05, + "loss": 11.8438, + "step": 26804 + }, + { + "epoch": 1.459639378407629, + "grad_norm": 0.505222743084213, + "learning_rate": 3.5903058778830434e-05, + "loss": 11.9408, + "step": 26805 + }, + { + "epoch": 1.459693832404212, + "grad_norm": 0.6122278623686817, + "learning_rate": 3.5896290429183656e-05, + "loss": 11.9357, + "step": 26806 + }, + { + "epoch": 1.459748286400795, + "grad_norm": 0.5553954049374712, + "learning_rate": 3.5889522578018044e-05, + "loss": 11.8876, + "step": 26807 + }, + { + "epoch": 1.4598027403973781, + "grad_norm": 0.5401387797359927, + "learning_rate": 3.588275522538629e-05, + "loss": 11.9887, + "step": 26808 + }, + { + "epoch": 1.459857194393961, + "grad_norm": 0.5533280896905917, + "learning_rate": 3.587598837134095e-05, + "loss": 11.9955, + "step": 26809 + }, + { + "epoch": 1.459911648390544, + "grad_norm": 0.6550990427751122, + "learning_rate": 3.5869222015934715e-05, + "loss": 11.8289, + "step": 26810 + }, + { + "epoch": 1.459966102387127, + "grad_norm": 0.587633376029004, + "learning_rate": 3.5862456159220114e-05, + "loss": 11.8928, + "step": 26811 + }, + { + "epoch": 1.46002055638371, + "grad_norm": 0.557811769610465, + "learning_rate": 3.585569080124983e-05, + "loss": 11.8358, + "step": 26812 + }, + { + "epoch": 1.460075010380293, + "grad_norm": 0.5273554918722372, + "learning_rate": 3.584892594207647e-05, + "loss": 11.7675, + "step": 26813 + }, + { + "epoch": 1.460129464376876, + "grad_norm": 0.5055439436397466, + "learning_rate": 3.5842161581752596e-05, + "loss": 11.8395, + "step": 26814 + }, + { + "epoch": 1.460183918373459, + "grad_norm": 0.5862420016312873, + "learning_rate": 3.58353977203309e-05, + "loss": 11.9083, + "step": 26815 + }, + { + "epoch": 1.460238372370042, + "grad_norm": 0.5139011935152126, + "learning_rate": 3.5828634357863835e-05, + "loss": 11.9216, + "step": 26816 + }, + { + "epoch": 1.4602928263666253, + "grad_norm": 0.5439183726164122, + "learning_rate": 3.582187149440408e-05, + "loss": 11.8978, + "step": 26817 + }, + { + "epoch": 1.4603472803632083, + "grad_norm": 0.5520476594550109, + "learning_rate": 3.581510913000424e-05, + "loss": 11.9112, + "step": 26818 + }, + { + "epoch": 1.4604017343597913, + "grad_norm": 0.5260466044548181, + "learning_rate": 3.580834726471686e-05, + "loss": 11.7303, + "step": 26819 + }, + { + "epoch": 1.4604561883563743, + "grad_norm": 0.4896045580720908, + "learning_rate": 3.5801585898594545e-05, + "loss": 11.8574, + "step": 26820 + }, + { + "epoch": 1.4605106423529572, + "grad_norm": 0.5525963234266409, + "learning_rate": 3.579482503168985e-05, + "loss": 11.9571, + "step": 26821 + }, + { + "epoch": 1.4605650963495402, + "grad_norm": 0.5674735658381522, + "learning_rate": 3.5788064664055384e-05, + "loss": 11.9662, + "step": 26822 + }, + { + "epoch": 1.4606195503461232, + "grad_norm": 0.5309966560276173, + "learning_rate": 3.578130479574367e-05, + "loss": 11.8387, + "step": 26823 + }, + { + "epoch": 1.4606740043427062, + "grad_norm": 0.6262015901906925, + "learning_rate": 3.5774545426807296e-05, + "loss": 11.9699, + "step": 26824 + }, + { + "epoch": 1.4607284583392892, + "grad_norm": 0.5931571144835259, + "learning_rate": 3.57677865572989e-05, + "loss": 11.8291, + "step": 26825 + }, + { + "epoch": 1.4607829123358722, + "grad_norm": 0.5761147142916926, + "learning_rate": 3.5761028187270896e-05, + "loss": 11.8606, + "step": 26826 + }, + { + "epoch": 1.4608373663324552, + "grad_norm": 0.5143222559023625, + "learning_rate": 3.575427031677594e-05, + "loss": 11.8135, + "step": 26827 + }, + { + "epoch": 1.4608918203290382, + "grad_norm": 0.5204746049835881, + "learning_rate": 3.574751294586652e-05, + "loss": 11.9793, + "step": 26828 + }, + { + "epoch": 1.4609462743256212, + "grad_norm": 0.5909807716816972, + "learning_rate": 3.574075607459522e-05, + "loss": 11.751, + "step": 26829 + }, + { + "epoch": 1.4610007283222042, + "grad_norm": 0.6470226609947656, + "learning_rate": 3.5733999703014596e-05, + "loss": 11.9792, + "step": 26830 + }, + { + "epoch": 1.4610551823187872, + "grad_norm": 0.5570834055776428, + "learning_rate": 3.572724383117715e-05, + "loss": 11.8176, + "step": 26831 + }, + { + "epoch": 1.4611096363153704, + "grad_norm": 0.5649935117040239, + "learning_rate": 3.572048845913546e-05, + "loss": 11.872, + "step": 26832 + }, + { + "epoch": 1.4611640903119534, + "grad_norm": 0.5690616025433386, + "learning_rate": 3.5713733586942e-05, + "loss": 11.8906, + "step": 26833 + }, + { + "epoch": 1.4612185443085364, + "grad_norm": 0.5559128719296456, + "learning_rate": 3.570697921464936e-05, + "loss": 11.8206, + "step": 26834 + }, + { + "epoch": 1.4612729983051194, + "grad_norm": 0.568355227035462, + "learning_rate": 3.5700225342310045e-05, + "loss": 11.8729, + "step": 26835 + }, + { + "epoch": 1.4613274523017024, + "grad_norm": 0.5144879399681113, + "learning_rate": 3.5693471969976514e-05, + "loss": 11.8475, + "step": 26836 + }, + { + "epoch": 1.4613819062982853, + "grad_norm": 0.5461364187896164, + "learning_rate": 3.568671909770136e-05, + "loss": 11.8737, + "step": 26837 + }, + { + "epoch": 1.4614363602948683, + "grad_norm": 0.5846842650329348, + "learning_rate": 3.567996672553704e-05, + "loss": 11.7958, + "step": 26838 + }, + { + "epoch": 1.4614908142914513, + "grad_norm": 0.5830549694521869, + "learning_rate": 3.5673214853536074e-05, + "loss": 11.9108, + "step": 26839 + }, + { + "epoch": 1.4615452682880345, + "grad_norm": 0.5879849804474622, + "learning_rate": 3.566646348175102e-05, + "loss": 11.8296, + "step": 26840 + }, + { + "epoch": 1.4615997222846175, + "grad_norm": 0.5461265208976671, + "learning_rate": 3.5659712610234295e-05, + "loss": 11.9402, + "step": 26841 + }, + { + "epoch": 1.4616541762812005, + "grad_norm": 0.609963121610458, + "learning_rate": 3.565296223903848e-05, + "loss": 11.9677, + "step": 26842 + }, + { + "epoch": 1.4617086302777835, + "grad_norm": 0.5175958788029443, + "learning_rate": 3.564621236821597e-05, + "loss": 11.8477, + "step": 26843 + }, + { + "epoch": 1.4617630842743665, + "grad_norm": 0.5896009516127384, + "learning_rate": 3.563946299781935e-05, + "loss": 11.8115, + "step": 26844 + }, + { + "epoch": 1.4618175382709495, + "grad_norm": 0.5422785192947653, + "learning_rate": 3.5632714127901054e-05, + "loss": 11.9648, + "step": 26845 + }, + { + "epoch": 1.4618719922675325, + "grad_norm": 0.4731157840095486, + "learning_rate": 3.562596575851354e-05, + "loss": 11.7739, + "step": 26846 + }, + { + "epoch": 1.4619264462641155, + "grad_norm": 0.5296582183190118, + "learning_rate": 3.5619217889709346e-05, + "loss": 11.8756, + "step": 26847 + }, + { + "epoch": 1.4619809002606985, + "grad_norm": 0.5278908416265546, + "learning_rate": 3.561247052154086e-05, + "loss": 11.7552, + "step": 26848 + }, + { + "epoch": 1.4620353542572815, + "grad_norm": 0.5306894901091312, + "learning_rate": 3.5605723654060654e-05, + "loss": 11.8228, + "step": 26849 + }, + { + "epoch": 1.4620898082538645, + "grad_norm": 0.5127144944073668, + "learning_rate": 3.55989772873211e-05, + "loss": 11.9017, + "step": 26850 + }, + { + "epoch": 1.4621442622504475, + "grad_norm": 0.5350098128136271, + "learning_rate": 3.55922314213747e-05, + "loss": 11.879, + "step": 26851 + }, + { + "epoch": 1.4621987162470305, + "grad_norm": 0.5825422245832014, + "learning_rate": 3.5585486056273943e-05, + "loss": 11.9091, + "step": 26852 + }, + { + "epoch": 1.4622531702436135, + "grad_norm": 0.5155547722424547, + "learning_rate": 3.557874119207121e-05, + "loss": 11.8226, + "step": 26853 + }, + { + "epoch": 1.4623076242401964, + "grad_norm": 0.5701044575689473, + "learning_rate": 3.557199682881902e-05, + "loss": 11.8285, + "step": 26854 + }, + { + "epoch": 1.4623620782367797, + "grad_norm": 0.5585870580704279, + "learning_rate": 3.556525296656979e-05, + "loss": 11.8385, + "step": 26855 + }, + { + "epoch": 1.4624165322333627, + "grad_norm": 0.5699723718271553, + "learning_rate": 3.555850960537593e-05, + "loss": 11.8206, + "step": 26856 + }, + { + "epoch": 1.4624709862299456, + "grad_norm": 0.5263886877754508, + "learning_rate": 3.555176674528994e-05, + "loss": 11.8327, + "step": 26857 + }, + { + "epoch": 1.4625254402265286, + "grad_norm": 0.5653618440115944, + "learning_rate": 3.554502438636419e-05, + "loss": 11.8992, + "step": 26858 + }, + { + "epoch": 1.4625798942231116, + "grad_norm": 0.5660923805417012, + "learning_rate": 3.553828252865117e-05, + "loss": 11.8532, + "step": 26859 + }, + { + "epoch": 1.4626343482196946, + "grad_norm": 0.5369221715785782, + "learning_rate": 3.553154117220323e-05, + "loss": 11.8881, + "step": 26860 + }, + { + "epoch": 1.4626888022162776, + "grad_norm": 0.5552046678845857, + "learning_rate": 3.552480031707285e-05, + "loss": 11.8117, + "step": 26861 + }, + { + "epoch": 1.4627432562128606, + "grad_norm": 0.5309263407310856, + "learning_rate": 3.551805996331247e-05, + "loss": 11.8196, + "step": 26862 + }, + { + "epoch": 1.4627977102094436, + "grad_norm": 0.5603158701294209, + "learning_rate": 3.551132011097442e-05, + "loss": 11.928, + "step": 26863 + }, + { + "epoch": 1.4628521642060268, + "grad_norm": 0.5828384033150691, + "learning_rate": 3.55045807601112e-05, + "loss": 11.7837, + "step": 26864 + }, + { + "epoch": 1.4629066182026098, + "grad_norm": 0.5465292314854292, + "learning_rate": 3.549784191077519e-05, + "loss": 11.912, + "step": 26865 + }, + { + "epoch": 1.4629610721991928, + "grad_norm": 0.5273812907053981, + "learning_rate": 3.549110356301873e-05, + "loss": 11.895, + "step": 26866 + }, + { + "epoch": 1.4630155261957758, + "grad_norm": 0.5416311755046903, + "learning_rate": 3.548436571689431e-05, + "loss": 11.9237, + "step": 26867 + }, + { + "epoch": 1.4630699801923588, + "grad_norm": 0.5237906236370793, + "learning_rate": 3.547762837245424e-05, + "loss": 11.7981, + "step": 26868 + }, + { + "epoch": 1.4631244341889418, + "grad_norm": 0.5410911256517951, + "learning_rate": 3.547089152975098e-05, + "loss": 11.7256, + "step": 26869 + }, + { + "epoch": 1.4631788881855248, + "grad_norm": 0.497770829428523, + "learning_rate": 3.546415518883687e-05, + "loss": 11.8333, + "step": 26870 + }, + { + "epoch": 1.4632333421821078, + "grad_norm": 0.5575192326980732, + "learning_rate": 3.545741934976434e-05, + "loss": 11.8322, + "step": 26871 + }, + { + "epoch": 1.4632877961786908, + "grad_norm": 0.5396793254369858, + "learning_rate": 3.545068401258571e-05, + "loss": 11.8685, + "step": 26872 + }, + { + "epoch": 1.4633422501752738, + "grad_norm": 0.5514470270430754, + "learning_rate": 3.544394917735337e-05, + "loss": 11.8436, + "step": 26873 + }, + { + "epoch": 1.4633967041718567, + "grad_norm": 0.5739962542643732, + "learning_rate": 3.543721484411976e-05, + "loss": 11.8729, + "step": 26874 + }, + { + "epoch": 1.4634511581684397, + "grad_norm": 0.5346187295383611, + "learning_rate": 3.543048101293719e-05, + "loss": 11.9297, + "step": 26875 + }, + { + "epoch": 1.4635056121650227, + "grad_norm": 0.5533855270475371, + "learning_rate": 3.5423747683857986e-05, + "loss": 11.8589, + "step": 26876 + }, + { + "epoch": 1.4635600661616057, + "grad_norm": 0.5170854857149493, + "learning_rate": 3.5417014856934595e-05, + "loss": 11.9063, + "step": 26877 + }, + { + "epoch": 1.463614520158189, + "grad_norm": 0.5254200164626402, + "learning_rate": 3.541028253221929e-05, + "loss": 11.8278, + "step": 26878 + }, + { + "epoch": 1.463668974154772, + "grad_norm": 0.5644971590330566, + "learning_rate": 3.5403550709764486e-05, + "loss": 11.9211, + "step": 26879 + }, + { + "epoch": 1.463723428151355, + "grad_norm": 0.554265191206821, + "learning_rate": 3.539681938962248e-05, + "loss": 11.8722, + "step": 26880 + }, + { + "epoch": 1.463777882147938, + "grad_norm": 0.5473133514142409, + "learning_rate": 3.539008857184567e-05, + "loss": 11.8108, + "step": 26881 + }, + { + "epoch": 1.463832336144521, + "grad_norm": 0.5564749672755559, + "learning_rate": 3.538335825648633e-05, + "loss": 11.9022, + "step": 26882 + }, + { + "epoch": 1.463886790141104, + "grad_norm": 0.5924881033935094, + "learning_rate": 3.537662844359687e-05, + "loss": 11.88, + "step": 26883 + }, + { + "epoch": 1.463941244137687, + "grad_norm": 0.5988835492623468, + "learning_rate": 3.5369899133229554e-05, + "loss": 11.982, + "step": 26884 + }, + { + "epoch": 1.4639956981342699, + "grad_norm": 0.4902957174226832, + "learning_rate": 3.536317032543673e-05, + "loss": 11.7601, + "step": 26885 + }, + { + "epoch": 1.4640501521308529, + "grad_norm": 0.5221769566812132, + "learning_rate": 3.535644202027081e-05, + "loss": 11.8759, + "step": 26886 + }, + { + "epoch": 1.464104606127436, + "grad_norm": 0.5963895010595072, + "learning_rate": 3.534971421778397e-05, + "loss": 11.8333, + "step": 26887 + }, + { + "epoch": 1.464159060124019, + "grad_norm": 0.5724937814045064, + "learning_rate": 3.5342986918028584e-05, + "loss": 11.8156, + "step": 26888 + }, + { + "epoch": 1.464213514120602, + "grad_norm": 0.5734674796494019, + "learning_rate": 3.533626012105702e-05, + "loss": 11.9431, + "step": 26889 + }, + { + "epoch": 1.464267968117185, + "grad_norm": 0.5768639096487074, + "learning_rate": 3.532953382692151e-05, + "loss": 11.8823, + "step": 26890 + }, + { + "epoch": 1.464322422113768, + "grad_norm": 0.5828853823118294, + "learning_rate": 3.5322808035674414e-05, + "loss": 11.8061, + "step": 26891 + }, + { + "epoch": 1.464376876110351, + "grad_norm": 0.6427780669610677, + "learning_rate": 3.5316082747367984e-05, + "loss": 11.8337, + "step": 26892 + }, + { + "epoch": 1.464431330106934, + "grad_norm": 0.532718269284918, + "learning_rate": 3.530935796205457e-05, + "loss": 11.9549, + "step": 26893 + }, + { + "epoch": 1.464485784103517, + "grad_norm": 0.5845266960964168, + "learning_rate": 3.53026336797864e-05, + "loss": 11.9907, + "step": 26894 + }, + { + "epoch": 1.4645402381001, + "grad_norm": 0.5820803008122897, + "learning_rate": 3.529590990061581e-05, + "loss": 11.9355, + "step": 26895 + }, + { + "epoch": 1.464594692096683, + "grad_norm": 0.553904992996588, + "learning_rate": 3.5289186624595125e-05, + "loss": 11.8495, + "step": 26896 + }, + { + "epoch": 1.464649146093266, + "grad_norm": 0.502351866275459, + "learning_rate": 3.528246385177653e-05, + "loss": 11.8297, + "step": 26897 + }, + { + "epoch": 1.464703600089849, + "grad_norm": 0.5226287251745798, + "learning_rate": 3.5275741582212376e-05, + "loss": 11.8943, + "step": 26898 + }, + { + "epoch": 1.464758054086432, + "grad_norm": 0.5174506947448685, + "learning_rate": 3.526901981595487e-05, + "loss": 11.8376, + "step": 26899 + }, + { + "epoch": 1.464812508083015, + "grad_norm": 0.5745438889851867, + "learning_rate": 3.526229855305633e-05, + "loss": 11.875, + "step": 26900 + }, + { + "epoch": 1.4648669620795982, + "grad_norm": 0.5942905440575059, + "learning_rate": 3.525557779356904e-05, + "loss": 11.9326, + "step": 26901 + }, + { + "epoch": 1.4649214160761812, + "grad_norm": 0.5543504431445094, + "learning_rate": 3.52488575375452e-05, + "loss": 11.8438, + "step": 26902 + }, + { + "epoch": 1.4649758700727642, + "grad_norm": 0.6329014310092056, + "learning_rate": 3.524213778503714e-05, + "loss": 12.005, + "step": 26903 + }, + { + "epoch": 1.4650303240693472, + "grad_norm": 0.5071093456661097, + "learning_rate": 3.523541853609704e-05, + "loss": 11.6483, + "step": 26904 + }, + { + "epoch": 1.4650847780659302, + "grad_norm": 0.5729737739930458, + "learning_rate": 3.522869979077723e-05, + "loss": 11.9562, + "step": 26905 + }, + { + "epoch": 1.4651392320625132, + "grad_norm": 0.4999666939485268, + "learning_rate": 3.52219815491299e-05, + "loss": 11.8352, + "step": 26906 + }, + { + "epoch": 1.4651936860590962, + "grad_norm": 0.5454996946055474, + "learning_rate": 3.5215263811207276e-05, + "loss": 11.8997, + "step": 26907 + }, + { + "epoch": 1.4652481400556792, + "grad_norm": 0.5751158072911432, + "learning_rate": 3.5208546577061666e-05, + "loss": 11.7981, + "step": 26908 + }, + { + "epoch": 1.4653025940522622, + "grad_norm": 0.6659713664357423, + "learning_rate": 3.520182984674522e-05, + "loss": 11.8795, + "step": 26909 + }, + { + "epoch": 1.4653570480488454, + "grad_norm": 0.5503720926795402, + "learning_rate": 3.519511362031021e-05, + "loss": 11.8338, + "step": 26910 + }, + { + "epoch": 1.4654115020454284, + "grad_norm": 0.5594942983638554, + "learning_rate": 3.518839789780891e-05, + "loss": 11.9687, + "step": 26911 + }, + { + "epoch": 1.4654659560420114, + "grad_norm": 0.5921855345390399, + "learning_rate": 3.518168267929345e-05, + "loss": 11.8966, + "step": 26912 + }, + { + "epoch": 1.4655204100385943, + "grad_norm": 0.5396709006485998, + "learning_rate": 3.517496796481614e-05, + "loss": 11.9207, + "step": 26913 + }, + { + "epoch": 1.4655748640351773, + "grad_norm": 0.5722722756055318, + "learning_rate": 3.516825375442912e-05, + "loss": 11.943, + "step": 26914 + }, + { + "epoch": 1.4656293180317603, + "grad_norm": 0.541991721209282, + "learning_rate": 3.516154004818465e-05, + "loss": 11.8641, + "step": 26915 + }, + { + "epoch": 1.4656837720283433, + "grad_norm": 0.5808604813469752, + "learning_rate": 3.5154826846134925e-05, + "loss": 11.777, + "step": 26916 + }, + { + "epoch": 1.4657382260249263, + "grad_norm": 0.5337465519442687, + "learning_rate": 3.51481141483321e-05, + "loss": 11.8277, + "step": 26917 + }, + { + "epoch": 1.4657926800215093, + "grad_norm": 0.5614730591651275, + "learning_rate": 3.514140195482846e-05, + "loss": 11.8785, + "step": 26918 + }, + { + "epoch": 1.4658471340180923, + "grad_norm": 0.5212476666733246, + "learning_rate": 3.513469026567612e-05, + "loss": 11.8603, + "step": 26919 + }, + { + "epoch": 1.4659015880146753, + "grad_norm": 0.5539456935912866, + "learning_rate": 3.512797908092733e-05, + "loss": 11.8429, + "step": 26920 + }, + { + "epoch": 1.4659560420112583, + "grad_norm": 0.5525381910997136, + "learning_rate": 3.5121268400634235e-05, + "loss": 11.9267, + "step": 26921 + }, + { + "epoch": 1.4660104960078413, + "grad_norm": 0.5257762624339773, + "learning_rate": 3.5114558224849025e-05, + "loss": 11.8649, + "step": 26922 + }, + { + "epoch": 1.4660649500044243, + "grad_norm": 0.5861112674124763, + "learning_rate": 3.510784855362392e-05, + "loss": 11.9031, + "step": 26923 + }, + { + "epoch": 1.4661194040010073, + "grad_norm": 0.5735585913197729, + "learning_rate": 3.510113938701105e-05, + "loss": 11.8264, + "step": 26924 + }, + { + "epoch": 1.4661738579975905, + "grad_norm": 0.5952942463803762, + "learning_rate": 3.509443072506263e-05, + "loss": 11.8942, + "step": 26925 + }, + { + "epoch": 1.4662283119941735, + "grad_norm": 0.5060379458745126, + "learning_rate": 3.5087722567830796e-05, + "loss": 11.833, + "step": 26926 + }, + { + "epoch": 1.4662827659907565, + "grad_norm": 0.5227254367275919, + "learning_rate": 3.508101491536769e-05, + "loss": 11.8399, + "step": 26927 + }, + { + "epoch": 1.4663372199873395, + "grad_norm": 0.5728525605740037, + "learning_rate": 3.507430776772553e-05, + "loss": 11.8202, + "step": 26928 + }, + { + "epoch": 1.4663916739839225, + "grad_norm": 0.578307121397152, + "learning_rate": 3.506760112495642e-05, + "loss": 11.9659, + "step": 26929 + }, + { + "epoch": 1.4664461279805054, + "grad_norm": 0.5611285752605445, + "learning_rate": 3.5060894987112544e-05, + "loss": 11.6997, + "step": 26930 + }, + { + "epoch": 1.4665005819770884, + "grad_norm": 0.6009530812971448, + "learning_rate": 3.5054189354246015e-05, + "loss": 11.8917, + "step": 26931 + }, + { + "epoch": 1.4665550359736714, + "grad_norm": 0.49966910990061286, + "learning_rate": 3.504748422640904e-05, + "loss": 11.8914, + "step": 26932 + }, + { + "epoch": 1.4666094899702546, + "grad_norm": 0.6037952896568908, + "learning_rate": 3.5040779603653684e-05, + "loss": 11.8515, + "step": 26933 + }, + { + "epoch": 1.4666639439668376, + "grad_norm": 0.566280631163193, + "learning_rate": 3.5034075486032115e-05, + "loss": 11.7609, + "step": 26934 + }, + { + "epoch": 1.4667183979634206, + "grad_norm": 0.571709024512032, + "learning_rate": 3.502737187359651e-05, + "loss": 11.8101, + "step": 26935 + }, + { + "epoch": 1.4667728519600036, + "grad_norm": 0.5679542079821897, + "learning_rate": 3.5020668766398955e-05, + "loss": 11.8582, + "step": 26936 + }, + { + "epoch": 1.4668273059565866, + "grad_norm": 0.5674774159137708, + "learning_rate": 3.501396616449154e-05, + "loss": 11.8548, + "step": 26937 + }, + { + "epoch": 1.4668817599531696, + "grad_norm": 0.538239445264612, + "learning_rate": 3.500726406792646e-05, + "loss": 11.8107, + "step": 26938 + }, + { + "epoch": 1.4669362139497526, + "grad_norm": 0.6377770860059915, + "learning_rate": 3.500056247675575e-05, + "loss": 11.749, + "step": 26939 + }, + { + "epoch": 1.4669906679463356, + "grad_norm": 0.563975129858514, + "learning_rate": 3.499386139103161e-05, + "loss": 11.7855, + "step": 26940 + }, + { + "epoch": 1.4670451219429186, + "grad_norm": 0.53408722706937, + "learning_rate": 3.498716081080607e-05, + "loss": 11.9532, + "step": 26941 + }, + { + "epoch": 1.4670995759395016, + "grad_norm": 0.5466688182286152, + "learning_rate": 3.49804607361313e-05, + "loss": 11.8466, + "step": 26942 + }, + { + "epoch": 1.4671540299360846, + "grad_norm": 0.6010772436464253, + "learning_rate": 3.4973761167059346e-05, + "loss": 11.9362, + "step": 26943 + }, + { + "epoch": 1.4672084839326676, + "grad_norm": 0.5392786398843316, + "learning_rate": 3.496706210364232e-05, + "loss": 11.8418, + "step": 26944 + }, + { + "epoch": 1.4672629379292506, + "grad_norm": 0.6659633762584934, + "learning_rate": 3.4960363545932376e-05, + "loss": 11.8143, + "step": 26945 + }, + { + "epoch": 1.4673173919258335, + "grad_norm": 0.6231163485999179, + "learning_rate": 3.495366549398154e-05, + "loss": 11.775, + "step": 26946 + }, + { + "epoch": 1.4673718459224165, + "grad_norm": 0.5094187662757288, + "learning_rate": 3.49469679478419e-05, + "loss": 11.818, + "step": 26947 + }, + { + "epoch": 1.4674262999189998, + "grad_norm": 0.5494230557980302, + "learning_rate": 3.494027090756552e-05, + "loss": 11.9304, + "step": 26948 + }, + { + "epoch": 1.4674807539155827, + "grad_norm": 0.5640995906812241, + "learning_rate": 3.4933574373204515e-05, + "loss": 11.9446, + "step": 26949 + }, + { + "epoch": 1.4675352079121657, + "grad_norm": 0.5499026076881367, + "learning_rate": 3.4926878344810964e-05, + "loss": 11.9071, + "step": 26950 + }, + { + "epoch": 1.4675896619087487, + "grad_norm": 0.5215088938960376, + "learning_rate": 3.4920182822436895e-05, + "loss": 11.8015, + "step": 26951 + }, + { + "epoch": 1.4676441159053317, + "grad_norm": 0.5455027911489024, + "learning_rate": 3.491348780613444e-05, + "loss": 11.8753, + "step": 26952 + }, + { + "epoch": 1.4676985699019147, + "grad_norm": 0.6132704465202785, + "learning_rate": 3.490679329595558e-05, + "loss": 11.9248, + "step": 26953 + }, + { + "epoch": 1.4677530238984977, + "grad_norm": 0.5588797520801592, + "learning_rate": 3.4900099291952435e-05, + "loss": 11.9458, + "step": 26954 + }, + { + "epoch": 1.4678074778950807, + "grad_norm": 0.5575655682178964, + "learning_rate": 3.4893405794177e-05, + "loss": 11.7532, + "step": 26955 + }, + { + "epoch": 1.4678619318916637, + "grad_norm": 0.5791931871644307, + "learning_rate": 3.4886712802681406e-05, + "loss": 11.888, + "step": 26956 + }, + { + "epoch": 1.467916385888247, + "grad_norm": 0.520258128178175, + "learning_rate": 3.4880020317517645e-05, + "loss": 11.8153, + "step": 26957 + }, + { + "epoch": 1.46797083988483, + "grad_norm": 0.5632911075780257, + "learning_rate": 3.4873328338737746e-05, + "loss": 11.9586, + "step": 26958 + }, + { + "epoch": 1.468025293881413, + "grad_norm": 0.5242617532428729, + "learning_rate": 3.4866636866393757e-05, + "loss": 11.8639, + "step": 26959 + }, + { + "epoch": 1.4680797478779959, + "grad_norm": 0.5768426935427615, + "learning_rate": 3.4859945900537746e-05, + "loss": 11.8118, + "step": 26960 + }, + { + "epoch": 1.4681342018745789, + "grad_norm": 0.5871673347753809, + "learning_rate": 3.485325544122171e-05, + "loss": 11.9576, + "step": 26961 + }, + { + "epoch": 1.4681886558711619, + "grad_norm": 0.5450289652016302, + "learning_rate": 3.484656548849771e-05, + "loss": 11.6982, + "step": 26962 + }, + { + "epoch": 1.4682431098677449, + "grad_norm": 0.5727478440550319, + "learning_rate": 3.4839876042417695e-05, + "loss": 11.8777, + "step": 26963 + }, + { + "epoch": 1.4682975638643279, + "grad_norm": 0.547667895974564, + "learning_rate": 3.483318710303378e-05, + "loss": 11.9015, + "step": 26964 + }, + { + "epoch": 1.4683520178609109, + "grad_norm": 0.5668453309871445, + "learning_rate": 3.482649867039789e-05, + "loss": 11.9573, + "step": 26965 + }, + { + "epoch": 1.4684064718574938, + "grad_norm": 0.5466427199304342, + "learning_rate": 3.481981074456211e-05, + "loss": 11.9304, + "step": 26966 + }, + { + "epoch": 1.4684609258540768, + "grad_norm": 0.5118188742957974, + "learning_rate": 3.4813123325578415e-05, + "loss": 11.7512, + "step": 26967 + }, + { + "epoch": 1.4685153798506598, + "grad_norm": 0.5745895014674879, + "learning_rate": 3.480643641349877e-05, + "loss": 11.9672, + "step": 26968 + }, + { + "epoch": 1.4685698338472428, + "grad_norm": 0.5358808814029702, + "learning_rate": 3.479975000837524e-05, + "loss": 11.8356, + "step": 26969 + }, + { + "epoch": 1.4686242878438258, + "grad_norm": 0.5066478546280682, + "learning_rate": 3.4793064110259755e-05, + "loss": 11.8727, + "step": 26970 + }, + { + "epoch": 1.468678741840409, + "grad_norm": 0.5347859064300481, + "learning_rate": 3.478637871920435e-05, + "loss": 11.884, + "step": 26971 + }, + { + "epoch": 1.468733195836992, + "grad_norm": 0.5496757316401885, + "learning_rate": 3.4779693835261015e-05, + "loss": 11.8785, + "step": 26972 + }, + { + "epoch": 1.468787649833575, + "grad_norm": 0.5679918776674071, + "learning_rate": 3.47730094584817e-05, + "loss": 11.7205, + "step": 26973 + }, + { + "epoch": 1.468842103830158, + "grad_norm": 0.5752347630178402, + "learning_rate": 3.476632558891843e-05, + "loss": 11.7147, + "step": 26974 + }, + { + "epoch": 1.468896557826741, + "grad_norm": 0.5453418953600246, + "learning_rate": 3.475964222662311e-05, + "loss": 11.9417, + "step": 26975 + }, + { + "epoch": 1.468951011823324, + "grad_norm": 0.5675759849355251, + "learning_rate": 3.47529593716478e-05, + "loss": 11.823, + "step": 26976 + }, + { + "epoch": 1.469005465819907, + "grad_norm": 0.5538553395942424, + "learning_rate": 3.474627702404441e-05, + "loss": 11.8549, + "step": 26977 + }, + { + "epoch": 1.46905991981649, + "grad_norm": 0.4988055556439985, + "learning_rate": 3.473959518386488e-05, + "loss": 11.8171, + "step": 26978 + }, + { + "epoch": 1.469114373813073, + "grad_norm": 0.5308248569857584, + "learning_rate": 3.473291385116124e-05, + "loss": 11.8448, + "step": 26979 + }, + { + "epoch": 1.4691688278096562, + "grad_norm": 0.5654780362187766, + "learning_rate": 3.4726233025985375e-05, + "loss": 11.7154, + "step": 26980 + }, + { + "epoch": 1.4692232818062392, + "grad_norm": 0.4706152556619751, + "learning_rate": 3.47195527083893e-05, + "loss": 11.8449, + "step": 26981 + }, + { + "epoch": 1.4692777358028222, + "grad_norm": 0.5511450909412954, + "learning_rate": 3.47128728984249e-05, + "loss": 11.9013, + "step": 26982 + }, + { + "epoch": 1.4693321897994052, + "grad_norm": 0.5091049474093662, + "learning_rate": 3.4706193596144144e-05, + "loss": 11.8955, + "step": 26983 + }, + { + "epoch": 1.4693866437959882, + "grad_norm": 0.5733711417322171, + "learning_rate": 3.469951480159901e-05, + "loss": 11.8904, + "step": 26984 + }, + { + "epoch": 1.4694410977925711, + "grad_norm": 0.50403405339265, + "learning_rate": 3.469283651484136e-05, + "loss": 11.951, + "step": 26985 + }, + { + "epoch": 1.4694955517891541, + "grad_norm": 0.551467789854702, + "learning_rate": 3.4686158735923216e-05, + "loss": 11.9534, + "step": 26986 + }, + { + "epoch": 1.4695500057857371, + "grad_norm": 0.5554771604025316, + "learning_rate": 3.4679481464896446e-05, + "loss": 11.8636, + "step": 26987 + }, + { + "epoch": 1.4696044597823201, + "grad_norm": 0.5434516447398801, + "learning_rate": 3.4672804701812945e-05, + "loss": 11.9037, + "step": 26988 + }, + { + "epoch": 1.4696589137789031, + "grad_norm": 0.5314844299161825, + "learning_rate": 3.466612844672471e-05, + "loss": 11.8527, + "step": 26989 + }, + { + "epoch": 1.4697133677754861, + "grad_norm": 0.5504031324251071, + "learning_rate": 3.465945269968358e-05, + "loss": 12.0211, + "step": 26990 + }, + { + "epoch": 1.469767821772069, + "grad_norm": 0.5704168540629599, + "learning_rate": 3.465277746074154e-05, + "loss": 11.8902, + "step": 26991 + }, + { + "epoch": 1.469822275768652, + "grad_norm": 0.4879255321437034, + "learning_rate": 3.464610272995043e-05, + "loss": 11.7967, + "step": 26992 + }, + { + "epoch": 1.469876729765235, + "grad_norm": 0.5395397518607035, + "learning_rate": 3.4639428507362174e-05, + "loss": 11.7737, + "step": 26993 + }, + { + "epoch": 1.469931183761818, + "grad_norm": 0.5583342025955563, + "learning_rate": 3.463275479302872e-05, + "loss": 11.7656, + "step": 26994 + }, + { + "epoch": 1.4699856377584013, + "grad_norm": 0.5333749188630824, + "learning_rate": 3.46260815870019e-05, + "loss": 11.8344, + "step": 26995 + }, + { + "epoch": 1.4700400917549843, + "grad_norm": 0.5947621372013558, + "learning_rate": 3.4619408889333696e-05, + "loss": 11.835, + "step": 26996 + }, + { + "epoch": 1.4700945457515673, + "grad_norm": 0.5218967318383265, + "learning_rate": 3.461273670007587e-05, + "loss": 11.835, + "step": 26997 + }, + { + "epoch": 1.4701489997481503, + "grad_norm": 0.5692122610369998, + "learning_rate": 3.460606501928035e-05, + "loss": 11.8929, + "step": 26998 + }, + { + "epoch": 1.4702034537447333, + "grad_norm": 0.4741647367532856, + "learning_rate": 3.4599393846999087e-05, + "loss": 11.6957, + "step": 26999 + }, + { + "epoch": 1.4702579077413163, + "grad_norm": 0.5489315259413341, + "learning_rate": 3.459272318328387e-05, + "loss": 11.8165, + "step": 27000 + }, + { + "epoch": 1.4703123617378993, + "grad_norm": 0.5828263256267341, + "learning_rate": 3.4586053028186636e-05, + "loss": 11.8819, + "step": 27001 + }, + { + "epoch": 1.4703668157344822, + "grad_norm": 0.574516434369927, + "learning_rate": 3.457938338175919e-05, + "loss": 11.9278, + "step": 27002 + }, + { + "epoch": 1.4704212697310655, + "grad_norm": 0.5907239366505275, + "learning_rate": 3.457271424405346e-05, + "loss": 11.928, + "step": 27003 + }, + { + "epoch": 1.4704757237276485, + "grad_norm": 0.5465245700889383, + "learning_rate": 3.4566045615121246e-05, + "loss": 11.9369, + "step": 27004 + }, + { + "epoch": 1.4705301777242314, + "grad_norm": 0.5332316149709876, + "learning_rate": 3.455937749501445e-05, + "loss": 11.96, + "step": 27005 + }, + { + "epoch": 1.4705846317208144, + "grad_norm": 0.5861251101772974, + "learning_rate": 3.455270988378495e-05, + "loss": 11.8555, + "step": 27006 + }, + { + "epoch": 1.4706390857173974, + "grad_norm": 0.5417709300912483, + "learning_rate": 3.45460427814845e-05, + "loss": 11.8343, + "step": 27007 + }, + { + "epoch": 1.4706935397139804, + "grad_norm": 0.5464346475311613, + "learning_rate": 3.4539376188165e-05, + "loss": 12.0091, + "step": 27008 + }, + { + "epoch": 1.4707479937105634, + "grad_norm": 0.5347709733589778, + "learning_rate": 3.453271010387832e-05, + "loss": 11.8109, + "step": 27009 + }, + { + "epoch": 1.4708024477071464, + "grad_norm": 0.5393284119988231, + "learning_rate": 3.4526044528676225e-05, + "loss": 11.9394, + "step": 27010 + }, + { + "epoch": 1.4708569017037294, + "grad_norm": 0.5806957972972273, + "learning_rate": 3.4519379462610625e-05, + "loss": 11.7509, + "step": 27011 + }, + { + "epoch": 1.4709113557003124, + "grad_norm": 0.5465121406012963, + "learning_rate": 3.4512714905733277e-05, + "loss": 11.9348, + "step": 27012 + }, + { + "epoch": 1.4709658096968954, + "grad_norm": 0.5724799408772205, + "learning_rate": 3.450605085809607e-05, + "loss": 11.9405, + "step": 27013 + }, + { + "epoch": 1.4710202636934784, + "grad_norm": 0.5955513575638715, + "learning_rate": 3.449938731975078e-05, + "loss": 11.9518, + "step": 27014 + }, + { + "epoch": 1.4710747176900614, + "grad_norm": 0.6018328515717035, + "learning_rate": 3.4492724290749224e-05, + "loss": 11.9525, + "step": 27015 + }, + { + "epoch": 1.4711291716866444, + "grad_norm": 0.5015215623563867, + "learning_rate": 3.4486061771143296e-05, + "loss": 11.7717, + "step": 27016 + }, + { + "epoch": 1.4711836256832274, + "grad_norm": 0.49988841942712686, + "learning_rate": 3.4479399760984664e-05, + "loss": 11.8735, + "step": 27017 + }, + { + "epoch": 1.4712380796798106, + "grad_norm": 0.5245592975923902, + "learning_rate": 3.447273826032525e-05, + "loss": 11.8615, + "step": 27018 + }, + { + "epoch": 1.4712925336763936, + "grad_norm": 0.649376184336951, + "learning_rate": 3.446607726921679e-05, + "loss": 11.9087, + "step": 27019 + }, + { + "epoch": 1.4713469876729766, + "grad_norm": 0.5830823200606501, + "learning_rate": 3.4459416787711084e-05, + "loss": 11.947, + "step": 27020 + }, + { + "epoch": 1.4714014416695596, + "grad_norm": 0.5540168823321698, + "learning_rate": 3.445275681585999e-05, + "loss": 11.7104, + "step": 27021 + }, + { + "epoch": 1.4714558956661425, + "grad_norm": 0.5830936253241314, + "learning_rate": 3.444609735371521e-05, + "loss": 11.8026, + "step": 27022 + }, + { + "epoch": 1.4715103496627255, + "grad_norm": 0.5401627797490386, + "learning_rate": 3.4439438401328606e-05, + "loss": 11.8363, + "step": 27023 + }, + { + "epoch": 1.4715648036593085, + "grad_norm": 0.5427904249373643, + "learning_rate": 3.443277995875189e-05, + "loss": 11.7379, + "step": 27024 + }, + { + "epoch": 1.4716192576558915, + "grad_norm": 0.5524654577776319, + "learning_rate": 3.442612202603691e-05, + "loss": 11.9151, + "step": 27025 + }, + { + "epoch": 1.4716737116524745, + "grad_norm": 0.5471381024249221, + "learning_rate": 3.4419464603235364e-05, + "loss": 11.8217, + "step": 27026 + }, + { + "epoch": 1.4717281656490577, + "grad_norm": 0.5546715885633946, + "learning_rate": 3.4412807690399096e-05, + "loss": 11.7589, + "step": 27027 + }, + { + "epoch": 1.4717826196456407, + "grad_norm": 0.7117188961593346, + "learning_rate": 3.440615128757984e-05, + "loss": 11.8712, + "step": 27028 + }, + { + "epoch": 1.4718370736422237, + "grad_norm": 0.5420285753965889, + "learning_rate": 3.4399495394829305e-05, + "loss": 11.9132, + "step": 27029 + }, + { + "epoch": 1.4718915276388067, + "grad_norm": 0.543851358459557, + "learning_rate": 3.4392840012199304e-05, + "loss": 11.7777, + "step": 27030 + }, + { + "epoch": 1.4719459816353897, + "grad_norm": 0.49003595741190664, + "learning_rate": 3.438618513974161e-05, + "loss": 11.801, + "step": 27031 + }, + { + "epoch": 1.4720004356319727, + "grad_norm": 0.514922000773794, + "learning_rate": 3.437953077750791e-05, + "loss": 11.7983, + "step": 27032 + }, + { + "epoch": 1.4720548896285557, + "grad_norm": 0.5555140945535076, + "learning_rate": 3.437287692555003e-05, + "loss": 11.7921, + "step": 27033 + }, + { + "epoch": 1.4721093436251387, + "grad_norm": 0.5197018757079596, + "learning_rate": 3.436622358391961e-05, + "loss": 11.8424, + "step": 27034 + }, + { + "epoch": 1.4721637976217217, + "grad_norm": 0.5746961787467441, + "learning_rate": 3.435957075266849e-05, + "loss": 11.8774, + "step": 27035 + }, + { + "epoch": 1.4722182516183047, + "grad_norm": 0.5523608801389424, + "learning_rate": 3.4352918431848313e-05, + "loss": 11.9063, + "step": 27036 + }, + { + "epoch": 1.4722727056148877, + "grad_norm": 0.6098172617612233, + "learning_rate": 3.434626662151089e-05, + "loss": 11.9233, + "step": 27037 + }, + { + "epoch": 1.4723271596114706, + "grad_norm": 0.6189974377781735, + "learning_rate": 3.4339615321707904e-05, + "loss": 11.8561, + "step": 27038 + }, + { + "epoch": 1.4723816136080536, + "grad_norm": 0.6098684049964824, + "learning_rate": 3.433296453249105e-05, + "loss": 12.0428, + "step": 27039 + }, + { + "epoch": 1.4724360676046366, + "grad_norm": 0.5231442378536864, + "learning_rate": 3.4326314253912114e-05, + "loss": 11.9368, + "step": 27040 + }, + { + "epoch": 1.4724905216012198, + "grad_norm": 0.5554595187181036, + "learning_rate": 3.431966448602274e-05, + "loss": 11.8724, + "step": 27041 + }, + { + "epoch": 1.4725449755978028, + "grad_norm": 0.5093832129644811, + "learning_rate": 3.431301522887467e-05, + "loss": 11.8652, + "step": 27042 + }, + { + "epoch": 1.4725994295943858, + "grad_norm": 0.532916989178289, + "learning_rate": 3.4306366482519624e-05, + "loss": 11.7608, + "step": 27043 + }, + { + "epoch": 1.4726538835909688, + "grad_norm": 0.5299826967679284, + "learning_rate": 3.429971824700927e-05, + "loss": 11.7297, + "step": 27044 + }, + { + "epoch": 1.4727083375875518, + "grad_norm": 0.5470163791515646, + "learning_rate": 3.429307052239536e-05, + "loss": 11.8786, + "step": 27045 + }, + { + "epoch": 1.4727627915841348, + "grad_norm": 0.5429542506097446, + "learning_rate": 3.4286423308729524e-05, + "loss": 11.7596, + "step": 27046 + }, + { + "epoch": 1.4728172455807178, + "grad_norm": 0.5193121313872467, + "learning_rate": 3.4279776606063496e-05, + "loss": 11.8058, + "step": 27047 + }, + { + "epoch": 1.4728716995773008, + "grad_norm": 0.569015986442028, + "learning_rate": 3.427313041444896e-05, + "loss": 11.921, + "step": 27048 + }, + { + "epoch": 1.4729261535738838, + "grad_norm": 0.5396491219347833, + "learning_rate": 3.4266484733937546e-05, + "loss": 11.9579, + "step": 27049 + }, + { + "epoch": 1.472980607570467, + "grad_norm": 0.5072043815273053, + "learning_rate": 3.425983956458101e-05, + "loss": 11.8264, + "step": 27050 + }, + { + "epoch": 1.47303506156705, + "grad_norm": 0.5150497274906632, + "learning_rate": 3.425319490643094e-05, + "loss": 11.8747, + "step": 27051 + }, + { + "epoch": 1.473089515563633, + "grad_norm": 0.5666219037141724, + "learning_rate": 3.42465507595391e-05, + "loss": 11.9068, + "step": 27052 + }, + { + "epoch": 1.473143969560216, + "grad_norm": 0.4999371458346644, + "learning_rate": 3.423990712395707e-05, + "loss": 11.8077, + "step": 27053 + }, + { + "epoch": 1.473198423556799, + "grad_norm": 0.5375262988369534, + "learning_rate": 3.4233263999736543e-05, + "loss": 11.8238, + "step": 27054 + }, + { + "epoch": 1.473252877553382, + "grad_norm": 0.5524602798318212, + "learning_rate": 3.4226621386929224e-05, + "loss": 11.9338, + "step": 27055 + }, + { + "epoch": 1.473307331549965, + "grad_norm": 0.5441144221728975, + "learning_rate": 3.421997928558669e-05, + "loss": 11.8395, + "step": 27056 + }, + { + "epoch": 1.473361785546548, + "grad_norm": 0.5611645080034972, + "learning_rate": 3.421333769576067e-05, + "loss": 11.9071, + "step": 27057 + }, + { + "epoch": 1.473416239543131, + "grad_norm": 0.5502104309668763, + "learning_rate": 3.420669661750277e-05, + "loss": 11.7065, + "step": 27058 + }, + { + "epoch": 1.473470693539714, + "grad_norm": 0.5715937041666886, + "learning_rate": 3.4200056050864584e-05, + "loss": 11.7687, + "step": 27059 + }, + { + "epoch": 1.473525147536297, + "grad_norm": 0.6181960696255637, + "learning_rate": 3.419341599589784e-05, + "loss": 11.8218, + "step": 27060 + }, + { + "epoch": 1.47357960153288, + "grad_norm": 0.5294926288327422, + "learning_rate": 3.418677645265409e-05, + "loss": 11.7405, + "step": 27061 + }, + { + "epoch": 1.473634055529463, + "grad_norm": 0.8029805969217824, + "learning_rate": 3.4180137421185035e-05, + "loss": 11.9257, + "step": 27062 + }, + { + "epoch": 1.473688509526046, + "grad_norm": 0.5349932841043552, + "learning_rate": 3.417349890154224e-05, + "loss": 11.8885, + "step": 27063 + }, + { + "epoch": 1.473742963522629, + "grad_norm": 0.5586481148900625, + "learning_rate": 3.416686089377735e-05, + "loss": 11.8633, + "step": 27064 + }, + { + "epoch": 1.4737974175192121, + "grad_norm": 0.5273718362697052, + "learning_rate": 3.4160223397942037e-05, + "loss": 11.8968, + "step": 27065 + }, + { + "epoch": 1.473851871515795, + "grad_norm": 0.5669173366538417, + "learning_rate": 3.4153586414087824e-05, + "loss": 11.9404, + "step": 27066 + }, + { + "epoch": 1.473906325512378, + "grad_norm": 0.6056961832563036, + "learning_rate": 3.414694994226644e-05, + "loss": 11.8837, + "step": 27067 + }, + { + "epoch": 1.473960779508961, + "grad_norm": 0.5432162802028999, + "learning_rate": 3.414031398252935e-05, + "loss": 11.9145, + "step": 27068 + }, + { + "epoch": 1.474015233505544, + "grad_norm": 0.5311333608968709, + "learning_rate": 3.4133678534928216e-05, + "loss": 11.8465, + "step": 27069 + }, + { + "epoch": 1.474069687502127, + "grad_norm": 0.535893314131085, + "learning_rate": 3.412704359951467e-05, + "loss": 11.8586, + "step": 27070 + }, + { + "epoch": 1.47412414149871, + "grad_norm": 0.524757750295078, + "learning_rate": 3.412040917634026e-05, + "loss": 11.8801, + "step": 27071 + }, + { + "epoch": 1.474178595495293, + "grad_norm": 0.5756576209868174, + "learning_rate": 3.4113775265456626e-05, + "loss": 11.9087, + "step": 27072 + }, + { + "epoch": 1.4742330494918763, + "grad_norm": 0.6782042329146925, + "learning_rate": 3.4107141866915294e-05, + "loss": 11.9043, + "step": 27073 + }, + { + "epoch": 1.4742875034884593, + "grad_norm": 0.5520135168377776, + "learning_rate": 3.410050898076791e-05, + "loss": 11.9487, + "step": 27074 + }, + { + "epoch": 1.4743419574850423, + "grad_norm": 0.4930499779384862, + "learning_rate": 3.4093876607065976e-05, + "loss": 11.8126, + "step": 27075 + }, + { + "epoch": 1.4743964114816253, + "grad_norm": 0.5479393717310947, + "learning_rate": 3.4087244745861125e-05, + "loss": 11.8866, + "step": 27076 + }, + { + "epoch": 1.4744508654782083, + "grad_norm": 0.5562895428702546, + "learning_rate": 3.4080613397204977e-05, + "loss": 11.906, + "step": 27077 + }, + { + "epoch": 1.4745053194747912, + "grad_norm": 0.5444885985476847, + "learning_rate": 3.407398256114896e-05, + "loss": 11.6886, + "step": 27078 + }, + { + "epoch": 1.4745597734713742, + "grad_norm": 0.5548738264640611, + "learning_rate": 3.406735223774471e-05, + "loss": 11.836, + "step": 27079 + }, + { + "epoch": 1.4746142274679572, + "grad_norm": 0.5348139660551197, + "learning_rate": 3.406072242704382e-05, + "loss": 11.8202, + "step": 27080 + }, + { + "epoch": 1.4746686814645402, + "grad_norm": 0.511501103488106, + "learning_rate": 3.405409312909777e-05, + "loss": 11.6727, + "step": 27081 + }, + { + "epoch": 1.4747231354611232, + "grad_norm": 0.5598396385414746, + "learning_rate": 3.40474643439582e-05, + "loss": 11.8583, + "step": 27082 + }, + { + "epoch": 1.4747775894577062, + "grad_norm": 0.5076146583988852, + "learning_rate": 3.404083607167657e-05, + "loss": 11.9152, + "step": 27083 + }, + { + "epoch": 1.4748320434542892, + "grad_norm": 0.5537755440569728, + "learning_rate": 3.403420831230449e-05, + "loss": 11.8594, + "step": 27084 + }, + { + "epoch": 1.4748864974508722, + "grad_norm": 0.5293285022532415, + "learning_rate": 3.402758106589343e-05, + "loss": 11.9193, + "step": 27085 + }, + { + "epoch": 1.4749409514474552, + "grad_norm": 0.5408443603695325, + "learning_rate": 3.402095433249502e-05, + "loss": 11.7753, + "step": 27086 + }, + { + "epoch": 1.4749954054440382, + "grad_norm": 0.5749917278803268, + "learning_rate": 3.401432811216071e-05, + "loss": 11.9071, + "step": 27087 + }, + { + "epoch": 1.4750498594406214, + "grad_norm": 0.5156306979181503, + "learning_rate": 3.400770240494202e-05, + "loss": 11.749, + "step": 27088 + }, + { + "epoch": 1.4751043134372044, + "grad_norm": 0.559651083921084, + "learning_rate": 3.400107721089054e-05, + "loss": 11.765, + "step": 27089 + }, + { + "epoch": 1.4751587674337874, + "grad_norm": 0.612724934333478, + "learning_rate": 3.399445253005772e-05, + "loss": 11.8683, + "step": 27090 + }, + { + "epoch": 1.4752132214303704, + "grad_norm": 0.59789341134019, + "learning_rate": 3.39878283624951e-05, + "loss": 11.946, + "step": 27091 + }, + { + "epoch": 1.4752676754269534, + "grad_norm": 0.5544059016115417, + "learning_rate": 3.398120470825423e-05, + "loss": 11.8487, + "step": 27092 + }, + { + "epoch": 1.4753221294235364, + "grad_norm": 0.5617610908798364, + "learning_rate": 3.3974581567386564e-05, + "loss": 11.8222, + "step": 27093 + }, + { + "epoch": 1.4753765834201193, + "grad_norm": 0.5125756027948952, + "learning_rate": 3.396795893994365e-05, + "loss": 11.8395, + "step": 27094 + }, + { + "epoch": 1.4754310374167023, + "grad_norm": 0.5559725961950754, + "learning_rate": 3.396133682597692e-05, + "loss": 11.9392, + "step": 27095 + }, + { + "epoch": 1.4754854914132853, + "grad_norm": 0.5762186452667207, + "learning_rate": 3.395471522553795e-05, + "loss": 11.7305, + "step": 27096 + }, + { + "epoch": 1.4755399454098685, + "grad_norm": 0.531698633923273, + "learning_rate": 3.394809413867819e-05, + "loss": 11.8277, + "step": 27097 + }, + { + "epoch": 1.4755943994064515, + "grad_norm": 0.5482631295246632, + "learning_rate": 3.394147356544909e-05, + "loss": 11.8116, + "step": 27098 + }, + { + "epoch": 1.4756488534030345, + "grad_norm": 0.6278240149208363, + "learning_rate": 3.393485350590221e-05, + "loss": 11.8782, + "step": 27099 + }, + { + "epoch": 1.4757033073996175, + "grad_norm": 0.5759202327727695, + "learning_rate": 3.392823396008895e-05, + "loss": 11.7914, + "step": 27100 + }, + { + "epoch": 1.4757577613962005, + "grad_norm": 0.5773957919324828, + "learning_rate": 3.392161492806085e-05, + "loss": 11.8509, + "step": 27101 + }, + { + "epoch": 1.4758122153927835, + "grad_norm": 0.5272776305146423, + "learning_rate": 3.3914996409869335e-05, + "loss": 11.8905, + "step": 27102 + }, + { + "epoch": 1.4758666693893665, + "grad_norm": 0.6199382628951214, + "learning_rate": 3.3908378405565876e-05, + "loss": 11.9084, + "step": 27103 + }, + { + "epoch": 1.4759211233859495, + "grad_norm": 0.5278872701414912, + "learning_rate": 3.3901760915201995e-05, + "loss": 11.8079, + "step": 27104 + }, + { + "epoch": 1.4759755773825325, + "grad_norm": 0.542572280098002, + "learning_rate": 3.389514393882906e-05, + "loss": 11.9388, + "step": 27105 + }, + { + "epoch": 1.4760300313791155, + "grad_norm": 0.5820755273708499, + "learning_rate": 3.388852747649862e-05, + "loss": 11.899, + "step": 27106 + }, + { + "epoch": 1.4760844853756985, + "grad_norm": 0.5562626129763535, + "learning_rate": 3.388191152826207e-05, + "loss": 11.7676, + "step": 27107 + }, + { + "epoch": 1.4761389393722815, + "grad_norm": 0.5507773169644343, + "learning_rate": 3.3875296094170826e-05, + "loss": 11.6364, + "step": 27108 + }, + { + "epoch": 1.4761933933688645, + "grad_norm": 0.5315939561427894, + "learning_rate": 3.38686811742764e-05, + "loss": 11.7449, + "step": 27109 + }, + { + "epoch": 1.4762478473654475, + "grad_norm": 0.5459717502917776, + "learning_rate": 3.3862066768630184e-05, + "loss": 11.7552, + "step": 27110 + }, + { + "epoch": 1.4763023013620307, + "grad_norm": 0.5277619271971716, + "learning_rate": 3.385545287728364e-05, + "loss": 11.8546, + "step": 27111 + }, + { + "epoch": 1.4763567553586137, + "grad_norm": 0.5954012509501209, + "learning_rate": 3.3848839500288166e-05, + "loss": 11.8795, + "step": 27112 + }, + { + "epoch": 1.4764112093551967, + "grad_norm": 0.5124358491319496, + "learning_rate": 3.384222663769521e-05, + "loss": 11.8748, + "step": 27113 + }, + { + "epoch": 1.4764656633517796, + "grad_norm": 0.6025619047189447, + "learning_rate": 3.383561428955623e-05, + "loss": 11.9018, + "step": 27114 + }, + { + "epoch": 1.4765201173483626, + "grad_norm": 0.5699948163670611, + "learning_rate": 3.382900245592257e-05, + "loss": 11.9429, + "step": 27115 + }, + { + "epoch": 1.4765745713449456, + "grad_norm": 0.517490357130833, + "learning_rate": 3.382239113684571e-05, + "loss": 11.8018, + "step": 27116 + }, + { + "epoch": 1.4766290253415286, + "grad_norm": 0.5466848815284313, + "learning_rate": 3.3815780332377044e-05, + "loss": 11.7851, + "step": 27117 + }, + { + "epoch": 1.4766834793381116, + "grad_norm": 0.5606494793356981, + "learning_rate": 3.3809170042567925e-05, + "loss": 11.9099, + "step": 27118 + }, + { + "epoch": 1.4767379333346946, + "grad_norm": 0.5845511369310461, + "learning_rate": 3.3802560267469855e-05, + "loss": 11.9356, + "step": 27119 + }, + { + "epoch": 1.4767923873312778, + "grad_norm": 0.5967304989228428, + "learning_rate": 3.379595100713413e-05, + "loss": 11.924, + "step": 27120 + }, + { + "epoch": 1.4768468413278608, + "grad_norm": 0.513447914758633, + "learning_rate": 3.3789342261612224e-05, + "loss": 11.9511, + "step": 27121 + }, + { + "epoch": 1.4769012953244438, + "grad_norm": 0.565957331964041, + "learning_rate": 3.378273403095547e-05, + "loss": 11.6514, + "step": 27122 + }, + { + "epoch": 1.4769557493210268, + "grad_norm": 0.5842253427170512, + "learning_rate": 3.37761263152153e-05, + "loss": 11.962, + "step": 27123 + }, + { + "epoch": 1.4770102033176098, + "grad_norm": 0.5845901353598117, + "learning_rate": 3.376951911444306e-05, + "loss": 11.9118, + "step": 27124 + }, + { + "epoch": 1.4770646573141928, + "grad_norm": 0.5141469175908666, + "learning_rate": 3.376291242869014e-05, + "loss": 11.8073, + "step": 27125 + }, + { + "epoch": 1.4771191113107758, + "grad_norm": 0.5259914534495058, + "learning_rate": 3.3756306258007954e-05, + "loss": 11.8461, + "step": 27126 + }, + { + "epoch": 1.4771735653073588, + "grad_norm": 0.5459010431752896, + "learning_rate": 3.374970060244784e-05, + "loss": 11.8832, + "step": 27127 + }, + { + "epoch": 1.4772280193039418, + "grad_norm": 0.5528517960244562, + "learning_rate": 3.374309546206113e-05, + "loss": 11.823, + "step": 27128 + }, + { + "epoch": 1.4772824733005248, + "grad_norm": 0.5271159165239279, + "learning_rate": 3.373649083689926e-05, + "loss": 11.904, + "step": 27129 + }, + { + "epoch": 1.4773369272971077, + "grad_norm": 0.5966994098474412, + "learning_rate": 3.372988672701351e-05, + "loss": 11.9556, + "step": 27130 + }, + { + "epoch": 1.4773913812936907, + "grad_norm": 0.5744693204930843, + "learning_rate": 3.372328313245531e-05, + "loss": 11.8177, + "step": 27131 + }, + { + "epoch": 1.4774458352902737, + "grad_norm": 0.5765051641289127, + "learning_rate": 3.371668005327594e-05, + "loss": 11.8644, + "step": 27132 + }, + { + "epoch": 1.4775002892868567, + "grad_norm": 0.5233858411786155, + "learning_rate": 3.371007748952681e-05, + "loss": 11.8243, + "step": 27133 + }, + { + "epoch": 1.47755474328344, + "grad_norm": 0.6552093420357177, + "learning_rate": 3.370347544125921e-05, + "loss": 11.9979, + "step": 27134 + }, + { + "epoch": 1.477609197280023, + "grad_norm": 0.559480773537894, + "learning_rate": 3.369687390852454e-05, + "loss": 11.865, + "step": 27135 + }, + { + "epoch": 1.477663651276606, + "grad_norm": 0.5642823448196956, + "learning_rate": 3.369027289137405e-05, + "loss": 11.9018, + "step": 27136 + }, + { + "epoch": 1.477718105273189, + "grad_norm": 0.5322223232586138, + "learning_rate": 3.3683672389859166e-05, + "loss": 11.8025, + "step": 27137 + }, + { + "epoch": 1.477772559269772, + "grad_norm": 0.587293504402518, + "learning_rate": 3.367707240403115e-05, + "loss": 11.8876, + "step": 27138 + }, + { + "epoch": 1.477827013266355, + "grad_norm": 0.5464093118301003, + "learning_rate": 3.367047293394132e-05, + "loss": 11.9181, + "step": 27139 + }, + { + "epoch": 1.477881467262938, + "grad_norm": 0.5373653748455144, + "learning_rate": 3.366387397964101e-05, + "loss": 11.8001, + "step": 27140 + }, + { + "epoch": 1.477935921259521, + "grad_norm": 0.5878366181510546, + "learning_rate": 3.365727554118159e-05, + "loss": 11.9296, + "step": 27141 + }, + { + "epoch": 1.4779903752561039, + "grad_norm": 0.5284923666358852, + "learning_rate": 3.365067761861427e-05, + "loss": 11.8898, + "step": 27142 + }, + { + "epoch": 1.478044829252687, + "grad_norm": 0.5664966967686911, + "learning_rate": 3.364408021199045e-05, + "loss": 11.8015, + "step": 27143 + }, + { + "epoch": 1.47809928324927, + "grad_norm": 0.566726081494145, + "learning_rate": 3.363748332136135e-05, + "loss": 11.79, + "step": 27144 + }, + { + "epoch": 1.478153737245853, + "grad_norm": 0.508897171974987, + "learning_rate": 3.363088694677834e-05, + "loss": 11.7891, + "step": 27145 + }, + { + "epoch": 1.478208191242436, + "grad_norm": 0.5855396399672527, + "learning_rate": 3.362429108829266e-05, + "loss": 12.0, + "step": 27146 + }, + { + "epoch": 1.478262645239019, + "grad_norm": 0.5839440654246064, + "learning_rate": 3.3617695745955646e-05, + "loss": 11.7482, + "step": 27147 + }, + { + "epoch": 1.478317099235602, + "grad_norm": 0.5294692175472873, + "learning_rate": 3.361110091981857e-05, + "loss": 11.9796, + "step": 27148 + }, + { + "epoch": 1.478371553232185, + "grad_norm": 0.5490297186905433, + "learning_rate": 3.3604506609932674e-05, + "loss": 11.6764, + "step": 27149 + }, + { + "epoch": 1.478426007228768, + "grad_norm": 0.5190701710804367, + "learning_rate": 3.35979128163493e-05, + "loss": 12.0312, + "step": 27150 + }, + { + "epoch": 1.478480461225351, + "grad_norm": 0.5108576585974903, + "learning_rate": 3.3591319539119656e-05, + "loss": 11.8501, + "step": 27151 + }, + { + "epoch": 1.478534915221934, + "grad_norm": 0.5622735264811781, + "learning_rate": 3.358472677829505e-05, + "loss": 11.9249, + "step": 27152 + }, + { + "epoch": 1.478589369218517, + "grad_norm": 0.5439652232085131, + "learning_rate": 3.357813453392679e-05, + "loss": 11.8032, + "step": 27153 + }, + { + "epoch": 1.4786438232151, + "grad_norm": 0.5089868565051143, + "learning_rate": 3.3571542806066047e-05, + "loss": 11.8561, + "step": 27154 + }, + { + "epoch": 1.478698277211683, + "grad_norm": 0.5547826449591965, + "learning_rate": 3.356495159476416e-05, + "loss": 11.774, + "step": 27155 + }, + { + "epoch": 1.478752731208266, + "grad_norm": 0.5473036932778695, + "learning_rate": 3.3558360900072325e-05, + "loss": 11.8986, + "step": 27156 + }, + { + "epoch": 1.478807185204849, + "grad_norm": 0.4951983523661329, + "learning_rate": 3.355177072204184e-05, + "loss": 11.88, + "step": 27157 + }, + { + "epoch": 1.4788616392014322, + "grad_norm": 0.5196955795091953, + "learning_rate": 3.354518106072394e-05, + "loss": 11.845, + "step": 27158 + }, + { + "epoch": 1.4789160931980152, + "grad_norm": 0.5520316211913328, + "learning_rate": 3.353859191616982e-05, + "loss": 11.8677, + "step": 27159 + }, + { + "epoch": 1.4789705471945982, + "grad_norm": 0.6203878430456891, + "learning_rate": 3.353200328843079e-05, + "loss": 11.92, + "step": 27160 + }, + { + "epoch": 1.4790250011911812, + "grad_norm": 0.5786738160243766, + "learning_rate": 3.352541517755802e-05, + "loss": 11.9021, + "step": 27161 + }, + { + "epoch": 1.4790794551877642, + "grad_norm": 0.5143903061802476, + "learning_rate": 3.351882758360276e-05, + "loss": 11.8808, + "step": 27162 + }, + { + "epoch": 1.4791339091843472, + "grad_norm": 0.5973229100998572, + "learning_rate": 3.351224050661629e-05, + "loss": 11.9367, + "step": 27163 + }, + { + "epoch": 1.4791883631809302, + "grad_norm": 0.528959301008005, + "learning_rate": 3.350565394664974e-05, + "loss": 11.8738, + "step": 27164 + }, + { + "epoch": 1.4792428171775132, + "grad_norm": 0.5771028782922764, + "learning_rate": 3.349906790375442e-05, + "loss": 11.9089, + "step": 27165 + }, + { + "epoch": 1.4792972711740962, + "grad_norm": 0.591567422479549, + "learning_rate": 3.349248237798146e-05, + "loss": 11.9304, + "step": 27166 + }, + { + "epoch": 1.4793517251706794, + "grad_norm": 0.48787454573548533, + "learning_rate": 3.3485897369382146e-05, + "loss": 11.781, + "step": 27167 + }, + { + "epoch": 1.4794061791672624, + "grad_norm": 0.5086313821470303, + "learning_rate": 3.347931287800765e-05, + "loss": 11.7327, + "step": 27168 + }, + { + "epoch": 1.4794606331638454, + "grad_norm": 0.5750569091876963, + "learning_rate": 3.347272890390915e-05, + "loss": 11.9718, + "step": 27169 + }, + { + "epoch": 1.4795150871604283, + "grad_norm": 0.5910052756781702, + "learning_rate": 3.346614544713789e-05, + "loss": 11.7934, + "step": 27170 + }, + { + "epoch": 1.4795695411570113, + "grad_norm": 0.7537333335813179, + "learning_rate": 3.3459562507745e-05, + "loss": 11.8531, + "step": 27171 + }, + { + "epoch": 1.4796239951535943, + "grad_norm": 0.5711350890046221, + "learning_rate": 3.345298008578175e-05, + "loss": 11.8902, + "step": 27172 + }, + { + "epoch": 1.4796784491501773, + "grad_norm": 0.5146417451766456, + "learning_rate": 3.344639818129926e-05, + "loss": 11.8732, + "step": 27173 + }, + { + "epoch": 1.4797329031467603, + "grad_norm": 0.5226190157213103, + "learning_rate": 3.343981679434873e-05, + "loss": 11.7984, + "step": 27174 + }, + { + "epoch": 1.4797873571433433, + "grad_norm": 0.5976252647813438, + "learning_rate": 3.343323592498138e-05, + "loss": 11.995, + "step": 27175 + }, + { + "epoch": 1.4798418111399263, + "grad_norm": 0.6078734907301634, + "learning_rate": 3.3426655573248324e-05, + "loss": 11.7342, + "step": 27176 + }, + { + "epoch": 1.4798962651365093, + "grad_norm": 0.5852577048129345, + "learning_rate": 3.342007573920078e-05, + "loss": 11.8116, + "step": 27177 + }, + { + "epoch": 1.4799507191330923, + "grad_norm": 0.5894873191799361, + "learning_rate": 3.34134964228899e-05, + "loss": 11.8434, + "step": 27178 + }, + { + "epoch": 1.4800051731296753, + "grad_norm": 0.5910698681153836, + "learning_rate": 3.340691762436681e-05, + "loss": 11.688, + "step": 27179 + }, + { + "epoch": 1.4800596271262583, + "grad_norm": 0.6702628794636158, + "learning_rate": 3.340033934368273e-05, + "loss": 11.8892, + "step": 27180 + }, + { + "epoch": 1.4801140811228415, + "grad_norm": 0.5375409497292924, + "learning_rate": 3.3393761580888736e-05, + "loss": 11.7689, + "step": 27181 + }, + { + "epoch": 1.4801685351194245, + "grad_norm": 0.5686594922761874, + "learning_rate": 3.338718433603606e-05, + "loss": 11.7577, + "step": 27182 + }, + { + "epoch": 1.4802229891160075, + "grad_norm": 0.5169845193658228, + "learning_rate": 3.3380607609175785e-05, + "loss": 11.824, + "step": 27183 + }, + { + "epoch": 1.4802774431125905, + "grad_norm": 0.5274731427167206, + "learning_rate": 3.3374031400359095e-05, + "loss": 11.8404, + "step": 27184 + }, + { + "epoch": 1.4803318971091735, + "grad_norm": 0.567724516103824, + "learning_rate": 3.336745570963709e-05, + "loss": 11.7453, + "step": 27185 + }, + { + "epoch": 1.4803863511057564, + "grad_norm": 0.5501182615912221, + "learning_rate": 3.336088053706092e-05, + "loss": 11.8781, + "step": 27186 + }, + { + "epoch": 1.4804408051023394, + "grad_norm": 0.5725508482541988, + "learning_rate": 3.3354305882681746e-05, + "loss": 11.9257, + "step": 27187 + }, + { + "epoch": 1.4804952590989224, + "grad_norm": 0.6068025079210324, + "learning_rate": 3.334773174655067e-05, + "loss": 11.9728, + "step": 27188 + }, + { + "epoch": 1.4805497130955054, + "grad_norm": 0.5027437770027873, + "learning_rate": 3.3341158128718776e-05, + "loss": 11.7719, + "step": 27189 + }, + { + "epoch": 1.4806041670920886, + "grad_norm": 0.5177253109463887, + "learning_rate": 3.333458502923725e-05, + "loss": 11.7254, + "step": 27190 + }, + { + "epoch": 1.4806586210886716, + "grad_norm": 0.5774954096402204, + "learning_rate": 3.332801244815714e-05, + "loss": 11.8728, + "step": 27191 + }, + { + "epoch": 1.4807130750852546, + "grad_norm": 0.6294710587232561, + "learning_rate": 3.332144038552961e-05, + "loss": 12.0012, + "step": 27192 + }, + { + "epoch": 1.4807675290818376, + "grad_norm": 0.5595296786294132, + "learning_rate": 3.331486884140572e-05, + "loss": 11.8264, + "step": 27193 + }, + { + "epoch": 1.4808219830784206, + "grad_norm": 0.527091612607428, + "learning_rate": 3.330829781583662e-05, + "loss": 11.8461, + "step": 27194 + }, + { + "epoch": 1.4808764370750036, + "grad_norm": 0.5274658985174875, + "learning_rate": 3.330172730887334e-05, + "loss": 11.7673, + "step": 27195 + }, + { + "epoch": 1.4809308910715866, + "grad_norm": 0.5809138311464556, + "learning_rate": 3.329515732056703e-05, + "loss": 11.9266, + "step": 27196 + }, + { + "epoch": 1.4809853450681696, + "grad_norm": 0.5390868402140865, + "learning_rate": 3.328858785096878e-05, + "loss": 11.8711, + "step": 27197 + }, + { + "epoch": 1.4810397990647526, + "grad_norm": 0.4756542962327035, + "learning_rate": 3.328201890012966e-05, + "loss": 11.8575, + "step": 27198 + }, + { + "epoch": 1.4810942530613356, + "grad_norm": 0.5462471967775758, + "learning_rate": 3.3275450468100764e-05, + "loss": 11.8843, + "step": 27199 + }, + { + "epoch": 1.4811487070579186, + "grad_norm": 0.5040527822746074, + "learning_rate": 3.3268882554933114e-05, + "loss": 11.8946, + "step": 27200 + }, + { + "epoch": 1.4812031610545016, + "grad_norm": 0.5543580122913088, + "learning_rate": 3.326231516067782e-05, + "loss": 11.8207, + "step": 27201 + }, + { + "epoch": 1.4812576150510846, + "grad_norm": 0.6029877438599717, + "learning_rate": 3.325574828538599e-05, + "loss": 11.9576, + "step": 27202 + }, + { + "epoch": 1.4813120690476675, + "grad_norm": 0.5570732431349827, + "learning_rate": 3.324918192910863e-05, + "loss": 11.6534, + "step": 27203 + }, + { + "epoch": 1.4813665230442508, + "grad_norm": 0.5318652584892374, + "learning_rate": 3.324261609189685e-05, + "loss": 11.9389, + "step": 27204 + }, + { + "epoch": 1.4814209770408338, + "grad_norm": 0.5413052857900214, + "learning_rate": 3.323605077380164e-05, + "loss": 11.7172, + "step": 27205 + }, + { + "epoch": 1.4814754310374167, + "grad_norm": 0.5925252601545924, + "learning_rate": 3.322948597487414e-05, + "loss": 11.8808, + "step": 27206 + }, + { + "epoch": 1.4815298850339997, + "grad_norm": 0.5347771775967283, + "learning_rate": 3.322292169516532e-05, + "loss": 11.9355, + "step": 27207 + }, + { + "epoch": 1.4815843390305827, + "grad_norm": 0.5609134224122345, + "learning_rate": 3.3216357934726294e-05, + "loss": 11.8862, + "step": 27208 + }, + { + "epoch": 1.4816387930271657, + "grad_norm": 0.5312910095736958, + "learning_rate": 3.320979469360808e-05, + "loss": 11.8217, + "step": 27209 + }, + { + "epoch": 1.4816932470237487, + "grad_norm": 0.5650646943245703, + "learning_rate": 3.320323197186165e-05, + "loss": 11.8769, + "step": 27210 + }, + { + "epoch": 1.4817477010203317, + "grad_norm": 0.5883610555504796, + "learning_rate": 3.31966697695381e-05, + "loss": 11.8561, + "step": 27211 + }, + { + "epoch": 1.4818021550169147, + "grad_norm": 0.5382984898612555, + "learning_rate": 3.319010808668848e-05, + "loss": 11.7359, + "step": 27212 + }, + { + "epoch": 1.481856609013498, + "grad_norm": 0.6328079032372491, + "learning_rate": 3.318354692336375e-05, + "loss": 11.9215, + "step": 27213 + }, + { + "epoch": 1.481911063010081, + "grad_norm": 0.5521219231718056, + "learning_rate": 3.3176986279614994e-05, + "loss": 11.7316, + "step": 27214 + }, + { + "epoch": 1.481965517006664, + "grad_norm": 0.5810596107266891, + "learning_rate": 3.317042615549317e-05, + "loss": 12.0288, + "step": 27215 + }, + { + "epoch": 1.482019971003247, + "grad_norm": 0.5162672593751939, + "learning_rate": 3.316386655104935e-05, + "loss": 11.8178, + "step": 27216 + }, + { + "epoch": 1.4820744249998299, + "grad_norm": 0.5837226705935764, + "learning_rate": 3.315730746633448e-05, + "loss": 11.9309, + "step": 27217 + }, + { + "epoch": 1.4821288789964129, + "grad_norm": 0.5838394899791232, + "learning_rate": 3.3150748901399634e-05, + "loss": 11.8939, + "step": 27218 + }, + { + "epoch": 1.4821833329929959, + "grad_norm": 0.49102244179581195, + "learning_rate": 3.314419085629577e-05, + "loss": 11.893, + "step": 27219 + }, + { + "epoch": 1.4822377869895789, + "grad_norm": 0.5228855868017309, + "learning_rate": 3.3137633331073855e-05, + "loss": 11.7481, + "step": 27220 + }, + { + "epoch": 1.4822922409861619, + "grad_norm": 0.5828162795906983, + "learning_rate": 3.313107632578495e-05, + "loss": 11.9261, + "step": 27221 + }, + { + "epoch": 1.4823466949827449, + "grad_norm": 0.5306854490185376, + "learning_rate": 3.3124519840479985e-05, + "loss": 11.8847, + "step": 27222 + }, + { + "epoch": 1.4824011489793278, + "grad_norm": 0.49920585092632636, + "learning_rate": 3.3117963875209955e-05, + "loss": 11.9139, + "step": 27223 + }, + { + "epoch": 1.4824556029759108, + "grad_norm": 0.5191636265092698, + "learning_rate": 3.3111408430025906e-05, + "loss": 11.8197, + "step": 27224 + }, + { + "epoch": 1.4825100569724938, + "grad_norm": 0.5373763862972796, + "learning_rate": 3.310485350497871e-05, + "loss": 11.9339, + "step": 27225 + }, + { + "epoch": 1.4825645109690768, + "grad_norm": 0.5363460680014575, + "learning_rate": 3.3098299100119435e-05, + "loss": 11.8584, + "step": 27226 + }, + { + "epoch": 1.4826189649656598, + "grad_norm": 0.602641280895511, + "learning_rate": 3.309174521549897e-05, + "loss": 11.8895, + "step": 27227 + }, + { + "epoch": 1.482673418962243, + "grad_norm": 0.4825496469900355, + "learning_rate": 3.308519185116834e-05, + "loss": 11.8072, + "step": 27228 + }, + { + "epoch": 1.482727872958826, + "grad_norm": 0.5247850017060505, + "learning_rate": 3.3078639007178495e-05, + "loss": 11.8737, + "step": 27229 + }, + { + "epoch": 1.482782326955409, + "grad_norm": 0.5279798067277318, + "learning_rate": 3.307208668358033e-05, + "loss": 11.8489, + "step": 27230 + }, + { + "epoch": 1.482836780951992, + "grad_norm": 0.5190133333592362, + "learning_rate": 3.306553488042489e-05, + "loss": 11.8621, + "step": 27231 + }, + { + "epoch": 1.482891234948575, + "grad_norm": 0.5516550293797355, + "learning_rate": 3.3058983597763026e-05, + "loss": 11.9184, + "step": 27232 + }, + { + "epoch": 1.482945688945158, + "grad_norm": 0.4975954920367842, + "learning_rate": 3.3052432835645744e-05, + "loss": 11.8549, + "step": 27233 + }, + { + "epoch": 1.483000142941741, + "grad_norm": 0.5627263185035444, + "learning_rate": 3.304588259412399e-05, + "loss": 11.8656, + "step": 27234 + }, + { + "epoch": 1.483054596938324, + "grad_norm": 0.5895512964169496, + "learning_rate": 3.3039332873248666e-05, + "loss": 12.0113, + "step": 27235 + }, + { + "epoch": 1.4831090509349072, + "grad_norm": 0.54498354106306, + "learning_rate": 3.303278367307073e-05, + "loss": 11.7448, + "step": 27236 + }, + { + "epoch": 1.4831635049314902, + "grad_norm": 0.5789294053716956, + "learning_rate": 3.3026234993641095e-05, + "loss": 11.9806, + "step": 27237 + }, + { + "epoch": 1.4832179589280732, + "grad_norm": 0.5721138297062113, + "learning_rate": 3.30196868350107e-05, + "loss": 11.8589, + "step": 27238 + }, + { + "epoch": 1.4832724129246562, + "grad_norm": 0.5906386088344636, + "learning_rate": 3.301313919723046e-05, + "loss": 11.9199, + "step": 27239 + }, + { + "epoch": 1.4833268669212392, + "grad_norm": 0.5347278439720046, + "learning_rate": 3.300659208035124e-05, + "loss": 11.8485, + "step": 27240 + }, + { + "epoch": 1.4833813209178222, + "grad_norm": 0.5051752633238162, + "learning_rate": 3.300004548442404e-05, + "loss": 11.815, + "step": 27241 + }, + { + "epoch": 1.4834357749144051, + "grad_norm": 0.6329251790011808, + "learning_rate": 3.299349940949968e-05, + "loss": 11.9331, + "step": 27242 + }, + { + "epoch": 1.4834902289109881, + "grad_norm": 0.5489740006734455, + "learning_rate": 3.2986953855629145e-05, + "loss": 11.7848, + "step": 27243 + }, + { + "epoch": 1.4835446829075711, + "grad_norm": 0.5848667235338048, + "learning_rate": 3.2980408822863264e-05, + "loss": 11.8227, + "step": 27244 + }, + { + "epoch": 1.4835991369041541, + "grad_norm": 0.5789435272763168, + "learning_rate": 3.297386431125296e-05, + "loss": 11.8886, + "step": 27245 + }, + { + "epoch": 1.4836535909007371, + "grad_norm": 0.529242405696322, + "learning_rate": 3.296732032084916e-05, + "loss": 11.724, + "step": 27246 + }, + { + "epoch": 1.48370804489732, + "grad_norm": 0.5779624953404515, + "learning_rate": 3.296077685170269e-05, + "loss": 11.8703, + "step": 27247 + }, + { + "epoch": 1.483762498893903, + "grad_norm": 0.5223023362809981, + "learning_rate": 3.2954233903864515e-05, + "loss": 11.7858, + "step": 27248 + }, + { + "epoch": 1.483816952890486, + "grad_norm": 0.5962154089421761, + "learning_rate": 3.294769147738541e-05, + "loss": 11.888, + "step": 27249 + }, + { + "epoch": 1.483871406887069, + "grad_norm": 0.5160660724809407, + "learning_rate": 3.2941149572316286e-05, + "loss": 11.8307, + "step": 27250 + }, + { + "epoch": 1.4839258608836523, + "grad_norm": 0.5809806957806917, + "learning_rate": 3.2934608188708075e-05, + "loss": 11.8838, + "step": 27251 + }, + { + "epoch": 1.4839803148802353, + "grad_norm": 0.48361595968755866, + "learning_rate": 3.292806732661156e-05, + "loss": 11.8405, + "step": 27252 + }, + { + "epoch": 1.4840347688768183, + "grad_norm": 0.4974798421988643, + "learning_rate": 3.292152698607768e-05, + "loss": 11.8206, + "step": 27253 + }, + { + "epoch": 1.4840892228734013, + "grad_norm": 0.5968044592205556, + "learning_rate": 3.2914987167157206e-05, + "loss": 11.8325, + "step": 27254 + }, + { + "epoch": 1.4841436768699843, + "grad_norm": 0.5455372703310573, + "learning_rate": 3.2908447869901095e-05, + "loss": 11.8612, + "step": 27255 + }, + { + "epoch": 1.4841981308665673, + "grad_norm": 0.616961306700275, + "learning_rate": 3.2901909094360096e-05, + "loss": 12.0473, + "step": 27256 + }, + { + "epoch": 1.4842525848631503, + "grad_norm": 0.5819114537843179, + "learning_rate": 3.2895370840585104e-05, + "loss": 11.7873, + "step": 27257 + }, + { + "epoch": 1.4843070388597333, + "grad_norm": 0.5389869348792732, + "learning_rate": 3.2888833108627035e-05, + "loss": 11.851, + "step": 27258 + }, + { + "epoch": 1.4843614928563162, + "grad_norm": 0.5731211277536921, + "learning_rate": 3.2882295898536584e-05, + "loss": 11.8152, + "step": 27259 + }, + { + "epoch": 1.4844159468528995, + "grad_norm": 0.5025569124630986, + "learning_rate": 3.287575921036467e-05, + "loss": 11.8047, + "step": 27260 + }, + { + "epoch": 1.4844704008494825, + "grad_norm": 0.5881762750936129, + "learning_rate": 3.2869223044162126e-05, + "loss": 11.8476, + "step": 27261 + }, + { + "epoch": 1.4845248548460654, + "grad_norm": 0.5608032378207175, + "learning_rate": 3.286268739997973e-05, + "loss": 11.863, + "step": 27262 + }, + { + "epoch": 1.4845793088426484, + "grad_norm": 0.5110285164030312, + "learning_rate": 3.285615227786838e-05, + "loss": 11.8661, + "step": 27263 + }, + { + "epoch": 1.4846337628392314, + "grad_norm": 0.704162504125869, + "learning_rate": 3.2849617677878805e-05, + "loss": 11.7386, + "step": 27264 + }, + { + "epoch": 1.4846882168358144, + "grad_norm": 0.5746161075372516, + "learning_rate": 3.2843083600061895e-05, + "loss": 11.6459, + "step": 27265 + }, + { + "epoch": 1.4847426708323974, + "grad_norm": 0.5199599932562631, + "learning_rate": 3.28365500444684e-05, + "loss": 11.73, + "step": 27266 + }, + { + "epoch": 1.4847971248289804, + "grad_norm": 0.5906330491008318, + "learning_rate": 3.283001701114916e-05, + "loss": 12.0179, + "step": 27267 + }, + { + "epoch": 1.4848515788255634, + "grad_norm": 0.5395303467151444, + "learning_rate": 3.282348450015503e-05, + "loss": 12.0826, + "step": 27268 + }, + { + "epoch": 1.4849060328221464, + "grad_norm": 0.51679973935423, + "learning_rate": 3.2816952511536705e-05, + "loss": 11.7768, + "step": 27269 + }, + { + "epoch": 1.4849604868187294, + "grad_norm": 0.5479399524253692, + "learning_rate": 3.281042104534503e-05, + "loss": 11.8409, + "step": 27270 + }, + { + "epoch": 1.4850149408153124, + "grad_norm": 0.5411507669843482, + "learning_rate": 3.280389010163077e-05, + "loss": 11.9591, + "step": 27271 + }, + { + "epoch": 1.4850693948118954, + "grad_norm": 0.5923250906597739, + "learning_rate": 3.279735968044473e-05, + "loss": 11.976, + "step": 27272 + }, + { + "epoch": 1.4851238488084784, + "grad_norm": 0.4882082373529916, + "learning_rate": 3.279082978183772e-05, + "loss": 11.8757, + "step": 27273 + }, + { + "epoch": 1.4851783028050616, + "grad_norm": 0.5303569538341536, + "learning_rate": 3.2784300405860455e-05, + "loss": 11.9316, + "step": 27274 + }, + { + "epoch": 1.4852327568016446, + "grad_norm": 0.5347246877158316, + "learning_rate": 3.277777155256378e-05, + "loss": 11.7729, + "step": 27275 + }, + { + "epoch": 1.4852872107982276, + "grad_norm": 0.5595970988754599, + "learning_rate": 3.2771243221998384e-05, + "loss": 11.869, + "step": 27276 + }, + { + "epoch": 1.4853416647948106, + "grad_norm": 0.5650652753851264, + "learning_rate": 3.2764715414215106e-05, + "loss": 11.8208, + "step": 27277 + }, + { + "epoch": 1.4853961187913935, + "grad_norm": 0.5484065079949246, + "learning_rate": 3.275818812926469e-05, + "loss": 11.8264, + "step": 27278 + }, + { + "epoch": 1.4854505727879765, + "grad_norm": 0.5189689296845389, + "learning_rate": 3.2751661367197836e-05, + "loss": 11.667, + "step": 27279 + }, + { + "epoch": 1.4855050267845595, + "grad_norm": 0.5436340697901008, + "learning_rate": 3.2745135128065386e-05, + "loss": 11.914, + "step": 27280 + }, + { + "epoch": 1.4855594807811425, + "grad_norm": 0.5124001145040258, + "learning_rate": 3.2738609411918e-05, + "loss": 11.951, + "step": 27281 + }, + { + "epoch": 1.4856139347777255, + "grad_norm": 0.5277418345053874, + "learning_rate": 3.2732084218806467e-05, + "loss": 11.8618, + "step": 27282 + }, + { + "epoch": 1.4856683887743087, + "grad_norm": 0.6326896215702091, + "learning_rate": 3.272555954878157e-05, + "loss": 11.7973, + "step": 27283 + }, + { + "epoch": 1.4857228427708917, + "grad_norm": 0.5674423458562502, + "learning_rate": 3.271903540189396e-05, + "loss": 11.9219, + "step": 27284 + }, + { + "epoch": 1.4857772967674747, + "grad_norm": 0.5087084102268827, + "learning_rate": 3.271251177819446e-05, + "loss": 11.9181, + "step": 27285 + }, + { + "epoch": 1.4858317507640577, + "grad_norm": 0.8901096032229812, + "learning_rate": 3.27059886777337e-05, + "loss": 11.8686, + "step": 27286 + }, + { + "epoch": 1.4858862047606407, + "grad_norm": 0.5073872482333592, + "learning_rate": 3.2699466100562504e-05, + "loss": 11.7898, + "step": 27287 + }, + { + "epoch": 1.4859406587572237, + "grad_norm": 0.5211698973456306, + "learning_rate": 3.269294404673155e-05, + "loss": 11.8486, + "step": 27288 + }, + { + "epoch": 1.4859951127538067, + "grad_norm": 0.5368291141901312, + "learning_rate": 3.2686422516291504e-05, + "loss": 11.8866, + "step": 27289 + }, + { + "epoch": 1.4860495667503897, + "grad_norm": 0.5693300384485326, + "learning_rate": 3.2679901509293164e-05, + "loss": 11.8772, + "step": 27290 + }, + { + "epoch": 1.4861040207469727, + "grad_norm": 0.5215937928340507, + "learning_rate": 3.2673381025787165e-05, + "loss": 11.869, + "step": 27291 + }, + { + "epoch": 1.4861584747435557, + "grad_norm": 0.5708249218961806, + "learning_rate": 3.266686106582428e-05, + "loss": 11.8806, + "step": 27292 + }, + { + "epoch": 1.4862129287401387, + "grad_norm": 0.63677097635389, + "learning_rate": 3.266034162945515e-05, + "loss": 11.8537, + "step": 27293 + }, + { + "epoch": 1.4862673827367217, + "grad_norm": 0.5857846513150303, + "learning_rate": 3.265382271673048e-05, + "loss": 11.8732, + "step": 27294 + }, + { + "epoch": 1.4863218367333046, + "grad_norm": 0.520796712326528, + "learning_rate": 3.2647304327701034e-05, + "loss": 11.7786, + "step": 27295 + }, + { + "epoch": 1.4863762907298876, + "grad_norm": 0.5983854752980385, + "learning_rate": 3.264078646241739e-05, + "loss": 11.9918, + "step": 27296 + }, + { + "epoch": 1.4864307447264706, + "grad_norm": 0.5101282818263799, + "learning_rate": 3.263426912093033e-05, + "loss": 11.6416, + "step": 27297 + }, + { + "epoch": 1.4864851987230538, + "grad_norm": 0.5038209482467594, + "learning_rate": 3.2627752303290496e-05, + "loss": 11.7531, + "step": 27298 + }, + { + "epoch": 1.4865396527196368, + "grad_norm": 0.5320127817485272, + "learning_rate": 3.262123600954852e-05, + "loss": 11.7726, + "step": 27299 + }, + { + "epoch": 1.4865941067162198, + "grad_norm": 0.5398572104230178, + "learning_rate": 3.2614720239755135e-05, + "loss": 11.7259, + "step": 27300 + }, + { + "epoch": 1.4866485607128028, + "grad_norm": 0.5973062249520685, + "learning_rate": 3.2608204993960964e-05, + "loss": 11.9197, + "step": 27301 + }, + { + "epoch": 1.4867030147093858, + "grad_norm": 0.5978273206228052, + "learning_rate": 3.260169027221672e-05, + "loss": 11.8774, + "step": 27302 + }, + { + "epoch": 1.4867574687059688, + "grad_norm": 0.6872509022041908, + "learning_rate": 3.259517607457301e-05, + "loss": 11.906, + "step": 27303 + }, + { + "epoch": 1.4868119227025518, + "grad_norm": 0.5169416606520894, + "learning_rate": 3.258866240108054e-05, + "loss": 11.8721, + "step": 27304 + }, + { + "epoch": 1.4868663766991348, + "grad_norm": 0.5240611112350078, + "learning_rate": 3.25821492517899e-05, + "loss": 11.7707, + "step": 27305 + }, + { + "epoch": 1.486920830695718, + "grad_norm": 0.5654095582004188, + "learning_rate": 3.257563662675178e-05, + "loss": 11.8115, + "step": 27306 + }, + { + "epoch": 1.486975284692301, + "grad_norm": 0.5085421091455207, + "learning_rate": 3.256912452601685e-05, + "loss": 11.8982, + "step": 27307 + }, + { + "epoch": 1.487029738688884, + "grad_norm": 0.5707968444123652, + "learning_rate": 3.256261294963572e-05, + "loss": 11.7993, + "step": 27308 + }, + { + "epoch": 1.487084192685467, + "grad_norm": 0.50828426846241, + "learning_rate": 3.2556101897658976e-05, + "loss": 11.9359, + "step": 27309 + }, + { + "epoch": 1.48713864668205, + "grad_norm": 0.5131446602261969, + "learning_rate": 3.254959137013733e-05, + "loss": 11.7652, + "step": 27310 + }, + { + "epoch": 1.487193100678633, + "grad_norm": 0.5417230565673917, + "learning_rate": 3.254308136712135e-05, + "loss": 12.0064, + "step": 27311 + }, + { + "epoch": 1.487247554675216, + "grad_norm": 0.557403104913808, + "learning_rate": 3.2536571888661706e-05, + "loss": 11.9038, + "step": 27312 + }, + { + "epoch": 1.487302008671799, + "grad_norm": 0.5115471585697475, + "learning_rate": 3.253006293480897e-05, + "loss": 11.8616, + "step": 27313 + }, + { + "epoch": 1.487356462668382, + "grad_norm": 0.5656769013152173, + "learning_rate": 3.2523554505613796e-05, + "loss": 11.8502, + "step": 27314 + }, + { + "epoch": 1.487410916664965, + "grad_norm": 0.4902086284984136, + "learning_rate": 3.251704660112675e-05, + "loss": 11.773, + "step": 27315 + }, + { + "epoch": 1.487465370661548, + "grad_norm": 0.5067827185346057, + "learning_rate": 3.2510539221398475e-05, + "loss": 11.8763, + "step": 27316 + }, + { + "epoch": 1.487519824658131, + "grad_norm": 0.5197292409505939, + "learning_rate": 3.25040323664796e-05, + "loss": 11.8191, + "step": 27317 + }, + { + "epoch": 1.487574278654714, + "grad_norm": 0.5697866881209924, + "learning_rate": 3.2497526036420644e-05, + "loss": 11.8394, + "step": 27318 + }, + { + "epoch": 1.487628732651297, + "grad_norm": 0.5313578729803738, + "learning_rate": 3.2491020231272317e-05, + "loss": 11.7565, + "step": 27319 + }, + { + "epoch": 1.48768318664788, + "grad_norm": 0.5369869662143938, + "learning_rate": 3.2484514951085065e-05, + "loss": 11.9415, + "step": 27320 + }, + { + "epoch": 1.4877376406444631, + "grad_norm": 0.5238407392481245, + "learning_rate": 3.2478010195909556e-05, + "loss": 11.8026, + "step": 27321 + }, + { + "epoch": 1.4877920946410461, + "grad_norm": 0.5856891411990525, + "learning_rate": 3.247150596579639e-05, + "loss": 11.9195, + "step": 27322 + }, + { + "epoch": 1.487846548637629, + "grad_norm": 0.5187091122853142, + "learning_rate": 3.24650022607961e-05, + "loss": 11.9496, + "step": 27323 + }, + { + "epoch": 1.487901002634212, + "grad_norm": 0.5612849698542438, + "learning_rate": 3.245849908095929e-05, + "loss": 11.8396, + "step": 27324 + }, + { + "epoch": 1.487955456630795, + "grad_norm": 0.5008938401427246, + "learning_rate": 3.245199642633649e-05, + "loss": 11.7777, + "step": 27325 + }, + { + "epoch": 1.488009910627378, + "grad_norm": 0.5242050525716808, + "learning_rate": 3.244549429697834e-05, + "loss": 11.962, + "step": 27326 + }, + { + "epoch": 1.488064364623961, + "grad_norm": 0.5871022906923379, + "learning_rate": 3.243899269293531e-05, + "loss": 11.8592, + "step": 27327 + }, + { + "epoch": 1.488118818620544, + "grad_norm": 0.5388255460765787, + "learning_rate": 3.243249161425801e-05, + "loss": 11.6366, + "step": 27328 + }, + { + "epoch": 1.488173272617127, + "grad_norm": 0.5691555834204939, + "learning_rate": 3.242599106099704e-05, + "loss": 11.8457, + "step": 27329 + }, + { + "epoch": 1.4882277266137103, + "grad_norm": 0.53414967703767, + "learning_rate": 3.2419491033202843e-05, + "loss": 11.8391, + "step": 27330 + }, + { + "epoch": 1.4882821806102933, + "grad_norm": 0.5408069509464369, + "learning_rate": 3.241299153092601e-05, + "loss": 11.8947, + "step": 27331 + }, + { + "epoch": 1.4883366346068763, + "grad_norm": 0.5642851220105962, + "learning_rate": 3.240649255421713e-05, + "loss": 11.8473, + "step": 27332 + }, + { + "epoch": 1.4883910886034593, + "grad_norm": 0.5484846794469815, + "learning_rate": 3.239999410312665e-05, + "loss": 11.9208, + "step": 27333 + }, + { + "epoch": 1.4884455426000422, + "grad_norm": 0.47128622223821753, + "learning_rate": 3.23934961777052e-05, + "loss": 11.8681, + "step": 27334 + }, + { + "epoch": 1.4884999965966252, + "grad_norm": 0.5297904201959726, + "learning_rate": 3.238699877800322e-05, + "loss": 11.9234, + "step": 27335 + }, + { + "epoch": 1.4885544505932082, + "grad_norm": 0.4994056879425562, + "learning_rate": 3.2380501904071315e-05, + "loss": 11.8726, + "step": 27336 + }, + { + "epoch": 1.4886089045897912, + "grad_norm": 0.6012553319637874, + "learning_rate": 3.237400555595993e-05, + "loss": 11.8352, + "step": 27337 + }, + { + "epoch": 1.4886633585863742, + "grad_norm": 0.5717342952793103, + "learning_rate": 3.236750973371966e-05, + "loss": 11.7764, + "step": 27338 + }, + { + "epoch": 1.4887178125829572, + "grad_norm": 0.5721044207906831, + "learning_rate": 3.236101443740097e-05, + "loss": 11.9964, + "step": 27339 + }, + { + "epoch": 1.4887722665795402, + "grad_norm": 0.543351701834864, + "learning_rate": 3.235451966705434e-05, + "loss": 11.6909, + "step": 27340 + }, + { + "epoch": 1.4888267205761232, + "grad_norm": 0.5593434669192275, + "learning_rate": 3.234802542273034e-05, + "loss": 11.8715, + "step": 27341 + }, + { + "epoch": 1.4888811745727062, + "grad_norm": 0.5956820363149719, + "learning_rate": 3.234153170447941e-05, + "loss": 11.8743, + "step": 27342 + }, + { + "epoch": 1.4889356285692892, + "grad_norm": 0.5586765541934234, + "learning_rate": 3.233503851235208e-05, + "loss": 11.86, + "step": 27343 + }, + { + "epoch": 1.4889900825658724, + "grad_norm": 0.5375570701571059, + "learning_rate": 3.232854584639887e-05, + "loss": 11.8627, + "step": 27344 + }, + { + "epoch": 1.4890445365624554, + "grad_norm": 0.5788137075225459, + "learning_rate": 3.2322053706670194e-05, + "loss": 11.8417, + "step": 27345 + }, + { + "epoch": 1.4890989905590384, + "grad_norm": 0.5739870435953575, + "learning_rate": 3.231556209321662e-05, + "loss": 11.8072, + "step": 27346 + }, + { + "epoch": 1.4891534445556214, + "grad_norm": 0.5200634454133051, + "learning_rate": 3.2309071006088554e-05, + "loss": 11.8978, + "step": 27347 + }, + { + "epoch": 1.4892078985522044, + "grad_norm": 0.5267569385414985, + "learning_rate": 3.230258044533653e-05, + "loss": 11.791, + "step": 27348 + }, + { + "epoch": 1.4892623525487874, + "grad_norm": 0.5257919766051812, + "learning_rate": 3.229609041101099e-05, + "loss": 11.7765, + "step": 27349 + }, + { + "epoch": 1.4893168065453704, + "grad_norm": 0.5379618945486823, + "learning_rate": 3.228960090316239e-05, + "loss": 11.7958, + "step": 27350 + }, + { + "epoch": 1.4893712605419533, + "grad_norm": 0.5464085762061488, + "learning_rate": 3.228311192184122e-05, + "loss": 11.9128, + "step": 27351 + }, + { + "epoch": 1.4894257145385363, + "grad_norm": 0.5120607901818375, + "learning_rate": 3.227662346709791e-05, + "loss": 11.8575, + "step": 27352 + }, + { + "epoch": 1.4894801685351196, + "grad_norm": 0.555661724649654, + "learning_rate": 3.227013553898296e-05, + "loss": 11.8716, + "step": 27353 + }, + { + "epoch": 1.4895346225317025, + "grad_norm": 0.5199119991369252, + "learning_rate": 3.2263648137546765e-05, + "loss": 11.8153, + "step": 27354 + }, + { + "epoch": 1.4895890765282855, + "grad_norm": 0.514488265265781, + "learning_rate": 3.22571612628398e-05, + "loss": 11.7517, + "step": 27355 + }, + { + "epoch": 1.4896435305248685, + "grad_norm": 0.5500673005441195, + "learning_rate": 3.2250674914912536e-05, + "loss": 11.918, + "step": 27356 + }, + { + "epoch": 1.4896979845214515, + "grad_norm": 0.6316095639101168, + "learning_rate": 3.224418909381536e-05, + "loss": 11.861, + "step": 27357 + }, + { + "epoch": 1.4897524385180345, + "grad_norm": 0.5401492324875024, + "learning_rate": 3.223770379959876e-05, + "loss": 11.827, + "step": 27358 + }, + { + "epoch": 1.4898068925146175, + "grad_norm": 0.5351032328964, + "learning_rate": 3.2231219032313144e-05, + "loss": 11.917, + "step": 27359 + }, + { + "epoch": 1.4898613465112005, + "grad_norm": 0.5419580382352344, + "learning_rate": 3.222473479200889e-05, + "loss": 11.8503, + "step": 27360 + }, + { + "epoch": 1.4899158005077835, + "grad_norm": 0.5611074554911892, + "learning_rate": 3.2218251078736505e-05, + "loss": 11.9644, + "step": 27361 + }, + { + "epoch": 1.4899702545043665, + "grad_norm": 0.5479997393111928, + "learning_rate": 3.2211767892546344e-05, + "loss": 11.8396, + "step": 27362 + }, + { + "epoch": 1.4900247085009495, + "grad_norm": 0.566210287760261, + "learning_rate": 3.220528523348886e-05, + "loss": 11.7726, + "step": 27363 + }, + { + "epoch": 1.4900791624975325, + "grad_norm": 0.5335510246762156, + "learning_rate": 3.2198803101614425e-05, + "loss": 11.9138, + "step": 27364 + }, + { + "epoch": 1.4901336164941155, + "grad_norm": 0.5369260579828239, + "learning_rate": 3.2192321496973465e-05, + "loss": 11.6365, + "step": 27365 + }, + { + "epoch": 1.4901880704906985, + "grad_norm": 0.5763778858222389, + "learning_rate": 3.218584041961641e-05, + "loss": 11.9058, + "step": 27366 + }, + { + "epoch": 1.4902425244872815, + "grad_norm": 0.5695051398147223, + "learning_rate": 3.217935986959361e-05, + "loss": 11.9074, + "step": 27367 + }, + { + "epoch": 1.4902969784838647, + "grad_norm": 0.5087613081774589, + "learning_rate": 3.217287984695551e-05, + "loss": 11.7241, + "step": 27368 + }, + { + "epoch": 1.4903514324804477, + "grad_norm": 0.5395579202495738, + "learning_rate": 3.2166400351752465e-05, + "loss": 11.7826, + "step": 27369 + }, + { + "epoch": 1.4904058864770307, + "grad_norm": 0.5027300400031914, + "learning_rate": 3.215992138403484e-05, + "loss": 11.7519, + "step": 27370 + }, + { + "epoch": 1.4904603404736136, + "grad_norm": 0.5456917563778277, + "learning_rate": 3.215344294385307e-05, + "loss": 11.8791, + "step": 27371 + }, + { + "epoch": 1.4905147944701966, + "grad_norm": 0.5098160299614349, + "learning_rate": 3.214696503125748e-05, + "loss": 11.7655, + "step": 27372 + }, + { + "epoch": 1.4905692484667796, + "grad_norm": 0.5985508874443944, + "learning_rate": 3.21404876462985e-05, + "loss": 11.8313, + "step": 27373 + }, + { + "epoch": 1.4906237024633626, + "grad_norm": 0.5881527697539021, + "learning_rate": 3.213401078902644e-05, + "loss": 11.9214, + "step": 27374 + }, + { + "epoch": 1.4906781564599456, + "grad_norm": 0.5772487234516996, + "learning_rate": 3.212753445949172e-05, + "loss": 11.7601, + "step": 27375 + }, + { + "epoch": 1.4907326104565288, + "grad_norm": 0.5953524314552884, + "learning_rate": 3.212105865774464e-05, + "loss": 11.8854, + "step": 27376 + }, + { + "epoch": 1.4907870644531118, + "grad_norm": 0.5253379304485232, + "learning_rate": 3.2114583383835595e-05, + "loss": 11.7607, + "step": 27377 + }, + { + "epoch": 1.4908415184496948, + "grad_norm": 0.5686002338526486, + "learning_rate": 3.210810863781496e-05, + "loss": 11.7952, + "step": 27378 + }, + { + "epoch": 1.4908959724462778, + "grad_norm": 0.5781451795322011, + "learning_rate": 3.210163441973306e-05, + "loss": 11.8081, + "step": 27379 + }, + { + "epoch": 1.4909504264428608, + "grad_norm": 0.550138091581712, + "learning_rate": 3.209516072964022e-05, + "loss": 11.8952, + "step": 27380 + }, + { + "epoch": 1.4910048804394438, + "grad_norm": 0.5790573792479207, + "learning_rate": 3.2088687567586804e-05, + "loss": 11.7505, + "step": 27381 + }, + { + "epoch": 1.4910593344360268, + "grad_norm": 0.5289096816977832, + "learning_rate": 3.2082214933623134e-05, + "loss": 11.7907, + "step": 27382 + }, + { + "epoch": 1.4911137884326098, + "grad_norm": 0.46653037365099215, + "learning_rate": 3.207574282779957e-05, + "loss": 11.8671, + "step": 27383 + }, + { + "epoch": 1.4911682424291928, + "grad_norm": 0.5352656293213426, + "learning_rate": 3.2069271250166385e-05, + "loss": 11.8939, + "step": 27384 + }, + { + "epoch": 1.4912226964257758, + "grad_norm": 0.4975280705881872, + "learning_rate": 3.206280020077398e-05, + "loss": 11.9769, + "step": 27385 + }, + { + "epoch": 1.4912771504223588, + "grad_norm": 0.5429788044732909, + "learning_rate": 3.2056329679672606e-05, + "loss": 11.8679, + "step": 27386 + }, + { + "epoch": 1.4913316044189417, + "grad_norm": 0.551843620543023, + "learning_rate": 3.2049859686912584e-05, + "loss": 11.7358, + "step": 27387 + }, + { + "epoch": 1.4913860584155247, + "grad_norm": 0.52908555993108, + "learning_rate": 3.20433902225443e-05, + "loss": 11.7486, + "step": 27388 + }, + { + "epoch": 1.4914405124121077, + "grad_norm": 0.8751017592020649, + "learning_rate": 3.2036921286618e-05, + "loss": 11.9371, + "step": 27389 + }, + { + "epoch": 1.4914949664086907, + "grad_norm": 0.6231913141871868, + "learning_rate": 3.2030452879184014e-05, + "loss": 11.8704, + "step": 27390 + }, + { + "epoch": 1.491549420405274, + "grad_norm": 0.5603397525587256, + "learning_rate": 3.2023985000292575e-05, + "loss": 11.8782, + "step": 27391 + }, + { + "epoch": 1.491603874401857, + "grad_norm": 0.6754076910478701, + "learning_rate": 3.201751764999403e-05, + "loss": 11.9525, + "step": 27392 + }, + { + "epoch": 1.49165832839844, + "grad_norm": 0.5935902729391448, + "learning_rate": 3.201105082833872e-05, + "loss": 11.8441, + "step": 27393 + }, + { + "epoch": 1.491712782395023, + "grad_norm": 0.5449317540943305, + "learning_rate": 3.200458453537682e-05, + "loss": 11.8019, + "step": 27394 + }, + { + "epoch": 1.491767236391606, + "grad_norm": 0.5499611987491785, + "learning_rate": 3.199811877115873e-05, + "loss": 11.8566, + "step": 27395 + }, + { + "epoch": 1.491821690388189, + "grad_norm": 0.5725463749876839, + "learning_rate": 3.199165353573462e-05, + "loss": 11.7469, + "step": 27396 + }, + { + "epoch": 1.491876144384772, + "grad_norm": 0.5345352879477128, + "learning_rate": 3.198518882915487e-05, + "loss": 11.8076, + "step": 27397 + }, + { + "epoch": 1.4919305983813549, + "grad_norm": 0.5150328479226377, + "learning_rate": 3.1978724651469647e-05, + "loss": 11.8037, + "step": 27398 + }, + { + "epoch": 1.4919850523779379, + "grad_norm": 0.5672482133986002, + "learning_rate": 3.197226100272931e-05, + "loss": 11.9698, + "step": 27399 + }, + { + "epoch": 1.492039506374521, + "grad_norm": 0.540766156130379, + "learning_rate": 3.196579788298407e-05, + "loss": 11.7337, + "step": 27400 + }, + { + "epoch": 1.492093960371104, + "grad_norm": 0.5686484632273681, + "learning_rate": 3.1959335292284175e-05, + "loss": 11.9007, + "step": 27401 + }, + { + "epoch": 1.492148414367687, + "grad_norm": 0.5120570343678219, + "learning_rate": 3.1952873230679926e-05, + "loss": 11.7939, + "step": 27402 + }, + { + "epoch": 1.49220286836427, + "grad_norm": 0.5517134705371104, + "learning_rate": 3.194641169822152e-05, + "loss": 11.6883, + "step": 27403 + }, + { + "epoch": 1.492257322360853, + "grad_norm": 0.5242483670753166, + "learning_rate": 3.193995069495922e-05, + "loss": 11.7722, + "step": 27404 + }, + { + "epoch": 1.492311776357436, + "grad_norm": 0.6385521546705989, + "learning_rate": 3.193349022094332e-05, + "loss": 11.9355, + "step": 27405 + }, + { + "epoch": 1.492366230354019, + "grad_norm": 0.5338069368985067, + "learning_rate": 3.1927030276223966e-05, + "loss": 11.7156, + "step": 27406 + }, + { + "epoch": 1.492420684350602, + "grad_norm": 0.5373913974375919, + "learning_rate": 3.192057086085148e-05, + "loss": 11.8471, + "step": 27407 + }, + { + "epoch": 1.492475138347185, + "grad_norm": 0.5928493974126836, + "learning_rate": 3.1914111974876026e-05, + "loss": 11.7267, + "step": 27408 + }, + { + "epoch": 1.492529592343768, + "grad_norm": 0.5168927346027926, + "learning_rate": 3.1907653618347886e-05, + "loss": 11.8028, + "step": 27409 + }, + { + "epoch": 1.492584046340351, + "grad_norm": 0.5313684037282164, + "learning_rate": 3.190119579131725e-05, + "loss": 11.8494, + "step": 27410 + }, + { + "epoch": 1.492638500336934, + "grad_norm": 0.5136660413035197, + "learning_rate": 3.18947384938343e-05, + "loss": 11.7845, + "step": 27411 + }, + { + "epoch": 1.492692954333517, + "grad_norm": 0.5357556278925202, + "learning_rate": 3.188828172594932e-05, + "loss": 11.8182, + "step": 27412 + }, + { + "epoch": 1.4927474083301, + "grad_norm": 0.5275353878504169, + "learning_rate": 3.188182548771245e-05, + "loss": 11.7841, + "step": 27413 + }, + { + "epoch": 1.4928018623266832, + "grad_norm": 0.5064172315746848, + "learning_rate": 3.1875369779173924e-05, + "loss": 11.7459, + "step": 27414 + }, + { + "epoch": 1.4928563163232662, + "grad_norm": 0.5470100177461124, + "learning_rate": 3.1868914600383994e-05, + "loss": 11.9278, + "step": 27415 + }, + { + "epoch": 1.4929107703198492, + "grad_norm": 0.4988042980869149, + "learning_rate": 3.186245995139276e-05, + "loss": 11.8884, + "step": 27416 + }, + { + "epoch": 1.4929652243164322, + "grad_norm": 0.542600917398713, + "learning_rate": 3.18560058322505e-05, + "loss": 11.7948, + "step": 27417 + }, + { + "epoch": 1.4930196783130152, + "grad_norm": 0.5337346171684353, + "learning_rate": 3.184955224300734e-05, + "loss": 11.8023, + "step": 27418 + }, + { + "epoch": 1.4930741323095982, + "grad_norm": 0.5068117506775958, + "learning_rate": 3.184309918371352e-05, + "loss": 11.751, + "step": 27419 + }, + { + "epoch": 1.4931285863061812, + "grad_norm": 0.5335797519704031, + "learning_rate": 3.18366466544192e-05, + "loss": 11.76, + "step": 27420 + }, + { + "epoch": 1.4931830403027642, + "grad_norm": 0.5469394051136713, + "learning_rate": 3.1830194655174505e-05, + "loss": 11.7379, + "step": 27421 + }, + { + "epoch": 1.4932374942993472, + "grad_norm": 0.5380651376103969, + "learning_rate": 3.1823743186029675e-05, + "loss": 11.9291, + "step": 27422 + }, + { + "epoch": 1.4932919482959304, + "grad_norm": 0.50377001974569, + "learning_rate": 3.1817292247034824e-05, + "loss": 11.8882, + "step": 27423 + }, + { + "epoch": 1.4933464022925134, + "grad_norm": 0.5398332871826095, + "learning_rate": 3.181084183824018e-05, + "loss": 11.8276, + "step": 27424 + }, + { + "epoch": 1.4934008562890964, + "grad_norm": 0.5402220267565918, + "learning_rate": 3.180439195969582e-05, + "loss": 11.8085, + "step": 27425 + }, + { + "epoch": 1.4934553102856793, + "grad_norm": 0.5355265862902128, + "learning_rate": 3.179794261145196e-05, + "loss": 11.8738, + "step": 27426 + }, + { + "epoch": 1.4935097642822623, + "grad_norm": 0.55237386828305, + "learning_rate": 3.179149379355877e-05, + "loss": 11.8119, + "step": 27427 + }, + { + "epoch": 1.4935642182788453, + "grad_norm": 0.497504691561615, + "learning_rate": 3.178504550606631e-05, + "loss": 11.9204, + "step": 27428 + }, + { + "epoch": 1.4936186722754283, + "grad_norm": 0.5127452387862489, + "learning_rate": 3.177859774902483e-05, + "loss": 11.8487, + "step": 27429 + }, + { + "epoch": 1.4936731262720113, + "grad_norm": 0.5758347834761272, + "learning_rate": 3.17721505224844e-05, + "loss": 11.9149, + "step": 27430 + }, + { + "epoch": 1.4937275802685943, + "grad_norm": 0.5547243846772557, + "learning_rate": 3.1765703826495144e-05, + "loss": 11.7884, + "step": 27431 + }, + { + "epoch": 1.4937820342651773, + "grad_norm": 0.5571913268408929, + "learning_rate": 3.1759257661107245e-05, + "loss": 11.715, + "step": 27432 + }, + { + "epoch": 1.4938364882617603, + "grad_norm": 0.5442979747736589, + "learning_rate": 3.175281202637077e-05, + "loss": 11.8839, + "step": 27433 + }, + { + "epoch": 1.4938909422583433, + "grad_norm": 0.5704327202416556, + "learning_rate": 3.174636692233591e-05, + "loss": 11.8487, + "step": 27434 + }, + { + "epoch": 1.4939453962549263, + "grad_norm": 0.5372537584832451, + "learning_rate": 3.173992234905272e-05, + "loss": 11.8977, + "step": 27435 + }, + { + "epoch": 1.4939998502515093, + "grad_norm": 0.575209458369802, + "learning_rate": 3.173347830657133e-05, + "loss": 11.9214, + "step": 27436 + }, + { + "epoch": 1.4940543042480925, + "grad_norm": 0.5796686629508384, + "learning_rate": 3.17270347949419e-05, + "loss": 11.8689, + "step": 27437 + }, + { + "epoch": 1.4941087582446755, + "grad_norm": 0.5433666967461873, + "learning_rate": 3.1720591814214464e-05, + "loss": 11.9079, + "step": 27438 + }, + { + "epoch": 1.4941632122412585, + "grad_norm": 0.5236638767969538, + "learning_rate": 3.1714149364439215e-05, + "loss": 11.852, + "step": 27439 + }, + { + "epoch": 1.4942176662378415, + "grad_norm": 0.5577906353201473, + "learning_rate": 3.1707707445666135e-05, + "loss": 11.8241, + "step": 27440 + }, + { + "epoch": 1.4942721202344245, + "grad_norm": 0.49045688724140346, + "learning_rate": 3.1701266057945376e-05, + "loss": 11.8646, + "step": 27441 + }, + { + "epoch": 1.4943265742310075, + "grad_norm": 0.5170163146468425, + "learning_rate": 3.169482520132705e-05, + "loss": 11.8163, + "step": 27442 + }, + { + "epoch": 1.4943810282275904, + "grad_norm": 0.5074604276050942, + "learning_rate": 3.168838487586119e-05, + "loss": 11.8348, + "step": 27443 + }, + { + "epoch": 1.4944354822241734, + "grad_norm": 0.4994581872740865, + "learning_rate": 3.168194508159794e-05, + "loss": 11.8019, + "step": 27444 + }, + { + "epoch": 1.4944899362207564, + "grad_norm": 0.5539482960695543, + "learning_rate": 3.16755058185873e-05, + "loss": 11.9257, + "step": 27445 + }, + { + "epoch": 1.4945443902173396, + "grad_norm": 0.5394138641393187, + "learning_rate": 3.166906708687943e-05, + "loss": 11.8341, + "step": 27446 + }, + { + "epoch": 1.4945988442139226, + "grad_norm": 0.5958720947355558, + "learning_rate": 3.1662628886524314e-05, + "loss": 11.7851, + "step": 27447 + }, + { + "epoch": 1.4946532982105056, + "grad_norm": 0.5055716382702036, + "learning_rate": 3.165619121757206e-05, + "loss": 11.9507, + "step": 27448 + }, + { + "epoch": 1.4947077522070886, + "grad_norm": 0.5850583357709767, + "learning_rate": 3.164975408007279e-05, + "loss": 11.8939, + "step": 27449 + }, + { + "epoch": 1.4947622062036716, + "grad_norm": 0.5091230101697962, + "learning_rate": 3.1643317474076415e-05, + "loss": 11.8763, + "step": 27450 + }, + { + "epoch": 1.4948166602002546, + "grad_norm": 0.5378135812842038, + "learning_rate": 3.163688139963311e-05, + "loss": 11.8288, + "step": 27451 + }, + { + "epoch": 1.4948711141968376, + "grad_norm": 0.550914214934827, + "learning_rate": 3.163044585679286e-05, + "loss": 11.957, + "step": 27452 + }, + { + "epoch": 1.4949255681934206, + "grad_norm": 0.5402679003309571, + "learning_rate": 3.162401084560571e-05, + "loss": 11.8529, + "step": 27453 + }, + { + "epoch": 1.4949800221900036, + "grad_norm": 0.5627741211877516, + "learning_rate": 3.161757636612176e-05, + "loss": 11.8402, + "step": 27454 + }, + { + "epoch": 1.4950344761865866, + "grad_norm": 0.5389489879191961, + "learning_rate": 3.161114241839096e-05, + "loss": 11.7944, + "step": 27455 + }, + { + "epoch": 1.4950889301831696, + "grad_norm": 0.5577135803377378, + "learning_rate": 3.1604709002463426e-05, + "loss": 11.8685, + "step": 27456 + }, + { + "epoch": 1.4951433841797526, + "grad_norm": 0.5315667289223625, + "learning_rate": 3.159827611838912e-05, + "loss": 11.853, + "step": 27457 + }, + { + "epoch": 1.4951978381763356, + "grad_norm": 0.5531299342302193, + "learning_rate": 3.159184376621811e-05, + "loss": 11.8088, + "step": 27458 + }, + { + "epoch": 1.4952522921729186, + "grad_norm": 0.5639067210014592, + "learning_rate": 3.15854119460004e-05, + "loss": 11.87, + "step": 27459 + }, + { + "epoch": 1.4953067461695015, + "grad_norm": 0.5060470268627262, + "learning_rate": 3.1578980657785975e-05, + "loss": 11.9204, + "step": 27460 + }, + { + "epoch": 1.4953612001660848, + "grad_norm": 0.7839953306743349, + "learning_rate": 3.1572549901624895e-05, + "loss": 11.9339, + "step": 27461 + }, + { + "epoch": 1.4954156541626678, + "grad_norm": 0.5774357181060241, + "learning_rate": 3.156611967756711e-05, + "loss": 11.6995, + "step": 27462 + }, + { + "epoch": 1.4954701081592507, + "grad_norm": 0.5582189399702306, + "learning_rate": 3.155968998566264e-05, + "loss": 11.823, + "step": 27463 + }, + { + "epoch": 1.4955245621558337, + "grad_norm": 0.53568887973156, + "learning_rate": 3.155326082596155e-05, + "loss": 11.8139, + "step": 27464 + }, + { + "epoch": 1.4955790161524167, + "grad_norm": 0.5031229417344054, + "learning_rate": 3.1546832198513754e-05, + "loss": 11.9377, + "step": 27465 + }, + { + "epoch": 1.4956334701489997, + "grad_norm": 0.5423070313971351, + "learning_rate": 3.154040410336929e-05, + "loss": 11.8203, + "step": 27466 + }, + { + "epoch": 1.4956879241455827, + "grad_norm": 0.5539134542696412, + "learning_rate": 3.1533976540578095e-05, + "loss": 11.8151, + "step": 27467 + }, + { + "epoch": 1.4957423781421657, + "grad_norm": 0.6590430201518052, + "learning_rate": 3.152754951019021e-05, + "loss": 12.0469, + "step": 27468 + }, + { + "epoch": 1.495796832138749, + "grad_norm": 0.5380775390880129, + "learning_rate": 3.152112301225556e-05, + "loss": 11.8812, + "step": 27469 + }, + { + "epoch": 1.495851286135332, + "grad_norm": 0.5424455925133175, + "learning_rate": 3.151469704682416e-05, + "loss": 11.8462, + "step": 27470 + }, + { + "epoch": 1.495905740131915, + "grad_norm": 0.5286895412193657, + "learning_rate": 3.150827161394597e-05, + "loss": 11.8277, + "step": 27471 + }, + { + "epoch": 1.495960194128498, + "grad_norm": 0.5286892757526992, + "learning_rate": 3.150184671367091e-05, + "loss": 11.8305, + "step": 27472 + }, + { + "epoch": 1.496014648125081, + "grad_norm": 0.5745537850910586, + "learning_rate": 3.149542234604902e-05, + "loss": 11.9197, + "step": 27473 + }, + { + "epoch": 1.4960691021216639, + "grad_norm": 0.48156888306081325, + "learning_rate": 3.148899851113018e-05, + "loss": 11.7803, + "step": 27474 + }, + { + "epoch": 1.4961235561182469, + "grad_norm": 0.5226758497171498, + "learning_rate": 3.148257520896436e-05, + "loss": 11.8624, + "step": 27475 + }, + { + "epoch": 1.4961780101148299, + "grad_norm": 0.6284035486208138, + "learning_rate": 3.147615243960157e-05, + "loss": 11.9189, + "step": 27476 + }, + { + "epoch": 1.4962324641114129, + "grad_norm": 0.5525820937708962, + "learning_rate": 3.146973020309168e-05, + "loss": 11.8693, + "step": 27477 + }, + { + "epoch": 1.4962869181079959, + "grad_norm": 0.5280852682684858, + "learning_rate": 3.146330849948468e-05, + "loss": 11.8642, + "step": 27478 + }, + { + "epoch": 1.4963413721045788, + "grad_norm": 0.5388026049611822, + "learning_rate": 3.145688732883047e-05, + "loss": 11.8511, + "step": 27479 + }, + { + "epoch": 1.4963958261011618, + "grad_norm": 0.5225958946227364, + "learning_rate": 3.1450466691179014e-05, + "loss": 11.8485, + "step": 27480 + }, + { + "epoch": 1.4964502800977448, + "grad_norm": 0.5239720890394016, + "learning_rate": 3.144404658658024e-05, + "loss": 11.9028, + "step": 27481 + }, + { + "epoch": 1.4965047340943278, + "grad_norm": 0.5550563471128637, + "learning_rate": 3.1437627015084016e-05, + "loss": 11.8473, + "step": 27482 + }, + { + "epoch": 1.4965591880909108, + "grad_norm": 0.5317330658116989, + "learning_rate": 3.143120797674034e-05, + "loss": 11.8971, + "step": 27483 + }, + { + "epoch": 1.496613642087494, + "grad_norm": 0.5109791195007246, + "learning_rate": 3.142478947159906e-05, + "loss": 12.0197, + "step": 27484 + }, + { + "epoch": 1.496668096084077, + "grad_norm": 0.5453036897113615, + "learning_rate": 3.141837149971011e-05, + "loss": 11.6837, + "step": 27485 + }, + { + "epoch": 1.49672255008066, + "grad_norm": 0.5486934779186424, + "learning_rate": 3.141195406112344e-05, + "loss": 11.8049, + "step": 27486 + }, + { + "epoch": 1.496777004077243, + "grad_norm": 0.5505034490465567, + "learning_rate": 3.1405537155888876e-05, + "loss": 11.9074, + "step": 27487 + }, + { + "epoch": 1.496831458073826, + "grad_norm": 0.4991785681869255, + "learning_rate": 3.1399120784056404e-05, + "loss": 11.8099, + "step": 27488 + }, + { + "epoch": 1.496885912070409, + "grad_norm": 0.483806822887898, + "learning_rate": 3.1392704945675835e-05, + "loss": 11.6764, + "step": 27489 + }, + { + "epoch": 1.496940366066992, + "grad_norm": 0.5662450510412216, + "learning_rate": 3.1386289640797126e-05, + "loss": 11.9674, + "step": 27490 + }, + { + "epoch": 1.496994820063575, + "grad_norm": 0.4846114197861738, + "learning_rate": 3.137987486947015e-05, + "loss": 11.7917, + "step": 27491 + }, + { + "epoch": 1.497049274060158, + "grad_norm": 0.5326804141127223, + "learning_rate": 3.137346063174472e-05, + "loss": 11.8509, + "step": 27492 + }, + { + "epoch": 1.4971037280567412, + "grad_norm": 0.54381282457603, + "learning_rate": 3.1367046927670815e-05, + "loss": 11.7573, + "step": 27493 + }, + { + "epoch": 1.4971581820533242, + "grad_norm": 0.6531117726889754, + "learning_rate": 3.136063375729823e-05, + "loss": 12.0029, + "step": 27494 + }, + { + "epoch": 1.4972126360499072, + "grad_norm": 0.7901619653358373, + "learning_rate": 3.135422112067691e-05, + "loss": 11.8016, + "step": 27495 + }, + { + "epoch": 1.4972670900464902, + "grad_norm": 0.547921772757661, + "learning_rate": 3.134780901785663e-05, + "loss": 11.8898, + "step": 27496 + }, + { + "epoch": 1.4973215440430732, + "grad_norm": 0.581070980445588, + "learning_rate": 3.13413974488873e-05, + "loss": 11.8743, + "step": 27497 + }, + { + "epoch": 1.4973759980396562, + "grad_norm": 0.5626893852553062, + "learning_rate": 3.1334986413818826e-05, + "loss": 11.9637, + "step": 27498 + }, + { + "epoch": 1.4974304520362391, + "grad_norm": 0.56260109220869, + "learning_rate": 3.132857591270096e-05, + "loss": 11.9424, + "step": 27499 + }, + { + "epoch": 1.4974849060328221, + "grad_norm": 0.5682717579984866, + "learning_rate": 3.132216594558368e-05, + "loss": 11.9458, + "step": 27500 + }, + { + "epoch": 1.4975393600294051, + "grad_norm": 0.5087272968067512, + "learning_rate": 3.1315756512516694e-05, + "loss": 11.8205, + "step": 27501 + }, + { + "epoch": 1.4975938140259881, + "grad_norm": 0.5251530675231519, + "learning_rate": 3.130934761354989e-05, + "loss": 11.8179, + "step": 27502 + }, + { + "epoch": 1.4976482680225711, + "grad_norm": 0.6019276926576036, + "learning_rate": 3.1302939248733164e-05, + "loss": 11.9282, + "step": 27503 + }, + { + "epoch": 1.497702722019154, + "grad_norm": 0.5623108223998298, + "learning_rate": 3.129653141811626e-05, + "loss": 11.8528, + "step": 27504 + }, + { + "epoch": 1.497757176015737, + "grad_norm": 0.5165853157628106, + "learning_rate": 3.1290124121749087e-05, + "loss": 11.8269, + "step": 27505 + }, + { + "epoch": 1.49781163001232, + "grad_norm": 0.5454551760674319, + "learning_rate": 3.1283717359681394e-05, + "loss": 11.8371, + "step": 27506 + }, + { + "epoch": 1.4978660840089033, + "grad_norm": 0.513751383959271, + "learning_rate": 3.127731113196308e-05, + "loss": 11.8529, + "step": 27507 + }, + { + "epoch": 1.4979205380054863, + "grad_norm": 0.7296877830900464, + "learning_rate": 3.1270905438643885e-05, + "loss": 11.7038, + "step": 27508 + }, + { + "epoch": 1.4979749920020693, + "grad_norm": 0.5451329658973117, + "learning_rate": 3.126450027977366e-05, + "loss": 11.8812, + "step": 27509 + }, + { + "epoch": 1.4980294459986523, + "grad_norm": 0.5304672373545064, + "learning_rate": 3.125809565540225e-05, + "loss": 11.9342, + "step": 27510 + }, + { + "epoch": 1.4980838999952353, + "grad_norm": 0.5229017499272024, + "learning_rate": 3.1251691565579376e-05, + "loss": 11.8463, + "step": 27511 + }, + { + "epoch": 1.4981383539918183, + "grad_norm": 0.5460362783778945, + "learning_rate": 3.124528801035487e-05, + "loss": 11.8818, + "step": 27512 + }, + { + "epoch": 1.4981928079884013, + "grad_norm": 0.5116544705483668, + "learning_rate": 3.123888498977856e-05, + "loss": 11.9706, + "step": 27513 + }, + { + "epoch": 1.4982472619849843, + "grad_norm": 0.5778308106786579, + "learning_rate": 3.1232482503900185e-05, + "loss": 11.8801, + "step": 27514 + }, + { + "epoch": 1.4983017159815673, + "grad_norm": 0.6156861772561578, + "learning_rate": 3.122608055276959e-05, + "loss": 11.9433, + "step": 27515 + }, + { + "epoch": 1.4983561699781505, + "grad_norm": 0.6010348328230253, + "learning_rate": 3.1219679136436494e-05, + "loss": 11.933, + "step": 27516 + }, + { + "epoch": 1.4984106239747335, + "grad_norm": 0.6208290159700537, + "learning_rate": 3.121327825495074e-05, + "loss": 11.9482, + "step": 27517 + }, + { + "epoch": 1.4984650779713165, + "grad_norm": 0.5385137364443083, + "learning_rate": 3.120687790836204e-05, + "loss": 11.7787, + "step": 27518 + }, + { + "epoch": 1.4985195319678994, + "grad_norm": 0.5725944104336594, + "learning_rate": 3.1200478096720185e-05, + "loss": 11.8878, + "step": 27519 + }, + { + "epoch": 1.4985739859644824, + "grad_norm": 0.6063329120900592, + "learning_rate": 3.1194078820075026e-05, + "loss": 11.7532, + "step": 27520 + }, + { + "epoch": 1.4986284399610654, + "grad_norm": 0.5704832095542197, + "learning_rate": 3.118768007847618e-05, + "loss": 11.8878, + "step": 27521 + }, + { + "epoch": 1.4986828939576484, + "grad_norm": 0.5636863502859587, + "learning_rate": 3.1181281871973514e-05, + "loss": 11.9443, + "step": 27522 + }, + { + "epoch": 1.4987373479542314, + "grad_norm": 0.5147934802112546, + "learning_rate": 3.117488420061669e-05, + "loss": 11.9173, + "step": 27523 + }, + { + "epoch": 1.4987918019508144, + "grad_norm": 0.51464273422368, + "learning_rate": 3.1168487064455524e-05, + "loss": 11.9362, + "step": 27524 + }, + { + "epoch": 1.4988462559473974, + "grad_norm": 0.4749948706665351, + "learning_rate": 3.1162090463539773e-05, + "loss": 11.7872, + "step": 27525 + }, + { + "epoch": 1.4989007099439804, + "grad_norm": 0.5811691982443821, + "learning_rate": 3.115569439791911e-05, + "loss": 11.7787, + "step": 27526 + }, + { + "epoch": 1.4989551639405634, + "grad_norm": 0.5207677503485039, + "learning_rate": 3.114929886764335e-05, + "loss": 11.8194, + "step": 27527 + }, + { + "epoch": 1.4990096179371464, + "grad_norm": 0.5618206163224754, + "learning_rate": 3.114290387276216e-05, + "loss": 11.8607, + "step": 27528 + }, + { + "epoch": 1.4990640719337294, + "grad_norm": 0.6222593230654404, + "learning_rate": 3.113650941332533e-05, + "loss": 11.8066, + "step": 27529 + }, + { + "epoch": 1.4991185259303124, + "grad_norm": 0.5485997733806728, + "learning_rate": 3.113011548938255e-05, + "loss": 11.7379, + "step": 27530 + }, + { + "epoch": 1.4991729799268956, + "grad_norm": 0.5699558382653481, + "learning_rate": 3.112372210098351e-05, + "loss": 11.9523, + "step": 27531 + }, + { + "epoch": 1.4992274339234786, + "grad_norm": 0.5336572321578744, + "learning_rate": 3.1117329248177984e-05, + "loss": 11.8289, + "step": 27532 + }, + { + "epoch": 1.4992818879200616, + "grad_norm": 0.5966800422283287, + "learning_rate": 3.111093693101563e-05, + "loss": 11.9526, + "step": 27533 + }, + { + "epoch": 1.4993363419166446, + "grad_norm": 0.5315714128725967, + "learning_rate": 3.1104545149546184e-05, + "loss": 11.8079, + "step": 27534 + }, + { + "epoch": 1.4993907959132275, + "grad_norm": 0.5280188799287588, + "learning_rate": 3.109815390381938e-05, + "loss": 11.9168, + "step": 27535 + }, + { + "epoch": 1.4994452499098105, + "grad_norm": 0.4887410793402791, + "learning_rate": 3.109176319388485e-05, + "loss": 11.8738, + "step": 27536 + }, + { + "epoch": 1.4994997039063935, + "grad_norm": 0.606420131416671, + "learning_rate": 3.1085373019792366e-05, + "loss": 11.8706, + "step": 27537 + }, + { + "epoch": 1.4995541579029765, + "grad_norm": 0.5491822896002987, + "learning_rate": 3.107898338159153e-05, + "loss": 11.773, + "step": 27538 + }, + { + "epoch": 1.4996086118995597, + "grad_norm": 0.5137174202577975, + "learning_rate": 3.107259427933212e-05, + "loss": 11.7138, + "step": 27539 + }, + { + "epoch": 1.4996630658961427, + "grad_norm": 0.5568255752004765, + "learning_rate": 3.106620571306378e-05, + "loss": 11.78, + "step": 27540 + }, + { + "epoch": 1.4997175198927257, + "grad_norm": 0.5316097749800234, + "learning_rate": 3.105981768283614e-05, + "loss": 11.8818, + "step": 27541 + }, + { + "epoch": 1.4997719738893087, + "grad_norm": 0.5207641553822407, + "learning_rate": 3.1053430188698976e-05, + "loss": 11.7427, + "step": 27542 + }, + { + "epoch": 1.4998264278858917, + "grad_norm": 0.5309694357002798, + "learning_rate": 3.1047043230701844e-05, + "loss": 11.8229, + "step": 27543 + }, + { + "epoch": 1.4998808818824747, + "grad_norm": 0.5583661611807208, + "learning_rate": 3.1040656808894505e-05, + "loss": 11.8332, + "step": 27544 + }, + { + "epoch": 1.4999353358790577, + "grad_norm": 0.5743922278384787, + "learning_rate": 3.103427092332656e-05, + "loss": 11.8431, + "step": 27545 + }, + { + "epoch": 1.4999897898756407, + "grad_norm": 0.5261854774839165, + "learning_rate": 3.1027885574047687e-05, + "loss": 11.8617, + "step": 27546 + }, + { + "epoch": 1.5000442438722237, + "grad_norm": 0.5468380951743005, + "learning_rate": 3.102150076110757e-05, + "loss": 11.7561, + "step": 27547 + }, + { + "epoch": 1.5000986978688067, + "grad_norm": 0.5654126809901402, + "learning_rate": 3.101511648455579e-05, + "loss": 11.8551, + "step": 27548 + }, + { + "epoch": 1.5001531518653897, + "grad_norm": 0.5495863535567939, + "learning_rate": 3.100873274444208e-05, + "loss": 11.809, + "step": 27549 + }, + { + "epoch": 1.5002076058619727, + "grad_norm": 0.5647235853031947, + "learning_rate": 3.1002349540816036e-05, + "loss": 11.8476, + "step": 27550 + }, + { + "epoch": 1.5002620598585557, + "grad_norm": 0.5435244911535195, + "learning_rate": 3.0995966873727244e-05, + "loss": 11.7316, + "step": 27551 + }, + { + "epoch": 1.5003165138551386, + "grad_norm": 0.529410883315443, + "learning_rate": 3.098958474322543e-05, + "loss": 11.8691, + "step": 27552 + }, + { + "epoch": 1.5003709678517216, + "grad_norm": 0.570198110631626, + "learning_rate": 3.098320314936015e-05, + "loss": 11.7387, + "step": 27553 + }, + { + "epoch": 1.5004254218483046, + "grad_norm": 0.5399444238497815, + "learning_rate": 3.0976822092181076e-05, + "loss": 11.7691, + "step": 27554 + }, + { + "epoch": 1.5004798758448878, + "grad_norm": 0.6302267951639604, + "learning_rate": 3.097044157173778e-05, + "loss": 11.8268, + "step": 27555 + }, + { + "epoch": 1.5005343298414708, + "grad_norm": 0.5475409955087848, + "learning_rate": 3.096406158807995e-05, + "loss": 11.7685, + "step": 27556 + }, + { + "epoch": 1.5005887838380538, + "grad_norm": 0.5299594529500552, + "learning_rate": 3.0957682141257104e-05, + "loss": 11.8292, + "step": 27557 + }, + { + "epoch": 1.5006432378346368, + "grad_norm": 0.5538666764621011, + "learning_rate": 3.0951303231318916e-05, + "loss": 11.6462, + "step": 27558 + }, + { + "epoch": 1.5006976918312198, + "grad_norm": 0.5317166062254615, + "learning_rate": 3.0944924858314994e-05, + "loss": 11.7942, + "step": 27559 + }, + { + "epoch": 1.5007521458278028, + "grad_norm": 0.581170481397359, + "learning_rate": 3.093854702229493e-05, + "loss": 11.7875, + "step": 27560 + }, + { + "epoch": 1.500806599824386, + "grad_norm": 0.5594560718388232, + "learning_rate": 3.093216972330827e-05, + "loss": 11.7466, + "step": 27561 + }, + { + "epoch": 1.500861053820969, + "grad_norm": 0.5414119794473163, + "learning_rate": 3.092579296140467e-05, + "loss": 11.8654, + "step": 27562 + }, + { + "epoch": 1.500915507817552, + "grad_norm": 0.5203958651338018, + "learning_rate": 3.0919416736633646e-05, + "loss": 11.9641, + "step": 27563 + }, + { + "epoch": 1.500969961814135, + "grad_norm": 0.5535061061668877, + "learning_rate": 3.091304104904487e-05, + "loss": 11.8742, + "step": 27564 + }, + { + "epoch": 1.501024415810718, + "grad_norm": 0.5362178767221011, + "learning_rate": 3.0906665898687826e-05, + "loss": 11.8637, + "step": 27565 + }, + { + "epoch": 1.501078869807301, + "grad_norm": 0.5127350859931332, + "learning_rate": 3.090029128561218e-05, + "loss": 11.8934, + "step": 27566 + }, + { + "epoch": 1.501133323803884, + "grad_norm": 0.5298054538718383, + "learning_rate": 3.089391720986742e-05, + "loss": 11.8764, + "step": 27567 + }, + { + "epoch": 1.501187777800467, + "grad_norm": 0.6092771664732751, + "learning_rate": 3.088754367150315e-05, + "loss": 11.8243, + "step": 27568 + }, + { + "epoch": 1.50124223179705, + "grad_norm": 0.5284515644674422, + "learning_rate": 3.088117067056896e-05, + "loss": 11.8535, + "step": 27569 + }, + { + "epoch": 1.501296685793633, + "grad_norm": 0.571331530496941, + "learning_rate": 3.0874798207114374e-05, + "loss": 11.9048, + "step": 27570 + }, + { + "epoch": 1.501351139790216, + "grad_norm": 0.6116606438283967, + "learning_rate": 3.0868426281188953e-05, + "loss": 11.9523, + "step": 27571 + }, + { + "epoch": 1.501405593786799, + "grad_norm": 0.5388547942213677, + "learning_rate": 3.0862054892842215e-05, + "loss": 11.8108, + "step": 27572 + }, + { + "epoch": 1.501460047783382, + "grad_norm": 0.5460617994491654, + "learning_rate": 3.0855684042123734e-05, + "loss": 11.8827, + "step": 27573 + }, + { + "epoch": 1.501514501779965, + "grad_norm": 0.5466345431925912, + "learning_rate": 3.084931372908307e-05, + "loss": 11.662, + "step": 27574 + }, + { + "epoch": 1.501568955776548, + "grad_norm": 0.5919496457101667, + "learning_rate": 3.0842943953769724e-05, + "loss": 11.9597, + "step": 27575 + }, + { + "epoch": 1.501623409773131, + "grad_norm": 0.5290296969291574, + "learning_rate": 3.083657471623326e-05, + "loss": 11.8332, + "step": 27576 + }, + { + "epoch": 1.501677863769714, + "grad_norm": 0.5476704734827672, + "learning_rate": 3.0830206016523165e-05, + "loss": 11.913, + "step": 27577 + }, + { + "epoch": 1.501732317766297, + "grad_norm": 0.5052560576496058, + "learning_rate": 3.0823837854689016e-05, + "loss": 11.9057, + "step": 27578 + }, + { + "epoch": 1.5017867717628801, + "grad_norm": 0.5616683807127564, + "learning_rate": 3.081747023078028e-05, + "loss": 11.8198, + "step": 27579 + }, + { + "epoch": 1.501841225759463, + "grad_norm": 0.5784261645398846, + "learning_rate": 3.081110314484652e-05, + "loss": 11.697, + "step": 27580 + }, + { + "epoch": 1.501895679756046, + "grad_norm": 0.4839773202934603, + "learning_rate": 3.0804736596937225e-05, + "loss": 11.8623, + "step": 27581 + }, + { + "epoch": 1.501950133752629, + "grad_norm": 0.5926065659373663, + "learning_rate": 3.079837058710188e-05, + "loss": 11.8601, + "step": 27582 + }, + { + "epoch": 1.502004587749212, + "grad_norm": 0.5667121528056517, + "learning_rate": 3.079200511539e-05, + "loss": 11.8723, + "step": 27583 + }, + { + "epoch": 1.502059041745795, + "grad_norm": 0.5492222967762245, + "learning_rate": 3.0785640181851125e-05, + "loss": 11.8033, + "step": 27584 + }, + { + "epoch": 1.5021134957423783, + "grad_norm": 0.5894426805620854, + "learning_rate": 3.07792757865347e-05, + "loss": 11.8386, + "step": 27585 + }, + { + "epoch": 1.5021679497389613, + "grad_norm": 0.5891161954158775, + "learning_rate": 3.0772911929490265e-05, + "loss": 11.8987, + "step": 27586 + }, + { + "epoch": 1.5022224037355443, + "grad_norm": 0.5071649840710323, + "learning_rate": 3.076654861076723e-05, + "loss": 11.8914, + "step": 27587 + }, + { + "epoch": 1.5022768577321273, + "grad_norm": 0.5372359479170464, + "learning_rate": 3.076018583041517e-05, + "loss": 11.8784, + "step": 27588 + }, + { + "epoch": 1.5023313117287103, + "grad_norm": 0.53300440436641, + "learning_rate": 3.075382358848348e-05, + "loss": 11.8932, + "step": 27589 + }, + { + "epoch": 1.5023857657252933, + "grad_norm": 0.5549183348199073, + "learning_rate": 3.07474618850217e-05, + "loss": 11.8055, + "step": 27590 + }, + { + "epoch": 1.5024402197218762, + "grad_norm": 0.5364623809143403, + "learning_rate": 3.074110072007927e-05, + "loss": 11.8346, + "step": 27591 + }, + { + "epoch": 1.5024946737184592, + "grad_norm": 0.5980968037882382, + "learning_rate": 3.073474009370563e-05, + "loss": 11.8335, + "step": 27592 + }, + { + "epoch": 1.5025491277150422, + "grad_norm": 0.5319294149873626, + "learning_rate": 3.07283800059503e-05, + "loss": 11.7779, + "step": 27593 + }, + { + "epoch": 1.5026035817116252, + "grad_norm": 0.5135358376415784, + "learning_rate": 3.072202045686265e-05, + "loss": 11.7976, + "step": 27594 + }, + { + "epoch": 1.5026580357082082, + "grad_norm": 0.5577114057239492, + "learning_rate": 3.0715661446492217e-05, + "loss": 11.8907, + "step": 27595 + }, + { + "epoch": 1.5027124897047912, + "grad_norm": 0.5472486261440314, + "learning_rate": 3.070930297488843e-05, + "loss": 11.9129, + "step": 27596 + }, + { + "epoch": 1.5027669437013742, + "grad_norm": 0.5574978807079234, + "learning_rate": 3.0702945042100706e-05, + "loss": 11.8204, + "step": 27597 + }, + { + "epoch": 1.5028213976979572, + "grad_norm": 0.5722485819548241, + "learning_rate": 3.0696587648178523e-05, + "loss": 11.8558, + "step": 27598 + }, + { + "epoch": 1.5028758516945402, + "grad_norm": 0.5395526032028541, + "learning_rate": 3.069023079317127e-05, + "loss": 11.8176, + "step": 27599 + }, + { + "epoch": 1.5029303056911232, + "grad_norm": 0.5277757089922277, + "learning_rate": 3.0683874477128436e-05, + "loss": 11.8558, + "step": 27600 + }, + { + "epoch": 1.5029847596877062, + "grad_norm": 0.5240624033785303, + "learning_rate": 3.067751870009942e-05, + "loss": 11.9188, + "step": 27601 + }, + { + "epoch": 1.5030392136842894, + "grad_norm": 0.5084139043867187, + "learning_rate": 3.067116346213361e-05, + "loss": 11.8472, + "step": 27602 + }, + { + "epoch": 1.5030936676808724, + "grad_norm": 0.5409512735160412, + "learning_rate": 3.06648087632805e-05, + "loss": 11.8957, + "step": 27603 + }, + { + "epoch": 1.5031481216774554, + "grad_norm": 0.5711421532119695, + "learning_rate": 3.0658454603589416e-05, + "loss": 11.9514, + "step": 27604 + }, + { + "epoch": 1.5032025756740384, + "grad_norm": 0.5798785861361018, + "learning_rate": 3.065210098310985e-05, + "loss": 11.9032, + "step": 27605 + }, + { + "epoch": 1.5032570296706214, + "grad_norm": 0.5517773569951979, + "learning_rate": 3.0645747901891164e-05, + "loss": 11.7489, + "step": 27606 + }, + { + "epoch": 1.5033114836672044, + "grad_norm": 0.60294794538028, + "learning_rate": 3.063939535998276e-05, + "loss": 11.8538, + "step": 27607 + }, + { + "epoch": 1.5033659376637876, + "grad_norm": 0.5379968417713321, + "learning_rate": 3.0633043357434074e-05, + "loss": 11.7519, + "step": 27608 + }, + { + "epoch": 1.5034203916603706, + "grad_norm": 0.5558431679438995, + "learning_rate": 3.0626691894294456e-05, + "loss": 11.8692, + "step": 27609 + }, + { + "epoch": 1.5034748456569536, + "grad_norm": 0.56468562645044, + "learning_rate": 3.0620340970613345e-05, + "loss": 11.7864, + "step": 27610 + }, + { + "epoch": 1.5035292996535365, + "grad_norm": 0.5267122393320834, + "learning_rate": 3.06139905864401e-05, + "loss": 11.7804, + "step": 27611 + }, + { + "epoch": 1.5035837536501195, + "grad_norm": 0.5499475642516489, + "learning_rate": 3.060764074182406e-05, + "loss": 11.775, + "step": 27612 + }, + { + "epoch": 1.5036382076467025, + "grad_norm": 0.5143490316338917, + "learning_rate": 3.0601291436814684e-05, + "loss": 11.8559, + "step": 27613 + }, + { + "epoch": 1.5036926616432855, + "grad_norm": 0.5706754059374372, + "learning_rate": 3.059494267146127e-05, + "loss": 11.8854, + "step": 27614 + }, + { + "epoch": 1.5037471156398685, + "grad_norm": 0.523767343686082, + "learning_rate": 3.058859444581326e-05, + "loss": 11.885, + "step": 27615 + }, + { + "epoch": 1.5038015696364515, + "grad_norm": 0.5624296865878132, + "learning_rate": 3.058224675991993e-05, + "loss": 11.6968, + "step": 27616 + }, + { + "epoch": 1.5038560236330345, + "grad_norm": 0.49422260520353717, + "learning_rate": 3.0575899613830706e-05, + "loss": 11.7223, + "step": 27617 + }, + { + "epoch": 1.5039104776296175, + "grad_norm": 0.5247207589597501, + "learning_rate": 3.0569553007594956e-05, + "loss": 11.8117, + "step": 27618 + }, + { + "epoch": 1.5039649316262005, + "grad_norm": 0.5646454568774119, + "learning_rate": 3.056320694126197e-05, + "loss": 11.8483, + "step": 27619 + }, + { + "epoch": 1.5040193856227835, + "grad_norm": 0.5403338629392758, + "learning_rate": 3.0556861414881154e-05, + "loss": 11.8416, + "step": 27620 + }, + { + "epoch": 1.5040738396193665, + "grad_norm": 0.5505254544530112, + "learning_rate": 3.0550516428501854e-05, + "loss": 11.8749, + "step": 27621 + }, + { + "epoch": 1.5041282936159495, + "grad_norm": 0.4985326885128025, + "learning_rate": 3.054417198217333e-05, + "loss": 11.7613, + "step": 27622 + }, + { + "epoch": 1.5041827476125325, + "grad_norm": 0.5408151241099561, + "learning_rate": 3.0537828075945016e-05, + "loss": 11.9199, + "step": 27623 + }, + { + "epoch": 1.5042372016091154, + "grad_norm": 0.5483937991568044, + "learning_rate": 3.053148470986617e-05, + "loss": 11.9476, + "step": 27624 + }, + { + "epoch": 1.5042916556056987, + "grad_norm": 0.6080094206391734, + "learning_rate": 3.052514188398617e-05, + "loss": 11.7556, + "step": 27625 + }, + { + "epoch": 1.5043461096022817, + "grad_norm": 0.5883997187373496, + "learning_rate": 3.05187995983543e-05, + "loss": 11.8255, + "step": 27626 + }, + { + "epoch": 1.5044005635988646, + "grad_norm": 0.563012064872864, + "learning_rate": 3.0512457853019917e-05, + "loss": 11.9002, + "step": 27627 + }, + { + "epoch": 1.5044550175954476, + "grad_norm": 0.566417441781685, + "learning_rate": 3.050611664803228e-05, + "loss": 11.7473, + "step": 27628 + }, + { + "epoch": 1.5045094715920306, + "grad_norm": 0.5262720486523642, + "learning_rate": 3.049977598344075e-05, + "loss": 11.9355, + "step": 27629 + }, + { + "epoch": 1.5045639255886136, + "grad_norm": 0.5155669848516421, + "learning_rate": 3.0493435859294628e-05, + "loss": 11.7297, + "step": 27630 + }, + { + "epoch": 1.5046183795851968, + "grad_norm": 0.5817023527518846, + "learning_rate": 3.0487096275643224e-05, + "loss": 11.8661, + "step": 27631 + }, + { + "epoch": 1.5046728335817798, + "grad_norm": 0.5217576368753092, + "learning_rate": 3.0480757232535772e-05, + "loss": 11.8647, + "step": 27632 + }, + { + "epoch": 1.5047272875783628, + "grad_norm": 0.5230937332016243, + "learning_rate": 3.0474418730021648e-05, + "loss": 11.8024, + "step": 27633 + }, + { + "epoch": 1.5047817415749458, + "grad_norm": 0.5179976205184285, + "learning_rate": 3.0468080768150076e-05, + "loss": 11.7769, + "step": 27634 + }, + { + "epoch": 1.5048361955715288, + "grad_norm": 0.5224160824425776, + "learning_rate": 3.0461743346970395e-05, + "loss": 11.9731, + "step": 27635 + }, + { + "epoch": 1.5048906495681118, + "grad_norm": 0.5364814966305287, + "learning_rate": 3.045540646653182e-05, + "loss": 11.7945, + "step": 27636 + }, + { + "epoch": 1.5049451035646948, + "grad_norm": 0.5789394424236494, + "learning_rate": 3.0449070126883707e-05, + "loss": 11.8448, + "step": 27637 + }, + { + "epoch": 1.5049995575612778, + "grad_norm": 0.5730519306389764, + "learning_rate": 3.0442734328075263e-05, + "loss": 11.8303, + "step": 27638 + }, + { + "epoch": 1.5050540115578608, + "grad_norm": 0.5712768170623889, + "learning_rate": 3.0436399070155774e-05, + "loss": 11.9416, + "step": 27639 + }, + { + "epoch": 1.5051084655544438, + "grad_norm": 0.5352127555076341, + "learning_rate": 3.0430064353174538e-05, + "loss": 11.8445, + "step": 27640 + }, + { + "epoch": 1.5051629195510268, + "grad_norm": 0.5426642678899108, + "learning_rate": 3.0423730177180797e-05, + "loss": 11.8103, + "step": 27641 + }, + { + "epoch": 1.5052173735476098, + "grad_norm": 0.5461720622822323, + "learning_rate": 3.0417396542223798e-05, + "loss": 11.9051, + "step": 27642 + }, + { + "epoch": 1.5052718275441928, + "grad_norm": 0.5544016741604803, + "learning_rate": 3.041106344835275e-05, + "loss": 11.8508, + "step": 27643 + }, + { + "epoch": 1.5053262815407757, + "grad_norm": 0.5802545894786407, + "learning_rate": 3.040473089561695e-05, + "loss": 11.7987, + "step": 27644 + }, + { + "epoch": 1.5053807355373587, + "grad_norm": 0.5435056876544804, + "learning_rate": 3.039839888406567e-05, + "loss": 11.7582, + "step": 27645 + }, + { + "epoch": 1.5054351895339417, + "grad_norm": 0.5498441096741119, + "learning_rate": 3.0392067413748083e-05, + "loss": 11.8857, + "step": 27646 + }, + { + "epoch": 1.5054896435305247, + "grad_norm": 0.5868732495443124, + "learning_rate": 3.0385736484713477e-05, + "loss": 11.7373, + "step": 27647 + }, + { + "epoch": 1.5055440975271077, + "grad_norm": 0.5631848433435084, + "learning_rate": 3.037940609701102e-05, + "loss": 11.7864, + "step": 27648 + }, + { + "epoch": 1.505598551523691, + "grad_norm": 0.5860801684071715, + "learning_rate": 3.0373076250690026e-05, + "loss": 11.7146, + "step": 27649 + }, + { + "epoch": 1.505653005520274, + "grad_norm": 0.5585283707571668, + "learning_rate": 3.036674694579962e-05, + "loss": 11.8924, + "step": 27650 + }, + { + "epoch": 1.505707459516857, + "grad_norm": 0.5920091119800553, + "learning_rate": 3.0360418182389105e-05, + "loss": 11.9214, + "step": 27651 + }, + { + "epoch": 1.50576191351344, + "grad_norm": 0.5741591184526227, + "learning_rate": 3.035408996050766e-05, + "loss": 11.8055, + "step": 27652 + }, + { + "epoch": 1.505816367510023, + "grad_norm": 0.4973276984223243, + "learning_rate": 3.0347762280204462e-05, + "loss": 11.7137, + "step": 27653 + }, + { + "epoch": 1.5058708215066061, + "grad_norm": 0.5516625562789796, + "learning_rate": 3.0341435141528763e-05, + "loss": 11.7877, + "step": 27654 + }, + { + "epoch": 1.505925275503189, + "grad_norm": 0.5708887913135768, + "learning_rate": 3.033510854452972e-05, + "loss": 11.9488, + "step": 27655 + }, + { + "epoch": 1.505979729499772, + "grad_norm": 0.5741516223263851, + "learning_rate": 3.032878248925657e-05, + "loss": 11.9446, + "step": 27656 + }, + { + "epoch": 1.506034183496355, + "grad_norm": 0.5531204656209718, + "learning_rate": 3.0322456975758505e-05, + "loss": 11.8402, + "step": 27657 + }, + { + "epoch": 1.506088637492938, + "grad_norm": 0.6218445852612702, + "learning_rate": 3.0316132004084674e-05, + "loss": 11.9344, + "step": 27658 + }, + { + "epoch": 1.506143091489521, + "grad_norm": 0.5769309911162305, + "learning_rate": 3.030980757428432e-05, + "loss": 11.9579, + "step": 27659 + }, + { + "epoch": 1.506197545486104, + "grad_norm": 0.5402758990034201, + "learning_rate": 3.0303483686406555e-05, + "loss": 11.889, + "step": 27660 + }, + { + "epoch": 1.506251999482687, + "grad_norm": 0.6158061534664355, + "learning_rate": 3.029716034050062e-05, + "loss": 11.792, + "step": 27661 + }, + { + "epoch": 1.50630645347927, + "grad_norm": 0.5213652094806597, + "learning_rate": 3.0290837536615656e-05, + "loss": 11.7095, + "step": 27662 + }, + { + "epoch": 1.506360907475853, + "grad_norm": 0.5524929572467947, + "learning_rate": 3.0284515274800807e-05, + "loss": 11.8645, + "step": 27663 + }, + { + "epoch": 1.506415361472436, + "grad_norm": 0.5287964101011609, + "learning_rate": 3.0278193555105283e-05, + "loss": 11.9561, + "step": 27664 + }, + { + "epoch": 1.506469815469019, + "grad_norm": 0.5456827261862065, + "learning_rate": 3.027187237757818e-05, + "loss": 11.7935, + "step": 27665 + }, + { + "epoch": 1.506524269465602, + "grad_norm": 0.5560675880179424, + "learning_rate": 3.0265551742268693e-05, + "loss": 11.8534, + "step": 27666 + }, + { + "epoch": 1.506578723462185, + "grad_norm": 0.4839734380980397, + "learning_rate": 3.0259231649226015e-05, + "loss": 11.7421, + "step": 27667 + }, + { + "epoch": 1.506633177458768, + "grad_norm": 0.5314535741315, + "learning_rate": 3.0252912098499207e-05, + "loss": 11.7904, + "step": 27668 + }, + { + "epoch": 1.506687631455351, + "grad_norm": 0.5362603181572028, + "learning_rate": 3.0246593090137476e-05, + "loss": 11.803, + "step": 27669 + }, + { + "epoch": 1.506742085451934, + "grad_norm": 0.5808088826584501, + "learning_rate": 3.0240274624189913e-05, + "loss": 11.9047, + "step": 27670 + }, + { + "epoch": 1.506796539448517, + "grad_norm": 0.578753416107352, + "learning_rate": 3.02339567007057e-05, + "loss": 11.8853, + "step": 27671 + }, + { + "epoch": 1.5068509934451002, + "grad_norm": 0.508534977626483, + "learning_rate": 3.0227639319733936e-05, + "loss": 11.8025, + "step": 27672 + }, + { + "epoch": 1.5069054474416832, + "grad_norm": 0.5839536048259578, + "learning_rate": 3.0221322481323723e-05, + "loss": 11.8539, + "step": 27673 + }, + { + "epoch": 1.5069599014382662, + "grad_norm": 0.5056608957084763, + "learning_rate": 3.021500618552424e-05, + "loss": 11.8683, + "step": 27674 + }, + { + "epoch": 1.5070143554348492, + "grad_norm": 0.48905232113237707, + "learning_rate": 3.0208690432384546e-05, + "loss": 11.7323, + "step": 27675 + }, + { + "epoch": 1.5070688094314322, + "grad_norm": 0.5823855342076538, + "learning_rate": 3.0202375221953805e-05, + "loss": 11.8104, + "step": 27676 + }, + { + "epoch": 1.5071232634280152, + "grad_norm": 0.545191595494048, + "learning_rate": 3.019606055428106e-05, + "loss": 11.9079, + "step": 27677 + }, + { + "epoch": 1.5071777174245984, + "grad_norm": 0.5396027113593479, + "learning_rate": 3.0189746429415468e-05, + "loss": 11.7344, + "step": 27678 + }, + { + "epoch": 1.5072321714211814, + "grad_norm": 0.5411755554388232, + "learning_rate": 3.0183432847406134e-05, + "loss": 12.0117, + "step": 27679 + }, + { + "epoch": 1.5072866254177644, + "grad_norm": 0.5251088975317232, + "learning_rate": 3.0177119808302113e-05, + "loss": 11.8839, + "step": 27680 + }, + { + "epoch": 1.5073410794143474, + "grad_norm": 0.5613355898052612, + "learning_rate": 3.0170807312152537e-05, + "loss": 11.7858, + "step": 27681 + }, + { + "epoch": 1.5073955334109304, + "grad_norm": 0.5848409840479958, + "learning_rate": 3.0164495359006484e-05, + "loss": 11.9158, + "step": 27682 + }, + { + "epoch": 1.5074499874075133, + "grad_norm": 0.5313528696351432, + "learning_rate": 3.0158183948912988e-05, + "loss": 11.8243, + "step": 27683 + }, + { + "epoch": 1.5075044414040963, + "grad_norm": 0.5753534768744759, + "learning_rate": 3.0151873081921213e-05, + "loss": 11.905, + "step": 27684 + }, + { + "epoch": 1.5075588954006793, + "grad_norm": 0.5783521566718891, + "learning_rate": 3.0145562758080137e-05, + "loss": 11.8615, + "step": 27685 + }, + { + "epoch": 1.5076133493972623, + "grad_norm": 0.5092573533801059, + "learning_rate": 3.0139252977438914e-05, + "loss": 11.8508, + "step": 27686 + }, + { + "epoch": 1.5076678033938453, + "grad_norm": 0.5199298621276031, + "learning_rate": 3.013294374004655e-05, + "loss": 11.6786, + "step": 27687 + }, + { + "epoch": 1.5077222573904283, + "grad_norm": 0.562591696967236, + "learning_rate": 3.0126635045952133e-05, + "loss": 11.7552, + "step": 27688 + }, + { + "epoch": 1.5077767113870113, + "grad_norm": 0.567123205368442, + "learning_rate": 3.0120326895204753e-05, + "loss": 11.9026, + "step": 27689 + }, + { + "epoch": 1.5078311653835943, + "grad_norm": 0.5685875255988978, + "learning_rate": 3.011401928785339e-05, + "loss": 11.7773, + "step": 27690 + }, + { + "epoch": 1.5078856193801773, + "grad_norm": 0.5327316258202117, + "learning_rate": 3.0107712223947203e-05, + "loss": 11.7722, + "step": 27691 + }, + { + "epoch": 1.5079400733767603, + "grad_norm": 0.5250633768345059, + "learning_rate": 3.0101405703535103e-05, + "loss": 11.8296, + "step": 27692 + }, + { + "epoch": 1.5079945273733433, + "grad_norm": 0.5514445851102612, + "learning_rate": 3.0095099726666187e-05, + "loss": 11.8859, + "step": 27693 + }, + { + "epoch": 1.5080489813699263, + "grad_norm": 0.6038059950739845, + "learning_rate": 3.0088794293389532e-05, + "loss": 11.8462, + "step": 27694 + }, + { + "epoch": 1.5081034353665095, + "grad_norm": 0.5387197878737624, + "learning_rate": 3.008248940375411e-05, + "loss": 11.9464, + "step": 27695 + }, + { + "epoch": 1.5081578893630925, + "grad_norm": 0.5045327878282647, + "learning_rate": 3.0076185057809003e-05, + "loss": 11.8414, + "step": 27696 + }, + { + "epoch": 1.5082123433596755, + "grad_norm": 0.543435502192343, + "learning_rate": 3.0069881255603182e-05, + "loss": 11.7743, + "step": 27697 + }, + { + "epoch": 1.5082667973562585, + "grad_norm": 0.49555653997269605, + "learning_rate": 3.006357799718572e-05, + "loss": 11.8638, + "step": 27698 + }, + { + "epoch": 1.5083212513528415, + "grad_norm": 0.6831213653704303, + "learning_rate": 3.005727528260557e-05, + "loss": 11.8205, + "step": 27699 + }, + { + "epoch": 1.5083757053494244, + "grad_norm": 0.5093461458203183, + "learning_rate": 3.0050973111911772e-05, + "loss": 11.8062, + "step": 27700 + }, + { + "epoch": 1.5084301593460077, + "grad_norm": 0.5545139771784615, + "learning_rate": 3.004467148515341e-05, + "loss": 11.8289, + "step": 27701 + }, + { + "epoch": 1.5084846133425907, + "grad_norm": 0.6564560130369709, + "learning_rate": 3.0038370402379344e-05, + "loss": 11.7958, + "step": 27702 + }, + { + "epoch": 1.5085390673391736, + "grad_norm": 0.5399395695077479, + "learning_rate": 3.0032069863638678e-05, + "loss": 11.8155, + "step": 27703 + }, + { + "epoch": 1.5085935213357566, + "grad_norm": 0.5239996833966908, + "learning_rate": 3.0025769868980335e-05, + "loss": 11.8158, + "step": 27704 + }, + { + "epoch": 1.5086479753323396, + "grad_norm": 0.527652555018816, + "learning_rate": 3.0019470418453345e-05, + "loss": 11.877, + "step": 27705 + }, + { + "epoch": 1.5087024293289226, + "grad_norm": 0.5482037668581788, + "learning_rate": 3.0013171512106718e-05, + "loss": 11.9077, + "step": 27706 + }, + { + "epoch": 1.5087568833255056, + "grad_norm": 0.5593091392044128, + "learning_rate": 3.0006873149989377e-05, + "loss": 11.8471, + "step": 27707 + }, + { + "epoch": 1.5088113373220886, + "grad_norm": 0.5489678883026025, + "learning_rate": 3.000057533215036e-05, + "loss": 11.9846, + "step": 27708 + }, + { + "epoch": 1.5088657913186716, + "grad_norm": 0.5048261198947238, + "learning_rate": 2.999427805863858e-05, + "loss": 11.767, + "step": 27709 + }, + { + "epoch": 1.5089202453152546, + "grad_norm": 0.5385155346180013, + "learning_rate": 2.9987981329503056e-05, + "loss": 11.802, + "step": 27710 + }, + { + "epoch": 1.5089746993118376, + "grad_norm": 0.5682527811224917, + "learning_rate": 2.9981685144792737e-05, + "loss": 11.7803, + "step": 27711 + }, + { + "epoch": 1.5090291533084206, + "grad_norm": 0.5867884195522941, + "learning_rate": 2.9975389504556538e-05, + "loss": 11.7611, + "step": 27712 + }, + { + "epoch": 1.5090836073050036, + "grad_norm": 0.5388582827725946, + "learning_rate": 2.996909440884349e-05, + "loss": 11.8301, + "step": 27713 + }, + { + "epoch": 1.5091380613015866, + "grad_norm": 0.5467698982091564, + "learning_rate": 2.9962799857702474e-05, + "loss": 11.7841, + "step": 27714 + }, + { + "epoch": 1.5091925152981696, + "grad_norm": 0.6212780696901338, + "learning_rate": 2.995650585118247e-05, + "loss": 11.985, + "step": 27715 + }, + { + "epoch": 1.5092469692947525, + "grad_norm": 0.5645021872395943, + "learning_rate": 2.9950212389332466e-05, + "loss": 11.8428, + "step": 27716 + }, + { + "epoch": 1.5093014232913355, + "grad_norm": 0.5347293351712962, + "learning_rate": 2.994391947220131e-05, + "loss": 11.7633, + "step": 27717 + }, + { + "epoch": 1.5093558772879185, + "grad_norm": 0.6065627903828362, + "learning_rate": 2.993762709983803e-05, + "loss": 11.8202, + "step": 27718 + }, + { + "epoch": 1.5094103312845017, + "grad_norm": 0.5478554910871952, + "learning_rate": 2.9931335272291472e-05, + "loss": 11.8343, + "step": 27719 + }, + { + "epoch": 1.5094647852810847, + "grad_norm": 0.5887533294741611, + "learning_rate": 2.9925043989610635e-05, + "loss": 11.9709, + "step": 27720 + }, + { + "epoch": 1.5095192392776677, + "grad_norm": 0.5367691135942377, + "learning_rate": 2.99187532518444e-05, + "loss": 11.7524, + "step": 27721 + }, + { + "epoch": 1.5095736932742507, + "grad_norm": 0.5920294223461078, + "learning_rate": 2.9912463059041673e-05, + "loss": 11.8158, + "step": 27722 + }, + { + "epoch": 1.5096281472708337, + "grad_norm": 0.5799886961127216, + "learning_rate": 2.9906173411251414e-05, + "loss": 11.9282, + "step": 27723 + }, + { + "epoch": 1.509682601267417, + "grad_norm": 0.5420188804440307, + "learning_rate": 2.9899884308522475e-05, + "loss": 11.7917, + "step": 27724 + }, + { + "epoch": 1.509737055264, + "grad_norm": 0.6077614752035385, + "learning_rate": 2.9893595750903813e-05, + "loss": 11.8512, + "step": 27725 + }, + { + "epoch": 1.509791509260583, + "grad_norm": 0.500352912740398, + "learning_rate": 2.9887307738444293e-05, + "loss": 11.8185, + "step": 27726 + }, + { + "epoch": 1.509845963257166, + "grad_norm": 0.6414151378311607, + "learning_rate": 2.9881020271192806e-05, + "loss": 11.8646, + "step": 27727 + }, + { + "epoch": 1.509900417253749, + "grad_norm": 0.562359974934004, + "learning_rate": 2.9874733349198315e-05, + "loss": 11.8485, + "step": 27728 + }, + { + "epoch": 1.509954871250332, + "grad_norm": 0.5324743453276087, + "learning_rate": 2.9868446972509612e-05, + "loss": 11.9092, + "step": 27729 + }, + { + "epoch": 1.510009325246915, + "grad_norm": 0.6062978713611572, + "learning_rate": 2.986216114117566e-05, + "loss": 11.7869, + "step": 27730 + }, + { + "epoch": 1.5100637792434979, + "grad_norm": 0.5741654731406654, + "learning_rate": 2.985587585524532e-05, + "loss": 11.872, + "step": 27731 + }, + { + "epoch": 1.5101182332400809, + "grad_norm": 0.5104704082013496, + "learning_rate": 2.9849591114767406e-05, + "loss": 11.7474, + "step": 27732 + }, + { + "epoch": 1.5101726872366639, + "grad_norm": 0.674178818188482, + "learning_rate": 2.984330691979087e-05, + "loss": 11.9635, + "step": 27733 + }, + { + "epoch": 1.5102271412332469, + "grad_norm": 0.5652969158957791, + "learning_rate": 2.9837023270364506e-05, + "loss": 11.9251, + "step": 27734 + }, + { + "epoch": 1.5102815952298299, + "grad_norm": 0.5609797645699056, + "learning_rate": 2.9830740166537264e-05, + "loss": 11.8837, + "step": 27735 + }, + { + "epoch": 1.5103360492264128, + "grad_norm": 0.5159221518667632, + "learning_rate": 2.9824457608357902e-05, + "loss": 11.8842, + "step": 27736 + }, + { + "epoch": 1.5103905032229958, + "grad_norm": 0.5129693179919498, + "learning_rate": 2.9818175595875342e-05, + "loss": 11.6384, + "step": 27737 + }, + { + "epoch": 1.5104449572195788, + "grad_norm": 0.606667141464947, + "learning_rate": 2.9811894129138452e-05, + "loss": 11.9409, + "step": 27738 + }, + { + "epoch": 1.5104994112161618, + "grad_norm": 0.5693205394135664, + "learning_rate": 2.9805613208196003e-05, + "loss": 11.8966, + "step": 27739 + }, + { + "epoch": 1.5105538652127448, + "grad_norm": 0.5250548845680687, + "learning_rate": 2.9799332833096906e-05, + "loss": 11.711, + "step": 27740 + }, + { + "epoch": 1.5106083192093278, + "grad_norm": 0.5524343060557678, + "learning_rate": 2.979305300388997e-05, + "loss": 11.8614, + "step": 27741 + }, + { + "epoch": 1.510662773205911, + "grad_norm": 0.5485382355577155, + "learning_rate": 2.9786773720624007e-05, + "loss": 11.8623, + "step": 27742 + }, + { + "epoch": 1.510717227202494, + "grad_norm": 0.5195175777305084, + "learning_rate": 2.9780494983347885e-05, + "loss": 11.8537, + "step": 27743 + }, + { + "epoch": 1.510771681199077, + "grad_norm": 0.5646632657230068, + "learning_rate": 2.9774216792110386e-05, + "loss": 11.9142, + "step": 27744 + }, + { + "epoch": 1.51082613519566, + "grad_norm": 0.5326355239330797, + "learning_rate": 2.976793914696039e-05, + "loss": 11.7431, + "step": 27745 + }, + { + "epoch": 1.510880589192243, + "grad_norm": 0.5640738171778821, + "learning_rate": 2.9761662047946638e-05, + "loss": 11.8633, + "step": 27746 + }, + { + "epoch": 1.510935043188826, + "grad_norm": 0.6548038237412614, + "learning_rate": 2.9755385495118014e-05, + "loss": 11.8872, + "step": 27747 + }, + { + "epoch": 1.5109894971854092, + "grad_norm": 0.6111004721881695, + "learning_rate": 2.9749109488523265e-05, + "loss": 11.8724, + "step": 27748 + }, + { + "epoch": 1.5110439511819922, + "grad_norm": 0.5840157481701392, + "learning_rate": 2.9742834028211207e-05, + "loss": 11.888, + "step": 27749 + }, + { + "epoch": 1.5110984051785752, + "grad_norm": 0.512270983141964, + "learning_rate": 2.97365591142307e-05, + "loss": 11.8493, + "step": 27750 + }, + { + "epoch": 1.5111528591751582, + "grad_norm": 0.5047354628998809, + "learning_rate": 2.9730284746630454e-05, + "loss": 11.7814, + "step": 27751 + }, + { + "epoch": 1.5112073131717412, + "grad_norm": 0.5192742936666448, + "learning_rate": 2.9724010925459368e-05, + "loss": 11.9035, + "step": 27752 + }, + { + "epoch": 1.5112617671683242, + "grad_norm": 0.5700787671591359, + "learning_rate": 2.9717737650766085e-05, + "loss": 11.7511, + "step": 27753 + }, + { + "epoch": 1.5113162211649072, + "grad_norm": 0.4802551680400022, + "learning_rate": 2.9711464922599474e-05, + "loss": 11.6913, + "step": 27754 + }, + { + "epoch": 1.5113706751614902, + "grad_norm": 0.5204942213256752, + "learning_rate": 2.9705192741008325e-05, + "loss": 11.7288, + "step": 27755 + }, + { + "epoch": 1.5114251291580731, + "grad_norm": 0.5405846651405644, + "learning_rate": 2.9698921106041354e-05, + "loss": 11.797, + "step": 27756 + }, + { + "epoch": 1.5114795831546561, + "grad_norm": 0.5781230033029797, + "learning_rate": 2.969265001774739e-05, + "loss": 11.7932, + "step": 27757 + }, + { + "epoch": 1.5115340371512391, + "grad_norm": 0.5185954624122529, + "learning_rate": 2.968637947617514e-05, + "loss": 11.756, + "step": 27758 + }, + { + "epoch": 1.5115884911478221, + "grad_norm": 0.5698397862436131, + "learning_rate": 2.968010948137343e-05, + "loss": 11.8523, + "step": 27759 + }, + { + "epoch": 1.5116429451444051, + "grad_norm": 0.5756019090718169, + "learning_rate": 2.9673840033390943e-05, + "loss": 11.8631, + "step": 27760 + }, + { + "epoch": 1.511697399140988, + "grad_norm": 0.5585769282172337, + "learning_rate": 2.9667571132276474e-05, + "loss": 11.8178, + "step": 27761 + }, + { + "epoch": 1.511751853137571, + "grad_norm": 0.5598159438504424, + "learning_rate": 2.9661302778078826e-05, + "loss": 11.8843, + "step": 27762 + }, + { + "epoch": 1.511806307134154, + "grad_norm": 0.5718899983595637, + "learning_rate": 2.965503497084663e-05, + "loss": 11.9957, + "step": 27763 + }, + { + "epoch": 1.511860761130737, + "grad_norm": 0.5435589226540075, + "learning_rate": 2.9648767710628665e-05, + "loss": 11.9282, + "step": 27764 + }, + { + "epoch": 1.5119152151273203, + "grad_norm": 0.5583589531844237, + "learning_rate": 2.964250099747372e-05, + "loss": 11.8914, + "step": 27765 + }, + { + "epoch": 1.5119696691239033, + "grad_norm": 0.6002200385594159, + "learning_rate": 2.9636234831430454e-05, + "loss": 11.7539, + "step": 27766 + }, + { + "epoch": 1.5120241231204863, + "grad_norm": 0.5466406512985008, + "learning_rate": 2.962996921254766e-05, + "loss": 11.797, + "step": 27767 + }, + { + "epoch": 1.5120785771170693, + "grad_norm": 0.49651751791199267, + "learning_rate": 2.9623704140873986e-05, + "loss": 11.8796, + "step": 27768 + }, + { + "epoch": 1.5121330311136523, + "grad_norm": 0.5640010234271187, + "learning_rate": 2.961743961645823e-05, + "loss": 11.9479, + "step": 27769 + }, + { + "epoch": 1.5121874851102353, + "grad_norm": 0.4978289056008298, + "learning_rate": 2.961117563934903e-05, + "loss": 11.758, + "step": 27770 + }, + { + "epoch": 1.5122419391068185, + "grad_norm": 0.5258343399477016, + "learning_rate": 2.9604912209595136e-05, + "loss": 11.7486, + "step": 27771 + }, + { + "epoch": 1.5122963931034015, + "grad_norm": 0.5353552731019385, + "learning_rate": 2.959864932724532e-05, + "loss": 11.8828, + "step": 27772 + }, + { + "epoch": 1.5123508470999845, + "grad_norm": 0.5380838597696318, + "learning_rate": 2.959238699234814e-05, + "loss": 11.9841, + "step": 27773 + }, + { + "epoch": 1.5124053010965675, + "grad_norm": 0.5472535488504849, + "learning_rate": 2.9586125204952398e-05, + "loss": 11.7373, + "step": 27774 + }, + { + "epoch": 1.5124597550931504, + "grad_norm": 0.49168558506863086, + "learning_rate": 2.9579863965106724e-05, + "loss": 11.677, + "step": 27775 + }, + { + "epoch": 1.5125142090897334, + "grad_norm": 0.6033380433241394, + "learning_rate": 2.957360327285984e-05, + "loss": 11.9156, + "step": 27776 + }, + { + "epoch": 1.5125686630863164, + "grad_norm": 0.5972369825776153, + "learning_rate": 2.956734312826046e-05, + "loss": 11.9468, + "step": 27777 + }, + { + "epoch": 1.5126231170828994, + "grad_norm": 0.5765631999622778, + "learning_rate": 2.9561083531357203e-05, + "loss": 11.8256, + "step": 27778 + }, + { + "epoch": 1.5126775710794824, + "grad_norm": 0.5718339356550207, + "learning_rate": 2.9554824482198794e-05, + "loss": 11.9699, + "step": 27779 + }, + { + "epoch": 1.5127320250760654, + "grad_norm": 0.5322174718543619, + "learning_rate": 2.9548565980833854e-05, + "loss": 11.8512, + "step": 27780 + }, + { + "epoch": 1.5127864790726484, + "grad_norm": 0.5482797675858028, + "learning_rate": 2.9542308027311116e-05, + "loss": 11.9528, + "step": 27781 + }, + { + "epoch": 1.5128409330692314, + "grad_norm": 0.5400244281915337, + "learning_rate": 2.953605062167921e-05, + "loss": 11.7887, + "step": 27782 + }, + { + "epoch": 1.5128953870658144, + "grad_norm": 0.6011909227453015, + "learning_rate": 2.9529793763986758e-05, + "loss": 11.8717, + "step": 27783 + }, + { + "epoch": 1.5129498410623974, + "grad_norm": 0.5506693073369622, + "learning_rate": 2.952353745428247e-05, + "loss": 11.7844, + "step": 27784 + }, + { + "epoch": 1.5130042950589804, + "grad_norm": 0.5740683367698717, + "learning_rate": 2.9517281692614952e-05, + "loss": 11.9149, + "step": 27785 + }, + { + "epoch": 1.5130587490555634, + "grad_norm": 0.5624485248051482, + "learning_rate": 2.9511026479032867e-05, + "loss": 11.9163, + "step": 27786 + }, + { + "epoch": 1.5131132030521464, + "grad_norm": 0.54204018303972, + "learning_rate": 2.9504771813584887e-05, + "loss": 11.9727, + "step": 27787 + }, + { + "epoch": 1.5131676570487296, + "grad_norm": 0.5594505465196601, + "learning_rate": 2.9498517696319605e-05, + "loss": 11.9219, + "step": 27788 + }, + { + "epoch": 1.5132221110453126, + "grad_norm": 0.6689923677952158, + "learning_rate": 2.9492264127285695e-05, + "loss": 12.0173, + "step": 27789 + }, + { + "epoch": 1.5132765650418956, + "grad_norm": 0.5000079040973773, + "learning_rate": 2.948601110653173e-05, + "loss": 11.8615, + "step": 27790 + }, + { + "epoch": 1.5133310190384786, + "grad_norm": 0.5409981726138181, + "learning_rate": 2.94797586341064e-05, + "loss": 11.7796, + "step": 27791 + }, + { + "epoch": 1.5133854730350615, + "grad_norm": 0.5024907674996787, + "learning_rate": 2.947350671005831e-05, + "loss": 11.8399, + "step": 27792 + }, + { + "epoch": 1.5134399270316445, + "grad_norm": 0.5657093388356965, + "learning_rate": 2.9467255334436006e-05, + "loss": 11.8278, + "step": 27793 + }, + { + "epoch": 1.5134943810282278, + "grad_norm": 0.512819115467977, + "learning_rate": 2.9461004507288194e-05, + "loss": 11.8364, + "step": 27794 + }, + { + "epoch": 1.5135488350248107, + "grad_norm": 0.5853750031511613, + "learning_rate": 2.9454754228663407e-05, + "loss": 11.7854, + "step": 27795 + }, + { + "epoch": 1.5136032890213937, + "grad_norm": 0.5658294543457406, + "learning_rate": 2.9448504498610307e-05, + "loss": 11.9533, + "step": 27796 + }, + { + "epoch": 1.5136577430179767, + "grad_norm": 0.516569400584985, + "learning_rate": 2.9442255317177448e-05, + "loss": 11.7878, + "step": 27797 + }, + { + "epoch": 1.5137121970145597, + "grad_norm": 0.5623119523680566, + "learning_rate": 2.9436006684413444e-05, + "loss": 11.8565, + "step": 27798 + }, + { + "epoch": 1.5137666510111427, + "grad_norm": 0.5305491088570766, + "learning_rate": 2.9429758600366906e-05, + "loss": 11.7577, + "step": 27799 + }, + { + "epoch": 1.5138211050077257, + "grad_norm": 0.4661915877437278, + "learning_rate": 2.9423511065086375e-05, + "loss": 11.6817, + "step": 27800 + }, + { + "epoch": 1.5138755590043087, + "grad_norm": 0.6026615705906947, + "learning_rate": 2.941726407862049e-05, + "loss": 11.9328, + "step": 27801 + }, + { + "epoch": 1.5139300130008917, + "grad_norm": 0.5188263326926184, + "learning_rate": 2.941101764101779e-05, + "loss": 11.8143, + "step": 27802 + }, + { + "epoch": 1.5139844669974747, + "grad_norm": 0.5143942788249989, + "learning_rate": 2.940477175232683e-05, + "loss": 11.8826, + "step": 27803 + }, + { + "epoch": 1.5140389209940577, + "grad_norm": 0.5289844667408242, + "learning_rate": 2.9398526412596228e-05, + "loss": 11.6707, + "step": 27804 + }, + { + "epoch": 1.5140933749906407, + "grad_norm": 0.5190835605898211, + "learning_rate": 2.9392281621874495e-05, + "loss": 11.8505, + "step": 27805 + }, + { + "epoch": 1.5141478289872237, + "grad_norm": 0.6307976520850446, + "learning_rate": 2.938603738021026e-05, + "loss": 11.8924, + "step": 27806 + }, + { + "epoch": 1.5142022829838067, + "grad_norm": 0.5463165588990535, + "learning_rate": 2.9379793687652003e-05, + "loss": 11.8728, + "step": 27807 + }, + { + "epoch": 1.5142567369803897, + "grad_norm": 0.5091881255534446, + "learning_rate": 2.937355054424835e-05, + "loss": 11.7606, + "step": 27808 + }, + { + "epoch": 1.5143111909769726, + "grad_norm": 0.5107749718863238, + "learning_rate": 2.9367307950047775e-05, + "loss": 11.6613, + "step": 27809 + }, + { + "epoch": 1.5143656449735556, + "grad_norm": 0.605009771388253, + "learning_rate": 2.9361065905098862e-05, + "loss": 11.8747, + "step": 27810 + }, + { + "epoch": 1.5144200989701386, + "grad_norm": 0.6343331401690552, + "learning_rate": 2.9354824409450165e-05, + "loss": 11.9454, + "step": 27811 + }, + { + "epoch": 1.5144745529667218, + "grad_norm": 0.5961261935633988, + "learning_rate": 2.9348583463150215e-05, + "loss": 11.9897, + "step": 27812 + }, + { + "epoch": 1.5145290069633048, + "grad_norm": 0.5260408723621902, + "learning_rate": 2.934234306624749e-05, + "loss": 11.8594, + "step": 27813 + }, + { + "epoch": 1.5145834609598878, + "grad_norm": 0.5676865393284748, + "learning_rate": 2.933610321879059e-05, + "loss": 11.6873, + "step": 27814 + }, + { + "epoch": 1.5146379149564708, + "grad_norm": 0.5789886000789263, + "learning_rate": 2.9329863920827973e-05, + "loss": 12.0022, + "step": 27815 + }, + { + "epoch": 1.5146923689530538, + "grad_norm": 0.5660964656367419, + "learning_rate": 2.932362517240822e-05, + "loss": 11.9032, + "step": 27816 + }, + { + "epoch": 1.5147468229496368, + "grad_norm": 0.5911587393227951, + "learning_rate": 2.9317386973579764e-05, + "loss": 11.9297, + "step": 27817 + }, + { + "epoch": 1.51480127694622, + "grad_norm": 0.5413495498279537, + "learning_rate": 2.931114932439121e-05, + "loss": 11.8286, + "step": 27818 + }, + { + "epoch": 1.514855730942803, + "grad_norm": 0.5625383941350024, + "learning_rate": 2.930491222489097e-05, + "loss": 11.887, + "step": 27819 + }, + { + "epoch": 1.514910184939386, + "grad_norm": 0.5705753135611374, + "learning_rate": 2.9298675675127586e-05, + "loss": 11.9516, + "step": 27820 + }, + { + "epoch": 1.514964638935969, + "grad_norm": 0.5495970961301904, + "learning_rate": 2.9292439675149587e-05, + "loss": 11.8987, + "step": 27821 + }, + { + "epoch": 1.515019092932552, + "grad_norm": 0.5799716159335289, + "learning_rate": 2.928620422500544e-05, + "loss": 11.807, + "step": 27822 + }, + { + "epoch": 1.515073546929135, + "grad_norm": 0.5299167729543424, + "learning_rate": 2.927996932474363e-05, + "loss": 11.8671, + "step": 27823 + }, + { + "epoch": 1.515128000925718, + "grad_norm": 0.549514754283632, + "learning_rate": 2.9273734974412605e-05, + "loss": 11.9484, + "step": 27824 + }, + { + "epoch": 1.515182454922301, + "grad_norm": 0.6565246978050049, + "learning_rate": 2.9267501174060873e-05, + "loss": 11.8285, + "step": 27825 + }, + { + "epoch": 1.515236908918884, + "grad_norm": 0.5094424060972945, + "learning_rate": 2.9261267923736958e-05, + "loss": 11.8731, + "step": 27826 + }, + { + "epoch": 1.515291362915467, + "grad_norm": 0.5401169161084873, + "learning_rate": 2.9255035223489236e-05, + "loss": 11.8489, + "step": 27827 + }, + { + "epoch": 1.51534581691205, + "grad_norm": 0.534441901673353, + "learning_rate": 2.9248803073366272e-05, + "loss": 11.8566, + "step": 27828 + }, + { + "epoch": 1.515400270908633, + "grad_norm": 0.5261033805036803, + "learning_rate": 2.9242571473416436e-05, + "loss": 11.8294, + "step": 27829 + }, + { + "epoch": 1.515454724905216, + "grad_norm": 0.6081218612704858, + "learning_rate": 2.9236340423688268e-05, + "loss": 11.9544, + "step": 27830 + }, + { + "epoch": 1.515509178901799, + "grad_norm": 0.5769409886346919, + "learning_rate": 2.9230109924230152e-05, + "loss": 11.925, + "step": 27831 + }, + { + "epoch": 1.515563632898382, + "grad_norm": 0.5710254215367669, + "learning_rate": 2.9223879975090606e-05, + "loss": 11.9083, + "step": 27832 + }, + { + "epoch": 1.515618086894965, + "grad_norm": 0.5514124110800704, + "learning_rate": 2.9217650576318036e-05, + "loss": 11.8565, + "step": 27833 + }, + { + "epoch": 1.515672540891548, + "grad_norm": 0.5206379089906886, + "learning_rate": 2.9211421727960854e-05, + "loss": 11.7541, + "step": 27834 + }, + { + "epoch": 1.5157269948881311, + "grad_norm": 0.537492180588417, + "learning_rate": 2.9205193430067525e-05, + "loss": 11.8885, + "step": 27835 + }, + { + "epoch": 1.515781448884714, + "grad_norm": 0.5459418827556682, + "learning_rate": 2.919896568268652e-05, + "loss": 11.741, + "step": 27836 + }, + { + "epoch": 1.515835902881297, + "grad_norm": 0.5690263196747039, + "learning_rate": 2.919273848586619e-05, + "loss": 11.7847, + "step": 27837 + }, + { + "epoch": 1.51589035687788, + "grad_norm": 0.518363252654648, + "learning_rate": 2.918651183965504e-05, + "loss": 11.7857, + "step": 27838 + }, + { + "epoch": 1.515944810874463, + "grad_norm": 0.5698791668369638, + "learning_rate": 2.918028574410141e-05, + "loss": 11.724, + "step": 27839 + }, + { + "epoch": 1.515999264871046, + "grad_norm": 0.6821101984848678, + "learning_rate": 2.9174060199253794e-05, + "loss": 11.8832, + "step": 27840 + }, + { + "epoch": 1.5160537188676293, + "grad_norm": 0.5393663450828089, + "learning_rate": 2.9167835205160532e-05, + "loss": 11.7887, + "step": 27841 + }, + { + "epoch": 1.5161081728642123, + "grad_norm": 0.6874066226660146, + "learning_rate": 2.9161610761870082e-05, + "loss": 11.7755, + "step": 27842 + }, + { + "epoch": 1.5161626268607953, + "grad_norm": 0.5503605609436028, + "learning_rate": 2.9155386869430847e-05, + "loss": 11.8203, + "step": 27843 + }, + { + "epoch": 1.5162170808573783, + "grad_norm": 0.5233963511568601, + "learning_rate": 2.9149163527891154e-05, + "loss": 11.8154, + "step": 27844 + }, + { + "epoch": 1.5162715348539613, + "grad_norm": 0.635581410885624, + "learning_rate": 2.9142940737299485e-05, + "loss": 11.8079, + "step": 27845 + }, + { + "epoch": 1.5163259888505443, + "grad_norm": 0.6003910004603072, + "learning_rate": 2.9136718497704164e-05, + "loss": 11.8677, + "step": 27846 + }, + { + "epoch": 1.5163804428471273, + "grad_norm": 0.55421737256066, + "learning_rate": 2.9130496809153597e-05, + "loss": 11.8329, + "step": 27847 + }, + { + "epoch": 1.5164348968437102, + "grad_norm": 0.5580708026299369, + "learning_rate": 2.9124275671696212e-05, + "loss": 11.9821, + "step": 27848 + }, + { + "epoch": 1.5164893508402932, + "grad_norm": 0.521589600014856, + "learning_rate": 2.9118055085380303e-05, + "loss": 11.8927, + "step": 27849 + }, + { + "epoch": 1.5165438048368762, + "grad_norm": 0.5953309181436236, + "learning_rate": 2.9111835050254323e-05, + "loss": 11.9729, + "step": 27850 + }, + { + "epoch": 1.5165982588334592, + "grad_norm": 0.5634892078543181, + "learning_rate": 2.9105615566366563e-05, + "loss": 11.8954, + "step": 27851 + }, + { + "epoch": 1.5166527128300422, + "grad_norm": 0.5375423234571668, + "learning_rate": 2.9099396633765464e-05, + "loss": 11.7917, + "step": 27852 + }, + { + "epoch": 1.5167071668266252, + "grad_norm": 0.7058152558283332, + "learning_rate": 2.9093178252499344e-05, + "loss": 12.0143, + "step": 27853 + }, + { + "epoch": 1.5167616208232082, + "grad_norm": 0.627399877302454, + "learning_rate": 2.9086960422616527e-05, + "loss": 11.8652, + "step": 27854 + }, + { + "epoch": 1.5168160748197912, + "grad_norm": 0.5432207752183109, + "learning_rate": 2.9080743144165433e-05, + "loss": 11.9182, + "step": 27855 + }, + { + "epoch": 1.5168705288163742, + "grad_norm": 0.5795550600974179, + "learning_rate": 2.9074526417194347e-05, + "loss": 11.918, + "step": 27856 + }, + { + "epoch": 1.5169249828129572, + "grad_norm": 0.6469021761797537, + "learning_rate": 2.906831024175166e-05, + "loss": 11.8331, + "step": 27857 + }, + { + "epoch": 1.5169794368095404, + "grad_norm": 0.5351030400617772, + "learning_rate": 2.906209461788566e-05, + "loss": 11.8457, + "step": 27858 + }, + { + "epoch": 1.5170338908061234, + "grad_norm": 0.5242390532435152, + "learning_rate": 2.9055879545644716e-05, + "loss": 11.8352, + "step": 27859 + }, + { + "epoch": 1.5170883448027064, + "grad_norm": 0.536360864304305, + "learning_rate": 2.9049665025077178e-05, + "loss": 11.7729, + "step": 27860 + }, + { + "epoch": 1.5171427987992894, + "grad_norm": 0.5182475054363096, + "learning_rate": 2.9043451056231295e-05, + "loss": 11.8093, + "step": 27861 + }, + { + "epoch": 1.5171972527958724, + "grad_norm": 0.5674892112315953, + "learning_rate": 2.9037237639155492e-05, + "loss": 11.9089, + "step": 27862 + }, + { + "epoch": 1.5172517067924554, + "grad_norm": 0.48246487052542764, + "learning_rate": 2.9031024773898018e-05, + "loss": 11.7977, + "step": 27863 + }, + { + "epoch": 1.5173061607890386, + "grad_norm": 0.6185070494429271, + "learning_rate": 2.9024812460507154e-05, + "loss": 11.7855, + "step": 27864 + }, + { + "epoch": 1.5173606147856216, + "grad_norm": 0.5718141434432262, + "learning_rate": 2.9018600699031294e-05, + "loss": 11.9102, + "step": 27865 + }, + { + "epoch": 1.5174150687822046, + "grad_norm": 0.5511896418875497, + "learning_rate": 2.9012389489518677e-05, + "loss": 11.823, + "step": 27866 + }, + { + "epoch": 1.5174695227787875, + "grad_norm": 0.5754939719354365, + "learning_rate": 2.9006178832017638e-05, + "loss": 11.9012, + "step": 27867 + }, + { + "epoch": 1.5175239767753705, + "grad_norm": 0.5322506960199332, + "learning_rate": 2.8999968726576442e-05, + "loss": 11.8449, + "step": 27868 + }, + { + "epoch": 1.5175784307719535, + "grad_norm": 0.5396397627818631, + "learning_rate": 2.8993759173243386e-05, + "loss": 11.8189, + "step": 27869 + }, + { + "epoch": 1.5176328847685365, + "grad_norm": 0.5185878582681657, + "learning_rate": 2.89875501720668e-05, + "loss": 11.895, + "step": 27870 + }, + { + "epoch": 1.5176873387651195, + "grad_norm": 0.5792286258753058, + "learning_rate": 2.8981341723094902e-05, + "loss": 11.8972, + "step": 27871 + }, + { + "epoch": 1.5177417927617025, + "grad_norm": 0.5322116546301725, + "learning_rate": 2.8975133826376057e-05, + "loss": 11.9257, + "step": 27872 + }, + { + "epoch": 1.5177962467582855, + "grad_norm": 0.5851757597525883, + "learning_rate": 2.8968926481958424e-05, + "loss": 11.8221, + "step": 27873 + }, + { + "epoch": 1.5178507007548685, + "grad_norm": 0.5829732504888988, + "learning_rate": 2.896271968989034e-05, + "loss": 11.9159, + "step": 27874 + }, + { + "epoch": 1.5179051547514515, + "grad_norm": 0.5144869012805328, + "learning_rate": 2.895651345022008e-05, + "loss": 11.6957, + "step": 27875 + }, + { + "epoch": 1.5179596087480345, + "grad_norm": 0.5280357587995841, + "learning_rate": 2.8950307762995853e-05, + "loss": 11.8242, + "step": 27876 + }, + { + "epoch": 1.5180140627446175, + "grad_norm": 0.5394983149057343, + "learning_rate": 2.894410262826599e-05, + "loss": 11.8521, + "step": 27877 + }, + { + "epoch": 1.5180685167412005, + "grad_norm": 0.5396967352632152, + "learning_rate": 2.893789804607866e-05, + "loss": 11.8221, + "step": 27878 + }, + { + "epoch": 1.5181229707377835, + "grad_norm": 0.5640942151315489, + "learning_rate": 2.893169401648218e-05, + "loss": 11.9148, + "step": 27879 + }, + { + "epoch": 1.5181774247343665, + "grad_norm": 0.5478598688414913, + "learning_rate": 2.8925490539524746e-05, + "loss": 11.8535, + "step": 27880 + }, + { + "epoch": 1.5182318787309494, + "grad_norm": 0.5769870862339115, + "learning_rate": 2.891928761525461e-05, + "loss": 12.0046, + "step": 27881 + }, + { + "epoch": 1.5182863327275327, + "grad_norm": 0.5464094118829763, + "learning_rate": 2.8913085243720085e-05, + "loss": 11.9371, + "step": 27882 + }, + { + "epoch": 1.5183407867241157, + "grad_norm": 0.5639966166131771, + "learning_rate": 2.8906883424969257e-05, + "loss": 11.9645, + "step": 27883 + }, + { + "epoch": 1.5183952407206986, + "grad_norm": 0.561523889345088, + "learning_rate": 2.890068215905043e-05, + "loss": 11.8737, + "step": 27884 + }, + { + "epoch": 1.5184496947172816, + "grad_norm": 0.5584318488250063, + "learning_rate": 2.8894481446011847e-05, + "loss": 11.7879, + "step": 27885 + }, + { + "epoch": 1.5185041487138646, + "grad_norm": 0.5195656150142299, + "learning_rate": 2.8888281285901674e-05, + "loss": 11.8961, + "step": 27886 + }, + { + "epoch": 1.5185586027104476, + "grad_norm": 0.5258584797947724, + "learning_rate": 2.8882081678768193e-05, + "loss": 11.8562, + "step": 27887 + }, + { + "epoch": 1.5186130567070308, + "grad_norm": 0.5705927095621556, + "learning_rate": 2.8875882624659524e-05, + "loss": 11.9243, + "step": 27888 + }, + { + "epoch": 1.5186675107036138, + "grad_norm": 0.5482577633922507, + "learning_rate": 2.8869684123623963e-05, + "loss": 11.8742, + "step": 27889 + }, + { + "epoch": 1.5187219647001968, + "grad_norm": 0.5771281166583392, + "learning_rate": 2.886348617570963e-05, + "loss": 11.8703, + "step": 27890 + }, + { + "epoch": 1.5187764186967798, + "grad_norm": 0.5262282254611751, + "learning_rate": 2.8857288780964753e-05, + "loss": 11.8149, + "step": 27891 + }, + { + "epoch": 1.5188308726933628, + "grad_norm": 0.5398936561700859, + "learning_rate": 2.8851091939437602e-05, + "loss": 11.7759, + "step": 27892 + }, + { + "epoch": 1.5188853266899458, + "grad_norm": 0.5436349613015419, + "learning_rate": 2.8844895651176218e-05, + "loss": 11.9125, + "step": 27893 + }, + { + "epoch": 1.5189397806865288, + "grad_norm": 0.5211540971772278, + "learning_rate": 2.8838699916228894e-05, + "loss": 11.8642, + "step": 27894 + }, + { + "epoch": 1.5189942346831118, + "grad_norm": 0.5274338892558537, + "learning_rate": 2.883250473464374e-05, + "loss": 11.8229, + "step": 27895 + }, + { + "epoch": 1.5190486886796948, + "grad_norm": 0.5211417927003539, + "learning_rate": 2.8826310106468968e-05, + "loss": 11.7099, + "step": 27896 + }, + { + "epoch": 1.5191031426762778, + "grad_norm": 0.5494026998814043, + "learning_rate": 2.8820116031752774e-05, + "loss": 11.807, + "step": 27897 + }, + { + "epoch": 1.5191575966728608, + "grad_norm": 0.6055377194136393, + "learning_rate": 2.8813922510543267e-05, + "loss": 11.8004, + "step": 27898 + }, + { + "epoch": 1.5192120506694438, + "grad_norm": 0.5225621783353847, + "learning_rate": 2.8807729542888662e-05, + "loss": 11.7967, + "step": 27899 + }, + { + "epoch": 1.5192665046660268, + "grad_norm": 0.5567172433033473, + "learning_rate": 2.8801537128837065e-05, + "loss": 11.7802, + "step": 27900 + }, + { + "epoch": 1.5193209586626097, + "grad_norm": 0.5196774715295803, + "learning_rate": 2.879534526843668e-05, + "loss": 11.8116, + "step": 27901 + }, + { + "epoch": 1.5193754126591927, + "grad_norm": 0.5388871073186456, + "learning_rate": 2.8789153961735605e-05, + "loss": 11.8597, + "step": 27902 + }, + { + "epoch": 1.5194298666557757, + "grad_norm": 0.5610179797228153, + "learning_rate": 2.8782963208782042e-05, + "loss": 11.847, + "step": 27903 + }, + { + "epoch": 1.5194843206523587, + "grad_norm": 0.5846937180348672, + "learning_rate": 2.87767730096241e-05, + "loss": 11.8652, + "step": 27904 + }, + { + "epoch": 1.519538774648942, + "grad_norm": 0.5537353097572987, + "learning_rate": 2.877058336430989e-05, + "loss": 11.8942, + "step": 27905 + }, + { + "epoch": 1.519593228645525, + "grad_norm": 0.5957336635846588, + "learning_rate": 2.8764394272887584e-05, + "loss": 11.8114, + "step": 27906 + }, + { + "epoch": 1.519647682642108, + "grad_norm": 0.48896804956338424, + "learning_rate": 2.8758205735405276e-05, + "loss": 11.8408, + "step": 27907 + }, + { + "epoch": 1.519702136638691, + "grad_norm": 0.5693470935341843, + "learning_rate": 2.875201775191111e-05, + "loss": 11.6823, + "step": 27908 + }, + { + "epoch": 1.519756590635274, + "grad_norm": 0.5409164269378668, + "learning_rate": 2.8745830322453226e-05, + "loss": 11.8604, + "step": 27909 + }, + { + "epoch": 1.519811044631857, + "grad_norm": 0.5218839293379983, + "learning_rate": 2.873964344707968e-05, + "loss": 11.8008, + "step": 27910 + }, + { + "epoch": 1.5198654986284401, + "grad_norm": 0.5377627599129235, + "learning_rate": 2.8733457125838658e-05, + "loss": 11.7381, + "step": 27911 + }, + { + "epoch": 1.519919952625023, + "grad_norm": 0.5765601938153482, + "learning_rate": 2.8727271358778185e-05, + "loss": 11.8799, + "step": 27912 + }, + { + "epoch": 1.519974406621606, + "grad_norm": 0.5392376194512571, + "learning_rate": 2.872108614594644e-05, + "loss": 11.766, + "step": 27913 + }, + { + "epoch": 1.520028860618189, + "grad_norm": 0.5889251853275724, + "learning_rate": 2.8714901487391477e-05, + "loss": 11.8027, + "step": 27914 + }, + { + "epoch": 1.520083314614772, + "grad_norm": 0.5743403154080624, + "learning_rate": 2.8708717383161366e-05, + "loss": 11.8232, + "step": 27915 + }, + { + "epoch": 1.520137768611355, + "grad_norm": 0.6280411365971715, + "learning_rate": 2.8702533833304256e-05, + "loss": 11.9877, + "step": 27916 + }, + { + "epoch": 1.520192222607938, + "grad_norm": 0.5101934537252294, + "learning_rate": 2.8696350837868168e-05, + "loss": 11.8981, + "step": 27917 + }, + { + "epoch": 1.520246676604521, + "grad_norm": 0.5393295124478232, + "learning_rate": 2.869016839690122e-05, + "loss": 11.7954, + "step": 27918 + }, + { + "epoch": 1.520301130601104, + "grad_norm": 0.5665424341421378, + "learning_rate": 2.86839865104515e-05, + "loss": 11.9649, + "step": 27919 + }, + { + "epoch": 1.520355584597687, + "grad_norm": 0.5372327587059348, + "learning_rate": 2.867780517856703e-05, + "loss": 11.8867, + "step": 27920 + }, + { + "epoch": 1.52041003859427, + "grad_norm": 0.49457536122256185, + "learning_rate": 2.8671624401295947e-05, + "loss": 11.7953, + "step": 27921 + }, + { + "epoch": 1.520464492590853, + "grad_norm": 0.5682484408441838, + "learning_rate": 2.866544417868624e-05, + "loss": 11.865, + "step": 27922 + }, + { + "epoch": 1.520518946587436, + "grad_norm": 0.5497355483267916, + "learning_rate": 2.865926451078603e-05, + "loss": 11.7453, + "step": 27923 + }, + { + "epoch": 1.520573400584019, + "grad_norm": 0.5312350452988377, + "learning_rate": 2.8653085397643355e-05, + "loss": 11.7752, + "step": 27924 + }, + { + "epoch": 1.520627854580602, + "grad_norm": 0.5337459565912904, + "learning_rate": 2.864690683930621e-05, + "loss": 11.7479, + "step": 27925 + }, + { + "epoch": 1.520682308577185, + "grad_norm": 0.5492033095210955, + "learning_rate": 2.8640728835822715e-05, + "loss": 11.8602, + "step": 27926 + }, + { + "epoch": 1.520736762573768, + "grad_norm": 0.5882903796976917, + "learning_rate": 2.863455138724085e-05, + "loss": 11.9274, + "step": 27927 + }, + { + "epoch": 1.5207912165703512, + "grad_norm": 0.5464474040295534, + "learning_rate": 2.862837449360871e-05, + "loss": 11.8038, + "step": 27928 + }, + { + "epoch": 1.5208456705669342, + "grad_norm": 0.5259964522455184, + "learning_rate": 2.862219815497428e-05, + "loss": 11.8672, + "step": 27929 + }, + { + "epoch": 1.5209001245635172, + "grad_norm": 0.5164772302638427, + "learning_rate": 2.86160223713856e-05, + "loss": 11.8341, + "step": 27930 + }, + { + "epoch": 1.5209545785601002, + "grad_norm": 0.5307716165980209, + "learning_rate": 2.860984714289072e-05, + "loss": 11.7679, + "step": 27931 + }, + { + "epoch": 1.5210090325566832, + "grad_norm": 0.6043738213104092, + "learning_rate": 2.8603672469537622e-05, + "loss": 11.8279, + "step": 27932 + }, + { + "epoch": 1.5210634865532662, + "grad_norm": 0.5445614445443454, + "learning_rate": 2.8597498351374376e-05, + "loss": 11.7401, + "step": 27933 + }, + { + "epoch": 1.5211179405498494, + "grad_norm": 0.5289380992588216, + "learning_rate": 2.8591324788448948e-05, + "loss": 11.8751, + "step": 27934 + }, + { + "epoch": 1.5211723945464324, + "grad_norm": 0.533654251638464, + "learning_rate": 2.8585151780809328e-05, + "loss": 11.8401, + "step": 27935 + }, + { + "epoch": 1.5212268485430154, + "grad_norm": 0.5220964872865659, + "learning_rate": 2.8578979328503576e-05, + "loss": 11.8242, + "step": 27936 + }, + { + "epoch": 1.5212813025395984, + "grad_norm": 0.5114364612323417, + "learning_rate": 2.857280743157962e-05, + "loss": 11.8001, + "step": 27937 + }, + { + "epoch": 1.5213357565361814, + "grad_norm": 0.5523660842971195, + "learning_rate": 2.856663609008553e-05, + "loss": 11.8084, + "step": 27938 + }, + { + "epoch": 1.5213902105327644, + "grad_norm": 0.52333885065501, + "learning_rate": 2.856046530406922e-05, + "loss": 11.8418, + "step": 27939 + }, + { + "epoch": 1.5214446645293473, + "grad_norm": 0.5295371740160958, + "learning_rate": 2.8554295073578708e-05, + "loss": 11.8021, + "step": 27940 + }, + { + "epoch": 1.5214991185259303, + "grad_norm": 0.4960778756772864, + "learning_rate": 2.8548125398662017e-05, + "loss": 11.8328, + "step": 27941 + }, + { + "epoch": 1.5215535725225133, + "grad_norm": 0.46425175646156275, + "learning_rate": 2.8541956279367053e-05, + "loss": 11.729, + "step": 27942 + }, + { + "epoch": 1.5216080265190963, + "grad_norm": 0.5012255929276607, + "learning_rate": 2.853578771574189e-05, + "loss": 11.8035, + "step": 27943 + }, + { + "epoch": 1.5216624805156793, + "grad_norm": 0.5888817324911219, + "learning_rate": 2.8529619707834355e-05, + "loss": 11.8283, + "step": 27944 + }, + { + "epoch": 1.5217169345122623, + "grad_norm": 0.6008255980546373, + "learning_rate": 2.8523452255692485e-05, + "loss": 11.9394, + "step": 27945 + }, + { + "epoch": 1.5217713885088453, + "grad_norm": 0.6249937246002554, + "learning_rate": 2.8517285359364266e-05, + "loss": 11.92, + "step": 27946 + }, + { + "epoch": 1.5218258425054283, + "grad_norm": 0.5441509701530162, + "learning_rate": 2.851111901889759e-05, + "loss": 11.9523, + "step": 27947 + }, + { + "epoch": 1.5218802965020113, + "grad_norm": 0.5510325862380181, + "learning_rate": 2.850495323434048e-05, + "loss": 11.8965, + "step": 27948 + }, + { + "epoch": 1.5219347504985943, + "grad_norm": 0.5338604703249146, + "learning_rate": 2.8498788005740816e-05, + "loss": 11.863, + "step": 27949 + }, + { + "epoch": 1.5219892044951773, + "grad_norm": 0.6182806408308447, + "learning_rate": 2.8492623333146585e-05, + "loss": 12.011, + "step": 27950 + }, + { + "epoch": 1.5220436584917603, + "grad_norm": 0.5586257192807768, + "learning_rate": 2.8486459216605688e-05, + "loss": 12.0284, + "step": 27951 + }, + { + "epoch": 1.5220981124883435, + "grad_norm": 0.5386383629515035, + "learning_rate": 2.848029565616607e-05, + "loss": 11.6983, + "step": 27952 + }, + { + "epoch": 1.5221525664849265, + "grad_norm": 0.6378948206143068, + "learning_rate": 2.847413265187573e-05, + "loss": 12.0341, + "step": 27953 + }, + { + "epoch": 1.5222070204815095, + "grad_norm": 0.6468948500218227, + "learning_rate": 2.8467970203782467e-05, + "loss": 11.8029, + "step": 27954 + }, + { + "epoch": 1.5222614744780925, + "grad_norm": 0.5759773580548446, + "learning_rate": 2.846180831193429e-05, + "loss": 11.8949, + "step": 27955 + }, + { + "epoch": 1.5223159284746755, + "grad_norm": 0.6236305897816501, + "learning_rate": 2.8455646976379068e-05, + "loss": 11.9684, + "step": 27956 + }, + { + "epoch": 1.5223703824712587, + "grad_norm": 0.6303340620397581, + "learning_rate": 2.844948619716473e-05, + "loss": 11.6356, + "step": 27957 + }, + { + "epoch": 1.5224248364678417, + "grad_norm": 0.5853771207077727, + "learning_rate": 2.844332597433921e-05, + "loss": 11.7633, + "step": 27958 + }, + { + "epoch": 1.5224792904644247, + "grad_norm": 0.5212751154680605, + "learning_rate": 2.8437166307950368e-05, + "loss": 11.9005, + "step": 27959 + }, + { + "epoch": 1.5225337444610076, + "grad_norm": 0.5753545971375285, + "learning_rate": 2.8431007198046144e-05, + "loss": 11.848, + "step": 27960 + }, + { + "epoch": 1.5225881984575906, + "grad_norm": 0.5314260131438653, + "learning_rate": 2.8424848644674385e-05, + "loss": 11.6512, + "step": 27961 + }, + { + "epoch": 1.5226426524541736, + "grad_norm": 0.5560664066073572, + "learning_rate": 2.841869064788303e-05, + "loss": 11.8517, + "step": 27962 + }, + { + "epoch": 1.5226971064507566, + "grad_norm": 0.5235610838337695, + "learning_rate": 2.841253320771994e-05, + "loss": 11.8405, + "step": 27963 + }, + { + "epoch": 1.5227515604473396, + "grad_norm": 0.6135400382910773, + "learning_rate": 2.8406376324232974e-05, + "loss": 11.6442, + "step": 27964 + }, + { + "epoch": 1.5228060144439226, + "grad_norm": 0.6182186923824203, + "learning_rate": 2.8400219997470056e-05, + "loss": 11.7927, + "step": 27965 + }, + { + "epoch": 1.5228604684405056, + "grad_norm": 0.5503423157192251, + "learning_rate": 2.839406422747901e-05, + "loss": 11.8811, + "step": 27966 + }, + { + "epoch": 1.5229149224370886, + "grad_norm": 0.5613776072349622, + "learning_rate": 2.8387909014307722e-05, + "loss": 11.8915, + "step": 27967 + }, + { + "epoch": 1.5229693764336716, + "grad_norm": 0.5330578499553932, + "learning_rate": 2.8381754358004097e-05, + "loss": 11.7234, + "step": 27968 + }, + { + "epoch": 1.5230238304302546, + "grad_norm": 0.556692852189812, + "learning_rate": 2.837560025861593e-05, + "loss": 11.6645, + "step": 27969 + }, + { + "epoch": 1.5230782844268376, + "grad_norm": 0.529759914988266, + "learning_rate": 2.836944671619114e-05, + "loss": 11.8636, + "step": 27970 + }, + { + "epoch": 1.5231327384234206, + "grad_norm": 0.5558955504395425, + "learning_rate": 2.8363293730777517e-05, + "loss": 11.9139, + "step": 27971 + }, + { + "epoch": 1.5231871924200036, + "grad_norm": 0.5073574032386341, + "learning_rate": 2.8357141302422962e-05, + "loss": 11.8774, + "step": 27972 + }, + { + "epoch": 1.5232416464165865, + "grad_norm": 0.5408827877794644, + "learning_rate": 2.835098943117529e-05, + "loss": 11.8919, + "step": 27973 + }, + { + "epoch": 1.5232961004131695, + "grad_norm": 0.5485433703136475, + "learning_rate": 2.8344838117082306e-05, + "loss": 11.6821, + "step": 27974 + }, + { + "epoch": 1.5233505544097528, + "grad_norm": 0.5524202991264017, + "learning_rate": 2.8338687360191906e-05, + "loss": 11.7799, + "step": 27975 + }, + { + "epoch": 1.5234050084063357, + "grad_norm": 0.5407395598972906, + "learning_rate": 2.8332537160551864e-05, + "loss": 11.7938, + "step": 27976 + }, + { + "epoch": 1.5234594624029187, + "grad_norm": 0.5985515785085527, + "learning_rate": 2.8326387518210063e-05, + "loss": 11.9475, + "step": 27977 + }, + { + "epoch": 1.5235139163995017, + "grad_norm": 0.5743204461344149, + "learning_rate": 2.8320238433214263e-05, + "loss": 11.9857, + "step": 27978 + }, + { + "epoch": 1.5235683703960847, + "grad_norm": 0.5341807793487325, + "learning_rate": 2.8314089905612306e-05, + "loss": 11.8742, + "step": 27979 + }, + { + "epoch": 1.5236228243926677, + "grad_norm": 0.5275568888986025, + "learning_rate": 2.8307941935452043e-05, + "loss": 11.8501, + "step": 27980 + }, + { + "epoch": 1.523677278389251, + "grad_norm": 0.5393791917241146, + "learning_rate": 2.830179452278121e-05, + "loss": 11.766, + "step": 27981 + }, + { + "epoch": 1.523731732385834, + "grad_norm": 0.5768720464720104, + "learning_rate": 2.8295647667647685e-05, + "loss": 11.9164, + "step": 27982 + }, + { + "epoch": 1.523786186382417, + "grad_norm": 0.5896321753178678, + "learning_rate": 2.8289501370099225e-05, + "loss": 11.7657, + "step": 27983 + }, + { + "epoch": 1.523840640379, + "grad_norm": 0.5534454910447092, + "learning_rate": 2.8283355630183593e-05, + "loss": 11.8392, + "step": 27984 + }, + { + "epoch": 1.523895094375583, + "grad_norm": 0.5516639208822186, + "learning_rate": 2.8277210447948653e-05, + "loss": 11.8244, + "step": 27985 + }, + { + "epoch": 1.523949548372166, + "grad_norm": 0.686931899725569, + "learning_rate": 2.8271065823442123e-05, + "loss": 11.8538, + "step": 27986 + }, + { + "epoch": 1.524004002368749, + "grad_norm": 0.5315660720617317, + "learning_rate": 2.8264921756711837e-05, + "loss": 11.9026, + "step": 27987 + }, + { + "epoch": 1.5240584563653319, + "grad_norm": 0.6007169368610384, + "learning_rate": 2.8258778247805517e-05, + "loss": 11.8662, + "step": 27988 + }, + { + "epoch": 1.5241129103619149, + "grad_norm": 0.5597611080799025, + "learning_rate": 2.825263529677097e-05, + "loss": 11.8968, + "step": 27989 + }, + { + "epoch": 1.5241673643584979, + "grad_norm": 0.5920000808107831, + "learning_rate": 2.8246492903655998e-05, + "loss": 11.8771, + "step": 27990 + }, + { + "epoch": 1.5242218183550809, + "grad_norm": 0.5544028723552854, + "learning_rate": 2.8240351068508297e-05, + "loss": 11.8372, + "step": 27991 + }, + { + "epoch": 1.5242762723516639, + "grad_norm": 0.5826707811122956, + "learning_rate": 2.82342097913757e-05, + "loss": 11.8462, + "step": 27992 + }, + { + "epoch": 1.5243307263482468, + "grad_norm": 0.5593061795049994, + "learning_rate": 2.8228069072305907e-05, + "loss": 11.7339, + "step": 27993 + }, + { + "epoch": 1.5243851803448298, + "grad_norm": 0.555009710492283, + "learning_rate": 2.8221928911346663e-05, + "loss": 11.7679, + "step": 27994 + }, + { + "epoch": 1.5244396343414128, + "grad_norm": 0.588777087305147, + "learning_rate": 2.821578930854577e-05, + "loss": 11.8769, + "step": 27995 + }, + { + "epoch": 1.5244940883379958, + "grad_norm": 0.5612662130441314, + "learning_rate": 2.8209650263950905e-05, + "loss": 11.6895, + "step": 27996 + }, + { + "epoch": 1.5245485423345788, + "grad_norm": 0.5517453495317268, + "learning_rate": 2.8203511777609858e-05, + "loss": 11.7565, + "step": 27997 + }, + { + "epoch": 1.524602996331162, + "grad_norm": 0.5360699191588418, + "learning_rate": 2.819737384957033e-05, + "loss": 11.8972, + "step": 27998 + }, + { + "epoch": 1.524657450327745, + "grad_norm": 0.5216156089956738, + "learning_rate": 2.819123647988009e-05, + "loss": 11.7221, + "step": 27999 + }, + { + "epoch": 1.524711904324328, + "grad_norm": 0.5835597939173611, + "learning_rate": 2.81850996685868e-05, + "loss": 11.8809, + "step": 28000 + }, + { + "epoch": 1.524766358320911, + "grad_norm": 0.5840532266995864, + "learning_rate": 2.8178963415738225e-05, + "loss": 11.8049, + "step": 28001 + }, + { + "epoch": 1.524820812317494, + "grad_norm": 0.6183406611645063, + "learning_rate": 2.81728277213821e-05, + "loss": 11.8851, + "step": 28002 + }, + { + "epoch": 1.524875266314077, + "grad_norm": 0.5232583144111126, + "learning_rate": 2.816669258556611e-05, + "loss": 11.8841, + "step": 28003 + }, + { + "epoch": 1.5249297203106602, + "grad_norm": 0.5773888316303224, + "learning_rate": 2.8160558008337977e-05, + "loss": 11.8857, + "step": 28004 + }, + { + "epoch": 1.5249841743072432, + "grad_norm": 0.576444798531799, + "learning_rate": 2.815442398974536e-05, + "loss": 11.8206, + "step": 28005 + }, + { + "epoch": 1.5250386283038262, + "grad_norm": 0.475299752941559, + "learning_rate": 2.8148290529835987e-05, + "loss": 11.8208, + "step": 28006 + }, + { + "epoch": 1.5250930823004092, + "grad_norm": 0.5790512256596804, + "learning_rate": 2.8142157628657583e-05, + "loss": 11.8499, + "step": 28007 + }, + { + "epoch": 1.5251475362969922, + "grad_norm": 0.5234532016890456, + "learning_rate": 2.8136025286257784e-05, + "loss": 11.8159, + "step": 28008 + }, + { + "epoch": 1.5252019902935752, + "grad_norm": 0.5630963999270417, + "learning_rate": 2.8129893502684334e-05, + "loss": 11.8672, + "step": 28009 + }, + { + "epoch": 1.5252564442901582, + "grad_norm": 0.5455318970182873, + "learning_rate": 2.8123762277984856e-05, + "loss": 11.8057, + "step": 28010 + }, + { + "epoch": 1.5253108982867412, + "grad_norm": 0.5171199728328628, + "learning_rate": 2.8117631612207084e-05, + "loss": 11.721, + "step": 28011 + }, + { + "epoch": 1.5253653522833241, + "grad_norm": 0.5221933409063921, + "learning_rate": 2.811150150539863e-05, + "loss": 11.8984, + "step": 28012 + }, + { + "epoch": 1.5254198062799071, + "grad_norm": 0.5187042053234086, + "learning_rate": 2.8105371957607228e-05, + "loss": 11.8053, + "step": 28013 + }, + { + "epoch": 1.5254742602764901, + "grad_norm": 0.5429878060486782, + "learning_rate": 2.8099242968880512e-05, + "loss": 11.8103, + "step": 28014 + }, + { + "epoch": 1.5255287142730731, + "grad_norm": 0.6503001961805629, + "learning_rate": 2.809311453926612e-05, + "loss": 11.9886, + "step": 28015 + }, + { + "epoch": 1.5255831682696561, + "grad_norm": 0.5230869261194098, + "learning_rate": 2.808698666881171e-05, + "loss": 11.8418, + "step": 28016 + }, + { + "epoch": 1.5256376222662391, + "grad_norm": 0.5723371956038334, + "learning_rate": 2.8080859357564993e-05, + "loss": 11.9124, + "step": 28017 + }, + { + "epoch": 1.525692076262822, + "grad_norm": 0.5966982453619117, + "learning_rate": 2.807473260557355e-05, + "loss": 11.9901, + "step": 28018 + }, + { + "epoch": 1.525746530259405, + "grad_norm": 0.49105321129198276, + "learning_rate": 2.806860641288507e-05, + "loss": 11.849, + "step": 28019 + }, + { + "epoch": 1.525800984255988, + "grad_norm": 0.5817851378221217, + "learning_rate": 2.806248077954714e-05, + "loss": 11.7378, + "step": 28020 + }, + { + "epoch": 1.525855438252571, + "grad_norm": 0.5241202288140655, + "learning_rate": 2.8056355705607452e-05, + "loss": 11.7037, + "step": 28021 + }, + { + "epoch": 1.5259098922491543, + "grad_norm": 0.49166570019066175, + "learning_rate": 2.8050231191113573e-05, + "loss": 11.795, + "step": 28022 + }, + { + "epoch": 1.5259643462457373, + "grad_norm": 0.5136362026482226, + "learning_rate": 2.8044107236113203e-05, + "loss": 11.8897, + "step": 28023 + }, + { + "epoch": 1.5260188002423203, + "grad_norm": 0.5261733293137251, + "learning_rate": 2.8037983840653913e-05, + "loss": 11.8093, + "step": 28024 + }, + { + "epoch": 1.5260732542389033, + "grad_norm": 0.4760619588143287, + "learning_rate": 2.80318610047833e-05, + "loss": 11.7524, + "step": 28025 + }, + { + "epoch": 1.5261277082354863, + "grad_norm": 0.5216907813608113, + "learning_rate": 2.8025738728549046e-05, + "loss": 11.8057, + "step": 28026 + }, + { + "epoch": 1.5261821622320695, + "grad_norm": 0.4826666149391475, + "learning_rate": 2.8019617011998677e-05, + "loss": 11.759, + "step": 28027 + }, + { + "epoch": 1.5262366162286525, + "grad_norm": 0.5486646520149847, + "learning_rate": 2.8013495855179837e-05, + "loss": 11.8722, + "step": 28028 + }, + { + "epoch": 1.5262910702252355, + "grad_norm": 0.5339442745290974, + "learning_rate": 2.8007375258140156e-05, + "loss": 11.8051, + "step": 28029 + }, + { + "epoch": 1.5263455242218185, + "grad_norm": 0.5702150244759141, + "learning_rate": 2.8001255220927168e-05, + "loss": 11.8569, + "step": 28030 + }, + { + "epoch": 1.5263999782184015, + "grad_norm": 0.5393840986818391, + "learning_rate": 2.7995135743588528e-05, + "loss": 11.8107, + "step": 28031 + }, + { + "epoch": 1.5264544322149844, + "grad_norm": 0.5263479220858477, + "learning_rate": 2.7989016826171755e-05, + "loss": 11.9613, + "step": 28032 + }, + { + "epoch": 1.5265088862115674, + "grad_norm": 0.5609368998722691, + "learning_rate": 2.7982898468724493e-05, + "loss": 11.9058, + "step": 28033 + }, + { + "epoch": 1.5265633402081504, + "grad_norm": 0.5145333261289492, + "learning_rate": 2.797678067129429e-05, + "loss": 11.8689, + "step": 28034 + }, + { + "epoch": 1.5266177942047334, + "grad_norm": 0.5112366708808339, + "learning_rate": 2.797066343392869e-05, + "loss": 11.7795, + "step": 28035 + }, + { + "epoch": 1.5266722482013164, + "grad_norm": 0.5292153657436052, + "learning_rate": 2.7964546756675313e-05, + "loss": 11.8977, + "step": 28036 + }, + { + "epoch": 1.5267267021978994, + "grad_norm": 0.5187700953438499, + "learning_rate": 2.795843063958168e-05, + "loss": 11.885, + "step": 28037 + }, + { + "epoch": 1.5267811561944824, + "grad_norm": 0.5260785610507742, + "learning_rate": 2.7952315082695367e-05, + "loss": 11.7061, + "step": 28038 + }, + { + "epoch": 1.5268356101910654, + "grad_norm": 0.5926034175885343, + "learning_rate": 2.794620008606397e-05, + "loss": 11.8106, + "step": 28039 + }, + { + "epoch": 1.5268900641876484, + "grad_norm": 0.5698490004315689, + "learning_rate": 2.794008564973497e-05, + "loss": 11.8, + "step": 28040 + }, + { + "epoch": 1.5269445181842314, + "grad_norm": 0.6678278354852758, + "learning_rate": 2.7933971773755986e-05, + "loss": 11.8865, + "step": 28041 + }, + { + "epoch": 1.5269989721808144, + "grad_norm": 0.5213235120952687, + "learning_rate": 2.7927858458174483e-05, + "loss": 11.752, + "step": 28042 + }, + { + "epoch": 1.5270534261773974, + "grad_norm": 0.5463359333407151, + "learning_rate": 2.7921745703038072e-05, + "loss": 11.6669, + "step": 28043 + }, + { + "epoch": 1.5271078801739804, + "grad_norm": 0.5344727131368201, + "learning_rate": 2.7915633508394258e-05, + "loss": 11.8769, + "step": 28044 + }, + { + "epoch": 1.5271623341705636, + "grad_norm": 0.5402139975432181, + "learning_rate": 2.7909521874290524e-05, + "loss": 11.9086, + "step": 28045 + }, + { + "epoch": 1.5272167881671466, + "grad_norm": 0.6095364033149826, + "learning_rate": 2.7903410800774478e-05, + "loss": 11.901, + "step": 28046 + }, + { + "epoch": 1.5272712421637296, + "grad_norm": 0.5795695853077685, + "learning_rate": 2.7897300287893568e-05, + "loss": 11.9074, + "step": 28047 + }, + { + "epoch": 1.5273256961603126, + "grad_norm": 0.541051857417764, + "learning_rate": 2.7891190335695373e-05, + "loss": 11.8282, + "step": 28048 + }, + { + "epoch": 1.5273801501568955, + "grad_norm": 0.6023298925941373, + "learning_rate": 2.788508094422735e-05, + "loss": 11.9771, + "step": 28049 + }, + { + "epoch": 1.5274346041534785, + "grad_norm": 0.590164425957543, + "learning_rate": 2.7878972113537017e-05, + "loss": 11.8233, + "step": 28050 + }, + { + "epoch": 1.5274890581500618, + "grad_norm": 0.5883901291502481, + "learning_rate": 2.787286384367194e-05, + "loss": 11.859, + "step": 28051 + }, + { + "epoch": 1.5275435121466447, + "grad_norm": 0.5598461273887382, + "learning_rate": 2.7866756134679528e-05, + "loss": 11.8201, + "step": 28052 + }, + { + "epoch": 1.5275979661432277, + "grad_norm": 0.5510855141903137, + "learning_rate": 2.786064898660734e-05, + "loss": 11.9274, + "step": 28053 + }, + { + "epoch": 1.5276524201398107, + "grad_norm": 0.6171418019889008, + "learning_rate": 2.785454239950286e-05, + "loss": 11.9351, + "step": 28054 + }, + { + "epoch": 1.5277068741363937, + "grad_norm": 0.5441721929752072, + "learning_rate": 2.784843637341351e-05, + "loss": 11.8461, + "step": 28055 + }, + { + "epoch": 1.5277613281329767, + "grad_norm": 0.550427509481184, + "learning_rate": 2.784233090838686e-05, + "loss": 11.7901, + "step": 28056 + }, + { + "epoch": 1.5278157821295597, + "grad_norm": 0.5388240833165424, + "learning_rate": 2.7836226004470322e-05, + "loss": 11.9641, + "step": 28057 + }, + { + "epoch": 1.5278702361261427, + "grad_norm": 0.554171740403026, + "learning_rate": 2.7830121661711405e-05, + "loss": 11.837, + "step": 28058 + }, + { + "epoch": 1.5279246901227257, + "grad_norm": 0.5383805673000276, + "learning_rate": 2.782401788015755e-05, + "loss": 11.8651, + "step": 28059 + }, + { + "epoch": 1.5279791441193087, + "grad_norm": 0.5657370833017105, + "learning_rate": 2.781791465985626e-05, + "loss": 11.8957, + "step": 28060 + }, + { + "epoch": 1.5280335981158917, + "grad_norm": 0.5216301810143582, + "learning_rate": 2.781181200085494e-05, + "loss": 11.7694, + "step": 28061 + }, + { + "epoch": 1.5280880521124747, + "grad_norm": 0.4991395494787023, + "learning_rate": 2.7805709903201073e-05, + "loss": 11.8382, + "step": 28062 + }, + { + "epoch": 1.5281425061090577, + "grad_norm": 0.5198154381445942, + "learning_rate": 2.7799608366942143e-05, + "loss": 11.8512, + "step": 28063 + }, + { + "epoch": 1.5281969601056407, + "grad_norm": 0.5222690177492786, + "learning_rate": 2.779350739212557e-05, + "loss": 11.8377, + "step": 28064 + }, + { + "epoch": 1.5282514141022236, + "grad_norm": 0.4910046177255441, + "learning_rate": 2.778740697879877e-05, + "loss": 11.7966, + "step": 28065 + }, + { + "epoch": 1.5283058680988066, + "grad_norm": 0.5890142611054503, + "learning_rate": 2.7781307127009226e-05, + "loss": 11.8705, + "step": 28066 + }, + { + "epoch": 1.5283603220953896, + "grad_norm": 0.5150666867395313, + "learning_rate": 2.7775207836804317e-05, + "loss": 11.635, + "step": 28067 + }, + { + "epoch": 1.5284147760919728, + "grad_norm": 0.5877921587720709, + "learning_rate": 2.7769109108231528e-05, + "loss": 11.8718, + "step": 28068 + }, + { + "epoch": 1.5284692300885558, + "grad_norm": 0.5905701907420984, + "learning_rate": 2.776301094133824e-05, + "loss": 11.8247, + "step": 28069 + }, + { + "epoch": 1.5285236840851388, + "grad_norm": 0.5319275146743632, + "learning_rate": 2.7756913336171917e-05, + "loss": 11.885, + "step": 28070 + }, + { + "epoch": 1.5285781380817218, + "grad_norm": 0.5710362753324053, + "learning_rate": 2.7750816292779926e-05, + "loss": 11.9469, + "step": 28071 + }, + { + "epoch": 1.5286325920783048, + "grad_norm": 0.5839829945359638, + "learning_rate": 2.7744719811209695e-05, + "loss": 11.8371, + "step": 28072 + }, + { + "epoch": 1.5286870460748878, + "grad_norm": 0.5087264130086645, + "learning_rate": 2.7738623891508688e-05, + "loss": 11.7981, + "step": 28073 + }, + { + "epoch": 1.528741500071471, + "grad_norm": 0.6097435097493412, + "learning_rate": 2.7732528533724245e-05, + "loss": 12.018, + "step": 28074 + }, + { + "epoch": 1.528795954068054, + "grad_norm": 0.581255067665826, + "learning_rate": 2.772643373790378e-05, + "loss": 11.9142, + "step": 28075 + }, + { + "epoch": 1.528850408064637, + "grad_norm": 0.5529909667015764, + "learning_rate": 2.772033950409466e-05, + "loss": 11.6942, + "step": 28076 + }, + { + "epoch": 1.52890486206122, + "grad_norm": 0.5507846355961354, + "learning_rate": 2.7714245832344298e-05, + "loss": 11.9129, + "step": 28077 + }, + { + "epoch": 1.528959316057803, + "grad_norm": 0.5263795313275622, + "learning_rate": 2.770815272270012e-05, + "loss": 11.8114, + "step": 28078 + }, + { + "epoch": 1.529013770054386, + "grad_norm": 0.5915882574630816, + "learning_rate": 2.7702060175209433e-05, + "loss": 11.947, + "step": 28079 + }, + { + "epoch": 1.529068224050969, + "grad_norm": 0.5420336359228751, + "learning_rate": 2.7695968189919684e-05, + "loss": 11.6884, + "step": 28080 + }, + { + "epoch": 1.529122678047552, + "grad_norm": 0.5761177601026898, + "learning_rate": 2.7689876766878177e-05, + "loss": 11.9456, + "step": 28081 + }, + { + "epoch": 1.529177132044135, + "grad_norm": 0.6179788515082738, + "learning_rate": 2.7683785906132353e-05, + "loss": 11.9586, + "step": 28082 + }, + { + "epoch": 1.529231586040718, + "grad_norm": 0.5376348725469609, + "learning_rate": 2.7677695607729505e-05, + "loss": 11.7374, + "step": 28083 + }, + { + "epoch": 1.529286040037301, + "grad_norm": 0.5875898380240233, + "learning_rate": 2.7671605871717044e-05, + "loss": 11.8818, + "step": 28084 + }, + { + "epoch": 1.529340494033884, + "grad_norm": 0.5319121946915005, + "learning_rate": 2.7665516698142314e-05, + "loss": 11.812, + "step": 28085 + }, + { + "epoch": 1.529394948030467, + "grad_norm": 0.5232715675086153, + "learning_rate": 2.765942808705263e-05, + "loss": 11.6767, + "step": 28086 + }, + { + "epoch": 1.52944940202705, + "grad_norm": 0.587033249086594, + "learning_rate": 2.7653340038495358e-05, + "loss": 11.9199, + "step": 28087 + }, + { + "epoch": 1.529503856023633, + "grad_norm": 0.5670701469594571, + "learning_rate": 2.7647252552517878e-05, + "loss": 11.8395, + "step": 28088 + }, + { + "epoch": 1.529558310020216, + "grad_norm": 0.5456216942507206, + "learning_rate": 2.7641165629167464e-05, + "loss": 11.8476, + "step": 28089 + }, + { + "epoch": 1.529612764016799, + "grad_norm": 0.6133476880784436, + "learning_rate": 2.7635079268491516e-05, + "loss": 11.7352, + "step": 28090 + }, + { + "epoch": 1.5296672180133821, + "grad_norm": 0.5689587520370178, + "learning_rate": 2.762899347053729e-05, + "loss": 11.9563, + "step": 28091 + }, + { + "epoch": 1.5297216720099651, + "grad_norm": 0.5608064813813687, + "learning_rate": 2.7622908235352185e-05, + "loss": 11.9223, + "step": 28092 + }, + { + "epoch": 1.529776126006548, + "grad_norm": 0.5265785301100568, + "learning_rate": 2.7616823562983453e-05, + "loss": 11.8348, + "step": 28093 + }, + { + "epoch": 1.529830580003131, + "grad_norm": 0.5261958437030664, + "learning_rate": 2.761073945347846e-05, + "loss": 11.901, + "step": 28094 + }, + { + "epoch": 1.529885033999714, + "grad_norm": 0.5544943092947802, + "learning_rate": 2.7604655906884502e-05, + "loss": 11.8705, + "step": 28095 + }, + { + "epoch": 1.529939487996297, + "grad_norm": 0.5932595399334087, + "learning_rate": 2.7598572923248857e-05, + "loss": 11.9126, + "step": 28096 + }, + { + "epoch": 1.5299939419928803, + "grad_norm": 0.5241433903541007, + "learning_rate": 2.759249050261887e-05, + "loss": 11.8082, + "step": 28097 + }, + { + "epoch": 1.5300483959894633, + "grad_norm": 0.508394728574014, + "learning_rate": 2.7586408645041796e-05, + "loss": 11.8364, + "step": 28098 + }, + { + "epoch": 1.5301028499860463, + "grad_norm": 0.5660475859610954, + "learning_rate": 2.758032735056495e-05, + "loss": 11.9086, + "step": 28099 + }, + { + "epoch": 1.5301573039826293, + "grad_norm": 0.5854390292446331, + "learning_rate": 2.7574246619235654e-05, + "loss": 11.892, + "step": 28100 + }, + { + "epoch": 1.5302117579792123, + "grad_norm": 0.6135300347394426, + "learning_rate": 2.756816645110113e-05, + "loss": 11.8013, + "step": 28101 + }, + { + "epoch": 1.5302662119757953, + "grad_norm": 0.47022733445491777, + "learning_rate": 2.7562086846208723e-05, + "loss": 11.7453, + "step": 28102 + }, + { + "epoch": 1.5303206659723783, + "grad_norm": 0.5604935484080049, + "learning_rate": 2.7556007804605654e-05, + "loss": 11.8919, + "step": 28103 + }, + { + "epoch": 1.5303751199689613, + "grad_norm": 0.5560539287446726, + "learning_rate": 2.754992932633924e-05, + "loss": 11.9655, + "step": 28104 + }, + { + "epoch": 1.5304295739655442, + "grad_norm": 0.5035252030556944, + "learning_rate": 2.7543851411456722e-05, + "loss": 11.845, + "step": 28105 + }, + { + "epoch": 1.5304840279621272, + "grad_norm": 0.5998247291009688, + "learning_rate": 2.753777406000534e-05, + "loss": 11.8718, + "step": 28106 + }, + { + "epoch": 1.5305384819587102, + "grad_norm": 0.6277000221854228, + "learning_rate": 2.7531697272032408e-05, + "loss": 11.8319, + "step": 28107 + }, + { + "epoch": 1.5305929359552932, + "grad_norm": 0.5311575974019177, + "learning_rate": 2.7525621047585115e-05, + "loss": 11.862, + "step": 28108 + }, + { + "epoch": 1.5306473899518762, + "grad_norm": 0.5240917378904438, + "learning_rate": 2.7519545386710787e-05, + "loss": 11.6807, + "step": 28109 + }, + { + "epoch": 1.5307018439484592, + "grad_norm": 0.5482481864827582, + "learning_rate": 2.75134702894566e-05, + "loss": 11.7755, + "step": 28110 + }, + { + "epoch": 1.5307562979450422, + "grad_norm": 0.5199580600118068, + "learning_rate": 2.7507395755869813e-05, + "loss": 11.7715, + "step": 28111 + }, + { + "epoch": 1.5308107519416252, + "grad_norm": 0.46853550384032233, + "learning_rate": 2.7501321785997713e-05, + "loss": 11.8175, + "step": 28112 + }, + { + "epoch": 1.5308652059382082, + "grad_norm": 0.5378687057721647, + "learning_rate": 2.7495248379887462e-05, + "loss": 11.794, + "step": 28113 + }, + { + "epoch": 1.5309196599347912, + "grad_norm": 0.6076040383716326, + "learning_rate": 2.7489175537586342e-05, + "loss": 11.9345, + "step": 28114 + }, + { + "epoch": 1.5309741139313744, + "grad_norm": 0.5302827915958461, + "learning_rate": 2.748310325914155e-05, + "loss": 11.78, + "step": 28115 + }, + { + "epoch": 1.5310285679279574, + "grad_norm": 0.5206254842623453, + "learning_rate": 2.7477031544600297e-05, + "loss": 11.844, + "step": 28116 + }, + { + "epoch": 1.5310830219245404, + "grad_norm": 0.5782492612681784, + "learning_rate": 2.7470960394009826e-05, + "loss": 11.6907, + "step": 28117 + }, + { + "epoch": 1.5311374759211234, + "grad_norm": 0.5816187740888729, + "learning_rate": 2.7464889807417293e-05, + "loss": 11.7165, + "step": 28118 + }, + { + "epoch": 1.5311919299177064, + "grad_norm": 0.5147939602641817, + "learning_rate": 2.7458819784869983e-05, + "loss": 11.7735, + "step": 28119 + }, + { + "epoch": 1.5312463839142894, + "grad_norm": 0.5354121280135299, + "learning_rate": 2.745275032641502e-05, + "loss": 11.6952, + "step": 28120 + }, + { + "epoch": 1.5313008379108726, + "grad_norm": 0.5344169272867186, + "learning_rate": 2.7446681432099642e-05, + "loss": 11.8144, + "step": 28121 + }, + { + "epoch": 1.5313552919074556, + "grad_norm": 0.5251465455208807, + "learning_rate": 2.744061310197106e-05, + "loss": 11.766, + "step": 28122 + }, + { + "epoch": 1.5314097459040386, + "grad_norm": 0.5721180654907436, + "learning_rate": 2.743454533607641e-05, + "loss": 11.814, + "step": 28123 + }, + { + "epoch": 1.5314641999006215, + "grad_norm": 0.5198986708643252, + "learning_rate": 2.7428478134462963e-05, + "loss": 11.801, + "step": 28124 + }, + { + "epoch": 1.5315186538972045, + "grad_norm": 0.6349634894030276, + "learning_rate": 2.742241149717779e-05, + "loss": 11.8605, + "step": 28125 + }, + { + "epoch": 1.5315731078937875, + "grad_norm": 0.5476776064162383, + "learning_rate": 2.7416345424268107e-05, + "loss": 11.8861, + "step": 28126 + }, + { + "epoch": 1.5316275618903705, + "grad_norm": 0.5389532605450704, + "learning_rate": 2.7410279915781123e-05, + "loss": 11.9331, + "step": 28127 + }, + { + "epoch": 1.5316820158869535, + "grad_norm": 0.7137800699458078, + "learning_rate": 2.7404214971763952e-05, + "loss": 11.8757, + "step": 28128 + }, + { + "epoch": 1.5317364698835365, + "grad_norm": 0.6023696650918151, + "learning_rate": 2.7398150592263805e-05, + "loss": 11.7711, + "step": 28129 + }, + { + "epoch": 1.5317909238801195, + "grad_norm": 0.5275100704514966, + "learning_rate": 2.7392086777327787e-05, + "loss": 11.9261, + "step": 28130 + }, + { + "epoch": 1.5318453778767025, + "grad_norm": 0.5377998200518751, + "learning_rate": 2.738602352700311e-05, + "loss": 11.8072, + "step": 28131 + }, + { + "epoch": 1.5318998318732855, + "grad_norm": 0.5857096227630232, + "learning_rate": 2.737996084133686e-05, + "loss": 11.9087, + "step": 28132 + }, + { + "epoch": 1.5319542858698685, + "grad_norm": 0.608249892848749, + "learning_rate": 2.7373898720376212e-05, + "loss": 11.8523, + "step": 28133 + }, + { + "epoch": 1.5320087398664515, + "grad_norm": 0.5743496564238021, + "learning_rate": 2.7367837164168375e-05, + "loss": 11.7277, + "step": 28134 + }, + { + "epoch": 1.5320631938630345, + "grad_norm": 0.5774906890183263, + "learning_rate": 2.7361776172760346e-05, + "loss": 11.8523, + "step": 28135 + }, + { + "epoch": 1.5321176478596175, + "grad_norm": 0.5693847254039437, + "learning_rate": 2.7355715746199327e-05, + "loss": 11.8029, + "step": 28136 + }, + { + "epoch": 1.5321721018562005, + "grad_norm": 0.48642514926955, + "learning_rate": 2.734965588453248e-05, + "loss": 11.7881, + "step": 28137 + }, + { + "epoch": 1.5322265558527837, + "grad_norm": 0.5208125835512221, + "learning_rate": 2.7343596587806865e-05, + "loss": 11.769, + "step": 28138 + }, + { + "epoch": 1.5322810098493667, + "grad_norm": 0.5756246884970783, + "learning_rate": 2.7337537856069652e-05, + "loss": 11.8422, + "step": 28139 + }, + { + "epoch": 1.5323354638459497, + "grad_norm": 0.5510921702394357, + "learning_rate": 2.7331479689367888e-05, + "loss": 11.8868, + "step": 28140 + }, + { + "epoch": 1.5323899178425326, + "grad_norm": 0.6054078622464308, + "learning_rate": 2.7325422087748776e-05, + "loss": 11.8649, + "step": 28141 + }, + { + "epoch": 1.5324443718391156, + "grad_norm": 0.5215026850061641, + "learning_rate": 2.7319365051259326e-05, + "loss": 11.848, + "step": 28142 + }, + { + "epoch": 1.5324988258356986, + "grad_norm": 0.5448259488081963, + "learning_rate": 2.7313308579946684e-05, + "loss": 11.8352, + "step": 28143 + }, + { + "epoch": 1.5325532798322818, + "grad_norm": 0.536583365184915, + "learning_rate": 2.7307252673858007e-05, + "loss": 11.7611, + "step": 28144 + }, + { + "epoch": 1.5326077338288648, + "grad_norm": 0.5328057211891392, + "learning_rate": 2.7301197333040264e-05, + "loss": 11.8555, + "step": 28145 + }, + { + "epoch": 1.5326621878254478, + "grad_norm": 0.5382358859349715, + "learning_rate": 2.729514255754063e-05, + "loss": 11.8034, + "step": 28146 + }, + { + "epoch": 1.5327166418220308, + "grad_norm": 0.54372606929296, + "learning_rate": 2.7289088347406135e-05, + "loss": 11.8172, + "step": 28147 + }, + { + "epoch": 1.5327710958186138, + "grad_norm": 0.5071314621892234, + "learning_rate": 2.7283034702683885e-05, + "loss": 11.771, + "step": 28148 + }, + { + "epoch": 1.5328255498151968, + "grad_norm": 0.47760232581095086, + "learning_rate": 2.7276981623420974e-05, + "loss": 11.7506, + "step": 28149 + }, + { + "epoch": 1.5328800038117798, + "grad_norm": 0.5779126067885992, + "learning_rate": 2.7270929109664423e-05, + "loss": 11.776, + "step": 28150 + }, + { + "epoch": 1.5329344578083628, + "grad_norm": 0.532756986419023, + "learning_rate": 2.7264877161461343e-05, + "loss": 11.8969, + "step": 28151 + }, + { + "epoch": 1.5329889118049458, + "grad_norm": 0.5301360810526304, + "learning_rate": 2.7258825778858764e-05, + "loss": 11.909, + "step": 28152 + }, + { + "epoch": 1.5330433658015288, + "grad_norm": 0.5409486462875552, + "learning_rate": 2.725277496190377e-05, + "loss": 11.855, + "step": 28153 + }, + { + "epoch": 1.5330978197981118, + "grad_norm": 0.5483761385219517, + "learning_rate": 2.724672471064341e-05, + "loss": 11.8397, + "step": 28154 + }, + { + "epoch": 1.5331522737946948, + "grad_norm": 0.5196613633243852, + "learning_rate": 2.7240675025124684e-05, + "loss": 11.8376, + "step": 28155 + }, + { + "epoch": 1.5332067277912778, + "grad_norm": 0.5921240861695857, + "learning_rate": 2.7234625905394697e-05, + "loss": 11.9667, + "step": 28156 + }, + { + "epoch": 1.5332611817878607, + "grad_norm": 0.5912985763166101, + "learning_rate": 2.7228577351500427e-05, + "loss": 11.991, + "step": 28157 + }, + { + "epoch": 1.5333156357844437, + "grad_norm": 0.54196239765597, + "learning_rate": 2.7222529363488992e-05, + "loss": 11.7573, + "step": 28158 + }, + { + "epoch": 1.5333700897810267, + "grad_norm": 0.5850025102559534, + "learning_rate": 2.7216481941407323e-05, + "loss": 11.9364, + "step": 28159 + }, + { + "epoch": 1.5334245437776097, + "grad_norm": 0.5977439025733949, + "learning_rate": 2.721043508530251e-05, + "loss": 11.9675, + "step": 28160 + }, + { + "epoch": 1.533478997774193, + "grad_norm": 0.5388939351892333, + "learning_rate": 2.720438879522158e-05, + "loss": 11.8457, + "step": 28161 + }, + { + "epoch": 1.533533451770776, + "grad_norm": 0.5213198958516342, + "learning_rate": 2.7198343071211508e-05, + "loss": 11.789, + "step": 28162 + }, + { + "epoch": 1.533587905767359, + "grad_norm": 0.5201233933753517, + "learning_rate": 2.7192297913319363e-05, + "loss": 11.7326, + "step": 28163 + }, + { + "epoch": 1.533642359763942, + "grad_norm": 0.5696029144591914, + "learning_rate": 2.7186253321592114e-05, + "loss": 11.8743, + "step": 28164 + }, + { + "epoch": 1.533696813760525, + "grad_norm": 0.5661831990271644, + "learning_rate": 2.718020929607673e-05, + "loss": 11.5473, + "step": 28165 + }, + { + "epoch": 1.533751267757108, + "grad_norm": 0.5733737118565293, + "learning_rate": 2.7174165836820297e-05, + "loss": 11.897, + "step": 28166 + }, + { + "epoch": 1.5338057217536911, + "grad_norm": 0.5742035351847482, + "learning_rate": 2.7168122943869723e-05, + "loss": 11.8856, + "step": 28167 + }, + { + "epoch": 1.5338601757502741, + "grad_norm": 0.5999607268231191, + "learning_rate": 2.716208061727208e-05, + "loss": 11.8658, + "step": 28168 + }, + { + "epoch": 1.533914629746857, + "grad_norm": 0.5218650452531289, + "learning_rate": 2.715603885707427e-05, + "loss": 11.7596, + "step": 28169 + }, + { + "epoch": 1.53396908374344, + "grad_norm": 0.5579045882793019, + "learning_rate": 2.7149997663323323e-05, + "loss": 11.8881, + "step": 28170 + }, + { + "epoch": 1.534023537740023, + "grad_norm": 0.5119751726399356, + "learning_rate": 2.7143957036066238e-05, + "loss": 11.8144, + "step": 28171 + }, + { + "epoch": 1.534077991736606, + "grad_norm": 0.528054243795056, + "learning_rate": 2.713791697534994e-05, + "loss": 11.7638, + "step": 28172 + }, + { + "epoch": 1.534132445733189, + "grad_norm": 0.6120665347017282, + "learning_rate": 2.7131877481221448e-05, + "loss": 11.6907, + "step": 28173 + }, + { + "epoch": 1.534186899729772, + "grad_norm": 0.5020852473806008, + "learning_rate": 2.712583855372769e-05, + "loss": 11.791, + "step": 28174 + }, + { + "epoch": 1.534241353726355, + "grad_norm": 0.5512264992726531, + "learning_rate": 2.71198001929156e-05, + "loss": 11.7513, + "step": 28175 + }, + { + "epoch": 1.534295807722938, + "grad_norm": 0.5514363641125494, + "learning_rate": 2.71137623988322e-05, + "loss": 11.8073, + "step": 28176 + }, + { + "epoch": 1.534350261719521, + "grad_norm": 0.5312078707535559, + "learning_rate": 2.7107725171524378e-05, + "loss": 11.8327, + "step": 28177 + }, + { + "epoch": 1.534404715716104, + "grad_norm": 0.5247512696484431, + "learning_rate": 2.7101688511039137e-05, + "loss": 11.7713, + "step": 28178 + }, + { + "epoch": 1.534459169712687, + "grad_norm": 0.5459385675423508, + "learning_rate": 2.7095652417423357e-05, + "loss": 11.8542, + "step": 28179 + }, + { + "epoch": 1.53451362370927, + "grad_norm": 0.5749601248779559, + "learning_rate": 2.7089616890724044e-05, + "loss": 11.8236, + "step": 28180 + }, + { + "epoch": 1.534568077705853, + "grad_norm": 0.5664622210684159, + "learning_rate": 2.7083581930988065e-05, + "loss": 11.8597, + "step": 28181 + }, + { + "epoch": 1.534622531702436, + "grad_norm": 0.5690130145344745, + "learning_rate": 2.7077547538262392e-05, + "loss": 11.6244, + "step": 28182 + }, + { + "epoch": 1.534676985699019, + "grad_norm": 0.6098614195320473, + "learning_rate": 2.7071513712593954e-05, + "loss": 11.8586, + "step": 28183 + }, + { + "epoch": 1.534731439695602, + "grad_norm": 0.5345772468011127, + "learning_rate": 2.7065480454029635e-05, + "loss": 11.8058, + "step": 28184 + }, + { + "epoch": 1.5347858936921852, + "grad_norm": 0.5004064107412757, + "learning_rate": 2.7059447762616396e-05, + "loss": 11.692, + "step": 28185 + }, + { + "epoch": 1.5348403476887682, + "grad_norm": 0.5509148116583732, + "learning_rate": 2.7053415638401125e-05, + "loss": 11.8781, + "step": 28186 + }, + { + "epoch": 1.5348948016853512, + "grad_norm": 0.5388835013163247, + "learning_rate": 2.7047384081430706e-05, + "loss": 11.9216, + "step": 28187 + }, + { + "epoch": 1.5349492556819342, + "grad_norm": 0.5831123752442173, + "learning_rate": 2.7041353091752076e-05, + "loss": 11.878, + "step": 28188 + }, + { + "epoch": 1.5350037096785172, + "grad_norm": 0.5942715795712129, + "learning_rate": 2.70353226694121e-05, + "loss": 11.8724, + "step": 28189 + }, + { + "epoch": 1.5350581636751004, + "grad_norm": 0.576071406714683, + "learning_rate": 2.7029292814457717e-05, + "loss": 11.8425, + "step": 28190 + }, + { + "epoch": 1.5351126176716834, + "grad_norm": 0.5763627257223947, + "learning_rate": 2.702326352693576e-05, + "loss": 11.7898, + "step": 28191 + }, + { + "epoch": 1.5351670716682664, + "grad_norm": 0.543371020897107, + "learning_rate": 2.7017234806893143e-05, + "loss": 11.8641, + "step": 28192 + }, + { + "epoch": 1.5352215256648494, + "grad_norm": 0.6112162290672735, + "learning_rate": 2.7011206654376787e-05, + "loss": 11.8811, + "step": 28193 + }, + { + "epoch": 1.5352759796614324, + "grad_norm": 0.5228178862268024, + "learning_rate": 2.7005179069433482e-05, + "loss": 11.7781, + "step": 28194 + }, + { + "epoch": 1.5353304336580154, + "grad_norm": 0.5842052961124035, + "learning_rate": 2.6999152052110222e-05, + "loss": 11.9354, + "step": 28195 + }, + { + "epoch": 1.5353848876545984, + "grad_norm": 0.5523614778190836, + "learning_rate": 2.6993125602453728e-05, + "loss": 11.8795, + "step": 28196 + }, + { + "epoch": 1.5354393416511813, + "grad_norm": 0.561699747227993, + "learning_rate": 2.6987099720510933e-05, + "loss": 11.7798, + "step": 28197 + }, + { + "epoch": 1.5354937956477643, + "grad_norm": 0.5862358470888648, + "learning_rate": 2.6981074406328732e-05, + "loss": 11.894, + "step": 28198 + }, + { + "epoch": 1.5355482496443473, + "grad_norm": 0.5816568902861724, + "learning_rate": 2.697504965995391e-05, + "loss": 11.6072, + "step": 28199 + }, + { + "epoch": 1.5356027036409303, + "grad_norm": 0.5415710531098747, + "learning_rate": 2.6969025481433375e-05, + "loss": 11.8295, + "step": 28200 + }, + { + "epoch": 1.5356571576375133, + "grad_norm": 0.5528077817902902, + "learning_rate": 2.6963001870813907e-05, + "loss": 11.9264, + "step": 28201 + }, + { + "epoch": 1.5357116116340963, + "grad_norm": 0.5498728942280875, + "learning_rate": 2.695697882814243e-05, + "loss": 11.808, + "step": 28202 + }, + { + "epoch": 1.5357660656306793, + "grad_norm": 0.5274560242694658, + "learning_rate": 2.6950956353465697e-05, + "loss": 11.847, + "step": 28203 + }, + { + "epoch": 1.5358205196272623, + "grad_norm": 0.7194185033400121, + "learning_rate": 2.6944934446830584e-05, + "loss": 11.9278, + "step": 28204 + }, + { + "epoch": 1.5358749736238453, + "grad_norm": 0.5294529582944018, + "learning_rate": 2.693891310828396e-05, + "loss": 11.8302, + "step": 28205 + }, + { + "epoch": 1.5359294276204283, + "grad_norm": 0.5764854018767049, + "learning_rate": 2.693289233787255e-05, + "loss": 11.8552, + "step": 28206 + }, + { + "epoch": 1.5359838816170113, + "grad_norm": 0.5182438175989933, + "learning_rate": 2.6926872135643244e-05, + "loss": 11.7975, + "step": 28207 + }, + { + "epoch": 1.5360383356135945, + "grad_norm": 0.5586890704804036, + "learning_rate": 2.6920852501642812e-05, + "loss": 11.9141, + "step": 28208 + }, + { + "epoch": 1.5360927896101775, + "grad_norm": 0.5515382589733375, + "learning_rate": 2.6914833435918085e-05, + "loss": 11.6309, + "step": 28209 + }, + { + "epoch": 1.5361472436067605, + "grad_norm": 0.5639640398732918, + "learning_rate": 2.6908814938515904e-05, + "loss": 11.7213, + "step": 28210 + }, + { + "epoch": 1.5362016976033435, + "grad_norm": 0.5368623491940676, + "learning_rate": 2.6902797009482992e-05, + "loss": 11.7836, + "step": 28211 + }, + { + "epoch": 1.5362561515999265, + "grad_norm": 0.5347723030275935, + "learning_rate": 2.6896779648866234e-05, + "loss": 11.7712, + "step": 28212 + }, + { + "epoch": 1.5363106055965094, + "grad_norm": 0.561264192490462, + "learning_rate": 2.6890762856712338e-05, + "loss": 11.9036, + "step": 28213 + }, + { + "epoch": 1.5363650595930927, + "grad_norm": 0.5754915513990667, + "learning_rate": 2.688474663306816e-05, + "loss": 11.867, + "step": 28214 + }, + { + "epoch": 1.5364195135896757, + "grad_norm": 0.5205203897511063, + "learning_rate": 2.687873097798046e-05, + "loss": 11.8342, + "step": 28215 + }, + { + "epoch": 1.5364739675862586, + "grad_norm": 0.5193764182969883, + "learning_rate": 2.6872715891495982e-05, + "loss": 11.7938, + "step": 28216 + }, + { + "epoch": 1.5365284215828416, + "grad_norm": 0.593055727163197, + "learning_rate": 2.6866701373661552e-05, + "loss": 11.9238, + "step": 28217 + }, + { + "epoch": 1.5365828755794246, + "grad_norm": 0.5334897617058839, + "learning_rate": 2.686068742452389e-05, + "loss": 11.8103, + "step": 28218 + }, + { + "epoch": 1.5366373295760076, + "grad_norm": 0.5530211029638681, + "learning_rate": 2.685467404412978e-05, + "loss": 11.9189, + "step": 28219 + }, + { + "epoch": 1.5366917835725906, + "grad_norm": 0.5918491619100299, + "learning_rate": 2.684866123252603e-05, + "loss": 11.916, + "step": 28220 + }, + { + "epoch": 1.5367462375691736, + "grad_norm": 0.6564766542400814, + "learning_rate": 2.684264898975932e-05, + "loss": 11.9384, + "step": 28221 + }, + { + "epoch": 1.5368006915657566, + "grad_norm": 0.5300217143991779, + "learning_rate": 2.683663731587648e-05, + "loss": 11.8974, + "step": 28222 + }, + { + "epoch": 1.5368551455623396, + "grad_norm": 0.6238055965656362, + "learning_rate": 2.6830626210924182e-05, + "loss": 11.8434, + "step": 28223 + }, + { + "epoch": 1.5369095995589226, + "grad_norm": 0.4747300939338794, + "learning_rate": 2.6824615674949227e-05, + "loss": 11.876, + "step": 28224 + }, + { + "epoch": 1.5369640535555056, + "grad_norm": 0.5202331247292236, + "learning_rate": 2.6818605707998345e-05, + "loss": 11.7362, + "step": 28225 + }, + { + "epoch": 1.5370185075520886, + "grad_norm": 0.6266174323981987, + "learning_rate": 2.681259631011822e-05, + "loss": 11.8971, + "step": 28226 + }, + { + "epoch": 1.5370729615486716, + "grad_norm": 0.5401195218607323, + "learning_rate": 2.680658748135566e-05, + "loss": 11.8222, + "step": 28227 + }, + { + "epoch": 1.5371274155452546, + "grad_norm": 0.5287654703216333, + "learning_rate": 2.6800579221757306e-05, + "loss": 11.8862, + "step": 28228 + }, + { + "epoch": 1.5371818695418376, + "grad_norm": 0.5382665462414589, + "learning_rate": 2.679457153136996e-05, + "loss": 11.838, + "step": 28229 + }, + { + "epoch": 1.5372363235384205, + "grad_norm": 0.5244955209680553, + "learning_rate": 2.678856441024028e-05, + "loss": 11.7849, + "step": 28230 + }, + { + "epoch": 1.5372907775350038, + "grad_norm": 0.5285389177552661, + "learning_rate": 2.6782557858414992e-05, + "loss": 11.8859, + "step": 28231 + }, + { + "epoch": 1.5373452315315868, + "grad_norm": 0.5646328371339733, + "learning_rate": 2.6776551875940847e-05, + "loss": 11.8143, + "step": 28232 + }, + { + "epoch": 1.5373996855281697, + "grad_norm": 0.5363893936188058, + "learning_rate": 2.6770546462864478e-05, + "loss": 11.8303, + "step": 28233 + }, + { + "epoch": 1.5374541395247527, + "grad_norm": 0.5184131565912657, + "learning_rate": 2.676454161923265e-05, + "loss": 11.9607, + "step": 28234 + }, + { + "epoch": 1.5375085935213357, + "grad_norm": 0.5467243664910497, + "learning_rate": 2.6758537345092037e-05, + "loss": 11.7396, + "step": 28235 + }, + { + "epoch": 1.5375630475179187, + "grad_norm": 0.5002391440336198, + "learning_rate": 2.6752533640489284e-05, + "loss": 11.6786, + "step": 28236 + }, + { + "epoch": 1.537617501514502, + "grad_norm": 0.5546305245961396, + "learning_rate": 2.6746530505471147e-05, + "loss": 11.8863, + "step": 28237 + }, + { + "epoch": 1.537671955511085, + "grad_norm": 0.5463276192766239, + "learning_rate": 2.6740527940084236e-05, + "loss": 11.7478, + "step": 28238 + }, + { + "epoch": 1.537726409507668, + "grad_norm": 0.6154320058044087, + "learning_rate": 2.67345259443753e-05, + "loss": 11.8903, + "step": 28239 + }, + { + "epoch": 1.537780863504251, + "grad_norm": 0.5863681216047066, + "learning_rate": 2.6728524518390952e-05, + "loss": 11.8566, + "step": 28240 + }, + { + "epoch": 1.537835317500834, + "grad_norm": 0.5603478206952835, + "learning_rate": 2.672252366217788e-05, + "loss": 11.8774, + "step": 28241 + }, + { + "epoch": 1.537889771497417, + "grad_norm": 0.5593682753283273, + "learning_rate": 2.6716523375782776e-05, + "loss": 11.7486, + "step": 28242 + }, + { + "epoch": 1.537944225494, + "grad_norm": 0.5579982858200164, + "learning_rate": 2.6710523659252252e-05, + "loss": 11.6957, + "step": 28243 + }, + { + "epoch": 1.5379986794905829, + "grad_norm": 0.5395499117612986, + "learning_rate": 2.670452451263301e-05, + "loss": 11.9338, + "step": 28244 + }, + { + "epoch": 1.5380531334871659, + "grad_norm": 0.5455376572211799, + "learning_rate": 2.6698525935971685e-05, + "loss": 11.7827, + "step": 28245 + }, + { + "epoch": 1.5381075874837489, + "grad_norm": 0.5247406524040752, + "learning_rate": 2.6692527929314893e-05, + "loss": 11.8625, + "step": 28246 + }, + { + "epoch": 1.5381620414803319, + "grad_norm": 0.5160767431977465, + "learning_rate": 2.6686530492709316e-05, + "loss": 11.7607, + "step": 28247 + }, + { + "epoch": 1.5382164954769149, + "grad_norm": 0.6140882322238727, + "learning_rate": 2.6680533626201544e-05, + "loss": 11.9011, + "step": 28248 + }, + { + "epoch": 1.5382709494734979, + "grad_norm": 0.5190865977042493, + "learning_rate": 2.6674537329838267e-05, + "loss": 11.8215, + "step": 28249 + }, + { + "epoch": 1.5383254034700808, + "grad_norm": 0.5606977311297707, + "learning_rate": 2.6668541603666052e-05, + "loss": 11.7624, + "step": 28250 + }, + { + "epoch": 1.5383798574666638, + "grad_norm": 0.5544608244481404, + "learning_rate": 2.6662546447731594e-05, + "loss": 11.866, + "step": 28251 + }, + { + "epoch": 1.5384343114632468, + "grad_norm": 0.5405060388636937, + "learning_rate": 2.6656551862081437e-05, + "loss": 11.8598, + "step": 28252 + }, + { + "epoch": 1.5384887654598298, + "grad_norm": 0.5814147566330362, + "learning_rate": 2.6650557846762237e-05, + "loss": 11.7744, + "step": 28253 + }, + { + "epoch": 1.5385432194564128, + "grad_norm": 0.4946928011055275, + "learning_rate": 2.6644564401820626e-05, + "loss": 11.7776, + "step": 28254 + }, + { + "epoch": 1.538597673452996, + "grad_norm": 0.5513304184837484, + "learning_rate": 2.663857152730319e-05, + "loss": 11.8948, + "step": 28255 + }, + { + "epoch": 1.538652127449579, + "grad_norm": 0.5325891969231507, + "learning_rate": 2.6632579223256515e-05, + "loss": 11.8826, + "step": 28256 + }, + { + "epoch": 1.538706581446162, + "grad_norm": 0.5272251203217627, + "learning_rate": 2.6626587489727184e-05, + "loss": 11.6687, + "step": 28257 + }, + { + "epoch": 1.538761035442745, + "grad_norm": 0.5104341806424597, + "learning_rate": 2.6620596326761814e-05, + "loss": 11.7976, + "step": 28258 + }, + { + "epoch": 1.538815489439328, + "grad_norm": 0.6044950523686626, + "learning_rate": 2.6614605734407016e-05, + "loss": 12.0231, + "step": 28259 + }, + { + "epoch": 1.5388699434359112, + "grad_norm": 0.509313261244652, + "learning_rate": 2.660861571270933e-05, + "loss": 11.8104, + "step": 28260 + }, + { + "epoch": 1.5389243974324942, + "grad_norm": 0.5832287795387571, + "learning_rate": 2.660262626171538e-05, + "loss": 11.9543, + "step": 28261 + }, + { + "epoch": 1.5389788514290772, + "grad_norm": 0.5313190113670253, + "learning_rate": 2.659663738147169e-05, + "loss": 11.9242, + "step": 28262 + }, + { + "epoch": 1.5390333054256602, + "grad_norm": 0.5440514986939275, + "learning_rate": 2.659064907202489e-05, + "loss": 11.9818, + "step": 28263 + }, + { + "epoch": 1.5390877594222432, + "grad_norm": 0.5605878741267645, + "learning_rate": 2.658466133342147e-05, + "loss": 11.9153, + "step": 28264 + }, + { + "epoch": 1.5391422134188262, + "grad_norm": 0.5532632315372058, + "learning_rate": 2.6578674165708073e-05, + "loss": 11.8579, + "step": 28265 + }, + { + "epoch": 1.5391966674154092, + "grad_norm": 0.5740420566631218, + "learning_rate": 2.657268756893122e-05, + "loss": 11.8121, + "step": 28266 + }, + { + "epoch": 1.5392511214119922, + "grad_norm": 0.5150297688552529, + "learning_rate": 2.656670154313744e-05, + "loss": 11.7826, + "step": 28267 + }, + { + "epoch": 1.5393055754085752, + "grad_norm": 0.5225642522836905, + "learning_rate": 2.6560716088373294e-05, + "loss": 11.7241, + "step": 28268 + }, + { + "epoch": 1.5393600294051581, + "grad_norm": 0.5335641716258503, + "learning_rate": 2.655473120468537e-05, + "loss": 11.8361, + "step": 28269 + }, + { + "epoch": 1.5394144834017411, + "grad_norm": 0.5868537446099409, + "learning_rate": 2.6548746892120136e-05, + "loss": 11.8187, + "step": 28270 + }, + { + "epoch": 1.5394689373983241, + "grad_norm": 0.5275408793199444, + "learning_rate": 2.6542763150724192e-05, + "loss": 11.8821, + "step": 28271 + }, + { + "epoch": 1.5395233913949071, + "grad_norm": 0.5738982718035953, + "learning_rate": 2.6536779980544024e-05, + "loss": 11.7754, + "step": 28272 + }, + { + "epoch": 1.5395778453914901, + "grad_norm": 0.550355726212609, + "learning_rate": 2.6530797381626194e-05, + "loss": 11.8614, + "step": 28273 + }, + { + "epoch": 1.5396322993880731, + "grad_norm": 0.5776006089965645, + "learning_rate": 2.6524815354017175e-05, + "loss": 11.8015, + "step": 28274 + }, + { + "epoch": 1.539686753384656, + "grad_norm": 0.5433736193633831, + "learning_rate": 2.651883389776355e-05, + "loss": 11.8397, + "step": 28275 + }, + { + "epoch": 1.539741207381239, + "grad_norm": 0.5378596487178605, + "learning_rate": 2.6512853012911787e-05, + "loss": 11.7706, + "step": 28276 + }, + { + "epoch": 1.539795661377822, + "grad_norm": 0.5774975567190883, + "learning_rate": 2.650687269950838e-05, + "loss": 11.8928, + "step": 28277 + }, + { + "epoch": 1.5398501153744053, + "grad_norm": 0.670412656161244, + "learning_rate": 2.650089295759989e-05, + "loss": 11.7932, + "step": 28278 + }, + { + "epoch": 1.5399045693709883, + "grad_norm": 0.6209052285760718, + "learning_rate": 2.6494913787232735e-05, + "loss": 12.094, + "step": 28279 + }, + { + "epoch": 1.5399590233675713, + "grad_norm": 0.5434472006128124, + "learning_rate": 2.648893518845347e-05, + "loss": 11.7558, + "step": 28280 + }, + { + "epoch": 1.5400134773641543, + "grad_norm": 0.507490407696909, + "learning_rate": 2.6482957161308598e-05, + "loss": 11.7851, + "step": 28281 + }, + { + "epoch": 1.5400679313607373, + "grad_norm": 0.5185829514338437, + "learning_rate": 2.6476979705844552e-05, + "loss": 11.8925, + "step": 28282 + }, + { + "epoch": 1.5401223853573203, + "grad_norm": 0.5818697985336496, + "learning_rate": 2.647100282210787e-05, + "loss": 11.7104, + "step": 28283 + }, + { + "epoch": 1.5401768393539035, + "grad_norm": 0.5285137941617885, + "learning_rate": 2.6465026510144975e-05, + "loss": 11.8305, + "step": 28284 + }, + { + "epoch": 1.5402312933504865, + "grad_norm": 0.5613587002440537, + "learning_rate": 2.645905077000239e-05, + "loss": 11.8033, + "step": 28285 + }, + { + "epoch": 1.5402857473470695, + "grad_norm": 0.5563490881689548, + "learning_rate": 2.6453075601726562e-05, + "loss": 11.981, + "step": 28286 + }, + { + "epoch": 1.5403402013436525, + "grad_norm": 0.5663931025263615, + "learning_rate": 2.6447101005363927e-05, + "loss": 11.826, + "step": 28287 + }, + { + "epoch": 1.5403946553402355, + "grad_norm": 0.5531980679284013, + "learning_rate": 2.6441126980960994e-05, + "loss": 11.8848, + "step": 28288 + }, + { + "epoch": 1.5404491093368184, + "grad_norm": 0.5424048762741972, + "learning_rate": 2.643515352856416e-05, + "loss": 11.7974, + "step": 28289 + }, + { + "epoch": 1.5405035633334014, + "grad_norm": 0.533448142546899, + "learning_rate": 2.642918064821992e-05, + "loss": 11.8026, + "step": 28290 + }, + { + "epoch": 1.5405580173299844, + "grad_norm": 0.5165036958476561, + "learning_rate": 2.6423208339974725e-05, + "loss": 11.9402, + "step": 28291 + }, + { + "epoch": 1.5406124713265674, + "grad_norm": 0.5354052136056343, + "learning_rate": 2.6417236603874982e-05, + "loss": 11.8617, + "step": 28292 + }, + { + "epoch": 1.5406669253231504, + "grad_norm": 0.592659981834093, + "learning_rate": 2.641126543996717e-05, + "loss": 11.8951, + "step": 28293 + }, + { + "epoch": 1.5407213793197334, + "grad_norm": 0.5192023515452038, + "learning_rate": 2.640529484829768e-05, + "loss": 11.7969, + "step": 28294 + }, + { + "epoch": 1.5407758333163164, + "grad_norm": 0.5714755675377791, + "learning_rate": 2.6399324828912973e-05, + "loss": 11.9247, + "step": 28295 + }, + { + "epoch": 1.5408302873128994, + "grad_norm": 0.5223828682124572, + "learning_rate": 2.6393355381859475e-05, + "loss": 11.7491, + "step": 28296 + }, + { + "epoch": 1.5408847413094824, + "grad_norm": 0.6046746840026958, + "learning_rate": 2.638738650718354e-05, + "loss": 11.8696, + "step": 28297 + }, + { + "epoch": 1.5409391953060654, + "grad_norm": 0.5080778644015773, + "learning_rate": 2.6381418204931684e-05, + "loss": 11.8089, + "step": 28298 + }, + { + "epoch": 1.5409936493026484, + "grad_norm": 0.5254248962893969, + "learning_rate": 2.637545047515022e-05, + "loss": 11.7532, + "step": 28299 + }, + { + "epoch": 1.5410481032992314, + "grad_norm": 0.5641932917252481, + "learning_rate": 2.6369483317885637e-05, + "loss": 11.8597, + "step": 28300 + }, + { + "epoch": 1.5411025572958146, + "grad_norm": 0.5175369706337605, + "learning_rate": 2.636351673318427e-05, + "loss": 11.7573, + "step": 28301 + }, + { + "epoch": 1.5411570112923976, + "grad_norm": 0.6062429143215914, + "learning_rate": 2.6357550721092538e-05, + "loss": 11.9393, + "step": 28302 + }, + { + "epoch": 1.5412114652889806, + "grad_norm": 0.5595891504648797, + "learning_rate": 2.6351585281656875e-05, + "loss": 11.7512, + "step": 28303 + }, + { + "epoch": 1.5412659192855636, + "grad_norm": 0.5788012801230924, + "learning_rate": 2.634562041492361e-05, + "loss": 11.9586, + "step": 28304 + }, + { + "epoch": 1.5413203732821465, + "grad_norm": 0.5805037700823928, + "learning_rate": 2.6339656120939182e-05, + "loss": 11.9215, + "step": 28305 + }, + { + "epoch": 1.5413748272787295, + "grad_norm": 0.5127059839878486, + "learning_rate": 2.6333692399749942e-05, + "loss": 11.8413, + "step": 28306 + }, + { + "epoch": 1.5414292812753128, + "grad_norm": 0.5395360288185435, + "learning_rate": 2.6327729251402233e-05, + "loss": 11.7857, + "step": 28307 + }, + { + "epoch": 1.5414837352718957, + "grad_norm": 0.5152761726323076, + "learning_rate": 2.6321766675942483e-05, + "loss": 11.8383, + "step": 28308 + }, + { + "epoch": 1.5415381892684787, + "grad_norm": 0.5600835552397703, + "learning_rate": 2.6315804673416998e-05, + "loss": 11.805, + "step": 28309 + }, + { + "epoch": 1.5415926432650617, + "grad_norm": 0.5454841709735719, + "learning_rate": 2.630984324387221e-05, + "loss": 11.8448, + "step": 28310 + }, + { + "epoch": 1.5416470972616447, + "grad_norm": 0.6253522379181685, + "learning_rate": 2.6303882387354406e-05, + "loss": 11.9069, + "step": 28311 + }, + { + "epoch": 1.5417015512582277, + "grad_norm": 0.6219127188901813, + "learning_rate": 2.629792210391e-05, + "loss": 11.8694, + "step": 28312 + }, + { + "epoch": 1.5417560052548107, + "grad_norm": 0.5797193422957406, + "learning_rate": 2.6291962393585278e-05, + "loss": 11.9399, + "step": 28313 + }, + { + "epoch": 1.5418104592513937, + "grad_norm": 0.5359935669224456, + "learning_rate": 2.6286003256426628e-05, + "loss": 11.7806, + "step": 28314 + }, + { + "epoch": 1.5418649132479767, + "grad_norm": 0.5482595131675084, + "learning_rate": 2.6280044692480432e-05, + "loss": 11.9322, + "step": 28315 + }, + { + "epoch": 1.5419193672445597, + "grad_norm": 0.5412211758477348, + "learning_rate": 2.627408670179291e-05, + "loss": 11.8118, + "step": 28316 + }, + { + "epoch": 1.5419738212411427, + "grad_norm": 0.5031157443018751, + "learning_rate": 2.6268129284410458e-05, + "loss": 11.724, + "step": 28317 + }, + { + "epoch": 1.5420282752377257, + "grad_norm": 0.48082930373461896, + "learning_rate": 2.6262172440379416e-05, + "loss": 11.9277, + "step": 28318 + }, + { + "epoch": 1.5420827292343087, + "grad_norm": 0.5756715142540825, + "learning_rate": 2.625621616974607e-05, + "loss": 11.8244, + "step": 28319 + }, + { + "epoch": 1.5421371832308917, + "grad_norm": 0.5387402752687224, + "learning_rate": 2.6250260472556777e-05, + "loss": 11.8202, + "step": 28320 + }, + { + "epoch": 1.5421916372274747, + "grad_norm": 0.5338966248872453, + "learning_rate": 2.62443053488578e-05, + "loss": 11.9323, + "step": 28321 + }, + { + "epoch": 1.5422460912240576, + "grad_norm": 0.49394165854848227, + "learning_rate": 2.623835079869551e-05, + "loss": 11.8239, + "step": 28322 + }, + { + "epoch": 1.5423005452206406, + "grad_norm": 0.5510301241991037, + "learning_rate": 2.6232396822116134e-05, + "loss": 11.7766, + "step": 28323 + }, + { + "epoch": 1.5423549992172239, + "grad_norm": 0.5920767057702554, + "learning_rate": 2.6226443419166012e-05, + "loss": 11.9232, + "step": 28324 + }, + { + "epoch": 1.5424094532138068, + "grad_norm": 0.5743929819507603, + "learning_rate": 2.62204905898915e-05, + "loss": 11.8195, + "step": 28325 + }, + { + "epoch": 1.5424639072103898, + "grad_norm": 0.5439752153702302, + "learning_rate": 2.6214538334338766e-05, + "loss": 11.9344, + "step": 28326 + }, + { + "epoch": 1.5425183612069728, + "grad_norm": 0.5646101969548867, + "learning_rate": 2.620858665255418e-05, + "loss": 11.8105, + "step": 28327 + }, + { + "epoch": 1.5425728152035558, + "grad_norm": 0.5761307663888735, + "learning_rate": 2.6202635544583975e-05, + "loss": 11.7824, + "step": 28328 + }, + { + "epoch": 1.5426272692001388, + "grad_norm": 0.532423942765136, + "learning_rate": 2.6196685010474452e-05, + "loss": 11.9044, + "step": 28329 + }, + { + "epoch": 1.542681723196722, + "grad_norm": 0.5435841278020633, + "learning_rate": 2.619073505027192e-05, + "loss": 11.9201, + "step": 28330 + }, + { + "epoch": 1.542736177193305, + "grad_norm": 0.5538508421217291, + "learning_rate": 2.618478566402257e-05, + "loss": 11.9541, + "step": 28331 + }, + { + "epoch": 1.542790631189888, + "grad_norm": 0.6162874260229597, + "learning_rate": 2.6178836851772736e-05, + "loss": 11.9137, + "step": 28332 + }, + { + "epoch": 1.542845085186471, + "grad_norm": 0.5373943280467628, + "learning_rate": 2.6172888613568612e-05, + "loss": 11.8199, + "step": 28333 + }, + { + "epoch": 1.542899539183054, + "grad_norm": 0.5179828636771145, + "learning_rate": 2.616694094945653e-05, + "loss": 11.8554, + "step": 28334 + }, + { + "epoch": 1.542953993179637, + "grad_norm": 0.48028407935061235, + "learning_rate": 2.6160993859482665e-05, + "loss": 11.9126, + "step": 28335 + }, + { + "epoch": 1.54300844717622, + "grad_norm": 0.5225454847238858, + "learning_rate": 2.6155047343693317e-05, + "loss": 11.8516, + "step": 28336 + }, + { + "epoch": 1.543062901172803, + "grad_norm": 0.563803946682016, + "learning_rate": 2.6149101402134702e-05, + "loss": 11.8689, + "step": 28337 + }, + { + "epoch": 1.543117355169386, + "grad_norm": 0.5273704956752464, + "learning_rate": 2.6143156034853034e-05, + "loss": 11.8562, + "step": 28338 + }, + { + "epoch": 1.543171809165969, + "grad_norm": 0.599175341602558, + "learning_rate": 2.6137211241894576e-05, + "loss": 11.8287, + "step": 28339 + }, + { + "epoch": 1.543226263162552, + "grad_norm": 0.5615828578947035, + "learning_rate": 2.613126702330557e-05, + "loss": 11.7832, + "step": 28340 + }, + { + "epoch": 1.543280717159135, + "grad_norm": 0.5481125681678248, + "learning_rate": 2.6125323379132193e-05, + "loss": 11.6501, + "step": 28341 + }, + { + "epoch": 1.543335171155718, + "grad_norm": 0.5813305801399807, + "learning_rate": 2.6119380309420727e-05, + "loss": 11.8198, + "step": 28342 + }, + { + "epoch": 1.543389625152301, + "grad_norm": 0.5535303105664144, + "learning_rate": 2.6113437814217313e-05, + "loss": 11.9125, + "step": 28343 + }, + { + "epoch": 1.543444079148884, + "grad_norm": 0.5820910869739198, + "learning_rate": 2.6107495893568236e-05, + "loss": 11.7817, + "step": 28344 + }, + { + "epoch": 1.543498533145467, + "grad_norm": 0.614254078685859, + "learning_rate": 2.6101554547519626e-05, + "loss": 11.8894, + "step": 28345 + }, + { + "epoch": 1.54355298714205, + "grad_norm": 0.5264325906590919, + "learning_rate": 2.6095613776117743e-05, + "loss": 11.8236, + "step": 28346 + }, + { + "epoch": 1.543607441138633, + "grad_norm": 0.5658556435382978, + "learning_rate": 2.608967357940877e-05, + "loss": 11.8537, + "step": 28347 + }, + { + "epoch": 1.5436618951352161, + "grad_norm": 0.5796891032107981, + "learning_rate": 2.608373395743886e-05, + "loss": 11.9766, + "step": 28348 + }, + { + "epoch": 1.5437163491317991, + "grad_norm": 0.5228768863041379, + "learning_rate": 2.607779491025426e-05, + "loss": 11.8632, + "step": 28349 + }, + { + "epoch": 1.543770803128382, + "grad_norm": 0.5562725029840014, + "learning_rate": 2.607185643790109e-05, + "loss": 11.7465, + "step": 28350 + }, + { + "epoch": 1.543825257124965, + "grad_norm": 0.524910704869663, + "learning_rate": 2.606591854042556e-05, + "loss": 11.8588, + "step": 28351 + }, + { + "epoch": 1.543879711121548, + "grad_norm": 0.5637855734704784, + "learning_rate": 2.6059981217873885e-05, + "loss": 11.9517, + "step": 28352 + }, + { + "epoch": 1.543934165118131, + "grad_norm": 0.6208971999782115, + "learning_rate": 2.605404447029216e-05, + "loss": 11.8617, + "step": 28353 + }, + { + "epoch": 1.5439886191147143, + "grad_norm": 0.5726753186084881, + "learning_rate": 2.6048108297726613e-05, + "loss": 11.9586, + "step": 28354 + }, + { + "epoch": 1.5440430731112973, + "grad_norm": 0.5358515479739011, + "learning_rate": 2.6042172700223357e-05, + "loss": 11.8994, + "step": 28355 + }, + { + "epoch": 1.5440975271078803, + "grad_norm": 0.548695997930056, + "learning_rate": 2.603623767782859e-05, + "loss": 11.9534, + "step": 28356 + }, + { + "epoch": 1.5441519811044633, + "grad_norm": 0.5525914459172369, + "learning_rate": 2.603030323058845e-05, + "loss": 11.8632, + "step": 28357 + }, + { + "epoch": 1.5442064351010463, + "grad_norm": 0.5346785184587497, + "learning_rate": 2.602436935854904e-05, + "loss": 11.7681, + "step": 28358 + }, + { + "epoch": 1.5442608890976293, + "grad_norm": 0.5612780373793211, + "learning_rate": 2.601843606175658e-05, + "loss": 11.9691, + "step": 28359 + }, + { + "epoch": 1.5443153430942123, + "grad_norm": 0.5176343562940072, + "learning_rate": 2.6012503340257123e-05, + "loss": 11.6743, + "step": 28360 + }, + { + "epoch": 1.5443697970907952, + "grad_norm": 0.5351190897668885, + "learning_rate": 2.6006571194096885e-05, + "loss": 11.8922, + "step": 28361 + }, + { + "epoch": 1.5444242510873782, + "grad_norm": 0.522983848147743, + "learning_rate": 2.6000639623321933e-05, + "loss": 11.5706, + "step": 28362 + }, + { + "epoch": 1.5444787050839612, + "grad_norm": 0.5916990401567385, + "learning_rate": 2.5994708627978415e-05, + "loss": 11.8936, + "step": 28363 + }, + { + "epoch": 1.5445331590805442, + "grad_norm": 0.5807024279553158, + "learning_rate": 2.5988778208112476e-05, + "loss": 11.8119, + "step": 28364 + }, + { + "epoch": 1.5445876130771272, + "grad_norm": 0.5416564826302289, + "learning_rate": 2.5982848363770184e-05, + "loss": 11.7792, + "step": 28365 + }, + { + "epoch": 1.5446420670737102, + "grad_norm": 0.545381216888862, + "learning_rate": 2.597691909499771e-05, + "loss": 11.7271, + "step": 28366 + }, + { + "epoch": 1.5446965210702932, + "grad_norm": 0.5237571080501712, + "learning_rate": 2.5970990401841123e-05, + "loss": 11.8412, + "step": 28367 + }, + { + "epoch": 1.5447509750668762, + "grad_norm": 0.5145817796750763, + "learning_rate": 2.5965062284346496e-05, + "loss": 11.6807, + "step": 28368 + }, + { + "epoch": 1.5448054290634592, + "grad_norm": 0.5881393218023219, + "learning_rate": 2.5959134742559998e-05, + "loss": 11.7546, + "step": 28369 + }, + { + "epoch": 1.5448598830600422, + "grad_norm": 0.5752119891535337, + "learning_rate": 2.5953207776527642e-05, + "loss": 11.8851, + "step": 28370 + }, + { + "epoch": 1.5449143370566254, + "grad_norm": 0.6303330141537662, + "learning_rate": 2.594728138629561e-05, + "loss": 11.8358, + "step": 28371 + }, + { + "epoch": 1.5449687910532084, + "grad_norm": 0.5962712900911351, + "learning_rate": 2.5941355571909886e-05, + "loss": 11.7304, + "step": 28372 + }, + { + "epoch": 1.5450232450497914, + "grad_norm": 0.5654189325468316, + "learning_rate": 2.593543033341661e-05, + "loss": 11.8266, + "step": 28373 + }, + { + "epoch": 1.5450776990463744, + "grad_norm": 0.5788037406236803, + "learning_rate": 2.5929505670861875e-05, + "loss": 11.7691, + "step": 28374 + }, + { + "epoch": 1.5451321530429574, + "grad_norm": 0.5022287286365017, + "learning_rate": 2.59235815842917e-05, + "loss": 11.7054, + "step": 28375 + }, + { + "epoch": 1.5451866070395404, + "grad_norm": 0.5976819508719686, + "learning_rate": 2.591765807375224e-05, + "loss": 11.8247, + "step": 28376 + }, + { + "epoch": 1.5452410610361236, + "grad_norm": 0.5356904181928719, + "learning_rate": 2.591173513928943e-05, + "loss": 11.7389, + "step": 28377 + }, + { + "epoch": 1.5452955150327066, + "grad_norm": 0.5956826321487316, + "learning_rate": 2.590581278094939e-05, + "loss": 11.8591, + "step": 28378 + }, + { + "epoch": 1.5453499690292896, + "grad_norm": 0.5405582684955066, + "learning_rate": 2.589989099877821e-05, + "loss": 11.7199, + "step": 28379 + }, + { + "epoch": 1.5454044230258726, + "grad_norm": 0.58390632451909, + "learning_rate": 2.589396979282187e-05, + "loss": 11.9566, + "step": 28380 + }, + { + "epoch": 1.5454588770224555, + "grad_norm": 0.5505994202984245, + "learning_rate": 2.588804916312648e-05, + "loss": 11.766, + "step": 28381 + }, + { + "epoch": 1.5455133310190385, + "grad_norm": 0.6379255049520043, + "learning_rate": 2.5882129109738018e-05, + "loss": 11.895, + "step": 28382 + }, + { + "epoch": 1.5455677850156215, + "grad_norm": 0.5558429276491551, + "learning_rate": 2.5876209632702586e-05, + "loss": 11.8519, + "step": 28383 + }, + { + "epoch": 1.5456222390122045, + "grad_norm": 0.5415143297445657, + "learning_rate": 2.587029073206614e-05, + "loss": 11.8384, + "step": 28384 + }, + { + "epoch": 1.5456766930087875, + "grad_norm": 0.5644130794259087, + "learning_rate": 2.586437240787475e-05, + "loss": 11.8423, + "step": 28385 + }, + { + "epoch": 1.5457311470053705, + "grad_norm": 0.5344614842194936, + "learning_rate": 2.585845466017448e-05, + "loss": 11.8722, + "step": 28386 + }, + { + "epoch": 1.5457856010019535, + "grad_norm": 0.5564494369049631, + "learning_rate": 2.585253748901125e-05, + "loss": 11.8582, + "step": 28387 + }, + { + "epoch": 1.5458400549985365, + "grad_norm": 0.6049125728363719, + "learning_rate": 2.5846620894431128e-05, + "loss": 11.7804, + "step": 28388 + }, + { + "epoch": 1.5458945089951195, + "grad_norm": 0.5961697799108411, + "learning_rate": 2.584070487648015e-05, + "loss": 11.9494, + "step": 28389 + }, + { + "epoch": 1.5459489629917025, + "grad_norm": 0.5841081208904166, + "learning_rate": 2.5834789435204243e-05, + "loss": 11.8902, + "step": 28390 + }, + { + "epoch": 1.5460034169882855, + "grad_norm": 0.5050965529600596, + "learning_rate": 2.5828874570649486e-05, + "loss": 11.9289, + "step": 28391 + }, + { + "epoch": 1.5460578709848685, + "grad_norm": 0.5294184189231204, + "learning_rate": 2.582296028286182e-05, + "loss": 11.7554, + "step": 28392 + }, + { + "epoch": 1.5461123249814515, + "grad_norm": 0.5135657146092494, + "learning_rate": 2.581704657188727e-05, + "loss": 11.7776, + "step": 28393 + }, + { + "epoch": 1.5461667789780347, + "grad_norm": 0.5651805158509523, + "learning_rate": 2.5811133437771785e-05, + "loss": 11.8092, + "step": 28394 + }, + { + "epoch": 1.5462212329746177, + "grad_norm": 0.5587154402497274, + "learning_rate": 2.5805220880561364e-05, + "loss": 11.8334, + "step": 28395 + }, + { + "epoch": 1.5462756869712007, + "grad_norm": 0.5270724338500411, + "learning_rate": 2.5799308900302044e-05, + "loss": 11.7817, + "step": 28396 + }, + { + "epoch": 1.5463301409677837, + "grad_norm": 0.5263739900704169, + "learning_rate": 2.5793397497039683e-05, + "loss": 11.7137, + "step": 28397 + }, + { + "epoch": 1.5463845949643666, + "grad_norm": 0.5602872287487679, + "learning_rate": 2.5787486670820348e-05, + "loss": 11.9086, + "step": 28398 + }, + { + "epoch": 1.5464390489609496, + "grad_norm": 0.517047938860014, + "learning_rate": 2.5781576421689923e-05, + "loss": 11.7993, + "step": 28399 + }, + { + "epoch": 1.5464935029575329, + "grad_norm": 0.5108722857351483, + "learning_rate": 2.5775666749694406e-05, + "loss": 11.8566, + "step": 28400 + }, + { + "epoch": 1.5465479569541158, + "grad_norm": 0.6174935293929322, + "learning_rate": 2.5769757654879777e-05, + "loss": 11.7961, + "step": 28401 + }, + { + "epoch": 1.5466024109506988, + "grad_norm": 0.5302922625882244, + "learning_rate": 2.576384913729194e-05, + "loss": 11.9029, + "step": 28402 + }, + { + "epoch": 1.5466568649472818, + "grad_norm": 0.5783597376976017, + "learning_rate": 2.5757941196976887e-05, + "loss": 11.8606, + "step": 28403 + }, + { + "epoch": 1.5467113189438648, + "grad_norm": 0.5795724963024235, + "learning_rate": 2.5752033833980515e-05, + "loss": 11.9066, + "step": 28404 + }, + { + "epoch": 1.5467657729404478, + "grad_norm": 0.5695245194364605, + "learning_rate": 2.57461270483488e-05, + "loss": 11.9406, + "step": 28405 + }, + { + "epoch": 1.5468202269370308, + "grad_norm": 0.5543568927912289, + "learning_rate": 2.574022084012765e-05, + "loss": 11.8522, + "step": 28406 + }, + { + "epoch": 1.5468746809336138, + "grad_norm": 0.5389756714339937, + "learning_rate": 2.573431520936297e-05, + "loss": 11.7696, + "step": 28407 + }, + { + "epoch": 1.5469291349301968, + "grad_norm": 0.5439451675427848, + "learning_rate": 2.5728410156100735e-05, + "loss": 11.7786, + "step": 28408 + }, + { + "epoch": 1.5469835889267798, + "grad_norm": 0.5777718417749071, + "learning_rate": 2.572250568038681e-05, + "loss": 11.8458, + "step": 28409 + }, + { + "epoch": 1.5470380429233628, + "grad_norm": 0.5440717683471554, + "learning_rate": 2.5716601782267168e-05, + "loss": 11.7257, + "step": 28410 + }, + { + "epoch": 1.5470924969199458, + "grad_norm": 0.5503235949718415, + "learning_rate": 2.5710698461787664e-05, + "loss": 11.875, + "step": 28411 + }, + { + "epoch": 1.5471469509165288, + "grad_norm": 0.5311155260575879, + "learning_rate": 2.5704795718994225e-05, + "loss": 11.854, + "step": 28412 + }, + { + "epoch": 1.5472014049131118, + "grad_norm": 0.5580733513631506, + "learning_rate": 2.5698893553932778e-05, + "loss": 11.8475, + "step": 28413 + }, + { + "epoch": 1.5472558589096947, + "grad_norm": 0.5629690841127116, + "learning_rate": 2.569299196664916e-05, + "loss": 11.8143, + "step": 28414 + }, + { + "epoch": 1.5473103129062777, + "grad_norm": 0.5358089708920208, + "learning_rate": 2.568709095718934e-05, + "loss": 11.8512, + "step": 28415 + }, + { + "epoch": 1.5473647669028607, + "grad_norm": 0.5409977297627018, + "learning_rate": 2.5681190525599163e-05, + "loss": 11.9244, + "step": 28416 + }, + { + "epoch": 1.5474192208994437, + "grad_norm": 0.638433176437136, + "learning_rate": 2.5675290671924468e-05, + "loss": 11.9127, + "step": 28417 + }, + { + "epoch": 1.547473674896027, + "grad_norm": 0.5756050355483072, + "learning_rate": 2.566939139621122e-05, + "loss": 11.9142, + "step": 28418 + }, + { + "epoch": 1.54752812889261, + "grad_norm": 0.5431040821555536, + "learning_rate": 2.566349269850522e-05, + "loss": 11.7574, + "step": 28419 + }, + { + "epoch": 1.547582582889193, + "grad_norm": 0.5511736086108341, + "learning_rate": 2.5657594578852386e-05, + "loss": 11.8797, + "step": 28420 + }, + { + "epoch": 1.547637036885776, + "grad_norm": 0.5323173722084245, + "learning_rate": 2.565169703729854e-05, + "loss": 11.8559, + "step": 28421 + }, + { + "epoch": 1.547691490882359, + "grad_norm": 0.5596644594630502, + "learning_rate": 2.5645800073889568e-05, + "loss": 11.823, + "step": 28422 + }, + { + "epoch": 1.547745944878942, + "grad_norm": 0.549082073241267, + "learning_rate": 2.5639903688671352e-05, + "loss": 11.9428, + "step": 28423 + }, + { + "epoch": 1.5478003988755251, + "grad_norm": 0.5475631173156968, + "learning_rate": 2.563400788168969e-05, + "loss": 11.6797, + "step": 28424 + }, + { + "epoch": 1.5478548528721081, + "grad_norm": 0.5149174796601396, + "learning_rate": 2.5628112652990478e-05, + "loss": 11.7562, + "step": 28425 + }, + { + "epoch": 1.547909306868691, + "grad_norm": 0.550137163536973, + "learning_rate": 2.5622218002619537e-05, + "loss": 11.9227, + "step": 28426 + }, + { + "epoch": 1.547963760865274, + "grad_norm": 0.6155296306761008, + "learning_rate": 2.5616323930622665e-05, + "loss": 12.0026, + "step": 28427 + }, + { + "epoch": 1.548018214861857, + "grad_norm": 0.5078000253024684, + "learning_rate": 2.5610430437045775e-05, + "loss": 11.7096, + "step": 28428 + }, + { + "epoch": 1.54807266885844, + "grad_norm": 0.5655854998697462, + "learning_rate": 2.5604537521934614e-05, + "loss": 11.7819, + "step": 28429 + }, + { + "epoch": 1.548127122855023, + "grad_norm": 0.5253510272861863, + "learning_rate": 2.559864518533508e-05, + "loss": 11.7842, + "step": 28430 + }, + { + "epoch": 1.548181576851606, + "grad_norm": 0.5411136871300276, + "learning_rate": 2.5592753427292927e-05, + "loss": 11.932, + "step": 28431 + }, + { + "epoch": 1.548236030848189, + "grad_norm": 0.6140542561957026, + "learning_rate": 2.5586862247854027e-05, + "loss": 11.8188, + "step": 28432 + }, + { + "epoch": 1.548290484844772, + "grad_norm": 0.5187459708397918, + "learning_rate": 2.5580971647064134e-05, + "loss": 11.7711, + "step": 28433 + }, + { + "epoch": 1.548344938841355, + "grad_norm": 0.5237170621281736, + "learning_rate": 2.5575081624969087e-05, + "loss": 11.9784, + "step": 28434 + }, + { + "epoch": 1.548399392837938, + "grad_norm": 0.5275190190654206, + "learning_rate": 2.556919218161472e-05, + "loss": 11.7082, + "step": 28435 + }, + { + "epoch": 1.548453846834521, + "grad_norm": 0.5589132428169242, + "learning_rate": 2.5563303317046795e-05, + "loss": 11.8246, + "step": 28436 + }, + { + "epoch": 1.548508300831104, + "grad_norm": 0.5084913066271427, + "learning_rate": 2.5557415031311082e-05, + "loss": 11.862, + "step": 28437 + }, + { + "epoch": 1.548562754827687, + "grad_norm": 0.522068060313236, + "learning_rate": 2.5551527324453417e-05, + "loss": 11.7411, + "step": 28438 + }, + { + "epoch": 1.54861720882427, + "grad_norm": 0.5739219829998441, + "learning_rate": 2.554564019651954e-05, + "loss": 11.8917, + "step": 28439 + }, + { + "epoch": 1.548671662820853, + "grad_norm": 0.5725865084331427, + "learning_rate": 2.5539753647555266e-05, + "loss": 11.9694, + "step": 28440 + }, + { + "epoch": 1.5487261168174362, + "grad_norm": 0.5222121633979407, + "learning_rate": 2.5533867677606337e-05, + "loss": 11.7907, + "step": 28441 + }, + { + "epoch": 1.5487805708140192, + "grad_norm": 0.5343862732679546, + "learning_rate": 2.5527982286718567e-05, + "loss": 11.9875, + "step": 28442 + }, + { + "epoch": 1.5488350248106022, + "grad_norm": 0.5173530999527631, + "learning_rate": 2.5522097474937658e-05, + "loss": 11.8422, + "step": 28443 + }, + { + "epoch": 1.5488894788071852, + "grad_norm": 0.5505206385587831, + "learning_rate": 2.5516213242309418e-05, + "loss": 11.8241, + "step": 28444 + }, + { + "epoch": 1.5489439328037682, + "grad_norm": 0.5601652186740096, + "learning_rate": 2.5510329588879624e-05, + "loss": 11.8184, + "step": 28445 + }, + { + "epoch": 1.5489983868003512, + "grad_norm": 0.5260787115235537, + "learning_rate": 2.5504446514693992e-05, + "loss": 11.9064, + "step": 28446 + }, + { + "epoch": 1.5490528407969344, + "grad_norm": 0.5527946293696209, + "learning_rate": 2.549856401979829e-05, + "loss": 11.8156, + "step": 28447 + }, + { + "epoch": 1.5491072947935174, + "grad_norm": 0.5783068020689694, + "learning_rate": 2.5492682104238207e-05, + "loss": 11.8586, + "step": 28448 + }, + { + "epoch": 1.5491617487901004, + "grad_norm": 0.5013085575154154, + "learning_rate": 2.5486800768059527e-05, + "loss": 11.7434, + "step": 28449 + }, + { + "epoch": 1.5492162027866834, + "grad_norm": 0.4907623554578605, + "learning_rate": 2.548092001130801e-05, + "loss": 11.788, + "step": 28450 + }, + { + "epoch": 1.5492706567832664, + "grad_norm": 0.60379728605464, + "learning_rate": 2.5475039834029324e-05, + "loss": 11.7984, + "step": 28451 + }, + { + "epoch": 1.5493251107798494, + "grad_norm": 0.5735537390763572, + "learning_rate": 2.546916023626925e-05, + "loss": 11.832, + "step": 28452 + }, + { + "epoch": 1.5493795647764323, + "grad_norm": 0.5880665310677062, + "learning_rate": 2.5463281218073466e-05, + "loss": 11.8112, + "step": 28453 + }, + { + "epoch": 1.5494340187730153, + "grad_norm": 0.5662966118919047, + "learning_rate": 2.5457402779487728e-05, + "loss": 12.0215, + "step": 28454 + }, + { + "epoch": 1.5494884727695983, + "grad_norm": 0.5487430492166419, + "learning_rate": 2.54515249205577e-05, + "loss": 11.8194, + "step": 28455 + }, + { + "epoch": 1.5495429267661813, + "grad_norm": 0.6073259437457524, + "learning_rate": 2.544564764132915e-05, + "loss": 11.9199, + "step": 28456 + }, + { + "epoch": 1.5495973807627643, + "grad_norm": 0.5110416125123185, + "learning_rate": 2.543977094184775e-05, + "loss": 11.9007, + "step": 28457 + }, + { + "epoch": 1.5496518347593473, + "grad_norm": 0.6010475075449443, + "learning_rate": 2.5433894822159155e-05, + "loss": 11.7367, + "step": 28458 + }, + { + "epoch": 1.5497062887559303, + "grad_norm": 0.5799295765125994, + "learning_rate": 2.54280192823091e-05, + "loss": 11.7556, + "step": 28459 + }, + { + "epoch": 1.5497607427525133, + "grad_norm": 0.5737195536592768, + "learning_rate": 2.542214432234331e-05, + "loss": 11.8661, + "step": 28460 + }, + { + "epoch": 1.5498151967490963, + "grad_norm": 0.5060511930665154, + "learning_rate": 2.5416269942307404e-05, + "loss": 11.8723, + "step": 28461 + }, + { + "epoch": 1.5498696507456793, + "grad_norm": 0.4857564571140649, + "learning_rate": 2.5410396142247107e-05, + "loss": 11.8094, + "step": 28462 + }, + { + "epoch": 1.5499241047422623, + "grad_norm": 0.5395258894578068, + "learning_rate": 2.5404522922208064e-05, + "loss": 11.8783, + "step": 28463 + }, + { + "epoch": 1.5499785587388455, + "grad_norm": 0.5520990718572701, + "learning_rate": 2.539865028223598e-05, + "loss": 11.8394, + "step": 28464 + }, + { + "epoch": 1.5500330127354285, + "grad_norm": 0.5408865396162762, + "learning_rate": 2.5392778222376467e-05, + "loss": 11.9677, + "step": 28465 + }, + { + "epoch": 1.5500874667320115, + "grad_norm": 0.5381429081715795, + "learning_rate": 2.5386906742675265e-05, + "loss": 11.8065, + "step": 28466 + }, + { + "epoch": 1.5501419207285945, + "grad_norm": 0.5616159830219871, + "learning_rate": 2.538103584317798e-05, + "loss": 11.9628, + "step": 28467 + }, + { + "epoch": 1.5501963747251775, + "grad_norm": 0.5400137642162649, + "learning_rate": 2.5375165523930256e-05, + "loss": 11.7801, + "step": 28468 + }, + { + "epoch": 1.5502508287217605, + "grad_norm": 0.5158123918662787, + "learning_rate": 2.5369295784977776e-05, + "loss": 11.8542, + "step": 28469 + }, + { + "epoch": 1.5503052827183437, + "grad_norm": 0.5271437661328544, + "learning_rate": 2.536342662636614e-05, + "loss": 11.7967, + "step": 28470 + }, + { + "epoch": 1.5503597367149267, + "grad_norm": 0.5534959222742973, + "learning_rate": 2.5357558048141017e-05, + "loss": 11.8952, + "step": 28471 + }, + { + "epoch": 1.5504141907115097, + "grad_norm": 0.5482585284101907, + "learning_rate": 2.5351690050348075e-05, + "loss": 11.9437, + "step": 28472 + }, + { + "epoch": 1.5504686447080926, + "grad_norm": 0.5579770962441959, + "learning_rate": 2.534582263303288e-05, + "loss": 11.8728, + "step": 28473 + }, + { + "epoch": 1.5505230987046756, + "grad_norm": 0.8902138915915079, + "learning_rate": 2.533995579624111e-05, + "loss": 11.7841, + "step": 28474 + }, + { + "epoch": 1.5505775527012586, + "grad_norm": 0.5406013223693736, + "learning_rate": 2.5334089540018336e-05, + "loss": 11.6726, + "step": 28475 + }, + { + "epoch": 1.5506320066978416, + "grad_norm": 0.6255594525097159, + "learning_rate": 2.5328223864410227e-05, + "loss": 11.8823, + "step": 28476 + }, + { + "epoch": 1.5506864606944246, + "grad_norm": 0.5325474589054756, + "learning_rate": 2.5322358769462383e-05, + "loss": 11.8107, + "step": 28477 + }, + { + "epoch": 1.5507409146910076, + "grad_norm": 0.5368601359246209, + "learning_rate": 2.531649425522037e-05, + "loss": 11.68, + "step": 28478 + }, + { + "epoch": 1.5507953686875906, + "grad_norm": 0.5529466446931488, + "learning_rate": 2.5310630321729846e-05, + "loss": 11.8102, + "step": 28479 + }, + { + "epoch": 1.5508498226841736, + "grad_norm": 0.6059228591987602, + "learning_rate": 2.5304766969036363e-05, + "loss": 11.9629, + "step": 28480 + }, + { + "epoch": 1.5509042766807566, + "grad_norm": 0.5819827990530834, + "learning_rate": 2.5298904197185557e-05, + "loss": 11.8608, + "step": 28481 + }, + { + "epoch": 1.5509587306773396, + "grad_norm": 0.5142102163683073, + "learning_rate": 2.5293042006222976e-05, + "loss": 11.8563, + "step": 28482 + }, + { + "epoch": 1.5510131846739226, + "grad_norm": 0.5319599480372926, + "learning_rate": 2.5287180396194233e-05, + "loss": 11.6907, + "step": 28483 + }, + { + "epoch": 1.5510676386705056, + "grad_norm": 0.5819049921814107, + "learning_rate": 2.528131936714493e-05, + "loss": 11.9163, + "step": 28484 + }, + { + "epoch": 1.5511220926670886, + "grad_norm": 0.5380853151840385, + "learning_rate": 2.5275458919120598e-05, + "loss": 11.7327, + "step": 28485 + }, + { + "epoch": 1.5511765466636716, + "grad_norm": 0.5537818658038224, + "learning_rate": 2.526959905216685e-05, + "loss": 11.7454, + "step": 28486 + }, + { + "epoch": 1.5512310006602545, + "grad_norm": 0.6021263448844547, + "learning_rate": 2.526373976632923e-05, + "loss": 11.9231, + "step": 28487 + }, + { + "epoch": 1.5512854546568378, + "grad_norm": 0.6145604956693037, + "learning_rate": 2.5257881061653277e-05, + "loss": 11.9234, + "step": 28488 + }, + { + "epoch": 1.5513399086534208, + "grad_norm": 0.5299598670829708, + "learning_rate": 2.5252022938184594e-05, + "loss": 11.8107, + "step": 28489 + }, + { + "epoch": 1.5513943626500037, + "grad_norm": 0.5900073754156276, + "learning_rate": 2.5246165395968702e-05, + "loss": 11.843, + "step": 28490 + }, + { + "epoch": 1.5514488166465867, + "grad_norm": 0.587429098817428, + "learning_rate": 2.5240308435051187e-05, + "loss": 11.9314, + "step": 28491 + }, + { + "epoch": 1.5515032706431697, + "grad_norm": 0.5925412366155998, + "learning_rate": 2.523445205547753e-05, + "loss": 11.9291, + "step": 28492 + }, + { + "epoch": 1.551557724639753, + "grad_norm": 0.4975812315264232, + "learning_rate": 2.5228596257293325e-05, + "loss": 11.6459, + "step": 28493 + }, + { + "epoch": 1.551612178636336, + "grad_norm": 0.5454202643079502, + "learning_rate": 2.522274104054413e-05, + "loss": 11.6712, + "step": 28494 + }, + { + "epoch": 1.551666632632919, + "grad_norm": 0.6599507266548589, + "learning_rate": 2.5216886405275396e-05, + "loss": 11.8713, + "step": 28495 + }, + { + "epoch": 1.551721086629502, + "grad_norm": 0.5091180273869543, + "learning_rate": 2.5211032351532738e-05, + "loss": 11.6595, + "step": 28496 + }, + { + "epoch": 1.551775540626085, + "grad_norm": 0.5563702209320073, + "learning_rate": 2.5205178879361624e-05, + "loss": 11.8779, + "step": 28497 + }, + { + "epoch": 1.551829994622668, + "grad_norm": 0.5664771627422197, + "learning_rate": 2.5199325988807553e-05, + "loss": 11.8834, + "step": 28498 + }, + { + "epoch": 1.551884448619251, + "grad_norm": 0.6076489917821639, + "learning_rate": 2.5193473679916103e-05, + "loss": 11.87, + "step": 28499 + }, + { + "epoch": 1.551938902615834, + "grad_norm": 0.5669662502693399, + "learning_rate": 2.518762195273272e-05, + "loss": 11.8221, + "step": 28500 + }, + { + "epoch": 1.5519933566124169, + "grad_norm": 0.5707755767582692, + "learning_rate": 2.5181770807302953e-05, + "loss": 11.7684, + "step": 28501 + }, + { + "epoch": 1.5520478106089999, + "grad_norm": 0.5518388417723734, + "learning_rate": 2.517592024367226e-05, + "loss": 11.8613, + "step": 28502 + }, + { + "epoch": 1.5521022646055829, + "grad_norm": 0.5008864884307273, + "learning_rate": 2.517007026188619e-05, + "loss": 11.7114, + "step": 28503 + }, + { + "epoch": 1.5521567186021659, + "grad_norm": 0.4934029122753554, + "learning_rate": 2.5164220861990174e-05, + "loss": 11.7556, + "step": 28504 + }, + { + "epoch": 1.5522111725987489, + "grad_norm": 0.5584005164458216, + "learning_rate": 2.515837204402972e-05, + "loss": 11.8764, + "step": 28505 + }, + { + "epoch": 1.5522656265953318, + "grad_norm": 0.5847427836113182, + "learning_rate": 2.5152523808050353e-05, + "loss": 11.7411, + "step": 28506 + }, + { + "epoch": 1.5523200805919148, + "grad_norm": 0.5587139265149721, + "learning_rate": 2.5146676154097504e-05, + "loss": 11.7873, + "step": 28507 + }, + { + "epoch": 1.5523745345884978, + "grad_norm": 0.5743506563430758, + "learning_rate": 2.5140829082216634e-05, + "loss": 11.9409, + "step": 28508 + }, + { + "epoch": 1.5524289885850808, + "grad_norm": 0.6032488970331529, + "learning_rate": 2.5134982592453248e-05, + "loss": 11.9309, + "step": 28509 + }, + { + "epoch": 1.5524834425816638, + "grad_norm": 0.5852034334633738, + "learning_rate": 2.5129136684852773e-05, + "loss": 11.8758, + "step": 28510 + }, + { + "epoch": 1.552537896578247, + "grad_norm": 0.5936254615136347, + "learning_rate": 2.512329135946072e-05, + "loss": 11.7461, + "step": 28511 + }, + { + "epoch": 1.55259235057483, + "grad_norm": 0.6038112322972872, + "learning_rate": 2.511744661632247e-05, + "loss": 11.9481, + "step": 28512 + }, + { + "epoch": 1.552646804571413, + "grad_norm": 0.57374905226439, + "learning_rate": 2.5111602455483552e-05, + "loss": 11.8354, + "step": 28513 + }, + { + "epoch": 1.552701258567996, + "grad_norm": 0.6271858490672713, + "learning_rate": 2.510575887698934e-05, + "loss": 11.9047, + "step": 28514 + }, + { + "epoch": 1.552755712564579, + "grad_norm": 0.5534109474902617, + "learning_rate": 2.5099915880885328e-05, + "loss": 11.8675, + "step": 28515 + }, + { + "epoch": 1.552810166561162, + "grad_norm": 0.6694132274300042, + "learning_rate": 2.5094073467216906e-05, + "loss": 11.8989, + "step": 28516 + }, + { + "epoch": 1.5528646205577452, + "grad_norm": 0.571900046650996, + "learning_rate": 2.508823163602957e-05, + "loss": 11.9213, + "step": 28517 + }, + { + "epoch": 1.5529190745543282, + "grad_norm": 0.526527084733199, + "learning_rate": 2.5082390387368694e-05, + "loss": 11.8168, + "step": 28518 + }, + { + "epoch": 1.5529735285509112, + "grad_norm": 0.5603474863790836, + "learning_rate": 2.5076549721279685e-05, + "loss": 11.9179, + "step": 28519 + }, + { + "epoch": 1.5530279825474942, + "grad_norm": 0.5400150236607307, + "learning_rate": 2.5070709637807998e-05, + "loss": 11.7396, + "step": 28520 + }, + { + "epoch": 1.5530824365440772, + "grad_norm": 0.5325796354521942, + "learning_rate": 2.5064870136999064e-05, + "loss": 11.785, + "step": 28521 + }, + { + "epoch": 1.5531368905406602, + "grad_norm": 0.5709758062936658, + "learning_rate": 2.5059031218898243e-05, + "loss": 11.92, + "step": 28522 + }, + { + "epoch": 1.5531913445372432, + "grad_norm": 0.5419729540075258, + "learning_rate": 2.505319288355098e-05, + "loss": 11.8677, + "step": 28523 + }, + { + "epoch": 1.5532457985338262, + "grad_norm": 0.5044308542791872, + "learning_rate": 2.5047355131002638e-05, + "loss": 11.9088, + "step": 28524 + }, + { + "epoch": 1.5533002525304092, + "grad_norm": 0.5875763905242495, + "learning_rate": 2.5041517961298655e-05, + "loss": 11.8522, + "step": 28525 + }, + { + "epoch": 1.5533547065269921, + "grad_norm": 0.5436673170751153, + "learning_rate": 2.5035681374484375e-05, + "loss": 11.6491, + "step": 28526 + }, + { + "epoch": 1.5534091605235751, + "grad_norm": 0.5018654095517917, + "learning_rate": 2.502984537060524e-05, + "loss": 11.8711, + "step": 28527 + }, + { + "epoch": 1.5534636145201581, + "grad_norm": 0.5694929635891186, + "learning_rate": 2.5024009949706596e-05, + "loss": 11.8014, + "step": 28528 + }, + { + "epoch": 1.5535180685167411, + "grad_norm": 0.555056874138216, + "learning_rate": 2.5018175111833796e-05, + "loss": 11.6892, + "step": 28529 + }, + { + "epoch": 1.5535725225133241, + "grad_norm": 0.54490926612928, + "learning_rate": 2.5012340857032267e-05, + "loss": 11.819, + "step": 28530 + }, + { + "epoch": 1.553626976509907, + "grad_norm": 0.5463372306875046, + "learning_rate": 2.5006507185347317e-05, + "loss": 11.8369, + "step": 28531 + }, + { + "epoch": 1.55368143050649, + "grad_norm": 0.5448397787529085, + "learning_rate": 2.500067409682435e-05, + "loss": 11.8249, + "step": 28532 + }, + { + "epoch": 1.553735884503073, + "grad_norm": 0.5675516686988226, + "learning_rate": 2.4994841591508754e-05, + "loss": 11.9393, + "step": 28533 + }, + { + "epoch": 1.5537903384996563, + "grad_norm": 0.5267371689428354, + "learning_rate": 2.4989009669445807e-05, + "loss": 11.5129, + "step": 28534 + }, + { + "epoch": 1.5538447924962393, + "grad_norm": 0.5425475610079252, + "learning_rate": 2.4983178330680936e-05, + "loss": 11.8204, + "step": 28535 + }, + { + "epoch": 1.5538992464928223, + "grad_norm": 0.549845211647986, + "learning_rate": 2.4977347575259425e-05, + "loss": 11.8349, + "step": 28536 + }, + { + "epoch": 1.5539537004894053, + "grad_norm": 0.4935824476753059, + "learning_rate": 2.4971517403226663e-05, + "loss": 11.867, + "step": 28537 + }, + { + "epoch": 1.5540081544859883, + "grad_norm": 0.6923932588157574, + "learning_rate": 2.496568781462797e-05, + "loss": 11.9196, + "step": 28538 + }, + { + "epoch": 1.5540626084825713, + "grad_norm": 0.5260881989908586, + "learning_rate": 2.495985880950863e-05, + "loss": 11.773, + "step": 28539 + }, + { + "epoch": 1.5541170624791545, + "grad_norm": 0.5364431472614594, + "learning_rate": 2.495403038791405e-05, + "loss": 11.7791, + "step": 28540 + }, + { + "epoch": 1.5541715164757375, + "grad_norm": 0.5812548888784491, + "learning_rate": 2.4948202549889488e-05, + "loss": 11.7457, + "step": 28541 + }, + { + "epoch": 1.5542259704723205, + "grad_norm": 0.5919997638028225, + "learning_rate": 2.494237529548029e-05, + "loss": 11.9624, + "step": 28542 + }, + { + "epoch": 1.5542804244689035, + "grad_norm": 0.5281734432144186, + "learning_rate": 2.4936548624731792e-05, + "loss": 11.9087, + "step": 28543 + }, + { + "epoch": 1.5543348784654865, + "grad_norm": 0.5807438940712529, + "learning_rate": 2.4930722537689256e-05, + "loss": 11.803, + "step": 28544 + }, + { + "epoch": 1.5543893324620695, + "grad_norm": 0.5800437925530684, + "learning_rate": 2.4924897034398033e-05, + "loss": 11.782, + "step": 28545 + }, + { + "epoch": 1.5544437864586524, + "grad_norm": 0.5268069571755901, + "learning_rate": 2.4919072114903374e-05, + "loss": 11.8909, + "step": 28546 + }, + { + "epoch": 1.5544982404552354, + "grad_norm": 0.545833204562664, + "learning_rate": 2.4913247779250636e-05, + "loss": 11.8384, + "step": 28547 + }, + { + "epoch": 1.5545526944518184, + "grad_norm": 0.6160011127069638, + "learning_rate": 2.490742402748507e-05, + "loss": 11.7852, + "step": 28548 + }, + { + "epoch": 1.5546071484484014, + "grad_norm": 0.5290535096343847, + "learning_rate": 2.490160085965194e-05, + "loss": 11.9017, + "step": 28549 + }, + { + "epoch": 1.5546616024449844, + "grad_norm": 0.571680133415217, + "learning_rate": 2.4895778275796587e-05, + "loss": 11.8059, + "step": 28550 + }, + { + "epoch": 1.5547160564415674, + "grad_norm": 0.6052844268196166, + "learning_rate": 2.4889956275964232e-05, + "loss": 11.8639, + "step": 28551 + }, + { + "epoch": 1.5547705104381504, + "grad_norm": 0.6723872804078066, + "learning_rate": 2.4884134860200194e-05, + "loss": 11.8993, + "step": 28552 + }, + { + "epoch": 1.5548249644347334, + "grad_norm": 0.5220939478677491, + "learning_rate": 2.4878314028549686e-05, + "loss": 11.6449, + "step": 28553 + }, + { + "epoch": 1.5548794184313164, + "grad_norm": 0.5959196608776058, + "learning_rate": 2.4872493781058015e-05, + "loss": 11.8946, + "step": 28554 + }, + { + "epoch": 1.5549338724278994, + "grad_norm": 0.5283774698462669, + "learning_rate": 2.486667411777046e-05, + "loss": 11.7955, + "step": 28555 + }, + { + "epoch": 1.5549883264244824, + "grad_norm": 0.5808798509977745, + "learning_rate": 2.4860855038732223e-05, + "loss": 11.8956, + "step": 28556 + }, + { + "epoch": 1.5550427804210654, + "grad_norm": 0.5713598747774712, + "learning_rate": 2.4855036543988598e-05, + "loss": 11.9146, + "step": 28557 + }, + { + "epoch": 1.5550972344176486, + "grad_norm": 0.5475176573672785, + "learning_rate": 2.4849218633584814e-05, + "loss": 11.8442, + "step": 28558 + }, + { + "epoch": 1.5551516884142316, + "grad_norm": 0.5350549055637662, + "learning_rate": 2.4843401307566082e-05, + "loss": 11.8207, + "step": 28559 + }, + { + "epoch": 1.5552061424108146, + "grad_norm": 0.704467150637401, + "learning_rate": 2.483758456597769e-05, + "loss": 11.8277, + "step": 28560 + }, + { + "epoch": 1.5552605964073976, + "grad_norm": 0.5585654171172186, + "learning_rate": 2.483176840886482e-05, + "loss": 11.8686, + "step": 28561 + }, + { + "epoch": 1.5553150504039805, + "grad_norm": 0.6053319547903152, + "learning_rate": 2.4825952836272747e-05, + "loss": 11.8209, + "step": 28562 + }, + { + "epoch": 1.5553695044005638, + "grad_norm": 0.58797267331446, + "learning_rate": 2.4820137848246648e-05, + "loss": 11.8193, + "step": 28563 + }, + { + "epoch": 1.5554239583971468, + "grad_norm": 0.5585981773514379, + "learning_rate": 2.4814323444831787e-05, + "loss": 11.7418, + "step": 28564 + }, + { + "epoch": 1.5554784123937297, + "grad_norm": 0.632299090688014, + "learning_rate": 2.480850962607334e-05, + "loss": 11.8569, + "step": 28565 + }, + { + "epoch": 1.5555328663903127, + "grad_norm": 0.5404204742042362, + "learning_rate": 2.4802696392016522e-05, + "loss": 11.7614, + "step": 28566 + }, + { + "epoch": 1.5555873203868957, + "grad_norm": 0.5307115052866692, + "learning_rate": 2.4796883742706613e-05, + "loss": 11.8431, + "step": 28567 + }, + { + "epoch": 1.5556417743834787, + "grad_norm": 0.5748895070945195, + "learning_rate": 2.479107167818868e-05, + "loss": 11.8516, + "step": 28568 + }, + { + "epoch": 1.5556962283800617, + "grad_norm": 0.5185364565502456, + "learning_rate": 2.4785260198507987e-05, + "loss": 11.7532, + "step": 28569 + }, + { + "epoch": 1.5557506823766447, + "grad_norm": 0.5824939631145019, + "learning_rate": 2.4779449303709758e-05, + "loss": 11.8828, + "step": 28570 + }, + { + "epoch": 1.5558051363732277, + "grad_norm": 0.5823898208825926, + "learning_rate": 2.477363899383911e-05, + "loss": 11.8927, + "step": 28571 + }, + { + "epoch": 1.5558595903698107, + "grad_norm": 0.498832121471464, + "learning_rate": 2.4767829268941302e-05, + "loss": 11.8058, + "step": 28572 + }, + { + "epoch": 1.5559140443663937, + "grad_norm": 0.5967633998287475, + "learning_rate": 2.4762020129061427e-05, + "loss": 11.8112, + "step": 28573 + }, + { + "epoch": 1.5559684983629767, + "grad_norm": 0.5864411539195628, + "learning_rate": 2.4756211574244735e-05, + "loss": 11.9025, + "step": 28574 + }, + { + "epoch": 1.5560229523595597, + "grad_norm": 0.5456037698325555, + "learning_rate": 2.4750403604536333e-05, + "loss": 11.8419, + "step": 28575 + }, + { + "epoch": 1.5560774063561427, + "grad_norm": 0.5512963607380309, + "learning_rate": 2.4744596219981418e-05, + "loss": 11.7866, + "step": 28576 + }, + { + "epoch": 1.5561318603527257, + "grad_norm": 0.5508067364195802, + "learning_rate": 2.4738789420625195e-05, + "loss": 11.7468, + "step": 28577 + }, + { + "epoch": 1.5561863143493087, + "grad_norm": 0.6184534697758933, + "learning_rate": 2.4732983206512706e-05, + "loss": 11.9362, + "step": 28578 + }, + { + "epoch": 1.5562407683458916, + "grad_norm": 0.5479859137355779, + "learning_rate": 2.47271775776892e-05, + "loss": 11.8337, + "step": 28579 + }, + { + "epoch": 1.5562952223424746, + "grad_norm": 0.5082027032151528, + "learning_rate": 2.472137253419974e-05, + "loss": 11.7432, + "step": 28580 + }, + { + "epoch": 1.5563496763390579, + "grad_norm": 0.49428153199139857, + "learning_rate": 2.4715568076089522e-05, + "loss": 11.7991, + "step": 28581 + }, + { + "epoch": 1.5564041303356408, + "grad_norm": 0.5589649387211867, + "learning_rate": 2.4709764203403697e-05, + "loss": 11.8683, + "step": 28582 + }, + { + "epoch": 1.5564585843322238, + "grad_norm": 0.53255396563382, + "learning_rate": 2.470396091618734e-05, + "loss": 11.7046, + "step": 28583 + }, + { + "epoch": 1.5565130383288068, + "grad_norm": 0.4977437695350165, + "learning_rate": 2.4698158214485644e-05, + "loss": 11.8066, + "step": 28584 + }, + { + "epoch": 1.5565674923253898, + "grad_norm": 0.5626315967222058, + "learning_rate": 2.4692356098343673e-05, + "loss": 11.8381, + "step": 28585 + }, + { + "epoch": 1.5566219463219728, + "grad_norm": 0.5286919067641952, + "learning_rate": 2.4686554567806597e-05, + "loss": 11.8094, + "step": 28586 + }, + { + "epoch": 1.556676400318556, + "grad_norm": 0.553498917671147, + "learning_rate": 2.4680753622919496e-05, + "loss": 11.8322, + "step": 28587 + }, + { + "epoch": 1.556730854315139, + "grad_norm": 0.5602149080639953, + "learning_rate": 2.4674953263727464e-05, + "loss": 11.9096, + "step": 28588 + }, + { + "epoch": 1.556785308311722, + "grad_norm": 0.5276623364621802, + "learning_rate": 2.4669153490275652e-05, + "loss": 11.7724, + "step": 28589 + }, + { + "epoch": 1.556839762308305, + "grad_norm": 0.49386661278834587, + "learning_rate": 2.466335430260911e-05, + "loss": 11.7903, + "step": 28590 + }, + { + "epoch": 1.556894216304888, + "grad_norm": 0.5721983479039223, + "learning_rate": 2.4657555700772962e-05, + "loss": 11.7868, + "step": 28591 + }, + { + "epoch": 1.556948670301471, + "grad_norm": 0.5269906038780431, + "learning_rate": 2.4651757684812317e-05, + "loss": 11.9258, + "step": 28592 + }, + { + "epoch": 1.557003124298054, + "grad_norm": 0.6267277822180448, + "learning_rate": 2.4645960254772215e-05, + "loss": 11.8316, + "step": 28593 + }, + { + "epoch": 1.557057578294637, + "grad_norm": 0.5675082189744377, + "learning_rate": 2.4640163410697793e-05, + "loss": 11.654, + "step": 28594 + }, + { + "epoch": 1.55711203229122, + "grad_norm": 0.5715847666698621, + "learning_rate": 2.4634367152634062e-05, + "loss": 11.7683, + "step": 28595 + }, + { + "epoch": 1.557166486287803, + "grad_norm": 0.5335686527000096, + "learning_rate": 2.4628571480626163e-05, + "loss": 11.8012, + "step": 28596 + }, + { + "epoch": 1.557220940284386, + "grad_norm": 0.5619368982704767, + "learning_rate": 2.4622776394719137e-05, + "loss": 11.7206, + "step": 28597 + }, + { + "epoch": 1.557275394280969, + "grad_norm": 0.5735706142849618, + "learning_rate": 2.4616981894958013e-05, + "loss": 11.8803, + "step": 28598 + }, + { + "epoch": 1.557329848277552, + "grad_norm": 0.5575511692095365, + "learning_rate": 2.4611187981387907e-05, + "loss": 11.717, + "step": 28599 + }, + { + "epoch": 1.557384302274135, + "grad_norm": 0.5332991999274577, + "learning_rate": 2.4605394654053814e-05, + "loss": 11.6635, + "step": 28600 + }, + { + "epoch": 1.557438756270718, + "grad_norm": 0.5375788513377285, + "learning_rate": 2.4599601913000846e-05, + "loss": 11.8682, + "step": 28601 + }, + { + "epoch": 1.557493210267301, + "grad_norm": 0.5642020985469542, + "learning_rate": 2.459380975827399e-05, + "loss": 11.8607, + "step": 28602 + }, + { + "epoch": 1.557547664263884, + "grad_norm": 0.5423063000863227, + "learning_rate": 2.4588018189918317e-05, + "loss": 11.8366, + "step": 28603 + }, + { + "epoch": 1.5576021182604671, + "grad_norm": 0.5343092928001854, + "learning_rate": 2.458222720797888e-05, + "loss": 11.944, + "step": 28604 + }, + { + "epoch": 1.5576565722570501, + "grad_norm": 0.4843757052977693, + "learning_rate": 2.457643681250068e-05, + "loss": 11.9183, + "step": 28605 + }, + { + "epoch": 1.5577110262536331, + "grad_norm": 0.548516745397688, + "learning_rate": 2.457064700352877e-05, + "loss": 11.8403, + "step": 28606 + }, + { + "epoch": 1.557765480250216, + "grad_norm": 0.5216081875371917, + "learning_rate": 2.4564857781108165e-05, + "loss": 11.7289, + "step": 28607 + }, + { + "epoch": 1.557819934246799, + "grad_norm": 0.508128508668198, + "learning_rate": 2.4559069145283852e-05, + "loss": 11.7655, + "step": 28608 + }, + { + "epoch": 1.557874388243382, + "grad_norm": 0.5711385745905865, + "learning_rate": 2.4553281096100888e-05, + "loss": 11.7131, + "step": 28609 + }, + { + "epoch": 1.5579288422399653, + "grad_norm": 0.6333255154336631, + "learning_rate": 2.4547493633604235e-05, + "loss": 11.7489, + "step": 28610 + }, + { + "epoch": 1.5579832962365483, + "grad_norm": 0.5312599057644889, + "learning_rate": 2.4541706757838957e-05, + "loss": 11.7314, + "step": 28611 + }, + { + "epoch": 1.5580377502331313, + "grad_norm": 0.5378413349580727, + "learning_rate": 2.4535920468849993e-05, + "loss": 11.8509, + "step": 28612 + }, + { + "epoch": 1.5580922042297143, + "grad_norm": 0.5637246262646303, + "learning_rate": 2.45301347666824e-05, + "loss": 11.8517, + "step": 28613 + }, + { + "epoch": 1.5581466582262973, + "grad_norm": 0.5419334294028164, + "learning_rate": 2.4524349651381095e-05, + "loss": 11.6795, + "step": 28614 + }, + { + "epoch": 1.5582011122228803, + "grad_norm": 0.6493976749345264, + "learning_rate": 2.4518565122991112e-05, + "loss": 11.8643, + "step": 28615 + }, + { + "epoch": 1.5582555662194633, + "grad_norm": 0.5473265317011902, + "learning_rate": 2.4512781181557455e-05, + "loss": 11.6634, + "step": 28616 + }, + { + "epoch": 1.5583100202160463, + "grad_norm": 0.5643660364536964, + "learning_rate": 2.4506997827125067e-05, + "loss": 11.879, + "step": 28617 + }, + { + "epoch": 1.5583644742126292, + "grad_norm": 0.5333052829511333, + "learning_rate": 2.4501215059738902e-05, + "loss": 11.8666, + "step": 28618 + }, + { + "epoch": 1.5584189282092122, + "grad_norm": 0.533843454397603, + "learning_rate": 2.449543287944397e-05, + "loss": 11.7767, + "step": 28619 + }, + { + "epoch": 1.5584733822057952, + "grad_norm": 0.5117656452129317, + "learning_rate": 2.4489651286285186e-05, + "loss": 11.8108, + "step": 28620 + }, + { + "epoch": 1.5585278362023782, + "grad_norm": 0.5018256139928625, + "learning_rate": 2.4483870280307576e-05, + "loss": 11.7305, + "step": 28621 + }, + { + "epoch": 1.5585822901989612, + "grad_norm": 0.5154778160752138, + "learning_rate": 2.4478089861556007e-05, + "loss": 11.7754, + "step": 28622 + }, + { + "epoch": 1.5586367441955442, + "grad_norm": 0.5008525101542722, + "learning_rate": 2.4472310030075518e-05, + "loss": 11.9024, + "step": 28623 + }, + { + "epoch": 1.5586911981921272, + "grad_norm": 0.5203044454204292, + "learning_rate": 2.446653078591099e-05, + "loss": 11.8236, + "step": 28624 + }, + { + "epoch": 1.5587456521887102, + "grad_norm": 0.5297525290030621, + "learning_rate": 2.4460752129107377e-05, + "loss": 11.865, + "step": 28625 + }, + { + "epoch": 1.5588001061852932, + "grad_norm": 0.5086130964742646, + "learning_rate": 2.4454974059709644e-05, + "loss": 11.7829, + "step": 28626 + }, + { + "epoch": 1.5588545601818764, + "grad_norm": 0.5712787289226434, + "learning_rate": 2.444919657776269e-05, + "loss": 11.8218, + "step": 28627 + }, + { + "epoch": 1.5589090141784594, + "grad_norm": 0.5381708369746944, + "learning_rate": 2.4443419683311496e-05, + "loss": 11.9287, + "step": 28628 + }, + { + "epoch": 1.5589634681750424, + "grad_norm": 0.5295786602324816, + "learning_rate": 2.4437643376400888e-05, + "loss": 11.8455, + "step": 28629 + }, + { + "epoch": 1.5590179221716254, + "grad_norm": 0.5203389601693851, + "learning_rate": 2.4431867657075846e-05, + "loss": 11.8296, + "step": 28630 + }, + { + "epoch": 1.5590723761682084, + "grad_norm": 0.6216417078768269, + "learning_rate": 2.4426092525381294e-05, + "loss": 11.8956, + "step": 28631 + }, + { + "epoch": 1.5591268301647914, + "grad_norm": 0.5659958596684848, + "learning_rate": 2.4420317981362094e-05, + "loss": 11.8099, + "step": 28632 + }, + { + "epoch": 1.5591812841613746, + "grad_norm": 0.5445559846984509, + "learning_rate": 2.4414544025063202e-05, + "loss": 11.8889, + "step": 28633 + }, + { + "epoch": 1.5592357381579576, + "grad_norm": 0.5817520990401447, + "learning_rate": 2.4408770656529468e-05, + "loss": 11.9012, + "step": 28634 + }, + { + "epoch": 1.5592901921545406, + "grad_norm": 0.5697807847156862, + "learning_rate": 2.440299787580583e-05, + "loss": 11.825, + "step": 28635 + }, + { + "epoch": 1.5593446461511236, + "grad_norm": 0.524748583111397, + "learning_rate": 2.439722568293713e-05, + "loss": 11.9108, + "step": 28636 + }, + { + "epoch": 1.5593991001477066, + "grad_norm": 0.5784674837697873, + "learning_rate": 2.439145407796829e-05, + "loss": 11.7483, + "step": 28637 + }, + { + "epoch": 1.5594535541442895, + "grad_norm": 0.5499970080038136, + "learning_rate": 2.4385683060944244e-05, + "loss": 11.6448, + "step": 28638 + }, + { + "epoch": 1.5595080081408725, + "grad_norm": 0.5523570417335587, + "learning_rate": 2.437991263190974e-05, + "loss": 11.8614, + "step": 28639 + }, + { + "epoch": 1.5595624621374555, + "grad_norm": 0.5371483363138093, + "learning_rate": 2.4374142790909716e-05, + "loss": 11.6866, + "step": 28640 + }, + { + "epoch": 1.5596169161340385, + "grad_norm": 0.5225912993809458, + "learning_rate": 2.4368373537989065e-05, + "loss": 11.7559, + "step": 28641 + }, + { + "epoch": 1.5596713701306215, + "grad_norm": 0.5885400975090556, + "learning_rate": 2.4362604873192608e-05, + "loss": 11.9619, + "step": 28642 + }, + { + "epoch": 1.5597258241272045, + "grad_norm": 0.6045698927878795, + "learning_rate": 2.4356836796565242e-05, + "loss": 11.9076, + "step": 28643 + }, + { + "epoch": 1.5597802781237875, + "grad_norm": 0.5977187037410151, + "learning_rate": 2.4351069308151775e-05, + "loss": 11.8175, + "step": 28644 + }, + { + "epoch": 1.5598347321203705, + "grad_norm": 0.629120694430693, + "learning_rate": 2.4345302407997106e-05, + "loss": 11.7188, + "step": 28645 + }, + { + "epoch": 1.5598891861169535, + "grad_norm": 0.5351879694628767, + "learning_rate": 2.433953609614602e-05, + "loss": 11.8421, + "step": 28646 + }, + { + "epoch": 1.5599436401135365, + "grad_norm": 0.526824232359522, + "learning_rate": 2.4333770372643382e-05, + "loss": 11.8083, + "step": 28647 + }, + { + "epoch": 1.5599980941101195, + "grad_norm": 0.5323823091066532, + "learning_rate": 2.4328005237534112e-05, + "loss": 11.7918, + "step": 28648 + }, + { + "epoch": 1.5600525481067025, + "grad_norm": 0.5136817842203153, + "learning_rate": 2.4322240690862896e-05, + "loss": 11.8293, + "step": 28649 + }, + { + "epoch": 1.5601070021032855, + "grad_norm": 0.6024753448170723, + "learning_rate": 2.4316476732674654e-05, + "loss": 11.8917, + "step": 28650 + }, + { + "epoch": 1.5601614560998687, + "grad_norm": 0.6061125754433051, + "learning_rate": 2.4310713363014147e-05, + "loss": 11.8552, + "step": 28651 + }, + { + "epoch": 1.5602159100964517, + "grad_norm": 0.5350605569422646, + "learning_rate": 2.4304950581926246e-05, + "loss": 11.7366, + "step": 28652 + }, + { + "epoch": 1.5602703640930347, + "grad_norm": 0.5338812927349402, + "learning_rate": 2.429918838945575e-05, + "loss": 11.6912, + "step": 28653 + }, + { + "epoch": 1.5603248180896176, + "grad_norm": 0.5487878983649543, + "learning_rate": 2.4293426785647444e-05, + "loss": 11.8467, + "step": 28654 + }, + { + "epoch": 1.5603792720862006, + "grad_norm": 0.6095213129678665, + "learning_rate": 2.4287665770546176e-05, + "loss": 11.8893, + "step": 28655 + }, + { + "epoch": 1.5604337260827836, + "grad_norm": 0.5485124508683139, + "learning_rate": 2.4281905344196686e-05, + "loss": 11.8722, + "step": 28656 + }, + { + "epoch": 1.5604881800793668, + "grad_norm": 0.5454587961981922, + "learning_rate": 2.4276145506643822e-05, + "loss": 11.8044, + "step": 28657 + }, + { + "epoch": 1.5605426340759498, + "grad_norm": 0.5391338900803166, + "learning_rate": 2.4270386257932353e-05, + "loss": 11.9079, + "step": 28658 + }, + { + "epoch": 1.5605970880725328, + "grad_norm": 0.5377075082659929, + "learning_rate": 2.4264627598107025e-05, + "loss": 11.8558, + "step": 28659 + }, + { + "epoch": 1.5606515420691158, + "grad_norm": 0.5359101307237248, + "learning_rate": 2.425886952721269e-05, + "loss": 11.8104, + "step": 28660 + }, + { + "epoch": 1.5607059960656988, + "grad_norm": 0.5198996882303529, + "learning_rate": 2.4253112045294056e-05, + "loss": 11.7534, + "step": 28661 + }, + { + "epoch": 1.5607604500622818, + "grad_norm": 0.6005535960545674, + "learning_rate": 2.4247355152395933e-05, + "loss": 11.753, + "step": 28662 + }, + { + "epoch": 1.5608149040588648, + "grad_norm": 0.5374859344460433, + "learning_rate": 2.4241598848563095e-05, + "loss": 11.8552, + "step": 28663 + }, + { + "epoch": 1.5608693580554478, + "grad_norm": 0.5435563746272594, + "learning_rate": 2.4235843133840276e-05, + "loss": 11.8205, + "step": 28664 + }, + { + "epoch": 1.5609238120520308, + "grad_norm": 0.514249889739879, + "learning_rate": 2.4230088008272267e-05, + "loss": 11.8019, + "step": 28665 + }, + { + "epoch": 1.5609782660486138, + "grad_norm": 0.5519709215934762, + "learning_rate": 2.422433347190377e-05, + "loss": 11.8763, + "step": 28666 + }, + { + "epoch": 1.5610327200451968, + "grad_norm": 0.6500519150146741, + "learning_rate": 2.4218579524779596e-05, + "loss": 11.8816, + "step": 28667 + }, + { + "epoch": 1.5610871740417798, + "grad_norm": 0.540962738663549, + "learning_rate": 2.4212826166944445e-05, + "loss": 11.7951, + "step": 28668 + }, + { + "epoch": 1.5611416280383628, + "grad_norm": 0.5331272631238472, + "learning_rate": 2.420707339844305e-05, + "loss": 11.7081, + "step": 28669 + }, + { + "epoch": 1.5611960820349458, + "grad_norm": 0.5311968185544254, + "learning_rate": 2.4201321219320194e-05, + "loss": 11.8584, + "step": 28670 + }, + { + "epoch": 1.5612505360315287, + "grad_norm": 0.5329710549646063, + "learning_rate": 2.419556962962054e-05, + "loss": 11.7512, + "step": 28671 + }, + { + "epoch": 1.5613049900281117, + "grad_norm": 0.5494203517761478, + "learning_rate": 2.4189818629388884e-05, + "loss": 11.9823, + "step": 28672 + }, + { + "epoch": 1.5613594440246947, + "grad_norm": 0.5331030500575514, + "learning_rate": 2.4184068218669877e-05, + "loss": 11.7479, + "step": 28673 + }, + { + "epoch": 1.561413898021278, + "grad_norm": 0.565972514856663, + "learning_rate": 2.417831839750827e-05, + "loss": 11.8592, + "step": 28674 + }, + { + "epoch": 1.561468352017861, + "grad_norm": 0.5680900215380795, + "learning_rate": 2.41725691659488e-05, + "loss": 12.0057, + "step": 28675 + }, + { + "epoch": 1.561522806014444, + "grad_norm": 0.5255724644831455, + "learning_rate": 2.416682052403613e-05, + "loss": 11.8153, + "step": 28676 + }, + { + "epoch": 1.561577260011027, + "grad_norm": 0.5279055876700216, + "learning_rate": 2.4161072471814995e-05, + "loss": 11.8486, + "step": 28677 + }, + { + "epoch": 1.56163171400761, + "grad_norm": 0.595591677256108, + "learning_rate": 2.415532500933009e-05, + "loss": 11.8528, + "step": 28678 + }, + { + "epoch": 1.561686168004193, + "grad_norm": 0.5061505186200804, + "learning_rate": 2.4149578136626062e-05, + "loss": 11.7846, + "step": 28679 + }, + { + "epoch": 1.5617406220007761, + "grad_norm": 0.5060773506766636, + "learning_rate": 2.4143831853747668e-05, + "loss": 11.8524, + "step": 28680 + }, + { + "epoch": 1.5617950759973591, + "grad_norm": 0.584715774655512, + "learning_rate": 2.4138086160739525e-05, + "loss": 11.7046, + "step": 28681 + }, + { + "epoch": 1.561849529993942, + "grad_norm": 0.6205679596133787, + "learning_rate": 2.413234105764638e-05, + "loss": 11.8545, + "step": 28682 + }, + { + "epoch": 1.561903983990525, + "grad_norm": 0.558499851873605, + "learning_rate": 2.4126596544512846e-05, + "loss": 11.7731, + "step": 28683 + }, + { + "epoch": 1.561958437987108, + "grad_norm": 0.49533448777764894, + "learning_rate": 2.4120852621383648e-05, + "loss": 11.9185, + "step": 28684 + }, + { + "epoch": 1.562012891983691, + "grad_norm": 0.5125488555076525, + "learning_rate": 2.4115109288303395e-05, + "loss": 11.7386, + "step": 28685 + }, + { + "epoch": 1.562067345980274, + "grad_norm": 0.5737346535227084, + "learning_rate": 2.410936654531678e-05, + "loss": 11.8959, + "step": 28686 + }, + { + "epoch": 1.562121799976857, + "grad_norm": 0.5614848846577719, + "learning_rate": 2.410362439246848e-05, + "loss": 11.6778, + "step": 28687 + }, + { + "epoch": 1.56217625397344, + "grad_norm": 0.5149510034252667, + "learning_rate": 2.409788282980313e-05, + "loss": 11.7197, + "step": 28688 + }, + { + "epoch": 1.562230707970023, + "grad_norm": 0.5971872967791887, + "learning_rate": 2.4092141857365348e-05, + "loss": 11.8822, + "step": 28689 + }, + { + "epoch": 1.562285161966606, + "grad_norm": 0.5987782035748999, + "learning_rate": 2.408640147519983e-05, + "loss": 11.716, + "step": 28690 + }, + { + "epoch": 1.562339615963189, + "grad_norm": 0.5242083412362638, + "learning_rate": 2.4080661683351147e-05, + "loss": 11.7833, + "step": 28691 + }, + { + "epoch": 1.562394069959772, + "grad_norm": 0.6180979334920832, + "learning_rate": 2.4074922481864005e-05, + "loss": 11.8354, + "step": 28692 + }, + { + "epoch": 1.562448523956355, + "grad_norm": 0.5467262921957935, + "learning_rate": 2.4069183870782962e-05, + "loss": 11.8422, + "step": 28693 + }, + { + "epoch": 1.562502977952938, + "grad_norm": 0.5297791975345009, + "learning_rate": 2.406344585015271e-05, + "loss": 11.7195, + "step": 28694 + }, + { + "epoch": 1.562557431949521, + "grad_norm": 0.5166941071554032, + "learning_rate": 2.405770842001781e-05, + "loss": 11.7458, + "step": 28695 + }, + { + "epoch": 1.562611885946104, + "grad_norm": 0.5626793895017317, + "learning_rate": 2.4051971580422917e-05, + "loss": 11.898, + "step": 28696 + }, + { + "epoch": 1.5626663399426872, + "grad_norm": 0.5460906787316868, + "learning_rate": 2.4046235331412648e-05, + "loss": 11.8376, + "step": 28697 + }, + { + "epoch": 1.5627207939392702, + "grad_norm": 0.5257022270095185, + "learning_rate": 2.404049967303158e-05, + "loss": 11.8347, + "step": 28698 + }, + { + "epoch": 1.5627752479358532, + "grad_norm": 0.5000313753235595, + "learning_rate": 2.4034764605324344e-05, + "loss": 11.8122, + "step": 28699 + }, + { + "epoch": 1.5628297019324362, + "grad_norm": 0.5569986287852403, + "learning_rate": 2.402903012833547e-05, + "loss": 11.793, + "step": 28700 + }, + { + "epoch": 1.5628841559290192, + "grad_norm": 0.5336683536513435, + "learning_rate": 2.4023296242109616e-05, + "loss": 11.8115, + "step": 28701 + }, + { + "epoch": 1.5629386099256022, + "grad_norm": 0.5475684687673211, + "learning_rate": 2.4017562946691376e-05, + "loss": 11.9115, + "step": 28702 + }, + { + "epoch": 1.5629930639221854, + "grad_norm": 0.5337789947071677, + "learning_rate": 2.4011830242125277e-05, + "loss": 11.8374, + "step": 28703 + }, + { + "epoch": 1.5630475179187684, + "grad_norm": 0.5904045267095109, + "learning_rate": 2.4006098128455957e-05, + "loss": 11.8028, + "step": 28704 + }, + { + "epoch": 1.5631019719153514, + "grad_norm": 0.5083725581979951, + "learning_rate": 2.400036660572793e-05, + "loss": 11.8354, + "step": 28705 + }, + { + "epoch": 1.5631564259119344, + "grad_norm": 0.5801147322330663, + "learning_rate": 2.3994635673985832e-05, + "loss": 11.8596, + "step": 28706 + }, + { + "epoch": 1.5632108799085174, + "grad_norm": 0.6197128490651064, + "learning_rate": 2.398890533327416e-05, + "loss": 11.724, + "step": 28707 + }, + { + "epoch": 1.5632653339051004, + "grad_norm": 0.509327581039951, + "learning_rate": 2.398317558363754e-05, + "loss": 11.7433, + "step": 28708 + }, + { + "epoch": 1.5633197879016834, + "grad_norm": 0.5578360607636319, + "learning_rate": 2.3977446425120498e-05, + "loss": 11.8905, + "step": 28709 + }, + { + "epoch": 1.5633742418982663, + "grad_norm": 0.568355764591363, + "learning_rate": 2.397171785776755e-05, + "loss": 11.8355, + "step": 28710 + }, + { + "epoch": 1.5634286958948493, + "grad_norm": 0.4980275299516433, + "learning_rate": 2.3965989881623263e-05, + "loss": 11.7844, + "step": 28711 + }, + { + "epoch": 1.5634831498914323, + "grad_norm": 0.541978878698937, + "learning_rate": 2.3960262496732234e-05, + "loss": 11.8814, + "step": 28712 + }, + { + "epoch": 1.5635376038880153, + "grad_norm": 0.501969711216694, + "learning_rate": 2.3954535703138914e-05, + "loss": 11.9145, + "step": 28713 + }, + { + "epoch": 1.5635920578845983, + "grad_norm": 0.5280477172839124, + "learning_rate": 2.3948809500887914e-05, + "loss": 11.7605, + "step": 28714 + }, + { + "epoch": 1.5636465118811813, + "grad_norm": 0.5200954490724421, + "learning_rate": 2.39430838900237e-05, + "loss": 11.7397, + "step": 28715 + }, + { + "epoch": 1.5637009658777643, + "grad_norm": 0.5243192315248342, + "learning_rate": 2.3937358870590842e-05, + "loss": 11.9174, + "step": 28716 + }, + { + "epoch": 1.5637554198743473, + "grad_norm": 0.5686220440936848, + "learning_rate": 2.393163444263381e-05, + "loss": 11.9139, + "step": 28717 + }, + { + "epoch": 1.5638098738709303, + "grad_norm": 0.5883168712787715, + "learning_rate": 2.392591060619718e-05, + "loss": 11.861, + "step": 28718 + }, + { + "epoch": 1.5638643278675133, + "grad_norm": 0.5133332182405562, + "learning_rate": 2.392018736132542e-05, + "loss": 11.7445, + "step": 28719 + }, + { + "epoch": 1.5639187818640963, + "grad_norm": 0.5413224039592875, + "learning_rate": 2.391446470806301e-05, + "loss": 11.9039, + "step": 28720 + }, + { + "epoch": 1.5639732358606795, + "grad_norm": 0.5726298650546662, + "learning_rate": 2.3908742646454517e-05, + "loss": 11.8146, + "step": 28721 + }, + { + "epoch": 1.5640276898572625, + "grad_norm": 0.5707113450541269, + "learning_rate": 2.3903021176544372e-05, + "loss": 11.8959, + "step": 28722 + }, + { + "epoch": 1.5640821438538455, + "grad_norm": 0.5102116110964123, + "learning_rate": 2.389730029837709e-05, + "loss": 11.8204, + "step": 28723 + }, + { + "epoch": 1.5641365978504285, + "grad_norm": 0.5654603646911585, + "learning_rate": 2.3891580011997194e-05, + "loss": 11.7731, + "step": 28724 + }, + { + "epoch": 1.5641910518470115, + "grad_norm": 0.5402573401043785, + "learning_rate": 2.3885860317449116e-05, + "loss": 11.7931, + "step": 28725 + }, + { + "epoch": 1.5642455058435947, + "grad_norm": 0.5231421169027981, + "learning_rate": 2.3880141214777375e-05, + "loss": 11.8217, + "step": 28726 + }, + { + "epoch": 1.5642999598401777, + "grad_norm": 0.5214264989303993, + "learning_rate": 2.387442270402639e-05, + "loss": 11.8492, + "step": 28727 + }, + { + "epoch": 1.5643544138367607, + "grad_norm": 0.5118071365436173, + "learning_rate": 2.3868704785240693e-05, + "loss": 11.8548, + "step": 28728 + }, + { + "epoch": 1.5644088678333437, + "grad_norm": 0.5094263838292173, + "learning_rate": 2.3862987458464704e-05, + "loss": 11.8949, + "step": 28729 + }, + { + "epoch": 1.5644633218299266, + "grad_norm": 0.5738672324059615, + "learning_rate": 2.3857270723742874e-05, + "loss": 11.8904, + "step": 28730 + }, + { + "epoch": 1.5645177758265096, + "grad_norm": 0.5148932033644436, + "learning_rate": 2.3851554581119708e-05, + "loss": 11.8232, + "step": 28731 + }, + { + "epoch": 1.5645722298230926, + "grad_norm": 0.5352406901516237, + "learning_rate": 2.3845839030639582e-05, + "loss": 11.822, + "step": 28732 + }, + { + "epoch": 1.5646266838196756, + "grad_norm": 0.49418822148510005, + "learning_rate": 2.3840124072347027e-05, + "loss": 11.8528, + "step": 28733 + }, + { + "epoch": 1.5646811378162586, + "grad_norm": 0.5375250399413598, + "learning_rate": 2.3834409706286397e-05, + "loss": 11.7332, + "step": 28734 + }, + { + "epoch": 1.5647355918128416, + "grad_norm": 0.58188569843298, + "learning_rate": 2.3828695932502177e-05, + "loss": 11.8946, + "step": 28735 + }, + { + "epoch": 1.5647900458094246, + "grad_norm": 0.5830529094803893, + "learning_rate": 2.3822982751038825e-05, + "loss": 11.894, + "step": 28736 + }, + { + "epoch": 1.5648444998060076, + "grad_norm": 0.5446066079976716, + "learning_rate": 2.3817270161940696e-05, + "loss": 11.9443, + "step": 28737 + }, + { + "epoch": 1.5648989538025906, + "grad_norm": 0.5943059947098568, + "learning_rate": 2.381155816525228e-05, + "loss": 11.8442, + "step": 28738 + }, + { + "epoch": 1.5649534077991736, + "grad_norm": 0.5221723910675296, + "learning_rate": 2.380584676101797e-05, + "loss": 11.7763, + "step": 28739 + }, + { + "epoch": 1.5650078617957566, + "grad_norm": 0.5839712836393786, + "learning_rate": 2.380013594928214e-05, + "loss": 11.8905, + "step": 28740 + }, + { + "epoch": 1.5650623157923396, + "grad_norm": 0.5390609129932482, + "learning_rate": 2.3794425730089263e-05, + "loss": 11.8606, + "step": 28741 + }, + { + "epoch": 1.5651167697889226, + "grad_norm": 0.507204504875137, + "learning_rate": 2.3788716103483677e-05, + "loss": 11.7389, + "step": 28742 + }, + { + "epoch": 1.5651712237855055, + "grad_norm": 0.5677803797688926, + "learning_rate": 2.3783007069509856e-05, + "loss": 11.9405, + "step": 28743 + }, + { + "epoch": 1.5652256777820888, + "grad_norm": 0.48342845713468824, + "learning_rate": 2.377729862821212e-05, + "loss": 11.7874, + "step": 28744 + }, + { + "epoch": 1.5652801317786718, + "grad_norm": 0.5616391977923335, + "learning_rate": 2.377159077963489e-05, + "loss": 11.9251, + "step": 28745 + }, + { + "epoch": 1.5653345857752547, + "grad_norm": 0.5282233862160536, + "learning_rate": 2.3765883523822575e-05, + "loss": 11.9397, + "step": 28746 + }, + { + "epoch": 1.5653890397718377, + "grad_norm": 0.5284359597713528, + "learning_rate": 2.3760176860819517e-05, + "loss": 11.7287, + "step": 28747 + }, + { + "epoch": 1.5654434937684207, + "grad_norm": 0.5204217061484021, + "learning_rate": 2.3754470790670158e-05, + "loss": 11.9262, + "step": 28748 + }, + { + "epoch": 1.5654979477650037, + "grad_norm": 0.5371181748420035, + "learning_rate": 2.3748765313418765e-05, + "loss": 11.8768, + "step": 28749 + }, + { + "epoch": 1.565552401761587, + "grad_norm": 0.5137552902370959, + "learning_rate": 2.3743060429109765e-05, + "loss": 11.727, + "step": 28750 + }, + { + "epoch": 1.56560685575817, + "grad_norm": 0.5572033761507299, + "learning_rate": 2.3737356137787548e-05, + "loss": 11.8423, + "step": 28751 + }, + { + "epoch": 1.565661309754753, + "grad_norm": 0.5129204105216301, + "learning_rate": 2.3731652439496398e-05, + "loss": 11.7398, + "step": 28752 + }, + { + "epoch": 1.565715763751336, + "grad_norm": 0.6097920625736223, + "learning_rate": 2.372594933428074e-05, + "loss": 11.8881, + "step": 28753 + }, + { + "epoch": 1.565770217747919, + "grad_norm": 0.5645443131079321, + "learning_rate": 2.3720246822184866e-05, + "loss": 11.824, + "step": 28754 + }, + { + "epoch": 1.565824671744502, + "grad_norm": 0.5537955200202854, + "learning_rate": 2.371454490325318e-05, + "loss": 11.908, + "step": 28755 + }, + { + "epoch": 1.565879125741085, + "grad_norm": 0.5893560843152212, + "learning_rate": 2.370884357752995e-05, + "loss": 11.9121, + "step": 28756 + }, + { + "epoch": 1.565933579737668, + "grad_norm": 0.5274576306479433, + "learning_rate": 2.3703142845059545e-05, + "loss": 11.8951, + "step": 28757 + }, + { + "epoch": 1.5659880337342509, + "grad_norm": 0.540132917346285, + "learning_rate": 2.369744270588635e-05, + "loss": 11.791, + "step": 28758 + }, + { + "epoch": 1.5660424877308339, + "grad_norm": 0.5511282991532422, + "learning_rate": 2.369174316005459e-05, + "loss": 11.7187, + "step": 28759 + }, + { + "epoch": 1.5660969417274169, + "grad_norm": 0.5231365561063569, + "learning_rate": 2.3686044207608626e-05, + "loss": 11.8112, + "step": 28760 + }, + { + "epoch": 1.5661513957239999, + "grad_norm": 0.48191017846474765, + "learning_rate": 2.368034584859281e-05, + "loss": 11.8063, + "step": 28761 + }, + { + "epoch": 1.5662058497205829, + "grad_norm": 0.5128220263647797, + "learning_rate": 2.3674648083051387e-05, + "loss": 11.7613, + "step": 28762 + }, + { + "epoch": 1.5662603037171658, + "grad_norm": 0.5198616359674489, + "learning_rate": 2.3668950911028743e-05, + "loss": 11.8377, + "step": 28763 + }, + { + "epoch": 1.5663147577137488, + "grad_norm": 0.5331805507713756, + "learning_rate": 2.3663254332569094e-05, + "loss": 11.8622, + "step": 28764 + }, + { + "epoch": 1.5663692117103318, + "grad_norm": 0.5312240906252309, + "learning_rate": 2.3657558347716824e-05, + "loss": 11.7363, + "step": 28765 + }, + { + "epoch": 1.5664236657069148, + "grad_norm": 0.5669592320093093, + "learning_rate": 2.3651862956516146e-05, + "loss": 11.8808, + "step": 28766 + }, + { + "epoch": 1.566478119703498, + "grad_norm": 0.5556726740596933, + "learning_rate": 2.3646168159011418e-05, + "loss": 11.6917, + "step": 28767 + }, + { + "epoch": 1.566532573700081, + "grad_norm": 0.6241357849272708, + "learning_rate": 2.3640473955246856e-05, + "loss": 11.6563, + "step": 28768 + }, + { + "epoch": 1.566587027696664, + "grad_norm": 0.5384150007817814, + "learning_rate": 2.3634780345266806e-05, + "loss": 11.8801, + "step": 28769 + }, + { + "epoch": 1.566641481693247, + "grad_norm": 0.6633948331576193, + "learning_rate": 2.3629087329115517e-05, + "loss": 11.907, + "step": 28770 + }, + { + "epoch": 1.56669593568983, + "grad_norm": 0.5392172734467195, + "learning_rate": 2.3623394906837216e-05, + "loss": 11.8092, + "step": 28771 + }, + { + "epoch": 1.566750389686413, + "grad_norm": 0.5751667334222652, + "learning_rate": 2.3617703078476204e-05, + "loss": 11.7699, + "step": 28772 + }, + { + "epoch": 1.5668048436829962, + "grad_norm": 0.5564432575087046, + "learning_rate": 2.3612011844076776e-05, + "loss": 11.7863, + "step": 28773 + }, + { + "epoch": 1.5668592976795792, + "grad_norm": 0.555151398696221, + "learning_rate": 2.3606321203683125e-05, + "loss": 11.758, + "step": 28774 + }, + { + "epoch": 1.5669137516761622, + "grad_norm": 0.5823342213344037, + "learning_rate": 2.360063115733956e-05, + "loss": 11.786, + "step": 28775 + }, + { + "epoch": 1.5669682056727452, + "grad_norm": 0.5084271925018297, + "learning_rate": 2.359494170509027e-05, + "loss": 11.8565, + "step": 28776 + }, + { + "epoch": 1.5670226596693282, + "grad_norm": 0.5624785039688724, + "learning_rate": 2.3589252846979547e-05, + "loss": 11.8468, + "step": 28777 + }, + { + "epoch": 1.5670771136659112, + "grad_norm": 0.5853940280131675, + "learning_rate": 2.3583564583051586e-05, + "loss": 11.8713, + "step": 28778 + }, + { + "epoch": 1.5671315676624942, + "grad_norm": 0.5133279565725016, + "learning_rate": 2.357787691335067e-05, + "loss": 11.7816, + "step": 28779 + }, + { + "epoch": 1.5671860216590772, + "grad_norm": 0.6258315850356592, + "learning_rate": 2.3572189837920998e-05, + "loss": 11.7263, + "step": 28780 + }, + { + "epoch": 1.5672404756556602, + "grad_norm": 0.5319551282542647, + "learning_rate": 2.3566503356806768e-05, + "loss": 11.7758, + "step": 28781 + }, + { + "epoch": 1.5672949296522432, + "grad_norm": 0.5462647364412913, + "learning_rate": 2.3560817470052253e-05, + "loss": 11.8098, + "step": 28782 + }, + { + "epoch": 1.5673493836488261, + "grad_norm": 0.5138441339635628, + "learning_rate": 2.3555132177701612e-05, + "loss": 11.8368, + "step": 28783 + }, + { + "epoch": 1.5674038376454091, + "grad_norm": 0.5074516251173836, + "learning_rate": 2.3549447479799093e-05, + "loss": 11.8303, + "step": 28784 + }, + { + "epoch": 1.5674582916419921, + "grad_norm": 0.5123121406713322, + "learning_rate": 2.3543763376388904e-05, + "loss": 11.8151, + "step": 28785 + }, + { + "epoch": 1.5675127456385751, + "grad_norm": 0.5743749465006065, + "learning_rate": 2.3538079867515217e-05, + "loss": 11.8573, + "step": 28786 + }, + { + "epoch": 1.5675671996351581, + "grad_norm": 0.5419491477759231, + "learning_rate": 2.3532396953222257e-05, + "loss": 11.8738, + "step": 28787 + }, + { + "epoch": 1.567621653631741, + "grad_norm": 0.5100219492146478, + "learning_rate": 2.352671463355418e-05, + "loss": 11.8024, + "step": 28788 + }, + { + "epoch": 1.567676107628324, + "grad_norm": 0.6024295216855875, + "learning_rate": 2.352103290855523e-05, + "loss": 11.8999, + "step": 28789 + }, + { + "epoch": 1.567730561624907, + "grad_norm": 0.5244918516681831, + "learning_rate": 2.3515351778269547e-05, + "loss": 11.8695, + "step": 28790 + }, + { + "epoch": 1.5677850156214903, + "grad_norm": 0.5251132217855241, + "learning_rate": 2.3509671242741293e-05, + "loss": 11.8705, + "step": 28791 + }, + { + "epoch": 1.5678394696180733, + "grad_norm": 0.5617429267671153, + "learning_rate": 2.3503991302014682e-05, + "loss": 11.7922, + "step": 28792 + }, + { + "epoch": 1.5678939236146563, + "grad_norm": 0.5013374606752228, + "learning_rate": 2.3498311956133845e-05, + "loss": 11.8182, + "step": 28793 + }, + { + "epoch": 1.5679483776112393, + "grad_norm": 0.529311636265472, + "learning_rate": 2.3492633205142957e-05, + "loss": 11.8551, + "step": 28794 + }, + { + "epoch": 1.5680028316078223, + "grad_norm": 0.5522262472677143, + "learning_rate": 2.348695504908621e-05, + "loss": 11.8119, + "step": 28795 + }, + { + "epoch": 1.5680572856044055, + "grad_norm": 0.5637816975019991, + "learning_rate": 2.34812774880077e-05, + "loss": 11.9208, + "step": 28796 + }, + { + "epoch": 1.5681117396009885, + "grad_norm": 0.5124110850407693, + "learning_rate": 2.3475600521951646e-05, + "loss": 11.8226, + "step": 28797 + }, + { + "epoch": 1.5681661935975715, + "grad_norm": 0.5061190377644172, + "learning_rate": 2.346992415096213e-05, + "loss": 11.7736, + "step": 28798 + }, + { + "epoch": 1.5682206475941545, + "grad_norm": 0.5952185542731735, + "learning_rate": 2.3464248375083343e-05, + "loss": 11.9282, + "step": 28799 + }, + { + "epoch": 1.5682751015907375, + "grad_norm": 0.5735068389300485, + "learning_rate": 2.345857319435939e-05, + "loss": 11.9166, + "step": 28800 + }, + { + "epoch": 1.5683295555873205, + "grad_norm": 0.5137494152933586, + "learning_rate": 2.3452898608834385e-05, + "loss": 11.8012, + "step": 28801 + }, + { + "epoch": 1.5683840095839034, + "grad_norm": 0.5625967930993502, + "learning_rate": 2.3447224618552498e-05, + "loss": 11.8578, + "step": 28802 + }, + { + "epoch": 1.5684384635804864, + "grad_norm": 0.5146389222455794, + "learning_rate": 2.344155122355781e-05, + "loss": 11.8517, + "step": 28803 + }, + { + "epoch": 1.5684929175770694, + "grad_norm": 0.5264964698827507, + "learning_rate": 2.3435878423894488e-05, + "loss": 11.7719, + "step": 28804 + }, + { + "epoch": 1.5685473715736524, + "grad_norm": 0.5464145866372988, + "learning_rate": 2.3430206219606578e-05, + "loss": 11.7619, + "step": 28805 + }, + { + "epoch": 1.5686018255702354, + "grad_norm": 0.5585292284540881, + "learning_rate": 2.3424534610738235e-05, + "loss": 11.8803, + "step": 28806 + }, + { + "epoch": 1.5686562795668184, + "grad_norm": 0.5819457914709306, + "learning_rate": 2.3418863597333573e-05, + "loss": 11.8771, + "step": 28807 + }, + { + "epoch": 1.5687107335634014, + "grad_norm": 0.5685507530431541, + "learning_rate": 2.3413193179436644e-05, + "loss": 11.8529, + "step": 28808 + }, + { + "epoch": 1.5687651875599844, + "grad_norm": 0.6031635409779674, + "learning_rate": 2.34075233570916e-05, + "loss": 11.8167, + "step": 28809 + }, + { + "epoch": 1.5688196415565674, + "grad_norm": 0.5577260190987443, + "learning_rate": 2.340185413034249e-05, + "loss": 11.9123, + "step": 28810 + }, + { + "epoch": 1.5688740955531504, + "grad_norm": 0.5429463809666294, + "learning_rate": 2.3396185499233393e-05, + "loss": 11.7829, + "step": 28811 + }, + { + "epoch": 1.5689285495497334, + "grad_norm": 0.5210444587902235, + "learning_rate": 2.3390517463808427e-05, + "loss": 11.7514, + "step": 28812 + }, + { + "epoch": 1.5689830035463164, + "grad_norm": 0.562737098318813, + "learning_rate": 2.3384850024111616e-05, + "loss": 11.8266, + "step": 28813 + }, + { + "epoch": 1.5690374575428996, + "grad_norm": 0.6035665160900674, + "learning_rate": 2.337918318018708e-05, + "loss": 11.8448, + "step": 28814 + }, + { + "epoch": 1.5690919115394826, + "grad_norm": 0.6524794324622821, + "learning_rate": 2.3373516932078843e-05, + "loss": 11.9193, + "step": 28815 + }, + { + "epoch": 1.5691463655360656, + "grad_norm": 0.5289140152754468, + "learning_rate": 2.3367851279830988e-05, + "loss": 11.5499, + "step": 28816 + }, + { + "epoch": 1.5692008195326486, + "grad_norm": 0.615430679097053, + "learning_rate": 2.3362186223487603e-05, + "loss": 11.8119, + "step": 28817 + }, + { + "epoch": 1.5692552735292316, + "grad_norm": 0.5461840473533941, + "learning_rate": 2.3356521763092687e-05, + "loss": 11.7733, + "step": 28818 + }, + { + "epoch": 1.5693097275258145, + "grad_norm": 0.5904324219559542, + "learning_rate": 2.335085789869036e-05, + "loss": 11.8883, + "step": 28819 + }, + { + "epoch": 1.5693641815223978, + "grad_norm": 0.5416018914944514, + "learning_rate": 2.3345194630324563e-05, + "loss": 11.7112, + "step": 28820 + }, + { + "epoch": 1.5694186355189808, + "grad_norm": 0.5592029661413549, + "learning_rate": 2.3339531958039384e-05, + "loss": 11.8843, + "step": 28821 + }, + { + "epoch": 1.5694730895155637, + "grad_norm": 0.6197204075331203, + "learning_rate": 2.333386988187889e-05, + "loss": 11.918, + "step": 28822 + }, + { + "epoch": 1.5695275435121467, + "grad_norm": 0.5527706793530026, + "learning_rate": 2.3328208401887054e-05, + "loss": 11.8334, + "step": 28823 + }, + { + "epoch": 1.5695819975087297, + "grad_norm": 0.5986971658936749, + "learning_rate": 2.332254751810795e-05, + "loss": 11.815, + "step": 28824 + }, + { + "epoch": 1.5696364515053127, + "grad_norm": 0.54743412591157, + "learning_rate": 2.3316887230585548e-05, + "loss": 11.7309, + "step": 28825 + }, + { + "epoch": 1.5696909055018957, + "grad_norm": 0.5232788361556617, + "learning_rate": 2.331122753936392e-05, + "loss": 11.884, + "step": 28826 + }, + { + "epoch": 1.5697453594984787, + "grad_norm": 0.5132268042937693, + "learning_rate": 2.3305568444487026e-05, + "loss": 11.7641, + "step": 28827 + }, + { + "epoch": 1.5697998134950617, + "grad_norm": 0.5243162193204141, + "learning_rate": 2.329990994599889e-05, + "loss": 11.8505, + "step": 28828 + }, + { + "epoch": 1.5698542674916447, + "grad_norm": 0.5632512571249487, + "learning_rate": 2.3294252043943565e-05, + "loss": 11.7132, + "step": 28829 + }, + { + "epoch": 1.5699087214882277, + "grad_norm": 0.558958465573504, + "learning_rate": 2.3288594738364955e-05, + "loss": 11.7075, + "step": 28830 + }, + { + "epoch": 1.5699631754848107, + "grad_norm": 0.5638982990361233, + "learning_rate": 2.328293802930712e-05, + "loss": 11.916, + "step": 28831 + }, + { + "epoch": 1.5700176294813937, + "grad_norm": 0.5584317871891189, + "learning_rate": 2.3277281916813998e-05, + "loss": 11.8094, + "step": 28832 + }, + { + "epoch": 1.5700720834779767, + "grad_norm": 0.534225741168446, + "learning_rate": 2.3271626400929592e-05, + "loss": 11.7414, + "step": 28833 + }, + { + "epoch": 1.5701265374745597, + "grad_norm": 0.5439174665096898, + "learning_rate": 2.3265971481697925e-05, + "loss": 11.8063, + "step": 28834 + }, + { + "epoch": 1.5701809914711427, + "grad_norm": 0.4844349434575177, + "learning_rate": 2.3260317159162893e-05, + "loss": 11.8095, + "step": 28835 + }, + { + "epoch": 1.5702354454677256, + "grad_norm": 0.5011890767572778, + "learning_rate": 2.325466343336854e-05, + "loss": 11.7949, + "step": 28836 + }, + { + "epoch": 1.5702898994643089, + "grad_norm": 0.5429282307084502, + "learning_rate": 2.3249010304358767e-05, + "loss": 11.8371, + "step": 28837 + }, + { + "epoch": 1.5703443534608919, + "grad_norm": 0.5885668486888262, + "learning_rate": 2.3243357772177587e-05, + "loss": 11.8381, + "step": 28838 + }, + { + "epoch": 1.5703988074574748, + "grad_norm": 0.516917885066894, + "learning_rate": 2.3237705836868928e-05, + "loss": 11.6619, + "step": 28839 + }, + { + "epoch": 1.5704532614540578, + "grad_norm": 0.5392968565847012, + "learning_rate": 2.323205449847671e-05, + "loss": 11.8768, + "step": 28840 + }, + { + "epoch": 1.5705077154506408, + "grad_norm": 0.5249524459624826, + "learning_rate": 2.3226403757044936e-05, + "loss": 11.8003, + "step": 28841 + }, + { + "epoch": 1.5705621694472238, + "grad_norm": 0.5771607004204238, + "learning_rate": 2.3220753612617495e-05, + "loss": 11.9423, + "step": 28842 + }, + { + "epoch": 1.570616623443807, + "grad_norm": 0.538561955421746, + "learning_rate": 2.3215104065238357e-05, + "loss": 11.784, + "step": 28843 + }, + { + "epoch": 1.57067107744039, + "grad_norm": 0.5394213655913459, + "learning_rate": 2.3209455114951474e-05, + "loss": 11.8195, + "step": 28844 + }, + { + "epoch": 1.570725531436973, + "grad_norm": 0.5664551279352269, + "learning_rate": 2.3203806761800717e-05, + "loss": 11.8103, + "step": 28845 + }, + { + "epoch": 1.570779985433556, + "grad_norm": 0.5625213965904582, + "learning_rate": 2.3198159005830055e-05, + "loss": 11.8841, + "step": 28846 + }, + { + "epoch": 1.570834439430139, + "grad_norm": 0.565816915360413, + "learning_rate": 2.3192511847083375e-05, + "loss": 11.7987, + "step": 28847 + }, + { + "epoch": 1.570888893426722, + "grad_norm": 0.5448686503596304, + "learning_rate": 2.318686528560462e-05, + "loss": 11.6822, + "step": 28848 + }, + { + "epoch": 1.570943347423305, + "grad_norm": 0.5553673942431313, + "learning_rate": 2.3181219321437687e-05, + "loss": 11.9259, + "step": 28849 + }, + { + "epoch": 1.570997801419888, + "grad_norm": 0.5950616031474728, + "learning_rate": 2.3175573954626452e-05, + "loss": 11.8688, + "step": 28850 + }, + { + "epoch": 1.571052255416471, + "grad_norm": 0.4919833826596297, + "learning_rate": 2.3169929185214856e-05, + "loss": 11.7691, + "step": 28851 + }, + { + "epoch": 1.571106709413054, + "grad_norm": 0.5778741528414014, + "learning_rate": 2.3164285013246755e-05, + "loss": 11.8501, + "step": 28852 + }, + { + "epoch": 1.571161163409637, + "grad_norm": 0.5534783101305234, + "learning_rate": 2.3158641438766083e-05, + "loss": 11.796, + "step": 28853 + }, + { + "epoch": 1.57121561740622, + "grad_norm": 0.569115913555047, + "learning_rate": 2.315299846181668e-05, + "loss": 11.9566, + "step": 28854 + }, + { + "epoch": 1.571270071402803, + "grad_norm": 0.5468155802080271, + "learning_rate": 2.314735608244244e-05, + "loss": 11.8586, + "step": 28855 + }, + { + "epoch": 1.571324525399386, + "grad_norm": 0.5401501043180141, + "learning_rate": 2.3141714300687278e-05, + "loss": 11.8065, + "step": 28856 + }, + { + "epoch": 1.571378979395969, + "grad_norm": 0.4796767885691581, + "learning_rate": 2.3136073116595003e-05, + "loss": 11.7506, + "step": 28857 + }, + { + "epoch": 1.571433433392552, + "grad_norm": 0.5759294664512107, + "learning_rate": 2.3130432530209545e-05, + "loss": 11.776, + "step": 28858 + }, + { + "epoch": 1.571487887389135, + "grad_norm": 0.5283198939236153, + "learning_rate": 2.3124792541574737e-05, + "loss": 11.7619, + "step": 28859 + }, + { + "epoch": 1.5715423413857181, + "grad_norm": 0.5819955804357554, + "learning_rate": 2.3119153150734397e-05, + "loss": 11.9188, + "step": 28860 + }, + { + "epoch": 1.5715967953823011, + "grad_norm": 0.5253213142452875, + "learning_rate": 2.311351435773246e-05, + "loss": 11.766, + "step": 28861 + }, + { + "epoch": 1.5716512493788841, + "grad_norm": 0.5531674322889195, + "learning_rate": 2.3107876162612684e-05, + "loss": 11.8874, + "step": 28862 + }, + { + "epoch": 1.5717057033754671, + "grad_norm": 0.5201859640964867, + "learning_rate": 2.3102238565419e-05, + "loss": 11.7146, + "step": 28863 + }, + { + "epoch": 1.57176015737205, + "grad_norm": 0.5856481414476795, + "learning_rate": 2.3096601566195163e-05, + "loss": 11.9887, + "step": 28864 + }, + { + "epoch": 1.571814611368633, + "grad_norm": 0.5451818598652349, + "learning_rate": 2.3090965164985058e-05, + "loss": 11.7176, + "step": 28865 + }, + { + "epoch": 1.5718690653652163, + "grad_norm": 0.5155272465145542, + "learning_rate": 2.308532936183253e-05, + "loss": 11.7139, + "step": 28866 + }, + { + "epoch": 1.5719235193617993, + "grad_norm": 0.5971233770583798, + "learning_rate": 2.3079694156781352e-05, + "loss": 11.9034, + "step": 28867 + }, + { + "epoch": 1.5719779733583823, + "grad_norm": 0.5359770605652687, + "learning_rate": 2.3074059549875393e-05, + "loss": 11.9564, + "step": 28868 + }, + { + "epoch": 1.5720324273549653, + "grad_norm": 0.6001300294057664, + "learning_rate": 2.3068425541158456e-05, + "loss": 11.7861, + "step": 28869 + }, + { + "epoch": 1.5720868813515483, + "grad_norm": 0.49531358904600387, + "learning_rate": 2.3062792130674314e-05, + "loss": 11.7003, + "step": 28870 + }, + { + "epoch": 1.5721413353481313, + "grad_norm": 0.588119631962856, + "learning_rate": 2.3057159318466836e-05, + "loss": 11.9108, + "step": 28871 + }, + { + "epoch": 1.5721957893447143, + "grad_norm": 0.5981029271313729, + "learning_rate": 2.305152710457975e-05, + "loss": 11.7787, + "step": 28872 + }, + { + "epoch": 1.5722502433412973, + "grad_norm": 0.5604200596126128, + "learning_rate": 2.3045895489056934e-05, + "loss": 11.8497, + "step": 28873 + }, + { + "epoch": 1.5723046973378803, + "grad_norm": 0.5760121192842886, + "learning_rate": 2.3040264471942108e-05, + "loss": 11.7729, + "step": 28874 + }, + { + "epoch": 1.5723591513344632, + "grad_norm": 0.5054891954410827, + "learning_rate": 2.3034634053279115e-05, + "loss": 11.691, + "step": 28875 + }, + { + "epoch": 1.5724136053310462, + "grad_norm": 0.6305847167877786, + "learning_rate": 2.3029004233111694e-05, + "loss": 11.8834, + "step": 28876 + }, + { + "epoch": 1.5724680593276292, + "grad_norm": 0.5231289623932586, + "learning_rate": 2.3023375011483638e-05, + "loss": 11.8407, + "step": 28877 + }, + { + "epoch": 1.5725225133242122, + "grad_norm": 0.5354428442379003, + "learning_rate": 2.3017746388438753e-05, + "loss": 11.9022, + "step": 28878 + }, + { + "epoch": 1.5725769673207952, + "grad_norm": 0.529924526298394, + "learning_rate": 2.3012118364020787e-05, + "loss": 11.8539, + "step": 28879 + }, + { + "epoch": 1.5726314213173782, + "grad_norm": 0.49937226386890826, + "learning_rate": 2.3006490938273507e-05, + "loss": 11.773, + "step": 28880 + }, + { + "epoch": 1.5726858753139612, + "grad_norm": 0.600246857127341, + "learning_rate": 2.3000864111240627e-05, + "loss": 11.9778, + "step": 28881 + }, + { + "epoch": 1.5727403293105442, + "grad_norm": 0.5150061147575434, + "learning_rate": 2.2995237882965936e-05, + "loss": 11.7229, + "step": 28882 + }, + { + "epoch": 1.5727947833071272, + "grad_norm": 0.5389269690688803, + "learning_rate": 2.298961225349322e-05, + "loss": 11.7591, + "step": 28883 + }, + { + "epoch": 1.5728492373037104, + "grad_norm": 0.5496834218271008, + "learning_rate": 2.2983987222866176e-05, + "loss": 11.8051, + "step": 28884 + }, + { + "epoch": 1.5729036913002934, + "grad_norm": 0.5408526547759474, + "learning_rate": 2.2978362791128582e-05, + "loss": 11.8004, + "step": 28885 + }, + { + "epoch": 1.5729581452968764, + "grad_norm": 0.48644510699505733, + "learning_rate": 2.2972738958324123e-05, + "loss": 11.7983, + "step": 28886 + }, + { + "epoch": 1.5730125992934594, + "grad_norm": 0.5125963878475169, + "learning_rate": 2.29671157244966e-05, + "loss": 11.7825, + "step": 28887 + }, + { + "epoch": 1.5730670532900424, + "grad_norm": 0.5456381862501376, + "learning_rate": 2.2961493089689677e-05, + "loss": 11.8283, + "step": 28888 + }, + { + "epoch": 1.5731215072866254, + "grad_norm": 0.5832132101324694, + "learning_rate": 2.295587105394713e-05, + "loss": 11.9061, + "step": 28889 + }, + { + "epoch": 1.5731759612832086, + "grad_norm": 0.5117851731701568, + "learning_rate": 2.2950249617312648e-05, + "loss": 11.8274, + "step": 28890 + }, + { + "epoch": 1.5732304152797916, + "grad_norm": 0.7988447022716587, + "learning_rate": 2.2944628779829913e-05, + "loss": 12.0, + "step": 28891 + }, + { + "epoch": 1.5732848692763746, + "grad_norm": 0.5515940665282327, + "learning_rate": 2.293900854154267e-05, + "loss": 11.8128, + "step": 28892 + }, + { + "epoch": 1.5733393232729576, + "grad_norm": 0.5458483583398712, + "learning_rate": 2.2933388902494646e-05, + "loss": 11.9411, + "step": 28893 + }, + { + "epoch": 1.5733937772695405, + "grad_norm": 0.5000229674779554, + "learning_rate": 2.2927769862729497e-05, + "loss": 11.8226, + "step": 28894 + }, + { + "epoch": 1.5734482312661235, + "grad_norm": 0.5421644200612199, + "learning_rate": 2.292215142229095e-05, + "loss": 11.9527, + "step": 28895 + }, + { + "epoch": 1.5735026852627065, + "grad_norm": 0.48402343152433586, + "learning_rate": 2.2916533581222655e-05, + "loss": 11.7499, + "step": 28896 + }, + { + "epoch": 1.5735571392592895, + "grad_norm": 0.5165782833986247, + "learning_rate": 2.2910916339568354e-05, + "loss": 11.8198, + "step": 28897 + }, + { + "epoch": 1.5736115932558725, + "grad_norm": 0.5627621105234702, + "learning_rate": 2.290529969737166e-05, + "loss": 11.7158, + "step": 28898 + }, + { + "epoch": 1.5736660472524555, + "grad_norm": 0.5331697384083023, + "learning_rate": 2.289968365467632e-05, + "loss": 11.7072, + "step": 28899 + }, + { + "epoch": 1.5737205012490385, + "grad_norm": 0.5808663226201757, + "learning_rate": 2.2894068211525964e-05, + "loss": 11.8476, + "step": 28900 + }, + { + "epoch": 1.5737749552456215, + "grad_norm": 0.5411353924438677, + "learning_rate": 2.2888453367964235e-05, + "loss": 11.8446, + "step": 28901 + }, + { + "epoch": 1.5738294092422045, + "grad_norm": 0.5439941125721316, + "learning_rate": 2.288283912403486e-05, + "loss": 11.8777, + "step": 28902 + }, + { + "epoch": 1.5738838632387875, + "grad_norm": 0.5354142726054871, + "learning_rate": 2.2877225479781427e-05, + "loss": 11.8446, + "step": 28903 + }, + { + "epoch": 1.5739383172353705, + "grad_norm": 0.5238275096977838, + "learning_rate": 2.2871612435247625e-05, + "loss": 11.8189, + "step": 28904 + }, + { + "epoch": 1.5739927712319535, + "grad_norm": 0.5314271423713879, + "learning_rate": 2.2865999990477128e-05, + "loss": 11.8378, + "step": 28905 + }, + { + "epoch": 1.5740472252285365, + "grad_norm": 0.5406769231803211, + "learning_rate": 2.286038814551351e-05, + "loss": 11.7432, + "step": 28906 + }, + { + "epoch": 1.5741016792251197, + "grad_norm": 0.5294794764373592, + "learning_rate": 2.28547769004005e-05, + "loss": 11.6909, + "step": 28907 + }, + { + "epoch": 1.5741561332217027, + "grad_norm": 0.5633527580065322, + "learning_rate": 2.284916625518163e-05, + "loss": 11.7931, + "step": 28908 + }, + { + "epoch": 1.5742105872182857, + "grad_norm": 0.5253651236233929, + "learning_rate": 2.2843556209900628e-05, + "loss": 11.8092, + "step": 28909 + }, + { + "epoch": 1.5742650412148687, + "grad_norm": 0.537188452610232, + "learning_rate": 2.283794676460107e-05, + "loss": 11.9092, + "step": 28910 + }, + { + "epoch": 1.5743194952114516, + "grad_norm": 0.5943887836728728, + "learning_rate": 2.283233791932654e-05, + "loss": 11.9433, + "step": 28911 + }, + { + "epoch": 1.5743739492080346, + "grad_norm": 0.5744023067687126, + "learning_rate": 2.2826729674120728e-05, + "loss": 11.8958, + "step": 28912 + }, + { + "epoch": 1.5744284032046179, + "grad_norm": 0.5141809582026872, + "learning_rate": 2.2821122029027176e-05, + "loss": 11.8696, + "step": 28913 + }, + { + "epoch": 1.5744828572012008, + "grad_norm": 0.5748075044435534, + "learning_rate": 2.2815514984089526e-05, + "loss": 11.858, + "step": 28914 + }, + { + "epoch": 1.5745373111977838, + "grad_norm": 0.560832134890702, + "learning_rate": 2.28099085393514e-05, + "loss": 11.8423, + "step": 28915 + }, + { + "epoch": 1.5745917651943668, + "grad_norm": 0.5371437284242473, + "learning_rate": 2.280430269485634e-05, + "loss": 11.9207, + "step": 28916 + }, + { + "epoch": 1.5746462191909498, + "grad_norm": 0.53380184172574, + "learning_rate": 2.2798697450648e-05, + "loss": 11.8518, + "step": 28917 + }, + { + "epoch": 1.5747006731875328, + "grad_norm": 0.540136552633758, + "learning_rate": 2.27930928067699e-05, + "loss": 11.8117, + "step": 28918 + }, + { + "epoch": 1.5747551271841158, + "grad_norm": 0.5279692252142438, + "learning_rate": 2.2787488763265697e-05, + "loss": 11.828, + "step": 28919 + }, + { + "epoch": 1.5748095811806988, + "grad_norm": 0.5667267210614443, + "learning_rate": 2.278188532017892e-05, + "loss": 11.7413, + "step": 28920 + }, + { + "epoch": 1.5748640351772818, + "grad_norm": 0.5641548462221883, + "learning_rate": 2.2776282477553125e-05, + "loss": 11.8776, + "step": 28921 + }, + { + "epoch": 1.5749184891738648, + "grad_norm": 0.5165989644735156, + "learning_rate": 2.277068023543194e-05, + "loss": 11.7896, + "step": 28922 + }, + { + "epoch": 1.5749729431704478, + "grad_norm": 0.5499267654843936, + "learning_rate": 2.2765078593858868e-05, + "loss": 11.8233, + "step": 28923 + }, + { + "epoch": 1.5750273971670308, + "grad_norm": 0.5689880773549624, + "learning_rate": 2.275947755287753e-05, + "loss": 11.8807, + "step": 28924 + }, + { + "epoch": 1.5750818511636138, + "grad_norm": 0.5704698987243558, + "learning_rate": 2.275387711253142e-05, + "loss": 11.9132, + "step": 28925 + }, + { + "epoch": 1.5751363051601968, + "grad_norm": 0.5903502315610777, + "learning_rate": 2.2748277272864106e-05, + "loss": 11.8719, + "step": 28926 + }, + { + "epoch": 1.5751907591567798, + "grad_norm": 0.5582185909323738, + "learning_rate": 2.2742678033919175e-05, + "loss": 11.83, + "step": 28927 + }, + { + "epoch": 1.5752452131533627, + "grad_norm": 0.5236327125435509, + "learning_rate": 2.2737079395740114e-05, + "loss": 11.8483, + "step": 28928 + }, + { + "epoch": 1.5752996671499457, + "grad_norm": 0.517888638304181, + "learning_rate": 2.2731481358370498e-05, + "loss": 11.7218, + "step": 28929 + }, + { + "epoch": 1.575354121146529, + "grad_norm": 0.5806616262289762, + "learning_rate": 2.272588392185384e-05, + "loss": 11.8007, + "step": 28930 + }, + { + "epoch": 1.575408575143112, + "grad_norm": 0.5388379469177362, + "learning_rate": 2.272028708623365e-05, + "loss": 11.7921, + "step": 28931 + }, + { + "epoch": 1.575463029139695, + "grad_norm": 0.5361134179981693, + "learning_rate": 2.2714690851553488e-05, + "loss": 11.7761, + "step": 28932 + }, + { + "epoch": 1.575517483136278, + "grad_norm": 0.5423441461487326, + "learning_rate": 2.2709095217856836e-05, + "loss": 11.8017, + "step": 28933 + }, + { + "epoch": 1.575571937132861, + "grad_norm": 0.5245884311961633, + "learning_rate": 2.270350018518723e-05, + "loss": 11.7721, + "step": 28934 + }, + { + "epoch": 1.575626391129444, + "grad_norm": 0.5602690809245169, + "learning_rate": 2.2697905753588156e-05, + "loss": 11.8095, + "step": 28935 + }, + { + "epoch": 1.5756808451260271, + "grad_norm": 0.6439708276703576, + "learning_rate": 2.269231192310315e-05, + "loss": 11.8879, + "step": 28936 + }, + { + "epoch": 1.5757352991226101, + "grad_norm": 0.6756873643787613, + "learning_rate": 2.2686718693775664e-05, + "loss": 11.8543, + "step": 28937 + }, + { + "epoch": 1.5757897531191931, + "grad_norm": 0.5729403690540208, + "learning_rate": 2.2681126065649227e-05, + "loss": 11.9723, + "step": 28938 + }, + { + "epoch": 1.575844207115776, + "grad_norm": 0.5200836804350855, + "learning_rate": 2.2675534038767342e-05, + "loss": 11.8063, + "step": 28939 + }, + { + "epoch": 1.575898661112359, + "grad_norm": 0.5864393205334614, + "learning_rate": 2.2669942613173465e-05, + "loss": 11.9725, + "step": 28940 + }, + { + "epoch": 1.575953115108942, + "grad_norm": 0.5406517164507786, + "learning_rate": 2.2664351788911063e-05, + "loss": 11.8057, + "step": 28941 + }, + { + "epoch": 1.576007569105525, + "grad_norm": 0.5329268958336405, + "learning_rate": 2.2658761566023655e-05, + "loss": 11.7887, + "step": 28942 + }, + { + "epoch": 1.576062023102108, + "grad_norm": 0.5332785095692129, + "learning_rate": 2.2653171944554662e-05, + "loss": 11.8428, + "step": 28943 + }, + { + "epoch": 1.576116477098691, + "grad_norm": 0.5469093885972705, + "learning_rate": 2.2647582924547606e-05, + "loss": 11.8983, + "step": 28944 + }, + { + "epoch": 1.576170931095274, + "grad_norm": 0.5199717403612628, + "learning_rate": 2.2641994506045883e-05, + "loss": 11.8231, + "step": 28945 + }, + { + "epoch": 1.576225385091857, + "grad_norm": 0.5879884774777993, + "learning_rate": 2.2636406689093014e-05, + "loss": 11.8622, + "step": 28946 + }, + { + "epoch": 1.57627983908844, + "grad_norm": 0.5686514620226485, + "learning_rate": 2.263081947373239e-05, + "loss": 11.649, + "step": 28947 + }, + { + "epoch": 1.576334293085023, + "grad_norm": 0.507483113874304, + "learning_rate": 2.2625232860007495e-05, + "loss": 11.8155, + "step": 28948 + }, + { + "epoch": 1.576388747081606, + "grad_norm": 0.6117196971041503, + "learning_rate": 2.2619646847961785e-05, + "loss": 11.8837, + "step": 28949 + }, + { + "epoch": 1.576443201078189, + "grad_norm": 0.5720458429327793, + "learning_rate": 2.261406143763868e-05, + "loss": 11.8197, + "step": 28950 + }, + { + "epoch": 1.576497655074772, + "grad_norm": 0.5534445260877011, + "learning_rate": 2.260847662908161e-05, + "loss": 11.8195, + "step": 28951 + }, + { + "epoch": 1.576552109071355, + "grad_norm": 0.5583281817128195, + "learning_rate": 2.2602892422333976e-05, + "loss": 11.6677, + "step": 28952 + }, + { + "epoch": 1.576606563067938, + "grad_norm": 0.531915341939756, + "learning_rate": 2.2597308817439232e-05, + "loss": 11.7879, + "step": 28953 + }, + { + "epoch": 1.5766610170645212, + "grad_norm": 0.5098542220047537, + "learning_rate": 2.2591725814440835e-05, + "loss": 11.7829, + "step": 28954 + }, + { + "epoch": 1.5767154710611042, + "grad_norm": 0.4992514116282411, + "learning_rate": 2.258614341338211e-05, + "loss": 11.7571, + "step": 28955 + }, + { + "epoch": 1.5767699250576872, + "grad_norm": 0.5449980587058868, + "learning_rate": 2.258056161430656e-05, + "loss": 11.9823, + "step": 28956 + }, + { + "epoch": 1.5768243790542702, + "grad_norm": 0.6281234219372464, + "learning_rate": 2.257498041725751e-05, + "loss": 11.8211, + "step": 28957 + }, + { + "epoch": 1.5768788330508532, + "grad_norm": 0.5436626228492178, + "learning_rate": 2.2569399822278425e-05, + "loss": 11.8588, + "step": 28958 + }, + { + "epoch": 1.5769332870474362, + "grad_norm": 0.631549265387338, + "learning_rate": 2.2563819829412647e-05, + "loss": 11.8126, + "step": 28959 + }, + { + "epoch": 1.5769877410440194, + "grad_norm": 0.5676873706665952, + "learning_rate": 2.2558240438703625e-05, + "loss": 11.7445, + "step": 28960 + }, + { + "epoch": 1.5770421950406024, + "grad_norm": 0.5769554586685568, + "learning_rate": 2.2552661650194706e-05, + "loss": 11.7879, + "step": 28961 + }, + { + "epoch": 1.5770966490371854, + "grad_norm": 0.5362000100321781, + "learning_rate": 2.2547083463929242e-05, + "loss": 11.8459, + "step": 28962 + }, + { + "epoch": 1.5771511030337684, + "grad_norm": 0.5134673533249084, + "learning_rate": 2.2541505879950665e-05, + "loss": 11.7598, + "step": 28963 + }, + { + "epoch": 1.5772055570303514, + "grad_norm": 0.5357515394780799, + "learning_rate": 2.253592889830234e-05, + "loss": 11.9005, + "step": 28964 + }, + { + "epoch": 1.5772600110269344, + "grad_norm": 0.524983362575471, + "learning_rate": 2.2530352519027596e-05, + "loss": 11.8162, + "step": 28965 + }, + { + "epoch": 1.5773144650235174, + "grad_norm": 0.6154698033924738, + "learning_rate": 2.252477674216986e-05, + "loss": 11.8156, + "step": 28966 + }, + { + "epoch": 1.5773689190201003, + "grad_norm": 0.5504062250391509, + "learning_rate": 2.2519201567772418e-05, + "loss": 11.6462, + "step": 28967 + }, + { + "epoch": 1.5774233730166833, + "grad_norm": 0.5430983524423318, + "learning_rate": 2.2513626995878688e-05, + "loss": 11.7889, + "step": 28968 + }, + { + "epoch": 1.5774778270132663, + "grad_norm": 0.5188498487490496, + "learning_rate": 2.2508053026531962e-05, + "loss": 11.7778, + "step": 28969 + }, + { + "epoch": 1.5775322810098493, + "grad_norm": 0.5121725059802743, + "learning_rate": 2.2502479659775632e-05, + "loss": 11.866, + "step": 28970 + }, + { + "epoch": 1.5775867350064323, + "grad_norm": 0.5290015315785176, + "learning_rate": 2.2496906895653026e-05, + "loss": 11.7747, + "step": 28971 + }, + { + "epoch": 1.5776411890030153, + "grad_norm": 0.6569043931659422, + "learning_rate": 2.2491334734207436e-05, + "loss": 11.9028, + "step": 28972 + }, + { + "epoch": 1.5776956429995983, + "grad_norm": 0.5543234105763295, + "learning_rate": 2.2485763175482255e-05, + "loss": 11.7604, + "step": 28973 + }, + { + "epoch": 1.5777500969961813, + "grad_norm": 0.49838727870874877, + "learning_rate": 2.2480192219520745e-05, + "loss": 11.7692, + "step": 28974 + }, + { + "epoch": 1.5778045509927643, + "grad_norm": 0.5135857097967986, + "learning_rate": 2.2474621866366265e-05, + "loss": 11.746, + "step": 28975 + }, + { + "epoch": 1.5778590049893473, + "grad_norm": 0.5333201044484813, + "learning_rate": 2.2469052116062162e-05, + "loss": 11.7192, + "step": 28976 + }, + { + "epoch": 1.5779134589859305, + "grad_norm": 0.529647847874188, + "learning_rate": 2.2463482968651673e-05, + "loss": 11.693, + "step": 28977 + }, + { + "epoch": 1.5779679129825135, + "grad_norm": 0.5440209011817407, + "learning_rate": 2.245791442417817e-05, + "loss": 11.7874, + "step": 28978 + }, + { + "epoch": 1.5780223669790965, + "grad_norm": 0.6376308003114978, + "learning_rate": 2.2452346482684904e-05, + "loss": 11.7973, + "step": 28979 + }, + { + "epoch": 1.5780768209756795, + "grad_norm": 0.5519943422912124, + "learning_rate": 2.2446779144215226e-05, + "loss": 11.8261, + "step": 28980 + }, + { + "epoch": 1.5781312749722625, + "grad_norm": 0.5295756531059973, + "learning_rate": 2.2441212408812406e-05, + "loss": 11.8154, + "step": 28981 + }, + { + "epoch": 1.5781857289688455, + "grad_norm": 0.6778275703681259, + "learning_rate": 2.2435646276519684e-05, + "loss": 11.8203, + "step": 28982 + }, + { + "epoch": 1.5782401829654287, + "grad_norm": 0.6029728680115989, + "learning_rate": 2.243008074738042e-05, + "loss": 11.6193, + "step": 28983 + }, + { + "epoch": 1.5782946369620117, + "grad_norm": 0.5648586442144191, + "learning_rate": 2.242451582143782e-05, + "loss": 11.8152, + "step": 28984 + }, + { + "epoch": 1.5783490909585947, + "grad_norm": 0.5750749028367127, + "learning_rate": 2.241895149873523e-05, + "loss": 11.8914, + "step": 28985 + }, + { + "epoch": 1.5784035449551777, + "grad_norm": 0.5815848033836704, + "learning_rate": 2.2413387779315854e-05, + "loss": 11.8193, + "step": 28986 + }, + { + "epoch": 1.5784579989517606, + "grad_norm": 0.5911850440102399, + "learning_rate": 2.2407824663222988e-05, + "loss": 11.9349, + "step": 28987 + }, + { + "epoch": 1.5785124529483436, + "grad_norm": 0.5667578798371958, + "learning_rate": 2.240226215049992e-05, + "loss": 11.8732, + "step": 28988 + }, + { + "epoch": 1.5785669069449266, + "grad_norm": 0.5261334268084475, + "learning_rate": 2.239670024118984e-05, + "loss": 11.7511, + "step": 28989 + }, + { + "epoch": 1.5786213609415096, + "grad_norm": 0.5301617304382541, + "learning_rate": 2.2391138935336065e-05, + "loss": 11.9436, + "step": 28990 + }, + { + "epoch": 1.5786758149380926, + "grad_norm": 0.6135552732462849, + "learning_rate": 2.2385578232981807e-05, + "loss": 12.0065, + "step": 28991 + }, + { + "epoch": 1.5787302689346756, + "grad_norm": 0.5642411080239568, + "learning_rate": 2.2380018134170278e-05, + "loss": 11.8878, + "step": 28992 + }, + { + "epoch": 1.5787847229312586, + "grad_norm": 0.5857130422423739, + "learning_rate": 2.2374458638944775e-05, + "loss": 11.8529, + "step": 28993 + }, + { + "epoch": 1.5788391769278416, + "grad_norm": 0.6379855297922655, + "learning_rate": 2.236889974734847e-05, + "loss": 11.9326, + "step": 28994 + }, + { + "epoch": 1.5788936309244246, + "grad_norm": 0.5219813063400816, + "learning_rate": 2.236334145942465e-05, + "loss": 11.7913, + "step": 28995 + }, + { + "epoch": 1.5789480849210076, + "grad_norm": 0.5256263252262056, + "learning_rate": 2.2357783775216478e-05, + "loss": 11.7567, + "step": 28996 + }, + { + "epoch": 1.5790025389175906, + "grad_norm": 0.6304368816838619, + "learning_rate": 2.2352226694767198e-05, + "loss": 11.8917, + "step": 28997 + }, + { + "epoch": 1.5790569929141736, + "grad_norm": 0.5439879866079371, + "learning_rate": 2.234667021812006e-05, + "loss": 11.8921, + "step": 28998 + }, + { + "epoch": 1.5791114469107566, + "grad_norm": 0.5229767730065559, + "learning_rate": 2.2341114345318192e-05, + "loss": 11.7656, + "step": 28999 + }, + { + "epoch": 1.5791659009073398, + "grad_norm": 0.5446884088217604, + "learning_rate": 2.233555907640491e-05, + "loss": 11.8311, + "step": 29000 + }, + { + "epoch": 1.5792203549039228, + "grad_norm": 0.506451489020867, + "learning_rate": 2.2330004411423288e-05, + "loss": 11.6914, + "step": 29001 + }, + { + "epoch": 1.5792748089005058, + "grad_norm": 0.5656469671271962, + "learning_rate": 2.232445035041658e-05, + "loss": 11.9168, + "step": 29002 + }, + { + "epoch": 1.5793292628970887, + "grad_norm": 0.6068004920605006, + "learning_rate": 2.2318896893427998e-05, + "loss": 11.7953, + "step": 29003 + }, + { + "epoch": 1.5793837168936717, + "grad_norm": 0.6278159447890614, + "learning_rate": 2.2313344040500673e-05, + "loss": 11.9229, + "step": 29004 + }, + { + "epoch": 1.5794381708902547, + "grad_norm": 0.5440617499970428, + "learning_rate": 2.2307791791677845e-05, + "loss": 11.7915, + "step": 29005 + }, + { + "epoch": 1.579492624886838, + "grad_norm": 0.781416627615001, + "learning_rate": 2.2302240147002616e-05, + "loss": 11.7311, + "step": 29006 + }, + { + "epoch": 1.579547078883421, + "grad_norm": 0.5246497992509671, + "learning_rate": 2.229668910651824e-05, + "loss": 11.6722, + "step": 29007 + }, + { + "epoch": 1.579601532880004, + "grad_norm": 0.5731466136407526, + "learning_rate": 2.2291138670267808e-05, + "loss": 11.9134, + "step": 29008 + }, + { + "epoch": 1.579655986876587, + "grad_norm": 0.5130475726398709, + "learning_rate": 2.2285588838294514e-05, + "loss": 11.8347, + "step": 29009 + }, + { + "epoch": 1.57971044087317, + "grad_norm": 0.5376548533099087, + "learning_rate": 2.2280039610641556e-05, + "loss": 11.7917, + "step": 29010 + }, + { + "epoch": 1.579764894869753, + "grad_norm": 0.6086318740399723, + "learning_rate": 2.2274490987352003e-05, + "loss": 11.901, + "step": 29011 + }, + { + "epoch": 1.579819348866336, + "grad_norm": 0.6488987762219234, + "learning_rate": 2.2268942968469043e-05, + "loss": 11.8067, + "step": 29012 + }, + { + "epoch": 1.579873802862919, + "grad_norm": 0.549697596344946, + "learning_rate": 2.226339555403584e-05, + "loss": 11.9265, + "step": 29013 + }, + { + "epoch": 1.579928256859502, + "grad_norm": 0.4918710492757763, + "learning_rate": 2.225784874409549e-05, + "loss": 11.8429, + "step": 29014 + }, + { + "epoch": 1.5799827108560849, + "grad_norm": 0.5244451401497985, + "learning_rate": 2.225230253869116e-05, + "loss": 11.7575, + "step": 29015 + }, + { + "epoch": 1.5800371648526679, + "grad_norm": 0.5493826641505074, + "learning_rate": 2.2246756937865943e-05, + "loss": 11.7747, + "step": 29016 + }, + { + "epoch": 1.5800916188492509, + "grad_norm": 0.5741686517478704, + "learning_rate": 2.224121194166301e-05, + "loss": 11.9342, + "step": 29017 + }, + { + "epoch": 1.5801460728458339, + "grad_norm": 0.5597580542195663, + "learning_rate": 2.2235667550125427e-05, + "loss": 11.8798, + "step": 29018 + }, + { + "epoch": 1.5802005268424169, + "grad_norm": 0.5389378037250077, + "learning_rate": 2.223012376329633e-05, + "loss": 11.8639, + "step": 29019 + }, + { + "epoch": 1.5802549808389998, + "grad_norm": 0.5279581187588729, + "learning_rate": 2.222458058121889e-05, + "loss": 11.712, + "step": 29020 + }, + { + "epoch": 1.5803094348355828, + "grad_norm": 0.5616077014528822, + "learning_rate": 2.221903800393611e-05, + "loss": 11.9719, + "step": 29021 + }, + { + "epoch": 1.5803638888321658, + "grad_norm": 0.5172516190327713, + "learning_rate": 2.2213496031491142e-05, + "loss": 11.7983, + "step": 29022 + }, + { + "epoch": 1.5804183428287488, + "grad_norm": 0.5774367667137277, + "learning_rate": 2.2207954663927066e-05, + "loss": 11.8706, + "step": 29023 + }, + { + "epoch": 1.580472796825332, + "grad_norm": 0.5558358176704967, + "learning_rate": 2.2202413901286968e-05, + "loss": 11.8833, + "step": 29024 + }, + { + "epoch": 1.580527250821915, + "grad_norm": 0.5207789713979787, + "learning_rate": 2.219687374361398e-05, + "loss": 11.7573, + "step": 29025 + }, + { + "epoch": 1.580581704818498, + "grad_norm": 0.5306067291703788, + "learning_rate": 2.2191334190951118e-05, + "loss": 11.8661, + "step": 29026 + }, + { + "epoch": 1.580636158815081, + "grad_norm": 0.5335870808746337, + "learning_rate": 2.218579524334151e-05, + "loss": 11.8255, + "step": 29027 + }, + { + "epoch": 1.580690612811664, + "grad_norm": 0.518464436909379, + "learning_rate": 2.218025690082819e-05, + "loss": 11.7333, + "step": 29028 + }, + { + "epoch": 1.5807450668082472, + "grad_norm": 0.5065462281628947, + "learning_rate": 2.217471916345427e-05, + "loss": 11.8099, + "step": 29029 + }, + { + "epoch": 1.5807995208048302, + "grad_norm": 0.5028103002169194, + "learning_rate": 2.2169182031262782e-05, + "loss": 11.7827, + "step": 29030 + }, + { + "epoch": 1.5808539748014132, + "grad_norm": 0.4958842347440862, + "learning_rate": 2.216364550429676e-05, + "loss": 11.8149, + "step": 29031 + }, + { + "epoch": 1.5809084287979962, + "grad_norm": 0.6093113753707403, + "learning_rate": 2.2158109582599305e-05, + "loss": 11.7194, + "step": 29032 + }, + { + "epoch": 1.5809628827945792, + "grad_norm": 0.5681429641056872, + "learning_rate": 2.2152574266213434e-05, + "loss": 11.9397, + "step": 29033 + }, + { + "epoch": 1.5810173367911622, + "grad_norm": 0.6107264754564226, + "learning_rate": 2.2147039555182216e-05, + "loss": 11.5987, + "step": 29034 + }, + { + "epoch": 1.5810717907877452, + "grad_norm": 0.5332209531813399, + "learning_rate": 2.214150544954865e-05, + "loss": 11.8597, + "step": 29035 + }, + { + "epoch": 1.5811262447843282, + "grad_norm": 0.5354953413126309, + "learning_rate": 2.213597194935578e-05, + "loss": 11.8647, + "step": 29036 + }, + { + "epoch": 1.5811806987809112, + "grad_norm": 0.5808049495861983, + "learning_rate": 2.21304390546467e-05, + "loss": 11.9228, + "step": 29037 + }, + { + "epoch": 1.5812351527774942, + "grad_norm": 0.5742406354612188, + "learning_rate": 2.2124906765464347e-05, + "loss": 11.8291, + "step": 29038 + }, + { + "epoch": 1.5812896067740771, + "grad_norm": 0.5044437219628712, + "learning_rate": 2.21193750818518e-05, + "loss": 11.8074, + "step": 29039 + }, + { + "epoch": 1.5813440607706601, + "grad_norm": 0.4933525140210394, + "learning_rate": 2.2113844003852057e-05, + "loss": 11.8077, + "step": 29040 + }, + { + "epoch": 1.5813985147672431, + "grad_norm": 0.5170722586985054, + "learning_rate": 2.2108313531508108e-05, + "loss": 11.7912, + "step": 29041 + }, + { + "epoch": 1.5814529687638261, + "grad_norm": 0.54294628014179, + "learning_rate": 2.2102783664862992e-05, + "loss": 11.812, + "step": 29042 + }, + { + "epoch": 1.5815074227604091, + "grad_norm": 0.5359145055436966, + "learning_rate": 2.2097254403959666e-05, + "loss": 11.7214, + "step": 29043 + }, + { + "epoch": 1.5815618767569921, + "grad_norm": 0.5144911463018499, + "learning_rate": 2.2091725748841187e-05, + "loss": 11.6378, + "step": 29044 + }, + { + "epoch": 1.581616330753575, + "grad_norm": 0.5511888396833232, + "learning_rate": 2.208619769955048e-05, + "loss": 11.8841, + "step": 29045 + }, + { + "epoch": 1.581670784750158, + "grad_norm": 0.5605769639697662, + "learning_rate": 2.2080670256130564e-05, + "loss": 11.7991, + "step": 29046 + }, + { + "epoch": 1.5817252387467413, + "grad_norm": 0.5613098356939531, + "learning_rate": 2.2075143418624454e-05, + "loss": 11.6674, + "step": 29047 + }, + { + "epoch": 1.5817796927433243, + "grad_norm": 0.5292845915904641, + "learning_rate": 2.2069617187075076e-05, + "loss": 11.6973, + "step": 29048 + }, + { + "epoch": 1.5818341467399073, + "grad_norm": 0.5340355912272015, + "learning_rate": 2.2064091561525445e-05, + "loss": 11.7223, + "step": 29049 + }, + { + "epoch": 1.5818886007364903, + "grad_norm": 0.5426519904518304, + "learning_rate": 2.205856654201851e-05, + "loss": 11.8386, + "step": 29050 + }, + { + "epoch": 1.5819430547330733, + "grad_norm": 0.5843536996058114, + "learning_rate": 2.20530421285972e-05, + "loss": 11.7543, + "step": 29051 + }, + { + "epoch": 1.5819975087296563, + "grad_norm": 0.5281374525024105, + "learning_rate": 2.2047518321304538e-05, + "loss": 11.8029, + "step": 29052 + }, + { + "epoch": 1.5820519627262395, + "grad_norm": 0.5468818828836998, + "learning_rate": 2.2041995120183424e-05, + "loss": 11.8517, + "step": 29053 + }, + { + "epoch": 1.5821064167228225, + "grad_norm": 0.5696897498571843, + "learning_rate": 2.2036472525276852e-05, + "loss": 11.7462, + "step": 29054 + }, + { + "epoch": 1.5821608707194055, + "grad_norm": 0.548390241746292, + "learning_rate": 2.2030950536627715e-05, + "loss": 11.9156, + "step": 29055 + }, + { + "epoch": 1.5822153247159885, + "grad_norm": 0.5841769023178146, + "learning_rate": 2.2025429154279008e-05, + "loss": 11.7717, + "step": 29056 + }, + { + "epoch": 1.5822697787125715, + "grad_norm": 0.536544838418208, + "learning_rate": 2.2019908378273612e-05, + "loss": 11.8411, + "step": 29057 + }, + { + "epoch": 1.5823242327091545, + "grad_norm": 0.546668705865174, + "learning_rate": 2.2014388208654492e-05, + "loss": 11.7562, + "step": 29058 + }, + { + "epoch": 1.5823786867057374, + "grad_norm": 0.5637228763955602, + "learning_rate": 2.2008868645464586e-05, + "loss": 11.7531, + "step": 29059 + }, + { + "epoch": 1.5824331407023204, + "grad_norm": 0.5349452483227524, + "learning_rate": 2.2003349688746776e-05, + "loss": 11.8587, + "step": 29060 + }, + { + "epoch": 1.5824875946989034, + "grad_norm": 0.5177356234154459, + "learning_rate": 2.199783133854403e-05, + "loss": 11.7372, + "step": 29061 + }, + { + "epoch": 1.5825420486954864, + "grad_norm": 0.5816474828550473, + "learning_rate": 2.1992313594899216e-05, + "loss": 11.8017, + "step": 29062 + }, + { + "epoch": 1.5825965026920694, + "grad_norm": 0.5381057753374475, + "learning_rate": 2.198679645785524e-05, + "loss": 11.8486, + "step": 29063 + }, + { + "epoch": 1.5826509566886524, + "grad_norm": 0.548465577977049, + "learning_rate": 2.1981279927455034e-05, + "loss": 11.8801, + "step": 29064 + }, + { + "epoch": 1.5827054106852354, + "grad_norm": 0.5347820313151733, + "learning_rate": 2.1975764003741462e-05, + "loss": 11.8356, + "step": 29065 + }, + { + "epoch": 1.5827598646818184, + "grad_norm": 0.5559862770546784, + "learning_rate": 2.1970248686757454e-05, + "loss": 11.8921, + "step": 29066 + }, + { + "epoch": 1.5828143186784014, + "grad_norm": 0.6062493227172571, + "learning_rate": 2.1964733976545847e-05, + "loss": 11.7938, + "step": 29067 + }, + { + "epoch": 1.5828687726749844, + "grad_norm": 0.5570617421374567, + "learning_rate": 2.195921987314956e-05, + "loss": 11.8624, + "step": 29068 + }, + { + "epoch": 1.5829232266715674, + "grad_norm": 0.5493201048040416, + "learning_rate": 2.1953706376611495e-05, + "loss": 11.853, + "step": 29069 + }, + { + "epoch": 1.5829776806681506, + "grad_norm": 0.6152634032043247, + "learning_rate": 2.1948193486974466e-05, + "loss": 11.793, + "step": 29070 + }, + { + "epoch": 1.5830321346647336, + "grad_norm": 0.5396840445424315, + "learning_rate": 2.1942681204281433e-05, + "loss": 11.9135, + "step": 29071 + }, + { + "epoch": 1.5830865886613166, + "grad_norm": 0.5363726241797955, + "learning_rate": 2.193716952857515e-05, + "loss": 11.81, + "step": 29072 + }, + { + "epoch": 1.5831410426578996, + "grad_norm": 0.5262139466133963, + "learning_rate": 2.1931658459898518e-05, + "loss": 11.844, + "step": 29073 + }, + { + "epoch": 1.5831954966544826, + "grad_norm": 0.5416015297460568, + "learning_rate": 2.1926147998294433e-05, + "loss": 11.7217, + "step": 29074 + }, + { + "epoch": 1.5832499506510656, + "grad_norm": 0.6406901360328437, + "learning_rate": 2.1920638143805695e-05, + "loss": 11.9512, + "step": 29075 + }, + { + "epoch": 1.5833044046476488, + "grad_norm": 0.58056937267003, + "learning_rate": 2.1915128896475188e-05, + "loss": 11.7572, + "step": 29076 + }, + { + "epoch": 1.5833588586442318, + "grad_norm": 0.5641658998258171, + "learning_rate": 2.1909620256345708e-05, + "loss": 11.7556, + "step": 29077 + }, + { + "epoch": 1.5834133126408148, + "grad_norm": 0.47841146649147465, + "learning_rate": 2.1904112223460138e-05, + "loss": 11.7228, + "step": 29078 + }, + { + "epoch": 1.5834677666373977, + "grad_norm": 0.5374175110604116, + "learning_rate": 2.1898604797861267e-05, + "loss": 11.7036, + "step": 29079 + }, + { + "epoch": 1.5835222206339807, + "grad_norm": 0.5960610157409361, + "learning_rate": 2.1893097979591937e-05, + "loss": 11.7403, + "step": 29080 + }, + { + "epoch": 1.5835766746305637, + "grad_norm": 0.5297496147641881, + "learning_rate": 2.1887591768695036e-05, + "loss": 11.7162, + "step": 29081 + }, + { + "epoch": 1.5836311286271467, + "grad_norm": 0.5736564245092398, + "learning_rate": 2.1882086165213268e-05, + "loss": 11.8737, + "step": 29082 + }, + { + "epoch": 1.5836855826237297, + "grad_norm": 0.5862986527536662, + "learning_rate": 2.1876581169189527e-05, + "loss": 11.7519, + "step": 29083 + }, + { + "epoch": 1.5837400366203127, + "grad_norm": 0.5418749097848605, + "learning_rate": 2.1871076780666556e-05, + "loss": 11.8964, + "step": 29084 + }, + { + "epoch": 1.5837944906168957, + "grad_norm": 0.4936289500691851, + "learning_rate": 2.18655729996872e-05, + "loss": 11.8897, + "step": 29085 + }, + { + "epoch": 1.5838489446134787, + "grad_norm": 0.5013235215036588, + "learning_rate": 2.186006982629427e-05, + "loss": 11.8063, + "step": 29086 + }, + { + "epoch": 1.5839033986100617, + "grad_norm": 0.5099163128588902, + "learning_rate": 2.185456726053052e-05, + "loss": 11.8442, + "step": 29087 + }, + { + "epoch": 1.5839578526066447, + "grad_norm": 0.5917029892828737, + "learning_rate": 2.1849065302438797e-05, + "loss": 11.8341, + "step": 29088 + }, + { + "epoch": 1.5840123066032277, + "grad_norm": 0.5021925925260688, + "learning_rate": 2.1843563952061808e-05, + "loss": 11.8131, + "step": 29089 + }, + { + "epoch": 1.5840667605998107, + "grad_norm": 0.497188094143562, + "learning_rate": 2.1838063209442407e-05, + "loss": 11.8229, + "step": 29090 + }, + { + "epoch": 1.5841212145963937, + "grad_norm": 0.626381462049018, + "learning_rate": 2.1832563074623335e-05, + "loss": 11.8201, + "step": 29091 + }, + { + "epoch": 1.5841756685929766, + "grad_norm": 0.5784104874652187, + "learning_rate": 2.182706354764733e-05, + "loss": 11.9355, + "step": 29092 + }, + { + "epoch": 1.5842301225895596, + "grad_norm": 0.5532987423111362, + "learning_rate": 2.182156462855721e-05, + "loss": 11.7749, + "step": 29093 + }, + { + "epoch": 1.5842845765861429, + "grad_norm": 0.5022861028858535, + "learning_rate": 2.18160663173957e-05, + "loss": 11.7734, + "step": 29094 + }, + { + "epoch": 1.5843390305827258, + "grad_norm": 0.5617752136216879, + "learning_rate": 2.1810568614205562e-05, + "loss": 11.8503, + "step": 29095 + }, + { + "epoch": 1.5843934845793088, + "grad_norm": 0.5429460549899313, + "learning_rate": 2.1805071519029586e-05, + "loss": 11.8458, + "step": 29096 + }, + { + "epoch": 1.5844479385758918, + "grad_norm": 0.5316821255156369, + "learning_rate": 2.1799575031910447e-05, + "loss": 11.8789, + "step": 29097 + }, + { + "epoch": 1.5845023925724748, + "grad_norm": 0.5462723679800302, + "learning_rate": 2.1794079152890966e-05, + "loss": 11.8165, + "step": 29098 + }, + { + "epoch": 1.584556846569058, + "grad_norm": 0.5281100326559773, + "learning_rate": 2.1788583882013812e-05, + "loss": 11.7381, + "step": 29099 + }, + { + "epoch": 1.584611300565641, + "grad_norm": 0.5652817255406261, + "learning_rate": 2.178308921932177e-05, + "loss": 11.8184, + "step": 29100 + }, + { + "epoch": 1.584665754562224, + "grad_norm": 0.559364291169176, + "learning_rate": 2.1777595164857544e-05, + "loss": 11.77, + "step": 29101 + }, + { + "epoch": 1.584720208558807, + "grad_norm": 0.5517743434517466, + "learning_rate": 2.1772101718663827e-05, + "loss": 11.7682, + "step": 29102 + }, + { + "epoch": 1.58477466255539, + "grad_norm": 0.6718982106305822, + "learning_rate": 2.17666088807834e-05, + "loss": 11.8081, + "step": 29103 + }, + { + "epoch": 1.584829116551973, + "grad_norm": 0.5600334935057784, + "learning_rate": 2.1761116651258918e-05, + "loss": 11.8599, + "step": 29104 + }, + { + "epoch": 1.584883570548556, + "grad_norm": 0.5398688980994854, + "learning_rate": 2.175562503013313e-05, + "loss": 11.8, + "step": 29105 + }, + { + "epoch": 1.584938024545139, + "grad_norm": 0.520637674409149, + "learning_rate": 2.17501340174487e-05, + "loss": 11.7779, + "step": 29106 + }, + { + "epoch": 1.584992478541722, + "grad_norm": 0.6543683148255448, + "learning_rate": 2.174464361324835e-05, + "loss": 11.6661, + "step": 29107 + }, + { + "epoch": 1.585046932538305, + "grad_norm": 0.4959228769340991, + "learning_rate": 2.1739153817574796e-05, + "loss": 11.7957, + "step": 29108 + }, + { + "epoch": 1.585101386534888, + "grad_norm": 0.5685651267143339, + "learning_rate": 2.1733664630470685e-05, + "loss": 11.8713, + "step": 29109 + }, + { + "epoch": 1.585155840531471, + "grad_norm": 0.5651378296195563, + "learning_rate": 2.1728176051978754e-05, + "loss": 11.9213, + "step": 29110 + }, + { + "epoch": 1.585210294528054, + "grad_norm": 0.5153589105640635, + "learning_rate": 2.1722688082141652e-05, + "loss": 11.795, + "step": 29111 + }, + { + "epoch": 1.585264748524637, + "grad_norm": 0.6036755918870512, + "learning_rate": 2.1717200721002017e-05, + "loss": 11.8628, + "step": 29112 + }, + { + "epoch": 1.58531920252122, + "grad_norm": 0.6678899420844301, + "learning_rate": 2.17117139686026e-05, + "loss": 11.8299, + "step": 29113 + }, + { + "epoch": 1.585373656517803, + "grad_norm": 0.5068025624269648, + "learning_rate": 2.170622782498598e-05, + "loss": 11.8039, + "step": 29114 + }, + { + "epoch": 1.585428110514386, + "grad_norm": 0.5893143485657308, + "learning_rate": 2.170074229019491e-05, + "loss": 11.7748, + "step": 29115 + }, + { + "epoch": 1.585482564510969, + "grad_norm": 0.5289392164569375, + "learning_rate": 2.1695257364271948e-05, + "loss": 11.7358, + "step": 29116 + }, + { + "epoch": 1.5855370185075521, + "grad_norm": 0.5227695138565449, + "learning_rate": 2.168977304725981e-05, + "loss": 11.8593, + "step": 29117 + }, + { + "epoch": 1.5855914725041351, + "grad_norm": 0.5659794584806885, + "learning_rate": 2.168428933920116e-05, + "loss": 11.9605, + "step": 29118 + }, + { + "epoch": 1.5856459265007181, + "grad_norm": 0.5498119223535392, + "learning_rate": 2.167880624013857e-05, + "loss": 11.7594, + "step": 29119 + }, + { + "epoch": 1.585700380497301, + "grad_norm": 0.5337225061114226, + "learning_rate": 2.167332375011476e-05, + "loss": 11.8674, + "step": 29120 + }, + { + "epoch": 1.585754834493884, + "grad_norm": 0.5580952724767342, + "learning_rate": 2.1667841869172313e-05, + "loss": 11.7955, + "step": 29121 + }, + { + "epoch": 1.585809288490467, + "grad_norm": 0.5462561153115113, + "learning_rate": 2.1662360597353826e-05, + "loss": 11.8734, + "step": 29122 + }, + { + "epoch": 1.5858637424870503, + "grad_norm": 0.5461428899994698, + "learning_rate": 2.1656879934702e-05, + "loss": 11.8885, + "step": 29123 + }, + { + "epoch": 1.5859181964836333, + "grad_norm": 0.5583712414235588, + "learning_rate": 2.165139988125938e-05, + "loss": 11.8426, + "step": 29124 + }, + { + "epoch": 1.5859726504802163, + "grad_norm": 0.6166205933702942, + "learning_rate": 2.1645920437068645e-05, + "loss": 11.6789, + "step": 29125 + }, + { + "epoch": 1.5860271044767993, + "grad_norm": 0.5771649355488141, + "learning_rate": 2.1640441602172347e-05, + "loss": 11.8718, + "step": 29126 + }, + { + "epoch": 1.5860815584733823, + "grad_norm": 0.5711758625196081, + "learning_rate": 2.1634963376613136e-05, + "loss": 11.7602, + "step": 29127 + }, + { + "epoch": 1.5861360124699653, + "grad_norm": 0.5728149068220495, + "learning_rate": 2.1629485760433575e-05, + "loss": 11.7312, + "step": 29128 + }, + { + "epoch": 1.5861904664665483, + "grad_norm": 0.6268439389283404, + "learning_rate": 2.1624008753676262e-05, + "loss": 11.8792, + "step": 29129 + }, + { + "epoch": 1.5862449204631313, + "grad_norm": 0.6069611203707442, + "learning_rate": 2.1618532356383835e-05, + "loss": 11.831, + "step": 29130 + }, + { + "epoch": 1.5862993744597143, + "grad_norm": 0.5386232493514849, + "learning_rate": 2.1613056568598843e-05, + "loss": 11.8296, + "step": 29131 + }, + { + "epoch": 1.5863538284562972, + "grad_norm": 0.5265429702693016, + "learning_rate": 2.1607581390363873e-05, + "loss": 11.6696, + "step": 29132 + }, + { + "epoch": 1.5864082824528802, + "grad_norm": 0.5530855200515045, + "learning_rate": 2.160210682172147e-05, + "loss": 11.7573, + "step": 29133 + }, + { + "epoch": 1.5864627364494632, + "grad_norm": 0.5583801460465538, + "learning_rate": 2.1596632862714228e-05, + "loss": 11.9413, + "step": 29134 + }, + { + "epoch": 1.5865171904460462, + "grad_norm": 0.565629203525206, + "learning_rate": 2.159115951338475e-05, + "loss": 11.8124, + "step": 29135 + }, + { + "epoch": 1.5865716444426292, + "grad_norm": 0.6122724835057962, + "learning_rate": 2.1585686773775525e-05, + "loss": 11.949, + "step": 29136 + }, + { + "epoch": 1.5866260984392122, + "grad_norm": 0.5596665136297907, + "learning_rate": 2.1580214643929187e-05, + "loss": 11.8034, + "step": 29137 + }, + { + "epoch": 1.5866805524357952, + "grad_norm": 0.5271993444831127, + "learning_rate": 2.1574743123888218e-05, + "loss": 11.7126, + "step": 29138 + }, + { + "epoch": 1.5867350064323782, + "grad_norm": 0.5547945712060374, + "learning_rate": 2.1569272213695236e-05, + "loss": 11.8732, + "step": 29139 + }, + { + "epoch": 1.5867894604289614, + "grad_norm": 0.5334795982773989, + "learning_rate": 2.156380191339271e-05, + "loss": 11.7308, + "step": 29140 + }, + { + "epoch": 1.5868439144255444, + "grad_norm": 0.5445794762733351, + "learning_rate": 2.1558332223023247e-05, + "loss": 11.6873, + "step": 29141 + }, + { + "epoch": 1.5868983684221274, + "grad_norm": 0.5507707318790223, + "learning_rate": 2.1552863142629344e-05, + "loss": 11.7586, + "step": 29142 + }, + { + "epoch": 1.5869528224187104, + "grad_norm": 0.5147203627377872, + "learning_rate": 2.1547394672253496e-05, + "loss": 11.765, + "step": 29143 + }, + { + "epoch": 1.5870072764152934, + "grad_norm": 0.7584100790072255, + "learning_rate": 2.1541926811938274e-05, + "loss": 11.6893, + "step": 29144 + }, + { + "epoch": 1.5870617304118764, + "grad_norm": 0.5401167545297461, + "learning_rate": 2.153645956172622e-05, + "loss": 11.7704, + "step": 29145 + }, + { + "epoch": 1.5871161844084596, + "grad_norm": 0.558777060546608, + "learning_rate": 2.1530992921659775e-05, + "loss": 11.7655, + "step": 29146 + }, + { + "epoch": 1.5871706384050426, + "grad_norm": 0.49229068922422564, + "learning_rate": 2.1525526891781522e-05, + "loss": 11.7972, + "step": 29147 + }, + { + "epoch": 1.5872250924016256, + "grad_norm": 0.5533981064329909, + "learning_rate": 2.1520061472133902e-05, + "loss": 11.8416, + "step": 29148 + }, + { + "epoch": 1.5872795463982086, + "grad_norm": 0.5013605542073992, + "learning_rate": 2.1514596662759467e-05, + "loss": 11.8212, + "step": 29149 + }, + { + "epoch": 1.5873340003947916, + "grad_norm": 0.5900255638176347, + "learning_rate": 2.1509132463700677e-05, + "loss": 11.6981, + "step": 29150 + }, + { + "epoch": 1.5873884543913745, + "grad_norm": 0.5330474399920727, + "learning_rate": 2.150366887500005e-05, + "loss": 11.8208, + "step": 29151 + }, + { + "epoch": 1.5874429083879575, + "grad_norm": 0.5134170634657004, + "learning_rate": 2.1498205896700063e-05, + "loss": 11.6273, + "step": 29152 + }, + { + "epoch": 1.5874973623845405, + "grad_norm": 0.5683029043787393, + "learning_rate": 2.1492743528843173e-05, + "loss": 11.8019, + "step": 29153 + }, + { + "epoch": 1.5875518163811235, + "grad_norm": 0.5519986747922824, + "learning_rate": 2.148728177147189e-05, + "loss": 11.6967, + "step": 29154 + }, + { + "epoch": 1.5876062703777065, + "grad_norm": 0.5351536096407199, + "learning_rate": 2.1481820624628644e-05, + "loss": 11.789, + "step": 29155 + }, + { + "epoch": 1.5876607243742895, + "grad_norm": 0.514282314891165, + "learning_rate": 2.1476360088355928e-05, + "loss": 11.8068, + "step": 29156 + }, + { + "epoch": 1.5877151783708725, + "grad_norm": 0.5509461107819521, + "learning_rate": 2.147090016269624e-05, + "loss": 11.8408, + "step": 29157 + }, + { + "epoch": 1.5877696323674555, + "grad_norm": 0.5855389748059536, + "learning_rate": 2.1465440847691975e-05, + "loss": 11.8164, + "step": 29158 + }, + { + "epoch": 1.5878240863640385, + "grad_norm": 0.5449832269316832, + "learning_rate": 2.1459982143385627e-05, + "loss": 11.8003, + "step": 29159 + }, + { + "epoch": 1.5878785403606215, + "grad_norm": 0.5963238684244109, + "learning_rate": 2.1454524049819613e-05, + "loss": 11.9584, + "step": 29160 + }, + { + "epoch": 1.5879329943572045, + "grad_norm": 0.5565491697938304, + "learning_rate": 2.1449066567036413e-05, + "loss": 11.7727, + "step": 29161 + }, + { + "epoch": 1.5879874483537875, + "grad_norm": 0.5219938308345949, + "learning_rate": 2.144360969507845e-05, + "loss": 11.875, + "step": 29162 + }, + { + "epoch": 1.5880419023503707, + "grad_norm": 0.5408579161222582, + "learning_rate": 2.1438153433988117e-05, + "loss": 11.8056, + "step": 29163 + }, + { + "epoch": 1.5880963563469537, + "grad_norm": 0.4886017432595483, + "learning_rate": 2.143269778380791e-05, + "loss": 11.8131, + "step": 29164 + }, + { + "epoch": 1.5881508103435367, + "grad_norm": 0.5771394631036637, + "learning_rate": 2.14272427445802e-05, + "loss": 11.8153, + "step": 29165 + }, + { + "epoch": 1.5882052643401197, + "grad_norm": 0.5217238601603127, + "learning_rate": 2.1421788316347415e-05, + "loss": 11.819, + "step": 29166 + }, + { + "epoch": 1.5882597183367027, + "grad_norm": 0.5630323168814597, + "learning_rate": 2.1416334499152013e-05, + "loss": 11.842, + "step": 29167 + }, + { + "epoch": 1.5883141723332856, + "grad_norm": 0.5376945597360916, + "learning_rate": 2.1410881293036344e-05, + "loss": 11.7791, + "step": 29168 + }, + { + "epoch": 1.5883686263298689, + "grad_norm": 0.5248237679813952, + "learning_rate": 2.1405428698042874e-05, + "loss": 11.7955, + "step": 29169 + }, + { + "epoch": 1.5884230803264519, + "grad_norm": 0.5436701563811874, + "learning_rate": 2.1399976714213942e-05, + "loss": 11.8776, + "step": 29170 + }, + { + "epoch": 1.5884775343230348, + "grad_norm": 0.5128821896212716, + "learning_rate": 2.1394525341591997e-05, + "loss": 11.8182, + "step": 29171 + }, + { + "epoch": 1.5885319883196178, + "grad_norm": 0.4992722527555482, + "learning_rate": 2.1389074580219402e-05, + "loss": 11.7359, + "step": 29172 + }, + { + "epoch": 1.5885864423162008, + "grad_norm": 0.5467252671302966, + "learning_rate": 2.138362443013853e-05, + "loss": 11.8906, + "step": 29173 + }, + { + "epoch": 1.5886408963127838, + "grad_norm": 0.5843163581667822, + "learning_rate": 2.137817489139179e-05, + "loss": 11.8127, + "step": 29174 + }, + { + "epoch": 1.5886953503093668, + "grad_norm": 0.629587246009668, + "learning_rate": 2.1372725964021534e-05, + "loss": 11.9161, + "step": 29175 + }, + { + "epoch": 1.5887498043059498, + "grad_norm": 0.5397705353121288, + "learning_rate": 2.136727764807016e-05, + "loss": 11.8118, + "step": 29176 + }, + { + "epoch": 1.5888042583025328, + "grad_norm": 0.540204443399126, + "learning_rate": 2.1361829943580004e-05, + "loss": 11.8334, + "step": 29177 + }, + { + "epoch": 1.5888587122991158, + "grad_norm": 0.6136391249747147, + "learning_rate": 2.135638285059345e-05, + "loss": 11.7844, + "step": 29178 + }, + { + "epoch": 1.5889131662956988, + "grad_norm": 0.5118044243007966, + "learning_rate": 2.1350936369152873e-05, + "loss": 11.9372, + "step": 29179 + }, + { + "epoch": 1.5889676202922818, + "grad_norm": 0.5571087194325227, + "learning_rate": 2.134549049930058e-05, + "loss": 11.773, + "step": 29180 + }, + { + "epoch": 1.5890220742888648, + "grad_norm": 0.5520172025077966, + "learning_rate": 2.134004524107899e-05, + "loss": 11.8087, + "step": 29181 + }, + { + "epoch": 1.5890765282854478, + "grad_norm": 0.5189616400894338, + "learning_rate": 2.1334600594530353e-05, + "loss": 11.8258, + "step": 29182 + }, + { + "epoch": 1.5891309822820308, + "grad_norm": 0.5060035468310032, + "learning_rate": 2.132915655969705e-05, + "loss": 11.8142, + "step": 29183 + }, + { + "epoch": 1.5891854362786138, + "grad_norm": 0.5444357482948762, + "learning_rate": 2.1323713136621447e-05, + "loss": 11.9714, + "step": 29184 + }, + { + "epoch": 1.5892398902751967, + "grad_norm": 0.5036453223751051, + "learning_rate": 2.1318270325345813e-05, + "loss": 11.7883, + "step": 29185 + }, + { + "epoch": 1.5892943442717797, + "grad_norm": 0.5272278688885228, + "learning_rate": 2.1312828125912542e-05, + "loss": 11.7632, + "step": 29186 + }, + { + "epoch": 1.589348798268363, + "grad_norm": 0.5516678752768197, + "learning_rate": 2.1307386538363872e-05, + "loss": 11.9416, + "step": 29187 + }, + { + "epoch": 1.589403252264946, + "grad_norm": 0.5390981805405269, + "learning_rate": 2.1301945562742198e-05, + "loss": 11.8479, + "step": 29188 + }, + { + "epoch": 1.589457706261529, + "grad_norm": 0.5455874018990825, + "learning_rate": 2.1296505199089767e-05, + "loss": 11.745, + "step": 29189 + }, + { + "epoch": 1.589512160258112, + "grad_norm": 0.5358756695688616, + "learning_rate": 2.129106544744891e-05, + "loss": 11.9746, + "step": 29190 + }, + { + "epoch": 1.589566614254695, + "grad_norm": 0.5572858884820283, + "learning_rate": 2.1285626307861985e-05, + "loss": 11.684, + "step": 29191 + }, + { + "epoch": 1.589621068251278, + "grad_norm": 0.5749974252294444, + "learning_rate": 2.1280187780371164e-05, + "loss": 11.7451, + "step": 29192 + }, + { + "epoch": 1.5896755222478611, + "grad_norm": 0.6094768452017864, + "learning_rate": 2.1274749865018817e-05, + "loss": 11.784, + "step": 29193 + }, + { + "epoch": 1.5897299762444441, + "grad_norm": 0.5155799518670798, + "learning_rate": 2.1269312561847243e-05, + "loss": 11.8573, + "step": 29194 + }, + { + "epoch": 1.5897844302410271, + "grad_norm": 0.554418934830859, + "learning_rate": 2.1263875870898663e-05, + "loss": 11.8116, + "step": 29195 + }, + { + "epoch": 1.58983888423761, + "grad_norm": 0.5677755816917791, + "learning_rate": 2.1258439792215424e-05, + "loss": 11.9431, + "step": 29196 + }, + { + "epoch": 1.589893338234193, + "grad_norm": 0.5491318303962497, + "learning_rate": 2.125300432583972e-05, + "loss": 11.8706, + "step": 29197 + }, + { + "epoch": 1.589947792230776, + "grad_norm": 0.5536616969800391, + "learning_rate": 2.12475694718139e-05, + "loss": 11.7946, + "step": 29198 + }, + { + "epoch": 1.590002246227359, + "grad_norm": 0.5421698118841684, + "learning_rate": 2.124213523018016e-05, + "loss": 11.7416, + "step": 29199 + }, + { + "epoch": 1.590056700223942, + "grad_norm": 0.5689022927176461, + "learning_rate": 2.1236701600980778e-05, + "loss": 11.9269, + "step": 29200 + }, + { + "epoch": 1.590111154220525, + "grad_norm": 0.6153413866925239, + "learning_rate": 2.1231268584258045e-05, + "loss": 11.8882, + "step": 29201 + }, + { + "epoch": 1.590165608217108, + "grad_norm": 0.5124053512110116, + "learning_rate": 2.122583618005417e-05, + "loss": 11.8088, + "step": 29202 + }, + { + "epoch": 1.590220062213691, + "grad_norm": 0.580719462557604, + "learning_rate": 2.122040438841141e-05, + "loss": 11.7163, + "step": 29203 + }, + { + "epoch": 1.590274516210274, + "grad_norm": 0.5151472244442663, + "learning_rate": 2.1214973209371968e-05, + "loss": 11.7885, + "step": 29204 + }, + { + "epoch": 1.590328970206857, + "grad_norm": 0.5076383471266717, + "learning_rate": 2.1209542642978108e-05, + "loss": 11.7529, + "step": 29205 + }, + { + "epoch": 1.59038342420344, + "grad_norm": 0.5193509259548776, + "learning_rate": 2.1204112689272084e-05, + "loss": 11.7794, + "step": 29206 + }, + { + "epoch": 1.590437878200023, + "grad_norm": 0.5828889025431375, + "learning_rate": 2.1198683348296066e-05, + "loss": 11.7939, + "step": 29207 + }, + { + "epoch": 1.590492332196606, + "grad_norm": 0.5650755250368186, + "learning_rate": 2.119325462009233e-05, + "loss": 11.8566, + "step": 29208 + }, + { + "epoch": 1.590546786193189, + "grad_norm": 0.5297085796436609, + "learning_rate": 2.1187826504703035e-05, + "loss": 11.887, + "step": 29209 + }, + { + "epoch": 1.5906012401897722, + "grad_norm": 0.5064611510433685, + "learning_rate": 2.118239900217044e-05, + "loss": 11.7344, + "step": 29210 + }, + { + "epoch": 1.5906556941863552, + "grad_norm": 0.5342162214270872, + "learning_rate": 2.1176972112536707e-05, + "loss": 11.8243, + "step": 29211 + }, + { + "epoch": 1.5907101481829382, + "grad_norm": 0.5392048206008597, + "learning_rate": 2.1171545835844074e-05, + "loss": 11.7277, + "step": 29212 + }, + { + "epoch": 1.5907646021795212, + "grad_norm": 0.5489343232386106, + "learning_rate": 2.116612017213473e-05, + "loss": 11.7642, + "step": 29213 + }, + { + "epoch": 1.5908190561761042, + "grad_norm": 0.5556368544405371, + "learning_rate": 2.1160695121450835e-05, + "loss": 11.9001, + "step": 29214 + }, + { + "epoch": 1.5908735101726872, + "grad_norm": 0.5637806875218498, + "learning_rate": 2.1155270683834583e-05, + "loss": 11.846, + "step": 29215 + }, + { + "epoch": 1.5909279641692704, + "grad_norm": 0.5675488791609382, + "learning_rate": 2.1149846859328204e-05, + "loss": 11.8661, + "step": 29216 + }, + { + "epoch": 1.5909824181658534, + "grad_norm": 0.5517703220420297, + "learning_rate": 2.1144423647973798e-05, + "loss": 11.8339, + "step": 29217 + }, + { + "epoch": 1.5910368721624364, + "grad_norm": 0.5623212189048679, + "learning_rate": 2.1139001049813623e-05, + "loss": 11.7671, + "step": 29218 + }, + { + "epoch": 1.5910913261590194, + "grad_norm": 0.49704509406640446, + "learning_rate": 2.1133579064889764e-05, + "loss": 11.8135, + "step": 29219 + }, + { + "epoch": 1.5911457801556024, + "grad_norm": 0.5886064269462271, + "learning_rate": 2.1128157693244454e-05, + "loss": 11.8631, + "step": 29220 + }, + { + "epoch": 1.5912002341521854, + "grad_norm": 0.5392727739873534, + "learning_rate": 2.1122736934919783e-05, + "loss": 11.7098, + "step": 29221 + }, + { + "epoch": 1.5912546881487684, + "grad_norm": 0.565132586191543, + "learning_rate": 2.1117316789957962e-05, + "loss": 11.8217, + "step": 29222 + }, + { + "epoch": 1.5913091421453514, + "grad_norm": 0.542112994531766, + "learning_rate": 2.1111897258401125e-05, + "loss": 11.795, + "step": 29223 + }, + { + "epoch": 1.5913635961419343, + "grad_norm": 0.5161550032683274, + "learning_rate": 2.110647834029137e-05, + "loss": 11.8477, + "step": 29224 + }, + { + "epoch": 1.5914180501385173, + "grad_norm": 0.5026830349343652, + "learning_rate": 2.1101060035670893e-05, + "loss": 11.8038, + "step": 29225 + }, + { + "epoch": 1.5914725041351003, + "grad_norm": 0.5379469062411203, + "learning_rate": 2.1095642344581778e-05, + "loss": 11.8555, + "step": 29226 + }, + { + "epoch": 1.5915269581316833, + "grad_norm": 0.5024891120704371, + "learning_rate": 2.1090225267066187e-05, + "loss": 11.6885, + "step": 29227 + }, + { + "epoch": 1.5915814121282663, + "grad_norm": 0.56469021355504, + "learning_rate": 2.1084808803166267e-05, + "loss": 11.9066, + "step": 29228 + }, + { + "epoch": 1.5916358661248493, + "grad_norm": 0.5676993194955461, + "learning_rate": 2.107939295292407e-05, + "loss": 11.7328, + "step": 29229 + }, + { + "epoch": 1.5916903201214323, + "grad_norm": 0.5052910014520902, + "learning_rate": 2.1073977716381787e-05, + "loss": 11.7752, + "step": 29230 + }, + { + "epoch": 1.5917447741180153, + "grad_norm": 0.5207248151621681, + "learning_rate": 2.106856309358145e-05, + "loss": 11.8814, + "step": 29231 + }, + { + "epoch": 1.5917992281145983, + "grad_norm": 0.5204707893107874, + "learning_rate": 2.1063149084565238e-05, + "loss": 11.8628, + "step": 29232 + }, + { + "epoch": 1.5918536821111815, + "grad_norm": 0.604036827527067, + "learning_rate": 2.1057735689375212e-05, + "loss": 11.9725, + "step": 29233 + }, + { + "epoch": 1.5919081361077645, + "grad_norm": 0.5089674645710892, + "learning_rate": 2.1052322908053457e-05, + "loss": 11.7684, + "step": 29234 + }, + { + "epoch": 1.5919625901043475, + "grad_norm": 0.5745763659233781, + "learning_rate": 2.104691074064209e-05, + "loss": 11.8978, + "step": 29235 + }, + { + "epoch": 1.5920170441009305, + "grad_norm": 0.591309404312403, + "learning_rate": 2.1041499187183167e-05, + "loss": 11.8323, + "step": 29236 + }, + { + "epoch": 1.5920714980975135, + "grad_norm": 0.5545029480301127, + "learning_rate": 2.103608824771881e-05, + "loss": 11.796, + "step": 29237 + }, + { + "epoch": 1.5921259520940965, + "grad_norm": 0.5184792550490716, + "learning_rate": 2.1030677922291054e-05, + "loss": 11.79, + "step": 29238 + }, + { + "epoch": 1.5921804060906797, + "grad_norm": 0.5539966035999682, + "learning_rate": 2.1025268210941984e-05, + "loss": 11.9322, + "step": 29239 + }, + { + "epoch": 1.5922348600872627, + "grad_norm": 0.576787261578154, + "learning_rate": 2.1019859113713702e-05, + "loss": 11.8311, + "step": 29240 + }, + { + "epoch": 1.5922893140838457, + "grad_norm": 0.5353127817297225, + "learning_rate": 2.101445063064821e-05, + "loss": 11.8213, + "step": 29241 + }, + { + "epoch": 1.5923437680804287, + "grad_norm": 0.5028518180042723, + "learning_rate": 2.1009042761787622e-05, + "loss": 11.6888, + "step": 29242 + }, + { + "epoch": 1.5923982220770116, + "grad_norm": 0.5227168638947924, + "learning_rate": 2.100363550717397e-05, + "loss": 11.7396, + "step": 29243 + }, + { + "epoch": 1.5924526760735946, + "grad_norm": 0.5207687491171965, + "learning_rate": 2.0998228866849258e-05, + "loss": 11.8756, + "step": 29244 + }, + { + "epoch": 1.5925071300701776, + "grad_norm": 0.5689433497115669, + "learning_rate": 2.0992822840855607e-05, + "loss": 11.9099, + "step": 29245 + }, + { + "epoch": 1.5925615840667606, + "grad_norm": 0.5170718065392603, + "learning_rate": 2.0987417429234978e-05, + "loss": 11.6968, + "step": 29246 + }, + { + "epoch": 1.5926160380633436, + "grad_norm": 0.7161047318327063, + "learning_rate": 2.098201263202948e-05, + "loss": 11.9096, + "step": 29247 + }, + { + "epoch": 1.5926704920599266, + "grad_norm": 0.5449667988099567, + "learning_rate": 2.0976608449281065e-05, + "loss": 11.7962, + "step": 29248 + }, + { + "epoch": 1.5927249460565096, + "grad_norm": 0.5422448402646335, + "learning_rate": 2.0971204881031804e-05, + "loss": 11.7914, + "step": 29249 + }, + { + "epoch": 1.5927794000530926, + "grad_norm": 0.5204730984461148, + "learning_rate": 2.0965801927323723e-05, + "loss": 11.8461, + "step": 29250 + }, + { + "epoch": 1.5928338540496756, + "grad_norm": 0.5911032488345888, + "learning_rate": 2.09603995881988e-05, + "loss": 11.9048, + "step": 29251 + }, + { + "epoch": 1.5928883080462586, + "grad_norm": 0.5380247928248235, + "learning_rate": 2.095499786369912e-05, + "loss": 11.8832, + "step": 29252 + }, + { + "epoch": 1.5929427620428416, + "grad_norm": 0.5702427190662462, + "learning_rate": 2.0949596753866573e-05, + "loss": 11.7932, + "step": 29253 + }, + { + "epoch": 1.5929972160394246, + "grad_norm": 0.5117243886033823, + "learning_rate": 2.094419625874322e-05, + "loss": 11.8682, + "step": 29254 + }, + { + "epoch": 1.5930516700360076, + "grad_norm": 0.562316585517797, + "learning_rate": 2.0938796378371084e-05, + "loss": 11.7828, + "step": 29255 + }, + { + "epoch": 1.5931061240325906, + "grad_norm": 0.564703827891049, + "learning_rate": 2.0933397112792097e-05, + "loss": 11.8802, + "step": 29256 + }, + { + "epoch": 1.5931605780291738, + "grad_norm": 0.5840082363291428, + "learning_rate": 2.0927998462048305e-05, + "loss": 11.8813, + "step": 29257 + }, + { + "epoch": 1.5932150320257568, + "grad_norm": 0.5169333620750146, + "learning_rate": 2.0922600426181627e-05, + "loss": 11.8397, + "step": 29258 + }, + { + "epoch": 1.5932694860223398, + "grad_norm": 0.5618592799405008, + "learning_rate": 2.09172030052341e-05, + "loss": 11.742, + "step": 29259 + }, + { + "epoch": 1.5933239400189227, + "grad_norm": 0.6014043592532105, + "learning_rate": 2.091180619924763e-05, + "loss": 11.7218, + "step": 29260 + }, + { + "epoch": 1.5933783940155057, + "grad_norm": 0.5450336197543947, + "learning_rate": 2.090641000826422e-05, + "loss": 11.8609, + "step": 29261 + }, + { + "epoch": 1.593432848012089, + "grad_norm": 0.5122395469798758, + "learning_rate": 2.0901014432325894e-05, + "loss": 11.758, + "step": 29262 + }, + { + "epoch": 1.593487302008672, + "grad_norm": 0.5099102735598482, + "learning_rate": 2.0895619471474482e-05, + "loss": 11.8422, + "step": 29263 + }, + { + "epoch": 1.593541756005255, + "grad_norm": 0.6136624409372344, + "learning_rate": 2.0890225125751996e-05, + "loss": 11.7182, + "step": 29264 + }, + { + "epoch": 1.593596210001838, + "grad_norm": 0.6155525934292949, + "learning_rate": 2.088483139520042e-05, + "loss": 11.9642, + "step": 29265 + }, + { + "epoch": 1.593650663998421, + "grad_norm": 0.5131323241283007, + "learning_rate": 2.087943827986163e-05, + "loss": 11.7876, + "step": 29266 + }, + { + "epoch": 1.593705117995004, + "grad_norm": 0.592275942631102, + "learning_rate": 2.087404577977763e-05, + "loss": 11.7975, + "step": 29267 + }, + { + "epoch": 1.593759571991587, + "grad_norm": 0.5590714359972297, + "learning_rate": 2.0868653894990286e-05, + "loss": 11.7961, + "step": 29268 + }, + { + "epoch": 1.59381402598817, + "grad_norm": 0.5687129917076879, + "learning_rate": 2.086326262554159e-05, + "loss": 11.8583, + "step": 29269 + }, + { + "epoch": 1.593868479984753, + "grad_norm": 0.5440672765535223, + "learning_rate": 2.0857871971473396e-05, + "loss": 11.7144, + "step": 29270 + }, + { + "epoch": 1.5939229339813359, + "grad_norm": 0.5096900234979592, + "learning_rate": 2.0852481932827683e-05, + "loss": 11.8566, + "step": 29271 + }, + { + "epoch": 1.5939773879779189, + "grad_norm": 0.6039441842701397, + "learning_rate": 2.0847092509646382e-05, + "loss": 11.8094, + "step": 29272 + }, + { + "epoch": 1.5940318419745019, + "grad_norm": 0.5224844081263593, + "learning_rate": 2.0841703701971317e-05, + "loss": 11.8094, + "step": 29273 + }, + { + "epoch": 1.5940862959710849, + "grad_norm": 0.5925389058157018, + "learning_rate": 2.0836315509844462e-05, + "loss": 11.8046, + "step": 29274 + }, + { + "epoch": 1.5941407499676679, + "grad_norm": 0.5479508959563254, + "learning_rate": 2.0830927933307666e-05, + "loss": 11.7138, + "step": 29275 + }, + { + "epoch": 1.5941952039642509, + "grad_norm": 0.637556042757401, + "learning_rate": 2.0825540972402858e-05, + "loss": 11.8174, + "step": 29276 + }, + { + "epoch": 1.5942496579608338, + "grad_norm": 0.5866111568680433, + "learning_rate": 2.082015462717194e-05, + "loss": 11.844, + "step": 29277 + }, + { + "epoch": 1.5943041119574168, + "grad_norm": 0.5123714898958694, + "learning_rate": 2.0814768897656754e-05, + "loss": 11.7639, + "step": 29278 + }, + { + "epoch": 1.5943585659539998, + "grad_norm": 0.6224809269400543, + "learning_rate": 2.080938378389923e-05, + "loss": 11.9527, + "step": 29279 + }, + { + "epoch": 1.594413019950583, + "grad_norm": 0.5313001344689005, + "learning_rate": 2.080399928594119e-05, + "loss": 11.8202, + "step": 29280 + }, + { + "epoch": 1.594467473947166, + "grad_norm": 0.592526489518852, + "learning_rate": 2.0798615403824562e-05, + "loss": 11.8038, + "step": 29281 + }, + { + "epoch": 1.594521927943749, + "grad_norm": 0.5396856684300272, + "learning_rate": 2.079323213759119e-05, + "loss": 11.8334, + "step": 29282 + }, + { + "epoch": 1.594576381940332, + "grad_norm": 0.5204311666070865, + "learning_rate": 2.0787849487282894e-05, + "loss": 11.8591, + "step": 29283 + }, + { + "epoch": 1.594630835936915, + "grad_norm": 0.5625748473460092, + "learning_rate": 2.078246745294159e-05, + "loss": 11.8796, + "step": 29284 + }, + { + "epoch": 1.594685289933498, + "grad_norm": 0.5504402028504651, + "learning_rate": 2.077708603460907e-05, + "loss": 11.8352, + "step": 29285 + }, + { + "epoch": 1.5947397439300812, + "grad_norm": 0.5938363247497671, + "learning_rate": 2.0771705232327253e-05, + "loss": 11.8516, + "step": 29286 + }, + { + "epoch": 1.5947941979266642, + "grad_norm": 0.5375808369991738, + "learning_rate": 2.0766325046137915e-05, + "loss": 11.7668, + "step": 29287 + }, + { + "epoch": 1.5948486519232472, + "grad_norm": 0.6255520458851344, + "learning_rate": 2.0760945476082914e-05, + "loss": 11.9198, + "step": 29288 + }, + { + "epoch": 1.5949031059198302, + "grad_norm": 0.6078822174774229, + "learning_rate": 2.075556652220413e-05, + "loss": 11.861, + "step": 29289 + }, + { + "epoch": 1.5949575599164132, + "grad_norm": 0.5727629181299997, + "learning_rate": 2.0750188184543306e-05, + "loss": 11.8628, + "step": 29290 + }, + { + "epoch": 1.5950120139129962, + "grad_norm": 0.567216252456352, + "learning_rate": 2.074481046314235e-05, + "loss": 11.7328, + "step": 29291 + }, + { + "epoch": 1.5950664679095792, + "grad_norm": 0.6022782443729904, + "learning_rate": 2.0739433358043026e-05, + "loss": 11.7829, + "step": 29292 + }, + { + "epoch": 1.5951209219061622, + "grad_norm": 0.5616427075100456, + "learning_rate": 2.0734056869287144e-05, + "loss": 11.8856, + "step": 29293 + }, + { + "epoch": 1.5951753759027452, + "grad_norm": 0.5327986980656418, + "learning_rate": 2.072868099691655e-05, + "loss": 11.9241, + "step": 29294 + }, + { + "epoch": 1.5952298298993282, + "grad_norm": 0.5593695576862636, + "learning_rate": 2.0723305740972996e-05, + "loss": 11.8909, + "step": 29295 + }, + { + "epoch": 1.5952842838959111, + "grad_norm": 0.5311426509712349, + "learning_rate": 2.0717931101498344e-05, + "loss": 11.9296, + "step": 29296 + }, + { + "epoch": 1.5953387378924941, + "grad_norm": 0.5176813535764708, + "learning_rate": 2.0712557078534335e-05, + "loss": 11.8577, + "step": 29297 + }, + { + "epoch": 1.5953931918890771, + "grad_norm": 0.5496162841096986, + "learning_rate": 2.0707183672122765e-05, + "loss": 11.8236, + "step": 29298 + }, + { + "epoch": 1.5954476458856601, + "grad_norm": 0.504755071941615, + "learning_rate": 2.0701810882305462e-05, + "loss": 11.7105, + "step": 29299 + }, + { + "epoch": 1.5955020998822431, + "grad_norm": 0.5039080260857132, + "learning_rate": 2.0696438709124157e-05, + "loss": 11.7858, + "step": 29300 + }, + { + "epoch": 1.5955565538788261, + "grad_norm": 0.6188281055076668, + "learning_rate": 2.069106715262067e-05, + "loss": 11.8878, + "step": 29301 + }, + { + "epoch": 1.595611007875409, + "grad_norm": 0.5333251973907444, + "learning_rate": 2.0685696212836737e-05, + "loss": 11.8109, + "step": 29302 + }, + { + "epoch": 1.5956654618719923, + "grad_norm": 0.5721325884762738, + "learning_rate": 2.0680325889814112e-05, + "loss": 11.85, + "step": 29303 + }, + { + "epoch": 1.5957199158685753, + "grad_norm": 0.5291263883005053, + "learning_rate": 2.0674956183594595e-05, + "loss": 11.8698, + "step": 29304 + }, + { + "epoch": 1.5957743698651583, + "grad_norm": 0.5646301205861213, + "learning_rate": 2.06695870942199e-05, + "loss": 11.8938, + "step": 29305 + }, + { + "epoch": 1.5958288238617413, + "grad_norm": 0.5432610891425843, + "learning_rate": 2.0664218621731823e-05, + "loss": 11.7214, + "step": 29306 + }, + { + "epoch": 1.5958832778583243, + "grad_norm": 0.539769048527323, + "learning_rate": 2.0658850766172054e-05, + "loss": 11.9207, + "step": 29307 + }, + { + "epoch": 1.5959377318549073, + "grad_norm": 0.588714758758604, + "learning_rate": 2.0653483527582408e-05, + "loss": 11.8795, + "step": 29308 + }, + { + "epoch": 1.5959921858514905, + "grad_norm": 0.5916651483477288, + "learning_rate": 2.0648116906004543e-05, + "loss": 11.7702, + "step": 29309 + }, + { + "epoch": 1.5960466398480735, + "grad_norm": 0.5108358492591021, + "learning_rate": 2.0642750901480233e-05, + "loss": 11.8893, + "step": 29310 + }, + { + "epoch": 1.5961010938446565, + "grad_norm": 0.49544440087693337, + "learning_rate": 2.0637385514051223e-05, + "loss": 11.7864, + "step": 29311 + }, + { + "epoch": 1.5961555478412395, + "grad_norm": 0.5423380161079329, + "learning_rate": 2.0632020743759217e-05, + "loss": 11.691, + "step": 29312 + }, + { + "epoch": 1.5962100018378225, + "grad_norm": 0.5046285314022753, + "learning_rate": 2.06266565906459e-05, + "loss": 11.9099, + "step": 29313 + }, + { + "epoch": 1.5962644558344055, + "grad_norm": 0.5434175528898965, + "learning_rate": 2.0621293054753032e-05, + "loss": 11.9292, + "step": 29314 + }, + { + "epoch": 1.5963189098309885, + "grad_norm": 0.513969180975846, + "learning_rate": 2.0615930136122286e-05, + "loss": 11.8333, + "step": 29315 + }, + { + "epoch": 1.5963733638275714, + "grad_norm": 0.5537772534660337, + "learning_rate": 2.061056783479539e-05, + "loss": 11.7639, + "step": 29316 + }, + { + "epoch": 1.5964278178241544, + "grad_norm": 0.5658981839435101, + "learning_rate": 2.0605206150814017e-05, + "loss": 11.9302, + "step": 29317 + }, + { + "epoch": 1.5964822718207374, + "grad_norm": 0.5156821553687051, + "learning_rate": 2.0599845084219905e-05, + "loss": 11.8463, + "step": 29318 + }, + { + "epoch": 1.5965367258173204, + "grad_norm": 0.6033784564546595, + "learning_rate": 2.059448463505468e-05, + "loss": 11.8912, + "step": 29319 + }, + { + "epoch": 1.5965911798139034, + "grad_norm": 0.5023804642949706, + "learning_rate": 2.058912480336006e-05, + "loss": 11.5914, + "step": 29320 + }, + { + "epoch": 1.5966456338104864, + "grad_norm": 0.565739164961108, + "learning_rate": 2.058376558917775e-05, + "loss": 11.7791, + "step": 29321 + }, + { + "epoch": 1.5967000878070694, + "grad_norm": 0.5280704510056182, + "learning_rate": 2.0578406992549405e-05, + "loss": 11.6932, + "step": 29322 + }, + { + "epoch": 1.5967545418036524, + "grad_norm": 0.5126183917450234, + "learning_rate": 2.0573049013516676e-05, + "loss": 11.8379, + "step": 29323 + }, + { + "epoch": 1.5968089958002354, + "grad_norm": 0.5440726386513677, + "learning_rate": 2.056769165212121e-05, + "loss": 11.8529, + "step": 29324 + }, + { + "epoch": 1.5968634497968184, + "grad_norm": 0.5233461159863342, + "learning_rate": 2.05623349084047e-05, + "loss": 11.727, + "step": 29325 + }, + { + "epoch": 1.5969179037934014, + "grad_norm": 0.5363070990828511, + "learning_rate": 2.055697878240882e-05, + "loss": 11.839, + "step": 29326 + }, + { + "epoch": 1.5969723577899846, + "grad_norm": 0.614792689281094, + "learning_rate": 2.0551623274175167e-05, + "loss": 11.7179, + "step": 29327 + }, + { + "epoch": 1.5970268117865676, + "grad_norm": 0.6002726534508523, + "learning_rate": 2.0546268383745447e-05, + "loss": 11.8518, + "step": 29328 + }, + { + "epoch": 1.5970812657831506, + "grad_norm": 0.6787932544133423, + "learning_rate": 2.0540914111161246e-05, + "loss": 11.7234, + "step": 29329 + }, + { + "epoch": 1.5971357197797336, + "grad_norm": 0.53390654356914, + "learning_rate": 2.0535560456464244e-05, + "loss": 11.7811, + "step": 29330 + }, + { + "epoch": 1.5971901737763166, + "grad_norm": 0.5723515378116586, + "learning_rate": 2.0530207419696014e-05, + "loss": 11.8642, + "step": 29331 + }, + { + "epoch": 1.5972446277728998, + "grad_norm": 0.5323370881988478, + "learning_rate": 2.052485500089826e-05, + "loss": 11.8854, + "step": 29332 + }, + { + "epoch": 1.5972990817694828, + "grad_norm": 0.5355162973079043, + "learning_rate": 2.0519503200112544e-05, + "loss": 11.8937, + "step": 29333 + }, + { + "epoch": 1.5973535357660658, + "grad_norm": 0.6096867267092838, + "learning_rate": 2.0514152017380482e-05, + "loss": 11.7952, + "step": 29334 + }, + { + "epoch": 1.5974079897626488, + "grad_norm": 0.6575767372477747, + "learning_rate": 2.050880145274373e-05, + "loss": 11.9411, + "step": 29335 + }, + { + "epoch": 1.5974624437592317, + "grad_norm": 0.5766491995228399, + "learning_rate": 2.0503451506243844e-05, + "loss": 11.8742, + "step": 29336 + }, + { + "epoch": 1.5975168977558147, + "grad_norm": 0.5564938888422738, + "learning_rate": 2.049810217792245e-05, + "loss": 11.6938, + "step": 29337 + }, + { + "epoch": 1.5975713517523977, + "grad_norm": 0.5633894880921775, + "learning_rate": 2.049275346782118e-05, + "loss": 11.8614, + "step": 29338 + }, + { + "epoch": 1.5976258057489807, + "grad_norm": 0.6026291967793118, + "learning_rate": 2.048740537598155e-05, + "loss": 11.9225, + "step": 29339 + }, + { + "epoch": 1.5976802597455637, + "grad_norm": 0.5454465730805981, + "learning_rate": 2.048205790244523e-05, + "loss": 11.806, + "step": 29340 + }, + { + "epoch": 1.5977347137421467, + "grad_norm": 0.5324393254097145, + "learning_rate": 2.047671104725373e-05, + "loss": 11.7547, + "step": 29341 + }, + { + "epoch": 1.5977891677387297, + "grad_norm": 0.539211253220576, + "learning_rate": 2.047136481044869e-05, + "loss": 11.8763, + "step": 29342 + }, + { + "epoch": 1.5978436217353127, + "grad_norm": 0.4994208877902171, + "learning_rate": 2.0466019192071652e-05, + "loss": 11.8807, + "step": 29343 + }, + { + "epoch": 1.5978980757318957, + "grad_norm": 0.5234971724937825, + "learning_rate": 2.0460674192164163e-05, + "loss": 11.8054, + "step": 29344 + }, + { + "epoch": 1.5979525297284787, + "grad_norm": 0.6142188536115024, + "learning_rate": 2.045532981076783e-05, + "loss": 11.7835, + "step": 29345 + }, + { + "epoch": 1.5980069837250617, + "grad_norm": 0.5165604775158151, + "learning_rate": 2.0449986047924173e-05, + "loss": 11.7835, + "step": 29346 + }, + { + "epoch": 1.5980614377216447, + "grad_norm": 0.5374935990097058, + "learning_rate": 2.044464290367476e-05, + "loss": 11.6989, + "step": 29347 + }, + { + "epoch": 1.5981158917182277, + "grad_norm": 0.5124067275860661, + "learning_rate": 2.0439300378061178e-05, + "loss": 11.7579, + "step": 29348 + }, + { + "epoch": 1.5981703457148106, + "grad_norm": 0.5376089506350612, + "learning_rate": 2.0433958471124902e-05, + "loss": 11.77, + "step": 29349 + }, + { + "epoch": 1.5982247997113939, + "grad_norm": 0.5845308154121257, + "learning_rate": 2.042861718290754e-05, + "loss": 11.831, + "step": 29350 + }, + { + "epoch": 1.5982792537079769, + "grad_norm": 0.5422491709840996, + "learning_rate": 2.0423276513450572e-05, + "loss": 11.8305, + "step": 29351 + }, + { + "epoch": 1.5983337077045598, + "grad_norm": 0.5882532223644485, + "learning_rate": 2.041793646279557e-05, + "loss": 11.8764, + "step": 29352 + }, + { + "epoch": 1.5983881617011428, + "grad_norm": 0.5068251702623046, + "learning_rate": 2.041259703098405e-05, + "loss": 11.8794, + "step": 29353 + }, + { + "epoch": 1.5984426156977258, + "grad_norm": 0.5423090314044817, + "learning_rate": 2.040725821805749e-05, + "loss": 11.6394, + "step": 29354 + }, + { + "epoch": 1.5984970696943088, + "grad_norm": 0.5244575659651973, + "learning_rate": 2.0401920024057464e-05, + "loss": 11.8177, + "step": 29355 + }, + { + "epoch": 1.598551523690892, + "grad_norm": 0.534760458178448, + "learning_rate": 2.0396582449025438e-05, + "loss": 11.8421, + "step": 29356 + }, + { + "epoch": 1.598605977687475, + "grad_norm": 0.595690836982202, + "learning_rate": 2.0391245493002952e-05, + "loss": 11.9205, + "step": 29357 + }, + { + "epoch": 1.598660431684058, + "grad_norm": 0.5297888333733959, + "learning_rate": 2.0385909156031467e-05, + "loss": 11.8155, + "step": 29358 + }, + { + "epoch": 1.598714885680641, + "grad_norm": 0.5589288586725722, + "learning_rate": 2.0380573438152507e-05, + "loss": 11.6377, + "step": 29359 + }, + { + "epoch": 1.598769339677224, + "grad_norm": 0.5116527023082919, + "learning_rate": 2.0375238339407577e-05, + "loss": 11.7126, + "step": 29360 + }, + { + "epoch": 1.598823793673807, + "grad_norm": 0.5354004677928198, + "learning_rate": 2.0369903859838135e-05, + "loss": 11.7904, + "step": 29361 + }, + { + "epoch": 1.59887824767039, + "grad_norm": 0.5822814331580956, + "learning_rate": 2.0364569999485695e-05, + "loss": 11.7983, + "step": 29362 + }, + { + "epoch": 1.598932701666973, + "grad_norm": 0.4890187835699303, + "learning_rate": 2.035923675839171e-05, + "loss": 11.8121, + "step": 29363 + }, + { + "epoch": 1.598987155663556, + "grad_norm": 0.5322577516379914, + "learning_rate": 2.0353904136597635e-05, + "loss": 11.7383, + "step": 29364 + }, + { + "epoch": 1.599041609660139, + "grad_norm": 0.5383542847029062, + "learning_rate": 2.034857213414497e-05, + "loss": 11.6958, + "step": 29365 + }, + { + "epoch": 1.599096063656722, + "grad_norm": 0.5210816974437572, + "learning_rate": 2.0343240751075154e-05, + "loss": 11.5566, + "step": 29366 + }, + { + "epoch": 1.599150517653305, + "grad_norm": 0.5457318379706793, + "learning_rate": 2.0337909987429683e-05, + "loss": 11.6954, + "step": 29367 + }, + { + "epoch": 1.599204971649888, + "grad_norm": 0.5168958739928255, + "learning_rate": 2.0332579843249954e-05, + "loss": 11.6211, + "step": 29368 + }, + { + "epoch": 1.599259425646471, + "grad_norm": 0.5850823126483844, + "learning_rate": 2.032725031857744e-05, + "loss": 11.737, + "step": 29369 + }, + { + "epoch": 1.599313879643054, + "grad_norm": 0.5253489496144531, + "learning_rate": 2.0321921413453627e-05, + "loss": 11.8412, + "step": 29370 + }, + { + "epoch": 1.599368333639637, + "grad_norm": 0.5565454633267117, + "learning_rate": 2.031659312791987e-05, + "loss": 11.8185, + "step": 29371 + }, + { + "epoch": 1.59942278763622, + "grad_norm": 0.5149642405061943, + "learning_rate": 2.0311265462017693e-05, + "loss": 11.8535, + "step": 29372 + }, + { + "epoch": 1.5994772416328031, + "grad_norm": 0.5864200899043692, + "learning_rate": 2.0305938415788472e-05, + "loss": 11.7961, + "step": 29373 + }, + { + "epoch": 1.5995316956293861, + "grad_norm": 0.5950075065925858, + "learning_rate": 2.030061198927361e-05, + "loss": 11.8858, + "step": 29374 + }, + { + "epoch": 1.5995861496259691, + "grad_norm": 0.6078560209968946, + "learning_rate": 2.0295286182514584e-05, + "loss": 11.9283, + "step": 29375 + }, + { + "epoch": 1.5996406036225521, + "grad_norm": 0.5447446069690238, + "learning_rate": 2.028996099555275e-05, + "loss": 11.7079, + "step": 29376 + }, + { + "epoch": 1.599695057619135, + "grad_norm": 0.5171646235522743, + "learning_rate": 2.028463642842957e-05, + "loss": 11.7078, + "step": 29377 + }, + { + "epoch": 1.599749511615718, + "grad_norm": 0.5406602315200316, + "learning_rate": 2.0279312481186407e-05, + "loss": 11.8501, + "step": 29378 + }, + { + "epoch": 1.5998039656123013, + "grad_norm": 0.554756835062345, + "learning_rate": 2.0273989153864703e-05, + "loss": 11.9458, + "step": 29379 + }, + { + "epoch": 1.5998584196088843, + "grad_norm": 0.6053443037557681, + "learning_rate": 2.0268666446505803e-05, + "loss": 11.8822, + "step": 29380 + }, + { + "epoch": 1.5999128736054673, + "grad_norm": 0.5052093775294951, + "learning_rate": 2.0263344359151114e-05, + "loss": 11.8362, + "step": 29381 + }, + { + "epoch": 1.5999673276020503, + "grad_norm": 0.5497470843840734, + "learning_rate": 2.0258022891842066e-05, + "loss": 11.7865, + "step": 29382 + }, + { + "epoch": 1.6000217815986333, + "grad_norm": 0.5696991347372337, + "learning_rate": 2.0252702044620007e-05, + "loss": 11.7343, + "step": 29383 + }, + { + "epoch": 1.6000762355952163, + "grad_norm": 0.5391041147615704, + "learning_rate": 2.024738181752631e-05, + "loss": 11.8043, + "step": 29384 + }, + { + "epoch": 1.6001306895917993, + "grad_norm": 0.5445163514808162, + "learning_rate": 2.0242062210602318e-05, + "loss": 11.8279, + "step": 29385 + }, + { + "epoch": 1.6001851435883823, + "grad_norm": 0.4901077404302885, + "learning_rate": 2.0236743223889422e-05, + "loss": 11.7292, + "step": 29386 + }, + { + "epoch": 1.6002395975849653, + "grad_norm": 0.5709519578301949, + "learning_rate": 2.0231424857429026e-05, + "loss": 11.8579, + "step": 29387 + }, + { + "epoch": 1.6002940515815482, + "grad_norm": 0.5521014017851856, + "learning_rate": 2.0226107111262404e-05, + "loss": 11.7892, + "step": 29388 + }, + { + "epoch": 1.6003485055781312, + "grad_norm": 0.5981844161381498, + "learning_rate": 2.0220789985430986e-05, + "loss": 11.8641, + "step": 29389 + }, + { + "epoch": 1.6004029595747142, + "grad_norm": 0.6060426741470232, + "learning_rate": 2.0215473479976067e-05, + "loss": 11.8387, + "step": 29390 + }, + { + "epoch": 1.6004574135712972, + "grad_norm": 0.5382350872141481, + "learning_rate": 2.0210157594939016e-05, + "loss": 11.7363, + "step": 29391 + }, + { + "epoch": 1.6005118675678802, + "grad_norm": 0.5575719557209478, + "learning_rate": 2.020484233036114e-05, + "loss": 11.9765, + "step": 29392 + }, + { + "epoch": 1.6005663215644632, + "grad_norm": 0.5301358191553304, + "learning_rate": 2.0199527686283827e-05, + "loss": 11.673, + "step": 29393 + }, + { + "epoch": 1.6006207755610462, + "grad_norm": 0.5860651116209377, + "learning_rate": 2.0194213662748362e-05, + "loss": 11.7878, + "step": 29394 + }, + { + "epoch": 1.6006752295576292, + "grad_norm": 0.5871124440341244, + "learning_rate": 2.018890025979604e-05, + "loss": 11.7139, + "step": 29395 + }, + { + "epoch": 1.6007296835542122, + "grad_norm": 0.5616677883384459, + "learning_rate": 2.0183587477468226e-05, + "loss": 11.8742, + "step": 29396 + }, + { + "epoch": 1.6007841375507954, + "grad_norm": 0.5191381149894279, + "learning_rate": 2.0178275315806237e-05, + "loss": 11.7855, + "step": 29397 + }, + { + "epoch": 1.6008385915473784, + "grad_norm": 0.5091419315561139, + "learning_rate": 2.017296377485135e-05, + "loss": 11.8683, + "step": 29398 + }, + { + "epoch": 1.6008930455439614, + "grad_norm": 0.5352089856035932, + "learning_rate": 2.0167652854644902e-05, + "loss": 11.8727, + "step": 29399 + }, + { + "epoch": 1.6009474995405444, + "grad_norm": 0.5537442666076666, + "learning_rate": 2.0162342555228152e-05, + "loss": 11.8444, + "step": 29400 + }, + { + "epoch": 1.6010019535371274, + "grad_norm": 0.5936712654447079, + "learning_rate": 2.0157032876642445e-05, + "loss": 11.7059, + "step": 29401 + }, + { + "epoch": 1.6010564075337106, + "grad_norm": 0.5149627350136136, + "learning_rate": 2.0151723818929002e-05, + "loss": 11.7851, + "step": 29402 + }, + { + "epoch": 1.6011108615302936, + "grad_norm": 0.5368922638871556, + "learning_rate": 2.014641538212918e-05, + "loss": 11.763, + "step": 29403 + }, + { + "epoch": 1.6011653155268766, + "grad_norm": 0.5987494576207734, + "learning_rate": 2.0141107566284224e-05, + "loss": 11.7101, + "step": 29404 + }, + { + "epoch": 1.6012197695234596, + "grad_norm": 0.5622285933443621, + "learning_rate": 2.0135800371435377e-05, + "loss": 11.7634, + "step": 29405 + }, + { + "epoch": 1.6012742235200426, + "grad_norm": 0.587119010358452, + "learning_rate": 2.013049379762397e-05, + "loss": 11.8866, + "step": 29406 + }, + { + "epoch": 1.6013286775166256, + "grad_norm": 0.5303848846970265, + "learning_rate": 2.0125187844891214e-05, + "loss": 11.8284, + "step": 29407 + }, + { + "epoch": 1.6013831315132085, + "grad_norm": 0.5169890060139251, + "learning_rate": 2.0119882513278376e-05, + "loss": 11.7693, + "step": 29408 + }, + { + "epoch": 1.6014375855097915, + "grad_norm": 0.4841309745431037, + "learning_rate": 2.011457780282677e-05, + "loss": 11.6597, + "step": 29409 + }, + { + "epoch": 1.6014920395063745, + "grad_norm": 0.5296250126575548, + "learning_rate": 2.010927371357757e-05, + "loss": 11.8657, + "step": 29410 + }, + { + "epoch": 1.6015464935029575, + "grad_norm": 0.5264449967244791, + "learning_rate": 2.010397024557209e-05, + "loss": 11.902, + "step": 29411 + }, + { + "epoch": 1.6016009474995405, + "grad_norm": 0.5242921438198742, + "learning_rate": 2.0098667398851502e-05, + "loss": 11.761, + "step": 29412 + }, + { + "epoch": 1.6016554014961235, + "grad_norm": 0.5567536072830178, + "learning_rate": 2.0093365173457102e-05, + "loss": 11.9027, + "step": 29413 + }, + { + "epoch": 1.6017098554927065, + "grad_norm": 0.5864266023755338, + "learning_rate": 2.0088063569430093e-05, + "loss": 11.9558, + "step": 29414 + }, + { + "epoch": 1.6017643094892895, + "grad_norm": 0.5302354177762099, + "learning_rate": 2.0082762586811686e-05, + "loss": 11.8522, + "step": 29415 + }, + { + "epoch": 1.6018187634858725, + "grad_norm": 0.5340893847277921, + "learning_rate": 2.007746222564314e-05, + "loss": 11.8762, + "step": 29416 + }, + { + "epoch": 1.6018732174824555, + "grad_norm": 0.5841347553224232, + "learning_rate": 2.0072162485965618e-05, + "loss": 11.5254, + "step": 29417 + }, + { + "epoch": 1.6019276714790385, + "grad_norm": 0.5277587417691794, + "learning_rate": 2.0066863367820376e-05, + "loss": 11.8683, + "step": 29418 + }, + { + "epoch": 1.6019821254756215, + "grad_norm": 0.5513675047556913, + "learning_rate": 2.006156487124863e-05, + "loss": 11.8087, + "step": 29419 + }, + { + "epoch": 1.6020365794722047, + "grad_norm": 0.523210258162122, + "learning_rate": 2.005626699629153e-05, + "loss": 11.8505, + "step": 29420 + }, + { + "epoch": 1.6020910334687877, + "grad_norm": 0.5376344267347557, + "learning_rate": 2.0050969742990346e-05, + "loss": 11.7899, + "step": 29421 + }, + { + "epoch": 1.6021454874653707, + "grad_norm": 0.535397752464809, + "learning_rate": 2.0045673111386187e-05, + "loss": 11.7908, + "step": 29422 + }, + { + "epoch": 1.6021999414619537, + "grad_norm": 0.5702444878502699, + "learning_rate": 2.0040377101520313e-05, + "loss": 11.7889, + "step": 29423 + }, + { + "epoch": 1.6022543954585367, + "grad_norm": 0.6231034304487845, + "learning_rate": 2.0035081713433878e-05, + "loss": 11.8418, + "step": 29424 + }, + { + "epoch": 1.6023088494551196, + "grad_norm": 0.49722995197810943, + "learning_rate": 2.0029786947168028e-05, + "loss": 11.8859, + "step": 29425 + }, + { + "epoch": 1.6023633034517029, + "grad_norm": 0.5580957798454991, + "learning_rate": 2.0024492802763996e-05, + "loss": 11.746, + "step": 29426 + }, + { + "epoch": 1.6024177574482859, + "grad_norm": 0.5889126367332558, + "learning_rate": 2.001919928026289e-05, + "loss": 11.8205, + "step": 29427 + }, + { + "epoch": 1.6024722114448688, + "grad_norm": 0.5209238836756184, + "learning_rate": 2.0013906379705925e-05, + "loss": 11.9455, + "step": 29428 + }, + { + "epoch": 1.6025266654414518, + "grad_norm": 0.5677628338776259, + "learning_rate": 2.0008614101134228e-05, + "loss": 11.8863, + "step": 29429 + }, + { + "epoch": 1.6025811194380348, + "grad_norm": 0.6074607586573398, + "learning_rate": 2.0003322444588945e-05, + "loss": 11.7899, + "step": 29430 + }, + { + "epoch": 1.6026355734346178, + "grad_norm": 0.5618640876632571, + "learning_rate": 1.9998031410111275e-05, + "loss": 11.7676, + "step": 29431 + }, + { + "epoch": 1.6026900274312008, + "grad_norm": 0.5084939224750649, + "learning_rate": 1.999274099774231e-05, + "loss": 11.7653, + "step": 29432 + }, + { + "epoch": 1.6027444814277838, + "grad_norm": 0.5814571428367679, + "learning_rate": 1.9987451207523245e-05, + "loss": 11.8396, + "step": 29433 + }, + { + "epoch": 1.6027989354243668, + "grad_norm": 0.5685096359853066, + "learning_rate": 1.9982162039495133e-05, + "loss": 11.818, + "step": 29434 + }, + { + "epoch": 1.6028533894209498, + "grad_norm": 0.5745809363328269, + "learning_rate": 1.9976873493699155e-05, + "loss": 11.6109, + "step": 29435 + }, + { + "epoch": 1.6029078434175328, + "grad_norm": 0.5289668498802075, + "learning_rate": 1.9971585570176443e-05, + "loss": 11.705, + "step": 29436 + }, + { + "epoch": 1.6029622974141158, + "grad_norm": 0.6143617425459307, + "learning_rate": 1.9966298268968086e-05, + "loss": 11.8005, + "step": 29437 + }, + { + "epoch": 1.6030167514106988, + "grad_norm": 0.49538255166652645, + "learning_rate": 1.996101159011524e-05, + "loss": 11.7992, + "step": 29438 + }, + { + "epoch": 1.6030712054072818, + "grad_norm": 0.5543726132688722, + "learning_rate": 1.9955725533658954e-05, + "loss": 11.7016, + "step": 29439 + }, + { + "epoch": 1.6031256594038648, + "grad_norm": 0.6354922580447147, + "learning_rate": 1.995044009964041e-05, + "loss": 11.8876, + "step": 29440 + }, + { + "epoch": 1.6031801134004477, + "grad_norm": 0.5291584829323384, + "learning_rate": 1.9945155288100636e-05, + "loss": 11.8345, + "step": 29441 + }, + { + "epoch": 1.6032345673970307, + "grad_norm": 0.5579322998389947, + "learning_rate": 1.9939871099080753e-05, + "loss": 11.8444, + "step": 29442 + }, + { + "epoch": 1.603289021393614, + "grad_norm": 0.5522996991215278, + "learning_rate": 1.993458753262192e-05, + "loss": 11.9134, + "step": 29443 + }, + { + "epoch": 1.603343475390197, + "grad_norm": 0.5340596804135831, + "learning_rate": 1.9929304588765107e-05, + "loss": 11.8661, + "step": 29444 + }, + { + "epoch": 1.60339792938678, + "grad_norm": 0.5900912133232146, + "learning_rate": 1.9924022267551444e-05, + "loss": 11.9336, + "step": 29445 + }, + { + "epoch": 1.603452383383363, + "grad_norm": 0.6204722787699938, + "learning_rate": 1.9918740569022042e-05, + "loss": 11.954, + "step": 29446 + }, + { + "epoch": 1.603506837379946, + "grad_norm": 0.5473150052355028, + "learning_rate": 1.9913459493217923e-05, + "loss": 11.7859, + "step": 29447 + }, + { + "epoch": 1.603561291376529, + "grad_norm": 0.6360911442389331, + "learning_rate": 1.9908179040180197e-05, + "loss": 11.8471, + "step": 29448 + }, + { + "epoch": 1.6036157453731121, + "grad_norm": 0.5718820184914463, + "learning_rate": 1.9902899209949865e-05, + "loss": 11.8596, + "step": 29449 + }, + { + "epoch": 1.6036701993696951, + "grad_norm": 0.5702905523345672, + "learning_rate": 1.9897620002568064e-05, + "loss": 11.9062, + "step": 29450 + }, + { + "epoch": 1.6037246533662781, + "grad_norm": 0.674056427309686, + "learning_rate": 1.9892341418075766e-05, + "loss": 12.0222, + "step": 29451 + }, + { + "epoch": 1.6037791073628611, + "grad_norm": 0.5430100968272906, + "learning_rate": 1.9887063456514055e-05, + "loss": 11.6578, + "step": 29452 + }, + { + "epoch": 1.603833561359444, + "grad_norm": 0.5437180308953273, + "learning_rate": 1.9881786117924027e-05, + "loss": 11.7666, + "step": 29453 + }, + { + "epoch": 1.603888015356027, + "grad_norm": 0.5978516870845708, + "learning_rate": 1.9876509402346623e-05, + "loss": 11.7347, + "step": 29454 + }, + { + "epoch": 1.60394246935261, + "grad_norm": 0.5734227760594006, + "learning_rate": 1.9871233309822935e-05, + "loss": 11.9542, + "step": 29455 + }, + { + "epoch": 1.603996923349193, + "grad_norm": 0.5099606994702084, + "learning_rate": 1.9865957840393945e-05, + "loss": 11.858, + "step": 29456 + }, + { + "epoch": 1.604051377345776, + "grad_norm": 0.5513875227445373, + "learning_rate": 1.986068299410071e-05, + "loss": 11.7734, + "step": 29457 + }, + { + "epoch": 1.604105831342359, + "grad_norm": 0.571443913331018, + "learning_rate": 1.9855408770984274e-05, + "loss": 11.7681, + "step": 29458 + }, + { + "epoch": 1.604160285338942, + "grad_norm": 0.6271376497212182, + "learning_rate": 1.9850135171085582e-05, + "loss": 11.7576, + "step": 29459 + }, + { + "epoch": 1.604214739335525, + "grad_norm": 0.5429512264294071, + "learning_rate": 1.9844862194445713e-05, + "loss": 11.8022, + "step": 29460 + }, + { + "epoch": 1.604269193332108, + "grad_norm": 0.5231861899643807, + "learning_rate": 1.9839589841105623e-05, + "loss": 11.8079, + "step": 29461 + }, + { + "epoch": 1.604323647328691, + "grad_norm": 0.530296903557611, + "learning_rate": 1.983431811110633e-05, + "loss": 11.8469, + "step": 29462 + }, + { + "epoch": 1.604378101325274, + "grad_norm": 0.5534393628654909, + "learning_rate": 1.9829047004488832e-05, + "loss": 11.7574, + "step": 29463 + }, + { + "epoch": 1.604432555321857, + "grad_norm": 0.5181474191870714, + "learning_rate": 1.982377652129409e-05, + "loss": 11.7387, + "step": 29464 + }, + { + "epoch": 1.60448700931844, + "grad_norm": 0.5370931947527635, + "learning_rate": 1.981850666156313e-05, + "loss": 11.7648, + "step": 29465 + }, + { + "epoch": 1.6045414633150232, + "grad_norm": 0.5592638879431615, + "learning_rate": 1.9813237425336883e-05, + "loss": 11.8377, + "step": 29466 + }, + { + "epoch": 1.6045959173116062, + "grad_norm": 0.6084962679463427, + "learning_rate": 1.9807968812656342e-05, + "loss": 11.7357, + "step": 29467 + }, + { + "epoch": 1.6046503713081892, + "grad_norm": 0.5625876362901717, + "learning_rate": 1.9802700823562514e-05, + "loss": 11.8689, + "step": 29468 + }, + { + "epoch": 1.6047048253047722, + "grad_norm": 0.5597426028816667, + "learning_rate": 1.9797433458096304e-05, + "loss": 11.825, + "step": 29469 + }, + { + "epoch": 1.6047592793013552, + "grad_norm": 0.5074386905928788, + "learning_rate": 1.979216671629873e-05, + "loss": 11.7852, + "step": 29470 + }, + { + "epoch": 1.6048137332979382, + "grad_norm": 0.5922805815533806, + "learning_rate": 1.97869005982107e-05, + "loss": 11.8745, + "step": 29471 + }, + { + "epoch": 1.6048681872945214, + "grad_norm": 0.5292795437817838, + "learning_rate": 1.978163510387321e-05, + "loss": 11.7987, + "step": 29472 + }, + { + "epoch": 1.6049226412911044, + "grad_norm": 0.5729941697010434, + "learning_rate": 1.977637023332717e-05, + "loss": 11.8809, + "step": 29473 + }, + { + "epoch": 1.6049770952876874, + "grad_norm": 0.6235051491056085, + "learning_rate": 1.9771105986613492e-05, + "loss": 11.8302, + "step": 29474 + }, + { + "epoch": 1.6050315492842704, + "grad_norm": 0.5094087735677347, + "learning_rate": 1.976584236377319e-05, + "loss": 11.7308, + "step": 29475 + }, + { + "epoch": 1.6050860032808534, + "grad_norm": 0.5222837277982497, + "learning_rate": 1.976057936484712e-05, + "loss": 11.7958, + "step": 29476 + }, + { + "epoch": 1.6051404572774364, + "grad_norm": 0.5484101553209316, + "learning_rate": 1.9755316989876273e-05, + "loss": 11.7901, + "step": 29477 + }, + { + "epoch": 1.6051949112740194, + "grad_norm": 0.5685191554377844, + "learning_rate": 1.975005523890151e-05, + "loss": 11.7697, + "step": 29478 + }, + { + "epoch": 1.6052493652706024, + "grad_norm": 0.538431149989638, + "learning_rate": 1.974479411196377e-05, + "loss": 11.7302, + "step": 29479 + }, + { + "epoch": 1.6053038192671854, + "grad_norm": 0.5511395883280289, + "learning_rate": 1.9739533609104e-05, + "loss": 11.8739, + "step": 29480 + }, + { + "epoch": 1.6053582732637683, + "grad_norm": 0.5394145093542255, + "learning_rate": 1.973427373036305e-05, + "loss": 11.7719, + "step": 29481 + }, + { + "epoch": 1.6054127272603513, + "grad_norm": 0.5821856532083591, + "learning_rate": 1.972901447578188e-05, + "loss": 11.6762, + "step": 29482 + }, + { + "epoch": 1.6054671812569343, + "grad_norm": 0.554589386980011, + "learning_rate": 1.9723755845401348e-05, + "loss": 11.6822, + "step": 29483 + }, + { + "epoch": 1.6055216352535173, + "grad_norm": 0.6017702453644517, + "learning_rate": 1.9718497839262327e-05, + "loss": 11.8896, + "step": 29484 + }, + { + "epoch": 1.6055760892501003, + "grad_norm": 0.5876700246333384, + "learning_rate": 1.9713240457405756e-05, + "loss": 11.8857, + "step": 29485 + }, + { + "epoch": 1.6056305432466833, + "grad_norm": 0.5051138645146366, + "learning_rate": 1.9707983699872467e-05, + "loss": 11.7911, + "step": 29486 + }, + { + "epoch": 1.6056849972432663, + "grad_norm": 0.5804991215949988, + "learning_rate": 1.9702727566703383e-05, + "loss": 11.8263, + "step": 29487 + }, + { + "epoch": 1.6057394512398493, + "grad_norm": 0.5762805663177812, + "learning_rate": 1.9697472057939336e-05, + "loss": 11.7438, + "step": 29488 + }, + { + "epoch": 1.6057939052364323, + "grad_norm": 0.5301925356628214, + "learning_rate": 1.969221717362124e-05, + "loss": 11.8418, + "step": 29489 + }, + { + "epoch": 1.6058483592330155, + "grad_norm": 0.4838981274097415, + "learning_rate": 1.9686962913789897e-05, + "loss": 11.8709, + "step": 29490 + }, + { + "epoch": 1.6059028132295985, + "grad_norm": 0.5256166653957722, + "learning_rate": 1.9681709278486204e-05, + "loss": 11.7395, + "step": 29491 + }, + { + "epoch": 1.6059572672261815, + "grad_norm": 0.5874138183369193, + "learning_rate": 1.967645626775103e-05, + "loss": 11.6483, + "step": 29492 + }, + { + "epoch": 1.6060117212227645, + "grad_norm": 0.619175014361116, + "learning_rate": 1.9671203881625187e-05, + "loss": 11.7709, + "step": 29493 + }, + { + "epoch": 1.6060661752193475, + "grad_norm": 0.5397721233361413, + "learning_rate": 1.9665952120149557e-05, + "loss": 11.841, + "step": 29494 + }, + { + "epoch": 1.6061206292159305, + "grad_norm": 0.5133565440623458, + "learning_rate": 1.9660700983364943e-05, + "loss": 11.635, + "step": 29495 + }, + { + "epoch": 1.6061750832125137, + "grad_norm": 0.5781675428524408, + "learning_rate": 1.9655450471312176e-05, + "loss": 11.8864, + "step": 29496 + }, + { + "epoch": 1.6062295372090967, + "grad_norm": 0.5758039603189183, + "learning_rate": 1.9650200584032118e-05, + "loss": 11.8107, + "step": 29497 + }, + { + "epoch": 1.6062839912056797, + "grad_norm": 0.5710336443251128, + "learning_rate": 1.9644951321565564e-05, + "loss": 11.8368, + "step": 29498 + }, + { + "epoch": 1.6063384452022627, + "grad_norm": 0.6379260319670289, + "learning_rate": 1.9639702683953355e-05, + "loss": 11.9761, + "step": 29499 + }, + { + "epoch": 1.6063928991988456, + "grad_norm": 0.5398940733126778, + "learning_rate": 1.963445467123628e-05, + "loss": 11.8037, + "step": 29500 + }, + { + "epoch": 1.6064473531954286, + "grad_norm": 0.560128835292989, + "learning_rate": 1.9629207283455152e-05, + "loss": 11.8461, + "step": 29501 + }, + { + "epoch": 1.6065018071920116, + "grad_norm": 0.5908217534337917, + "learning_rate": 1.9623960520650818e-05, + "loss": 11.8833, + "step": 29502 + }, + { + "epoch": 1.6065562611885946, + "grad_norm": 0.6062157160382261, + "learning_rate": 1.9618714382864022e-05, + "loss": 11.9305, + "step": 29503 + }, + { + "epoch": 1.6066107151851776, + "grad_norm": 0.5354432956893059, + "learning_rate": 1.9613468870135632e-05, + "loss": 11.7287, + "step": 29504 + }, + { + "epoch": 1.6066651691817606, + "grad_norm": 0.5175288607008623, + "learning_rate": 1.9608223982506336e-05, + "loss": 11.7869, + "step": 29505 + }, + { + "epoch": 1.6067196231783436, + "grad_norm": 0.5830635355602947, + "learning_rate": 1.9602979720016966e-05, + "loss": 11.9349, + "step": 29506 + }, + { + "epoch": 1.6067740771749266, + "grad_norm": 0.5880066495957835, + "learning_rate": 1.959773608270835e-05, + "loss": 11.6871, + "step": 29507 + }, + { + "epoch": 1.6068285311715096, + "grad_norm": 0.5211816113989604, + "learning_rate": 1.9592493070621186e-05, + "loss": 11.8174, + "step": 29508 + }, + { + "epoch": 1.6068829851680926, + "grad_norm": 0.5881013403450657, + "learning_rate": 1.95872506837963e-05, + "loss": 11.7841, + "step": 29509 + }, + { + "epoch": 1.6069374391646756, + "grad_norm": 0.5116156019886727, + "learning_rate": 1.9582008922274418e-05, + "loss": 11.8707, + "step": 29510 + }, + { + "epoch": 1.6069918931612586, + "grad_norm": 0.556690276586919, + "learning_rate": 1.9576767786096352e-05, + "loss": 11.7254, + "step": 29511 + }, + { + "epoch": 1.6070463471578416, + "grad_norm": 0.6215590417171741, + "learning_rate": 1.9571527275302792e-05, + "loss": 11.905, + "step": 29512 + }, + { + "epoch": 1.6071008011544248, + "grad_norm": 0.5608214710341549, + "learning_rate": 1.9566287389934535e-05, + "loss": 11.9164, + "step": 29513 + }, + { + "epoch": 1.6071552551510078, + "grad_norm": 0.5599770801378372, + "learning_rate": 1.9561048130032357e-05, + "loss": 11.8274, + "step": 29514 + }, + { + "epoch": 1.6072097091475908, + "grad_norm": 0.531641870382106, + "learning_rate": 1.9555809495636912e-05, + "loss": 11.6633, + "step": 29515 + }, + { + "epoch": 1.6072641631441738, + "grad_norm": 0.5398785670105353, + "learning_rate": 1.9550571486788984e-05, + "loss": 11.829, + "step": 29516 + }, + { + "epoch": 1.6073186171407567, + "grad_norm": 0.5244099261527798, + "learning_rate": 1.9545334103529324e-05, + "loss": 11.7315, + "step": 29517 + }, + { + "epoch": 1.6073730711373397, + "grad_norm": 0.5923949215629317, + "learning_rate": 1.954009734589861e-05, + "loss": 11.8393, + "step": 29518 + }, + { + "epoch": 1.607427525133923, + "grad_norm": 0.5406979365669363, + "learning_rate": 1.9534861213937627e-05, + "loss": 11.7673, + "step": 29519 + }, + { + "epoch": 1.607481979130506, + "grad_norm": 0.5557786652710994, + "learning_rate": 1.9529625707687027e-05, + "loss": 11.8586, + "step": 29520 + }, + { + "epoch": 1.607536433127089, + "grad_norm": 0.5494921665501551, + "learning_rate": 1.9524390827187577e-05, + "loss": 11.8828, + "step": 29521 + }, + { + "epoch": 1.607590887123672, + "grad_norm": 0.5415854479649429, + "learning_rate": 1.9519156572479934e-05, + "loss": 11.8841, + "step": 29522 + }, + { + "epoch": 1.607645341120255, + "grad_norm": 0.5048770497296734, + "learning_rate": 1.9513922943604834e-05, + "loss": 11.8813, + "step": 29523 + }, + { + "epoch": 1.607699795116838, + "grad_norm": 0.4881420841267677, + "learning_rate": 1.9508689940603008e-05, + "loss": 11.9356, + "step": 29524 + }, + { + "epoch": 1.607754249113421, + "grad_norm": 0.5381087130994184, + "learning_rate": 1.950345756351506e-05, + "loss": 11.8552, + "step": 29525 + }, + { + "epoch": 1.607808703110004, + "grad_norm": 0.5556975514909981, + "learning_rate": 1.9498225812381755e-05, + "loss": 11.7983, + "step": 29526 + }, + { + "epoch": 1.607863157106587, + "grad_norm": 0.5920559232220721, + "learning_rate": 1.9492994687243714e-05, + "loss": 11.876, + "step": 29527 + }, + { + "epoch": 1.6079176111031699, + "grad_norm": 0.5737121249998529, + "learning_rate": 1.9487764188141655e-05, + "loss": 11.7988, + "step": 29528 + }, + { + "epoch": 1.6079720650997529, + "grad_norm": 0.5837325675912988, + "learning_rate": 1.948253431511626e-05, + "loss": 11.8061, + "step": 29529 + }, + { + "epoch": 1.6080265190963359, + "grad_norm": 0.5208751115569004, + "learning_rate": 1.9477305068208163e-05, + "loss": 11.8655, + "step": 29530 + }, + { + "epoch": 1.6080809730929189, + "grad_norm": 0.5807413749379425, + "learning_rate": 1.9472076447458064e-05, + "loss": 11.8194, + "step": 29531 + }, + { + "epoch": 1.6081354270895019, + "grad_norm": 0.5780867307509088, + "learning_rate": 1.946684845290658e-05, + "loss": 11.748, + "step": 29532 + }, + { + "epoch": 1.6081898810860848, + "grad_norm": 0.5718270503096778, + "learning_rate": 1.9461621084594418e-05, + "loss": 11.6538, + "step": 29533 + }, + { + "epoch": 1.6082443350826678, + "grad_norm": 0.5256356144512199, + "learning_rate": 1.945639434256219e-05, + "loss": 11.8155, + "step": 29534 + }, + { + "epoch": 1.6082987890792508, + "grad_norm": 0.5758285504641771, + "learning_rate": 1.9451168226850524e-05, + "loss": 11.7021, + "step": 29535 + }, + { + "epoch": 1.608353243075834, + "grad_norm": 0.5559875475881851, + "learning_rate": 1.9445942737500113e-05, + "loss": 11.8845, + "step": 29536 + }, + { + "epoch": 1.608407697072417, + "grad_norm": 0.534015103734107, + "learning_rate": 1.9440717874551528e-05, + "loss": 11.883, + "step": 29537 + }, + { + "epoch": 1.608462151069, + "grad_norm": 0.5331950711834742, + "learning_rate": 1.9435493638045455e-05, + "loss": 11.7914, + "step": 29538 + }, + { + "epoch": 1.608516605065583, + "grad_norm": 0.5458694608076182, + "learning_rate": 1.943027002802247e-05, + "loss": 11.725, + "step": 29539 + }, + { + "epoch": 1.608571059062166, + "grad_norm": 0.5828304155005862, + "learning_rate": 1.9425047044523226e-05, + "loss": 11.8955, + "step": 29540 + }, + { + "epoch": 1.608625513058749, + "grad_norm": 0.5225958157818917, + "learning_rate": 1.941982468758834e-05, + "loss": 11.7342, + "step": 29541 + }, + { + "epoch": 1.6086799670553322, + "grad_norm": 0.5496488900618468, + "learning_rate": 1.941460295725839e-05, + "loss": 11.8381, + "step": 29542 + }, + { + "epoch": 1.6087344210519152, + "grad_norm": 0.5062961263757868, + "learning_rate": 1.9409381853574026e-05, + "loss": 11.8887, + "step": 29543 + }, + { + "epoch": 1.6087888750484982, + "grad_norm": 0.5302396278303994, + "learning_rate": 1.9404161376575835e-05, + "loss": 11.813, + "step": 29544 + }, + { + "epoch": 1.6088433290450812, + "grad_norm": 0.49799933580786143, + "learning_rate": 1.939894152630436e-05, + "loss": 11.8171, + "step": 29545 + }, + { + "epoch": 1.6088977830416642, + "grad_norm": 0.5289288469149018, + "learning_rate": 1.9393722302800266e-05, + "loss": 11.7155, + "step": 29546 + }, + { + "epoch": 1.6089522370382472, + "grad_norm": 0.6175431664817955, + "learning_rate": 1.9388503706104077e-05, + "loss": 11.8465, + "step": 29547 + }, + { + "epoch": 1.6090066910348302, + "grad_norm": 0.5713663245887997, + "learning_rate": 1.938328573625643e-05, + "loss": 11.9048, + "step": 29548 + }, + { + "epoch": 1.6090611450314132, + "grad_norm": 0.7576427256232343, + "learning_rate": 1.9378068393297844e-05, + "loss": 11.9706, + "step": 29549 + }, + { + "epoch": 1.6091155990279962, + "grad_norm": 0.5443327726839043, + "learning_rate": 1.937285167726892e-05, + "loss": 11.9013, + "step": 29550 + }, + { + "epoch": 1.6091700530245792, + "grad_norm": 0.5647505781259433, + "learning_rate": 1.9367635588210253e-05, + "loss": 11.9367, + "step": 29551 + }, + { + "epoch": 1.6092245070211622, + "grad_norm": 0.5655720292030478, + "learning_rate": 1.936242012616234e-05, + "loss": 11.8224, + "step": 29552 + }, + { + "epoch": 1.6092789610177451, + "grad_norm": 0.5978227737092479, + "learning_rate": 1.9357205291165802e-05, + "loss": 11.8153, + "step": 29553 + }, + { + "epoch": 1.6093334150143281, + "grad_norm": 0.5091386406530556, + "learning_rate": 1.9351991083261156e-05, + "loss": 11.828, + "step": 29554 + }, + { + "epoch": 1.6093878690109111, + "grad_norm": 0.5762205756489719, + "learning_rate": 1.9346777502488923e-05, + "loss": 11.8939, + "step": 29555 + }, + { + "epoch": 1.6094423230074941, + "grad_norm": 0.5396369529761386, + "learning_rate": 1.9341564548889713e-05, + "loss": 11.5902, + "step": 29556 + }, + { + "epoch": 1.6094967770040771, + "grad_norm": 0.5152461717621627, + "learning_rate": 1.9336352222503984e-05, + "loss": 11.713, + "step": 29557 + }, + { + "epoch": 1.60955123100066, + "grad_norm": 0.5538421050177275, + "learning_rate": 1.933114052337234e-05, + "loss": 11.8857, + "step": 29558 + }, + { + "epoch": 1.609605684997243, + "grad_norm": 0.5159800025316812, + "learning_rate": 1.932592945153524e-05, + "loss": 11.8314, + "step": 29559 + }, + { + "epoch": 1.6096601389938263, + "grad_norm": 0.5413113498923491, + "learning_rate": 1.9320719007033282e-05, + "loss": 11.8667, + "step": 29560 + }, + { + "epoch": 1.6097145929904093, + "grad_norm": 0.5531103888116603, + "learning_rate": 1.931550918990691e-05, + "loss": 11.7705, + "step": 29561 + }, + { + "epoch": 1.6097690469869923, + "grad_norm": 0.5782706319008367, + "learning_rate": 1.9310300000196668e-05, + "loss": 11.926, + "step": 29562 + }, + { + "epoch": 1.6098235009835753, + "grad_norm": 0.5584620417309196, + "learning_rate": 1.930509143794309e-05, + "loss": 11.8614, + "step": 29563 + }, + { + "epoch": 1.6098779549801583, + "grad_norm": 0.5381583087166442, + "learning_rate": 1.9299883503186656e-05, + "loss": 11.7382, + "step": 29564 + }, + { + "epoch": 1.6099324089767415, + "grad_norm": 0.5125804308240364, + "learning_rate": 1.9294676195967833e-05, + "loss": 11.8325, + "step": 29565 + }, + { + "epoch": 1.6099868629733245, + "grad_norm": 0.5481157917892993, + "learning_rate": 1.9289469516327175e-05, + "loss": 11.838, + "step": 29566 + }, + { + "epoch": 1.6100413169699075, + "grad_norm": 0.5936471282787849, + "learning_rate": 1.9284263464305108e-05, + "loss": 11.9087, + "step": 29567 + }, + { + "epoch": 1.6100957709664905, + "grad_norm": 0.5251934267782864, + "learning_rate": 1.9279058039942165e-05, + "loss": 11.8108, + "step": 29568 + }, + { + "epoch": 1.6101502249630735, + "grad_norm": 0.6441650840009321, + "learning_rate": 1.9273853243278782e-05, + "loss": 11.9299, + "step": 29569 + }, + { + "epoch": 1.6102046789596565, + "grad_norm": 0.5996784450447684, + "learning_rate": 1.9268649074355484e-05, + "loss": 11.8155, + "step": 29570 + }, + { + "epoch": 1.6102591329562395, + "grad_norm": 0.5240673711257662, + "learning_rate": 1.9263445533212677e-05, + "loss": 11.8043, + "step": 29571 + }, + { + "epoch": 1.6103135869528225, + "grad_norm": 0.5922263540896808, + "learning_rate": 1.9258242619890854e-05, + "loss": 11.8052, + "step": 29572 + }, + { + "epoch": 1.6103680409494054, + "grad_norm": 0.5400260135135027, + "learning_rate": 1.9253040334430505e-05, + "loss": 11.7609, + "step": 29573 + }, + { + "epoch": 1.6104224949459884, + "grad_norm": 0.5622988521635481, + "learning_rate": 1.9247838676872064e-05, + "loss": 11.7748, + "step": 29574 + }, + { + "epoch": 1.6104769489425714, + "grad_norm": 0.47277871171211544, + "learning_rate": 1.9242637647255967e-05, + "loss": 11.7018, + "step": 29575 + }, + { + "epoch": 1.6105314029391544, + "grad_norm": 0.5598215265791844, + "learning_rate": 1.9237437245622635e-05, + "loss": 11.977, + "step": 29576 + }, + { + "epoch": 1.6105858569357374, + "grad_norm": 0.5066079895315062, + "learning_rate": 1.923223747201254e-05, + "loss": 11.8769, + "step": 29577 + }, + { + "epoch": 1.6106403109323204, + "grad_norm": 0.5737905323530964, + "learning_rate": 1.922703832646613e-05, + "loss": 11.8221, + "step": 29578 + }, + { + "epoch": 1.6106947649289034, + "grad_norm": 0.5371741190676275, + "learning_rate": 1.922183980902379e-05, + "loss": 11.8263, + "step": 29579 + }, + { + "epoch": 1.6107492189254864, + "grad_norm": 0.5470637761312328, + "learning_rate": 1.9216641919725996e-05, + "loss": 11.829, + "step": 29580 + }, + { + "epoch": 1.6108036729220694, + "grad_norm": 0.5755290994175941, + "learning_rate": 1.9211444658613122e-05, + "loss": 11.8048, + "step": 29581 + }, + { + "epoch": 1.6108581269186524, + "grad_norm": 0.6298942677106962, + "learning_rate": 1.9206248025725614e-05, + "loss": 11.9161, + "step": 29582 + }, + { + "epoch": 1.6109125809152356, + "grad_norm": 0.5313163553259902, + "learning_rate": 1.920105202110386e-05, + "loss": 11.9056, + "step": 29583 + }, + { + "epoch": 1.6109670349118186, + "grad_norm": 0.5472964463557821, + "learning_rate": 1.91958566447883e-05, + "loss": 11.905, + "step": 29584 + }, + { + "epoch": 1.6110214889084016, + "grad_norm": 0.5912321516673279, + "learning_rate": 1.9190661896819307e-05, + "loss": 11.8834, + "step": 29585 + }, + { + "epoch": 1.6110759429049846, + "grad_norm": 0.6536873624048212, + "learning_rate": 1.9185467777237254e-05, + "loss": 11.923, + "step": 29586 + }, + { + "epoch": 1.6111303969015676, + "grad_norm": 0.5090252953567536, + "learning_rate": 1.9180274286082578e-05, + "loss": 11.833, + "step": 29587 + }, + { + "epoch": 1.6111848508981506, + "grad_norm": 0.5001710337858857, + "learning_rate": 1.9175081423395612e-05, + "loss": 11.8627, + "step": 29588 + }, + { + "epoch": 1.6112393048947338, + "grad_norm": 0.5291093489575781, + "learning_rate": 1.9169889189216783e-05, + "loss": 11.8392, + "step": 29589 + }, + { + "epoch": 1.6112937588913168, + "grad_norm": 0.5202906696704613, + "learning_rate": 1.9164697583586468e-05, + "loss": 11.776, + "step": 29590 + }, + { + "epoch": 1.6113482128878998, + "grad_norm": 0.5246389746290668, + "learning_rate": 1.9159506606544984e-05, + "loss": 11.8681, + "step": 29591 + }, + { + "epoch": 1.6114026668844827, + "grad_norm": 0.4898275564259432, + "learning_rate": 1.9154316258132777e-05, + "loss": 11.6848, + "step": 29592 + }, + { + "epoch": 1.6114571208810657, + "grad_norm": 0.5413377027556796, + "learning_rate": 1.9149126538390127e-05, + "loss": 11.8868, + "step": 29593 + }, + { + "epoch": 1.6115115748776487, + "grad_norm": 0.5192466163693963, + "learning_rate": 1.9143937447357462e-05, + "loss": 11.761, + "step": 29594 + }, + { + "epoch": 1.6115660288742317, + "grad_norm": 0.5587969031217869, + "learning_rate": 1.9138748985075094e-05, + "loss": 11.8258, + "step": 29595 + }, + { + "epoch": 1.6116204828708147, + "grad_norm": 0.5353248257573864, + "learning_rate": 1.9133561151583355e-05, + "loss": 11.7902, + "step": 29596 + }, + { + "epoch": 1.6116749368673977, + "grad_norm": 0.5227399056629031, + "learning_rate": 1.912837394692262e-05, + "loss": 11.7976, + "step": 29597 + }, + { + "epoch": 1.6117293908639807, + "grad_norm": 0.5508571534806835, + "learning_rate": 1.91231873711332e-05, + "loss": 11.6627, + "step": 29598 + }, + { + "epoch": 1.6117838448605637, + "grad_norm": 0.5102923687347977, + "learning_rate": 1.9118001424255427e-05, + "loss": 11.8478, + "step": 29599 + }, + { + "epoch": 1.6118382988571467, + "grad_norm": 0.531953171103753, + "learning_rate": 1.9112816106329667e-05, + "loss": 11.7117, + "step": 29600 + }, + { + "epoch": 1.6118927528537297, + "grad_norm": 0.5297788108626057, + "learning_rate": 1.9107631417396188e-05, + "loss": 11.8004, + "step": 29601 + }, + { + "epoch": 1.6119472068503127, + "grad_norm": 0.7068541638325369, + "learning_rate": 1.9102447357495355e-05, + "loss": 11.7185, + "step": 29602 + }, + { + "epoch": 1.6120016608468957, + "grad_norm": 0.49332287462322977, + "learning_rate": 1.909726392666744e-05, + "loss": 11.8006, + "step": 29603 + }, + { + "epoch": 1.6120561148434787, + "grad_norm": 0.5468416721993087, + "learning_rate": 1.9092081124952787e-05, + "loss": 11.7957, + "step": 29604 + }, + { + "epoch": 1.6121105688400617, + "grad_norm": 0.5464211206930017, + "learning_rate": 1.9086898952391686e-05, + "loss": 11.7933, + "step": 29605 + }, + { + "epoch": 1.6121650228366449, + "grad_norm": 0.5639125354698105, + "learning_rate": 1.90817174090244e-05, + "loss": 11.8746, + "step": 29606 + }, + { + "epoch": 1.6122194768332279, + "grad_norm": 0.5275710519658461, + "learning_rate": 1.9076536494891272e-05, + "loss": 11.8444, + "step": 29607 + }, + { + "epoch": 1.6122739308298109, + "grad_norm": 0.5395387033152028, + "learning_rate": 1.9071356210032544e-05, + "loss": 11.7913, + "step": 29608 + }, + { + "epoch": 1.6123283848263938, + "grad_norm": 0.627910051876697, + "learning_rate": 1.9066176554488545e-05, + "loss": 12.0437, + "step": 29609 + }, + { + "epoch": 1.6123828388229768, + "grad_norm": 0.6087750569054463, + "learning_rate": 1.9060997528299505e-05, + "loss": 11.8598, + "step": 29610 + }, + { + "epoch": 1.6124372928195598, + "grad_norm": 0.5898224923045239, + "learning_rate": 1.9055819131505724e-05, + "loss": 11.8917, + "step": 29611 + }, + { + "epoch": 1.612491746816143, + "grad_norm": 0.5226180741584564, + "learning_rate": 1.9050641364147493e-05, + "loss": 11.7502, + "step": 29612 + }, + { + "epoch": 1.612546200812726, + "grad_norm": 0.5586843605855791, + "learning_rate": 1.904546422626502e-05, + "loss": 11.7246, + "step": 29613 + }, + { + "epoch": 1.612600654809309, + "grad_norm": 0.5259545980686726, + "learning_rate": 1.904028771789863e-05, + "loss": 11.823, + "step": 29614 + }, + { + "epoch": 1.612655108805892, + "grad_norm": 0.5658945433406595, + "learning_rate": 1.9035111839088527e-05, + "loss": 11.8673, + "step": 29615 + }, + { + "epoch": 1.612709562802475, + "grad_norm": 0.5527330441003985, + "learning_rate": 1.902993658987495e-05, + "loss": 11.7979, + "step": 29616 + }, + { + "epoch": 1.612764016799058, + "grad_norm": 0.5096908852313172, + "learning_rate": 1.9024761970298187e-05, + "loss": 11.7417, + "step": 29617 + }, + { + "epoch": 1.612818470795641, + "grad_norm": 0.5566952959549946, + "learning_rate": 1.9019587980398445e-05, + "loss": 11.9029, + "step": 29618 + }, + { + "epoch": 1.612872924792224, + "grad_norm": 0.5164784893964677, + "learning_rate": 1.901441462021598e-05, + "loss": 11.7604, + "step": 29619 + }, + { + "epoch": 1.612927378788807, + "grad_norm": 0.505202894194634, + "learning_rate": 1.9009241889790984e-05, + "loss": 11.8244, + "step": 29620 + }, + { + "epoch": 1.61298183278539, + "grad_norm": 0.49931685006218834, + "learning_rate": 1.9004069789163702e-05, + "loss": 11.7195, + "step": 29621 + }, + { + "epoch": 1.613036286781973, + "grad_norm": 0.5455425210517945, + "learning_rate": 1.8998898318374382e-05, + "loss": 11.7581, + "step": 29622 + }, + { + "epoch": 1.613090740778556, + "grad_norm": 0.5432936625568962, + "learning_rate": 1.8993727477463197e-05, + "loss": 11.8066, + "step": 29623 + }, + { + "epoch": 1.613145194775139, + "grad_norm": 0.5828583096469647, + "learning_rate": 1.8988557266470408e-05, + "loss": 11.8331, + "step": 29624 + }, + { + "epoch": 1.613199648771722, + "grad_norm": 0.526464711439938, + "learning_rate": 1.898338768543615e-05, + "loss": 11.8147, + "step": 29625 + }, + { + "epoch": 1.613254102768305, + "grad_norm": 0.51089263905331, + "learning_rate": 1.8978218734400645e-05, + "loss": 11.8426, + "step": 29626 + }, + { + "epoch": 1.613308556764888, + "grad_norm": 0.5337080174071616, + "learning_rate": 1.8973050413404126e-05, + "loss": 11.8138, + "step": 29627 + }, + { + "epoch": 1.613363010761471, + "grad_norm": 0.6126107910181418, + "learning_rate": 1.896788272248673e-05, + "loss": 11.8685, + "step": 29628 + }, + { + "epoch": 1.613417464758054, + "grad_norm": 0.5444234316369566, + "learning_rate": 1.896271566168869e-05, + "loss": 11.7673, + "step": 29629 + }, + { + "epoch": 1.6134719187546371, + "grad_norm": 0.6123301985222828, + "learning_rate": 1.895754923105013e-05, + "loss": 11.7898, + "step": 29630 + }, + { + "epoch": 1.6135263727512201, + "grad_norm": 0.5254807159606458, + "learning_rate": 1.8952383430611298e-05, + "loss": 11.8347, + "step": 29631 + }, + { + "epoch": 1.6135808267478031, + "grad_norm": 0.5005848331393221, + "learning_rate": 1.894721826041229e-05, + "loss": 11.8329, + "step": 29632 + }, + { + "epoch": 1.6136352807443861, + "grad_norm": 0.5490648289902342, + "learning_rate": 1.8942053720493314e-05, + "loss": 11.8345, + "step": 29633 + }, + { + "epoch": 1.613689734740969, + "grad_norm": 0.5419209812540147, + "learning_rate": 1.8936889810894542e-05, + "loss": 11.8052, + "step": 29634 + }, + { + "epoch": 1.6137441887375523, + "grad_norm": 0.5832089670673982, + "learning_rate": 1.89317265316561e-05, + "loss": 11.6428, + "step": 29635 + }, + { + "epoch": 1.6137986427341353, + "grad_norm": 0.553729642072451, + "learning_rate": 1.892656388281816e-05, + "loss": 11.9131, + "step": 29636 + }, + { + "epoch": 1.6138530967307183, + "grad_norm": 0.5502991842329528, + "learning_rate": 1.892140186442083e-05, + "loss": 11.8427, + "step": 29637 + }, + { + "epoch": 1.6139075507273013, + "grad_norm": 0.555463008611924, + "learning_rate": 1.8916240476504276e-05, + "loss": 11.9569, + "step": 29638 + }, + { + "epoch": 1.6139620047238843, + "grad_norm": 0.6543402161789428, + "learning_rate": 1.891107971910865e-05, + "loss": 11.9743, + "step": 29639 + }, + { + "epoch": 1.6140164587204673, + "grad_norm": 0.5126772533103994, + "learning_rate": 1.8905919592274046e-05, + "loss": 11.5509, + "step": 29640 + }, + { + "epoch": 1.6140709127170503, + "grad_norm": 0.559791595103359, + "learning_rate": 1.890076009604064e-05, + "loss": 11.872, + "step": 29641 + }, + { + "epoch": 1.6141253667136333, + "grad_norm": 0.6019338330864715, + "learning_rate": 1.8895601230448502e-05, + "loss": 11.8674, + "step": 29642 + }, + { + "epoch": 1.6141798207102163, + "grad_norm": 0.5680816470425887, + "learning_rate": 1.8890442995537782e-05, + "loss": 11.8414, + "step": 29643 + }, + { + "epoch": 1.6142342747067993, + "grad_norm": 0.5496959065861055, + "learning_rate": 1.888528539134856e-05, + "loss": 11.805, + "step": 29644 + }, + { + "epoch": 1.6142887287033822, + "grad_norm": 0.5399562832626262, + "learning_rate": 1.8880128417920993e-05, + "loss": 11.8105, + "step": 29645 + }, + { + "epoch": 1.6143431826999652, + "grad_norm": 0.5282321670642584, + "learning_rate": 1.887497207529514e-05, + "loss": 11.7994, + "step": 29646 + }, + { + "epoch": 1.6143976366965482, + "grad_norm": 0.5414942430792258, + "learning_rate": 1.8869816363511095e-05, + "loss": 11.7747, + "step": 29647 + }, + { + "epoch": 1.6144520906931312, + "grad_norm": 0.5670112969663105, + "learning_rate": 1.8864661282608954e-05, + "loss": 11.8084, + "step": 29648 + }, + { + "epoch": 1.6145065446897142, + "grad_norm": 0.543724242751264, + "learning_rate": 1.8859506832628848e-05, + "loss": 11.6512, + "step": 29649 + }, + { + "epoch": 1.6145609986862972, + "grad_norm": 0.6046962502639556, + "learning_rate": 1.885435301361079e-05, + "loss": 11.881, + "step": 29650 + }, + { + "epoch": 1.6146154526828802, + "grad_norm": 0.5295983012501382, + "learning_rate": 1.8849199825594922e-05, + "loss": 11.6617, + "step": 29651 + }, + { + "epoch": 1.6146699066794632, + "grad_norm": 0.5356367137513399, + "learning_rate": 1.8844047268621256e-05, + "loss": 11.8382, + "step": 29652 + }, + { + "epoch": 1.6147243606760464, + "grad_norm": 0.5692716482021974, + "learning_rate": 1.8838895342729913e-05, + "loss": 11.7674, + "step": 29653 + }, + { + "epoch": 1.6147788146726294, + "grad_norm": 0.5406897308559924, + "learning_rate": 1.8833744047960912e-05, + "loss": 11.832, + "step": 29654 + }, + { + "epoch": 1.6148332686692124, + "grad_norm": 0.5131634859804876, + "learning_rate": 1.882859338435434e-05, + "loss": 11.7801, + "step": 29655 + }, + { + "epoch": 1.6148877226657954, + "grad_norm": 0.5414352540346842, + "learning_rate": 1.8823443351950254e-05, + "loss": 11.8916, + "step": 29656 + }, + { + "epoch": 1.6149421766623784, + "grad_norm": 0.6039948249554931, + "learning_rate": 1.8818293950788656e-05, + "loss": 11.8279, + "step": 29657 + }, + { + "epoch": 1.6149966306589614, + "grad_norm": 0.5631411197719803, + "learning_rate": 1.881314518090964e-05, + "loss": 11.771, + "step": 29658 + }, + { + "epoch": 1.6150510846555446, + "grad_norm": 0.5250399020585735, + "learning_rate": 1.8807997042353198e-05, + "loss": 11.7496, + "step": 29659 + }, + { + "epoch": 1.6151055386521276, + "grad_norm": 0.559565163232491, + "learning_rate": 1.8802849535159393e-05, + "loss": 11.8464, + "step": 29660 + }, + { + "epoch": 1.6151599926487106, + "grad_norm": 0.5732934200833762, + "learning_rate": 1.8797702659368265e-05, + "loss": 11.8802, + "step": 29661 + }, + { + "epoch": 1.6152144466452936, + "grad_norm": 0.5437281490005131, + "learning_rate": 1.879255641501979e-05, + "loss": 11.8055, + "step": 29662 + }, + { + "epoch": 1.6152689006418766, + "grad_norm": 0.5146443961078804, + "learning_rate": 1.8787410802154048e-05, + "loss": 11.7519, + "step": 29663 + }, + { + "epoch": 1.6153233546384596, + "grad_norm": 0.5901833914398004, + "learning_rate": 1.8782265820810995e-05, + "loss": 11.824, + "step": 29664 + }, + { + "epoch": 1.6153778086350425, + "grad_norm": 0.49409431228392275, + "learning_rate": 1.877712147103069e-05, + "loss": 11.8383, + "step": 29665 + }, + { + "epoch": 1.6154322626316255, + "grad_norm": 0.5510657008505888, + "learning_rate": 1.87719777528531e-05, + "loss": 11.843, + "step": 29666 + }, + { + "epoch": 1.6154867166282085, + "grad_norm": 0.5682934847729291, + "learning_rate": 1.876683466631821e-05, + "loss": 11.8976, + "step": 29667 + }, + { + "epoch": 1.6155411706247915, + "grad_norm": 0.5586429230938673, + "learning_rate": 1.8761692211466063e-05, + "loss": 11.6376, + "step": 29668 + }, + { + "epoch": 1.6155956246213745, + "grad_norm": 0.5575927369629163, + "learning_rate": 1.8756550388336603e-05, + "loss": 11.8644, + "step": 29669 + }, + { + "epoch": 1.6156500786179575, + "grad_norm": 0.5522321182844298, + "learning_rate": 1.8751409196969817e-05, + "loss": 11.8512, + "step": 29670 + }, + { + "epoch": 1.6157045326145405, + "grad_norm": 0.5691194886311696, + "learning_rate": 1.874626863740574e-05, + "loss": 11.7803, + "step": 29671 + }, + { + "epoch": 1.6157589866111235, + "grad_norm": 0.5387213643850569, + "learning_rate": 1.8741128709684264e-05, + "loss": 11.8652, + "step": 29672 + }, + { + "epoch": 1.6158134406077065, + "grad_norm": 0.5629550626198992, + "learning_rate": 1.8735989413845432e-05, + "loss": 11.8926, + "step": 29673 + }, + { + "epoch": 1.6158678946042895, + "grad_norm": 0.5400894612339091, + "learning_rate": 1.873085074992915e-05, + "loss": 11.6906, + "step": 29674 + }, + { + "epoch": 1.6159223486008725, + "grad_norm": 0.5797810667066575, + "learning_rate": 1.8725712717975418e-05, + "loss": 11.8288, + "step": 29675 + }, + { + "epoch": 1.6159768025974557, + "grad_norm": 0.583512170594587, + "learning_rate": 1.872057531802419e-05, + "loss": 11.9047, + "step": 29676 + }, + { + "epoch": 1.6160312565940387, + "grad_norm": 0.5796086398446647, + "learning_rate": 1.871543855011536e-05, + "loss": 11.8306, + "step": 29677 + }, + { + "epoch": 1.6160857105906217, + "grad_norm": 0.6026315869755731, + "learning_rate": 1.871030241428894e-05, + "loss": 11.9264, + "step": 29678 + }, + { + "epoch": 1.6161401645872047, + "grad_norm": 0.5549657449014572, + "learning_rate": 1.8705166910584815e-05, + "loss": 11.8103, + "step": 29679 + }, + { + "epoch": 1.6161946185837877, + "grad_norm": 0.5021729527965968, + "learning_rate": 1.870003203904297e-05, + "loss": 11.7316, + "step": 29680 + }, + { + "epoch": 1.6162490725803706, + "grad_norm": 0.5175615033962297, + "learning_rate": 1.8694897799703282e-05, + "loss": 11.9144, + "step": 29681 + }, + { + "epoch": 1.6163035265769539, + "grad_norm": 0.5262945184296868, + "learning_rate": 1.8689764192605707e-05, + "loss": 11.8049, + "step": 29682 + }, + { + "epoch": 1.6163579805735369, + "grad_norm": 0.4973390433921258, + "learning_rate": 1.8684631217790194e-05, + "loss": 11.8239, + "step": 29683 + }, + { + "epoch": 1.6164124345701198, + "grad_norm": 0.5495698019580566, + "learning_rate": 1.8679498875296588e-05, + "loss": 11.808, + "step": 29684 + }, + { + "epoch": 1.6164668885667028, + "grad_norm": 0.5129640458517766, + "learning_rate": 1.8674367165164898e-05, + "loss": 11.8014, + "step": 29685 + }, + { + "epoch": 1.6165213425632858, + "grad_norm": 0.5501871845800342, + "learning_rate": 1.8669236087434904e-05, + "loss": 11.7824, + "step": 29686 + }, + { + "epoch": 1.6165757965598688, + "grad_norm": 0.5263849480792924, + "learning_rate": 1.866410564214658e-05, + "loss": 11.651, + "step": 29687 + }, + { + "epoch": 1.6166302505564518, + "grad_norm": 0.5654000764992241, + "learning_rate": 1.8658975829339832e-05, + "loss": 11.6673, + "step": 29688 + }, + { + "epoch": 1.6166847045530348, + "grad_norm": 0.5329454968180634, + "learning_rate": 1.8653846649054497e-05, + "loss": 11.7625, + "step": 29689 + }, + { + "epoch": 1.6167391585496178, + "grad_norm": 0.5874770734532969, + "learning_rate": 1.864871810133053e-05, + "loss": 11.8525, + "step": 29690 + }, + { + "epoch": 1.6167936125462008, + "grad_norm": 0.5889592799692791, + "learning_rate": 1.8643590186207737e-05, + "loss": 11.8978, + "step": 29691 + }, + { + "epoch": 1.6168480665427838, + "grad_norm": 0.5247054306891863, + "learning_rate": 1.8638462903726062e-05, + "loss": 11.7815, + "step": 29692 + }, + { + "epoch": 1.6169025205393668, + "grad_norm": 0.5405937891158469, + "learning_rate": 1.8633336253925314e-05, + "loss": 11.8829, + "step": 29693 + }, + { + "epoch": 1.6169569745359498, + "grad_norm": 0.5864781772542362, + "learning_rate": 1.862821023684539e-05, + "loss": 11.8152, + "step": 29694 + }, + { + "epoch": 1.6170114285325328, + "grad_norm": 0.5940501313197909, + "learning_rate": 1.8623084852526208e-05, + "loss": 11.9119, + "step": 29695 + }, + { + "epoch": 1.6170658825291158, + "grad_norm": 0.5226921153191229, + "learning_rate": 1.8617960101007504e-05, + "loss": 11.761, + "step": 29696 + }, + { + "epoch": 1.6171203365256988, + "grad_norm": 0.5452004933781307, + "learning_rate": 1.861283598232919e-05, + "loss": 11.7848, + "step": 29697 + }, + { + "epoch": 1.6171747905222817, + "grad_norm": 0.5318178540595372, + "learning_rate": 1.8607712496531148e-05, + "loss": 11.7976, + "step": 29698 + }, + { + "epoch": 1.617229244518865, + "grad_norm": 0.5477669497775027, + "learning_rate": 1.8602589643653144e-05, + "loss": 11.8655, + "step": 29699 + }, + { + "epoch": 1.617283698515448, + "grad_norm": 0.5344043123632418, + "learning_rate": 1.8597467423735092e-05, + "loss": 11.8005, + "step": 29700 + }, + { + "epoch": 1.617338152512031, + "grad_norm": 0.5575845681308145, + "learning_rate": 1.859234583681675e-05, + "loss": 11.852, + "step": 29701 + }, + { + "epoch": 1.617392606508614, + "grad_norm": 0.5262517809572043, + "learning_rate": 1.8587224882938003e-05, + "loss": 11.78, + "step": 29702 + }, + { + "epoch": 1.617447060505197, + "grad_norm": 0.5882528780569757, + "learning_rate": 1.8582104562138637e-05, + "loss": 11.838, + "step": 29703 + }, + { + "epoch": 1.61750151450178, + "grad_norm": 0.6035131653248957, + "learning_rate": 1.857698487445846e-05, + "loss": 11.8683, + "step": 29704 + }, + { + "epoch": 1.6175559684983631, + "grad_norm": 0.5144802107133795, + "learning_rate": 1.857186581993737e-05, + "loss": 11.7094, + "step": 29705 + }, + { + "epoch": 1.6176104224949461, + "grad_norm": 0.5667786980611123, + "learning_rate": 1.8566747398615048e-05, + "loss": 11.8018, + "step": 29706 + }, + { + "epoch": 1.6176648764915291, + "grad_norm": 0.5400424489562431, + "learning_rate": 1.8561629610531385e-05, + "loss": 11.8653, + "step": 29707 + }, + { + "epoch": 1.6177193304881121, + "grad_norm": 0.563215643402406, + "learning_rate": 1.8556512455726115e-05, + "loss": 11.9103, + "step": 29708 + }, + { + "epoch": 1.617773784484695, + "grad_norm": 0.5516286598899117, + "learning_rate": 1.8551395934239068e-05, + "loss": 11.7368, + "step": 29709 + }, + { + "epoch": 1.617828238481278, + "grad_norm": 0.5328982898114879, + "learning_rate": 1.8546280046110042e-05, + "loss": 11.7824, + "step": 29710 + }, + { + "epoch": 1.617882692477861, + "grad_norm": 0.5870227609709985, + "learning_rate": 1.8541164791378786e-05, + "loss": 11.9064, + "step": 29711 + }, + { + "epoch": 1.617937146474444, + "grad_norm": 0.5749743102814574, + "learning_rate": 1.8536050170085117e-05, + "loss": 11.9272, + "step": 29712 + }, + { + "epoch": 1.617991600471027, + "grad_norm": 0.5111909105755295, + "learning_rate": 1.8530936182268755e-05, + "loss": 11.6833, + "step": 29713 + }, + { + "epoch": 1.61804605446761, + "grad_norm": 0.5118690492104214, + "learning_rate": 1.852582282796952e-05, + "loss": 11.7455, + "step": 29714 + }, + { + "epoch": 1.618100508464193, + "grad_norm": 0.5177070100658223, + "learning_rate": 1.8520710107227156e-05, + "loss": 11.8511, + "step": 29715 + }, + { + "epoch": 1.618154962460776, + "grad_norm": 0.5429501437094439, + "learning_rate": 1.8515598020081383e-05, + "loss": 11.9224, + "step": 29716 + }, + { + "epoch": 1.618209416457359, + "grad_norm": 0.5475021896792563, + "learning_rate": 1.8510486566572017e-05, + "loss": 11.6796, + "step": 29717 + }, + { + "epoch": 1.618263870453942, + "grad_norm": 0.5924743382317234, + "learning_rate": 1.850537574673874e-05, + "loss": 11.8485, + "step": 29718 + }, + { + "epoch": 1.618318324450525, + "grad_norm": 0.5013354626793133, + "learning_rate": 1.8500265560621334e-05, + "loss": 11.8067, + "step": 29719 + }, + { + "epoch": 1.618372778447108, + "grad_norm": 0.5114146283955324, + "learning_rate": 1.8495156008259552e-05, + "loss": 11.6471, + "step": 29720 + }, + { + "epoch": 1.618427232443691, + "grad_norm": 0.5786062154256025, + "learning_rate": 1.8490047089693084e-05, + "loss": 11.934, + "step": 29721 + }, + { + "epoch": 1.618481686440274, + "grad_norm": 0.5626412667315792, + "learning_rate": 1.8484938804961706e-05, + "loss": 11.8384, + "step": 29722 + }, + { + "epoch": 1.6185361404368572, + "grad_norm": 0.5544560528376635, + "learning_rate": 1.8479831154105075e-05, + "loss": 11.6767, + "step": 29723 + }, + { + "epoch": 1.6185905944334402, + "grad_norm": 0.5589589299532259, + "learning_rate": 1.847472413716298e-05, + "loss": 11.8483, + "step": 29724 + }, + { + "epoch": 1.6186450484300232, + "grad_norm": 0.5928141170937883, + "learning_rate": 1.8469617754175107e-05, + "loss": 11.7703, + "step": 29725 + }, + { + "epoch": 1.6186995024266062, + "grad_norm": 0.5576237658610851, + "learning_rate": 1.8464512005181133e-05, + "loss": 11.8894, + "step": 29726 + }, + { + "epoch": 1.6187539564231892, + "grad_norm": 0.5290010663145447, + "learning_rate": 1.845940689022081e-05, + "loss": 11.7963, + "step": 29727 + }, + { + "epoch": 1.6188084104197722, + "grad_norm": 0.5617072598563205, + "learning_rate": 1.845430240933379e-05, + "loss": 11.7636, + "step": 29728 + }, + { + "epoch": 1.6188628644163554, + "grad_norm": 0.5955959921049784, + "learning_rate": 1.844919856255981e-05, + "loss": 11.879, + "step": 29729 + }, + { + "epoch": 1.6189173184129384, + "grad_norm": 0.5005111701661928, + "learning_rate": 1.8444095349938518e-05, + "loss": 11.8425, + "step": 29730 + }, + { + "epoch": 1.6189717724095214, + "grad_norm": 0.5753867860748185, + "learning_rate": 1.843899277150961e-05, + "loss": 11.8455, + "step": 29731 + }, + { + "epoch": 1.6190262264061044, + "grad_norm": 0.6022810675614024, + "learning_rate": 1.8433890827312806e-05, + "loss": 11.7727, + "step": 29732 + }, + { + "epoch": 1.6190806804026874, + "grad_norm": 0.5737669885927515, + "learning_rate": 1.8428789517387723e-05, + "loss": 11.6892, + "step": 29733 + }, + { + "epoch": 1.6191351343992704, + "grad_norm": 0.5383511411239242, + "learning_rate": 1.8423688841774067e-05, + "loss": 11.7437, + "step": 29734 + }, + { + "epoch": 1.6191895883958534, + "grad_norm": 0.6560133762952048, + "learning_rate": 1.8418588800511493e-05, + "loss": 11.9499, + "step": 29735 + }, + { + "epoch": 1.6192440423924364, + "grad_norm": 0.5467763424292883, + "learning_rate": 1.841348939363963e-05, + "loss": 11.8131, + "step": 29736 + }, + { + "epoch": 1.6192984963890193, + "grad_norm": 0.5654688761193011, + "learning_rate": 1.840839062119818e-05, + "loss": 11.6356, + "step": 29737 + }, + { + "epoch": 1.6193529503856023, + "grad_norm": 0.5736787912331952, + "learning_rate": 1.840329248322673e-05, + "loss": 11.7675, + "step": 29738 + }, + { + "epoch": 1.6194074043821853, + "grad_norm": 0.7146954815898924, + "learning_rate": 1.8398194979765005e-05, + "loss": 11.8188, + "step": 29739 + }, + { + "epoch": 1.6194618583787683, + "grad_norm": 0.5270226949178375, + "learning_rate": 1.839309811085257e-05, + "loss": 11.651, + "step": 29740 + }, + { + "epoch": 1.6195163123753513, + "grad_norm": 0.5697433592279761, + "learning_rate": 1.8388001876529105e-05, + "loss": 11.8665, + "step": 29741 + }, + { + "epoch": 1.6195707663719343, + "grad_norm": 0.5792422543746626, + "learning_rate": 1.8382906276834212e-05, + "loss": 11.8301, + "step": 29742 + }, + { + "epoch": 1.6196252203685173, + "grad_norm": 0.5314354702809112, + "learning_rate": 1.8377811311807514e-05, + "loss": 11.7925, + "step": 29743 + }, + { + "epoch": 1.6196796743651003, + "grad_norm": 0.5408031908187323, + "learning_rate": 1.837271698148868e-05, + "loss": 11.9217, + "step": 29744 + }, + { + "epoch": 1.6197341283616833, + "grad_norm": 0.5682392557677713, + "learning_rate": 1.836762328591728e-05, + "loss": 11.7606, + "step": 29745 + }, + { + "epoch": 1.6197885823582665, + "grad_norm": 0.47152443855339354, + "learning_rate": 1.8362530225132902e-05, + "loss": 11.817, + "step": 29746 + }, + { + "epoch": 1.6198430363548495, + "grad_norm": 0.5890601620756605, + "learning_rate": 1.8357437799175216e-05, + "loss": 11.9369, + "step": 29747 + }, + { + "epoch": 1.6198974903514325, + "grad_norm": 0.512614243080736, + "learning_rate": 1.8352346008083753e-05, + "loss": 11.7113, + "step": 29748 + }, + { + "epoch": 1.6199519443480155, + "grad_norm": 0.5001521175061336, + "learning_rate": 1.8347254851898166e-05, + "loss": 11.7671, + "step": 29749 + }, + { + "epoch": 1.6200063983445985, + "grad_norm": 0.5526447643640972, + "learning_rate": 1.8342164330657997e-05, + "loss": 11.6839, + "step": 29750 + }, + { + "epoch": 1.6200608523411815, + "grad_norm": 0.5244986183923309, + "learning_rate": 1.833707444440288e-05, + "loss": 11.6831, + "step": 29751 + }, + { + "epoch": 1.6201153063377647, + "grad_norm": 0.5890321091260146, + "learning_rate": 1.833198519317233e-05, + "loss": 11.8685, + "step": 29752 + }, + { + "epoch": 1.6201697603343477, + "grad_norm": 0.5218310798077371, + "learning_rate": 1.832689657700597e-05, + "loss": 11.7616, + "step": 29753 + }, + { + "epoch": 1.6202242143309307, + "grad_norm": 0.5239361598665088, + "learning_rate": 1.832180859594338e-05, + "loss": 11.7665, + "step": 29754 + }, + { + "epoch": 1.6202786683275137, + "grad_norm": 0.5561360692793038, + "learning_rate": 1.8316721250024093e-05, + "loss": 11.8565, + "step": 29755 + }, + { + "epoch": 1.6203331223240967, + "grad_norm": 0.58588166861285, + "learning_rate": 1.8311634539287692e-05, + "loss": 11.8331, + "step": 29756 + }, + { + "epoch": 1.6203875763206796, + "grad_norm": 0.47225103007604813, + "learning_rate": 1.83065484637737e-05, + "loss": 11.7213, + "step": 29757 + }, + { + "epoch": 1.6204420303172626, + "grad_norm": 0.5730737085838575, + "learning_rate": 1.830146302352168e-05, + "loss": 11.7649, + "step": 29758 + }, + { + "epoch": 1.6204964843138456, + "grad_norm": 0.480864723884432, + "learning_rate": 1.8296378218571207e-05, + "loss": 11.7468, + "step": 29759 + }, + { + "epoch": 1.6205509383104286, + "grad_norm": 0.6027636164304947, + "learning_rate": 1.829129404896177e-05, + "loss": 11.8949, + "step": 29760 + }, + { + "epoch": 1.6206053923070116, + "grad_norm": 0.515908200547813, + "learning_rate": 1.8286210514732973e-05, + "loss": 11.6664, + "step": 29761 + }, + { + "epoch": 1.6206598463035946, + "grad_norm": 0.5588522111663349, + "learning_rate": 1.8281127615924264e-05, + "loss": 11.7865, + "step": 29762 + }, + { + "epoch": 1.6207143003001776, + "grad_norm": 0.6389388471645864, + "learning_rate": 1.8276045352575244e-05, + "loss": 11.8541, + "step": 29763 + }, + { + "epoch": 1.6207687542967606, + "grad_norm": 0.5136481223944924, + "learning_rate": 1.8270963724725375e-05, + "loss": 11.8994, + "step": 29764 + }, + { + "epoch": 1.6208232082933436, + "grad_norm": 0.5349080581632016, + "learning_rate": 1.826588273241423e-05, + "loss": 11.7461, + "step": 29765 + }, + { + "epoch": 1.6208776622899266, + "grad_norm": 0.5986740150879933, + "learning_rate": 1.8260802375681286e-05, + "loss": 11.9312, + "step": 29766 + }, + { + "epoch": 1.6209321162865096, + "grad_norm": 0.5560229739189517, + "learning_rate": 1.825572265456602e-05, + "loss": 11.8586, + "step": 29767 + }, + { + "epoch": 1.6209865702830926, + "grad_norm": 0.582516417404331, + "learning_rate": 1.825064356910796e-05, + "loss": 11.8453, + "step": 29768 + }, + { + "epoch": 1.6210410242796758, + "grad_norm": 0.6281607864308909, + "learning_rate": 1.8245565119346632e-05, + "loss": 11.795, + "step": 29769 + }, + { + "epoch": 1.6210954782762588, + "grad_norm": 0.5695811605645611, + "learning_rate": 1.8240487305321473e-05, + "loss": 11.7029, + "step": 29770 + }, + { + "epoch": 1.6211499322728418, + "grad_norm": 0.5451215618444492, + "learning_rate": 1.8235410127072027e-05, + "loss": 11.7372, + "step": 29771 + }, + { + "epoch": 1.6212043862694248, + "grad_norm": 0.5705223579328994, + "learning_rate": 1.8230333584637716e-05, + "loss": 11.9475, + "step": 29772 + }, + { + "epoch": 1.6212588402660078, + "grad_norm": 0.5490309796689397, + "learning_rate": 1.8225257678058072e-05, + "loss": 11.8289, + "step": 29773 + }, + { + "epoch": 1.6213132942625907, + "grad_norm": 0.5834407945646138, + "learning_rate": 1.822018240737251e-05, + "loss": 11.9484, + "step": 29774 + }, + { + "epoch": 1.621367748259174, + "grad_norm": 0.5774574060540624, + "learning_rate": 1.8215107772620554e-05, + "loss": 11.9275, + "step": 29775 + }, + { + "epoch": 1.621422202255757, + "grad_norm": 0.5749672081949425, + "learning_rate": 1.8210033773841627e-05, + "loss": 11.8054, + "step": 29776 + }, + { + "epoch": 1.62147665625234, + "grad_norm": 0.5108241365433569, + "learning_rate": 1.820496041107518e-05, + "loss": 11.7704, + "step": 29777 + }, + { + "epoch": 1.621531110248923, + "grad_norm": 0.5355894214491904, + "learning_rate": 1.8199887684360696e-05, + "loss": 11.7478, + "step": 29778 + }, + { + "epoch": 1.621585564245506, + "grad_norm": 0.5344683838749086, + "learning_rate": 1.819481559373757e-05, + "loss": 11.721, + "step": 29779 + }, + { + "epoch": 1.621640018242089, + "grad_norm": 0.5484251651408244, + "learning_rate": 1.818974413924528e-05, + "loss": 11.7141, + "step": 29780 + }, + { + "epoch": 1.621694472238672, + "grad_norm": 0.5330372194509918, + "learning_rate": 1.8184673320923296e-05, + "loss": 11.847, + "step": 29781 + }, + { + "epoch": 1.621748926235255, + "grad_norm": 0.54969047449176, + "learning_rate": 1.8179603138810973e-05, + "loss": 11.9113, + "step": 29782 + }, + { + "epoch": 1.621803380231838, + "grad_norm": 0.5540264519426792, + "learning_rate": 1.8174533592947817e-05, + "loss": 11.8589, + "step": 29783 + }, + { + "epoch": 1.621857834228421, + "grad_norm": 0.5027733970076889, + "learning_rate": 1.8169464683373172e-05, + "loss": 11.8258, + "step": 29784 + }, + { + "epoch": 1.6219122882250039, + "grad_norm": 0.5338662523738564, + "learning_rate": 1.8164396410126527e-05, + "loss": 11.8409, + "step": 29785 + }, + { + "epoch": 1.6219667422215869, + "grad_norm": 0.5355778796105455, + "learning_rate": 1.8159328773247254e-05, + "loss": 11.6876, + "step": 29786 + }, + { + "epoch": 1.6220211962181699, + "grad_norm": 0.5506582771023127, + "learning_rate": 1.8154261772774738e-05, + "loss": 11.7884, + "step": 29787 + }, + { + "epoch": 1.6220756502147529, + "grad_norm": 0.5706165051098933, + "learning_rate": 1.8149195408748443e-05, + "loss": 11.8531, + "step": 29788 + }, + { + "epoch": 1.6221301042113359, + "grad_norm": 0.5373574049088574, + "learning_rate": 1.8144129681207688e-05, + "loss": 11.7053, + "step": 29789 + }, + { + "epoch": 1.6221845582079188, + "grad_norm": 0.5458616252224401, + "learning_rate": 1.8139064590191946e-05, + "loss": 11.7965, + "step": 29790 + }, + { + "epoch": 1.6222390122045018, + "grad_norm": 0.5840944206279342, + "learning_rate": 1.8134000135740536e-05, + "loss": 11.8291, + "step": 29791 + }, + { + "epoch": 1.6222934662010848, + "grad_norm": 0.5795284200751835, + "learning_rate": 1.8128936317892875e-05, + "loss": 11.7689, + "step": 29792 + }, + { + "epoch": 1.622347920197668, + "grad_norm": 0.5292672808196972, + "learning_rate": 1.8123873136688364e-05, + "loss": 11.774, + "step": 29793 + }, + { + "epoch": 1.622402374194251, + "grad_norm": 0.505826063359344, + "learning_rate": 1.8118810592166314e-05, + "loss": 11.8555, + "step": 29794 + }, + { + "epoch": 1.622456828190834, + "grad_norm": 0.5177697343648606, + "learning_rate": 1.811374868436615e-05, + "loss": 11.7862, + "step": 29795 + }, + { + "epoch": 1.622511282187417, + "grad_norm": 0.6289377222011117, + "learning_rate": 1.810868741332721e-05, + "loss": 11.8154, + "step": 29796 + }, + { + "epoch": 1.622565736184, + "grad_norm": 0.5185660813581372, + "learning_rate": 1.810362677908882e-05, + "loss": 11.7476, + "step": 29797 + }, + { + "epoch": 1.622620190180583, + "grad_norm": 0.8677192061925832, + "learning_rate": 1.8098566781690397e-05, + "loss": 11.6404, + "step": 29798 + }, + { + "epoch": 1.6226746441771662, + "grad_norm": 0.5562605901423329, + "learning_rate": 1.8093507421171217e-05, + "loss": 11.8005, + "step": 29799 + }, + { + "epoch": 1.6227290981737492, + "grad_norm": 0.5416724699517385, + "learning_rate": 1.8088448697570693e-05, + "loss": 11.8058, + "step": 29800 + }, + { + "epoch": 1.6227835521703322, + "grad_norm": 0.5413622332538577, + "learning_rate": 1.8083390610928097e-05, + "loss": 11.8775, + "step": 29801 + }, + { + "epoch": 1.6228380061669152, + "grad_norm": 0.5086184504807458, + "learning_rate": 1.807833316128279e-05, + "loss": 11.8459, + "step": 29802 + }, + { + "epoch": 1.6228924601634982, + "grad_norm": 0.5248310507755106, + "learning_rate": 1.807327634867414e-05, + "loss": 11.7233, + "step": 29803 + }, + { + "epoch": 1.6229469141600812, + "grad_norm": 0.5570920371789084, + "learning_rate": 1.8068220173141394e-05, + "loss": 11.8058, + "step": 29804 + }, + { + "epoch": 1.6230013681566642, + "grad_norm": 0.504956431398303, + "learning_rate": 1.806316463472394e-05, + "loss": 11.7385, + "step": 29805 + }, + { + "epoch": 1.6230558221532472, + "grad_norm": 0.5343066618336901, + "learning_rate": 1.805810973346106e-05, + "loss": 11.8999, + "step": 29806 + }, + { + "epoch": 1.6231102761498302, + "grad_norm": 0.5745609893400554, + "learning_rate": 1.8053055469392034e-05, + "loss": 11.9275, + "step": 29807 + }, + { + "epoch": 1.6231647301464132, + "grad_norm": 0.5751834413970852, + "learning_rate": 1.8048001842556216e-05, + "loss": 11.8573, + "step": 29808 + }, + { + "epoch": 1.6232191841429962, + "grad_norm": 0.5415861978949785, + "learning_rate": 1.804294885299286e-05, + "loss": 11.792, + "step": 29809 + }, + { + "epoch": 1.6232736381395791, + "grad_norm": 0.5979319586553563, + "learning_rate": 1.8037896500741292e-05, + "loss": 11.8604, + "step": 29810 + }, + { + "epoch": 1.6233280921361621, + "grad_norm": 0.5332030842511283, + "learning_rate": 1.8032844785840765e-05, + "loss": 11.825, + "step": 29811 + }, + { + "epoch": 1.6233825461327451, + "grad_norm": 0.5672826300173477, + "learning_rate": 1.802779370833061e-05, + "loss": 11.7771, + "step": 29812 + }, + { + "epoch": 1.6234370001293281, + "grad_norm": 0.5456811727174755, + "learning_rate": 1.8022743268250053e-05, + "loss": 11.7762, + "step": 29813 + }, + { + "epoch": 1.6234914541259111, + "grad_norm": 0.5188778840167558, + "learning_rate": 1.8017693465638385e-05, + "loss": 11.8128, + "step": 29814 + }, + { + "epoch": 1.623545908122494, + "grad_norm": 0.6029305953430277, + "learning_rate": 1.8012644300534908e-05, + "loss": 11.9242, + "step": 29815 + }, + { + "epoch": 1.6236003621190773, + "grad_norm": 0.576940166603129, + "learning_rate": 1.800759577297887e-05, + "loss": 11.8113, + "step": 29816 + }, + { + "epoch": 1.6236548161156603, + "grad_norm": 0.5358719089609392, + "learning_rate": 1.800254788300948e-05, + "loss": 11.8687, + "step": 29817 + }, + { + "epoch": 1.6237092701122433, + "grad_norm": 0.5202429387029124, + "learning_rate": 1.7997500630666053e-05, + "loss": 11.6968, + "step": 29818 + }, + { + "epoch": 1.6237637241088263, + "grad_norm": 0.5431275583205528, + "learning_rate": 1.7992454015987793e-05, + "loss": 11.7837, + "step": 29819 + }, + { + "epoch": 1.6238181781054093, + "grad_norm": 0.5793584771153201, + "learning_rate": 1.7987408039013987e-05, + "loss": 11.7659, + "step": 29820 + }, + { + "epoch": 1.6238726321019923, + "grad_norm": 0.5579550905480155, + "learning_rate": 1.7982362699783827e-05, + "loss": 11.6604, + "step": 29821 + }, + { + "epoch": 1.6239270860985755, + "grad_norm": 0.5744584766801781, + "learning_rate": 1.7977317998336597e-05, + "loss": 12.0137, + "step": 29822 + }, + { + "epoch": 1.6239815400951585, + "grad_norm": 0.6315104965522051, + "learning_rate": 1.797227393471147e-05, + "loss": 11.8804, + "step": 29823 + }, + { + "epoch": 1.6240359940917415, + "grad_norm": 0.562141376184874, + "learning_rate": 1.7967230508947687e-05, + "loss": 11.8801, + "step": 29824 + }, + { + "epoch": 1.6240904480883245, + "grad_norm": 0.538474797622595, + "learning_rate": 1.796218772108451e-05, + "loss": 11.7563, + "step": 29825 + }, + { + "epoch": 1.6241449020849075, + "grad_norm": 0.5280226524983459, + "learning_rate": 1.7957145571161117e-05, + "loss": 11.8238, + "step": 29826 + }, + { + "epoch": 1.6241993560814905, + "grad_norm": 0.5367644806566167, + "learning_rate": 1.7952104059216722e-05, + "loss": 11.8238, + "step": 29827 + }, + { + "epoch": 1.6242538100780735, + "grad_norm": 0.536063174405373, + "learning_rate": 1.7947063185290502e-05, + "loss": 11.8473, + "step": 29828 + }, + { + "epoch": 1.6243082640746564, + "grad_norm": 0.5080321495151783, + "learning_rate": 1.7942022949421665e-05, + "loss": 11.8336, + "step": 29829 + }, + { + "epoch": 1.6243627180712394, + "grad_norm": 0.5543109604420468, + "learning_rate": 1.793698335164946e-05, + "loss": 11.8157, + "step": 29830 + }, + { + "epoch": 1.6244171720678224, + "grad_norm": 0.5446841505257469, + "learning_rate": 1.7931944392013e-05, + "loss": 11.7777, + "step": 29831 + }, + { + "epoch": 1.6244716260644054, + "grad_norm": 0.5390297565914725, + "learning_rate": 1.792690607055153e-05, + "loss": 11.6937, + "step": 29832 + }, + { + "epoch": 1.6245260800609884, + "grad_norm": 0.6024428999512876, + "learning_rate": 1.7921868387304187e-05, + "loss": 11.715, + "step": 29833 + }, + { + "epoch": 1.6245805340575714, + "grad_norm": 0.4685025058250226, + "learning_rate": 1.791683134231017e-05, + "loss": 11.8521, + "step": 29834 + }, + { + "epoch": 1.6246349880541544, + "grad_norm": 0.537252562932653, + "learning_rate": 1.7911794935608616e-05, + "loss": 11.8172, + "step": 29835 + }, + { + "epoch": 1.6246894420507374, + "grad_norm": 0.5740644328894018, + "learning_rate": 1.790675916723874e-05, + "loss": 11.7572, + "step": 29836 + }, + { + "epoch": 1.6247438960473204, + "grad_norm": 0.49532925050483767, + "learning_rate": 1.7901724037239675e-05, + "loss": 11.7963, + "step": 29837 + }, + { + "epoch": 1.6247983500439034, + "grad_norm": 0.4812212648939318, + "learning_rate": 1.7896689545650548e-05, + "loss": 11.6105, + "step": 29838 + }, + { + "epoch": 1.6248528040404866, + "grad_norm": 0.5808507902498412, + "learning_rate": 1.7891655692510547e-05, + "loss": 11.7792, + "step": 29839 + }, + { + "epoch": 1.6249072580370696, + "grad_norm": 0.5716854153082225, + "learning_rate": 1.7886622477858783e-05, + "loss": 11.8356, + "step": 29840 + }, + { + "epoch": 1.6249617120336526, + "grad_norm": 0.5438784820571364, + "learning_rate": 1.7881589901734408e-05, + "loss": 11.7404, + "step": 29841 + }, + { + "epoch": 1.6250161660302356, + "grad_norm": 0.5522321049595004, + "learning_rate": 1.7876557964176587e-05, + "loss": 11.8787, + "step": 29842 + }, + { + "epoch": 1.6250706200268186, + "grad_norm": 0.5385128981210782, + "learning_rate": 1.7871526665224404e-05, + "loss": 11.9742, + "step": 29843 + }, + { + "epoch": 1.6251250740234016, + "grad_norm": 0.5642394369165362, + "learning_rate": 1.786649600491702e-05, + "loss": 11.8231, + "step": 29844 + }, + { + "epoch": 1.6251795280199848, + "grad_norm": 0.5282025949759572, + "learning_rate": 1.7861465983293514e-05, + "loss": 11.8951, + "step": 29845 + }, + { + "epoch": 1.6252339820165678, + "grad_norm": 0.5067521434221105, + "learning_rate": 1.7856436600393046e-05, + "loss": 11.8502, + "step": 29846 + }, + { + "epoch": 1.6252884360131508, + "grad_norm": 0.5880429059922319, + "learning_rate": 1.785140785625471e-05, + "loss": 11.8383, + "step": 29847 + }, + { + "epoch": 1.6253428900097338, + "grad_norm": 0.532984901509907, + "learning_rate": 1.7846379750917576e-05, + "loss": 11.8345, + "step": 29848 + }, + { + "epoch": 1.6253973440063167, + "grad_norm": 0.5544920082952747, + "learning_rate": 1.7841352284420786e-05, + "loss": 11.7894, + "step": 29849 + }, + { + "epoch": 1.6254517980028997, + "grad_norm": 0.5217674602501226, + "learning_rate": 1.7836325456803404e-05, + "loss": 11.7569, + "step": 29850 + }, + { + "epoch": 1.6255062519994827, + "grad_norm": 0.5172372220391482, + "learning_rate": 1.783129926810453e-05, + "loss": 11.7676, + "step": 29851 + }, + { + "epoch": 1.6255607059960657, + "grad_norm": 0.5156677079660713, + "learning_rate": 1.7826273718363274e-05, + "loss": 11.6364, + "step": 29852 + }, + { + "epoch": 1.6256151599926487, + "grad_norm": 0.5500854937460841, + "learning_rate": 1.7821248807618674e-05, + "loss": 11.7937, + "step": 29853 + }, + { + "epoch": 1.6256696139892317, + "grad_norm": 0.6009562225192525, + "learning_rate": 1.7816224535909854e-05, + "loss": 11.8653, + "step": 29854 + }, + { + "epoch": 1.6257240679858147, + "grad_norm": 0.5897278157514485, + "learning_rate": 1.7811200903275826e-05, + "loss": 11.9097, + "step": 29855 + }, + { + "epoch": 1.6257785219823977, + "grad_norm": 0.5196931484303855, + "learning_rate": 1.7806177909755706e-05, + "loss": 11.7866, + "step": 29856 + }, + { + "epoch": 1.6258329759789807, + "grad_norm": 0.5259285433385095, + "learning_rate": 1.7801155555388528e-05, + "loss": 11.9078, + "step": 29857 + }, + { + "epoch": 1.6258874299755637, + "grad_norm": 0.5379667007032992, + "learning_rate": 1.7796133840213323e-05, + "loss": 11.8614, + "step": 29858 + }, + { + "epoch": 1.6259418839721467, + "grad_norm": 0.5596329701440825, + "learning_rate": 1.77911127642692e-05, + "loss": 11.7767, + "step": 29859 + }, + { + "epoch": 1.6259963379687297, + "grad_norm": 0.5160637975508303, + "learning_rate": 1.778609232759514e-05, + "loss": 11.8189, + "step": 29860 + }, + { + "epoch": 1.6260507919653127, + "grad_norm": 0.565438200037967, + "learning_rate": 1.7781072530230236e-05, + "loss": 11.7822, + "step": 29861 + }, + { + "epoch": 1.6261052459618957, + "grad_norm": 0.58634501271915, + "learning_rate": 1.7776053372213486e-05, + "loss": 11.8256, + "step": 29862 + }, + { + "epoch": 1.6261596999584789, + "grad_norm": 0.557731556141301, + "learning_rate": 1.7771034853583923e-05, + "loss": 11.7614, + "step": 29863 + }, + { + "epoch": 1.6262141539550619, + "grad_norm": 0.5498987337364991, + "learning_rate": 1.7766016974380606e-05, + "loss": 11.8434, + "step": 29864 + }, + { + "epoch": 1.6262686079516449, + "grad_norm": 0.5814591000650591, + "learning_rate": 1.7760999734642513e-05, + "loss": 11.8587, + "step": 29865 + }, + { + "epoch": 1.6263230619482278, + "grad_norm": 0.5252663649406915, + "learning_rate": 1.7755983134408703e-05, + "loss": 11.7993, + "step": 29866 + }, + { + "epoch": 1.6263775159448108, + "grad_norm": 0.5483473578770199, + "learning_rate": 1.775096717371816e-05, + "loss": 11.8305, + "step": 29867 + }, + { + "epoch": 1.626431969941394, + "grad_norm": 0.5179828168760365, + "learning_rate": 1.7745951852609865e-05, + "loss": 11.8467, + "step": 29868 + }, + { + "epoch": 1.626486423937977, + "grad_norm": 0.6214975217796868, + "learning_rate": 1.7740937171122864e-05, + "loss": 11.7297, + "step": 29869 + }, + { + "epoch": 1.62654087793456, + "grad_norm": 0.5395443085330129, + "learning_rate": 1.7735923129296107e-05, + "loss": 11.805, + "step": 29870 + }, + { + "epoch": 1.626595331931143, + "grad_norm": 0.6039017919913058, + "learning_rate": 1.7730909727168632e-05, + "loss": 11.8849, + "step": 29871 + }, + { + "epoch": 1.626649785927726, + "grad_norm": 0.5699838813347412, + "learning_rate": 1.7725896964779375e-05, + "loss": 11.8384, + "step": 29872 + }, + { + "epoch": 1.626704239924309, + "grad_norm": 0.5390644551119237, + "learning_rate": 1.772088484216734e-05, + "loss": 11.8132, + "step": 29873 + }, + { + "epoch": 1.626758693920892, + "grad_norm": 0.581202814320493, + "learning_rate": 1.7715873359371526e-05, + "loss": 11.885, + "step": 29874 + }, + { + "epoch": 1.626813147917475, + "grad_norm": 0.4968017133864991, + "learning_rate": 1.7710862516430858e-05, + "loss": 11.6651, + "step": 29875 + }, + { + "epoch": 1.626867601914058, + "grad_norm": 0.5312199139071078, + "learning_rate": 1.7705852313384374e-05, + "loss": 11.8351, + "step": 29876 + }, + { + "epoch": 1.626922055910641, + "grad_norm": 0.5009578686172519, + "learning_rate": 1.770084275027093e-05, + "loss": 11.773, + "step": 29877 + }, + { + "epoch": 1.626976509907224, + "grad_norm": 0.5437621951448142, + "learning_rate": 1.7695833827129538e-05, + "loss": 11.7318, + "step": 29878 + }, + { + "epoch": 1.627030963903807, + "grad_norm": 0.5263257599612827, + "learning_rate": 1.7690825543999167e-05, + "loss": 11.86, + "step": 29879 + }, + { + "epoch": 1.62708541790039, + "grad_norm": 0.529554063485207, + "learning_rate": 1.7685817900918723e-05, + "loss": 11.7946, + "step": 29880 + }, + { + "epoch": 1.627139871896973, + "grad_norm": 0.5981416094577404, + "learning_rate": 1.768081089792717e-05, + "loss": 11.9005, + "step": 29881 + }, + { + "epoch": 1.627194325893556, + "grad_norm": 0.589911703001778, + "learning_rate": 1.7675804535063424e-05, + "loss": 11.8847, + "step": 29882 + }, + { + "epoch": 1.627248779890139, + "grad_norm": 0.5577245965792366, + "learning_rate": 1.767079881236645e-05, + "loss": 11.843, + "step": 29883 + }, + { + "epoch": 1.627303233886722, + "grad_norm": 0.5370008557756406, + "learning_rate": 1.7665793729875125e-05, + "loss": 11.7251, + "step": 29884 + }, + { + "epoch": 1.627357687883305, + "grad_norm": 0.55426542388383, + "learning_rate": 1.76607892876284e-05, + "loss": 11.8755, + "step": 29885 + }, + { + "epoch": 1.6274121418798881, + "grad_norm": 0.5663218347031582, + "learning_rate": 1.7655785485665233e-05, + "loss": 11.7826, + "step": 29886 + }, + { + "epoch": 1.6274665958764711, + "grad_norm": 0.5194411723218578, + "learning_rate": 1.765078232402444e-05, + "loss": 11.7238, + "step": 29887 + }, + { + "epoch": 1.6275210498730541, + "grad_norm": 0.5452031387886116, + "learning_rate": 1.7645779802744965e-05, + "loss": 11.8034, + "step": 29888 + }, + { + "epoch": 1.6275755038696371, + "grad_norm": 0.625163580505798, + "learning_rate": 1.7640777921865746e-05, + "loss": 11.7914, + "step": 29889 + }, + { + "epoch": 1.6276299578662201, + "grad_norm": 0.562152490599785, + "learning_rate": 1.7635776681425632e-05, + "loss": 11.8573, + "step": 29890 + }, + { + "epoch": 1.627684411862803, + "grad_norm": 0.5631411373166973, + "learning_rate": 1.763077608146355e-05, + "loss": 11.8009, + "step": 29891 + }, + { + "epoch": 1.6277388658593863, + "grad_norm": 0.5004948928729843, + "learning_rate": 1.7625776122018333e-05, + "loss": 11.824, + "step": 29892 + }, + { + "epoch": 1.6277933198559693, + "grad_norm": 0.5780593933229375, + "learning_rate": 1.7620776803128936e-05, + "loss": 11.9005, + "step": 29893 + }, + { + "epoch": 1.6278477738525523, + "grad_norm": 0.543401389222193, + "learning_rate": 1.761577812483416e-05, + "loss": 11.7474, + "step": 29894 + }, + { + "epoch": 1.6279022278491353, + "grad_norm": 0.5595408395569539, + "learning_rate": 1.761078008717293e-05, + "loss": 11.8585, + "step": 29895 + }, + { + "epoch": 1.6279566818457183, + "grad_norm": 0.5635121012342469, + "learning_rate": 1.7605782690184103e-05, + "loss": 11.9676, + "step": 29896 + }, + { + "epoch": 1.6280111358423013, + "grad_norm": 0.5949180173017401, + "learning_rate": 1.7600785933906494e-05, + "loss": 11.9263, + "step": 29897 + }, + { + "epoch": 1.6280655898388843, + "grad_norm": 0.6192375097089666, + "learning_rate": 1.7595789818379017e-05, + "loss": 11.8609, + "step": 29898 + }, + { + "epoch": 1.6281200438354673, + "grad_norm": 0.5377687262894562, + "learning_rate": 1.7590794343640472e-05, + "loss": 11.7447, + "step": 29899 + }, + { + "epoch": 1.6281744978320503, + "grad_norm": 0.5328658151317262, + "learning_rate": 1.7585799509729727e-05, + "loss": 11.8867, + "step": 29900 + }, + { + "epoch": 1.6282289518286333, + "grad_norm": 0.6000130189421203, + "learning_rate": 1.7580805316685666e-05, + "loss": 11.7832, + "step": 29901 + }, + { + "epoch": 1.6282834058252162, + "grad_norm": 0.6347525803748942, + "learning_rate": 1.7575811764547045e-05, + "loss": 11.9967, + "step": 29902 + }, + { + "epoch": 1.6283378598217992, + "grad_norm": 0.507984416260509, + "learning_rate": 1.7570818853352766e-05, + "loss": 11.6879, + "step": 29903 + }, + { + "epoch": 1.6283923138183822, + "grad_norm": 0.5272719736698143, + "learning_rate": 1.75658265831416e-05, + "loss": 11.7738, + "step": 29904 + }, + { + "epoch": 1.6284467678149652, + "grad_norm": 0.5171362303261697, + "learning_rate": 1.756083495395241e-05, + "loss": 11.7618, + "step": 29905 + }, + { + "epoch": 1.6285012218115482, + "grad_norm": 0.573823690687866, + "learning_rate": 1.7555843965823992e-05, + "loss": 11.8225, + "step": 29906 + }, + { + "epoch": 1.6285556758081312, + "grad_norm": 0.6025332639191686, + "learning_rate": 1.7550853618795125e-05, + "loss": 11.8546, + "step": 29907 + }, + { + "epoch": 1.6286101298047142, + "grad_norm": 0.5876088671905414, + "learning_rate": 1.7545863912904693e-05, + "loss": 11.9995, + "step": 29908 + }, + { + "epoch": 1.6286645838012974, + "grad_norm": 0.6659103166319886, + "learning_rate": 1.7540874848191413e-05, + "loss": 11.7968, + "step": 29909 + }, + { + "epoch": 1.6287190377978804, + "grad_norm": 0.5783262871274817, + "learning_rate": 1.753588642469414e-05, + "loss": 11.7612, + "step": 29910 + }, + { + "epoch": 1.6287734917944634, + "grad_norm": 0.49121689900863397, + "learning_rate": 1.753089864245162e-05, + "loss": 11.7456, + "step": 29911 + }, + { + "epoch": 1.6288279457910464, + "grad_norm": 0.554789154861225, + "learning_rate": 1.7525911501502666e-05, + "loss": 11.6595, + "step": 29912 + }, + { + "epoch": 1.6288823997876294, + "grad_norm": 0.4976445127465602, + "learning_rate": 1.752092500188608e-05, + "loss": 11.8202, + "step": 29913 + }, + { + "epoch": 1.6289368537842124, + "grad_norm": 0.574762160355171, + "learning_rate": 1.7515939143640592e-05, + "loss": 11.7653, + "step": 29914 + }, + { + "epoch": 1.6289913077807956, + "grad_norm": 0.54272193204427, + "learning_rate": 1.7510953926805006e-05, + "loss": 11.8093, + "step": 29915 + }, + { + "epoch": 1.6290457617773786, + "grad_norm": 0.5370533238410519, + "learning_rate": 1.7505969351418083e-05, + "loss": 11.7391, + "step": 29916 + }, + { + "epoch": 1.6291002157739616, + "grad_norm": 0.5176173028365498, + "learning_rate": 1.7500985417518555e-05, + "loss": 11.7328, + "step": 29917 + }, + { + "epoch": 1.6291546697705446, + "grad_norm": 0.5533274271616844, + "learning_rate": 1.7496002125145217e-05, + "loss": 11.7789, + "step": 29918 + }, + { + "epoch": 1.6292091237671276, + "grad_norm": 0.5672350726348384, + "learning_rate": 1.7491019474336788e-05, + "loss": 11.8602, + "step": 29919 + }, + { + "epoch": 1.6292635777637106, + "grad_norm": 0.5632535034997334, + "learning_rate": 1.7486037465132042e-05, + "loss": 11.8678, + "step": 29920 + }, + { + "epoch": 1.6293180317602936, + "grad_norm": 0.5189339183346685, + "learning_rate": 1.7481056097569692e-05, + "loss": 11.8659, + "step": 29921 + }, + { + "epoch": 1.6293724857568765, + "grad_norm": 0.5896665295666436, + "learning_rate": 1.7476075371688484e-05, + "loss": 11.8734, + "step": 29922 + }, + { + "epoch": 1.6294269397534595, + "grad_norm": 0.5639517663215442, + "learning_rate": 1.7471095287527173e-05, + "loss": 11.8643, + "step": 29923 + }, + { + "epoch": 1.6294813937500425, + "grad_norm": 0.532895021995113, + "learning_rate": 1.7466115845124443e-05, + "loss": 11.783, + "step": 29924 + }, + { + "epoch": 1.6295358477466255, + "grad_norm": 0.5485912463971948, + "learning_rate": 1.746113704451906e-05, + "loss": 11.7881, + "step": 29925 + }, + { + "epoch": 1.6295903017432085, + "grad_norm": 0.5370351342773633, + "learning_rate": 1.7456158885749698e-05, + "loss": 11.8659, + "step": 29926 + }, + { + "epoch": 1.6296447557397915, + "grad_norm": 0.5385903916694121, + "learning_rate": 1.7451181368855108e-05, + "loss": 11.8075, + "step": 29927 + }, + { + "epoch": 1.6296992097363745, + "grad_norm": 0.5417105813799838, + "learning_rate": 1.7446204493873985e-05, + "loss": 11.8404, + "step": 29928 + }, + { + "epoch": 1.6297536637329575, + "grad_norm": 0.5365161137756895, + "learning_rate": 1.7441228260844977e-05, + "loss": 11.8451, + "step": 29929 + }, + { + "epoch": 1.6298081177295405, + "grad_norm": 0.5404649118432637, + "learning_rate": 1.7436252669806864e-05, + "loss": 11.6875, + "step": 29930 + }, + { + "epoch": 1.6298625717261235, + "grad_norm": 0.5416142284054279, + "learning_rate": 1.7431277720798266e-05, + "loss": 11.7902, + "step": 29931 + }, + { + "epoch": 1.6299170257227065, + "grad_norm": 0.5846514279150229, + "learning_rate": 1.7426303413857924e-05, + "loss": 11.7458, + "step": 29932 + }, + { + "epoch": 1.6299714797192897, + "grad_norm": 0.526056418719975, + "learning_rate": 1.742132974902446e-05, + "loss": 11.6389, + "step": 29933 + }, + { + "epoch": 1.6300259337158727, + "grad_norm": 0.5459289100184062, + "learning_rate": 1.74163567263366e-05, + "loss": 11.7018, + "step": 29934 + }, + { + "epoch": 1.6300803877124557, + "grad_norm": 0.4947918508224737, + "learning_rate": 1.7411384345833016e-05, + "loss": 11.775, + "step": 29935 + }, + { + "epoch": 1.6301348417090387, + "grad_norm": 0.5410534306996849, + "learning_rate": 1.7406412607552335e-05, + "loss": 11.7166, + "step": 29936 + }, + { + "epoch": 1.6301892957056217, + "grad_norm": 0.6530609830229392, + "learning_rate": 1.7401441511533268e-05, + "loss": 11.9182, + "step": 29937 + }, + { + "epoch": 1.6302437497022049, + "grad_norm": 0.5322161878776143, + "learning_rate": 1.7396471057814433e-05, + "loss": 11.8298, + "step": 29938 + }, + { + "epoch": 1.6302982036987879, + "grad_norm": 0.5067145283136489, + "learning_rate": 1.7391501246434472e-05, + "loss": 11.8654, + "step": 29939 + }, + { + "epoch": 1.6303526576953709, + "grad_norm": 0.5367518130966582, + "learning_rate": 1.7386532077432084e-05, + "loss": 11.7867, + "step": 29940 + }, + { + "epoch": 1.6304071116919538, + "grad_norm": 0.5940582748469746, + "learning_rate": 1.7381563550845836e-05, + "loss": 11.8191, + "step": 29941 + }, + { + "epoch": 1.6304615656885368, + "grad_norm": 0.5854313763757858, + "learning_rate": 1.7376595666714444e-05, + "loss": 11.7924, + "step": 29942 + }, + { + "epoch": 1.6305160196851198, + "grad_norm": 0.586362723890138, + "learning_rate": 1.737162842507647e-05, + "loss": 11.6086, + "step": 29943 + }, + { + "epoch": 1.6305704736817028, + "grad_norm": 0.513156294977651, + "learning_rate": 1.73666618259706e-05, + "loss": 11.7184, + "step": 29944 + }, + { + "epoch": 1.6306249276782858, + "grad_norm": 0.553937100181963, + "learning_rate": 1.736169586943539e-05, + "loss": 12.0196, + "step": 29945 + }, + { + "epoch": 1.6306793816748688, + "grad_norm": 0.5563188406615613, + "learning_rate": 1.7356730555509494e-05, + "loss": 11.8782, + "step": 29946 + }, + { + "epoch": 1.6307338356714518, + "grad_norm": 0.5886290555930446, + "learning_rate": 1.7351765884231575e-05, + "loss": 11.9051, + "step": 29947 + }, + { + "epoch": 1.6307882896680348, + "grad_norm": 0.5713424153146086, + "learning_rate": 1.7346801855640138e-05, + "loss": 11.9171, + "step": 29948 + }, + { + "epoch": 1.6308427436646178, + "grad_norm": 0.5629399749142001, + "learning_rate": 1.7341838469773818e-05, + "loss": 11.7363, + "step": 29949 + }, + { + "epoch": 1.6308971976612008, + "grad_norm": 0.5192354403945891, + "learning_rate": 1.7336875726671265e-05, + "loss": 11.9123, + "step": 29950 + }, + { + "epoch": 1.6309516516577838, + "grad_norm": 0.4996096798522264, + "learning_rate": 1.733191362637099e-05, + "loss": 11.8283, + "step": 29951 + }, + { + "epoch": 1.6310061056543668, + "grad_norm": 0.5517685381132719, + "learning_rate": 1.732695216891165e-05, + "loss": 11.7819, + "step": 29952 + }, + { + "epoch": 1.6310605596509498, + "grad_norm": 0.5743685710000227, + "learning_rate": 1.7321991354331768e-05, + "loss": 11.8194, + "step": 29953 + }, + { + "epoch": 1.6311150136475328, + "grad_norm": 0.502849034050244, + "learning_rate": 1.7317031182669975e-05, + "loss": 11.7695, + "step": 29954 + }, + { + "epoch": 1.6311694676441157, + "grad_norm": 0.557595428652575, + "learning_rate": 1.7312071653964778e-05, + "loss": 11.9792, + "step": 29955 + }, + { + "epoch": 1.631223921640699, + "grad_norm": 0.5255916282851473, + "learning_rate": 1.7307112768254785e-05, + "loss": 11.7699, + "step": 29956 + }, + { + "epoch": 1.631278375637282, + "grad_norm": 0.5822218988876974, + "learning_rate": 1.7302154525578607e-05, + "loss": 11.6688, + "step": 29957 + }, + { + "epoch": 1.631332829633865, + "grad_norm": 0.5970869466234149, + "learning_rate": 1.729719692597468e-05, + "loss": 11.9408, + "step": 29958 + }, + { + "epoch": 1.631387283630448, + "grad_norm": 0.5221473922026875, + "learning_rate": 1.729223996948166e-05, + "loss": 11.7715, + "step": 29959 + }, + { + "epoch": 1.631441737627031, + "grad_norm": 0.5946245539765873, + "learning_rate": 1.728728365613801e-05, + "loss": 11.7498, + "step": 29960 + }, + { + "epoch": 1.631496191623614, + "grad_norm": 0.5729044952909914, + "learning_rate": 1.728232798598233e-05, + "loss": 11.9128, + "step": 29961 + }, + { + "epoch": 1.6315506456201971, + "grad_norm": 0.6188856348056977, + "learning_rate": 1.727737295905315e-05, + "loss": 11.8017, + "step": 29962 + }, + { + "epoch": 1.6316050996167801, + "grad_norm": 0.5219248367702782, + "learning_rate": 1.727241857538897e-05, + "loss": 11.7027, + "step": 29963 + }, + { + "epoch": 1.6316595536133631, + "grad_norm": 0.5771117140186215, + "learning_rate": 1.7267464835028368e-05, + "loss": 11.757, + "step": 29964 + }, + { + "epoch": 1.6317140076099461, + "grad_norm": 0.534418423431781, + "learning_rate": 1.72625117380098e-05, + "loss": 11.8091, + "step": 29965 + }, + { + "epoch": 1.631768461606529, + "grad_norm": 0.5382609954575446, + "learning_rate": 1.725755928437185e-05, + "loss": 11.6708, + "step": 29966 + }, + { + "epoch": 1.631822915603112, + "grad_norm": 0.573478177342807, + "learning_rate": 1.725260747415299e-05, + "loss": 11.787, + "step": 29967 + }, + { + "epoch": 1.631877369599695, + "grad_norm": 0.5501560654196728, + "learning_rate": 1.7247656307391703e-05, + "loss": 11.8144, + "step": 29968 + }, + { + "epoch": 1.631931823596278, + "grad_norm": 0.5329772046645241, + "learning_rate": 1.7242705784126545e-05, + "loss": 11.7624, + "step": 29969 + }, + { + "epoch": 1.631986277592861, + "grad_norm": 0.5403872408862898, + "learning_rate": 1.723775590439596e-05, + "loss": 11.843, + "step": 29970 + }, + { + "epoch": 1.632040731589444, + "grad_norm": 0.6013170532934077, + "learning_rate": 1.7232806668238466e-05, + "loss": 11.8641, + "step": 29971 + }, + { + "epoch": 1.632095185586027, + "grad_norm": 0.5806003289194821, + "learning_rate": 1.7227858075692572e-05, + "loss": 11.8855, + "step": 29972 + }, + { + "epoch": 1.63214963958261, + "grad_norm": 0.5536369154366305, + "learning_rate": 1.722291012679671e-05, + "loss": 11.6345, + "step": 29973 + }, + { + "epoch": 1.632204093579193, + "grad_norm": 0.5051355613369085, + "learning_rate": 1.7217962821589405e-05, + "loss": 11.8649, + "step": 29974 + }, + { + "epoch": 1.632258547575776, + "grad_norm": 0.5162418393512304, + "learning_rate": 1.7213016160109075e-05, + "loss": 11.8208, + "step": 29975 + }, + { + "epoch": 1.632313001572359, + "grad_norm": 0.5359214026692442, + "learning_rate": 1.720807014239425e-05, + "loss": 11.8198, + "step": 29976 + }, + { + "epoch": 1.632367455568942, + "grad_norm": 0.5661563304303698, + "learning_rate": 1.7203124768483347e-05, + "loss": 11.8668, + "step": 29977 + }, + { + "epoch": 1.632421909565525, + "grad_norm": 0.5385703886541486, + "learning_rate": 1.7198180038414814e-05, + "loss": 11.81, + "step": 29978 + }, + { + "epoch": 1.6324763635621082, + "grad_norm": 0.5266639322681721, + "learning_rate": 1.7193235952227127e-05, + "loss": 11.8471, + "step": 29979 + }, + { + "epoch": 1.6325308175586912, + "grad_norm": 0.5411447754752525, + "learning_rate": 1.7188292509958714e-05, + "loss": 11.7058, + "step": 29980 + }, + { + "epoch": 1.6325852715552742, + "grad_norm": 0.5698511022891626, + "learning_rate": 1.7183349711648045e-05, + "loss": 11.795, + "step": 29981 + }, + { + "epoch": 1.6326397255518572, + "grad_norm": 0.5468815026330077, + "learning_rate": 1.717840755733352e-05, + "loss": 11.7333, + "step": 29982 + }, + { + "epoch": 1.6326941795484402, + "grad_norm": 0.5504093319599296, + "learning_rate": 1.7173466047053576e-05, + "loss": 11.7767, + "step": 29983 + }, + { + "epoch": 1.6327486335450232, + "grad_norm": 0.6155818544855903, + "learning_rate": 1.7168525180846683e-05, + "loss": 11.9369, + "step": 29984 + }, + { + "epoch": 1.6328030875416064, + "grad_norm": 0.5670589884714078, + "learning_rate": 1.71635849587512e-05, + "loss": 11.8164, + "step": 29985 + }, + { + "epoch": 1.6328575415381894, + "grad_norm": 0.607814367470063, + "learning_rate": 1.7158645380805595e-05, + "loss": 11.7662, + "step": 29986 + }, + { + "epoch": 1.6329119955347724, + "grad_norm": 0.5492429874772824, + "learning_rate": 1.7153706447048256e-05, + "loss": 11.8007, + "step": 29987 + }, + { + "epoch": 1.6329664495313554, + "grad_norm": 0.5837794851697979, + "learning_rate": 1.714876815751757e-05, + "loss": 11.8114, + "step": 29988 + }, + { + "epoch": 1.6330209035279384, + "grad_norm": 0.6224607450061648, + "learning_rate": 1.714383051225198e-05, + "loss": 11.7728, + "step": 29989 + }, + { + "epoch": 1.6330753575245214, + "grad_norm": 0.5772532331114856, + "learning_rate": 1.7138893511289833e-05, + "loss": 11.8376, + "step": 29990 + }, + { + "epoch": 1.6331298115211044, + "grad_norm": 0.5929361195225834, + "learning_rate": 1.7133957154669566e-05, + "loss": 11.7147, + "step": 29991 + }, + { + "epoch": 1.6331842655176874, + "grad_norm": 0.5619559270786373, + "learning_rate": 1.7129021442429526e-05, + "loss": 11.8793, + "step": 29992 + }, + { + "epoch": 1.6332387195142704, + "grad_norm": 0.5084116920288405, + "learning_rate": 1.712408637460814e-05, + "loss": 11.6987, + "step": 29993 + }, + { + "epoch": 1.6332931735108533, + "grad_norm": 0.6164379057478121, + "learning_rate": 1.711915195124373e-05, + "loss": 11.8974, + "step": 29994 + }, + { + "epoch": 1.6333476275074363, + "grad_norm": 0.8771008723158041, + "learning_rate": 1.7114218172374687e-05, + "loss": 11.771, + "step": 29995 + }, + { + "epoch": 1.6334020815040193, + "grad_norm": 0.6433309937496873, + "learning_rate": 1.7109285038039414e-05, + "loss": 11.9548, + "step": 29996 + }, + { + "epoch": 1.6334565355006023, + "grad_norm": 0.5610217464026294, + "learning_rate": 1.7104352548276238e-05, + "loss": 11.8948, + "step": 29997 + }, + { + "epoch": 1.6335109894971853, + "grad_norm": 0.5950032725760158, + "learning_rate": 1.7099420703123492e-05, + "loss": 11.9114, + "step": 29998 + }, + { + "epoch": 1.6335654434937683, + "grad_norm": 0.5461232231821463, + "learning_rate": 1.7094489502619583e-05, + "loss": 11.7725, + "step": 29999 + }, + { + "epoch": 1.6336198974903513, + "grad_norm": 0.6059168275598296, + "learning_rate": 1.7089558946802808e-05, + "loss": 11.9028, + "step": 30000 + }, + { + "epoch": 1.6336743514869343, + "grad_norm": 0.6105096510741903, + "learning_rate": 1.708462903571154e-05, + "loss": 11.7295, + "step": 30001 + }, + { + "epoch": 1.6337288054835175, + "grad_norm": 0.5402355451457493, + "learning_rate": 1.7079699769384084e-05, + "loss": 11.8044, + "step": 30002 + }, + { + "epoch": 1.6337832594801005, + "grad_norm": 0.5329796961915664, + "learning_rate": 1.7074771147858805e-05, + "loss": 11.8456, + "step": 30003 + }, + { + "epoch": 1.6338377134766835, + "grad_norm": 0.6321166473939128, + "learning_rate": 1.7069843171173993e-05, + "loss": 11.8749, + "step": 30004 + }, + { + "epoch": 1.6338921674732665, + "grad_norm": 0.5538225118784644, + "learning_rate": 1.706491583936799e-05, + "loss": 11.7861, + "step": 30005 + }, + { + "epoch": 1.6339466214698495, + "grad_norm": 0.5914003584658366, + "learning_rate": 1.7059989152479126e-05, + "loss": 11.7819, + "step": 30006 + }, + { + "epoch": 1.6340010754664325, + "grad_norm": 0.5376738789432051, + "learning_rate": 1.7055063110545698e-05, + "loss": 11.7034, + "step": 30007 + }, + { + "epoch": 1.6340555294630157, + "grad_norm": 0.5155795542157379, + "learning_rate": 1.705013771360602e-05, + "loss": 11.8084, + "step": 30008 + }, + { + "epoch": 1.6341099834595987, + "grad_norm": 0.5703404727879696, + "learning_rate": 1.7045212961698342e-05, + "loss": 11.8154, + "step": 30009 + }, + { + "epoch": 1.6341644374561817, + "grad_norm": 0.5368328682726928, + "learning_rate": 1.704028885486101e-05, + "loss": 11.849, + "step": 30010 + }, + { + "epoch": 1.6342188914527647, + "grad_norm": 0.5565666632153429, + "learning_rate": 1.7035365393132328e-05, + "loss": 11.8997, + "step": 30011 + }, + { + "epoch": 1.6342733454493477, + "grad_norm": 0.5732944831513871, + "learning_rate": 1.7030442576550533e-05, + "loss": 11.8533, + "step": 30012 + }, + { + "epoch": 1.6343277994459307, + "grad_norm": 0.563069851826406, + "learning_rate": 1.702552040515395e-05, + "loss": 11.7898, + "step": 30013 + }, + { + "epoch": 1.6343822534425136, + "grad_norm": 0.6865294138891473, + "learning_rate": 1.702059887898081e-05, + "loss": 11.859, + "step": 30014 + }, + { + "epoch": 1.6344367074390966, + "grad_norm": 0.6114203703375761, + "learning_rate": 1.7015677998069436e-05, + "loss": 11.898, + "step": 30015 + }, + { + "epoch": 1.6344911614356796, + "grad_norm": 0.5619269326249757, + "learning_rate": 1.701075776245804e-05, + "loss": 11.8784, + "step": 30016 + }, + { + "epoch": 1.6345456154322626, + "grad_norm": 0.5496292266512631, + "learning_rate": 1.7005838172184942e-05, + "loss": 11.6915, + "step": 30017 + }, + { + "epoch": 1.6346000694288456, + "grad_norm": 0.61453076425597, + "learning_rate": 1.700091922728836e-05, + "loss": 11.9333, + "step": 30018 + }, + { + "epoch": 1.6346545234254286, + "grad_norm": 0.5557187034857363, + "learning_rate": 1.6996000927806522e-05, + "loss": 11.7656, + "step": 30019 + }, + { + "epoch": 1.6347089774220116, + "grad_norm": 0.544668931532488, + "learning_rate": 1.69910832737777e-05, + "loss": 11.7086, + "step": 30020 + }, + { + "epoch": 1.6347634314185946, + "grad_norm": 0.5320903825442154, + "learning_rate": 1.6986166265240165e-05, + "loss": 11.9186, + "step": 30021 + }, + { + "epoch": 1.6348178854151776, + "grad_norm": 0.5975450327333364, + "learning_rate": 1.698124990223209e-05, + "loss": 11.8909, + "step": 30022 + }, + { + "epoch": 1.6348723394117606, + "grad_norm": 0.6641660955700938, + "learning_rate": 1.6976334184791766e-05, + "loss": 11.7259, + "step": 30023 + }, + { + "epoch": 1.6349267934083436, + "grad_norm": 0.565918647424538, + "learning_rate": 1.6971419112957366e-05, + "loss": 11.833, + "step": 30024 + }, + { + "epoch": 1.6349812474049266, + "grad_norm": 0.5664943585391604, + "learning_rate": 1.696650468676716e-05, + "loss": 11.8858, + "step": 30025 + }, + { + "epoch": 1.6350357014015098, + "grad_norm": 0.5758799834049017, + "learning_rate": 1.6961590906259317e-05, + "loss": 11.7644, + "step": 30026 + }, + { + "epoch": 1.6350901553980928, + "grad_norm": 0.5871389275039496, + "learning_rate": 1.6956677771472086e-05, + "loss": 11.6974, + "step": 30027 + }, + { + "epoch": 1.6351446093946758, + "grad_norm": 0.5864890693843933, + "learning_rate": 1.6951765282443665e-05, + "loss": 11.8469, + "step": 30028 + }, + { + "epoch": 1.6351990633912588, + "grad_norm": 0.5496732357600271, + "learning_rate": 1.6946853439212208e-05, + "loss": 11.7803, + "step": 30029 + }, + { + "epoch": 1.6352535173878417, + "grad_norm": 0.54321256875059, + "learning_rate": 1.6941942241815976e-05, + "loss": 11.8153, + "step": 30030 + }, + { + "epoch": 1.6353079713844247, + "grad_norm": 0.49234013000421545, + "learning_rate": 1.6937031690293114e-05, + "loss": 11.8611, + "step": 30031 + }, + { + "epoch": 1.635362425381008, + "grad_norm": 0.5436932201135376, + "learning_rate": 1.693212178468181e-05, + "loss": 11.7982, + "step": 30032 + }, + { + "epoch": 1.635416879377591, + "grad_norm": 0.6021818624904028, + "learning_rate": 1.6927212525020286e-05, + "loss": 11.8615, + "step": 30033 + }, + { + "epoch": 1.635471333374174, + "grad_norm": 0.5271136833375871, + "learning_rate": 1.6922303911346672e-05, + "loss": 11.6763, + "step": 30034 + }, + { + "epoch": 1.635525787370757, + "grad_norm": 0.5546249424141428, + "learning_rate": 1.6917395943699167e-05, + "loss": 11.7943, + "step": 30035 + }, + { + "epoch": 1.63558024136734, + "grad_norm": 0.5544407333531863, + "learning_rate": 1.6912488622115906e-05, + "loss": 11.798, + "step": 30036 + }, + { + "epoch": 1.635634695363923, + "grad_norm": 0.55837627104082, + "learning_rate": 1.69075819466351e-05, + "loss": 11.7274, + "step": 30037 + }, + { + "epoch": 1.635689149360506, + "grad_norm": 0.4966642026542986, + "learning_rate": 1.6902675917294864e-05, + "loss": 11.7882, + "step": 30038 + }, + { + "epoch": 1.635743603357089, + "grad_norm": 0.5733304750743955, + "learning_rate": 1.689777053413333e-05, + "loss": 11.9074, + "step": 30039 + }, + { + "epoch": 1.635798057353672, + "grad_norm": 0.5606136053844597, + "learning_rate": 1.6892865797188683e-05, + "loss": 11.854, + "step": 30040 + }, + { + "epoch": 1.635852511350255, + "grad_norm": 0.5457936238075886, + "learning_rate": 1.6887961706499033e-05, + "loss": 11.8125, + "step": 30041 + }, + { + "epoch": 1.6359069653468379, + "grad_norm": 0.5306080361527131, + "learning_rate": 1.6883058262102558e-05, + "loss": 11.8243, + "step": 30042 + }, + { + "epoch": 1.6359614193434209, + "grad_norm": 0.5193432870429455, + "learning_rate": 1.6878155464037338e-05, + "loss": 11.7919, + "step": 30043 + }, + { + "epoch": 1.6360158733400039, + "grad_norm": 0.6253338911875089, + "learning_rate": 1.687325331234152e-05, + "loss": 11.9306, + "step": 30044 + }, + { + "epoch": 1.6360703273365869, + "grad_norm": 0.5701814352057775, + "learning_rate": 1.686835180705324e-05, + "loss": 11.8268, + "step": 30045 + }, + { + "epoch": 1.6361247813331699, + "grad_norm": 0.5394099316510441, + "learning_rate": 1.686345094821058e-05, + "loss": 11.8104, + "step": 30046 + }, + { + "epoch": 1.6361792353297528, + "grad_norm": 0.5045219247526627, + "learning_rate": 1.6858550735851695e-05, + "loss": 11.6289, + "step": 30047 + }, + { + "epoch": 1.6362336893263358, + "grad_norm": 0.5671328828710982, + "learning_rate": 1.685365117001466e-05, + "loss": 11.6058, + "step": 30048 + }, + { + "epoch": 1.636288143322919, + "grad_norm": 0.538110442919234, + "learning_rate": 1.6848752250737554e-05, + "loss": 11.8118, + "step": 30049 + }, + { + "epoch": 1.636342597319502, + "grad_norm": 0.522312921593881, + "learning_rate": 1.684385397805851e-05, + "loss": 11.7837, + "step": 30050 + }, + { + "epoch": 1.636397051316085, + "grad_norm": 0.5498876554399803, + "learning_rate": 1.683895635201559e-05, + "loss": 11.7524, + "step": 30051 + }, + { + "epoch": 1.636451505312668, + "grad_norm": 0.5556707509641179, + "learning_rate": 1.6834059372646904e-05, + "loss": 11.8493, + "step": 30052 + }, + { + "epoch": 1.636505959309251, + "grad_norm": 0.5582342262947769, + "learning_rate": 1.6829163039990503e-05, + "loss": 11.7411, + "step": 30053 + }, + { + "epoch": 1.636560413305834, + "grad_norm": 0.5381200686560274, + "learning_rate": 1.682426735408448e-05, + "loss": 11.8162, + "step": 30054 + }, + { + "epoch": 1.6366148673024172, + "grad_norm": 0.5421399558650414, + "learning_rate": 1.6819372314966932e-05, + "loss": 11.8233, + "step": 30055 + }, + { + "epoch": 1.6366693212990002, + "grad_norm": 0.5242904898004394, + "learning_rate": 1.6814477922675853e-05, + "loss": 11.7721, + "step": 30056 + }, + { + "epoch": 1.6367237752955832, + "grad_norm": 0.5682402609569833, + "learning_rate": 1.6809584177249405e-05, + "loss": 11.8661, + "step": 30057 + }, + { + "epoch": 1.6367782292921662, + "grad_norm": 0.5518682391718505, + "learning_rate": 1.6804691078725533e-05, + "loss": 11.7571, + "step": 30058 + }, + { + "epoch": 1.6368326832887492, + "grad_norm": 0.5388755418216244, + "learning_rate": 1.6799798627142326e-05, + "loss": 11.8295, + "step": 30059 + }, + { + "epoch": 1.6368871372853322, + "grad_norm": 0.5676185457478856, + "learning_rate": 1.679490682253787e-05, + "loss": 11.8375, + "step": 30060 + }, + { + "epoch": 1.6369415912819152, + "grad_norm": 0.5274055204742559, + "learning_rate": 1.679001566495014e-05, + "loss": 11.8399, + "step": 30061 + }, + { + "epoch": 1.6369960452784982, + "grad_norm": 0.5763032762948113, + "learning_rate": 1.6785125154417235e-05, + "loss": 11.8082, + "step": 30062 + }, + { + "epoch": 1.6370504992750812, + "grad_norm": 0.5071738550733418, + "learning_rate": 1.678023529097712e-05, + "loss": 11.8153, + "step": 30063 + }, + { + "epoch": 1.6371049532716642, + "grad_norm": 0.5430073858298432, + "learning_rate": 1.6775346074667887e-05, + "loss": 11.736, + "step": 30064 + }, + { + "epoch": 1.6371594072682472, + "grad_norm": 0.5404312233014523, + "learning_rate": 1.677045750552748e-05, + "loss": 11.8809, + "step": 30065 + }, + { + "epoch": 1.6372138612648302, + "grad_norm": 0.5368471376442372, + "learning_rate": 1.6765569583593964e-05, + "loss": 11.7555, + "step": 30066 + }, + { + "epoch": 1.6372683152614131, + "grad_norm": 0.537025994929866, + "learning_rate": 1.676068230890536e-05, + "loss": 11.6391, + "step": 30067 + }, + { + "epoch": 1.6373227692579961, + "grad_norm": 0.5311610970612696, + "learning_rate": 1.6755795681499653e-05, + "loss": 11.7425, + "step": 30068 + }, + { + "epoch": 1.6373772232545791, + "grad_norm": 0.5370816819715026, + "learning_rate": 1.6750909701414808e-05, + "loss": 11.7935, + "step": 30069 + }, + { + "epoch": 1.6374316772511621, + "grad_norm": 0.5189362084418445, + "learning_rate": 1.6746024368688872e-05, + "loss": 11.8864, + "step": 30070 + }, + { + "epoch": 1.6374861312477451, + "grad_norm": 0.5576943627089798, + "learning_rate": 1.6741139683359796e-05, + "loss": 11.616, + "step": 30071 + }, + { + "epoch": 1.6375405852443283, + "grad_norm": 0.5894375453579125, + "learning_rate": 1.6736255645465602e-05, + "loss": 11.7119, + "step": 30072 + }, + { + "epoch": 1.6375950392409113, + "grad_norm": 0.5471688132992194, + "learning_rate": 1.673137225504422e-05, + "loss": 11.6751, + "step": 30073 + }, + { + "epoch": 1.6376494932374943, + "grad_norm": 0.5673981566929037, + "learning_rate": 1.672648951213368e-05, + "loss": 11.8766, + "step": 30074 + }, + { + "epoch": 1.6377039472340773, + "grad_norm": 0.5408324528312617, + "learning_rate": 1.672160741677189e-05, + "loss": 11.7351, + "step": 30075 + }, + { + "epoch": 1.6377584012306603, + "grad_norm": 0.5820788572291198, + "learning_rate": 1.671672596899686e-05, + "loss": 11.8543, + "step": 30076 + }, + { + "epoch": 1.6378128552272433, + "grad_norm": 0.557561109908252, + "learning_rate": 1.6711845168846553e-05, + "loss": 11.6013, + "step": 30077 + }, + { + "epoch": 1.6378673092238265, + "grad_norm": 0.5336726556037551, + "learning_rate": 1.67069650163589e-05, + "loss": 11.845, + "step": 30078 + }, + { + "epoch": 1.6379217632204095, + "grad_norm": 0.6094915021122238, + "learning_rate": 1.6702085511571863e-05, + "loss": 11.8543, + "step": 30079 + }, + { + "epoch": 1.6379762172169925, + "grad_norm": 0.5317362788737234, + "learning_rate": 1.6697206654523344e-05, + "loss": 11.8369, + "step": 30080 + }, + { + "epoch": 1.6380306712135755, + "grad_norm": 0.5327956438040591, + "learning_rate": 1.6692328445251325e-05, + "loss": 11.7825, + "step": 30081 + }, + { + "epoch": 1.6380851252101585, + "grad_norm": 0.5461396379068768, + "learning_rate": 1.6687450883793753e-05, + "loss": 11.8955, + "step": 30082 + }, + { + "epoch": 1.6381395792067415, + "grad_norm": 0.5050090392465167, + "learning_rate": 1.668257397018851e-05, + "loss": 11.8116, + "step": 30083 + }, + { + "epoch": 1.6381940332033245, + "grad_norm": 0.5690084558942501, + "learning_rate": 1.667769770447357e-05, + "loss": 11.6996, + "step": 30084 + }, + { + "epoch": 1.6382484871999075, + "grad_norm": 0.5827578713703137, + "learning_rate": 1.6672822086686803e-05, + "loss": 11.8359, + "step": 30085 + }, + { + "epoch": 1.6383029411964904, + "grad_norm": 0.542803027702863, + "learning_rate": 1.6667947116866168e-05, + "loss": 11.7655, + "step": 30086 + }, + { + "epoch": 1.6383573951930734, + "grad_norm": 0.5515404128187872, + "learning_rate": 1.6663072795049515e-05, + "loss": 11.8565, + "step": 30087 + }, + { + "epoch": 1.6384118491896564, + "grad_norm": 0.5164036976239055, + "learning_rate": 1.665819912127482e-05, + "loss": 11.8333, + "step": 30088 + }, + { + "epoch": 1.6384663031862394, + "grad_norm": 0.592115041557363, + "learning_rate": 1.6653326095579946e-05, + "loss": 11.6284, + "step": 30089 + }, + { + "epoch": 1.6385207571828224, + "grad_norm": 0.5585561045423707, + "learning_rate": 1.6648453718002765e-05, + "loss": 11.8202, + "step": 30090 + }, + { + "epoch": 1.6385752111794054, + "grad_norm": 0.5429523758392679, + "learning_rate": 1.664358198858117e-05, + "loss": 11.7837, + "step": 30091 + }, + { + "epoch": 1.6386296651759884, + "grad_norm": 0.5787009703870182, + "learning_rate": 1.6638710907353095e-05, + "loss": 11.9051, + "step": 30092 + }, + { + "epoch": 1.6386841191725714, + "grad_norm": 0.5372462408997021, + "learning_rate": 1.6633840474356366e-05, + "loss": 11.5681, + "step": 30093 + }, + { + "epoch": 1.6387385731691544, + "grad_norm": 0.5849190833703195, + "learning_rate": 1.6628970689628887e-05, + "loss": 11.9292, + "step": 30094 + }, + { + "epoch": 1.6387930271657374, + "grad_norm": 0.5209925881249817, + "learning_rate": 1.6624101553208492e-05, + "loss": 11.882, + "step": 30095 + }, + { + "epoch": 1.6388474811623206, + "grad_norm": 0.5124654393783703, + "learning_rate": 1.661923306513309e-05, + "loss": 11.7205, + "step": 30096 + }, + { + "epoch": 1.6389019351589036, + "grad_norm": 0.560753906045654, + "learning_rate": 1.6614365225440488e-05, + "loss": 11.805, + "step": 30097 + }, + { + "epoch": 1.6389563891554866, + "grad_norm": 0.5489227700795934, + "learning_rate": 1.6609498034168603e-05, + "loss": 11.9082, + "step": 30098 + }, + { + "epoch": 1.6390108431520696, + "grad_norm": 0.5765246165371151, + "learning_rate": 1.6604631491355226e-05, + "loss": 11.8043, + "step": 30099 + }, + { + "epoch": 1.6390652971486526, + "grad_norm": 0.5548932072939367, + "learning_rate": 1.6599765597038207e-05, + "loss": 11.8307, + "step": 30100 + }, + { + "epoch": 1.6391197511452358, + "grad_norm": 0.530195112560014, + "learning_rate": 1.6594900351255428e-05, + "loss": 11.841, + "step": 30101 + }, + { + "epoch": 1.6391742051418188, + "grad_norm": 0.5312002150356473, + "learning_rate": 1.659003575404465e-05, + "loss": 11.6799, + "step": 30102 + }, + { + "epoch": 1.6392286591384018, + "grad_norm": 0.5707074796569253, + "learning_rate": 1.6585171805443754e-05, + "loss": 11.8772, + "step": 30103 + }, + { + "epoch": 1.6392831131349848, + "grad_norm": 0.6623149603267436, + "learning_rate": 1.658030850549057e-05, + "loss": 11.9273, + "step": 30104 + }, + { + "epoch": 1.6393375671315678, + "grad_norm": 0.5247996329746817, + "learning_rate": 1.657544585422287e-05, + "loss": 11.7142, + "step": 30105 + }, + { + "epoch": 1.6393920211281507, + "grad_norm": 0.5231783955010734, + "learning_rate": 1.6570583851678524e-05, + "loss": 11.8849, + "step": 30106 + }, + { + "epoch": 1.6394464751247337, + "grad_norm": 0.5614133123226214, + "learning_rate": 1.6565722497895287e-05, + "loss": 11.7412, + "step": 30107 + }, + { + "epoch": 1.6395009291213167, + "grad_norm": 0.575766303853106, + "learning_rate": 1.6560861792910997e-05, + "loss": 11.9719, + "step": 30108 + }, + { + "epoch": 1.6395553831178997, + "grad_norm": 0.5500778176237447, + "learning_rate": 1.655600173676345e-05, + "loss": 11.7736, + "step": 30109 + }, + { + "epoch": 1.6396098371144827, + "grad_norm": 0.5503625238016333, + "learning_rate": 1.655114232949039e-05, + "loss": 11.8682, + "step": 30110 + }, + { + "epoch": 1.6396642911110657, + "grad_norm": 0.5653805119479136, + "learning_rate": 1.6546283571129672e-05, + "loss": 11.9648, + "step": 30111 + }, + { + "epoch": 1.6397187451076487, + "grad_norm": 0.5157511129896286, + "learning_rate": 1.6541425461719027e-05, + "loss": 11.7993, + "step": 30112 + }, + { + "epoch": 1.6397731991042317, + "grad_norm": 0.5705645957327735, + "learning_rate": 1.653656800129628e-05, + "loss": 11.7648, + "step": 30113 + }, + { + "epoch": 1.6398276531008147, + "grad_norm": 0.5515964624708107, + "learning_rate": 1.6531711189899145e-05, + "loss": 11.7286, + "step": 30114 + }, + { + "epoch": 1.6398821070973977, + "grad_norm": 0.5069048453469294, + "learning_rate": 1.6526855027565423e-05, + "loss": 11.8409, + "step": 30115 + }, + { + "epoch": 1.6399365610939807, + "grad_norm": 0.5606669819797543, + "learning_rate": 1.652199951433291e-05, + "loss": 11.8605, + "step": 30116 + }, + { + "epoch": 1.6399910150905637, + "grad_norm": 0.6099907291433857, + "learning_rate": 1.651714465023929e-05, + "loss": 11.9511, + "step": 30117 + }, + { + "epoch": 1.6400454690871467, + "grad_norm": 0.5727123118523072, + "learning_rate": 1.6512290435322385e-05, + "loss": 11.6288, + "step": 30118 + }, + { + "epoch": 1.6400999230837299, + "grad_norm": 0.4939985069499722, + "learning_rate": 1.6507436869619907e-05, + "loss": 11.7311, + "step": 30119 + }, + { + "epoch": 1.6401543770803129, + "grad_norm": 0.5129205285412832, + "learning_rate": 1.650258395316957e-05, + "loss": 11.6636, + "step": 30120 + }, + { + "epoch": 1.6402088310768959, + "grad_norm": 0.5085701211172086, + "learning_rate": 1.6497731686009176e-05, + "loss": 11.7575, + "step": 30121 + }, + { + "epoch": 1.6402632850734788, + "grad_norm": 0.5101237714626635, + "learning_rate": 1.6492880068176396e-05, + "loss": 11.7947, + "step": 30122 + }, + { + "epoch": 1.6403177390700618, + "grad_norm": 0.5512918945382997, + "learning_rate": 1.6488029099709023e-05, + "loss": 11.8164, + "step": 30123 + }, + { + "epoch": 1.6403721930666448, + "grad_norm": 0.5242432731708799, + "learning_rate": 1.6483178780644702e-05, + "loss": 11.7004, + "step": 30124 + }, + { + "epoch": 1.640426647063228, + "grad_norm": 0.568338929432154, + "learning_rate": 1.6478329111021185e-05, + "loss": 11.9587, + "step": 30125 + }, + { + "epoch": 1.640481101059811, + "grad_norm": 0.507992246282294, + "learning_rate": 1.6473480090876226e-05, + "loss": 11.7301, + "step": 30126 + }, + { + "epoch": 1.640535555056394, + "grad_norm": 0.5983717861836699, + "learning_rate": 1.646863172024746e-05, + "loss": 11.8204, + "step": 30127 + }, + { + "epoch": 1.640590009052977, + "grad_norm": 0.5772742570237682, + "learning_rate": 1.6463783999172665e-05, + "loss": 11.7378, + "step": 30128 + }, + { + "epoch": 1.64064446304956, + "grad_norm": 0.5491561143340316, + "learning_rate": 1.6458936927689462e-05, + "loss": 11.8633, + "step": 30129 + }, + { + "epoch": 1.640698917046143, + "grad_norm": 0.5221765408554284, + "learning_rate": 1.6454090505835562e-05, + "loss": 11.8537, + "step": 30130 + }, + { + "epoch": 1.640753371042726, + "grad_norm": 0.5578128299948817, + "learning_rate": 1.6449244733648695e-05, + "loss": 11.8589, + "step": 30131 + }, + { + "epoch": 1.640807825039309, + "grad_norm": 0.6028340515247499, + "learning_rate": 1.6444399611166484e-05, + "loss": 11.8208, + "step": 30132 + }, + { + "epoch": 1.640862279035892, + "grad_norm": 0.5399518459484299, + "learning_rate": 1.6439555138426666e-05, + "loss": 11.7228, + "step": 30133 + }, + { + "epoch": 1.640916733032475, + "grad_norm": 0.5902670897304149, + "learning_rate": 1.6434711315466844e-05, + "loss": 11.8582, + "step": 30134 + }, + { + "epoch": 1.640971187029058, + "grad_norm": 0.5743644022290435, + "learning_rate": 1.6429868142324757e-05, + "loss": 11.8942, + "step": 30135 + }, + { + "epoch": 1.641025641025641, + "grad_norm": 0.5386308223798555, + "learning_rate": 1.642502561903799e-05, + "loss": 11.7774, + "step": 30136 + }, + { + "epoch": 1.641080095022224, + "grad_norm": 0.5331663092826117, + "learning_rate": 1.6420183745644258e-05, + "loss": 11.8669, + "step": 30137 + }, + { + "epoch": 1.641134549018807, + "grad_norm": 0.5882156028799918, + "learning_rate": 1.6415342522181233e-05, + "loss": 11.8053, + "step": 30138 + }, + { + "epoch": 1.64118900301539, + "grad_norm": 0.5164712670058067, + "learning_rate": 1.6410501948686473e-05, + "loss": 11.8903, + "step": 30139 + }, + { + "epoch": 1.641243457011973, + "grad_norm": 0.596815269015819, + "learning_rate": 1.6405662025197665e-05, + "loss": 11.8447, + "step": 30140 + }, + { + "epoch": 1.641297911008556, + "grad_norm": 0.5299777840801392, + "learning_rate": 1.6400822751752475e-05, + "loss": 11.7045, + "step": 30141 + }, + { + "epoch": 1.6413523650051391, + "grad_norm": 0.5109550589165297, + "learning_rate": 1.6395984128388487e-05, + "loss": 11.8194, + "step": 30142 + }, + { + "epoch": 1.6414068190017221, + "grad_norm": 0.600931446336842, + "learning_rate": 1.6391146155143367e-05, + "loss": 11.857, + "step": 30143 + }, + { + "epoch": 1.6414612729983051, + "grad_norm": 0.5144424977095644, + "learning_rate": 1.63863088320547e-05, + "loss": 11.6053, + "step": 30144 + }, + { + "epoch": 1.6415157269948881, + "grad_norm": 0.5553563956708736, + "learning_rate": 1.6381472159160126e-05, + "loss": 11.8207, + "step": 30145 + }, + { + "epoch": 1.6415701809914711, + "grad_norm": 0.5406254046813125, + "learning_rate": 1.6376636136497235e-05, + "loss": 11.8224, + "step": 30146 + }, + { + "epoch": 1.641624634988054, + "grad_norm": 0.6141331659409237, + "learning_rate": 1.6371800764103663e-05, + "loss": 11.8021, + "step": 30147 + }, + { + "epoch": 1.6416790889846373, + "grad_norm": 0.5601358481483191, + "learning_rate": 1.6366966042016996e-05, + "loss": 11.6448, + "step": 30148 + }, + { + "epoch": 1.6417335429812203, + "grad_norm": 0.484738020824408, + "learning_rate": 1.6362131970274796e-05, + "loss": 11.8006, + "step": 30149 + }, + { + "epoch": 1.6417879969778033, + "grad_norm": 0.5427513378769239, + "learning_rate": 1.6357298548914722e-05, + "loss": 11.8511, + "step": 30150 + }, + { + "epoch": 1.6418424509743863, + "grad_norm": 0.5756763379337209, + "learning_rate": 1.635246577797428e-05, + "loss": 11.8875, + "step": 30151 + }, + { + "epoch": 1.6418969049709693, + "grad_norm": 0.5385630663061246, + "learning_rate": 1.63476336574911e-05, + "loss": 11.7996, + "step": 30152 + }, + { + "epoch": 1.6419513589675523, + "grad_norm": 0.6060538780518313, + "learning_rate": 1.6342802187502782e-05, + "loss": 11.7774, + "step": 30153 + }, + { + "epoch": 1.6420058129641353, + "grad_norm": 0.5927889484959951, + "learning_rate": 1.6337971368046823e-05, + "loss": 11.7934, + "step": 30154 + }, + { + "epoch": 1.6420602669607183, + "grad_norm": 0.5672123417583669, + "learning_rate": 1.633314119916086e-05, + "loss": 11.8023, + "step": 30155 + }, + { + "epoch": 1.6421147209573013, + "grad_norm": 0.5047756720524929, + "learning_rate": 1.63283116808824e-05, + "loss": 11.8242, + "step": 30156 + }, + { + "epoch": 1.6421691749538843, + "grad_norm": 0.5481889386605263, + "learning_rate": 1.6323482813249046e-05, + "loss": 11.8706, + "step": 30157 + }, + { + "epoch": 1.6422236289504673, + "grad_norm": 0.5739071727362628, + "learning_rate": 1.631865459629832e-05, + "loss": 11.9472, + "step": 30158 + }, + { + "epoch": 1.6422780829470502, + "grad_norm": 0.5230641715863068, + "learning_rate": 1.6313827030067742e-05, + "loss": 11.7824, + "step": 30159 + }, + { + "epoch": 1.6423325369436332, + "grad_norm": 0.533055946320717, + "learning_rate": 1.6309000114594907e-05, + "loss": 11.7953, + "step": 30160 + }, + { + "epoch": 1.6423869909402162, + "grad_norm": 0.5262768491491955, + "learning_rate": 1.6304173849917293e-05, + "loss": 11.7402, + "step": 30161 + }, + { + "epoch": 1.6424414449367992, + "grad_norm": 0.5180209889053046, + "learning_rate": 1.6299348236072487e-05, + "loss": 11.7873, + "step": 30162 + }, + { + "epoch": 1.6424958989333822, + "grad_norm": 0.5472664669211041, + "learning_rate": 1.629452327309796e-05, + "loss": 11.7478, + "step": 30163 + }, + { + "epoch": 1.6425503529299652, + "grad_norm": 0.5622909712539453, + "learning_rate": 1.6289698961031265e-05, + "loss": 11.917, + "step": 30164 + }, + { + "epoch": 1.6426048069265482, + "grad_norm": 0.5305997335205761, + "learning_rate": 1.628487529990992e-05, + "loss": 11.813, + "step": 30165 + }, + { + "epoch": 1.6426592609231314, + "grad_norm": 0.49457213538677725, + "learning_rate": 1.6280052289771407e-05, + "loss": 11.7294, + "step": 30166 + }, + { + "epoch": 1.6427137149197144, + "grad_norm": 0.5504786587383643, + "learning_rate": 1.627522993065327e-05, + "loss": 11.8793, + "step": 30167 + }, + { + "epoch": 1.6427681689162974, + "grad_norm": 0.5563024258222868, + "learning_rate": 1.6270408222592992e-05, + "loss": 11.7451, + "step": 30168 + }, + { + "epoch": 1.6428226229128804, + "grad_norm": 0.5538065961453419, + "learning_rate": 1.6265587165628026e-05, + "loss": 11.8441, + "step": 30169 + }, + { + "epoch": 1.6428770769094634, + "grad_norm": 0.5133949483620408, + "learning_rate": 1.6260766759795932e-05, + "loss": 11.8452, + "step": 30170 + }, + { + "epoch": 1.6429315309060466, + "grad_norm": 0.537967420213983, + "learning_rate": 1.625594700513413e-05, + "loss": 11.7883, + "step": 30171 + }, + { + "epoch": 1.6429859849026296, + "grad_norm": 0.5286085004048998, + "learning_rate": 1.6251127901680154e-05, + "loss": 11.8057, + "step": 30172 + }, + { + "epoch": 1.6430404388992126, + "grad_norm": 0.5052098441762602, + "learning_rate": 1.624630944947142e-05, + "loss": 11.702, + "step": 30173 + }, + { + "epoch": 1.6430948928957956, + "grad_norm": 0.6221990772921728, + "learning_rate": 1.6241491648545425e-05, + "loss": 12.0073, + "step": 30174 + }, + { + "epoch": 1.6431493468923786, + "grad_norm": 0.5572868259678778, + "learning_rate": 1.6236674498939674e-05, + "loss": 11.7741, + "step": 30175 + }, + { + "epoch": 1.6432038008889616, + "grad_norm": 0.6730592903415611, + "learning_rate": 1.623185800069157e-05, + "loss": 11.8505, + "step": 30176 + }, + { + "epoch": 1.6432582548855446, + "grad_norm": 0.5398495166799504, + "learning_rate": 1.6227042153838602e-05, + "loss": 11.8462, + "step": 30177 + }, + { + "epoch": 1.6433127088821275, + "grad_norm": 0.5434122483300948, + "learning_rate": 1.6222226958418196e-05, + "loss": 11.8473, + "step": 30178 + }, + { + "epoch": 1.6433671628787105, + "grad_norm": 0.5141177947597703, + "learning_rate": 1.621741241446778e-05, + "loss": 11.7393, + "step": 30179 + }, + { + "epoch": 1.6434216168752935, + "grad_norm": 0.5144025252010122, + "learning_rate": 1.6212598522024846e-05, + "loss": 11.7931, + "step": 30180 + }, + { + "epoch": 1.6434760708718765, + "grad_norm": 0.5644670811107217, + "learning_rate": 1.6207785281126776e-05, + "loss": 11.7436, + "step": 30181 + }, + { + "epoch": 1.6435305248684595, + "grad_norm": 0.5343302417104526, + "learning_rate": 1.6202972691811036e-05, + "loss": 11.8889, + "step": 30182 + }, + { + "epoch": 1.6435849788650425, + "grad_norm": 0.5538776457357333, + "learning_rate": 1.619816075411501e-05, + "loss": 11.7634, + "step": 30183 + }, + { + "epoch": 1.6436394328616255, + "grad_norm": 0.5331346245091763, + "learning_rate": 1.619334946807617e-05, + "loss": 11.6585, + "step": 30184 + }, + { + "epoch": 1.6436938868582085, + "grad_norm": 0.5369501897949668, + "learning_rate": 1.618853883373187e-05, + "loss": 11.5974, + "step": 30185 + }, + { + "epoch": 1.6437483408547915, + "grad_norm": 0.549389119772309, + "learning_rate": 1.618372885111955e-05, + "loss": 11.8103, + "step": 30186 + }, + { + "epoch": 1.6438027948513745, + "grad_norm": 0.5392353962985108, + "learning_rate": 1.6178919520276636e-05, + "loss": 11.8543, + "step": 30187 + }, + { + "epoch": 1.6438572488479575, + "grad_norm": 0.5745473003725531, + "learning_rate": 1.6174110841240498e-05, + "loss": 11.8401, + "step": 30188 + }, + { + "epoch": 1.6439117028445407, + "grad_norm": 0.5425281327084583, + "learning_rate": 1.6169302814048505e-05, + "loss": 11.756, + "step": 30189 + }, + { + "epoch": 1.6439661568411237, + "grad_norm": 0.5404762589688521, + "learning_rate": 1.6164495438738093e-05, + "loss": 11.7027, + "step": 30190 + }, + { + "epoch": 1.6440206108377067, + "grad_norm": 0.5224215118945175, + "learning_rate": 1.615968871534661e-05, + "loss": 11.7933, + "step": 30191 + }, + { + "epoch": 1.6440750648342897, + "grad_norm": 0.5558364736720943, + "learning_rate": 1.6154882643911462e-05, + "loss": 11.8388, + "step": 30192 + }, + { + "epoch": 1.6441295188308727, + "grad_norm": 0.5201611773068436, + "learning_rate": 1.6150077224469982e-05, + "loss": 11.7992, + "step": 30193 + }, + { + "epoch": 1.6441839728274557, + "grad_norm": 0.6008427608907387, + "learning_rate": 1.61452724570596e-05, + "loss": 11.9217, + "step": 30194 + }, + { + "epoch": 1.6442384268240389, + "grad_norm": 0.5242592097140267, + "learning_rate": 1.6140468341717606e-05, + "loss": 11.8247, + "step": 30195 + }, + { + "epoch": 1.6442928808206219, + "grad_norm": 0.5179011124989028, + "learning_rate": 1.6135664878481427e-05, + "loss": 11.7884, + "step": 30196 + }, + { + "epoch": 1.6443473348172049, + "grad_norm": 0.5513964123639697, + "learning_rate": 1.6130862067388353e-05, + "loss": 11.8346, + "step": 30197 + }, + { + "epoch": 1.6444017888137878, + "grad_norm": 0.5532758824760037, + "learning_rate": 1.6126059908475778e-05, + "loss": 11.8882, + "step": 30198 + }, + { + "epoch": 1.6444562428103708, + "grad_norm": 0.5063132610517528, + "learning_rate": 1.6121258401781035e-05, + "loss": 11.7788, + "step": 30199 + }, + { + "epoch": 1.6445106968069538, + "grad_norm": 0.5780013179652417, + "learning_rate": 1.6116457547341425e-05, + "loss": 11.7088, + "step": 30200 + }, + { + "epoch": 1.6445651508035368, + "grad_norm": 0.5981226697750344, + "learning_rate": 1.6111657345194308e-05, + "loss": 11.7495, + "step": 30201 + }, + { + "epoch": 1.6446196048001198, + "grad_norm": 0.5244753579200823, + "learning_rate": 1.6106857795377038e-05, + "loss": 11.8968, + "step": 30202 + }, + { + "epoch": 1.6446740587967028, + "grad_norm": 0.5259705428364313, + "learning_rate": 1.6102058897926886e-05, + "loss": 11.747, + "step": 30203 + }, + { + "epoch": 1.6447285127932858, + "grad_norm": 0.6379847630238716, + "learning_rate": 1.609726065288122e-05, + "loss": 11.8394, + "step": 30204 + }, + { + "epoch": 1.6447829667898688, + "grad_norm": 0.5361327029031906, + "learning_rate": 1.609246306027731e-05, + "loss": 11.8516, + "step": 30205 + }, + { + "epoch": 1.6448374207864518, + "grad_norm": 0.46549869099427416, + "learning_rate": 1.608766612015249e-05, + "loss": 11.7019, + "step": 30206 + }, + { + "epoch": 1.6448918747830348, + "grad_norm": 0.5261882995306035, + "learning_rate": 1.6082869832544022e-05, + "loss": 11.8061, + "step": 30207 + }, + { + "epoch": 1.6449463287796178, + "grad_norm": 0.49370947612749594, + "learning_rate": 1.607807419748927e-05, + "loss": 11.7223, + "step": 30208 + }, + { + "epoch": 1.6450007827762008, + "grad_norm": 0.5000206853764118, + "learning_rate": 1.6073279215025473e-05, + "loss": 11.816, + "step": 30209 + }, + { + "epoch": 1.6450552367727838, + "grad_norm": 0.5787497720942635, + "learning_rate": 1.6068484885189915e-05, + "loss": 11.9479, + "step": 30210 + }, + { + "epoch": 1.6451096907693668, + "grad_norm": 0.5372897855231669, + "learning_rate": 1.6063691208019917e-05, + "loss": 11.8536, + "step": 30211 + }, + { + "epoch": 1.64516414476595, + "grad_norm": 0.5002106263799222, + "learning_rate": 1.6058898183552708e-05, + "loss": 11.661, + "step": 30212 + }, + { + "epoch": 1.645218598762533, + "grad_norm": 0.5660293224008779, + "learning_rate": 1.6054105811825582e-05, + "loss": 11.8339, + "step": 30213 + }, + { + "epoch": 1.645273052759116, + "grad_norm": 0.5275471143776981, + "learning_rate": 1.6049314092875832e-05, + "loss": 11.8119, + "step": 30214 + }, + { + "epoch": 1.645327506755699, + "grad_norm": 0.5427198343184654, + "learning_rate": 1.6044523026740666e-05, + "loss": 11.8751, + "step": 30215 + }, + { + "epoch": 1.645381960752282, + "grad_norm": 0.5249795762334607, + "learning_rate": 1.603973261345739e-05, + "loss": 11.7967, + "step": 30216 + }, + { + "epoch": 1.645436414748865, + "grad_norm": 0.5720662556319112, + "learning_rate": 1.6034942853063218e-05, + "loss": 11.844, + "step": 30217 + }, + { + "epoch": 1.6454908687454481, + "grad_norm": 0.6051020981225518, + "learning_rate": 1.603015374559543e-05, + "loss": 11.9617, + "step": 30218 + }, + { + "epoch": 1.6455453227420311, + "grad_norm": 0.5839193438142962, + "learning_rate": 1.6025365291091253e-05, + "loss": 11.8775, + "step": 30219 + }, + { + "epoch": 1.6455997767386141, + "grad_norm": 0.5290094140171435, + "learning_rate": 1.6020577489587897e-05, + "loss": 11.8442, + "step": 30220 + }, + { + "epoch": 1.6456542307351971, + "grad_norm": 0.5457381959843188, + "learning_rate": 1.6015790341122637e-05, + "loss": 11.8472, + "step": 30221 + }, + { + "epoch": 1.6457086847317801, + "grad_norm": 0.5302315325245227, + "learning_rate": 1.601100384573264e-05, + "loss": 11.7743, + "step": 30222 + }, + { + "epoch": 1.645763138728363, + "grad_norm": 0.5443525039553707, + "learning_rate": 1.6006218003455175e-05, + "loss": 11.8231, + "step": 30223 + }, + { + "epoch": 1.645817592724946, + "grad_norm": 0.5151473320863345, + "learning_rate": 1.6001432814327467e-05, + "loss": 11.7664, + "step": 30224 + }, + { + "epoch": 1.645872046721529, + "grad_norm": 0.5958881746221502, + "learning_rate": 1.599664827838667e-05, + "loss": 11.846, + "step": 30225 + }, + { + "epoch": 1.645926500718112, + "grad_norm": 0.5797504547148997, + "learning_rate": 1.5991864395670052e-05, + "loss": 11.8341, + "step": 30226 + }, + { + "epoch": 1.645980954714695, + "grad_norm": 0.5883214560695113, + "learning_rate": 1.598708116621477e-05, + "loss": 11.8504, + "step": 30227 + }, + { + "epoch": 1.646035408711278, + "grad_norm": 0.6053640051559289, + "learning_rate": 1.598229859005804e-05, + "loss": 11.8396, + "step": 30228 + }, + { + "epoch": 1.646089862707861, + "grad_norm": 0.5922548361657342, + "learning_rate": 1.597751666723706e-05, + "loss": 11.8569, + "step": 30229 + }, + { + "epoch": 1.646144316704444, + "grad_norm": 0.5600807084813513, + "learning_rate": 1.5972735397788972e-05, + "loss": 11.7831, + "step": 30230 + }, + { + "epoch": 1.646198770701027, + "grad_norm": 0.509136169809699, + "learning_rate": 1.5967954781751004e-05, + "loss": 11.7968, + "step": 30231 + }, + { + "epoch": 1.64625322469761, + "grad_norm": 0.5757226218075859, + "learning_rate": 1.5963174819160297e-05, + "loss": 11.8568, + "step": 30232 + }, + { + "epoch": 1.646307678694193, + "grad_norm": 0.5645557112183426, + "learning_rate": 1.5958395510054058e-05, + "loss": 11.786, + "step": 30233 + }, + { + "epoch": 1.646362132690776, + "grad_norm": 0.5352800378522243, + "learning_rate": 1.59536168544694e-05, + "loss": 11.66, + "step": 30234 + }, + { + "epoch": 1.6464165866873592, + "grad_norm": 0.520343803438192, + "learning_rate": 1.5948838852443515e-05, + "loss": 11.8258, + "step": 30235 + }, + { + "epoch": 1.6464710406839422, + "grad_norm": 0.5734099053021384, + "learning_rate": 1.5944061504013584e-05, + "loss": 11.9015, + "step": 30236 + }, + { + "epoch": 1.6465254946805252, + "grad_norm": 0.5804734427098937, + "learning_rate": 1.593928480921669e-05, + "loss": 11.8885, + "step": 30237 + }, + { + "epoch": 1.6465799486771082, + "grad_norm": 0.5559074652720587, + "learning_rate": 1.593450876809005e-05, + "loss": 11.8093, + "step": 30238 + }, + { + "epoch": 1.6466344026736912, + "grad_norm": 0.5548681081568707, + "learning_rate": 1.5929733380670763e-05, + "loss": 11.7741, + "step": 30239 + }, + { + "epoch": 1.6466888566702742, + "grad_norm": 0.5482775809889404, + "learning_rate": 1.5924958646995948e-05, + "loss": 11.8452, + "step": 30240 + }, + { + "epoch": 1.6467433106668574, + "grad_norm": 0.515091895675061, + "learning_rate": 1.5920184567102768e-05, + "loss": 11.8557, + "step": 30241 + }, + { + "epoch": 1.6467977646634404, + "grad_norm": 0.6422393359974997, + "learning_rate": 1.5915411141028326e-05, + "loss": 11.6629, + "step": 30242 + }, + { + "epoch": 1.6468522186600234, + "grad_norm": 0.5747118067216739, + "learning_rate": 1.591063836880976e-05, + "loss": 11.7587, + "step": 30243 + }, + { + "epoch": 1.6469066726566064, + "grad_norm": 0.5208001241281782, + "learning_rate": 1.5905866250484146e-05, + "loss": 11.764, + "step": 30244 + }, + { + "epoch": 1.6469611266531894, + "grad_norm": 0.5943357567202925, + "learning_rate": 1.590109478608862e-05, + "loss": 11.6988, + "step": 30245 + }, + { + "epoch": 1.6470155806497724, + "grad_norm": 0.6174542726047334, + "learning_rate": 1.589632397566031e-05, + "loss": 11.8192, + "step": 30246 + }, + { + "epoch": 1.6470700346463554, + "grad_norm": 0.5589766098234975, + "learning_rate": 1.589155381923627e-05, + "loss": 11.8471, + "step": 30247 + }, + { + "epoch": 1.6471244886429384, + "grad_norm": 0.526663364675429, + "learning_rate": 1.588678431685363e-05, + "loss": 11.833, + "step": 30248 + }, + { + "epoch": 1.6471789426395214, + "grad_norm": 0.5420646816181991, + "learning_rate": 1.588201546854946e-05, + "loss": 11.7753, + "step": 30249 + }, + { + "epoch": 1.6472333966361044, + "grad_norm": 0.5585573894236098, + "learning_rate": 1.587724727436082e-05, + "loss": 11.8415, + "step": 30250 + }, + { + "epoch": 1.6472878506326873, + "grad_norm": 0.5718008066308626, + "learning_rate": 1.5872479734324843e-05, + "loss": 11.7366, + "step": 30251 + }, + { + "epoch": 1.6473423046292703, + "grad_norm": 0.4943534299757422, + "learning_rate": 1.5867712848478545e-05, + "loss": 11.7126, + "step": 30252 + }, + { + "epoch": 1.6473967586258533, + "grad_norm": 0.5023601942903341, + "learning_rate": 1.586294661685904e-05, + "loss": 11.7995, + "step": 30253 + }, + { + "epoch": 1.6474512126224363, + "grad_norm": 0.5113537105801732, + "learning_rate": 1.585818103950335e-05, + "loss": 11.8232, + "step": 30254 + }, + { + "epoch": 1.6475056666190193, + "grad_norm": 0.5339525225899784, + "learning_rate": 1.5853416116448582e-05, + "loss": 11.7476, + "step": 30255 + }, + { + "epoch": 1.6475601206156023, + "grad_norm": 0.5436259771386474, + "learning_rate": 1.5848651847731745e-05, + "loss": 11.7871, + "step": 30256 + }, + { + "epoch": 1.6476145746121853, + "grad_norm": 0.6170914582958583, + "learning_rate": 1.5843888233389893e-05, + "loss": 11.8158, + "step": 30257 + }, + { + "epoch": 1.6476690286087683, + "grad_norm": 0.6716163108483342, + "learning_rate": 1.58391252734601e-05, + "loss": 11.836, + "step": 30258 + }, + { + "epoch": 1.6477234826053515, + "grad_norm": 0.5310794370224393, + "learning_rate": 1.5834362967979387e-05, + "loss": 11.829, + "step": 30259 + }, + { + "epoch": 1.6477779366019345, + "grad_norm": 0.481759676350806, + "learning_rate": 1.582960131698479e-05, + "loss": 11.8247, + "step": 30260 + }, + { + "epoch": 1.6478323905985175, + "grad_norm": 0.5315280858473589, + "learning_rate": 1.582484032051329e-05, + "loss": 11.8731, + "step": 30261 + }, + { + "epoch": 1.6478868445951005, + "grad_norm": 0.5378813463110909, + "learning_rate": 1.5820079978601955e-05, + "loss": 11.8076, + "step": 30262 + }, + { + "epoch": 1.6479412985916835, + "grad_norm": 0.55903266225185, + "learning_rate": 1.5815320291287816e-05, + "loss": 11.7443, + "step": 30263 + }, + { + "epoch": 1.6479957525882665, + "grad_norm": 0.6012034498040094, + "learning_rate": 1.5810561258607847e-05, + "loss": 11.7531, + "step": 30264 + }, + { + "epoch": 1.6480502065848497, + "grad_norm": 0.5144155695918641, + "learning_rate": 1.5805802880599097e-05, + "loss": 11.9225, + "step": 30265 + }, + { + "epoch": 1.6481046605814327, + "grad_norm": 0.5474350811292334, + "learning_rate": 1.580104515729851e-05, + "loss": 11.8487, + "step": 30266 + }, + { + "epoch": 1.6481591145780157, + "grad_norm": 0.6174978287790941, + "learning_rate": 1.579628808874315e-05, + "loss": 11.8679, + "step": 30267 + }, + { + "epoch": 1.6482135685745987, + "grad_norm": 0.5420979287960093, + "learning_rate": 1.5791531674969938e-05, + "loss": 11.8612, + "step": 30268 + }, + { + "epoch": 1.6482680225711817, + "grad_norm": 0.5702792554346019, + "learning_rate": 1.5786775916015938e-05, + "loss": 11.8418, + "step": 30269 + }, + { + "epoch": 1.6483224765677646, + "grad_norm": 0.5200834366442612, + "learning_rate": 1.5782020811918075e-05, + "loss": 11.8059, + "step": 30270 + }, + { + "epoch": 1.6483769305643476, + "grad_norm": 0.5350795284900471, + "learning_rate": 1.5777266362713327e-05, + "loss": 11.8158, + "step": 30271 + }, + { + "epoch": 1.6484313845609306, + "grad_norm": 0.5144431219346606, + "learning_rate": 1.577251256843868e-05, + "loss": 11.8311, + "step": 30272 + }, + { + "epoch": 1.6484858385575136, + "grad_norm": 0.5051398509971821, + "learning_rate": 1.5767759429131123e-05, + "loss": 11.805, + "step": 30273 + }, + { + "epoch": 1.6485402925540966, + "grad_norm": 0.5139305998432078, + "learning_rate": 1.5763006944827564e-05, + "loss": 11.8075, + "step": 30274 + }, + { + "epoch": 1.6485947465506796, + "grad_norm": 0.5260812256509733, + "learning_rate": 1.5758255115565023e-05, + "loss": 11.7722, + "step": 30275 + }, + { + "epoch": 1.6486492005472626, + "grad_norm": 0.5193301853136946, + "learning_rate": 1.575350394138039e-05, + "loss": 11.7299, + "step": 30276 + }, + { + "epoch": 1.6487036545438456, + "grad_norm": 0.551471942030761, + "learning_rate": 1.5748753422310657e-05, + "loss": 11.7086, + "step": 30277 + }, + { + "epoch": 1.6487581085404286, + "grad_norm": 0.5041331920105072, + "learning_rate": 1.5744003558392727e-05, + "loss": 11.7779, + "step": 30278 + }, + { + "epoch": 1.6488125625370116, + "grad_norm": 0.5717935631152075, + "learning_rate": 1.573925434966358e-05, + "loss": 11.8936, + "step": 30279 + }, + { + "epoch": 1.6488670165335946, + "grad_norm": 0.5568158823108074, + "learning_rate": 1.5734505796160125e-05, + "loss": 11.9016, + "step": 30280 + }, + { + "epoch": 1.6489214705301776, + "grad_norm": 0.5757758848790586, + "learning_rate": 1.572975789791925e-05, + "loss": 11.8203, + "step": 30281 + }, + { + "epoch": 1.6489759245267608, + "grad_norm": 0.5473651209505923, + "learning_rate": 1.5725010654977944e-05, + "loss": 11.7335, + "step": 30282 + }, + { + "epoch": 1.6490303785233438, + "grad_norm": 0.5518298560932837, + "learning_rate": 1.572026406737306e-05, + "loss": 11.7585, + "step": 30283 + }, + { + "epoch": 1.6490848325199268, + "grad_norm": 0.5395694848191006, + "learning_rate": 1.5715518135141537e-05, + "loss": 11.8149, + "step": 30284 + }, + { + "epoch": 1.6491392865165098, + "grad_norm": 0.5357026261927198, + "learning_rate": 1.57107728583203e-05, + "loss": 11.8469, + "step": 30285 + }, + { + "epoch": 1.6491937405130928, + "grad_norm": 0.5788451144378695, + "learning_rate": 1.5706028236946213e-05, + "loss": 11.9288, + "step": 30286 + }, + { + "epoch": 1.6492481945096757, + "grad_norm": 0.49511969403709766, + "learning_rate": 1.5701284271056206e-05, + "loss": 11.7889, + "step": 30287 + }, + { + "epoch": 1.649302648506259, + "grad_norm": 0.5333781080811473, + "learning_rate": 1.5696540960687124e-05, + "loss": 11.8837, + "step": 30288 + }, + { + "epoch": 1.649357102502842, + "grad_norm": 0.485940058173729, + "learning_rate": 1.5691798305875892e-05, + "loss": 11.7135, + "step": 30289 + }, + { + "epoch": 1.649411556499425, + "grad_norm": 0.5588762528727661, + "learning_rate": 1.5687056306659385e-05, + "loss": 11.7117, + "step": 30290 + }, + { + "epoch": 1.649466010496008, + "grad_norm": 0.4969574205986947, + "learning_rate": 1.568231496307445e-05, + "loss": 11.8301, + "step": 30291 + }, + { + "epoch": 1.649520464492591, + "grad_norm": 0.5388298627014794, + "learning_rate": 1.567757427515799e-05, + "loss": 11.7702, + "step": 30292 + }, + { + "epoch": 1.649574918489174, + "grad_norm": 0.559164915027884, + "learning_rate": 1.5672834242946833e-05, + "loss": 11.8038, + "step": 30293 + }, + { + "epoch": 1.649629372485757, + "grad_norm": 0.5254196074212372, + "learning_rate": 1.5668094866477846e-05, + "loss": 11.7543, + "step": 30294 + }, + { + "epoch": 1.64968382648234, + "grad_norm": 0.5450844572147387, + "learning_rate": 1.5663356145787932e-05, + "loss": 11.8074, + "step": 30295 + }, + { + "epoch": 1.649738280478923, + "grad_norm": 0.5800732777198301, + "learning_rate": 1.5658618080913878e-05, + "loss": 11.9646, + "step": 30296 + }, + { + "epoch": 1.649792734475506, + "grad_norm": 0.5408316042486881, + "learning_rate": 1.5653880671892573e-05, + "loss": 11.7845, + "step": 30297 + }, + { + "epoch": 1.6498471884720889, + "grad_norm": 0.5049158407211579, + "learning_rate": 1.5649143918760822e-05, + "loss": 11.7873, + "step": 30298 + }, + { + "epoch": 1.6499016424686719, + "grad_norm": 0.5507430811172999, + "learning_rate": 1.5644407821555495e-05, + "loss": 11.9438, + "step": 30299 + }, + { + "epoch": 1.6499560964652549, + "grad_norm": 0.537260845691589, + "learning_rate": 1.5639672380313398e-05, + "loss": 11.714, + "step": 30300 + }, + { + "epoch": 1.6500105504618379, + "grad_norm": 0.5595521725432183, + "learning_rate": 1.563493759507133e-05, + "loss": 11.8271, + "step": 30301 + }, + { + "epoch": 1.6500650044584209, + "grad_norm": 0.5486761625586757, + "learning_rate": 1.5630203465866156e-05, + "loss": 11.8217, + "step": 30302 + }, + { + "epoch": 1.6501194584550039, + "grad_norm": 0.5580839278671575, + "learning_rate": 1.5625469992734642e-05, + "loss": 11.7532, + "step": 30303 + }, + { + "epoch": 1.6501739124515868, + "grad_norm": 0.5809952505539281, + "learning_rate": 1.5620737175713663e-05, + "loss": 11.8901, + "step": 30304 + }, + { + "epoch": 1.65022836644817, + "grad_norm": 0.543482058822921, + "learning_rate": 1.561600501483994e-05, + "loss": 11.7574, + "step": 30305 + }, + { + "epoch": 1.650282820444753, + "grad_norm": 0.5544393298231484, + "learning_rate": 1.5611273510150326e-05, + "loss": 11.7676, + "step": 30306 + }, + { + "epoch": 1.650337274441336, + "grad_norm": 0.5144533714967504, + "learning_rate": 1.5606542661681622e-05, + "loss": 11.79, + "step": 30307 + }, + { + "epoch": 1.650391728437919, + "grad_norm": 0.5170192773414732, + "learning_rate": 1.5601812469470565e-05, + "loss": 11.7147, + "step": 30308 + }, + { + "epoch": 1.650446182434502, + "grad_norm": 0.545316082602362, + "learning_rate": 1.559708293355402e-05, + "loss": 11.8639, + "step": 30309 + }, + { + "epoch": 1.650500636431085, + "grad_norm": 0.5979883381590966, + "learning_rate": 1.559235405396867e-05, + "loss": 11.9237, + "step": 30310 + }, + { + "epoch": 1.6505550904276682, + "grad_norm": 0.5953268062769637, + "learning_rate": 1.558762583075133e-05, + "loss": 11.7034, + "step": 30311 + }, + { + "epoch": 1.6506095444242512, + "grad_norm": 0.6066605854287557, + "learning_rate": 1.5582898263938784e-05, + "loss": 11.858, + "step": 30312 + }, + { + "epoch": 1.6506639984208342, + "grad_norm": 0.5447694093535638, + "learning_rate": 1.557817135356775e-05, + "loss": 11.8601, + "step": 30313 + }, + { + "epoch": 1.6507184524174172, + "grad_norm": 0.6123454736841145, + "learning_rate": 1.5573445099675045e-05, + "loss": 11.8336, + "step": 30314 + }, + { + "epoch": 1.6507729064140002, + "grad_norm": 0.5389667746472384, + "learning_rate": 1.556871950229737e-05, + "loss": 11.7927, + "step": 30315 + }, + { + "epoch": 1.6508273604105832, + "grad_norm": 0.5308657024967885, + "learning_rate": 1.5563994561471506e-05, + "loss": 11.675, + "step": 30316 + }, + { + "epoch": 1.6508818144071662, + "grad_norm": 0.525862668934587, + "learning_rate": 1.555927027723416e-05, + "loss": 11.7492, + "step": 30317 + }, + { + "epoch": 1.6509362684037492, + "grad_norm": 0.5386362426469454, + "learning_rate": 1.5554546649622094e-05, + "loss": 11.6782, + "step": 30318 + }, + { + "epoch": 1.6509907224003322, + "grad_norm": 0.5667891737255518, + "learning_rate": 1.5549823678672072e-05, + "loss": 11.9245, + "step": 30319 + }, + { + "epoch": 1.6510451763969152, + "grad_norm": 0.5157830108174235, + "learning_rate": 1.5545101364420744e-05, + "loss": 11.5794, + "step": 30320 + }, + { + "epoch": 1.6510996303934982, + "grad_norm": 0.6135712245199829, + "learning_rate": 1.5540379706904862e-05, + "loss": 11.9485, + "step": 30321 + }, + { + "epoch": 1.6511540843900812, + "grad_norm": 0.5558481003182353, + "learning_rate": 1.5535658706161183e-05, + "loss": 11.8048, + "step": 30322 + }, + { + "epoch": 1.6512085383866641, + "grad_norm": 0.5875586621045195, + "learning_rate": 1.5530938362226353e-05, + "loss": 11.8165, + "step": 30323 + }, + { + "epoch": 1.6512629923832471, + "grad_norm": 0.537000074397403, + "learning_rate": 1.552621867513715e-05, + "loss": 11.8471, + "step": 30324 + }, + { + "epoch": 1.6513174463798301, + "grad_norm": 0.557250138866359, + "learning_rate": 1.5521499644930203e-05, + "loss": 11.7105, + "step": 30325 + }, + { + "epoch": 1.6513719003764131, + "grad_norm": 0.5779101757630384, + "learning_rate": 1.5516781271642265e-05, + "loss": 11.6837, + "step": 30326 + }, + { + "epoch": 1.6514263543729961, + "grad_norm": 0.5137701020962699, + "learning_rate": 1.5512063555309985e-05, + "loss": 11.7143, + "step": 30327 + }, + { + "epoch": 1.6514808083695791, + "grad_norm": 0.5416251137432319, + "learning_rate": 1.5507346495970075e-05, + "loss": 11.7964, + "step": 30328 + }, + { + "epoch": 1.6515352623661623, + "grad_norm": 0.5257476400477481, + "learning_rate": 1.5502630093659243e-05, + "loss": 11.7226, + "step": 30329 + }, + { + "epoch": 1.6515897163627453, + "grad_norm": 0.5530633292834322, + "learning_rate": 1.549791434841409e-05, + "loss": 11.7645, + "step": 30330 + }, + { + "epoch": 1.6516441703593283, + "grad_norm": 0.4887682575319276, + "learning_rate": 1.549319926027134e-05, + "loss": 11.7486, + "step": 30331 + }, + { + "epoch": 1.6516986243559113, + "grad_norm": 0.5035435737874265, + "learning_rate": 1.5488484829267626e-05, + "loss": 11.6502, + "step": 30332 + }, + { + "epoch": 1.6517530783524943, + "grad_norm": 0.5236023357418601, + "learning_rate": 1.5483771055439634e-05, + "loss": 11.8346, + "step": 30333 + }, + { + "epoch": 1.6518075323490773, + "grad_norm": 0.5720862268818151, + "learning_rate": 1.5479057938824025e-05, + "loss": 11.7759, + "step": 30334 + }, + { + "epoch": 1.6518619863456605, + "grad_norm": 0.5189373404562976, + "learning_rate": 1.5474345479457408e-05, + "loss": 11.794, + "step": 30335 + }, + { + "epoch": 1.6519164403422435, + "grad_norm": 0.596743891031663, + "learning_rate": 1.5469633677376495e-05, + "loss": 11.8006, + "step": 30336 + }, + { + "epoch": 1.6519708943388265, + "grad_norm": 0.6333154097587338, + "learning_rate": 1.5464922532617853e-05, + "loss": 11.9669, + "step": 30337 + }, + { + "epoch": 1.6520253483354095, + "grad_norm": 0.5352705912976505, + "learning_rate": 1.546021204521817e-05, + "loss": 11.8758, + "step": 30338 + }, + { + "epoch": 1.6520798023319925, + "grad_norm": 0.5458485553451803, + "learning_rate": 1.5455502215214057e-05, + "loss": 11.8238, + "step": 30339 + }, + { + "epoch": 1.6521342563285755, + "grad_norm": 0.5712397342395055, + "learning_rate": 1.545079304264212e-05, + "loss": 11.7973, + "step": 30340 + }, + { + "epoch": 1.6521887103251585, + "grad_norm": 0.666926277135753, + "learning_rate": 1.544608452753902e-05, + "loss": 11.8262, + "step": 30341 + }, + { + "epoch": 1.6522431643217415, + "grad_norm": 0.543368776603423, + "learning_rate": 1.544137666994131e-05, + "loss": 11.8499, + "step": 30342 + }, + { + "epoch": 1.6522976183183244, + "grad_norm": 0.5593660809025408, + "learning_rate": 1.5436669469885645e-05, + "loss": 11.8503, + "step": 30343 + }, + { + "epoch": 1.6523520723149074, + "grad_norm": 0.5784121439909142, + "learning_rate": 1.543196292740864e-05, + "loss": 11.7, + "step": 30344 + }, + { + "epoch": 1.6524065263114904, + "grad_norm": 0.5811856404692756, + "learning_rate": 1.542725704254685e-05, + "loss": 11.8399, + "step": 30345 + }, + { + "epoch": 1.6524609803080734, + "grad_norm": 0.5337992690973378, + "learning_rate": 1.5422551815336916e-05, + "loss": 11.7704, + "step": 30346 + }, + { + "epoch": 1.6525154343046564, + "grad_norm": 0.5361280886355042, + "learning_rate": 1.5417847245815387e-05, + "loss": 11.6659, + "step": 30347 + }, + { + "epoch": 1.6525698883012394, + "grad_norm": 0.5805437058398001, + "learning_rate": 1.5413143334018875e-05, + "loss": 11.8149, + "step": 30348 + }, + { + "epoch": 1.6526243422978224, + "grad_norm": 0.5204293584801625, + "learning_rate": 1.5408440079983945e-05, + "loss": 11.7907, + "step": 30349 + }, + { + "epoch": 1.6526787962944054, + "grad_norm": 0.5598694718774125, + "learning_rate": 1.540373748374715e-05, + "loss": 11.7298, + "step": 30350 + }, + { + "epoch": 1.6527332502909884, + "grad_norm": 0.5730257959846164, + "learning_rate": 1.539903554534511e-05, + "loss": 11.7879, + "step": 30351 + }, + { + "epoch": 1.6527877042875716, + "grad_norm": 0.5471553983808888, + "learning_rate": 1.5394334264814326e-05, + "loss": 11.796, + "step": 30352 + }, + { + "epoch": 1.6528421582841546, + "grad_norm": 0.6051570299155735, + "learning_rate": 1.5389633642191415e-05, + "loss": 11.8685, + "step": 30353 + }, + { + "epoch": 1.6528966122807376, + "grad_norm": 0.5979375283709545, + "learning_rate": 1.5384933677512868e-05, + "loss": 11.8454, + "step": 30354 + }, + { + "epoch": 1.6529510662773206, + "grad_norm": 0.5348757715511375, + "learning_rate": 1.538023437081528e-05, + "loss": 11.883, + "step": 30355 + }, + { + "epoch": 1.6530055202739036, + "grad_norm": 0.705201060188786, + "learning_rate": 1.5375535722135203e-05, + "loss": 11.8147, + "step": 30356 + }, + { + "epoch": 1.6530599742704866, + "grad_norm": 0.581476274725346, + "learning_rate": 1.5370837731509125e-05, + "loss": 11.8792, + "step": 30357 + }, + { + "epoch": 1.6531144282670698, + "grad_norm": 0.5791998452877999, + "learning_rate": 1.5366140398973627e-05, + "loss": 11.8992, + "step": 30358 + }, + { + "epoch": 1.6531688822636528, + "grad_norm": 0.56410517208088, + "learning_rate": 1.5361443724565183e-05, + "loss": 11.8791, + "step": 30359 + }, + { + "epoch": 1.6532233362602358, + "grad_norm": 0.6183107415967306, + "learning_rate": 1.5356747708320386e-05, + "loss": 11.7988, + "step": 30360 + }, + { + "epoch": 1.6532777902568188, + "grad_norm": 0.6875640496560392, + "learning_rate": 1.5352052350275703e-05, + "loss": 11.8829, + "step": 30361 + }, + { + "epoch": 1.6533322442534018, + "grad_norm": 0.5497935646659076, + "learning_rate": 1.534735765046763e-05, + "loss": 11.791, + "step": 30362 + }, + { + "epoch": 1.6533866982499847, + "grad_norm": 0.5619957708161861, + "learning_rate": 1.5342663608932738e-05, + "loss": 11.8213, + "step": 30363 + }, + { + "epoch": 1.6534411522465677, + "grad_norm": 0.5555121280641989, + "learning_rate": 1.5337970225707456e-05, + "loss": 11.882, + "step": 30364 + }, + { + "epoch": 1.6534956062431507, + "grad_norm": 0.48188241317873043, + "learning_rate": 1.5333277500828337e-05, + "loss": 11.8077, + "step": 30365 + }, + { + "epoch": 1.6535500602397337, + "grad_norm": 0.6001084996859429, + "learning_rate": 1.5328585434331834e-05, + "loss": 11.8115, + "step": 30366 + }, + { + "epoch": 1.6536045142363167, + "grad_norm": 0.5428956276223575, + "learning_rate": 1.5323894026254448e-05, + "loss": 11.8563, + "step": 30367 + }, + { + "epoch": 1.6536589682328997, + "grad_norm": 0.5685570349966749, + "learning_rate": 1.531920327663269e-05, + "loss": 11.7507, + "step": 30368 + }, + { + "epoch": 1.6537134222294827, + "grad_norm": 0.5485690021047516, + "learning_rate": 1.5314513185502976e-05, + "loss": 11.8139, + "step": 30369 + }, + { + "epoch": 1.6537678762260657, + "grad_norm": 0.5509489858868376, + "learning_rate": 1.530982375290184e-05, + "loss": 11.8292, + "step": 30370 + }, + { + "epoch": 1.6538223302226487, + "grad_norm": 0.5656163722575241, + "learning_rate": 1.530513497886571e-05, + "loss": 11.7971, + "step": 30371 + }, + { + "epoch": 1.6538767842192317, + "grad_norm": 0.5823733713054023, + "learning_rate": 1.530044686343104e-05, + "loss": 11.8057, + "step": 30372 + }, + { + "epoch": 1.6539312382158147, + "grad_norm": 0.5984367437471964, + "learning_rate": 1.5295759406634312e-05, + "loss": 11.8007, + "step": 30373 + }, + { + "epoch": 1.6539856922123977, + "grad_norm": 0.5833388668835244, + "learning_rate": 1.5291072608511946e-05, + "loss": 11.7687, + "step": 30374 + }, + { + "epoch": 1.6540401462089809, + "grad_norm": 0.614076766546919, + "learning_rate": 1.5286386469100434e-05, + "loss": 11.9744, + "step": 30375 + }, + { + "epoch": 1.6540946002055639, + "grad_norm": 0.5085048132639166, + "learning_rate": 1.5281700988436153e-05, + "loss": 11.6403, + "step": 30376 + }, + { + "epoch": 1.6541490542021469, + "grad_norm": 0.5215632385558154, + "learning_rate": 1.527701616655558e-05, + "loss": 11.7214, + "step": 30377 + }, + { + "epoch": 1.6542035081987299, + "grad_norm": 0.532165653013137, + "learning_rate": 1.527233200349516e-05, + "loss": 11.7481, + "step": 30378 + }, + { + "epoch": 1.6542579621953128, + "grad_norm": 0.6044869715802822, + "learning_rate": 1.5267648499291266e-05, + "loss": 11.8685, + "step": 30379 + }, + { + "epoch": 1.6543124161918958, + "grad_norm": 0.5182815276832565, + "learning_rate": 1.52629656539804e-05, + "loss": 11.8329, + "step": 30380 + }, + { + "epoch": 1.654366870188479, + "grad_norm": 0.5775482579320368, + "learning_rate": 1.5258283467598878e-05, + "loss": 11.8831, + "step": 30381 + }, + { + "epoch": 1.654421324185062, + "grad_norm": 0.5034491015701649, + "learning_rate": 1.525360194018315e-05, + "loss": 11.7911, + "step": 30382 + }, + { + "epoch": 1.654475778181645, + "grad_norm": 0.52738638170991, + "learning_rate": 1.524892107176964e-05, + "loss": 11.6845, + "step": 30383 + }, + { + "epoch": 1.654530232178228, + "grad_norm": 0.587513577590588, + "learning_rate": 1.5244240862394721e-05, + "loss": 11.806, + "step": 30384 + }, + { + "epoch": 1.654584686174811, + "grad_norm": 0.6151047411750562, + "learning_rate": 1.523956131209482e-05, + "loss": 11.7422, + "step": 30385 + }, + { + "epoch": 1.654639140171394, + "grad_norm": 0.4971705865165115, + "learning_rate": 1.5234882420906282e-05, + "loss": 11.7241, + "step": 30386 + }, + { + "epoch": 1.654693594167977, + "grad_norm": 0.6124434570136407, + "learning_rate": 1.5230204188865538e-05, + "loss": 11.8893, + "step": 30387 + }, + { + "epoch": 1.65474804816456, + "grad_norm": 0.5647274186823761, + "learning_rate": 1.5225526616008912e-05, + "loss": 11.7533, + "step": 30388 + }, + { + "epoch": 1.654802502161143, + "grad_norm": 0.5205825535187503, + "learning_rate": 1.5220849702372819e-05, + "loss": 11.8398, + "step": 30389 + }, + { + "epoch": 1.654856956157726, + "grad_norm": 0.5137468990488969, + "learning_rate": 1.5216173447993654e-05, + "loss": 11.7658, + "step": 30390 + }, + { + "epoch": 1.654911410154309, + "grad_norm": 0.5846701781531567, + "learning_rate": 1.5211497852907697e-05, + "loss": 11.712, + "step": 30391 + }, + { + "epoch": 1.654965864150892, + "grad_norm": 0.5596722748678072, + "learning_rate": 1.5206822917151353e-05, + "loss": 11.8297, + "step": 30392 + }, + { + "epoch": 1.655020318147475, + "grad_norm": 0.6166217791286542, + "learning_rate": 1.5202148640760993e-05, + "loss": 11.888, + "step": 30393 + }, + { + "epoch": 1.655074772144058, + "grad_norm": 0.5143980147394284, + "learning_rate": 1.5197475023772933e-05, + "loss": 11.7012, + "step": 30394 + }, + { + "epoch": 1.655129226140641, + "grad_norm": 0.6011240633541712, + "learning_rate": 1.5192802066223543e-05, + "loss": 11.9945, + "step": 30395 + }, + { + "epoch": 1.655183680137224, + "grad_norm": 0.5237221533972257, + "learning_rate": 1.5188129768149129e-05, + "loss": 11.7944, + "step": 30396 + }, + { + "epoch": 1.655238134133807, + "grad_norm": 0.5219806364379025, + "learning_rate": 1.518345812958606e-05, + "loss": 11.7039, + "step": 30397 + }, + { + "epoch": 1.65529258813039, + "grad_norm": 0.5315686986719056, + "learning_rate": 1.5178787150570617e-05, + "loss": 11.8195, + "step": 30398 + }, + { + "epoch": 1.6553470421269731, + "grad_norm": 0.5243077468399364, + "learning_rate": 1.5174116831139185e-05, + "loss": 11.7269, + "step": 30399 + }, + { + "epoch": 1.6554014961235561, + "grad_norm": 0.4962328515717563, + "learning_rate": 1.5169447171328032e-05, + "loss": 11.7835, + "step": 30400 + }, + { + "epoch": 1.6554559501201391, + "grad_norm": 0.5297610904104818, + "learning_rate": 1.5164778171173465e-05, + "loss": 12.0246, + "step": 30401 + }, + { + "epoch": 1.6555104041167221, + "grad_norm": 0.5402781603697728, + "learning_rate": 1.5160109830711832e-05, + "loss": 11.855, + "step": 30402 + }, + { + "epoch": 1.6555648581133051, + "grad_norm": 0.7292855827356006, + "learning_rate": 1.5155442149979393e-05, + "loss": 11.8603, + "step": 30403 + }, + { + "epoch": 1.6556193121098883, + "grad_norm": 0.574548793554654, + "learning_rate": 1.5150775129012473e-05, + "loss": 11.7707, + "step": 30404 + }, + { + "epoch": 1.6556737661064713, + "grad_norm": 0.5715973146866069, + "learning_rate": 1.5146108767847367e-05, + "loss": 11.8918, + "step": 30405 + }, + { + "epoch": 1.6557282201030543, + "grad_norm": 0.5483522380286712, + "learning_rate": 1.5141443066520323e-05, + "loss": 11.7617, + "step": 30406 + }, + { + "epoch": 1.6557826740996373, + "grad_norm": 0.49708881440381997, + "learning_rate": 1.5136778025067678e-05, + "loss": 11.7585, + "step": 30407 + }, + { + "epoch": 1.6558371280962203, + "grad_norm": 0.5767574518299988, + "learning_rate": 1.5132113643525658e-05, + "loss": 11.7842, + "step": 30408 + }, + { + "epoch": 1.6558915820928033, + "grad_norm": 0.553466801046347, + "learning_rate": 1.512744992193057e-05, + "loss": 11.7271, + "step": 30409 + }, + { + "epoch": 1.6559460360893863, + "grad_norm": 0.5508168131996588, + "learning_rate": 1.5122786860318671e-05, + "loss": 11.7983, + "step": 30410 + }, + { + "epoch": 1.6560004900859693, + "grad_norm": 0.5412522124930113, + "learning_rate": 1.511812445872619e-05, + "loss": 11.8493, + "step": 30411 + }, + { + "epoch": 1.6560549440825523, + "grad_norm": 0.762679716974236, + "learning_rate": 1.511346271718943e-05, + "loss": 11.753, + "step": 30412 + }, + { + "epoch": 1.6561093980791353, + "grad_norm": 0.5304688621580921, + "learning_rate": 1.5108801635744596e-05, + "loss": 11.9053, + "step": 30413 + }, + { + "epoch": 1.6561638520757183, + "grad_norm": 0.5631865236162934, + "learning_rate": 1.510414121442798e-05, + "loss": 11.7591, + "step": 30414 + }, + { + "epoch": 1.6562183060723012, + "grad_norm": 0.5897639777853539, + "learning_rate": 1.5099481453275777e-05, + "loss": 11.7983, + "step": 30415 + }, + { + "epoch": 1.6562727600688842, + "grad_norm": 0.5405331228809653, + "learning_rate": 1.5094822352324246e-05, + "loss": 11.7762, + "step": 30416 + }, + { + "epoch": 1.6563272140654672, + "grad_norm": 0.5257861697223148, + "learning_rate": 1.5090163911609633e-05, + "loss": 11.7752, + "step": 30417 + }, + { + "epoch": 1.6563816680620502, + "grad_norm": 0.5135910112359288, + "learning_rate": 1.5085506131168125e-05, + "loss": 11.6758, + "step": 30418 + }, + { + "epoch": 1.6564361220586332, + "grad_norm": 0.5315099764161411, + "learning_rate": 1.508084901103598e-05, + "loss": 11.7313, + "step": 30419 + }, + { + "epoch": 1.6564905760552162, + "grad_norm": 0.5490301029222859, + "learning_rate": 1.5076192551249402e-05, + "loss": 11.8143, + "step": 30420 + }, + { + "epoch": 1.6565450300517992, + "grad_norm": 0.5209767159520465, + "learning_rate": 1.5071536751844562e-05, + "loss": 11.7725, + "step": 30421 + }, + { + "epoch": 1.6565994840483824, + "grad_norm": 0.5816017991475442, + "learning_rate": 1.506688161285772e-05, + "loss": 11.8754, + "step": 30422 + }, + { + "epoch": 1.6566539380449654, + "grad_norm": 0.5109652757838231, + "learning_rate": 1.5062227134325024e-05, + "loss": 11.5619, + "step": 30423 + }, + { + "epoch": 1.6567083920415484, + "grad_norm": 0.5540279756642714, + "learning_rate": 1.5057573316282725e-05, + "loss": 11.7301, + "step": 30424 + }, + { + "epoch": 1.6567628460381314, + "grad_norm": 0.524685815297852, + "learning_rate": 1.5052920158766948e-05, + "loss": 11.8003, + "step": 30425 + }, + { + "epoch": 1.6568173000347144, + "grad_norm": 0.5097022926707467, + "learning_rate": 1.5048267661813919e-05, + "loss": 11.7585, + "step": 30426 + }, + { + "epoch": 1.6568717540312974, + "grad_norm": 0.545251790768531, + "learning_rate": 1.5043615825459823e-05, + "loss": 11.8773, + "step": 30427 + }, + { + "epoch": 1.6569262080278806, + "grad_norm": 0.5084444355519825, + "learning_rate": 1.5038964649740806e-05, + "loss": 11.8479, + "step": 30428 + }, + { + "epoch": 1.6569806620244636, + "grad_norm": 0.5251794274872539, + "learning_rate": 1.5034314134693061e-05, + "loss": 11.6698, + "step": 30429 + }, + { + "epoch": 1.6570351160210466, + "grad_norm": 0.6197933700010179, + "learning_rate": 1.502966428035274e-05, + "loss": 11.9312, + "step": 30430 + }, + { + "epoch": 1.6570895700176296, + "grad_norm": 0.5446950480380597, + "learning_rate": 1.5025015086755968e-05, + "loss": 11.8244, + "step": 30431 + }, + { + "epoch": 1.6571440240142126, + "grad_norm": 0.49786773049131056, + "learning_rate": 1.5020366553938958e-05, + "loss": 11.7073, + "step": 30432 + }, + { + "epoch": 1.6571984780107956, + "grad_norm": 0.5520210459942693, + "learning_rate": 1.5015718681937818e-05, + "loss": 11.867, + "step": 30433 + }, + { + "epoch": 1.6572529320073786, + "grad_norm": 0.5915950089523143, + "learning_rate": 1.5011071470788706e-05, + "loss": 11.8472, + "step": 30434 + }, + { + "epoch": 1.6573073860039615, + "grad_norm": 0.5431760172320985, + "learning_rate": 1.500642492052774e-05, + "loss": 11.9128, + "step": 30435 + }, + { + "epoch": 1.6573618400005445, + "grad_norm": 0.5358030956140224, + "learning_rate": 1.5001779031191087e-05, + "loss": 11.7488, + "step": 30436 + }, + { + "epoch": 1.6574162939971275, + "grad_norm": 0.6157510286347626, + "learning_rate": 1.4997133802814845e-05, + "loss": 11.8955, + "step": 30437 + }, + { + "epoch": 1.6574707479937105, + "grad_norm": 0.5774760449390086, + "learning_rate": 1.4992489235435126e-05, + "loss": 11.8392, + "step": 30438 + }, + { + "epoch": 1.6575252019902935, + "grad_norm": 0.5401170132973687, + "learning_rate": 1.4987845329088101e-05, + "loss": 11.7123, + "step": 30439 + }, + { + "epoch": 1.6575796559868765, + "grad_norm": 0.5495759029012905, + "learning_rate": 1.4983202083809844e-05, + "loss": 11.9677, + "step": 30440 + }, + { + "epoch": 1.6576341099834595, + "grad_norm": 0.5041097356224132, + "learning_rate": 1.4978559499636447e-05, + "loss": 11.7353, + "step": 30441 + }, + { + "epoch": 1.6576885639800425, + "grad_norm": 0.5175543807236078, + "learning_rate": 1.4973917576604046e-05, + "loss": 11.7952, + "step": 30442 + }, + { + "epoch": 1.6577430179766255, + "grad_norm": 0.5684874834799495, + "learning_rate": 1.4969276314748692e-05, + "loss": 11.7802, + "step": 30443 + }, + { + "epoch": 1.6577974719732085, + "grad_norm": 0.5264629618291342, + "learning_rate": 1.4964635714106545e-05, + "loss": 11.7771, + "step": 30444 + }, + { + "epoch": 1.6578519259697917, + "grad_norm": 0.6101260725520791, + "learning_rate": 1.495999577471362e-05, + "loss": 11.7677, + "step": 30445 + }, + { + "epoch": 1.6579063799663747, + "grad_norm": 0.5461996462767426, + "learning_rate": 1.4955356496606042e-05, + "loss": 11.8557, + "step": 30446 + }, + { + "epoch": 1.6579608339629577, + "grad_norm": 0.5690858666358461, + "learning_rate": 1.4950717879819864e-05, + "loss": 11.7328, + "step": 30447 + }, + { + "epoch": 1.6580152879595407, + "grad_norm": 0.5042772098131838, + "learning_rate": 1.4946079924391165e-05, + "loss": 11.7353, + "step": 30448 + }, + { + "epoch": 1.6580697419561237, + "grad_norm": 0.5551249340875368, + "learning_rate": 1.4941442630356028e-05, + "loss": 11.8102, + "step": 30449 + }, + { + "epoch": 1.6581241959527067, + "grad_norm": 0.6822976424863264, + "learning_rate": 1.4936805997750503e-05, + "loss": 11.9755, + "step": 30450 + }, + { + "epoch": 1.6581786499492899, + "grad_norm": 0.5217195502125209, + "learning_rate": 1.4932170026610636e-05, + "loss": 11.7261, + "step": 30451 + }, + { + "epoch": 1.6582331039458729, + "grad_norm": 0.5350002273576602, + "learning_rate": 1.4927534716972468e-05, + "loss": 11.8111, + "step": 30452 + }, + { + "epoch": 1.6582875579424559, + "grad_norm": 0.5409593969053802, + "learning_rate": 1.4922900068872048e-05, + "loss": 11.8507, + "step": 30453 + }, + { + "epoch": 1.6583420119390389, + "grad_norm": 0.5343336682199202, + "learning_rate": 1.4918266082345444e-05, + "loss": 11.8041, + "step": 30454 + }, + { + "epoch": 1.6583964659356218, + "grad_norm": 0.5439666242708063, + "learning_rate": 1.4913632757428652e-05, + "loss": 11.8396, + "step": 30455 + }, + { + "epoch": 1.6584509199322048, + "grad_norm": 0.5078023638394096, + "learning_rate": 1.4909000094157743e-05, + "loss": 11.7087, + "step": 30456 + }, + { + "epoch": 1.6585053739287878, + "grad_norm": 0.5040080596064688, + "learning_rate": 1.4904368092568699e-05, + "loss": 11.7599, + "step": 30457 + }, + { + "epoch": 1.6585598279253708, + "grad_norm": 0.5456376162897237, + "learning_rate": 1.489973675269758e-05, + "loss": 11.7348, + "step": 30458 + }, + { + "epoch": 1.6586142819219538, + "grad_norm": 0.5226602620290465, + "learning_rate": 1.4895106074580355e-05, + "loss": 11.8768, + "step": 30459 + }, + { + "epoch": 1.6586687359185368, + "grad_norm": 0.5421098455850706, + "learning_rate": 1.4890476058253089e-05, + "loss": 11.8537, + "step": 30460 + }, + { + "epoch": 1.6587231899151198, + "grad_norm": 0.52238904348751, + "learning_rate": 1.4885846703751749e-05, + "loss": 11.8057, + "step": 30461 + }, + { + "epoch": 1.6587776439117028, + "grad_norm": 0.5643276611856208, + "learning_rate": 1.4881218011112308e-05, + "loss": 11.84, + "step": 30462 + }, + { + "epoch": 1.6588320979082858, + "grad_norm": 0.5108632330040422, + "learning_rate": 1.4876589980370825e-05, + "loss": 11.7637, + "step": 30463 + }, + { + "epoch": 1.6588865519048688, + "grad_norm": 0.5748429324447424, + "learning_rate": 1.4871962611563218e-05, + "loss": 11.6599, + "step": 30464 + }, + { + "epoch": 1.6589410059014518, + "grad_norm": 0.5685506754204068, + "learning_rate": 1.486733590472551e-05, + "loss": 11.7119, + "step": 30465 + }, + { + "epoch": 1.6589954598980348, + "grad_norm": 0.5518315507953145, + "learning_rate": 1.4862709859893708e-05, + "loss": 11.8669, + "step": 30466 + }, + { + "epoch": 1.6590499138946178, + "grad_norm": 0.5161797691907695, + "learning_rate": 1.4858084477103717e-05, + "loss": 11.8546, + "step": 30467 + }, + { + "epoch": 1.6591043678912007, + "grad_norm": 0.5500540514779502, + "learning_rate": 1.4853459756391564e-05, + "loss": 11.891, + "step": 30468 + }, + { + "epoch": 1.659158821887784, + "grad_norm": 0.5261406948857814, + "learning_rate": 1.4848835697793174e-05, + "loss": 11.8261, + "step": 30469 + }, + { + "epoch": 1.659213275884367, + "grad_norm": 0.5221450123639628, + "learning_rate": 1.484421230134453e-05, + "loss": 11.8626, + "step": 30470 + }, + { + "epoch": 1.65926772988095, + "grad_norm": 0.5370308777913975, + "learning_rate": 1.483958956708157e-05, + "loss": 11.7647, + "step": 30471 + }, + { + "epoch": 1.659322183877533, + "grad_norm": 0.5451917737767658, + "learning_rate": 1.483496749504022e-05, + "loss": 11.7655, + "step": 30472 + }, + { + "epoch": 1.659376637874116, + "grad_norm": 0.5512427098784394, + "learning_rate": 1.4830346085256475e-05, + "loss": 11.728, + "step": 30473 + }, + { + "epoch": 1.6594310918706991, + "grad_norm": 0.48879493807625274, + "learning_rate": 1.4825725337766217e-05, + "loss": 11.7241, + "step": 30474 + }, + { + "epoch": 1.6594855458672821, + "grad_norm": 0.5323193600760803, + "learning_rate": 1.4821105252605404e-05, + "loss": 11.6926, + "step": 30475 + }, + { + "epoch": 1.6595399998638651, + "grad_norm": 0.5357552343034483, + "learning_rate": 1.481648582980998e-05, + "loss": 11.873, + "step": 30476 + }, + { + "epoch": 1.6595944538604481, + "grad_norm": 0.5843543350729047, + "learning_rate": 1.481186706941583e-05, + "loss": 11.749, + "step": 30477 + }, + { + "epoch": 1.6596489078570311, + "grad_norm": 0.6312598436209041, + "learning_rate": 1.4807248971458898e-05, + "loss": 11.9846, + "step": 30478 + }, + { + "epoch": 1.6597033618536141, + "grad_norm": 0.6070430916918202, + "learning_rate": 1.4802631535975076e-05, + "loss": 11.8512, + "step": 30479 + }, + { + "epoch": 1.659757815850197, + "grad_norm": 0.5799074330813278, + "learning_rate": 1.47980147630003e-05, + "loss": 11.8646, + "step": 30480 + }, + { + "epoch": 1.65981226984678, + "grad_norm": 0.5673598018127015, + "learning_rate": 1.4793398652570445e-05, + "loss": 11.7869, + "step": 30481 + }, + { + "epoch": 1.659866723843363, + "grad_norm": 0.5577040261014342, + "learning_rate": 1.478878320472139e-05, + "loss": 11.8044, + "step": 30482 + }, + { + "epoch": 1.659921177839946, + "grad_norm": 0.5914851761444279, + "learning_rate": 1.4784168419489065e-05, + "loss": 11.926, + "step": 30483 + }, + { + "epoch": 1.659975631836529, + "grad_norm": 0.5864248361351323, + "learning_rate": 1.4779554296909315e-05, + "loss": 11.8559, + "step": 30484 + }, + { + "epoch": 1.660030085833112, + "grad_norm": 0.5727600462604755, + "learning_rate": 1.477494083701807e-05, + "loss": 11.7601, + "step": 30485 + }, + { + "epoch": 1.660084539829695, + "grad_norm": 0.5837628149486116, + "learning_rate": 1.4770328039851155e-05, + "loss": 11.8826, + "step": 30486 + }, + { + "epoch": 1.660138993826278, + "grad_norm": 0.5257373739101987, + "learning_rate": 1.4765715905444455e-05, + "loss": 11.7619, + "step": 30487 + }, + { + "epoch": 1.660193447822861, + "grad_norm": 0.5569929326785522, + "learning_rate": 1.4761104433833873e-05, + "loss": 11.7252, + "step": 30488 + }, + { + "epoch": 1.660247901819444, + "grad_norm": 0.5596793453691836, + "learning_rate": 1.4756493625055211e-05, + "loss": 11.5962, + "step": 30489 + }, + { + "epoch": 1.660302355816027, + "grad_norm": 0.5268308993081, + "learning_rate": 1.475188347914438e-05, + "loss": 11.8873, + "step": 30490 + }, + { + "epoch": 1.66035680981261, + "grad_norm": 0.5503512179842955, + "learning_rate": 1.474727399613719e-05, + "loss": 11.9062, + "step": 30491 + }, + { + "epoch": 1.6604112638091932, + "grad_norm": 0.5652508792980745, + "learning_rate": 1.474266517606947e-05, + "loss": 11.92, + "step": 30492 + }, + { + "epoch": 1.6604657178057762, + "grad_norm": 0.4919906257929225, + "learning_rate": 1.4738057018977114e-05, + "loss": 11.8253, + "step": 30493 + }, + { + "epoch": 1.6605201718023592, + "grad_norm": 0.5038695577291233, + "learning_rate": 1.4733449524895893e-05, + "loss": 11.8345, + "step": 30494 + }, + { + "epoch": 1.6605746257989422, + "grad_norm": 0.5230842360938279, + "learning_rate": 1.47288426938617e-05, + "loss": 11.7675, + "step": 30495 + }, + { + "epoch": 1.6606290797955252, + "grad_norm": 0.5818535948817662, + "learning_rate": 1.4724236525910296e-05, + "loss": 11.876, + "step": 30496 + }, + { + "epoch": 1.6606835337921082, + "grad_norm": 0.5106333140474635, + "learning_rate": 1.471963102107753e-05, + "loss": 11.5331, + "step": 30497 + }, + { + "epoch": 1.6607379877886914, + "grad_norm": 0.5344151145727453, + "learning_rate": 1.4715026179399239e-05, + "loss": 11.7591, + "step": 30498 + }, + { + "epoch": 1.6607924417852744, + "grad_norm": 0.500887120599743, + "learning_rate": 1.4710422000911183e-05, + "loss": 11.7601, + "step": 30499 + }, + { + "epoch": 1.6608468957818574, + "grad_norm": 0.5596855613702822, + "learning_rate": 1.4705818485649237e-05, + "loss": 11.6184, + "step": 30500 + }, + { + "epoch": 1.6609013497784404, + "grad_norm": 0.6390268041634257, + "learning_rate": 1.4701215633649102e-05, + "loss": 11.8665, + "step": 30501 + }, + { + "epoch": 1.6609558037750234, + "grad_norm": 0.5300312005429197, + "learning_rate": 1.4696613444946627e-05, + "loss": 11.7044, + "step": 30502 + }, + { + "epoch": 1.6610102577716064, + "grad_norm": 0.5479748636290885, + "learning_rate": 1.4692011919577609e-05, + "loss": 11.6887, + "step": 30503 + }, + { + "epoch": 1.6610647117681894, + "grad_norm": 0.5865166959549861, + "learning_rate": 1.4687411057577782e-05, + "loss": 11.8067, + "step": 30504 + }, + { + "epoch": 1.6611191657647724, + "grad_norm": 0.631559322131889, + "learning_rate": 1.4682810858982986e-05, + "loss": 11.8915, + "step": 30505 + }, + { + "epoch": 1.6611736197613554, + "grad_norm": 0.6867109202353935, + "learning_rate": 1.4678211323828939e-05, + "loss": 11.8789, + "step": 30506 + }, + { + "epoch": 1.6612280737579384, + "grad_norm": 0.5306137175143503, + "learning_rate": 1.4673612452151441e-05, + "loss": 11.8295, + "step": 30507 + }, + { + "epoch": 1.6612825277545213, + "grad_norm": 0.5384906241963868, + "learning_rate": 1.4669014243986224e-05, + "loss": 11.7728, + "step": 30508 + }, + { + "epoch": 1.6613369817511043, + "grad_norm": 0.5363580618189482, + "learning_rate": 1.4664416699369065e-05, + "loss": 11.643, + "step": 30509 + }, + { + "epoch": 1.6613914357476873, + "grad_norm": 0.5276766457965232, + "learning_rate": 1.4659819818335729e-05, + "loss": 11.7706, + "step": 30510 + }, + { + "epoch": 1.6614458897442703, + "grad_norm": 0.5972428001730457, + "learning_rate": 1.4655223600921952e-05, + "loss": 11.7991, + "step": 30511 + }, + { + "epoch": 1.6615003437408533, + "grad_norm": 0.5463440321434727, + "learning_rate": 1.4650628047163472e-05, + "loss": 11.7979, + "step": 30512 + }, + { + "epoch": 1.6615547977374363, + "grad_norm": 0.5086481462827724, + "learning_rate": 1.4646033157095996e-05, + "loss": 11.8116, + "step": 30513 + }, + { + "epoch": 1.6616092517340193, + "grad_norm": 0.553074260326025, + "learning_rate": 1.4641438930755268e-05, + "loss": 11.7544, + "step": 30514 + }, + { + "epoch": 1.6616637057306025, + "grad_norm": 0.5367150667301452, + "learning_rate": 1.4636845368177065e-05, + "loss": 11.8423, + "step": 30515 + }, + { + "epoch": 1.6617181597271855, + "grad_norm": 0.56086017051629, + "learning_rate": 1.4632252469397034e-05, + "loss": 11.8746, + "step": 30516 + }, + { + "epoch": 1.6617726137237685, + "grad_norm": 0.5090796710024545, + "learning_rate": 1.4627660234450957e-05, + "loss": 11.8088, + "step": 30517 + }, + { + "epoch": 1.6618270677203515, + "grad_norm": 0.5647977886294014, + "learning_rate": 1.4623068663374484e-05, + "loss": 11.8778, + "step": 30518 + }, + { + "epoch": 1.6618815217169345, + "grad_norm": 0.5852868407151046, + "learning_rate": 1.4618477756203364e-05, + "loss": 11.7201, + "step": 30519 + }, + { + "epoch": 1.6619359757135175, + "grad_norm": 0.6172226511052394, + "learning_rate": 1.4613887512973257e-05, + "loss": 11.7215, + "step": 30520 + }, + { + "epoch": 1.6619904297101007, + "grad_norm": 0.5306618304665179, + "learning_rate": 1.4609297933719913e-05, + "loss": 11.7218, + "step": 30521 + }, + { + "epoch": 1.6620448837066837, + "grad_norm": 0.5640490787690716, + "learning_rate": 1.460470901847898e-05, + "loss": 11.8726, + "step": 30522 + }, + { + "epoch": 1.6620993377032667, + "grad_norm": 0.589495949418483, + "learning_rate": 1.460012076728613e-05, + "loss": 11.8359, + "step": 30523 + }, + { + "epoch": 1.6621537916998497, + "grad_norm": 0.5767911074678742, + "learning_rate": 1.4595533180177057e-05, + "loss": 11.8568, + "step": 30524 + }, + { + "epoch": 1.6622082456964327, + "grad_norm": 0.5925122392277705, + "learning_rate": 1.4590946257187465e-05, + "loss": 11.7018, + "step": 30525 + }, + { + "epoch": 1.6622626996930157, + "grad_norm": 0.5372430755545777, + "learning_rate": 1.4586359998352984e-05, + "loss": 11.6798, + "step": 30526 + }, + { + "epoch": 1.6623171536895986, + "grad_norm": 0.5798060665421161, + "learning_rate": 1.4581774403709303e-05, + "loss": 11.8282, + "step": 30527 + }, + { + "epoch": 1.6623716076861816, + "grad_norm": 0.5419721798492204, + "learning_rate": 1.4577189473292053e-05, + "loss": 11.8512, + "step": 30528 + }, + { + "epoch": 1.6624260616827646, + "grad_norm": 0.5229119328459965, + "learning_rate": 1.4572605207136925e-05, + "loss": 11.8269, + "step": 30529 + }, + { + "epoch": 1.6624805156793476, + "grad_norm": 0.5830340866341808, + "learning_rate": 1.4568021605279525e-05, + "loss": 11.8343, + "step": 30530 + }, + { + "epoch": 1.6625349696759306, + "grad_norm": 0.5839352903112112, + "learning_rate": 1.4563438667755536e-05, + "loss": 11.7647, + "step": 30531 + }, + { + "epoch": 1.6625894236725136, + "grad_norm": 0.5442136620128387, + "learning_rate": 1.4558856394600573e-05, + "loss": 11.7684, + "step": 30532 + }, + { + "epoch": 1.6626438776690966, + "grad_norm": 0.5287246160487524, + "learning_rate": 1.455427478585024e-05, + "loss": 11.7827, + "step": 30533 + }, + { + "epoch": 1.6626983316656796, + "grad_norm": 0.6048785959870898, + "learning_rate": 1.4549693841540235e-05, + "loss": 11.8265, + "step": 30534 + }, + { + "epoch": 1.6627527856622626, + "grad_norm": 0.5371811579078356, + "learning_rate": 1.4545113561706114e-05, + "loss": 11.9294, + "step": 30535 + }, + { + "epoch": 1.6628072396588456, + "grad_norm": 0.5189119070180531, + "learning_rate": 1.4540533946383516e-05, + "loss": 11.7807, + "step": 30536 + }, + { + "epoch": 1.6628616936554286, + "grad_norm": 0.5601368480764733, + "learning_rate": 1.4535954995608081e-05, + "loss": 11.7246, + "step": 30537 + }, + { + "epoch": 1.6629161476520118, + "grad_norm": 0.5494316509945332, + "learning_rate": 1.4531376709415368e-05, + "loss": 11.8384, + "step": 30538 + }, + { + "epoch": 1.6629706016485948, + "grad_norm": 0.521643862118866, + "learning_rate": 1.4526799087841037e-05, + "loss": 11.7458, + "step": 30539 + }, + { + "epoch": 1.6630250556451778, + "grad_norm": 0.5813182171851603, + "learning_rate": 1.4522222130920616e-05, + "loss": 11.8722, + "step": 30540 + }, + { + "epoch": 1.6630795096417608, + "grad_norm": 0.515015291051856, + "learning_rate": 1.4517645838689754e-05, + "loss": 11.6671, + "step": 30541 + }, + { + "epoch": 1.6631339636383438, + "grad_norm": 0.6300347934852151, + "learning_rate": 1.451307021118401e-05, + "loss": 11.933, + "step": 30542 + }, + { + "epoch": 1.6631884176349268, + "grad_norm": 0.5629815631744695, + "learning_rate": 1.4508495248438958e-05, + "loss": 11.8467, + "step": 30543 + }, + { + "epoch": 1.66324287163151, + "grad_norm": 0.5630607797950447, + "learning_rate": 1.4503920950490202e-05, + "loss": 11.7637, + "step": 30544 + }, + { + "epoch": 1.663297325628093, + "grad_norm": 0.5409344508512145, + "learning_rate": 1.4499347317373268e-05, + "loss": 11.8202, + "step": 30545 + }, + { + "epoch": 1.663351779624676, + "grad_norm": 0.5470298260052968, + "learning_rate": 1.4494774349123741e-05, + "loss": 11.834, + "step": 30546 + }, + { + "epoch": 1.663406233621259, + "grad_norm": 0.5327489775189177, + "learning_rate": 1.4490202045777224e-05, + "loss": 11.6929, + "step": 30547 + }, + { + "epoch": 1.663460687617842, + "grad_norm": 0.5302068927531721, + "learning_rate": 1.4485630407369211e-05, + "loss": 11.6785, + "step": 30548 + }, + { + "epoch": 1.663515141614425, + "grad_norm": 0.523235391529584, + "learning_rate": 1.4481059433935296e-05, + "loss": 11.812, + "step": 30549 + }, + { + "epoch": 1.663569595611008, + "grad_norm": 0.5708136537775274, + "learning_rate": 1.4476489125510973e-05, + "loss": 11.728, + "step": 30550 + }, + { + "epoch": 1.663624049607591, + "grad_norm": 0.5440375571530032, + "learning_rate": 1.4471919482131846e-05, + "loss": 11.7661, + "step": 30551 + }, + { + "epoch": 1.663678503604174, + "grad_norm": 0.5273290339034447, + "learning_rate": 1.4467350503833421e-05, + "loss": 11.7998, + "step": 30552 + }, + { + "epoch": 1.663732957600757, + "grad_norm": 0.837060976833387, + "learning_rate": 1.4462782190651191e-05, + "loss": 11.6781, + "step": 30553 + }, + { + "epoch": 1.66378741159734, + "grad_norm": 0.5517925530154327, + "learning_rate": 1.4458214542620729e-05, + "loss": 11.889, + "step": 30554 + }, + { + "epoch": 1.6638418655939229, + "grad_norm": 0.5398113363513517, + "learning_rate": 1.4453647559777527e-05, + "loss": 11.8162, + "step": 30555 + }, + { + "epoch": 1.6638963195905059, + "grad_norm": 0.48781454370799227, + "learning_rate": 1.4449081242157126e-05, + "loss": 11.6628, + "step": 30556 + }, + { + "epoch": 1.6639507735870889, + "grad_norm": 0.5324761965955556, + "learning_rate": 1.4444515589794982e-05, + "loss": 11.7681, + "step": 30557 + }, + { + "epoch": 1.6640052275836719, + "grad_norm": 0.5190118095315441, + "learning_rate": 1.4439950602726648e-05, + "loss": 11.7628, + "step": 30558 + }, + { + "epoch": 1.6640596815802549, + "grad_norm": 0.562217111189288, + "learning_rate": 1.4435386280987618e-05, + "loss": 11.9103, + "step": 30559 + }, + { + "epoch": 1.6641141355768378, + "grad_norm": 0.5883842957215601, + "learning_rate": 1.443082262461336e-05, + "loss": 11.9338, + "step": 30560 + }, + { + "epoch": 1.6641685895734208, + "grad_norm": 0.6249725874539235, + "learning_rate": 1.4426259633639416e-05, + "loss": 11.8424, + "step": 30561 + }, + { + "epoch": 1.664223043570004, + "grad_norm": 0.4985586351457106, + "learning_rate": 1.4421697308101178e-05, + "loss": 11.7519, + "step": 30562 + }, + { + "epoch": 1.664277497566587, + "grad_norm": 0.5391981641148595, + "learning_rate": 1.4417135648034186e-05, + "loss": 11.8427, + "step": 30563 + }, + { + "epoch": 1.66433195156317, + "grad_norm": 0.5288522672615051, + "learning_rate": 1.441257465347391e-05, + "loss": 11.7846, + "step": 30564 + }, + { + "epoch": 1.664386405559753, + "grad_norm": 0.5518269616252821, + "learning_rate": 1.4408014324455787e-05, + "loss": 11.7347, + "step": 30565 + }, + { + "epoch": 1.664440859556336, + "grad_norm": 0.6027580192488022, + "learning_rate": 1.4403454661015326e-05, + "loss": 11.8817, + "step": 30566 + }, + { + "epoch": 1.664495313552919, + "grad_norm": 0.5729240918869509, + "learning_rate": 1.4398895663187927e-05, + "loss": 11.7502, + "step": 30567 + }, + { + "epoch": 1.6645497675495022, + "grad_norm": 0.5349704726100336, + "learning_rate": 1.43943373310091e-05, + "loss": 11.6887, + "step": 30568 + }, + { + "epoch": 1.6646042215460852, + "grad_norm": 0.6587897180079224, + "learning_rate": 1.4389779664514235e-05, + "loss": 11.7798, + "step": 30569 + }, + { + "epoch": 1.6646586755426682, + "grad_norm": 0.5412107808277605, + "learning_rate": 1.4385222663738796e-05, + "loss": 11.8893, + "step": 30570 + }, + { + "epoch": 1.6647131295392512, + "grad_norm": 0.5234676013799723, + "learning_rate": 1.4380666328718274e-05, + "loss": 11.8058, + "step": 30571 + }, + { + "epoch": 1.6647675835358342, + "grad_norm": 0.5545345381822373, + "learning_rate": 1.4376110659488006e-05, + "loss": 11.8276, + "step": 30572 + }, + { + "epoch": 1.6648220375324172, + "grad_norm": 0.5127483290730295, + "learning_rate": 1.4371555656083457e-05, + "loss": 11.7903, + "step": 30573 + }, + { + "epoch": 1.6648764915290002, + "grad_norm": 0.6074805114055226, + "learning_rate": 1.4367001318540075e-05, + "loss": 11.7282, + "step": 30574 + }, + { + "epoch": 1.6649309455255832, + "grad_norm": 0.524109377181651, + "learning_rate": 1.4362447646893218e-05, + "loss": 11.774, + "step": 30575 + }, + { + "epoch": 1.6649853995221662, + "grad_norm": 0.5469659882901008, + "learning_rate": 1.4357894641178371e-05, + "loss": 11.7587, + "step": 30576 + }, + { + "epoch": 1.6650398535187492, + "grad_norm": 0.5216704136518607, + "learning_rate": 1.435334230143086e-05, + "loss": 11.8727, + "step": 30577 + }, + { + "epoch": 1.6650943075153322, + "grad_norm": 0.5255879142118899, + "learning_rate": 1.4348790627686149e-05, + "loss": 11.8424, + "step": 30578 + }, + { + "epoch": 1.6651487615119152, + "grad_norm": 0.5120144482353609, + "learning_rate": 1.4344239619979583e-05, + "loss": 11.7848, + "step": 30579 + }, + { + "epoch": 1.6652032155084981, + "grad_norm": 0.6554859104398678, + "learning_rate": 1.433968927834658e-05, + "loss": 11.7708, + "step": 30580 + }, + { + "epoch": 1.6652576695050811, + "grad_norm": 0.5535995453122468, + "learning_rate": 1.4335139602822557e-05, + "loss": 11.6216, + "step": 30581 + }, + { + "epoch": 1.6653121235016641, + "grad_norm": 0.5198100048389207, + "learning_rate": 1.4330590593442817e-05, + "loss": 11.5929, + "step": 30582 + }, + { + "epoch": 1.6653665774982471, + "grad_norm": 0.5358411842760149, + "learning_rate": 1.4326042250242789e-05, + "loss": 11.8566, + "step": 30583 + }, + { + "epoch": 1.6654210314948301, + "grad_norm": 0.5785564122028076, + "learning_rate": 1.43214945732578e-05, + "loss": 11.7175, + "step": 30584 + }, + { + "epoch": 1.6654754854914133, + "grad_norm": 0.5435001051217152, + "learning_rate": 1.431694756252323e-05, + "loss": 11.7581, + "step": 30585 + }, + { + "epoch": 1.6655299394879963, + "grad_norm": 0.5384336380334687, + "learning_rate": 1.4312401218074478e-05, + "loss": 11.7637, + "step": 30586 + }, + { + "epoch": 1.6655843934845793, + "grad_norm": 0.5429834110530761, + "learning_rate": 1.4307855539946847e-05, + "loss": 11.8063, + "step": 30587 + }, + { + "epoch": 1.6656388474811623, + "grad_norm": 0.5496962632683134, + "learning_rate": 1.4303310528175717e-05, + "loss": 11.8142, + "step": 30588 + }, + { + "epoch": 1.6656933014777453, + "grad_norm": 0.5093237154352173, + "learning_rate": 1.4298766182796386e-05, + "loss": 11.7745, + "step": 30589 + }, + { + "epoch": 1.6657477554743283, + "grad_norm": 0.5365372478799709, + "learning_rate": 1.4294222503844257e-05, + "loss": 11.8519, + "step": 30590 + }, + { + "epoch": 1.6658022094709115, + "grad_norm": 0.5333590014600259, + "learning_rate": 1.4289679491354613e-05, + "loss": 11.8323, + "step": 30591 + }, + { + "epoch": 1.6658566634674945, + "grad_norm": 0.5855486768965955, + "learning_rate": 1.4285137145362781e-05, + "loss": 11.8671, + "step": 30592 + }, + { + "epoch": 1.6659111174640775, + "grad_norm": 0.5117628893373161, + "learning_rate": 1.4280595465904123e-05, + "loss": 11.8224, + "step": 30593 + }, + { + "epoch": 1.6659655714606605, + "grad_norm": 0.5177405067725183, + "learning_rate": 1.42760544530139e-05, + "loss": 11.727, + "step": 30594 + }, + { + "epoch": 1.6660200254572435, + "grad_norm": 0.590421014802051, + "learning_rate": 1.4271514106727458e-05, + "loss": 11.8403, + "step": 30595 + }, + { + "epoch": 1.6660744794538265, + "grad_norm": 0.5205639485560986, + "learning_rate": 1.4266974427080115e-05, + "loss": 11.8655, + "step": 30596 + }, + { + "epoch": 1.6661289334504095, + "grad_norm": 0.6257887917244348, + "learning_rate": 1.4262435414107134e-05, + "loss": 11.9091, + "step": 30597 + }, + { + "epoch": 1.6661833874469925, + "grad_norm": 0.5213222188455704, + "learning_rate": 1.4257897067843862e-05, + "loss": 11.7735, + "step": 30598 + }, + { + "epoch": 1.6662378414435755, + "grad_norm": 0.6016607595260474, + "learning_rate": 1.4253359388325537e-05, + "loss": 11.7408, + "step": 30599 + }, + { + "epoch": 1.6662922954401584, + "grad_norm": 0.5728991967714202, + "learning_rate": 1.4248822375587489e-05, + "loss": 11.7922, + "step": 30600 + }, + { + "epoch": 1.6663467494367414, + "grad_norm": 0.6421991607919589, + "learning_rate": 1.4244286029664988e-05, + "loss": 11.9538, + "step": 30601 + }, + { + "epoch": 1.6664012034333244, + "grad_norm": 0.527930619823098, + "learning_rate": 1.4239750350593273e-05, + "loss": 11.8458, + "step": 30602 + }, + { + "epoch": 1.6664556574299074, + "grad_norm": 0.5772038038487892, + "learning_rate": 1.4235215338407658e-05, + "loss": 11.7478, + "step": 30603 + }, + { + "epoch": 1.6665101114264904, + "grad_norm": 0.5274345854094674, + "learning_rate": 1.4230680993143376e-05, + "loss": 11.7769, + "step": 30604 + }, + { + "epoch": 1.6665645654230734, + "grad_norm": 0.5411191221571823, + "learning_rate": 1.4226147314835714e-05, + "loss": 11.7858, + "step": 30605 + }, + { + "epoch": 1.6666190194196564, + "grad_norm": 0.4910491243074338, + "learning_rate": 1.4221614303519904e-05, + "loss": 11.7864, + "step": 30606 + }, + { + "epoch": 1.6666734734162394, + "grad_norm": 0.5490921718782192, + "learning_rate": 1.4217081959231204e-05, + "loss": 11.8052, + "step": 30607 + }, + { + "epoch": 1.6667279274128226, + "grad_norm": 0.5370539524170274, + "learning_rate": 1.4212550282004878e-05, + "loss": 11.8518, + "step": 30608 + }, + { + "epoch": 1.6667823814094056, + "grad_norm": 0.5763943784481144, + "learning_rate": 1.4208019271876128e-05, + "loss": 11.8761, + "step": 30609 + }, + { + "epoch": 1.6668368354059886, + "grad_norm": 0.5489631288190735, + "learning_rate": 1.4203488928880226e-05, + "loss": 11.7234, + "step": 30610 + }, + { + "epoch": 1.6668912894025716, + "grad_norm": 0.6224037636667891, + "learning_rate": 1.4198959253052391e-05, + "loss": 11.8845, + "step": 30611 + }, + { + "epoch": 1.6669457433991546, + "grad_norm": 0.5296441792635029, + "learning_rate": 1.4194430244427802e-05, + "loss": 11.8248, + "step": 30612 + }, + { + "epoch": 1.6670001973957376, + "grad_norm": 0.6968604979288429, + "learning_rate": 1.4189901903041746e-05, + "loss": 11.8933, + "step": 30613 + }, + { + "epoch": 1.6670546513923208, + "grad_norm": 0.5899802956762564, + "learning_rate": 1.4185374228929382e-05, + "loss": 11.9042, + "step": 30614 + }, + { + "epoch": 1.6671091053889038, + "grad_norm": 0.5404336158935368, + "learning_rate": 1.4180847222125959e-05, + "loss": 11.834, + "step": 30615 + }, + { + "epoch": 1.6671635593854868, + "grad_norm": 0.5609721932057188, + "learning_rate": 1.4176320882666627e-05, + "loss": 11.8251, + "step": 30616 + }, + { + "epoch": 1.6672180133820698, + "grad_norm": 0.5526477556977242, + "learning_rate": 1.4171795210586658e-05, + "loss": 11.7074, + "step": 30617 + }, + { + "epoch": 1.6672724673786528, + "grad_norm": 0.5548255554874686, + "learning_rate": 1.4167270205921169e-05, + "loss": 11.7813, + "step": 30618 + }, + { + "epoch": 1.6673269213752357, + "grad_norm": 0.5542599397274358, + "learning_rate": 1.4162745868705373e-05, + "loss": 11.7384, + "step": 30619 + }, + { + "epoch": 1.6673813753718187, + "grad_norm": 0.5542399215659537, + "learning_rate": 1.4158222198974502e-05, + "loss": 11.8433, + "step": 30620 + }, + { + "epoch": 1.6674358293684017, + "grad_norm": 0.5338091516642297, + "learning_rate": 1.415369919676368e-05, + "loss": 11.8447, + "step": 30621 + }, + { + "epoch": 1.6674902833649847, + "grad_norm": 0.5192170780996689, + "learning_rate": 1.414917686210806e-05, + "loss": 11.8986, + "step": 30622 + }, + { + "epoch": 1.6675447373615677, + "grad_norm": 0.563720811981106, + "learning_rate": 1.4144655195042877e-05, + "loss": 11.8853, + "step": 30623 + }, + { + "epoch": 1.6675991913581507, + "grad_norm": 0.5031909712320997, + "learning_rate": 1.4140134195603216e-05, + "loss": 11.7591, + "step": 30624 + }, + { + "epoch": 1.6676536453547337, + "grad_norm": 0.6025047116870932, + "learning_rate": 1.4135613863824304e-05, + "loss": 11.7564, + "step": 30625 + }, + { + "epoch": 1.6677080993513167, + "grad_norm": 0.4733156463199147, + "learning_rate": 1.4131094199741224e-05, + "loss": 11.8403, + "step": 30626 + }, + { + "epoch": 1.6677625533478997, + "grad_norm": 0.5721359322113635, + "learning_rate": 1.4126575203389181e-05, + "loss": 11.8232, + "step": 30627 + }, + { + "epoch": 1.6678170073444827, + "grad_norm": 0.534762221395335, + "learning_rate": 1.412205687480328e-05, + "loss": 11.779, + "step": 30628 + }, + { + "epoch": 1.6678714613410657, + "grad_norm": 0.5322262854907802, + "learning_rate": 1.411753921401865e-05, + "loss": 11.7902, + "step": 30629 + }, + { + "epoch": 1.6679259153376487, + "grad_norm": 0.5514355863362038, + "learning_rate": 1.4113022221070472e-05, + "loss": 11.8879, + "step": 30630 + }, + { + "epoch": 1.6679803693342317, + "grad_norm": 0.502506217205064, + "learning_rate": 1.410850589599383e-05, + "loss": 11.7194, + "step": 30631 + }, + { + "epoch": 1.6680348233308149, + "grad_norm": 0.5694251018926452, + "learning_rate": 1.410399023882385e-05, + "loss": 11.8151, + "step": 30632 + }, + { + "epoch": 1.6680892773273979, + "grad_norm": 0.5125036313871394, + "learning_rate": 1.4099475249595628e-05, + "loss": 11.8312, + "step": 30633 + }, + { + "epoch": 1.6681437313239809, + "grad_norm": 0.5450231604275171, + "learning_rate": 1.4094960928344292e-05, + "loss": 11.7312, + "step": 30634 + }, + { + "epoch": 1.6681981853205639, + "grad_norm": 0.5849536903654996, + "learning_rate": 1.4090447275104968e-05, + "loss": 11.8918, + "step": 30635 + }, + { + "epoch": 1.6682526393171468, + "grad_norm": 0.5251727664436304, + "learning_rate": 1.4085934289912706e-05, + "loss": 11.8149, + "step": 30636 + }, + { + "epoch": 1.66830709331373, + "grad_norm": 0.6149918819302681, + "learning_rate": 1.4081421972802655e-05, + "loss": 11.8117, + "step": 30637 + }, + { + "epoch": 1.668361547310313, + "grad_norm": 0.5665962930161643, + "learning_rate": 1.4076910323809845e-05, + "loss": 11.5848, + "step": 30638 + }, + { + "epoch": 1.668416001306896, + "grad_norm": 0.5111726167768292, + "learning_rate": 1.4072399342969422e-05, + "loss": 11.8496, + "step": 30639 + }, + { + "epoch": 1.668470455303479, + "grad_norm": 0.552531943121508, + "learning_rate": 1.4067889030316406e-05, + "loss": 11.6825, + "step": 30640 + }, + { + "epoch": 1.668524909300062, + "grad_norm": 0.5175033775069464, + "learning_rate": 1.4063379385885911e-05, + "loss": 11.7762, + "step": 30641 + }, + { + "epoch": 1.668579363296645, + "grad_norm": 0.6186484060029801, + "learning_rate": 1.4058870409713e-05, + "loss": 12.0445, + "step": 30642 + }, + { + "epoch": 1.668633817293228, + "grad_norm": 0.5427269369432087, + "learning_rate": 1.4054362101832696e-05, + "loss": 11.8152, + "step": 30643 + }, + { + "epoch": 1.668688271289811, + "grad_norm": 0.5530534526730182, + "learning_rate": 1.4049854462280088e-05, + "loss": 11.8655, + "step": 30644 + }, + { + "epoch": 1.668742725286394, + "grad_norm": 0.5780781935152243, + "learning_rate": 1.4045347491090254e-05, + "loss": 11.7665, + "step": 30645 + }, + { + "epoch": 1.668797179282977, + "grad_norm": 0.5181392305023769, + "learning_rate": 1.4040841188298182e-05, + "loss": 11.7054, + "step": 30646 + }, + { + "epoch": 1.66885163327956, + "grad_norm": 0.636847789134479, + "learning_rate": 1.4036335553938962e-05, + "loss": 11.805, + "step": 30647 + }, + { + "epoch": 1.668906087276143, + "grad_norm": 0.5660885977660897, + "learning_rate": 1.4031830588047601e-05, + "loss": 11.8311, + "step": 30648 + }, + { + "epoch": 1.668960541272726, + "grad_norm": 0.6833788173018199, + "learning_rate": 1.4027326290659159e-05, + "loss": 11.8768, + "step": 30649 + }, + { + "epoch": 1.669014995269309, + "grad_norm": 0.6139854201866528, + "learning_rate": 1.4022822661808621e-05, + "loss": 11.8321, + "step": 30650 + }, + { + "epoch": 1.669069449265892, + "grad_norm": 0.5552076093213949, + "learning_rate": 1.4018319701531035e-05, + "loss": 11.7677, + "step": 30651 + }, + { + "epoch": 1.669123903262475, + "grad_norm": 0.5209814904995312, + "learning_rate": 1.4013817409861463e-05, + "loss": 11.7055, + "step": 30652 + }, + { + "epoch": 1.669178357259058, + "grad_norm": 0.5339816384169245, + "learning_rate": 1.400931578683482e-05, + "loss": 11.7872, + "step": 30653 + }, + { + "epoch": 1.669232811255641, + "grad_norm": 0.5683988407237588, + "learning_rate": 1.400481483248618e-05, + "loss": 11.8208, + "step": 30654 + }, + { + "epoch": 1.6692872652522242, + "grad_norm": 0.5907841922901259, + "learning_rate": 1.4000314546850502e-05, + "loss": 11.8629, + "step": 30655 + }, + { + "epoch": 1.6693417192488071, + "grad_norm": 0.5407176440878831, + "learning_rate": 1.3995814929962791e-05, + "loss": 11.8693, + "step": 30656 + }, + { + "epoch": 1.6693961732453901, + "grad_norm": 0.5387535016711819, + "learning_rate": 1.3991315981858077e-05, + "loss": 11.8178, + "step": 30657 + }, + { + "epoch": 1.6694506272419731, + "grad_norm": 0.5622576450184463, + "learning_rate": 1.3986817702571286e-05, + "loss": 11.8224, + "step": 30658 + }, + { + "epoch": 1.6695050812385561, + "grad_norm": 0.5118073203746123, + "learning_rate": 1.3982320092137447e-05, + "loss": 11.8534, + "step": 30659 + }, + { + "epoch": 1.6695595352351391, + "grad_norm": 0.540542840608444, + "learning_rate": 1.3977823150591496e-05, + "loss": 11.8103, + "step": 30660 + }, + { + "epoch": 1.6696139892317223, + "grad_norm": 0.5482135433940559, + "learning_rate": 1.3973326877968429e-05, + "loss": 11.7786, + "step": 30661 + }, + { + "epoch": 1.6696684432283053, + "grad_norm": 0.553369146661749, + "learning_rate": 1.3968831274303206e-05, + "loss": 11.8318, + "step": 30662 + }, + { + "epoch": 1.6697228972248883, + "grad_norm": 0.5073757333984068, + "learning_rate": 1.3964336339630757e-05, + "loss": 11.7957, + "step": 30663 + }, + { + "epoch": 1.6697773512214713, + "grad_norm": 0.5055837732603168, + "learning_rate": 1.3959842073986085e-05, + "loss": 11.7707, + "step": 30664 + }, + { + "epoch": 1.6698318052180543, + "grad_norm": 0.559876593933698, + "learning_rate": 1.3955348477404072e-05, + "loss": 11.7811, + "step": 30665 + }, + { + "epoch": 1.6698862592146373, + "grad_norm": 0.5333976863578087, + "learning_rate": 1.395085554991974e-05, + "loss": 11.8541, + "step": 30666 + }, + { + "epoch": 1.6699407132112203, + "grad_norm": 0.5454917870028871, + "learning_rate": 1.3946363291567944e-05, + "loss": 11.797, + "step": 30667 + }, + { + "epoch": 1.6699951672078033, + "grad_norm": 0.530962618090314, + "learning_rate": 1.3941871702383669e-05, + "loss": 11.8457, + "step": 30668 + }, + { + "epoch": 1.6700496212043863, + "grad_norm": 0.488430907721536, + "learning_rate": 1.3937380782401855e-05, + "loss": 11.7797, + "step": 30669 + }, + { + "epoch": 1.6701040752009693, + "grad_norm": 0.5307821084548326, + "learning_rate": 1.3932890531657373e-05, + "loss": 11.796, + "step": 30670 + }, + { + "epoch": 1.6701585291975523, + "grad_norm": 0.5233877365304789, + "learning_rate": 1.3928400950185194e-05, + "loss": 11.8846, + "step": 30671 + }, + { + "epoch": 1.6702129831941352, + "grad_norm": 0.5273684616147256, + "learning_rate": 1.3923912038020204e-05, + "loss": 11.7697, + "step": 30672 + }, + { + "epoch": 1.6702674371907182, + "grad_norm": 0.6265352816580725, + "learning_rate": 1.3919423795197284e-05, + "loss": 11.7663, + "step": 30673 + }, + { + "epoch": 1.6703218911873012, + "grad_norm": 0.5377756673137262, + "learning_rate": 1.3914936221751384e-05, + "loss": 11.7565, + "step": 30674 + }, + { + "epoch": 1.6703763451838842, + "grad_norm": 0.5160009649868486, + "learning_rate": 1.3910449317717356e-05, + "loss": 11.7871, + "step": 30675 + }, + { + "epoch": 1.6704307991804672, + "grad_norm": 0.5258348428317202, + "learning_rate": 1.3905963083130135e-05, + "loss": 11.843, + "step": 30676 + }, + { + "epoch": 1.6704852531770502, + "grad_norm": 0.5179978593831852, + "learning_rate": 1.3901477518024552e-05, + "loss": 11.7681, + "step": 30677 + }, + { + "epoch": 1.6705397071736334, + "grad_norm": 0.5239347580002798, + "learning_rate": 1.3896992622435523e-05, + "loss": 11.7877, + "step": 30678 + }, + { + "epoch": 1.6705941611702164, + "grad_norm": 0.5381226775521556, + "learning_rate": 1.3892508396397941e-05, + "loss": 11.7176, + "step": 30679 + }, + { + "epoch": 1.6706486151667994, + "grad_norm": 0.5557597232340132, + "learning_rate": 1.3888024839946635e-05, + "loss": 11.8947, + "step": 30680 + }, + { + "epoch": 1.6707030691633824, + "grad_norm": 0.5284770407387012, + "learning_rate": 1.3883541953116508e-05, + "loss": 11.7098, + "step": 30681 + }, + { + "epoch": 1.6707575231599654, + "grad_norm": 0.5503145099653577, + "learning_rate": 1.3879059735942401e-05, + "loss": 11.8881, + "step": 30682 + }, + { + "epoch": 1.6708119771565484, + "grad_norm": 0.6382435746497301, + "learning_rate": 1.3874578188459153e-05, + "loss": 11.8654, + "step": 30683 + }, + { + "epoch": 1.6708664311531316, + "grad_norm": 0.5822616467219236, + "learning_rate": 1.3870097310701636e-05, + "loss": 11.9282, + "step": 30684 + }, + { + "epoch": 1.6709208851497146, + "grad_norm": 0.5727434133047872, + "learning_rate": 1.3865617102704676e-05, + "loss": 11.7462, + "step": 30685 + }, + { + "epoch": 1.6709753391462976, + "grad_norm": 0.515647104081518, + "learning_rate": 1.3861137564503135e-05, + "loss": 11.6965, + "step": 30686 + }, + { + "epoch": 1.6710297931428806, + "grad_norm": 0.5183354230310832, + "learning_rate": 1.385665869613182e-05, + "loss": 11.7704, + "step": 30687 + }, + { + "epoch": 1.6710842471394636, + "grad_norm": 0.6054209867642117, + "learning_rate": 1.3852180497625588e-05, + "loss": 11.7721, + "step": 30688 + }, + { + "epoch": 1.6711387011360466, + "grad_norm": 0.5805249737938416, + "learning_rate": 1.384770296901924e-05, + "loss": 11.8295, + "step": 30689 + }, + { + "epoch": 1.6711931551326296, + "grad_norm": 0.577409391021253, + "learning_rate": 1.3843226110347584e-05, + "loss": 11.714, + "step": 30690 + }, + { + "epoch": 1.6712476091292126, + "grad_norm": 0.5656187536623469, + "learning_rate": 1.3838749921645477e-05, + "loss": 11.8524, + "step": 30691 + }, + { + "epoch": 1.6713020631257955, + "grad_norm": 0.5594468803485352, + "learning_rate": 1.3834274402947711e-05, + "loss": 11.7565, + "step": 30692 + }, + { + "epoch": 1.6713565171223785, + "grad_norm": 0.5624642442627585, + "learning_rate": 1.3829799554289036e-05, + "loss": 11.851, + "step": 30693 + }, + { + "epoch": 1.6714109711189615, + "grad_norm": 0.6382464063271283, + "learning_rate": 1.3825325375704323e-05, + "loss": 11.7675, + "step": 30694 + }, + { + "epoch": 1.6714654251155445, + "grad_norm": 0.5776129033826342, + "learning_rate": 1.3820851867228313e-05, + "loss": 11.8955, + "step": 30695 + }, + { + "epoch": 1.6715198791121275, + "grad_norm": 0.5297254642489581, + "learning_rate": 1.3816379028895832e-05, + "loss": 11.8662, + "step": 30696 + }, + { + "epoch": 1.6715743331087105, + "grad_norm": 0.5479233691876345, + "learning_rate": 1.3811906860741608e-05, + "loss": 11.8863, + "step": 30697 + }, + { + "epoch": 1.6716287871052935, + "grad_norm": 0.6292991158702002, + "learning_rate": 1.3807435362800481e-05, + "loss": 11.7588, + "step": 30698 + }, + { + "epoch": 1.6716832411018765, + "grad_norm": 0.517464277885571, + "learning_rate": 1.3802964535107177e-05, + "loss": 11.8557, + "step": 30699 + }, + { + "epoch": 1.6717376950984595, + "grad_norm": 0.5444021249152353, + "learning_rate": 1.3798494377696459e-05, + "loss": 11.8088, + "step": 30700 + }, + { + "epoch": 1.6717921490950425, + "grad_norm": 0.5918704828124932, + "learning_rate": 1.379402489060314e-05, + "loss": 11.7818, + "step": 30701 + }, + { + "epoch": 1.6718466030916257, + "grad_norm": 0.5443720411630387, + "learning_rate": 1.3789556073861931e-05, + "loss": 11.9303, + "step": 30702 + }, + { + "epoch": 1.6719010570882087, + "grad_norm": 0.5499557398483298, + "learning_rate": 1.37850879275076e-05, + "loss": 11.7787, + "step": 30703 + }, + { + "epoch": 1.6719555110847917, + "grad_norm": 0.49798541994472717, + "learning_rate": 1.3780620451574855e-05, + "loss": 11.7193, + "step": 30704 + }, + { + "epoch": 1.6720099650813747, + "grad_norm": 0.5402006959711375, + "learning_rate": 1.3776153646098467e-05, + "loss": 11.7147, + "step": 30705 + }, + { + "epoch": 1.6720644190779577, + "grad_norm": 0.5664303938988359, + "learning_rate": 1.3771687511113186e-05, + "loss": 11.8356, + "step": 30706 + }, + { + "epoch": 1.6721188730745409, + "grad_norm": 0.5531096500165288, + "learning_rate": 1.3767222046653705e-05, + "loss": 11.8854, + "step": 30707 + }, + { + "epoch": 1.6721733270711239, + "grad_norm": 0.5545658808717064, + "learning_rate": 1.3762757252754788e-05, + "loss": 11.8637, + "step": 30708 + }, + { + "epoch": 1.6722277810677069, + "grad_norm": 0.569745662900167, + "learning_rate": 1.3758293129451116e-05, + "loss": 11.884, + "step": 30709 + }, + { + "epoch": 1.6722822350642899, + "grad_norm": 0.5800772003933736, + "learning_rate": 1.375382967677743e-05, + "loss": 11.8379, + "step": 30710 + }, + { + "epoch": 1.6723366890608728, + "grad_norm": 0.5226995107064963, + "learning_rate": 1.3749366894768412e-05, + "loss": 11.8163, + "step": 30711 + }, + { + "epoch": 1.6723911430574558, + "grad_norm": 0.5779024817809948, + "learning_rate": 1.37449047834588e-05, + "loss": 11.8702, + "step": 30712 + }, + { + "epoch": 1.6724455970540388, + "grad_norm": 0.5495962175139693, + "learning_rate": 1.374044334288328e-05, + "loss": 11.742, + "step": 30713 + }, + { + "epoch": 1.6725000510506218, + "grad_norm": 0.5564836727694991, + "learning_rate": 1.3735982573076511e-05, + "loss": 11.757, + "step": 30714 + }, + { + "epoch": 1.6725545050472048, + "grad_norm": 0.5740085108021452, + "learning_rate": 1.3731522474073233e-05, + "loss": 11.7604, + "step": 30715 + }, + { + "epoch": 1.6726089590437878, + "grad_norm": 0.5160515161411331, + "learning_rate": 1.3727063045908084e-05, + "loss": 11.8244, + "step": 30716 + }, + { + "epoch": 1.6726634130403708, + "grad_norm": 0.543489470428873, + "learning_rate": 1.3722604288615759e-05, + "loss": 11.7809, + "step": 30717 + }, + { + "epoch": 1.6727178670369538, + "grad_norm": 0.5175571927238948, + "learning_rate": 1.3718146202230953e-05, + "loss": 11.8779, + "step": 30718 + }, + { + "epoch": 1.6727723210335368, + "grad_norm": 0.5198347335821604, + "learning_rate": 1.371368878678828e-05, + "loss": 11.7775, + "step": 30719 + }, + { + "epoch": 1.6728267750301198, + "grad_norm": 0.5414702976639753, + "learning_rate": 1.3709232042322472e-05, + "loss": 11.8345, + "step": 30720 + }, + { + "epoch": 1.6728812290267028, + "grad_norm": 0.6141249785605588, + "learning_rate": 1.3704775968868111e-05, + "loss": 11.9421, + "step": 30721 + }, + { + "epoch": 1.6729356830232858, + "grad_norm": 0.7390792488911934, + "learning_rate": 1.3700320566459912e-05, + "loss": 11.7716, + "step": 30722 + }, + { + "epoch": 1.6729901370198688, + "grad_norm": 0.5173868267241312, + "learning_rate": 1.3695865835132493e-05, + "loss": 11.7361, + "step": 30723 + }, + { + "epoch": 1.6730445910164518, + "grad_norm": 0.5474029527500227, + "learning_rate": 1.3691411774920471e-05, + "loss": 11.8611, + "step": 30724 + }, + { + "epoch": 1.673099045013035, + "grad_norm": 0.5514893474856272, + "learning_rate": 1.3686958385858529e-05, + "loss": 11.8024, + "step": 30725 + }, + { + "epoch": 1.673153499009618, + "grad_norm": 0.577040005904867, + "learning_rate": 1.368250566798125e-05, + "loss": 11.7137, + "step": 30726 + }, + { + "epoch": 1.673207953006201, + "grad_norm": 0.5475240789568734, + "learning_rate": 1.3678053621323283e-05, + "loss": 11.7408, + "step": 30727 + }, + { + "epoch": 1.673262407002784, + "grad_norm": 0.530659666503884, + "learning_rate": 1.367360224591927e-05, + "loss": 11.8441, + "step": 30728 + }, + { + "epoch": 1.673316860999367, + "grad_norm": 0.5611999792445183, + "learning_rate": 1.3669151541803771e-05, + "loss": 11.8717, + "step": 30729 + }, + { + "epoch": 1.67337131499595, + "grad_norm": 0.5306339523517843, + "learning_rate": 1.3664701509011457e-05, + "loss": 11.806, + "step": 30730 + }, + { + "epoch": 1.6734257689925331, + "grad_norm": 0.5219800102886839, + "learning_rate": 1.3660252147576879e-05, + "loss": 11.6249, + "step": 30731 + }, + { + "epoch": 1.6734802229891161, + "grad_norm": 0.574986750708085, + "learning_rate": 1.3655803457534688e-05, + "loss": 11.7464, + "step": 30732 + }, + { + "epoch": 1.6735346769856991, + "grad_norm": 0.6243519582976768, + "learning_rate": 1.3651355438919444e-05, + "loss": 11.9312, + "step": 30733 + }, + { + "epoch": 1.6735891309822821, + "grad_norm": 0.5939908201828773, + "learning_rate": 1.364690809176572e-05, + "loss": 11.8336, + "step": 30734 + }, + { + "epoch": 1.6736435849788651, + "grad_norm": 0.5744204053403791, + "learning_rate": 1.3642461416108142e-05, + "loss": 11.6633, + "step": 30735 + }, + { + "epoch": 1.673698038975448, + "grad_norm": 0.5276219268542305, + "learning_rate": 1.3638015411981242e-05, + "loss": 11.7969, + "step": 30736 + }, + { + "epoch": 1.673752492972031, + "grad_norm": 0.4895900857711987, + "learning_rate": 1.3633570079419644e-05, + "loss": 11.5451, + "step": 30737 + }, + { + "epoch": 1.673806946968614, + "grad_norm": 0.6192604787760138, + "learning_rate": 1.3629125418457867e-05, + "loss": 11.8738, + "step": 30738 + }, + { + "epoch": 1.673861400965197, + "grad_norm": 0.5183316054407755, + "learning_rate": 1.3624681429130493e-05, + "loss": 11.8157, + "step": 30739 + }, + { + "epoch": 1.67391585496178, + "grad_norm": 0.5352572930781914, + "learning_rate": 1.3620238111472095e-05, + "loss": 11.7011, + "step": 30740 + }, + { + "epoch": 1.673970308958363, + "grad_norm": 0.6608333963970415, + "learning_rate": 1.3615795465517201e-05, + "loss": 11.9022, + "step": 30741 + }, + { + "epoch": 1.674024762954946, + "grad_norm": 0.5645761140980992, + "learning_rate": 1.3611353491300383e-05, + "loss": 11.8001, + "step": 30742 + }, + { + "epoch": 1.674079216951529, + "grad_norm": 0.5175858894914743, + "learning_rate": 1.360691218885617e-05, + "loss": 11.7656, + "step": 30743 + }, + { + "epoch": 1.674133670948112, + "grad_norm": 0.5380103375369542, + "learning_rate": 1.3602471558219076e-05, + "loss": 11.7123, + "step": 30744 + }, + { + "epoch": 1.674188124944695, + "grad_norm": 0.5079453659542924, + "learning_rate": 1.3598031599423666e-05, + "loss": 11.7998, + "step": 30745 + }, + { + "epoch": 1.674242578941278, + "grad_norm": 0.5437587357002618, + "learning_rate": 1.3593592312504444e-05, + "loss": 11.8777, + "step": 30746 + }, + { + "epoch": 1.674297032937861, + "grad_norm": 0.5996576782136743, + "learning_rate": 1.3589153697495948e-05, + "loss": 11.8434, + "step": 30747 + }, + { + "epoch": 1.6743514869344442, + "grad_norm": 0.5709378066021393, + "learning_rate": 1.3584715754432664e-05, + "loss": 11.79, + "step": 30748 + }, + { + "epoch": 1.6744059409310272, + "grad_norm": 0.5601905928731062, + "learning_rate": 1.3580278483349129e-05, + "loss": 11.7178, + "step": 30749 + }, + { + "epoch": 1.6744603949276102, + "grad_norm": 0.5323236474538788, + "learning_rate": 1.3575841884279861e-05, + "loss": 11.7674, + "step": 30750 + }, + { + "epoch": 1.6745148489241932, + "grad_norm": 0.507929985239759, + "learning_rate": 1.3571405957259309e-05, + "loss": 11.7714, + "step": 30751 + }, + { + "epoch": 1.6745693029207762, + "grad_norm": 0.48647216203174226, + "learning_rate": 1.3566970702322058e-05, + "loss": 11.8004, + "step": 30752 + }, + { + "epoch": 1.6746237569173592, + "grad_norm": 0.5567592249661721, + "learning_rate": 1.3562536119502477e-05, + "loss": 11.5576, + "step": 30753 + }, + { + "epoch": 1.6746782109139424, + "grad_norm": 0.611344391102703, + "learning_rate": 1.3558102208835121e-05, + "loss": 11.7727, + "step": 30754 + }, + { + "epoch": 1.6747326649105254, + "grad_norm": 0.5741730647314339, + "learning_rate": 1.3553668970354483e-05, + "loss": 11.8161, + "step": 30755 + }, + { + "epoch": 1.6747871189071084, + "grad_norm": 0.5089711773059751, + "learning_rate": 1.3549236404094978e-05, + "loss": 11.6753, + "step": 30756 + }, + { + "epoch": 1.6748415729036914, + "grad_norm": 0.5474695100402294, + "learning_rate": 1.3544804510091136e-05, + "loss": 11.7483, + "step": 30757 + }, + { + "epoch": 1.6748960269002744, + "grad_norm": 0.5201051846364081, + "learning_rate": 1.3540373288377372e-05, + "loss": 11.6236, + "step": 30758 + }, + { + "epoch": 1.6749504808968574, + "grad_norm": 0.6008503605801372, + "learning_rate": 1.3535942738988194e-05, + "loss": 11.911, + "step": 30759 + }, + { + "epoch": 1.6750049348934404, + "grad_norm": 0.5490633818844283, + "learning_rate": 1.3531512861957995e-05, + "loss": 11.8329, + "step": 30760 + }, + { + "epoch": 1.6750593888900234, + "grad_norm": 0.5956673616462292, + "learning_rate": 1.3527083657321248e-05, + "loss": 11.6763, + "step": 30761 + }, + { + "epoch": 1.6751138428866064, + "grad_norm": 0.5871668559190035, + "learning_rate": 1.3522655125112449e-05, + "loss": 11.8177, + "step": 30762 + }, + { + "epoch": 1.6751682968831894, + "grad_norm": 0.5252319294898933, + "learning_rate": 1.3518227265365945e-05, + "loss": 11.8208, + "step": 30763 + }, + { + "epoch": 1.6752227508797723, + "grad_norm": 0.6494343850464059, + "learning_rate": 1.3513800078116234e-05, + "loss": 11.8865, + "step": 30764 + }, + { + "epoch": 1.6752772048763553, + "grad_norm": 0.5577113683637119, + "learning_rate": 1.3509373563397688e-05, + "loss": 11.8043, + "step": 30765 + }, + { + "epoch": 1.6753316588729383, + "grad_norm": 0.5479970067312949, + "learning_rate": 1.3504947721244753e-05, + "loss": 11.8447, + "step": 30766 + }, + { + "epoch": 1.6753861128695213, + "grad_norm": 0.5144120454661337, + "learning_rate": 1.3500522551691885e-05, + "loss": 11.7687, + "step": 30767 + }, + { + "epoch": 1.6754405668661043, + "grad_norm": 0.5836956491599675, + "learning_rate": 1.3496098054773432e-05, + "loss": 11.8723, + "step": 30768 + }, + { + "epoch": 1.6754950208626873, + "grad_norm": 0.5516998216749677, + "learning_rate": 1.3491674230523842e-05, + "loss": 11.7775, + "step": 30769 + }, + { + "epoch": 1.6755494748592703, + "grad_norm": 0.6221994938327313, + "learning_rate": 1.3487251078977492e-05, + "loss": 11.9036, + "step": 30770 + }, + { + "epoch": 1.6756039288558535, + "grad_norm": 0.4966220869176989, + "learning_rate": 1.3482828600168795e-05, + "loss": 11.7278, + "step": 30771 + }, + { + "epoch": 1.6756583828524365, + "grad_norm": 0.5655745151714466, + "learning_rate": 1.3478406794132137e-05, + "loss": 11.791, + "step": 30772 + }, + { + "epoch": 1.6757128368490195, + "grad_norm": 0.5737705742226658, + "learning_rate": 1.3473985660901878e-05, + "loss": 11.8163, + "step": 30773 + }, + { + "epoch": 1.6757672908456025, + "grad_norm": 0.5724829551180888, + "learning_rate": 1.3469565200512434e-05, + "loss": 11.8458, + "step": 30774 + }, + { + "epoch": 1.6758217448421855, + "grad_norm": 0.6026529526748534, + "learning_rate": 1.3465145412998148e-05, + "loss": 11.7957, + "step": 30775 + }, + { + "epoch": 1.6758761988387685, + "grad_norm": 0.5668312546691957, + "learning_rate": 1.34607262983934e-05, + "loss": 11.7239, + "step": 30776 + }, + { + "epoch": 1.6759306528353517, + "grad_norm": 0.5320837458259533, + "learning_rate": 1.3456307856732575e-05, + "loss": 11.7107, + "step": 30777 + }, + { + "epoch": 1.6759851068319347, + "grad_norm": 0.5371011898510129, + "learning_rate": 1.345189008804999e-05, + "loss": 11.7957, + "step": 30778 + }, + { + "epoch": 1.6760395608285177, + "grad_norm": 0.60628546475273, + "learning_rate": 1.344747299238005e-05, + "loss": 11.8693, + "step": 30779 + }, + { + "epoch": 1.6760940148251007, + "grad_norm": 0.5945148256243175, + "learning_rate": 1.344305656975705e-05, + "loss": 11.8287, + "step": 30780 + }, + { + "epoch": 1.6761484688216837, + "grad_norm": 0.5422130819635125, + "learning_rate": 1.3438640820215376e-05, + "loss": 11.7867, + "step": 30781 + }, + { + "epoch": 1.6762029228182667, + "grad_norm": 0.502865469110512, + "learning_rate": 1.3434225743789352e-05, + "loss": 11.6306, + "step": 30782 + }, + { + "epoch": 1.6762573768148497, + "grad_norm": 0.55338832640472, + "learning_rate": 1.3429811340513287e-05, + "loss": 11.7155, + "step": 30783 + }, + { + "epoch": 1.6763118308114326, + "grad_norm": 0.5342679761102067, + "learning_rate": 1.3425397610421541e-05, + "loss": 11.7603, + "step": 30784 + }, + { + "epoch": 1.6763662848080156, + "grad_norm": 0.5849556399975631, + "learning_rate": 1.34209845535484e-05, + "loss": 11.7956, + "step": 30785 + }, + { + "epoch": 1.6764207388045986, + "grad_norm": 0.5567148521287311, + "learning_rate": 1.3416572169928233e-05, + "loss": 11.56, + "step": 30786 + }, + { + "epoch": 1.6764751928011816, + "grad_norm": 0.5522869294628729, + "learning_rate": 1.341216045959529e-05, + "loss": 11.6991, + "step": 30787 + }, + { + "epoch": 1.6765296467977646, + "grad_norm": 0.5534686532411146, + "learning_rate": 1.3407749422583915e-05, + "loss": 11.8312, + "step": 30788 + }, + { + "epoch": 1.6765841007943476, + "grad_norm": 0.59627571237323, + "learning_rate": 1.3403339058928422e-05, + "loss": 11.9478, + "step": 30789 + }, + { + "epoch": 1.6766385547909306, + "grad_norm": 0.4876074041422157, + "learning_rate": 1.339892936866306e-05, + "loss": 11.7527, + "step": 30790 + }, + { + "epoch": 1.6766930087875136, + "grad_norm": 0.5432630204509983, + "learning_rate": 1.339452035182217e-05, + "loss": 11.819, + "step": 30791 + }, + { + "epoch": 1.6767474627840966, + "grad_norm": 0.5341624086670461, + "learning_rate": 1.3390112008439992e-05, + "loss": 11.6182, + "step": 30792 + }, + { + "epoch": 1.6768019167806796, + "grad_norm": 0.5474969809089916, + "learning_rate": 1.3385704338550853e-05, + "loss": 11.7691, + "step": 30793 + }, + { + "epoch": 1.6768563707772626, + "grad_norm": 0.5223113431713742, + "learning_rate": 1.3381297342189004e-05, + "loss": 11.7281, + "step": 30794 + }, + { + "epoch": 1.6769108247738458, + "grad_norm": 0.5546933604599444, + "learning_rate": 1.3376891019388682e-05, + "loss": 11.8756, + "step": 30795 + }, + { + "epoch": 1.6769652787704288, + "grad_norm": 0.5150681125434099, + "learning_rate": 1.3372485370184218e-05, + "loss": 11.8583, + "step": 30796 + }, + { + "epoch": 1.6770197327670118, + "grad_norm": 0.5345053336037571, + "learning_rate": 1.3368080394609794e-05, + "loss": 11.747, + "step": 30797 + }, + { + "epoch": 1.6770741867635948, + "grad_norm": 0.5870743626595543, + "learning_rate": 1.3363676092699718e-05, + "loss": 11.9117, + "step": 30798 + }, + { + "epoch": 1.6771286407601778, + "grad_norm": 0.5435534278676204, + "learning_rate": 1.3359272464488248e-05, + "loss": 11.8437, + "step": 30799 + }, + { + "epoch": 1.6771830947567608, + "grad_norm": 0.4971165194270236, + "learning_rate": 1.3354869510009583e-05, + "loss": 11.7076, + "step": 30800 + }, + { + "epoch": 1.677237548753344, + "grad_norm": 0.5501541279707652, + "learning_rate": 1.3350467229298002e-05, + "loss": 11.7002, + "step": 30801 + }, + { + "epoch": 1.677292002749927, + "grad_norm": 0.5257373542374613, + "learning_rate": 1.3346065622387694e-05, + "loss": 11.7226, + "step": 30802 + }, + { + "epoch": 1.67734645674651, + "grad_norm": 0.4854746856865274, + "learning_rate": 1.3341664689312939e-05, + "loss": 11.7435, + "step": 30803 + }, + { + "epoch": 1.677400910743093, + "grad_norm": 0.5627802516140279, + "learning_rate": 1.3337264430107921e-05, + "loss": 11.8204, + "step": 30804 + }, + { + "epoch": 1.677455364739676, + "grad_norm": 0.5372233772155028, + "learning_rate": 1.3332864844806859e-05, + "loss": 11.8013, + "step": 30805 + }, + { + "epoch": 1.677509818736259, + "grad_norm": 0.5283210218300569, + "learning_rate": 1.332846593344399e-05, + "loss": 11.8013, + "step": 30806 + }, + { + "epoch": 1.677564272732842, + "grad_norm": 0.4962134052480816, + "learning_rate": 1.3324067696053488e-05, + "loss": 11.6823, + "step": 30807 + }, + { + "epoch": 1.677618726729425, + "grad_norm": 0.56457898796017, + "learning_rate": 1.3319670132669593e-05, + "loss": 11.7457, + "step": 30808 + }, + { + "epoch": 1.677673180726008, + "grad_norm": 0.5815024347259764, + "learning_rate": 1.3315273243326454e-05, + "loss": 11.825, + "step": 30809 + }, + { + "epoch": 1.677727634722591, + "grad_norm": 0.5140054589877632, + "learning_rate": 1.3310877028058277e-05, + "loss": 11.7688, + "step": 30810 + }, + { + "epoch": 1.677782088719174, + "grad_norm": 0.5833100953870268, + "learning_rate": 1.3306481486899292e-05, + "loss": 11.7954, + "step": 30811 + }, + { + "epoch": 1.6778365427157569, + "grad_norm": 0.6723215588131805, + "learning_rate": 1.3302086619883625e-05, + "loss": 12.0623, + "step": 30812 + }, + { + "epoch": 1.6778909967123399, + "grad_norm": 0.530142616716068, + "learning_rate": 1.3297692427045516e-05, + "loss": 11.8834, + "step": 30813 + }, + { + "epoch": 1.6779454507089229, + "grad_norm": 0.5224907506133561, + "learning_rate": 1.329329890841904e-05, + "loss": 11.7784, + "step": 30814 + }, + { + "epoch": 1.6779999047055059, + "grad_norm": 0.5218275472829298, + "learning_rate": 1.328890606403842e-05, + "loss": 11.8317, + "step": 30815 + }, + { + "epoch": 1.6780543587020889, + "grad_norm": 0.5489846031224762, + "learning_rate": 1.3284513893937822e-05, + "loss": 11.826, + "step": 30816 + }, + { + "epoch": 1.6781088126986718, + "grad_norm": 0.5204790083568857, + "learning_rate": 1.3280122398151363e-05, + "loss": 11.7844, + "step": 30817 + }, + { + "epoch": 1.678163266695255, + "grad_norm": 0.5194773603284347, + "learning_rate": 1.3275731576713247e-05, + "loss": 11.7438, + "step": 30818 + }, + { + "epoch": 1.678217720691838, + "grad_norm": 0.5871451905002218, + "learning_rate": 1.3271341429657557e-05, + "loss": 11.5825, + "step": 30819 + }, + { + "epoch": 1.678272174688421, + "grad_norm": 0.5159076570751091, + "learning_rate": 1.3266951957018481e-05, + "loss": 11.7884, + "step": 30820 + }, + { + "epoch": 1.678326628685004, + "grad_norm": 0.6150612967492541, + "learning_rate": 1.326256315883011e-05, + "loss": 11.9415, + "step": 30821 + }, + { + "epoch": 1.678381082681587, + "grad_norm": 0.4931136989370211, + "learning_rate": 1.3258175035126596e-05, + "loss": 11.8024, + "step": 30822 + }, + { + "epoch": 1.67843553667817, + "grad_norm": 0.5725825767857251, + "learning_rate": 1.3253787585942112e-05, + "loss": 11.7904, + "step": 30823 + }, + { + "epoch": 1.6784899906747532, + "grad_norm": 0.6390060461074529, + "learning_rate": 1.3249400811310663e-05, + "loss": 12.0097, + "step": 30824 + }, + { + "epoch": 1.6785444446713362, + "grad_norm": 0.5314803990215526, + "learning_rate": 1.3245014711266435e-05, + "loss": 11.7944, + "step": 30825 + }, + { + "epoch": 1.6785988986679192, + "grad_norm": 0.5038522679184477, + "learning_rate": 1.3240629285843542e-05, + "loss": 11.7378, + "step": 30826 + }, + { + "epoch": 1.6786533526645022, + "grad_norm": 0.5454704986853873, + "learning_rate": 1.3236244535076036e-05, + "loss": 11.9079, + "step": 30827 + }, + { + "epoch": 1.6787078066610852, + "grad_norm": 0.557951993907346, + "learning_rate": 1.3231860458998069e-05, + "loss": 11.8218, + "step": 30828 + }, + { + "epoch": 1.6787622606576682, + "grad_norm": 0.555182406506991, + "learning_rate": 1.3227477057643677e-05, + "loss": 11.6933, + "step": 30829 + }, + { + "epoch": 1.6788167146542512, + "grad_norm": 0.5695684130966864, + "learning_rate": 1.3223094331047004e-05, + "loss": 11.656, + "step": 30830 + }, + { + "epoch": 1.6788711686508342, + "grad_norm": 0.5020493245916736, + "learning_rate": 1.3218712279242084e-05, + "loss": 11.8005, + "step": 30831 + }, + { + "epoch": 1.6789256226474172, + "grad_norm": 0.5481198553243628, + "learning_rate": 1.3214330902263006e-05, + "loss": 11.8994, + "step": 30832 + }, + { + "epoch": 1.6789800766440002, + "grad_norm": 0.5018290173423888, + "learning_rate": 1.3209950200143884e-05, + "loss": 11.7898, + "step": 30833 + }, + { + "epoch": 1.6790345306405832, + "grad_norm": 0.502488793789876, + "learning_rate": 1.3205570172918702e-05, + "loss": 11.6313, + "step": 30834 + }, + { + "epoch": 1.6790889846371662, + "grad_norm": 0.5579112840523157, + "learning_rate": 1.320119082062159e-05, + "loss": 11.8107, + "step": 30835 + }, + { + "epoch": 1.6791434386337492, + "grad_norm": 0.6770367140611824, + "learning_rate": 1.3196812143286551e-05, + "loss": 11.8248, + "step": 30836 + }, + { + "epoch": 1.6791978926303321, + "grad_norm": 0.5036100044091174, + "learning_rate": 1.3192434140947652e-05, + "loss": 11.7979, + "step": 30837 + }, + { + "epoch": 1.6792523466269151, + "grad_norm": 0.5353437836758715, + "learning_rate": 1.3188056813638971e-05, + "loss": 11.7918, + "step": 30838 + }, + { + "epoch": 1.6793068006234981, + "grad_norm": 0.5288200963388185, + "learning_rate": 1.3183680161394495e-05, + "loss": 11.6297, + "step": 30839 + }, + { + "epoch": 1.6793612546200811, + "grad_norm": 0.5658695122389732, + "learning_rate": 1.3179304184248298e-05, + "loss": 11.8004, + "step": 30840 + }, + { + "epoch": 1.6794157086166643, + "grad_norm": 0.544517652730449, + "learning_rate": 1.3174928882234373e-05, + "loss": 11.8579, + "step": 30841 + }, + { + "epoch": 1.6794701626132473, + "grad_norm": 0.4962557362885047, + "learning_rate": 1.3170554255386791e-05, + "loss": 11.8731, + "step": 30842 + }, + { + "epoch": 1.6795246166098303, + "grad_norm": 0.5188071007258321, + "learning_rate": 1.3166180303739528e-05, + "loss": 11.6215, + "step": 30843 + }, + { + "epoch": 1.6795790706064133, + "grad_norm": 0.5188097641128784, + "learning_rate": 1.316180702732659e-05, + "loss": 11.8047, + "step": 30844 + }, + { + "epoch": 1.6796335246029963, + "grad_norm": 0.5264623739112867, + "learning_rate": 1.3157434426182025e-05, + "loss": 11.8645, + "step": 30845 + }, + { + "epoch": 1.6796879785995793, + "grad_norm": 0.5310097958857362, + "learning_rate": 1.3153062500339796e-05, + "loss": 11.7249, + "step": 30846 + }, + { + "epoch": 1.6797424325961625, + "grad_norm": 0.5991591247823416, + "learning_rate": 1.3148691249833922e-05, + "loss": 11.8467, + "step": 30847 + }, + { + "epoch": 1.6797968865927455, + "grad_norm": 0.5257352301499815, + "learning_rate": 1.3144320674698396e-05, + "loss": 11.8047, + "step": 30848 + }, + { + "epoch": 1.6798513405893285, + "grad_norm": 0.5647495279696361, + "learning_rate": 1.3139950774967192e-05, + "loss": 11.8984, + "step": 30849 + }, + { + "epoch": 1.6799057945859115, + "grad_norm": 0.5275975143919737, + "learning_rate": 1.3135581550674315e-05, + "loss": 11.709, + "step": 30850 + }, + { + "epoch": 1.6799602485824945, + "grad_norm": 0.5936834863768895, + "learning_rate": 1.3131213001853692e-05, + "loss": 11.8145, + "step": 30851 + }, + { + "epoch": 1.6800147025790775, + "grad_norm": 0.5436783695171021, + "learning_rate": 1.3126845128539356e-05, + "loss": 11.6715, + "step": 30852 + }, + { + "epoch": 1.6800691565756605, + "grad_norm": 0.5401272411344255, + "learning_rate": 1.3122477930765243e-05, + "loss": 11.8293, + "step": 30853 + }, + { + "epoch": 1.6801236105722435, + "grad_norm": 0.5071004671666826, + "learning_rate": 1.311811140856528e-05, + "loss": 11.7706, + "step": 30854 + }, + { + "epoch": 1.6801780645688265, + "grad_norm": 0.5213372358570623, + "learning_rate": 1.3113745561973478e-05, + "loss": 11.8414, + "step": 30855 + }, + { + "epoch": 1.6802325185654094, + "grad_norm": 0.5677481754254107, + "learning_rate": 1.310938039102374e-05, + "loss": 11.7646, + "step": 30856 + }, + { + "epoch": 1.6802869725619924, + "grad_norm": 0.5898449492356662, + "learning_rate": 1.310501589575005e-05, + "loss": 11.7354, + "step": 30857 + }, + { + "epoch": 1.6803414265585754, + "grad_norm": 0.5417890917817078, + "learning_rate": 1.3100652076186314e-05, + "loss": 11.7539, + "step": 30858 + }, + { + "epoch": 1.6803958805551584, + "grad_norm": 0.5769759359254978, + "learning_rate": 1.3096288932366474e-05, + "loss": 11.7423, + "step": 30859 + }, + { + "epoch": 1.6804503345517414, + "grad_norm": 0.6294268045759392, + "learning_rate": 1.3091926464324488e-05, + "loss": 11.8246, + "step": 30860 + }, + { + "epoch": 1.6805047885483244, + "grad_norm": 0.5385612538260942, + "learning_rate": 1.3087564672094243e-05, + "loss": 11.7478, + "step": 30861 + }, + { + "epoch": 1.6805592425449074, + "grad_norm": 0.5374740654419663, + "learning_rate": 1.3083203555709677e-05, + "loss": 11.7969, + "step": 30862 + }, + { + "epoch": 1.6806136965414904, + "grad_norm": 0.527415784310097, + "learning_rate": 1.3078843115204709e-05, + "loss": 11.71, + "step": 30863 + }, + { + "epoch": 1.6806681505380734, + "grad_norm": 0.5145511706149598, + "learning_rate": 1.3074483350613209e-05, + "loss": 11.73, + "step": 30864 + }, + { + "epoch": 1.6807226045346566, + "grad_norm": 0.5551752795785434, + "learning_rate": 1.307012426196912e-05, + "loss": 11.866, + "step": 30865 + }, + { + "epoch": 1.6807770585312396, + "grad_norm": 0.5474346226968603, + "learning_rate": 1.3065765849306311e-05, + "loss": 11.8227, + "step": 30866 + }, + { + "epoch": 1.6808315125278226, + "grad_norm": 0.6086897915503154, + "learning_rate": 1.3061408112658703e-05, + "loss": 11.8433, + "step": 30867 + }, + { + "epoch": 1.6808859665244056, + "grad_norm": 0.5770849409133813, + "learning_rate": 1.3057051052060143e-05, + "loss": 11.8486, + "step": 30868 + }, + { + "epoch": 1.6809404205209886, + "grad_norm": 0.5439636611188174, + "learning_rate": 1.3052694667544551e-05, + "loss": 11.8306, + "step": 30869 + }, + { + "epoch": 1.6809948745175716, + "grad_norm": 0.5498289907417206, + "learning_rate": 1.3048338959145766e-05, + "loss": 11.723, + "step": 30870 + }, + { + "epoch": 1.6810493285141548, + "grad_norm": 0.5625196472309748, + "learning_rate": 1.3043983926897684e-05, + "loss": 11.8165, + "step": 30871 + }, + { + "epoch": 1.6811037825107378, + "grad_norm": 0.5064244434293584, + "learning_rate": 1.303962957083419e-05, + "loss": 11.7715, + "step": 30872 + }, + { + "epoch": 1.6811582365073208, + "grad_norm": 0.5110432577084042, + "learning_rate": 1.3035275890989107e-05, + "loss": 11.6278, + "step": 30873 + }, + { + "epoch": 1.6812126905039038, + "grad_norm": 0.5213051899119718, + "learning_rate": 1.3030922887396291e-05, + "loss": 11.785, + "step": 30874 + }, + { + "epoch": 1.6812671445004868, + "grad_norm": 0.5560173869588457, + "learning_rate": 1.3026570560089624e-05, + "loss": 11.6741, + "step": 30875 + }, + { + "epoch": 1.6813215984970697, + "grad_norm": 0.5389905003713072, + "learning_rate": 1.3022218909102901e-05, + "loss": 11.8545, + "step": 30876 + }, + { + "epoch": 1.6813760524936527, + "grad_norm": 0.5600843065563837, + "learning_rate": 1.3017867934470018e-05, + "loss": 11.7851, + "step": 30877 + }, + { + "epoch": 1.6814305064902357, + "grad_norm": 0.5502635265922394, + "learning_rate": 1.301351763622476e-05, + "loss": 11.8169, + "step": 30878 + }, + { + "epoch": 1.6814849604868187, + "grad_norm": 0.5284691962359876, + "learning_rate": 1.3009168014400997e-05, + "loss": 11.7361, + "step": 30879 + }, + { + "epoch": 1.6815394144834017, + "grad_norm": 0.4789944056438769, + "learning_rate": 1.3004819069032514e-05, + "loss": 11.721, + "step": 30880 + }, + { + "epoch": 1.6815938684799847, + "grad_norm": 0.5517250120970683, + "learning_rate": 1.3000470800153141e-05, + "loss": 11.7812, + "step": 30881 + }, + { + "epoch": 1.6816483224765677, + "grad_norm": 0.5518930652654063, + "learning_rate": 1.2996123207796718e-05, + "loss": 11.8482, + "step": 30882 + }, + { + "epoch": 1.6817027764731507, + "grad_norm": 0.5485477433553917, + "learning_rate": 1.2991776291997037e-05, + "loss": 11.802, + "step": 30883 + }, + { + "epoch": 1.6817572304697337, + "grad_norm": 0.5263391224168971, + "learning_rate": 1.2987430052787885e-05, + "loss": 11.6938, + "step": 30884 + }, + { + "epoch": 1.6818116844663167, + "grad_norm": 0.5706755975080879, + "learning_rate": 1.2983084490203056e-05, + "loss": 11.8174, + "step": 30885 + }, + { + "epoch": 1.6818661384628997, + "grad_norm": 0.5285214526502554, + "learning_rate": 1.2978739604276357e-05, + "loss": 11.7778, + "step": 30886 + }, + { + "epoch": 1.6819205924594827, + "grad_norm": 0.5909773457621915, + "learning_rate": 1.2974395395041595e-05, + "loss": 11.8949, + "step": 30887 + }, + { + "epoch": 1.6819750464560659, + "grad_norm": 0.5747792388283205, + "learning_rate": 1.2970051862532496e-05, + "loss": 11.8241, + "step": 30888 + }, + { + "epoch": 1.6820295004526489, + "grad_norm": 0.5925829241717875, + "learning_rate": 1.29657090067829e-05, + "loss": 11.8375, + "step": 30889 + }, + { + "epoch": 1.6820839544492319, + "grad_norm": 0.5220072443568061, + "learning_rate": 1.2961366827826515e-05, + "loss": 11.8994, + "step": 30890 + }, + { + "epoch": 1.6821384084458149, + "grad_norm": 0.5577779274779051, + "learning_rate": 1.295702532569717e-05, + "loss": 11.8704, + "step": 30891 + }, + { + "epoch": 1.6821928624423979, + "grad_norm": 0.5051562087141184, + "learning_rate": 1.2952684500428558e-05, + "loss": 11.7841, + "step": 30892 + }, + { + "epoch": 1.6822473164389808, + "grad_norm": 0.5518716408252478, + "learning_rate": 1.294834435205451e-05, + "loss": 11.7912, + "step": 30893 + }, + { + "epoch": 1.682301770435564, + "grad_norm": 0.5066884801095428, + "learning_rate": 1.2944004880608718e-05, + "loss": 11.7375, + "step": 30894 + }, + { + "epoch": 1.682356224432147, + "grad_norm": 0.5492426517534502, + "learning_rate": 1.2939666086124936e-05, + "loss": 11.8867, + "step": 30895 + }, + { + "epoch": 1.68241067842873, + "grad_norm": 0.5693621790149476, + "learning_rate": 1.29353279686369e-05, + "loss": 11.8318, + "step": 30896 + }, + { + "epoch": 1.682465132425313, + "grad_norm": 0.5165803167555809, + "learning_rate": 1.2930990528178377e-05, + "loss": 11.8264, + "step": 30897 + }, + { + "epoch": 1.682519586421896, + "grad_norm": 0.5596161858222217, + "learning_rate": 1.2926653764783047e-05, + "loss": 11.8619, + "step": 30898 + }, + { + "epoch": 1.682574040418479, + "grad_norm": 0.547827742362886, + "learning_rate": 1.2922317678484697e-05, + "loss": 11.7555, + "step": 30899 + }, + { + "epoch": 1.682628494415062, + "grad_norm": 0.535879415983803, + "learning_rate": 1.2917982269316975e-05, + "loss": 11.7463, + "step": 30900 + }, + { + "epoch": 1.682682948411645, + "grad_norm": 0.535413866828367, + "learning_rate": 1.2913647537313644e-05, + "loss": 11.8689, + "step": 30901 + }, + { + "epoch": 1.682737402408228, + "grad_norm": 0.5805696529246693, + "learning_rate": 1.2909313482508379e-05, + "loss": 11.9161, + "step": 30902 + }, + { + "epoch": 1.682791856404811, + "grad_norm": 0.5260264998409206, + "learning_rate": 1.2904980104934917e-05, + "loss": 11.8405, + "step": 30903 + }, + { + "epoch": 1.682846310401394, + "grad_norm": 0.5854381143859504, + "learning_rate": 1.2900647404626943e-05, + "loss": 11.792, + "step": 30904 + }, + { + "epoch": 1.682900764397977, + "grad_norm": 0.5253860655671488, + "learning_rate": 1.2896315381618107e-05, + "loss": 11.7813, + "step": 30905 + }, + { + "epoch": 1.68295521839456, + "grad_norm": 0.554908637873242, + "learning_rate": 1.2891984035942162e-05, + "loss": 11.7716, + "step": 30906 + }, + { + "epoch": 1.683009672391143, + "grad_norm": 0.5782336746781361, + "learning_rate": 1.2887653367632735e-05, + "loss": 11.6716, + "step": 30907 + }, + { + "epoch": 1.683064126387726, + "grad_norm": 0.5665593192445647, + "learning_rate": 1.2883323376723521e-05, + "loss": 11.7545, + "step": 30908 + }, + { + "epoch": 1.683118580384309, + "grad_norm": 0.6995263762196952, + "learning_rate": 1.2878994063248218e-05, + "loss": 11.9942, + "step": 30909 + }, + { + "epoch": 1.683173034380892, + "grad_norm": 0.535471529613836, + "learning_rate": 1.287466542724044e-05, + "loss": 11.6526, + "step": 30910 + }, + { + "epoch": 1.6832274883774752, + "grad_norm": 0.48303961442288756, + "learning_rate": 1.2870337468733895e-05, + "loss": 11.7168, + "step": 30911 + }, + { + "epoch": 1.6832819423740581, + "grad_norm": 0.5542509936891705, + "learning_rate": 1.28660101877622e-05, + "loss": 11.8746, + "step": 30912 + }, + { + "epoch": 1.6833363963706411, + "grad_norm": 0.5820860719517761, + "learning_rate": 1.2861683584359041e-05, + "loss": 11.7746, + "step": 30913 + }, + { + "epoch": 1.6833908503672241, + "grad_norm": 0.6270610000900988, + "learning_rate": 1.2857357658558056e-05, + "loss": 11.691, + "step": 30914 + }, + { + "epoch": 1.6834453043638071, + "grad_norm": 0.5736399326213332, + "learning_rate": 1.285303241039284e-05, + "loss": 11.8652, + "step": 30915 + }, + { + "epoch": 1.6834997583603901, + "grad_norm": 0.5619805326157901, + "learning_rate": 1.2848707839897078e-05, + "loss": 11.7971, + "step": 30916 + }, + { + "epoch": 1.6835542123569733, + "grad_norm": 0.479098233802878, + "learning_rate": 1.2844383947104354e-05, + "loss": 11.7201, + "step": 30917 + }, + { + "epoch": 1.6836086663535563, + "grad_norm": 0.5459321419597545, + "learning_rate": 1.2840060732048353e-05, + "loss": 11.8616, + "step": 30918 + }, + { + "epoch": 1.6836631203501393, + "grad_norm": 0.6303937182194855, + "learning_rate": 1.2835738194762626e-05, + "loss": 11.7714, + "step": 30919 + }, + { + "epoch": 1.6837175743467223, + "grad_norm": 0.6063994481799019, + "learning_rate": 1.283141633528081e-05, + "loss": 11.8242, + "step": 30920 + }, + { + "epoch": 1.6837720283433053, + "grad_norm": 0.5507727194406049, + "learning_rate": 1.2827095153636548e-05, + "loss": 11.8543, + "step": 30921 + }, + { + "epoch": 1.6838264823398883, + "grad_norm": 0.5404127369170493, + "learning_rate": 1.282277464986339e-05, + "loss": 11.8502, + "step": 30922 + }, + { + "epoch": 1.6838809363364713, + "grad_norm": 0.522691500546519, + "learning_rate": 1.2818454823994974e-05, + "loss": 11.7252, + "step": 30923 + }, + { + "epoch": 1.6839353903330543, + "grad_norm": 0.5330287746979502, + "learning_rate": 1.2814135676064887e-05, + "loss": 11.8818, + "step": 30924 + }, + { + "epoch": 1.6839898443296373, + "grad_norm": 0.5324819084756363, + "learning_rate": 1.2809817206106667e-05, + "loss": 11.7668, + "step": 30925 + }, + { + "epoch": 1.6840442983262203, + "grad_norm": 0.5679799851672759, + "learning_rate": 1.2805499414153954e-05, + "loss": 11.7543, + "step": 30926 + }, + { + "epoch": 1.6840987523228033, + "grad_norm": 0.5641950613952643, + "learning_rate": 1.2801182300240277e-05, + "loss": 11.8477, + "step": 30927 + }, + { + "epoch": 1.6841532063193863, + "grad_norm": 0.5684721307222901, + "learning_rate": 1.2796865864399254e-05, + "loss": 11.9336, + "step": 30928 + }, + { + "epoch": 1.6842076603159692, + "grad_norm": 0.5958884951547424, + "learning_rate": 1.2792550106664415e-05, + "loss": 11.8898, + "step": 30929 + }, + { + "epoch": 1.6842621143125522, + "grad_norm": 0.5768648008155474, + "learning_rate": 1.2788235027069318e-05, + "loss": 11.8193, + "step": 30930 + }, + { + "epoch": 1.6843165683091352, + "grad_norm": 0.5808263521474171, + "learning_rate": 1.278392062564755e-05, + "loss": 11.8038, + "step": 30931 + }, + { + "epoch": 1.6843710223057182, + "grad_norm": 0.5670701911435472, + "learning_rate": 1.2779606902432627e-05, + "loss": 11.7317, + "step": 30932 + }, + { + "epoch": 1.6844254763023012, + "grad_norm": 0.5128089869119639, + "learning_rate": 1.2775293857458148e-05, + "loss": 11.6964, + "step": 30933 + }, + { + "epoch": 1.6844799302988842, + "grad_norm": 0.48434730096323336, + "learning_rate": 1.2770981490757572e-05, + "loss": 11.6702, + "step": 30934 + }, + { + "epoch": 1.6845343842954674, + "grad_norm": 0.5543601897213132, + "learning_rate": 1.2766669802364473e-05, + "loss": 11.5906, + "step": 30935 + }, + { + "epoch": 1.6845888382920504, + "grad_norm": 0.5149236057309247, + "learning_rate": 1.276235879231239e-05, + "loss": 11.8646, + "step": 30936 + }, + { + "epoch": 1.6846432922886334, + "grad_norm": 0.5751090349175441, + "learning_rate": 1.275804846063482e-05, + "loss": 11.8037, + "step": 30937 + }, + { + "epoch": 1.6846977462852164, + "grad_norm": 0.5163364506804589, + "learning_rate": 1.2753738807365322e-05, + "loss": 11.7896, + "step": 30938 + }, + { + "epoch": 1.6847522002817994, + "grad_norm": 0.557297852222947, + "learning_rate": 1.2749429832537352e-05, + "loss": 11.8165, + "step": 30939 + }, + { + "epoch": 1.6848066542783826, + "grad_norm": 0.5754185383229934, + "learning_rate": 1.2745121536184468e-05, + "loss": 11.8465, + "step": 30940 + }, + { + "epoch": 1.6848611082749656, + "grad_norm": 0.5189316542291988, + "learning_rate": 1.2740813918340144e-05, + "loss": 11.6276, + "step": 30941 + }, + { + "epoch": 1.6849155622715486, + "grad_norm": 0.5397203483892539, + "learning_rate": 1.2736506979037866e-05, + "loss": 11.814, + "step": 30942 + }, + { + "epoch": 1.6849700162681316, + "grad_norm": 0.5248558922205516, + "learning_rate": 1.2732200718311183e-05, + "loss": 11.655, + "step": 30943 + }, + { + "epoch": 1.6850244702647146, + "grad_norm": 0.569772147300932, + "learning_rate": 1.2727895136193535e-05, + "loss": 11.8354, + "step": 30944 + }, + { + "epoch": 1.6850789242612976, + "grad_norm": 0.5107304223006778, + "learning_rate": 1.2723590232718386e-05, + "loss": 11.7712, + "step": 30945 + }, + { + "epoch": 1.6851333782578806, + "grad_norm": 0.5517829892272162, + "learning_rate": 1.2719286007919252e-05, + "loss": 11.853, + "step": 30946 + }, + { + "epoch": 1.6851878322544636, + "grad_norm": 0.5516534462677495, + "learning_rate": 1.2714982461829571e-05, + "loss": 11.9019, + "step": 30947 + }, + { + "epoch": 1.6852422862510466, + "grad_norm": 0.5973626906897074, + "learning_rate": 1.2710679594482855e-05, + "loss": 11.8328, + "step": 30948 + }, + { + "epoch": 1.6852967402476295, + "grad_norm": 0.486708149672646, + "learning_rate": 1.2706377405912496e-05, + "loss": 11.7596, + "step": 30949 + }, + { + "epoch": 1.6853511942442125, + "grad_norm": 0.5392361116153961, + "learning_rate": 1.270207589615201e-05, + "loss": 11.8083, + "step": 30950 + }, + { + "epoch": 1.6854056482407955, + "grad_norm": 0.6011225344433534, + "learning_rate": 1.2697775065234807e-05, + "loss": 11.8283, + "step": 30951 + }, + { + "epoch": 1.6854601022373785, + "grad_norm": 0.4660341428272207, + "learning_rate": 1.2693474913194347e-05, + "loss": 11.7044, + "step": 30952 + }, + { + "epoch": 1.6855145562339615, + "grad_norm": 0.5744173327336871, + "learning_rate": 1.2689175440064083e-05, + "loss": 11.8485, + "step": 30953 + }, + { + "epoch": 1.6855690102305445, + "grad_norm": 0.49386697404015156, + "learning_rate": 1.268487664587743e-05, + "loss": 11.8396, + "step": 30954 + }, + { + "epoch": 1.6856234642271275, + "grad_norm": 0.5649774813196295, + "learning_rate": 1.268057853066783e-05, + "loss": 11.7956, + "step": 30955 + }, + { + "epoch": 1.6856779182237105, + "grad_norm": 0.5066835993067511, + "learning_rate": 1.2676281094468667e-05, + "loss": 11.9316, + "step": 30956 + }, + { + "epoch": 1.6857323722202935, + "grad_norm": 0.5387885700361347, + "learning_rate": 1.2671984337313381e-05, + "loss": 11.809, + "step": 30957 + }, + { + "epoch": 1.6857868262168767, + "grad_norm": 0.6158151256337171, + "learning_rate": 1.266768825923541e-05, + "loss": 11.7484, + "step": 30958 + }, + { + "epoch": 1.6858412802134597, + "grad_norm": 0.5642077699526209, + "learning_rate": 1.2663392860268131e-05, + "loss": 11.7662, + "step": 30959 + }, + { + "epoch": 1.6858957342100427, + "grad_norm": 0.5004019979844113, + "learning_rate": 1.265909814044497e-05, + "loss": 11.8103, + "step": 30960 + }, + { + "epoch": 1.6859501882066257, + "grad_norm": 0.5326111269189654, + "learning_rate": 1.265480409979929e-05, + "loss": 11.7948, + "step": 30961 + }, + { + "epoch": 1.6860046422032087, + "grad_norm": 0.503972500382625, + "learning_rate": 1.2650510738364518e-05, + "loss": 11.6654, + "step": 30962 + }, + { + "epoch": 1.6860590961997917, + "grad_norm": 0.5222908121815988, + "learning_rate": 1.2646218056174009e-05, + "loss": 11.9125, + "step": 30963 + }, + { + "epoch": 1.6861135501963749, + "grad_norm": 0.5209559935917166, + "learning_rate": 1.2641926053261177e-05, + "loss": 11.6175, + "step": 30964 + }, + { + "epoch": 1.6861680041929579, + "grad_norm": 0.5519284255939755, + "learning_rate": 1.2637634729659375e-05, + "loss": 11.7419, + "step": 30965 + }, + { + "epoch": 1.6862224581895409, + "grad_norm": 0.5532925618193903, + "learning_rate": 1.2633344085401955e-05, + "loss": 11.776, + "step": 30966 + }, + { + "epoch": 1.6862769121861239, + "grad_norm": 0.5466818222668209, + "learning_rate": 1.262905412052232e-05, + "loss": 11.7007, + "step": 30967 + }, + { + "epoch": 1.6863313661827068, + "grad_norm": 0.5760458209075585, + "learning_rate": 1.2624764835053804e-05, + "loss": 11.7261, + "step": 30968 + }, + { + "epoch": 1.6863858201792898, + "grad_norm": 0.5607843698283759, + "learning_rate": 1.2620476229029753e-05, + "loss": 11.7256, + "step": 30969 + }, + { + "epoch": 1.6864402741758728, + "grad_norm": 0.5876410562875248, + "learning_rate": 1.2616188302483567e-05, + "loss": 11.7445, + "step": 30970 + }, + { + "epoch": 1.6864947281724558, + "grad_norm": 0.5333042910383061, + "learning_rate": 1.2611901055448528e-05, + "loss": 11.7903, + "step": 30971 + }, + { + "epoch": 1.6865491821690388, + "grad_norm": 0.5375697937719466, + "learning_rate": 1.260761448795802e-05, + "loss": 11.795, + "step": 30972 + }, + { + "epoch": 1.6866036361656218, + "grad_norm": 0.5320807055716856, + "learning_rate": 1.2603328600045338e-05, + "loss": 11.7174, + "step": 30973 + }, + { + "epoch": 1.6866580901622048, + "grad_norm": 0.5754043768594431, + "learning_rate": 1.259904339174386e-05, + "loss": 11.7903, + "step": 30974 + }, + { + "epoch": 1.6867125441587878, + "grad_norm": 0.5422487035895022, + "learning_rate": 1.2594758863086865e-05, + "loss": 11.8148, + "step": 30975 + }, + { + "epoch": 1.6867669981553708, + "grad_norm": 0.5434807244508277, + "learning_rate": 1.2590475014107661e-05, + "loss": 11.8619, + "step": 30976 + }, + { + "epoch": 1.6868214521519538, + "grad_norm": 0.5788963942280565, + "learning_rate": 1.2586191844839612e-05, + "loss": 11.8867, + "step": 30977 + }, + { + "epoch": 1.6868759061485368, + "grad_norm": 0.6179581524250197, + "learning_rate": 1.2581909355315968e-05, + "loss": 11.7265, + "step": 30978 + }, + { + "epoch": 1.6869303601451198, + "grad_norm": 0.6156278283170895, + "learning_rate": 1.2577627545570059e-05, + "loss": 11.7749, + "step": 30979 + }, + { + "epoch": 1.6869848141417028, + "grad_norm": 0.5364324405623805, + "learning_rate": 1.2573346415635201e-05, + "loss": 11.7738, + "step": 30980 + }, + { + "epoch": 1.687039268138286, + "grad_norm": 0.6067647389487533, + "learning_rate": 1.2569065965544636e-05, + "loss": 11.8974, + "step": 30981 + }, + { + "epoch": 1.687093722134869, + "grad_norm": 0.5466475366529785, + "learning_rate": 1.2564786195331702e-05, + "loss": 11.771, + "step": 30982 + }, + { + "epoch": 1.687148176131452, + "grad_norm": 0.5538879810462987, + "learning_rate": 1.256050710502964e-05, + "loss": 11.8921, + "step": 30983 + }, + { + "epoch": 1.687202630128035, + "grad_norm": 0.7236317356514967, + "learning_rate": 1.2556228694671746e-05, + "loss": 11.8492, + "step": 30984 + }, + { + "epoch": 1.687257084124618, + "grad_norm": 0.5389627890915959, + "learning_rate": 1.2551950964291292e-05, + "loss": 11.7868, + "step": 30985 + }, + { + "epoch": 1.687311538121201, + "grad_norm": 0.5477972562887624, + "learning_rate": 1.2547673913921499e-05, + "loss": 11.661, + "step": 30986 + }, + { + "epoch": 1.6873659921177842, + "grad_norm": 0.5549818238153731, + "learning_rate": 1.2543397543595692e-05, + "loss": 11.9308, + "step": 30987 + }, + { + "epoch": 1.6874204461143671, + "grad_norm": 0.5392778514406316, + "learning_rate": 1.2539121853347069e-05, + "loss": 11.6602, + "step": 30988 + }, + { + "epoch": 1.6874749001109501, + "grad_norm": 0.5639723548899724, + "learning_rate": 1.2534846843208925e-05, + "loss": 11.7974, + "step": 30989 + }, + { + "epoch": 1.6875293541075331, + "grad_norm": 0.5938283471144208, + "learning_rate": 1.2530572513214446e-05, + "loss": 11.8409, + "step": 30990 + }, + { + "epoch": 1.6875838081041161, + "grad_norm": 0.5400408892509663, + "learning_rate": 1.2526298863396912e-05, + "loss": 11.7953, + "step": 30991 + }, + { + "epoch": 1.6876382621006991, + "grad_norm": 0.6553367130312735, + "learning_rate": 1.2522025893789569e-05, + "loss": 11.8586, + "step": 30992 + }, + { + "epoch": 1.687692716097282, + "grad_norm": 0.5834404463377707, + "learning_rate": 1.2517753604425608e-05, + "loss": 11.7478, + "step": 30993 + }, + { + "epoch": 1.687747170093865, + "grad_norm": 0.57803278579205, + "learning_rate": 1.2513481995338284e-05, + "loss": 11.8328, + "step": 30994 + }, + { + "epoch": 1.687801624090448, + "grad_norm": 0.6286824435063781, + "learning_rate": 1.2509211066560788e-05, + "loss": 11.813, + "step": 30995 + }, + { + "epoch": 1.687856078087031, + "grad_norm": 0.5277799842411208, + "learning_rate": 1.250494081812632e-05, + "loss": 11.7461, + "step": 30996 + }, + { + "epoch": 1.687910532083614, + "grad_norm": 0.528310544407678, + "learning_rate": 1.250067125006813e-05, + "loss": 11.6591, + "step": 30997 + }, + { + "epoch": 1.687964986080197, + "grad_norm": 0.5240383164639838, + "learning_rate": 1.2496402362419369e-05, + "loss": 11.8203, + "step": 30998 + }, + { + "epoch": 1.68801944007678, + "grad_norm": 0.5499002892758581, + "learning_rate": 1.2492134155213275e-05, + "loss": 11.6102, + "step": 30999 + }, + { + "epoch": 1.688073894073363, + "grad_norm": 0.5024318789840799, + "learning_rate": 1.2487866628483014e-05, + "loss": 11.7725, + "step": 31000 + }, + { + "epoch": 1.688128348069946, + "grad_norm": 0.5838408608040243, + "learning_rate": 1.2483599782261768e-05, + "loss": 11.7318, + "step": 31001 + }, + { + "epoch": 1.688182802066529, + "grad_norm": 0.6098789562032324, + "learning_rate": 1.2479333616582744e-05, + "loss": 11.8629, + "step": 31002 + }, + { + "epoch": 1.688237256063112, + "grad_norm": 0.5656585730421902, + "learning_rate": 1.2475068131479084e-05, + "loss": 11.792, + "step": 31003 + }, + { + "epoch": 1.688291710059695, + "grad_norm": 0.5392994469720828, + "learning_rate": 1.2470803326984016e-05, + "loss": 11.7874, + "step": 31004 + }, + { + "epoch": 1.6883461640562782, + "grad_norm": 0.5252149296259485, + "learning_rate": 1.2466539203130612e-05, + "loss": 11.784, + "step": 31005 + }, + { + "epoch": 1.6884006180528612, + "grad_norm": 0.507835131305577, + "learning_rate": 1.246227575995208e-05, + "loss": 11.7961, + "step": 31006 + }, + { + "epoch": 1.6884550720494442, + "grad_norm": 0.5172020169125301, + "learning_rate": 1.2458012997481594e-05, + "loss": 11.8493, + "step": 31007 + }, + { + "epoch": 1.6885095260460272, + "grad_norm": 0.580931403383934, + "learning_rate": 1.245375091575226e-05, + "loss": 11.8756, + "step": 31008 + }, + { + "epoch": 1.6885639800426102, + "grad_norm": 0.5486886961906027, + "learning_rate": 1.2449489514797264e-05, + "loss": 11.7934, + "step": 31009 + }, + { + "epoch": 1.6886184340391934, + "grad_norm": 0.47751340387962543, + "learning_rate": 1.24452287946497e-05, + "loss": 11.7613, + "step": 31010 + }, + { + "epoch": 1.6886728880357764, + "grad_norm": 0.6112902242248104, + "learning_rate": 1.244096875534273e-05, + "loss": 11.7245, + "step": 31011 + }, + { + "epoch": 1.6887273420323594, + "grad_norm": 0.5524698751702133, + "learning_rate": 1.2436709396909451e-05, + "loss": 11.817, + "step": 31012 + }, + { + "epoch": 1.6887817960289424, + "grad_norm": 0.5503354050461463, + "learning_rate": 1.2432450719383015e-05, + "loss": 11.8147, + "step": 31013 + }, + { + "epoch": 1.6888362500255254, + "grad_norm": 0.5508965272505504, + "learning_rate": 1.242819272279656e-05, + "loss": 11.6423, + "step": 31014 + }, + { + "epoch": 1.6888907040221084, + "grad_norm": 0.5367811120380197, + "learning_rate": 1.242393540718313e-05, + "loss": 11.8595, + "step": 31015 + }, + { + "epoch": 1.6889451580186914, + "grad_norm": 0.5561738870650693, + "learning_rate": 1.2419678772575882e-05, + "loss": 11.8342, + "step": 31016 + }, + { + "epoch": 1.6889996120152744, + "grad_norm": 0.5301874112024781, + "learning_rate": 1.2415422819007871e-05, + "loss": 11.8866, + "step": 31017 + }, + { + "epoch": 1.6890540660118574, + "grad_norm": 0.6467900675327487, + "learning_rate": 1.2411167546512226e-05, + "loss": 11.9131, + "step": 31018 + }, + { + "epoch": 1.6891085200084404, + "grad_norm": 0.5411147604998521, + "learning_rate": 1.2406912955122052e-05, + "loss": 11.9355, + "step": 31019 + }, + { + "epoch": 1.6891629740050234, + "grad_norm": 0.47049751414523905, + "learning_rate": 1.240265904487039e-05, + "loss": 11.7164, + "step": 31020 + }, + { + "epoch": 1.6892174280016063, + "grad_norm": 0.5459693348498853, + "learning_rate": 1.2398405815790371e-05, + "loss": 11.8325, + "step": 31021 + }, + { + "epoch": 1.6892718819981893, + "grad_norm": 0.5441759555883625, + "learning_rate": 1.2394153267915009e-05, + "loss": 11.8041, + "step": 31022 + }, + { + "epoch": 1.6893263359947723, + "grad_norm": 0.5060146303218898, + "learning_rate": 1.2389901401277426e-05, + "loss": 11.7788, + "step": 31023 + }, + { + "epoch": 1.6893807899913553, + "grad_norm": 0.6361849047081553, + "learning_rate": 1.238565021591066e-05, + "loss": 11.8789, + "step": 31024 + }, + { + "epoch": 1.6894352439879383, + "grad_norm": 0.5062576448789375, + "learning_rate": 1.2381399711847751e-05, + "loss": 11.7049, + "step": 31025 + }, + { + "epoch": 1.6894896979845213, + "grad_norm": 0.554103616111281, + "learning_rate": 1.2377149889121797e-05, + "loss": 11.7653, + "step": 31026 + }, + { + "epoch": 1.6895441519811043, + "grad_norm": 0.4929483825124844, + "learning_rate": 1.2372900747765791e-05, + "loss": 11.6667, + "step": 31027 + }, + { + "epoch": 1.6895986059776875, + "grad_norm": 0.5440798486434106, + "learning_rate": 1.236865228781281e-05, + "loss": 11.8324, + "step": 31028 + }, + { + "epoch": 1.6896530599742705, + "grad_norm": 0.565908417010825, + "learning_rate": 1.2364404509295902e-05, + "loss": 11.7977, + "step": 31029 + }, + { + "epoch": 1.6897075139708535, + "grad_norm": 0.5461142968907308, + "learning_rate": 1.2360157412248064e-05, + "loss": 11.7253, + "step": 31030 + }, + { + "epoch": 1.6897619679674365, + "grad_norm": 0.5625146515850067, + "learning_rate": 1.2355910996702358e-05, + "loss": 11.9029, + "step": 31031 + }, + { + "epoch": 1.6898164219640195, + "grad_norm": 0.4987479890051444, + "learning_rate": 1.2351665262691758e-05, + "loss": 11.644, + "step": 31032 + }, + { + "epoch": 1.6898708759606025, + "grad_norm": 0.5498233010714532, + "learning_rate": 1.2347420210249339e-05, + "loss": 11.7239, + "step": 31033 + }, + { + "epoch": 1.6899253299571857, + "grad_norm": 0.579645915344141, + "learning_rate": 1.2343175839408072e-05, + "loss": 11.8525, + "step": 31034 + }, + { + "epoch": 1.6899797839537687, + "grad_norm": 0.5109973452372204, + "learning_rate": 1.2338932150200955e-05, + "loss": 11.7003, + "step": 31035 + }, + { + "epoch": 1.6900342379503517, + "grad_norm": 0.5250080212971248, + "learning_rate": 1.2334689142661015e-05, + "loss": 11.7213, + "step": 31036 + }, + { + "epoch": 1.6900886919469347, + "grad_norm": 0.5140151033662399, + "learning_rate": 1.2330446816821217e-05, + "loss": 11.7804, + "step": 31037 + }, + { + "epoch": 1.6901431459435177, + "grad_norm": 0.5814524810375874, + "learning_rate": 1.232620517271459e-05, + "loss": 11.7545, + "step": 31038 + }, + { + "epoch": 1.6901975999401007, + "grad_norm": 0.5844885144223078, + "learning_rate": 1.2321964210374081e-05, + "loss": 11.8049, + "step": 31039 + }, + { + "epoch": 1.6902520539366837, + "grad_norm": 0.5273357932651959, + "learning_rate": 1.231772392983267e-05, + "loss": 11.825, + "step": 31040 + }, + { + "epoch": 1.6903065079332666, + "grad_norm": 0.5230934720052108, + "learning_rate": 1.2313484331123371e-05, + "loss": 11.7785, + "step": 31041 + }, + { + "epoch": 1.6903609619298496, + "grad_norm": 0.5066650193755428, + "learning_rate": 1.2309245414279092e-05, + "loss": 11.6719, + "step": 31042 + }, + { + "epoch": 1.6904154159264326, + "grad_norm": 0.48386011572223003, + "learning_rate": 1.230500717933285e-05, + "loss": 11.7318, + "step": 31043 + }, + { + "epoch": 1.6904698699230156, + "grad_norm": 0.5816903007676951, + "learning_rate": 1.2300769626317587e-05, + "loss": 11.7068, + "step": 31044 + }, + { + "epoch": 1.6905243239195986, + "grad_norm": 0.5322336672555599, + "learning_rate": 1.2296532755266222e-05, + "loss": 11.8342, + "step": 31045 + }, + { + "epoch": 1.6905787779161816, + "grad_norm": 0.5174156072071366, + "learning_rate": 1.2292296566211737e-05, + "loss": 11.7864, + "step": 31046 + }, + { + "epoch": 1.6906332319127646, + "grad_norm": 0.6314504146664143, + "learning_rate": 1.228806105918705e-05, + "loss": 11.8131, + "step": 31047 + }, + { + "epoch": 1.6906876859093476, + "grad_norm": 0.5293592883679874, + "learning_rate": 1.2283826234225137e-05, + "loss": 11.7786, + "step": 31048 + }, + { + "epoch": 1.6907421399059306, + "grad_norm": 0.5495173292261814, + "learning_rate": 1.2279592091358871e-05, + "loss": 11.795, + "step": 31049 + }, + { + "epoch": 1.6907965939025136, + "grad_norm": 0.5284385447491805, + "learning_rate": 1.2275358630621214e-05, + "loss": 11.7026, + "step": 31050 + }, + { + "epoch": 1.6908510478990968, + "grad_norm": 0.5072889494758992, + "learning_rate": 1.2271125852045095e-05, + "loss": 11.8435, + "step": 31051 + }, + { + "epoch": 1.6909055018956798, + "grad_norm": 0.541842245256738, + "learning_rate": 1.2266893755663388e-05, + "loss": 11.7366, + "step": 31052 + }, + { + "epoch": 1.6909599558922628, + "grad_norm": 0.5077504448857384, + "learning_rate": 1.2262662341509057e-05, + "loss": 11.8134, + "step": 31053 + }, + { + "epoch": 1.6910144098888458, + "grad_norm": 0.5042125936553743, + "learning_rate": 1.2258431609614973e-05, + "loss": 11.6919, + "step": 31054 + }, + { + "epoch": 1.6910688638854288, + "grad_norm": 0.5543434981239634, + "learning_rate": 1.2254201560014022e-05, + "loss": 11.7729, + "step": 31055 + }, + { + "epoch": 1.6911233178820118, + "grad_norm": 0.5241217795547016, + "learning_rate": 1.2249972192739135e-05, + "loss": 11.7454, + "step": 31056 + }, + { + "epoch": 1.691177771878595, + "grad_norm": 0.5353220571388403, + "learning_rate": 1.224574350782316e-05, + "loss": 11.8802, + "step": 31057 + }, + { + "epoch": 1.691232225875178, + "grad_norm": 0.5159713338830011, + "learning_rate": 1.2241515505299016e-05, + "loss": 11.7998, + "step": 31058 + }, + { + "epoch": 1.691286679871761, + "grad_norm": 0.5840084513881622, + "learning_rate": 1.2237288185199547e-05, + "loss": 11.8563, + "step": 31059 + }, + { + "epoch": 1.691341133868344, + "grad_norm": 0.5622320436290875, + "learning_rate": 1.2233061547557667e-05, + "loss": 11.7928, + "step": 31060 + }, + { + "epoch": 1.691395587864927, + "grad_norm": 0.5405334827035437, + "learning_rate": 1.2228835592406196e-05, + "loss": 11.7847, + "step": 31061 + }, + { + "epoch": 1.69145004186151, + "grad_norm": 0.5921790358345382, + "learning_rate": 1.2224610319778018e-05, + "loss": 11.7002, + "step": 31062 + }, + { + "epoch": 1.691504495858093, + "grad_norm": 0.5404621715199007, + "learning_rate": 1.2220385729706007e-05, + "loss": 11.8761, + "step": 31063 + }, + { + "epoch": 1.691558949854676, + "grad_norm": 0.6224685543905782, + "learning_rate": 1.2216161822222993e-05, + "loss": 11.8591, + "step": 31064 + }, + { + "epoch": 1.691613403851259, + "grad_norm": 0.5297868569126959, + "learning_rate": 1.2211938597361839e-05, + "loss": 11.4864, + "step": 31065 + }, + { + "epoch": 1.691667857847842, + "grad_norm": 0.7639102393028403, + "learning_rate": 1.2207716055155349e-05, + "loss": 11.7668, + "step": 31066 + }, + { + "epoch": 1.691722311844425, + "grad_norm": 0.5075137608088227, + "learning_rate": 1.2203494195636379e-05, + "loss": 11.8143, + "step": 31067 + }, + { + "epoch": 1.691776765841008, + "grad_norm": 0.5915385553745713, + "learning_rate": 1.2199273018837775e-05, + "loss": 11.8833, + "step": 31068 + }, + { + "epoch": 1.6918312198375909, + "grad_norm": 0.5144594337393157, + "learning_rate": 1.2195052524792339e-05, + "loss": 11.7613, + "step": 31069 + }, + { + "epoch": 1.6918856738341739, + "grad_norm": 0.4984695264655388, + "learning_rate": 1.2190832713532917e-05, + "loss": 11.7809, + "step": 31070 + }, + { + "epoch": 1.6919401278307569, + "grad_norm": 0.5117894029324008, + "learning_rate": 1.2186613585092277e-05, + "loss": 11.7135, + "step": 31071 + }, + { + "epoch": 1.6919945818273399, + "grad_norm": 0.5383639911433917, + "learning_rate": 1.2182395139503289e-05, + "loss": 11.8268, + "step": 31072 + }, + { + "epoch": 1.6920490358239229, + "grad_norm": 0.5174906912576328, + "learning_rate": 1.2178177376798693e-05, + "loss": 11.8227, + "step": 31073 + }, + { + "epoch": 1.692103489820506, + "grad_norm": 0.575912544748499, + "learning_rate": 1.2173960297011344e-05, + "loss": 11.6925, + "step": 31074 + }, + { + "epoch": 1.692157943817089, + "grad_norm": 0.5235764455622189, + "learning_rate": 1.2169743900174003e-05, + "loss": 11.6668, + "step": 31075 + }, + { + "epoch": 1.692212397813672, + "grad_norm": 0.5566910111289741, + "learning_rate": 1.2165528186319453e-05, + "loss": 11.8516, + "step": 31076 + }, + { + "epoch": 1.692266851810255, + "grad_norm": 0.535604012565439, + "learning_rate": 1.2161313155480469e-05, + "loss": 11.8037, + "step": 31077 + }, + { + "epoch": 1.692321305806838, + "grad_norm": 0.5247715048292585, + "learning_rate": 1.2157098807689882e-05, + "loss": 11.8749, + "step": 31078 + }, + { + "epoch": 1.692375759803421, + "grad_norm": 0.568802881873654, + "learning_rate": 1.2152885142980397e-05, + "loss": 11.8276, + "step": 31079 + }, + { + "epoch": 1.6924302138000042, + "grad_norm": 0.5621694856576721, + "learning_rate": 1.2148672161384823e-05, + "loss": 11.8172, + "step": 31080 + }, + { + "epoch": 1.6924846677965872, + "grad_norm": 0.5564010752224365, + "learning_rate": 1.2144459862935898e-05, + "loss": 11.8701, + "step": 31081 + }, + { + "epoch": 1.6925391217931702, + "grad_norm": 0.5462549488719423, + "learning_rate": 1.2140248247666397e-05, + "loss": 11.8552, + "step": 31082 + }, + { + "epoch": 1.6925935757897532, + "grad_norm": 0.5615880949210886, + "learning_rate": 1.213603731560905e-05, + "loss": 11.7695, + "step": 31083 + }, + { + "epoch": 1.6926480297863362, + "grad_norm": 0.5923126679584949, + "learning_rate": 1.2131827066796608e-05, + "loss": 11.7957, + "step": 31084 + }, + { + "epoch": 1.6927024837829192, + "grad_norm": 0.5783788676073419, + "learning_rate": 1.2127617501261845e-05, + "loss": 11.691, + "step": 31085 + }, + { + "epoch": 1.6927569377795022, + "grad_norm": 0.5420336454809602, + "learning_rate": 1.2123408619037434e-05, + "loss": 11.7871, + "step": 31086 + }, + { + "epoch": 1.6928113917760852, + "grad_norm": 0.5781991952400373, + "learning_rate": 1.211920042015615e-05, + "loss": 11.773, + "step": 31087 + }, + { + "epoch": 1.6928658457726682, + "grad_norm": 0.5485901242604088, + "learning_rate": 1.2114992904650691e-05, + "loss": 11.8089, + "step": 31088 + }, + { + "epoch": 1.6929202997692512, + "grad_norm": 0.5246036213674602, + "learning_rate": 1.211078607255377e-05, + "loss": 11.8005, + "step": 31089 + }, + { + "epoch": 1.6929747537658342, + "grad_norm": 0.651142675260831, + "learning_rate": 1.2106579923898154e-05, + "loss": 11.7684, + "step": 31090 + }, + { + "epoch": 1.6930292077624172, + "grad_norm": 0.5184941021250798, + "learning_rate": 1.2102374458716481e-05, + "loss": 11.8613, + "step": 31091 + }, + { + "epoch": 1.6930836617590002, + "grad_norm": 0.5544019442943551, + "learning_rate": 1.2098169677041504e-05, + "loss": 11.7862, + "step": 31092 + }, + { + "epoch": 1.6931381157555832, + "grad_norm": 0.6027250340795689, + "learning_rate": 1.2093965578905875e-05, + "loss": 11.855, + "step": 31093 + }, + { + "epoch": 1.6931925697521661, + "grad_norm": 0.6490606591017354, + "learning_rate": 1.2089762164342344e-05, + "loss": 11.853, + "step": 31094 + }, + { + "epoch": 1.6932470237487491, + "grad_norm": 0.6449716678820732, + "learning_rate": 1.2085559433383565e-05, + "loss": 11.7931, + "step": 31095 + }, + { + "epoch": 1.6933014777453321, + "grad_norm": 0.5988995642660392, + "learning_rate": 1.2081357386062186e-05, + "loss": 11.8962, + "step": 31096 + }, + { + "epoch": 1.6933559317419151, + "grad_norm": 0.5489172557999267, + "learning_rate": 1.2077156022410952e-05, + "loss": 11.7662, + "step": 31097 + }, + { + "epoch": 1.6934103857384983, + "grad_norm": 0.5388743072112442, + "learning_rate": 1.2072955342462466e-05, + "loss": 11.737, + "step": 31098 + }, + { + "epoch": 1.6934648397350813, + "grad_norm": 0.5502970655509305, + "learning_rate": 1.2068755346249427e-05, + "loss": 11.8435, + "step": 31099 + }, + { + "epoch": 1.6935192937316643, + "grad_norm": 0.5266090068269315, + "learning_rate": 1.2064556033804508e-05, + "loss": 11.656, + "step": 31100 + }, + { + "epoch": 1.6935737477282473, + "grad_norm": 0.5333254402942086, + "learning_rate": 1.206035740516034e-05, + "loss": 11.8281, + "step": 31101 + }, + { + "epoch": 1.6936282017248303, + "grad_norm": 0.5587803731499055, + "learning_rate": 1.2056159460349592e-05, + "loss": 11.842, + "step": 31102 + }, + { + "epoch": 1.6936826557214133, + "grad_norm": 0.5887906236010659, + "learning_rate": 1.2051962199404876e-05, + "loss": 11.79, + "step": 31103 + }, + { + "epoch": 1.6937371097179965, + "grad_norm": 0.5052092030785456, + "learning_rate": 1.2047765622358875e-05, + "loss": 11.7987, + "step": 31104 + }, + { + "epoch": 1.6937915637145795, + "grad_norm": 0.5189554623571955, + "learning_rate": 1.2043569729244198e-05, + "loss": 11.7302, + "step": 31105 + }, + { + "epoch": 1.6938460177111625, + "grad_norm": 0.536995774187871, + "learning_rate": 1.203937452009345e-05, + "loss": 11.9114, + "step": 31106 + }, + { + "epoch": 1.6939004717077455, + "grad_norm": 0.5514921998254492, + "learning_rate": 1.2035179994939295e-05, + "loss": 11.733, + "step": 31107 + }, + { + "epoch": 1.6939549257043285, + "grad_norm": 0.5376014961174918, + "learning_rate": 1.2030986153814316e-05, + "loss": 11.649, + "step": 31108 + }, + { + "epoch": 1.6940093797009115, + "grad_norm": 0.6520060717512525, + "learning_rate": 1.2026792996751157e-05, + "loss": 11.9269, + "step": 31109 + }, + { + "epoch": 1.6940638336974945, + "grad_norm": 0.5537771651695272, + "learning_rate": 1.202260052378239e-05, + "loss": 11.8471, + "step": 31110 + }, + { + "epoch": 1.6941182876940775, + "grad_norm": 0.5484192303934142, + "learning_rate": 1.2018408734940644e-05, + "loss": 11.7694, + "step": 31111 + }, + { + "epoch": 1.6941727416906605, + "grad_norm": 0.5503157610454154, + "learning_rate": 1.2014217630258518e-05, + "loss": 11.8576, + "step": 31112 + }, + { + "epoch": 1.6942271956872434, + "grad_norm": 0.6120296218848519, + "learning_rate": 1.201002720976857e-05, + "loss": 11.7632, + "step": 31113 + }, + { + "epoch": 1.6942816496838264, + "grad_norm": 0.5567956856022628, + "learning_rate": 1.2005837473503434e-05, + "loss": 11.9131, + "step": 31114 + }, + { + "epoch": 1.6943361036804094, + "grad_norm": 0.5460710209147294, + "learning_rate": 1.2001648421495648e-05, + "loss": 11.6285, + "step": 31115 + }, + { + "epoch": 1.6943905576769924, + "grad_norm": 0.48387214905013837, + "learning_rate": 1.1997460053777799e-05, + "loss": 11.6831, + "step": 31116 + }, + { + "epoch": 1.6944450116735754, + "grad_norm": 0.6240826432684845, + "learning_rate": 1.1993272370382469e-05, + "loss": 11.9838, + "step": 31117 + }, + { + "epoch": 1.6944994656701584, + "grad_norm": 0.5618401324838624, + "learning_rate": 1.198908537134219e-05, + "loss": 11.8195, + "step": 31118 + }, + { + "epoch": 1.6945539196667414, + "grad_norm": 0.49165678973523896, + "learning_rate": 1.1984899056689569e-05, + "loss": 11.692, + "step": 31119 + }, + { + "epoch": 1.6946083736633244, + "grad_norm": 0.5751672989876923, + "learning_rate": 1.1980713426457102e-05, + "loss": 11.9483, + "step": 31120 + }, + { + "epoch": 1.6946628276599076, + "grad_norm": 0.5234085505286293, + "learning_rate": 1.1976528480677396e-05, + "loss": 11.6454, + "step": 31121 + }, + { + "epoch": 1.6947172816564906, + "grad_norm": 0.5742276405421017, + "learning_rate": 1.1972344219382947e-05, + "loss": 11.7931, + "step": 31122 + }, + { + "epoch": 1.6947717356530736, + "grad_norm": 0.5736738705491817, + "learning_rate": 1.1968160642606308e-05, + "loss": 11.8693, + "step": 31123 + }, + { + "epoch": 1.6948261896496566, + "grad_norm": 0.5227934599039821, + "learning_rate": 1.196397775038004e-05, + "loss": 11.7932, + "step": 31124 + }, + { + "epoch": 1.6948806436462396, + "grad_norm": 0.5585268892314317, + "learning_rate": 1.195979554273664e-05, + "loss": 11.8074, + "step": 31125 + }, + { + "epoch": 1.6949350976428226, + "grad_norm": 0.5272249447494851, + "learning_rate": 1.1955614019708616e-05, + "loss": 11.8543, + "step": 31126 + }, + { + "epoch": 1.6949895516394058, + "grad_norm": 0.6150872977246788, + "learning_rate": 1.195143318132852e-05, + "loss": 11.8697, + "step": 31127 + }, + { + "epoch": 1.6950440056359888, + "grad_norm": 0.523893250614035, + "learning_rate": 1.1947253027628824e-05, + "loss": 11.7275, + "step": 31128 + }, + { + "epoch": 1.6950984596325718, + "grad_norm": 0.5466708874336663, + "learning_rate": 1.1943073558642082e-05, + "loss": 11.6989, + "step": 31129 + }, + { + "epoch": 1.6951529136291548, + "grad_norm": 0.5297247842485884, + "learning_rate": 1.1938894774400743e-05, + "loss": 11.7921, + "step": 31130 + }, + { + "epoch": 1.6952073676257378, + "grad_norm": 0.668330003281475, + "learning_rate": 1.1934716674937352e-05, + "loss": 11.7513, + "step": 31131 + }, + { + "epoch": 1.6952618216223208, + "grad_norm": 0.5890816140468341, + "learning_rate": 1.1930539260284346e-05, + "loss": 11.9074, + "step": 31132 + }, + { + "epoch": 1.6953162756189037, + "grad_norm": 0.5921423566789797, + "learning_rate": 1.1926362530474234e-05, + "loss": 11.7829, + "step": 31133 + }, + { + "epoch": 1.6953707296154867, + "grad_norm": 0.5604002994266877, + "learning_rate": 1.1922186485539522e-05, + "loss": 11.7451, + "step": 31134 + }, + { + "epoch": 1.6954251836120697, + "grad_norm": 0.5386171188268375, + "learning_rate": 1.1918011125512651e-05, + "loss": 11.6358, + "step": 31135 + }, + { + "epoch": 1.6954796376086527, + "grad_norm": 0.6047722369859541, + "learning_rate": 1.1913836450426108e-05, + "loss": 11.7773, + "step": 31136 + }, + { + "epoch": 1.6955340916052357, + "grad_norm": 0.5636338677013298, + "learning_rate": 1.190966246031232e-05, + "loss": 11.8731, + "step": 31137 + }, + { + "epoch": 1.6955885456018187, + "grad_norm": 0.5289540230140203, + "learning_rate": 1.1905489155203764e-05, + "loss": 11.6416, + "step": 31138 + }, + { + "epoch": 1.6956429995984017, + "grad_norm": 0.5610172695635461, + "learning_rate": 1.1901316535132922e-05, + "loss": 11.8599, + "step": 31139 + }, + { + "epoch": 1.6956974535949847, + "grad_norm": 0.5620621465242817, + "learning_rate": 1.1897144600132203e-05, + "loss": 11.8455, + "step": 31140 + }, + { + "epoch": 1.6957519075915677, + "grad_norm": 0.5286016597676234, + "learning_rate": 1.1892973350234072e-05, + "loss": 11.7321, + "step": 31141 + }, + { + "epoch": 1.6958063615881507, + "grad_norm": 0.5538988834899655, + "learning_rate": 1.1888802785470943e-05, + "loss": 11.7554, + "step": 31142 + }, + { + "epoch": 1.6958608155847337, + "grad_norm": 0.5220134673972876, + "learning_rate": 1.1884632905875281e-05, + "loss": 11.6936, + "step": 31143 + }, + { + "epoch": 1.6959152695813169, + "grad_norm": 0.5659510239305034, + "learning_rate": 1.188046371147946e-05, + "loss": 11.8246, + "step": 31144 + }, + { + "epoch": 1.6959697235778999, + "grad_norm": 0.5261749517964932, + "learning_rate": 1.1876295202315957e-05, + "loss": 11.7013, + "step": 31145 + }, + { + "epoch": 1.6960241775744829, + "grad_norm": 0.555824132148923, + "learning_rate": 1.1872127378417163e-05, + "loss": 11.9716, + "step": 31146 + }, + { + "epoch": 1.6960786315710659, + "grad_norm": 0.5117819884497045, + "learning_rate": 1.1867960239815467e-05, + "loss": 11.8115, + "step": 31147 + }, + { + "epoch": 1.6961330855676489, + "grad_norm": 0.5933197441541157, + "learning_rate": 1.1863793786543288e-05, + "loss": 11.7341, + "step": 31148 + }, + { + "epoch": 1.6961875395642318, + "grad_norm": 0.5199894150538277, + "learning_rate": 1.1859628018633052e-05, + "loss": 11.7156, + "step": 31149 + }, + { + "epoch": 1.696241993560815, + "grad_norm": 0.5452589369395606, + "learning_rate": 1.1855462936117101e-05, + "loss": 11.7479, + "step": 31150 + }, + { + "epoch": 1.696296447557398, + "grad_norm": 0.5794796909162606, + "learning_rate": 1.1851298539027889e-05, + "loss": 11.7396, + "step": 31151 + }, + { + "epoch": 1.696350901553981, + "grad_norm": 0.6460472069428692, + "learning_rate": 1.184713482739772e-05, + "loss": 11.6926, + "step": 31152 + }, + { + "epoch": 1.696405355550564, + "grad_norm": 0.6811738661338196, + "learning_rate": 1.184297180125905e-05, + "loss": 11.8275, + "step": 31153 + }, + { + "epoch": 1.696459809547147, + "grad_norm": 0.597358354833702, + "learning_rate": 1.1838809460644195e-05, + "loss": 11.8919, + "step": 31154 + }, + { + "epoch": 1.69651426354373, + "grad_norm": 0.5613360676396864, + "learning_rate": 1.1834647805585552e-05, + "loss": 11.7637, + "step": 31155 + }, + { + "epoch": 1.696568717540313, + "grad_norm": 0.5514983697671271, + "learning_rate": 1.1830486836115483e-05, + "loss": 11.7227, + "step": 31156 + }, + { + "epoch": 1.696623171536896, + "grad_norm": 0.5435911315202359, + "learning_rate": 1.1826326552266308e-05, + "loss": 11.8893, + "step": 31157 + }, + { + "epoch": 1.696677625533479, + "grad_norm": 0.5338506575659464, + "learning_rate": 1.1822166954070434e-05, + "loss": 11.6075, + "step": 31158 + }, + { + "epoch": 1.696732079530062, + "grad_norm": 0.4986525568452677, + "learning_rate": 1.1818008041560147e-05, + "loss": 11.8006, + "step": 31159 + }, + { + "epoch": 1.696786533526645, + "grad_norm": 0.5745697895053549, + "learning_rate": 1.1813849814767818e-05, + "loss": 11.898, + "step": 31160 + }, + { + "epoch": 1.696840987523228, + "grad_norm": 0.5029913364089251, + "learning_rate": 1.1809692273725803e-05, + "loss": 11.7185, + "step": 31161 + }, + { + "epoch": 1.696895441519811, + "grad_norm": 0.5740095031314133, + "learning_rate": 1.1805535418466395e-05, + "loss": 11.9046, + "step": 31162 + }, + { + "epoch": 1.696949895516394, + "grad_norm": 0.5590567368795529, + "learning_rate": 1.1801379249021948e-05, + "loss": 11.7892, + "step": 31163 + }, + { + "epoch": 1.697004349512977, + "grad_norm": 0.4763955732081574, + "learning_rate": 1.1797223765424747e-05, + "loss": 11.8285, + "step": 31164 + }, + { + "epoch": 1.69705880350956, + "grad_norm": 0.5061884999654044, + "learning_rate": 1.1793068967707154e-05, + "loss": 11.7972, + "step": 31165 + }, + { + "epoch": 1.697113257506143, + "grad_norm": 0.5523447234983317, + "learning_rate": 1.1788914855901445e-05, + "loss": 11.7633, + "step": 31166 + }, + { + "epoch": 1.697167711502726, + "grad_norm": 0.5401459164104537, + "learning_rate": 1.1784761430039904e-05, + "loss": 11.7394, + "step": 31167 + }, + { + "epoch": 1.6972221654993092, + "grad_norm": 0.5376616400063358, + "learning_rate": 1.178060869015487e-05, + "loss": 11.766, + "step": 31168 + }, + { + "epoch": 1.6972766194958921, + "grad_norm": 0.5910258722714671, + "learning_rate": 1.1776456636278598e-05, + "loss": 11.7837, + "step": 31169 + }, + { + "epoch": 1.6973310734924751, + "grad_norm": 0.5302378668243817, + "learning_rate": 1.1772305268443417e-05, + "loss": 11.8017, + "step": 31170 + }, + { + "epoch": 1.6973855274890581, + "grad_norm": 0.6109088340488431, + "learning_rate": 1.1768154586681568e-05, + "loss": 11.8214, + "step": 31171 + }, + { + "epoch": 1.6974399814856411, + "grad_norm": 0.5527344866619177, + "learning_rate": 1.1764004591025346e-05, + "loss": 11.6997, + "step": 31172 + }, + { + "epoch": 1.6974944354822243, + "grad_norm": 0.625763945079985, + "learning_rate": 1.1759855281507037e-05, + "loss": 11.9015, + "step": 31173 + }, + { + "epoch": 1.6975488894788073, + "grad_norm": 0.4966233521673928, + "learning_rate": 1.1755706658158872e-05, + "loss": 11.7, + "step": 31174 + }, + { + "epoch": 1.6976033434753903, + "grad_norm": 0.5206706851319974, + "learning_rate": 1.1751558721013157e-05, + "loss": 11.7349, + "step": 31175 + }, + { + "epoch": 1.6976577974719733, + "grad_norm": 0.5466083767639491, + "learning_rate": 1.174741147010211e-05, + "loss": 11.7965, + "step": 31176 + }, + { + "epoch": 1.6977122514685563, + "grad_norm": 0.5555311654612501, + "learning_rate": 1.1743264905457973e-05, + "loss": 11.8305, + "step": 31177 + }, + { + "epoch": 1.6977667054651393, + "grad_norm": 0.5827110532366812, + "learning_rate": 1.1739119027113033e-05, + "loss": 11.9338, + "step": 31178 + }, + { + "epoch": 1.6978211594617223, + "grad_norm": 0.546713353054429, + "learning_rate": 1.1734973835099483e-05, + "loss": 11.8532, + "step": 31179 + }, + { + "epoch": 1.6978756134583053, + "grad_norm": 0.5617743954800166, + "learning_rate": 1.17308293294496e-05, + "loss": 12.0388, + "step": 31180 + }, + { + "epoch": 1.6979300674548883, + "grad_norm": 0.5302951201507142, + "learning_rate": 1.1726685510195567e-05, + "loss": 11.8557, + "step": 31181 + }, + { + "epoch": 1.6979845214514713, + "grad_norm": 0.517914976928299, + "learning_rate": 1.1722542377369639e-05, + "loss": 11.8541, + "step": 31182 + }, + { + "epoch": 1.6980389754480543, + "grad_norm": 0.5714738519216818, + "learning_rate": 1.1718399931004043e-05, + "loss": 11.8515, + "step": 31183 + }, + { + "epoch": 1.6980934294446373, + "grad_norm": 0.6066301526252882, + "learning_rate": 1.1714258171130954e-05, + "loss": 11.713, + "step": 31184 + }, + { + "epoch": 1.6981478834412203, + "grad_norm": 0.562355052368481, + "learning_rate": 1.1710117097782635e-05, + "loss": 11.8545, + "step": 31185 + }, + { + "epoch": 1.6982023374378032, + "grad_norm": 0.5294204864785601, + "learning_rate": 1.1705976710991206e-05, + "loss": 11.7689, + "step": 31186 + }, + { + "epoch": 1.6982567914343862, + "grad_norm": 0.5444777783774468, + "learning_rate": 1.170183701078892e-05, + "loss": 11.6812, + "step": 31187 + }, + { + "epoch": 1.6983112454309692, + "grad_norm": 0.57113633311093, + "learning_rate": 1.1697697997207979e-05, + "loss": 11.7576, + "step": 31188 + }, + { + "epoch": 1.6983656994275522, + "grad_norm": 0.5507576347384435, + "learning_rate": 1.1693559670280518e-05, + "loss": 11.5738, + "step": 31189 + }, + { + "epoch": 1.6984201534241352, + "grad_norm": 0.5175553432450869, + "learning_rate": 1.1689422030038765e-05, + "loss": 11.7401, + "step": 31190 + }, + { + "epoch": 1.6984746074207184, + "grad_norm": 0.5095825569788507, + "learning_rate": 1.1685285076514863e-05, + "loss": 11.6847, + "step": 31191 + }, + { + "epoch": 1.6985290614173014, + "grad_norm": 0.5677070131155337, + "learning_rate": 1.1681148809741016e-05, + "loss": 11.8285, + "step": 31192 + }, + { + "epoch": 1.6985835154138844, + "grad_norm": 0.5335949677262533, + "learning_rate": 1.1677013229749334e-05, + "loss": 11.7886, + "step": 31193 + }, + { + "epoch": 1.6986379694104674, + "grad_norm": 0.560539418745713, + "learning_rate": 1.1672878336572024e-05, + "loss": 11.7421, + "step": 31194 + }, + { + "epoch": 1.6986924234070504, + "grad_norm": 0.5705922297864177, + "learning_rate": 1.166874413024126e-05, + "loss": 11.8745, + "step": 31195 + }, + { + "epoch": 1.6987468774036334, + "grad_norm": 0.554963106832216, + "learning_rate": 1.1664610610789106e-05, + "loss": 11.9591, + "step": 31196 + }, + { + "epoch": 1.6988013314002166, + "grad_norm": 0.548341727094421, + "learning_rate": 1.1660477778247758e-05, + "loss": 11.7741, + "step": 31197 + }, + { + "epoch": 1.6988557853967996, + "grad_norm": 0.5568708477835408, + "learning_rate": 1.165634563264938e-05, + "loss": 11.7128, + "step": 31198 + }, + { + "epoch": 1.6989102393933826, + "grad_norm": 0.5605668718638176, + "learning_rate": 1.1652214174026045e-05, + "loss": 11.8779, + "step": 31199 + }, + { + "epoch": 1.6989646933899656, + "grad_norm": 0.5776693095633945, + "learning_rate": 1.1648083402409927e-05, + "loss": 11.8465, + "step": 31200 + }, + { + "epoch": 1.6990191473865486, + "grad_norm": 0.5691267610329024, + "learning_rate": 1.1643953317833112e-05, + "loss": 11.7833, + "step": 31201 + }, + { + "epoch": 1.6990736013831316, + "grad_norm": 0.5488503787332141, + "learning_rate": 1.1639823920327753e-05, + "loss": 11.7771, + "step": 31202 + }, + { + "epoch": 1.6991280553797146, + "grad_norm": 0.5433140690219086, + "learning_rate": 1.1635695209925922e-05, + "loss": 11.8082, + "step": 31203 + }, + { + "epoch": 1.6991825093762976, + "grad_norm": 0.5317624685769947, + "learning_rate": 1.1631567186659741e-05, + "loss": 11.8112, + "step": 31204 + }, + { + "epoch": 1.6992369633728805, + "grad_norm": 0.5585805141630426, + "learning_rate": 1.1627439850561351e-05, + "loss": 11.8755, + "step": 31205 + }, + { + "epoch": 1.6992914173694635, + "grad_norm": 0.5323604788938309, + "learning_rate": 1.1623313201662778e-05, + "loss": 11.7716, + "step": 31206 + }, + { + "epoch": 1.6993458713660465, + "grad_norm": 0.5593327894143041, + "learning_rate": 1.1619187239996154e-05, + "loss": 11.8379, + "step": 31207 + }, + { + "epoch": 1.6994003253626295, + "grad_norm": 0.513432314602708, + "learning_rate": 1.1615061965593533e-05, + "loss": 11.8205, + "step": 31208 + }, + { + "epoch": 1.6994547793592125, + "grad_norm": 0.5476954780496204, + "learning_rate": 1.1610937378487008e-05, + "loss": 11.826, + "step": 31209 + }, + { + "epoch": 1.6995092333557955, + "grad_norm": 0.6067927322440799, + "learning_rate": 1.1606813478708689e-05, + "loss": 11.8313, + "step": 31210 + }, + { + "epoch": 1.6995636873523785, + "grad_norm": 0.528739121928077, + "learning_rate": 1.1602690266290583e-05, + "loss": 11.7791, + "step": 31211 + }, + { + "epoch": 1.6996181413489615, + "grad_norm": 0.4979464794477796, + "learning_rate": 1.15985677412648e-05, + "loss": 11.68, + "step": 31212 + }, + { + "epoch": 1.6996725953455445, + "grad_norm": 0.5699731103190695, + "learning_rate": 1.1594445903663365e-05, + "loss": 11.783, + "step": 31213 + }, + { + "epoch": 1.6997270493421277, + "grad_norm": 0.5240174059605114, + "learning_rate": 1.1590324753518367e-05, + "loss": 11.7972, + "step": 31214 + }, + { + "epoch": 1.6997815033387107, + "grad_norm": 0.5348642485513088, + "learning_rate": 1.1586204290861824e-05, + "loss": 11.8107, + "step": 31215 + }, + { + "epoch": 1.6998359573352937, + "grad_norm": 0.5532560565183934, + "learning_rate": 1.1582084515725766e-05, + "loss": 11.8337, + "step": 31216 + }, + { + "epoch": 1.6998904113318767, + "grad_norm": 0.5466933677368346, + "learning_rate": 1.1577965428142257e-05, + "loss": 11.7897, + "step": 31217 + }, + { + "epoch": 1.6999448653284597, + "grad_norm": 0.5572512944589181, + "learning_rate": 1.1573847028143315e-05, + "loss": 11.8394, + "step": 31218 + }, + { + "epoch": 1.6999993193250427, + "grad_norm": 0.5551068691720326, + "learning_rate": 1.1569729315760969e-05, + "loss": 11.8603, + "step": 31219 + }, + { + "epoch": 1.7000537733216259, + "grad_norm": 0.5692280362315412, + "learning_rate": 1.1565612291027228e-05, + "loss": 11.7198, + "step": 31220 + }, + { + "epoch": 1.7001082273182089, + "grad_norm": 0.5044517711240781, + "learning_rate": 1.1561495953974122e-05, + "loss": 11.7405, + "step": 31221 + }, + { + "epoch": 1.7001626813147919, + "grad_norm": 0.6662775759620126, + "learning_rate": 1.1557380304633659e-05, + "loss": 11.8044, + "step": 31222 + }, + { + "epoch": 1.7002171353113749, + "grad_norm": 0.5468430216285007, + "learning_rate": 1.1553265343037833e-05, + "loss": 11.6875, + "step": 31223 + }, + { + "epoch": 1.7002715893079579, + "grad_norm": 0.510436662117331, + "learning_rate": 1.1549151069218655e-05, + "loss": 11.7209, + "step": 31224 + }, + { + "epoch": 1.7003260433045408, + "grad_norm": 0.571369443119801, + "learning_rate": 1.1545037483208098e-05, + "loss": 11.8318, + "step": 31225 + }, + { + "epoch": 1.7003804973011238, + "grad_norm": 0.5187043516404863, + "learning_rate": 1.1540924585038193e-05, + "loss": 11.7688, + "step": 31226 + }, + { + "epoch": 1.7004349512977068, + "grad_norm": 0.5264191141204417, + "learning_rate": 1.153681237474088e-05, + "loss": 11.8329, + "step": 31227 + }, + { + "epoch": 1.7004894052942898, + "grad_norm": 0.5478201588307893, + "learning_rate": 1.1532700852348132e-05, + "loss": 11.8485, + "step": 31228 + }, + { + "epoch": 1.7005438592908728, + "grad_norm": 0.5377739250236913, + "learning_rate": 1.1528590017891961e-05, + "loss": 11.6057, + "step": 31229 + }, + { + "epoch": 1.7005983132874558, + "grad_norm": 0.5774197328841263, + "learning_rate": 1.1524479871404293e-05, + "loss": 11.7849, + "step": 31230 + }, + { + "epoch": 1.7006527672840388, + "grad_norm": 0.5434830326628208, + "learning_rate": 1.1520370412917103e-05, + "loss": 11.7581, + "step": 31231 + }, + { + "epoch": 1.7007072212806218, + "grad_norm": 0.5541688132906321, + "learning_rate": 1.1516261642462366e-05, + "loss": 11.781, + "step": 31232 + }, + { + "epoch": 1.7007616752772048, + "grad_norm": 0.6036717618407496, + "learning_rate": 1.1512153560072003e-05, + "loss": 11.846, + "step": 31233 + }, + { + "epoch": 1.7008161292737878, + "grad_norm": 0.5431214809691001, + "learning_rate": 1.1508046165777997e-05, + "loss": 11.79, + "step": 31234 + }, + { + "epoch": 1.7008705832703708, + "grad_norm": 0.5744119753557512, + "learning_rate": 1.1503939459612234e-05, + "loss": 11.72, + "step": 31235 + }, + { + "epoch": 1.7009250372669538, + "grad_norm": 0.6004856798370535, + "learning_rate": 1.149983344160671e-05, + "loss": 11.8934, + "step": 31236 + }, + { + "epoch": 1.7009794912635368, + "grad_norm": 0.6745525890407313, + "learning_rate": 1.1495728111793313e-05, + "loss": 11.9227, + "step": 31237 + }, + { + "epoch": 1.70103394526012, + "grad_norm": 0.5476790313785479, + "learning_rate": 1.149162347020396e-05, + "loss": 11.7708, + "step": 31238 + }, + { + "epoch": 1.701088399256703, + "grad_norm": 0.5726140051205252, + "learning_rate": 1.1487519516870604e-05, + "loss": 11.7045, + "step": 31239 + }, + { + "epoch": 1.701142853253286, + "grad_norm": 0.5206056900957082, + "learning_rate": 1.148341625182512e-05, + "loss": 11.7224, + "step": 31240 + }, + { + "epoch": 1.701197307249869, + "grad_norm": 0.5515522092270686, + "learning_rate": 1.1479313675099456e-05, + "loss": 11.6924, + "step": 31241 + }, + { + "epoch": 1.701251761246452, + "grad_norm": 0.5844488582553043, + "learning_rate": 1.1475211786725482e-05, + "loss": 11.8821, + "step": 31242 + }, + { + "epoch": 1.7013062152430352, + "grad_norm": 0.5717821331759587, + "learning_rate": 1.1471110586735101e-05, + "loss": 11.7941, + "step": 31243 + }, + { + "epoch": 1.7013606692396182, + "grad_norm": 0.5688372934165855, + "learning_rate": 1.1467010075160223e-05, + "loss": 11.965, + "step": 31244 + }, + { + "epoch": 1.7014151232362011, + "grad_norm": 0.5840128309900645, + "learning_rate": 1.1462910252032711e-05, + "loss": 11.7605, + "step": 31245 + }, + { + "epoch": 1.7014695772327841, + "grad_norm": 0.5091680797575212, + "learning_rate": 1.1458811117384472e-05, + "loss": 11.8584, + "step": 31246 + }, + { + "epoch": 1.7015240312293671, + "grad_norm": 0.4746440996748854, + "learning_rate": 1.145471267124737e-05, + "loss": 11.6595, + "step": 31247 + }, + { + "epoch": 1.7015784852259501, + "grad_norm": 0.4875825178627739, + "learning_rate": 1.1450614913653246e-05, + "loss": 11.6376, + "step": 31248 + }, + { + "epoch": 1.7016329392225331, + "grad_norm": 0.5855410815440408, + "learning_rate": 1.1446517844634009e-05, + "loss": 11.8022, + "step": 31249 + }, + { + "epoch": 1.701687393219116, + "grad_norm": 0.6982611521644404, + "learning_rate": 1.1442421464221487e-05, + "loss": 11.8648, + "step": 31250 + }, + { + "epoch": 1.701741847215699, + "grad_norm": 0.578953360743314, + "learning_rate": 1.1438325772447566e-05, + "loss": 11.7539, + "step": 31251 + }, + { + "epoch": 1.701796301212282, + "grad_norm": 0.5357224343657475, + "learning_rate": 1.1434230769344046e-05, + "loss": 11.9149, + "step": 31252 + }, + { + "epoch": 1.701850755208865, + "grad_norm": 0.5423134189518319, + "learning_rate": 1.1430136454942808e-05, + "loss": 11.6547, + "step": 31253 + }, + { + "epoch": 1.701905209205448, + "grad_norm": 0.5542201498977888, + "learning_rate": 1.1426042829275706e-05, + "loss": 11.7096, + "step": 31254 + }, + { + "epoch": 1.701959663202031, + "grad_norm": 0.5109266878922101, + "learning_rate": 1.1421949892374516e-05, + "loss": 11.7829, + "step": 31255 + }, + { + "epoch": 1.702014117198614, + "grad_norm": 0.5398111518041445, + "learning_rate": 1.1417857644271158e-05, + "loss": 11.7906, + "step": 31256 + }, + { + "epoch": 1.702068571195197, + "grad_norm": 0.5654600725757558, + "learning_rate": 1.1413766084997335e-05, + "loss": 11.88, + "step": 31257 + }, + { + "epoch": 1.70212302519178, + "grad_norm": 0.6032763644933257, + "learning_rate": 1.1409675214584925e-05, + "loss": 11.8079, + "step": 31258 + }, + { + "epoch": 1.702177479188363, + "grad_norm": 0.5796129509260738, + "learning_rate": 1.140558503306577e-05, + "loss": 11.7964, + "step": 31259 + }, + { + "epoch": 1.702231933184946, + "grad_norm": 0.5547492772467061, + "learning_rate": 1.1401495540471607e-05, + "loss": 11.7957, + "step": 31260 + }, + { + "epoch": 1.7022863871815292, + "grad_norm": 0.5234253960011268, + "learning_rate": 1.1397406736834305e-05, + "loss": 11.7468, + "step": 31261 + }, + { + "epoch": 1.7023408411781122, + "grad_norm": 0.5560471414455337, + "learning_rate": 1.13933186221856e-05, + "loss": 11.8825, + "step": 31262 + }, + { + "epoch": 1.7023952951746952, + "grad_norm": 0.556439278089486, + "learning_rate": 1.1389231196557337e-05, + "loss": 11.8036, + "step": 31263 + }, + { + "epoch": 1.7024497491712782, + "grad_norm": 0.5913091902537242, + "learning_rate": 1.1385144459981245e-05, + "loss": 11.8411, + "step": 31264 + }, + { + "epoch": 1.7025042031678612, + "grad_norm": 0.5001200998457481, + "learning_rate": 1.1381058412489132e-05, + "loss": 11.801, + "step": 31265 + }, + { + "epoch": 1.7025586571644442, + "grad_norm": 0.5743565103017917, + "learning_rate": 1.1376973054112816e-05, + "loss": 11.8137, + "step": 31266 + }, + { + "epoch": 1.7026131111610274, + "grad_norm": 0.5366117216556852, + "learning_rate": 1.1372888384883973e-05, + "loss": 11.7674, + "step": 31267 + }, + { + "epoch": 1.7026675651576104, + "grad_norm": 0.5544635367823163, + "learning_rate": 1.1368804404834431e-05, + "loss": 11.7378, + "step": 31268 + }, + { + "epoch": 1.7027220191541934, + "grad_norm": 0.5414820553650712, + "learning_rate": 1.1364721113995914e-05, + "loss": 11.7492, + "step": 31269 + }, + { + "epoch": 1.7027764731507764, + "grad_norm": 0.5411382474896068, + "learning_rate": 1.1360638512400179e-05, + "loss": 11.7101, + "step": 31270 + }, + { + "epoch": 1.7028309271473594, + "grad_norm": 0.5423226520013353, + "learning_rate": 1.1356556600079016e-05, + "loss": 11.8066, + "step": 31271 + }, + { + "epoch": 1.7028853811439424, + "grad_norm": 0.5846380472738664, + "learning_rate": 1.135247537706411e-05, + "loss": 11.8038, + "step": 31272 + }, + { + "epoch": 1.7029398351405254, + "grad_norm": 0.5268004901861915, + "learning_rate": 1.1348394843387244e-05, + "loss": 11.8386, + "step": 31273 + }, + { + "epoch": 1.7029942891371084, + "grad_norm": 0.5801831134073427, + "learning_rate": 1.1344314999080107e-05, + "loss": 11.8378, + "step": 31274 + }, + { + "epoch": 1.7030487431336914, + "grad_norm": 0.5959497144288662, + "learning_rate": 1.1340235844174463e-05, + "loss": 11.8477, + "step": 31275 + }, + { + "epoch": 1.7031031971302744, + "grad_norm": 0.5313650368151611, + "learning_rate": 1.1336157378702018e-05, + "loss": 11.729, + "step": 31276 + }, + { + "epoch": 1.7031576511268574, + "grad_norm": 0.5720033144718367, + "learning_rate": 1.1332079602694446e-05, + "loss": 11.7933, + "step": 31277 + }, + { + "epoch": 1.7032121051234403, + "grad_norm": 0.5260727974708785, + "learning_rate": 1.1328002516183522e-05, + "loss": 11.7659, + "step": 31278 + }, + { + "epoch": 1.7032665591200233, + "grad_norm": 0.530997427908885, + "learning_rate": 1.1323926119200912e-05, + "loss": 11.6772, + "step": 31279 + }, + { + "epoch": 1.7033210131166063, + "grad_norm": 0.4760019346679841, + "learning_rate": 1.131985041177831e-05, + "loss": 11.6817, + "step": 31280 + }, + { + "epoch": 1.7033754671131893, + "grad_norm": 0.5251093535938762, + "learning_rate": 1.1315775393947447e-05, + "loss": 11.7829, + "step": 31281 + }, + { + "epoch": 1.7034299211097723, + "grad_norm": 0.6078293290441132, + "learning_rate": 1.1311701065739965e-05, + "loss": 11.7932, + "step": 31282 + }, + { + "epoch": 1.7034843751063553, + "grad_norm": 0.5385094091574454, + "learning_rate": 1.1307627427187595e-05, + "loss": 11.7556, + "step": 31283 + }, + { + "epoch": 1.7035388291029385, + "grad_norm": 0.5637869318010038, + "learning_rate": 1.1303554478321954e-05, + "loss": 11.8293, + "step": 31284 + }, + { + "epoch": 1.7035932830995215, + "grad_norm": 0.5672741725345258, + "learning_rate": 1.1299482219174784e-05, + "loss": 11.8961, + "step": 31285 + }, + { + "epoch": 1.7036477370961045, + "grad_norm": 0.5191819917234376, + "learning_rate": 1.1295410649777704e-05, + "loss": 11.7897, + "step": 31286 + }, + { + "epoch": 1.7037021910926875, + "grad_norm": 0.5211313978753638, + "learning_rate": 1.1291339770162368e-05, + "loss": 11.8367, + "step": 31287 + }, + { + "epoch": 1.7037566450892705, + "grad_norm": 0.6807269684400129, + "learning_rate": 1.1287269580360471e-05, + "loss": 11.9386, + "step": 31288 + }, + { + "epoch": 1.7038110990858535, + "grad_norm": 0.5270099005856907, + "learning_rate": 1.1283200080403632e-05, + "loss": 11.7162, + "step": 31289 + }, + { + "epoch": 1.7038655530824367, + "grad_norm": 0.5911195333290961, + "learning_rate": 1.1279131270323518e-05, + "loss": 11.8088, + "step": 31290 + }, + { + "epoch": 1.7039200070790197, + "grad_norm": 0.5900182950918653, + "learning_rate": 1.1275063150151744e-05, + "loss": 11.8025, + "step": 31291 + }, + { + "epoch": 1.7039744610756027, + "grad_norm": 0.5600283388219753, + "learning_rate": 1.1270995719919952e-05, + "loss": 11.9367, + "step": 31292 + }, + { + "epoch": 1.7040289150721857, + "grad_norm": 0.5049451539233827, + "learning_rate": 1.1266928979659796e-05, + "loss": 11.7135, + "step": 31293 + }, + { + "epoch": 1.7040833690687687, + "grad_norm": 0.6294900510590399, + "learning_rate": 1.1262862929402862e-05, + "loss": 11.8334, + "step": 31294 + }, + { + "epoch": 1.7041378230653517, + "grad_norm": 0.5269043995110961, + "learning_rate": 1.1258797569180812e-05, + "loss": 11.6721, + "step": 31295 + }, + { + "epoch": 1.7041922770619347, + "grad_norm": 0.5255994567283027, + "learning_rate": 1.1254732899025222e-05, + "loss": 11.8429, + "step": 31296 + }, + { + "epoch": 1.7042467310585176, + "grad_norm": 0.5035362961618848, + "learning_rate": 1.1250668918967711e-05, + "loss": 11.742, + "step": 31297 + }, + { + "epoch": 1.7043011850551006, + "grad_norm": 0.5254803888413105, + "learning_rate": 1.1246605629039886e-05, + "loss": 11.9251, + "step": 31298 + }, + { + "epoch": 1.7043556390516836, + "grad_norm": 0.50652490422515, + "learning_rate": 1.1242543029273322e-05, + "loss": 11.7973, + "step": 31299 + }, + { + "epoch": 1.7044100930482666, + "grad_norm": 0.5293692893456606, + "learning_rate": 1.123848111969965e-05, + "loss": 11.8231, + "step": 31300 + }, + { + "epoch": 1.7044645470448496, + "grad_norm": 0.4859020321205245, + "learning_rate": 1.1234419900350413e-05, + "loss": 11.6815, + "step": 31301 + }, + { + "epoch": 1.7045190010414326, + "grad_norm": 0.49308515674546355, + "learning_rate": 1.1230359371257215e-05, + "loss": 11.7344, + "step": 31302 + }, + { + "epoch": 1.7045734550380156, + "grad_norm": 0.5485698988111734, + "learning_rate": 1.1226299532451657e-05, + "loss": 11.7352, + "step": 31303 + }, + { + "epoch": 1.7046279090345986, + "grad_norm": 0.5742764621579853, + "learning_rate": 1.1222240383965243e-05, + "loss": 11.7522, + "step": 31304 + }, + { + "epoch": 1.7046823630311816, + "grad_norm": 0.4777860565935414, + "learning_rate": 1.1218181925829607e-05, + "loss": 11.7434, + "step": 31305 + }, + { + "epoch": 1.7047368170277646, + "grad_norm": 0.5040213787829656, + "learning_rate": 1.1214124158076278e-05, + "loss": 11.8565, + "step": 31306 + }, + { + "epoch": 1.7047912710243478, + "grad_norm": 0.5030122128680107, + "learning_rate": 1.1210067080736786e-05, + "loss": 11.6438, + "step": 31307 + }, + { + "epoch": 1.7048457250209308, + "grad_norm": 0.5981916364110439, + "learning_rate": 1.1206010693842727e-05, + "loss": 11.9581, + "step": 31308 + }, + { + "epoch": 1.7049001790175138, + "grad_norm": 0.5167265446436821, + "learning_rate": 1.120195499742559e-05, + "loss": 11.8135, + "step": 31309 + }, + { + "epoch": 1.7049546330140968, + "grad_norm": 0.5268983740408758, + "learning_rate": 1.1197899991516968e-05, + "loss": 11.7719, + "step": 31310 + }, + { + "epoch": 1.7050090870106798, + "grad_norm": 0.5874469871352266, + "learning_rate": 1.119384567614835e-05, + "loss": 11.7836, + "step": 31311 + }, + { + "epoch": 1.7050635410072628, + "grad_norm": 0.5373827277132782, + "learning_rate": 1.11897920513513e-05, + "loss": 11.7336, + "step": 31312 + }, + { + "epoch": 1.705117995003846, + "grad_norm": 0.5108124772621323, + "learning_rate": 1.1185739117157301e-05, + "loss": 11.8133, + "step": 31313 + }, + { + "epoch": 1.705172449000429, + "grad_norm": 0.5876733779798438, + "learning_rate": 1.1181686873597886e-05, + "loss": 11.9087, + "step": 31314 + }, + { + "epoch": 1.705226902997012, + "grad_norm": 0.5517447147814664, + "learning_rate": 1.1177635320704594e-05, + "loss": 11.9218, + "step": 31315 + }, + { + "epoch": 1.705281356993595, + "grad_norm": 0.6209837257303116, + "learning_rate": 1.1173584458508901e-05, + "loss": 11.9754, + "step": 31316 + }, + { + "epoch": 1.705335810990178, + "grad_norm": 0.5512831780592493, + "learning_rate": 1.1169534287042283e-05, + "loss": 11.6591, + "step": 31317 + }, + { + "epoch": 1.705390264986761, + "grad_norm": 0.5985583538413535, + "learning_rate": 1.1165484806336302e-05, + "loss": 11.7803, + "step": 31318 + }, + { + "epoch": 1.705444718983344, + "grad_norm": 0.575165712108634, + "learning_rate": 1.1161436016422378e-05, + "loss": 11.8899, + "step": 31319 + }, + { + "epoch": 1.705499172979927, + "grad_norm": 0.5819721823764681, + "learning_rate": 1.115738791733204e-05, + "loss": 11.7869, + "step": 31320 + }, + { + "epoch": 1.70555362697651, + "grad_norm": 0.5609460168421133, + "learning_rate": 1.1153340509096744e-05, + "loss": 11.7868, + "step": 31321 + }, + { + "epoch": 1.705608080973093, + "grad_norm": 0.5771190839784344, + "learning_rate": 1.1149293791747984e-05, + "loss": 11.841, + "step": 31322 + }, + { + "epoch": 1.705662534969676, + "grad_norm": 0.5188743409995102, + "learning_rate": 1.114524776531719e-05, + "loss": 11.8366, + "step": 31323 + }, + { + "epoch": 1.705716988966259, + "grad_norm": 0.5913309838941305, + "learning_rate": 1.1141202429835885e-05, + "loss": 11.7614, + "step": 31324 + }, + { + "epoch": 1.7057714429628419, + "grad_norm": 0.5552822084419874, + "learning_rate": 1.1137157785335462e-05, + "loss": 11.8259, + "step": 31325 + }, + { + "epoch": 1.7058258969594249, + "grad_norm": 0.5318717769413936, + "learning_rate": 1.1133113831847431e-05, + "loss": 11.7007, + "step": 31326 + }, + { + "epoch": 1.7058803509560079, + "grad_norm": 0.5681071697112415, + "learning_rate": 1.1129070569403199e-05, + "loss": 11.896, + "step": 31327 + }, + { + "epoch": 1.7059348049525909, + "grad_norm": 0.5089217670051894, + "learning_rate": 1.112502799803421e-05, + "loss": 11.7835, + "step": 31328 + }, + { + "epoch": 1.7059892589491739, + "grad_norm": 0.6497385507303329, + "learning_rate": 1.1120986117771892e-05, + "loss": 11.7724, + "step": 31329 + }, + { + "epoch": 1.7060437129457569, + "grad_norm": 0.5022658093666803, + "learning_rate": 1.1116944928647722e-05, + "loss": 11.7518, + "step": 31330 + }, + { + "epoch": 1.70609816694234, + "grad_norm": 0.5996853030420862, + "learning_rate": 1.1112904430693071e-05, + "loss": 11.7625, + "step": 31331 + }, + { + "epoch": 1.706152620938923, + "grad_norm": 0.5378810116332772, + "learning_rate": 1.1108864623939408e-05, + "loss": 11.7291, + "step": 31332 + }, + { + "epoch": 1.706207074935506, + "grad_norm": 0.5509921075397711, + "learning_rate": 1.1104825508418094e-05, + "loss": 11.8393, + "step": 31333 + }, + { + "epoch": 1.706261528932089, + "grad_norm": 0.5440243687930099, + "learning_rate": 1.110078708416058e-05, + "loss": 11.7172, + "step": 31334 + }, + { + "epoch": 1.706315982928672, + "grad_norm": 0.5716647565322295, + "learning_rate": 1.1096749351198243e-05, + "loss": 11.7178, + "step": 31335 + }, + { + "epoch": 1.706370436925255, + "grad_norm": 0.59309097214083, + "learning_rate": 1.1092712309562503e-05, + "loss": 11.7863, + "step": 31336 + }, + { + "epoch": 1.7064248909218382, + "grad_norm": 0.5770068869056281, + "learning_rate": 1.1088675959284756e-05, + "loss": 11.7634, + "step": 31337 + }, + { + "epoch": 1.7064793449184212, + "grad_norm": 0.5370075067079356, + "learning_rate": 1.1084640300396343e-05, + "loss": 11.6557, + "step": 31338 + }, + { + "epoch": 1.7065337989150042, + "grad_norm": 0.5773533412198194, + "learning_rate": 1.1080605332928706e-05, + "loss": 11.8865, + "step": 31339 + }, + { + "epoch": 1.7065882529115872, + "grad_norm": 0.6064211803671088, + "learning_rate": 1.1076571056913166e-05, + "loss": 11.9042, + "step": 31340 + }, + { + "epoch": 1.7066427069081702, + "grad_norm": 0.5828648957159318, + "learning_rate": 1.1072537472381127e-05, + "loss": 11.7946, + "step": 31341 + }, + { + "epoch": 1.7066971609047532, + "grad_norm": 0.5706479761074192, + "learning_rate": 1.106850457936397e-05, + "loss": 11.8108, + "step": 31342 + }, + { + "epoch": 1.7067516149013362, + "grad_norm": 0.5552683982108595, + "learning_rate": 1.106447237789301e-05, + "loss": 11.8471, + "step": 31343 + }, + { + "epoch": 1.7068060688979192, + "grad_norm": 0.5314013550341861, + "learning_rate": 1.1060440867999655e-05, + "loss": 11.7732, + "step": 31344 + }, + { + "epoch": 1.7068605228945022, + "grad_norm": 0.5327937711703757, + "learning_rate": 1.1056410049715193e-05, + "loss": 11.7761, + "step": 31345 + }, + { + "epoch": 1.7069149768910852, + "grad_norm": 0.5575654136767733, + "learning_rate": 1.1052379923071033e-05, + "loss": 11.7641, + "step": 31346 + }, + { + "epoch": 1.7069694308876682, + "grad_norm": 0.6278233816868286, + "learning_rate": 1.1048350488098491e-05, + "loss": 11.9154, + "step": 31347 + }, + { + "epoch": 1.7070238848842512, + "grad_norm": 0.5061958734375708, + "learning_rate": 1.1044321744828857e-05, + "loss": 11.7967, + "step": 31348 + }, + { + "epoch": 1.7070783388808342, + "grad_norm": 0.5369718885477847, + "learning_rate": 1.1040293693293524e-05, + "loss": 11.7805, + "step": 31349 + }, + { + "epoch": 1.7071327928774171, + "grad_norm": 0.5771905146793501, + "learning_rate": 1.103626633352376e-05, + "loss": 11.8721, + "step": 31350 + }, + { + "epoch": 1.7071872468740001, + "grad_norm": 0.5150195865782898, + "learning_rate": 1.1032239665550915e-05, + "loss": 11.7464, + "step": 31351 + }, + { + "epoch": 1.7072417008705831, + "grad_norm": 0.5898189938885433, + "learning_rate": 1.1028213689406309e-05, + "loss": 11.8358, + "step": 31352 + }, + { + "epoch": 1.7072961548671661, + "grad_norm": 0.6005633282977264, + "learning_rate": 1.1024188405121216e-05, + "loss": 11.8762, + "step": 31353 + }, + { + "epoch": 1.7073506088637493, + "grad_norm": 0.7128329843736801, + "learning_rate": 1.102016381272698e-05, + "loss": 11.8238, + "step": 31354 + }, + { + "epoch": 1.7074050628603323, + "grad_norm": 0.587798147854613, + "learning_rate": 1.1016139912254841e-05, + "loss": 11.9425, + "step": 31355 + }, + { + "epoch": 1.7074595168569153, + "grad_norm": 0.5373226019184096, + "learning_rate": 1.1012116703736153e-05, + "loss": 11.7459, + "step": 31356 + }, + { + "epoch": 1.7075139708534983, + "grad_norm": 0.5513145666790776, + "learning_rate": 1.1008094187202166e-05, + "loss": 11.6954, + "step": 31357 + }, + { + "epoch": 1.7075684248500813, + "grad_norm": 0.5214057416926633, + "learning_rate": 1.1004072362684136e-05, + "loss": 11.7938, + "step": 31358 + }, + { + "epoch": 1.7076228788466643, + "grad_norm": 0.4945392891809306, + "learning_rate": 1.1000051230213393e-05, + "loss": 11.7933, + "step": 31359 + }, + { + "epoch": 1.7076773328432475, + "grad_norm": 0.5116429250355039, + "learning_rate": 1.0996030789821143e-05, + "loss": 11.8087, + "step": 31360 + }, + { + "epoch": 1.7077317868398305, + "grad_norm": 0.5063420571560326, + "learning_rate": 1.0992011041538719e-05, + "loss": 11.8815, + "step": 31361 + }, + { + "epoch": 1.7077862408364135, + "grad_norm": 0.5517296998105604, + "learning_rate": 1.0987991985397317e-05, + "loss": 11.768, + "step": 31362 + }, + { + "epoch": 1.7078406948329965, + "grad_norm": 0.541663286380688, + "learning_rate": 1.0983973621428212e-05, + "loss": 11.7052, + "step": 31363 + }, + { + "epoch": 1.7078951488295795, + "grad_norm": 0.538583345271869, + "learning_rate": 1.097995594966268e-05, + "loss": 11.7137, + "step": 31364 + }, + { + "epoch": 1.7079496028261625, + "grad_norm": 0.5956429163476145, + "learning_rate": 1.0975938970131915e-05, + "loss": 11.8822, + "step": 31365 + }, + { + "epoch": 1.7080040568227455, + "grad_norm": 0.528189305513491, + "learning_rate": 1.0971922682867209e-05, + "loss": 11.9011, + "step": 31366 + }, + { + "epoch": 1.7080585108193285, + "grad_norm": 0.5457842011110622, + "learning_rate": 1.0967907087899754e-05, + "loss": 11.9135, + "step": 31367 + }, + { + "epoch": 1.7081129648159115, + "grad_norm": 0.5762829539871385, + "learning_rate": 1.0963892185260771e-05, + "loss": 11.8354, + "step": 31368 + }, + { + "epoch": 1.7081674188124945, + "grad_norm": 0.5604912212589517, + "learning_rate": 1.0959877974981503e-05, + "loss": 11.7911, + "step": 31369 + }, + { + "epoch": 1.7082218728090774, + "grad_norm": 0.5599812857831126, + "learning_rate": 1.0955864457093145e-05, + "loss": 11.807, + "step": 31370 + }, + { + "epoch": 1.7082763268056604, + "grad_norm": 0.5705282747501176, + "learning_rate": 1.0951851631626931e-05, + "loss": 11.7675, + "step": 31371 + }, + { + "epoch": 1.7083307808022434, + "grad_norm": 0.5512783051901707, + "learning_rate": 1.0947839498614032e-05, + "loss": 11.8431, + "step": 31372 + }, + { + "epoch": 1.7083852347988264, + "grad_norm": 0.5521971898170139, + "learning_rate": 1.0943828058085693e-05, + "loss": 11.774, + "step": 31373 + }, + { + "epoch": 1.7084396887954094, + "grad_norm": 0.49511653281167395, + "learning_rate": 1.0939817310073064e-05, + "loss": 11.7015, + "step": 31374 + }, + { + "epoch": 1.7084941427919924, + "grad_norm": 0.5341521834294026, + "learning_rate": 1.0935807254607344e-05, + "loss": 11.737, + "step": 31375 + }, + { + "epoch": 1.7085485967885754, + "grad_norm": 0.49326756019797263, + "learning_rate": 1.0931797891719742e-05, + "loss": 11.6405, + "step": 31376 + }, + { + "epoch": 1.7086030507851586, + "grad_norm": 0.5888327481424543, + "learning_rate": 1.092778922144142e-05, + "loss": 11.7765, + "step": 31377 + }, + { + "epoch": 1.7086575047817416, + "grad_norm": 0.5394675366004322, + "learning_rate": 1.0923781243803533e-05, + "loss": 11.7522, + "step": 31378 + }, + { + "epoch": 1.7087119587783246, + "grad_norm": 0.6782495984309581, + "learning_rate": 1.0919773958837266e-05, + "loss": 11.8773, + "step": 31379 + }, + { + "epoch": 1.7087664127749076, + "grad_norm": 0.5195367539642743, + "learning_rate": 1.091576736657377e-05, + "loss": 11.8558, + "step": 31380 + }, + { + "epoch": 1.7088208667714906, + "grad_norm": 0.5677555429094424, + "learning_rate": 1.0911761467044212e-05, + "loss": 11.7823, + "step": 31381 + }, + { + "epoch": 1.7088753207680736, + "grad_norm": 0.5161460961865567, + "learning_rate": 1.0907756260279734e-05, + "loss": 11.8444, + "step": 31382 + }, + { + "epoch": 1.7089297747646568, + "grad_norm": 0.5625333123438224, + "learning_rate": 1.0903751746311485e-05, + "loss": 11.7917, + "step": 31383 + }, + { + "epoch": 1.7089842287612398, + "grad_norm": 0.5187215304958623, + "learning_rate": 1.08997479251706e-05, + "loss": 11.8216, + "step": 31384 + }, + { + "epoch": 1.7090386827578228, + "grad_norm": 0.5733631636985802, + "learning_rate": 1.0895744796888207e-05, + "loss": 11.7785, + "step": 31385 + }, + { + "epoch": 1.7090931367544058, + "grad_norm": 0.5444131973315395, + "learning_rate": 1.0891742361495472e-05, + "loss": 11.7844, + "step": 31386 + }, + { + "epoch": 1.7091475907509888, + "grad_norm": 0.5296870045464985, + "learning_rate": 1.0887740619023489e-05, + "loss": 11.8324, + "step": 31387 + }, + { + "epoch": 1.7092020447475718, + "grad_norm": 0.5362580175654252, + "learning_rate": 1.0883739569503382e-05, + "loss": 11.8728, + "step": 31388 + }, + { + "epoch": 1.7092564987441548, + "grad_norm": 0.5050825661817893, + "learning_rate": 1.0879739212966233e-05, + "loss": 11.6665, + "step": 31389 + }, + { + "epoch": 1.7093109527407377, + "grad_norm": 0.5089122476644603, + "learning_rate": 1.0875739549443186e-05, + "loss": 11.6489, + "step": 31390 + }, + { + "epoch": 1.7093654067373207, + "grad_norm": 0.5137015693380984, + "learning_rate": 1.087174057896535e-05, + "loss": 11.7657, + "step": 31391 + }, + { + "epoch": 1.7094198607339037, + "grad_norm": 0.6671127362235856, + "learning_rate": 1.0867742301563788e-05, + "loss": 11.8941, + "step": 31392 + }, + { + "epoch": 1.7094743147304867, + "grad_norm": 0.5283914427623861, + "learning_rate": 1.086374471726963e-05, + "loss": 11.7407, + "step": 31393 + }, + { + "epoch": 1.7095287687270697, + "grad_norm": 0.5791147656313416, + "learning_rate": 1.085974782611392e-05, + "loss": 11.9786, + "step": 31394 + }, + { + "epoch": 1.7095832227236527, + "grad_norm": 0.5413001101306327, + "learning_rate": 1.0855751628127775e-05, + "loss": 11.7702, + "step": 31395 + }, + { + "epoch": 1.7096376767202357, + "grad_norm": 0.5410949087423563, + "learning_rate": 1.0851756123342239e-05, + "loss": 11.8466, + "step": 31396 + }, + { + "epoch": 1.7096921307168187, + "grad_norm": 0.5234902815370354, + "learning_rate": 1.0847761311788418e-05, + "loss": 11.8398, + "step": 31397 + }, + { + "epoch": 1.7097465847134017, + "grad_norm": 0.5276060194192779, + "learning_rate": 1.0843767193497356e-05, + "loss": 11.8515, + "step": 31398 + }, + { + "epoch": 1.7098010387099847, + "grad_norm": 0.5028690735956702, + "learning_rate": 1.0839773768500095e-05, + "loss": 11.8355, + "step": 31399 + }, + { + "epoch": 1.7098554927065677, + "grad_norm": 0.5399868910470234, + "learning_rate": 1.0835781036827697e-05, + "loss": 11.8094, + "step": 31400 + }, + { + "epoch": 1.7099099467031509, + "grad_norm": 0.6186854069007485, + "learning_rate": 1.0831788998511238e-05, + "loss": 11.8729, + "step": 31401 + }, + { + "epoch": 1.7099644006997339, + "grad_norm": 0.5169071866840222, + "learning_rate": 1.0827797653581728e-05, + "loss": 11.7763, + "step": 31402 + }, + { + "epoch": 1.7100188546963169, + "grad_norm": 0.5749566765483729, + "learning_rate": 1.082380700207024e-05, + "loss": 11.8186, + "step": 31403 + }, + { + "epoch": 1.7100733086928999, + "grad_norm": 0.5346734206873114, + "learning_rate": 1.0819817044007764e-05, + "loss": 11.8243, + "step": 31404 + }, + { + "epoch": 1.7101277626894829, + "grad_norm": 0.5721397827847031, + "learning_rate": 1.0815827779425359e-05, + "loss": 11.702, + "step": 31405 + }, + { + "epoch": 1.7101822166860658, + "grad_norm": 0.5159035606249273, + "learning_rate": 1.0811839208354014e-05, + "loss": 11.8388, + "step": 31406 + }, + { + "epoch": 1.710236670682649, + "grad_norm": 0.6457503850999484, + "learning_rate": 1.0807851330824792e-05, + "loss": 11.7218, + "step": 31407 + }, + { + "epoch": 1.710291124679232, + "grad_norm": 0.5418568786335738, + "learning_rate": 1.0803864146868669e-05, + "loss": 11.9103, + "step": 31408 + }, + { + "epoch": 1.710345578675815, + "grad_norm": 0.5107847895235661, + "learning_rate": 1.0799877656516633e-05, + "loss": 11.8648, + "step": 31409 + }, + { + "epoch": 1.710400032672398, + "grad_norm": 0.5459476998412349, + "learning_rate": 1.0795891859799734e-05, + "loss": 11.758, + "step": 31410 + }, + { + "epoch": 1.710454486668981, + "grad_norm": 0.6011175290544278, + "learning_rate": 1.0791906756748916e-05, + "loss": 11.8125, + "step": 31411 + }, + { + "epoch": 1.710508940665564, + "grad_norm": 0.5648296862396707, + "learning_rate": 1.0787922347395195e-05, + "loss": 11.8706, + "step": 31412 + }, + { + "epoch": 1.710563394662147, + "grad_norm": 0.6056109388881363, + "learning_rate": 1.0783938631769564e-05, + "loss": 11.8285, + "step": 31413 + }, + { + "epoch": 1.71061784865873, + "grad_norm": 0.5171633534586365, + "learning_rate": 1.077995560990297e-05, + "loss": 11.8017, + "step": 31414 + }, + { + "epoch": 1.710672302655313, + "grad_norm": 0.5490682626692884, + "learning_rate": 1.0775973281826424e-05, + "loss": 11.8877, + "step": 31415 + }, + { + "epoch": 1.710726756651896, + "grad_norm": 0.5379869833675414, + "learning_rate": 1.0771991647570856e-05, + "loss": 11.7853, + "step": 31416 + }, + { + "epoch": 1.710781210648479, + "grad_norm": 0.6122121634699095, + "learning_rate": 1.0768010707167264e-05, + "loss": 11.8365, + "step": 31417 + }, + { + "epoch": 1.710835664645062, + "grad_norm": 0.6593521026631303, + "learning_rate": 1.0764030460646579e-05, + "loss": 11.9166, + "step": 31418 + }, + { + "epoch": 1.710890118641645, + "grad_norm": 0.5613742894427809, + "learning_rate": 1.0760050908039742e-05, + "loss": 11.8424, + "step": 31419 + }, + { + "epoch": 1.710944572638228, + "grad_norm": 0.5385458958570473, + "learning_rate": 1.0756072049377742e-05, + "loss": 11.6688, + "step": 31420 + }, + { + "epoch": 1.710999026634811, + "grad_norm": 0.5693692277686349, + "learning_rate": 1.075209388469146e-05, + "loss": 11.8753, + "step": 31421 + }, + { + "epoch": 1.711053480631394, + "grad_norm": 0.5054031846875073, + "learning_rate": 1.0748116414011888e-05, + "loss": 11.7141, + "step": 31422 + }, + { + "epoch": 1.711107934627977, + "grad_norm": 0.5666660153708234, + "learning_rate": 1.074413963736991e-05, + "loss": 11.884, + "step": 31423 + }, + { + "epoch": 1.7111623886245602, + "grad_norm": 0.5416718000326497, + "learning_rate": 1.0740163554796478e-05, + "loss": 11.8489, + "step": 31424 + }, + { + "epoch": 1.7112168426211432, + "grad_norm": 0.5800941045494119, + "learning_rate": 1.0736188166322513e-05, + "loss": 11.8722, + "step": 31425 + }, + { + "epoch": 1.7112712966177261, + "grad_norm": 0.5321302466771534, + "learning_rate": 1.0732213471978902e-05, + "loss": 11.7248, + "step": 31426 + }, + { + "epoch": 1.7113257506143091, + "grad_norm": 0.6105716607849028, + "learning_rate": 1.0728239471796586e-05, + "loss": 11.9043, + "step": 31427 + }, + { + "epoch": 1.7113802046108921, + "grad_norm": 0.512140905505257, + "learning_rate": 1.072426616580645e-05, + "loss": 11.7281, + "step": 31428 + }, + { + "epoch": 1.7114346586074751, + "grad_norm": 0.5692570433891122, + "learning_rate": 1.0720293554039374e-05, + "loss": 11.8318, + "step": 31429 + }, + { + "epoch": 1.7114891126040583, + "grad_norm": 0.5329991484708254, + "learning_rate": 1.0716321636526295e-05, + "loss": 11.7278, + "step": 31430 + }, + { + "epoch": 1.7115435666006413, + "grad_norm": 0.5407219564206193, + "learning_rate": 1.0712350413298045e-05, + "loss": 11.8869, + "step": 31431 + }, + { + "epoch": 1.7115980205972243, + "grad_norm": 0.5466139944200886, + "learning_rate": 1.0708379884385545e-05, + "loss": 11.8161, + "step": 31432 + }, + { + "epoch": 1.7116524745938073, + "grad_norm": 0.5260275128237987, + "learning_rate": 1.0704410049819647e-05, + "loss": 11.7968, + "step": 31433 + }, + { + "epoch": 1.7117069285903903, + "grad_norm": 0.5113930171897116, + "learning_rate": 1.0700440909631226e-05, + "loss": 11.7194, + "step": 31434 + }, + { + "epoch": 1.7117613825869733, + "grad_norm": 0.5484905628324327, + "learning_rate": 1.0696472463851181e-05, + "loss": 11.7503, + "step": 31435 + }, + { + "epoch": 1.7118158365835563, + "grad_norm": 0.6031907273961457, + "learning_rate": 1.069250471251031e-05, + "loss": 11.7069, + "step": 31436 + }, + { + "epoch": 1.7118702905801393, + "grad_norm": 0.5901844988234785, + "learning_rate": 1.0688537655639552e-05, + "loss": 11.76, + "step": 31437 + }, + { + "epoch": 1.7119247445767223, + "grad_norm": 0.5501451352658523, + "learning_rate": 1.0684571293269652e-05, + "loss": 11.7001, + "step": 31438 + }, + { + "epoch": 1.7119791985733053, + "grad_norm": 0.5508893112889037, + "learning_rate": 1.0680605625431506e-05, + "loss": 11.7408, + "step": 31439 + }, + { + "epoch": 1.7120336525698883, + "grad_norm": 0.5197146997984214, + "learning_rate": 1.0676640652155979e-05, + "loss": 11.7606, + "step": 31440 + }, + { + "epoch": 1.7120881065664713, + "grad_norm": 0.6284286242143241, + "learning_rate": 1.0672676373473845e-05, + "loss": 11.8299, + "step": 31441 + }, + { + "epoch": 1.7121425605630542, + "grad_norm": 0.5178726043955564, + "learning_rate": 1.066871278941598e-05, + "loss": 11.7827, + "step": 31442 + }, + { + "epoch": 1.7121970145596372, + "grad_norm": 0.5233572969675926, + "learning_rate": 1.066474990001316e-05, + "loss": 11.7396, + "step": 31443 + }, + { + "epoch": 1.7122514685562202, + "grad_norm": 0.5556031437831046, + "learning_rate": 1.066078770529626e-05, + "loss": 11.7558, + "step": 31444 + }, + { + "epoch": 1.7123059225528032, + "grad_norm": 0.5105555233505923, + "learning_rate": 1.0656826205296021e-05, + "loss": 11.6405, + "step": 31445 + }, + { + "epoch": 1.7123603765493862, + "grad_norm": 0.5691749437304826, + "learning_rate": 1.0652865400043299e-05, + "loss": 11.6957, + "step": 31446 + }, + { + "epoch": 1.7124148305459694, + "grad_norm": 0.592482491596476, + "learning_rate": 1.0648905289568912e-05, + "loss": 11.9148, + "step": 31447 + }, + { + "epoch": 1.7124692845425524, + "grad_norm": 0.5470973254217619, + "learning_rate": 1.0644945873903579e-05, + "loss": 11.8666, + "step": 31448 + }, + { + "epoch": 1.7125237385391354, + "grad_norm": 0.5457120239878783, + "learning_rate": 1.064098715307813e-05, + "loss": 11.8624, + "step": 31449 + }, + { + "epoch": 1.7125781925357184, + "grad_norm": 0.5779835299810903, + "learning_rate": 1.0637029127123377e-05, + "loss": 11.8025, + "step": 31450 + }, + { + "epoch": 1.7126326465323014, + "grad_norm": 0.5365774391121249, + "learning_rate": 1.063307179607006e-05, + "loss": 11.7081, + "step": 31451 + }, + { + "epoch": 1.7126871005288844, + "grad_norm": 0.5367586795865453, + "learning_rate": 1.0629115159948966e-05, + "loss": 11.7081, + "step": 31452 + }, + { + "epoch": 1.7127415545254676, + "grad_norm": 0.5022251677931849, + "learning_rate": 1.0625159218790847e-05, + "loss": 11.6737, + "step": 31453 + }, + { + "epoch": 1.7127960085220506, + "grad_norm": 0.5299995102224879, + "learning_rate": 1.0621203972626504e-05, + "loss": 11.7842, + "step": 31454 + }, + { + "epoch": 1.7128504625186336, + "grad_norm": 0.5703988649620033, + "learning_rate": 1.0617249421486642e-05, + "loss": 11.8321, + "step": 31455 + }, + { + "epoch": 1.7129049165152166, + "grad_norm": 0.5735867008308759, + "learning_rate": 1.0613295565402038e-05, + "loss": 11.8049, + "step": 31456 + }, + { + "epoch": 1.7129593705117996, + "grad_norm": 0.5642239560494303, + "learning_rate": 1.0609342404403477e-05, + "loss": 11.7006, + "step": 31457 + }, + { + "epoch": 1.7130138245083826, + "grad_norm": 0.5533283490365261, + "learning_rate": 1.0605389938521626e-05, + "loss": 11.826, + "step": 31458 + }, + { + "epoch": 1.7130682785049656, + "grad_norm": 0.5759973187992862, + "learning_rate": 1.060143816778727e-05, + "loss": 11.9006, + "step": 31459 + }, + { + "epoch": 1.7131227325015486, + "grad_norm": 0.5610058868292839, + "learning_rate": 1.0597487092231096e-05, + "loss": 11.8556, + "step": 31460 + }, + { + "epoch": 1.7131771864981316, + "grad_norm": 0.6075491934975943, + "learning_rate": 1.0593536711883866e-05, + "loss": 11.8305, + "step": 31461 + }, + { + "epoch": 1.7132316404947145, + "grad_norm": 0.5660814834145764, + "learning_rate": 1.0589587026776304e-05, + "loss": 11.6515, + "step": 31462 + }, + { + "epoch": 1.7132860944912975, + "grad_norm": 0.5660420940437678, + "learning_rate": 1.0585638036939083e-05, + "loss": 11.661, + "step": 31463 + }, + { + "epoch": 1.7133405484878805, + "grad_norm": 0.5647383866968578, + "learning_rate": 1.0581689742402968e-05, + "loss": 11.7554, + "step": 31464 + }, + { + "epoch": 1.7133950024844635, + "grad_norm": 0.5337713886613089, + "learning_rate": 1.05777421431986e-05, + "loss": 11.8057, + "step": 31465 + }, + { + "epoch": 1.7134494564810465, + "grad_norm": 0.6145542922416759, + "learning_rate": 1.057379523935672e-05, + "loss": 11.6535, + "step": 31466 + }, + { + "epoch": 1.7135039104776295, + "grad_norm": 0.5585886103670066, + "learning_rate": 1.056984903090802e-05, + "loss": 11.8565, + "step": 31467 + }, + { + "epoch": 1.7135583644742125, + "grad_norm": 0.5203912155534972, + "learning_rate": 1.0565903517883135e-05, + "loss": 11.7955, + "step": 31468 + }, + { + "epoch": 1.7136128184707955, + "grad_norm": 0.5443404081948386, + "learning_rate": 1.0561958700312812e-05, + "loss": 11.8629, + "step": 31469 + }, + { + "epoch": 1.7136672724673785, + "grad_norm": 0.5563624426346335, + "learning_rate": 1.0558014578227671e-05, + "loss": 11.7182, + "step": 31470 + }, + { + "epoch": 1.7137217264639617, + "grad_norm": 0.5407574650049557, + "learning_rate": 1.055407115165844e-05, + "loss": 11.7053, + "step": 31471 + }, + { + "epoch": 1.7137761804605447, + "grad_norm": 0.6411292787717409, + "learning_rate": 1.0550128420635719e-05, + "loss": 11.8317, + "step": 31472 + }, + { + "epoch": 1.7138306344571277, + "grad_norm": 0.5362348810926708, + "learning_rate": 1.0546186385190204e-05, + "loss": 11.8486, + "step": 31473 + }, + { + "epoch": 1.7138850884537107, + "grad_norm": 0.5584341649374477, + "learning_rate": 1.0542245045352572e-05, + "loss": 11.7925, + "step": 31474 + }, + { + "epoch": 1.7139395424502937, + "grad_norm": 0.5238947057269379, + "learning_rate": 1.053830440115341e-05, + "loss": 11.8164, + "step": 31475 + }, + { + "epoch": 1.7139939964468769, + "grad_norm": 0.627934181047507, + "learning_rate": 1.0534364452623424e-05, + "loss": 11.9633, + "step": 31476 + }, + { + "epoch": 1.7140484504434599, + "grad_norm": 0.5471972408288592, + "learning_rate": 1.0530425199793226e-05, + "loss": 11.8643, + "step": 31477 + }, + { + "epoch": 1.7141029044400429, + "grad_norm": 0.5573526478679477, + "learning_rate": 1.0526486642693423e-05, + "loss": 11.7174, + "step": 31478 + }, + { + "epoch": 1.7141573584366259, + "grad_norm": 0.5196324123573932, + "learning_rate": 1.0522548781354691e-05, + "loss": 11.711, + "step": 31479 + }, + { + "epoch": 1.7142118124332089, + "grad_norm": 0.5325676929214292, + "learning_rate": 1.0518611615807594e-05, + "loss": 11.7579, + "step": 31480 + }, + { + "epoch": 1.7142662664297919, + "grad_norm": 0.574989754468607, + "learning_rate": 1.0514675146082808e-05, + "loss": 11.7042, + "step": 31481 + }, + { + "epoch": 1.7143207204263748, + "grad_norm": 0.5475284050653473, + "learning_rate": 1.0510739372210888e-05, + "loss": 11.7333, + "step": 31482 + }, + { + "epoch": 1.7143751744229578, + "grad_norm": 0.5267157853026788, + "learning_rate": 1.0506804294222473e-05, + "loss": 11.8675, + "step": 31483 + }, + { + "epoch": 1.7144296284195408, + "grad_norm": 0.5417543576027039, + "learning_rate": 1.0502869912148172e-05, + "loss": 11.777, + "step": 31484 + }, + { + "epoch": 1.7144840824161238, + "grad_norm": 0.5439329150693312, + "learning_rate": 1.049893622601854e-05, + "loss": 11.803, + "step": 31485 + }, + { + "epoch": 1.7145385364127068, + "grad_norm": 0.6262666704501959, + "learning_rate": 1.0495003235864209e-05, + "loss": 11.8466, + "step": 31486 + }, + { + "epoch": 1.7145929904092898, + "grad_norm": 0.5329814841722585, + "learning_rate": 1.0491070941715752e-05, + "loss": 11.748, + "step": 31487 + }, + { + "epoch": 1.7146474444058728, + "grad_norm": 0.5956165880943686, + "learning_rate": 1.0487139343603702e-05, + "loss": 11.7892, + "step": 31488 + }, + { + "epoch": 1.7147018984024558, + "grad_norm": 0.5532086445924472, + "learning_rate": 1.04832084415587e-05, + "loss": 11.7403, + "step": 31489 + }, + { + "epoch": 1.7147563523990388, + "grad_norm": 0.5494991855615453, + "learning_rate": 1.0479278235611267e-05, + "loss": 11.7501, + "step": 31490 + }, + { + "epoch": 1.7148108063956218, + "grad_norm": 0.5586482148519528, + "learning_rate": 1.047534872579199e-05, + "loss": 11.7576, + "step": 31491 + }, + { + "epoch": 1.7148652603922048, + "grad_norm": 0.6106049331093408, + "learning_rate": 1.0471419912131396e-05, + "loss": 11.7508, + "step": 31492 + }, + { + "epoch": 1.7149197143887878, + "grad_norm": 0.5640251305216247, + "learning_rate": 1.0467491794660066e-05, + "loss": 11.7891, + "step": 31493 + }, + { + "epoch": 1.714974168385371, + "grad_norm": 0.5420770331907155, + "learning_rate": 1.046356437340853e-05, + "loss": 11.8908, + "step": 31494 + }, + { + "epoch": 1.715028622381954, + "grad_norm": 0.5660027284710281, + "learning_rate": 1.0459637648407328e-05, + "loss": 11.8144, + "step": 31495 + }, + { + "epoch": 1.715083076378537, + "grad_norm": 0.580241096977362, + "learning_rate": 1.0455711619687014e-05, + "loss": 11.8834, + "step": 31496 + }, + { + "epoch": 1.71513753037512, + "grad_norm": 0.5168530731885738, + "learning_rate": 1.0451786287278097e-05, + "loss": 11.8216, + "step": 31497 + }, + { + "epoch": 1.715191984371703, + "grad_norm": 0.6908020811749591, + "learning_rate": 1.0447861651211099e-05, + "loss": 11.9295, + "step": 31498 + }, + { + "epoch": 1.715246438368286, + "grad_norm": 0.523079280413465, + "learning_rate": 1.0443937711516571e-05, + "loss": 11.7284, + "step": 31499 + }, + { + "epoch": 1.7153008923648692, + "grad_norm": 0.6298982791860973, + "learning_rate": 1.0440014468224968e-05, + "loss": 11.745, + "step": 31500 + }, + { + "epoch": 1.7153553463614521, + "grad_norm": 0.5342697318723727, + "learning_rate": 1.0436091921366863e-05, + "loss": 11.8669, + "step": 31501 + }, + { + "epoch": 1.7154098003580351, + "grad_norm": 0.6044304543034712, + "learning_rate": 1.0432170070972702e-05, + "loss": 11.8372, + "step": 31502 + }, + { + "epoch": 1.7154642543546181, + "grad_norm": 0.5340018605607262, + "learning_rate": 1.0428248917073025e-05, + "loss": 11.7431, + "step": 31503 + }, + { + "epoch": 1.7155187083512011, + "grad_norm": 0.6007453410235755, + "learning_rate": 1.0424328459698285e-05, + "loss": 11.8048, + "step": 31504 + }, + { + "epoch": 1.7155731623477841, + "grad_norm": 0.5575050090038742, + "learning_rate": 1.0420408698878992e-05, + "loss": 11.8508, + "step": 31505 + }, + { + "epoch": 1.7156276163443671, + "grad_norm": 0.5591948108676068, + "learning_rate": 1.0416489634645644e-05, + "loss": 11.813, + "step": 31506 + }, + { + "epoch": 1.71568207034095, + "grad_norm": 0.5707980019885782, + "learning_rate": 1.0412571267028704e-05, + "loss": 11.8364, + "step": 31507 + }, + { + "epoch": 1.715736524337533, + "grad_norm": 0.5400907530965886, + "learning_rate": 1.0408653596058615e-05, + "loss": 11.7201, + "step": 31508 + }, + { + "epoch": 1.715790978334116, + "grad_norm": 0.5476077115514427, + "learning_rate": 1.0404736621765854e-05, + "loss": 11.7321, + "step": 31509 + }, + { + "epoch": 1.715845432330699, + "grad_norm": 0.5132813267877665, + "learning_rate": 1.0400820344180884e-05, + "loss": 11.8285, + "step": 31510 + }, + { + "epoch": 1.715899886327282, + "grad_norm": 0.5453485699056855, + "learning_rate": 1.039690476333418e-05, + "loss": 11.8487, + "step": 31511 + }, + { + "epoch": 1.715954340323865, + "grad_norm": 0.5177560533775238, + "learning_rate": 1.0392989879256142e-05, + "loss": 11.6971, + "step": 31512 + }, + { + "epoch": 1.716008794320448, + "grad_norm": 0.5625019384813693, + "learning_rate": 1.038907569197728e-05, + "loss": 11.7111, + "step": 31513 + }, + { + "epoch": 1.716063248317031, + "grad_norm": 0.5604026690973885, + "learning_rate": 1.0385162201527954e-05, + "loss": 11.8006, + "step": 31514 + }, + { + "epoch": 1.716117702313614, + "grad_norm": 0.6216431233268295, + "learning_rate": 1.0381249407938664e-05, + "loss": 11.7218, + "step": 31515 + }, + { + "epoch": 1.716172156310197, + "grad_norm": 0.5785475638762526, + "learning_rate": 1.0377337311239787e-05, + "loss": 11.8406, + "step": 31516 + }, + { + "epoch": 1.7162266103067803, + "grad_norm": 0.6282864620618059, + "learning_rate": 1.0373425911461764e-05, + "loss": 11.9044, + "step": 31517 + }, + { + "epoch": 1.7162810643033632, + "grad_norm": 0.5517508946106106, + "learning_rate": 1.0369515208635061e-05, + "loss": 11.6555, + "step": 31518 + }, + { + "epoch": 1.7163355182999462, + "grad_norm": 0.5468075888658445, + "learning_rate": 1.0365605202789986e-05, + "loss": 11.6919, + "step": 31519 + }, + { + "epoch": 1.7163899722965292, + "grad_norm": 0.5434999254704422, + "learning_rate": 1.0361695893957001e-05, + "loss": 11.801, + "step": 31520 + }, + { + "epoch": 1.7164444262931122, + "grad_norm": 0.49435438127943954, + "learning_rate": 1.035778728216652e-05, + "loss": 11.8131, + "step": 31521 + }, + { + "epoch": 1.7164988802896952, + "grad_norm": 0.5170349555973794, + "learning_rate": 1.0353879367448905e-05, + "loss": 11.6788, + "step": 31522 + }, + { + "epoch": 1.7165533342862784, + "grad_norm": 0.5494413014674582, + "learning_rate": 1.0349972149834574e-05, + "loss": 11.8655, + "step": 31523 + }, + { + "epoch": 1.7166077882828614, + "grad_norm": 0.532723027958204, + "learning_rate": 1.0346065629353874e-05, + "loss": 11.7205, + "step": 31524 + }, + { + "epoch": 1.7166622422794444, + "grad_norm": 0.5396919734135337, + "learning_rate": 1.034215980603721e-05, + "loss": 11.6964, + "step": 31525 + }, + { + "epoch": 1.7167166962760274, + "grad_norm": 0.5429105082128733, + "learning_rate": 1.0338254679914939e-05, + "loss": 11.8505, + "step": 31526 + }, + { + "epoch": 1.7167711502726104, + "grad_norm": 0.5963999813032983, + "learning_rate": 1.0334350251017455e-05, + "loss": 11.7538, + "step": 31527 + }, + { + "epoch": 1.7168256042691934, + "grad_norm": 0.6194946283530616, + "learning_rate": 1.0330446519375104e-05, + "loss": 11.8054, + "step": 31528 + }, + { + "epoch": 1.7168800582657764, + "grad_norm": 0.4968158802263791, + "learning_rate": 1.0326543485018214e-05, + "loss": 11.7892, + "step": 31529 + }, + { + "epoch": 1.7169345122623594, + "grad_norm": 0.5616479182415575, + "learning_rate": 1.0322641147977185e-05, + "loss": 11.7664, + "step": 31530 + }, + { + "epoch": 1.7169889662589424, + "grad_norm": 0.5527259225655293, + "learning_rate": 1.0318739508282305e-05, + "loss": 11.869, + "step": 31531 + }, + { + "epoch": 1.7170434202555254, + "grad_norm": 0.5166681923881729, + "learning_rate": 1.0314838565963958e-05, + "loss": 11.9058, + "step": 31532 + }, + { + "epoch": 1.7170978742521084, + "grad_norm": 0.5237575025103077, + "learning_rate": 1.0310938321052477e-05, + "loss": 11.8324, + "step": 31533 + }, + { + "epoch": 1.7171523282486914, + "grad_norm": 0.5178726626154517, + "learning_rate": 1.030703877357817e-05, + "loss": 11.8695, + "step": 31534 + }, + { + "epoch": 1.7172067822452743, + "grad_norm": 0.5688868199917441, + "learning_rate": 1.0303139923571393e-05, + "loss": 11.833, + "step": 31535 + }, + { + "epoch": 1.7172612362418573, + "grad_norm": 0.5133095572706747, + "learning_rate": 1.0299241771062406e-05, + "loss": 11.7907, + "step": 31536 + }, + { + "epoch": 1.7173156902384403, + "grad_norm": 0.5517265001372719, + "learning_rate": 1.029534431608159e-05, + "loss": 11.8537, + "step": 31537 + }, + { + "epoch": 1.7173701442350233, + "grad_norm": 0.6458168601278518, + "learning_rate": 1.0291447558659218e-05, + "loss": 11.8367, + "step": 31538 + }, + { + "epoch": 1.7174245982316063, + "grad_norm": 0.6796053357856466, + "learning_rate": 1.0287551498825575e-05, + "loss": 11.7266, + "step": 31539 + }, + { + "epoch": 1.7174790522281893, + "grad_norm": 0.5672932565031154, + "learning_rate": 1.0283656136610997e-05, + "loss": 11.7974, + "step": 31540 + }, + { + "epoch": 1.7175335062247725, + "grad_norm": 0.5717451313671872, + "learning_rate": 1.0279761472045735e-05, + "loss": 11.744, + "step": 31541 + }, + { + "epoch": 1.7175879602213555, + "grad_norm": 0.5334852185932999, + "learning_rate": 1.0275867505160108e-05, + "loss": 11.8491, + "step": 31542 + }, + { + "epoch": 1.7176424142179385, + "grad_norm": 0.590529124136015, + "learning_rate": 1.0271974235984372e-05, + "loss": 11.8387, + "step": 31543 + }, + { + "epoch": 1.7176968682145215, + "grad_norm": 0.5984197892987017, + "learning_rate": 1.0268081664548802e-05, + "loss": 11.8591, + "step": 31544 + }, + { + "epoch": 1.7177513222111045, + "grad_norm": 0.5299065613184486, + "learning_rate": 1.0264189790883693e-05, + "loss": 11.5362, + "step": 31545 + }, + { + "epoch": 1.7178057762076877, + "grad_norm": 0.5150687628781443, + "learning_rate": 1.0260298615019281e-05, + "loss": 11.6615, + "step": 31546 + }, + { + "epoch": 1.7178602302042707, + "grad_norm": 0.575100554428203, + "learning_rate": 1.0256408136985862e-05, + "loss": 11.8952, + "step": 31547 + }, + { + "epoch": 1.7179146842008537, + "grad_norm": 0.5113283756035647, + "learning_rate": 1.0252518356813657e-05, + "loss": 11.7354, + "step": 31548 + }, + { + "epoch": 1.7179691381974367, + "grad_norm": 0.5525406438555145, + "learning_rate": 1.0248629274532895e-05, + "loss": 11.8523, + "step": 31549 + }, + { + "epoch": 1.7180235921940197, + "grad_norm": 0.5285956182745611, + "learning_rate": 1.0244740890173865e-05, + "loss": 11.6948, + "step": 31550 + }, + { + "epoch": 1.7180780461906027, + "grad_norm": 0.5017862131373222, + "learning_rate": 1.0240853203766764e-05, + "loss": 11.7097, + "step": 31551 + }, + { + "epoch": 1.7181325001871857, + "grad_norm": 0.6261660800804799, + "learning_rate": 1.023696621534187e-05, + "loss": 11.8221, + "step": 31552 + }, + { + "epoch": 1.7181869541837687, + "grad_norm": 0.5545612695863086, + "learning_rate": 1.0233079924929346e-05, + "loss": 11.922, + "step": 31553 + }, + { + "epoch": 1.7182414081803516, + "grad_norm": 0.5104973443489229, + "learning_rate": 1.0229194332559456e-05, + "loss": 11.9069, + "step": 31554 + }, + { + "epoch": 1.7182958621769346, + "grad_norm": 0.5420941901474664, + "learning_rate": 1.0225309438262421e-05, + "loss": 11.7832, + "step": 31555 + }, + { + "epoch": 1.7183503161735176, + "grad_norm": 0.553564900845983, + "learning_rate": 1.0221425242068417e-05, + "loss": 11.9343, + "step": 31556 + }, + { + "epoch": 1.7184047701701006, + "grad_norm": 0.5617533403323572, + "learning_rate": 1.0217541744007687e-05, + "loss": 11.7281, + "step": 31557 + }, + { + "epoch": 1.7184592241666836, + "grad_norm": 0.5345195734226562, + "learning_rate": 1.0213658944110404e-05, + "loss": 11.7387, + "step": 31558 + }, + { + "epoch": 1.7185136781632666, + "grad_norm": 0.5405528268589259, + "learning_rate": 1.020977684240675e-05, + "loss": 11.7239, + "step": 31559 + }, + { + "epoch": 1.7185681321598496, + "grad_norm": 0.5258780473731511, + "learning_rate": 1.0205895438926949e-05, + "loss": 11.6759, + "step": 31560 + }, + { + "epoch": 1.7186225861564326, + "grad_norm": 0.5641976631201479, + "learning_rate": 1.0202014733701138e-05, + "loss": 11.7516, + "step": 31561 + }, + { + "epoch": 1.7186770401530156, + "grad_norm": 0.5717027721752065, + "learning_rate": 1.0198134726759545e-05, + "loss": 11.7368, + "step": 31562 + }, + { + "epoch": 1.7187314941495986, + "grad_norm": 0.6088941775585593, + "learning_rate": 1.0194255418132292e-05, + "loss": 11.9148, + "step": 31563 + }, + { + "epoch": 1.7187859481461818, + "grad_norm": 0.5165406297448353, + "learning_rate": 1.01903768078496e-05, + "loss": 11.7316, + "step": 31564 + }, + { + "epoch": 1.7188404021427648, + "grad_norm": 0.5166590946357978, + "learning_rate": 1.0186498895941566e-05, + "loss": 11.7439, + "step": 31565 + }, + { + "epoch": 1.7188948561393478, + "grad_norm": 0.5265213241462661, + "learning_rate": 1.0182621682438386e-05, + "loss": 11.8408, + "step": 31566 + }, + { + "epoch": 1.7189493101359308, + "grad_norm": 0.5631334509084922, + "learning_rate": 1.0178745167370218e-05, + "loss": 11.8852, + "step": 31567 + }, + { + "epoch": 1.7190037641325138, + "grad_norm": 0.579931971715743, + "learning_rate": 1.017486935076719e-05, + "loss": 11.8404, + "step": 31568 + }, + { + "epoch": 1.7190582181290968, + "grad_norm": 0.6461344153584181, + "learning_rate": 1.0170994232659425e-05, + "loss": 11.8433, + "step": 31569 + }, + { + "epoch": 1.71911267212568, + "grad_norm": 0.5633979225841509, + "learning_rate": 1.0167119813077097e-05, + "loss": 11.7002, + "step": 31570 + }, + { + "epoch": 1.719167126122263, + "grad_norm": 0.5530773534330833, + "learning_rate": 1.0163246092050283e-05, + "loss": 11.7217, + "step": 31571 + }, + { + "epoch": 1.719221580118846, + "grad_norm": 0.5637321739375956, + "learning_rate": 1.0159373069609157e-05, + "loss": 11.7768, + "step": 31572 + }, + { + "epoch": 1.719276034115429, + "grad_norm": 0.5270648099352024, + "learning_rate": 1.0155500745783797e-05, + "loss": 11.725, + "step": 31573 + }, + { + "epoch": 1.719330488112012, + "grad_norm": 0.5248390565762538, + "learning_rate": 1.0151629120604345e-05, + "loss": 11.8033, + "step": 31574 + }, + { + "epoch": 1.719384942108595, + "grad_norm": 0.53525601514702, + "learning_rate": 1.0147758194100864e-05, + "loss": 11.7964, + "step": 31575 + }, + { + "epoch": 1.719439396105178, + "grad_norm": 0.564528381781861, + "learning_rate": 1.0143887966303512e-05, + "loss": 11.7376, + "step": 31576 + }, + { + "epoch": 1.719493850101761, + "grad_norm": 0.5347051331277457, + "learning_rate": 1.0140018437242338e-05, + "loss": 11.7006, + "step": 31577 + }, + { + "epoch": 1.719548304098344, + "grad_norm": 0.5297048389539417, + "learning_rate": 1.0136149606947466e-05, + "loss": 11.7363, + "step": 31578 + }, + { + "epoch": 1.719602758094927, + "grad_norm": 0.5562468619462452, + "learning_rate": 1.0132281475448967e-05, + "loss": 11.7608, + "step": 31579 + }, + { + "epoch": 1.71965721209151, + "grad_norm": 0.5929683492025817, + "learning_rate": 1.0128414042776901e-05, + "loss": 11.92, + "step": 31580 + }, + { + "epoch": 1.719711666088093, + "grad_norm": 0.5721271283055273, + "learning_rate": 1.0124547308961352e-05, + "loss": 11.6951, + "step": 31581 + }, + { + "epoch": 1.7197661200846759, + "grad_norm": 0.5579167792201388, + "learning_rate": 1.0120681274032417e-05, + "loss": 11.7907, + "step": 31582 + }, + { + "epoch": 1.7198205740812589, + "grad_norm": 0.497027615107683, + "learning_rate": 1.0116815938020119e-05, + "loss": 11.7754, + "step": 31583 + }, + { + "epoch": 1.7198750280778419, + "grad_norm": 0.5914398622635827, + "learning_rate": 1.0112951300954554e-05, + "loss": 11.9825, + "step": 31584 + }, + { + "epoch": 1.7199294820744249, + "grad_norm": 0.4773284932550976, + "learning_rate": 1.0109087362865732e-05, + "loss": 11.7669, + "step": 31585 + }, + { + "epoch": 1.7199839360710079, + "grad_norm": 0.5543281504887684, + "learning_rate": 1.0105224123783742e-05, + "loss": 11.7636, + "step": 31586 + }, + { + "epoch": 1.720038390067591, + "grad_norm": 0.5437165446321306, + "learning_rate": 1.010136158373859e-05, + "loss": 11.7309, + "step": 31587 + }, + { + "epoch": 1.720092844064174, + "grad_norm": 0.5741886097642697, + "learning_rate": 1.0097499742760342e-05, + "loss": 11.8462, + "step": 31588 + }, + { + "epoch": 1.720147298060757, + "grad_norm": 0.5216243349159138, + "learning_rate": 1.0093638600879008e-05, + "loss": 11.6872, + "step": 31589 + }, + { + "epoch": 1.72020175205734, + "grad_norm": 0.5493658476374641, + "learning_rate": 1.0089778158124596e-05, + "loss": 11.7145, + "step": 31590 + }, + { + "epoch": 1.720256206053923, + "grad_norm": 0.4942337410589276, + "learning_rate": 1.0085918414527174e-05, + "loss": 11.5876, + "step": 31591 + }, + { + "epoch": 1.720310660050506, + "grad_norm": 0.49565025243563043, + "learning_rate": 1.0082059370116714e-05, + "loss": 11.7896, + "step": 31592 + }, + { + "epoch": 1.7203651140470892, + "grad_norm": 0.5346386801223755, + "learning_rate": 1.0078201024923228e-05, + "loss": 11.7027, + "step": 31593 + }, + { + "epoch": 1.7204195680436722, + "grad_norm": 0.5325015247587378, + "learning_rate": 1.0074343378976758e-05, + "loss": 11.767, + "step": 31594 + }, + { + "epoch": 1.7204740220402552, + "grad_norm": 0.5011686389998581, + "learning_rate": 1.0070486432307257e-05, + "loss": 11.7314, + "step": 31595 + }, + { + "epoch": 1.7205284760368382, + "grad_norm": 0.5931487508652592, + "learning_rate": 1.0066630184944748e-05, + "loss": 11.7359, + "step": 31596 + }, + { + "epoch": 1.7205829300334212, + "grad_norm": 0.5578300443420816, + "learning_rate": 1.0062774636919181e-05, + "loss": 11.7218, + "step": 31597 + }, + { + "epoch": 1.7206373840300042, + "grad_norm": 0.4853604650187636, + "learning_rate": 1.005891978826059e-05, + "loss": 11.648, + "step": 31598 + }, + { + "epoch": 1.7206918380265872, + "grad_norm": 0.5662665237519126, + "learning_rate": 1.0055065638998917e-05, + "loss": 11.785, + "step": 31599 + }, + { + "epoch": 1.7207462920231702, + "grad_norm": 0.583786061851586, + "learning_rate": 1.0051212189164117e-05, + "loss": 11.8159, + "step": 31600 + }, + { + "epoch": 1.7208007460197532, + "grad_norm": 0.5268813172833314, + "learning_rate": 1.0047359438786197e-05, + "loss": 11.766, + "step": 31601 + }, + { + "epoch": 1.7208552000163362, + "grad_norm": 0.5665939415294465, + "learning_rate": 1.004350738789508e-05, + "loss": 11.8127, + "step": 31602 + }, + { + "epoch": 1.7209096540129192, + "grad_norm": 0.5773183052607285, + "learning_rate": 1.0039656036520728e-05, + "loss": 11.7662, + "step": 31603 + }, + { + "epoch": 1.7209641080095022, + "grad_norm": 0.5510697257439507, + "learning_rate": 1.003580538469312e-05, + "loss": 11.8002, + "step": 31604 + }, + { + "epoch": 1.7210185620060852, + "grad_norm": 0.580086336691622, + "learning_rate": 1.0031955432442153e-05, + "loss": 11.912, + "step": 31605 + }, + { + "epoch": 1.7210730160026682, + "grad_norm": 0.5022698051681344, + "learning_rate": 1.0028106179797813e-05, + "loss": 11.6991, + "step": 31606 + }, + { + "epoch": 1.7211274699992511, + "grad_norm": 0.5138518962521057, + "learning_rate": 1.002425762678999e-05, + "loss": 11.7953, + "step": 31607 + }, + { + "epoch": 1.7211819239958341, + "grad_norm": 0.5711216960217186, + "learning_rate": 1.0020409773448637e-05, + "loss": 11.7749, + "step": 31608 + }, + { + "epoch": 1.7212363779924171, + "grad_norm": 0.5005861228314973, + "learning_rate": 1.0016562619803682e-05, + "loss": 11.5601, + "step": 31609 + }, + { + "epoch": 1.7212908319890003, + "grad_norm": 0.5337994292374386, + "learning_rate": 1.0012716165884994e-05, + "loss": 11.8073, + "step": 31610 + }, + { + "epoch": 1.7213452859855833, + "grad_norm": 0.6007049351191095, + "learning_rate": 1.0008870411722537e-05, + "loss": 11.7589, + "step": 31611 + }, + { + "epoch": 1.7213997399821663, + "grad_norm": 0.5666830835561676, + "learning_rate": 1.0005025357346187e-05, + "loss": 11.8024, + "step": 31612 + }, + { + "epoch": 1.7214541939787493, + "grad_norm": 0.6016147626228205, + "learning_rate": 1.0001181002785864e-05, + "loss": 11.9061, + "step": 31613 + }, + { + "epoch": 1.7215086479753323, + "grad_norm": 0.5700946700385966, + "learning_rate": 9.997337348071423e-06, + "loss": 11.7835, + "step": 31614 + }, + { + "epoch": 1.7215631019719153, + "grad_norm": 0.6276406480298716, + "learning_rate": 9.993494393232795e-06, + "loss": 11.8186, + "step": 31615 + }, + { + "epoch": 1.7216175559684985, + "grad_norm": 0.5382441499206801, + "learning_rate": 9.989652138299854e-06, + "loss": 11.6891, + "step": 31616 + }, + { + "epoch": 1.7216720099650815, + "grad_norm": 0.5526851264588184, + "learning_rate": 9.985810583302457e-06, + "loss": 11.7511, + "step": 31617 + }, + { + "epoch": 1.7217264639616645, + "grad_norm": 0.5077383918843621, + "learning_rate": 9.981969728270524e-06, + "loss": 11.8187, + "step": 31618 + }, + { + "epoch": 1.7217809179582475, + "grad_norm": 0.559844989293298, + "learning_rate": 9.978129573233875e-06, + "loss": 11.7314, + "step": 31619 + }, + { + "epoch": 1.7218353719548305, + "grad_norm": 0.5416457035722602, + "learning_rate": 9.974290118222374e-06, + "loss": 11.8057, + "step": 31620 + }, + { + "epoch": 1.7218898259514135, + "grad_norm": 0.49255162000816144, + "learning_rate": 9.970451363265909e-06, + "loss": 11.821, + "step": 31621 + }, + { + "epoch": 1.7219442799479965, + "grad_norm": 0.5756449643012322, + "learning_rate": 9.9666133083943e-06, + "loss": 11.7517, + "step": 31622 + }, + { + "epoch": 1.7219987339445795, + "grad_norm": 0.554707897770957, + "learning_rate": 9.962775953637416e-06, + "loss": 11.7847, + "step": 31623 + }, + { + "epoch": 1.7220531879411625, + "grad_norm": 0.5579892266497304, + "learning_rate": 9.95893929902506e-06, + "loss": 11.7983, + "step": 31624 + }, + { + "epoch": 1.7221076419377455, + "grad_norm": 0.5721832657330594, + "learning_rate": 9.955103344587125e-06, + "loss": 11.78, + "step": 31625 + }, + { + "epoch": 1.7221620959343285, + "grad_norm": 0.5286547776030073, + "learning_rate": 9.951268090353382e-06, + "loss": 11.8571, + "step": 31626 + }, + { + "epoch": 1.7222165499309114, + "grad_norm": 0.6085563065468181, + "learning_rate": 9.947433536353679e-06, + "loss": 11.9021, + "step": 31627 + }, + { + "epoch": 1.7222710039274944, + "grad_norm": 0.49485171895229996, + "learning_rate": 9.943599682617865e-06, + "loss": 11.7653, + "step": 31628 + }, + { + "epoch": 1.7223254579240774, + "grad_norm": 0.5236511628822097, + "learning_rate": 9.939766529175698e-06, + "loss": 11.6727, + "step": 31629 + }, + { + "epoch": 1.7223799119206604, + "grad_norm": 0.5440952775692672, + "learning_rate": 9.935934076057008e-06, + "loss": 11.8226, + "step": 31630 + }, + { + "epoch": 1.7224343659172434, + "grad_norm": 0.5611291784323829, + "learning_rate": 9.932102323291603e-06, + "loss": 11.8492, + "step": 31631 + }, + { + "epoch": 1.7224888199138264, + "grad_norm": 0.5407081392247196, + "learning_rate": 9.928271270909273e-06, + "loss": 11.7461, + "step": 31632 + }, + { + "epoch": 1.7225432739104094, + "grad_norm": 0.5431104522580013, + "learning_rate": 9.924440918939814e-06, + "loss": 11.8053, + "step": 31633 + }, + { + "epoch": 1.7225977279069926, + "grad_norm": 0.5648261138440481, + "learning_rate": 9.920611267413005e-06, + "loss": 11.7403, + "step": 31634 + }, + { + "epoch": 1.7226521819035756, + "grad_norm": 0.6562552367201177, + "learning_rate": 9.91678231635864e-06, + "loss": 11.8344, + "step": 31635 + }, + { + "epoch": 1.7227066359001586, + "grad_norm": 0.6069501439000724, + "learning_rate": 9.912954065806468e-06, + "loss": 11.7119, + "step": 31636 + }, + { + "epoch": 1.7227610898967416, + "grad_norm": 0.5926715108090882, + "learning_rate": 9.90912651578626e-06, + "loss": 11.8739, + "step": 31637 + }, + { + "epoch": 1.7228155438933246, + "grad_norm": 0.57333652997764, + "learning_rate": 9.905299666327838e-06, + "loss": 11.9253, + "step": 31638 + }, + { + "epoch": 1.7228699978899076, + "grad_norm": 0.5179009446000575, + "learning_rate": 9.90147351746088e-06, + "loss": 11.7079, + "step": 31639 + }, + { + "epoch": 1.7229244518864908, + "grad_norm": 0.5927678052247316, + "learning_rate": 9.897648069215193e-06, + "loss": 11.7076, + "step": 31640 + }, + { + "epoch": 1.7229789058830738, + "grad_norm": 0.5420398795968947, + "learning_rate": 9.893823321620488e-06, + "loss": 11.8555, + "step": 31641 + }, + { + "epoch": 1.7230333598796568, + "grad_norm": 0.7032982705210287, + "learning_rate": 9.889999274706518e-06, + "loss": 11.8216, + "step": 31642 + }, + { + "epoch": 1.7230878138762398, + "grad_norm": 0.5456092851715043, + "learning_rate": 9.886175928503038e-06, + "loss": 11.8308, + "step": 31643 + }, + { + "epoch": 1.7231422678728228, + "grad_norm": 0.5144023790602227, + "learning_rate": 9.882353283039758e-06, + "loss": 11.8272, + "step": 31644 + }, + { + "epoch": 1.7231967218694058, + "grad_norm": 0.5219399078420015, + "learning_rate": 9.87853133834643e-06, + "loss": 11.7801, + "step": 31645 + }, + { + "epoch": 1.7232511758659887, + "grad_norm": 0.5264655189569761, + "learning_rate": 9.874710094452733e-06, + "loss": 11.7471, + "step": 31646 + }, + { + "epoch": 1.7233056298625717, + "grad_norm": 0.5354346228133944, + "learning_rate": 9.870889551388419e-06, + "loss": 11.7905, + "step": 31647 + }, + { + "epoch": 1.7233600838591547, + "grad_norm": 0.5272933071145303, + "learning_rate": 9.867069709183186e-06, + "loss": 11.6881, + "step": 31648 + }, + { + "epoch": 1.7234145378557377, + "grad_norm": 0.5897080081996334, + "learning_rate": 9.863250567866721e-06, + "loss": 11.7322, + "step": 31649 + }, + { + "epoch": 1.7234689918523207, + "grad_norm": 0.5078046384524149, + "learning_rate": 9.859432127468748e-06, + "loss": 11.8635, + "step": 31650 + }, + { + "epoch": 1.7235234458489037, + "grad_norm": 0.637272507312113, + "learning_rate": 9.85561438801893e-06, + "loss": 11.7572, + "step": 31651 + }, + { + "epoch": 1.7235778998454867, + "grad_norm": 0.560055228726501, + "learning_rate": 9.851797349546976e-06, + "loss": 11.7605, + "step": 31652 + }, + { + "epoch": 1.7236323538420697, + "grad_norm": 0.5062532132317071, + "learning_rate": 9.847981012082574e-06, + "loss": 11.6805, + "step": 31653 + }, + { + "epoch": 1.7236868078386527, + "grad_norm": 0.5877989342666577, + "learning_rate": 9.844165375655379e-06, + "loss": 11.7646, + "step": 31654 + }, + { + "epoch": 1.7237412618352357, + "grad_norm": 0.5330469216308327, + "learning_rate": 9.840350440295088e-06, + "loss": 11.6925, + "step": 31655 + }, + { + "epoch": 1.7237957158318187, + "grad_norm": 0.5621134833465058, + "learning_rate": 9.836536206031333e-06, + "loss": 11.8281, + "step": 31656 + }, + { + "epoch": 1.723850169828402, + "grad_norm": 0.5405881843850385, + "learning_rate": 9.832722672893812e-06, + "loss": 11.6365, + "step": 31657 + }, + { + "epoch": 1.7239046238249849, + "grad_norm": 0.5139143593961488, + "learning_rate": 9.82890984091216e-06, + "loss": 11.7755, + "step": 31658 + }, + { + "epoch": 1.7239590778215679, + "grad_norm": 0.5470150575024245, + "learning_rate": 9.825097710116016e-06, + "loss": 11.8374, + "step": 31659 + }, + { + "epoch": 1.7240135318181509, + "grad_norm": 0.5799642034690877, + "learning_rate": 9.821286280535048e-06, + "loss": 11.8594, + "step": 31660 + }, + { + "epoch": 1.7240679858147339, + "grad_norm": 0.5471452229170947, + "learning_rate": 9.817475552198851e-06, + "loss": 11.9307, + "step": 31661 + }, + { + "epoch": 1.7241224398113169, + "grad_norm": 0.5604782843538757, + "learning_rate": 9.813665525137117e-06, + "loss": 11.8213, + "step": 31662 + }, + { + "epoch": 1.7241768938079, + "grad_norm": 0.5269929859812372, + "learning_rate": 9.80985619937943e-06, + "loss": 11.797, + "step": 31663 + }, + { + "epoch": 1.724231347804483, + "grad_norm": 0.5361352117910505, + "learning_rate": 9.806047574955413e-06, + "loss": 11.818, + "step": 31664 + }, + { + "epoch": 1.724285801801066, + "grad_norm": 0.5594370648934125, + "learning_rate": 9.80223965189473e-06, + "loss": 11.7579, + "step": 31665 + }, + { + "epoch": 1.724340255797649, + "grad_norm": 0.5921846916923454, + "learning_rate": 9.798432430226923e-06, + "loss": 11.9223, + "step": 31666 + }, + { + "epoch": 1.724394709794232, + "grad_norm": 0.5474470645296436, + "learning_rate": 9.794625909981659e-06, + "loss": 11.7333, + "step": 31667 + }, + { + "epoch": 1.724449163790815, + "grad_norm": 0.5149711932014519, + "learning_rate": 9.790820091188502e-06, + "loss": 11.8104, + "step": 31668 + }, + { + "epoch": 1.724503617787398, + "grad_norm": 0.5614818687022267, + "learning_rate": 9.787014973877062e-06, + "loss": 11.8931, + "step": 31669 + }, + { + "epoch": 1.724558071783981, + "grad_norm": 0.5239461447515954, + "learning_rate": 9.783210558076928e-06, + "loss": 11.7741, + "step": 31670 + }, + { + "epoch": 1.724612525780564, + "grad_norm": 0.5675347104846662, + "learning_rate": 9.77940684381765e-06, + "loss": 11.7387, + "step": 31671 + }, + { + "epoch": 1.724666979777147, + "grad_norm": 0.5974951287968702, + "learning_rate": 9.775603831128865e-06, + "loss": 11.8754, + "step": 31672 + }, + { + "epoch": 1.72472143377373, + "grad_norm": 0.5715581040755615, + "learning_rate": 9.77180152004009e-06, + "loss": 11.872, + "step": 31673 + }, + { + "epoch": 1.724775887770313, + "grad_norm": 0.5238339741704982, + "learning_rate": 9.767999910580916e-06, + "loss": 11.867, + "step": 31674 + }, + { + "epoch": 1.724830341766896, + "grad_norm": 0.6017754906156638, + "learning_rate": 9.764199002780927e-06, + "loss": 11.8095, + "step": 31675 + }, + { + "epoch": 1.724884795763479, + "grad_norm": 0.5378820399597718, + "learning_rate": 9.760398796669646e-06, + "loss": 11.8627, + "step": 31676 + }, + { + "epoch": 1.724939249760062, + "grad_norm": 0.5688165944084097, + "learning_rate": 9.756599292276646e-06, + "loss": 11.9054, + "step": 31677 + }, + { + "epoch": 1.724993703756645, + "grad_norm": 0.6024619360754615, + "learning_rate": 9.752800489631453e-06, + "loss": 11.9786, + "step": 31678 + }, + { + "epoch": 1.725048157753228, + "grad_norm": 0.5228581285698074, + "learning_rate": 9.74900238876364e-06, + "loss": 11.7096, + "step": 31679 + }, + { + "epoch": 1.7251026117498112, + "grad_norm": 0.5032368347871917, + "learning_rate": 9.745204989702705e-06, + "loss": 11.7223, + "step": 31680 + }, + { + "epoch": 1.7251570657463942, + "grad_norm": 0.528420938205925, + "learning_rate": 9.741408292478183e-06, + "loss": 11.8796, + "step": 31681 + }, + { + "epoch": 1.7252115197429772, + "grad_norm": 0.5160502954828745, + "learning_rate": 9.737612297119625e-06, + "loss": 11.7338, + "step": 31682 + }, + { + "epoch": 1.7252659737395601, + "grad_norm": 0.5349271245456051, + "learning_rate": 9.733817003656509e-06, + "loss": 11.7757, + "step": 31683 + }, + { + "epoch": 1.7253204277361431, + "grad_norm": 0.6173101001891643, + "learning_rate": 9.7300224121184e-06, + "loss": 11.8106, + "step": 31684 + }, + { + "epoch": 1.7253748817327261, + "grad_norm": 0.5553358185095433, + "learning_rate": 9.726228522534742e-06, + "loss": 11.8653, + "step": 31685 + }, + { + "epoch": 1.7254293357293093, + "grad_norm": 0.5359553888537437, + "learning_rate": 9.722435334935077e-06, + "loss": 11.7207, + "step": 31686 + }, + { + "epoch": 1.7254837897258923, + "grad_norm": 0.5705403304738097, + "learning_rate": 9.718642849348902e-06, + "loss": 11.9162, + "step": 31687 + }, + { + "epoch": 1.7255382437224753, + "grad_norm": 0.47644253008702425, + "learning_rate": 9.714851065805697e-06, + "loss": 11.6373, + "step": 31688 + }, + { + "epoch": 1.7255926977190583, + "grad_norm": 0.5633304336926984, + "learning_rate": 9.711059984334981e-06, + "loss": 11.7309, + "step": 31689 + }, + { + "epoch": 1.7256471517156413, + "grad_norm": 0.5367003899122796, + "learning_rate": 9.707269604966162e-06, + "loss": 11.7679, + "step": 31690 + }, + { + "epoch": 1.7257016057122243, + "grad_norm": 0.5448976654034257, + "learning_rate": 9.703479927728765e-06, + "loss": 11.7898, + "step": 31691 + }, + { + "epoch": 1.7257560597088073, + "grad_norm": 0.5791685649235563, + "learning_rate": 9.699690952652275e-06, + "loss": 11.8029, + "step": 31692 + }, + { + "epoch": 1.7258105137053903, + "grad_norm": 0.5271671684126381, + "learning_rate": 9.6959026797661e-06, + "loss": 11.7772, + "step": 31693 + }, + { + "epoch": 1.7258649677019733, + "grad_norm": 0.5772368975839575, + "learning_rate": 9.692115109099754e-06, + "loss": 11.8192, + "step": 31694 + }, + { + "epoch": 1.7259194216985563, + "grad_norm": 0.5436844798966903, + "learning_rate": 9.688328240682643e-06, + "loss": 11.8391, + "step": 31695 + }, + { + "epoch": 1.7259738756951393, + "grad_norm": 0.5332655215303967, + "learning_rate": 9.684542074544256e-06, + "loss": 11.8615, + "step": 31696 + }, + { + "epoch": 1.7260283296917223, + "grad_norm": 0.5085579615833358, + "learning_rate": 9.680756610714003e-06, + "loss": 11.8357, + "step": 31697 + }, + { + "epoch": 1.7260827836883053, + "grad_norm": 0.530327560360123, + "learning_rate": 9.676971849221328e-06, + "loss": 11.8331, + "step": 31698 + }, + { + "epoch": 1.7261372376848882, + "grad_norm": 0.5679448804061265, + "learning_rate": 9.673187790095706e-06, + "loss": 11.8893, + "step": 31699 + }, + { + "epoch": 1.7261916916814712, + "grad_norm": 0.5371532607973594, + "learning_rate": 9.66940443336648e-06, + "loss": 11.8694, + "step": 31700 + }, + { + "epoch": 1.7262461456780542, + "grad_norm": 0.5364368743944987, + "learning_rate": 9.665621779063127e-06, + "loss": 11.811, + "step": 31701 + }, + { + "epoch": 1.7263005996746372, + "grad_norm": 0.6120074762318003, + "learning_rate": 9.661839827215058e-06, + "loss": 11.9309, + "step": 31702 + }, + { + "epoch": 1.7263550536712202, + "grad_norm": 0.5521704409754, + "learning_rate": 9.658058577851658e-06, + "loss": 11.7711, + "step": 31703 + }, + { + "epoch": 1.7264095076678034, + "grad_norm": 0.49477285511138874, + "learning_rate": 9.654278031002361e-06, + "loss": 11.8737, + "step": 31704 + }, + { + "epoch": 1.7264639616643864, + "grad_norm": 0.5873796201042932, + "learning_rate": 9.650498186696522e-06, + "loss": 11.9124, + "step": 31705 + }, + { + "epoch": 1.7265184156609694, + "grad_norm": 0.5743880791251625, + "learning_rate": 9.646719044963593e-06, + "loss": 11.7838, + "step": 31706 + }, + { + "epoch": 1.7265728696575524, + "grad_norm": 0.5289910373412918, + "learning_rate": 9.642940605832906e-06, + "loss": 11.7521, + "step": 31707 + }, + { + "epoch": 1.7266273236541354, + "grad_norm": 0.5643291751214005, + "learning_rate": 9.639162869333861e-06, + "loss": 11.7548, + "step": 31708 + }, + { + "epoch": 1.7266817776507186, + "grad_norm": 0.5648407876761691, + "learning_rate": 9.63538583549588e-06, + "loss": 11.8659, + "step": 31709 + }, + { + "epoch": 1.7267362316473016, + "grad_norm": 0.6448956523557557, + "learning_rate": 9.631609504348249e-06, + "loss": 11.8444, + "step": 31710 + }, + { + "epoch": 1.7267906856438846, + "grad_norm": 0.5195585514260834, + "learning_rate": 9.627833875920411e-06, + "loss": 11.7816, + "step": 31711 + }, + { + "epoch": 1.7268451396404676, + "grad_norm": 0.5592209932938418, + "learning_rate": 9.624058950241666e-06, + "loss": 11.9012, + "step": 31712 + }, + { + "epoch": 1.7268995936370506, + "grad_norm": 0.5576575172917734, + "learning_rate": 9.62028472734139e-06, + "loss": 12.0067, + "step": 31713 + }, + { + "epoch": 1.7269540476336336, + "grad_norm": 0.577147121452383, + "learning_rate": 9.616511207248957e-06, + "loss": 11.7964, + "step": 31714 + }, + { + "epoch": 1.7270085016302166, + "grad_norm": 0.5705233257587317, + "learning_rate": 9.61273838999367e-06, + "loss": 11.8475, + "step": 31715 + }, + { + "epoch": 1.7270629556267996, + "grad_norm": 0.6145207127089715, + "learning_rate": 9.608966275604913e-06, + "loss": 11.8718, + "step": 31716 + }, + { + "epoch": 1.7271174096233826, + "grad_norm": 0.5457264230962521, + "learning_rate": 9.605194864111967e-06, + "loss": 11.8942, + "step": 31717 + }, + { + "epoch": 1.7271718636199656, + "grad_norm": 0.5731032537776786, + "learning_rate": 9.601424155544214e-06, + "loss": 11.7933, + "step": 31718 + }, + { + "epoch": 1.7272263176165485, + "grad_norm": 0.5175332225203725, + "learning_rate": 9.597654149930934e-06, + "loss": 11.7578, + "step": 31719 + }, + { + "epoch": 1.7272807716131315, + "grad_norm": 0.6721303858954103, + "learning_rate": 9.593884847301437e-06, + "loss": 12.0066, + "step": 31720 + }, + { + "epoch": 1.7273352256097145, + "grad_norm": 0.6322488225549545, + "learning_rate": 9.590116247685089e-06, + "loss": 11.7774, + "step": 31721 + }, + { + "epoch": 1.7273896796062975, + "grad_norm": 0.5490412921685415, + "learning_rate": 9.586348351111118e-06, + "loss": 11.6418, + "step": 31722 + }, + { + "epoch": 1.7274441336028805, + "grad_norm": 0.5069457744941861, + "learning_rate": 9.582581157608883e-06, + "loss": 11.6878, + "step": 31723 + }, + { + "epoch": 1.7274985875994635, + "grad_norm": 0.510792647693245, + "learning_rate": 9.57881466720767e-06, + "loss": 11.7458, + "step": 31724 + }, + { + "epoch": 1.7275530415960465, + "grad_norm": 0.5589741024172136, + "learning_rate": 9.575048879936732e-06, + "loss": 11.8988, + "step": 31725 + }, + { + "epoch": 1.7276074955926295, + "grad_norm": 0.5440348207592137, + "learning_rate": 9.571283795825404e-06, + "loss": 11.7806, + "step": 31726 + }, + { + "epoch": 1.7276619495892127, + "grad_norm": 0.5228054075329174, + "learning_rate": 9.567519414902926e-06, + "loss": 11.7824, + "step": 31727 + }, + { + "epoch": 1.7277164035857957, + "grad_norm": 0.5812598052893078, + "learning_rate": 9.563755737198588e-06, + "loss": 11.8072, + "step": 31728 + }, + { + "epoch": 1.7277708575823787, + "grad_norm": 0.5124134013005092, + "learning_rate": 9.559992762741666e-06, + "loss": 11.7268, + "step": 31729 + }, + { + "epoch": 1.7278253115789617, + "grad_norm": 0.5835164472508388, + "learning_rate": 9.55623049156138e-06, + "loss": 11.7904, + "step": 31730 + }, + { + "epoch": 1.7278797655755447, + "grad_norm": 0.5085767572132961, + "learning_rate": 9.55246892368703e-06, + "loss": 11.8523, + "step": 31731 + }, + { + "epoch": 1.7279342195721277, + "grad_norm": 0.576927234951472, + "learning_rate": 9.548708059147827e-06, + "loss": 11.7906, + "step": 31732 + }, + { + "epoch": 1.7279886735687109, + "grad_norm": 0.597292849935191, + "learning_rate": 9.544947897973066e-06, + "loss": 11.7756, + "step": 31733 + }, + { + "epoch": 1.7280431275652939, + "grad_norm": 0.5796608200635165, + "learning_rate": 9.54118844019194e-06, + "loss": 11.8446, + "step": 31734 + }, + { + "epoch": 1.7280975815618769, + "grad_norm": 0.5473601123137064, + "learning_rate": 9.5374296858337e-06, + "loss": 11.9058, + "step": 31735 + }, + { + "epoch": 1.7281520355584599, + "grad_norm": 0.6127335719945055, + "learning_rate": 9.533671634927599e-06, + "loss": 11.8116, + "step": 31736 + }, + { + "epoch": 1.7282064895550429, + "grad_norm": 0.532967238794352, + "learning_rate": 9.529914287502816e-06, + "loss": 11.7317, + "step": 31737 + }, + { + "epoch": 1.7282609435516259, + "grad_norm": 0.6523433605397009, + "learning_rate": 9.526157643588618e-06, + "loss": 11.8911, + "step": 31738 + }, + { + "epoch": 1.7283153975482088, + "grad_norm": 0.5082996924963534, + "learning_rate": 9.52240170321418e-06, + "loss": 11.7689, + "step": 31739 + }, + { + "epoch": 1.7283698515447918, + "grad_norm": 0.5489672555722254, + "learning_rate": 9.518646466408709e-06, + "loss": 11.8655, + "step": 31740 + }, + { + "epoch": 1.7284243055413748, + "grad_norm": 0.47865030614903176, + "learning_rate": 9.51489193320143e-06, + "loss": 11.7239, + "step": 31741 + }, + { + "epoch": 1.7284787595379578, + "grad_norm": 0.5008943163819279, + "learning_rate": 9.511138103621508e-06, + "loss": 11.6818, + "step": 31742 + }, + { + "epoch": 1.7285332135345408, + "grad_norm": 0.5616135003176408, + "learning_rate": 9.507384977698175e-06, + "loss": 11.8098, + "step": 31743 + }, + { + "epoch": 1.7285876675311238, + "grad_norm": 0.6008478960011499, + "learning_rate": 9.503632555460574e-06, + "loss": 11.8165, + "step": 31744 + }, + { + "epoch": 1.7286421215277068, + "grad_norm": 0.5864006875525177, + "learning_rate": 9.499880836937913e-06, + "loss": 11.9344, + "step": 31745 + }, + { + "epoch": 1.7286965755242898, + "grad_norm": 0.6029267443455991, + "learning_rate": 9.496129822159338e-06, + "loss": 11.7735, + "step": 31746 + }, + { + "epoch": 1.7287510295208728, + "grad_norm": 0.5100562454781388, + "learning_rate": 9.492379511154036e-06, + "loss": 11.7855, + "step": 31747 + }, + { + "epoch": 1.7288054835174558, + "grad_norm": 0.5145890233843773, + "learning_rate": 9.488629903951197e-06, + "loss": 11.7564, + "step": 31748 + }, + { + "epoch": 1.7288599375140388, + "grad_norm": 0.5558111178106622, + "learning_rate": 9.484881000579937e-06, + "loss": 11.8101, + "step": 31749 + }, + { + "epoch": 1.728914391510622, + "grad_norm": 0.5288825719725364, + "learning_rate": 9.481132801069403e-06, + "loss": 11.7869, + "step": 31750 + }, + { + "epoch": 1.728968845507205, + "grad_norm": 0.5379104324708626, + "learning_rate": 9.477385305448794e-06, + "loss": 11.8337, + "step": 31751 + }, + { + "epoch": 1.729023299503788, + "grad_norm": 0.6405719500892642, + "learning_rate": 9.473638513747184e-06, + "loss": 11.9674, + "step": 31752 + }, + { + "epoch": 1.729077753500371, + "grad_norm": 0.614330814369286, + "learning_rate": 9.469892425993764e-06, + "loss": 11.7954, + "step": 31753 + }, + { + "epoch": 1.729132207496954, + "grad_norm": 0.5079721712198427, + "learning_rate": 9.466147042217632e-06, + "loss": 11.652, + "step": 31754 + }, + { + "epoch": 1.729186661493537, + "grad_norm": 0.5782492986472985, + "learning_rate": 9.46240236244793e-06, + "loss": 11.77, + "step": 31755 + }, + { + "epoch": 1.7292411154901202, + "grad_norm": 0.5521552723003574, + "learning_rate": 9.45865838671376e-06, + "loss": 11.839, + "step": 31756 + }, + { + "epoch": 1.7292955694867032, + "grad_norm": 0.5549373008724034, + "learning_rate": 9.45491511504425e-06, + "loss": 11.8042, + "step": 31757 + }, + { + "epoch": 1.7293500234832861, + "grad_norm": 0.5215075361656176, + "learning_rate": 9.451172547468512e-06, + "loss": 11.6191, + "step": 31758 + }, + { + "epoch": 1.7294044774798691, + "grad_norm": 0.5692800819500581, + "learning_rate": 9.447430684015645e-06, + "loss": 11.7545, + "step": 31759 + }, + { + "epoch": 1.7294589314764521, + "grad_norm": 0.5437205367979392, + "learning_rate": 9.44368952471475e-06, + "loss": 11.8703, + "step": 31760 + }, + { + "epoch": 1.7295133854730351, + "grad_norm": 0.5414735539849145, + "learning_rate": 9.439949069594888e-06, + "loss": 11.8, + "step": 31761 + }, + { + "epoch": 1.7295678394696181, + "grad_norm": 0.5091608588791786, + "learning_rate": 9.436209318685163e-06, + "loss": 11.7333, + "step": 31762 + }, + { + "epoch": 1.729622293466201, + "grad_norm": 0.5716410024395188, + "learning_rate": 9.432470272014681e-06, + "loss": 11.8264, + "step": 31763 + }, + { + "epoch": 1.729676747462784, + "grad_norm": 0.5155399898054873, + "learning_rate": 9.428731929612488e-06, + "loss": 11.8019, + "step": 31764 + }, + { + "epoch": 1.729731201459367, + "grad_norm": 0.5338373149824597, + "learning_rate": 9.424994291507682e-06, + "loss": 11.8044, + "step": 31765 + }, + { + "epoch": 1.72978565545595, + "grad_norm": 0.5280839117958512, + "learning_rate": 9.421257357729284e-06, + "loss": 11.7311, + "step": 31766 + }, + { + "epoch": 1.729840109452533, + "grad_norm": 0.5590267339098637, + "learning_rate": 9.417521128306406e-06, + "loss": 11.9218, + "step": 31767 + }, + { + "epoch": 1.729894563449116, + "grad_norm": 0.5411376474594606, + "learning_rate": 9.413785603268055e-06, + "loss": 11.7603, + "step": 31768 + }, + { + "epoch": 1.729949017445699, + "grad_norm": 0.5774859340004413, + "learning_rate": 9.41005078264331e-06, + "loss": 11.7246, + "step": 31769 + }, + { + "epoch": 1.730003471442282, + "grad_norm": 0.5115390727274177, + "learning_rate": 9.406316666461202e-06, + "loss": 11.7678, + "step": 31770 + }, + { + "epoch": 1.730057925438865, + "grad_norm": 0.5870685307456822, + "learning_rate": 9.402583254750752e-06, + "loss": 11.8025, + "step": 31771 + }, + { + "epoch": 1.730112379435448, + "grad_norm": 0.5992386089876499, + "learning_rate": 9.398850547541015e-06, + "loss": 11.8757, + "step": 31772 + }, + { + "epoch": 1.730166833432031, + "grad_norm": 0.5789352698885607, + "learning_rate": 9.395118544861026e-06, + "loss": 11.7844, + "step": 31773 + }, + { + "epoch": 1.7302212874286143, + "grad_norm": 0.5334465536363548, + "learning_rate": 9.391387246739758e-06, + "loss": 11.6402, + "step": 31774 + }, + { + "epoch": 1.7302757414251972, + "grad_norm": 0.5767553661427227, + "learning_rate": 9.387656653206289e-06, + "loss": 11.8415, + "step": 31775 + }, + { + "epoch": 1.7303301954217802, + "grad_norm": 0.5284614688540364, + "learning_rate": 9.383926764289574e-06, + "loss": 11.7814, + "step": 31776 + }, + { + "epoch": 1.7303846494183632, + "grad_norm": 0.5561196040121275, + "learning_rate": 9.380197580018667e-06, + "loss": 11.7798, + "step": 31777 + }, + { + "epoch": 1.7304391034149462, + "grad_norm": 0.5335937976136347, + "learning_rate": 9.376469100422513e-06, + "loss": 11.4178, + "step": 31778 + }, + { + "epoch": 1.7304935574115294, + "grad_norm": 0.5712066405402608, + "learning_rate": 9.372741325530154e-06, + "loss": 11.875, + "step": 31779 + }, + { + "epoch": 1.7305480114081124, + "grad_norm": 0.5862121634867449, + "learning_rate": 9.369014255370557e-06, + "loss": 11.9227, + "step": 31780 + }, + { + "epoch": 1.7306024654046954, + "grad_norm": 0.5074207143718371, + "learning_rate": 9.365287889972686e-06, + "loss": 11.6428, + "step": 31781 + }, + { + "epoch": 1.7306569194012784, + "grad_norm": 0.5267269417926154, + "learning_rate": 9.361562229365561e-06, + "loss": 11.9004, + "step": 31782 + }, + { + "epoch": 1.7307113733978614, + "grad_norm": 0.5822986060977571, + "learning_rate": 9.357837273578096e-06, + "loss": 11.7974, + "step": 31783 + }, + { + "epoch": 1.7307658273944444, + "grad_norm": 0.5779219084655686, + "learning_rate": 9.3541130226393e-06, + "loss": 11.8296, + "step": 31784 + }, + { + "epoch": 1.7308202813910274, + "grad_norm": 0.5272081635596604, + "learning_rate": 9.350389476578137e-06, + "loss": 11.6917, + "step": 31785 + }, + { + "epoch": 1.7308747353876104, + "grad_norm": 0.5631118552644984, + "learning_rate": 9.34666663542353e-06, + "loss": 11.7668, + "step": 31786 + }, + { + "epoch": 1.7309291893841934, + "grad_norm": 0.5549072649547676, + "learning_rate": 9.342944499204465e-06, + "loss": 11.6803, + "step": 31787 + }, + { + "epoch": 1.7309836433807764, + "grad_norm": 0.7167830541880525, + "learning_rate": 9.339223067949843e-06, + "loss": 11.804, + "step": 31788 + }, + { + "epoch": 1.7310380973773594, + "grad_norm": 0.5556018355130229, + "learning_rate": 9.335502341688652e-06, + "loss": 11.6957, + "step": 31789 + }, + { + "epoch": 1.7310925513739424, + "grad_norm": 0.5446655057382435, + "learning_rate": 9.33178232044979e-06, + "loss": 11.8085, + "step": 31790 + }, + { + "epoch": 1.7311470053705253, + "grad_norm": 0.5432132415105937, + "learning_rate": 9.328063004262177e-06, + "loss": 11.8292, + "step": 31791 + }, + { + "epoch": 1.7312014593671083, + "grad_norm": 0.5688969977007066, + "learning_rate": 9.324344393154783e-06, + "loss": 11.8716, + "step": 31792 + }, + { + "epoch": 1.7312559133636913, + "grad_norm": 0.5212004715882294, + "learning_rate": 9.320626487156459e-06, + "loss": 11.7897, + "step": 31793 + }, + { + "epoch": 1.7313103673602743, + "grad_norm": 0.516186582647028, + "learning_rate": 9.316909286296183e-06, + "loss": 11.873, + "step": 31794 + }, + { + "epoch": 1.7313648213568573, + "grad_norm": 0.5027105438080042, + "learning_rate": 9.313192790602798e-06, + "loss": 11.7418, + "step": 31795 + }, + { + "epoch": 1.7314192753534403, + "grad_norm": 0.5349807289495488, + "learning_rate": 9.309477000105237e-06, + "loss": 11.6966, + "step": 31796 + }, + { + "epoch": 1.7314737293500235, + "grad_norm": 0.5307829903497532, + "learning_rate": 9.305761914832412e-06, + "loss": 11.8554, + "step": 31797 + }, + { + "epoch": 1.7315281833466065, + "grad_norm": 0.4943187880420814, + "learning_rate": 9.302047534813174e-06, + "loss": 11.7604, + "step": 31798 + }, + { + "epoch": 1.7315826373431895, + "grad_norm": 0.537687020744414, + "learning_rate": 9.298333860076435e-06, + "loss": 11.8353, + "step": 31799 + }, + { + "epoch": 1.7316370913397725, + "grad_norm": 0.5634131204889544, + "learning_rate": 9.294620890651074e-06, + "loss": 11.7512, + "step": 31800 + }, + { + "epoch": 1.7316915453363555, + "grad_norm": 0.5880410043981958, + "learning_rate": 9.29090862656593e-06, + "loss": 11.9291, + "step": 31801 + }, + { + "epoch": 1.7317459993329385, + "grad_norm": 0.5863121013597389, + "learning_rate": 9.287197067849907e-06, + "loss": 11.7841, + "step": 31802 + }, + { + "epoch": 1.7318004533295217, + "grad_norm": 0.5473313432392998, + "learning_rate": 9.283486214531833e-06, + "loss": 11.8744, + "step": 31803 + }, + { + "epoch": 1.7318549073261047, + "grad_norm": 0.535849865589238, + "learning_rate": 9.27977606664061e-06, + "loss": 11.8094, + "step": 31804 + }, + { + "epoch": 1.7319093613226877, + "grad_norm": 0.5661076997331189, + "learning_rate": 9.276066624205038e-06, + "loss": 11.8197, + "step": 31805 + }, + { + "epoch": 1.7319638153192707, + "grad_norm": 0.5627804408755753, + "learning_rate": 9.27235788725399e-06, + "loss": 11.7295, + "step": 31806 + }, + { + "epoch": 1.7320182693158537, + "grad_norm": 0.5157923365260186, + "learning_rate": 9.268649855816313e-06, + "loss": 11.8166, + "step": 31807 + }, + { + "epoch": 1.7320727233124367, + "grad_norm": 0.5736871896204039, + "learning_rate": 9.264942529920817e-06, + "loss": 11.8427, + "step": 31808 + }, + { + "epoch": 1.7321271773090197, + "grad_norm": 0.5814916522615083, + "learning_rate": 9.261235909596367e-06, + "loss": 11.8641, + "step": 31809 + }, + { + "epoch": 1.7321816313056027, + "grad_norm": 0.5461754129138099, + "learning_rate": 9.25752999487176e-06, + "loss": 11.8515, + "step": 31810 + }, + { + "epoch": 1.7322360853021856, + "grad_norm": 0.5126673278482509, + "learning_rate": 9.253824785775799e-06, + "loss": 11.7771, + "step": 31811 + }, + { + "epoch": 1.7322905392987686, + "grad_norm": 0.5560593970273288, + "learning_rate": 9.250120282337326e-06, + "loss": 11.7364, + "step": 31812 + }, + { + "epoch": 1.7323449932953516, + "grad_norm": 0.5395423087362559, + "learning_rate": 9.24641648458513e-06, + "loss": 11.8862, + "step": 31813 + }, + { + "epoch": 1.7323994472919346, + "grad_norm": 0.49090752907695995, + "learning_rate": 9.24271339254803e-06, + "loss": 11.7974, + "step": 31814 + }, + { + "epoch": 1.7324539012885176, + "grad_norm": 0.4910345344878293, + "learning_rate": 9.239011006254794e-06, + "loss": 11.7537, + "step": 31815 + }, + { + "epoch": 1.7325083552851006, + "grad_norm": 0.5468577576444289, + "learning_rate": 9.235309325734242e-06, + "loss": 11.8825, + "step": 31816 + }, + { + "epoch": 1.7325628092816836, + "grad_norm": 0.541162242652814, + "learning_rate": 9.23160835101513e-06, + "loss": 11.8388, + "step": 31817 + }, + { + "epoch": 1.7326172632782666, + "grad_norm": 0.6230660625624617, + "learning_rate": 9.227908082126258e-06, + "loss": 11.9004, + "step": 31818 + }, + { + "epoch": 1.7326717172748496, + "grad_norm": 0.5382764596129828, + "learning_rate": 9.22420851909641e-06, + "loss": 11.9146, + "step": 31819 + }, + { + "epoch": 1.7327261712714328, + "grad_norm": 0.5263115047747339, + "learning_rate": 9.220509661954346e-06, + "loss": 11.6744, + "step": 31820 + }, + { + "epoch": 1.7327806252680158, + "grad_norm": 0.49533064461987614, + "learning_rate": 9.216811510728795e-06, + "loss": 11.765, + "step": 31821 + }, + { + "epoch": 1.7328350792645988, + "grad_norm": 0.5375916987433108, + "learning_rate": 9.213114065448559e-06, + "loss": 11.7109, + "step": 31822 + }, + { + "epoch": 1.7328895332611818, + "grad_norm": 0.5771549502666888, + "learning_rate": 9.209417326142367e-06, + "loss": 11.9013, + "step": 31823 + }, + { + "epoch": 1.7329439872577648, + "grad_norm": 0.5788564287638096, + "learning_rate": 9.205721292838976e-06, + "loss": 11.7073, + "step": 31824 + }, + { + "epoch": 1.7329984412543478, + "grad_norm": 0.5620934116215915, + "learning_rate": 9.202025965567118e-06, + "loss": 11.9465, + "step": 31825 + }, + { + "epoch": 1.733052895250931, + "grad_norm": 0.5288264080461414, + "learning_rate": 9.198331344355537e-06, + "loss": 11.7418, + "step": 31826 + }, + { + "epoch": 1.733107349247514, + "grad_norm": 0.5991099826495264, + "learning_rate": 9.194637429232955e-06, + "loss": 11.8112, + "step": 31827 + }, + { + "epoch": 1.733161803244097, + "grad_norm": 0.49794012698116397, + "learning_rate": 9.190944220228093e-06, + "loss": 11.7288, + "step": 31828 + }, + { + "epoch": 1.73321625724068, + "grad_norm": 0.5295308583277801, + "learning_rate": 9.187251717369695e-06, + "loss": 11.8134, + "step": 31829 + }, + { + "epoch": 1.733270711237263, + "grad_norm": 0.5052311824340352, + "learning_rate": 9.183559920686457e-06, + "loss": 11.8452, + "step": 31830 + }, + { + "epoch": 1.733325165233846, + "grad_norm": 0.524407725961568, + "learning_rate": 9.179868830207084e-06, + "loss": 11.6978, + "step": 31831 + }, + { + "epoch": 1.733379619230429, + "grad_norm": 0.5772553659785641, + "learning_rate": 9.17617844596027e-06, + "loss": 11.7124, + "step": 31832 + }, + { + "epoch": 1.733434073227012, + "grad_norm": 0.5215331440394445, + "learning_rate": 9.172488767974718e-06, + "loss": 11.688, + "step": 31833 + }, + { + "epoch": 1.733488527223595, + "grad_norm": 0.5672522888550658, + "learning_rate": 9.168799796279148e-06, + "loss": 11.7624, + "step": 31834 + }, + { + "epoch": 1.733542981220178, + "grad_norm": 0.5529480895985476, + "learning_rate": 9.165111530902204e-06, + "loss": 11.7447, + "step": 31835 + }, + { + "epoch": 1.733597435216761, + "grad_norm": 0.5738203143307091, + "learning_rate": 9.161423971872606e-06, + "loss": 11.8681, + "step": 31836 + }, + { + "epoch": 1.733651889213344, + "grad_norm": 0.48516725773577646, + "learning_rate": 9.15773711921898e-06, + "loss": 11.8038, + "step": 31837 + }, + { + "epoch": 1.733706343209927, + "grad_norm": 0.559581261836742, + "learning_rate": 9.154050972970052e-06, + "loss": 11.8667, + "step": 31838 + }, + { + "epoch": 1.7337607972065099, + "grad_norm": 0.5619177232601096, + "learning_rate": 9.150365533154437e-06, + "loss": 11.7478, + "step": 31839 + }, + { + "epoch": 1.7338152512030929, + "grad_norm": 0.5469247544641608, + "learning_rate": 9.146680799800834e-06, + "loss": 11.8148, + "step": 31840 + }, + { + "epoch": 1.7338697051996759, + "grad_norm": 0.5634893522774611, + "learning_rate": 9.142996772937884e-06, + "loss": 11.8232, + "step": 31841 + }, + { + "epoch": 1.7339241591962589, + "grad_norm": 0.60737512667563, + "learning_rate": 9.1393134525942e-06, + "loss": 11.7279, + "step": 31842 + }, + { + "epoch": 1.733978613192842, + "grad_norm": 0.5891923500901595, + "learning_rate": 9.13563083879847e-06, + "loss": 11.8399, + "step": 31843 + }, + { + "epoch": 1.734033067189425, + "grad_norm": 0.549736623254649, + "learning_rate": 9.131948931579303e-06, + "loss": 11.6969, + "step": 31844 + }, + { + "epoch": 1.734087521186008, + "grad_norm": 0.5923333227224441, + "learning_rate": 9.128267730965334e-06, + "loss": 11.9202, + "step": 31845 + }, + { + "epoch": 1.734141975182591, + "grad_norm": 0.5112563342606398, + "learning_rate": 9.124587236985216e-06, + "loss": 11.8206, + "step": 31846 + }, + { + "epoch": 1.734196429179174, + "grad_norm": 0.5577007296796052, + "learning_rate": 9.120907449667527e-06, + "loss": 11.7966, + "step": 31847 + }, + { + "epoch": 1.734250883175757, + "grad_norm": 0.5229001121047716, + "learning_rate": 9.11722836904092e-06, + "loss": 11.7849, + "step": 31848 + }, + { + "epoch": 1.7343053371723403, + "grad_norm": 0.5792622385692741, + "learning_rate": 9.113549995133964e-06, + "loss": 11.7612, + "step": 31849 + }, + { + "epoch": 1.7343597911689232, + "grad_norm": 0.5338103589125868, + "learning_rate": 9.10987232797531e-06, + "loss": 11.7377, + "step": 31850 + }, + { + "epoch": 1.7344142451655062, + "grad_norm": 0.5622271457193666, + "learning_rate": 9.106195367593528e-06, + "loss": 11.7544, + "step": 31851 + }, + { + "epoch": 1.7344686991620892, + "grad_norm": 0.5177335513682215, + "learning_rate": 9.102519114017194e-06, + "loss": 11.7878, + "step": 31852 + }, + { + "epoch": 1.7345231531586722, + "grad_norm": 0.5389755869777442, + "learning_rate": 9.09884356727494e-06, + "loss": 11.6675, + "step": 31853 + }, + { + "epoch": 1.7345776071552552, + "grad_norm": 0.5404158838421443, + "learning_rate": 9.095168727395298e-06, + "loss": 11.8193, + "step": 31854 + }, + { + "epoch": 1.7346320611518382, + "grad_norm": 0.5943710794913865, + "learning_rate": 9.091494594406868e-06, + "loss": 11.7008, + "step": 31855 + }, + { + "epoch": 1.7346865151484212, + "grad_norm": 0.5339501381814628, + "learning_rate": 9.087821168338239e-06, + "loss": 11.6924, + "step": 31856 + }, + { + "epoch": 1.7347409691450042, + "grad_norm": 0.5678115108341879, + "learning_rate": 9.084148449217945e-06, + "loss": 11.7249, + "step": 31857 + }, + { + "epoch": 1.7347954231415872, + "grad_norm": 0.5668293287245962, + "learning_rate": 9.080476437074569e-06, + "loss": 11.8283, + "step": 31858 + }, + { + "epoch": 1.7348498771381702, + "grad_norm": 0.5547633744492095, + "learning_rate": 9.07680513193665e-06, + "loss": 11.7417, + "step": 31859 + }, + { + "epoch": 1.7349043311347532, + "grad_norm": 0.5436193098513749, + "learning_rate": 9.07313453383275e-06, + "loss": 11.6788, + "step": 31860 + }, + { + "epoch": 1.7349587851313362, + "grad_norm": 0.5690684965834453, + "learning_rate": 9.069464642791403e-06, + "loss": 11.8594, + "step": 31861 + }, + { + "epoch": 1.7350132391279192, + "grad_norm": 0.5921293204556438, + "learning_rate": 9.065795458841143e-06, + "loss": 11.8938, + "step": 31862 + }, + { + "epoch": 1.7350676931245022, + "grad_norm": 0.5523228562514196, + "learning_rate": 9.06212698201051e-06, + "loss": 11.8822, + "step": 31863 + }, + { + "epoch": 1.7351221471210851, + "grad_norm": 0.5686848873771777, + "learning_rate": 9.058459212328018e-06, + "loss": 11.8903, + "step": 31864 + }, + { + "epoch": 1.7351766011176681, + "grad_norm": 0.5264078035947367, + "learning_rate": 9.054792149822222e-06, + "loss": 11.8165, + "step": 31865 + }, + { + "epoch": 1.7352310551142511, + "grad_norm": 0.5488165106359091, + "learning_rate": 9.051125794521587e-06, + "loss": 11.7877, + "step": 31866 + }, + { + "epoch": 1.7352855091108343, + "grad_norm": 0.5730019632075116, + "learning_rate": 9.047460146454644e-06, + "loss": 11.8723, + "step": 31867 + }, + { + "epoch": 1.7353399631074173, + "grad_norm": 0.6138328700026215, + "learning_rate": 9.04379520564993e-06, + "loss": 11.7798, + "step": 31868 + }, + { + "epoch": 1.7353944171040003, + "grad_norm": 0.6531888711113734, + "learning_rate": 9.040130972135907e-06, + "loss": 11.7869, + "step": 31869 + }, + { + "epoch": 1.7354488711005833, + "grad_norm": 0.575646539004118, + "learning_rate": 9.036467445941089e-06, + "loss": 11.7694, + "step": 31870 + }, + { + "epoch": 1.7355033250971663, + "grad_norm": 0.5919880430843911, + "learning_rate": 9.03280462709395e-06, + "loss": 11.8656, + "step": 31871 + }, + { + "epoch": 1.7355577790937493, + "grad_norm": 0.524589233508476, + "learning_rate": 9.029142515622968e-06, + "loss": 11.7044, + "step": 31872 + }, + { + "epoch": 1.7356122330903325, + "grad_norm": 0.5689243356154033, + "learning_rate": 9.025481111556645e-06, + "loss": 11.8084, + "step": 31873 + }, + { + "epoch": 1.7356666870869155, + "grad_norm": 0.5010574702545381, + "learning_rate": 9.021820414923421e-06, + "loss": 11.6926, + "step": 31874 + }, + { + "epoch": 1.7357211410834985, + "grad_norm": 0.5140298930794341, + "learning_rate": 9.018160425751787e-06, + "loss": 11.7479, + "step": 31875 + }, + { + "epoch": 1.7357755950800815, + "grad_norm": 0.5598431460374912, + "learning_rate": 9.014501144070187e-06, + "loss": 11.8489, + "step": 31876 + }, + { + "epoch": 1.7358300490766645, + "grad_norm": 0.5498528068954908, + "learning_rate": 9.010842569907086e-06, + "loss": 11.7145, + "step": 31877 + }, + { + "epoch": 1.7358845030732475, + "grad_norm": 0.5128889158274546, + "learning_rate": 9.00718470329095e-06, + "loss": 11.8008, + "step": 31878 + }, + { + "epoch": 1.7359389570698305, + "grad_norm": 0.5558278832011802, + "learning_rate": 9.003527544250178e-06, + "loss": 11.8823, + "step": 31879 + }, + { + "epoch": 1.7359934110664135, + "grad_norm": 0.5685858614518955, + "learning_rate": 8.999871092813272e-06, + "loss": 11.8429, + "step": 31880 + }, + { + "epoch": 1.7360478650629965, + "grad_norm": 0.5850625674664518, + "learning_rate": 8.996215349008608e-06, + "loss": 11.8085, + "step": 31881 + }, + { + "epoch": 1.7361023190595795, + "grad_norm": 0.5071773126399107, + "learning_rate": 8.992560312864617e-06, + "loss": 11.804, + "step": 31882 + }, + { + "epoch": 1.7361567730561625, + "grad_norm": 0.5433957365845743, + "learning_rate": 8.988905984409768e-06, + "loss": 11.7221, + "step": 31883 + }, + { + "epoch": 1.7362112270527454, + "grad_norm": 0.501478910362739, + "learning_rate": 8.985252363672426e-06, + "loss": 11.7778, + "step": 31884 + }, + { + "epoch": 1.7362656810493284, + "grad_norm": 0.5460184950510293, + "learning_rate": 8.981599450681043e-06, + "loss": 11.6809, + "step": 31885 + }, + { + "epoch": 1.7363201350459114, + "grad_norm": 0.5794270108992531, + "learning_rate": 8.977947245463991e-06, + "loss": 11.7472, + "step": 31886 + }, + { + "epoch": 1.7363745890424944, + "grad_norm": 0.513443854355982, + "learning_rate": 8.97429574804971e-06, + "loss": 11.6999, + "step": 31887 + }, + { + "epoch": 1.7364290430390774, + "grad_norm": 0.5719591196016566, + "learning_rate": 8.970644958466534e-06, + "loss": 11.948, + "step": 31888 + }, + { + "epoch": 1.7364834970356604, + "grad_norm": 0.5466542800558982, + "learning_rate": 8.966994876742907e-06, + "loss": 11.6543, + "step": 31889 + }, + { + "epoch": 1.7365379510322436, + "grad_norm": 0.5855541183710491, + "learning_rate": 8.963345502907216e-06, + "loss": 11.8503, + "step": 31890 + }, + { + "epoch": 1.7365924050288266, + "grad_norm": 0.4972010693033175, + "learning_rate": 8.959696836987796e-06, + "loss": 11.739, + "step": 31891 + }, + { + "epoch": 1.7366468590254096, + "grad_norm": 0.6368720737829793, + "learning_rate": 8.956048879013045e-06, + "loss": 11.7997, + "step": 31892 + }, + { + "epoch": 1.7367013130219926, + "grad_norm": 0.6061320673296801, + "learning_rate": 8.95240162901132e-06, + "loss": 11.8223, + "step": 31893 + }, + { + "epoch": 1.7367557670185756, + "grad_norm": 0.5468653568716522, + "learning_rate": 8.948755087010973e-06, + "loss": 11.6784, + "step": 31894 + }, + { + "epoch": 1.7368102210151586, + "grad_norm": 0.6787007297548442, + "learning_rate": 8.945109253040407e-06, + "loss": 11.9298, + "step": 31895 + }, + { + "epoch": 1.7368646750117418, + "grad_norm": 0.4826222858430997, + "learning_rate": 8.941464127127918e-06, + "loss": 11.7985, + "step": 31896 + }, + { + "epoch": 1.7369191290083248, + "grad_norm": 0.5567254485433407, + "learning_rate": 8.937819709301898e-06, + "loss": 11.7621, + "step": 31897 + }, + { + "epoch": 1.7369735830049078, + "grad_norm": 0.5272206331795003, + "learning_rate": 8.934175999590633e-06, + "loss": 11.7968, + "step": 31898 + }, + { + "epoch": 1.7370280370014908, + "grad_norm": 0.5289801539308546, + "learning_rate": 8.930532998022512e-06, + "loss": 11.7091, + "step": 31899 + }, + { + "epoch": 1.7370824909980738, + "grad_norm": 0.5294484475452663, + "learning_rate": 8.926890704625845e-06, + "loss": 11.7452, + "step": 31900 + }, + { + "epoch": 1.7371369449946568, + "grad_norm": 0.5283246401411331, + "learning_rate": 8.923249119428922e-06, + "loss": 11.758, + "step": 31901 + }, + { + "epoch": 1.7371913989912398, + "grad_norm": 0.6418452457116163, + "learning_rate": 8.919608242460108e-06, + "loss": 11.8581, + "step": 31902 + }, + { + "epoch": 1.7372458529878227, + "grad_norm": 0.5248087601600387, + "learning_rate": 8.915968073747682e-06, + "loss": 11.8406, + "step": 31903 + }, + { + "epoch": 1.7373003069844057, + "grad_norm": 0.5388132701630212, + "learning_rate": 8.912328613319953e-06, + "loss": 11.8169, + "step": 31904 + }, + { + "epoch": 1.7373547609809887, + "grad_norm": 0.5051992040933106, + "learning_rate": 8.908689861205255e-06, + "loss": 11.7077, + "step": 31905 + }, + { + "epoch": 1.7374092149775717, + "grad_norm": 0.5759347931673632, + "learning_rate": 8.905051817431853e-06, + "loss": 11.8287, + "step": 31906 + }, + { + "epoch": 1.7374636689741547, + "grad_norm": 0.4963267385136898, + "learning_rate": 8.901414482028047e-06, + "loss": 11.8085, + "step": 31907 + }, + { + "epoch": 1.7375181229707377, + "grad_norm": 0.5020050777137453, + "learning_rate": 8.897777855022105e-06, + "loss": 11.7405, + "step": 31908 + }, + { + "epoch": 1.7375725769673207, + "grad_norm": 0.5810868966399029, + "learning_rate": 8.894141936442346e-06, + "loss": 11.7657, + "step": 31909 + }, + { + "epoch": 1.7376270309639037, + "grad_norm": 0.5735157174973503, + "learning_rate": 8.890506726317005e-06, + "loss": 11.7871, + "step": 31910 + }, + { + "epoch": 1.7376814849604867, + "grad_norm": 0.5594504910573439, + "learning_rate": 8.886872224674359e-06, + "loss": 11.6382, + "step": 31911 + }, + { + "epoch": 1.7377359389570697, + "grad_norm": 0.5037711618953534, + "learning_rate": 8.883238431542684e-06, + "loss": 11.791, + "step": 31912 + }, + { + "epoch": 1.737790392953653, + "grad_norm": 0.5332720651748264, + "learning_rate": 8.879605346950203e-06, + "loss": 11.7786, + "step": 31913 + }, + { + "epoch": 1.7378448469502359, + "grad_norm": 0.5461210602637525, + "learning_rate": 8.875972970925229e-06, + "loss": 11.8422, + "step": 31914 + }, + { + "epoch": 1.7378993009468189, + "grad_norm": 0.5556292310379968, + "learning_rate": 8.872341303495935e-06, + "loss": 11.7714, + "step": 31915 + }, + { + "epoch": 1.7379537549434019, + "grad_norm": 0.554320969484622, + "learning_rate": 8.868710344690601e-06, + "loss": 11.7083, + "step": 31916 + }, + { + "epoch": 1.7380082089399849, + "grad_norm": 0.6041289850317021, + "learning_rate": 8.86508009453748e-06, + "loss": 11.7377, + "step": 31917 + }, + { + "epoch": 1.7380626629365679, + "grad_norm": 0.5841586540888242, + "learning_rate": 8.861450553064765e-06, + "loss": 11.7625, + "step": 31918 + }, + { + "epoch": 1.738117116933151, + "grad_norm": 0.5567303583460421, + "learning_rate": 8.857821720300697e-06, + "loss": 11.6293, + "step": 31919 + }, + { + "epoch": 1.738171570929734, + "grad_norm": 0.5983747172622275, + "learning_rate": 8.854193596273509e-06, + "loss": 11.8934, + "step": 31920 + }, + { + "epoch": 1.738226024926317, + "grad_norm": 0.7066976709933921, + "learning_rate": 8.850566181011366e-06, + "loss": 11.9972, + "step": 31921 + }, + { + "epoch": 1.7382804789229, + "grad_norm": 0.5893838894800378, + "learning_rate": 8.846939474542538e-06, + "loss": 11.7745, + "step": 31922 + }, + { + "epoch": 1.738334932919483, + "grad_norm": 0.5110780693151147, + "learning_rate": 8.843313476895165e-06, + "loss": 11.7757, + "step": 31923 + }, + { + "epoch": 1.738389386916066, + "grad_norm": 0.6209174833375607, + "learning_rate": 8.839688188097495e-06, + "loss": 11.822, + "step": 31924 + }, + { + "epoch": 1.738443840912649, + "grad_norm": 0.5819825506408214, + "learning_rate": 8.83606360817768e-06, + "loss": 11.7911, + "step": 31925 + }, + { + "epoch": 1.738498294909232, + "grad_norm": 0.6126769758037125, + "learning_rate": 8.832439737163923e-06, + "loss": 11.7689, + "step": 31926 + }, + { + "epoch": 1.738552748905815, + "grad_norm": 0.5648979642562577, + "learning_rate": 8.82881657508442e-06, + "loss": 11.8178, + "step": 31927 + }, + { + "epoch": 1.738607202902398, + "grad_norm": 0.5358949345609733, + "learning_rate": 8.825194121967307e-06, + "loss": 11.7572, + "step": 31928 + }, + { + "epoch": 1.738661656898981, + "grad_norm": 0.5717128670773614, + "learning_rate": 8.821572377840803e-06, + "loss": 11.8206, + "step": 31929 + }, + { + "epoch": 1.738716110895564, + "grad_norm": 0.5853436900609554, + "learning_rate": 8.817951342733032e-06, + "loss": 11.8308, + "step": 31930 + }, + { + "epoch": 1.738770564892147, + "grad_norm": 0.5471095256239284, + "learning_rate": 8.81433101667215e-06, + "loss": 11.7714, + "step": 31931 + }, + { + "epoch": 1.73882501888873, + "grad_norm": 0.546559421596322, + "learning_rate": 8.810711399686334e-06, + "loss": 11.8059, + "step": 31932 + }, + { + "epoch": 1.738879472885313, + "grad_norm": 0.5367329275204307, + "learning_rate": 8.807092491803715e-06, + "loss": 11.7581, + "step": 31933 + }, + { + "epoch": 1.738933926881896, + "grad_norm": 0.5698202345816502, + "learning_rate": 8.803474293052438e-06, + "loss": 11.9099, + "step": 31934 + }, + { + "epoch": 1.738988380878479, + "grad_norm": 0.5262262135844425, + "learning_rate": 8.799856803460627e-06, + "loss": 11.6944, + "step": 31935 + }, + { + "epoch": 1.739042834875062, + "grad_norm": 0.5930909117868646, + "learning_rate": 8.796240023056445e-06, + "loss": 11.8409, + "step": 31936 + }, + { + "epoch": 1.7390972888716452, + "grad_norm": 0.5720385926628261, + "learning_rate": 8.792623951867985e-06, + "loss": 11.9001, + "step": 31937 + }, + { + "epoch": 1.7391517428682282, + "grad_norm": 0.587627436208008, + "learning_rate": 8.789008589923364e-06, + "loss": 11.8586, + "step": 31938 + }, + { + "epoch": 1.7392061968648111, + "grad_norm": 0.5999198699587912, + "learning_rate": 8.78539393725073e-06, + "loss": 11.8785, + "step": 31939 + }, + { + "epoch": 1.7392606508613941, + "grad_norm": 0.5202491532771909, + "learning_rate": 8.781779993878169e-06, + "loss": 11.5571, + "step": 31940 + }, + { + "epoch": 1.7393151048579771, + "grad_norm": 0.6831119204493081, + "learning_rate": 8.778166759833784e-06, + "loss": 11.8553, + "step": 31941 + }, + { + "epoch": 1.7393695588545601, + "grad_norm": 0.5254518224668623, + "learning_rate": 8.774554235145648e-06, + "loss": 11.8292, + "step": 31942 + }, + { + "epoch": 1.7394240128511433, + "grad_norm": 0.5153392436022605, + "learning_rate": 8.770942419841888e-06, + "loss": 11.7068, + "step": 31943 + }, + { + "epoch": 1.7394784668477263, + "grad_norm": 0.5577738292986892, + "learning_rate": 8.767331313950588e-06, + "loss": 11.6506, + "step": 31944 + }, + { + "epoch": 1.7395329208443093, + "grad_norm": 0.5793486381665118, + "learning_rate": 8.763720917499807e-06, + "loss": 11.6731, + "step": 31945 + }, + { + "epoch": 1.7395873748408923, + "grad_norm": 0.5667946436974896, + "learning_rate": 8.760111230517653e-06, + "loss": 11.8109, + "step": 31946 + }, + { + "epoch": 1.7396418288374753, + "grad_norm": 0.5475771598830302, + "learning_rate": 8.75650225303215e-06, + "loss": 11.7654, + "step": 31947 + }, + { + "epoch": 1.7396962828340583, + "grad_norm": 0.5202563000046072, + "learning_rate": 8.75289398507141e-06, + "loss": 11.8181, + "step": 31948 + }, + { + "epoch": 1.7397507368306413, + "grad_norm": 0.5646446723735778, + "learning_rate": 8.74928642666345e-06, + "loss": 11.762, + "step": 31949 + }, + { + "epoch": 1.7398051908272243, + "grad_norm": 0.6338854791429734, + "learning_rate": 8.745679577836342e-06, + "loss": 11.8404, + "step": 31950 + }, + { + "epoch": 1.7398596448238073, + "grad_norm": 0.627681073375901, + "learning_rate": 8.74207343861817e-06, + "loss": 11.7785, + "step": 31951 + }, + { + "epoch": 1.7399140988203903, + "grad_norm": 0.5419673933021651, + "learning_rate": 8.738468009036893e-06, + "loss": 11.724, + "step": 31952 + }, + { + "epoch": 1.7399685528169733, + "grad_norm": 0.5212431761118159, + "learning_rate": 8.73486328912061e-06, + "loss": 11.6219, + "step": 31953 + }, + { + "epoch": 1.7400230068135563, + "grad_norm": 0.60439168942001, + "learning_rate": 8.731259278897341e-06, + "loss": 11.8018, + "step": 31954 + }, + { + "epoch": 1.7400774608101393, + "grad_norm": 0.5415075829565839, + "learning_rate": 8.727655978395089e-06, + "loss": 11.7804, + "step": 31955 + }, + { + "epoch": 1.7401319148067222, + "grad_norm": 0.5351938086840057, + "learning_rate": 8.724053387641906e-06, + "loss": 11.5577, + "step": 31956 + }, + { + "epoch": 1.7401863688033052, + "grad_norm": 0.621849052620925, + "learning_rate": 8.720451506665783e-06, + "loss": 11.7246, + "step": 31957 + }, + { + "epoch": 1.7402408227998882, + "grad_norm": 0.5492982585708485, + "learning_rate": 8.716850335494742e-06, + "loss": 11.7424, + "step": 31958 + }, + { + "epoch": 1.7402952767964712, + "grad_norm": 0.5920599598120099, + "learning_rate": 8.71324987415677e-06, + "loss": 11.8464, + "step": 31959 + }, + { + "epoch": 1.7403497307930544, + "grad_norm": 0.4947891504692365, + "learning_rate": 8.70965012267988e-06, + "loss": 11.7205, + "step": 31960 + }, + { + "epoch": 1.7404041847896374, + "grad_norm": 0.5455025699705034, + "learning_rate": 8.706051081092092e-06, + "loss": 11.6804, + "step": 31961 + }, + { + "epoch": 1.7404586387862204, + "grad_norm": 0.557191036658413, + "learning_rate": 8.70245274942132e-06, + "loss": 11.6383, + "step": 31962 + }, + { + "epoch": 1.7405130927828034, + "grad_norm": 0.5340196800062514, + "learning_rate": 8.698855127695605e-06, + "loss": 11.7639, + "step": 31963 + }, + { + "epoch": 1.7405675467793864, + "grad_norm": 0.5767153667836261, + "learning_rate": 8.695258215942893e-06, + "loss": 11.9129, + "step": 31964 + }, + { + "epoch": 1.7406220007759694, + "grad_norm": 0.5694295553676907, + "learning_rate": 8.691662014191159e-06, + "loss": 11.8254, + "step": 31965 + }, + { + "epoch": 1.7406764547725526, + "grad_norm": 0.6535280581606852, + "learning_rate": 8.688066522468397e-06, + "loss": 11.91, + "step": 31966 + }, + { + "epoch": 1.7407309087691356, + "grad_norm": 0.5542515877471047, + "learning_rate": 8.684471740802514e-06, + "loss": 11.7171, + "step": 31967 + }, + { + "epoch": 1.7407853627657186, + "grad_norm": 0.5526873203872726, + "learning_rate": 8.680877669221522e-06, + "loss": 11.912, + "step": 31968 + }, + { + "epoch": 1.7408398167623016, + "grad_norm": 0.6768706919925701, + "learning_rate": 8.67728430775332e-06, + "loss": 11.9056, + "step": 31969 + }, + { + "epoch": 1.7408942707588846, + "grad_norm": 0.5126020406449916, + "learning_rate": 8.673691656425885e-06, + "loss": 11.8263, + "step": 31970 + }, + { + "epoch": 1.7409487247554676, + "grad_norm": 0.5439267010646247, + "learning_rate": 8.670099715267132e-06, + "loss": 11.7099, + "step": 31971 + }, + { + "epoch": 1.7410031787520506, + "grad_norm": 0.4976696273610106, + "learning_rate": 8.666508484304992e-06, + "loss": 11.7321, + "step": 31972 + }, + { + "epoch": 1.7410576327486336, + "grad_norm": 0.5508934392557755, + "learning_rate": 8.662917963567418e-06, + "loss": 11.8543, + "step": 31973 + }, + { + "epoch": 1.7411120867452166, + "grad_norm": 0.5740364595544567, + "learning_rate": 8.65932815308228e-06, + "loss": 11.842, + "step": 31974 + }, + { + "epoch": 1.7411665407417996, + "grad_norm": 0.5215353855019623, + "learning_rate": 8.655739052877532e-06, + "loss": 11.7305, + "step": 31975 + }, + { + "epoch": 1.7412209947383825, + "grad_norm": 0.5614963324268551, + "learning_rate": 8.652150662981095e-06, + "loss": 11.8178, + "step": 31976 + }, + { + "epoch": 1.7412754487349655, + "grad_norm": 0.5653651352357203, + "learning_rate": 8.648562983420839e-06, + "loss": 11.9051, + "step": 31977 + }, + { + "epoch": 1.7413299027315485, + "grad_norm": 0.5334463137538656, + "learning_rate": 8.644976014224692e-06, + "loss": 11.7374, + "step": 31978 + }, + { + "epoch": 1.7413843567281315, + "grad_norm": 0.576985678978604, + "learning_rate": 8.641389755420515e-06, + "loss": 11.729, + "step": 31979 + }, + { + "epoch": 1.7414388107247145, + "grad_norm": 0.5234658387895171, + "learning_rate": 8.637804207036226e-06, + "loss": 11.7497, + "step": 31980 + }, + { + "epoch": 1.7414932647212975, + "grad_norm": 0.6089813444056918, + "learning_rate": 8.634219369099694e-06, + "loss": 11.8424, + "step": 31981 + }, + { + "epoch": 1.7415477187178805, + "grad_norm": 0.5935612717936091, + "learning_rate": 8.630635241638773e-06, + "loss": 11.8468, + "step": 31982 + }, + { + "epoch": 1.7416021727144637, + "grad_norm": 0.5886032923980896, + "learning_rate": 8.627051824681376e-06, + "loss": 11.8731, + "step": 31983 + }, + { + "epoch": 1.7416566267110467, + "grad_norm": 0.5198412867645322, + "learning_rate": 8.623469118255334e-06, + "loss": 11.8138, + "step": 31984 + }, + { + "epoch": 1.7417110807076297, + "grad_norm": 0.49262438340279746, + "learning_rate": 8.619887122388525e-06, + "loss": 11.775, + "step": 31985 + }, + { + "epoch": 1.7417655347042127, + "grad_norm": 0.5374674231944098, + "learning_rate": 8.616305837108795e-06, + "loss": 11.7372, + "step": 31986 + }, + { + "epoch": 1.7418199887007957, + "grad_norm": 0.5033764033578693, + "learning_rate": 8.612725262443989e-06, + "loss": 11.7487, + "step": 31987 + }, + { + "epoch": 1.7418744426973787, + "grad_norm": 0.6043909876110837, + "learning_rate": 8.60914539842198e-06, + "loss": 11.8433, + "step": 31988 + }, + { + "epoch": 1.741928896693962, + "grad_norm": 0.5455036964850147, + "learning_rate": 8.605566245070552e-06, + "loss": 11.7161, + "step": 31989 + }, + { + "epoch": 1.7419833506905449, + "grad_norm": 0.5409583913962226, + "learning_rate": 8.601987802417599e-06, + "loss": 11.7858, + "step": 31990 + }, + { + "epoch": 1.7420378046871279, + "grad_norm": 0.5815138530271966, + "learning_rate": 8.598410070490915e-06, + "loss": 11.7551, + "step": 31991 + }, + { + "epoch": 1.7420922586837109, + "grad_norm": 0.5446835716470613, + "learning_rate": 8.594833049318297e-06, + "loss": 11.7497, + "step": 31992 + }, + { + "epoch": 1.7421467126802939, + "grad_norm": 0.5786796657330281, + "learning_rate": 8.591256738927611e-06, + "loss": 11.717, + "step": 31993 + }, + { + "epoch": 1.7422011666768769, + "grad_norm": 0.5645904802426776, + "learning_rate": 8.587681139346615e-06, + "loss": 11.7751, + "step": 31994 + }, + { + "epoch": 1.7422556206734598, + "grad_norm": 0.5709068463833342, + "learning_rate": 8.584106250603164e-06, + "loss": 11.8462, + "step": 31995 + }, + { + "epoch": 1.7423100746700428, + "grad_norm": 0.5340152565702709, + "learning_rate": 8.580532072725012e-06, + "loss": 11.7117, + "step": 31996 + }, + { + "epoch": 1.7423645286666258, + "grad_norm": 0.5387086854060601, + "learning_rate": 8.576958605740004e-06, + "loss": 11.817, + "step": 31997 + }, + { + "epoch": 1.7424189826632088, + "grad_norm": 0.5140277553920145, + "learning_rate": 8.573385849675863e-06, + "loss": 11.7447, + "step": 31998 + }, + { + "epoch": 1.7424734366597918, + "grad_norm": 0.5629323046688717, + "learning_rate": 8.56981380456041e-06, + "loss": 11.8892, + "step": 31999 + }, + { + "epoch": 1.7425278906563748, + "grad_norm": 0.5431528813466309, + "learning_rate": 8.566242470421448e-06, + "loss": 11.7875, + "step": 32000 + }, + { + "epoch": 1.7425823446529578, + "grad_norm": 0.5154491026682231, + "learning_rate": 8.562671847286707e-06, + "loss": 11.6852, + "step": 32001 + }, + { + "epoch": 1.7426367986495408, + "grad_norm": 0.5892238812937414, + "learning_rate": 8.559101935183944e-06, + "loss": 11.7823, + "step": 32002 + }, + { + "epoch": 1.7426912526461238, + "grad_norm": 0.5680434408571875, + "learning_rate": 8.555532734140959e-06, + "loss": 11.7483, + "step": 32003 + }, + { + "epoch": 1.7427457066427068, + "grad_norm": 0.5022921114212071, + "learning_rate": 8.551964244185474e-06, + "loss": 11.7121, + "step": 32004 + }, + { + "epoch": 1.7428001606392898, + "grad_norm": 0.5543481251024199, + "learning_rate": 8.548396465345265e-06, + "loss": 11.7405, + "step": 32005 + }, + { + "epoch": 1.7428546146358728, + "grad_norm": 0.5417179749654835, + "learning_rate": 8.544829397648046e-06, + "loss": 11.7747, + "step": 32006 + }, + { + "epoch": 1.742909068632456, + "grad_norm": 0.5116342225880156, + "learning_rate": 8.541263041121584e-06, + "loss": 11.7312, + "step": 32007 + }, + { + "epoch": 1.742963522629039, + "grad_norm": 0.512291100772166, + "learning_rate": 8.537697395793586e-06, + "loss": 11.7384, + "step": 32008 + }, + { + "epoch": 1.743017976625622, + "grad_norm": 0.6432909811509387, + "learning_rate": 8.534132461691779e-06, + "loss": 11.7537, + "step": 32009 + }, + { + "epoch": 1.743072430622205, + "grad_norm": 0.5673547760782457, + "learning_rate": 8.530568238843928e-06, + "loss": 11.782, + "step": 32010 + }, + { + "epoch": 1.743126884618788, + "grad_norm": 0.5607694313388282, + "learning_rate": 8.52700472727771e-06, + "loss": 11.6333, + "step": 32011 + }, + { + "epoch": 1.7431813386153712, + "grad_norm": 0.4913740142745817, + "learning_rate": 8.523441927020848e-06, + "loss": 11.7803, + "step": 32012 + }, + { + "epoch": 1.7432357926119542, + "grad_norm": 0.5694761407372012, + "learning_rate": 8.519879838101031e-06, + "loss": 11.7153, + "step": 32013 + }, + { + "epoch": 1.7432902466085372, + "grad_norm": 0.6028894497907186, + "learning_rate": 8.516318460545958e-06, + "loss": 11.9751, + "step": 32014 + }, + { + "epoch": 1.7433447006051201, + "grad_norm": 0.5507114254422816, + "learning_rate": 8.512757794383353e-06, + "loss": 11.8017, + "step": 32015 + }, + { + "epoch": 1.7433991546017031, + "grad_norm": 0.5972184955355894, + "learning_rate": 8.50919783964087e-06, + "loss": 11.739, + "step": 32016 + }, + { + "epoch": 1.7434536085982861, + "grad_norm": 0.5857228924141316, + "learning_rate": 8.505638596346233e-06, + "loss": 11.6848, + "step": 32017 + }, + { + "epoch": 1.7435080625948691, + "grad_norm": 0.562031072838741, + "learning_rate": 8.502080064527063e-06, + "loss": 11.7777, + "step": 32018 + }, + { + "epoch": 1.7435625165914521, + "grad_norm": 0.7034782977047336, + "learning_rate": 8.498522244211093e-06, + "loss": 11.835, + "step": 32019 + }, + { + "epoch": 1.743616970588035, + "grad_norm": 0.5649368271146247, + "learning_rate": 8.494965135425937e-06, + "loss": 11.8731, + "step": 32020 + }, + { + "epoch": 1.743671424584618, + "grad_norm": 0.5436977892164904, + "learning_rate": 8.491408738199291e-06, + "loss": 11.8892, + "step": 32021 + }, + { + "epoch": 1.743725878581201, + "grad_norm": 0.5559043028084236, + "learning_rate": 8.487853052558791e-06, + "loss": 11.9061, + "step": 32022 + }, + { + "epoch": 1.743780332577784, + "grad_norm": 0.5133931781143667, + "learning_rate": 8.484298078532083e-06, + "loss": 11.7108, + "step": 32023 + }, + { + "epoch": 1.743834786574367, + "grad_norm": 0.5480535266969013, + "learning_rate": 8.480743816146818e-06, + "loss": 11.7956, + "step": 32024 + }, + { + "epoch": 1.74388924057095, + "grad_norm": 0.5323917940609546, + "learning_rate": 8.477190265430668e-06, + "loss": 11.8297, + "step": 32025 + }, + { + "epoch": 1.743943694567533, + "grad_norm": 0.5391719565254227, + "learning_rate": 8.473637426411196e-06, + "loss": 11.6837, + "step": 32026 + }, + { + "epoch": 1.743998148564116, + "grad_norm": 0.5212312196844444, + "learning_rate": 8.470085299116103e-06, + "loss": 11.7863, + "step": 32027 + }, + { + "epoch": 1.744052602560699, + "grad_norm": 0.5708364673038417, + "learning_rate": 8.466533883572947e-06, + "loss": 11.8323, + "step": 32028 + }, + { + "epoch": 1.744107056557282, + "grad_norm": 0.5751547927852145, + "learning_rate": 8.462983179809391e-06, + "loss": 11.8152, + "step": 32029 + }, + { + "epoch": 1.7441615105538653, + "grad_norm": 0.48722597878671126, + "learning_rate": 8.459433187853016e-06, + "loss": 11.8481, + "step": 32030 + }, + { + "epoch": 1.7442159645504483, + "grad_norm": 0.5742185937981873, + "learning_rate": 8.455883907731465e-06, + "loss": 11.7642, + "step": 32031 + }, + { + "epoch": 1.7442704185470312, + "grad_norm": 0.6257192235986521, + "learning_rate": 8.452335339472305e-06, + "loss": 11.8823, + "step": 32032 + }, + { + "epoch": 1.7443248725436142, + "grad_norm": 0.527745318790352, + "learning_rate": 8.448787483103116e-06, + "loss": 11.7927, + "step": 32033 + }, + { + "epoch": 1.7443793265401972, + "grad_norm": 0.6222700022992099, + "learning_rate": 8.445240338651527e-06, + "loss": 11.871, + "step": 32034 + }, + { + "epoch": 1.7444337805367802, + "grad_norm": 0.5487261637021156, + "learning_rate": 8.441693906145088e-06, + "loss": 11.5871, + "step": 32035 + }, + { + "epoch": 1.7444882345333634, + "grad_norm": 0.5047141327693643, + "learning_rate": 8.438148185611395e-06, + "loss": 11.6439, + "step": 32036 + }, + { + "epoch": 1.7445426885299464, + "grad_norm": 0.5784583151736736, + "learning_rate": 8.434603177078027e-06, + "loss": 11.7275, + "step": 32037 + }, + { + "epoch": 1.7445971425265294, + "grad_norm": 0.5229644104779019, + "learning_rate": 8.43105888057253e-06, + "loss": 11.6475, + "step": 32038 + }, + { + "epoch": 1.7446515965231124, + "grad_norm": 0.5699041812977644, + "learning_rate": 8.427515296122491e-06, + "loss": 11.7752, + "step": 32039 + }, + { + "epoch": 1.7447060505196954, + "grad_norm": 0.5957268317513674, + "learning_rate": 8.423972423755433e-06, + "loss": 11.9105, + "step": 32040 + }, + { + "epoch": 1.7447605045162784, + "grad_norm": 0.539938225679523, + "learning_rate": 8.420430263498935e-06, + "loss": 11.7252, + "step": 32041 + }, + { + "epoch": 1.7448149585128614, + "grad_norm": 0.5709212899677252, + "learning_rate": 8.41688881538053e-06, + "loss": 11.739, + "step": 32042 + }, + { + "epoch": 1.7448694125094444, + "grad_norm": 0.5266446459566502, + "learning_rate": 8.41334807942774e-06, + "loss": 11.7513, + "step": 32043 + }, + { + "epoch": 1.7449238665060274, + "grad_norm": 0.5677609274402388, + "learning_rate": 8.409808055668134e-06, + "loss": 11.8223, + "step": 32044 + }, + { + "epoch": 1.7449783205026104, + "grad_norm": 0.5532136403402126, + "learning_rate": 8.406268744129209e-06, + "loss": 11.8705, + "step": 32045 + }, + { + "epoch": 1.7450327744991934, + "grad_norm": 0.5151713012528072, + "learning_rate": 8.4027301448385e-06, + "loss": 11.7093, + "step": 32046 + }, + { + "epoch": 1.7450872284957764, + "grad_norm": 0.6215215255628963, + "learning_rate": 8.399192257823518e-06, + "loss": 11.7937, + "step": 32047 + }, + { + "epoch": 1.7451416824923593, + "grad_norm": 0.6201131430276449, + "learning_rate": 8.395655083111776e-06, + "loss": 11.8427, + "step": 32048 + }, + { + "epoch": 1.7451961364889423, + "grad_norm": 0.5209650258702061, + "learning_rate": 8.392118620730794e-06, + "loss": 11.7063, + "step": 32049 + }, + { + "epoch": 1.7452505904855253, + "grad_norm": 0.5612328279856644, + "learning_rate": 8.38858287070805e-06, + "loss": 11.9087, + "step": 32050 + }, + { + "epoch": 1.7453050444821083, + "grad_norm": 0.564227927612779, + "learning_rate": 8.385047833071058e-06, + "loss": 11.6975, + "step": 32051 + }, + { + "epoch": 1.7453594984786913, + "grad_norm": 0.567497662223573, + "learning_rate": 8.381513507847306e-06, + "loss": 11.8284, + "step": 32052 + }, + { + "epoch": 1.7454139524752745, + "grad_norm": 0.5363259550569481, + "learning_rate": 8.377979895064248e-06, + "loss": 11.7159, + "step": 32053 + }, + { + "epoch": 1.7454684064718575, + "grad_norm": 0.590642468655495, + "learning_rate": 8.374446994749396e-06, + "loss": 11.7312, + "step": 32054 + }, + { + "epoch": 1.7455228604684405, + "grad_norm": 0.5465002820785123, + "learning_rate": 8.370914806930198e-06, + "loss": 11.6307, + "step": 32055 + }, + { + "epoch": 1.7455773144650235, + "grad_norm": 0.5888990470494139, + "learning_rate": 8.367383331634148e-06, + "loss": 11.8342, + "step": 32056 + }, + { + "epoch": 1.7456317684616065, + "grad_norm": 0.5746100887956731, + "learning_rate": 8.363852568888686e-06, + "loss": 11.6764, + "step": 32057 + }, + { + "epoch": 1.7456862224581895, + "grad_norm": 0.5887223194598696, + "learning_rate": 8.360322518721265e-06, + "loss": 11.8901, + "step": 32058 + }, + { + "epoch": 1.7457406764547727, + "grad_norm": 0.5108734790458672, + "learning_rate": 8.35679318115935e-06, + "loss": 11.7426, + "step": 32059 + }, + { + "epoch": 1.7457951304513557, + "grad_norm": 0.5690170304757952, + "learning_rate": 8.353264556230378e-06, + "loss": 11.6658, + "step": 32060 + }, + { + "epoch": 1.7458495844479387, + "grad_norm": 0.5176612902090224, + "learning_rate": 8.349736643961813e-06, + "loss": 11.6612, + "step": 32061 + }, + { + "epoch": 1.7459040384445217, + "grad_norm": 0.5799500284494544, + "learning_rate": 8.346209444381048e-06, + "loss": 11.8357, + "step": 32062 + }, + { + "epoch": 1.7459584924411047, + "grad_norm": 0.5396245797154937, + "learning_rate": 8.342682957515513e-06, + "loss": 11.8793, + "step": 32063 + }, + { + "epoch": 1.7460129464376877, + "grad_norm": 0.5222696040881784, + "learning_rate": 8.339157183392666e-06, + "loss": 11.7805, + "step": 32064 + }, + { + "epoch": 1.7460674004342707, + "grad_norm": 0.550740507352698, + "learning_rate": 8.335632122039893e-06, + "loss": 11.7638, + "step": 32065 + }, + { + "epoch": 1.7461218544308537, + "grad_norm": 0.5574886696018079, + "learning_rate": 8.332107773484633e-06, + "loss": 11.7661, + "step": 32066 + }, + { + "epoch": 1.7461763084274367, + "grad_norm": 0.6567091073380188, + "learning_rate": 8.328584137754259e-06, + "loss": 11.9116, + "step": 32067 + }, + { + "epoch": 1.7462307624240196, + "grad_norm": 0.5897784852784564, + "learning_rate": 8.325061214876195e-06, + "loss": 11.6664, + "step": 32068 + }, + { + "epoch": 1.7462852164206026, + "grad_norm": 0.5467779883960694, + "learning_rate": 8.32153900487782e-06, + "loss": 11.6983, + "step": 32069 + }, + { + "epoch": 1.7463396704171856, + "grad_norm": 0.5084861598687963, + "learning_rate": 8.318017507786535e-06, + "loss": 11.6828, + "step": 32070 + }, + { + "epoch": 1.7463941244137686, + "grad_norm": 0.5764218627558026, + "learning_rate": 8.31449672362975e-06, + "loss": 11.6875, + "step": 32071 + }, + { + "epoch": 1.7464485784103516, + "grad_norm": 0.5343029023386447, + "learning_rate": 8.310976652434776e-06, + "loss": 11.8479, + "step": 32072 + }, + { + "epoch": 1.7465030324069346, + "grad_norm": 0.5455757670649288, + "learning_rate": 8.307457294229038e-06, + "loss": 11.7445, + "step": 32073 + }, + { + "epoch": 1.7465574864035176, + "grad_norm": 0.5637366874826736, + "learning_rate": 8.303938649039888e-06, + "loss": 11.5759, + "step": 32074 + }, + { + "epoch": 1.7466119404001006, + "grad_norm": 0.541970138016841, + "learning_rate": 8.300420716894686e-06, + "loss": 11.8475, + "step": 32075 + }, + { + "epoch": 1.7466663943966836, + "grad_norm": 0.568620399398183, + "learning_rate": 8.296903497820808e-06, + "loss": 11.8766, + "step": 32076 + }, + { + "epoch": 1.7467208483932668, + "grad_norm": 0.5413186586308619, + "learning_rate": 8.293386991845553e-06, + "loss": 11.8918, + "step": 32077 + }, + { + "epoch": 1.7467753023898498, + "grad_norm": 0.5654869300695888, + "learning_rate": 8.289871198996334e-06, + "loss": 11.8282, + "step": 32078 + }, + { + "epoch": 1.7468297563864328, + "grad_norm": 0.599210642396545, + "learning_rate": 8.286356119300432e-06, + "loss": 11.8196, + "step": 32079 + }, + { + "epoch": 1.7468842103830158, + "grad_norm": 0.5681529131139145, + "learning_rate": 8.28284175278521e-06, + "loss": 11.7322, + "step": 32080 + }, + { + "epoch": 1.7469386643795988, + "grad_norm": 0.5764930737237418, + "learning_rate": 8.279328099478023e-06, + "loss": 11.8125, + "step": 32081 + }, + { + "epoch": 1.746993118376182, + "grad_norm": 0.6045864425024923, + "learning_rate": 8.27581515940612e-06, + "loss": 11.7184, + "step": 32082 + }, + { + "epoch": 1.747047572372765, + "grad_norm": 0.5662377367856796, + "learning_rate": 8.272302932596888e-06, + "loss": 11.9217, + "step": 32083 + }, + { + "epoch": 1.747102026369348, + "grad_norm": 0.6057743589010771, + "learning_rate": 8.268791419077592e-06, + "loss": 11.8011, + "step": 32084 + }, + { + "epoch": 1.747156480365931, + "grad_norm": 0.5613648064944241, + "learning_rate": 8.265280618875559e-06, + "loss": 11.8049, + "step": 32085 + }, + { + "epoch": 1.747210934362514, + "grad_norm": 0.48816779325586207, + "learning_rate": 8.261770532018098e-06, + "loss": 11.6951, + "step": 32086 + }, + { + "epoch": 1.747265388359097, + "grad_norm": 0.6015002130304541, + "learning_rate": 8.258261158532487e-06, + "loss": 11.8388, + "step": 32087 + }, + { + "epoch": 1.74731984235568, + "grad_norm": 0.5379049160926599, + "learning_rate": 8.254752498446028e-06, + "loss": 11.8134, + "step": 32088 + }, + { + "epoch": 1.747374296352263, + "grad_norm": 0.536591873539549, + "learning_rate": 8.251244551785987e-06, + "loss": 11.7769, + "step": 32089 + }, + { + "epoch": 1.747428750348846, + "grad_norm": 0.5370086668121891, + "learning_rate": 8.247737318579673e-06, + "loss": 11.8478, + "step": 32090 + }, + { + "epoch": 1.747483204345429, + "grad_norm": 0.5393090808096153, + "learning_rate": 8.244230798854347e-06, + "loss": 11.6601, + "step": 32091 + }, + { + "epoch": 1.747537658342012, + "grad_norm": 0.6406910306153049, + "learning_rate": 8.24072499263724e-06, + "loss": 11.8621, + "step": 32092 + }, + { + "epoch": 1.747592112338595, + "grad_norm": 0.5317494391056862, + "learning_rate": 8.237219899955662e-06, + "loss": 11.7992, + "step": 32093 + }, + { + "epoch": 1.747646566335178, + "grad_norm": 0.5756798275330044, + "learning_rate": 8.233715520836837e-06, + "loss": 11.7773, + "step": 32094 + }, + { + "epoch": 1.747701020331761, + "grad_norm": 0.5776534959828046, + "learning_rate": 8.230211855308057e-06, + "loss": 11.8673, + "step": 32095 + }, + { + "epoch": 1.7477554743283439, + "grad_norm": 0.551942844959109, + "learning_rate": 8.226708903396507e-06, + "loss": 11.7723, + "step": 32096 + }, + { + "epoch": 1.7478099283249269, + "grad_norm": 0.5698823976918336, + "learning_rate": 8.223206665129468e-06, + "loss": 11.649, + "step": 32097 + }, + { + "epoch": 1.7478643823215099, + "grad_norm": 0.5585603105322928, + "learning_rate": 8.219705140534173e-06, + "loss": 11.837, + "step": 32098 + }, + { + "epoch": 1.7479188363180929, + "grad_norm": 0.5582876530335303, + "learning_rate": 8.216204329637834e-06, + "loss": 11.7641, + "step": 32099 + }, + { + "epoch": 1.747973290314676, + "grad_norm": 0.6105735570584604, + "learning_rate": 8.212704232467694e-06, + "loss": 11.7557, + "step": 32100 + }, + { + "epoch": 1.748027744311259, + "grad_norm": 0.5067331004793603, + "learning_rate": 8.209204849050944e-06, + "loss": 11.7947, + "step": 32101 + }, + { + "epoch": 1.748082198307842, + "grad_norm": 0.5115845141990338, + "learning_rate": 8.205706179414829e-06, + "loss": 11.7104, + "step": 32102 + }, + { + "epoch": 1.748136652304425, + "grad_norm": 0.5998195876428078, + "learning_rate": 8.202208223586538e-06, + "loss": 11.8674, + "step": 32103 + }, + { + "epoch": 1.748191106301008, + "grad_norm": 0.5431466281900563, + "learning_rate": 8.198710981593249e-06, + "loss": 11.8214, + "step": 32104 + }, + { + "epoch": 1.748245560297591, + "grad_norm": 0.5681776478049104, + "learning_rate": 8.195214453462196e-06, + "loss": 11.7089, + "step": 32105 + }, + { + "epoch": 1.7483000142941743, + "grad_norm": 0.5113870119080456, + "learning_rate": 8.191718639220536e-06, + "loss": 11.7082, + "step": 32106 + }, + { + "epoch": 1.7483544682907572, + "grad_norm": 0.5602182821619474, + "learning_rate": 8.188223538895456e-06, + "loss": 11.7769, + "step": 32107 + }, + { + "epoch": 1.7484089222873402, + "grad_norm": 0.5190456682225881, + "learning_rate": 8.184729152514182e-06, + "loss": 11.6851, + "step": 32108 + }, + { + "epoch": 1.7484633762839232, + "grad_norm": 0.5102597510300653, + "learning_rate": 8.181235480103822e-06, + "loss": 11.7892, + "step": 32109 + }, + { + "epoch": 1.7485178302805062, + "grad_norm": 0.5306065048933504, + "learning_rate": 8.1777425216916e-06, + "loss": 11.8607, + "step": 32110 + }, + { + "epoch": 1.7485722842770892, + "grad_norm": 0.5225156835424806, + "learning_rate": 8.174250277304628e-06, + "loss": 11.7497, + "step": 32111 + }, + { + "epoch": 1.7486267382736722, + "grad_norm": 0.5087099348069014, + "learning_rate": 8.170758746970097e-06, + "loss": 11.7138, + "step": 32112 + }, + { + "epoch": 1.7486811922702552, + "grad_norm": 0.5325545484352462, + "learning_rate": 8.167267930715161e-06, + "loss": 11.6696, + "step": 32113 + }, + { + "epoch": 1.7487356462668382, + "grad_norm": 0.5834730509714643, + "learning_rate": 8.163777828566921e-06, + "loss": 11.8353, + "step": 32114 + }, + { + "epoch": 1.7487901002634212, + "grad_norm": 0.5603422351374289, + "learning_rate": 8.160288440552565e-06, + "loss": 11.6679, + "step": 32115 + }, + { + "epoch": 1.7488445542600042, + "grad_norm": 0.509828969330385, + "learning_rate": 8.156799766699186e-06, + "loss": 11.6698, + "step": 32116 + }, + { + "epoch": 1.7488990082565872, + "grad_norm": 0.5374525294611626, + "learning_rate": 8.153311807033958e-06, + "loss": 11.6937, + "step": 32117 + }, + { + "epoch": 1.7489534622531702, + "grad_norm": 0.554043448104786, + "learning_rate": 8.149824561583962e-06, + "loss": 11.734, + "step": 32118 + }, + { + "epoch": 1.7490079162497532, + "grad_norm": 0.5397036647979724, + "learning_rate": 8.146338030376332e-06, + "loss": 11.7771, + "step": 32119 + }, + { + "epoch": 1.7490623702463362, + "grad_norm": 0.5295951749240384, + "learning_rate": 8.14285221343819e-06, + "loss": 11.7543, + "step": 32120 + }, + { + "epoch": 1.7491168242429191, + "grad_norm": 0.5525421236400031, + "learning_rate": 8.139367110796626e-06, + "loss": 11.6253, + "step": 32121 + }, + { + "epoch": 1.7491712782395021, + "grad_norm": 0.5072333533669457, + "learning_rate": 8.135882722478772e-06, + "loss": 11.7333, + "step": 32122 + }, + { + "epoch": 1.7492257322360854, + "grad_norm": 0.4910920049852444, + "learning_rate": 8.132399048511685e-06, + "loss": 11.7351, + "step": 32123 + }, + { + "epoch": 1.7492801862326683, + "grad_norm": 0.5801308156761865, + "learning_rate": 8.128916088922467e-06, + "loss": 11.6644, + "step": 32124 + }, + { + "epoch": 1.7493346402292513, + "grad_norm": 0.6177879020362912, + "learning_rate": 8.125433843738206e-06, + "loss": 11.7532, + "step": 32125 + }, + { + "epoch": 1.7493890942258343, + "grad_norm": 0.5496992994543061, + "learning_rate": 8.12195231298597e-06, + "loss": 11.8126, + "step": 32126 + }, + { + "epoch": 1.7494435482224173, + "grad_norm": 0.5578115138425902, + "learning_rate": 8.118471496692859e-06, + "loss": 11.8202, + "step": 32127 + }, + { + "epoch": 1.7494980022190003, + "grad_norm": 0.5027642151831847, + "learning_rate": 8.114991394885908e-06, + "loss": 11.6968, + "step": 32128 + }, + { + "epoch": 1.7495524562155835, + "grad_norm": 0.51993964938592, + "learning_rate": 8.111512007592181e-06, + "loss": 11.7713, + "step": 32129 + }, + { + "epoch": 1.7496069102121665, + "grad_norm": 0.5210930704701255, + "learning_rate": 8.108033334838771e-06, + "loss": 11.7823, + "step": 32130 + }, + { + "epoch": 1.7496613642087495, + "grad_norm": 0.532312018316325, + "learning_rate": 8.104555376652689e-06, + "loss": 11.5907, + "step": 32131 + }, + { + "epoch": 1.7497158182053325, + "grad_norm": 0.5969973555377257, + "learning_rate": 8.101078133061025e-06, + "loss": 11.8562, + "step": 32132 + }, + { + "epoch": 1.7497702722019155, + "grad_norm": 0.5720486340891017, + "learning_rate": 8.097601604090765e-06, + "loss": 11.6376, + "step": 32133 + }, + { + "epoch": 1.7498247261984985, + "grad_norm": 0.5672396675148712, + "learning_rate": 8.094125789768959e-06, + "loss": 11.8103, + "step": 32134 + }, + { + "epoch": 1.7498791801950815, + "grad_norm": 0.5272982053436494, + "learning_rate": 8.090650690122659e-06, + "loss": 11.6549, + "step": 32135 + }, + { + "epoch": 1.7499336341916645, + "grad_norm": 0.5139232150430204, + "learning_rate": 8.087176305178856e-06, + "loss": 11.7585, + "step": 32136 + }, + { + "epoch": 1.7499880881882475, + "grad_norm": 0.5509708901767132, + "learning_rate": 8.083702634964595e-06, + "loss": 11.7857, + "step": 32137 + }, + { + "epoch": 1.7500425421848305, + "grad_norm": 0.5744520180512087, + "learning_rate": 8.080229679506868e-06, + "loss": 11.7313, + "step": 32138 + }, + { + "epoch": 1.7500969961814135, + "grad_norm": 0.5655000383945293, + "learning_rate": 8.076757438832706e-06, + "loss": 11.9291, + "step": 32139 + }, + { + "epoch": 1.7501514501779964, + "grad_norm": 0.5862390754138455, + "learning_rate": 8.073285912969075e-06, + "loss": 11.8236, + "step": 32140 + }, + { + "epoch": 1.7502059041745794, + "grad_norm": 0.5722237955712688, + "learning_rate": 8.06981510194299e-06, + "loss": 11.9508, + "step": 32141 + }, + { + "epoch": 1.7502603581711624, + "grad_norm": 0.5175385689487015, + "learning_rate": 8.066345005781473e-06, + "loss": 11.7662, + "step": 32142 + }, + { + "epoch": 1.7503148121677454, + "grad_norm": 0.5907442384318156, + "learning_rate": 8.062875624511446e-06, + "loss": 11.767, + "step": 32143 + }, + { + "epoch": 1.7503692661643284, + "grad_norm": 0.5211366070284962, + "learning_rate": 8.05940695815992e-06, + "loss": 11.8284, + "step": 32144 + }, + { + "epoch": 1.7504237201609114, + "grad_norm": 0.5708486403797793, + "learning_rate": 8.055939006753855e-06, + "loss": 11.8896, + "step": 32145 + }, + { + "epoch": 1.7504781741574946, + "grad_norm": 0.5930456298382185, + "learning_rate": 8.052471770320236e-06, + "loss": 11.7559, + "step": 32146 + }, + { + "epoch": 1.7505326281540776, + "grad_norm": 0.5351082534180633, + "learning_rate": 8.049005248886021e-06, + "loss": 11.7867, + "step": 32147 + }, + { + "epoch": 1.7505870821506606, + "grad_norm": 0.5900071507189579, + "learning_rate": 8.045539442478144e-06, + "loss": 11.8933, + "step": 32148 + }, + { + "epoch": 1.7506415361472436, + "grad_norm": 0.5572461041020155, + "learning_rate": 8.042074351123596e-06, + "loss": 11.9106, + "step": 32149 + }, + { + "epoch": 1.7506959901438266, + "grad_norm": 0.6163914801817144, + "learning_rate": 8.038609974849276e-06, + "loss": 11.9965, + "step": 32150 + }, + { + "epoch": 1.7507504441404096, + "grad_norm": 0.7564647393300263, + "learning_rate": 8.035146313682173e-06, + "loss": 11.8039, + "step": 32151 + }, + { + "epoch": 1.7508048981369928, + "grad_norm": 0.48232305983174045, + "learning_rate": 8.03168336764919e-06, + "loss": 11.7294, + "step": 32152 + }, + { + "epoch": 1.7508593521335758, + "grad_norm": 0.4935820149526705, + "learning_rate": 8.028221136777237e-06, + "loss": 11.8181, + "step": 32153 + }, + { + "epoch": 1.7509138061301588, + "grad_norm": 0.5416017889021689, + "learning_rate": 8.024759621093281e-06, + "loss": 11.8525, + "step": 32154 + }, + { + "epoch": 1.7509682601267418, + "grad_norm": 0.5129958786548087, + "learning_rate": 8.021298820624212e-06, + "loss": 11.7635, + "step": 32155 + }, + { + "epoch": 1.7510227141233248, + "grad_norm": 0.5283839041939262, + "learning_rate": 8.017838735396932e-06, + "loss": 11.7942, + "step": 32156 + }, + { + "epoch": 1.7510771681199078, + "grad_norm": 0.5214212576874, + "learning_rate": 8.014379365438396e-06, + "loss": 11.8274, + "step": 32157 + }, + { + "epoch": 1.7511316221164908, + "grad_norm": 0.5444012761202118, + "learning_rate": 8.010920710775437e-06, + "loss": 11.7991, + "step": 32158 + }, + { + "epoch": 1.7511860761130738, + "grad_norm": 0.5768476318510125, + "learning_rate": 8.007462771435015e-06, + "loss": 11.79, + "step": 32159 + }, + { + "epoch": 1.7512405301096567, + "grad_norm": 0.5243840732132846, + "learning_rate": 8.00400554744397e-06, + "loss": 11.6581, + "step": 32160 + }, + { + "epoch": 1.7512949841062397, + "grad_norm": 0.5432265632365408, + "learning_rate": 8.00054903882922e-06, + "loss": 11.8268, + "step": 32161 + }, + { + "epoch": 1.7513494381028227, + "grad_norm": 0.49905626161452704, + "learning_rate": 7.997093245617638e-06, + "loss": 11.8295, + "step": 32162 + }, + { + "epoch": 1.7514038920994057, + "grad_norm": 0.5509428819257189, + "learning_rate": 7.99363816783606e-06, + "loss": 11.8169, + "step": 32163 + }, + { + "epoch": 1.7514583460959887, + "grad_norm": 0.5070706101941347, + "learning_rate": 7.990183805511398e-06, + "loss": 11.6974, + "step": 32164 + }, + { + "epoch": 1.7515128000925717, + "grad_norm": 0.5655243994198613, + "learning_rate": 7.986730158670485e-06, + "loss": 11.8078, + "step": 32165 + }, + { + "epoch": 1.7515672540891547, + "grad_norm": 0.6123305922211647, + "learning_rate": 7.983277227340203e-06, + "loss": 11.9129, + "step": 32166 + }, + { + "epoch": 1.7516217080857377, + "grad_norm": 0.538501649298484, + "learning_rate": 7.979825011547381e-06, + "loss": 11.8663, + "step": 32167 + }, + { + "epoch": 1.7516761620823207, + "grad_norm": 0.5830568664258794, + "learning_rate": 7.976373511318857e-06, + "loss": 11.7276, + "step": 32168 + }, + { + "epoch": 1.7517306160789037, + "grad_norm": 0.5367117301081419, + "learning_rate": 7.972922726681508e-06, + "loss": 11.7983, + "step": 32169 + }, + { + "epoch": 1.751785070075487, + "grad_norm": 0.528729286250279, + "learning_rate": 7.969472657662136e-06, + "loss": 11.6916, + "step": 32170 + }, + { + "epoch": 1.7518395240720699, + "grad_norm": 0.48942031309086037, + "learning_rate": 7.966023304287585e-06, + "loss": 11.7504, + "step": 32171 + }, + { + "epoch": 1.7518939780686529, + "grad_norm": 0.5260973860763183, + "learning_rate": 7.962574666584676e-06, + "loss": 11.8194, + "step": 32172 + }, + { + "epoch": 1.7519484320652359, + "grad_norm": 0.6013993876569956, + "learning_rate": 7.959126744580203e-06, + "loss": 11.9182, + "step": 32173 + }, + { + "epoch": 1.7520028860618189, + "grad_norm": 0.5291298509488411, + "learning_rate": 7.955679538301008e-06, + "loss": 11.6776, + "step": 32174 + }, + { + "epoch": 1.7520573400584019, + "grad_norm": 0.5724929330919186, + "learning_rate": 7.952233047773871e-06, + "loss": 11.6852, + "step": 32175 + }, + { + "epoch": 1.752111794054985, + "grad_norm": 0.5458027967153894, + "learning_rate": 7.948787273025626e-06, + "loss": 11.8398, + "step": 32176 + }, + { + "epoch": 1.752166248051568, + "grad_norm": 0.5948731986463153, + "learning_rate": 7.945342214083029e-06, + "loss": 11.8019, + "step": 32177 + }, + { + "epoch": 1.752220702048151, + "grad_norm": 0.5175419612460501, + "learning_rate": 7.941897870972881e-06, + "loss": 11.8234, + "step": 32178 + }, + { + "epoch": 1.752275156044734, + "grad_norm": 0.5192168255454845, + "learning_rate": 7.938454243722004e-06, + "loss": 11.7962, + "step": 32179 + }, + { + "epoch": 1.752329610041317, + "grad_norm": 0.5530401944175701, + "learning_rate": 7.935011332357112e-06, + "loss": 11.8509, + "step": 32180 + }, + { + "epoch": 1.7523840640379, + "grad_norm": 0.5483546100052389, + "learning_rate": 7.931569136905048e-06, + "loss": 11.8307, + "step": 32181 + }, + { + "epoch": 1.752438518034483, + "grad_norm": 0.5308029554244464, + "learning_rate": 7.928127657392526e-06, + "loss": 11.7503, + "step": 32182 + }, + { + "epoch": 1.752492972031066, + "grad_norm": 0.6463200199516159, + "learning_rate": 7.92468689384631e-06, + "loss": 11.9085, + "step": 32183 + }, + { + "epoch": 1.752547426027649, + "grad_norm": 0.6052874888046295, + "learning_rate": 7.921246846293195e-06, + "loss": 11.8176, + "step": 32184 + }, + { + "epoch": 1.752601880024232, + "grad_norm": 0.5277665412074546, + "learning_rate": 7.917807514759879e-06, + "loss": 11.7246, + "step": 32185 + }, + { + "epoch": 1.752656334020815, + "grad_norm": 0.5577879226699427, + "learning_rate": 7.914368899273161e-06, + "loss": 11.8003, + "step": 32186 + }, + { + "epoch": 1.752710788017398, + "grad_norm": 0.5877162949416089, + "learning_rate": 7.910930999859734e-06, + "loss": 11.836, + "step": 32187 + }, + { + "epoch": 1.752765242013981, + "grad_norm": 0.5768428727662831, + "learning_rate": 7.907493816546362e-06, + "loss": 11.7091, + "step": 32188 + }, + { + "epoch": 1.752819696010564, + "grad_norm": 0.5081975865111498, + "learning_rate": 7.90405734935975e-06, + "loss": 11.7753, + "step": 32189 + }, + { + "epoch": 1.752874150007147, + "grad_norm": 0.5409616280016912, + "learning_rate": 7.90062159832663e-06, + "loss": 11.8314, + "step": 32190 + }, + { + "epoch": 1.75292860400373, + "grad_norm": 0.6280703923892754, + "learning_rate": 7.897186563473735e-06, + "loss": 11.8578, + "step": 32191 + }, + { + "epoch": 1.752983058000313, + "grad_norm": 0.5890986189124914, + "learning_rate": 7.893752244827768e-06, + "loss": 11.787, + "step": 32192 + }, + { + "epoch": 1.7530375119968962, + "grad_norm": 0.5070755262227107, + "learning_rate": 7.89031864241543e-06, + "loss": 11.7177, + "step": 32193 + }, + { + "epoch": 1.7530919659934792, + "grad_norm": 0.49700882147831205, + "learning_rate": 7.886885756263407e-06, + "loss": 11.8008, + "step": 32194 + }, + { + "epoch": 1.7531464199900622, + "grad_norm": 0.5594109306667665, + "learning_rate": 7.883453586398404e-06, + "loss": 11.7856, + "step": 32195 + }, + { + "epoch": 1.7532008739866451, + "grad_norm": 0.5707705400748605, + "learning_rate": 7.88002213284712e-06, + "loss": 11.7651, + "step": 32196 + }, + { + "epoch": 1.7532553279832281, + "grad_norm": 0.52791498707753, + "learning_rate": 7.876591395636234e-06, + "loss": 11.799, + "step": 32197 + }, + { + "epoch": 1.7533097819798111, + "grad_norm": 0.5885877943751425, + "learning_rate": 7.873161374792426e-06, + "loss": 11.6264, + "step": 32198 + }, + { + "epoch": 1.7533642359763943, + "grad_norm": 0.5577006781823926, + "learning_rate": 7.86973207034235e-06, + "loss": 11.71, + "step": 32199 + }, + { + "epoch": 1.7534186899729773, + "grad_norm": 0.5008494017413169, + "learning_rate": 7.86630348231271e-06, + "loss": 11.7965, + "step": 32200 + }, + { + "epoch": 1.7534731439695603, + "grad_norm": 0.5661881923445657, + "learning_rate": 7.862875610730125e-06, + "loss": 11.884, + "step": 32201 + }, + { + "epoch": 1.7535275979661433, + "grad_norm": 0.6334994442026655, + "learning_rate": 7.859448455621288e-06, + "loss": 11.8528, + "step": 32202 + }, + { + "epoch": 1.7535820519627263, + "grad_norm": 0.5008431638330966, + "learning_rate": 7.856022017012832e-06, + "loss": 11.7394, + "step": 32203 + }, + { + "epoch": 1.7536365059593093, + "grad_norm": 0.5261220463483217, + "learning_rate": 7.852596294931391e-06, + "loss": 11.7669, + "step": 32204 + }, + { + "epoch": 1.7536909599558923, + "grad_norm": 0.5660318987309142, + "learning_rate": 7.8491712894036e-06, + "loss": 11.7858, + "step": 32205 + }, + { + "epoch": 1.7537454139524753, + "grad_norm": 0.5383905058425613, + "learning_rate": 7.845747000456138e-06, + "loss": 11.7916, + "step": 32206 + }, + { + "epoch": 1.7537998679490583, + "grad_norm": 0.5486025497378929, + "learning_rate": 7.842323428115572e-06, + "loss": 11.8162, + "step": 32207 + }, + { + "epoch": 1.7538543219456413, + "grad_norm": 0.5545640399730635, + "learning_rate": 7.838900572408581e-06, + "loss": 11.812, + "step": 32208 + }, + { + "epoch": 1.7539087759422243, + "grad_norm": 0.5135894571875912, + "learning_rate": 7.835478433361732e-06, + "loss": 11.7503, + "step": 32209 + }, + { + "epoch": 1.7539632299388073, + "grad_norm": 0.5727064257373675, + "learning_rate": 7.83205701100168e-06, + "loss": 11.7242, + "step": 32210 + }, + { + "epoch": 1.7540176839353903, + "grad_norm": 0.48632272167826107, + "learning_rate": 7.828636305354986e-06, + "loss": 11.8623, + "step": 32211 + }, + { + "epoch": 1.7540721379319733, + "grad_norm": 0.5449425851250413, + "learning_rate": 7.825216316448292e-06, + "loss": 11.7668, + "step": 32212 + }, + { + "epoch": 1.7541265919285562, + "grad_norm": 0.5409883419744331, + "learning_rate": 7.821797044308177e-06, + "loss": 11.8818, + "step": 32213 + }, + { + "epoch": 1.7541810459251392, + "grad_norm": 0.5330606175454291, + "learning_rate": 7.818378488961208e-06, + "loss": 11.8365, + "step": 32214 + }, + { + "epoch": 1.7542354999217222, + "grad_norm": 0.5497375834604316, + "learning_rate": 7.814960650434011e-06, + "loss": 11.8318, + "step": 32215 + }, + { + "epoch": 1.7542899539183054, + "grad_norm": 0.6446595210524717, + "learning_rate": 7.811543528753106e-06, + "loss": 11.8336, + "step": 32216 + }, + { + "epoch": 1.7543444079148884, + "grad_norm": 0.5698699944627266, + "learning_rate": 7.808127123945108e-06, + "loss": 11.6559, + "step": 32217 + }, + { + "epoch": 1.7543988619114714, + "grad_norm": 0.522611867715547, + "learning_rate": 7.804711436036593e-06, + "loss": 11.7201, + "step": 32218 + }, + { + "epoch": 1.7544533159080544, + "grad_norm": 0.5714231990359632, + "learning_rate": 7.801296465054087e-06, + "loss": 11.8032, + "step": 32219 + }, + { + "epoch": 1.7545077699046374, + "grad_norm": 0.540221801484937, + "learning_rate": 7.797882211024177e-06, + "loss": 11.8662, + "step": 32220 + }, + { + "epoch": 1.7545622239012204, + "grad_norm": 0.5585542781002193, + "learning_rate": 7.794468673973376e-06, + "loss": 11.6747, + "step": 32221 + }, + { + "epoch": 1.7546166778978036, + "grad_norm": 0.5320363297074058, + "learning_rate": 7.791055853928264e-06, + "loss": 11.7608, + "step": 32222 + }, + { + "epoch": 1.7546711318943866, + "grad_norm": 0.5462074931922277, + "learning_rate": 7.787643750915374e-06, + "loss": 11.559, + "step": 32223 + }, + { + "epoch": 1.7547255858909696, + "grad_norm": 0.535820239283221, + "learning_rate": 7.784232364961208e-06, + "loss": 11.7968, + "step": 32224 + }, + { + "epoch": 1.7547800398875526, + "grad_norm": 0.5547533369282873, + "learning_rate": 7.780821696092333e-06, + "loss": 11.877, + "step": 32225 + }, + { + "epoch": 1.7548344938841356, + "grad_norm": 0.6763425151967221, + "learning_rate": 7.777411744335238e-06, + "loss": 11.8638, + "step": 32226 + }, + { + "epoch": 1.7548889478807186, + "grad_norm": 0.5207995028623225, + "learning_rate": 7.77400250971645e-06, + "loss": 11.7608, + "step": 32227 + }, + { + "epoch": 1.7549434018773016, + "grad_norm": 0.5806136414228242, + "learning_rate": 7.77059399226251e-06, + "loss": 11.7124, + "step": 32228 + }, + { + "epoch": 1.7549978558738846, + "grad_norm": 0.5977322860435186, + "learning_rate": 7.767186191999876e-06, + "loss": 11.6959, + "step": 32229 + }, + { + "epoch": 1.7550523098704676, + "grad_norm": 0.5416504793801568, + "learning_rate": 7.763779108955094e-06, + "loss": 11.7832, + "step": 32230 + }, + { + "epoch": 1.7551067638670506, + "grad_norm": 0.567089416905431, + "learning_rate": 7.76037274315461e-06, + "loss": 11.8185, + "step": 32231 + }, + { + "epoch": 1.7551612178636335, + "grad_norm": 0.5591491809986737, + "learning_rate": 7.756967094624946e-06, + "loss": 11.7368, + "step": 32232 + }, + { + "epoch": 1.7552156718602165, + "grad_norm": 0.5325861996181607, + "learning_rate": 7.753562163392592e-06, + "loss": 11.7578, + "step": 32233 + }, + { + "epoch": 1.7552701258567995, + "grad_norm": 0.5506910665304571, + "learning_rate": 7.750157949483983e-06, + "loss": 11.8254, + "step": 32234 + }, + { + "epoch": 1.7553245798533825, + "grad_norm": 0.5056868603709823, + "learning_rate": 7.746754452925631e-06, + "loss": 11.7153, + "step": 32235 + }, + { + "epoch": 1.7553790338499655, + "grad_norm": 0.5344924082242175, + "learning_rate": 7.74335167374397e-06, + "loss": 11.7105, + "step": 32236 + }, + { + "epoch": 1.7554334878465485, + "grad_norm": 0.5599740681400854, + "learning_rate": 7.739949611965491e-06, + "loss": 11.8362, + "step": 32237 + }, + { + "epoch": 1.7554879418431315, + "grad_norm": 0.5271103339849061, + "learning_rate": 7.736548267616628e-06, + "loss": 11.5224, + "step": 32238 + }, + { + "epoch": 1.7555423958397145, + "grad_norm": 0.5685243285199002, + "learning_rate": 7.733147640723837e-06, + "loss": 11.946, + "step": 32239 + }, + { + "epoch": 1.7555968498362977, + "grad_norm": 0.5300780559426779, + "learning_rate": 7.729747731313574e-06, + "loss": 11.7598, + "step": 32240 + }, + { + "epoch": 1.7556513038328807, + "grad_norm": 0.5029465535898692, + "learning_rate": 7.726348539412254e-06, + "loss": 11.7824, + "step": 32241 + }, + { + "epoch": 1.7557057578294637, + "grad_norm": 0.5929412511542949, + "learning_rate": 7.72295006504633e-06, + "loss": 11.836, + "step": 32242 + }, + { + "epoch": 1.7557602118260467, + "grad_norm": 0.565940440276524, + "learning_rate": 7.719552308242239e-06, + "loss": 11.7975, + "step": 32243 + }, + { + "epoch": 1.7558146658226297, + "grad_norm": 0.5574182071318897, + "learning_rate": 7.716155269026349e-06, + "loss": 11.9, + "step": 32244 + }, + { + "epoch": 1.7558691198192127, + "grad_norm": 0.49909450072414313, + "learning_rate": 7.712758947425147e-06, + "loss": 11.7809, + "step": 32245 + }, + { + "epoch": 1.755923573815796, + "grad_norm": 0.5235263892340182, + "learning_rate": 7.709363343464982e-06, + "loss": 11.7689, + "step": 32246 + }, + { + "epoch": 1.7559780278123789, + "grad_norm": 0.5664352797917843, + "learning_rate": 7.705968457172297e-06, + "loss": 11.7751, + "step": 32247 + }, + { + "epoch": 1.7560324818089619, + "grad_norm": 0.5830060694994326, + "learning_rate": 7.702574288573461e-06, + "loss": 11.8712, + "step": 32248 + }, + { + "epoch": 1.7560869358055449, + "grad_norm": 0.5566821559668119, + "learning_rate": 7.699180837694908e-06, + "loss": 11.8874, + "step": 32249 + }, + { + "epoch": 1.7561413898021279, + "grad_norm": 0.5491128300321498, + "learning_rate": 7.695788104562984e-06, + "loss": 11.9275, + "step": 32250 + }, + { + "epoch": 1.7561958437987109, + "grad_norm": 0.5680822424895551, + "learning_rate": 7.692396089204101e-06, + "loss": 11.8738, + "step": 32251 + }, + { + "epoch": 1.7562502977952938, + "grad_norm": 0.6692234037475192, + "learning_rate": 7.68900479164464e-06, + "loss": 11.7829, + "step": 32252 + }, + { + "epoch": 1.7563047517918768, + "grad_norm": 0.5513177088985287, + "learning_rate": 7.685614211910951e-06, + "loss": 11.7933, + "step": 32253 + }, + { + "epoch": 1.7563592057884598, + "grad_norm": 0.6010922202352793, + "learning_rate": 7.682224350029387e-06, + "loss": 11.7873, + "step": 32254 + }, + { + "epoch": 1.7564136597850428, + "grad_norm": 0.5554814044252768, + "learning_rate": 7.678835206026357e-06, + "loss": 11.7286, + "step": 32255 + }, + { + "epoch": 1.7564681137816258, + "grad_norm": 0.5534693096387924, + "learning_rate": 7.675446779928163e-06, + "loss": 11.8511, + "step": 32256 + }, + { + "epoch": 1.7565225677782088, + "grad_norm": 0.5657564163925257, + "learning_rate": 7.672059071761196e-06, + "loss": 11.8573, + "step": 32257 + }, + { + "epoch": 1.7565770217747918, + "grad_norm": 0.5257295790399134, + "learning_rate": 7.668672081551765e-06, + "loss": 11.8729, + "step": 32258 + }, + { + "epoch": 1.7566314757713748, + "grad_norm": 0.5803789243855803, + "learning_rate": 7.665285809326239e-06, + "loss": 11.8916, + "step": 32259 + }, + { + "epoch": 1.7566859297679578, + "grad_norm": 0.6112079479631757, + "learning_rate": 7.66190025511091e-06, + "loss": 11.8393, + "step": 32260 + }, + { + "epoch": 1.7567403837645408, + "grad_norm": 0.5415121407551673, + "learning_rate": 7.658515418932144e-06, + "loss": 11.868, + "step": 32261 + }, + { + "epoch": 1.7567948377611238, + "grad_norm": 0.5504172781674589, + "learning_rate": 7.655131300816254e-06, + "loss": 11.7321, + "step": 32262 + }, + { + "epoch": 1.756849291757707, + "grad_norm": 0.5419586328626931, + "learning_rate": 7.651747900789551e-06, + "loss": 11.7968, + "step": 32263 + }, + { + "epoch": 1.75690374575429, + "grad_norm": 0.49878985276194937, + "learning_rate": 7.64836521887835e-06, + "loss": 11.8255, + "step": 32264 + }, + { + "epoch": 1.756958199750873, + "grad_norm": 0.5384417709443856, + "learning_rate": 7.644983255108928e-06, + "loss": 11.6968, + "step": 32265 + }, + { + "epoch": 1.757012653747456, + "grad_norm": 0.5099488674337725, + "learning_rate": 7.64160200950761e-06, + "loss": 11.6918, + "step": 32266 + }, + { + "epoch": 1.757067107744039, + "grad_norm": 0.5744231889499453, + "learning_rate": 7.638221482100694e-06, + "loss": 11.7467, + "step": 32267 + }, + { + "epoch": 1.757121561740622, + "grad_norm": 0.6547415147829072, + "learning_rate": 7.63484167291444e-06, + "loss": 11.7437, + "step": 32268 + }, + { + "epoch": 1.7571760157372052, + "grad_norm": 0.5257627177962527, + "learning_rate": 7.631462581975158e-06, + "loss": 11.6708, + "step": 32269 + }, + { + "epoch": 1.7572304697337882, + "grad_norm": 0.5846229757023956, + "learning_rate": 7.628084209309106e-06, + "loss": 11.8139, + "step": 32270 + }, + { + "epoch": 1.7572849237303712, + "grad_norm": 0.5961961171614157, + "learning_rate": 7.624706554942573e-06, + "loss": 11.8151, + "step": 32271 + }, + { + "epoch": 1.7573393777269541, + "grad_norm": 0.5271634897816913, + "learning_rate": 7.621329618901796e-06, + "loss": 11.8484, + "step": 32272 + }, + { + "epoch": 1.7573938317235371, + "grad_norm": 0.490433808307365, + "learning_rate": 7.617953401213074e-06, + "loss": 11.7982, + "step": 32273 + }, + { + "epoch": 1.7574482857201201, + "grad_norm": 0.5442882276878024, + "learning_rate": 7.614577901902631e-06, + "loss": 11.7319, + "step": 32274 + }, + { + "epoch": 1.7575027397167031, + "grad_norm": 0.5778531847508808, + "learning_rate": 7.6112031209967125e-06, + "loss": 11.8007, + "step": 32275 + }, + { + "epoch": 1.7575571937132861, + "grad_norm": 0.494578925015559, + "learning_rate": 7.607829058521576e-06, + "loss": 11.7508, + "step": 32276 + }, + { + "epoch": 1.757611647709869, + "grad_norm": 0.531911092636844, + "learning_rate": 7.604455714503467e-06, + "loss": 11.7154, + "step": 32277 + }, + { + "epoch": 1.757666101706452, + "grad_norm": 0.5440454427937312, + "learning_rate": 7.601083088968586e-06, + "loss": 11.8183, + "step": 32278 + }, + { + "epoch": 1.757720555703035, + "grad_norm": 0.4721846534624302, + "learning_rate": 7.59771118194319e-06, + "loss": 11.5942, + "step": 32279 + }, + { + "epoch": 1.757775009699618, + "grad_norm": 0.5711021111911527, + "learning_rate": 7.594339993453481e-06, + "loss": 11.7782, + "step": 32280 + }, + { + "epoch": 1.757829463696201, + "grad_norm": 0.5456321476474515, + "learning_rate": 7.590969523525693e-06, + "loss": 11.8084, + "step": 32281 + }, + { + "epoch": 1.757883917692784, + "grad_norm": 0.6153892555574109, + "learning_rate": 7.587599772186005e-06, + "loss": 11.7252, + "step": 32282 + }, + { + "epoch": 1.757938371689367, + "grad_norm": 0.5864348175280548, + "learning_rate": 7.584230739460663e-06, + "loss": 11.8638, + "step": 32283 + }, + { + "epoch": 1.75799282568595, + "grad_norm": 0.5530913718208338, + "learning_rate": 7.5808624253758345e-06, + "loss": 11.7893, + "step": 32284 + }, + { + "epoch": 1.758047279682533, + "grad_norm": 0.5258033004590266, + "learning_rate": 7.577494829957698e-06, + "loss": 11.8255, + "step": 32285 + }, + { + "epoch": 1.7581017336791163, + "grad_norm": 0.5209343895028691, + "learning_rate": 7.57412795323249e-06, + "loss": 11.7554, + "step": 32286 + }, + { + "epoch": 1.7581561876756993, + "grad_norm": 0.6293390912174309, + "learning_rate": 7.570761795226333e-06, + "loss": 11.704, + "step": 32287 + }, + { + "epoch": 1.7582106416722822, + "grad_norm": 0.5319007120541434, + "learning_rate": 7.567396355965439e-06, + "loss": 11.8324, + "step": 32288 + }, + { + "epoch": 1.7582650956688652, + "grad_norm": 0.580508535233131, + "learning_rate": 7.564031635475988e-06, + "loss": 11.9045, + "step": 32289 + }, + { + "epoch": 1.7583195496654482, + "grad_norm": 0.5068231410167017, + "learning_rate": 7.560667633784113e-06, + "loss": 11.7702, + "step": 32290 + }, + { + "epoch": 1.7583740036620312, + "grad_norm": 0.5250683827358754, + "learning_rate": 7.5573043509160055e-06, + "loss": 11.7798, + "step": 32291 + }, + { + "epoch": 1.7584284576586144, + "grad_norm": 0.5100140078716555, + "learning_rate": 7.553941786897778e-06, + "loss": 11.7662, + "step": 32292 + }, + { + "epoch": 1.7584829116551974, + "grad_norm": 0.5553137085994898, + "learning_rate": 7.5505799417556315e-06, + "loss": 11.6263, + "step": 32293 + }, + { + "epoch": 1.7585373656517804, + "grad_norm": 0.49266713690707487, + "learning_rate": 7.5472188155156666e-06, + "loss": 11.682, + "step": 32294 + }, + { + "epoch": 1.7585918196483634, + "grad_norm": 0.6093620570986684, + "learning_rate": 7.54385840820403e-06, + "loss": 11.726, + "step": 32295 + }, + { + "epoch": 1.7586462736449464, + "grad_norm": 0.5638044436141385, + "learning_rate": 7.540498719846856e-06, + "loss": 11.7721, + "step": 32296 + }, + { + "epoch": 1.7587007276415294, + "grad_norm": 0.5268638154805048, + "learning_rate": 7.537139750470268e-06, + "loss": 11.7714, + "step": 32297 + }, + { + "epoch": 1.7587551816381124, + "grad_norm": 0.5547729103595103, + "learning_rate": 7.5337815001003895e-06, + "loss": 11.6223, + "step": 32298 + }, + { + "epoch": 1.7588096356346954, + "grad_norm": 0.5643332879019087, + "learning_rate": 7.530423968763323e-06, + "loss": 11.7611, + "step": 32299 + }, + { + "epoch": 1.7588640896312784, + "grad_norm": 0.5092644379332668, + "learning_rate": 7.52706715648519e-06, + "loss": 11.7484, + "step": 32300 + }, + { + "epoch": 1.7589185436278614, + "grad_norm": 0.6123002738172678, + "learning_rate": 7.523711063292105e-06, + "loss": 12.0026, + "step": 32301 + }, + { + "epoch": 1.7589729976244444, + "grad_norm": 0.5364123042148184, + "learning_rate": 7.520355689210134e-06, + "loss": 11.615, + "step": 32302 + }, + { + "epoch": 1.7590274516210274, + "grad_norm": 0.5853044926867247, + "learning_rate": 7.517001034265403e-06, + "loss": 11.8361, + "step": 32303 + }, + { + "epoch": 1.7590819056176104, + "grad_norm": 0.5177055192904207, + "learning_rate": 7.513647098483978e-06, + "loss": 11.8876, + "step": 32304 + }, + { + "epoch": 1.7591363596141933, + "grad_norm": 0.709516664642074, + "learning_rate": 7.5102938818919274e-06, + "loss": 11.888, + "step": 32305 + }, + { + "epoch": 1.7591908136107763, + "grad_norm": 0.5587832619399481, + "learning_rate": 7.506941384515365e-06, + "loss": 11.7161, + "step": 32306 + }, + { + "epoch": 1.7592452676073593, + "grad_norm": 0.5509328295572602, + "learning_rate": 7.503589606380312e-06, + "loss": 11.8065, + "step": 32307 + }, + { + "epoch": 1.7592997216039423, + "grad_norm": 0.5763956979013817, + "learning_rate": 7.5002385475128835e-06, + "loss": 11.7853, + "step": 32308 + }, + { + "epoch": 1.7593541756005253, + "grad_norm": 0.5897447799682455, + "learning_rate": 7.4968882079390905e-06, + "loss": 11.8323, + "step": 32309 + }, + { + "epoch": 1.7594086295971085, + "grad_norm": 0.5614212276382561, + "learning_rate": 7.4935385876850114e-06, + "loss": 11.8205, + "step": 32310 + }, + { + "epoch": 1.7594630835936915, + "grad_norm": 0.5580453990328673, + "learning_rate": 7.4901896867766944e-06, + "loss": 11.8253, + "step": 32311 + }, + { + "epoch": 1.7595175375902745, + "grad_norm": 0.5768845641338056, + "learning_rate": 7.486841505240172e-06, + "loss": 11.8223, + "step": 32312 + }, + { + "epoch": 1.7595719915868575, + "grad_norm": 0.5870514910026379, + "learning_rate": 7.483494043101514e-06, + "loss": 11.848, + "step": 32313 + }, + { + "epoch": 1.7596264455834405, + "grad_norm": 0.5314168646702158, + "learning_rate": 7.4801473003866864e-06, + "loss": 11.86, + "step": 32314 + }, + { + "epoch": 1.7596808995800237, + "grad_norm": 0.620104276143737, + "learning_rate": 7.476801277121748e-06, + "loss": 11.8212, + "step": 32315 + }, + { + "epoch": 1.7597353535766067, + "grad_norm": 0.5435303021810113, + "learning_rate": 7.473455973332743e-06, + "loss": 11.7835, + "step": 32316 + }, + { + "epoch": 1.7597898075731897, + "grad_norm": 0.5963718671984589, + "learning_rate": 7.470111389045642e-06, + "loss": 11.7939, + "step": 32317 + }, + { + "epoch": 1.7598442615697727, + "grad_norm": 0.5304119456924522, + "learning_rate": 7.466767524286477e-06, + "loss": 11.8243, + "step": 32318 + }, + { + "epoch": 1.7598987155663557, + "grad_norm": 0.553422404781632, + "learning_rate": 7.463424379081241e-06, + "loss": 11.9248, + "step": 32319 + }, + { + "epoch": 1.7599531695629387, + "grad_norm": 0.5249786498460067, + "learning_rate": 7.460081953455955e-06, + "loss": 11.8219, + "step": 32320 + }, + { + "epoch": 1.7600076235595217, + "grad_norm": 0.5083800382337074, + "learning_rate": 7.456740247436567e-06, + "loss": 11.8487, + "step": 32321 + }, + { + "epoch": 1.7600620775561047, + "grad_norm": 0.5440220910278729, + "learning_rate": 7.4533992610490875e-06, + "loss": 11.7141, + "step": 32322 + }, + { + "epoch": 1.7601165315526877, + "grad_norm": 0.48060071553130085, + "learning_rate": 7.450058994319531e-06, + "loss": 11.7604, + "step": 32323 + }, + { + "epoch": 1.7601709855492707, + "grad_norm": 0.5719979749730558, + "learning_rate": 7.446719447273798e-06, + "loss": 11.8195, + "step": 32324 + }, + { + "epoch": 1.7602254395458536, + "grad_norm": 0.535382825884724, + "learning_rate": 7.443380619937901e-06, + "loss": 11.7395, + "step": 32325 + }, + { + "epoch": 1.7602798935424366, + "grad_norm": 0.6219871745107087, + "learning_rate": 7.440042512337808e-06, + "loss": 11.9031, + "step": 32326 + }, + { + "epoch": 1.7603343475390196, + "grad_norm": 0.5571690019897455, + "learning_rate": 7.4367051244994545e-06, + "loss": 11.8527, + "step": 32327 + }, + { + "epoch": 1.7603888015356026, + "grad_norm": 0.5644400894009494, + "learning_rate": 7.43336845644882e-06, + "loss": 11.9148, + "step": 32328 + }, + { + "epoch": 1.7604432555321856, + "grad_norm": 0.5039294177055477, + "learning_rate": 7.430032508211826e-06, + "loss": 11.8055, + "step": 32329 + }, + { + "epoch": 1.7604977095287686, + "grad_norm": 0.5505615147334901, + "learning_rate": 7.426697279814432e-06, + "loss": 11.7359, + "step": 32330 + }, + { + "epoch": 1.7605521635253516, + "grad_norm": 0.5532087565581637, + "learning_rate": 7.4233627712825495e-06, + "loss": 11.7698, + "step": 32331 + }, + { + "epoch": 1.7606066175219346, + "grad_norm": 0.5392862202807782, + "learning_rate": 7.420028982642124e-06, + "loss": 11.7708, + "step": 32332 + }, + { + "epoch": 1.7606610715185178, + "grad_norm": 0.5398926937844774, + "learning_rate": 7.416695913919114e-06, + "loss": 11.8584, + "step": 32333 + }, + { + "epoch": 1.7607155255151008, + "grad_norm": 0.5723571536725904, + "learning_rate": 7.413363565139364e-06, + "loss": 11.8024, + "step": 32334 + }, + { + "epoch": 1.7607699795116838, + "grad_norm": 0.5295461305273266, + "learning_rate": 7.410031936328854e-06, + "loss": 11.6689, + "step": 32335 + }, + { + "epoch": 1.7608244335082668, + "grad_norm": 0.5634649371573618, + "learning_rate": 7.40670102751343e-06, + "loss": 11.6344, + "step": 32336 + }, + { + "epoch": 1.7608788875048498, + "grad_norm": 0.5427299651954265, + "learning_rate": 7.403370838719037e-06, + "loss": 11.6804, + "step": 32337 + }, + { + "epoch": 1.7609333415014328, + "grad_norm": 0.5157469887731284, + "learning_rate": 7.400041369971577e-06, + "loss": 11.682, + "step": 32338 + }, + { + "epoch": 1.760987795498016, + "grad_norm": 0.6181380236442666, + "learning_rate": 7.3967126212969085e-06, + "loss": 11.8238, + "step": 32339 + }, + { + "epoch": 1.761042249494599, + "grad_norm": 0.5783597891575023, + "learning_rate": 7.393384592720942e-06, + "loss": 11.7492, + "step": 32340 + }, + { + "epoch": 1.761096703491182, + "grad_norm": 0.5322414404189337, + "learning_rate": 7.390057284269536e-06, + "loss": 11.8629, + "step": 32341 + }, + { + "epoch": 1.761151157487765, + "grad_norm": 0.5611333027155997, + "learning_rate": 7.3867306959685915e-06, + "loss": 11.8159, + "step": 32342 + }, + { + "epoch": 1.761205611484348, + "grad_norm": 0.5706571472987123, + "learning_rate": 7.383404827843965e-06, + "loss": 11.838, + "step": 32343 + }, + { + "epoch": 1.761260065480931, + "grad_norm": 0.6170261038711772, + "learning_rate": 7.380079679921493e-06, + "loss": 11.876, + "step": 32344 + }, + { + "epoch": 1.761314519477514, + "grad_norm": 0.5252955293287406, + "learning_rate": 7.376755252227063e-06, + "loss": 11.843, + "step": 32345 + }, + { + "epoch": 1.761368973474097, + "grad_norm": 0.5390535813073829, + "learning_rate": 7.373431544786513e-06, + "loss": 11.7998, + "step": 32346 + }, + { + "epoch": 1.76142342747068, + "grad_norm": 0.537408796287597, + "learning_rate": 7.3701085576256986e-06, + "loss": 11.8537, + "step": 32347 + }, + { + "epoch": 1.761477881467263, + "grad_norm": 0.5563103088766296, + "learning_rate": 7.366786290770445e-06, + "loss": 11.7829, + "step": 32348 + }, + { + "epoch": 1.761532335463846, + "grad_norm": 0.5364377866093762, + "learning_rate": 7.363464744246596e-06, + "loss": 11.8085, + "step": 32349 + }, + { + "epoch": 1.761586789460429, + "grad_norm": 0.5351507697297039, + "learning_rate": 7.360143918079987e-06, + "loss": 11.8374, + "step": 32350 + }, + { + "epoch": 1.761641243457012, + "grad_norm": 0.5204197436056024, + "learning_rate": 7.356823812296432e-06, + "loss": 11.8803, + "step": 32351 + }, + { + "epoch": 1.7616956974535949, + "grad_norm": 0.5480532340536863, + "learning_rate": 7.353504426921754e-06, + "loss": 11.7533, + "step": 32352 + }, + { + "epoch": 1.7617501514501779, + "grad_norm": 0.5295869301956416, + "learning_rate": 7.350185761981776e-06, + "loss": 11.8076, + "step": 32353 + }, + { + "epoch": 1.7618046054467609, + "grad_norm": 0.4870461952741857, + "learning_rate": 7.346867817502267e-06, + "loss": 11.8195, + "step": 32354 + }, + { + "epoch": 1.7618590594433439, + "grad_norm": 0.5677620771240898, + "learning_rate": 7.3435505935090635e-06, + "loss": 11.7581, + "step": 32355 + }, + { + "epoch": 1.761913513439927, + "grad_norm": 0.5574819696135239, + "learning_rate": 7.340234090027942e-06, + "loss": 11.6624, + "step": 32356 + }, + { + "epoch": 1.76196796743651, + "grad_norm": 0.590936554342043, + "learning_rate": 7.336918307084717e-06, + "loss": 11.7542, + "step": 32357 + }, + { + "epoch": 1.762022421433093, + "grad_norm": 0.5000063304337666, + "learning_rate": 7.333603244705134e-06, + "loss": 11.8136, + "step": 32358 + }, + { + "epoch": 1.762076875429676, + "grad_norm": 0.5676488735060264, + "learning_rate": 7.330288902914983e-06, + "loss": 11.8305, + "step": 32359 + }, + { + "epoch": 1.762131329426259, + "grad_norm": 0.6944711710574692, + "learning_rate": 7.326975281740078e-06, + "loss": 11.6905, + "step": 32360 + }, + { + "epoch": 1.762185783422842, + "grad_norm": 0.5067506588848002, + "learning_rate": 7.323662381206131e-06, + "loss": 11.891, + "step": 32361 + }, + { + "epoch": 1.7622402374194253, + "grad_norm": 0.5903404600956154, + "learning_rate": 7.320350201338944e-06, + "loss": 11.7509, + "step": 32362 + }, + { + "epoch": 1.7622946914160083, + "grad_norm": 0.6223132012732371, + "learning_rate": 7.31703874216425e-06, + "loss": 11.8712, + "step": 32363 + }, + { + "epoch": 1.7623491454125912, + "grad_norm": 0.5699465982485366, + "learning_rate": 7.3137280037077985e-06, + "loss": 11.7497, + "step": 32364 + }, + { + "epoch": 1.7624035994091742, + "grad_norm": 0.5382094454685953, + "learning_rate": 7.310417985995355e-06, + "loss": 11.7891, + "step": 32365 + }, + { + "epoch": 1.7624580534057572, + "grad_norm": 0.9520201525640292, + "learning_rate": 7.307108689052633e-06, + "loss": 11.7321, + "step": 32366 + }, + { + "epoch": 1.7625125074023402, + "grad_norm": 0.6391005015733988, + "learning_rate": 7.30380011290539e-06, + "loss": 11.9134, + "step": 32367 + }, + { + "epoch": 1.7625669613989232, + "grad_norm": 0.581569098823044, + "learning_rate": 7.300492257579327e-06, + "loss": 11.9759, + "step": 32368 + }, + { + "epoch": 1.7626214153955062, + "grad_norm": 0.5519937761675575, + "learning_rate": 7.297185123100192e-06, + "loss": 11.9339, + "step": 32369 + }, + { + "epoch": 1.7626758693920892, + "grad_norm": 0.5469487552409232, + "learning_rate": 7.293878709493684e-06, + "loss": 11.7986, + "step": 32370 + }, + { + "epoch": 1.7627303233886722, + "grad_norm": 0.5260441176529977, + "learning_rate": 7.290573016785518e-06, + "loss": 11.6944, + "step": 32371 + }, + { + "epoch": 1.7627847773852552, + "grad_norm": 0.5805763485213549, + "learning_rate": 7.287268045001428e-06, + "loss": 11.8504, + "step": 32372 + }, + { + "epoch": 1.7628392313818382, + "grad_norm": 0.5655698371088128, + "learning_rate": 7.283963794167081e-06, + "loss": 11.7975, + "step": 32373 + }, + { + "epoch": 1.7628936853784212, + "grad_norm": 0.5660394550762327, + "learning_rate": 7.28066026430817e-06, + "loss": 11.7062, + "step": 32374 + }, + { + "epoch": 1.7629481393750042, + "grad_norm": 0.4911911745799803, + "learning_rate": 7.277357455450407e-06, + "loss": 11.85, + "step": 32375 + }, + { + "epoch": 1.7630025933715872, + "grad_norm": 0.5423764304606848, + "learning_rate": 7.274055367619437e-06, + "loss": 11.7692, + "step": 32376 + }, + { + "epoch": 1.7630570473681701, + "grad_norm": 0.5731570304029355, + "learning_rate": 7.270754000840985e-06, + "loss": 11.6807, + "step": 32377 + }, + { + "epoch": 1.7631115013647531, + "grad_norm": 0.6037863315480049, + "learning_rate": 7.267453355140685e-06, + "loss": 11.5989, + "step": 32378 + }, + { + "epoch": 1.7631659553613361, + "grad_norm": 0.536057241950262, + "learning_rate": 7.26415343054424e-06, + "loss": 11.7712, + "step": 32379 + }, + { + "epoch": 1.7632204093579193, + "grad_norm": 0.5426477716803638, + "learning_rate": 7.260854227077274e-06, + "loss": 11.7726, + "step": 32380 + }, + { + "epoch": 1.7632748633545023, + "grad_norm": 0.48845889683067467, + "learning_rate": 7.257555744765454e-06, + "loss": 11.7433, + "step": 32381 + }, + { + "epoch": 1.7633293173510853, + "grad_norm": 0.9140697807030868, + "learning_rate": 7.25425798363445e-06, + "loss": 11.7978, + "step": 32382 + }, + { + "epoch": 1.7633837713476683, + "grad_norm": 0.4784156197793373, + "learning_rate": 7.250960943709884e-06, + "loss": 11.7587, + "step": 32383 + }, + { + "epoch": 1.7634382253442513, + "grad_norm": 0.6121897958137692, + "learning_rate": 7.247664625017403e-06, + "loss": 11.7678, + "step": 32384 + }, + { + "epoch": 1.7634926793408345, + "grad_norm": 0.5203835716886616, + "learning_rate": 7.244369027582621e-06, + "loss": 11.8362, + "step": 32385 + }, + { + "epoch": 1.7635471333374175, + "grad_norm": 0.7471726687272454, + "learning_rate": 7.241074151431182e-06, + "loss": 11.7367, + "step": 32386 + }, + { + "epoch": 1.7636015873340005, + "grad_norm": 0.5778906960302164, + "learning_rate": 7.237779996588723e-06, + "loss": 11.8412, + "step": 32387 + }, + { + "epoch": 1.7636560413305835, + "grad_norm": 0.6550759964786931, + "learning_rate": 7.234486563080823e-06, + "loss": 11.8367, + "step": 32388 + }, + { + "epoch": 1.7637104953271665, + "grad_norm": 0.5475345367993918, + "learning_rate": 7.231193850933127e-06, + "loss": 11.8007, + "step": 32389 + }, + { + "epoch": 1.7637649493237495, + "grad_norm": 0.5899456756389151, + "learning_rate": 7.227901860171215e-06, + "loss": 11.8182, + "step": 32390 + }, + { + "epoch": 1.7638194033203325, + "grad_norm": 0.5973111699437943, + "learning_rate": 7.224610590820713e-06, + "loss": 11.8454, + "step": 32391 + }, + { + "epoch": 1.7638738573169155, + "grad_norm": 0.5544637859212342, + "learning_rate": 7.221320042907176e-06, + "loss": 11.7157, + "step": 32392 + }, + { + "epoch": 1.7639283113134985, + "grad_norm": 0.5580414332870065, + "learning_rate": 7.2180302164562175e-06, + "loss": 11.7504, + "step": 32393 + }, + { + "epoch": 1.7639827653100815, + "grad_norm": 0.5193774247888508, + "learning_rate": 7.214741111493439e-06, + "loss": 11.6611, + "step": 32394 + }, + { + "epoch": 1.7640372193066645, + "grad_norm": 0.5531527796970839, + "learning_rate": 7.211452728044377e-06, + "loss": 11.7538, + "step": 32395 + }, + { + "epoch": 1.7640916733032475, + "grad_norm": 0.5379305994824084, + "learning_rate": 7.20816506613462e-06, + "loss": 11.7033, + "step": 32396 + }, + { + "epoch": 1.7641461272998304, + "grad_norm": 0.5246291274011378, + "learning_rate": 7.2048781257897265e-06, + "loss": 11.8177, + "step": 32397 + }, + { + "epoch": 1.7642005812964134, + "grad_norm": 0.5485662663006204, + "learning_rate": 7.2015919070352545e-06, + "loss": 11.7589, + "step": 32398 + }, + { + "epoch": 1.7642550352929964, + "grad_norm": 0.5439113601867139, + "learning_rate": 7.198306409896794e-06, + "loss": 11.8224, + "step": 32399 + }, + { + "epoch": 1.7643094892895794, + "grad_norm": 0.5510783818630199, + "learning_rate": 7.195021634399834e-06, + "loss": 11.9455, + "step": 32400 + }, + { + "epoch": 1.7643639432861624, + "grad_norm": 0.5078001366956315, + "learning_rate": 7.191737580569979e-06, + "loss": 11.7833, + "step": 32401 + }, + { + "epoch": 1.7644183972827454, + "grad_norm": 0.5463870607280911, + "learning_rate": 7.1884542484327076e-06, + "loss": 11.6649, + "step": 32402 + }, + { + "epoch": 1.7644728512793286, + "grad_norm": 0.5293179509605374, + "learning_rate": 7.18517163801361e-06, + "loss": 11.7558, + "step": 32403 + }, + { + "epoch": 1.7645273052759116, + "grad_norm": 0.5661948285546391, + "learning_rate": 7.181889749338178e-06, + "loss": 11.8284, + "step": 32404 + }, + { + "epoch": 1.7645817592724946, + "grad_norm": 0.4774295334887779, + "learning_rate": 7.178608582431912e-06, + "loss": 11.7392, + "step": 32405 + }, + { + "epoch": 1.7646362132690776, + "grad_norm": 0.5569778331117314, + "learning_rate": 7.1753281373203805e-06, + "loss": 11.7874, + "step": 32406 + }, + { + "epoch": 1.7646906672656606, + "grad_norm": 0.49605450321583283, + "learning_rate": 7.172048414029042e-06, + "loss": 11.7344, + "step": 32407 + }, + { + "epoch": 1.7647451212622436, + "grad_norm": 0.5286431509239733, + "learning_rate": 7.1687694125834205e-06, + "loss": 11.8066, + "step": 32408 + }, + { + "epoch": 1.7647995752588268, + "grad_norm": 0.6143050511893186, + "learning_rate": 7.165491133009039e-06, + "loss": 11.8055, + "step": 32409 + }, + { + "epoch": 1.7648540292554098, + "grad_norm": 0.5104935075402033, + "learning_rate": 7.1622135753313445e-06, + "loss": 11.7688, + "step": 32410 + }, + { + "epoch": 1.7649084832519928, + "grad_norm": 0.5085736202065824, + "learning_rate": 7.158936739575861e-06, + "loss": 11.7869, + "step": 32411 + }, + { + "epoch": 1.7649629372485758, + "grad_norm": 0.5548308218267389, + "learning_rate": 7.155660625768046e-06, + "loss": 11.647, + "step": 32412 + }, + { + "epoch": 1.7650173912451588, + "grad_norm": 0.5182332690613815, + "learning_rate": 7.152385233933389e-06, + "loss": 11.8057, + "step": 32413 + }, + { + "epoch": 1.7650718452417418, + "grad_norm": 0.5407855924686837, + "learning_rate": 7.149110564097372e-06, + "loss": 11.7418, + "step": 32414 + }, + { + "epoch": 1.7651262992383248, + "grad_norm": 0.6337329798454913, + "learning_rate": 7.145836616285406e-06, + "loss": 11.7943, + "step": 32415 + }, + { + "epoch": 1.7651807532349078, + "grad_norm": 0.48449606329339867, + "learning_rate": 7.142563390523016e-06, + "loss": 11.5754, + "step": 32416 + }, + { + "epoch": 1.7652352072314907, + "grad_norm": 0.5302133558003101, + "learning_rate": 7.139290886835604e-06, + "loss": 11.8318, + "step": 32417 + }, + { + "epoch": 1.7652896612280737, + "grad_norm": 0.5462321802071552, + "learning_rate": 7.136019105248659e-06, + "loss": 11.7628, + "step": 32418 + }, + { + "epoch": 1.7653441152246567, + "grad_norm": 0.593250827650166, + "learning_rate": 7.132748045787585e-06, + "loss": 11.936, + "step": 32419 + }, + { + "epoch": 1.7653985692212397, + "grad_norm": 0.534377515132765, + "learning_rate": 7.1294777084778385e-06, + "loss": 11.8307, + "step": 32420 + }, + { + "epoch": 1.7654530232178227, + "grad_norm": 0.5957141896429868, + "learning_rate": 7.126208093344855e-06, + "loss": 11.8671, + "step": 32421 + }, + { + "epoch": 1.7655074772144057, + "grad_norm": 0.5253310196150951, + "learning_rate": 7.122939200414047e-06, + "loss": 11.5939, + "step": 32422 + }, + { + "epoch": 1.7655619312109887, + "grad_norm": 0.5727741432569967, + "learning_rate": 7.119671029710851e-06, + "loss": 11.8215, + "step": 32423 + }, + { + "epoch": 1.7656163852075717, + "grad_norm": 0.5645552107529307, + "learning_rate": 7.116403581260666e-06, + "loss": 11.5887, + "step": 32424 + }, + { + "epoch": 1.7656708392041547, + "grad_norm": 0.5728926466432189, + "learning_rate": 7.113136855088887e-06, + "loss": 11.6244, + "step": 32425 + }, + { + "epoch": 1.765725293200738, + "grad_norm": 0.5505402048947162, + "learning_rate": 7.109870851220946e-06, + "loss": 11.8847, + "step": 32426 + }, + { + "epoch": 1.765779747197321, + "grad_norm": 0.48449133413234013, + "learning_rate": 7.106605569682223e-06, + "loss": 11.8173, + "step": 32427 + }, + { + "epoch": 1.7658342011939039, + "grad_norm": 0.6641475685607001, + "learning_rate": 7.103341010498121e-06, + "loss": 11.8476, + "step": 32428 + }, + { + "epoch": 1.7658886551904869, + "grad_norm": 0.5200225627804707, + "learning_rate": 7.100077173693998e-06, + "loss": 11.7741, + "step": 32429 + }, + { + "epoch": 1.7659431091870699, + "grad_norm": 0.6478267440136369, + "learning_rate": 7.0968140592952645e-06, + "loss": 11.7211, + "step": 32430 + }, + { + "epoch": 1.7659975631836529, + "grad_norm": 0.526273367586206, + "learning_rate": 7.093551667327292e-06, + "loss": 11.7279, + "step": 32431 + }, + { + "epoch": 1.766052017180236, + "grad_norm": 0.5293466186121332, + "learning_rate": 7.090289997815436e-06, + "loss": 11.6945, + "step": 32432 + }, + { + "epoch": 1.766106471176819, + "grad_norm": 0.5234387758570058, + "learning_rate": 7.087029050785077e-06, + "loss": 11.785, + "step": 32433 + }, + { + "epoch": 1.766160925173402, + "grad_norm": 0.5608956079550506, + "learning_rate": 7.083768826261561e-06, + "loss": 11.7521, + "step": 32434 + }, + { + "epoch": 1.766215379169985, + "grad_norm": 0.6132855594715134, + "learning_rate": 7.080509324270224e-06, + "loss": 11.8305, + "step": 32435 + }, + { + "epoch": 1.766269833166568, + "grad_norm": 0.579739800056128, + "learning_rate": 7.077250544836444e-06, + "loss": 11.8372, + "step": 32436 + }, + { + "epoch": 1.766324287163151, + "grad_norm": 0.4848196679170275, + "learning_rate": 7.073992487985537e-06, + "loss": 11.6512, + "step": 32437 + }, + { + "epoch": 1.766378741159734, + "grad_norm": 0.5769411461503011, + "learning_rate": 7.070735153742858e-06, + "loss": 11.7398, + "step": 32438 + }, + { + "epoch": 1.766433195156317, + "grad_norm": 0.5400025851720152, + "learning_rate": 7.06747854213371e-06, + "loss": 11.7714, + "step": 32439 + }, + { + "epoch": 1.7664876491529, + "grad_norm": 0.5430651435837082, + "learning_rate": 7.0642226531834495e-06, + "loss": 11.7308, + "step": 32440 + }, + { + "epoch": 1.766542103149483, + "grad_norm": 0.5546201747900835, + "learning_rate": 7.0609674869173695e-06, + "loss": 11.8682, + "step": 32441 + }, + { + "epoch": 1.766596557146066, + "grad_norm": 0.4697497270887615, + "learning_rate": 7.057713043360781e-06, + "loss": 11.7482, + "step": 32442 + }, + { + "epoch": 1.766651011142649, + "grad_norm": 0.5527719939885997, + "learning_rate": 7.054459322539031e-06, + "loss": 11.689, + "step": 32443 + }, + { + "epoch": 1.766705465139232, + "grad_norm": 0.5539698824192684, + "learning_rate": 7.051206324477389e-06, + "loss": 11.8499, + "step": 32444 + }, + { + "epoch": 1.766759919135815, + "grad_norm": 0.5806791927372109, + "learning_rate": 7.047954049201144e-06, + "loss": 11.8532, + "step": 32445 + }, + { + "epoch": 1.766814373132398, + "grad_norm": 0.5164966670585809, + "learning_rate": 7.044702496735589e-06, + "loss": 11.7236, + "step": 32446 + }, + { + "epoch": 1.766868827128981, + "grad_norm": 0.5388727806756906, + "learning_rate": 7.041451667106014e-06, + "loss": 11.8655, + "step": 32447 + }, + { + "epoch": 1.766923281125564, + "grad_norm": 0.5258345518559903, + "learning_rate": 7.038201560337721e-06, + "loss": 11.8075, + "step": 32448 + }, + { + "epoch": 1.7669777351221472, + "grad_norm": 0.5586393142600838, + "learning_rate": 7.034952176455945e-06, + "loss": 11.7938, + "step": 32449 + }, + { + "epoch": 1.7670321891187302, + "grad_norm": 0.6787701241470055, + "learning_rate": 7.031703515485988e-06, + "loss": 11.8707, + "step": 32450 + }, + { + "epoch": 1.7670866431153132, + "grad_norm": 0.5443562415419549, + "learning_rate": 7.028455577453074e-06, + "loss": 11.8743, + "step": 32451 + }, + { + "epoch": 1.7671410971118962, + "grad_norm": 0.548410018421957, + "learning_rate": 7.025208362382496e-06, + "loss": 11.7629, + "step": 32452 + }, + { + "epoch": 1.7671955511084791, + "grad_norm": 0.5662985835500658, + "learning_rate": 7.021961870299476e-06, + "loss": 11.868, + "step": 32453 + }, + { + "epoch": 1.7672500051050621, + "grad_norm": 0.5611080212631334, + "learning_rate": 7.0187161012292945e-06, + "loss": 11.789, + "step": 32454 + }, + { + "epoch": 1.7673044591016454, + "grad_norm": 0.5441373707393591, + "learning_rate": 7.015471055197164e-06, + "loss": 11.756, + "step": 32455 + }, + { + "epoch": 1.7673589130982283, + "grad_norm": 0.6761327254056887, + "learning_rate": 7.0122267322282995e-06, + "loss": 11.894, + "step": 32456 + }, + { + "epoch": 1.7674133670948113, + "grad_norm": 0.563502754538389, + "learning_rate": 7.008983132347968e-06, + "loss": 11.7751, + "step": 32457 + }, + { + "epoch": 1.7674678210913943, + "grad_norm": 0.5391698601908398, + "learning_rate": 7.005740255581395e-06, + "loss": 11.8214, + "step": 32458 + }, + { + "epoch": 1.7675222750879773, + "grad_norm": 0.5756762889962632, + "learning_rate": 7.002498101953758e-06, + "loss": 11.7541, + "step": 32459 + }, + { + "epoch": 1.7675767290845603, + "grad_norm": 0.5510747624109077, + "learning_rate": 6.999256671490306e-06, + "loss": 11.9001, + "step": 32460 + }, + { + "epoch": 1.7676311830811433, + "grad_norm": 0.5600344165461107, + "learning_rate": 6.996015964216218e-06, + "loss": 11.6302, + "step": 32461 + }, + { + "epoch": 1.7676856370777263, + "grad_norm": 0.5105209825942861, + "learning_rate": 6.992775980156718e-06, + "loss": 11.6263, + "step": 32462 + }, + { + "epoch": 1.7677400910743093, + "grad_norm": 0.5287327792460506, + "learning_rate": 6.989536719336976e-06, + "loss": 11.7988, + "step": 32463 + }, + { + "epoch": 1.7677945450708923, + "grad_norm": 0.5235574865193675, + "learning_rate": 6.986298181782213e-06, + "loss": 11.7791, + "step": 32464 + }, + { + "epoch": 1.7678489990674753, + "grad_norm": 0.5248584359930557, + "learning_rate": 6.983060367517591e-06, + "loss": 11.5514, + "step": 32465 + }, + { + "epoch": 1.7679034530640583, + "grad_norm": 0.53106990569008, + "learning_rate": 6.979823276568276e-06, + "loss": 11.73, + "step": 32466 + }, + { + "epoch": 1.7679579070606413, + "grad_norm": 0.5624229679426564, + "learning_rate": 6.97658690895947e-06, + "loss": 11.788, + "step": 32467 + }, + { + "epoch": 1.7680123610572243, + "grad_norm": 0.5983849167658366, + "learning_rate": 6.97335126471631e-06, + "loss": 11.8528, + "step": 32468 + }, + { + "epoch": 1.7680668150538073, + "grad_norm": 0.49099023311629675, + "learning_rate": 6.9701163438639635e-06, + "loss": 11.8061, + "step": 32469 + }, + { + "epoch": 1.7681212690503902, + "grad_norm": 0.5159192516763268, + "learning_rate": 6.9668821464276224e-06, + "loss": 11.8415, + "step": 32470 + }, + { + "epoch": 1.7681757230469732, + "grad_norm": 0.5505523919006803, + "learning_rate": 6.9636486724323765e-06, + "loss": 11.741, + "step": 32471 + }, + { + "epoch": 1.7682301770435562, + "grad_norm": 0.5282648311414133, + "learning_rate": 6.960415921903429e-06, + "loss": 11.7232, + "step": 32472 + }, + { + "epoch": 1.7682846310401394, + "grad_norm": 0.5803095678248418, + "learning_rate": 6.95718389486587e-06, + "loss": 11.7988, + "step": 32473 + }, + { + "epoch": 1.7683390850367224, + "grad_norm": 0.5393963369431248, + "learning_rate": 6.95395259134487e-06, + "loss": 11.7687, + "step": 32474 + }, + { + "epoch": 1.7683935390333054, + "grad_norm": 0.5067067948482463, + "learning_rate": 6.95072201136554e-06, + "loss": 11.7722, + "step": 32475 + }, + { + "epoch": 1.7684479930298884, + "grad_norm": 0.5390073898022799, + "learning_rate": 6.947492154952972e-06, + "loss": 11.7714, + "step": 32476 + }, + { + "epoch": 1.7685024470264714, + "grad_norm": 0.5290798418720969, + "learning_rate": 6.944263022132336e-06, + "loss": 11.8138, + "step": 32477 + }, + { + "epoch": 1.7685569010230544, + "grad_norm": 0.47873354783519273, + "learning_rate": 6.941034612928699e-06, + "loss": 11.7539, + "step": 32478 + }, + { + "epoch": 1.7686113550196376, + "grad_norm": 0.551475868516879, + "learning_rate": 6.9378069273671855e-06, + "loss": 11.7834, + "step": 32479 + }, + { + "epoch": 1.7686658090162206, + "grad_norm": 0.5259224440001655, + "learning_rate": 6.934579965472898e-06, + "loss": 11.6581, + "step": 32480 + }, + { + "epoch": 1.7687202630128036, + "grad_norm": 0.5675922206340825, + "learning_rate": 6.931353727270917e-06, + "loss": 11.7332, + "step": 32481 + }, + { + "epoch": 1.7687747170093866, + "grad_norm": 0.4966813392031055, + "learning_rate": 6.928128212786356e-06, + "loss": 11.7096, + "step": 32482 + }, + { + "epoch": 1.7688291710059696, + "grad_norm": 0.5851621807802333, + "learning_rate": 6.92490342204426e-06, + "loss": 11.7999, + "step": 32483 + }, + { + "epoch": 1.7688836250025526, + "grad_norm": 0.5693229515501992, + "learning_rate": 6.921679355069732e-06, + "loss": 11.8676, + "step": 32484 + }, + { + "epoch": 1.7689380789991356, + "grad_norm": 0.5154227529447211, + "learning_rate": 6.918456011887842e-06, + "loss": 11.7555, + "step": 32485 + }, + { + "epoch": 1.7689925329957186, + "grad_norm": 0.5550616391724824, + "learning_rate": 6.915233392523623e-06, + "loss": 11.8038, + "step": 32486 + }, + { + "epoch": 1.7690469869923016, + "grad_norm": 0.5692104693139884, + "learning_rate": 6.91201149700218e-06, + "loss": 11.762, + "step": 32487 + }, + { + "epoch": 1.7691014409888846, + "grad_norm": 0.5375555291794875, + "learning_rate": 6.908790325348536e-06, + "loss": 11.7449, + "step": 32488 + }, + { + "epoch": 1.7691558949854675, + "grad_norm": 0.5655107579397572, + "learning_rate": 6.905569877587759e-06, + "loss": 11.812, + "step": 32489 + }, + { + "epoch": 1.7692103489820505, + "grad_norm": 0.5203234674230789, + "learning_rate": 6.902350153744874e-06, + "loss": 11.7405, + "step": 32490 + }, + { + "epoch": 1.7692648029786335, + "grad_norm": 0.5442782451858077, + "learning_rate": 6.8991311538449175e-06, + "loss": 11.8278, + "step": 32491 + }, + { + "epoch": 1.7693192569752165, + "grad_norm": 0.5938194500026797, + "learning_rate": 6.895912877912958e-06, + "loss": 11.8093, + "step": 32492 + }, + { + "epoch": 1.7693737109717995, + "grad_norm": 0.49089936157676045, + "learning_rate": 6.892695325973963e-06, + "loss": 11.7251, + "step": 32493 + }, + { + "epoch": 1.7694281649683825, + "grad_norm": 0.5462910053179499, + "learning_rate": 6.8894784980530255e-06, + "loss": 11.7654, + "step": 32494 + }, + { + "epoch": 1.7694826189649655, + "grad_norm": 0.5416389054162575, + "learning_rate": 6.88626239417508e-06, + "loss": 11.8812, + "step": 32495 + }, + { + "epoch": 1.7695370729615487, + "grad_norm": 0.5772196206579886, + "learning_rate": 6.883047014365173e-06, + "loss": 11.8, + "step": 32496 + }, + { + "epoch": 1.7695915269581317, + "grad_norm": 0.5731945478590326, + "learning_rate": 6.8798323586483305e-06, + "loss": 11.8409, + "step": 32497 + }, + { + "epoch": 1.7696459809547147, + "grad_norm": 0.5237249111482807, + "learning_rate": 6.876618427049509e-06, + "loss": 11.6249, + "step": 32498 + }, + { + "epoch": 1.7697004349512977, + "grad_norm": 0.6041684509377229, + "learning_rate": 6.873405219593732e-06, + "loss": 11.8516, + "step": 32499 + }, + { + "epoch": 1.7697548889478807, + "grad_norm": 0.5715753407255886, + "learning_rate": 6.870192736305958e-06, + "loss": 11.8486, + "step": 32500 + }, + { + "epoch": 1.7698093429444637, + "grad_norm": 0.6189144514321696, + "learning_rate": 6.866980977211201e-06, + "loss": 11.8001, + "step": 32501 + }, + { + "epoch": 1.769863796941047, + "grad_norm": 0.5777239824397306, + "learning_rate": 6.8637699423344085e-06, + "loss": 11.8568, + "step": 32502 + }, + { + "epoch": 1.7699182509376299, + "grad_norm": 0.8850594492937024, + "learning_rate": 6.860559631700558e-06, + "loss": 11.7279, + "step": 32503 + }, + { + "epoch": 1.7699727049342129, + "grad_norm": 0.5851050335484635, + "learning_rate": 6.857350045334643e-06, + "loss": 11.8075, + "step": 32504 + }, + { + "epoch": 1.7700271589307959, + "grad_norm": 0.6013720013585722, + "learning_rate": 6.854141183261564e-06, + "loss": 11.9136, + "step": 32505 + }, + { + "epoch": 1.7700816129273789, + "grad_norm": 0.509443011363841, + "learning_rate": 6.850933045506302e-06, + "loss": 11.6482, + "step": 32506 + }, + { + "epoch": 1.7701360669239619, + "grad_norm": 0.5679414901449624, + "learning_rate": 6.847725632093815e-06, + "loss": 11.5742, + "step": 32507 + }, + { + "epoch": 1.7701905209205449, + "grad_norm": 0.5392368385181602, + "learning_rate": 6.844518943049027e-06, + "loss": 11.8295, + "step": 32508 + }, + { + "epoch": 1.7702449749171278, + "grad_norm": 0.5751937584515109, + "learning_rate": 6.841312978396896e-06, + "loss": 11.8178, + "step": 32509 + }, + { + "epoch": 1.7702994289137108, + "grad_norm": 0.5864386990441026, + "learning_rate": 6.838107738162325e-06, + "loss": 11.8829, + "step": 32510 + }, + { + "epoch": 1.7703538829102938, + "grad_norm": 0.5505025726782754, + "learning_rate": 6.83490322237027e-06, + "loss": 11.8084, + "step": 32511 + }, + { + "epoch": 1.7704083369068768, + "grad_norm": 0.5154488770079803, + "learning_rate": 6.831699431045602e-06, + "loss": 11.8203, + "step": 32512 + }, + { + "epoch": 1.7704627909034598, + "grad_norm": 0.5465051550207456, + "learning_rate": 6.828496364213277e-06, + "loss": 11.7996, + "step": 32513 + }, + { + "epoch": 1.7705172449000428, + "grad_norm": 0.5741377442620422, + "learning_rate": 6.825294021898221e-06, + "loss": 11.6627, + "step": 32514 + }, + { + "epoch": 1.7705716988966258, + "grad_norm": 0.5494894114487702, + "learning_rate": 6.822092404125258e-06, + "loss": 11.8403, + "step": 32515 + }, + { + "epoch": 1.7706261528932088, + "grad_norm": 0.5090681600776872, + "learning_rate": 6.818891510919356e-06, + "loss": 11.7853, + "step": 32516 + }, + { + "epoch": 1.7706806068897918, + "grad_norm": 0.5263071241968802, + "learning_rate": 6.815691342305364e-06, + "loss": 11.7875, + "step": 32517 + }, + { + "epoch": 1.7707350608863748, + "grad_norm": 0.549212220543552, + "learning_rate": 6.812491898308193e-06, + "loss": 11.7381, + "step": 32518 + }, + { + "epoch": 1.770789514882958, + "grad_norm": 0.5314527347005255, + "learning_rate": 6.809293178952714e-06, + "loss": 11.6578, + "step": 32519 + }, + { + "epoch": 1.770843968879541, + "grad_norm": 0.5252082284132608, + "learning_rate": 6.806095184263783e-06, + "loss": 11.8465, + "step": 32520 + }, + { + "epoch": 1.770898422876124, + "grad_norm": 0.49797130645623167, + "learning_rate": 6.802897914266315e-06, + "loss": 11.8213, + "step": 32521 + }, + { + "epoch": 1.770952876872707, + "grad_norm": 0.5635534863376405, + "learning_rate": 6.799701368985112e-06, + "loss": 11.9435, + "step": 32522 + }, + { + "epoch": 1.77100733086929, + "grad_norm": 0.5629515308452587, + "learning_rate": 6.7965055484450865e-06, + "loss": 11.8811, + "step": 32523 + }, + { + "epoch": 1.771061784865873, + "grad_norm": 0.5315500683196582, + "learning_rate": 6.7933104526710534e-06, + "loss": 11.709, + "step": 32524 + }, + { + "epoch": 1.7711162388624562, + "grad_norm": 0.5764388303605275, + "learning_rate": 6.790116081687858e-06, + "loss": 11.8658, + "step": 32525 + }, + { + "epoch": 1.7711706928590392, + "grad_norm": 0.6623134247580197, + "learning_rate": 6.786922435520371e-06, + "loss": 11.9887, + "step": 32526 + }, + { + "epoch": 1.7712251468556222, + "grad_norm": 0.5157210021703791, + "learning_rate": 6.783729514193382e-06, + "loss": 11.7958, + "step": 32527 + }, + { + "epoch": 1.7712796008522051, + "grad_norm": 0.5376612123642904, + "learning_rate": 6.78053731773175e-06, + "loss": 11.816, + "step": 32528 + }, + { + "epoch": 1.7713340548487881, + "grad_norm": 0.5261152942844866, + "learning_rate": 6.77734584616031e-06, + "loss": 11.7756, + "step": 32529 + }, + { + "epoch": 1.7713885088453711, + "grad_norm": 0.5143438841387615, + "learning_rate": 6.7741550995038535e-06, + "loss": 11.7241, + "step": 32530 + }, + { + "epoch": 1.7714429628419541, + "grad_norm": 0.6092866657952518, + "learning_rate": 6.770965077787206e-06, + "loss": 11.7917, + "step": 32531 + }, + { + "epoch": 1.7714974168385371, + "grad_norm": 0.5320070727590585, + "learning_rate": 6.767775781035157e-06, + "loss": 11.7141, + "step": 32532 + }, + { + "epoch": 1.7715518708351201, + "grad_norm": 0.5772470246873229, + "learning_rate": 6.764587209272533e-06, + "loss": 11.7088, + "step": 32533 + }, + { + "epoch": 1.771606324831703, + "grad_norm": 0.5277213074435889, + "learning_rate": 6.76139936252409e-06, + "loss": 11.6816, + "step": 32534 + }, + { + "epoch": 1.771660778828286, + "grad_norm": 0.5219917950417122, + "learning_rate": 6.7582122408146656e-06, + "loss": 11.7806, + "step": 32535 + }, + { + "epoch": 1.771715232824869, + "grad_norm": 0.5715357613118955, + "learning_rate": 6.755025844169027e-06, + "loss": 11.6965, + "step": 32536 + }, + { + "epoch": 1.771769686821452, + "grad_norm": 0.5850917935202838, + "learning_rate": 6.751840172611923e-06, + "loss": 11.5886, + "step": 32537 + }, + { + "epoch": 1.771824140818035, + "grad_norm": 0.6101323911637545, + "learning_rate": 6.748655226168155e-06, + "loss": 11.836, + "step": 32538 + }, + { + "epoch": 1.771878594814618, + "grad_norm": 0.5412089774253621, + "learning_rate": 6.745471004862481e-06, + "loss": 11.7079, + "step": 32539 + }, + { + "epoch": 1.771933048811201, + "grad_norm": 0.5299501793833984, + "learning_rate": 6.742287508719647e-06, + "loss": 11.7622, + "step": 32540 + }, + { + "epoch": 1.771987502807784, + "grad_norm": 0.5232663481832034, + "learning_rate": 6.739104737764457e-06, + "loss": 11.6454, + "step": 32541 + }, + { + "epoch": 1.772041956804367, + "grad_norm": 0.5395042908687733, + "learning_rate": 6.735922692021601e-06, + "loss": 11.8339, + "step": 32542 + }, + { + "epoch": 1.7720964108009503, + "grad_norm": 0.5607707941876608, + "learning_rate": 6.732741371515871e-06, + "loss": 11.7649, + "step": 32543 + }, + { + "epoch": 1.7721508647975333, + "grad_norm": 0.50940726724126, + "learning_rate": 6.72956077627197e-06, + "loss": 11.5963, + "step": 32544 + }, + { + "epoch": 1.7722053187941162, + "grad_norm": 0.5687962796539477, + "learning_rate": 6.7263809063146554e-06, + "loss": 11.8172, + "step": 32545 + }, + { + "epoch": 1.7722597727906992, + "grad_norm": 0.5552807747158219, + "learning_rate": 6.723201761668651e-06, + "loss": 11.7434, + "step": 32546 + }, + { + "epoch": 1.7723142267872822, + "grad_norm": 0.6312128068106516, + "learning_rate": 6.72002334235865e-06, + "loss": 11.7108, + "step": 32547 + }, + { + "epoch": 1.7723686807838654, + "grad_norm": 0.5364946785921043, + "learning_rate": 6.71684564840942e-06, + "loss": 11.7466, + "step": 32548 + }, + { + "epoch": 1.7724231347804484, + "grad_norm": 0.4954720453777132, + "learning_rate": 6.7136686798456084e-06, + "loss": 11.7325, + "step": 32549 + }, + { + "epoch": 1.7724775887770314, + "grad_norm": 0.5507651694484306, + "learning_rate": 6.710492436691984e-06, + "loss": 11.8744, + "step": 32550 + }, + { + "epoch": 1.7725320427736144, + "grad_norm": 0.5603768587860525, + "learning_rate": 6.707316918973194e-06, + "loss": 11.8327, + "step": 32551 + }, + { + "epoch": 1.7725864967701974, + "grad_norm": 0.5773301010353931, + "learning_rate": 6.704142126713953e-06, + "loss": 11.7658, + "step": 32552 + }, + { + "epoch": 1.7726409507667804, + "grad_norm": 0.5225682569384548, + "learning_rate": 6.700968059938962e-06, + "loss": 11.8519, + "step": 32553 + }, + { + "epoch": 1.7726954047633634, + "grad_norm": 0.5220744324181364, + "learning_rate": 6.697794718672878e-06, + "loss": 11.7751, + "step": 32554 + }, + { + "epoch": 1.7727498587599464, + "grad_norm": 0.5522113729084689, + "learning_rate": 6.694622102940396e-06, + "loss": 11.8958, + "step": 32555 + }, + { + "epoch": 1.7728043127565294, + "grad_norm": 0.5953742583947584, + "learning_rate": 6.691450212766193e-06, + "loss": 11.8135, + "step": 32556 + }, + { + "epoch": 1.7728587667531124, + "grad_norm": 0.582818102037976, + "learning_rate": 6.688279048174895e-06, + "loss": 11.8136, + "step": 32557 + }, + { + "epoch": 1.7729132207496954, + "grad_norm": 0.5999572791971642, + "learning_rate": 6.685108609191204e-06, + "loss": 11.8104, + "step": 32558 + }, + { + "epoch": 1.7729676747462784, + "grad_norm": 0.510340518078583, + "learning_rate": 6.681938895839746e-06, + "loss": 11.7312, + "step": 32559 + }, + { + "epoch": 1.7730221287428614, + "grad_norm": 0.7048680747222791, + "learning_rate": 6.678769908145188e-06, + "loss": 11.8127, + "step": 32560 + }, + { + "epoch": 1.7730765827394444, + "grad_norm": 0.5453801199230138, + "learning_rate": 6.675601646132157e-06, + "loss": 11.8925, + "step": 32561 + }, + { + "epoch": 1.7731310367360273, + "grad_norm": 0.5304706086726125, + "learning_rate": 6.672434109825299e-06, + "loss": 11.8602, + "step": 32562 + }, + { + "epoch": 1.7731854907326103, + "grad_norm": 0.6165774646844658, + "learning_rate": 6.66926729924926e-06, + "loss": 11.7511, + "step": 32563 + }, + { + "epoch": 1.7732399447291933, + "grad_norm": 0.5789662439924932, + "learning_rate": 6.6661012144286325e-06, + "loss": 11.8659, + "step": 32564 + }, + { + "epoch": 1.7732943987257763, + "grad_norm": 0.590542330161939, + "learning_rate": 6.662935855388086e-06, + "loss": 11.7068, + "step": 32565 + }, + { + "epoch": 1.7733488527223595, + "grad_norm": 0.5549403780924586, + "learning_rate": 6.659771222152167e-06, + "loss": 11.6931, + "step": 32566 + }, + { + "epoch": 1.7734033067189425, + "grad_norm": 0.6011244737428332, + "learning_rate": 6.656607314745522e-06, + "loss": 11.8375, + "step": 32567 + }, + { + "epoch": 1.7734577607155255, + "grad_norm": 0.5470194606503198, + "learning_rate": 6.653444133192777e-06, + "loss": 11.8163, + "step": 32568 + }, + { + "epoch": 1.7735122147121085, + "grad_norm": 0.5263834449077107, + "learning_rate": 6.650281677518477e-06, + "loss": 11.7583, + "step": 32569 + }, + { + "epoch": 1.7735666687086915, + "grad_norm": 0.6066447590285262, + "learning_rate": 6.64711994774726e-06, + "loss": 11.8415, + "step": 32570 + }, + { + "epoch": 1.7736211227052745, + "grad_norm": 0.564971141513668, + "learning_rate": 6.6439589439036834e-06, + "loss": 11.7612, + "step": 32571 + }, + { + "epoch": 1.7736755767018577, + "grad_norm": 0.5426395113191236, + "learning_rate": 6.640798666012349e-06, + "loss": 11.8211, + "step": 32572 + }, + { + "epoch": 1.7737300306984407, + "grad_norm": 0.5355788596137272, + "learning_rate": 6.637639114097804e-06, + "loss": 11.7272, + "step": 32573 + }, + { + "epoch": 1.7737844846950237, + "grad_norm": 0.5417248876945642, + "learning_rate": 6.6344802881846416e-06, + "loss": 11.7728, + "step": 32574 + }, + { + "epoch": 1.7738389386916067, + "grad_norm": 0.5176878213694106, + "learning_rate": 6.63132218829744e-06, + "loss": 11.5263, + "step": 32575 + }, + { + "epoch": 1.7738933926881897, + "grad_norm": 0.538084768549536, + "learning_rate": 6.628164814460702e-06, + "loss": 11.8603, + "step": 32576 + }, + { + "epoch": 1.7739478466847727, + "grad_norm": 0.5634966133398733, + "learning_rate": 6.62500816669902e-06, + "loss": 11.8694, + "step": 32577 + }, + { + "epoch": 1.7740023006813557, + "grad_norm": 0.5087286585919892, + "learning_rate": 6.621852245036952e-06, + "loss": 11.6815, + "step": 32578 + }, + { + "epoch": 1.7740567546779387, + "grad_norm": 0.5553469008752997, + "learning_rate": 6.618697049499001e-06, + "loss": 11.7431, + "step": 32579 + }, + { + "epoch": 1.7741112086745217, + "grad_norm": 0.5456400676770684, + "learning_rate": 6.615542580109735e-06, + "loss": 11.762, + "step": 32580 + }, + { + "epoch": 1.7741656626711046, + "grad_norm": 0.536257816136381, + "learning_rate": 6.612388836893657e-06, + "loss": 11.523, + "step": 32581 + }, + { + "epoch": 1.7742201166676876, + "grad_norm": 0.5835429825597319, + "learning_rate": 6.609235819875314e-06, + "loss": 11.7725, + "step": 32582 + }, + { + "epoch": 1.7742745706642706, + "grad_norm": 0.508417587120594, + "learning_rate": 6.60608352907921e-06, + "loss": 11.8408, + "step": 32583 + }, + { + "epoch": 1.7743290246608536, + "grad_norm": 0.5429689687452576, + "learning_rate": 6.6029319645298575e-06, + "loss": 11.8121, + "step": 32584 + }, + { + "epoch": 1.7743834786574366, + "grad_norm": 0.5955028609000584, + "learning_rate": 6.599781126251792e-06, + "loss": 11.7714, + "step": 32585 + }, + { + "epoch": 1.7744379326540196, + "grad_norm": 0.6138863765080108, + "learning_rate": 6.596631014269472e-06, + "loss": 11.8127, + "step": 32586 + }, + { + "epoch": 1.7744923866506026, + "grad_norm": 0.5274485178907716, + "learning_rate": 6.593481628607423e-06, + "loss": 11.7936, + "step": 32587 + }, + { + "epoch": 1.7745468406471856, + "grad_norm": 0.6001581529946779, + "learning_rate": 6.590332969290114e-06, + "loss": 11.8609, + "step": 32588 + }, + { + "epoch": 1.7746012946437688, + "grad_norm": 0.5261086031578917, + "learning_rate": 6.587185036342036e-06, + "loss": 11.8238, + "step": 32589 + }, + { + "epoch": 1.7746557486403518, + "grad_norm": 0.5738828339982922, + "learning_rate": 6.584037829787692e-06, + "loss": 11.8126, + "step": 32590 + }, + { + "epoch": 1.7747102026369348, + "grad_norm": 0.5554205995944045, + "learning_rate": 6.580891349651519e-06, + "loss": 11.8371, + "step": 32591 + }, + { + "epoch": 1.7747646566335178, + "grad_norm": 0.5183023933811208, + "learning_rate": 6.577745595958018e-06, + "loss": 11.7964, + "step": 32592 + }, + { + "epoch": 1.7748191106301008, + "grad_norm": 0.5084647466055651, + "learning_rate": 6.574600568731615e-06, + "loss": 11.7346, + "step": 32593 + }, + { + "epoch": 1.7748735646266838, + "grad_norm": 0.5481947987741224, + "learning_rate": 6.571456267996801e-06, + "loss": 11.7844, + "step": 32594 + }, + { + "epoch": 1.774928018623267, + "grad_norm": 0.5891789659361069, + "learning_rate": 6.568312693778011e-06, + "loss": 11.7306, + "step": 32595 + }, + { + "epoch": 1.77498247261985, + "grad_norm": 0.5276265723583782, + "learning_rate": 6.5651698460996834e-06, + "loss": 11.7326, + "step": 32596 + }, + { + "epoch": 1.775036926616433, + "grad_norm": 0.5704125501566568, + "learning_rate": 6.562027724986264e-06, + "loss": 11.6956, + "step": 32597 + }, + { + "epoch": 1.775091380613016, + "grad_norm": 0.562789157110727, + "learning_rate": 6.558886330462189e-06, + "loss": 11.789, + "step": 32598 + }, + { + "epoch": 1.775145834609599, + "grad_norm": 0.5928072781857274, + "learning_rate": 6.555745662551882e-06, + "loss": 11.6489, + "step": 32599 + }, + { + "epoch": 1.775200288606182, + "grad_norm": 0.5326225760551547, + "learning_rate": 6.5526057212797585e-06, + "loss": 11.7364, + "step": 32600 + }, + { + "epoch": 1.775254742602765, + "grad_norm": 0.5278390274475373, + "learning_rate": 6.549466506670244e-06, + "loss": 11.7725, + "step": 32601 + }, + { + "epoch": 1.775309196599348, + "grad_norm": 0.5295545231723466, + "learning_rate": 6.546328018747772e-06, + "loss": 11.7202, + "step": 32602 + }, + { + "epoch": 1.775363650595931, + "grad_norm": 0.4988386655783545, + "learning_rate": 6.543190257536702e-06, + "loss": 11.8298, + "step": 32603 + }, + { + "epoch": 1.775418104592514, + "grad_norm": 0.5897230884922778, + "learning_rate": 6.5400532230614706e-06, + "loss": 11.8236, + "step": 32604 + }, + { + "epoch": 1.775472558589097, + "grad_norm": 0.5374701214690483, + "learning_rate": 6.536916915346469e-06, + "loss": 11.7702, + "step": 32605 + }, + { + "epoch": 1.77552701258568, + "grad_norm": 0.5796462809143641, + "learning_rate": 6.533781334416056e-06, + "loss": 11.8336, + "step": 32606 + }, + { + "epoch": 1.775581466582263, + "grad_norm": 0.5378564911576558, + "learning_rate": 6.5306464802946445e-06, + "loss": 11.6341, + "step": 32607 + }, + { + "epoch": 1.775635920578846, + "grad_norm": 0.5606812729941905, + "learning_rate": 6.527512353006593e-06, + "loss": 11.9215, + "step": 32608 + }, + { + "epoch": 1.7756903745754289, + "grad_norm": 0.5763508853752508, + "learning_rate": 6.524378952576282e-06, + "loss": 11.539, + "step": 32609 + }, + { + "epoch": 1.7757448285720119, + "grad_norm": 0.6068411877977646, + "learning_rate": 6.52124627902807e-06, + "loss": 11.7649, + "step": 32610 + }, + { + "epoch": 1.7757992825685949, + "grad_norm": 0.6007422795674634, + "learning_rate": 6.518114332386327e-06, + "loss": 11.8686, + "step": 32611 + }, + { + "epoch": 1.7758537365651779, + "grad_norm": 0.5722685691181251, + "learning_rate": 6.5149831126754105e-06, + "loss": 11.9874, + "step": 32612 + }, + { + "epoch": 1.775908190561761, + "grad_norm": 0.551065592349112, + "learning_rate": 6.511852619919656e-06, + "loss": 11.6544, + "step": 32613 + }, + { + "epoch": 1.775962644558344, + "grad_norm": 0.5352143631889089, + "learning_rate": 6.508722854143423e-06, + "loss": 11.653, + "step": 32614 + }, + { + "epoch": 1.776017098554927, + "grad_norm": 0.5635059400041645, + "learning_rate": 6.505593815371047e-06, + "loss": 11.8178, + "step": 32615 + }, + { + "epoch": 1.77607155255151, + "grad_norm": 0.5749195072114188, + "learning_rate": 6.502465503626831e-06, + "loss": 11.843, + "step": 32616 + }, + { + "epoch": 1.776126006548093, + "grad_norm": 0.5009668994357556, + "learning_rate": 6.499337918935133e-06, + "loss": 11.7927, + "step": 32617 + }, + { + "epoch": 1.7761804605446763, + "grad_norm": 0.504484548633708, + "learning_rate": 6.496211061320256e-06, + "loss": 11.7142, + "step": 32618 + }, + { + "epoch": 1.7762349145412593, + "grad_norm": 0.5485633307366743, + "learning_rate": 6.493084930806537e-06, + "loss": 11.7661, + "step": 32619 + }, + { + "epoch": 1.7762893685378423, + "grad_norm": 0.4947471269227897, + "learning_rate": 6.489959527418255e-06, + "loss": 11.7114, + "step": 32620 + }, + { + "epoch": 1.7763438225344252, + "grad_norm": 0.5299280750765318, + "learning_rate": 6.486834851179746e-06, + "loss": 11.7021, + "step": 32621 + }, + { + "epoch": 1.7763982765310082, + "grad_norm": 0.6046416560381348, + "learning_rate": 6.4837109021152696e-06, + "loss": 11.8864, + "step": 32622 + }, + { + "epoch": 1.7764527305275912, + "grad_norm": 0.5971586014993002, + "learning_rate": 6.48058768024915e-06, + "loss": 11.8263, + "step": 32623 + }, + { + "epoch": 1.7765071845241742, + "grad_norm": 0.5535618459704909, + "learning_rate": 6.477465185605669e-06, + "loss": 11.8389, + "step": 32624 + }, + { + "epoch": 1.7765616385207572, + "grad_norm": 0.5770403531098282, + "learning_rate": 6.474343418209106e-06, + "loss": 11.7517, + "step": 32625 + }, + { + "epoch": 1.7766160925173402, + "grad_norm": 0.5941900106701521, + "learning_rate": 6.471222378083719e-06, + "loss": 11.784, + "step": 32626 + }, + { + "epoch": 1.7766705465139232, + "grad_norm": 0.5260095947151255, + "learning_rate": 6.468102065253812e-06, + "loss": 11.7068, + "step": 32627 + }, + { + "epoch": 1.7767250005105062, + "grad_norm": 0.4935738656016135, + "learning_rate": 6.464982479743598e-06, + "loss": 11.7772, + "step": 32628 + }, + { + "epoch": 1.7767794545070892, + "grad_norm": 0.5438689668500842, + "learning_rate": 6.461863621577391e-06, + "loss": 11.6524, + "step": 32629 + }, + { + "epoch": 1.7768339085036722, + "grad_norm": 0.5956595402759075, + "learning_rate": 6.458745490779405e-06, + "loss": 11.7141, + "step": 32630 + }, + { + "epoch": 1.7768883625002552, + "grad_norm": 0.530747206389311, + "learning_rate": 6.4556280873739214e-06, + "loss": 11.8797, + "step": 32631 + }, + { + "epoch": 1.7769428164968382, + "grad_norm": 0.5550826770474203, + "learning_rate": 6.452511411385131e-06, + "loss": 11.8071, + "step": 32632 + }, + { + "epoch": 1.7769972704934212, + "grad_norm": 0.51211618541263, + "learning_rate": 6.449395462837315e-06, + "loss": 11.6899, + "step": 32633 + }, + { + "epoch": 1.7770517244900041, + "grad_norm": 0.5216564382793045, + "learning_rate": 6.446280241754687e-06, + "loss": 11.7083, + "step": 32634 + }, + { + "epoch": 1.7771061784865871, + "grad_norm": 0.5227773404881256, + "learning_rate": 6.443165748161484e-06, + "loss": 11.791, + "step": 32635 + }, + { + "epoch": 1.7771606324831704, + "grad_norm": 0.5125099603910652, + "learning_rate": 6.440051982081918e-06, + "loss": 11.7994, + "step": 32636 + }, + { + "epoch": 1.7772150864797533, + "grad_norm": 0.5557595719417294, + "learning_rate": 6.436938943540172e-06, + "loss": 11.596, + "step": 32637 + }, + { + "epoch": 1.7772695404763363, + "grad_norm": 0.5539884593410619, + "learning_rate": 6.433826632560491e-06, + "loss": 11.7835, + "step": 32638 + }, + { + "epoch": 1.7773239944729193, + "grad_norm": 0.5064855042708822, + "learning_rate": 6.430715049167069e-06, + "loss": 11.7443, + "step": 32639 + }, + { + "epoch": 1.7773784484695023, + "grad_norm": 0.5309002760722017, + "learning_rate": 6.427604193384085e-06, + "loss": 11.9209, + "step": 32640 + }, + { + "epoch": 1.7774329024660853, + "grad_norm": 0.5919438412692817, + "learning_rate": 6.424494065235764e-06, + "loss": 11.6732, + "step": 32641 + }, + { + "epoch": 1.7774873564626685, + "grad_norm": 0.5355567293841104, + "learning_rate": 6.421384664746244e-06, + "loss": 11.7462, + "step": 32642 + }, + { + "epoch": 1.7775418104592515, + "grad_norm": 0.5255919046413203, + "learning_rate": 6.418275991939759e-06, + "loss": 11.8555, + "step": 32643 + }, + { + "epoch": 1.7775962644558345, + "grad_norm": 0.5583327883722515, + "learning_rate": 6.415168046840425e-06, + "loss": 11.7699, + "step": 32644 + }, + { + "epoch": 1.7776507184524175, + "grad_norm": 0.5817085883094335, + "learning_rate": 6.412060829472466e-06, + "loss": 11.7111, + "step": 32645 + }, + { + "epoch": 1.7777051724490005, + "grad_norm": 0.5088954993976289, + "learning_rate": 6.408954339860007e-06, + "loss": 11.8435, + "step": 32646 + }, + { + "epoch": 1.7777596264455835, + "grad_norm": 0.5786548787336618, + "learning_rate": 6.405848578027207e-06, + "loss": 11.7142, + "step": 32647 + }, + { + "epoch": 1.7778140804421665, + "grad_norm": 0.5514046851765381, + "learning_rate": 6.402743543998235e-06, + "loss": 11.6604, + "step": 32648 + }, + { + "epoch": 1.7778685344387495, + "grad_norm": 0.5248882071075454, + "learning_rate": 6.399639237797206e-06, + "loss": 11.727, + "step": 32649 + }, + { + "epoch": 1.7779229884353325, + "grad_norm": 0.5662896027745242, + "learning_rate": 6.396535659448288e-06, + "loss": 11.8725, + "step": 32650 + }, + { + "epoch": 1.7779774424319155, + "grad_norm": 0.5455941611714284, + "learning_rate": 6.393432808975619e-06, + "loss": 11.8148, + "step": 32651 + }, + { + "epoch": 1.7780318964284985, + "grad_norm": 0.5795116195227646, + "learning_rate": 6.3903306864032895e-06, + "loss": 11.7415, + "step": 32652 + }, + { + "epoch": 1.7780863504250815, + "grad_norm": 0.5435402068006697, + "learning_rate": 6.38722929175547e-06, + "loss": 11.7942, + "step": 32653 + }, + { + "epoch": 1.7781408044216644, + "grad_norm": 0.4898854798438047, + "learning_rate": 6.384128625056241e-06, + "loss": 11.6952, + "step": 32654 + }, + { + "epoch": 1.7781952584182474, + "grad_norm": 0.5502141616198264, + "learning_rate": 6.381028686329738e-06, + "loss": 11.687, + "step": 32655 + }, + { + "epoch": 1.7782497124148304, + "grad_norm": 0.5715235364798035, + "learning_rate": 6.377929475600064e-06, + "loss": 11.6627, + "step": 32656 + }, + { + "epoch": 1.7783041664114134, + "grad_norm": 0.5565606715258251, + "learning_rate": 6.37483099289129e-06, + "loss": 11.7002, + "step": 32657 + }, + { + "epoch": 1.7783586204079964, + "grad_norm": 0.559697312513193, + "learning_rate": 6.371733238227539e-06, + "loss": 11.8468, + "step": 32658 + }, + { + "epoch": 1.7784130744045796, + "grad_norm": 0.5666534709032173, + "learning_rate": 6.368636211632883e-06, + "loss": 11.7649, + "step": 32659 + }, + { + "epoch": 1.7784675284011626, + "grad_norm": 0.5574114448826537, + "learning_rate": 6.3655399131314245e-06, + "loss": 11.7239, + "step": 32660 + }, + { + "epoch": 1.7785219823977456, + "grad_norm": 0.5925011940755665, + "learning_rate": 6.362444342747232e-06, + "loss": 11.8493, + "step": 32661 + }, + { + "epoch": 1.7785764363943286, + "grad_norm": 0.5269764138423191, + "learning_rate": 6.359349500504364e-06, + "loss": 11.7507, + "step": 32662 + }, + { + "epoch": 1.7786308903909116, + "grad_norm": 0.5403476584558234, + "learning_rate": 6.356255386426912e-06, + "loss": 11.8175, + "step": 32663 + }, + { + "epoch": 1.7786853443874946, + "grad_norm": 0.61565991339433, + "learning_rate": 6.353162000538903e-06, + "loss": 12.0279, + "step": 32664 + }, + { + "epoch": 1.7787397983840778, + "grad_norm": 0.5591293243317734, + "learning_rate": 6.350069342864429e-06, + "loss": 11.7781, + "step": 32665 + }, + { + "epoch": 1.7787942523806608, + "grad_norm": 0.5662041809359237, + "learning_rate": 6.346977413427524e-06, + "loss": 11.9033, + "step": 32666 + }, + { + "epoch": 1.7788487063772438, + "grad_norm": 0.5844082084590769, + "learning_rate": 6.343886212252215e-06, + "loss": 11.8189, + "step": 32667 + }, + { + "epoch": 1.7789031603738268, + "grad_norm": 0.7302541912977605, + "learning_rate": 6.340795739362559e-06, + "loss": 11.8814, + "step": 32668 + }, + { + "epoch": 1.7789576143704098, + "grad_norm": 0.5493051291355272, + "learning_rate": 6.337705994782573e-06, + "loss": 11.7421, + "step": 32669 + }, + { + "epoch": 1.7790120683669928, + "grad_norm": 0.5271900879138697, + "learning_rate": 6.3346169785363115e-06, + "loss": 11.7093, + "step": 32670 + }, + { + "epoch": 1.7790665223635758, + "grad_norm": 0.5556375173986846, + "learning_rate": 6.331528690647759e-06, + "loss": 11.7206, + "step": 32671 + }, + { + "epoch": 1.7791209763601588, + "grad_norm": 0.5979757578752126, + "learning_rate": 6.328441131140938e-06, + "loss": 11.7581, + "step": 32672 + }, + { + "epoch": 1.7791754303567417, + "grad_norm": 0.5278252118688105, + "learning_rate": 6.325354300039887e-06, + "loss": 11.8388, + "step": 32673 + }, + { + "epoch": 1.7792298843533247, + "grad_norm": 0.5879647844127346, + "learning_rate": 6.322268197368586e-06, + "loss": 11.7815, + "step": 32674 + }, + { + "epoch": 1.7792843383499077, + "grad_norm": 0.5437098437859124, + "learning_rate": 6.3191828231510375e-06, + "loss": 11.7574, + "step": 32675 + }, + { + "epoch": 1.7793387923464907, + "grad_norm": 0.5610889898473906, + "learning_rate": 6.316098177411245e-06, + "loss": 11.7519, + "step": 32676 + }, + { + "epoch": 1.7793932463430737, + "grad_norm": 0.5288993721382848, + "learning_rate": 6.313014260173167e-06, + "loss": 11.641, + "step": 32677 + }, + { + "epoch": 1.7794477003396567, + "grad_norm": 0.5735405654787178, + "learning_rate": 6.309931071460806e-06, + "loss": 11.7809, + "step": 32678 + }, + { + "epoch": 1.7795021543362397, + "grad_norm": 0.550847871540742, + "learning_rate": 6.3068486112981325e-06, + "loss": 11.7952, + "step": 32679 + }, + { + "epoch": 1.7795566083328227, + "grad_norm": 0.6224556952124183, + "learning_rate": 6.303766879709117e-06, + "loss": 11.8511, + "step": 32680 + }, + { + "epoch": 1.7796110623294057, + "grad_norm": 0.49699209659575416, + "learning_rate": 6.300685876717716e-06, + "loss": 11.7905, + "step": 32681 + }, + { + "epoch": 1.779665516325989, + "grad_norm": 0.5197056696772931, + "learning_rate": 6.2976056023479e-06, + "loss": 11.7658, + "step": 32682 + }, + { + "epoch": 1.779719970322572, + "grad_norm": 0.5299205418333491, + "learning_rate": 6.294526056623617e-06, + "loss": 11.7874, + "step": 32683 + }, + { + "epoch": 1.779774424319155, + "grad_norm": 0.5006191990495225, + "learning_rate": 6.291447239568815e-06, + "loss": 11.6852, + "step": 32684 + }, + { + "epoch": 1.7798288783157379, + "grad_norm": 0.5241928736389975, + "learning_rate": 6.28836915120744e-06, + "loss": 11.7525, + "step": 32685 + }, + { + "epoch": 1.7798833323123209, + "grad_norm": 0.5512105472519098, + "learning_rate": 6.28529179156343e-06, + "loss": 11.8235, + "step": 32686 + }, + { + "epoch": 1.7799377863089039, + "grad_norm": 0.5758030156783916, + "learning_rate": 6.282215160660676e-06, + "loss": 11.8157, + "step": 32687 + }, + { + "epoch": 1.779992240305487, + "grad_norm": 0.5578829898472907, + "learning_rate": 6.279139258523159e-06, + "loss": 11.6964, + "step": 32688 + }, + { + "epoch": 1.78004669430207, + "grad_norm": 0.5467188643625759, + "learning_rate": 6.27606408517476e-06, + "loss": 11.7781, + "step": 32689 + }, + { + "epoch": 1.780101148298653, + "grad_norm": 0.6195338947234599, + "learning_rate": 6.2729896406394155e-06, + "loss": 11.8318, + "step": 32690 + }, + { + "epoch": 1.780155602295236, + "grad_norm": 0.555715689687519, + "learning_rate": 6.269915924941006e-06, + "loss": 11.7176, + "step": 32691 + }, + { + "epoch": 1.780210056291819, + "grad_norm": 0.5294021269516758, + "learning_rate": 6.266842938103468e-06, + "loss": 11.8282, + "step": 32692 + }, + { + "epoch": 1.780264510288402, + "grad_norm": 0.5760740376196016, + "learning_rate": 6.2637706801506605e-06, + "loss": 11.7588, + "step": 32693 + }, + { + "epoch": 1.780318964284985, + "grad_norm": 0.5384086038782483, + "learning_rate": 6.2606991511064865e-06, + "loss": 11.783, + "step": 32694 + }, + { + "epoch": 1.780373418281568, + "grad_norm": 0.5761383915638604, + "learning_rate": 6.257628350994859e-06, + "loss": 11.8015, + "step": 32695 + }, + { + "epoch": 1.780427872278151, + "grad_norm": 0.546854857346783, + "learning_rate": 6.254558279839628e-06, + "loss": 11.8423, + "step": 32696 + }, + { + "epoch": 1.780482326274734, + "grad_norm": 0.5394379748626792, + "learning_rate": 6.251488937664674e-06, + "loss": 11.6775, + "step": 32697 + }, + { + "epoch": 1.780536780271317, + "grad_norm": 0.5814153801090587, + "learning_rate": 6.248420324493853e-06, + "loss": 11.7156, + "step": 32698 + }, + { + "epoch": 1.7805912342679, + "grad_norm": 0.5081416254563221, + "learning_rate": 6.245352440351037e-06, + "loss": 11.8127, + "step": 32699 + }, + { + "epoch": 1.780645688264483, + "grad_norm": 0.5885158572104521, + "learning_rate": 6.242285285260097e-06, + "loss": 11.7697, + "step": 32700 + }, + { + "epoch": 1.780700142261066, + "grad_norm": 0.5903081283855978, + "learning_rate": 6.2392188592448554e-06, + "loss": 11.8192, + "step": 32701 + }, + { + "epoch": 1.780754596257649, + "grad_norm": 0.5813850653756711, + "learning_rate": 6.236153162329184e-06, + "loss": 11.6511, + "step": 32702 + }, + { + "epoch": 1.780809050254232, + "grad_norm": 0.6015448142938231, + "learning_rate": 6.233088194536896e-06, + "loss": 11.8689, + "step": 32703 + }, + { + "epoch": 1.780863504250815, + "grad_norm": 0.5436128411710512, + "learning_rate": 6.2300239558918506e-06, + "loss": 11.7093, + "step": 32704 + }, + { + "epoch": 1.780917958247398, + "grad_norm": 0.601270612565341, + "learning_rate": 6.226960446417851e-06, + "loss": 11.7905, + "step": 32705 + }, + { + "epoch": 1.7809724122439812, + "grad_norm": 0.5890192276838048, + "learning_rate": 6.223897666138756e-06, + "loss": 11.9191, + "step": 32706 + }, + { + "epoch": 1.7810268662405642, + "grad_norm": 0.5455178899138761, + "learning_rate": 6.220835615078346e-06, + "loss": 11.7916, + "step": 32707 + }, + { + "epoch": 1.7810813202371472, + "grad_norm": 0.552575852189147, + "learning_rate": 6.217774293260436e-06, + "loss": 11.7924, + "step": 32708 + }, + { + "epoch": 1.7811357742337302, + "grad_norm": 0.526128252261358, + "learning_rate": 6.2147137007088405e-06, + "loss": 11.794, + "step": 32709 + }, + { + "epoch": 1.7811902282303131, + "grad_norm": 0.5799430183200035, + "learning_rate": 6.2116538374473735e-06, + "loss": 11.7877, + "step": 32710 + }, + { + "epoch": 1.7812446822268961, + "grad_norm": 0.5552499412949681, + "learning_rate": 6.208594703499804e-06, + "loss": 11.8429, + "step": 32711 + }, + { + "epoch": 1.7812991362234794, + "grad_norm": 0.5965677348838014, + "learning_rate": 6.2055362988899355e-06, + "loss": 11.8804, + "step": 32712 + }, + { + "epoch": 1.7813535902200623, + "grad_norm": 0.5188102393525604, + "learning_rate": 6.202478623641539e-06, + "loss": 11.7638, + "step": 32713 + }, + { + "epoch": 1.7814080442166453, + "grad_norm": 0.5312133620617769, + "learning_rate": 6.1994216777784166e-06, + "loss": 11.6473, + "step": 32714 + }, + { + "epoch": 1.7814624982132283, + "grad_norm": 0.5897830592166103, + "learning_rate": 6.196365461324305e-06, + "loss": 11.805, + "step": 32715 + }, + { + "epoch": 1.7815169522098113, + "grad_norm": 0.546088818681991, + "learning_rate": 6.193309974302997e-06, + "loss": 11.8009, + "step": 32716 + }, + { + "epoch": 1.7815714062063943, + "grad_norm": 0.5243351059626951, + "learning_rate": 6.190255216738239e-06, + "loss": 11.7343, + "step": 32717 + }, + { + "epoch": 1.7816258602029773, + "grad_norm": 0.5256419405680276, + "learning_rate": 6.187201188653779e-06, + "loss": 11.7348, + "step": 32718 + }, + { + "epoch": 1.7816803141995603, + "grad_norm": 0.5594804599033242, + "learning_rate": 6.184147890073388e-06, + "loss": 11.692, + "step": 32719 + }, + { + "epoch": 1.7817347681961433, + "grad_norm": 0.5321778004033472, + "learning_rate": 6.18109532102078e-06, + "loss": 11.7293, + "step": 32720 + }, + { + "epoch": 1.7817892221927263, + "grad_norm": 0.5011377673136107, + "learning_rate": 6.178043481519713e-06, + "loss": 11.7447, + "step": 32721 + }, + { + "epoch": 1.7818436761893093, + "grad_norm": 0.5403480612341243, + "learning_rate": 6.174992371593924e-06, + "loss": 11.8208, + "step": 32722 + }, + { + "epoch": 1.7818981301858923, + "grad_norm": 0.5525945969178169, + "learning_rate": 6.171941991267116e-06, + "loss": 11.8392, + "step": 32723 + }, + { + "epoch": 1.7819525841824753, + "grad_norm": 0.5598211056078931, + "learning_rate": 6.168892340563037e-06, + "loss": 11.7512, + "step": 32724 + }, + { + "epoch": 1.7820070381790583, + "grad_norm": 0.5632965586119917, + "learning_rate": 6.165843419505379e-06, + "loss": 11.696, + "step": 32725 + }, + { + "epoch": 1.7820614921756412, + "grad_norm": 0.5739009722718315, + "learning_rate": 6.162795228117868e-06, + "loss": 11.8494, + "step": 32726 + }, + { + "epoch": 1.7821159461722242, + "grad_norm": 0.5474179586942003, + "learning_rate": 6.159747766424195e-06, + "loss": 11.8151, + "step": 32727 + }, + { + "epoch": 1.7821704001688072, + "grad_norm": 0.5117156862463175, + "learning_rate": 6.156701034448042e-06, + "loss": 11.7808, + "step": 32728 + }, + { + "epoch": 1.7822248541653904, + "grad_norm": 0.5650488556079817, + "learning_rate": 6.153655032213135e-06, + "loss": 11.7281, + "step": 32729 + }, + { + "epoch": 1.7822793081619734, + "grad_norm": 0.584714363894475, + "learning_rate": 6.150609759743131e-06, + "loss": 11.7672, + "step": 32730 + }, + { + "epoch": 1.7823337621585564, + "grad_norm": 0.5345534779849421, + "learning_rate": 6.147565217061735e-06, + "loss": 11.8112, + "step": 32731 + }, + { + "epoch": 1.7823882161551394, + "grad_norm": 0.5670964215084656, + "learning_rate": 6.144521404192616e-06, + "loss": 11.8892, + "step": 32732 + }, + { + "epoch": 1.7824426701517224, + "grad_norm": 0.6037630502503243, + "learning_rate": 6.141478321159422e-06, + "loss": 12.0556, + "step": 32733 + }, + { + "epoch": 1.7824971241483054, + "grad_norm": 0.5704912551789066, + "learning_rate": 6.138435967985845e-06, + "loss": 11.7784, + "step": 32734 + }, + { + "epoch": 1.7825515781448886, + "grad_norm": 0.4837017365684751, + "learning_rate": 6.135394344695522e-06, + "loss": 11.7829, + "step": 32735 + }, + { + "epoch": 1.7826060321414716, + "grad_norm": 0.5633150706322234, + "learning_rate": 6.132353451312134e-06, + "loss": 11.7697, + "step": 32736 + }, + { + "epoch": 1.7826604861380546, + "grad_norm": 0.5994231024040529, + "learning_rate": 6.129313287859295e-06, + "loss": 11.8579, + "step": 32737 + }, + { + "epoch": 1.7827149401346376, + "grad_norm": 0.5598219524080285, + "learning_rate": 6.126273854360653e-06, + "loss": 11.8211, + "step": 32738 + }, + { + "epoch": 1.7827693941312206, + "grad_norm": 0.5437292911132128, + "learning_rate": 6.123235150839857e-06, + "loss": 11.7411, + "step": 32739 + }, + { + "epoch": 1.7828238481278036, + "grad_norm": 0.5355352760056432, + "learning_rate": 6.120197177320508e-06, + "loss": 11.8826, + "step": 32740 + }, + { + "epoch": 1.7828783021243866, + "grad_norm": 0.5801272054397876, + "learning_rate": 6.117159933826277e-06, + "loss": 11.8695, + "step": 32741 + }, + { + "epoch": 1.7829327561209696, + "grad_norm": 0.6026029119665858, + "learning_rate": 6.114123420380724e-06, + "loss": 11.7992, + "step": 32742 + }, + { + "epoch": 1.7829872101175526, + "grad_norm": 0.5090448052521881, + "learning_rate": 6.111087637007506e-06, + "loss": 11.8298, + "step": 32743 + }, + { + "epoch": 1.7830416641141356, + "grad_norm": 0.5809190171254183, + "learning_rate": 6.1080525837302284e-06, + "loss": 11.8446, + "step": 32744 + }, + { + "epoch": 1.7830961181107186, + "grad_norm": 0.5084379390934818, + "learning_rate": 6.105018260572459e-06, + "loss": 11.6902, + "step": 32745 + }, + { + "epoch": 1.7831505721073015, + "grad_norm": 0.5102844516896258, + "learning_rate": 6.101984667557836e-06, + "loss": 11.7845, + "step": 32746 + }, + { + "epoch": 1.7832050261038845, + "grad_norm": 0.5393110489205621, + "learning_rate": 6.098951804709918e-06, + "loss": 11.7556, + "step": 32747 + }, + { + "epoch": 1.7832594801004675, + "grad_norm": 0.5610236594888275, + "learning_rate": 6.095919672052286e-06, + "loss": 11.8582, + "step": 32748 + }, + { + "epoch": 1.7833139340970505, + "grad_norm": 0.5295984301667501, + "learning_rate": 6.0928882696085435e-06, + "loss": 11.8565, + "step": 32749 + }, + { + "epoch": 1.7833683880936335, + "grad_norm": 0.5455010219919253, + "learning_rate": 6.0898575974022375e-06, + "loss": 11.7454, + "step": 32750 + }, + { + "epoch": 1.7834228420902165, + "grad_norm": 0.532496244299336, + "learning_rate": 6.0868276554569725e-06, + "loss": 11.7405, + "step": 32751 + }, + { + "epoch": 1.7834772960867997, + "grad_norm": 0.5289465348200701, + "learning_rate": 6.083798443796251e-06, + "loss": 11.7201, + "step": 32752 + }, + { + "epoch": 1.7835317500833827, + "grad_norm": 0.5679209360527092, + "learning_rate": 6.080769962443689e-06, + "loss": 11.7737, + "step": 32753 + }, + { + "epoch": 1.7835862040799657, + "grad_norm": 0.521273573393573, + "learning_rate": 6.077742211422799e-06, + "loss": 11.6698, + "step": 32754 + }, + { + "epoch": 1.7836406580765487, + "grad_norm": 0.5747886977133413, + "learning_rate": 6.07471519075713e-06, + "loss": 11.8493, + "step": 32755 + }, + { + "epoch": 1.7836951120731317, + "grad_norm": 0.5486564983341604, + "learning_rate": 6.071688900470251e-06, + "loss": 11.5643, + "step": 32756 + }, + { + "epoch": 1.7837495660697147, + "grad_norm": 0.5807463120178638, + "learning_rate": 6.068663340585656e-06, + "loss": 11.7643, + "step": 32757 + }, + { + "epoch": 1.783804020066298, + "grad_norm": 0.6299894467839595, + "learning_rate": 6.065638511126881e-06, + "loss": 11.8122, + "step": 32758 + }, + { + "epoch": 1.783858474062881, + "grad_norm": 0.5433739098086517, + "learning_rate": 6.062614412117462e-06, + "loss": 11.8107, + "step": 32759 + }, + { + "epoch": 1.7839129280594639, + "grad_norm": 0.5579663249929967, + "learning_rate": 6.059591043580892e-06, + "loss": 11.7298, + "step": 32760 + }, + { + "epoch": 1.7839673820560469, + "grad_norm": 0.5379817593054868, + "learning_rate": 6.056568405540719e-06, + "loss": 11.8564, + "step": 32761 + }, + { + "epoch": 1.7840218360526299, + "grad_norm": 0.5656778190490366, + "learning_rate": 6.053546498020401e-06, + "loss": 11.7822, + "step": 32762 + }, + { + "epoch": 1.7840762900492129, + "grad_norm": 0.517132446100167, + "learning_rate": 6.050525321043487e-06, + "loss": 11.677, + "step": 32763 + }, + { + "epoch": 1.7841307440457959, + "grad_norm": 0.5526752874623392, + "learning_rate": 6.047504874633414e-06, + "loss": 11.683, + "step": 32764 + }, + { + "epoch": 1.7841851980423789, + "grad_norm": 0.6635846424445987, + "learning_rate": 6.0444851588137065e-06, + "loss": 11.8886, + "step": 32765 + }, + { + "epoch": 1.7842396520389618, + "grad_norm": 0.5028456486693429, + "learning_rate": 6.041466173607868e-06, + "loss": 11.7425, + "step": 32766 + }, + { + "epoch": 1.7842941060355448, + "grad_norm": 0.5358399233934722, + "learning_rate": 6.038447919039314e-06, + "loss": 11.8024, + "step": 32767 + }, + { + "epoch": 1.7843485600321278, + "grad_norm": 0.6545409848659228, + "learning_rate": 6.03543039513157e-06, + "loss": 11.932, + "step": 32768 + }, + { + "epoch": 1.7844030140287108, + "grad_norm": 0.5409977093844731, + "learning_rate": 6.032413601908049e-06, + "loss": 11.7231, + "step": 32769 + }, + { + "epoch": 1.7844574680252938, + "grad_norm": 0.5547535912506245, + "learning_rate": 6.029397539392246e-06, + "loss": 11.8589, + "step": 32770 + }, + { + "epoch": 1.7845119220218768, + "grad_norm": 0.5452665682740018, + "learning_rate": 6.026382207607617e-06, + "loss": 11.8653, + "step": 32771 + }, + { + "epoch": 1.7845663760184598, + "grad_norm": 0.5795592234860748, + "learning_rate": 6.02336760657759e-06, + "loss": 11.8091, + "step": 32772 + }, + { + "epoch": 1.7846208300150428, + "grad_norm": 0.5758528998235982, + "learning_rate": 6.020353736325635e-06, + "loss": 11.7532, + "step": 32773 + }, + { + "epoch": 1.7846752840116258, + "grad_norm": 0.5091401476597546, + "learning_rate": 6.017340596875143e-06, + "loss": 11.6873, + "step": 32774 + }, + { + "epoch": 1.7847297380082088, + "grad_norm": 0.531788250732534, + "learning_rate": 6.014328188249596e-06, + "loss": 11.8192, + "step": 32775 + }, + { + "epoch": 1.784784192004792, + "grad_norm": 0.5296134574860314, + "learning_rate": 6.011316510472386e-06, + "loss": 11.7253, + "step": 32776 + }, + { + "epoch": 1.784838646001375, + "grad_norm": 0.527924853212081, + "learning_rate": 6.00830556356693e-06, + "loss": 11.8661, + "step": 32777 + }, + { + "epoch": 1.784893099997958, + "grad_norm": 0.5520828888280412, + "learning_rate": 6.005295347556672e-06, + "loss": 11.7979, + "step": 32778 + }, + { + "epoch": 1.784947553994541, + "grad_norm": 0.5351865975895744, + "learning_rate": 6.0022858624649735e-06, + "loss": 11.7371, + "step": 32779 + }, + { + "epoch": 1.785002007991124, + "grad_norm": 0.537562345824475, + "learning_rate": 5.999277108315271e-06, + "loss": 11.749, + "step": 32780 + }, + { + "epoch": 1.785056461987707, + "grad_norm": 0.6614506627680491, + "learning_rate": 5.9962690851309675e-06, + "loss": 11.9624, + "step": 32781 + }, + { + "epoch": 1.7851109159842902, + "grad_norm": 0.532982334405565, + "learning_rate": 5.993261792935423e-06, + "loss": 11.8047, + "step": 32782 + }, + { + "epoch": 1.7851653699808732, + "grad_norm": 0.565314211049647, + "learning_rate": 5.990255231752062e-06, + "loss": 11.8521, + "step": 32783 + }, + { + "epoch": 1.7852198239774562, + "grad_norm": 0.559505079002129, + "learning_rate": 5.9872494016042225e-06, + "loss": 11.9165, + "step": 32784 + }, + { + "epoch": 1.7852742779740391, + "grad_norm": 0.49870783123113654, + "learning_rate": 5.984244302515307e-06, + "loss": 11.6201, + "step": 32785 + }, + { + "epoch": 1.7853287319706221, + "grad_norm": 0.5204790619077684, + "learning_rate": 5.981239934508686e-06, + "loss": 11.8264, + "step": 32786 + }, + { + "epoch": 1.7853831859672051, + "grad_norm": 0.5996319298658219, + "learning_rate": 5.9782362976076865e-06, + "loss": 11.8328, + "step": 32787 + }, + { + "epoch": 1.7854376399637881, + "grad_norm": 0.5637128176984325, + "learning_rate": 5.97523339183571e-06, + "loss": 11.8389, + "step": 32788 + }, + { + "epoch": 1.7854920939603711, + "grad_norm": 0.5489482003779558, + "learning_rate": 5.972231217216062e-06, + "loss": 11.7052, + "step": 32789 + }, + { + "epoch": 1.785546547956954, + "grad_norm": 0.6024242860078325, + "learning_rate": 5.969229773772134e-06, + "loss": 11.9011, + "step": 32790 + }, + { + "epoch": 1.785601001953537, + "grad_norm": 0.5263928789986556, + "learning_rate": 5.966229061527229e-06, + "loss": 11.8157, + "step": 32791 + }, + { + "epoch": 1.78565545595012, + "grad_norm": 0.5241021630678591, + "learning_rate": 5.963229080504696e-06, + "loss": 11.7763, + "step": 32792 + }, + { + "epoch": 1.785709909946703, + "grad_norm": 0.5216179403005902, + "learning_rate": 5.960229830727882e-06, + "loss": 11.7932, + "step": 32793 + }, + { + "epoch": 1.785764363943286, + "grad_norm": 0.5423555008380151, + "learning_rate": 5.957231312220069e-06, + "loss": 11.8024, + "step": 32794 + }, + { + "epoch": 1.785818817939869, + "grad_norm": 0.5718442422207454, + "learning_rate": 5.9542335250046155e-06, + "loss": 11.5255, + "step": 32795 + }, + { + "epoch": 1.785873271936452, + "grad_norm": 0.5243168265178102, + "learning_rate": 5.951236469104815e-06, + "loss": 11.7026, + "step": 32796 + }, + { + "epoch": 1.785927725933035, + "grad_norm": 0.48359023598502987, + "learning_rate": 5.9482401445439595e-06, + "loss": 11.7673, + "step": 32797 + }, + { + "epoch": 1.785982179929618, + "grad_norm": 0.5389200397432837, + "learning_rate": 5.945244551345364e-06, + "loss": 11.7954, + "step": 32798 + }, + { + "epoch": 1.7860366339262013, + "grad_norm": 0.5026237159577307, + "learning_rate": 5.94224968953232e-06, + "loss": 11.6851, + "step": 32799 + }, + { + "epoch": 1.7860910879227843, + "grad_norm": 0.5157097858037699, + "learning_rate": 5.939255559128121e-06, + "loss": 11.7758, + "step": 32800 + }, + { + "epoch": 1.7861455419193673, + "grad_norm": 0.5293425019336567, + "learning_rate": 5.936262160156025e-06, + "loss": 11.7399, + "step": 32801 + }, + { + "epoch": 1.7861999959159502, + "grad_norm": 0.4931659322533404, + "learning_rate": 5.933269492639359e-06, + "loss": 11.6566, + "step": 32802 + }, + { + "epoch": 1.7862544499125332, + "grad_norm": 0.5246199651328718, + "learning_rate": 5.930277556601338e-06, + "loss": 11.7588, + "step": 32803 + }, + { + "epoch": 1.7863089039091162, + "grad_norm": 0.545291086389174, + "learning_rate": 5.927286352065253e-06, + "loss": 11.7092, + "step": 32804 + }, + { + "epoch": 1.7863633579056994, + "grad_norm": 0.6090685558430966, + "learning_rate": 5.9242958790543865e-06, + "loss": 11.859, + "step": 32805 + }, + { + "epoch": 1.7864178119022824, + "grad_norm": 0.5063771339523607, + "learning_rate": 5.921306137591975e-06, + "loss": 11.637, + "step": 32806 + }, + { + "epoch": 1.7864722658988654, + "grad_norm": 0.5353859595848508, + "learning_rate": 5.918317127701245e-06, + "loss": 11.8884, + "step": 32807 + }, + { + "epoch": 1.7865267198954484, + "grad_norm": 0.5738267069233514, + "learning_rate": 5.915328849405466e-06, + "loss": 11.8586, + "step": 32808 + }, + { + "epoch": 1.7865811738920314, + "grad_norm": 0.5902060104006173, + "learning_rate": 5.912341302727864e-06, + "loss": 11.7625, + "step": 32809 + }, + { + "epoch": 1.7866356278886144, + "grad_norm": 0.5447056975591531, + "learning_rate": 5.909354487691687e-06, + "loss": 11.8551, + "step": 32810 + }, + { + "epoch": 1.7866900818851974, + "grad_norm": 0.6176259087339802, + "learning_rate": 5.9063684043201285e-06, + "loss": 11.75, + "step": 32811 + }, + { + "epoch": 1.7867445358817804, + "grad_norm": 0.5873460302544379, + "learning_rate": 5.903383052636446e-06, + "loss": 11.7537, + "step": 32812 + }, + { + "epoch": 1.7867989898783634, + "grad_norm": 0.5253352394469801, + "learning_rate": 5.900398432663823e-06, + "loss": 11.84, + "step": 32813 + }, + { + "epoch": 1.7868534438749464, + "grad_norm": 0.583531651090751, + "learning_rate": 5.897414544425483e-06, + "loss": 11.9039, + "step": 32814 + }, + { + "epoch": 1.7869078978715294, + "grad_norm": 0.5496762027681773, + "learning_rate": 5.894431387944644e-06, + "loss": 11.6629, + "step": 32815 + }, + { + "epoch": 1.7869623518681124, + "grad_norm": 0.5588490667752386, + "learning_rate": 5.891448963244483e-06, + "loss": 11.7954, + "step": 32816 + }, + { + "epoch": 1.7870168058646954, + "grad_norm": 0.5534679792393347, + "learning_rate": 5.888467270348208e-06, + "loss": 11.7441, + "step": 32817 + }, + { + "epoch": 1.7870712598612783, + "grad_norm": 0.5075673098860951, + "learning_rate": 5.885486309278964e-06, + "loss": 11.7683, + "step": 32818 + }, + { + "epoch": 1.7871257138578613, + "grad_norm": 0.582522103117588, + "learning_rate": 5.882506080059968e-06, + "loss": 11.7187, + "step": 32819 + }, + { + "epoch": 1.7871801678544443, + "grad_norm": 0.560677545645884, + "learning_rate": 5.879526582714412e-06, + "loss": 11.7653, + "step": 32820 + }, + { + "epoch": 1.7872346218510273, + "grad_norm": 0.5288387157816162, + "learning_rate": 5.876547817265421e-06, + "loss": 11.7938, + "step": 32821 + }, + { + "epoch": 1.7872890758476105, + "grad_norm": 0.5107213676622757, + "learning_rate": 5.873569783736188e-06, + "loss": 11.748, + "step": 32822 + }, + { + "epoch": 1.7873435298441935, + "grad_norm": 0.5409498270161708, + "learning_rate": 5.87059248214985e-06, + "loss": 11.8192, + "step": 32823 + }, + { + "epoch": 1.7873979838407765, + "grad_norm": 0.4982023751157804, + "learning_rate": 5.867615912529589e-06, + "loss": 11.7029, + "step": 32824 + }, + { + "epoch": 1.7874524378373595, + "grad_norm": 0.5016788576366078, + "learning_rate": 5.864640074898509e-06, + "loss": 11.7278, + "step": 32825 + }, + { + "epoch": 1.7875068918339425, + "grad_norm": 0.6166524688272519, + "learning_rate": 5.861664969279779e-06, + "loss": 11.8109, + "step": 32826 + }, + { + "epoch": 1.7875613458305255, + "grad_norm": 0.6151538621448444, + "learning_rate": 5.858690595696559e-06, + "loss": 11.8117, + "step": 32827 + }, + { + "epoch": 1.7876157998271087, + "grad_norm": 0.5488559382736814, + "learning_rate": 5.855716954171919e-06, + "loss": 11.7301, + "step": 32828 + }, + { + "epoch": 1.7876702538236917, + "grad_norm": 0.5040495965532082, + "learning_rate": 5.852744044729008e-06, + "loss": 11.8371, + "step": 32829 + }, + { + "epoch": 1.7877247078202747, + "grad_norm": 0.5197784734437446, + "learning_rate": 5.849771867390974e-06, + "loss": 11.7459, + "step": 32830 + }, + { + "epoch": 1.7877791618168577, + "grad_norm": 0.5692484554854838, + "learning_rate": 5.846800422180876e-06, + "loss": 11.7582, + "step": 32831 + }, + { + "epoch": 1.7878336158134407, + "grad_norm": 0.5860774792490339, + "learning_rate": 5.843829709121862e-06, + "loss": 11.8864, + "step": 32832 + }, + { + "epoch": 1.7878880698100237, + "grad_norm": 0.5093837756154763, + "learning_rate": 5.840859728237013e-06, + "loss": 11.6715, + "step": 32833 + }, + { + "epoch": 1.7879425238066067, + "grad_norm": 0.529759685057493, + "learning_rate": 5.837890479549435e-06, + "loss": 11.8237, + "step": 32834 + }, + { + "epoch": 1.7879969778031897, + "grad_norm": 0.5358648716126443, + "learning_rate": 5.834921963082207e-06, + "loss": 11.7788, + "step": 32835 + }, + { + "epoch": 1.7880514317997727, + "grad_norm": 0.49667954301496087, + "learning_rate": 5.831954178858412e-06, + "loss": 11.6285, + "step": 32836 + }, + { + "epoch": 1.7881058857963557, + "grad_norm": 0.5044001812051533, + "learning_rate": 5.828987126901164e-06, + "loss": 11.6813, + "step": 32837 + }, + { + "epoch": 1.7881603397929386, + "grad_norm": 0.49883837962208377, + "learning_rate": 5.826020807233467e-06, + "loss": 11.8703, + "step": 32838 + }, + { + "epoch": 1.7882147937895216, + "grad_norm": 0.520880660530782, + "learning_rate": 5.823055219878448e-06, + "loss": 11.7585, + "step": 32839 + }, + { + "epoch": 1.7882692477861046, + "grad_norm": 0.5862275949805851, + "learning_rate": 5.820090364859132e-06, + "loss": 11.7042, + "step": 32840 + }, + { + "epoch": 1.7883237017826876, + "grad_norm": 0.6167287607529381, + "learning_rate": 5.817126242198578e-06, + "loss": 11.7581, + "step": 32841 + }, + { + "epoch": 1.7883781557792706, + "grad_norm": 0.5315434795855483, + "learning_rate": 5.814162851919869e-06, + "loss": 11.567, + "step": 32842 + }, + { + "epoch": 1.7884326097758536, + "grad_norm": 0.5537450764862275, + "learning_rate": 5.811200194046007e-06, + "loss": 11.7587, + "step": 32843 + }, + { + "epoch": 1.7884870637724366, + "grad_norm": 0.5531575408292612, + "learning_rate": 5.808238268600064e-06, + "loss": 11.929, + "step": 32844 + }, + { + "epoch": 1.7885415177690196, + "grad_norm": 0.5542420036282248, + "learning_rate": 5.8052770756050315e-06, + "loss": 11.7609, + "step": 32845 + }, + { + "epoch": 1.7885959717656028, + "grad_norm": 0.5390362602659625, + "learning_rate": 5.80231661508398e-06, + "loss": 11.7488, + "step": 32846 + }, + { + "epoch": 1.7886504257621858, + "grad_norm": 0.5517606227076052, + "learning_rate": 5.799356887059915e-06, + "loss": 11.844, + "step": 32847 + }, + { + "epoch": 1.7887048797587688, + "grad_norm": 0.5451060092260289, + "learning_rate": 5.7963978915558384e-06, + "loss": 11.7586, + "step": 32848 + }, + { + "epoch": 1.7887593337553518, + "grad_norm": 0.5593841263572491, + "learning_rate": 5.793439628594777e-06, + "loss": 11.7846, + "step": 32849 + }, + { + "epoch": 1.7888137877519348, + "grad_norm": 0.5517176516321998, + "learning_rate": 5.7904820981997125e-06, + "loss": 11.7455, + "step": 32850 + }, + { + "epoch": 1.788868241748518, + "grad_norm": 0.5971910759972061, + "learning_rate": 5.787525300393681e-06, + "loss": 11.8001, + "step": 32851 + }, + { + "epoch": 1.788922695745101, + "grad_norm": 0.5457154284841771, + "learning_rate": 5.784569235199633e-06, + "loss": 11.7141, + "step": 32852 + }, + { + "epoch": 1.788977149741684, + "grad_norm": 0.5036840400789974, + "learning_rate": 5.78161390264057e-06, + "loss": 11.8536, + "step": 32853 + }, + { + "epoch": 1.789031603738267, + "grad_norm": 0.5905699640468144, + "learning_rate": 5.778659302739497e-06, + "loss": 11.6751, + "step": 32854 + }, + { + "epoch": 1.78908605773485, + "grad_norm": 0.4858515758353979, + "learning_rate": 5.775705435519351e-06, + "loss": 11.6447, + "step": 32855 + }, + { + "epoch": 1.789140511731433, + "grad_norm": 0.49867023838384755, + "learning_rate": 5.772752301003148e-06, + "loss": 11.6954, + "step": 32856 + }, + { + "epoch": 1.789194965728016, + "grad_norm": 0.5234473695025423, + "learning_rate": 5.769799899213812e-06, + "loss": 11.8778, + "step": 32857 + }, + { + "epoch": 1.789249419724599, + "grad_norm": 0.5285744211556234, + "learning_rate": 5.766848230174304e-06, + "loss": 11.8652, + "step": 32858 + }, + { + "epoch": 1.789303873721182, + "grad_norm": 0.5694143238664334, + "learning_rate": 5.763897293907605e-06, + "loss": 11.6405, + "step": 32859 + }, + { + "epoch": 1.789358327717765, + "grad_norm": 0.5514348461301628, + "learning_rate": 5.760947090436619e-06, + "loss": 11.7148, + "step": 32860 + }, + { + "epoch": 1.789412781714348, + "grad_norm": 0.5305600285038201, + "learning_rate": 5.757997619784339e-06, + "loss": 11.7894, + "step": 32861 + }, + { + "epoch": 1.789467235710931, + "grad_norm": 0.5262665324909412, + "learning_rate": 5.755048881973657e-06, + "loss": 11.8313, + "step": 32862 + }, + { + "epoch": 1.789521689707514, + "grad_norm": 0.5583189119528796, + "learning_rate": 5.752100877027511e-06, + "loss": 11.8179, + "step": 32863 + }, + { + "epoch": 1.789576143704097, + "grad_norm": 0.5139461141436407, + "learning_rate": 5.74915360496886e-06, + "loss": 11.6874, + "step": 32864 + }, + { + "epoch": 1.78963059770068, + "grad_norm": 0.531811869166695, + "learning_rate": 5.746207065820575e-06, + "loss": 11.9065, + "step": 32865 + }, + { + "epoch": 1.7896850516972629, + "grad_norm": 0.5573450083285999, + "learning_rate": 5.743261259605603e-06, + "loss": 11.7443, + "step": 32866 + }, + { + "epoch": 1.7897395056938459, + "grad_norm": 0.5363820158103323, + "learning_rate": 5.740316186346839e-06, + "loss": 11.7429, + "step": 32867 + }, + { + "epoch": 1.7897939596904289, + "grad_norm": 0.5264741953493456, + "learning_rate": 5.737371846067174e-06, + "loss": 11.5681, + "step": 32868 + }, + { + "epoch": 1.789848413687012, + "grad_norm": 0.5184990745929842, + "learning_rate": 5.734428238789524e-06, + "loss": 11.8289, + "step": 32869 + }, + { + "epoch": 1.789902867683595, + "grad_norm": 0.5524714354586424, + "learning_rate": 5.73148536453676e-06, + "loss": 11.6553, + "step": 32870 + }, + { + "epoch": 1.789957321680178, + "grad_norm": 0.5484058375687554, + "learning_rate": 5.728543223331784e-06, + "loss": 11.8601, + "step": 32871 + }, + { + "epoch": 1.790011775676761, + "grad_norm": 0.5695995821207973, + "learning_rate": 5.725601815197445e-06, + "loss": 11.7591, + "step": 32872 + }, + { + "epoch": 1.790066229673344, + "grad_norm": 0.57874390247035, + "learning_rate": 5.72266114015666e-06, + "loss": 11.7745, + "step": 32873 + }, + { + "epoch": 1.790120683669927, + "grad_norm": 0.531637325728104, + "learning_rate": 5.719721198232253e-06, + "loss": 11.7771, + "step": 32874 + }, + { + "epoch": 1.7901751376665103, + "grad_norm": 0.5462136666414714, + "learning_rate": 5.716781989447106e-06, + "loss": 11.6699, + "step": 32875 + }, + { + "epoch": 1.7902295916630933, + "grad_norm": 0.503406205559153, + "learning_rate": 5.713843513824091e-06, + "loss": 11.6755, + "step": 32876 + }, + { + "epoch": 1.7902840456596762, + "grad_norm": 0.4759854353265222, + "learning_rate": 5.710905771386043e-06, + "loss": 11.6307, + "step": 32877 + }, + { + "epoch": 1.7903384996562592, + "grad_norm": 0.5647717676050596, + "learning_rate": 5.70796876215578e-06, + "loss": 11.7854, + "step": 32878 + }, + { + "epoch": 1.7903929536528422, + "grad_norm": 0.5050902283551958, + "learning_rate": 5.705032486156181e-06, + "loss": 11.5763, + "step": 32879 + }, + { + "epoch": 1.7904474076494252, + "grad_norm": 0.5145242576764114, + "learning_rate": 5.702096943410052e-06, + "loss": 11.7488, + "step": 32880 + }, + { + "epoch": 1.7905018616460082, + "grad_norm": 0.5428305930845646, + "learning_rate": 5.69916213394025e-06, + "loss": 11.7431, + "step": 32881 + }, + { + "epoch": 1.7905563156425912, + "grad_norm": 0.5071416773790963, + "learning_rate": 5.696228057769559e-06, + "loss": 11.7854, + "step": 32882 + }, + { + "epoch": 1.7906107696391742, + "grad_norm": 0.5550151678472932, + "learning_rate": 5.693294714920816e-06, + "loss": 11.8018, + "step": 32883 + }, + { + "epoch": 1.7906652236357572, + "grad_norm": 0.5399487750150528, + "learning_rate": 5.690362105416825e-06, + "loss": 11.7867, + "step": 32884 + }, + { + "epoch": 1.7907196776323402, + "grad_norm": 0.5485015365277324, + "learning_rate": 5.6874302292803995e-06, + "loss": 11.682, + "step": 32885 + }, + { + "epoch": 1.7907741316289232, + "grad_norm": 0.7571456953560544, + "learning_rate": 5.684499086534345e-06, + "loss": 11.9283, + "step": 32886 + }, + { + "epoch": 1.7908285856255062, + "grad_norm": 0.5273593563894615, + "learning_rate": 5.681568677201432e-06, + "loss": 11.7999, + "step": 32887 + }, + { + "epoch": 1.7908830396220892, + "grad_norm": 0.626783140819843, + "learning_rate": 5.678639001304464e-06, + "loss": 11.9423, + "step": 32888 + }, + { + "epoch": 1.7909374936186722, + "grad_norm": 0.5420035564587004, + "learning_rate": 5.675710058866202e-06, + "loss": 11.8615, + "step": 32889 + }, + { + "epoch": 1.7909919476152552, + "grad_norm": 0.518446198273281, + "learning_rate": 5.672781849909436e-06, + "loss": 11.6747, + "step": 32890 + }, + { + "epoch": 1.7910464016118381, + "grad_norm": 0.5310628896367843, + "learning_rate": 5.669854374456962e-06, + "loss": 11.7012, + "step": 32891 + }, + { + "epoch": 1.7911008556084214, + "grad_norm": 0.5225573995709512, + "learning_rate": 5.666927632531494e-06, + "loss": 11.8082, + "step": 32892 + }, + { + "epoch": 1.7911553096050044, + "grad_norm": 0.5447897525909504, + "learning_rate": 5.664001624155835e-06, + "loss": 11.8022, + "step": 32893 + }, + { + "epoch": 1.7912097636015873, + "grad_norm": 0.5324193042929436, + "learning_rate": 5.661076349352701e-06, + "loss": 11.9058, + "step": 32894 + }, + { + "epoch": 1.7912642175981703, + "grad_norm": 0.6308899564451923, + "learning_rate": 5.658151808144874e-06, + "loss": 11.6467, + "step": 32895 + }, + { + "epoch": 1.7913186715947533, + "grad_norm": 0.5388995465943365, + "learning_rate": 5.655228000555069e-06, + "loss": 11.7834, + "step": 32896 + }, + { + "epoch": 1.7913731255913363, + "grad_norm": 0.5674606228068833, + "learning_rate": 5.652304926606045e-06, + "loss": 11.7996, + "step": 32897 + }, + { + "epoch": 1.7914275795879195, + "grad_norm": 0.5652502506867298, + "learning_rate": 5.649382586320517e-06, + "loss": 11.7887, + "step": 32898 + }, + { + "epoch": 1.7914820335845025, + "grad_norm": 0.5457927536965536, + "learning_rate": 5.646460979721202e-06, + "loss": 11.6952, + "step": 32899 + }, + { + "epoch": 1.7915364875810855, + "grad_norm": 0.6501923471868624, + "learning_rate": 5.643540106830825e-06, + "loss": 11.9492, + "step": 32900 + }, + { + "epoch": 1.7915909415776685, + "grad_norm": 0.5184715250681738, + "learning_rate": 5.640619967672123e-06, + "loss": 11.789, + "step": 32901 + }, + { + "epoch": 1.7916453955742515, + "grad_norm": 0.5407044796019927, + "learning_rate": 5.637700562267767e-06, + "loss": 11.8146, + "step": 32902 + }, + { + "epoch": 1.7916998495708345, + "grad_norm": 0.5577398170497451, + "learning_rate": 5.634781890640484e-06, + "loss": 11.7791, + "step": 32903 + }, + { + "epoch": 1.7917543035674175, + "grad_norm": 0.564337721593817, + "learning_rate": 5.631863952812955e-06, + "loss": 11.9121, + "step": 32904 + }, + { + "epoch": 1.7918087575640005, + "grad_norm": 0.5491338197300244, + "learning_rate": 5.628946748807895e-06, + "loss": 11.7338, + "step": 32905 + }, + { + "epoch": 1.7918632115605835, + "grad_norm": 0.529156402740374, + "learning_rate": 5.626030278647954e-06, + "loss": 11.7318, + "step": 32906 + }, + { + "epoch": 1.7919176655571665, + "grad_norm": 0.6292734936144189, + "learning_rate": 5.623114542355845e-06, + "loss": 11.7865, + "step": 32907 + }, + { + "epoch": 1.7919721195537495, + "grad_norm": 0.558882568823229, + "learning_rate": 5.620199539954218e-06, + "loss": 11.9001, + "step": 32908 + }, + { + "epoch": 1.7920265735503325, + "grad_norm": 0.5287236011991581, + "learning_rate": 5.6172852714657335e-06, + "loss": 11.6772, + "step": 32909 + }, + { + "epoch": 1.7920810275469155, + "grad_norm": 0.5153919322195507, + "learning_rate": 5.614371736913082e-06, + "loss": 11.6605, + "step": 32910 + }, + { + "epoch": 1.7921354815434984, + "grad_norm": 0.49298910148421154, + "learning_rate": 5.6114589363188915e-06, + "loss": 11.7694, + "step": 32911 + }, + { + "epoch": 1.7921899355400814, + "grad_norm": 0.5639422226384082, + "learning_rate": 5.608546869705822e-06, + "loss": 11.7764, + "step": 32912 + }, + { + "epoch": 1.7922443895366644, + "grad_norm": 0.48621126242864204, + "learning_rate": 5.605635537096543e-06, + "loss": 11.6721, + "step": 32913 + }, + { + "epoch": 1.7922988435332474, + "grad_norm": 0.5956402582652661, + "learning_rate": 5.602724938513649e-06, + "loss": 11.8766, + "step": 32914 + }, + { + "epoch": 1.7923532975298304, + "grad_norm": 0.578792789996769, + "learning_rate": 5.599815073979819e-06, + "loss": 11.8459, + "step": 32915 + }, + { + "epoch": 1.7924077515264136, + "grad_norm": 0.520250136008144, + "learning_rate": 5.5969059435176386e-06, + "loss": 11.8288, + "step": 32916 + }, + { + "epoch": 1.7924622055229966, + "grad_norm": 0.6073175168381555, + "learning_rate": 5.593997547149765e-06, + "loss": 11.8408, + "step": 32917 + }, + { + "epoch": 1.7925166595195796, + "grad_norm": 0.5155050623384161, + "learning_rate": 5.5910898848987925e-06, + "loss": 11.831, + "step": 32918 + }, + { + "epoch": 1.7925711135161626, + "grad_norm": 0.5066378974759215, + "learning_rate": 5.5881829567873355e-06, + "loss": 11.7648, + "step": 32919 + }, + { + "epoch": 1.7926255675127456, + "grad_norm": 0.530946138852541, + "learning_rate": 5.585276762838009e-06, + "loss": 11.7931, + "step": 32920 + }, + { + "epoch": 1.7926800215093288, + "grad_norm": 0.5880682300761637, + "learning_rate": 5.582371303073386e-06, + "loss": 11.8818, + "step": 32921 + }, + { + "epoch": 1.7927344755059118, + "grad_norm": 0.7158473795371252, + "learning_rate": 5.57946657751609e-06, + "loss": 11.8053, + "step": 32922 + }, + { + "epoch": 1.7927889295024948, + "grad_norm": 0.5643935947936998, + "learning_rate": 5.576562586188694e-06, + "loss": 11.7546, + "step": 32923 + }, + { + "epoch": 1.7928433834990778, + "grad_norm": 0.5462105170578148, + "learning_rate": 5.573659329113767e-06, + "loss": 11.7147, + "step": 32924 + }, + { + "epoch": 1.7928978374956608, + "grad_norm": 0.5741835540156626, + "learning_rate": 5.570756806313926e-06, + "loss": 11.9257, + "step": 32925 + }, + { + "epoch": 1.7929522914922438, + "grad_norm": 0.5454007195822214, + "learning_rate": 5.567855017811696e-06, + "loss": 11.7167, + "step": 32926 + }, + { + "epoch": 1.7930067454888268, + "grad_norm": 0.565170862047559, + "learning_rate": 5.564953963629671e-06, + "loss": 11.7812, + "step": 32927 + }, + { + "epoch": 1.7930611994854098, + "grad_norm": 0.5751964823867437, + "learning_rate": 5.562053643790411e-06, + "loss": 11.8629, + "step": 32928 + }, + { + "epoch": 1.7931156534819928, + "grad_norm": 0.6041347164794797, + "learning_rate": 5.5591540583164406e-06, + "loss": 11.8195, + "step": 32929 + }, + { + "epoch": 1.7931701074785757, + "grad_norm": 0.542418557889861, + "learning_rate": 5.556255207230343e-06, + "loss": 11.802, + "step": 32930 + }, + { + "epoch": 1.7932245614751587, + "grad_norm": 0.5974718855396798, + "learning_rate": 5.553357090554623e-06, + "loss": 11.7662, + "step": 32931 + }, + { + "epoch": 1.7932790154717417, + "grad_norm": 0.49823910135020305, + "learning_rate": 5.550459708311862e-06, + "loss": 11.7352, + "step": 32932 + }, + { + "epoch": 1.7933334694683247, + "grad_norm": 0.5628189693596657, + "learning_rate": 5.547563060524541e-06, + "loss": 11.711, + "step": 32933 + }, + { + "epoch": 1.7933879234649077, + "grad_norm": 0.5985189818794429, + "learning_rate": 5.54466714721521e-06, + "loss": 11.6791, + "step": 32934 + }, + { + "epoch": 1.7934423774614907, + "grad_norm": 0.549031945653635, + "learning_rate": 5.541771968406406e-06, + "loss": 11.8105, + "step": 32935 + }, + { + "epoch": 1.7934968314580737, + "grad_norm": 0.585048829188477, + "learning_rate": 5.538877524120611e-06, + "loss": 11.8103, + "step": 32936 + }, + { + "epoch": 1.7935512854546567, + "grad_norm": 0.5897362453287507, + "learning_rate": 5.5359838143803635e-06, + "loss": 11.8408, + "step": 32937 + }, + { + "epoch": 1.7936057394512397, + "grad_norm": 0.519749619448922, + "learning_rate": 5.533090839208133e-06, + "loss": 11.714, + "step": 32938 + }, + { + "epoch": 1.793660193447823, + "grad_norm": 0.5693986335915681, + "learning_rate": 5.5301985986264234e-06, + "loss": 11.7528, + "step": 32939 + }, + { + "epoch": 1.793714647444406, + "grad_norm": 0.5171493502925723, + "learning_rate": 5.527307092657741e-06, + "loss": 11.6295, + "step": 32940 + }, + { + "epoch": 1.793769101440989, + "grad_norm": 0.5438200903069883, + "learning_rate": 5.5244163213245545e-06, + "loss": 11.8122, + "step": 32941 + }, + { + "epoch": 1.7938235554375719, + "grad_norm": 0.6501297040214571, + "learning_rate": 5.521526284649359e-06, + "loss": 11.8396, + "step": 32942 + }, + { + "epoch": 1.7938780094341549, + "grad_norm": 0.5484339027683268, + "learning_rate": 5.518636982654612e-06, + "loss": 11.7793, + "step": 32943 + }, + { + "epoch": 1.7939324634307379, + "grad_norm": 0.549753147222448, + "learning_rate": 5.515748415362798e-06, + "loss": 11.893, + "step": 32944 + }, + { + "epoch": 1.793986917427321, + "grad_norm": 0.5140662256940166, + "learning_rate": 5.512860582796353e-06, + "loss": 11.7629, + "step": 32945 + }, + { + "epoch": 1.794041371423904, + "grad_norm": 0.5795490634534858, + "learning_rate": 5.5099734849777595e-06, + "loss": 11.7468, + "step": 32946 + }, + { + "epoch": 1.794095825420487, + "grad_norm": 0.5368867896303627, + "learning_rate": 5.5070871219294776e-06, + "loss": 11.808, + "step": 32947 + }, + { + "epoch": 1.79415027941707, + "grad_norm": 0.5882965945355642, + "learning_rate": 5.50420149367391e-06, + "loss": 11.8415, + "step": 32948 + }, + { + "epoch": 1.794204733413653, + "grad_norm": 0.5701701114734583, + "learning_rate": 5.501316600233508e-06, + "loss": 11.7892, + "step": 32949 + }, + { + "epoch": 1.794259187410236, + "grad_norm": 0.5466941626380739, + "learning_rate": 5.4984324416307405e-06, + "loss": 11.8151, + "step": 32950 + }, + { + "epoch": 1.794313641406819, + "grad_norm": 0.5454520733639197, + "learning_rate": 5.495549017887991e-06, + "loss": 11.7103, + "step": 32951 + }, + { + "epoch": 1.794368095403402, + "grad_norm": 0.5774645190721264, + "learning_rate": 5.492666329027718e-06, + "loss": 11.7959, + "step": 32952 + }, + { + "epoch": 1.794422549399985, + "grad_norm": 0.5382498296973779, + "learning_rate": 5.4897843750723045e-06, + "loss": 11.7658, + "step": 32953 + }, + { + "epoch": 1.794477003396568, + "grad_norm": 0.5640751589546652, + "learning_rate": 5.486903156044187e-06, + "loss": 11.8879, + "step": 32954 + }, + { + "epoch": 1.794531457393151, + "grad_norm": 0.5619623001192191, + "learning_rate": 5.48402267196575e-06, + "loss": 11.9502, + "step": 32955 + }, + { + "epoch": 1.794585911389734, + "grad_norm": 0.6720658821772526, + "learning_rate": 5.481142922859428e-06, + "loss": 11.8907, + "step": 32956 + }, + { + "epoch": 1.794640365386317, + "grad_norm": 0.5309193801337244, + "learning_rate": 5.4782639087475714e-06, + "loss": 11.8927, + "step": 32957 + }, + { + "epoch": 1.7946948193829, + "grad_norm": 0.5060859659602768, + "learning_rate": 5.475385629652585e-06, + "loss": 11.7418, + "step": 32958 + }, + { + "epoch": 1.794749273379483, + "grad_norm": 0.5500296358704544, + "learning_rate": 5.472508085596861e-06, + "loss": 11.5995, + "step": 32959 + }, + { + "epoch": 1.794803727376066, + "grad_norm": 0.5415131342335241, + "learning_rate": 5.469631276602749e-06, + "loss": 11.7777, + "step": 32960 + }, + { + "epoch": 1.794858181372649, + "grad_norm": 0.4773191893379602, + "learning_rate": 5.4667552026926415e-06, + "loss": 11.7708, + "step": 32961 + }, + { + "epoch": 1.7949126353692322, + "grad_norm": 0.6345859726488672, + "learning_rate": 5.46387986388891e-06, + "loss": 11.9852, + "step": 32962 + }, + { + "epoch": 1.7949670893658152, + "grad_norm": 0.5511800044115259, + "learning_rate": 5.461005260213892e-06, + "loss": 11.807, + "step": 32963 + }, + { + "epoch": 1.7950215433623982, + "grad_norm": 0.6016446725644914, + "learning_rate": 5.458131391689958e-06, + "loss": 11.8363, + "step": 32964 + }, + { + "epoch": 1.7950759973589812, + "grad_norm": 0.5373914682548222, + "learning_rate": 5.455258258339446e-06, + "loss": 11.782, + "step": 32965 + }, + { + "epoch": 1.7951304513555641, + "grad_norm": 0.49659643496513534, + "learning_rate": 5.452385860184705e-06, + "loss": 11.7047, + "step": 32966 + }, + { + "epoch": 1.7951849053521471, + "grad_norm": 0.6386390216313624, + "learning_rate": 5.449514197248051e-06, + "loss": 11.6199, + "step": 32967 + }, + { + "epoch": 1.7952393593487304, + "grad_norm": 0.6626104470852224, + "learning_rate": 5.4466432695518545e-06, + "loss": 11.8896, + "step": 32968 + }, + { + "epoch": 1.7952938133453133, + "grad_norm": 0.5813216714212438, + "learning_rate": 5.443773077118419e-06, + "loss": 11.8182, + "step": 32969 + }, + { + "epoch": 1.7953482673418963, + "grad_norm": 0.5478349669179279, + "learning_rate": 5.4409036199700395e-06, + "loss": 11.8171, + "step": 32970 + }, + { + "epoch": 1.7954027213384793, + "grad_norm": 0.55359895922794, + "learning_rate": 5.438034898129063e-06, + "loss": 11.6894, + "step": 32971 + }, + { + "epoch": 1.7954571753350623, + "grad_norm": 0.5285163082513964, + "learning_rate": 5.4351669116177725e-06, + "loss": 11.7452, + "step": 32972 + }, + { + "epoch": 1.7955116293316453, + "grad_norm": 0.5531286289009327, + "learning_rate": 5.432299660458484e-06, + "loss": 11.7209, + "step": 32973 + }, + { + "epoch": 1.7955660833282283, + "grad_norm": 0.533561977470842, + "learning_rate": 5.429433144673512e-06, + "loss": 11.7901, + "step": 32974 + }, + { + "epoch": 1.7956205373248113, + "grad_norm": 0.5161039121828637, + "learning_rate": 5.426567364285107e-06, + "loss": 11.6132, + "step": 32975 + }, + { + "epoch": 1.7956749913213943, + "grad_norm": 0.5068217370561073, + "learning_rate": 5.423702319315593e-06, + "loss": 11.7355, + "step": 32976 + }, + { + "epoch": 1.7957294453179773, + "grad_norm": 0.6415120804812983, + "learning_rate": 5.42083800978721e-06, + "loss": 11.9336, + "step": 32977 + }, + { + "epoch": 1.7957838993145603, + "grad_norm": 0.526911813800498, + "learning_rate": 5.417974435722273e-06, + "loss": 11.7279, + "step": 32978 + }, + { + "epoch": 1.7958383533111433, + "grad_norm": 0.4895660059007519, + "learning_rate": 5.415111597143019e-06, + "loss": 11.8068, + "step": 32979 + }, + { + "epoch": 1.7958928073077263, + "grad_norm": 0.5158891977356518, + "learning_rate": 5.412249494071709e-06, + "loss": 11.8079, + "step": 32980 + }, + { + "epoch": 1.7959472613043093, + "grad_norm": 0.53320740878039, + "learning_rate": 5.4093881265306235e-06, + "loss": 11.7425, + "step": 32981 + }, + { + "epoch": 1.7960017153008923, + "grad_norm": 0.5302737779417936, + "learning_rate": 5.406527494541991e-06, + "loss": 11.7888, + "step": 32982 + }, + { + "epoch": 1.7960561692974752, + "grad_norm": 0.5772101317185414, + "learning_rate": 5.4036675981280485e-06, + "loss": 11.8371, + "step": 32983 + }, + { + "epoch": 1.7961106232940582, + "grad_norm": 0.5354878640147748, + "learning_rate": 5.400808437311078e-06, + "loss": 11.9267, + "step": 32984 + }, + { + "epoch": 1.7961650772906415, + "grad_norm": 0.5921124983555275, + "learning_rate": 5.397950012113273e-06, + "loss": 11.8837, + "step": 32985 + }, + { + "epoch": 1.7962195312872244, + "grad_norm": 0.5211389425899624, + "learning_rate": 5.395092322556883e-06, + "loss": 11.8441, + "step": 32986 + }, + { + "epoch": 1.7962739852838074, + "grad_norm": 0.5216618274321483, + "learning_rate": 5.3922353686641e-06, + "loss": 11.6793, + "step": 32987 + }, + { + "epoch": 1.7963284392803904, + "grad_norm": 0.5172682682443953, + "learning_rate": 5.389379150457186e-06, + "loss": 11.7679, + "step": 32988 + }, + { + "epoch": 1.7963828932769734, + "grad_norm": 0.4961430726988221, + "learning_rate": 5.38652366795831e-06, + "loss": 11.7149, + "step": 32989 + }, + { + "epoch": 1.7964373472735564, + "grad_norm": 0.6063725149669045, + "learning_rate": 5.383668921189689e-06, + "loss": 11.8792, + "step": 32990 + }, + { + "epoch": 1.7964918012701396, + "grad_norm": 0.5702487715606085, + "learning_rate": 5.380814910173548e-06, + "loss": 11.7312, + "step": 32991 + }, + { + "epoch": 1.7965462552667226, + "grad_norm": 0.5801547243275779, + "learning_rate": 5.377961634932027e-06, + "loss": 11.7412, + "step": 32992 + }, + { + "epoch": 1.7966007092633056, + "grad_norm": 0.5527701141612907, + "learning_rate": 5.375109095487374e-06, + "loss": 11.7354, + "step": 32993 + }, + { + "epoch": 1.7966551632598886, + "grad_norm": 0.5735356711884958, + "learning_rate": 5.372257291861715e-06, + "loss": 11.8257, + "step": 32994 + }, + { + "epoch": 1.7967096172564716, + "grad_norm": 0.5822056748790422, + "learning_rate": 5.369406224077256e-06, + "loss": 11.6295, + "step": 32995 + }, + { + "epoch": 1.7967640712530546, + "grad_norm": 0.5841452217221534, + "learning_rate": 5.366555892156178e-06, + "loss": 11.8433, + "step": 32996 + }, + { + "epoch": 1.7968185252496376, + "grad_norm": 0.5432889074077022, + "learning_rate": 5.363706296120618e-06, + "loss": 11.7912, + "step": 32997 + }, + { + "epoch": 1.7968729792462206, + "grad_norm": 0.5221669384389692, + "learning_rate": 5.360857435992772e-06, + "loss": 11.8043, + "step": 32998 + }, + { + "epoch": 1.7969274332428036, + "grad_norm": 0.5864699388637166, + "learning_rate": 5.358009311794754e-06, + "loss": 11.8788, + "step": 32999 + }, + { + "epoch": 1.7969818872393866, + "grad_norm": 0.5059116582391517, + "learning_rate": 5.355161923548724e-06, + "loss": 11.686, + "step": 33000 + }, + { + "epoch": 1.7970363412359696, + "grad_norm": 0.5752081796614689, + "learning_rate": 5.352315271276831e-06, + "loss": 11.7916, + "step": 33001 + }, + { + "epoch": 1.7970907952325526, + "grad_norm": 0.5372043231782221, + "learning_rate": 5.349469355001202e-06, + "loss": 11.8713, + "step": 33002 + }, + { + "epoch": 1.7971452492291355, + "grad_norm": 0.5803842771590753, + "learning_rate": 5.346624174743986e-06, + "loss": 11.9202, + "step": 33003 + }, + { + "epoch": 1.7971997032257185, + "grad_norm": 0.5032991441482636, + "learning_rate": 5.343779730527277e-06, + "loss": 11.6306, + "step": 33004 + }, + { + "epoch": 1.7972541572223015, + "grad_norm": 0.5803015960911285, + "learning_rate": 5.340936022373222e-06, + "loss": 11.8288, + "step": 33005 + }, + { + "epoch": 1.7973086112188845, + "grad_norm": 0.5305174714243117, + "learning_rate": 5.338093050303905e-06, + "loss": 11.6913, + "step": 33006 + }, + { + "epoch": 1.7973630652154675, + "grad_norm": 0.6319059823594511, + "learning_rate": 5.335250814341464e-06, + "loss": 12.0044, + "step": 33007 + }, + { + "epoch": 1.7974175192120505, + "grad_norm": 0.5374725158357982, + "learning_rate": 5.332409314508003e-06, + "loss": 11.6781, + "step": 33008 + }, + { + "epoch": 1.7974719732086337, + "grad_norm": 0.5578158580914994, + "learning_rate": 5.329568550825581e-06, + "loss": 11.8052, + "step": 33009 + }, + { + "epoch": 1.7975264272052167, + "grad_norm": 0.5957287702480674, + "learning_rate": 5.3267285233163045e-06, + "loss": 11.8027, + "step": 33010 + }, + { + "epoch": 1.7975808812017997, + "grad_norm": 0.4860268760649681, + "learning_rate": 5.3238892320022886e-06, + "loss": 11.8393, + "step": 33011 + }, + { + "epoch": 1.7976353351983827, + "grad_norm": 0.5436182457278895, + "learning_rate": 5.3210506769055705e-06, + "loss": 11.7452, + "step": 33012 + }, + { + "epoch": 1.7976897891949657, + "grad_norm": 0.5570223204020921, + "learning_rate": 5.318212858048244e-06, + "loss": 11.787, + "step": 33013 + }, + { + "epoch": 1.7977442431915487, + "grad_norm": 0.5986976690922344, + "learning_rate": 5.315375775452369e-06, + "loss": 11.8338, + "step": 33014 + }, + { + "epoch": 1.797798697188132, + "grad_norm": 0.551169818946494, + "learning_rate": 5.312539429140018e-06, + "loss": 11.8283, + "step": 33015 + }, + { + "epoch": 1.797853151184715, + "grad_norm": 0.6176384875769115, + "learning_rate": 5.309703819133238e-06, + "loss": 11.869, + "step": 33016 + }, + { + "epoch": 1.7979076051812979, + "grad_norm": 0.5242391249554645, + "learning_rate": 5.306868945454068e-06, + "loss": 11.8052, + "step": 33017 + }, + { + "epoch": 1.7979620591778809, + "grad_norm": 0.5367222734114467, + "learning_rate": 5.304034808124591e-06, + "loss": 11.7946, + "step": 33018 + }, + { + "epoch": 1.7980165131744639, + "grad_norm": 0.5272415705511996, + "learning_rate": 5.3012014071668e-06, + "loss": 11.882, + "step": 33019 + }, + { + "epoch": 1.7980709671710469, + "grad_norm": 0.5580051244643481, + "learning_rate": 5.298368742602766e-06, + "loss": 11.6822, + "step": 33020 + }, + { + "epoch": 1.7981254211676299, + "grad_norm": 0.6239567137405468, + "learning_rate": 5.295536814454472e-06, + "loss": 11.8083, + "step": 33021 + }, + { + "epoch": 1.7981798751642128, + "grad_norm": 0.5463087922129438, + "learning_rate": 5.292705622743977e-06, + "loss": 11.5613, + "step": 33022 + }, + { + "epoch": 1.7982343291607958, + "grad_norm": 0.5552737913614896, + "learning_rate": 5.289875167493286e-06, + "loss": 11.7769, + "step": 33023 + }, + { + "epoch": 1.7982887831573788, + "grad_norm": 0.5147142660490316, + "learning_rate": 5.287045448724404e-06, + "loss": 11.8173, + "step": 33024 + }, + { + "epoch": 1.7983432371539618, + "grad_norm": 0.5788489928901949, + "learning_rate": 5.284216466459357e-06, + "loss": 11.7764, + "step": 33025 + }, + { + "epoch": 1.7983976911505448, + "grad_norm": 0.5810937995182615, + "learning_rate": 5.281388220720107e-06, + "loss": 11.8385, + "step": 33026 + }, + { + "epoch": 1.7984521451471278, + "grad_norm": 0.541854898260314, + "learning_rate": 5.27856071152868e-06, + "loss": 11.7527, + "step": 33027 + }, + { + "epoch": 1.7985065991437108, + "grad_norm": 0.4937860521646309, + "learning_rate": 5.275733938907046e-06, + "loss": 11.7261, + "step": 33028 + }, + { + "epoch": 1.7985610531402938, + "grad_norm": 0.5547554882226353, + "learning_rate": 5.272907902877189e-06, + "loss": 11.7142, + "step": 33029 + }, + { + "epoch": 1.7986155071368768, + "grad_norm": 0.544133630270464, + "learning_rate": 5.27008260346109e-06, + "loss": 11.8562, + "step": 33030 + }, + { + "epoch": 1.7986699611334598, + "grad_norm": 0.5407408427436463, + "learning_rate": 5.2672580406807e-06, + "loss": 11.8199, + "step": 33031 + }, + { + "epoch": 1.798724415130043, + "grad_norm": 0.5375801738999948, + "learning_rate": 5.2644342145580005e-06, + "loss": 11.7831, + "step": 33032 + }, + { + "epoch": 1.798778869126626, + "grad_norm": 0.6789230656775038, + "learning_rate": 5.261611125114963e-06, + "loss": 11.8353, + "step": 33033 + }, + { + "epoch": 1.798833323123209, + "grad_norm": 0.6411747950919314, + "learning_rate": 5.258788772373513e-06, + "loss": 11.7864, + "step": 33034 + }, + { + "epoch": 1.798887777119792, + "grad_norm": 0.510044093749449, + "learning_rate": 5.255967156355623e-06, + "loss": 11.863, + "step": 33035 + }, + { + "epoch": 1.798942231116375, + "grad_norm": 0.5360965215681892, + "learning_rate": 5.253146277083199e-06, + "loss": 11.6881, + "step": 33036 + }, + { + "epoch": 1.798996685112958, + "grad_norm": 0.5428324202304564, + "learning_rate": 5.250326134578221e-06, + "loss": 11.6235, + "step": 33037 + }, + { + "epoch": 1.7990511391095412, + "grad_norm": 0.49448376491651336, + "learning_rate": 5.247506728862595e-06, + "loss": 11.7862, + "step": 33038 + }, + { + "epoch": 1.7991055931061242, + "grad_norm": 0.5633106550018803, + "learning_rate": 5.244688059958225e-06, + "loss": 11.8028, + "step": 33039 + }, + { + "epoch": 1.7991600471027072, + "grad_norm": 0.6164474620025211, + "learning_rate": 5.241870127887072e-06, + "loss": 11.7571, + "step": 33040 + }, + { + "epoch": 1.7992145010992902, + "grad_norm": 0.556047598832963, + "learning_rate": 5.239052932671018e-06, + "loss": 11.7722, + "step": 33041 + }, + { + "epoch": 1.7992689550958731, + "grad_norm": 0.561233804075412, + "learning_rate": 5.23623647433199e-06, + "loss": 11.833, + "step": 33042 + }, + { + "epoch": 1.7993234090924561, + "grad_norm": 0.5363343879344251, + "learning_rate": 5.23342075289186e-06, + "loss": 11.8106, + "step": 33043 + }, + { + "epoch": 1.7993778630890391, + "grad_norm": 0.5180780725819338, + "learning_rate": 5.230605768372554e-06, + "loss": 11.7159, + "step": 33044 + }, + { + "epoch": 1.7994323170856221, + "grad_norm": 0.5917444056953549, + "learning_rate": 5.227791520795955e-06, + "loss": 11.8704, + "step": 33045 + }, + { + "epoch": 1.7994867710822051, + "grad_norm": 0.6058205122319649, + "learning_rate": 5.2249780101839345e-06, + "loss": 11.776, + "step": 33046 + }, + { + "epoch": 1.799541225078788, + "grad_norm": 0.5910864790269835, + "learning_rate": 5.222165236558385e-06, + "loss": 11.8462, + "step": 33047 + }, + { + "epoch": 1.799595679075371, + "grad_norm": 0.5409314574371767, + "learning_rate": 5.2193531999411795e-06, + "loss": 11.8283, + "step": 33048 + }, + { + "epoch": 1.799650133071954, + "grad_norm": 0.703583447428384, + "learning_rate": 5.2165419003541545e-06, + "loss": 11.8901, + "step": 33049 + }, + { + "epoch": 1.799704587068537, + "grad_norm": 0.5390945416622668, + "learning_rate": 5.213731337819217e-06, + "loss": 11.7955, + "step": 33050 + }, + { + "epoch": 1.79975904106512, + "grad_norm": 0.49991864252572077, + "learning_rate": 5.210921512358191e-06, + "loss": 11.7977, + "step": 33051 + }, + { + "epoch": 1.799813495061703, + "grad_norm": 0.5359739836329991, + "learning_rate": 5.2081124239929395e-06, + "loss": 11.7865, + "step": 33052 + }, + { + "epoch": 1.799867949058286, + "grad_norm": 0.6323550774870728, + "learning_rate": 5.205304072745299e-06, + "loss": 11.9618, + "step": 33053 + }, + { + "epoch": 1.799922403054869, + "grad_norm": 0.5009089060164308, + "learning_rate": 5.202496458637118e-06, + "loss": 11.7868, + "step": 33054 + }, + { + "epoch": 1.7999768570514523, + "grad_norm": 0.5397962049504392, + "learning_rate": 5.199689581690204e-06, + "loss": 11.7871, + "step": 33055 + }, + { + "epoch": 1.8000313110480353, + "grad_norm": 0.7182613204472554, + "learning_rate": 5.196883441926415e-06, + "loss": 11.7472, + "step": 33056 + }, + { + "epoch": 1.8000857650446183, + "grad_norm": 0.5485250475107362, + "learning_rate": 5.194078039367556e-06, + "loss": 11.6487, + "step": 33057 + }, + { + "epoch": 1.8001402190412013, + "grad_norm": 0.5951127871722497, + "learning_rate": 5.191273374035455e-06, + "loss": 11.9036, + "step": 33058 + }, + { + "epoch": 1.8001946730377842, + "grad_norm": 0.5071896827292842, + "learning_rate": 5.1884694459519045e-06, + "loss": 11.6455, + "step": 33059 + }, + { + "epoch": 1.8002491270343672, + "grad_norm": 0.5661822993466582, + "learning_rate": 5.185666255138721e-06, + "loss": 11.8508, + "step": 33060 + }, + { + "epoch": 1.8003035810309505, + "grad_norm": 0.5037982708150142, + "learning_rate": 5.182863801617677e-06, + "loss": 11.8576, + "step": 33061 + }, + { + "epoch": 1.8003580350275334, + "grad_norm": 0.5182435953116797, + "learning_rate": 5.180062085410609e-06, + "loss": 11.7623, + "step": 33062 + }, + { + "epoch": 1.8004124890241164, + "grad_norm": 0.6543159068382637, + "learning_rate": 5.177261106539255e-06, + "loss": 11.9147, + "step": 33063 + }, + { + "epoch": 1.8004669430206994, + "grad_norm": 0.573379508773424, + "learning_rate": 5.174460865025443e-06, + "loss": 11.6139, + "step": 33064 + }, + { + "epoch": 1.8005213970172824, + "grad_norm": 0.5455433176042846, + "learning_rate": 5.171661360890911e-06, + "loss": 11.6488, + "step": 33065 + }, + { + "epoch": 1.8005758510138654, + "grad_norm": 0.616186317030893, + "learning_rate": 5.168862594157442e-06, + "loss": 11.7815, + "step": 33066 + }, + { + "epoch": 1.8006303050104484, + "grad_norm": 0.5576784896864911, + "learning_rate": 5.166064564846818e-06, + "loss": 11.7695, + "step": 33067 + }, + { + "epoch": 1.8006847590070314, + "grad_norm": 0.558524115267277, + "learning_rate": 5.163267272980776e-06, + "loss": 11.9219, + "step": 33068 + }, + { + "epoch": 1.8007392130036144, + "grad_norm": 0.5776225595460162, + "learning_rate": 5.160470718581068e-06, + "loss": 11.8872, + "step": 33069 + }, + { + "epoch": 1.8007936670001974, + "grad_norm": 0.5695110058292474, + "learning_rate": 5.157674901669441e-06, + "loss": 11.6353, + "step": 33070 + }, + { + "epoch": 1.8008481209967804, + "grad_norm": 0.68544646777409, + "learning_rate": 5.154879822267633e-06, + "loss": 11.8049, + "step": 33071 + }, + { + "epoch": 1.8009025749933634, + "grad_norm": 0.7136796969335726, + "learning_rate": 5.152085480397395e-06, + "loss": 11.7655, + "step": 33072 + }, + { + "epoch": 1.8009570289899464, + "grad_norm": 0.5283364656274697, + "learning_rate": 5.149291876080431e-06, + "loss": 11.7, + "step": 33073 + }, + { + "epoch": 1.8010114829865294, + "grad_norm": 0.5487399564934956, + "learning_rate": 5.146499009338501e-06, + "loss": 11.7431, + "step": 33074 + }, + { + "epoch": 1.8010659369831123, + "grad_norm": 0.573375986776589, + "learning_rate": 5.143706880193289e-06, + "loss": 11.6611, + "step": 33075 + }, + { + "epoch": 1.8011203909796953, + "grad_norm": 0.5127462304835643, + "learning_rate": 5.1409154886665315e-06, + "loss": 11.7831, + "step": 33076 + }, + { + "epoch": 1.8011748449762783, + "grad_norm": 0.5341124749966027, + "learning_rate": 5.138124834779901e-06, + "loss": 11.7424, + "step": 33077 + }, + { + "epoch": 1.8012292989728613, + "grad_norm": 0.558461180509717, + "learning_rate": 5.135334918555146e-06, + "loss": 11.5907, + "step": 33078 + }, + { + "epoch": 1.8012837529694445, + "grad_norm": 0.5695077286896717, + "learning_rate": 5.132545740013928e-06, + "loss": 11.812, + "step": 33079 + }, + { + "epoch": 1.8013382069660275, + "grad_norm": 0.6236338186733865, + "learning_rate": 5.129757299177928e-06, + "loss": 11.8106, + "step": 33080 + }, + { + "epoch": 1.8013926609626105, + "grad_norm": 0.6048148154238496, + "learning_rate": 5.126969596068853e-06, + "loss": 11.9336, + "step": 33081 + }, + { + "epoch": 1.8014471149591935, + "grad_norm": 0.6021874516734276, + "learning_rate": 5.124182630708385e-06, + "loss": 11.8336, + "step": 33082 + }, + { + "epoch": 1.8015015689557765, + "grad_norm": 0.5510384474859819, + "learning_rate": 5.1213964031181615e-06, + "loss": 11.8096, + "step": 33083 + }, + { + "epoch": 1.8015560229523597, + "grad_norm": 0.5608582089557207, + "learning_rate": 5.118610913319888e-06, + "loss": 11.8161, + "step": 33084 + }, + { + "epoch": 1.8016104769489427, + "grad_norm": 0.5780941252311154, + "learning_rate": 5.115826161335202e-06, + "loss": 11.7778, + "step": 33085 + }, + { + "epoch": 1.8016649309455257, + "grad_norm": 0.5049369266089829, + "learning_rate": 5.113042147185765e-06, + "loss": 11.7887, + "step": 33086 + }, + { + "epoch": 1.8017193849421087, + "grad_norm": 0.572217096407932, + "learning_rate": 5.110258870893225e-06, + "loss": 11.8803, + "step": 33087 + }, + { + "epoch": 1.8017738389386917, + "grad_norm": 0.5332739464871372, + "learning_rate": 5.1074763324792215e-06, + "loss": 11.8783, + "step": 33088 + }, + { + "epoch": 1.8018282929352747, + "grad_norm": 0.584070318484261, + "learning_rate": 5.104694531965415e-06, + "loss": 11.7572, + "step": 33089 + }, + { + "epoch": 1.8018827469318577, + "grad_norm": 0.5883338984393851, + "learning_rate": 5.101913469373387e-06, + "loss": 11.7152, + "step": 33090 + }, + { + "epoch": 1.8019372009284407, + "grad_norm": 0.5086451157941407, + "learning_rate": 5.099133144724821e-06, + "loss": 11.7435, + "step": 33091 + }, + { + "epoch": 1.8019916549250237, + "grad_norm": 0.5571779434488101, + "learning_rate": 5.096353558041289e-06, + "loss": 11.8092, + "step": 33092 + }, + { + "epoch": 1.8020461089216067, + "grad_norm": 0.5312922849609283, + "learning_rate": 5.0935747093444285e-06, + "loss": 11.6488, + "step": 33093 + }, + { + "epoch": 1.8021005629181897, + "grad_norm": 0.5166841817620795, + "learning_rate": 5.090796598655867e-06, + "loss": 11.718, + "step": 33094 + }, + { + "epoch": 1.8021550169147726, + "grad_norm": 0.5906951796143799, + "learning_rate": 5.0880192259971645e-06, + "loss": 11.7187, + "step": 33095 + }, + { + "epoch": 1.8022094709113556, + "grad_norm": 0.4857076515708823, + "learning_rate": 5.0852425913899605e-06, + "loss": 11.7128, + "step": 33096 + }, + { + "epoch": 1.8022639249079386, + "grad_norm": 0.548613000878737, + "learning_rate": 5.082466694855803e-06, + "loss": 11.8502, + "step": 33097 + }, + { + "epoch": 1.8023183789045216, + "grad_norm": 0.5440445244085107, + "learning_rate": 5.079691536416331e-06, + "loss": 11.7426, + "step": 33098 + }, + { + "epoch": 1.8023728329011046, + "grad_norm": 0.5765165755015134, + "learning_rate": 5.0769171160930824e-06, + "loss": 11.9014, + "step": 33099 + }, + { + "epoch": 1.8024272868976876, + "grad_norm": 0.5193150501470704, + "learning_rate": 5.074143433907641e-06, + "loss": 11.7797, + "step": 33100 + }, + { + "epoch": 1.8024817408942706, + "grad_norm": 0.6264881221743966, + "learning_rate": 5.071370489881589e-06, + "loss": 11.857, + "step": 33101 + }, + { + "epoch": 1.8025361948908538, + "grad_norm": 0.5292029358386698, + "learning_rate": 5.068598284036474e-06, + "loss": 11.6965, + "step": 33102 + }, + { + "epoch": 1.8025906488874368, + "grad_norm": 0.5577462168844834, + "learning_rate": 5.065826816393848e-06, + "loss": 11.718, + "step": 33103 + }, + { + "epoch": 1.8026451028840198, + "grad_norm": 0.5124702474738054, + "learning_rate": 5.063056086975293e-06, + "loss": 11.7704, + "step": 33104 + }, + { + "epoch": 1.8026995568806028, + "grad_norm": 0.5172760410745203, + "learning_rate": 5.0602860958023136e-06, + "loss": 11.7581, + "step": 33105 + }, + { + "epoch": 1.8027540108771858, + "grad_norm": 0.5681498001389979, + "learning_rate": 5.057516842896492e-06, + "loss": 11.7469, + "step": 33106 + }, + { + "epoch": 1.8028084648737688, + "grad_norm": 0.553499130275587, + "learning_rate": 5.054748328279324e-06, + "loss": 11.725, + "step": 33107 + }, + { + "epoch": 1.802862918870352, + "grad_norm": 0.5267487531792995, + "learning_rate": 5.051980551972369e-06, + "loss": 11.8036, + "step": 33108 + }, + { + "epoch": 1.802917372866935, + "grad_norm": 0.5835299047872015, + "learning_rate": 5.049213513997142e-06, + "loss": 11.7708, + "step": 33109 + }, + { + "epoch": 1.802971826863518, + "grad_norm": 0.564457165664405, + "learning_rate": 5.046447214375138e-06, + "loss": 11.9303, + "step": 33110 + }, + { + "epoch": 1.803026280860101, + "grad_norm": 0.5626347562366967, + "learning_rate": 5.043681653127885e-06, + "loss": 11.5876, + "step": 33111 + }, + { + "epoch": 1.803080734856684, + "grad_norm": 0.5262466057129509, + "learning_rate": 5.040916830276887e-06, + "loss": 11.8207, + "step": 33112 + }, + { + "epoch": 1.803135188853267, + "grad_norm": 0.5368449488581447, + "learning_rate": 5.03815274584366e-06, + "loss": 11.7659, + "step": 33113 + }, + { + "epoch": 1.80318964284985, + "grad_norm": 0.6010632105142093, + "learning_rate": 5.035389399849666e-06, + "loss": 11.9371, + "step": 33114 + }, + { + "epoch": 1.803244096846433, + "grad_norm": 0.5916678779008685, + "learning_rate": 5.032626792316408e-06, + "loss": 11.7108, + "step": 33115 + }, + { + "epoch": 1.803298550843016, + "grad_norm": 0.5369634232869531, + "learning_rate": 5.029864923265382e-06, + "loss": 11.8294, + "step": 33116 + }, + { + "epoch": 1.803353004839599, + "grad_norm": 0.5089388708774236, + "learning_rate": 5.027103792718036e-06, + "loss": 11.7105, + "step": 33117 + }, + { + "epoch": 1.803407458836182, + "grad_norm": 0.5112987221801994, + "learning_rate": 5.024343400695874e-06, + "loss": 11.6755, + "step": 33118 + }, + { + "epoch": 1.803461912832765, + "grad_norm": 0.5754809501452369, + "learning_rate": 5.021583747220349e-06, + "loss": 11.843, + "step": 33119 + }, + { + "epoch": 1.803516366829348, + "grad_norm": 0.569141860179444, + "learning_rate": 5.018824832312907e-06, + "loss": 11.7935, + "step": 33120 + }, + { + "epoch": 1.803570820825931, + "grad_norm": 0.5524580090942568, + "learning_rate": 5.016066655995022e-06, + "loss": 11.7522, + "step": 33121 + }, + { + "epoch": 1.803625274822514, + "grad_norm": 0.5185444764335322, + "learning_rate": 5.013309218288109e-06, + "loss": 11.7835, + "step": 33122 + }, + { + "epoch": 1.8036797288190969, + "grad_norm": 0.5046861956728936, + "learning_rate": 5.0105525192136515e-06, + "loss": 11.6846, + "step": 33123 + }, + { + "epoch": 1.8037341828156799, + "grad_norm": 0.549262063097627, + "learning_rate": 5.007796558793054e-06, + "loss": 11.7959, + "step": 33124 + }, + { + "epoch": 1.803788636812263, + "grad_norm": 0.5706250564929886, + "learning_rate": 5.005041337047778e-06, + "loss": 11.6666, + "step": 33125 + }, + { + "epoch": 1.803843090808846, + "grad_norm": 0.5280845153105259, + "learning_rate": 5.002286853999216e-06, + "loss": 11.7222, + "step": 33126 + }, + { + "epoch": 1.803897544805429, + "grad_norm": 0.6090800605198194, + "learning_rate": 4.999533109668797e-06, + "loss": 11.8456, + "step": 33127 + }, + { + "epoch": 1.803951998802012, + "grad_norm": 0.5449261964947809, + "learning_rate": 4.996780104077958e-06, + "loss": 11.8051, + "step": 33128 + }, + { + "epoch": 1.804006452798595, + "grad_norm": 0.5253676105361571, + "learning_rate": 4.994027837248094e-06, + "loss": 11.826, + "step": 33129 + }, + { + "epoch": 1.804060906795178, + "grad_norm": 0.5381580407631475, + "learning_rate": 4.991276309200588e-06, + "loss": 11.8203, + "step": 33130 + }, + { + "epoch": 1.8041153607917613, + "grad_norm": 0.5776018075832223, + "learning_rate": 4.988525519956855e-06, + "loss": 11.8983, + "step": 33131 + }, + { + "epoch": 1.8041698147883443, + "grad_norm": 0.5193345683995854, + "learning_rate": 4.985775469538268e-06, + "loss": 11.7223, + "step": 33132 + }, + { + "epoch": 1.8042242687849273, + "grad_norm": 0.5355267333610721, + "learning_rate": 4.983026157966242e-06, + "loss": 11.7857, + "step": 33133 + }, + { + "epoch": 1.8042787227815102, + "grad_norm": 0.5441305285446537, + "learning_rate": 4.980277585262128e-06, + "loss": 11.8036, + "step": 33134 + }, + { + "epoch": 1.8043331767780932, + "grad_norm": 0.557721790054812, + "learning_rate": 4.977529751447318e-06, + "loss": 11.7417, + "step": 33135 + }, + { + "epoch": 1.8043876307746762, + "grad_norm": 0.5690619179987269, + "learning_rate": 4.974782656543164e-06, + "loss": 11.7234, + "step": 33136 + }, + { + "epoch": 1.8044420847712592, + "grad_norm": 0.5062107400952486, + "learning_rate": 4.9720363005710365e-06, + "loss": 11.7422, + "step": 33137 + }, + { + "epoch": 1.8044965387678422, + "grad_norm": 0.5485248627223632, + "learning_rate": 4.969290683552297e-06, + "loss": 11.6907, + "step": 33138 + }, + { + "epoch": 1.8045509927644252, + "grad_norm": 0.5305898060488378, + "learning_rate": 4.966545805508293e-06, + "loss": 11.7244, + "step": 33139 + }, + { + "epoch": 1.8046054467610082, + "grad_norm": 0.5429745541015821, + "learning_rate": 4.963801666460377e-06, + "loss": 11.7969, + "step": 33140 + }, + { + "epoch": 1.8046599007575912, + "grad_norm": 0.592253568178117, + "learning_rate": 4.961058266429852e-06, + "loss": 11.6663, + "step": 33141 + }, + { + "epoch": 1.8047143547541742, + "grad_norm": 0.5559710956921947, + "learning_rate": 4.95831560543808e-06, + "loss": 11.7606, + "step": 33142 + }, + { + "epoch": 1.8047688087507572, + "grad_norm": 0.6063945203602211, + "learning_rate": 4.955573683506387e-06, + "loss": 11.7917, + "step": 33143 + }, + { + "epoch": 1.8048232627473402, + "grad_norm": 0.5676914394759548, + "learning_rate": 4.9528325006560905e-06, + "loss": 11.7848, + "step": 33144 + }, + { + "epoch": 1.8048777167439232, + "grad_norm": 0.5441310715019334, + "learning_rate": 4.950092056908518e-06, + "loss": 11.789, + "step": 33145 + }, + { + "epoch": 1.8049321707405062, + "grad_norm": 0.562846309247268, + "learning_rate": 4.947352352284962e-06, + "loss": 11.8551, + "step": 33146 + }, + { + "epoch": 1.8049866247370892, + "grad_norm": 0.5843194000520776, + "learning_rate": 4.94461338680674e-06, + "loss": 11.7459, + "step": 33147 + }, + { + "epoch": 1.8050410787336721, + "grad_norm": 0.5762016328871316, + "learning_rate": 4.941875160495135e-06, + "loss": 11.8354, + "step": 33148 + }, + { + "epoch": 1.8050955327302554, + "grad_norm": 0.5579528750347603, + "learning_rate": 4.939137673371452e-06, + "loss": 11.811, + "step": 33149 + }, + { + "epoch": 1.8051499867268384, + "grad_norm": 0.5200221724236684, + "learning_rate": 4.936400925456997e-06, + "loss": 11.7344, + "step": 33150 + }, + { + "epoch": 1.8052044407234213, + "grad_norm": 0.6106854710543946, + "learning_rate": 4.933664916773007e-06, + "loss": 11.8581, + "step": 33151 + }, + { + "epoch": 1.8052588947200043, + "grad_norm": 0.554498844373309, + "learning_rate": 4.930929647340776e-06, + "loss": 11.8799, + "step": 33152 + }, + { + "epoch": 1.8053133487165873, + "grad_norm": 0.5525246288148445, + "learning_rate": 4.9281951171816e-06, + "loss": 11.6564, + "step": 33153 + }, + { + "epoch": 1.8053678027131705, + "grad_norm": 0.535704884413912, + "learning_rate": 4.925461326316705e-06, + "loss": 11.6345, + "step": 33154 + }, + { + "epoch": 1.8054222567097535, + "grad_norm": 0.5484366647238916, + "learning_rate": 4.922728274767374e-06, + "loss": 11.9068, + "step": 33155 + }, + { + "epoch": 1.8054767107063365, + "grad_norm": 0.5386424025094163, + "learning_rate": 4.919995962554846e-06, + "loss": 11.7773, + "step": 33156 + }, + { + "epoch": 1.8055311647029195, + "grad_norm": 0.5393011097664493, + "learning_rate": 4.9172643897003936e-06, + "loss": 11.7834, + "step": 33157 + }, + { + "epoch": 1.8055856186995025, + "grad_norm": 0.5000349905754421, + "learning_rate": 4.9145335562252204e-06, + "loss": 11.6803, + "step": 33158 + }, + { + "epoch": 1.8056400726960855, + "grad_norm": 0.5346719670032192, + "learning_rate": 4.911803462150588e-06, + "loss": 11.715, + "step": 33159 + }, + { + "epoch": 1.8056945266926685, + "grad_norm": 0.5394760021643212, + "learning_rate": 4.9090741074977245e-06, + "loss": 11.715, + "step": 33160 + }, + { + "epoch": 1.8057489806892515, + "grad_norm": 0.5571987610832586, + "learning_rate": 4.906345492287834e-06, + "loss": 11.8169, + "step": 33161 + }, + { + "epoch": 1.8058034346858345, + "grad_norm": 0.5534356781641624, + "learning_rate": 4.903617616542156e-06, + "loss": 11.6854, + "step": 33162 + }, + { + "epoch": 1.8058578886824175, + "grad_norm": 0.5355276617833473, + "learning_rate": 4.900890480281883e-06, + "loss": 11.8339, + "step": 33163 + }, + { + "epoch": 1.8059123426790005, + "grad_norm": 0.529139284611768, + "learning_rate": 4.898164083528245e-06, + "loss": 11.6885, + "step": 33164 + }, + { + "epoch": 1.8059667966755835, + "grad_norm": 0.6012888361537658, + "learning_rate": 4.895438426302435e-06, + "loss": 11.7846, + "step": 33165 + }, + { + "epoch": 1.8060212506721665, + "grad_norm": 0.5267517422127683, + "learning_rate": 4.892713508625635e-06, + "loss": 11.7775, + "step": 33166 + }, + { + "epoch": 1.8060757046687494, + "grad_norm": 0.5625057355380759, + "learning_rate": 4.8899893305190514e-06, + "loss": 11.8971, + "step": 33167 + }, + { + "epoch": 1.8061301586653324, + "grad_norm": 0.5216984893198304, + "learning_rate": 4.887265892003856e-06, + "loss": 11.816, + "step": 33168 + }, + { + "epoch": 1.8061846126619154, + "grad_norm": 0.6246677665710533, + "learning_rate": 4.884543193101232e-06, + "loss": 11.8072, + "step": 33169 + }, + { + "epoch": 1.8062390666584984, + "grad_norm": 0.5301392317448178, + "learning_rate": 4.881821233832362e-06, + "loss": 11.6522, + "step": 33170 + }, + { + "epoch": 1.8062935206550814, + "grad_norm": 0.5864101392981528, + "learning_rate": 4.879100014218385e-06, + "loss": 11.7581, + "step": 33171 + }, + { + "epoch": 1.8063479746516646, + "grad_norm": 0.6302461544781593, + "learning_rate": 4.876379534280495e-06, + "loss": 11.8697, + "step": 33172 + }, + { + "epoch": 1.8064024286482476, + "grad_norm": 0.5126815062616424, + "learning_rate": 4.873659794039809e-06, + "loss": 11.8067, + "step": 33173 + }, + { + "epoch": 1.8064568826448306, + "grad_norm": 0.5338329742520125, + "learning_rate": 4.87094079351752e-06, + "loss": 11.8254, + "step": 33174 + }, + { + "epoch": 1.8065113366414136, + "grad_norm": 0.5558821238255813, + "learning_rate": 4.868222532734734e-06, + "loss": 11.8499, + "step": 33175 + }, + { + "epoch": 1.8065657906379966, + "grad_norm": 0.5297772250345244, + "learning_rate": 4.86550501171259e-06, + "loss": 11.6535, + "step": 33176 + }, + { + "epoch": 1.8066202446345796, + "grad_norm": 0.5301268819352307, + "learning_rate": 4.862788230472259e-06, + "loss": 11.5577, + "step": 33177 + }, + { + "epoch": 1.8066746986311628, + "grad_norm": 0.5195226774698151, + "learning_rate": 4.860072189034826e-06, + "loss": 11.604, + "step": 33178 + }, + { + "epoch": 1.8067291526277458, + "grad_norm": 0.5873862330329568, + "learning_rate": 4.857356887421438e-06, + "loss": 11.8314, + "step": 33179 + }, + { + "epoch": 1.8067836066243288, + "grad_norm": 0.5355624033162096, + "learning_rate": 4.854642325653202e-06, + "loss": 11.5848, + "step": 33180 + }, + { + "epoch": 1.8068380606209118, + "grad_norm": 0.5334199011435603, + "learning_rate": 4.851928503751202e-06, + "loss": 11.6847, + "step": 33181 + }, + { + "epoch": 1.8068925146174948, + "grad_norm": 0.5424485365186009, + "learning_rate": 4.849215421736586e-06, + "loss": 11.7574, + "step": 33182 + }, + { + "epoch": 1.8069469686140778, + "grad_norm": 0.5519686488107474, + "learning_rate": 4.846503079630404e-06, + "loss": 11.7667, + "step": 33183 + }, + { + "epoch": 1.8070014226106608, + "grad_norm": 0.5372690316473078, + "learning_rate": 4.843791477453785e-06, + "loss": 11.7687, + "step": 33184 + }, + { + "epoch": 1.8070558766072438, + "grad_norm": 0.5206480337445053, + "learning_rate": 4.8410806152278e-06, + "loss": 11.6573, + "step": 33185 + }, + { + "epoch": 1.8071103306038268, + "grad_norm": 0.5600968336725373, + "learning_rate": 4.838370492973521e-06, + "loss": 11.7819, + "step": 33186 + }, + { + "epoch": 1.8071647846004097, + "grad_norm": 0.5411745921831451, + "learning_rate": 4.835661110712042e-06, + "loss": 11.8349, + "step": 33187 + }, + { + "epoch": 1.8072192385969927, + "grad_norm": 0.5557196344027634, + "learning_rate": 4.832952468464413e-06, + "loss": 11.7677, + "step": 33188 + }, + { + "epoch": 1.8072736925935757, + "grad_norm": 0.5819464925228438, + "learning_rate": 4.830244566251729e-06, + "loss": 11.7541, + "step": 33189 + }, + { + "epoch": 1.8073281465901587, + "grad_norm": 0.573373616125001, + "learning_rate": 4.827537404095006e-06, + "loss": 11.7272, + "step": 33190 + }, + { + "epoch": 1.8073826005867417, + "grad_norm": 0.6061551048176003, + "learning_rate": 4.824830982015305e-06, + "loss": 11.8376, + "step": 33191 + }, + { + "epoch": 1.8074370545833247, + "grad_norm": 0.5699583363847336, + "learning_rate": 4.822125300033686e-06, + "loss": 11.5983, + "step": 33192 + }, + { + "epoch": 1.8074915085799077, + "grad_norm": 0.5608869196218322, + "learning_rate": 4.819420358171178e-06, + "loss": 11.8257, + "step": 33193 + }, + { + "epoch": 1.8075459625764907, + "grad_norm": 0.5604810907891347, + "learning_rate": 4.816716156448831e-06, + "loss": 11.6896, + "step": 33194 + }, + { + "epoch": 1.807600416573074, + "grad_norm": 0.5146847407936258, + "learning_rate": 4.814012694887649e-06, + "loss": 11.7387, + "step": 33195 + }, + { + "epoch": 1.807654870569657, + "grad_norm": 0.506587606170563, + "learning_rate": 4.811309973508682e-06, + "loss": 11.7663, + "step": 33196 + }, + { + "epoch": 1.80770932456624, + "grad_norm": 0.5397408178174148, + "learning_rate": 4.808607992332914e-06, + "loss": 11.8642, + "step": 33197 + }, + { + "epoch": 1.8077637785628229, + "grad_norm": 0.541070921712911, + "learning_rate": 4.805906751381373e-06, + "loss": 11.7178, + "step": 33198 + }, + { + "epoch": 1.8078182325594059, + "grad_norm": 0.5703390458843115, + "learning_rate": 4.803206250675097e-06, + "loss": 11.8087, + "step": 33199 + }, + { + "epoch": 1.8078726865559889, + "grad_norm": 0.5797365627006066, + "learning_rate": 4.800506490235013e-06, + "loss": 11.7641, + "step": 33200 + }, + { + "epoch": 1.807927140552572, + "grad_norm": 0.5177289704755285, + "learning_rate": 4.797807470082172e-06, + "loss": 11.7672, + "step": 33201 + }, + { + "epoch": 1.807981594549155, + "grad_norm": 0.5838144505341726, + "learning_rate": 4.795109190237557e-06, + "loss": 11.8404, + "step": 33202 + }, + { + "epoch": 1.808036048545738, + "grad_norm": 0.5445052167707678, + "learning_rate": 4.792411650722117e-06, + "loss": 11.8278, + "step": 33203 + }, + { + "epoch": 1.808090502542321, + "grad_norm": 0.5908857135536393, + "learning_rate": 4.78971485155687e-06, + "loss": 11.8719, + "step": 33204 + }, + { + "epoch": 1.808144956538904, + "grad_norm": 0.48831478538696144, + "learning_rate": 4.787018792762743e-06, + "loss": 11.7465, + "step": 33205 + }, + { + "epoch": 1.808199410535487, + "grad_norm": 0.548509157319017, + "learning_rate": 4.7843234743607525e-06, + "loss": 11.6314, + "step": 33206 + }, + { + "epoch": 1.80825386453207, + "grad_norm": 0.561310458372122, + "learning_rate": 4.781628896371815e-06, + "loss": 11.8118, + "step": 33207 + }, + { + "epoch": 1.808308318528653, + "grad_norm": 0.5234777153094029, + "learning_rate": 4.778935058816902e-06, + "loss": 11.7851, + "step": 33208 + }, + { + "epoch": 1.808362772525236, + "grad_norm": 0.6093699230494634, + "learning_rate": 4.776241961716965e-06, + "loss": 11.8338, + "step": 33209 + }, + { + "epoch": 1.808417226521819, + "grad_norm": 0.5867070046528813, + "learning_rate": 4.773549605092931e-06, + "loss": 11.8577, + "step": 33210 + }, + { + "epoch": 1.808471680518402, + "grad_norm": 0.5452169022255674, + "learning_rate": 4.77085798896576e-06, + "loss": 11.9047, + "step": 33211 + }, + { + "epoch": 1.808526134514985, + "grad_norm": 0.5196413237297726, + "learning_rate": 4.7681671133563476e-06, + "loss": 11.5996, + "step": 33212 + }, + { + "epoch": 1.808580588511568, + "grad_norm": 0.6086607823386528, + "learning_rate": 4.765476978285633e-06, + "loss": 11.8549, + "step": 33213 + }, + { + "epoch": 1.808635042508151, + "grad_norm": 0.5089794525742093, + "learning_rate": 4.762787583774564e-06, + "loss": 11.678, + "step": 33214 + }, + { + "epoch": 1.808689496504734, + "grad_norm": 0.574101671690218, + "learning_rate": 4.760098929844003e-06, + "loss": 11.6728, + "step": 33215 + }, + { + "epoch": 1.808743950501317, + "grad_norm": 0.5626940699284111, + "learning_rate": 4.757411016514912e-06, + "loss": 11.8825, + "step": 33216 + }, + { + "epoch": 1.8087984044979, + "grad_norm": 0.5615339223466838, + "learning_rate": 4.754723843808151e-06, + "loss": 11.8007, + "step": 33217 + }, + { + "epoch": 1.8088528584944832, + "grad_norm": 0.5455314024715735, + "learning_rate": 4.752037411744637e-06, + "loss": 11.8554, + "step": 33218 + }, + { + "epoch": 1.8089073124910662, + "grad_norm": 0.5402356367933193, + "learning_rate": 4.749351720345252e-06, + "loss": 11.8262, + "step": 33219 + }, + { + "epoch": 1.8089617664876492, + "grad_norm": 0.5582377215152176, + "learning_rate": 4.74666676963087e-06, + "loss": 11.8435, + "step": 33220 + }, + { + "epoch": 1.8090162204842322, + "grad_norm": 0.588828318117417, + "learning_rate": 4.743982559622395e-06, + "loss": 11.7713, + "step": 33221 + }, + { + "epoch": 1.8090706744808152, + "grad_norm": 0.5557691878795636, + "learning_rate": 4.741299090340678e-06, + "loss": 11.8135, + "step": 33222 + }, + { + "epoch": 1.8091251284773981, + "grad_norm": 0.6213764815908573, + "learning_rate": 4.7386163618066026e-06, + "loss": 11.8986, + "step": 33223 + }, + { + "epoch": 1.8091795824739814, + "grad_norm": 0.5406688724320587, + "learning_rate": 4.735934374041007e-06, + "loss": 11.8455, + "step": 33224 + }, + { + "epoch": 1.8092340364705644, + "grad_norm": 0.5716837753206888, + "learning_rate": 4.733253127064763e-06, + "loss": 11.7499, + "step": 33225 + }, + { + "epoch": 1.8092884904671473, + "grad_norm": 0.5579331798827912, + "learning_rate": 4.730572620898732e-06, + "loss": 11.7505, + "step": 33226 + }, + { + "epoch": 1.8093429444637303, + "grad_norm": 0.5274949203688987, + "learning_rate": 4.727892855563731e-06, + "loss": 11.5504, + "step": 33227 + }, + { + "epoch": 1.8093973984603133, + "grad_norm": 0.5336049351114056, + "learning_rate": 4.7252138310806324e-06, + "loss": 11.8481, + "step": 33228 + }, + { + "epoch": 1.8094518524568963, + "grad_norm": 0.6252957727507026, + "learning_rate": 4.722535547470242e-06, + "loss": 11.8298, + "step": 33229 + }, + { + "epoch": 1.8095063064534793, + "grad_norm": 0.5727061752869124, + "learning_rate": 4.719858004753375e-06, + "loss": 11.8586, + "step": 33230 + }, + { + "epoch": 1.8095607604500623, + "grad_norm": 0.5170299752091735, + "learning_rate": 4.717181202950893e-06, + "loss": 11.8325, + "step": 33231 + }, + { + "epoch": 1.8096152144466453, + "grad_norm": 0.5299384660261783, + "learning_rate": 4.71450514208357e-06, + "loss": 11.7124, + "step": 33232 + }, + { + "epoch": 1.8096696684432283, + "grad_norm": 0.5499786496689046, + "learning_rate": 4.711829822172254e-06, + "loss": 11.8386, + "step": 33233 + }, + { + "epoch": 1.8097241224398113, + "grad_norm": 0.5593279097519885, + "learning_rate": 4.709155243237706e-06, + "loss": 11.8838, + "step": 33234 + }, + { + "epoch": 1.8097785764363943, + "grad_norm": 0.5756440750572177, + "learning_rate": 4.706481405300756e-06, + "loss": 11.7977, + "step": 33235 + }, + { + "epoch": 1.8098330304329773, + "grad_norm": 0.5310462041528016, + "learning_rate": 4.703808308382196e-06, + "loss": 11.7635, + "step": 33236 + }, + { + "epoch": 1.8098874844295603, + "grad_norm": 0.5094785712825856, + "learning_rate": 4.701135952502788e-06, + "loss": 11.7599, + "step": 33237 + }, + { + "epoch": 1.8099419384261433, + "grad_norm": 0.528188634335068, + "learning_rate": 4.69846433768335e-06, + "loss": 11.7453, + "step": 33238 + }, + { + "epoch": 1.8099963924227263, + "grad_norm": 0.5171714685422232, + "learning_rate": 4.695793463944631e-06, + "loss": 11.6009, + "step": 33239 + }, + { + "epoch": 1.8100508464193092, + "grad_norm": 0.5283440178362762, + "learning_rate": 4.693123331307392e-06, + "loss": 11.7741, + "step": 33240 + }, + { + "epoch": 1.8101053004158922, + "grad_norm": 0.5691858803247282, + "learning_rate": 4.690453939792427e-06, + "loss": 11.7403, + "step": 33241 + }, + { + "epoch": 1.8101597544124755, + "grad_norm": 0.575870998897387, + "learning_rate": 4.687785289420454e-06, + "loss": 11.7369, + "step": 33242 + }, + { + "epoch": 1.8102142084090584, + "grad_norm": 0.583534904636657, + "learning_rate": 4.6851173802122675e-06, + "loss": 11.8699, + "step": 33243 + }, + { + "epoch": 1.8102686624056414, + "grad_norm": 0.5322452685176067, + "learning_rate": 4.6824502121885714e-06, + "loss": 11.733, + "step": 33244 + }, + { + "epoch": 1.8103231164022244, + "grad_norm": 0.5102198165469266, + "learning_rate": 4.6797837853701394e-06, + "loss": 11.7132, + "step": 33245 + }, + { + "epoch": 1.8103775703988074, + "grad_norm": 0.5272270547549536, + "learning_rate": 4.677118099777688e-06, + "loss": 11.7703, + "step": 33246 + }, + { + "epoch": 1.8104320243953904, + "grad_norm": 0.5301405200023217, + "learning_rate": 4.674453155431946e-06, + "loss": 11.7402, + "step": 33247 + }, + { + "epoch": 1.8104864783919736, + "grad_norm": 0.5920261063214396, + "learning_rate": 4.671788952353662e-06, + "loss": 11.7978, + "step": 33248 + }, + { + "epoch": 1.8105409323885566, + "grad_norm": 0.4839884667414224, + "learning_rate": 4.669125490563531e-06, + "loss": 11.7457, + "step": 33249 + }, + { + "epoch": 1.8105953863851396, + "grad_norm": 0.6332621013155387, + "learning_rate": 4.666462770082247e-06, + "loss": 11.7544, + "step": 33250 + }, + { + "epoch": 1.8106498403817226, + "grad_norm": 0.5678331737352504, + "learning_rate": 4.663800790930561e-06, + "loss": 11.7125, + "step": 33251 + }, + { + "epoch": 1.8107042943783056, + "grad_norm": 0.5643788351233826, + "learning_rate": 4.661139553129123e-06, + "loss": 11.8097, + "step": 33252 + }, + { + "epoch": 1.8107587483748886, + "grad_norm": 0.5241273300555964, + "learning_rate": 4.658479056698672e-06, + "loss": 11.6924, + "step": 33253 + }, + { + "epoch": 1.8108132023714716, + "grad_norm": 0.521911469572689, + "learning_rate": 4.655819301659869e-06, + "loss": 11.7294, + "step": 33254 + }, + { + "epoch": 1.8108676563680546, + "grad_norm": 0.5678487776401085, + "learning_rate": 4.653160288033409e-06, + "loss": 11.8871, + "step": 33255 + }, + { + "epoch": 1.8109221103646376, + "grad_norm": 0.49279398776295474, + "learning_rate": 4.650502015839953e-06, + "loss": 11.7573, + "step": 33256 + }, + { + "epoch": 1.8109765643612206, + "grad_norm": 0.49438942981577944, + "learning_rate": 4.647844485100184e-06, + "loss": 11.7171, + "step": 33257 + }, + { + "epoch": 1.8110310183578036, + "grad_norm": 0.7163185688421985, + "learning_rate": 4.645187695834774e-06, + "loss": 11.7511, + "step": 33258 + }, + { + "epoch": 1.8110854723543865, + "grad_norm": 0.5259278219870365, + "learning_rate": 4.642531648064374e-06, + "loss": 11.7328, + "step": 33259 + }, + { + "epoch": 1.8111399263509695, + "grad_norm": 0.5057567915034931, + "learning_rate": 4.639876341809657e-06, + "loss": 11.7507, + "step": 33260 + }, + { + "epoch": 1.8111943803475525, + "grad_norm": 0.5143063553239388, + "learning_rate": 4.637221777091227e-06, + "loss": 11.8169, + "step": 33261 + }, + { + "epoch": 1.8112488343441355, + "grad_norm": 0.5147328992008211, + "learning_rate": 4.634567953929758e-06, + "loss": 11.813, + "step": 33262 + }, + { + "epoch": 1.8113032883407185, + "grad_norm": 0.5632237736043315, + "learning_rate": 4.6319148723459e-06, + "loss": 11.6644, + "step": 33263 + }, + { + "epoch": 1.8113577423373015, + "grad_norm": 0.580805442319458, + "learning_rate": 4.6292625323602346e-06, + "loss": 11.7563, + "step": 33264 + }, + { + "epoch": 1.8114121963338847, + "grad_norm": 0.5617976680277361, + "learning_rate": 4.626610933993447e-06, + "loss": 11.7837, + "step": 33265 + }, + { + "epoch": 1.8114666503304677, + "grad_norm": 0.649113014473685, + "learning_rate": 4.62396007726611e-06, + "loss": 11.8386, + "step": 33266 + }, + { + "epoch": 1.8115211043270507, + "grad_norm": 0.5411429635084574, + "learning_rate": 4.621309962198861e-06, + "loss": 11.7343, + "step": 33267 + }, + { + "epoch": 1.8115755583236337, + "grad_norm": 0.5173265291460204, + "learning_rate": 4.618660588812284e-06, + "loss": 11.7112, + "step": 33268 + }, + { + "epoch": 1.8116300123202167, + "grad_norm": 0.6377719460421658, + "learning_rate": 4.616011957127009e-06, + "loss": 11.8151, + "step": 33269 + }, + { + "epoch": 1.8116844663167997, + "grad_norm": 0.47039995371666304, + "learning_rate": 4.613364067163639e-06, + "loss": 11.8002, + "step": 33270 + }, + { + "epoch": 1.811738920313383, + "grad_norm": 0.5676157980270421, + "learning_rate": 4.610716918942726e-06, + "loss": 11.788, + "step": 33271 + }, + { + "epoch": 1.811793374309966, + "grad_norm": 0.5539329546890714, + "learning_rate": 4.6080705124848854e-06, + "loss": 11.7338, + "step": 33272 + }, + { + "epoch": 1.811847828306549, + "grad_norm": 0.6242612650651835, + "learning_rate": 4.60542484781068e-06, + "loss": 11.7645, + "step": 33273 + }, + { + "epoch": 1.8119022823031319, + "grad_norm": 0.5256584462171094, + "learning_rate": 4.602779924940681e-06, + "loss": 11.6497, + "step": 33274 + }, + { + "epoch": 1.8119567362997149, + "grad_norm": 0.5091817870369126, + "learning_rate": 4.600135743895473e-06, + "loss": 11.6957, + "step": 33275 + }, + { + "epoch": 1.8120111902962979, + "grad_norm": 0.6179767147898236, + "learning_rate": 4.597492304695605e-06, + "loss": 11.9228, + "step": 33276 + }, + { + "epoch": 1.8120656442928809, + "grad_norm": 0.547809004844969, + "learning_rate": 4.5948496073616395e-06, + "loss": 11.8367, + "step": 33277 + }, + { + "epoch": 1.8121200982894639, + "grad_norm": 0.5145807759496834, + "learning_rate": 4.592207651914115e-06, + "loss": 11.7819, + "step": 33278 + }, + { + "epoch": 1.8121745522860468, + "grad_norm": 0.5736815125760386, + "learning_rate": 4.589566438373594e-06, + "loss": 11.7776, + "step": 33279 + }, + { + "epoch": 1.8122290062826298, + "grad_norm": 0.6243906740683176, + "learning_rate": 4.586925966760602e-06, + "loss": 11.7867, + "step": 33280 + }, + { + "epoch": 1.8122834602792128, + "grad_norm": 0.5354100915325727, + "learning_rate": 4.584286237095669e-06, + "loss": 11.7204, + "step": 33281 + }, + { + "epoch": 1.8123379142757958, + "grad_norm": 0.5680149612435472, + "learning_rate": 4.581647249399335e-06, + "loss": 11.6994, + "step": 33282 + }, + { + "epoch": 1.8123923682723788, + "grad_norm": 0.5192362144096057, + "learning_rate": 4.579009003692103e-06, + "loss": 11.7376, + "step": 33283 + }, + { + "epoch": 1.8124468222689618, + "grad_norm": 0.5867927421074767, + "learning_rate": 4.576371499994503e-06, + "loss": 11.8128, + "step": 33284 + }, + { + "epoch": 1.8125012762655448, + "grad_norm": 0.5088532240817195, + "learning_rate": 4.573734738327052e-06, + "loss": 11.7488, + "step": 33285 + }, + { + "epoch": 1.8125557302621278, + "grad_norm": 0.5126047136330019, + "learning_rate": 4.57109871871022e-06, + "loss": 11.6943, + "step": 33286 + }, + { + "epoch": 1.8126101842587108, + "grad_norm": 0.5445520444535854, + "learning_rate": 4.56846344116455e-06, + "loss": 11.7375, + "step": 33287 + }, + { + "epoch": 1.812664638255294, + "grad_norm": 0.5235453073984584, + "learning_rate": 4.565828905710501e-06, + "loss": 11.7828, + "step": 33288 + }, + { + "epoch": 1.812719092251877, + "grad_norm": 0.6082597676176608, + "learning_rate": 4.563195112368579e-06, + "loss": 11.7967, + "step": 33289 + }, + { + "epoch": 1.81277354624846, + "grad_norm": 0.5184887605400904, + "learning_rate": 4.560562061159257e-06, + "loss": 11.853, + "step": 33290 + }, + { + "epoch": 1.812828000245043, + "grad_norm": 0.5212808673359198, + "learning_rate": 4.557929752102996e-06, + "loss": 11.7158, + "step": 33291 + }, + { + "epoch": 1.812882454241626, + "grad_norm": 0.5108478368583238, + "learning_rate": 4.555298185220291e-06, + "loss": 11.7638, + "step": 33292 + }, + { + "epoch": 1.812936908238209, + "grad_norm": 0.5577709139267213, + "learning_rate": 4.552667360531582e-06, + "loss": 11.7567, + "step": 33293 + }, + { + "epoch": 1.8129913622347922, + "grad_norm": 0.5426431823864006, + "learning_rate": 4.5500372780573505e-06, + "loss": 11.8768, + "step": 33294 + }, + { + "epoch": 1.8130458162313752, + "grad_norm": 0.5837656771242806, + "learning_rate": 4.5474079378180265e-06, + "loss": 11.7182, + "step": 33295 + }, + { + "epoch": 1.8131002702279582, + "grad_norm": 0.5553445735645262, + "learning_rate": 4.54477933983406e-06, + "loss": 11.6822, + "step": 33296 + }, + { + "epoch": 1.8131547242245412, + "grad_norm": 0.5262483855216749, + "learning_rate": 4.5421514841259115e-06, + "loss": 11.7623, + "step": 33297 + }, + { + "epoch": 1.8132091782211242, + "grad_norm": 0.5803607492229638, + "learning_rate": 4.539524370713988e-06, + "loss": 11.7194, + "step": 33298 + }, + { + "epoch": 1.8132636322177071, + "grad_norm": 0.5781905249474677, + "learning_rate": 4.53689799961875e-06, + "loss": 11.9124, + "step": 33299 + }, + { + "epoch": 1.8133180862142901, + "grad_norm": 0.5236121957406582, + "learning_rate": 4.5342723708606036e-06, + "loss": 11.81, + "step": 33300 + }, + { + "epoch": 1.8133725402108731, + "grad_norm": 0.4992174506719999, + "learning_rate": 4.531647484459945e-06, + "loss": 11.7106, + "step": 33301 + }, + { + "epoch": 1.8134269942074561, + "grad_norm": 0.5549655380744889, + "learning_rate": 4.529023340437222e-06, + "loss": 11.6963, + "step": 33302 + }, + { + "epoch": 1.8134814482040391, + "grad_norm": 0.5603578588330804, + "learning_rate": 4.5263999388128085e-06, + "loss": 11.8157, + "step": 33303 + }, + { + "epoch": 1.813535902200622, + "grad_norm": 0.5322873348611824, + "learning_rate": 4.523777279607133e-06, + "loss": 11.7214, + "step": 33304 + }, + { + "epoch": 1.813590356197205, + "grad_norm": 0.5531494564247507, + "learning_rate": 4.521155362840568e-06, + "loss": 11.7389, + "step": 33305 + }, + { + "epoch": 1.813644810193788, + "grad_norm": 0.6302713973539736, + "learning_rate": 4.518534188533508e-06, + "loss": 11.6865, + "step": 33306 + }, + { + "epoch": 1.813699264190371, + "grad_norm": 0.49506686958018375, + "learning_rate": 4.515913756706347e-06, + "loss": 11.724, + "step": 33307 + }, + { + "epoch": 1.813753718186954, + "grad_norm": 0.5194636773562015, + "learning_rate": 4.513294067379448e-06, + "loss": 11.7975, + "step": 33308 + }, + { + "epoch": 1.813808172183537, + "grad_norm": 0.5697393268501594, + "learning_rate": 4.510675120573204e-06, + "loss": 11.8829, + "step": 33309 + }, + { + "epoch": 1.81386262618012, + "grad_norm": 0.5224354794587687, + "learning_rate": 4.508056916307957e-06, + "loss": 11.7556, + "step": 33310 + }, + { + "epoch": 1.813917080176703, + "grad_norm": 0.5405521840368618, + "learning_rate": 4.505439454604054e-06, + "loss": 11.7114, + "step": 33311 + }, + { + "epoch": 1.8139715341732863, + "grad_norm": 0.5638519595378328, + "learning_rate": 4.5028227354818935e-06, + "loss": 11.8164, + "step": 33312 + }, + { + "epoch": 1.8140259881698693, + "grad_norm": 0.568896695552425, + "learning_rate": 4.500206758961778e-06, + "loss": 11.7192, + "step": 33313 + }, + { + "epoch": 1.8140804421664523, + "grad_norm": 0.5615458323351968, + "learning_rate": 4.497591525064082e-06, + "loss": 11.7677, + "step": 33314 + }, + { + "epoch": 1.8141348961630352, + "grad_norm": 0.5293997293990172, + "learning_rate": 4.4949770338091225e-06, + "loss": 11.8222, + "step": 33315 + }, + { + "epoch": 1.8141893501596182, + "grad_norm": 0.568010811307435, + "learning_rate": 4.49236328521725e-06, + "loss": 11.9389, + "step": 33316 + }, + { + "epoch": 1.8142438041562012, + "grad_norm": 0.49981054646930256, + "learning_rate": 4.489750279308757e-06, + "loss": 11.8114, + "step": 33317 + }, + { + "epoch": 1.8142982581527844, + "grad_norm": 0.5225249272932014, + "learning_rate": 4.4871380161039865e-06, + "loss": 11.7547, + "step": 33318 + }, + { + "epoch": 1.8143527121493674, + "grad_norm": 0.5493646771922226, + "learning_rate": 4.484526495623265e-06, + "loss": 11.5598, + "step": 33319 + }, + { + "epoch": 1.8144071661459504, + "grad_norm": 0.6062000021701177, + "learning_rate": 4.481915717886886e-06, + "loss": 11.743, + "step": 33320 + }, + { + "epoch": 1.8144616201425334, + "grad_norm": 0.5753678436648219, + "learning_rate": 4.479305682915136e-06, + "loss": 11.9379, + "step": 33321 + }, + { + "epoch": 1.8145160741391164, + "grad_norm": 0.6455252697951465, + "learning_rate": 4.476696390728318e-06, + "loss": 11.7879, + "step": 33322 + }, + { + "epoch": 1.8145705281356994, + "grad_norm": 0.5407366107188332, + "learning_rate": 4.47408784134673e-06, + "loss": 11.8897, + "step": 33323 + }, + { + "epoch": 1.8146249821322824, + "grad_norm": 0.5738336666237459, + "learning_rate": 4.471480034790676e-06, + "loss": 11.8631, + "step": 33324 + }, + { + "epoch": 1.8146794361288654, + "grad_norm": 0.5893288333801068, + "learning_rate": 4.468872971080384e-06, + "loss": 11.8426, + "step": 33325 + }, + { + "epoch": 1.8147338901254484, + "grad_norm": 0.5472307563950716, + "learning_rate": 4.466266650236184e-06, + "loss": 11.7547, + "step": 33326 + }, + { + "epoch": 1.8147883441220314, + "grad_norm": 0.6296197773275323, + "learning_rate": 4.463661072278291e-06, + "loss": 11.8189, + "step": 33327 + }, + { + "epoch": 1.8148427981186144, + "grad_norm": 0.5841100972519099, + "learning_rate": 4.461056237227001e-06, + "loss": 11.7946, + "step": 33328 + }, + { + "epoch": 1.8148972521151974, + "grad_norm": 0.5151973204027636, + "learning_rate": 4.458452145102554e-06, + "loss": 11.7281, + "step": 33329 + }, + { + "epoch": 1.8149517061117804, + "grad_norm": 0.5503912756346909, + "learning_rate": 4.4558487959252215e-06, + "loss": 11.8112, + "step": 33330 + }, + { + "epoch": 1.8150061601083634, + "grad_norm": 0.5226465804257504, + "learning_rate": 4.453246189715232e-06, + "loss": 11.8009, + "step": 33331 + }, + { + "epoch": 1.8150606141049463, + "grad_norm": 0.6119326188396066, + "learning_rate": 4.450644326492803e-06, + "loss": 11.8591, + "step": 33332 + }, + { + "epoch": 1.8151150681015293, + "grad_norm": 0.5231959641530465, + "learning_rate": 4.448043206278196e-06, + "loss": 11.7045, + "step": 33333 + }, + { + "epoch": 1.8151695220981123, + "grad_norm": 0.4931963916129974, + "learning_rate": 4.445442829091629e-06, + "loss": 11.7716, + "step": 33334 + }, + { + "epoch": 1.8152239760946955, + "grad_norm": 0.5697086660320094, + "learning_rate": 4.442843194953317e-06, + "loss": 11.9071, + "step": 33335 + }, + { + "epoch": 1.8152784300912785, + "grad_norm": 0.5087142488913186, + "learning_rate": 4.440244303883501e-06, + "loss": 11.7693, + "step": 33336 + }, + { + "epoch": 1.8153328840878615, + "grad_norm": 0.5295033186600981, + "learning_rate": 4.437646155902353e-06, + "loss": 11.8257, + "step": 33337 + }, + { + "epoch": 1.8153873380844445, + "grad_norm": 0.5766055544264328, + "learning_rate": 4.435048751030102e-06, + "loss": 11.8032, + "step": 33338 + }, + { + "epoch": 1.8154417920810275, + "grad_norm": 0.5235030491962843, + "learning_rate": 4.432452089286931e-06, + "loss": 11.8001, + "step": 33339 + }, + { + "epoch": 1.8154962460776105, + "grad_norm": 0.5517239769619563, + "learning_rate": 4.429856170693047e-06, + "loss": 11.7633, + "step": 33340 + }, + { + "epoch": 1.8155507000741937, + "grad_norm": 0.5235495340085781, + "learning_rate": 4.427260995268634e-06, + "loss": 11.7956, + "step": 33341 + }, + { + "epoch": 1.8156051540707767, + "grad_norm": 0.47897288269388455, + "learning_rate": 4.424666563033853e-06, + "loss": 11.6569, + "step": 33342 + }, + { + "epoch": 1.8156596080673597, + "grad_norm": 0.5624732894903519, + "learning_rate": 4.422072874008909e-06, + "loss": 11.9135, + "step": 33343 + }, + { + "epoch": 1.8157140620639427, + "grad_norm": 0.5635515144033085, + "learning_rate": 4.419479928213932e-06, + "loss": 11.6486, + "step": 33344 + }, + { + "epoch": 1.8157685160605257, + "grad_norm": 0.5859999162358663, + "learning_rate": 4.4168877256691165e-06, + "loss": 11.7451, + "step": 33345 + }, + { + "epoch": 1.8158229700571087, + "grad_norm": 0.5564169979815358, + "learning_rate": 4.414296266394624e-06, + "loss": 11.8639, + "step": 33346 + }, + { + "epoch": 1.8158774240536917, + "grad_norm": 0.5819972700378305, + "learning_rate": 4.411705550410572e-06, + "loss": 11.7479, + "step": 33347 + }, + { + "epoch": 1.8159318780502747, + "grad_norm": 0.5720018007402299, + "learning_rate": 4.409115577737155e-06, + "loss": 11.8476, + "step": 33348 + }, + { + "epoch": 1.8159863320468577, + "grad_norm": 0.5472262518933869, + "learning_rate": 4.406526348394458e-06, + "loss": 11.8056, + "step": 33349 + }, + { + "epoch": 1.8160407860434407, + "grad_norm": 0.5890280566237442, + "learning_rate": 4.403937862402663e-06, + "loss": 11.7987, + "step": 33350 + }, + { + "epoch": 1.8160952400400237, + "grad_norm": 0.5684000945037254, + "learning_rate": 4.401350119781877e-06, + "loss": 11.8233, + "step": 33351 + }, + { + "epoch": 1.8161496940366066, + "grad_norm": 0.5102991266086635, + "learning_rate": 4.398763120552218e-06, + "loss": 11.6719, + "step": 33352 + }, + { + "epoch": 1.8162041480331896, + "grad_norm": 0.5504354368638585, + "learning_rate": 4.396176864733815e-06, + "loss": 11.8069, + "step": 33353 + }, + { + "epoch": 1.8162586020297726, + "grad_norm": 0.608197532885495, + "learning_rate": 4.39359135234676e-06, + "loss": 11.7731, + "step": 33354 + }, + { + "epoch": 1.8163130560263556, + "grad_norm": 0.6193116912963961, + "learning_rate": 4.3910065834111725e-06, + "loss": 11.8017, + "step": 33355 + }, + { + "epoch": 1.8163675100229386, + "grad_norm": 0.5551117452595495, + "learning_rate": 4.38842255794717e-06, + "loss": 11.8228, + "step": 33356 + }, + { + "epoch": 1.8164219640195216, + "grad_norm": 0.5526334834554533, + "learning_rate": 4.385839275974812e-06, + "loss": 11.7001, + "step": 33357 + }, + { + "epoch": 1.8164764180161048, + "grad_norm": 0.5410470492028854, + "learning_rate": 4.3832567375142186e-06, + "loss": 11.769, + "step": 33358 + }, + { + "epoch": 1.8165308720126878, + "grad_norm": 0.4824851732505219, + "learning_rate": 4.380674942585428e-06, + "loss": 11.7198, + "step": 33359 + }, + { + "epoch": 1.8165853260092708, + "grad_norm": 0.5371286258570362, + "learning_rate": 4.378093891208568e-06, + "loss": 11.7387, + "step": 33360 + }, + { + "epoch": 1.8166397800058538, + "grad_norm": 0.5261341840184692, + "learning_rate": 4.37551358340369e-06, + "loss": 11.6994, + "step": 33361 + }, + { + "epoch": 1.8166942340024368, + "grad_norm": 0.506506395276939, + "learning_rate": 4.372934019190833e-06, + "loss": 11.7797, + "step": 33362 + }, + { + "epoch": 1.8167486879990198, + "grad_norm": 0.5681386074969045, + "learning_rate": 4.370355198590103e-06, + "loss": 11.7945, + "step": 33363 + }, + { + "epoch": 1.816803141995603, + "grad_norm": 0.5927926920329334, + "learning_rate": 4.367777121621508e-06, + "loss": 11.9358, + "step": 33364 + }, + { + "epoch": 1.816857595992186, + "grad_norm": 0.5492525892142982, + "learning_rate": 4.365199788305119e-06, + "loss": 11.8749, + "step": 33365 + }, + { + "epoch": 1.816912049988769, + "grad_norm": 0.5868734938464674, + "learning_rate": 4.362623198660975e-06, + "loss": 11.8656, + "step": 33366 + }, + { + "epoch": 1.816966503985352, + "grad_norm": 0.5737197849021911, + "learning_rate": 4.360047352709107e-06, + "loss": 11.6498, + "step": 33367 + }, + { + "epoch": 1.817020957981935, + "grad_norm": 0.5010726442825639, + "learning_rate": 4.357472250469563e-06, + "loss": 11.7842, + "step": 33368 + }, + { + "epoch": 1.817075411978518, + "grad_norm": 0.5260934326391239, + "learning_rate": 4.354897891962339e-06, + "loss": 11.763, + "step": 33369 + }, + { + "epoch": 1.817129865975101, + "grad_norm": 0.5762939373781866, + "learning_rate": 4.352324277207498e-06, + "loss": 11.8411, + "step": 33370 + }, + { + "epoch": 1.817184319971684, + "grad_norm": 0.46623385380449567, + "learning_rate": 4.349751406224989e-06, + "loss": 11.7587, + "step": 33371 + }, + { + "epoch": 1.817238773968267, + "grad_norm": 0.5648296635165347, + "learning_rate": 4.3471792790348634e-06, + "loss": 11.7265, + "step": 33372 + }, + { + "epoch": 1.81729322796485, + "grad_norm": 0.5351545931153535, + "learning_rate": 4.344607895657127e-06, + "loss": 11.7363, + "step": 33373 + }, + { + "epoch": 1.817347681961433, + "grad_norm": 0.5870113109642406, + "learning_rate": 4.342037256111742e-06, + "loss": 11.701, + "step": 33374 + }, + { + "epoch": 1.817402135958016, + "grad_norm": 0.538544264144728, + "learning_rate": 4.3394673604187255e-06, + "loss": 11.7449, + "step": 33375 + }, + { + "epoch": 1.817456589954599, + "grad_norm": 0.5717901168737194, + "learning_rate": 4.33689820859805e-06, + "loss": 11.8415, + "step": 33376 + }, + { + "epoch": 1.817511043951182, + "grad_norm": 0.6177129381709924, + "learning_rate": 4.3343298006697005e-06, + "loss": 11.6388, + "step": 33377 + }, + { + "epoch": 1.817565497947765, + "grad_norm": 0.5550373898347775, + "learning_rate": 4.331762136653639e-06, + "loss": 11.7749, + "step": 33378 + }, + { + "epoch": 1.817619951944348, + "grad_norm": 0.4833398613095589, + "learning_rate": 4.329195216569837e-06, + "loss": 11.5995, + "step": 33379 + }, + { + "epoch": 1.8176744059409309, + "grad_norm": 0.5039485652407718, + "learning_rate": 4.32662904043829e-06, + "loss": 11.8422, + "step": 33380 + }, + { + "epoch": 1.8177288599375139, + "grad_norm": 0.5287346972619532, + "learning_rate": 4.324063608278883e-06, + "loss": 11.8981, + "step": 33381 + }, + { + "epoch": 1.817783313934097, + "grad_norm": 0.5433745401187149, + "learning_rate": 4.3214989201116105e-06, + "loss": 11.7002, + "step": 33382 + }, + { + "epoch": 1.81783776793068, + "grad_norm": 0.5623893756865836, + "learning_rate": 4.318934975956413e-06, + "loss": 11.7466, + "step": 33383 + }, + { + "epoch": 1.817892221927263, + "grad_norm": 0.5229503510293498, + "learning_rate": 4.316371775833217e-06, + "loss": 11.7498, + "step": 33384 + }, + { + "epoch": 1.817946675923846, + "grad_norm": 0.5196069408964588, + "learning_rate": 4.3138093197619634e-06, + "loss": 11.6792, + "step": 33385 + }, + { + "epoch": 1.818001129920429, + "grad_norm": 0.5270375830935387, + "learning_rate": 4.311247607762569e-06, + "loss": 11.7954, + "step": 33386 + }, + { + "epoch": 1.8180555839170123, + "grad_norm": 0.523893051620918, + "learning_rate": 4.308686639854975e-06, + "loss": 11.7321, + "step": 33387 + }, + { + "epoch": 1.8181100379135953, + "grad_norm": 0.5614365727548793, + "learning_rate": 4.306126416059064e-06, + "loss": 11.7834, + "step": 33388 + }, + { + "epoch": 1.8181644919101783, + "grad_norm": 0.513632033751421, + "learning_rate": 4.303566936394765e-06, + "loss": 11.7146, + "step": 33389 + }, + { + "epoch": 1.8182189459067613, + "grad_norm": 0.5477296502040427, + "learning_rate": 4.301008200882006e-06, + "loss": 11.7553, + "step": 33390 + }, + { + "epoch": 1.8182733999033442, + "grad_norm": 0.5785223420716445, + "learning_rate": 4.298450209540628e-06, + "loss": 11.7638, + "step": 33391 + }, + { + "epoch": 1.8183278538999272, + "grad_norm": 0.5166285606492488, + "learning_rate": 4.295892962390558e-06, + "loss": 11.7441, + "step": 33392 + }, + { + "epoch": 1.8183823078965102, + "grad_norm": 0.6215707218826039, + "learning_rate": 4.293336459451669e-06, + "loss": 11.8865, + "step": 33393 + }, + { + "epoch": 1.8184367618930932, + "grad_norm": 0.5193871759628252, + "learning_rate": 4.290780700743846e-06, + "loss": 11.753, + "step": 33394 + }, + { + "epoch": 1.8184912158896762, + "grad_norm": 0.5457247022914097, + "learning_rate": 4.288225686286962e-06, + "loss": 11.8888, + "step": 33395 + }, + { + "epoch": 1.8185456698862592, + "grad_norm": 0.6053692002563366, + "learning_rate": 4.285671416100889e-06, + "loss": 11.7425, + "step": 33396 + }, + { + "epoch": 1.8186001238828422, + "grad_norm": 0.5378080864852702, + "learning_rate": 4.28311789020549e-06, + "loss": 11.7059, + "step": 33397 + }, + { + "epoch": 1.8186545778794252, + "grad_norm": 0.5961398522834784, + "learning_rate": 4.280565108620604e-06, + "loss": 11.8281, + "step": 33398 + }, + { + "epoch": 1.8187090318760082, + "grad_norm": 0.616068474741855, + "learning_rate": 4.278013071366116e-06, + "loss": 11.9312, + "step": 33399 + }, + { + "epoch": 1.8187634858725912, + "grad_norm": 0.5498839603826776, + "learning_rate": 4.275461778461831e-06, + "loss": 11.7726, + "step": 33400 + }, + { + "epoch": 1.8188179398691742, + "grad_norm": 0.5951827338815259, + "learning_rate": 4.272911229927634e-06, + "loss": 11.759, + "step": 33401 + }, + { + "epoch": 1.8188723938657572, + "grad_norm": 0.5330808372606821, + "learning_rate": 4.27036142578332e-06, + "loss": 11.8161, + "step": 33402 + }, + { + "epoch": 1.8189268478623402, + "grad_norm": 0.5826470579002819, + "learning_rate": 4.267812366048718e-06, + "loss": 11.8479, + "step": 33403 + }, + { + "epoch": 1.8189813018589231, + "grad_norm": 0.510313617563817, + "learning_rate": 4.265264050743667e-06, + "loss": 11.7532, + "step": 33404 + }, + { + "epoch": 1.8190357558555064, + "grad_norm": 0.5416770344837316, + "learning_rate": 4.2627164798879845e-06, + "loss": 11.8805, + "step": 33405 + }, + { + "epoch": 1.8190902098520894, + "grad_norm": 0.5713216715113655, + "learning_rate": 4.260169653501467e-06, + "loss": 11.8938, + "step": 33406 + }, + { + "epoch": 1.8191446638486723, + "grad_norm": 0.5185567635271205, + "learning_rate": 4.25762357160393e-06, + "loss": 11.7382, + "step": 33407 + }, + { + "epoch": 1.8191991178452553, + "grad_norm": 0.5088800254469613, + "learning_rate": 4.25507823421516e-06, + "loss": 11.7215, + "step": 33408 + }, + { + "epoch": 1.8192535718418383, + "grad_norm": 0.5321922930209237, + "learning_rate": 4.252533641354972e-06, + "loss": 11.7429, + "step": 33409 + }, + { + "epoch": 1.8193080258384213, + "grad_norm": 0.5342018890374852, + "learning_rate": 4.249989793043119e-06, + "loss": 11.6897, + "step": 33410 + }, + { + "epoch": 1.8193624798350045, + "grad_norm": 0.523277589494354, + "learning_rate": 4.247446689299428e-06, + "loss": 11.7073, + "step": 33411 + }, + { + "epoch": 1.8194169338315875, + "grad_norm": 0.5845380724600674, + "learning_rate": 4.24490433014364e-06, + "loss": 11.7868, + "step": 33412 + }, + { + "epoch": 1.8194713878281705, + "grad_norm": 0.5011164408601148, + "learning_rate": 4.242362715595527e-06, + "loss": 11.8852, + "step": 33413 + }, + { + "epoch": 1.8195258418247535, + "grad_norm": 0.5536522211802043, + "learning_rate": 4.239821845674874e-06, + "loss": 11.8753, + "step": 33414 + }, + { + "epoch": 1.8195802958213365, + "grad_norm": 0.49836783003234875, + "learning_rate": 4.23728172040141e-06, + "loss": 11.7772, + "step": 33415 + }, + { + "epoch": 1.8196347498179195, + "grad_norm": 0.5547445071794451, + "learning_rate": 4.2347423397948945e-06, + "loss": 11.8077, + "step": 33416 + }, + { + "epoch": 1.8196892038145025, + "grad_norm": 0.5113337040094863, + "learning_rate": 4.232203703875104e-06, + "loss": 11.6234, + "step": 33417 + }, + { + "epoch": 1.8197436578110855, + "grad_norm": 0.5791002775489162, + "learning_rate": 4.229665812661742e-06, + "loss": 11.9064, + "step": 33418 + }, + { + "epoch": 1.8197981118076685, + "grad_norm": 0.504620041642399, + "learning_rate": 4.227128666174574e-06, + "loss": 11.7447, + "step": 33419 + }, + { + "epoch": 1.8198525658042515, + "grad_norm": 0.6257757387641968, + "learning_rate": 4.224592264433302e-06, + "loss": 11.8402, + "step": 33420 + }, + { + "epoch": 1.8199070198008345, + "grad_norm": 0.6169789961814764, + "learning_rate": 4.222056607457669e-06, + "loss": 11.8489, + "step": 33421 + }, + { + "epoch": 1.8199614737974175, + "grad_norm": 0.7099164133328735, + "learning_rate": 4.219521695267392e-06, + "loss": 11.6361, + "step": 33422 + }, + { + "epoch": 1.8200159277940005, + "grad_norm": 0.5359550502325594, + "learning_rate": 4.216987527882166e-06, + "loss": 11.8551, + "step": 33423 + }, + { + "epoch": 1.8200703817905834, + "grad_norm": 0.5362127744955114, + "learning_rate": 4.214454105321719e-06, + "loss": 11.7809, + "step": 33424 + }, + { + "epoch": 1.8201248357871664, + "grad_norm": 0.5881300916325793, + "learning_rate": 4.211921427605725e-06, + "loss": 11.7678, + "step": 33425 + }, + { + "epoch": 1.8201792897837494, + "grad_norm": 0.5299651412201789, + "learning_rate": 4.209389494753902e-06, + "loss": 11.7097, + "step": 33426 + }, + { + "epoch": 1.8202337437803324, + "grad_norm": 0.5394203768634799, + "learning_rate": 4.206858306785922e-06, + "loss": 11.7668, + "step": 33427 + }, + { + "epoch": 1.8202881977769156, + "grad_norm": 0.55487418992176, + "learning_rate": 4.20432786372148e-06, + "loss": 11.7554, + "step": 33428 + }, + { + "epoch": 1.8203426517734986, + "grad_norm": 0.5553023300732464, + "learning_rate": 4.201798165580262e-06, + "loss": 11.747, + "step": 33429 + }, + { + "epoch": 1.8203971057700816, + "grad_norm": 0.5267832576306941, + "learning_rate": 4.199269212381906e-06, + "loss": 11.8648, + "step": 33430 + }, + { + "epoch": 1.8204515597666646, + "grad_norm": 0.5247482544173551, + "learning_rate": 4.196741004146121e-06, + "loss": 11.6874, + "step": 33431 + }, + { + "epoch": 1.8205060137632476, + "grad_norm": 0.5166248630521795, + "learning_rate": 4.194213540892544e-06, + "loss": 11.8875, + "step": 33432 + }, + { + "epoch": 1.8205604677598306, + "grad_norm": 0.507621012519032, + "learning_rate": 4.191686822640806e-06, + "loss": 11.7607, + "step": 33433 + }, + { + "epoch": 1.8206149217564138, + "grad_norm": 0.5069945546515978, + "learning_rate": 4.1891608494106e-06, + "loss": 11.7674, + "step": 33434 + }, + { + "epoch": 1.8206693757529968, + "grad_norm": 0.6204043248303384, + "learning_rate": 4.186635621221524e-06, + "loss": 11.8956, + "step": 33435 + }, + { + "epoch": 1.8207238297495798, + "grad_norm": 0.5070988944301096, + "learning_rate": 4.184111138093261e-06, + "loss": 11.7003, + "step": 33436 + }, + { + "epoch": 1.8207782837461628, + "grad_norm": 0.5401162562355685, + "learning_rate": 4.181587400045406e-06, + "loss": 11.772, + "step": 33437 + }, + { + "epoch": 1.8208327377427458, + "grad_norm": 0.6120771082563683, + "learning_rate": 4.179064407097588e-06, + "loss": 11.8083, + "step": 33438 + }, + { + "epoch": 1.8208871917393288, + "grad_norm": 0.5474527864695609, + "learning_rate": 4.176542159269448e-06, + "loss": 11.8253, + "step": 33439 + }, + { + "epoch": 1.8209416457359118, + "grad_norm": 0.4984756252724798, + "learning_rate": 4.17402065658058e-06, + "loss": 11.7179, + "step": 33440 + }, + { + "epoch": 1.8209960997324948, + "grad_norm": 0.5595777782453509, + "learning_rate": 4.171499899050612e-06, + "loss": 11.8789, + "step": 33441 + }, + { + "epoch": 1.8210505537290778, + "grad_norm": 0.551713785230797, + "learning_rate": 4.16897988669912e-06, + "loss": 11.7899, + "step": 33442 + }, + { + "epoch": 1.8211050077256608, + "grad_norm": 0.5359306983219716, + "learning_rate": 4.166460619545698e-06, + "loss": 11.6985, + "step": 33443 + }, + { + "epoch": 1.8211594617222437, + "grad_norm": 0.5220655482790552, + "learning_rate": 4.1639420976099745e-06, + "loss": 11.7098, + "step": 33444 + }, + { + "epoch": 1.8212139157188267, + "grad_norm": 0.5894974152081649, + "learning_rate": 4.1614243209114886e-06, + "loss": 11.7912, + "step": 33445 + }, + { + "epoch": 1.8212683697154097, + "grad_norm": 0.549315203766272, + "learning_rate": 4.158907289469849e-06, + "loss": 11.715, + "step": 33446 + }, + { + "epoch": 1.8213228237119927, + "grad_norm": 0.5489235924790603, + "learning_rate": 4.156391003304616e-06, + "loss": 11.8682, + "step": 33447 + }, + { + "epoch": 1.8213772777085757, + "grad_norm": 0.587180068057914, + "learning_rate": 4.153875462435363e-06, + "loss": 11.7768, + "step": 33448 + }, + { + "epoch": 1.8214317317051587, + "grad_norm": 0.6152518409633797, + "learning_rate": 4.151360666881643e-06, + "loss": 11.8247, + "step": 33449 + }, + { + "epoch": 1.8214861857017417, + "grad_norm": 0.5084308337759688, + "learning_rate": 4.1488466166630046e-06, + "loss": 11.7028, + "step": 33450 + }, + { + "epoch": 1.8215406396983247, + "grad_norm": 0.585755402914931, + "learning_rate": 4.146333311799044e-06, + "loss": 11.7917, + "step": 33451 + }, + { + "epoch": 1.821595093694908, + "grad_norm": 0.5121071994847662, + "learning_rate": 4.1438207523092466e-06, + "loss": 11.7782, + "step": 33452 + }, + { + "epoch": 1.821649547691491, + "grad_norm": 0.510229864272492, + "learning_rate": 4.141308938213162e-06, + "loss": 11.8306, + "step": 33453 + }, + { + "epoch": 1.821704001688074, + "grad_norm": 0.5967912187830556, + "learning_rate": 4.1387978695303645e-06, + "loss": 11.6707, + "step": 33454 + }, + { + "epoch": 1.8217584556846569, + "grad_norm": 0.592826957125138, + "learning_rate": 4.1362875462803264e-06, + "loss": 11.9532, + "step": 33455 + }, + { + "epoch": 1.8218129096812399, + "grad_norm": 0.6119896503146464, + "learning_rate": 4.13377796848261e-06, + "loss": 11.7899, + "step": 33456 + }, + { + "epoch": 1.821867363677823, + "grad_norm": 0.5436330551330625, + "learning_rate": 4.1312691361566905e-06, + "loss": 11.8297, + "step": 33457 + }, + { + "epoch": 1.821921817674406, + "grad_norm": 0.5556863848375361, + "learning_rate": 4.128761049322127e-06, + "loss": 11.7746, + "step": 33458 + }, + { + "epoch": 1.821976271670989, + "grad_norm": 0.6291168622184845, + "learning_rate": 4.126253707998373e-06, + "loss": 11.8717, + "step": 33459 + }, + { + "epoch": 1.822030725667572, + "grad_norm": 0.5158533575860381, + "learning_rate": 4.123747112204945e-06, + "loss": 11.7635, + "step": 33460 + }, + { + "epoch": 1.822085179664155, + "grad_norm": 0.5252329388785419, + "learning_rate": 4.121241261961372e-06, + "loss": 11.8638, + "step": 33461 + }, + { + "epoch": 1.822139633660738, + "grad_norm": 0.5511530083840109, + "learning_rate": 4.118736157287073e-06, + "loss": 11.8423, + "step": 33462 + }, + { + "epoch": 1.822194087657321, + "grad_norm": 0.5093725510377453, + "learning_rate": 4.116231798201586e-06, + "loss": 11.7447, + "step": 33463 + }, + { + "epoch": 1.822248541653904, + "grad_norm": 0.5513784833495462, + "learning_rate": 4.11372818472433e-06, + "loss": 11.8521, + "step": 33464 + }, + { + "epoch": 1.822302995650487, + "grad_norm": 0.5737361005938579, + "learning_rate": 4.111225316874823e-06, + "loss": 11.8044, + "step": 33465 + }, + { + "epoch": 1.82235744964707, + "grad_norm": 0.5736580509093636, + "learning_rate": 4.108723194672504e-06, + "loss": 11.7777, + "step": 33466 + }, + { + "epoch": 1.822411903643653, + "grad_norm": 0.4762929827191769, + "learning_rate": 4.106221818136835e-06, + "loss": 11.7381, + "step": 33467 + }, + { + "epoch": 1.822466357640236, + "grad_norm": 0.5685099642399056, + "learning_rate": 4.103721187287268e-06, + "loss": 11.7572, + "step": 33468 + }, + { + "epoch": 1.822520811636819, + "grad_norm": 0.5643956750201643, + "learning_rate": 4.101221302143244e-06, + "loss": 11.7392, + "step": 33469 + }, + { + "epoch": 1.822575265633402, + "grad_norm": 0.5591510287579061, + "learning_rate": 4.0987221627242114e-06, + "loss": 11.706, + "step": 33470 + }, + { + "epoch": 1.822629719629985, + "grad_norm": 0.5173483899573453, + "learning_rate": 4.0962237690496005e-06, + "loss": 11.6633, + "step": 33471 + }, + { + "epoch": 1.822684173626568, + "grad_norm": 0.5755603656146498, + "learning_rate": 4.093726121138819e-06, + "loss": 11.7247, + "step": 33472 + }, + { + "epoch": 1.822738627623151, + "grad_norm": 0.5355424571927285, + "learning_rate": 4.0912292190113275e-06, + "loss": 11.6119, + "step": 33473 + }, + { + "epoch": 1.822793081619734, + "grad_norm": 0.5545457762756087, + "learning_rate": 4.088733062686501e-06, + "loss": 11.7037, + "step": 33474 + }, + { + "epoch": 1.8228475356163172, + "grad_norm": 0.5973165584493365, + "learning_rate": 4.086237652183789e-06, + "loss": 11.7259, + "step": 33475 + }, + { + "epoch": 1.8229019896129002, + "grad_norm": 0.5586578768783266, + "learning_rate": 4.083742987522565e-06, + "loss": 11.7125, + "step": 33476 + }, + { + "epoch": 1.8229564436094832, + "grad_norm": 0.535720181937925, + "learning_rate": 4.081249068722237e-06, + "loss": 11.7799, + "step": 33477 + }, + { + "epoch": 1.8230108976060662, + "grad_norm": 0.5093407659815583, + "learning_rate": 4.078755895802222e-06, + "loss": 11.8156, + "step": 33478 + }, + { + "epoch": 1.8230653516026492, + "grad_norm": 0.5401439310621149, + "learning_rate": 4.076263468781871e-06, + "loss": 11.7707, + "step": 33479 + }, + { + "epoch": 1.8231198055992321, + "grad_norm": 0.5588026189918949, + "learning_rate": 4.073771787680591e-06, + "loss": 11.7741, + "step": 33480 + }, + { + "epoch": 1.8231742595958154, + "grad_norm": 0.7520636812294781, + "learning_rate": 4.0712808525177445e-06, + "loss": 11.898, + "step": 33481 + }, + { + "epoch": 1.8232287135923984, + "grad_norm": 0.5100512927896358, + "learning_rate": 4.068790663312705e-06, + "loss": 11.6486, + "step": 33482 + }, + { + "epoch": 1.8232831675889813, + "grad_norm": 0.5180993532143796, + "learning_rate": 4.066301220084834e-06, + "loss": 11.7956, + "step": 33483 + }, + { + "epoch": 1.8233376215855643, + "grad_norm": 0.552233970687271, + "learning_rate": 4.063812522853494e-06, + "loss": 11.7794, + "step": 33484 + }, + { + "epoch": 1.8233920755821473, + "grad_norm": 0.5829374629722772, + "learning_rate": 4.061324571638048e-06, + "loss": 11.8612, + "step": 33485 + }, + { + "epoch": 1.8234465295787303, + "grad_norm": 0.5354389819448515, + "learning_rate": 4.058837366457813e-06, + "loss": 11.827, + "step": 33486 + }, + { + "epoch": 1.8235009835753133, + "grad_norm": 0.5289263660728635, + "learning_rate": 4.056350907332141e-06, + "loss": 11.8566, + "step": 33487 + }, + { + "epoch": 1.8235554375718963, + "grad_norm": 0.5190264478511732, + "learning_rate": 4.053865194280404e-06, + "loss": 11.7686, + "step": 33488 + }, + { + "epoch": 1.8236098915684793, + "grad_norm": 0.6100448974671006, + "learning_rate": 4.051380227321877e-06, + "loss": 11.7739, + "step": 33489 + }, + { + "epoch": 1.8236643455650623, + "grad_norm": 0.5570678020624251, + "learning_rate": 4.048896006475922e-06, + "loss": 11.9415, + "step": 33490 + }, + { + "epoch": 1.8237187995616453, + "grad_norm": 0.6030830455316337, + "learning_rate": 4.046412531761845e-06, + "loss": 11.8224, + "step": 33491 + }, + { + "epoch": 1.8237732535582283, + "grad_norm": 0.5958088040953632, + "learning_rate": 4.043929803198943e-06, + "loss": 11.8535, + "step": 33492 + }, + { + "epoch": 1.8238277075548113, + "grad_norm": 0.5204784015270262, + "learning_rate": 4.041447820806543e-06, + "loss": 11.8078, + "step": 33493 + }, + { + "epoch": 1.8238821615513943, + "grad_norm": 0.5771124407140996, + "learning_rate": 4.038966584603932e-06, + "loss": 11.6476, + "step": 33494 + }, + { + "epoch": 1.8239366155479773, + "grad_norm": 0.5696700008076531, + "learning_rate": 4.036486094610414e-06, + "loss": 11.7869, + "step": 33495 + }, + { + "epoch": 1.8239910695445603, + "grad_norm": 0.555014384169201, + "learning_rate": 4.034006350845265e-06, + "loss": 11.8537, + "step": 33496 + }, + { + "epoch": 1.8240455235411432, + "grad_norm": 0.5408362132387518, + "learning_rate": 4.031527353327791e-06, + "loss": 11.7032, + "step": 33497 + }, + { + "epoch": 1.8240999775377265, + "grad_norm": 0.5366653302165357, + "learning_rate": 4.029049102077231e-06, + "loss": 11.7905, + "step": 33498 + }, + { + "epoch": 1.8241544315343095, + "grad_norm": 0.5751795087361428, + "learning_rate": 4.026571597112893e-06, + "loss": 11.7215, + "step": 33499 + }, + { + "epoch": 1.8242088855308924, + "grad_norm": 0.5635483618024176, + "learning_rate": 4.0240948384540286e-06, + "loss": 11.8326, + "step": 33500 + }, + { + "epoch": 1.8242633395274754, + "grad_norm": 0.5132812454468167, + "learning_rate": 4.02161882611991e-06, + "loss": 11.744, + "step": 33501 + }, + { + "epoch": 1.8243177935240584, + "grad_norm": 0.7318438973514497, + "learning_rate": 4.019143560129757e-06, + "loss": 11.9016, + "step": 33502 + }, + { + "epoch": 1.8243722475206414, + "grad_norm": 0.5615510206887165, + "learning_rate": 4.016669040502863e-06, + "loss": 11.8107, + "step": 33503 + }, + { + "epoch": 1.8244267015172246, + "grad_norm": 0.6124006959764241, + "learning_rate": 4.014195267258425e-06, + "loss": 11.89, + "step": 33504 + }, + { + "epoch": 1.8244811555138076, + "grad_norm": 0.5255157333733381, + "learning_rate": 4.011722240415716e-06, + "loss": 11.7617, + "step": 33505 + }, + { + "epoch": 1.8245356095103906, + "grad_norm": 0.5741861787237976, + "learning_rate": 4.009249959993943e-06, + "loss": 11.8035, + "step": 33506 + }, + { + "epoch": 1.8245900635069736, + "grad_norm": 0.557906623715009, + "learning_rate": 4.006778426012348e-06, + "loss": 11.767, + "step": 33507 + }, + { + "epoch": 1.8246445175035566, + "grad_norm": 0.510337835917614, + "learning_rate": 4.004307638490135e-06, + "loss": 11.8081, + "step": 33508 + }, + { + "epoch": 1.8246989715001396, + "grad_norm": 0.5642408994260448, + "learning_rate": 4.001837597446523e-06, + "loss": 11.8833, + "step": 33509 + }, + { + "epoch": 1.8247534254967226, + "grad_norm": 0.580121439087195, + "learning_rate": 3.99936830290073e-06, + "loss": 11.8494, + "step": 33510 + }, + { + "epoch": 1.8248078794933056, + "grad_norm": 0.5428320757164858, + "learning_rate": 3.996899754871952e-06, + "loss": 11.7593, + "step": 33511 + }, + { + "epoch": 1.8248623334898886, + "grad_norm": 0.5673038505201914, + "learning_rate": 3.994431953379374e-06, + "loss": 11.8183, + "step": 33512 + }, + { + "epoch": 1.8249167874864716, + "grad_norm": 0.5365934862740096, + "learning_rate": 3.991964898442191e-06, + "loss": 11.565, + "step": 33513 + }, + { + "epoch": 1.8249712414830546, + "grad_norm": 0.526265695858374, + "learning_rate": 3.989498590079577e-06, + "loss": 11.7453, + "step": 33514 + }, + { + "epoch": 1.8250256954796376, + "grad_norm": 0.5468074167815248, + "learning_rate": 3.987033028310749e-06, + "loss": 11.8768, + "step": 33515 + }, + { + "epoch": 1.8250801494762205, + "grad_norm": 0.5590618180762743, + "learning_rate": 3.984568213154827e-06, + "loss": 11.7486, + "step": 33516 + }, + { + "epoch": 1.8251346034728035, + "grad_norm": 0.5274691560683105, + "learning_rate": 3.982104144631027e-06, + "loss": 11.6954, + "step": 33517 + }, + { + "epoch": 1.8251890574693865, + "grad_norm": 0.5301098752998251, + "learning_rate": 3.979640822758468e-06, + "loss": 11.8421, + "step": 33518 + }, + { + "epoch": 1.8252435114659695, + "grad_norm": 0.6038447826816482, + "learning_rate": 3.977178247556346e-06, + "loss": 11.795, + "step": 33519 + }, + { + "epoch": 1.8252979654625525, + "grad_norm": 0.5864158844167162, + "learning_rate": 3.974716419043767e-06, + "loss": 11.7993, + "step": 33520 + }, + { + "epoch": 1.8253524194591357, + "grad_norm": 0.5217450081365466, + "learning_rate": 3.972255337239916e-06, + "loss": 11.757, + "step": 33521 + }, + { + "epoch": 1.8254068734557187, + "grad_norm": 0.5495316015868976, + "learning_rate": 3.969795002163912e-06, + "loss": 11.7986, + "step": 33522 + }, + { + "epoch": 1.8254613274523017, + "grad_norm": 0.5359550676514172, + "learning_rate": 3.967335413834872e-06, + "loss": 11.8066, + "step": 33523 + }, + { + "epoch": 1.8255157814488847, + "grad_norm": 0.528853344261235, + "learning_rate": 3.964876572271947e-06, + "loss": 11.7869, + "step": 33524 + }, + { + "epoch": 1.8255702354454677, + "grad_norm": 0.5651961918332623, + "learning_rate": 3.962418477494234e-06, + "loss": 11.8611, + "step": 33525 + }, + { + "epoch": 1.8256246894420507, + "grad_norm": 0.6233616935130436, + "learning_rate": 3.959961129520862e-06, + "loss": 11.7343, + "step": 33526 + }, + { + "epoch": 1.825679143438634, + "grad_norm": 0.5257494402201819, + "learning_rate": 3.957504528370948e-06, + "loss": 11.6419, + "step": 33527 + }, + { + "epoch": 1.825733597435217, + "grad_norm": 0.5632429576399808, + "learning_rate": 3.955048674063577e-06, + "loss": 11.8467, + "step": 33528 + }, + { + "epoch": 1.8257880514318, + "grad_norm": 0.5665512604678933, + "learning_rate": 3.952593566617868e-06, + "loss": 11.8056, + "step": 33529 + }, + { + "epoch": 1.825842505428383, + "grad_norm": 0.4963751260642664, + "learning_rate": 3.950139206052883e-06, + "loss": 11.6963, + "step": 33530 + }, + { + "epoch": 1.8258969594249659, + "grad_norm": 0.6040634228372773, + "learning_rate": 3.94768559238774e-06, + "loss": 11.6905, + "step": 33531 + }, + { + "epoch": 1.8259514134215489, + "grad_norm": 0.5819911824782974, + "learning_rate": 3.94523272564149e-06, + "loss": 11.8623, + "step": 33532 + }, + { + "epoch": 1.8260058674181319, + "grad_norm": 0.5712258398644969, + "learning_rate": 3.942780605833218e-06, + "loss": 11.8323, + "step": 33533 + }, + { + "epoch": 1.8260603214147149, + "grad_norm": 0.5260499541865548, + "learning_rate": 3.940329232981999e-06, + "loss": 11.7525, + "step": 33534 + }, + { + "epoch": 1.8261147754112979, + "grad_norm": 0.6315230831867228, + "learning_rate": 3.937878607106882e-06, + "loss": 11.7957, + "step": 33535 + }, + { + "epoch": 1.8261692294078808, + "grad_norm": 0.5747869563676422, + "learning_rate": 3.93542872822692e-06, + "loss": 11.7983, + "step": 33536 + }, + { + "epoch": 1.8262236834044638, + "grad_norm": 0.5092012466463031, + "learning_rate": 3.932979596361197e-06, + "loss": 11.7679, + "step": 33537 + }, + { + "epoch": 1.8262781374010468, + "grad_norm": 0.5208759833375679, + "learning_rate": 3.930531211528721e-06, + "loss": 11.8191, + "step": 33538 + }, + { + "epoch": 1.8263325913976298, + "grad_norm": 0.510786388246234, + "learning_rate": 3.928083573748554e-06, + "loss": 11.7865, + "step": 33539 + }, + { + "epoch": 1.8263870453942128, + "grad_norm": 0.5496315114605621, + "learning_rate": 3.925636683039713e-06, + "loss": 11.7737, + "step": 33540 + }, + { + "epoch": 1.8264414993907958, + "grad_norm": 0.5115296641116465, + "learning_rate": 3.92319053942124e-06, + "loss": 11.5915, + "step": 33541 + }, + { + "epoch": 1.8264959533873788, + "grad_norm": 0.5194726769345599, + "learning_rate": 3.920745142912152e-06, + "loss": 11.7564, + "step": 33542 + }, + { + "epoch": 1.8265504073839618, + "grad_norm": 0.5129502256690269, + "learning_rate": 3.9183004935314575e-06, + "loss": 11.8047, + "step": 33543 + }, + { + "epoch": 1.8266048613805448, + "grad_norm": 0.5156117474702039, + "learning_rate": 3.915856591298172e-06, + "loss": 11.6586, + "step": 33544 + }, + { + "epoch": 1.826659315377128, + "grad_norm": 0.5090345224098513, + "learning_rate": 3.913413436231284e-06, + "loss": 11.7061, + "step": 33545 + }, + { + "epoch": 1.826713769373711, + "grad_norm": 0.5375689023583371, + "learning_rate": 3.91097102834983e-06, + "loss": 11.7355, + "step": 33546 + }, + { + "epoch": 1.826768223370294, + "grad_norm": 0.5301205373658786, + "learning_rate": 3.908529367672764e-06, + "loss": 11.7086, + "step": 33547 + }, + { + "epoch": 1.826822677366877, + "grad_norm": 0.5711261212450155, + "learning_rate": 3.906088454219081e-06, + "loss": 11.7186, + "step": 33548 + }, + { + "epoch": 1.82687713136346, + "grad_norm": 0.552201056906126, + "learning_rate": 3.9036482880077885e-06, + "loss": 11.7314, + "step": 33549 + }, + { + "epoch": 1.826931585360043, + "grad_norm": 0.6254665150234314, + "learning_rate": 3.901208869057838e-06, + "loss": 11.717, + "step": 33550 + }, + { + "epoch": 1.8269860393566262, + "grad_norm": 0.5310749922950447, + "learning_rate": 3.898770197388202e-06, + "loss": 11.8462, + "step": 33551 + }, + { + "epoch": 1.8270404933532092, + "grad_norm": 0.5839481534108598, + "learning_rate": 3.896332273017844e-06, + "loss": 11.8887, + "step": 33552 + }, + { + "epoch": 1.8270949473497922, + "grad_norm": 0.5488042552274396, + "learning_rate": 3.8938950959657164e-06, + "loss": 11.7305, + "step": 33553 + }, + { + "epoch": 1.8271494013463752, + "grad_norm": 0.5482019480639584, + "learning_rate": 3.891458666250791e-06, + "loss": 11.8, + "step": 33554 + }, + { + "epoch": 1.8272038553429581, + "grad_norm": 0.5318702393227676, + "learning_rate": 3.889022983891988e-06, + "loss": 11.6851, + "step": 33555 + }, + { + "epoch": 1.8272583093395411, + "grad_norm": 0.5674620351831604, + "learning_rate": 3.886588048908268e-06, + "loss": 11.6989, + "step": 33556 + }, + { + "epoch": 1.8273127633361241, + "grad_norm": 0.5604734315062843, + "learning_rate": 3.8841538613185516e-06, + "loss": 11.7631, + "step": 33557 + }, + { + "epoch": 1.8273672173327071, + "grad_norm": 0.5664493019605271, + "learning_rate": 3.881720421141766e-06, + "loss": 11.774, + "step": 33558 + }, + { + "epoch": 1.8274216713292901, + "grad_norm": 0.5485116698974144, + "learning_rate": 3.8792877283968635e-06, + "loss": 11.8739, + "step": 33559 + }, + { + "epoch": 1.8274761253258731, + "grad_norm": 0.5468985098187719, + "learning_rate": 3.8768557831027175e-06, + "loss": 11.7811, + "step": 33560 + }, + { + "epoch": 1.827530579322456, + "grad_norm": 0.56447714643943, + "learning_rate": 3.87442458527828e-06, + "loss": 11.8985, + "step": 33561 + }, + { + "epoch": 1.827585033319039, + "grad_norm": 0.5239203982580789, + "learning_rate": 3.871994134942436e-06, + "loss": 11.786, + "step": 33562 + }, + { + "epoch": 1.827639487315622, + "grad_norm": 0.6344256070612694, + "learning_rate": 3.86956443211407e-06, + "loss": 11.7964, + "step": 33563 + }, + { + "epoch": 1.827693941312205, + "grad_norm": 0.5252318449258297, + "learning_rate": 3.867135476812101e-06, + "loss": 11.6403, + "step": 33564 + }, + { + "epoch": 1.827748395308788, + "grad_norm": 0.5342986990063621, + "learning_rate": 3.864707269055401e-06, + "loss": 11.6848, + "step": 33565 + }, + { + "epoch": 1.827802849305371, + "grad_norm": 0.5265795921368792, + "learning_rate": 3.862279808862878e-06, + "loss": 11.7295, + "step": 33566 + }, + { + "epoch": 1.827857303301954, + "grad_norm": 0.5534938744120571, + "learning_rate": 3.859853096253363e-06, + "loss": 11.8545, + "step": 33567 + }, + { + "epoch": 1.8279117572985373, + "grad_norm": 0.5314140467853407, + "learning_rate": 3.8574271312457725e-06, + "loss": 11.7534, + "step": 33568 + }, + { + "epoch": 1.8279662112951203, + "grad_norm": 0.589235670157932, + "learning_rate": 3.8550019138589465e-06, + "loss": 11.8212, + "step": 33569 + }, + { + "epoch": 1.8280206652917033, + "grad_norm": 0.6138541162915487, + "learning_rate": 3.852577444111738e-06, + "loss": 11.7081, + "step": 33570 + }, + { + "epoch": 1.8280751192882863, + "grad_norm": 0.5913178943141056, + "learning_rate": 3.850153722023031e-06, + "loss": 11.7072, + "step": 33571 + }, + { + "epoch": 1.8281295732848692, + "grad_norm": 0.5667341265681017, + "learning_rate": 3.8477307476116445e-06, + "loss": 11.7691, + "step": 33572 + }, + { + "epoch": 1.8281840272814522, + "grad_norm": 0.5785024779146959, + "learning_rate": 3.84530852089644e-06, + "loss": 11.9016, + "step": 33573 + }, + { + "epoch": 1.8282384812780355, + "grad_norm": 0.5261711522291102, + "learning_rate": 3.842887041896226e-06, + "loss": 11.7926, + "step": 33574 + }, + { + "epoch": 1.8282929352746184, + "grad_norm": 0.5538186553914405, + "learning_rate": 3.8404663106298426e-06, + "loss": 11.6854, + "step": 33575 + }, + { + "epoch": 1.8283473892712014, + "grad_norm": 0.5154450321743397, + "learning_rate": 3.8380463271161294e-06, + "loss": 11.8107, + "step": 33576 + }, + { + "epoch": 1.8284018432677844, + "grad_norm": 0.5768646424999689, + "learning_rate": 3.835627091373883e-06, + "loss": 11.7498, + "step": 33577 + }, + { + "epoch": 1.8284562972643674, + "grad_norm": 0.5540500575422419, + "learning_rate": 3.833208603421945e-06, + "loss": 11.6689, + "step": 33578 + }, + { + "epoch": 1.8285107512609504, + "grad_norm": 0.5255612682975356, + "learning_rate": 3.830790863279088e-06, + "loss": 11.6593, + "step": 33579 + }, + { + "epoch": 1.8285652052575334, + "grad_norm": 0.6457394825812683, + "learning_rate": 3.828373870964153e-06, + "loss": 11.7615, + "step": 33580 + }, + { + "epoch": 1.8286196592541164, + "grad_norm": 0.5497840083945347, + "learning_rate": 3.82595762649588e-06, + "loss": 11.8321, + "step": 33581 + }, + { + "epoch": 1.8286741132506994, + "grad_norm": 0.5115940301137489, + "learning_rate": 3.8235421298931096e-06, + "loss": 11.9497, + "step": 33582 + }, + { + "epoch": 1.8287285672472824, + "grad_norm": 0.5415655013355506, + "learning_rate": 3.8211273811746055e-06, + "loss": 11.6561, + "step": 33583 + }, + { + "epoch": 1.8287830212438654, + "grad_norm": 0.5508433141977883, + "learning_rate": 3.8187133803591295e-06, + "loss": 11.7448, + "step": 33584 + }, + { + "epoch": 1.8288374752404484, + "grad_norm": 0.5045897152001088, + "learning_rate": 3.816300127465466e-06, + "loss": 11.7792, + "step": 33585 + }, + { + "epoch": 1.8288919292370314, + "grad_norm": 0.5137576802657025, + "learning_rate": 3.8138876225124022e-06, + "loss": 11.7979, + "step": 33586 + }, + { + "epoch": 1.8289463832336144, + "grad_norm": 0.5264990376744149, + "learning_rate": 3.8114758655186657e-06, + "loss": 11.6629, + "step": 33587 + }, + { + "epoch": 1.8290008372301974, + "grad_norm": 0.5882370370095509, + "learning_rate": 3.8090648565030305e-06, + "loss": 11.8454, + "step": 33588 + }, + { + "epoch": 1.8290552912267803, + "grad_norm": 0.49267080795308765, + "learning_rate": 3.806654595484227e-06, + "loss": 11.7577, + "step": 33589 + }, + { + "epoch": 1.8291097452233633, + "grad_norm": 0.5659817523041979, + "learning_rate": 3.804245082481017e-06, + "loss": 11.845, + "step": 33590 + }, + { + "epoch": 1.8291641992199466, + "grad_norm": 0.5002647537304712, + "learning_rate": 3.8018363175121306e-06, + "loss": 11.7127, + "step": 33591 + }, + { + "epoch": 1.8292186532165295, + "grad_norm": 0.537488999759871, + "learning_rate": 3.7994283005962974e-06, + "loss": 11.7534, + "step": 33592 + }, + { + "epoch": 1.8292731072131125, + "grad_norm": 0.5781595998247397, + "learning_rate": 3.7970210317522457e-06, + "loss": 11.7543, + "step": 33593 + }, + { + "epoch": 1.8293275612096955, + "grad_norm": 0.514201095862873, + "learning_rate": 3.794614510998684e-06, + "loss": 11.7786, + "step": 33594 + }, + { + "epoch": 1.8293820152062785, + "grad_norm": 0.47920266652534765, + "learning_rate": 3.7922087383543414e-06, + "loss": 11.6605, + "step": 33595 + }, + { + "epoch": 1.8294364692028615, + "grad_norm": 0.5740164054785348, + "learning_rate": 3.7898037138379027e-06, + "loss": 11.8035, + "step": 33596 + }, + { + "epoch": 1.8294909231994447, + "grad_norm": 0.5942858605802906, + "learning_rate": 3.7873994374680864e-06, + "loss": 11.7615, + "step": 33597 + }, + { + "epoch": 1.8295453771960277, + "grad_norm": 0.5675241724473631, + "learning_rate": 3.7849959092636e-06, + "loss": 11.7158, + "step": 33598 + }, + { + "epoch": 1.8295998311926107, + "grad_norm": 0.5415942452650854, + "learning_rate": 3.782593129243117e-06, + "loss": 11.7951, + "step": 33599 + }, + { + "epoch": 1.8296542851891937, + "grad_norm": 0.5467909468926304, + "learning_rate": 3.780191097425323e-06, + "loss": 11.6424, + "step": 33600 + }, + { + "epoch": 1.8297087391857767, + "grad_norm": 0.5691203385637891, + "learning_rate": 3.7777898138289025e-06, + "loss": 11.7199, + "step": 33601 + }, + { + "epoch": 1.8297631931823597, + "grad_norm": 0.5605891023426018, + "learning_rate": 3.7753892784725188e-06, + "loss": 11.8701, + "step": 33602 + }, + { + "epoch": 1.8298176471789427, + "grad_norm": 0.584924898795815, + "learning_rate": 3.7729894913748564e-06, + "loss": 11.902, + "step": 33603 + }, + { + "epoch": 1.8298721011755257, + "grad_norm": 0.644131364469211, + "learning_rate": 3.7705904525545567e-06, + "loss": 11.7686, + "step": 33604 + }, + { + "epoch": 1.8299265551721087, + "grad_norm": 0.5328483942849503, + "learning_rate": 3.7681921620302818e-06, + "loss": 11.6705, + "step": 33605 + }, + { + "epoch": 1.8299810091686917, + "grad_norm": 0.5084748882255641, + "learning_rate": 3.765794619820684e-06, + "loss": 11.7388, + "step": 33606 + }, + { + "epoch": 1.8300354631652747, + "grad_norm": 0.5387853995100667, + "learning_rate": 3.7633978259444035e-06, + "loss": 11.8504, + "step": 33607 + }, + { + "epoch": 1.8300899171618576, + "grad_norm": 0.552683848284675, + "learning_rate": 3.7610017804200815e-06, + "loss": 11.8648, + "step": 33608 + }, + { + "epoch": 1.8301443711584406, + "grad_norm": 0.5490437201936529, + "learning_rate": 3.758606483266347e-06, + "loss": 11.7925, + "step": 33609 + }, + { + "epoch": 1.8301988251550236, + "grad_norm": 0.5589677545008368, + "learning_rate": 3.7562119345018408e-06, + "loss": 11.797, + "step": 33610 + }, + { + "epoch": 1.8302532791516066, + "grad_norm": 0.5426710002220848, + "learning_rate": 3.7538181341451593e-06, + "loss": 11.8016, + "step": 33611 + }, + { + "epoch": 1.8303077331481896, + "grad_norm": 0.5816641511814663, + "learning_rate": 3.751425082214932e-06, + "loss": 11.7449, + "step": 33612 + }, + { + "epoch": 1.8303621871447726, + "grad_norm": 0.54028323069145, + "learning_rate": 3.7490327787297664e-06, + "loss": 11.8756, + "step": 33613 + }, + { + "epoch": 1.8304166411413556, + "grad_norm": 0.5437505885248093, + "learning_rate": 3.746641223708258e-06, + "loss": 11.7894, + "step": 33614 + }, + { + "epoch": 1.8304710951379388, + "grad_norm": 0.5107128114373266, + "learning_rate": 3.744250417169015e-06, + "loss": 11.7212, + "step": 33615 + }, + { + "epoch": 1.8305255491345218, + "grad_norm": 0.553345160010139, + "learning_rate": 3.7418603591306112e-06, + "loss": 11.7638, + "step": 33616 + }, + { + "epoch": 1.8305800031311048, + "grad_norm": 0.5201727159599808, + "learning_rate": 3.7394710496116648e-06, + "loss": 11.8593, + "step": 33617 + }, + { + "epoch": 1.8306344571276878, + "grad_norm": 0.5548311846824785, + "learning_rate": 3.7370824886307056e-06, + "loss": 11.7792, + "step": 33618 + }, + { + "epoch": 1.8306889111242708, + "grad_norm": 0.4999218151319178, + "learning_rate": 3.734694676206352e-06, + "loss": 11.7706, + "step": 33619 + }, + { + "epoch": 1.830743365120854, + "grad_norm": 0.5730300704645586, + "learning_rate": 3.7323076123571556e-06, + "loss": 11.7754, + "step": 33620 + }, + { + "epoch": 1.830797819117437, + "grad_norm": 0.620387799682463, + "learning_rate": 3.7299212971016797e-06, + "loss": 11.8119, + "step": 33621 + }, + { + "epoch": 1.83085227311402, + "grad_norm": 0.5691015568130691, + "learning_rate": 3.7275357304584978e-06, + "loss": 11.8631, + "step": 33622 + }, + { + "epoch": 1.830906727110603, + "grad_norm": 0.6034637461747832, + "learning_rate": 3.725150912446118e-06, + "loss": 11.8568, + "step": 33623 + }, + { + "epoch": 1.830961181107186, + "grad_norm": 0.4916739862755106, + "learning_rate": 3.722766843083114e-06, + "loss": 11.644, + "step": 33624 + }, + { + "epoch": 1.831015635103769, + "grad_norm": 0.5358512196182962, + "learning_rate": 3.7203835223880268e-06, + "loss": 11.7572, + "step": 33625 + }, + { + "epoch": 1.831070089100352, + "grad_norm": 0.5223640356404262, + "learning_rate": 3.7180009503793743e-06, + "loss": 11.7622, + "step": 33626 + }, + { + "epoch": 1.831124543096935, + "grad_norm": 0.5357213005973265, + "learning_rate": 3.715619127075709e-06, + "loss": 11.7943, + "step": 33627 + }, + { + "epoch": 1.831178997093518, + "grad_norm": 0.5403553467227992, + "learning_rate": 3.713238052495516e-06, + "loss": 11.7865, + "step": 33628 + }, + { + "epoch": 1.831233451090101, + "grad_norm": 0.555072468847666, + "learning_rate": 3.7108577266573464e-06, + "loss": 11.7744, + "step": 33629 + }, + { + "epoch": 1.831287905086684, + "grad_norm": 0.5400711145683039, + "learning_rate": 3.708478149579686e-06, + "loss": 11.7561, + "step": 33630 + }, + { + "epoch": 1.831342359083267, + "grad_norm": 0.5609842697305087, + "learning_rate": 3.7060993212810535e-06, + "loss": 11.7684, + "step": 33631 + }, + { + "epoch": 1.83139681307985, + "grad_norm": 0.5558519570080996, + "learning_rate": 3.703721241779956e-06, + "loss": 11.7791, + "step": 33632 + }, + { + "epoch": 1.831451267076433, + "grad_norm": 0.5742799670780467, + "learning_rate": 3.7013439110948454e-06, + "loss": 11.6923, + "step": 33633 + }, + { + "epoch": 1.831505721073016, + "grad_norm": 0.4924399750888138, + "learning_rate": 3.6989673292442407e-06, + "loss": 11.8236, + "step": 33634 + }, + { + "epoch": 1.831560175069599, + "grad_norm": 0.5755479277658104, + "learning_rate": 3.6965914962466153e-06, + "loss": 11.6702, + "step": 33635 + }, + { + "epoch": 1.8316146290661819, + "grad_norm": 0.5668670997137093, + "learning_rate": 3.694216412120444e-06, + "loss": 11.7997, + "step": 33636 + }, + { + "epoch": 1.8316690830627649, + "grad_norm": 0.5988314770585237, + "learning_rate": 3.691842076884211e-06, + "loss": 11.7937, + "step": 33637 + }, + { + "epoch": 1.831723537059348, + "grad_norm": 0.5620322342811402, + "learning_rate": 3.689468490556347e-06, + "loss": 11.8267, + "step": 33638 + }, + { + "epoch": 1.831777991055931, + "grad_norm": 0.5555223649173702, + "learning_rate": 3.687095653155337e-06, + "loss": 11.7555, + "step": 33639 + }, + { + "epoch": 1.831832445052514, + "grad_norm": 0.5262201788231075, + "learning_rate": 3.6847235646996102e-06, + "loss": 11.7956, + "step": 33640 + }, + { + "epoch": 1.831886899049097, + "grad_norm": 0.6097477835124032, + "learning_rate": 3.6823522252076306e-06, + "loss": 11.8023, + "step": 33641 + }, + { + "epoch": 1.83194135304568, + "grad_norm": 0.5737853532295223, + "learning_rate": 3.67998163469786e-06, + "loss": 11.8284, + "step": 33642 + }, + { + "epoch": 1.831995807042263, + "grad_norm": 0.5420299003831286, + "learning_rate": 3.6776117931886733e-06, + "loss": 11.757, + "step": 33643 + }, + { + "epoch": 1.8320502610388463, + "grad_norm": 0.5757652343362268, + "learning_rate": 3.6752427006985446e-06, + "loss": 11.8109, + "step": 33644 + }, + { + "epoch": 1.8321047150354293, + "grad_norm": 0.5118663299731981, + "learning_rate": 3.67287435724587e-06, + "loss": 11.6178, + "step": 33645 + }, + { + "epoch": 1.8321591690320123, + "grad_norm": 0.5685451668477773, + "learning_rate": 3.67050676284908e-06, + "loss": 11.8246, + "step": 33646 + }, + { + "epoch": 1.8322136230285953, + "grad_norm": 0.5323133237344017, + "learning_rate": 3.668139917526592e-06, + "loss": 11.8171, + "step": 33647 + }, + { + "epoch": 1.8322680770251782, + "grad_norm": 0.5140507649072633, + "learning_rate": 3.665773821296792e-06, + "loss": 11.7333, + "step": 33648 + }, + { + "epoch": 1.8323225310217612, + "grad_norm": 0.5631979179573443, + "learning_rate": 3.6634084741781094e-06, + "loss": 11.7532, + "step": 33649 + }, + { + "epoch": 1.8323769850183442, + "grad_norm": 0.5506797082488566, + "learning_rate": 3.6610438761888965e-06, + "loss": 11.8504, + "step": 33650 + }, + { + "epoch": 1.8324314390149272, + "grad_norm": 0.514061050701803, + "learning_rate": 3.658680027347583e-06, + "loss": 11.7024, + "step": 33651 + }, + { + "epoch": 1.8324858930115102, + "grad_norm": 0.523363066878204, + "learning_rate": 3.6563169276725206e-06, + "loss": 11.801, + "step": 33652 + }, + { + "epoch": 1.8325403470080932, + "grad_norm": 0.5703393082583804, + "learning_rate": 3.6539545771820837e-06, + "loss": 11.7958, + "step": 33653 + }, + { + "epoch": 1.8325948010046762, + "grad_norm": 0.5645968935853388, + "learning_rate": 3.6515929758946798e-06, + "loss": 11.8249, + "step": 33654 + }, + { + "epoch": 1.8326492550012592, + "grad_norm": 0.5269401939551209, + "learning_rate": 3.6492321238286166e-06, + "loss": 11.8765, + "step": 33655 + }, + { + "epoch": 1.8327037089978422, + "grad_norm": 0.5474930751105823, + "learning_rate": 3.64687202100229e-06, + "loss": 11.814, + "step": 33656 + }, + { + "epoch": 1.8327581629944252, + "grad_norm": 0.5282810706860984, + "learning_rate": 3.6445126674340636e-06, + "loss": 11.8933, + "step": 33657 + }, + { + "epoch": 1.8328126169910082, + "grad_norm": 0.5192044567011344, + "learning_rate": 3.6421540631422447e-06, + "loss": 11.6787, + "step": 33658 + }, + { + "epoch": 1.8328670709875912, + "grad_norm": 0.607667276788634, + "learning_rate": 3.6397962081452074e-06, + "loss": 11.9255, + "step": 33659 + }, + { + "epoch": 1.8329215249841742, + "grad_norm": 0.5863241837845117, + "learning_rate": 3.6374391024612597e-06, + "loss": 11.7884, + "step": 33660 + }, + { + "epoch": 1.8329759789807574, + "grad_norm": 0.564734192758178, + "learning_rate": 3.6350827461087646e-06, + "loss": 11.6631, + "step": 33661 + }, + { + "epoch": 1.8330304329773404, + "grad_norm": 0.5771472327379159, + "learning_rate": 3.632727139106018e-06, + "loss": 11.7665, + "step": 33662 + }, + { + "epoch": 1.8330848869739234, + "grad_norm": 0.5001199290655215, + "learning_rate": 3.6303722814713503e-06, + "loss": 11.6977, + "step": 33663 + }, + { + "epoch": 1.8331393409705063, + "grad_norm": 0.5356334823763569, + "learning_rate": 3.628018173223069e-06, + "loss": 11.8373, + "step": 33664 + }, + { + "epoch": 1.8331937949670893, + "grad_norm": 0.5407584876816743, + "learning_rate": 3.6256648143794703e-06, + "loss": 11.7917, + "step": 33665 + }, + { + "epoch": 1.8332482489636723, + "grad_norm": 0.5262477456202952, + "learning_rate": 3.623312204958873e-06, + "loss": 11.8158, + "step": 33666 + }, + { + "epoch": 1.8333027029602555, + "grad_norm": 0.5384037636619359, + "learning_rate": 3.6209603449795515e-06, + "loss": 11.7585, + "step": 33667 + }, + { + "epoch": 1.8333571569568385, + "grad_norm": 0.5770060144243758, + "learning_rate": 3.6186092344598023e-06, + "loss": 11.7487, + "step": 33668 + }, + { + "epoch": 1.8334116109534215, + "grad_norm": 0.48745336548330126, + "learning_rate": 3.6162588734179326e-06, + "loss": 11.8087, + "step": 33669 + }, + { + "epoch": 1.8334660649500045, + "grad_norm": 0.5086000950835574, + "learning_rate": 3.6139092618721727e-06, + "loss": 11.7118, + "step": 33670 + }, + { + "epoch": 1.8335205189465875, + "grad_norm": 0.5192043453566273, + "learning_rate": 3.611560399840841e-06, + "loss": 11.6702, + "step": 33671 + }, + { + "epoch": 1.8335749729431705, + "grad_norm": 0.5433328063871803, + "learning_rate": 3.6092122873421783e-06, + "loss": 11.8374, + "step": 33672 + }, + { + "epoch": 1.8336294269397535, + "grad_norm": 0.5404471055371471, + "learning_rate": 3.6068649243944264e-06, + "loss": 11.7247, + "step": 33673 + }, + { + "epoch": 1.8336838809363365, + "grad_norm": 0.5311974378110095, + "learning_rate": 3.604518311015881e-06, + "loss": 11.655, + "step": 33674 + }, + { + "epoch": 1.8337383349329195, + "grad_norm": 0.5458367952834732, + "learning_rate": 3.6021724472247387e-06, + "loss": 11.7666, + "step": 33675 + }, + { + "epoch": 1.8337927889295025, + "grad_norm": 0.6863974465711044, + "learning_rate": 3.599827333039296e-06, + "loss": 11.8068, + "step": 33676 + }, + { + "epoch": 1.8338472429260855, + "grad_norm": 0.49571684168626645, + "learning_rate": 3.59748296847775e-06, + "loss": 11.6708, + "step": 33677 + }, + { + "epoch": 1.8339016969226685, + "grad_norm": 0.5016990640118166, + "learning_rate": 3.5951393535583413e-06, + "loss": 11.6921, + "step": 33678 + }, + { + "epoch": 1.8339561509192515, + "grad_norm": 0.5088780451289755, + "learning_rate": 3.592796488299299e-06, + "loss": 11.7353, + "step": 33679 + }, + { + "epoch": 1.8340106049158345, + "grad_norm": 0.5834436289501197, + "learning_rate": 3.5904543727188322e-06, + "loss": 11.8659, + "step": 33680 + }, + { + "epoch": 1.8340650589124174, + "grad_norm": 0.5486892229223033, + "learning_rate": 3.5881130068351698e-06, + "loss": 11.7854, + "step": 33681 + }, + { + "epoch": 1.8341195129090004, + "grad_norm": 0.5295833476695455, + "learning_rate": 3.5857723906665197e-06, + "loss": 11.7739, + "step": 33682 + }, + { + "epoch": 1.8341739669055834, + "grad_norm": 0.5252285010143786, + "learning_rate": 3.5834325242310453e-06, + "loss": 11.7262, + "step": 33683 + }, + { + "epoch": 1.8342284209021664, + "grad_norm": 0.6019713085614004, + "learning_rate": 3.581093407546987e-06, + "loss": 11.7305, + "step": 33684 + }, + { + "epoch": 1.8342828748987496, + "grad_norm": 0.5351547807703942, + "learning_rate": 3.5787550406325086e-06, + "loss": 11.7171, + "step": 33685 + }, + { + "epoch": 1.8343373288953326, + "grad_norm": 0.5282363304151417, + "learning_rate": 3.576417423505807e-06, + "loss": 11.6159, + "step": 33686 + }, + { + "epoch": 1.8343917828919156, + "grad_norm": 0.5251180636048071, + "learning_rate": 3.5740805561850445e-06, + "loss": 11.9031, + "step": 33687 + }, + { + "epoch": 1.8344462368884986, + "grad_norm": 0.622908629364273, + "learning_rate": 3.571744438688418e-06, + "loss": 12.027, + "step": 33688 + }, + { + "epoch": 1.8345006908850816, + "grad_norm": 0.5142626164041925, + "learning_rate": 3.5694090710340576e-06, + "loss": 11.6893, + "step": 33689 + }, + { + "epoch": 1.8345551448816648, + "grad_norm": 0.5283159116676002, + "learning_rate": 3.567074453240149e-06, + "loss": 11.6854, + "step": 33690 + }, + { + "epoch": 1.8346095988782478, + "grad_norm": 0.5511602178658775, + "learning_rate": 3.564740585324855e-06, + "loss": 11.6854, + "step": 33691 + }, + { + "epoch": 1.8346640528748308, + "grad_norm": 0.6081058223194803, + "learning_rate": 3.562407467306295e-06, + "loss": 11.8995, + "step": 33692 + }, + { + "epoch": 1.8347185068714138, + "grad_norm": 0.6237050189656914, + "learning_rate": 3.560075099202642e-06, + "loss": 11.8137, + "step": 33693 + }, + { + "epoch": 1.8347729608679968, + "grad_norm": 0.5849995900037084, + "learning_rate": 3.5577434810320055e-06, + "loss": 11.8858, + "step": 33694 + }, + { + "epoch": 1.8348274148645798, + "grad_norm": 0.5450710006351104, + "learning_rate": 3.5554126128125256e-06, + "loss": 11.9415, + "step": 33695 + }, + { + "epoch": 1.8348818688611628, + "grad_norm": 0.8510014482508347, + "learning_rate": 3.5530824945623542e-06, + "loss": 11.8374, + "step": 33696 + }, + { + "epoch": 1.8349363228577458, + "grad_norm": 0.5490630740266585, + "learning_rate": 3.5507531262995553e-06, + "loss": 11.8302, + "step": 33697 + }, + { + "epoch": 1.8349907768543288, + "grad_norm": 0.6000585345043453, + "learning_rate": 3.5484245080423027e-06, + "loss": 11.9073, + "step": 33698 + }, + { + "epoch": 1.8350452308509118, + "grad_norm": 0.5315158199283085, + "learning_rate": 3.5460966398086602e-06, + "loss": 11.8954, + "step": 33699 + }, + { + "epoch": 1.8350996848474947, + "grad_norm": 0.5903353983840656, + "learning_rate": 3.543769521616758e-06, + "loss": 11.7407, + "step": 33700 + }, + { + "epoch": 1.8351541388440777, + "grad_norm": 0.5577473741100369, + "learning_rate": 3.541443153484658e-06, + "loss": 11.8393, + "step": 33701 + }, + { + "epoch": 1.8352085928406607, + "grad_norm": 0.5673069611908624, + "learning_rate": 3.5391175354304807e-06, + "loss": 11.7718, + "step": 33702 + }, + { + "epoch": 1.8352630468372437, + "grad_norm": 0.6519382976389545, + "learning_rate": 3.5367926674723216e-06, + "loss": 11.9164, + "step": 33703 + }, + { + "epoch": 1.8353175008338267, + "grad_norm": 0.5968121862080892, + "learning_rate": 3.5344685496282227e-06, + "loss": 11.7602, + "step": 33704 + }, + { + "epoch": 1.8353719548304097, + "grad_norm": 0.6036063004325238, + "learning_rate": 3.532145181916269e-06, + "loss": 11.6866, + "step": 33705 + }, + { + "epoch": 1.8354264088269927, + "grad_norm": 0.5781734630937158, + "learning_rate": 3.5298225643545457e-06, + "loss": 11.8314, + "step": 33706 + }, + { + "epoch": 1.8354808628235757, + "grad_norm": 0.5272934985316827, + "learning_rate": 3.5275006969610835e-06, + "loss": 11.7894, + "step": 33707 + }, + { + "epoch": 1.835535316820159, + "grad_norm": 0.5774879294105703, + "learning_rate": 3.5251795797539676e-06, + "loss": 11.7908, + "step": 33708 + }, + { + "epoch": 1.835589770816742, + "grad_norm": 0.5483932207367126, + "learning_rate": 3.5228592127512285e-06, + "loss": 11.7713, + "step": 33709 + }, + { + "epoch": 1.835644224813325, + "grad_norm": 0.5514752429617766, + "learning_rate": 3.5205395959709286e-06, + "loss": 11.7251, + "step": 33710 + }, + { + "epoch": 1.835698678809908, + "grad_norm": 0.5734875429848707, + "learning_rate": 3.5182207294310654e-06, + "loss": 11.7756, + "step": 33711 + }, + { + "epoch": 1.8357531328064909, + "grad_norm": 0.5350863732956618, + "learning_rate": 3.5159026131497132e-06, + "loss": 11.7566, + "step": 33712 + }, + { + "epoch": 1.8358075868030739, + "grad_norm": 0.5154652665206887, + "learning_rate": 3.5135852471449016e-06, + "loss": 11.7158, + "step": 33713 + }, + { + "epoch": 1.835862040799657, + "grad_norm": 0.5818234333088478, + "learning_rate": 3.5112686314346054e-06, + "loss": 11.6471, + "step": 33714 + }, + { + "epoch": 1.83591649479624, + "grad_norm": 0.5309050324678231, + "learning_rate": 3.508952766036877e-06, + "loss": 11.8418, + "step": 33715 + }, + { + "epoch": 1.835970948792823, + "grad_norm": 0.5316833087325932, + "learning_rate": 3.506637650969702e-06, + "loss": 11.5931, + "step": 33716 + }, + { + "epoch": 1.836025402789406, + "grad_norm": 0.508372623903475, + "learning_rate": 3.5043232862510987e-06, + "loss": 11.7174, + "step": 33717 + }, + { + "epoch": 1.836079856785989, + "grad_norm": 0.5095887549123126, + "learning_rate": 3.5020096718990756e-06, + "loss": 11.6387, + "step": 33718 + }, + { + "epoch": 1.836134310782572, + "grad_norm": 0.5077160094496781, + "learning_rate": 3.499696807931585e-06, + "loss": 11.7219, + "step": 33719 + }, + { + "epoch": 1.836188764779155, + "grad_norm": 0.5156147150890515, + "learning_rate": 3.4973846943666568e-06, + "loss": 11.8598, + "step": 33720 + }, + { + "epoch": 1.836243218775738, + "grad_norm": 0.5657735402184215, + "learning_rate": 3.4950733312222315e-06, + "loss": 11.7291, + "step": 33721 + }, + { + "epoch": 1.836297672772321, + "grad_norm": 0.5027216645238088, + "learning_rate": 3.492762718516307e-06, + "loss": 11.7495, + "step": 33722 + }, + { + "epoch": 1.836352126768904, + "grad_norm": 0.575887213552128, + "learning_rate": 3.490452856266857e-06, + "loss": 11.8109, + "step": 33723 + }, + { + "epoch": 1.836406580765487, + "grad_norm": 0.6160079531206806, + "learning_rate": 3.488143744491801e-06, + "loss": 11.9721, + "step": 33724 + }, + { + "epoch": 1.83646103476207, + "grad_norm": 0.5444621174267315, + "learning_rate": 3.4858353832091463e-06, + "loss": 11.9605, + "step": 33725 + }, + { + "epoch": 1.836515488758653, + "grad_norm": 0.5286723973743807, + "learning_rate": 3.483527772436812e-06, + "loss": 11.7713, + "step": 33726 + }, + { + "epoch": 1.836569942755236, + "grad_norm": 0.5236218450241388, + "learning_rate": 3.481220912192762e-06, + "loss": 11.827, + "step": 33727 + }, + { + "epoch": 1.836624396751819, + "grad_norm": 0.517060319718189, + "learning_rate": 3.4789148024949035e-06, + "loss": 11.77, + "step": 33728 + }, + { + "epoch": 1.836678850748402, + "grad_norm": 0.5859984114605573, + "learning_rate": 3.4766094433612006e-06, + "loss": 11.8325, + "step": 33729 + }, + { + "epoch": 1.836733304744985, + "grad_norm": 0.6051276245372441, + "learning_rate": 3.474304834809583e-06, + "loss": 11.9341, + "step": 33730 + }, + { + "epoch": 1.8367877587415682, + "grad_norm": 0.5844962191411933, + "learning_rate": 3.4720009768579365e-06, + "loss": 11.7248, + "step": 33731 + }, + { + "epoch": 1.8368422127381512, + "grad_norm": 0.5563580638775164, + "learning_rate": 3.4696978695242132e-06, + "loss": 11.7365, + "step": 33732 + }, + { + "epoch": 1.8368966667347342, + "grad_norm": 0.4947205237272991, + "learning_rate": 3.46739551282631e-06, + "loss": 11.7052, + "step": 33733 + }, + { + "epoch": 1.8369511207313172, + "grad_norm": 0.6067653619978699, + "learning_rate": 3.465093906782124e-06, + "loss": 11.8687, + "step": 33734 + }, + { + "epoch": 1.8370055747279002, + "grad_norm": 0.5250863734251067, + "learning_rate": 3.462793051409552e-06, + "loss": 11.6991, + "step": 33735 + }, + { + "epoch": 1.8370600287244832, + "grad_norm": 0.5854503019613355, + "learning_rate": 3.46049294672649e-06, + "loss": 11.7324, + "step": 33736 + }, + { + "epoch": 1.8371144827210664, + "grad_norm": 0.5107861565301598, + "learning_rate": 3.4581935927508357e-06, + "loss": 11.7062, + "step": 33737 + }, + { + "epoch": 1.8371689367176494, + "grad_norm": 0.6099423778392281, + "learning_rate": 3.455894989500441e-06, + "loss": 11.8401, + "step": 33738 + }, + { + "epoch": 1.8372233907142324, + "grad_norm": 0.5488953774915811, + "learning_rate": 3.453597136993203e-06, + "loss": 11.752, + "step": 33739 + }, + { + "epoch": 1.8372778447108153, + "grad_norm": 0.5356423523111601, + "learning_rate": 3.4513000352469848e-06, + "loss": 11.7573, + "step": 33740 + }, + { + "epoch": 1.8373322987073983, + "grad_norm": 0.5279503509526772, + "learning_rate": 3.44900368427965e-06, + "loss": 11.8028, + "step": 33741 + }, + { + "epoch": 1.8373867527039813, + "grad_norm": 0.5434130940980396, + "learning_rate": 3.4467080841090628e-06, + "loss": 11.7407, + "step": 33742 + }, + { + "epoch": 1.8374412067005643, + "grad_norm": 0.5766316413889403, + "learning_rate": 3.4444132347530635e-06, + "loss": 11.8449, + "step": 33743 + }, + { + "epoch": 1.8374956606971473, + "grad_norm": 0.652101556792841, + "learning_rate": 3.4421191362294824e-06, + "loss": 11.93, + "step": 33744 + }, + { + "epoch": 1.8375501146937303, + "grad_norm": 0.5960265972082481, + "learning_rate": 3.4398257885561945e-06, + "loss": 11.8512, + "step": 33745 + }, + { + "epoch": 1.8376045686903133, + "grad_norm": 0.5640952024080866, + "learning_rate": 3.4375331917510077e-06, + "loss": 11.7464, + "step": 33746 + }, + { + "epoch": 1.8376590226868963, + "grad_norm": 0.5098863373297707, + "learning_rate": 3.435241345831752e-06, + "loss": 11.7593, + "step": 33747 + }, + { + "epoch": 1.8377134766834793, + "grad_norm": 0.5095515741351909, + "learning_rate": 3.4329502508162583e-06, + "loss": 11.6392, + "step": 33748 + }, + { + "epoch": 1.8377679306800623, + "grad_norm": 0.542932740237173, + "learning_rate": 3.4306599067223443e-06, + "loss": 11.7267, + "step": 33749 + }, + { + "epoch": 1.8378223846766453, + "grad_norm": 0.565006808595657, + "learning_rate": 3.428370313567797e-06, + "loss": 11.6211, + "step": 33750 + }, + { + "epoch": 1.8378768386732283, + "grad_norm": 0.6050045356351809, + "learning_rate": 3.426081471370435e-06, + "loss": 11.8577, + "step": 33751 + }, + { + "epoch": 1.8379312926698113, + "grad_norm": 0.583321263599445, + "learning_rate": 3.423793380148077e-06, + "loss": 11.7218, + "step": 33752 + }, + { + "epoch": 1.8379857466663942, + "grad_norm": 0.5872353698009192, + "learning_rate": 3.4215060399184986e-06, + "loss": 11.7619, + "step": 33753 + }, + { + "epoch": 1.8380402006629775, + "grad_norm": 0.5766620994994406, + "learning_rate": 3.4192194506994733e-06, + "loss": 11.9911, + "step": 33754 + }, + { + "epoch": 1.8380946546595605, + "grad_norm": 0.5074922654203639, + "learning_rate": 3.4169336125087994e-06, + "loss": 11.6697, + "step": 33755 + }, + { + "epoch": 1.8381491086561434, + "grad_norm": 0.5260217142404257, + "learning_rate": 3.414648525364239e-06, + "loss": 11.7082, + "step": 33756 + }, + { + "epoch": 1.8382035626527264, + "grad_norm": 0.5470496184281105, + "learning_rate": 3.41236418928359e-06, + "loss": 11.7668, + "step": 33757 + }, + { + "epoch": 1.8382580166493094, + "grad_norm": 0.5724853788041439, + "learning_rate": 3.410080604284571e-06, + "loss": 11.7921, + "step": 33758 + }, + { + "epoch": 1.8383124706458924, + "grad_norm": 0.6009617179526612, + "learning_rate": 3.4077977703849794e-06, + "loss": 11.7534, + "step": 33759 + }, + { + "epoch": 1.8383669246424756, + "grad_norm": 0.5783049663565869, + "learning_rate": 3.405515687602534e-06, + "loss": 11.736, + "step": 33760 + }, + { + "epoch": 1.8384213786390586, + "grad_norm": 0.5730574125204992, + "learning_rate": 3.4032343559549984e-06, + "loss": 11.7719, + "step": 33761 + }, + { + "epoch": 1.8384758326356416, + "grad_norm": 0.519669808350654, + "learning_rate": 3.400953775460136e-06, + "loss": 11.7636, + "step": 33762 + }, + { + "epoch": 1.8385302866322246, + "grad_norm": 0.5964376024612283, + "learning_rate": 3.398673946135644e-06, + "loss": 11.8903, + "step": 33763 + }, + { + "epoch": 1.8385847406288076, + "grad_norm": 0.5672682021379412, + "learning_rate": 3.396394867999264e-06, + "loss": 11.7035, + "step": 33764 + }, + { + "epoch": 1.8386391946253906, + "grad_norm": 0.5265068891237084, + "learning_rate": 3.3941165410687147e-06, + "loss": 11.8036, + "step": 33765 + }, + { + "epoch": 1.8386936486219736, + "grad_norm": 0.516687143355546, + "learning_rate": 3.3918389653617043e-06, + "loss": 11.7173, + "step": 33766 + }, + { + "epoch": 1.8387481026185566, + "grad_norm": 0.5810793523202998, + "learning_rate": 3.3895621408959745e-06, + "loss": 11.8949, + "step": 33767 + }, + { + "epoch": 1.8388025566151396, + "grad_norm": 0.578112063505665, + "learning_rate": 3.3872860676891995e-06, + "loss": 11.6485, + "step": 33768 + }, + { + "epoch": 1.8388570106117226, + "grad_norm": 0.5520149915577371, + "learning_rate": 3.3850107457590983e-06, + "loss": 11.7525, + "step": 33769 + }, + { + "epoch": 1.8389114646083056, + "grad_norm": 0.6627735492956288, + "learning_rate": 3.3827361751233465e-06, + "loss": 11.7934, + "step": 33770 + }, + { + "epoch": 1.8389659186048886, + "grad_norm": 0.5528380821301336, + "learning_rate": 3.3804623557996516e-06, + "loss": 11.6964, + "step": 33771 + }, + { + "epoch": 1.8390203726014716, + "grad_norm": 0.5441781566992544, + "learning_rate": 3.3781892878056777e-06, + "loss": 11.8651, + "step": 33772 + }, + { + "epoch": 1.8390748265980545, + "grad_norm": 0.5268215498162374, + "learning_rate": 3.37591697115911e-06, + "loss": 11.8521, + "step": 33773 + }, + { + "epoch": 1.8391292805946375, + "grad_norm": 0.5350371230736668, + "learning_rate": 3.3736454058776236e-06, + "loss": 11.8748, + "step": 33774 + }, + { + "epoch": 1.8391837345912205, + "grad_norm": 0.5694064059618958, + "learning_rate": 3.37137459197886e-06, + "loss": 11.8134, + "step": 33775 + }, + { + "epoch": 1.8392381885878035, + "grad_norm": 0.5457504647782292, + "learning_rate": 3.3691045294805047e-06, + "loss": 11.8112, + "step": 33776 + }, + { + "epoch": 1.8392926425843865, + "grad_norm": 0.5434586038544973, + "learning_rate": 3.3668352184001885e-06, + "loss": 11.7694, + "step": 33777 + }, + { + "epoch": 1.8393470965809697, + "grad_norm": 0.48286760906165666, + "learning_rate": 3.3645666587555635e-06, + "loss": 11.637, + "step": 33778 + }, + { + "epoch": 1.8394015505775527, + "grad_norm": 0.5622188256202755, + "learning_rate": 3.3622988505642826e-06, + "loss": 11.798, + "step": 33779 + }, + { + "epoch": 1.8394560045741357, + "grad_norm": 0.5629060755557711, + "learning_rate": 3.360031793843965e-06, + "loss": 11.7861, + "step": 33780 + }, + { + "epoch": 1.8395104585707187, + "grad_norm": 0.6039799015888979, + "learning_rate": 3.3577654886122524e-06, + "loss": 11.7994, + "step": 33781 + }, + { + "epoch": 1.8395649125673017, + "grad_norm": 0.5314482109570983, + "learning_rate": 3.3554999348867633e-06, + "loss": 11.7301, + "step": 33782 + }, + { + "epoch": 1.8396193665638847, + "grad_norm": 0.5141102963073315, + "learning_rate": 3.3532351326851174e-06, + "loss": 11.7489, + "step": 33783 + }, + { + "epoch": 1.839673820560468, + "grad_norm": 0.5368867589358306, + "learning_rate": 3.3509710820249228e-06, + "loss": 11.7278, + "step": 33784 + }, + { + "epoch": 1.839728274557051, + "grad_norm": 0.5577536904149286, + "learning_rate": 3.3487077829237655e-06, + "loss": 11.8324, + "step": 33785 + }, + { + "epoch": 1.839782728553634, + "grad_norm": 0.5442647554330254, + "learning_rate": 3.346445235399287e-06, + "loss": 11.7692, + "step": 33786 + }, + { + "epoch": 1.8398371825502169, + "grad_norm": 0.6038344235159461, + "learning_rate": 3.3441834394690507e-06, + "loss": 11.76, + "step": 33787 + }, + { + "epoch": 1.8398916365467999, + "grad_norm": 0.5063113196907719, + "learning_rate": 3.341922395150643e-06, + "loss": 11.8317, + "step": 33788 + }, + { + "epoch": 1.8399460905433829, + "grad_norm": 0.5978406466617064, + "learning_rate": 3.3396621024616714e-06, + "loss": 11.8848, + "step": 33789 + }, + { + "epoch": 1.8400005445399659, + "grad_norm": 0.5054431701294958, + "learning_rate": 3.337402561419689e-06, + "loss": 11.8459, + "step": 33790 + }, + { + "epoch": 1.8400549985365489, + "grad_norm": 0.5702263084054048, + "learning_rate": 3.3351437720422818e-06, + "loss": 11.7451, + "step": 33791 + }, + { + "epoch": 1.8401094525331319, + "grad_norm": 0.6097616800921006, + "learning_rate": 3.3328857343470023e-06, + "loss": 11.9097, + "step": 33792 + }, + { + "epoch": 1.8401639065297148, + "grad_norm": 0.5396892024496525, + "learning_rate": 3.330628448351414e-06, + "loss": 11.7652, + "step": 33793 + }, + { + "epoch": 1.8402183605262978, + "grad_norm": 0.6901732418284905, + "learning_rate": 3.328371914073081e-06, + "loss": 11.855, + "step": 33794 + }, + { + "epoch": 1.8402728145228808, + "grad_norm": 0.597373235272736, + "learning_rate": 3.3261161315295218e-06, + "loss": 11.8066, + "step": 33795 + }, + { + "epoch": 1.8403272685194638, + "grad_norm": 0.6055857919287467, + "learning_rate": 3.3238611007383124e-06, + "loss": 11.7263, + "step": 33796 + }, + { + "epoch": 1.8403817225160468, + "grad_norm": 0.5341443804936415, + "learning_rate": 3.3216068217169606e-06, + "loss": 11.8828, + "step": 33797 + }, + { + "epoch": 1.8404361765126298, + "grad_norm": 0.5368310268770802, + "learning_rate": 3.3193532944830185e-06, + "loss": 11.8119, + "step": 33798 + }, + { + "epoch": 1.8404906305092128, + "grad_norm": 0.5166187888250507, + "learning_rate": 3.317100519053984e-06, + "loss": 11.6499, + "step": 33799 + }, + { + "epoch": 1.8405450845057958, + "grad_norm": 0.5043569772314318, + "learning_rate": 3.314848495447387e-06, + "loss": 11.7501, + "step": 33800 + }, + { + "epoch": 1.840599538502379, + "grad_norm": 0.5723431816021484, + "learning_rate": 3.312597223680758e-06, + "loss": 11.8072, + "step": 33801 + }, + { + "epoch": 1.840653992498962, + "grad_norm": 0.5232662325831565, + "learning_rate": 3.310346703771583e-06, + "loss": 11.7075, + "step": 33802 + }, + { + "epoch": 1.840708446495545, + "grad_norm": 0.5849260386456648, + "learning_rate": 3.3080969357373703e-06, + "loss": 11.8087, + "step": 33803 + }, + { + "epoch": 1.840762900492128, + "grad_norm": 0.5810891250532745, + "learning_rate": 3.305847919595606e-06, + "loss": 11.6649, + "step": 33804 + }, + { + "epoch": 1.840817354488711, + "grad_norm": 0.5432501955593974, + "learning_rate": 3.3035996553637762e-06, + "loss": 11.7504, + "step": 33805 + }, + { + "epoch": 1.840871808485294, + "grad_norm": 0.5981020426794402, + "learning_rate": 3.3013521430593884e-06, + "loss": 11.736, + "step": 33806 + }, + { + "epoch": 1.8409262624818772, + "grad_norm": 0.540309161120785, + "learning_rate": 3.2991053826998853e-06, + "loss": 11.7747, + "step": 33807 + }, + { + "epoch": 1.8409807164784602, + "grad_norm": 0.5358183372556647, + "learning_rate": 3.2968593743027744e-06, + "loss": 11.8613, + "step": 33808 + }, + { + "epoch": 1.8410351704750432, + "grad_norm": 0.6148948096617408, + "learning_rate": 3.294614117885486e-06, + "loss": 11.7305, + "step": 33809 + }, + { + "epoch": 1.8410896244716262, + "grad_norm": 0.5049933350510138, + "learning_rate": 3.2923696134654957e-06, + "loss": 11.7738, + "step": 33810 + }, + { + "epoch": 1.8411440784682092, + "grad_norm": 0.5862344141498278, + "learning_rate": 3.290125861060267e-06, + "loss": 11.7892, + "step": 33811 + }, + { + "epoch": 1.8411985324647921, + "grad_norm": 0.5401173404633814, + "learning_rate": 3.287882860687219e-06, + "loss": 11.766, + "step": 33812 + }, + { + "epoch": 1.8412529864613751, + "grad_norm": 0.5384693365841544, + "learning_rate": 3.2856406123638496e-06, + "loss": 11.72, + "step": 33813 + }, + { + "epoch": 1.8413074404579581, + "grad_norm": 0.6024943728394331, + "learning_rate": 3.283399116107533e-06, + "loss": 11.8491, + "step": 33814 + }, + { + "epoch": 1.8413618944545411, + "grad_norm": 0.5547636411800385, + "learning_rate": 3.281158371935711e-06, + "loss": 11.8517, + "step": 33815 + }, + { + "epoch": 1.8414163484511241, + "grad_norm": 0.5204289649108008, + "learning_rate": 3.278918379865847e-06, + "loss": 11.7569, + "step": 33816 + }, + { + "epoch": 1.8414708024477071, + "grad_norm": 0.5472886969893096, + "learning_rate": 3.2766791399153175e-06, + "loss": 11.6995, + "step": 33817 + }, + { + "epoch": 1.84152525644429, + "grad_norm": 0.5206538728662238, + "learning_rate": 3.2744406521015627e-06, + "loss": 11.7028, + "step": 33818 + }, + { + "epoch": 1.841579710440873, + "grad_norm": 0.596221335886329, + "learning_rate": 3.272202916441969e-06, + "loss": 11.9266, + "step": 33819 + }, + { + "epoch": 1.841634164437456, + "grad_norm": 0.5045741931754221, + "learning_rate": 3.2699659329539557e-06, + "loss": 11.7785, + "step": 33820 + }, + { + "epoch": 1.841688618434039, + "grad_norm": 0.5872171316368902, + "learning_rate": 3.267729701654898e-06, + "loss": 11.7731, + "step": 33821 + }, + { + "epoch": 1.841743072430622, + "grad_norm": 0.5898172680419017, + "learning_rate": 3.265494222562193e-06, + "loss": 11.9146, + "step": 33822 + }, + { + "epoch": 1.841797526427205, + "grad_norm": 0.5707552746372736, + "learning_rate": 3.2632594956932603e-06, + "loss": 11.6587, + "step": 33823 + }, + { + "epoch": 1.8418519804237883, + "grad_norm": 0.52788861099732, + "learning_rate": 3.2610255210654082e-06, + "loss": 11.7568, + "step": 33824 + }, + { + "epoch": 1.8419064344203713, + "grad_norm": 0.5824919257319955, + "learning_rate": 3.2587922986960674e-06, + "loss": 11.8044, + "step": 33825 + }, + { + "epoch": 1.8419608884169543, + "grad_norm": 0.5163323818054658, + "learning_rate": 3.2565598286025566e-06, + "loss": 11.7038, + "step": 33826 + }, + { + "epoch": 1.8420153424135373, + "grad_norm": 0.5501110666957616, + "learning_rate": 3.2543281108022736e-06, + "loss": 11.6753, + "step": 33827 + }, + { + "epoch": 1.8420697964101203, + "grad_norm": 0.5496418342032785, + "learning_rate": 3.2520971453125715e-06, + "loss": 11.7697, + "step": 33828 + }, + { + "epoch": 1.8421242504067032, + "grad_norm": 0.5626608410252125, + "learning_rate": 3.2498669321507692e-06, + "loss": 11.8011, + "step": 33829 + }, + { + "epoch": 1.8421787044032865, + "grad_norm": 0.5368191917332996, + "learning_rate": 3.2476374713342304e-06, + "loss": 11.8879, + "step": 33830 + }, + { + "epoch": 1.8422331583998695, + "grad_norm": 0.5143398354378745, + "learning_rate": 3.2454087628802863e-06, + "loss": 11.7049, + "step": 33831 + }, + { + "epoch": 1.8422876123964524, + "grad_norm": 0.4836503017569531, + "learning_rate": 3.2431808068062786e-06, + "loss": 11.7691, + "step": 33832 + }, + { + "epoch": 1.8423420663930354, + "grad_norm": 0.542519706301802, + "learning_rate": 3.240953603129515e-06, + "loss": 11.8129, + "step": 33833 + }, + { + "epoch": 1.8423965203896184, + "grad_norm": 0.5948890937753298, + "learning_rate": 3.238727151867338e-06, + "loss": 11.9013, + "step": 33834 + }, + { + "epoch": 1.8424509743862014, + "grad_norm": 0.6205790799442409, + "learning_rate": 3.236501453037033e-06, + "loss": 11.7787, + "step": 33835 + }, + { + "epoch": 1.8425054283827844, + "grad_norm": 0.5399811529053937, + "learning_rate": 3.23427650665592e-06, + "loss": 11.7706, + "step": 33836 + }, + { + "epoch": 1.8425598823793674, + "grad_norm": 0.5837166679594895, + "learning_rate": 3.232052312741296e-06, + "loss": 11.8098, + "step": 33837 + }, + { + "epoch": 1.8426143363759504, + "grad_norm": 0.5234130122805322, + "learning_rate": 3.2298288713104695e-06, + "loss": 11.7363, + "step": 33838 + }, + { + "epoch": 1.8426687903725334, + "grad_norm": 0.5453969860580936, + "learning_rate": 3.227606182380716e-06, + "loss": 11.8542, + "step": 33839 + }, + { + "epoch": 1.8427232443691164, + "grad_norm": 0.5675145898980608, + "learning_rate": 3.2253842459693318e-06, + "loss": 11.7664, + "step": 33840 + }, + { + "epoch": 1.8427776983656994, + "grad_norm": 0.5845154079088531, + "learning_rate": 3.2231630620935814e-06, + "loss": 11.8287, + "step": 33841 + }, + { + "epoch": 1.8428321523622824, + "grad_norm": 0.5210611882694561, + "learning_rate": 3.2209426307707515e-06, + "loss": 11.5919, + "step": 33842 + }, + { + "epoch": 1.8428866063588654, + "grad_norm": 0.576083556110766, + "learning_rate": 3.218722952018094e-06, + "loss": 11.8641, + "step": 33843 + }, + { + "epoch": 1.8429410603554484, + "grad_norm": 0.506464158582543, + "learning_rate": 3.2165040258528844e-06, + "loss": 11.8904, + "step": 33844 + }, + { + "epoch": 1.8429955143520313, + "grad_norm": 0.5790154077641596, + "learning_rate": 3.2142858522923757e-06, + "loss": 11.6134, + "step": 33845 + }, + { + "epoch": 1.8430499683486143, + "grad_norm": 0.5140118067808708, + "learning_rate": 3.2120684313537985e-06, + "loss": 11.9575, + "step": 33846 + }, + { + "epoch": 1.8431044223451973, + "grad_norm": 0.5090120031802001, + "learning_rate": 3.2098517630544168e-06, + "loss": 11.7169, + "step": 33847 + }, + { + "epoch": 1.8431588763417805, + "grad_norm": 0.5135841133725867, + "learning_rate": 3.2076358474114498e-06, + "loss": 11.6483, + "step": 33848 + }, + { + "epoch": 1.8432133303383635, + "grad_norm": 0.5670370227024023, + "learning_rate": 3.2054206844421397e-06, + "loss": 11.8771, + "step": 33849 + }, + { + "epoch": 1.8432677843349465, + "grad_norm": 0.5503407544068725, + "learning_rate": 3.203206274163717e-06, + "loss": 11.8097, + "step": 33850 + }, + { + "epoch": 1.8433222383315295, + "grad_norm": 0.5307854885737221, + "learning_rate": 3.20099261659339e-06, + "loss": 11.7264, + "step": 33851 + }, + { + "epoch": 1.8433766923281125, + "grad_norm": 0.49715861760956076, + "learning_rate": 3.1987797117483786e-06, + "loss": 11.7581, + "step": 33852 + }, + { + "epoch": 1.8434311463246955, + "grad_norm": 0.5831303357984639, + "learning_rate": 3.196567559645891e-06, + "loss": 11.9091, + "step": 33853 + }, + { + "epoch": 1.8434856003212787, + "grad_norm": 0.5464664626966996, + "learning_rate": 3.194356160303136e-06, + "loss": 11.7487, + "step": 33854 + }, + { + "epoch": 1.8435400543178617, + "grad_norm": 0.5775957701878826, + "learning_rate": 3.192145513737299e-06, + "loss": 11.8228, + "step": 33855 + }, + { + "epoch": 1.8435945083144447, + "grad_norm": 0.619429479039788, + "learning_rate": 3.189935619965567e-06, + "loss": 11.7778, + "step": 33856 + }, + { + "epoch": 1.8436489623110277, + "grad_norm": 0.5248449823582793, + "learning_rate": 3.1877264790051377e-06, + "loss": 11.7079, + "step": 33857 + }, + { + "epoch": 1.8437034163076107, + "grad_norm": 0.5542833636643758, + "learning_rate": 3.185518090873174e-06, + "loss": 11.8628, + "step": 33858 + }, + { + "epoch": 1.8437578703041937, + "grad_norm": 0.5158652188160496, + "learning_rate": 3.183310455586852e-06, + "loss": 11.7808, + "step": 33859 + }, + { + "epoch": 1.8438123243007767, + "grad_norm": 0.544956279425335, + "learning_rate": 3.1811035731633576e-06, + "loss": 11.8081, + "step": 33860 + }, + { + "epoch": 1.8438667782973597, + "grad_norm": 0.6838284135909499, + "learning_rate": 3.1788974436198328e-06, + "loss": 11.8219, + "step": 33861 + }, + { + "epoch": 1.8439212322939427, + "grad_norm": 0.5159639415411428, + "learning_rate": 3.1766920669734414e-06, + "loss": 11.7425, + "step": 33862 + }, + { + "epoch": 1.8439756862905257, + "grad_norm": 0.5035792350524115, + "learning_rate": 3.1744874432413253e-06, + "loss": 11.7774, + "step": 33863 + }, + { + "epoch": 1.8440301402871087, + "grad_norm": 0.5575584635097174, + "learning_rate": 3.1722835724406374e-06, + "loss": 11.8778, + "step": 33864 + }, + { + "epoch": 1.8440845942836916, + "grad_norm": 0.521595897479169, + "learning_rate": 3.1700804545885087e-06, + "loss": 11.8565, + "step": 33865 + }, + { + "epoch": 1.8441390482802746, + "grad_norm": 0.5325072597234949, + "learning_rate": 3.16787808970207e-06, + "loss": 11.6101, + "step": 33866 + }, + { + "epoch": 1.8441935022768576, + "grad_norm": 0.526921449884404, + "learning_rate": 3.1656764777984625e-06, + "loss": 11.785, + "step": 33867 + }, + { + "epoch": 1.8442479562734406, + "grad_norm": 0.5604604463304292, + "learning_rate": 3.1634756188947736e-06, + "loss": 11.8091, + "step": 33868 + }, + { + "epoch": 1.8443024102700236, + "grad_norm": 0.5237033650127261, + "learning_rate": 3.161275513008155e-06, + "loss": 11.7208, + "step": 33869 + }, + { + "epoch": 1.8443568642666066, + "grad_norm": 0.6297718900730161, + "learning_rate": 3.159076160155683e-06, + "loss": 11.7882, + "step": 33870 + }, + { + "epoch": 1.8444113182631898, + "grad_norm": 0.5738234071997348, + "learning_rate": 3.1568775603544766e-06, + "loss": 11.8193, + "step": 33871 + }, + { + "epoch": 1.8444657722597728, + "grad_norm": 0.5641880449837706, + "learning_rate": 3.1546797136216443e-06, + "loss": 11.8338, + "step": 33872 + }, + { + "epoch": 1.8445202262563558, + "grad_norm": 0.5413760584223677, + "learning_rate": 3.1524826199742506e-06, + "loss": 11.6959, + "step": 33873 + }, + { + "epoch": 1.8445746802529388, + "grad_norm": 0.5616691097512236, + "learning_rate": 3.1502862794294152e-06, + "loss": 11.6755, + "step": 33874 + }, + { + "epoch": 1.8446291342495218, + "grad_norm": 0.6246207200975681, + "learning_rate": 3.1480906920041798e-06, + "loss": 11.8428, + "step": 33875 + }, + { + "epoch": 1.8446835882461048, + "grad_norm": 0.5401450050853716, + "learning_rate": 3.1458958577156195e-06, + "loss": 11.6716, + "step": 33876 + }, + { + "epoch": 1.844738042242688, + "grad_norm": 0.5416520671529061, + "learning_rate": 3.143701776580832e-06, + "loss": 11.7409, + "step": 33877 + }, + { + "epoch": 1.844792496239271, + "grad_norm": 0.49940488635913727, + "learning_rate": 3.141508448616859e-06, + "loss": 11.6558, + "step": 33878 + }, + { + "epoch": 1.844846950235854, + "grad_norm": 0.5851468047936453, + "learning_rate": 3.1393158738407645e-06, + "loss": 11.7389, + "step": 33879 + }, + { + "epoch": 1.844901404232437, + "grad_norm": 0.5432004950819189, + "learning_rate": 3.13712405226958e-06, + "loss": 11.9062, + "step": 33880 + }, + { + "epoch": 1.84495585822902, + "grad_norm": 0.5686325387673025, + "learning_rate": 3.1349329839203802e-06, + "loss": 11.8332, + "step": 33881 + }, + { + "epoch": 1.845010312225603, + "grad_norm": 0.5388373122055787, + "learning_rate": 3.1327426688101733e-06, + "loss": 11.6179, + "step": 33882 + }, + { + "epoch": 1.845064766222186, + "grad_norm": 0.519201245880715, + "learning_rate": 3.1305531069560025e-06, + "loss": 11.8168, + "step": 33883 + }, + { + "epoch": 1.845119220218769, + "grad_norm": 0.566674166251352, + "learning_rate": 3.1283642983749085e-06, + "loss": 11.7421, + "step": 33884 + }, + { + "epoch": 1.845173674215352, + "grad_norm": 0.5437773880958092, + "learning_rate": 3.1261762430838894e-06, + "loss": 11.8667, + "step": 33885 + }, + { + "epoch": 1.845228128211935, + "grad_norm": 0.5301400094827246, + "learning_rate": 3.1239889410999644e-06, + "loss": 11.7364, + "step": 33886 + }, + { + "epoch": 1.845282582208518, + "grad_norm": 0.5028840908671239, + "learning_rate": 3.121802392440165e-06, + "loss": 11.8648, + "step": 33887 + }, + { + "epoch": 1.845337036205101, + "grad_norm": 0.5392734574427442, + "learning_rate": 3.1196165971214553e-06, + "loss": 11.8577, + "step": 33888 + }, + { + "epoch": 1.845391490201684, + "grad_norm": 0.528793753122846, + "learning_rate": 3.1174315551608768e-06, + "loss": 11.8459, + "step": 33889 + }, + { + "epoch": 1.845445944198267, + "grad_norm": 0.5099363291544728, + "learning_rate": 3.1152472665753717e-06, + "loss": 11.8106, + "step": 33890 + }, + { + "epoch": 1.84550039819485, + "grad_norm": 0.5899545497919645, + "learning_rate": 3.113063731381971e-06, + "loss": 11.7961, + "step": 33891 + }, + { + "epoch": 1.845554852191433, + "grad_norm": 0.5196418330767255, + "learning_rate": 3.1108809495976275e-06, + "loss": 11.6222, + "step": 33892 + }, + { + "epoch": 1.8456093061880159, + "grad_norm": 0.5522045974661178, + "learning_rate": 3.108698921239317e-06, + "loss": 11.6702, + "step": 33893 + }, + { + "epoch": 1.845663760184599, + "grad_norm": 0.5347161207556804, + "learning_rate": 3.1065176463240364e-06, + "loss": 11.7245, + "step": 33894 + }, + { + "epoch": 1.845718214181182, + "grad_norm": 0.5985366129011804, + "learning_rate": 3.104337124868706e-06, + "loss": 11.9679, + "step": 33895 + }, + { + "epoch": 1.845772668177765, + "grad_norm": 0.5567806245290867, + "learning_rate": 3.1021573568903007e-06, + "loss": 11.8281, + "step": 33896 + }, + { + "epoch": 1.845827122174348, + "grad_norm": 0.4984262442091683, + "learning_rate": 3.0999783424057626e-06, + "loss": 11.6549, + "step": 33897 + }, + { + "epoch": 1.845881576170931, + "grad_norm": 0.5305676714975427, + "learning_rate": 3.0978000814320452e-06, + "loss": 11.8301, + "step": 33898 + }, + { + "epoch": 1.845936030167514, + "grad_norm": 0.5835378027254757, + "learning_rate": 3.09562257398609e-06, + "loss": 11.9314, + "step": 33899 + }, + { + "epoch": 1.8459904841640973, + "grad_norm": 0.538974559621088, + "learning_rate": 3.093445820084817e-06, + "loss": 11.8596, + "step": 33900 + }, + { + "epoch": 1.8460449381606803, + "grad_norm": 0.6174274770695499, + "learning_rate": 3.0912698197451796e-06, + "loss": 11.8054, + "step": 33901 + }, + { + "epoch": 1.8460993921572633, + "grad_norm": 0.4943718810476187, + "learning_rate": 3.0890945729840524e-06, + "loss": 11.7585, + "step": 33902 + }, + { + "epoch": 1.8461538461538463, + "grad_norm": 0.5133081439897508, + "learning_rate": 3.0869200798184004e-06, + "loss": 11.8115, + "step": 33903 + }, + { + "epoch": 1.8462083001504292, + "grad_norm": 0.57591531044245, + "learning_rate": 3.08474634026511e-06, + "loss": 11.8596, + "step": 33904 + }, + { + "epoch": 1.8462627541470122, + "grad_norm": 0.5128937353643369, + "learning_rate": 3.082573354341067e-06, + "loss": 11.854, + "step": 33905 + }, + { + "epoch": 1.8463172081435952, + "grad_norm": 0.5434425857331038, + "learning_rate": 3.0804011220632033e-06, + "loss": 11.7711, + "step": 33906 + }, + { + "epoch": 1.8463716621401782, + "grad_norm": 0.6067532377719885, + "learning_rate": 3.0782296434483825e-06, + "loss": 11.7875, + "step": 33907 + }, + { + "epoch": 1.8464261161367612, + "grad_norm": 0.5375928356579653, + "learning_rate": 3.0760589185135026e-06, + "loss": 11.8202, + "step": 33908 + }, + { + "epoch": 1.8464805701333442, + "grad_norm": 0.5511883926322493, + "learning_rate": 3.0738889472754493e-06, + "loss": 11.8206, + "step": 33909 + }, + { + "epoch": 1.8465350241299272, + "grad_norm": 0.5489226508382511, + "learning_rate": 3.071719729751077e-06, + "loss": 11.8662, + "step": 33910 + }, + { + "epoch": 1.8465894781265102, + "grad_norm": 0.48990277090348355, + "learning_rate": 3.0695512659572823e-06, + "loss": 11.7124, + "step": 33911 + }, + { + "epoch": 1.8466439321230932, + "grad_norm": 0.5500921390007175, + "learning_rate": 3.0673835559109075e-06, + "loss": 11.6711, + "step": 33912 + }, + { + "epoch": 1.8466983861196762, + "grad_norm": 0.53862484676813, + "learning_rate": 3.0652165996288174e-06, + "loss": 11.8484, + "step": 33913 + }, + { + "epoch": 1.8467528401162592, + "grad_norm": 0.5906898883728187, + "learning_rate": 3.0630503971278646e-06, + "loss": 11.8781, + "step": 33914 + }, + { + "epoch": 1.8468072941128422, + "grad_norm": 0.6049249207166347, + "learning_rate": 3.060884948424869e-06, + "loss": 11.8281, + "step": 33915 + }, + { + "epoch": 1.8468617481094252, + "grad_norm": 0.5412072270855967, + "learning_rate": 3.0587202535367065e-06, + "loss": 11.5532, + "step": 33916 + }, + { + "epoch": 1.8469162021060082, + "grad_norm": 0.512322497233974, + "learning_rate": 3.0565563124801745e-06, + "loss": 11.7961, + "step": 33917 + }, + { + "epoch": 1.8469706561025914, + "grad_norm": 0.5129580644807363, + "learning_rate": 3.054393125272137e-06, + "loss": 11.6918, + "step": 33918 + }, + { + "epoch": 1.8470251100991744, + "grad_norm": 0.5634917997525085, + "learning_rate": 3.052230691929381e-06, + "loss": 11.814, + "step": 33919 + }, + { + "epoch": 1.8470795640957574, + "grad_norm": 0.5260412748198389, + "learning_rate": 3.050069012468737e-06, + "loss": 11.7143, + "step": 33920 + }, + { + "epoch": 1.8471340180923403, + "grad_norm": 0.5614031038340227, + "learning_rate": 3.0479080869070253e-06, + "loss": 11.8278, + "step": 33921 + }, + { + "epoch": 1.8471884720889233, + "grad_norm": 0.5044524238585407, + "learning_rate": 3.0457479152610324e-06, + "loss": 11.7716, + "step": 33922 + }, + { + "epoch": 1.8472429260855066, + "grad_norm": 0.4937868742172702, + "learning_rate": 3.043588497547567e-06, + "loss": 11.7005, + "step": 33923 + }, + { + "epoch": 1.8472973800820895, + "grad_norm": 0.5206714157371767, + "learning_rate": 3.041429833783427e-06, + "loss": 11.7411, + "step": 33924 + }, + { + "epoch": 1.8473518340786725, + "grad_norm": 0.5995436217804876, + "learning_rate": 3.039271923985365e-06, + "loss": 11.7636, + "step": 33925 + }, + { + "epoch": 1.8474062880752555, + "grad_norm": 0.6395912019597665, + "learning_rate": 3.037114768170202e-06, + "loss": 11.7916, + "step": 33926 + }, + { + "epoch": 1.8474607420718385, + "grad_norm": 0.5854515514875622, + "learning_rate": 3.03495836635469e-06, + "loss": 11.8173, + "step": 33927 + }, + { + "epoch": 1.8475151960684215, + "grad_norm": 0.5693612498369074, + "learning_rate": 3.0328027185556052e-06, + "loss": 11.802, + "step": 33928 + }, + { + "epoch": 1.8475696500650045, + "grad_norm": 0.5205507286134287, + "learning_rate": 3.0306478247897008e-06, + "loss": 11.7203, + "step": 33929 + }, + { + "epoch": 1.8476241040615875, + "grad_norm": 0.5217218520518706, + "learning_rate": 3.028493685073752e-06, + "loss": 11.8448, + "step": 33930 + }, + { + "epoch": 1.8476785580581705, + "grad_norm": 0.5338324715648306, + "learning_rate": 3.0263402994244794e-06, + "loss": 11.682, + "step": 33931 + }, + { + "epoch": 1.8477330120547535, + "grad_norm": 0.5709604337486814, + "learning_rate": 3.0241876678586577e-06, + "loss": 11.8978, + "step": 33932 + }, + { + "epoch": 1.8477874660513365, + "grad_norm": 0.5203366203077877, + "learning_rate": 3.0220357903930297e-06, + "loss": 11.8205, + "step": 33933 + }, + { + "epoch": 1.8478419200479195, + "grad_norm": 0.5372597897618722, + "learning_rate": 3.0198846670443037e-06, + "loss": 11.8024, + "step": 33934 + }, + { + "epoch": 1.8478963740445025, + "grad_norm": 0.543726202110921, + "learning_rate": 3.017734297829211e-06, + "loss": 11.7345, + "step": 33935 + }, + { + "epoch": 1.8479508280410855, + "grad_norm": 0.5597512685153038, + "learning_rate": 3.015584682764494e-06, + "loss": 11.8128, + "step": 33936 + }, + { + "epoch": 1.8480052820376685, + "grad_norm": 0.6448671965687806, + "learning_rate": 3.013435821866839e-06, + "loss": 11.6734, + "step": 33937 + }, + { + "epoch": 1.8480597360342514, + "grad_norm": 0.5558892060875587, + "learning_rate": 3.0112877151529884e-06, + "loss": 11.7819, + "step": 33938 + }, + { + "epoch": 1.8481141900308344, + "grad_norm": 0.5321331313157706, + "learning_rate": 3.0091403626396177e-06, + "loss": 11.7569, + "step": 33939 + }, + { + "epoch": 1.8481686440274174, + "grad_norm": 0.6038253896564069, + "learning_rate": 3.0069937643434354e-06, + "loss": 11.7305, + "step": 33940 + }, + { + "epoch": 1.8482230980240006, + "grad_norm": 0.5640110249845319, + "learning_rate": 3.0048479202811398e-06, + "loss": 11.7231, + "step": 33941 + }, + { + "epoch": 1.8482775520205836, + "grad_norm": 0.5658637186911623, + "learning_rate": 3.002702830469406e-06, + "loss": 11.7794, + "step": 33942 + }, + { + "epoch": 1.8483320060171666, + "grad_norm": 0.6092022829773311, + "learning_rate": 3.000558494924932e-06, + "loss": 11.894, + "step": 33943 + }, + { + "epoch": 1.8483864600137496, + "grad_norm": 0.5247351580081084, + "learning_rate": 2.9984149136643823e-06, + "loss": 11.7366, + "step": 33944 + }, + { + "epoch": 1.8484409140103326, + "grad_norm": 0.5024333038604187, + "learning_rate": 2.9962720867044326e-06, + "loss": 11.7122, + "step": 33945 + }, + { + "epoch": 1.8484953680069156, + "grad_norm": 0.536140477341649, + "learning_rate": 2.9941300140617246e-06, + "loss": 11.7737, + "step": 33946 + }, + { + "epoch": 1.8485498220034988, + "grad_norm": 0.5158297429179122, + "learning_rate": 2.991988695752923e-06, + "loss": 11.6573, + "step": 33947 + }, + { + "epoch": 1.8486042760000818, + "grad_norm": 0.48013086023660484, + "learning_rate": 2.9898481317947036e-06, + "loss": 11.7769, + "step": 33948 + }, + { + "epoch": 1.8486587299966648, + "grad_norm": 0.569550321927307, + "learning_rate": 2.987708322203675e-06, + "loss": 11.7834, + "step": 33949 + }, + { + "epoch": 1.8487131839932478, + "grad_norm": 0.5193882620624866, + "learning_rate": 2.985569266996513e-06, + "loss": 11.8766, + "step": 33950 + }, + { + "epoch": 1.8487676379898308, + "grad_norm": 0.5274351535341895, + "learning_rate": 2.9834309661898264e-06, + "loss": 11.6403, + "step": 33951 + }, + { + "epoch": 1.8488220919864138, + "grad_norm": 0.5220389440061838, + "learning_rate": 2.9812934198002463e-06, + "loss": 11.7715, + "step": 33952 + }, + { + "epoch": 1.8488765459829968, + "grad_norm": 0.5481192506398179, + "learning_rate": 2.979156627844404e-06, + "loss": 11.7027, + "step": 33953 + }, + { + "epoch": 1.8489309999795798, + "grad_norm": 0.5675706386525854, + "learning_rate": 2.977020590338908e-06, + "loss": 11.8059, + "step": 33954 + }, + { + "epoch": 1.8489854539761628, + "grad_norm": 0.5724062075791675, + "learning_rate": 2.974885307300379e-06, + "loss": 11.7369, + "step": 33955 + }, + { + "epoch": 1.8490399079727458, + "grad_norm": 0.6104988207407752, + "learning_rate": 2.9727507787454035e-06, + "loss": 11.8075, + "step": 33956 + }, + { + "epoch": 1.8490943619693287, + "grad_norm": 0.5096959022771089, + "learning_rate": 2.970617004690579e-06, + "loss": 11.7302, + "step": 33957 + }, + { + "epoch": 1.8491488159659117, + "grad_norm": 0.5879912890201504, + "learning_rate": 2.968483985152526e-06, + "loss": 11.8342, + "step": 33958 + }, + { + "epoch": 1.8492032699624947, + "grad_norm": 0.5704977956925356, + "learning_rate": 2.9663517201478084e-06, + "loss": 11.8169, + "step": 33959 + }, + { + "epoch": 1.8492577239590777, + "grad_norm": 0.5562458096238031, + "learning_rate": 2.964220209693014e-06, + "loss": 11.6883, + "step": 33960 + }, + { + "epoch": 1.8493121779556607, + "grad_norm": 0.5260987530055947, + "learning_rate": 2.9620894538047175e-06, + "loss": 11.6639, + "step": 33961 + }, + { + "epoch": 1.8493666319522437, + "grad_norm": 0.538078585004661, + "learning_rate": 2.9599594524994834e-06, + "loss": 11.8035, + "step": 33962 + }, + { + "epoch": 1.8494210859488267, + "grad_norm": 0.5188453130396761, + "learning_rate": 2.9578302057938766e-06, + "loss": 11.5551, + "step": 33963 + }, + { + "epoch": 1.84947553994541, + "grad_norm": 0.5379328298430154, + "learning_rate": 2.9557017137044617e-06, + "loss": 11.7338, + "step": 33964 + }, + { + "epoch": 1.849529993941993, + "grad_norm": 0.5754936847753943, + "learning_rate": 2.953573976247781e-06, + "loss": 11.719, + "step": 33965 + }, + { + "epoch": 1.849584447938576, + "grad_norm": 0.5249416751920055, + "learning_rate": 2.9514469934403876e-06, + "loss": 11.6886, + "step": 33966 + }, + { + "epoch": 1.849638901935159, + "grad_norm": 0.5380695841891844, + "learning_rate": 2.949320765298813e-06, + "loss": 11.7186, + "step": 33967 + }, + { + "epoch": 1.849693355931742, + "grad_norm": 0.5157242662529193, + "learning_rate": 2.9471952918395883e-06, + "loss": 11.6475, + "step": 33968 + }, + { + "epoch": 1.8497478099283249, + "grad_norm": 0.599758824239076, + "learning_rate": 2.945070573079256e-06, + "loss": 11.8665, + "step": 33969 + }, + { + "epoch": 1.849802263924908, + "grad_norm": 0.54743572491124, + "learning_rate": 2.942946609034336e-06, + "loss": 11.7804, + "step": 33970 + }, + { + "epoch": 1.849856717921491, + "grad_norm": 0.5556212574903687, + "learning_rate": 2.940823399721326e-06, + "loss": 11.7047, + "step": 33971 + }, + { + "epoch": 1.849911171918074, + "grad_norm": 0.5043351902681977, + "learning_rate": 2.938700945156769e-06, + "loss": 11.8355, + "step": 33972 + }, + { + "epoch": 1.849965625914657, + "grad_norm": 0.5355679036669441, + "learning_rate": 2.9365792453571404e-06, + "loss": 11.8139, + "step": 33973 + }, + { + "epoch": 1.85002007991124, + "grad_norm": 0.5586999119186564, + "learning_rate": 2.934458300338949e-06, + "loss": 11.7065, + "step": 33974 + }, + { + "epoch": 1.850074533907823, + "grad_norm": 0.5536597163442772, + "learning_rate": 2.9323381101186933e-06, + "loss": 11.7894, + "step": 33975 + }, + { + "epoch": 1.850128987904406, + "grad_norm": 0.5836834594475118, + "learning_rate": 2.9302186747128478e-06, + "loss": 11.8819, + "step": 33976 + }, + { + "epoch": 1.850183441900989, + "grad_norm": 0.5914927375099355, + "learning_rate": 2.9280999941379005e-06, + "loss": 11.6999, + "step": 33977 + }, + { + "epoch": 1.850237895897572, + "grad_norm": 0.4948145150744575, + "learning_rate": 2.9259820684103267e-06, + "loss": 11.7372, + "step": 33978 + }, + { + "epoch": 1.850292349894155, + "grad_norm": 0.5262052599923139, + "learning_rate": 2.923864897546602e-06, + "loss": 11.7061, + "step": 33979 + }, + { + "epoch": 1.850346803890738, + "grad_norm": 0.6282740310847362, + "learning_rate": 2.9217484815631803e-06, + "loss": 11.8636, + "step": 33980 + }, + { + "epoch": 1.850401257887321, + "grad_norm": 0.508782728275568, + "learning_rate": 2.9196328204765145e-06, + "loss": 11.8079, + "step": 33981 + }, + { + "epoch": 1.850455711883904, + "grad_norm": 0.5279491061551861, + "learning_rate": 2.9175179143030697e-06, + "loss": 11.7281, + "step": 33982 + }, + { + "epoch": 1.850510165880487, + "grad_norm": 0.49618291423348704, + "learning_rate": 2.915403763059288e-06, + "loss": 11.7713, + "step": 33983 + }, + { + "epoch": 1.85056461987707, + "grad_norm": 0.5385394130196661, + "learning_rate": 2.913290366761612e-06, + "loss": 11.8213, + "step": 33984 + }, + { + "epoch": 1.850619073873653, + "grad_norm": 0.5479233683614301, + "learning_rate": 2.9111777254264728e-06, + "loss": 11.7341, + "step": 33985 + }, + { + "epoch": 1.850673527870236, + "grad_norm": 0.49936015882159296, + "learning_rate": 2.9090658390702907e-06, + "loss": 11.7457, + "step": 33986 + }, + { + "epoch": 1.850727981866819, + "grad_norm": 0.5808296453329583, + "learning_rate": 2.9069547077094972e-06, + "loss": 11.773, + "step": 33987 + }, + { + "epoch": 1.8507824358634022, + "grad_norm": 0.5293009174682723, + "learning_rate": 2.904844331360501e-06, + "loss": 11.8333, + "step": 33988 + }, + { + "epoch": 1.8508368898599852, + "grad_norm": 0.5841705642277881, + "learning_rate": 2.902734710039723e-06, + "loss": 11.8187, + "step": 33989 + }, + { + "epoch": 1.8508913438565682, + "grad_norm": 0.5649771813488085, + "learning_rate": 2.9006258437635605e-06, + "loss": 11.7901, + "step": 33990 + }, + { + "epoch": 1.8509457978531512, + "grad_norm": 0.5302723487249812, + "learning_rate": 2.8985177325484113e-06, + "loss": 11.7943, + "step": 33991 + }, + { + "epoch": 1.8510002518497342, + "grad_norm": 0.5866540247940978, + "learning_rate": 2.8964103764106855e-06, + "loss": 11.8323, + "step": 33992 + }, + { + "epoch": 1.8510547058463174, + "grad_norm": 0.5551975874233183, + "learning_rate": 2.894303775366736e-06, + "loss": 11.9083, + "step": 33993 + }, + { + "epoch": 1.8511091598429004, + "grad_norm": 0.5270400714667117, + "learning_rate": 2.8921979294329825e-06, + "loss": 11.8537, + "step": 33994 + }, + { + "epoch": 1.8511636138394834, + "grad_norm": 0.5135052772250565, + "learning_rate": 2.8900928386257906e-06, + "loss": 11.71, + "step": 33995 + }, + { + "epoch": 1.8512180678360663, + "grad_norm": 0.5771000790059908, + "learning_rate": 2.887988502961503e-06, + "loss": 11.7572, + "step": 33996 + }, + { + "epoch": 1.8512725218326493, + "grad_norm": 0.5483909976668098, + "learning_rate": 2.8858849224565164e-06, + "loss": 11.8167, + "step": 33997 + }, + { + "epoch": 1.8513269758292323, + "grad_norm": 0.5226034055040285, + "learning_rate": 2.8837820971271634e-06, + "loss": 11.7273, + "step": 33998 + }, + { + "epoch": 1.8513814298258153, + "grad_norm": 0.541963955985011, + "learning_rate": 2.881680026989808e-06, + "loss": 11.7224, + "step": 33999 + }, + { + "epoch": 1.8514358838223983, + "grad_norm": 0.5181706792953558, + "learning_rate": 2.879578712060793e-06, + "loss": 11.8442, + "step": 34000 + }, + { + "epoch": 1.8514903378189813, + "grad_norm": 0.5399078815227443, + "learning_rate": 2.877478152356472e-06, + "loss": 11.7581, + "step": 34001 + }, + { + "epoch": 1.8515447918155643, + "grad_norm": 0.5359867665277845, + "learning_rate": 2.8753783478931653e-06, + "loss": 11.7962, + "step": 34002 + }, + { + "epoch": 1.8515992458121473, + "grad_norm": 0.5435760006232293, + "learning_rate": 2.8732792986871925e-06, + "loss": 11.7628, + "step": 34003 + }, + { + "epoch": 1.8516536998087303, + "grad_norm": 0.555593989298099, + "learning_rate": 2.8711810047549082e-06, + "loss": 11.8577, + "step": 34004 + }, + { + "epoch": 1.8517081538053133, + "grad_norm": 0.5285585549972435, + "learning_rate": 2.8690834661125988e-06, + "loss": 11.7192, + "step": 34005 + }, + { + "epoch": 1.8517626078018963, + "grad_norm": 0.5503586250886979, + "learning_rate": 2.8669866827765844e-06, + "loss": 11.6285, + "step": 34006 + }, + { + "epoch": 1.8518170617984793, + "grad_norm": 0.5363522402626031, + "learning_rate": 2.864890654763175e-06, + "loss": 11.7311, + "step": 34007 + }, + { + "epoch": 1.8518715157950623, + "grad_norm": 0.5241239077671972, + "learning_rate": 2.8627953820886567e-06, + "loss": 11.5543, + "step": 34008 + }, + { + "epoch": 1.8519259697916453, + "grad_norm": 0.5527898725009992, + "learning_rate": 2.860700864769339e-06, + "loss": 11.9241, + "step": 34009 + }, + { + "epoch": 1.8519804237882282, + "grad_norm": 0.5655947024463986, + "learning_rate": 2.8586071028214976e-06, + "loss": 11.8699, + "step": 34010 + }, + { + "epoch": 1.8520348777848115, + "grad_norm": 0.5864008975548566, + "learning_rate": 2.85651409626142e-06, + "loss": 11.8148, + "step": 34011 + }, + { + "epoch": 1.8520893317813945, + "grad_norm": 0.5272738235541787, + "learning_rate": 2.8544218451053816e-06, + "loss": 11.7192, + "step": 34012 + }, + { + "epoch": 1.8521437857779774, + "grad_norm": 0.545272321828681, + "learning_rate": 2.8523303493696364e-06, + "loss": 11.8962, + "step": 34013 + }, + { + "epoch": 1.8521982397745604, + "grad_norm": 0.5912552431684458, + "learning_rate": 2.850239609070482e-06, + "loss": 11.7549, + "step": 34014 + }, + { + "epoch": 1.8522526937711434, + "grad_norm": 0.4986214073980574, + "learning_rate": 2.8481496242241502e-06, + "loss": 11.8233, + "step": 34015 + }, + { + "epoch": 1.8523071477677264, + "grad_norm": 0.5315990421146979, + "learning_rate": 2.8460603948469057e-06, + "loss": 11.805, + "step": 34016 + }, + { + "epoch": 1.8523616017643096, + "grad_norm": 0.5277944879919373, + "learning_rate": 2.8439719209549687e-06, + "loss": 11.8399, + "step": 34017 + }, + { + "epoch": 1.8524160557608926, + "grad_norm": 0.5307441067634918, + "learning_rate": 2.841884202564604e-06, + "loss": 11.8005, + "step": 34018 + }, + { + "epoch": 1.8524705097574756, + "grad_norm": 0.5361733699957829, + "learning_rate": 2.839797239692055e-06, + "loss": 11.7477, + "step": 34019 + }, + { + "epoch": 1.8525249637540586, + "grad_norm": 0.50796473172156, + "learning_rate": 2.8377110323535293e-06, + "loss": 11.3824, + "step": 34020 + }, + { + "epoch": 1.8525794177506416, + "grad_norm": 0.5777575613399936, + "learning_rate": 2.83562558056526e-06, + "loss": 11.7881, + "step": 34021 + }, + { + "epoch": 1.8526338717472246, + "grad_norm": 0.5001863028163757, + "learning_rate": 2.833540884343455e-06, + "loss": 11.6736, + "step": 34022 + }, + { + "epoch": 1.8526883257438076, + "grad_norm": 0.5182572141374138, + "learning_rate": 2.831456943704336e-06, + "loss": 11.8024, + "step": 34023 + }, + { + "epoch": 1.8527427797403906, + "grad_norm": 0.6047831763564729, + "learning_rate": 2.829373758664089e-06, + "loss": 11.8276, + "step": 34024 + }, + { + "epoch": 1.8527972337369736, + "grad_norm": 0.5913716437327828, + "learning_rate": 2.8272913292389457e-06, + "loss": 11.8893, + "step": 34025 + }, + { + "epoch": 1.8528516877335566, + "grad_norm": 0.5208796587409614, + "learning_rate": 2.8252096554450824e-06, + "loss": 11.7287, + "step": 34026 + }, + { + "epoch": 1.8529061417301396, + "grad_norm": 0.535002590497656, + "learning_rate": 2.8231287372986635e-06, + "loss": 11.6704, + "step": 34027 + }, + { + "epoch": 1.8529605957267226, + "grad_norm": 0.5129719394648435, + "learning_rate": 2.82104857481591e-06, + "loss": 11.6365, + "step": 34028 + }, + { + "epoch": 1.8530150497233056, + "grad_norm": 0.5661020644053443, + "learning_rate": 2.818969168012975e-06, + "loss": 11.6811, + "step": 34029 + }, + { + "epoch": 1.8530695037198885, + "grad_norm": 0.5854461566190753, + "learning_rate": 2.8168905169060233e-06, + "loss": 11.6504, + "step": 34030 + }, + { + "epoch": 1.8531239577164715, + "grad_norm": 0.5553709904444278, + "learning_rate": 2.8148126215112425e-06, + "loss": 11.7144, + "step": 34031 + }, + { + "epoch": 1.8531784117130545, + "grad_norm": 0.5810422576221066, + "learning_rate": 2.812735481844764e-06, + "loss": 11.8553, + "step": 34032 + }, + { + "epoch": 1.8532328657096375, + "grad_norm": 0.5812792611574698, + "learning_rate": 2.810659097922763e-06, + "loss": 11.8353, + "step": 34033 + }, + { + "epoch": 1.8532873197062207, + "grad_norm": 0.5702078686054134, + "learning_rate": 2.8085834697613722e-06, + "loss": 11.7031, + "step": 34034 + }, + { + "epoch": 1.8533417737028037, + "grad_norm": 0.5366748486364615, + "learning_rate": 2.8065085973767445e-06, + "loss": 11.8238, + "step": 34035 + }, + { + "epoch": 1.8533962276993867, + "grad_norm": 0.5374314304561377, + "learning_rate": 2.804434480785001e-06, + "loss": 11.8128, + "step": 34036 + }, + { + "epoch": 1.8534506816959697, + "grad_norm": 0.5532755574199162, + "learning_rate": 2.8023611200022616e-06, + "loss": 11.6945, + "step": 34037 + }, + { + "epoch": 1.8535051356925527, + "grad_norm": 0.5401358258063391, + "learning_rate": 2.80028851504468e-06, + "loss": 11.8425, + "step": 34038 + }, + { + "epoch": 1.8535595896891357, + "grad_norm": 0.5763647670588261, + "learning_rate": 2.798216665928333e-06, + "loss": 11.7155, + "step": 34039 + }, + { + "epoch": 1.853614043685719, + "grad_norm": 0.5766449647815882, + "learning_rate": 2.796145572669362e-06, + "loss": 11.7909, + "step": 34040 + }, + { + "epoch": 1.853668497682302, + "grad_norm": 0.5725141589151067, + "learning_rate": 2.7940752352838773e-06, + "loss": 11.8821, + "step": 34041 + }, + { + "epoch": 1.853722951678885, + "grad_norm": 0.6125597265303008, + "learning_rate": 2.7920056537879547e-06, + "loss": 11.8031, + "step": 34042 + }, + { + "epoch": 1.853777405675468, + "grad_norm": 0.5361428075674681, + "learning_rate": 2.7899368281977034e-06, + "loss": 11.7827, + "step": 34043 + }, + { + "epoch": 1.8538318596720509, + "grad_norm": 0.5198248211482072, + "learning_rate": 2.7878687585291995e-06, + "loss": 11.7022, + "step": 34044 + }, + { + "epoch": 1.8538863136686339, + "grad_norm": 0.5079659696727752, + "learning_rate": 2.7858014447985414e-06, + "loss": 11.6905, + "step": 34045 + }, + { + "epoch": 1.8539407676652169, + "grad_norm": 0.5385895893421295, + "learning_rate": 2.783734887021783e-06, + "loss": 11.7728, + "step": 34046 + }, + { + "epoch": 1.8539952216617999, + "grad_norm": 0.5560019379130211, + "learning_rate": 2.781669085215011e-06, + "loss": 11.7799, + "step": 34047 + }, + { + "epoch": 1.8540496756583829, + "grad_norm": 0.5357157989539586, + "learning_rate": 2.779604039394279e-06, + "loss": 11.7575, + "step": 34048 + }, + { + "epoch": 1.8541041296549658, + "grad_norm": 0.5371506830369353, + "learning_rate": 2.7775397495756527e-06, + "loss": 11.7611, + "step": 34049 + }, + { + "epoch": 1.8541585836515488, + "grad_norm": 0.5161986694474797, + "learning_rate": 2.7754762157751857e-06, + "loss": 11.6191, + "step": 34050 + }, + { + "epoch": 1.8542130376481318, + "grad_norm": 0.5605193402336714, + "learning_rate": 2.7734134380089093e-06, + "loss": 11.8143, + "step": 34051 + }, + { + "epoch": 1.8542674916447148, + "grad_norm": 0.564727523872469, + "learning_rate": 2.771351416292878e-06, + "loss": 11.8044, + "step": 34052 + }, + { + "epoch": 1.8543219456412978, + "grad_norm": 0.6114687241225171, + "learning_rate": 2.7692901506431334e-06, + "loss": 11.8897, + "step": 34053 + }, + { + "epoch": 1.8543763996378808, + "grad_norm": 0.5843686735740672, + "learning_rate": 2.767229641075675e-06, + "loss": 11.6551, + "step": 34054 + }, + { + "epoch": 1.8544308536344638, + "grad_norm": 0.5647573311623332, + "learning_rate": 2.765169887606567e-06, + "loss": 11.725, + "step": 34055 + }, + { + "epoch": 1.8544853076310468, + "grad_norm": 0.5802162322802504, + "learning_rate": 2.7631108902517964e-06, + "loss": 11.7947, + "step": 34056 + }, + { + "epoch": 1.85453976162763, + "grad_norm": 0.652859205557321, + "learning_rate": 2.7610526490273738e-06, + "loss": 11.7925, + "step": 34057 + }, + { + "epoch": 1.854594215624213, + "grad_norm": 0.5545669111004263, + "learning_rate": 2.7589951639493296e-06, + "loss": 11.7274, + "step": 34058 + }, + { + "epoch": 1.854648669620796, + "grad_norm": 0.5583813046794736, + "learning_rate": 2.7569384350336293e-06, + "loss": 11.7297, + "step": 34059 + }, + { + "epoch": 1.854703123617379, + "grad_norm": 0.5484538012243809, + "learning_rate": 2.754882462296293e-06, + "loss": 11.6868, + "step": 34060 + }, + { + "epoch": 1.854757577613962, + "grad_norm": 0.532367958735592, + "learning_rate": 2.7528272457532865e-06, + "loss": 11.7704, + "step": 34061 + }, + { + "epoch": 1.854812031610545, + "grad_norm": 0.47254148780927535, + "learning_rate": 2.7507727854206076e-06, + "loss": 11.7237, + "step": 34062 + }, + { + "epoch": 1.8548664856071282, + "grad_norm": 0.558732675017002, + "learning_rate": 2.748719081314244e-06, + "loss": 11.7849, + "step": 34063 + }, + { + "epoch": 1.8549209396037112, + "grad_norm": 0.5876116258967456, + "learning_rate": 2.7466661334501266e-06, + "loss": 11.9338, + "step": 34064 + }, + { + "epoch": 1.8549753936002942, + "grad_norm": 0.5205508541048451, + "learning_rate": 2.7446139418442763e-06, + "loss": 11.8928, + "step": 34065 + }, + { + "epoch": 1.8550298475968772, + "grad_norm": 0.5501540084999282, + "learning_rate": 2.7425625065125917e-06, + "loss": 11.7058, + "step": 34066 + }, + { + "epoch": 1.8550843015934602, + "grad_norm": 0.5275910836032256, + "learning_rate": 2.7405118274710482e-06, + "loss": 11.9045, + "step": 34067 + }, + { + "epoch": 1.8551387555900432, + "grad_norm": 0.515798708229121, + "learning_rate": 2.738461904735612e-06, + "loss": 11.6111, + "step": 34068 + }, + { + "epoch": 1.8551932095866261, + "grad_norm": 0.5428290845476124, + "learning_rate": 2.7364127383221914e-06, + "loss": 11.8164, + "step": 34069 + }, + { + "epoch": 1.8552476635832091, + "grad_norm": 0.6048802628482521, + "learning_rate": 2.7343643282467413e-06, + "loss": 11.7328, + "step": 34070 + }, + { + "epoch": 1.8553021175797921, + "grad_norm": 0.5607030518340594, + "learning_rate": 2.7323166745251814e-06, + "loss": 11.7448, + "step": 34071 + }, + { + "epoch": 1.8553565715763751, + "grad_norm": 0.4982570699389137, + "learning_rate": 2.7302697771734553e-06, + "loss": 11.7343, + "step": 34072 + }, + { + "epoch": 1.8554110255729581, + "grad_norm": 0.4921227757229111, + "learning_rate": 2.72822363620745e-06, + "loss": 11.6926, + "step": 34073 + }, + { + "epoch": 1.855465479569541, + "grad_norm": 0.5654418146983228, + "learning_rate": 2.7261782516430854e-06, + "loss": 11.5966, + "step": 34074 + }, + { + "epoch": 1.855519933566124, + "grad_norm": 0.7153251081472566, + "learning_rate": 2.7241336234962944e-06, + "loss": 11.7875, + "step": 34075 + }, + { + "epoch": 1.855574387562707, + "grad_norm": 0.49568571008554196, + "learning_rate": 2.7220897517829303e-06, + "loss": 11.7951, + "step": 34076 + }, + { + "epoch": 1.85562884155929, + "grad_norm": 0.5459610465500246, + "learning_rate": 2.720046636518925e-06, + "loss": 11.7583, + "step": 34077 + }, + { + "epoch": 1.855683295555873, + "grad_norm": 0.52765938688209, + "learning_rate": 2.718004277720143e-06, + "loss": 11.7692, + "step": 34078 + }, + { + "epoch": 1.855737749552456, + "grad_norm": 0.5497646074992447, + "learning_rate": 2.7159626754024615e-06, + "loss": 11.7232, + "step": 34079 + }, + { + "epoch": 1.855792203549039, + "grad_norm": 0.510688996790669, + "learning_rate": 2.713921829581789e-06, + "loss": 11.6517, + "step": 34080 + }, + { + "epoch": 1.8558466575456223, + "grad_norm": 0.4958188034929215, + "learning_rate": 2.7118817402739695e-06, + "loss": 11.7161, + "step": 34081 + }, + { + "epoch": 1.8559011115422053, + "grad_norm": 0.49682322626932107, + "learning_rate": 2.7098424074948782e-06, + "loss": 11.7348, + "step": 34082 + }, + { + "epoch": 1.8559555655387883, + "grad_norm": 0.5405829211041254, + "learning_rate": 2.707803831260347e-06, + "loss": 11.6823, + "step": 34083 + }, + { + "epoch": 1.8560100195353713, + "grad_norm": 0.5487267444143137, + "learning_rate": 2.7057660115862748e-06, + "loss": 11.717, + "step": 34084 + }, + { + "epoch": 1.8560644735319543, + "grad_norm": 0.5829987937994934, + "learning_rate": 2.703728948488471e-06, + "loss": 11.6097, + "step": 34085 + }, + { + "epoch": 1.8561189275285372, + "grad_norm": 0.5683107597925958, + "learning_rate": 2.7016926419827892e-06, + "loss": 11.5456, + "step": 34086 + }, + { + "epoch": 1.8561733815251205, + "grad_norm": 0.5247377964798963, + "learning_rate": 2.6996570920850615e-06, + "loss": 11.6565, + "step": 34087 + }, + { + "epoch": 1.8562278355217035, + "grad_norm": 0.5438516098119133, + "learning_rate": 2.697622298811109e-06, + "loss": 11.7745, + "step": 34088 + }, + { + "epoch": 1.8562822895182864, + "grad_norm": 0.5375365510810982, + "learning_rate": 2.6955882621767627e-06, + "loss": 11.7952, + "step": 34089 + }, + { + "epoch": 1.8563367435148694, + "grad_norm": 0.5472913557677537, + "learning_rate": 2.6935549821978545e-06, + "loss": 11.8205, + "step": 34090 + }, + { + "epoch": 1.8563911975114524, + "grad_norm": 0.5682658780083004, + "learning_rate": 2.6915224588901723e-06, + "loss": 11.8902, + "step": 34091 + }, + { + "epoch": 1.8564456515080354, + "grad_norm": 0.5987486409175985, + "learning_rate": 2.689490692269536e-06, + "loss": 11.7818, + "step": 34092 + }, + { + "epoch": 1.8565001055046184, + "grad_norm": 0.5172981863585994, + "learning_rate": 2.687459682351734e-06, + "loss": 11.7375, + "step": 34093 + }, + { + "epoch": 1.8565545595012014, + "grad_norm": 0.5655138168649811, + "learning_rate": 2.685429429152575e-06, + "loss": 11.6837, + "step": 34094 + }, + { + "epoch": 1.8566090134977844, + "grad_norm": 0.566723333506846, + "learning_rate": 2.6833999326878244e-06, + "loss": 11.8561, + "step": 34095 + }, + { + "epoch": 1.8566634674943674, + "grad_norm": 0.508191075909932, + "learning_rate": 2.681371192973281e-06, + "loss": 11.499, + "step": 34096 + }, + { + "epoch": 1.8567179214909504, + "grad_norm": 0.5461943753485724, + "learning_rate": 2.6793432100247094e-06, + "loss": 11.6532, + "step": 34097 + }, + { + "epoch": 1.8567723754875334, + "grad_norm": 0.5362114674594104, + "learning_rate": 2.6773159838578867e-06, + "loss": 11.7125, + "step": 34098 + }, + { + "epoch": 1.8568268294841164, + "grad_norm": 0.566289879168616, + "learning_rate": 2.675289514488588e-06, + "loss": 11.7581, + "step": 34099 + }, + { + "epoch": 1.8568812834806994, + "grad_norm": 0.5526048342528732, + "learning_rate": 2.673263801932546e-06, + "loss": 11.8534, + "step": 34100 + }, + { + "epoch": 1.8569357374772824, + "grad_norm": 0.5463096012916056, + "learning_rate": 2.671238846205526e-06, + "loss": 11.8857, + "step": 34101 + }, + { + "epoch": 1.8569901914738653, + "grad_norm": 0.5366637513217796, + "learning_rate": 2.6692146473232816e-06, + "loss": 11.8138, + "step": 34102 + }, + { + "epoch": 1.8570446454704483, + "grad_norm": 0.6373991308782505, + "learning_rate": 2.667191205301545e-06, + "loss": 11.8892, + "step": 34103 + }, + { + "epoch": 1.8570990994670316, + "grad_norm": 0.5667846566917036, + "learning_rate": 2.6651685201560583e-06, + "loss": 11.7853, + "step": 34104 + }, + { + "epoch": 1.8571535534636145, + "grad_norm": 0.5419256198203309, + "learning_rate": 2.6631465919025433e-06, + "loss": 11.6945, + "step": 34105 + }, + { + "epoch": 1.8572080074601975, + "grad_norm": 0.5776209773524162, + "learning_rate": 2.661125420556709e-06, + "loss": 11.7391, + "step": 34106 + }, + { + "epoch": 1.8572624614567805, + "grad_norm": 0.5877709735118014, + "learning_rate": 2.6591050061342994e-06, + "loss": 11.8664, + "step": 34107 + }, + { + "epoch": 1.8573169154533635, + "grad_norm": 0.5876932201534659, + "learning_rate": 2.6570853486510115e-06, + "loss": 11.7654, + "step": 34108 + }, + { + "epoch": 1.8573713694499465, + "grad_norm": 0.5421207030376842, + "learning_rate": 2.6550664481225564e-06, + "loss": 11.6486, + "step": 34109 + }, + { + "epoch": 1.8574258234465297, + "grad_norm": 0.5299217307624858, + "learning_rate": 2.6530483045646205e-06, + "loss": 11.6776, + "step": 34110 + }, + { + "epoch": 1.8574802774431127, + "grad_norm": 0.5534014212715822, + "learning_rate": 2.6510309179929025e-06, + "loss": 11.808, + "step": 34111 + }, + { + "epoch": 1.8575347314396957, + "grad_norm": 0.5490184703527806, + "learning_rate": 2.6490142884231016e-06, + "loss": 11.7646, + "step": 34112 + }, + { + "epoch": 1.8575891854362787, + "grad_norm": 0.5146089147151446, + "learning_rate": 2.6469984158708825e-06, + "loss": 11.8306, + "step": 34113 + }, + { + "epoch": 1.8576436394328617, + "grad_norm": 0.5469835749781836, + "learning_rate": 2.6449833003519444e-06, + "loss": 11.8555, + "step": 34114 + }, + { + "epoch": 1.8576980934294447, + "grad_norm": 0.5818258261115045, + "learning_rate": 2.6429689418819403e-06, + "loss": 11.7264, + "step": 34115 + }, + { + "epoch": 1.8577525474260277, + "grad_norm": 0.5738309420067346, + "learning_rate": 2.6409553404765365e-06, + "loss": 11.7786, + "step": 34116 + }, + { + "epoch": 1.8578070014226107, + "grad_norm": 0.5532584211809648, + "learning_rate": 2.6389424961513866e-06, + "loss": 11.7642, + "step": 34117 + }, + { + "epoch": 1.8578614554191937, + "grad_norm": 0.5342596315396536, + "learning_rate": 2.6369304089221447e-06, + "loss": 11.7343, + "step": 34118 + }, + { + "epoch": 1.8579159094157767, + "grad_norm": 0.6193558740509785, + "learning_rate": 2.6349190788044652e-06, + "loss": 11.7922, + "step": 34119 + }, + { + "epoch": 1.8579703634123597, + "grad_norm": 0.5782423910365604, + "learning_rate": 2.63290850581398e-06, + "loss": 11.7755, + "step": 34120 + }, + { + "epoch": 1.8580248174089427, + "grad_norm": 0.5145585207300134, + "learning_rate": 2.6308986899663323e-06, + "loss": 11.6257, + "step": 34121 + }, + { + "epoch": 1.8580792714055256, + "grad_norm": 0.5637859053969433, + "learning_rate": 2.6288896312771315e-06, + "loss": 11.871, + "step": 34122 + }, + { + "epoch": 1.8581337254021086, + "grad_norm": 0.4941110555645744, + "learning_rate": 2.626881329762021e-06, + "loss": 11.6426, + "step": 34123 + }, + { + "epoch": 1.8581881793986916, + "grad_norm": 0.5744780816570595, + "learning_rate": 2.62487378543661e-06, + "loss": 11.8303, + "step": 34124 + }, + { + "epoch": 1.8582426333952746, + "grad_norm": 0.598086808279827, + "learning_rate": 2.622866998316509e-06, + "loss": 11.6945, + "step": 34125 + }, + { + "epoch": 1.8582970873918576, + "grad_norm": 0.5973845623077279, + "learning_rate": 2.6208609684173493e-06, + "loss": 11.6114, + "step": 34126 + }, + { + "epoch": 1.8583515413884408, + "grad_norm": 0.5666796603852995, + "learning_rate": 2.618855695754674e-06, + "loss": 11.8497, + "step": 34127 + }, + { + "epoch": 1.8584059953850238, + "grad_norm": 0.5428249603268561, + "learning_rate": 2.6168511803441154e-06, + "loss": 11.62, + "step": 34128 + }, + { + "epoch": 1.8584604493816068, + "grad_norm": 0.5146541338848857, + "learning_rate": 2.6148474222012608e-06, + "loss": 11.7109, + "step": 34129 + }, + { + "epoch": 1.8585149033781898, + "grad_norm": 0.5506291667570146, + "learning_rate": 2.612844421341676e-06, + "loss": 11.8329, + "step": 34130 + }, + { + "epoch": 1.8585693573747728, + "grad_norm": 0.5131219078540313, + "learning_rate": 2.610842177780948e-06, + "loss": 11.8199, + "step": 34131 + }, + { + "epoch": 1.8586238113713558, + "grad_norm": 0.567544050071326, + "learning_rate": 2.608840691534642e-06, + "loss": 11.7734, + "step": 34132 + }, + { + "epoch": 1.858678265367939, + "grad_norm": 0.5105797380134522, + "learning_rate": 2.6068399626183236e-06, + "loss": 11.7357, + "step": 34133 + }, + { + "epoch": 1.858732719364522, + "grad_norm": 0.5662576253554312, + "learning_rate": 2.6048399910475473e-06, + "loss": 11.7998, + "step": 34134 + }, + { + "epoch": 1.858787173361105, + "grad_norm": 0.5163416056105855, + "learning_rate": 2.602840776837867e-06, + "loss": 11.7284, + "step": 34135 + }, + { + "epoch": 1.858841627357688, + "grad_norm": 0.536067522575811, + "learning_rate": 2.6008423200048483e-06, + "loss": 11.8133, + "step": 34136 + }, + { + "epoch": 1.858896081354271, + "grad_norm": 0.5424786276923304, + "learning_rate": 2.5988446205640004e-06, + "loss": 11.7822, + "step": 34137 + }, + { + "epoch": 1.858950535350854, + "grad_norm": 0.5252799566672407, + "learning_rate": 2.596847678530867e-06, + "loss": 11.6414, + "step": 34138 + }, + { + "epoch": 1.859004989347437, + "grad_norm": 0.5157648847759491, + "learning_rate": 2.5948514939209912e-06, + "loss": 11.6841, + "step": 34139 + }, + { + "epoch": 1.85905944334402, + "grad_norm": 0.5395751269361226, + "learning_rate": 2.5928560667498713e-06, + "loss": 11.7793, + "step": 34140 + }, + { + "epoch": 1.859113897340603, + "grad_norm": 0.5638058456493286, + "learning_rate": 2.590861397033051e-06, + "loss": 11.7167, + "step": 34141 + }, + { + "epoch": 1.859168351337186, + "grad_norm": 0.5847364246820539, + "learning_rate": 2.5888674847860175e-06, + "loss": 11.6694, + "step": 34142 + }, + { + "epoch": 1.859222805333769, + "grad_norm": 0.5103492098472038, + "learning_rate": 2.5868743300242916e-06, + "loss": 11.7623, + "step": 34143 + }, + { + "epoch": 1.859277259330352, + "grad_norm": 0.5412922834575296, + "learning_rate": 2.584881932763361e-06, + "loss": 11.7553, + "step": 34144 + }, + { + "epoch": 1.859331713326935, + "grad_norm": 0.5538789553502556, + "learning_rate": 2.582890293018725e-06, + "loss": 11.7341, + "step": 34145 + }, + { + "epoch": 1.859386167323518, + "grad_norm": 0.5653406075227116, + "learning_rate": 2.5808994108058925e-06, + "loss": 11.7556, + "step": 34146 + }, + { + "epoch": 1.859440621320101, + "grad_norm": 0.6198355925560457, + "learning_rate": 2.5789092861403076e-06, + "loss": 11.7568, + "step": 34147 + }, + { + "epoch": 1.859495075316684, + "grad_norm": 0.5829700476640527, + "learning_rate": 2.5769199190374683e-06, + "loss": 11.8438, + "step": 34148 + }, + { + "epoch": 1.859549529313267, + "grad_norm": 0.6709208864682427, + "learning_rate": 2.574931309512818e-06, + "loss": 11.7601, + "step": 34149 + }, + { + "epoch": 1.8596039833098499, + "grad_norm": 0.576368123322428, + "learning_rate": 2.5729434575818557e-06, + "loss": 11.7127, + "step": 34150 + }, + { + "epoch": 1.859658437306433, + "grad_norm": 0.5050539730265304, + "learning_rate": 2.570956363260024e-06, + "loss": 11.732, + "step": 34151 + }, + { + "epoch": 1.859712891303016, + "grad_norm": 0.6046256561023708, + "learning_rate": 2.5689700265627668e-06, + "loss": 11.8598, + "step": 34152 + }, + { + "epoch": 1.859767345299599, + "grad_norm": 0.5269572616654264, + "learning_rate": 2.5669844475055492e-06, + "loss": 11.6571, + "step": 34153 + }, + { + "epoch": 1.859821799296182, + "grad_norm": 0.603658806901523, + "learning_rate": 2.564999626103781e-06, + "loss": 11.8524, + "step": 34154 + }, + { + "epoch": 1.859876253292765, + "grad_norm": 0.5321904211696481, + "learning_rate": 2.563015562372939e-06, + "loss": 11.7855, + "step": 34155 + }, + { + "epoch": 1.8599307072893483, + "grad_norm": 0.5061557068441046, + "learning_rate": 2.5610322563284216e-06, + "loss": 11.8365, + "step": 34156 + }, + { + "epoch": 1.8599851612859313, + "grad_norm": 0.5852691174880951, + "learning_rate": 2.559049707985639e-06, + "loss": 11.7605, + "step": 34157 + }, + { + "epoch": 1.8600396152825143, + "grad_norm": 0.5499744580013146, + "learning_rate": 2.5570679173600564e-06, + "loss": 11.8352, + "step": 34158 + }, + { + "epoch": 1.8600940692790973, + "grad_norm": 0.5637411575606855, + "learning_rate": 2.5550868844670283e-06, + "loss": 11.775, + "step": 34159 + }, + { + "epoch": 1.8601485232756803, + "grad_norm": 0.5444245383974731, + "learning_rate": 2.5531066093219978e-06, + "loss": 11.6991, + "step": 34160 + }, + { + "epoch": 1.8602029772722632, + "grad_norm": 0.5262136559054372, + "learning_rate": 2.551127091940353e-06, + "loss": 11.7058, + "step": 34161 + }, + { + "epoch": 1.8602574312688462, + "grad_norm": 0.6000498145071027, + "learning_rate": 2.549148332337481e-06, + "loss": 11.7554, + "step": 34162 + }, + { + "epoch": 1.8603118852654292, + "grad_norm": 0.5196297525263693, + "learning_rate": 2.5471703305287807e-06, + "loss": 11.7863, + "step": 34163 + }, + { + "epoch": 1.8603663392620122, + "grad_norm": 0.5747606300573432, + "learning_rate": 2.5451930865296183e-06, + "loss": 11.728, + "step": 34164 + }, + { + "epoch": 1.8604207932585952, + "grad_norm": 0.5196508024223219, + "learning_rate": 2.5432166003553914e-06, + "loss": 11.71, + "step": 34165 + }, + { + "epoch": 1.8604752472551782, + "grad_norm": 0.5213264834311555, + "learning_rate": 2.5412408720214443e-06, + "loss": 11.8188, + "step": 34166 + }, + { + "epoch": 1.8605297012517612, + "grad_norm": 0.5298991715536169, + "learning_rate": 2.539265901543153e-06, + "loss": 11.8251, + "step": 34167 + }, + { + "epoch": 1.8605841552483442, + "grad_norm": 0.5340164841269298, + "learning_rate": 2.5372916889358835e-06, + "loss": 11.7761, + "step": 34168 + }, + { + "epoch": 1.8606386092449272, + "grad_norm": 0.5569992179735073, + "learning_rate": 2.535318234214967e-06, + "loss": 11.7586, + "step": 34169 + }, + { + "epoch": 1.8606930632415102, + "grad_norm": 0.5866682630661555, + "learning_rate": 2.5333455373957705e-06, + "loss": 11.8225, + "step": 34170 + }, + { + "epoch": 1.8607475172380932, + "grad_norm": 0.5428021841429663, + "learning_rate": 2.5313735984936137e-06, + "loss": 11.7837, + "step": 34171 + }, + { + "epoch": 1.8608019712346762, + "grad_norm": 0.5527744381721219, + "learning_rate": 2.529402417523841e-06, + "loss": 11.699, + "step": 34172 + }, + { + "epoch": 1.8608564252312592, + "grad_norm": 0.5296158164996029, + "learning_rate": 2.527431994501783e-06, + "loss": 11.7278, + "step": 34173 + }, + { + "epoch": 1.8609108792278424, + "grad_norm": 0.576018565837309, + "learning_rate": 2.5254623294427626e-06, + "loss": 11.8261, + "step": 34174 + }, + { + "epoch": 1.8609653332244254, + "grad_norm": 0.5740271902952466, + "learning_rate": 2.5234934223621e-06, + "loss": 11.7174, + "step": 34175 + }, + { + "epoch": 1.8610197872210084, + "grad_norm": 0.4834014411927382, + "learning_rate": 2.521525273275094e-06, + "loss": 11.8187, + "step": 34176 + }, + { + "epoch": 1.8610742412175914, + "grad_norm": 0.5667849462284822, + "learning_rate": 2.5195578821970545e-06, + "loss": 11.766, + "step": 34177 + }, + { + "epoch": 1.8611286952141743, + "grad_norm": 0.5830541716647009, + "learning_rate": 2.517591249143281e-06, + "loss": 11.7631, + "step": 34178 + }, + { + "epoch": 1.8611831492107573, + "grad_norm": 0.5263017125893658, + "learning_rate": 2.5156253741290603e-06, + "loss": 11.7173, + "step": 34179 + }, + { + "epoch": 1.8612376032073406, + "grad_norm": 0.5570995837286824, + "learning_rate": 2.513660257169692e-06, + "loss": 11.7829, + "step": 34180 + }, + { + "epoch": 1.8612920572039235, + "grad_norm": 0.543429935542524, + "learning_rate": 2.5116958982804418e-06, + "loss": 11.812, + "step": 34181 + }, + { + "epoch": 1.8613465112005065, + "grad_norm": 0.5425701658297523, + "learning_rate": 2.509732297476608e-06, + "loss": 11.9944, + "step": 34182 + }, + { + "epoch": 1.8614009651970895, + "grad_norm": 0.5524225480267694, + "learning_rate": 2.507769454773434e-06, + "loss": 11.8507, + "step": 34183 + }, + { + "epoch": 1.8614554191936725, + "grad_norm": 0.5906724294506779, + "learning_rate": 2.5058073701861864e-06, + "loss": 11.8047, + "step": 34184 + }, + { + "epoch": 1.8615098731902555, + "grad_norm": 0.5278422417604384, + "learning_rate": 2.50384604373014e-06, + "loss": 11.7586, + "step": 34185 + }, + { + "epoch": 1.8615643271868385, + "grad_norm": 0.5351336110155367, + "learning_rate": 2.501885475420551e-06, + "loss": 11.7011, + "step": 34186 + }, + { + "epoch": 1.8616187811834215, + "grad_norm": 0.5940612577929096, + "learning_rate": 2.499925665272629e-06, + "loss": 11.7994, + "step": 34187 + }, + { + "epoch": 1.8616732351800045, + "grad_norm": 0.5494946382919955, + "learning_rate": 2.49796661330165e-06, + "loss": 11.827, + "step": 34188 + }, + { + "epoch": 1.8617276891765875, + "grad_norm": 0.5304235029645848, + "learning_rate": 2.4960083195228244e-06, + "loss": 11.8589, + "step": 34189 + }, + { + "epoch": 1.8617821431731705, + "grad_norm": 0.5175006356071478, + "learning_rate": 2.494050783951396e-06, + "loss": 11.7035, + "step": 34190 + }, + { + "epoch": 1.8618365971697535, + "grad_norm": 0.5503796944672893, + "learning_rate": 2.4920940066025635e-06, + "loss": 11.8346, + "step": 34191 + }, + { + "epoch": 1.8618910511663365, + "grad_norm": 0.5345422841907969, + "learning_rate": 2.490137987491581e-06, + "loss": 11.7668, + "step": 34192 + }, + { + "epoch": 1.8619455051629195, + "grad_norm": 0.5448337000854672, + "learning_rate": 2.4881827266336255e-06, + "loss": 11.8119, + "step": 34193 + }, + { + "epoch": 1.8619999591595024, + "grad_norm": 0.5575270582841075, + "learning_rate": 2.486228224043918e-06, + "loss": 11.9021, + "step": 34194 + }, + { + "epoch": 1.8620544131560854, + "grad_norm": 0.525244067361692, + "learning_rate": 2.484274479737658e-06, + "loss": 11.6603, + "step": 34195 + }, + { + "epoch": 1.8621088671526684, + "grad_norm": 0.5677293472556686, + "learning_rate": 2.482321493730033e-06, + "loss": 11.7894, + "step": 34196 + }, + { + "epoch": 1.8621633211492516, + "grad_norm": 0.5553831084579393, + "learning_rate": 2.480369266036231e-06, + "loss": 11.6958, + "step": 34197 + }, + { + "epoch": 1.8622177751458346, + "grad_norm": 0.5331850581872086, + "learning_rate": 2.4784177966714172e-06, + "loss": 11.9201, + "step": 34198 + }, + { + "epoch": 1.8622722291424176, + "grad_norm": 0.5978388936682295, + "learning_rate": 2.476467085650791e-06, + "loss": 11.8646, + "step": 34199 + }, + { + "epoch": 1.8623266831390006, + "grad_norm": 0.5316424538573272, + "learning_rate": 2.4745171329895065e-06, + "loss": 11.7897, + "step": 34200 + }, + { + "epoch": 1.8623811371355836, + "grad_norm": 0.5333208931118412, + "learning_rate": 2.47256793870273e-06, + "loss": 11.8553, + "step": 34201 + }, + { + "epoch": 1.8624355911321666, + "grad_norm": 0.5304161429780652, + "learning_rate": 2.470619502805638e-06, + "loss": 11.6802, + "step": 34202 + }, + { + "epoch": 1.8624900451287498, + "grad_norm": 0.5419035587078287, + "learning_rate": 2.46867182531334e-06, + "loss": 11.8046, + "step": 34203 + }, + { + "epoch": 1.8625444991253328, + "grad_norm": 0.5318397569660418, + "learning_rate": 2.466724906241025e-06, + "loss": 11.8833, + "step": 34204 + }, + { + "epoch": 1.8625989531219158, + "grad_norm": 0.5124298265960896, + "learning_rate": 2.464778745603802e-06, + "loss": 11.6758, + "step": 34205 + }, + { + "epoch": 1.8626534071184988, + "grad_norm": 0.5815998586177793, + "learning_rate": 2.462833343416826e-06, + "loss": 11.9073, + "step": 34206 + }, + { + "epoch": 1.8627078611150818, + "grad_norm": 0.5996786164517552, + "learning_rate": 2.4608886996952186e-06, + "loss": 11.8514, + "step": 34207 + }, + { + "epoch": 1.8627623151116648, + "grad_norm": 0.5399291136870131, + "learning_rate": 2.4589448144540783e-06, + "loss": 11.8548, + "step": 34208 + }, + { + "epoch": 1.8628167691082478, + "grad_norm": 0.585185179304002, + "learning_rate": 2.457001687708549e-06, + "loss": 11.6782, + "step": 34209 + }, + { + "epoch": 1.8628712231048308, + "grad_norm": 0.5246050621343618, + "learning_rate": 2.4550593194737404e-06, + "loss": 11.7869, + "step": 34210 + }, + { + "epoch": 1.8629256771014138, + "grad_norm": 0.5029747833277742, + "learning_rate": 2.4531177097647408e-06, + "loss": 11.8503, + "step": 34211 + }, + { + "epoch": 1.8629801310979968, + "grad_norm": 0.6242831355913958, + "learning_rate": 2.45117685859666e-06, + "loss": 11.8906, + "step": 34212 + }, + { + "epoch": 1.8630345850945798, + "grad_norm": 0.5425892746415845, + "learning_rate": 2.449236765984586e-06, + "loss": 11.8247, + "step": 34213 + }, + { + "epoch": 1.8630890390911627, + "grad_norm": 0.5155520131370054, + "learning_rate": 2.447297431943607e-06, + "loss": 11.8579, + "step": 34214 + }, + { + "epoch": 1.8631434930877457, + "grad_norm": 0.5262307966690408, + "learning_rate": 2.4453588564887996e-06, + "loss": 11.6946, + "step": 34215 + }, + { + "epoch": 1.8631979470843287, + "grad_norm": 0.5198146549994429, + "learning_rate": 2.443421039635252e-06, + "loss": 11.7159, + "step": 34216 + }, + { + "epoch": 1.8632524010809117, + "grad_norm": 0.5144561516247748, + "learning_rate": 2.4414839813980183e-06, + "loss": 11.7513, + "step": 34217 + }, + { + "epoch": 1.8633068550774947, + "grad_norm": 0.5160035491790321, + "learning_rate": 2.439547681792154e-06, + "loss": 11.7505, + "step": 34218 + }, + { + "epoch": 1.8633613090740777, + "grad_norm": 0.4955868170545605, + "learning_rate": 2.4376121408327458e-06, + "loss": 11.7747, + "step": 34219 + }, + { + "epoch": 1.8634157630706607, + "grad_norm": 0.5103738312816297, + "learning_rate": 2.435677358534816e-06, + "loss": 11.6458, + "step": 34220 + }, + { + "epoch": 1.863470217067244, + "grad_norm": 0.5371214183596122, + "learning_rate": 2.4337433349134077e-06, + "loss": 11.5142, + "step": 34221 + }, + { + "epoch": 1.863524671063827, + "grad_norm": 0.570535075632023, + "learning_rate": 2.4318100699835977e-06, + "loss": 11.8633, + "step": 34222 + }, + { + "epoch": 1.86357912506041, + "grad_norm": 0.5840620998834162, + "learning_rate": 2.4298775637603745e-06, + "loss": 11.7923, + "step": 34223 + }, + { + "epoch": 1.863633579056993, + "grad_norm": 0.5727108335166309, + "learning_rate": 2.427945816258803e-06, + "loss": 11.8758, + "step": 34224 + }, + { + "epoch": 1.8636880330535759, + "grad_norm": 0.5794853517027942, + "learning_rate": 2.426014827493872e-06, + "loss": 11.7628, + "step": 34225 + }, + { + "epoch": 1.863742487050159, + "grad_norm": 0.5708225316638832, + "learning_rate": 2.424084597480636e-06, + "loss": 11.7865, + "step": 34226 + }, + { + "epoch": 1.863796941046742, + "grad_norm": 0.6114812305209636, + "learning_rate": 2.4221551262340716e-06, + "loss": 11.8621, + "step": 34227 + }, + { + "epoch": 1.863851395043325, + "grad_norm": 0.5540456319701818, + "learning_rate": 2.4202264137691776e-06, + "loss": 11.6821, + "step": 34228 + }, + { + "epoch": 1.863905849039908, + "grad_norm": 0.5730136453861171, + "learning_rate": 2.418298460100987e-06, + "loss": 11.814, + "step": 34229 + }, + { + "epoch": 1.863960303036491, + "grad_norm": 0.523589859419443, + "learning_rate": 2.4163712652444547e-06, + "loss": 11.6111, + "step": 34230 + }, + { + "epoch": 1.864014757033074, + "grad_norm": 0.6112924156925106, + "learning_rate": 2.4144448292146017e-06, + "loss": 11.7233, + "step": 34231 + }, + { + "epoch": 1.864069211029657, + "grad_norm": 0.5374203790311808, + "learning_rate": 2.4125191520263713e-06, + "loss": 11.7915, + "step": 34232 + }, + { + "epoch": 1.86412366502624, + "grad_norm": 0.5362833093421417, + "learning_rate": 2.410594233694763e-06, + "loss": 11.7705, + "step": 34233 + }, + { + "epoch": 1.864178119022823, + "grad_norm": 0.5197452888719586, + "learning_rate": 2.4086700742347535e-06, + "loss": 11.7186, + "step": 34234 + }, + { + "epoch": 1.864232573019406, + "grad_norm": 0.5190742130390831, + "learning_rate": 2.406746673661275e-06, + "loss": 11.5895, + "step": 34235 + }, + { + "epoch": 1.864287027015989, + "grad_norm": 0.6293038113589222, + "learning_rate": 2.4048240319893055e-06, + "loss": 11.8104, + "step": 34236 + }, + { + "epoch": 1.864341481012572, + "grad_norm": 0.5279810593944538, + "learning_rate": 2.402902149233799e-06, + "loss": 11.6538, + "step": 34237 + }, + { + "epoch": 1.864395935009155, + "grad_norm": 0.527408394474537, + "learning_rate": 2.4009810254096875e-06, + "loss": 11.8652, + "step": 34238 + }, + { + "epoch": 1.864450389005738, + "grad_norm": 0.5098192001087642, + "learning_rate": 2.3990606605319156e-06, + "loss": 11.6551, + "step": 34239 + }, + { + "epoch": 1.864504843002321, + "grad_norm": 0.5070566259114763, + "learning_rate": 2.397141054615415e-06, + "loss": 11.735, + "step": 34240 + }, + { + "epoch": 1.864559296998904, + "grad_norm": 0.49966074639794694, + "learning_rate": 2.3952222076751184e-06, + "loss": 11.7903, + "step": 34241 + }, + { + "epoch": 1.864613750995487, + "grad_norm": 0.5308729556639347, + "learning_rate": 2.3933041197259253e-06, + "loss": 11.7152, + "step": 34242 + }, + { + "epoch": 1.86466820499207, + "grad_norm": 0.5165496154658284, + "learning_rate": 2.391386790782779e-06, + "loss": 11.671, + "step": 34243 + }, + { + "epoch": 1.8647226589886532, + "grad_norm": 0.6427412163321673, + "learning_rate": 2.389470220860579e-06, + "loss": 11.8475, + "step": 34244 + }, + { + "epoch": 1.8647771129852362, + "grad_norm": 0.5266160782859761, + "learning_rate": 2.387554409974224e-06, + "loss": 11.7488, + "step": 34245 + }, + { + "epoch": 1.8648315669818192, + "grad_norm": 0.5867827464115407, + "learning_rate": 2.385639358138636e-06, + "loss": 11.9609, + "step": 34246 + }, + { + "epoch": 1.8648860209784022, + "grad_norm": 0.5885272033914, + "learning_rate": 2.3837250653686583e-06, + "loss": 11.8016, + "step": 34247 + }, + { + "epoch": 1.8649404749749852, + "grad_norm": 0.5648847339188333, + "learning_rate": 2.3818115316792122e-06, + "loss": 11.7979, + "step": 34248 + }, + { + "epoch": 1.8649949289715682, + "grad_norm": 0.6350035996609225, + "learning_rate": 2.379898757085175e-06, + "loss": 11.8398, + "step": 34249 + }, + { + "epoch": 1.8650493829681514, + "grad_norm": 0.524575375399674, + "learning_rate": 2.3779867416014125e-06, + "loss": 11.899, + "step": 34250 + }, + { + "epoch": 1.8651038369647344, + "grad_norm": 0.5214220943150758, + "learning_rate": 2.3760754852428015e-06, + "loss": 11.8004, + "step": 34251 + }, + { + "epoch": 1.8651582909613174, + "grad_norm": 0.5475231764212184, + "learning_rate": 2.374164988024197e-06, + "loss": 11.7604, + "step": 34252 + }, + { + "epoch": 1.8652127449579003, + "grad_norm": 0.5265529373533434, + "learning_rate": 2.3722552499604645e-06, + "loss": 11.6525, + "step": 34253 + }, + { + "epoch": 1.8652671989544833, + "grad_norm": 0.5207911192734828, + "learning_rate": 2.370346271066426e-06, + "loss": 11.7003, + "step": 34254 + }, + { + "epoch": 1.8653216529510663, + "grad_norm": 0.6038112220169792, + "learning_rate": 2.368438051356958e-06, + "loss": 11.872, + "step": 34255 + }, + { + "epoch": 1.8653761069476493, + "grad_norm": 0.5391677969810922, + "learning_rate": 2.366530590846905e-06, + "loss": 11.6137, + "step": 34256 + }, + { + "epoch": 1.8654305609442323, + "grad_norm": 0.5426360851693741, + "learning_rate": 2.364623889551065e-06, + "loss": 11.8784, + "step": 34257 + }, + { + "epoch": 1.8654850149408153, + "grad_norm": 0.5505850702167205, + "learning_rate": 2.3627179474842833e-06, + "loss": 11.7918, + "step": 34258 + }, + { + "epoch": 1.8655394689373983, + "grad_norm": 0.5028812290055151, + "learning_rate": 2.360812764661391e-06, + "loss": 11.6313, + "step": 34259 + }, + { + "epoch": 1.8655939229339813, + "grad_norm": 0.5912852801999976, + "learning_rate": 2.3589083410971768e-06, + "loss": 11.7307, + "step": 34260 + }, + { + "epoch": 1.8656483769305643, + "grad_norm": 0.5048018768131038, + "learning_rate": 2.3570046768064847e-06, + "loss": 11.8072, + "step": 34261 + }, + { + "epoch": 1.8657028309271473, + "grad_norm": 0.5317571589028683, + "learning_rate": 2.35510177180408e-06, + "loss": 11.6593, + "step": 34262 + }, + { + "epoch": 1.8657572849237303, + "grad_norm": 0.4986764309703747, + "learning_rate": 2.353199626104796e-06, + "loss": 11.7164, + "step": 34263 + }, + { + "epoch": 1.8658117389203133, + "grad_norm": 0.5505031809051203, + "learning_rate": 2.3512982397233987e-06, + "loss": 11.7329, + "step": 34264 + }, + { + "epoch": 1.8658661929168963, + "grad_norm": 0.5392223950234498, + "learning_rate": 2.3493976126746754e-06, + "loss": 11.7521, + "step": 34265 + }, + { + "epoch": 1.8659206469134793, + "grad_norm": 0.5083800362848726, + "learning_rate": 2.347497744973437e-06, + "loss": 11.8374, + "step": 34266 + }, + { + "epoch": 1.8659751009100625, + "grad_norm": 0.5482753870093451, + "learning_rate": 2.345598636634405e-06, + "loss": 11.7542, + "step": 34267 + }, + { + "epoch": 1.8660295549066455, + "grad_norm": 0.580260195311508, + "learning_rate": 2.34370028767239e-06, + "loss": 11.9381, + "step": 34268 + }, + { + "epoch": 1.8660840089032285, + "grad_norm": 0.5747469685504321, + "learning_rate": 2.3418026981021357e-06, + "loss": 11.7568, + "step": 34269 + }, + { + "epoch": 1.8661384628998114, + "grad_norm": 0.5508804485965543, + "learning_rate": 2.3399058679383855e-06, + "loss": 11.816, + "step": 34270 + }, + { + "epoch": 1.8661929168963944, + "grad_norm": 0.5502687580181147, + "learning_rate": 2.338009797195928e-06, + "loss": 11.8374, + "step": 34271 + }, + { + "epoch": 1.8662473708929774, + "grad_norm": 0.5356402709789219, + "learning_rate": 2.3361144858894734e-06, + "loss": 11.7471, + "step": 34272 + }, + { + "epoch": 1.8663018248895606, + "grad_norm": 0.7388047894645445, + "learning_rate": 2.334219934033777e-06, + "loss": 11.7528, + "step": 34273 + }, + { + "epoch": 1.8663562788861436, + "grad_norm": 0.5753619368989512, + "learning_rate": 2.332326141643548e-06, + "loss": 11.817, + "step": 34274 + }, + { + "epoch": 1.8664107328827266, + "grad_norm": 0.7071034474573465, + "learning_rate": 2.3304331087335542e-06, + "loss": 11.8966, + "step": 34275 + }, + { + "epoch": 1.8664651868793096, + "grad_norm": 0.5041905253307493, + "learning_rate": 2.328540835318471e-06, + "loss": 11.6612, + "step": 34276 + }, + { + "epoch": 1.8665196408758926, + "grad_norm": 0.5531223309782023, + "learning_rate": 2.3266493214130437e-06, + "loss": 11.7908, + "step": 34277 + }, + { + "epoch": 1.8665740948724756, + "grad_norm": 0.5583570288124222, + "learning_rate": 2.3247585670319703e-06, + "loss": 11.788, + "step": 34278 + }, + { + "epoch": 1.8666285488690586, + "grad_norm": 0.5290098880893309, + "learning_rate": 2.322868572189951e-06, + "loss": 11.6824, + "step": 34279 + }, + { + "epoch": 1.8666830028656416, + "grad_norm": 0.5187326875973255, + "learning_rate": 2.3209793369016965e-06, + "loss": 11.7978, + "step": 34280 + }, + { + "epoch": 1.8667374568622246, + "grad_norm": 0.5672294546787048, + "learning_rate": 2.3190908611818717e-06, + "loss": 11.8205, + "step": 34281 + }, + { + "epoch": 1.8667919108588076, + "grad_norm": 0.4957828950353944, + "learning_rate": 2.3172031450451883e-06, + "loss": 11.5603, + "step": 34282 + }, + { + "epoch": 1.8668463648553906, + "grad_norm": 0.5370713289251441, + "learning_rate": 2.3153161885063113e-06, + "loss": 11.702, + "step": 34283 + }, + { + "epoch": 1.8669008188519736, + "grad_norm": 0.5764498160337324, + "learning_rate": 2.3134299915799184e-06, + "loss": 11.6951, + "step": 34284 + }, + { + "epoch": 1.8669552728485566, + "grad_norm": 0.5379404055186005, + "learning_rate": 2.3115445542806757e-06, + "loss": 11.7268, + "step": 34285 + }, + { + "epoch": 1.8670097268451396, + "grad_norm": 0.5687889360300659, + "learning_rate": 2.309659876623238e-06, + "loss": 11.808, + "step": 34286 + }, + { + "epoch": 1.8670641808417225, + "grad_norm": 0.545817895197542, + "learning_rate": 2.3077759586222825e-06, + "loss": 11.6021, + "step": 34287 + }, + { + "epoch": 1.8671186348383055, + "grad_norm": 0.5231374223122174, + "learning_rate": 2.305892800292442e-06, + "loss": 11.7889, + "step": 34288 + }, + { + "epoch": 1.8671730888348885, + "grad_norm": 0.5538354207890446, + "learning_rate": 2.304010401648349e-06, + "loss": 11.7504, + "step": 34289 + }, + { + "epoch": 1.8672275428314717, + "grad_norm": 0.5949262545531886, + "learning_rate": 2.3021287627046695e-06, + "loss": 11.7684, + "step": 34290 + }, + { + "epoch": 1.8672819968280547, + "grad_norm": 0.5877613945347738, + "learning_rate": 2.3002478834760035e-06, + "loss": 11.9002, + "step": 34291 + }, + { + "epoch": 1.8673364508246377, + "grad_norm": 0.5546132023592178, + "learning_rate": 2.298367763976994e-06, + "loss": 11.8494, + "step": 34292 + }, + { + "epoch": 1.8673909048212207, + "grad_norm": 0.5340674487277636, + "learning_rate": 2.296488404222286e-06, + "loss": 11.8007, + "step": 34293 + }, + { + "epoch": 1.8674453588178037, + "grad_norm": 0.5522883664425916, + "learning_rate": 2.294609804226444e-06, + "loss": 11.8901, + "step": 34294 + }, + { + "epoch": 1.8674998128143867, + "grad_norm": 0.5625909293605665, + "learning_rate": 2.292731964004113e-06, + "loss": 11.8434, + "step": 34295 + }, + { + "epoch": 1.86755426681097, + "grad_norm": 0.5467337569973905, + "learning_rate": 2.290854883569882e-06, + "loss": 11.8364, + "step": 34296 + }, + { + "epoch": 1.867608720807553, + "grad_norm": 0.5375065484188294, + "learning_rate": 2.2889785629383486e-06, + "loss": 11.8107, + "step": 34297 + }, + { + "epoch": 1.867663174804136, + "grad_norm": 0.5255140464611013, + "learning_rate": 2.2871030021241134e-06, + "loss": 11.6934, + "step": 34298 + }, + { + "epoch": 1.867717628800719, + "grad_norm": 0.49817229426717313, + "learning_rate": 2.2852282011417424e-06, + "loss": 11.7442, + "step": 34299 + }, + { + "epoch": 1.867772082797302, + "grad_norm": 0.5203996708735958, + "learning_rate": 2.283354160005824e-06, + "loss": 11.8311, + "step": 34300 + }, + { + "epoch": 1.8678265367938849, + "grad_norm": 0.5673202159230402, + "learning_rate": 2.2814808787309348e-06, + "loss": 11.7362, + "step": 34301 + }, + { + "epoch": 1.8678809907904679, + "grad_norm": 1.0016004923503519, + "learning_rate": 2.2796083573316306e-06, + "loss": 11.822, + "step": 34302 + }, + { + "epoch": 1.8679354447870509, + "grad_norm": 0.5657832098027313, + "learning_rate": 2.277736595822477e-06, + "loss": 11.8573, + "step": 34303 + }, + { + "epoch": 1.8679898987836339, + "grad_norm": 0.5360344128760475, + "learning_rate": 2.2758655942180407e-06, + "loss": 11.737, + "step": 34304 + }, + { + "epoch": 1.8680443527802169, + "grad_norm": 0.5493261703465715, + "learning_rate": 2.273995352532865e-06, + "loss": 11.6219, + "step": 34305 + }, + { + "epoch": 1.8680988067767998, + "grad_norm": 0.5148775070202533, + "learning_rate": 2.272125870781472e-06, + "loss": 11.7599, + "step": 34306 + }, + { + "epoch": 1.8681532607733828, + "grad_norm": 0.5306871466967943, + "learning_rate": 2.2702571489784388e-06, + "loss": 11.7628, + "step": 34307 + }, + { + "epoch": 1.8682077147699658, + "grad_norm": 0.5516223979441167, + "learning_rate": 2.2683891871382646e-06, + "loss": 11.7774, + "step": 34308 + }, + { + "epoch": 1.8682621687665488, + "grad_norm": 0.5264760278860711, + "learning_rate": 2.2665219852754825e-06, + "loss": 11.7439, + "step": 34309 + }, + { + "epoch": 1.8683166227631318, + "grad_norm": 0.5794411286558803, + "learning_rate": 2.264655543404626e-06, + "loss": 11.6774, + "step": 34310 + }, + { + "epoch": 1.8683710767597148, + "grad_norm": 0.701812098298003, + "learning_rate": 2.2627898615401933e-06, + "loss": 11.8844, + "step": 34311 + }, + { + "epoch": 1.8684255307562978, + "grad_norm": 0.5253276827866118, + "learning_rate": 2.260924939696696e-06, + "loss": 11.7294, + "step": 34312 + }, + { + "epoch": 1.8684799847528808, + "grad_norm": 0.5835653874817696, + "learning_rate": 2.259060777888633e-06, + "loss": 11.8683, + "step": 34313 + }, + { + "epoch": 1.868534438749464, + "grad_norm": 0.5507583791240651, + "learning_rate": 2.2571973761305045e-06, + "loss": 11.7279, + "step": 34314 + }, + { + "epoch": 1.868588892746047, + "grad_norm": 0.5577230961812434, + "learning_rate": 2.255334734436809e-06, + "loss": 11.753, + "step": 34315 + }, + { + "epoch": 1.86864334674263, + "grad_norm": 0.5313683324233448, + "learning_rate": 2.253472852822014e-06, + "loss": 11.6531, + "step": 34316 + }, + { + "epoch": 1.868697800739213, + "grad_norm": 0.6020399393248772, + "learning_rate": 2.251611731300629e-06, + "loss": 11.8591, + "step": 34317 + }, + { + "epoch": 1.868752254735796, + "grad_norm": 0.5795915911399782, + "learning_rate": 2.2497513698870766e-06, + "loss": 11.7954, + "step": 34318 + }, + { + "epoch": 1.868806708732379, + "grad_norm": 0.526170584593981, + "learning_rate": 2.2478917685958554e-06, + "loss": 11.7472, + "step": 34319 + }, + { + "epoch": 1.8688611627289622, + "grad_norm": 0.5060685953506894, + "learning_rate": 2.2460329274414328e-06, + "loss": 11.7706, + "step": 34320 + }, + { + "epoch": 1.8689156167255452, + "grad_norm": 0.49720238683926316, + "learning_rate": 2.2441748464382295e-06, + "loss": 11.7317, + "step": 34321 + }, + { + "epoch": 1.8689700707221282, + "grad_norm": 0.7488114605314281, + "learning_rate": 2.2423175256007345e-06, + "loss": 11.7128, + "step": 34322 + }, + { + "epoch": 1.8690245247187112, + "grad_norm": 0.5379971433796815, + "learning_rate": 2.2404609649433692e-06, + "loss": 11.749, + "step": 34323 + }, + { + "epoch": 1.8690789787152942, + "grad_norm": 0.5231756967868846, + "learning_rate": 2.2386051644805783e-06, + "loss": 11.7986, + "step": 34324 + }, + { + "epoch": 1.8691334327118772, + "grad_norm": 0.5753825588422949, + "learning_rate": 2.2367501242267717e-06, + "loss": 11.7385, + "step": 34325 + }, + { + "epoch": 1.8691878867084601, + "grad_norm": 0.5608086685297307, + "learning_rate": 2.234895844196394e-06, + "loss": 11.7776, + "step": 34326 + }, + { + "epoch": 1.8692423407050431, + "grad_norm": 0.5394895111798916, + "learning_rate": 2.233042324403889e-06, + "loss": 11.7824, + "step": 34327 + }, + { + "epoch": 1.8692967947016261, + "grad_norm": 0.5689546395263193, + "learning_rate": 2.2311895648636117e-06, + "loss": 11.7004, + "step": 34328 + }, + { + "epoch": 1.8693512486982091, + "grad_norm": 0.600586015788296, + "learning_rate": 2.229337565590006e-06, + "loss": 11.8067, + "step": 34329 + }, + { + "epoch": 1.8694057026947921, + "grad_norm": 0.5158412661360852, + "learning_rate": 2.227486326597483e-06, + "loss": 11.6357, + "step": 34330 + }, + { + "epoch": 1.869460156691375, + "grad_norm": 0.50905046108615, + "learning_rate": 2.225635847900409e-06, + "loss": 11.7642, + "step": 34331 + }, + { + "epoch": 1.869514610687958, + "grad_norm": 0.5485737885268168, + "learning_rate": 2.2237861295131946e-06, + "loss": 11.8075, + "step": 34332 + }, + { + "epoch": 1.869569064684541, + "grad_norm": 0.5234525241867425, + "learning_rate": 2.2219371714502058e-06, + "loss": 11.772, + "step": 34333 + }, + { + "epoch": 1.869623518681124, + "grad_norm": 0.5333371854004794, + "learning_rate": 2.2200889737258423e-06, + "loss": 11.7838, + "step": 34334 + }, + { + "epoch": 1.869677972677707, + "grad_norm": 0.5365931389998904, + "learning_rate": 2.2182415363544482e-06, + "loss": 11.8084, + "step": 34335 + }, + { + "epoch": 1.86973242667429, + "grad_norm": 0.6032294290204636, + "learning_rate": 2.2163948593504236e-06, + "loss": 11.8715, + "step": 34336 + }, + { + "epoch": 1.8697868806708733, + "grad_norm": 0.5684661051567209, + "learning_rate": 2.2145489427281117e-06, + "loss": 11.6568, + "step": 34337 + }, + { + "epoch": 1.8698413346674563, + "grad_norm": 0.5391462109251062, + "learning_rate": 2.2127037865018573e-06, + "loss": 11.7007, + "step": 34338 + }, + { + "epoch": 1.8698957886640393, + "grad_norm": 0.5558384979744345, + "learning_rate": 2.2108593906860155e-06, + "loss": 11.662, + "step": 34339 + }, + { + "epoch": 1.8699502426606223, + "grad_norm": 0.590831257116328, + "learning_rate": 2.2090157552949297e-06, + "loss": 11.8627, + "step": 34340 + }, + { + "epoch": 1.8700046966572053, + "grad_norm": 0.5359014405334612, + "learning_rate": 2.2071728803429337e-06, + "loss": 11.73, + "step": 34341 + }, + { + "epoch": 1.8700591506537882, + "grad_norm": 0.5653957194860302, + "learning_rate": 2.205330765844382e-06, + "loss": 11.8693, + "step": 34342 + }, + { + "epoch": 1.8701136046503715, + "grad_norm": 0.5562694823055013, + "learning_rate": 2.203489411813553e-06, + "loss": 11.6833, + "step": 34343 + }, + { + "epoch": 1.8701680586469545, + "grad_norm": 0.5149356771779124, + "learning_rate": 2.2016488182648122e-06, + "loss": 11.7022, + "step": 34344 + }, + { + "epoch": 1.8702225126435374, + "grad_norm": 0.5422724576588746, + "learning_rate": 2.199808985212437e-06, + "loss": 11.5619, + "step": 34345 + }, + { + "epoch": 1.8702769666401204, + "grad_norm": 0.5864744721307434, + "learning_rate": 2.19796991267075e-06, + "loss": 11.6507, + "step": 34346 + }, + { + "epoch": 1.8703314206367034, + "grad_norm": 0.46228481529677695, + "learning_rate": 2.196131600654061e-06, + "loss": 11.7805, + "step": 34347 + }, + { + "epoch": 1.8703858746332864, + "grad_norm": 0.5508542909946582, + "learning_rate": 2.194294049176637e-06, + "loss": 11.8564, + "step": 34348 + }, + { + "epoch": 1.8704403286298694, + "grad_norm": 0.5355241792843812, + "learning_rate": 2.1924572582528003e-06, + "loss": 11.8426, + "step": 34349 + }, + { + "epoch": 1.8704947826264524, + "grad_norm": 0.6120102724229697, + "learning_rate": 2.1906212278968053e-06, + "loss": 11.7656, + "step": 34350 + }, + { + "epoch": 1.8705492366230354, + "grad_norm": 0.5116721589038303, + "learning_rate": 2.188785958122963e-06, + "loss": 11.6986, + "step": 34351 + }, + { + "epoch": 1.8706036906196184, + "grad_norm": 0.5128777813311106, + "learning_rate": 2.1869514489454955e-06, + "loss": 11.6956, + "step": 34352 + }, + { + "epoch": 1.8706581446162014, + "grad_norm": 0.5499742456332106, + "learning_rate": 2.185117700378714e-06, + "loss": 11.8258, + "step": 34353 + }, + { + "epoch": 1.8707125986127844, + "grad_norm": 0.5396749535097277, + "learning_rate": 2.1832847124368614e-06, + "loss": 11.8081, + "step": 34354 + }, + { + "epoch": 1.8707670526093674, + "grad_norm": 0.5932369636036566, + "learning_rate": 2.1814524851341833e-06, + "loss": 11.8096, + "step": 34355 + }, + { + "epoch": 1.8708215066059504, + "grad_norm": 0.5744945478452651, + "learning_rate": 2.179621018484945e-06, + "loss": 11.8209, + "step": 34356 + }, + { + "epoch": 1.8708759606025334, + "grad_norm": 0.5524508983211879, + "learning_rate": 2.1777903125033694e-06, + "loss": 11.859, + "step": 34357 + }, + { + "epoch": 1.8709304145991164, + "grad_norm": 0.511186491903641, + "learning_rate": 2.1759603672037e-06, + "loss": 11.7553, + "step": 34358 + }, + { + "epoch": 1.8709848685956993, + "grad_norm": 0.597577069022772, + "learning_rate": 2.1741311826001808e-06, + "loss": 11.9734, + "step": 34359 + }, + { + "epoch": 1.8710393225922826, + "grad_norm": 0.5804922313953728, + "learning_rate": 2.172302758707001e-06, + "loss": 11.6796, + "step": 34360 + }, + { + "epoch": 1.8710937765888656, + "grad_norm": 0.5485987486874512, + "learning_rate": 2.170475095538427e-06, + "loss": 11.838, + "step": 34361 + }, + { + "epoch": 1.8711482305854485, + "grad_norm": 0.5347484606120664, + "learning_rate": 2.1686481931086244e-06, + "loss": 11.8154, + "step": 34362 + }, + { + "epoch": 1.8712026845820315, + "grad_norm": 0.5123476391533188, + "learning_rate": 2.166822051431816e-06, + "loss": 11.7624, + "step": 34363 + }, + { + "epoch": 1.8712571385786145, + "grad_norm": 0.6508918327409489, + "learning_rate": 2.1649966705222234e-06, + "loss": 11.745, + "step": 34364 + }, + { + "epoch": 1.8713115925751975, + "grad_norm": 0.5250660221748026, + "learning_rate": 2.1631720503940134e-06, + "loss": 11.6672, + "step": 34365 + }, + { + "epoch": 1.8713660465717807, + "grad_norm": 0.6354470342955629, + "learning_rate": 2.1613481910613963e-06, + "loss": 11.8038, + "step": 34366 + }, + { + "epoch": 1.8714205005683637, + "grad_norm": 0.48238835692212567, + "learning_rate": 2.159525092538539e-06, + "loss": 11.736, + "step": 34367 + }, + { + "epoch": 1.8714749545649467, + "grad_norm": 0.564512669862306, + "learning_rate": 2.157702754839608e-06, + "loss": 11.8659, + "step": 34368 + }, + { + "epoch": 1.8715294085615297, + "grad_norm": 0.5635828074828091, + "learning_rate": 2.1558811779788023e-06, + "loss": 11.7195, + "step": 34369 + }, + { + "epoch": 1.8715838625581127, + "grad_norm": 0.5534831164136985, + "learning_rate": 2.154060361970267e-06, + "loss": 11.7614, + "step": 34370 + }, + { + "epoch": 1.8716383165546957, + "grad_norm": 0.49612570940930945, + "learning_rate": 2.1522403068281795e-06, + "loss": 11.8315, + "step": 34371 + }, + { + "epoch": 1.8716927705512787, + "grad_norm": 0.6191222455780548, + "learning_rate": 2.1504210125666614e-06, + "loss": 11.8296, + "step": 34372 + }, + { + "epoch": 1.8717472245478617, + "grad_norm": 0.5328092438598759, + "learning_rate": 2.1486024791998903e-06, + "loss": 11.6856, + "step": 34373 + }, + { + "epoch": 1.8718016785444447, + "grad_norm": 0.4964358302001134, + "learning_rate": 2.1467847067419887e-06, + "loss": 11.7469, + "step": 34374 + }, + { + "epoch": 1.8718561325410277, + "grad_norm": 0.5587324996028472, + "learning_rate": 2.144967695207101e-06, + "loss": 11.8047, + "step": 34375 + }, + { + "epoch": 1.8719105865376107, + "grad_norm": 0.5908732823712312, + "learning_rate": 2.1431514446093594e-06, + "loss": 11.7955, + "step": 34376 + }, + { + "epoch": 1.8719650405341937, + "grad_norm": 0.5157730303173998, + "learning_rate": 2.1413359549628862e-06, + "loss": 11.7474, + "step": 34377 + }, + { + "epoch": 1.8720194945307767, + "grad_norm": 0.5848190070966441, + "learning_rate": 2.139521226281793e-06, + "loss": 11.8345, + "step": 34378 + }, + { + "epoch": 1.8720739485273596, + "grad_norm": 0.5816650124556637, + "learning_rate": 2.1377072585801906e-06, + "loss": 11.7985, + "step": 34379 + }, + { + "epoch": 1.8721284025239426, + "grad_norm": 0.5434419924222774, + "learning_rate": 2.1358940518721893e-06, + "loss": 11.7551, + "step": 34380 + }, + { + "epoch": 1.8721828565205256, + "grad_norm": 0.5277405683949002, + "learning_rate": 2.1340816061718893e-06, + "loss": 11.8552, + "step": 34381 + }, + { + "epoch": 1.8722373105171086, + "grad_norm": 0.48254009195127046, + "learning_rate": 2.1322699214933793e-06, + "loss": 11.7246, + "step": 34382 + }, + { + "epoch": 1.8722917645136916, + "grad_norm": 0.5822475979806188, + "learning_rate": 2.1304589978507595e-06, + "loss": 11.7647, + "step": 34383 + }, + { + "epoch": 1.8723462185102748, + "grad_norm": 0.6217129123806907, + "learning_rate": 2.1286488352580846e-06, + "loss": 11.765, + "step": 34384 + }, + { + "epoch": 1.8724006725068578, + "grad_norm": 0.6047053339299858, + "learning_rate": 2.126839433729466e-06, + "loss": 11.7751, + "step": 34385 + }, + { + "epoch": 1.8724551265034408, + "grad_norm": 0.600128254369047, + "learning_rate": 2.125030793278948e-06, + "loss": 11.8939, + "step": 34386 + }, + { + "epoch": 1.8725095805000238, + "grad_norm": 0.5384829704509522, + "learning_rate": 2.1232229139206196e-06, + "loss": 11.7675, + "step": 34387 + }, + { + "epoch": 1.8725640344966068, + "grad_norm": 0.546335750772106, + "learning_rate": 2.121415795668513e-06, + "loss": 11.7511, + "step": 34388 + }, + { + "epoch": 1.8726184884931898, + "grad_norm": 0.5903271827455143, + "learning_rate": 2.119609438536685e-06, + "loss": 11.7408, + "step": 34389 + }, + { + "epoch": 1.872672942489773, + "grad_norm": 0.5416514137835514, + "learning_rate": 2.11780384253919e-06, + "loss": 11.7675, + "step": 34390 + }, + { + "epoch": 1.872727396486356, + "grad_norm": 0.5236622722793727, + "learning_rate": 2.1159990076900727e-06, + "loss": 11.8181, + "step": 34391 + }, + { + "epoch": 1.872781850482939, + "grad_norm": 0.5594087362080239, + "learning_rate": 2.114194934003366e-06, + "loss": 11.7708, + "step": 34392 + }, + { + "epoch": 1.872836304479522, + "grad_norm": 0.5846820398290147, + "learning_rate": 2.112391621493093e-06, + "loss": 11.8291, + "step": 34393 + }, + { + "epoch": 1.872890758476105, + "grad_norm": 0.519282890580764, + "learning_rate": 2.1105890701732743e-06, + "loss": 11.7426, + "step": 34394 + }, + { + "epoch": 1.872945212472688, + "grad_norm": 0.5318798358420342, + "learning_rate": 2.1087872800579333e-06, + "loss": 11.7557, + "step": 34395 + }, + { + "epoch": 1.872999666469271, + "grad_norm": 0.5705674234138456, + "learning_rate": 2.1069862511610694e-06, + "loss": 11.6563, + "step": 34396 + }, + { + "epoch": 1.873054120465854, + "grad_norm": 0.4792590627200801, + "learning_rate": 2.1051859834967156e-06, + "loss": 11.6679, + "step": 34397 + }, + { + "epoch": 1.873108574462437, + "grad_norm": 0.6341616288108749, + "learning_rate": 2.1033864770788503e-06, + "loss": 11.7423, + "step": 34398 + }, + { + "epoch": 1.87316302845902, + "grad_norm": 0.5527814185344745, + "learning_rate": 2.1015877319214507e-06, + "loss": 11.9045, + "step": 34399 + }, + { + "epoch": 1.873217482455603, + "grad_norm": 0.5782207855250743, + "learning_rate": 2.0997897480385386e-06, + "loss": 11.7253, + "step": 34400 + }, + { + "epoch": 1.873271936452186, + "grad_norm": 0.5304157200611321, + "learning_rate": 2.09799252544407e-06, + "loss": 11.6856, + "step": 34401 + }, + { + "epoch": 1.873326390448769, + "grad_norm": 0.5394937941037234, + "learning_rate": 2.096196064152034e-06, + "loss": 11.8111, + "step": 34402 + }, + { + "epoch": 1.873380844445352, + "grad_norm": 0.5954734149265039, + "learning_rate": 2.0944003641763966e-06, + "loss": 11.843, + "step": 34403 + }, + { + "epoch": 1.873435298441935, + "grad_norm": 0.551728549063149, + "learning_rate": 2.0926054255311136e-06, + "loss": 11.7095, + "step": 34404 + }, + { + "epoch": 1.873489752438518, + "grad_norm": 0.5462890870139695, + "learning_rate": 2.0908112482301622e-06, + "loss": 11.7751, + "step": 34405 + }, + { + "epoch": 1.873544206435101, + "grad_norm": 0.5500798009257194, + "learning_rate": 2.0890178322874653e-06, + "loss": 11.8971, + "step": 34406 + }, + { + "epoch": 1.873598660431684, + "grad_norm": 0.6280786023691517, + "learning_rate": 2.0872251777170005e-06, + "loss": 11.8629, + "step": 34407 + }, + { + "epoch": 1.873653114428267, + "grad_norm": 0.5237086988875659, + "learning_rate": 2.0854332845327007e-06, + "loss": 11.7362, + "step": 34408 + }, + { + "epoch": 1.87370756842485, + "grad_norm": 0.5262139772734832, + "learning_rate": 2.083642152748466e-06, + "loss": 11.8662, + "step": 34409 + }, + { + "epoch": 1.873762022421433, + "grad_norm": 0.5298680180649677, + "learning_rate": 2.0818517823782745e-06, + "loss": 11.7746, + "step": 34410 + }, + { + "epoch": 1.873816476418016, + "grad_norm": 0.5205467443915855, + "learning_rate": 2.0800621734360035e-06, + "loss": 11.7789, + "step": 34411 + }, + { + "epoch": 1.873870930414599, + "grad_norm": 0.6486251825344184, + "learning_rate": 2.078273325935598e-06, + "loss": 11.7703, + "step": 34412 + }, + { + "epoch": 1.8739253844111823, + "grad_norm": 0.5099860177347235, + "learning_rate": 2.0764852398909684e-06, + "loss": 11.7349, + "step": 34413 + }, + { + "epoch": 1.8739798384077653, + "grad_norm": 0.5821970470073644, + "learning_rate": 2.074697915316004e-06, + "loss": 11.8652, + "step": 34414 + }, + { + "epoch": 1.8740342924043483, + "grad_norm": 0.5218286181112657, + "learning_rate": 2.0729113522246155e-06, + "loss": 11.8774, + "step": 34415 + }, + { + "epoch": 1.8740887464009313, + "grad_norm": 0.5745509749204943, + "learning_rate": 2.0711255506306814e-06, + "loss": 11.6911, + "step": 34416 + }, + { + "epoch": 1.8741432003975143, + "grad_norm": 0.6063070917365466, + "learning_rate": 2.069340510548112e-06, + "loss": 11.7745, + "step": 34417 + }, + { + "epoch": 1.8741976543940972, + "grad_norm": 0.5818062815628678, + "learning_rate": 2.067556231990775e-06, + "loss": 11.7646, + "step": 34418 + }, + { + "epoch": 1.8742521083906802, + "grad_norm": 0.5071365366911584, + "learning_rate": 2.065772714972525e-06, + "loss": 11.857, + "step": 34419 + }, + { + "epoch": 1.8743065623872632, + "grad_norm": 0.5530547851562596, + "learning_rate": 2.063989959507262e-06, + "loss": 11.7571, + "step": 34420 + }, + { + "epoch": 1.8743610163838462, + "grad_norm": 0.5200060839544695, + "learning_rate": 2.062207965608831e-06, + "loss": 11.7833, + "step": 34421 + }, + { + "epoch": 1.8744154703804292, + "grad_norm": 0.5272580332080232, + "learning_rate": 2.0604267332911096e-06, + "loss": 11.6509, + "step": 34422 + }, + { + "epoch": 1.8744699243770122, + "grad_norm": 0.567258798481717, + "learning_rate": 2.0586462625679203e-06, + "loss": 11.8684, + "step": 34423 + }, + { + "epoch": 1.8745243783735952, + "grad_norm": 0.5232239501305265, + "learning_rate": 2.0568665534531184e-06, + "loss": 11.739, + "step": 34424 + }, + { + "epoch": 1.8745788323701782, + "grad_norm": 0.5078560684131614, + "learning_rate": 2.0550876059605594e-06, + "loss": 11.7891, + "step": 34425 + }, + { + "epoch": 1.8746332863667612, + "grad_norm": 0.5451956301968227, + "learning_rate": 2.053309420104055e-06, + "loss": 11.7283, + "step": 34426 + }, + { + "epoch": 1.8746877403633442, + "grad_norm": 0.5816667572424823, + "learning_rate": 2.0515319958974487e-06, + "loss": 11.8666, + "step": 34427 + }, + { + "epoch": 1.8747421943599272, + "grad_norm": 0.5185057030445498, + "learning_rate": 2.0497553333545637e-06, + "loss": 11.7493, + "step": 34428 + }, + { + "epoch": 1.8747966483565102, + "grad_norm": 0.5482425338200828, + "learning_rate": 2.0479794324891887e-06, + "loss": 11.7361, + "step": 34429 + }, + { + "epoch": 1.8748511023530934, + "grad_norm": 0.5393914245802071, + "learning_rate": 2.046204293315168e-06, + "loss": 11.7319, + "step": 34430 + }, + { + "epoch": 1.8749055563496764, + "grad_norm": 0.5342755576887441, + "learning_rate": 2.04442991584628e-06, + "loss": 11.7162, + "step": 34431 + }, + { + "epoch": 1.8749600103462594, + "grad_norm": 0.515268639026471, + "learning_rate": 2.042656300096335e-06, + "loss": 11.7082, + "step": 34432 + }, + { + "epoch": 1.8750144643428424, + "grad_norm": 0.5224587268132403, + "learning_rate": 2.0408834460791227e-06, + "loss": 11.7594, + "step": 34433 + }, + { + "epoch": 1.8750689183394254, + "grad_norm": 0.5892220430621732, + "learning_rate": 2.0391113538084316e-06, + "loss": 11.8748, + "step": 34434 + }, + { + "epoch": 1.8751233723360083, + "grad_norm": 0.5619071835590458, + "learning_rate": 2.037340023298029e-06, + "loss": 11.742, + "step": 34435 + }, + { + "epoch": 1.8751778263325916, + "grad_norm": 0.5974927532265994, + "learning_rate": 2.035569454561692e-06, + "loss": 11.774, + "step": 34436 + }, + { + "epoch": 1.8752322803291746, + "grad_norm": 0.525811668882926, + "learning_rate": 2.0337996476132214e-06, + "loss": 11.7742, + "step": 34437 + }, + { + "epoch": 1.8752867343257575, + "grad_norm": 0.5553818211235978, + "learning_rate": 2.0320306024663395e-06, + "loss": 11.7991, + "step": 34438 + }, + { + "epoch": 1.8753411883223405, + "grad_norm": 0.5918921431889814, + "learning_rate": 2.0302623191348126e-06, + "loss": 11.8323, + "step": 34439 + }, + { + "epoch": 1.8753956423189235, + "grad_norm": 0.5258447972096325, + "learning_rate": 2.0284947976324077e-06, + "loss": 11.7244, + "step": 34440 + }, + { + "epoch": 1.8754500963155065, + "grad_norm": 0.532229743224184, + "learning_rate": 2.026728037972847e-06, + "loss": 11.7806, + "step": 34441 + }, + { + "epoch": 1.8755045503120895, + "grad_norm": 0.5235079017539482, + "learning_rate": 2.0249620401698865e-06, + "loss": 11.7687, + "step": 34442 + }, + { + "epoch": 1.8755590043086725, + "grad_norm": 0.6012943838447397, + "learning_rate": 2.023196804237237e-06, + "loss": 11.8524, + "step": 34443 + }, + { + "epoch": 1.8756134583052555, + "grad_norm": 0.5462113512508756, + "learning_rate": 2.021432330188655e-06, + "loss": 11.7749, + "step": 34444 + }, + { + "epoch": 1.8756679123018385, + "grad_norm": 0.496202303587155, + "learning_rate": 2.0196686180378397e-06, + "loss": 11.6893, + "step": 34445 + }, + { + "epoch": 1.8757223662984215, + "grad_norm": 0.5559307539325787, + "learning_rate": 2.017905667798514e-06, + "loss": 11.7268, + "step": 34446 + }, + { + "epoch": 1.8757768202950045, + "grad_norm": 0.513245831471796, + "learning_rate": 2.0161434794843893e-06, + "loss": 11.7377, + "step": 34447 + }, + { + "epoch": 1.8758312742915875, + "grad_norm": 0.5683770587521315, + "learning_rate": 2.014382053109165e-06, + "loss": 11.793, + "step": 34448 + }, + { + "epoch": 1.8758857282881705, + "grad_norm": 0.5482440403752477, + "learning_rate": 2.012621388686531e-06, + "loss": 11.8129, + "step": 34449 + }, + { + "epoch": 1.8759401822847535, + "grad_norm": 0.5094562828130171, + "learning_rate": 2.0108614862301866e-06, + "loss": 11.7884, + "step": 34450 + }, + { + "epoch": 1.8759946362813364, + "grad_norm": 0.6093824328480545, + "learning_rate": 2.0091023457538105e-06, + "loss": 11.8513, + "step": 34451 + }, + { + "epoch": 1.8760490902779194, + "grad_norm": 0.535910924486717, + "learning_rate": 2.0073439672711024e-06, + "loss": 11.878, + "step": 34452 + }, + { + "epoch": 1.8761035442745024, + "grad_norm": 0.5940137508258937, + "learning_rate": 2.005586350795707e-06, + "loss": 11.8551, + "step": 34453 + }, + { + "epoch": 1.8761579982710856, + "grad_norm": 0.5250192951520369, + "learning_rate": 2.003829496341325e-06, + "loss": 11.7418, + "step": 34454 + }, + { + "epoch": 1.8762124522676686, + "grad_norm": 0.5981081202272566, + "learning_rate": 2.0020734039215782e-06, + "loss": 11.7875, + "step": 34455 + }, + { + "epoch": 1.8762669062642516, + "grad_norm": 0.6114501315845203, + "learning_rate": 2.000318073550156e-06, + "loss": 11.8459, + "step": 34456 + }, + { + "epoch": 1.8763213602608346, + "grad_norm": 0.4906140205056725, + "learning_rate": 1.9985635052406915e-06, + "loss": 11.7518, + "step": 34457 + }, + { + "epoch": 1.8763758142574176, + "grad_norm": 0.5900485469254249, + "learning_rate": 1.99680969900683e-06, + "loss": 11.8525, + "step": 34458 + }, + { + "epoch": 1.8764302682540008, + "grad_norm": 0.5923781937237657, + "learning_rate": 1.995056654862215e-06, + "loss": 11.7313, + "step": 34459 + }, + { + "epoch": 1.8764847222505838, + "grad_norm": 0.5422296100505702, + "learning_rate": 1.9933043728204702e-06, + "loss": 11.7604, + "step": 34460 + }, + { + "epoch": 1.8765391762471668, + "grad_norm": 0.4869438152224834, + "learning_rate": 1.9915528528952175e-06, + "loss": 11.6852, + "step": 34461 + }, + { + "epoch": 1.8765936302437498, + "grad_norm": 0.5031597530798516, + "learning_rate": 1.9898020951001016e-06, + "loss": 11.7802, + "step": 34462 + }, + { + "epoch": 1.8766480842403328, + "grad_norm": 0.5869418858344352, + "learning_rate": 1.9880520994487115e-06, + "loss": 11.7233, + "step": 34463 + }, + { + "epoch": 1.8767025382369158, + "grad_norm": 0.5768040134503095, + "learning_rate": 1.986302865954681e-06, + "loss": 11.7805, + "step": 34464 + }, + { + "epoch": 1.8767569922334988, + "grad_norm": 0.5407664945084442, + "learning_rate": 1.984554394631577e-06, + "loss": 11.7714, + "step": 34465 + }, + { + "epoch": 1.8768114462300818, + "grad_norm": 0.5181589858084656, + "learning_rate": 1.9828066854930328e-06, + "loss": 11.8283, + "step": 34466 + }, + { + "epoch": 1.8768659002266648, + "grad_norm": 0.5918453354931499, + "learning_rate": 1.981059738552604e-06, + "loss": 11.7435, + "step": 34467 + }, + { + "epoch": 1.8769203542232478, + "grad_norm": 0.6205678159164849, + "learning_rate": 1.9793135538239028e-06, + "loss": 11.8655, + "step": 34468 + }, + { + "epoch": 1.8769748082198308, + "grad_norm": 0.5245735247157259, + "learning_rate": 1.9775681313204953e-06, + "loss": 11.8524, + "step": 34469 + }, + { + "epoch": 1.8770292622164138, + "grad_norm": 0.5423945105094565, + "learning_rate": 1.9758234710559485e-06, + "loss": 11.7362, + "step": 34470 + }, + { + "epoch": 1.8770837162129967, + "grad_norm": 0.5091349420645608, + "learning_rate": 1.9740795730438524e-06, + "loss": 11.7449, + "step": 34471 + }, + { + "epoch": 1.8771381702095797, + "grad_norm": 0.5295070002646127, + "learning_rate": 1.9723364372977394e-06, + "loss": 11.6581, + "step": 34472 + }, + { + "epoch": 1.8771926242061627, + "grad_norm": 0.5319138956302755, + "learning_rate": 1.970594063831177e-06, + "loss": 11.7964, + "step": 34473 + }, + { + "epoch": 1.8772470782027457, + "grad_norm": 0.564039425531116, + "learning_rate": 1.9688524526577213e-06, + "loss": 11.6605, + "step": 34474 + }, + { + "epoch": 1.8773015321993287, + "grad_norm": 0.5564437896006181, + "learning_rate": 1.9671116037909056e-06, + "loss": 11.6495, + "step": 34475 + }, + { + "epoch": 1.8773559861959117, + "grad_norm": 0.6194062080551602, + "learning_rate": 1.9653715172442743e-06, + "loss": 11.7993, + "step": 34476 + }, + { + "epoch": 1.877410440192495, + "grad_norm": 0.6026581706880969, + "learning_rate": 1.9636321930313507e-06, + "loss": 11.8409, + "step": 34477 + }, + { + "epoch": 1.877464894189078, + "grad_norm": 0.5308738460868556, + "learning_rate": 1.961893631165668e-06, + "loss": 11.8138, + "step": 34478 + }, + { + "epoch": 1.877519348185661, + "grad_norm": 0.545804917861025, + "learning_rate": 1.9601558316607482e-06, + "loss": 11.7917, + "step": 34479 + }, + { + "epoch": 1.877573802182244, + "grad_norm": 0.6841642340268604, + "learning_rate": 1.9584187945300812e-06, + "loss": 11.793, + "step": 34480 + }, + { + "epoch": 1.877628256178827, + "grad_norm": 0.5817327235030416, + "learning_rate": 1.9566825197872007e-06, + "loss": 11.8931, + "step": 34481 + }, + { + "epoch": 1.8776827101754099, + "grad_norm": 0.5477205931273549, + "learning_rate": 1.9549470074455957e-06, + "loss": 11.7111, + "step": 34482 + }, + { + "epoch": 1.877737164171993, + "grad_norm": 0.5515966502652739, + "learning_rate": 1.9532122575187663e-06, + "loss": 11.8316, + "step": 34483 + }, + { + "epoch": 1.877791618168576, + "grad_norm": 0.4810358389520921, + "learning_rate": 1.9514782700202018e-06, + "loss": 11.7007, + "step": 34484 + }, + { + "epoch": 1.877846072165159, + "grad_norm": 0.5236914559142065, + "learning_rate": 1.94974504496338e-06, + "loss": 11.7711, + "step": 34485 + }, + { + "epoch": 1.877900526161742, + "grad_norm": 0.5716203529750217, + "learning_rate": 1.948012582361791e-06, + "loss": 11.8149, + "step": 34486 + }, + { + "epoch": 1.877954980158325, + "grad_norm": 0.5472489722211471, + "learning_rate": 1.94628088222889e-06, + "loss": 11.862, + "step": 34487 + }, + { + "epoch": 1.878009434154908, + "grad_norm": 0.5891219500076248, + "learning_rate": 1.9445499445781666e-06, + "loss": 11.7981, + "step": 34488 + }, + { + "epoch": 1.878063888151491, + "grad_norm": 0.5271053617140028, + "learning_rate": 1.9428197694230543e-06, + "loss": 11.7337, + "step": 34489 + }, + { + "epoch": 1.878118342148074, + "grad_norm": 0.5517514029627272, + "learning_rate": 1.94109035677702e-06, + "loss": 11.9251, + "step": 34490 + }, + { + "epoch": 1.878172796144657, + "grad_norm": 0.5427870678584611, + "learning_rate": 1.9393617066535196e-06, + "loss": 11.676, + "step": 34491 + }, + { + "epoch": 1.87822725014124, + "grad_norm": 0.5426510252194713, + "learning_rate": 1.937633819065987e-06, + "loss": 11.7591, + "step": 34492 + }, + { + "epoch": 1.878281704137823, + "grad_norm": 0.5740285832375813, + "learning_rate": 1.935906694027856e-06, + "loss": 11.6884, + "step": 34493 + }, + { + "epoch": 1.878336158134406, + "grad_norm": 0.5414019091039056, + "learning_rate": 1.9341803315525488e-06, + "loss": 11.6635, + "step": 34494 + }, + { + "epoch": 1.878390612130989, + "grad_norm": 0.5739667806560893, + "learning_rate": 1.9324547316535104e-06, + "loss": 11.7823, + "step": 34495 + }, + { + "epoch": 1.878445066127572, + "grad_norm": 0.5395552988286576, + "learning_rate": 1.9307298943441523e-06, + "loss": 11.7178, + "step": 34496 + }, + { + "epoch": 1.878499520124155, + "grad_norm": 0.4898450664483993, + "learning_rate": 1.929005819637886e-06, + "loss": 11.7795, + "step": 34497 + }, + { + "epoch": 1.878553974120738, + "grad_norm": 0.4883211430185245, + "learning_rate": 1.9272825075481226e-06, + "loss": 11.73, + "step": 34498 + }, + { + "epoch": 1.878608428117321, + "grad_norm": 0.5283353580397148, + "learning_rate": 1.925559958088241e-06, + "loss": 11.7586, + "step": 34499 + }, + { + "epoch": 1.8786628821139042, + "grad_norm": 0.5234978536808704, + "learning_rate": 1.9238381712716636e-06, + "loss": 11.9098, + "step": 34500 + }, + { + "epoch": 1.8787173361104872, + "grad_norm": 0.5749481724090452, + "learning_rate": 1.9221171471117684e-06, + "loss": 11.7901, + "step": 34501 + }, + { + "epoch": 1.8787717901070702, + "grad_norm": 0.5375052414148336, + "learning_rate": 1.9203968856219224e-06, + "loss": 11.6708, + "step": 34502 + }, + { + "epoch": 1.8788262441036532, + "grad_norm": 0.5686279822901938, + "learning_rate": 1.918677386815537e-06, + "loss": 11.9018, + "step": 34503 + }, + { + "epoch": 1.8788806981002362, + "grad_norm": 0.5093394053034656, + "learning_rate": 1.9169586507059577e-06, + "loss": 11.8013, + "step": 34504 + }, + { + "epoch": 1.8789351520968192, + "grad_norm": 0.571703094684345, + "learning_rate": 1.9152406773065513e-06, + "loss": 11.8528, + "step": 34505 + }, + { + "epoch": 1.8789896060934024, + "grad_norm": 0.5498325117636943, + "learning_rate": 1.9135234666306844e-06, + "loss": 11.6962, + "step": 34506 + }, + { + "epoch": 1.8790440600899854, + "grad_norm": 0.5639003883480723, + "learning_rate": 1.9118070186917137e-06, + "loss": 11.8434, + "step": 34507 + }, + { + "epoch": 1.8790985140865684, + "grad_norm": 0.5960834375712886, + "learning_rate": 1.9100913335029833e-06, + "loss": 11.8433, + "step": 34508 + }, + { + "epoch": 1.8791529680831514, + "grad_norm": 0.5022991779269027, + "learning_rate": 1.908376411077828e-06, + "loss": 11.5936, + "step": 34509 + }, + { + "epoch": 1.8792074220797343, + "grad_norm": 0.5216442959501958, + "learning_rate": 1.9066622514295807e-06, + "loss": 11.7873, + "step": 34510 + }, + { + "epoch": 1.8792618760763173, + "grad_norm": 0.5401386107197366, + "learning_rate": 1.9049488545715865e-06, + "loss": 11.8959, + "step": 34511 + }, + { + "epoch": 1.8793163300729003, + "grad_norm": 0.5106782080787051, + "learning_rate": 1.9032362205171572e-06, + "loss": 11.7131, + "step": 34512 + }, + { + "epoch": 1.8793707840694833, + "grad_norm": 0.5643095554772389, + "learning_rate": 1.9015243492796154e-06, + "loss": 11.7519, + "step": 34513 + }, + { + "epoch": 1.8794252380660663, + "grad_norm": 0.5136996016727898, + "learning_rate": 1.8998132408722724e-06, + "loss": 11.7354, + "step": 34514 + }, + { + "epoch": 1.8794796920626493, + "grad_norm": 0.5778985080824569, + "learning_rate": 1.898102895308429e-06, + "loss": 11.7375, + "step": 34515 + }, + { + "epoch": 1.8795341460592323, + "grad_norm": 0.548016986559713, + "learning_rate": 1.8963933126013856e-06, + "loss": 11.6967, + "step": 34516 + }, + { + "epoch": 1.8795886000558153, + "grad_norm": 0.5332376768984646, + "learning_rate": 1.8946844927644425e-06, + "loss": 11.625, + "step": 34517 + }, + { + "epoch": 1.8796430540523983, + "grad_norm": 0.5273766765887075, + "learning_rate": 1.8929764358109003e-06, + "loss": 11.7244, + "step": 34518 + }, + { + "epoch": 1.8796975080489813, + "grad_norm": 0.5529812715269194, + "learning_rate": 1.8912691417540152e-06, + "loss": 11.5003, + "step": 34519 + }, + { + "epoch": 1.8797519620455643, + "grad_norm": 0.5449124996821897, + "learning_rate": 1.8895626106070763e-06, + "loss": 11.8011, + "step": 34520 + }, + { + "epoch": 1.8798064160421473, + "grad_norm": 0.5784908225647555, + "learning_rate": 1.88785684238334e-06, + "loss": 11.8468, + "step": 34521 + }, + { + "epoch": 1.8798608700387303, + "grad_norm": 0.5575582693831567, + "learning_rate": 1.886151837096084e-06, + "loss": 11.7628, + "step": 34522 + }, + { + "epoch": 1.8799153240353133, + "grad_norm": 0.5495828736806819, + "learning_rate": 1.884447594758576e-06, + "loss": 11.7958, + "step": 34523 + }, + { + "epoch": 1.8799697780318965, + "grad_norm": 0.5294812376046137, + "learning_rate": 1.8827441153840496e-06, + "loss": 11.7934, + "step": 34524 + }, + { + "epoch": 1.8800242320284795, + "grad_norm": 0.527139784511984, + "learning_rate": 1.8810413989857722e-06, + "loss": 11.7265, + "step": 34525 + }, + { + "epoch": 1.8800786860250625, + "grad_norm": 0.551246936249197, + "learning_rate": 1.8793394455769552e-06, + "loss": 11.6954, + "step": 34526 + }, + { + "epoch": 1.8801331400216454, + "grad_norm": 0.5171860086676685, + "learning_rate": 1.8776382551708548e-06, + "loss": 11.7661, + "step": 34527 + }, + { + "epoch": 1.8801875940182284, + "grad_norm": 0.5310445627915399, + "learning_rate": 1.875937827780705e-06, + "loss": 11.702, + "step": 34528 + }, + { + "epoch": 1.8802420480148117, + "grad_norm": 0.541717433002239, + "learning_rate": 1.874238163419706e-06, + "loss": 11.693, + "step": 34529 + }, + { + "epoch": 1.8802965020113946, + "grad_norm": 0.5230157604658559, + "learning_rate": 1.8725392621010917e-06, + "loss": 11.9111, + "step": 34530 + }, + { + "epoch": 1.8803509560079776, + "grad_norm": 0.5148423224131015, + "learning_rate": 1.870841123838063e-06, + "loss": 11.6381, + "step": 34531 + }, + { + "epoch": 1.8804054100045606, + "grad_norm": 0.5341492950235581, + "learning_rate": 1.8691437486438313e-06, + "loss": 11.8082, + "step": 34532 + }, + { + "epoch": 1.8804598640011436, + "grad_norm": 0.575986502869782, + "learning_rate": 1.867447136531597e-06, + "loss": 11.7954, + "step": 34533 + }, + { + "epoch": 1.8805143179977266, + "grad_norm": 0.5303804944689541, + "learning_rate": 1.8657512875145388e-06, + "loss": 11.7158, + "step": 34534 + }, + { + "epoch": 1.8805687719943096, + "grad_norm": 0.5393704055071213, + "learning_rate": 1.8640562016058794e-06, + "loss": 11.7661, + "step": 34535 + }, + { + "epoch": 1.8806232259908926, + "grad_norm": 0.5307814562574126, + "learning_rate": 1.8623618788187524e-06, + "loss": 11.7323, + "step": 34536 + }, + { + "epoch": 1.8806776799874756, + "grad_norm": 0.5245617896281497, + "learning_rate": 1.860668319166381e-06, + "loss": 11.6104, + "step": 34537 + }, + { + "epoch": 1.8807321339840586, + "grad_norm": 0.5890252640452627, + "learning_rate": 1.8589755226618987e-06, + "loss": 11.7918, + "step": 34538 + }, + { + "epoch": 1.8807865879806416, + "grad_norm": 0.639612477749031, + "learning_rate": 1.857283489318473e-06, + "loss": 11.7559, + "step": 34539 + }, + { + "epoch": 1.8808410419772246, + "grad_norm": 0.5893184885080097, + "learning_rate": 1.8555922191492825e-06, + "loss": 11.7902, + "step": 34540 + }, + { + "epoch": 1.8808954959738076, + "grad_norm": 0.542572726714157, + "learning_rate": 1.8539017121674495e-06, + "loss": 11.7784, + "step": 34541 + }, + { + "epoch": 1.8809499499703906, + "grad_norm": 0.5043011800731687, + "learning_rate": 1.8522119683861528e-06, + "loss": 11.7943, + "step": 34542 + }, + { + "epoch": 1.8810044039669735, + "grad_norm": 0.5480161379186976, + "learning_rate": 1.8505229878185038e-06, + "loss": 11.7191, + "step": 34543 + }, + { + "epoch": 1.8810588579635565, + "grad_norm": 0.5861594521838982, + "learning_rate": 1.8488347704776477e-06, + "loss": 11.9703, + "step": 34544 + }, + { + "epoch": 1.8811133119601395, + "grad_norm": 0.513443270616714, + "learning_rate": 1.8471473163767295e-06, + "loss": 11.7988, + "step": 34545 + }, + { + "epoch": 1.8811677659567225, + "grad_norm": 0.5734974278795244, + "learning_rate": 1.8454606255288386e-06, + "loss": 11.6857, + "step": 34546 + }, + { + "epoch": 1.8812222199533057, + "grad_norm": 0.601132744975943, + "learning_rate": 1.8437746979471093e-06, + "loss": 11.8225, + "step": 34547 + }, + { + "epoch": 1.8812766739498887, + "grad_norm": 0.4990396662536802, + "learning_rate": 1.842089533644653e-06, + "loss": 11.7413, + "step": 34548 + }, + { + "epoch": 1.8813311279464717, + "grad_norm": 0.5113258685520127, + "learning_rate": 1.8404051326345596e-06, + "loss": 11.7624, + "step": 34549 + }, + { + "epoch": 1.8813855819430547, + "grad_norm": 0.5341688777125242, + "learning_rate": 1.8387214949299514e-06, + "loss": 11.6345, + "step": 34550 + }, + { + "epoch": 1.8814400359396377, + "grad_norm": 0.531137488453673, + "learning_rate": 1.837038620543896e-06, + "loss": 11.7813, + "step": 34551 + }, + { + "epoch": 1.8814944899362207, + "grad_norm": 0.5573666641696239, + "learning_rate": 1.8353565094894941e-06, + "loss": 11.8014, + "step": 34552 + }, + { + "epoch": 1.881548943932804, + "grad_norm": 0.5511885283203259, + "learning_rate": 1.8336751617798132e-06, + "loss": 11.7881, + "step": 34553 + }, + { + "epoch": 1.881603397929387, + "grad_norm": 0.5616453379529558, + "learning_rate": 1.8319945774279534e-06, + "loss": 11.8446, + "step": 34554 + }, + { + "epoch": 1.88165785192597, + "grad_norm": 0.5507030940031911, + "learning_rate": 1.8303147564469492e-06, + "loss": 11.7357, + "step": 34555 + }, + { + "epoch": 1.881712305922553, + "grad_norm": 0.49553125219301664, + "learning_rate": 1.8286356988498898e-06, + "loss": 11.6769, + "step": 34556 + }, + { + "epoch": 1.881766759919136, + "grad_norm": 0.5741465435718031, + "learning_rate": 1.8269574046498205e-06, + "loss": 11.8025, + "step": 34557 + }, + { + "epoch": 1.8818212139157189, + "grad_norm": 0.520782133536226, + "learning_rate": 1.8252798738597866e-06, + "loss": 11.7877, + "step": 34558 + }, + { + "epoch": 1.8818756679123019, + "grad_norm": 0.574023474605489, + "learning_rate": 1.8236031064928548e-06, + "loss": 11.7481, + "step": 34559 + }, + { + "epoch": 1.8819301219088849, + "grad_norm": 0.5782810539166594, + "learning_rate": 1.8219271025620489e-06, + "loss": 11.833, + "step": 34560 + }, + { + "epoch": 1.8819845759054679, + "grad_norm": 0.6411418693401827, + "learning_rate": 1.820251862080391e-06, + "loss": 11.714, + "step": 34561 + }, + { + "epoch": 1.8820390299020509, + "grad_norm": 0.5110308301288894, + "learning_rate": 1.8185773850609267e-06, + "loss": 11.7653, + "step": 34562 + }, + { + "epoch": 1.8820934838986338, + "grad_norm": 0.5322964441196149, + "learning_rate": 1.8169036715166677e-06, + "loss": 11.744, + "step": 34563 + }, + { + "epoch": 1.8821479378952168, + "grad_norm": 0.6191140639186177, + "learning_rate": 1.8152307214606368e-06, + "loss": 11.7733, + "step": 34564 + }, + { + "epoch": 1.8822023918917998, + "grad_norm": 0.5484913916318791, + "learning_rate": 1.8135585349058236e-06, + "loss": 11.7513, + "step": 34565 + }, + { + "epoch": 1.8822568458883828, + "grad_norm": 0.5840247447273171, + "learning_rate": 1.8118871118652515e-06, + "loss": 11.8865, + "step": 34566 + }, + { + "epoch": 1.8823112998849658, + "grad_norm": 0.48549691500511627, + "learning_rate": 1.8102164523519206e-06, + "loss": 11.7408, + "step": 34567 + }, + { + "epoch": 1.8823657538815488, + "grad_norm": 0.5255324341211323, + "learning_rate": 1.8085465563787985e-06, + "loss": 11.8428, + "step": 34568 + }, + { + "epoch": 1.8824202078781318, + "grad_norm": 0.5819003900039607, + "learning_rate": 1.8068774239589082e-06, + "loss": 11.8125, + "step": 34569 + }, + { + "epoch": 1.882474661874715, + "grad_norm": 0.5291091749693664, + "learning_rate": 1.8052090551051837e-06, + "loss": 11.8378, + "step": 34570 + }, + { + "epoch": 1.882529115871298, + "grad_norm": 0.5638550577042505, + "learning_rate": 1.8035414498306258e-06, + "loss": 11.5874, + "step": 34571 + }, + { + "epoch": 1.882583569867881, + "grad_norm": 0.5451776922628029, + "learning_rate": 1.8018746081482018e-06, + "loss": 11.68, + "step": 34572 + }, + { + "epoch": 1.882638023864464, + "grad_norm": 0.6496991930191914, + "learning_rate": 1.800208530070857e-06, + "loss": 11.758, + "step": 34573 + }, + { + "epoch": 1.882692477861047, + "grad_norm": 0.5364858740991691, + "learning_rate": 1.798543215611581e-06, + "loss": 11.7761, + "step": 34574 + }, + { + "epoch": 1.88274693185763, + "grad_norm": 0.556211449417477, + "learning_rate": 1.7968786647832747e-06, + "loss": 11.8209, + "step": 34575 + }, + { + "epoch": 1.8828013858542132, + "grad_norm": 0.5515509456259532, + "learning_rate": 1.7952148775989275e-06, + "loss": 11.8054, + "step": 34576 + }, + { + "epoch": 1.8828558398507962, + "grad_norm": 0.6478233205323938, + "learning_rate": 1.7935518540714514e-06, + "loss": 11.8756, + "step": 34577 + }, + { + "epoch": 1.8829102938473792, + "grad_norm": 0.5265629244130572, + "learning_rate": 1.7918895942137804e-06, + "loss": 11.7566, + "step": 34578 + }, + { + "epoch": 1.8829647478439622, + "grad_norm": 0.5120578431869297, + "learning_rate": 1.7902280980388596e-06, + "loss": 11.676, + "step": 34579 + }, + { + "epoch": 1.8830192018405452, + "grad_norm": 0.5100364969390428, + "learning_rate": 1.7885673655595902e-06, + "loss": 11.8194, + "step": 34580 + }, + { + "epoch": 1.8830736558371282, + "grad_norm": 0.6390320572686244, + "learning_rate": 1.7869073967888839e-06, + "loss": 11.8057, + "step": 34581 + }, + { + "epoch": 1.8831281098337112, + "grad_norm": 0.6243437268660795, + "learning_rate": 1.7852481917396636e-06, + "loss": 11.8041, + "step": 34582 + }, + { + "epoch": 1.8831825638302941, + "grad_norm": 0.5538963090657304, + "learning_rate": 1.7835897504248078e-06, + "loss": 11.7161, + "step": 34583 + }, + { + "epoch": 1.8832370178268771, + "grad_norm": 0.5899599698293884, + "learning_rate": 1.7819320728572508e-06, + "loss": 11.8435, + "step": 34584 + }, + { + "epoch": 1.8832914718234601, + "grad_norm": 0.5630917006394409, + "learning_rate": 1.780275159049849e-06, + "loss": 11.7576, + "step": 34585 + }, + { + "epoch": 1.8833459258200431, + "grad_norm": 0.5680457632757285, + "learning_rate": 1.778619009015503e-06, + "loss": 11.817, + "step": 34586 + }, + { + "epoch": 1.8834003798166261, + "grad_norm": 0.6171949631287466, + "learning_rate": 1.7769636227670805e-06, + "loss": 11.7812, + "step": 34587 + }, + { + "epoch": 1.883454833813209, + "grad_norm": 0.5608462336342018, + "learning_rate": 1.7753090003174711e-06, + "loss": 11.6799, + "step": 34588 + }, + { + "epoch": 1.883509287809792, + "grad_norm": 0.614067725318175, + "learning_rate": 1.773655141679531e-06, + "loss": 11.876, + "step": 34589 + }, + { + "epoch": 1.883563741806375, + "grad_norm": 0.5843568433323433, + "learning_rate": 1.772002046866117e-06, + "loss": 11.8226, + "step": 34590 + }, + { + "epoch": 1.883618195802958, + "grad_norm": 0.5584145875563088, + "learning_rate": 1.770349715890085e-06, + "loss": 11.727, + "step": 34591 + }, + { + "epoch": 1.883672649799541, + "grad_norm": 0.553526472848038, + "learning_rate": 1.7686981487642918e-06, + "loss": 12.0033, + "step": 34592 + }, + { + "epoch": 1.8837271037961243, + "grad_norm": 0.5529915250229152, + "learning_rate": 1.76704734550156e-06, + "loss": 11.7626, + "step": 34593 + }, + { + "epoch": 1.8837815577927073, + "grad_norm": 0.5301679117743063, + "learning_rate": 1.7653973061147688e-06, + "loss": 11.8459, + "step": 34594 + }, + { + "epoch": 1.8838360117892903, + "grad_norm": 0.6135710586873316, + "learning_rate": 1.7637480306166964e-06, + "loss": 11.8688, + "step": 34595 + }, + { + "epoch": 1.8838904657858733, + "grad_norm": 0.511829590838322, + "learning_rate": 1.7620995190202105e-06, + "loss": 11.7596, + "step": 34596 + }, + { + "epoch": 1.8839449197824563, + "grad_norm": 0.507604732791635, + "learning_rate": 1.7604517713381008e-06, + "loss": 11.7183, + "step": 34597 + }, + { + "epoch": 1.8839993737790393, + "grad_norm": 0.5305830915378384, + "learning_rate": 1.7588047875832013e-06, + "loss": 11.7633, + "step": 34598 + }, + { + "epoch": 1.8840538277756225, + "grad_norm": 0.5432931869801424, + "learning_rate": 1.7571585677683133e-06, + "loss": 11.8153, + "step": 34599 + }, + { + "epoch": 1.8841082817722055, + "grad_norm": 0.585687679615243, + "learning_rate": 1.7555131119062374e-06, + "loss": 11.7612, + "step": 34600 + }, + { + "epoch": 1.8841627357687885, + "grad_norm": 0.5167908151283527, + "learning_rate": 1.7538684200097632e-06, + "loss": 11.7514, + "step": 34601 + }, + { + "epoch": 1.8842171897653714, + "grad_norm": 0.5467397447246971, + "learning_rate": 1.7522244920916698e-06, + "loss": 11.8294, + "step": 34602 + }, + { + "epoch": 1.8842716437619544, + "grad_norm": 0.6097082491675548, + "learning_rate": 1.7505813281647797e-06, + "loss": 11.8247, + "step": 34603 + }, + { + "epoch": 1.8843260977585374, + "grad_norm": 0.5857637570024662, + "learning_rate": 1.7489389282418278e-06, + "loss": 11.7459, + "step": 34604 + }, + { + "epoch": 1.8843805517551204, + "grad_norm": 0.5376543082440299, + "learning_rate": 1.7472972923356035e-06, + "loss": 11.845, + "step": 34605 + }, + { + "epoch": 1.8844350057517034, + "grad_norm": 0.5574582469953928, + "learning_rate": 1.7456564204588854e-06, + "loss": 11.8851, + "step": 34606 + }, + { + "epoch": 1.8844894597482864, + "grad_norm": 0.5628027342243649, + "learning_rate": 1.744016312624408e-06, + "loss": 11.8243, + "step": 34607 + }, + { + "epoch": 1.8845439137448694, + "grad_norm": 0.4962840768947248, + "learning_rate": 1.7423769688449388e-06, + "loss": 11.7812, + "step": 34608 + }, + { + "epoch": 1.8845983677414524, + "grad_norm": 0.5508468887795229, + "learning_rate": 1.740738389133234e-06, + "loss": 11.8607, + "step": 34609 + }, + { + "epoch": 1.8846528217380354, + "grad_norm": 0.5677877786463065, + "learning_rate": 1.7391005735020172e-06, + "loss": 11.6974, + "step": 34610 + }, + { + "epoch": 1.8847072757346184, + "grad_norm": 0.6195100329471449, + "learning_rate": 1.7374635219640334e-06, + "loss": 11.756, + "step": 34611 + }, + { + "epoch": 1.8847617297312014, + "grad_norm": 0.5438648655468336, + "learning_rate": 1.735827234532006e-06, + "loss": 11.7925, + "step": 34612 + }, + { + "epoch": 1.8848161837277844, + "grad_norm": 0.532194074554113, + "learning_rate": 1.7341917112186801e-06, + "loss": 11.7477, + "step": 34613 + }, + { + "epoch": 1.8848706377243674, + "grad_norm": 0.5519866347819824, + "learning_rate": 1.7325569520367458e-06, + "loss": 11.6991, + "step": 34614 + }, + { + "epoch": 1.8849250917209504, + "grad_norm": 0.5740721307059174, + "learning_rate": 1.730922956998926e-06, + "loss": 11.6997, + "step": 34615 + }, + { + "epoch": 1.8849795457175333, + "grad_norm": 0.5505885728565861, + "learning_rate": 1.7292897261179442e-06, + "loss": 11.6975, + "step": 34616 + }, + { + "epoch": 1.8850339997141166, + "grad_norm": 0.5209356291069546, + "learning_rate": 1.7276572594064677e-06, + "loss": 11.7307, + "step": 34617 + }, + { + "epoch": 1.8850884537106996, + "grad_norm": 0.5481467800813334, + "learning_rate": 1.72602555687722e-06, + "loss": 11.6987, + "step": 34618 + }, + { + "epoch": 1.8851429077072825, + "grad_norm": 0.5774504452202481, + "learning_rate": 1.7243946185428794e-06, + "loss": 11.8261, + "step": 34619 + }, + { + "epoch": 1.8851973617038655, + "grad_norm": 0.5343436440407358, + "learning_rate": 1.722764444416125e-06, + "loss": 11.7185, + "step": 34620 + }, + { + "epoch": 1.8852518157004485, + "grad_norm": 0.6282483195123607, + "learning_rate": 1.7211350345096355e-06, + "loss": 11.5196, + "step": 34621 + }, + { + "epoch": 1.8853062696970315, + "grad_norm": 0.5681866131835598, + "learning_rate": 1.7195063888360786e-06, + "loss": 11.767, + "step": 34622 + }, + { + "epoch": 1.8853607236936147, + "grad_norm": 0.5812783220152261, + "learning_rate": 1.7178785074081216e-06, + "loss": 11.9382, + "step": 34623 + }, + { + "epoch": 1.8854151776901977, + "grad_norm": 0.526674766439873, + "learning_rate": 1.7162513902384214e-06, + "loss": 11.6523, + "step": 34624 + }, + { + "epoch": 1.8854696316867807, + "grad_norm": 0.541344562149101, + "learning_rate": 1.7146250373396455e-06, + "loss": 11.8034, + "step": 34625 + }, + { + "epoch": 1.8855240856833637, + "grad_norm": 0.5301244783231273, + "learning_rate": 1.7129994487244061e-06, + "loss": 11.6972, + "step": 34626 + }, + { + "epoch": 1.8855785396799467, + "grad_norm": 0.5027119398328848, + "learning_rate": 1.7113746244053818e-06, + "loss": 11.7451, + "step": 34627 + }, + { + "epoch": 1.8856329936765297, + "grad_norm": 0.5125000611931756, + "learning_rate": 1.709750564395185e-06, + "loss": 11.7177, + "step": 34628 + }, + { + "epoch": 1.8856874476731127, + "grad_norm": 0.5237211434302128, + "learning_rate": 1.7081272687064609e-06, + "loss": 11.765, + "step": 34629 + }, + { + "epoch": 1.8857419016696957, + "grad_norm": 0.639306643617262, + "learning_rate": 1.7065047373518105e-06, + "loss": 11.6949, + "step": 34630 + }, + { + "epoch": 1.8857963556662787, + "grad_norm": 0.5658080562396036, + "learning_rate": 1.7048829703438685e-06, + "loss": 11.7816, + "step": 34631 + }, + { + "epoch": 1.8858508096628617, + "grad_norm": 0.5362130035284358, + "learning_rate": 1.7032619676952356e-06, + "loss": 11.815, + "step": 34632 + }, + { + "epoch": 1.8859052636594447, + "grad_norm": 0.5027667432043391, + "learning_rate": 1.7016417294185349e-06, + "loss": 11.6848, + "step": 34633 + }, + { + "epoch": 1.8859597176560277, + "grad_norm": 0.6115580945655306, + "learning_rate": 1.7000222555263346e-06, + "loss": 11.7466, + "step": 34634 + }, + { + "epoch": 1.8860141716526106, + "grad_norm": 0.5362984113650079, + "learning_rate": 1.6984035460312687e-06, + "loss": 11.7809, + "step": 34635 + }, + { + "epoch": 1.8860686256491936, + "grad_norm": 0.5479685299225547, + "learning_rate": 1.6967856009458826e-06, + "loss": 11.913, + "step": 34636 + }, + { + "epoch": 1.8861230796457766, + "grad_norm": 0.5556102697829234, + "learning_rate": 1.6951684202827888e-06, + "loss": 11.8064, + "step": 34637 + }, + { + "epoch": 1.8861775336423596, + "grad_norm": 0.5221456171550668, + "learning_rate": 1.6935520040545328e-06, + "loss": 11.798, + "step": 34638 + }, + { + "epoch": 1.8862319876389426, + "grad_norm": 0.5436265338598268, + "learning_rate": 1.6919363522737263e-06, + "loss": 11.7613, + "step": 34639 + }, + { + "epoch": 1.8862864416355258, + "grad_norm": 0.5368084362797071, + "learning_rate": 1.6903214649529043e-06, + "loss": 11.8183, + "step": 34640 + }, + { + "epoch": 1.8863408956321088, + "grad_norm": 0.6089297506825833, + "learning_rate": 1.6887073421046117e-06, + "loss": 11.7475, + "step": 34641 + }, + { + "epoch": 1.8863953496286918, + "grad_norm": 0.5182800471381385, + "learning_rate": 1.6870939837414278e-06, + "loss": 11.7802, + "step": 34642 + }, + { + "epoch": 1.8864498036252748, + "grad_norm": 0.5707836716971906, + "learning_rate": 1.685481389875887e-06, + "loss": 11.8407, + "step": 34643 + }, + { + "epoch": 1.8865042576218578, + "grad_norm": 0.5300192048478521, + "learning_rate": 1.6838695605205346e-06, + "loss": 11.6704, + "step": 34644 + }, + { + "epoch": 1.8865587116184408, + "grad_norm": 0.5188660417075077, + "learning_rate": 1.682258495687905e-06, + "loss": 11.7235, + "step": 34645 + }, + { + "epoch": 1.886613165615024, + "grad_norm": 0.571706838943987, + "learning_rate": 1.6806481953905106e-06, + "loss": 11.7495, + "step": 34646 + }, + { + "epoch": 1.886667619611607, + "grad_norm": 0.531707708010136, + "learning_rate": 1.6790386596408858e-06, + "loss": 11.6087, + "step": 34647 + }, + { + "epoch": 1.88672207360819, + "grad_norm": 0.5221425359013574, + "learning_rate": 1.6774298884515427e-06, + "loss": 11.7859, + "step": 34648 + }, + { + "epoch": 1.886776527604773, + "grad_norm": 0.512961130352283, + "learning_rate": 1.6758218818350046e-06, + "loss": 11.7506, + "step": 34649 + }, + { + "epoch": 1.886830981601356, + "grad_norm": 0.5830322974777441, + "learning_rate": 1.6742146398037617e-06, + "loss": 11.8445, + "step": 34650 + }, + { + "epoch": 1.886885435597939, + "grad_norm": 0.5815686863134284, + "learning_rate": 1.6726081623703038e-06, + "loss": 11.8466, + "step": 34651 + }, + { + "epoch": 1.886939889594522, + "grad_norm": 0.5212978927462005, + "learning_rate": 1.6710024495471433e-06, + "loss": 11.7543, + "step": 34652 + }, + { + "epoch": 1.886994343591105, + "grad_norm": 0.5131432598187302, + "learning_rate": 1.6693975013467478e-06, + "loss": 11.7428, + "step": 34653 + }, + { + "epoch": 1.887048797587688, + "grad_norm": 0.5679582808671314, + "learning_rate": 1.6677933177816184e-06, + "loss": 11.8561, + "step": 34654 + }, + { + "epoch": 1.887103251584271, + "grad_norm": 0.5255067247519414, + "learning_rate": 1.6661898988642123e-06, + "loss": 11.7917, + "step": 34655 + }, + { + "epoch": 1.887157705580854, + "grad_norm": 0.563327143505627, + "learning_rate": 1.6645872446070078e-06, + "loss": 11.8587, + "step": 34656 + }, + { + "epoch": 1.887212159577437, + "grad_norm": 0.5254916221724062, + "learning_rate": 1.6629853550224618e-06, + "loss": 11.7731, + "step": 34657 + }, + { + "epoch": 1.88726661357402, + "grad_norm": 0.569864926068086, + "learning_rate": 1.661384230123031e-06, + "loss": 11.6328, + "step": 34658 + }, + { + "epoch": 1.887321067570603, + "grad_norm": 0.5249439267573294, + "learning_rate": 1.659783869921172e-06, + "loss": 11.7202, + "step": 34659 + }, + { + "epoch": 1.887375521567186, + "grad_norm": 0.514860100244056, + "learning_rate": 1.6581842744293307e-06, + "loss": 11.8043, + "step": 34660 + }, + { + "epoch": 1.887429975563769, + "grad_norm": 0.5258641290207511, + "learning_rate": 1.6565854436599303e-06, + "loss": 11.7038, + "step": 34661 + }, + { + "epoch": 1.887484429560352, + "grad_norm": 0.5452153717444679, + "learning_rate": 1.6549873776254166e-06, + "loss": 11.8005, + "step": 34662 + }, + { + "epoch": 1.887538883556935, + "grad_norm": 0.5488223450922112, + "learning_rate": 1.6533900763382125e-06, + "loss": 11.7307, + "step": 34663 + }, + { + "epoch": 1.887593337553518, + "grad_norm": 0.5598097191485137, + "learning_rate": 1.651793539810742e-06, + "loss": 11.7573, + "step": 34664 + }, + { + "epoch": 1.887647791550101, + "grad_norm": 0.5145815345171996, + "learning_rate": 1.650197768055417e-06, + "loss": 11.7986, + "step": 34665 + }, + { + "epoch": 1.887702245546684, + "grad_norm": 0.5233328645199831, + "learning_rate": 1.6486027610846499e-06, + "loss": 11.7454, + "step": 34666 + }, + { + "epoch": 1.887756699543267, + "grad_norm": 0.5236644100545705, + "learning_rate": 1.647008518910842e-06, + "loss": 11.7264, + "step": 34667 + }, + { + "epoch": 1.88781115353985, + "grad_norm": 0.48575464511707533, + "learning_rate": 1.6454150415463832e-06, + "loss": 11.695, + "step": 34668 + }, + { + "epoch": 1.8878656075364333, + "grad_norm": 0.5971994981377021, + "learning_rate": 1.6438223290036747e-06, + "loss": 11.7124, + "step": 34669 + }, + { + "epoch": 1.8879200615330163, + "grad_norm": 0.5287751875479139, + "learning_rate": 1.6422303812951068e-06, + "loss": 11.7466, + "step": 34670 + }, + { + "epoch": 1.8879745155295993, + "grad_norm": 0.5880344382484398, + "learning_rate": 1.640639198433036e-06, + "loss": 11.9114, + "step": 34671 + }, + { + "epoch": 1.8880289695261823, + "grad_norm": 0.5139600890663518, + "learning_rate": 1.6390487804298527e-06, + "loss": 11.7664, + "step": 34672 + }, + { + "epoch": 1.8880834235227653, + "grad_norm": 0.5631639749303369, + "learning_rate": 1.6374591272979244e-06, + "loss": 11.7814, + "step": 34673 + }, + { + "epoch": 1.8881378775193483, + "grad_norm": 0.5110284170270594, + "learning_rate": 1.6358702390496084e-06, + "loss": 11.6061, + "step": 34674 + }, + { + "epoch": 1.8881923315159312, + "grad_norm": 0.5624146109225929, + "learning_rate": 1.634282115697261e-06, + "loss": 11.8418, + "step": 34675 + }, + { + "epoch": 1.8882467855125142, + "grad_norm": 0.5334597198508444, + "learning_rate": 1.6326947572532281e-06, + "loss": 11.7894, + "step": 34676 + }, + { + "epoch": 1.8883012395090972, + "grad_norm": 0.5403257393225684, + "learning_rate": 1.6311081637298665e-06, + "loss": 11.7361, + "step": 34677 + }, + { + "epoch": 1.8883556935056802, + "grad_norm": 0.5198175447217575, + "learning_rate": 1.6295223351394884e-06, + "loss": 11.7902, + "step": 34678 + }, + { + "epoch": 1.8884101475022632, + "grad_norm": 0.5247410821422873, + "learning_rate": 1.627937271494462e-06, + "loss": 11.6793, + "step": 34679 + }, + { + "epoch": 1.8884646014988462, + "grad_norm": 0.5245438830662863, + "learning_rate": 1.626352972807077e-06, + "loss": 11.8101, + "step": 34680 + }, + { + "epoch": 1.8885190554954292, + "grad_norm": 0.5197853217966982, + "learning_rate": 1.6247694390896685e-06, + "loss": 11.7402, + "step": 34681 + }, + { + "epoch": 1.8885735094920122, + "grad_norm": 0.6099667352408009, + "learning_rate": 1.6231866703545594e-06, + "loss": 11.7899, + "step": 34682 + }, + { + "epoch": 1.8886279634885952, + "grad_norm": 0.541271358248567, + "learning_rate": 1.6216046666140405e-06, + "loss": 11.7236, + "step": 34683 + }, + { + "epoch": 1.8886824174851782, + "grad_norm": 0.5240606070074448, + "learning_rate": 1.620023427880435e-06, + "loss": 11.8339, + "step": 34684 + }, + { + "epoch": 1.8887368714817612, + "grad_norm": 0.5808576552608417, + "learning_rate": 1.6184429541660106e-06, + "loss": 11.7861, + "step": 34685 + }, + { + "epoch": 1.8887913254783442, + "grad_norm": 0.4815287009993346, + "learning_rate": 1.6168632454830802e-06, + "loss": 11.8493, + "step": 34686 + }, + { + "epoch": 1.8888457794749274, + "grad_norm": 0.5570491946695187, + "learning_rate": 1.6152843018439111e-06, + "loss": 11.8726, + "step": 34687 + }, + { + "epoch": 1.8889002334715104, + "grad_norm": 0.5613414560471554, + "learning_rate": 1.6137061232607942e-06, + "loss": 11.6397, + "step": 34688 + }, + { + "epoch": 1.8889546874680934, + "grad_norm": 0.518802792322339, + "learning_rate": 1.6121287097460192e-06, + "loss": 11.6943, + "step": 34689 + }, + { + "epoch": 1.8890091414646764, + "grad_norm": 0.5352295290433896, + "learning_rate": 1.6105520613118097e-06, + "loss": 11.6351, + "step": 34690 + }, + { + "epoch": 1.8890635954612593, + "grad_norm": 0.6134100862756692, + "learning_rate": 1.6089761779704449e-06, + "loss": 11.9323, + "step": 34691 + }, + { + "epoch": 1.8891180494578426, + "grad_norm": 0.5595482894017889, + "learning_rate": 1.6074010597341927e-06, + "loss": 11.8118, + "step": 34692 + }, + { + "epoch": 1.8891725034544256, + "grad_norm": 0.49573460899014316, + "learning_rate": 1.6058267066152765e-06, + "loss": 11.7629, + "step": 34693 + }, + { + "epoch": 1.8892269574510085, + "grad_norm": 0.48920837880105894, + "learning_rate": 1.6042531186259646e-06, + "loss": 11.7556, + "step": 34694 + }, + { + "epoch": 1.8892814114475915, + "grad_norm": 0.5987163818653659, + "learning_rate": 1.6026802957784691e-06, + "loss": 11.7194, + "step": 34695 + }, + { + "epoch": 1.8893358654441745, + "grad_norm": 0.5411999354481595, + "learning_rate": 1.6011082380850472e-06, + "loss": 11.8184, + "step": 34696 + }, + { + "epoch": 1.8893903194407575, + "grad_norm": 0.5333057676812357, + "learning_rate": 1.5995369455579001e-06, + "loss": 11.8026, + "step": 34697 + }, + { + "epoch": 1.8894447734373405, + "grad_norm": 0.531896850009571, + "learning_rate": 1.5979664182092514e-06, + "loss": 11.8222, + "step": 34698 + }, + { + "epoch": 1.8894992274339235, + "grad_norm": 0.5833299775246741, + "learning_rate": 1.5963966560513465e-06, + "loss": 11.8024, + "step": 34699 + }, + { + "epoch": 1.8895536814305065, + "grad_norm": 0.49339265781876485, + "learning_rate": 1.5948276590963318e-06, + "loss": 11.7774, + "step": 34700 + }, + { + "epoch": 1.8896081354270895, + "grad_norm": 0.5774944496657055, + "learning_rate": 1.5932594273564528e-06, + "loss": 11.8376, + "step": 34701 + }, + { + "epoch": 1.8896625894236725, + "grad_norm": 0.5468619797962504, + "learning_rate": 1.5916919608438885e-06, + "loss": 11.5729, + "step": 34702 + }, + { + "epoch": 1.8897170434202555, + "grad_norm": 0.5493901200405292, + "learning_rate": 1.5901252595708293e-06, + "loss": 11.8508, + "step": 34703 + }, + { + "epoch": 1.8897714974168385, + "grad_norm": 0.5532054980662606, + "learning_rate": 1.5885593235494657e-06, + "loss": 11.6965, + "step": 34704 + }, + { + "epoch": 1.8898259514134215, + "grad_norm": 0.5317276365390414, + "learning_rate": 1.586994152791954e-06, + "loss": 11.7395, + "step": 34705 + }, + { + "epoch": 1.8898804054100045, + "grad_norm": 0.549413117040086, + "learning_rate": 1.5854297473104961e-06, + "loss": 11.7359, + "step": 34706 + }, + { + "epoch": 1.8899348594065875, + "grad_norm": 0.5398246119148612, + "learning_rate": 1.5838661071172268e-06, + "loss": 11.7162, + "step": 34707 + }, + { + "epoch": 1.8899893134031704, + "grad_norm": 0.5557072050995043, + "learning_rate": 1.5823032322243248e-06, + "loss": 11.7413, + "step": 34708 + }, + { + "epoch": 1.8900437673997534, + "grad_norm": 0.5064212637841524, + "learning_rate": 1.580741122643936e-06, + "loss": 11.7254, + "step": 34709 + }, + { + "epoch": 1.8900982213963367, + "grad_norm": 0.5484464114083785, + "learning_rate": 1.5791797783882178e-06, + "loss": 11.7821, + "step": 34710 + }, + { + "epoch": 1.8901526753929196, + "grad_norm": 0.6471395838410182, + "learning_rate": 1.5776191994693046e-06, + "loss": 11.8386, + "step": 34711 + }, + { + "epoch": 1.8902071293895026, + "grad_norm": 0.5416635492934879, + "learning_rate": 1.57605938589932e-06, + "loss": 11.7639, + "step": 34712 + }, + { + "epoch": 1.8902615833860856, + "grad_norm": 0.5751980565896976, + "learning_rate": 1.5745003376903987e-06, + "loss": 11.8457, + "step": 34713 + }, + { + "epoch": 1.8903160373826686, + "grad_norm": 0.4850482014936915, + "learning_rate": 1.5729420548546758e-06, + "loss": 11.775, + "step": 34714 + }, + { + "epoch": 1.8903704913792516, + "grad_norm": 0.5402958731500984, + "learning_rate": 1.5713845374042634e-06, + "loss": 11.7768, + "step": 34715 + }, + { + "epoch": 1.8904249453758348, + "grad_norm": 0.5487319009951255, + "learning_rate": 1.5698277853512634e-06, + "loss": 11.8579, + "step": 34716 + }, + { + "epoch": 1.8904793993724178, + "grad_norm": 0.5120235422922093, + "learning_rate": 1.5682717987077988e-06, + "loss": 11.6429, + "step": 34717 + }, + { + "epoch": 1.8905338533690008, + "grad_norm": 0.5177553700338551, + "learning_rate": 1.5667165774859604e-06, + "loss": 11.8002, + "step": 34718 + }, + { + "epoch": 1.8905883073655838, + "grad_norm": 0.5220602464741136, + "learning_rate": 1.5651621216978274e-06, + "loss": 11.7366, + "step": 34719 + }, + { + "epoch": 1.8906427613621668, + "grad_norm": 0.5365320626779672, + "learning_rate": 1.5636084313555122e-06, + "loss": 11.7242, + "step": 34720 + }, + { + "epoch": 1.8906972153587498, + "grad_norm": 0.5375619959670963, + "learning_rate": 1.5620555064710939e-06, + "loss": 11.7972, + "step": 34721 + }, + { + "epoch": 1.8907516693553328, + "grad_norm": 0.5818501230587102, + "learning_rate": 1.56050334705663e-06, + "loss": 11.6516, + "step": 34722 + }, + { + "epoch": 1.8908061233519158, + "grad_norm": 0.5517358627891079, + "learning_rate": 1.558951953124199e-06, + "loss": 11.7736, + "step": 34723 + }, + { + "epoch": 1.8908605773484988, + "grad_norm": 0.5677645282670712, + "learning_rate": 1.5574013246858587e-06, + "loss": 11.5162, + "step": 34724 + }, + { + "epoch": 1.8909150313450818, + "grad_norm": 0.5050136781134978, + "learning_rate": 1.5558514617536878e-06, + "loss": 11.78, + "step": 34725 + }, + { + "epoch": 1.8909694853416648, + "grad_norm": 0.5766250407978719, + "learning_rate": 1.5543023643397214e-06, + "loss": 11.815, + "step": 34726 + }, + { + "epoch": 1.8910239393382478, + "grad_norm": 0.5869011819929751, + "learning_rate": 1.5527540324560052e-06, + "loss": 11.8199, + "step": 34727 + }, + { + "epoch": 1.8910783933348307, + "grad_norm": 0.5516178557245982, + "learning_rate": 1.5512064661145854e-06, + "loss": 11.8658, + "step": 34728 + }, + { + "epoch": 1.8911328473314137, + "grad_norm": 0.5291889074957623, + "learning_rate": 1.5496596653274965e-06, + "loss": 11.7808, + "step": 34729 + }, + { + "epoch": 1.8911873013279967, + "grad_norm": 0.4693384375666813, + "learning_rate": 1.5481136301067623e-06, + "loss": 11.7714, + "step": 34730 + }, + { + "epoch": 1.8912417553245797, + "grad_norm": 0.5802511886742016, + "learning_rate": 1.5465683604644177e-06, + "loss": 11.8109, + "step": 34731 + }, + { + "epoch": 1.8912962093211627, + "grad_norm": 0.5188283475927622, + "learning_rate": 1.5450238564124531e-06, + "loss": 11.7134, + "step": 34732 + }, + { + "epoch": 1.891350663317746, + "grad_norm": 0.5295055212468985, + "learning_rate": 1.5434801179629034e-06, + "loss": 11.8336, + "step": 34733 + }, + { + "epoch": 1.891405117314329, + "grad_norm": 0.5805184217072353, + "learning_rate": 1.5419371451277476e-06, + "loss": 11.8499, + "step": 34734 + }, + { + "epoch": 1.891459571310912, + "grad_norm": 0.5713144011010932, + "learning_rate": 1.5403949379190096e-06, + "loss": 11.7572, + "step": 34735 + }, + { + "epoch": 1.891514025307495, + "grad_norm": 0.5472873777591046, + "learning_rate": 1.5388534963486801e-06, + "loss": 11.7151, + "step": 34736 + }, + { + "epoch": 1.891568479304078, + "grad_norm": 0.5995463581218854, + "learning_rate": 1.5373128204287268e-06, + "loss": 11.8759, + "step": 34737 + }, + { + "epoch": 1.891622933300661, + "grad_norm": 0.5159348677970845, + "learning_rate": 1.5357729101711517e-06, + "loss": 11.6406, + "step": 34738 + }, + { + "epoch": 1.891677387297244, + "grad_norm": 0.5494847554065374, + "learning_rate": 1.5342337655879112e-06, + "loss": 11.7725, + "step": 34739 + }, + { + "epoch": 1.891731841293827, + "grad_norm": 0.5773632038676855, + "learning_rate": 1.5326953866909855e-06, + "loss": 11.7697, + "step": 34740 + }, + { + "epoch": 1.89178629529041, + "grad_norm": 0.5364709842273305, + "learning_rate": 1.531157773492331e-06, + "loss": 11.7066, + "step": 34741 + }, + { + "epoch": 1.891840749286993, + "grad_norm": 0.4894740729461135, + "learning_rate": 1.529620926003905e-06, + "loss": 11.6814, + "step": 34742 + }, + { + "epoch": 1.891895203283576, + "grad_norm": 0.5604563994796176, + "learning_rate": 1.528084844237665e-06, + "loss": 11.8408, + "step": 34743 + }, + { + "epoch": 1.891949657280159, + "grad_norm": 0.5781736009590769, + "learning_rate": 1.5265495282055453e-06, + "loss": 11.8307, + "step": 34744 + }, + { + "epoch": 1.892004111276742, + "grad_norm": 0.542475410913117, + "learning_rate": 1.5250149779195034e-06, + "loss": 11.7901, + "step": 34745 + }, + { + "epoch": 1.892058565273325, + "grad_norm": 0.5735364564893207, + "learning_rate": 1.5234811933914406e-06, + "loss": 11.6456, + "step": 34746 + }, + { + "epoch": 1.892113019269908, + "grad_norm": 0.5097625218550237, + "learning_rate": 1.521948174633314e-06, + "loss": 11.7917, + "step": 34747 + }, + { + "epoch": 1.892167473266491, + "grad_norm": 0.5090108605281053, + "learning_rate": 1.5204159216570258e-06, + "loss": 11.698, + "step": 34748 + }, + { + "epoch": 1.892221927263074, + "grad_norm": 0.5908728465726407, + "learning_rate": 1.518884434474499e-06, + "loss": 11.8075, + "step": 34749 + }, + { + "epoch": 1.892276381259657, + "grad_norm": 0.5441837765889614, + "learning_rate": 1.5173537130976578e-06, + "loss": 11.7498, + "step": 34750 + }, + { + "epoch": 1.89233083525624, + "grad_norm": 0.523598287999606, + "learning_rate": 1.5158237575383816e-06, + "loss": 11.7017, + "step": 34751 + }, + { + "epoch": 1.892385289252823, + "grad_norm": 0.5610758206507106, + "learning_rate": 1.5142945678085719e-06, + "loss": 11.8497, + "step": 34752 + }, + { + "epoch": 1.892439743249406, + "grad_norm": 0.4921037485056164, + "learning_rate": 1.512766143920119e-06, + "loss": 11.7247, + "step": 34753 + }, + { + "epoch": 1.892494197245989, + "grad_norm": 0.5580199380516381, + "learning_rate": 1.5112384858849137e-06, + "loss": 11.666, + "step": 34754 + }, + { + "epoch": 1.892548651242572, + "grad_norm": 0.5300540095189908, + "learning_rate": 1.5097115937148464e-06, + "loss": 11.8728, + "step": 34755 + }, + { + "epoch": 1.892603105239155, + "grad_norm": 0.5143627162845545, + "learning_rate": 1.5081854674217632e-06, + "loss": 11.8203, + "step": 34756 + }, + { + "epoch": 1.8926575592357382, + "grad_norm": 0.5525976886598135, + "learning_rate": 1.5066601070175657e-06, + "loss": 11.8271, + "step": 34757 + }, + { + "epoch": 1.8927120132323212, + "grad_norm": 0.5606215905430598, + "learning_rate": 1.5051355125140775e-06, + "loss": 11.8279, + "step": 34758 + }, + { + "epoch": 1.8927664672289042, + "grad_norm": 0.5401105593454313, + "learning_rate": 1.5036116839231785e-06, + "loss": 11.7004, + "step": 34759 + }, + { + "epoch": 1.8928209212254872, + "grad_norm": 0.5605481407711291, + "learning_rate": 1.5020886212567254e-06, + "loss": 11.8379, + "step": 34760 + }, + { + "epoch": 1.8928753752220702, + "grad_norm": 0.4993905054794739, + "learning_rate": 1.5005663245265423e-06, + "loss": 11.7567, + "step": 34761 + }, + { + "epoch": 1.8929298292186534, + "grad_norm": 0.5246236421027616, + "learning_rate": 1.499044793744464e-06, + "loss": 11.7809, + "step": 34762 + }, + { + "epoch": 1.8929842832152364, + "grad_norm": 0.5593205693022092, + "learning_rate": 1.497524028922337e-06, + "loss": 11.7566, + "step": 34763 + }, + { + "epoch": 1.8930387372118194, + "grad_norm": 0.5288054314884846, + "learning_rate": 1.4960040300719846e-06, + "loss": 11.7922, + "step": 34764 + }, + { + "epoch": 1.8930931912084024, + "grad_norm": 0.60937038346665, + "learning_rate": 1.494484797205231e-06, + "loss": 11.8041, + "step": 34765 + }, + { + "epoch": 1.8931476452049854, + "grad_norm": 0.5091641163587646, + "learning_rate": 1.492966330333867e-06, + "loss": 11.5661, + "step": 34766 + }, + { + "epoch": 1.8932020992015683, + "grad_norm": 0.5661740741072653, + "learning_rate": 1.4914486294697271e-06, + "loss": 11.8103, + "step": 34767 + }, + { + "epoch": 1.8932565531981513, + "grad_norm": 0.5477024116714037, + "learning_rate": 1.4899316946246022e-06, + "loss": 11.7092, + "step": 34768 + }, + { + "epoch": 1.8933110071947343, + "grad_norm": 0.47994673754978207, + "learning_rate": 1.4884155258102828e-06, + "loss": 11.6568, + "step": 34769 + }, + { + "epoch": 1.8933654611913173, + "grad_norm": 0.5191042016630331, + "learning_rate": 1.4869001230385815e-06, + "loss": 11.7265, + "step": 34770 + }, + { + "epoch": 1.8934199151879003, + "grad_norm": 0.49992275049186446, + "learning_rate": 1.4853854863212447e-06, + "loss": 11.7216, + "step": 34771 + }, + { + "epoch": 1.8934743691844833, + "grad_norm": 0.537041236182039, + "learning_rate": 1.483871615670085e-06, + "loss": 11.7867, + "step": 34772 + }, + { + "epoch": 1.8935288231810663, + "grad_norm": 0.5737665435883587, + "learning_rate": 1.4823585110968486e-06, + "loss": 11.9449, + "step": 34773 + }, + { + "epoch": 1.8935832771776493, + "grad_norm": 0.5464830454808277, + "learning_rate": 1.480846172613315e-06, + "loss": 11.752, + "step": 34774 + }, + { + "epoch": 1.8936377311742323, + "grad_norm": 0.5219663064655751, + "learning_rate": 1.4793346002312524e-06, + "loss": 11.7249, + "step": 34775 + }, + { + "epoch": 1.8936921851708153, + "grad_norm": 0.5716351177460596, + "learning_rate": 1.4778237939623962e-06, + "loss": 11.7522, + "step": 34776 + }, + { + "epoch": 1.8937466391673983, + "grad_norm": 0.526522341466927, + "learning_rate": 1.4763137538185146e-06, + "loss": 11.6462, + "step": 34777 + }, + { + "epoch": 1.8938010931639813, + "grad_norm": 0.5344246023625869, + "learning_rate": 1.4748044798113315e-06, + "loss": 11.612, + "step": 34778 + }, + { + "epoch": 1.8938555471605643, + "grad_norm": 0.590258437698882, + "learning_rate": 1.473295971952604e-06, + "loss": 11.7494, + "step": 34779 + }, + { + "epoch": 1.8939100011571475, + "grad_norm": 0.49805653184473947, + "learning_rate": 1.4717882302540454e-06, + "loss": 11.7127, + "step": 34780 + }, + { + "epoch": 1.8939644551537305, + "grad_norm": 0.5563173794132958, + "learning_rate": 1.4702812547273793e-06, + "loss": 11.7098, + "step": 34781 + }, + { + "epoch": 1.8940189091503135, + "grad_norm": 0.5431409688264829, + "learning_rate": 1.4687750453843296e-06, + "loss": 11.8068, + "step": 34782 + }, + { + "epoch": 1.8940733631468964, + "grad_norm": 0.5453814382357687, + "learning_rate": 1.4672696022366095e-06, + "loss": 11.839, + "step": 34783 + }, + { + "epoch": 1.8941278171434794, + "grad_norm": 0.5824055691910709, + "learning_rate": 1.4657649252959204e-06, + "loss": 11.8123, + "step": 34784 + }, + { + "epoch": 1.8941822711400624, + "grad_norm": 0.5229939638108031, + "learning_rate": 1.4642610145739755e-06, + "loss": 11.6223, + "step": 34785 + }, + { + "epoch": 1.8942367251366456, + "grad_norm": 0.5515915628150705, + "learning_rate": 1.462757870082454e-06, + "loss": 11.7954, + "step": 34786 + }, + { + "epoch": 1.8942911791332286, + "grad_norm": 0.5806034845993756, + "learning_rate": 1.4612554918330579e-06, + "loss": 11.6721, + "step": 34787 + }, + { + "epoch": 1.8943456331298116, + "grad_norm": 0.5664725552415001, + "learning_rate": 1.4597538798374554e-06, + "loss": 11.6593, + "step": 34788 + }, + { + "epoch": 1.8944000871263946, + "grad_norm": 0.5032714377117913, + "learning_rate": 1.4582530341073487e-06, + "loss": 11.7285, + "step": 34789 + }, + { + "epoch": 1.8944545411229776, + "grad_norm": 0.5413865182310977, + "learning_rate": 1.4567529546543833e-06, + "loss": 11.8393, + "step": 34790 + }, + { + "epoch": 1.8945089951195606, + "grad_norm": 0.5406849285193681, + "learning_rate": 1.4552536414902285e-06, + "loss": 11.7651, + "step": 34791 + }, + { + "epoch": 1.8945634491161436, + "grad_norm": 0.5438037667417679, + "learning_rate": 1.453755094626552e-06, + "loss": 11.8675, + "step": 34792 + }, + { + "epoch": 1.8946179031127266, + "grad_norm": 0.48452529742500633, + "learning_rate": 1.4522573140750008e-06, + "loss": 11.7498, + "step": 34793 + }, + { + "epoch": 1.8946723571093096, + "grad_norm": 0.5504460965053491, + "learning_rate": 1.4507602998472204e-06, + "loss": 11.758, + "step": 34794 + }, + { + "epoch": 1.8947268111058926, + "grad_norm": 0.5358206341292236, + "learning_rate": 1.4492640519548572e-06, + "loss": 11.8275, + "step": 34795 + }, + { + "epoch": 1.8947812651024756, + "grad_norm": 0.5243397355325661, + "learning_rate": 1.4477685704095356e-06, + "loss": 11.8239, + "step": 34796 + }, + { + "epoch": 1.8948357190990586, + "grad_norm": 0.5464946328937796, + "learning_rate": 1.4462738552229128e-06, + "loss": 11.7493, + "step": 34797 + }, + { + "epoch": 1.8948901730956416, + "grad_norm": 0.6018134686648586, + "learning_rate": 1.444779906406568e-06, + "loss": 11.7378, + "step": 34798 + }, + { + "epoch": 1.8949446270922246, + "grad_norm": 0.550844955802657, + "learning_rate": 1.4432867239721592e-06, + "loss": 11.7504, + "step": 34799 + }, + { + "epoch": 1.8949990810888075, + "grad_norm": 0.5681524069504759, + "learning_rate": 1.4417943079312768e-06, + "loss": 11.6855, + "step": 34800 + }, + { + "epoch": 1.8950535350853905, + "grad_norm": 0.6080234433877981, + "learning_rate": 1.4403026582955337e-06, + "loss": 11.8908, + "step": 34801 + }, + { + "epoch": 1.8951079890819735, + "grad_norm": 0.5256429397977135, + "learning_rate": 1.4388117750765207e-06, + "loss": 11.6753, + "step": 34802 + }, + { + "epoch": 1.8951624430785567, + "grad_norm": 0.5545665709539822, + "learning_rate": 1.4373216582858285e-06, + "loss": 11.6976, + "step": 34803 + }, + { + "epoch": 1.8952168970751397, + "grad_norm": 0.5014566453021978, + "learning_rate": 1.43583230793507e-06, + "loss": 11.7029, + "step": 34804 + }, + { + "epoch": 1.8952713510717227, + "grad_norm": 0.538242420412618, + "learning_rate": 1.4343437240357916e-06, + "loss": 11.7059, + "step": 34805 + }, + { + "epoch": 1.8953258050683057, + "grad_norm": 0.5268630452561806, + "learning_rate": 1.432855906599595e-06, + "loss": 11.6758, + "step": 34806 + }, + { + "epoch": 1.8953802590648887, + "grad_norm": 0.586093835070148, + "learning_rate": 1.4313688556380377e-06, + "loss": 11.7773, + "step": 34807 + }, + { + "epoch": 1.8954347130614717, + "grad_norm": 0.5295547503407131, + "learning_rate": 1.4298825711626884e-06, + "loss": 11.683, + "step": 34808 + }, + { + "epoch": 1.895489167058055, + "grad_norm": 0.5564299490582887, + "learning_rate": 1.4283970531851044e-06, + "loss": 11.7742, + "step": 34809 + }, + { + "epoch": 1.895543621054638, + "grad_norm": 0.6002470713604036, + "learning_rate": 1.4269123017168318e-06, + "loss": 11.8173, + "step": 34810 + }, + { + "epoch": 1.895598075051221, + "grad_norm": 0.6225778379082046, + "learning_rate": 1.4254283167694172e-06, + "loss": 12.0001, + "step": 34811 + }, + { + "epoch": 1.895652529047804, + "grad_norm": 0.5628654435386629, + "learning_rate": 1.4239450983544068e-06, + "loss": 11.7122, + "step": 34812 + }, + { + "epoch": 1.895706983044387, + "grad_norm": 0.49712718917347337, + "learning_rate": 1.4224626464833246e-06, + "loss": 11.7467, + "step": 34813 + }, + { + "epoch": 1.8957614370409699, + "grad_norm": 0.5111721505786188, + "learning_rate": 1.420980961167717e-06, + "loss": 11.7691, + "step": 34814 + }, + { + "epoch": 1.8958158910375529, + "grad_norm": 0.5331358144303171, + "learning_rate": 1.4195000424190751e-06, + "loss": 11.8331, + "step": 34815 + }, + { + "epoch": 1.8958703450341359, + "grad_norm": 0.6406120784888542, + "learning_rate": 1.418019890248956e-06, + "loss": 11.6602, + "step": 34816 + }, + { + "epoch": 1.8959247990307189, + "grad_norm": 0.5092941387868579, + "learning_rate": 1.4165405046688285e-06, + "loss": 11.7415, + "step": 34817 + }, + { + "epoch": 1.8959792530273019, + "grad_norm": 0.5475993714739043, + "learning_rate": 1.4150618856902164e-06, + "loss": 11.6683, + "step": 34818 + }, + { + "epoch": 1.8960337070238849, + "grad_norm": 0.5204246077710504, + "learning_rate": 1.4135840333246219e-06, + "loss": 11.8169, + "step": 34819 + }, + { + "epoch": 1.8960881610204678, + "grad_norm": 0.5832453250973386, + "learning_rate": 1.4121069475835247e-06, + "loss": 11.8012, + "step": 34820 + }, + { + "epoch": 1.8961426150170508, + "grad_norm": 0.5604925075904181, + "learning_rate": 1.4106306284784265e-06, + "loss": 11.7356, + "step": 34821 + }, + { + "epoch": 1.8961970690136338, + "grad_norm": 0.5131718079393366, + "learning_rate": 1.409155076020785e-06, + "loss": 11.8441, + "step": 34822 + }, + { + "epoch": 1.8962515230102168, + "grad_norm": 0.512722172347258, + "learning_rate": 1.407680290222091e-06, + "loss": 11.812, + "step": 34823 + }, + { + "epoch": 1.8963059770067998, + "grad_norm": 0.6201351620301891, + "learning_rate": 1.406206271093824e-06, + "loss": 11.8832, + "step": 34824 + }, + { + "epoch": 1.8963604310033828, + "grad_norm": 0.5031496210608059, + "learning_rate": 1.4047330186474085e-06, + "loss": 11.6435, + "step": 34825 + }, + { + "epoch": 1.8964148849999658, + "grad_norm": 0.5593957168834991, + "learning_rate": 1.403260532894346e-06, + "loss": 11.8077, + "step": 34826 + }, + { + "epoch": 1.896469338996549, + "grad_norm": 0.5424184431281923, + "learning_rate": 1.4017888138460388e-06, + "loss": 11.7674, + "step": 34827 + }, + { + "epoch": 1.896523792993132, + "grad_norm": 0.5589082158369856, + "learning_rate": 1.4003178615139777e-06, + "loss": 11.7461, + "step": 34828 + }, + { + "epoch": 1.896578246989715, + "grad_norm": 0.5458886550404423, + "learning_rate": 1.3988476759095758e-06, + "loss": 11.7457, + "step": 34829 + }, + { + "epoch": 1.896632700986298, + "grad_norm": 0.5190695557372617, + "learning_rate": 1.3973782570442684e-06, + "loss": 11.7606, + "step": 34830 + }, + { + "epoch": 1.896687154982881, + "grad_norm": 0.5457732202297492, + "learning_rate": 1.3959096049294795e-06, + "loss": 11.7472, + "step": 34831 + }, + { + "epoch": 1.8967416089794642, + "grad_norm": 0.5009467775369594, + "learning_rate": 1.3944417195766335e-06, + "loss": 11.8235, + "step": 34832 + }, + { + "epoch": 1.8967960629760472, + "grad_norm": 0.5759621152852069, + "learning_rate": 1.3929746009971433e-06, + "loss": 11.7632, + "step": 34833 + }, + { + "epoch": 1.8968505169726302, + "grad_norm": 0.5048663985822598, + "learning_rate": 1.3915082492024334e-06, + "loss": 11.7819, + "step": 34834 + }, + { + "epoch": 1.8969049709692132, + "grad_norm": 0.49344584842703065, + "learning_rate": 1.3900426642038721e-06, + "loss": 11.6221, + "step": 34835 + }, + { + "epoch": 1.8969594249657962, + "grad_norm": 0.5586343453616, + "learning_rate": 1.388577846012884e-06, + "loss": 11.8319, + "step": 34836 + }, + { + "epoch": 1.8970138789623792, + "grad_norm": 0.5940360802026438, + "learning_rate": 1.3871137946408597e-06, + "loss": 11.9579, + "step": 34837 + }, + { + "epoch": 1.8970683329589622, + "grad_norm": 0.5076628703369054, + "learning_rate": 1.3856505100991678e-06, + "loss": 11.7297, + "step": 34838 + }, + { + "epoch": 1.8971227869555451, + "grad_norm": 0.6115903399663384, + "learning_rate": 1.3841879923991885e-06, + "loss": 11.8168, + "step": 34839 + }, + { + "epoch": 1.8971772409521281, + "grad_norm": 0.5392599585338174, + "learning_rate": 1.3827262415523124e-06, + "loss": 11.814, + "step": 34840 + }, + { + "epoch": 1.8972316949487111, + "grad_norm": 0.5822170653633048, + "learning_rate": 1.381265257569897e-06, + "loss": 11.772, + "step": 34841 + }, + { + "epoch": 1.8972861489452941, + "grad_norm": 0.5396554052281479, + "learning_rate": 1.3798050404633e-06, + "loss": 11.735, + "step": 34842 + }, + { + "epoch": 1.8973406029418771, + "grad_norm": 0.517078631050961, + "learning_rate": 1.3783455902438792e-06, + "loss": 11.692, + "step": 34843 + }, + { + "epoch": 1.8973950569384601, + "grad_norm": 0.5173106123180002, + "learning_rate": 1.3768869069229695e-06, + "loss": 11.8019, + "step": 34844 + }, + { + "epoch": 1.897449510935043, + "grad_norm": 0.5681062754518449, + "learning_rate": 1.3754289905119288e-06, + "loss": 11.6775, + "step": 34845 + }, + { + "epoch": 1.897503964931626, + "grad_norm": 0.6031698787342817, + "learning_rate": 1.3739718410221037e-06, + "loss": 11.831, + "step": 34846 + }, + { + "epoch": 1.897558418928209, + "grad_norm": 0.49968257780963066, + "learning_rate": 1.372515458464796e-06, + "loss": 11.8007, + "step": 34847 + }, + { + "epoch": 1.897612872924792, + "grad_norm": 0.5205453754959796, + "learning_rate": 1.3710598428513633e-06, + "loss": 11.6712, + "step": 34848 + }, + { + "epoch": 1.897667326921375, + "grad_norm": 0.5691755387365655, + "learning_rate": 1.3696049941930967e-06, + "loss": 11.8457, + "step": 34849 + }, + { + "epoch": 1.8977217809179583, + "grad_norm": 0.49082575073352686, + "learning_rate": 1.3681509125013314e-06, + "loss": 11.7527, + "step": 34850 + }, + { + "epoch": 1.8977762349145413, + "grad_norm": 0.5544832287565732, + "learning_rate": 1.3666975977873697e-06, + "loss": 11.8401, + "step": 34851 + }, + { + "epoch": 1.8978306889111243, + "grad_norm": 0.5811081783586601, + "learning_rate": 1.3652450500625026e-06, + "loss": 11.7448, + "step": 34852 + }, + { + "epoch": 1.8978851429077073, + "grad_norm": 0.5831570482655312, + "learning_rate": 1.3637932693380318e-06, + "loss": 11.7733, + "step": 34853 + }, + { + "epoch": 1.8979395969042903, + "grad_norm": 0.5353805915594402, + "learning_rate": 1.3623422556252375e-06, + "loss": 11.7708, + "step": 34854 + }, + { + "epoch": 1.8979940509008733, + "grad_norm": 0.6718657003779454, + "learning_rate": 1.3608920089354217e-06, + "loss": 11.8133, + "step": 34855 + }, + { + "epoch": 1.8980485048974565, + "grad_norm": 0.5680055538659191, + "learning_rate": 1.3594425292798418e-06, + "loss": 11.7966, + "step": 34856 + }, + { + "epoch": 1.8981029588940395, + "grad_norm": 0.5177834196991179, + "learning_rate": 1.3579938166697892e-06, + "loss": 11.726, + "step": 34857 + }, + { + "epoch": 1.8981574128906225, + "grad_norm": 0.5225753726577712, + "learning_rate": 1.3565458711165101e-06, + "loss": 11.7457, + "step": 34858 + }, + { + "epoch": 1.8982118668872054, + "grad_norm": 0.48728829065372314, + "learning_rate": 1.3550986926312736e-06, + "loss": 11.7121, + "step": 34859 + }, + { + "epoch": 1.8982663208837884, + "grad_norm": 0.5293433402269976, + "learning_rate": 1.353652281225337e-06, + "loss": 11.6661, + "step": 34860 + }, + { + "epoch": 1.8983207748803714, + "grad_norm": 0.6206070880327738, + "learning_rate": 1.3522066369099473e-06, + "loss": 11.8236, + "step": 34861 + }, + { + "epoch": 1.8983752288769544, + "grad_norm": 0.5734420945731776, + "learning_rate": 1.3507617596963396e-06, + "loss": 11.841, + "step": 34862 + }, + { + "epoch": 1.8984296828735374, + "grad_norm": 0.5792092352979687, + "learning_rate": 1.3493176495957493e-06, + "loss": 11.8518, + "step": 34863 + }, + { + "epoch": 1.8984841368701204, + "grad_norm": 0.6182898818515133, + "learning_rate": 1.3478743066194011e-06, + "loss": 11.7188, + "step": 34864 + }, + { + "epoch": 1.8985385908667034, + "grad_norm": 0.5467828552647913, + "learning_rate": 1.3464317307785413e-06, + "loss": 11.7695, + "step": 34865 + }, + { + "epoch": 1.8985930448632864, + "grad_norm": 0.5113883760743647, + "learning_rate": 1.344989922084361e-06, + "loss": 11.7157, + "step": 34866 + }, + { + "epoch": 1.8986474988598694, + "grad_norm": 0.5428993289081379, + "learning_rate": 1.3435488805480845e-06, + "loss": 11.8631, + "step": 34867 + }, + { + "epoch": 1.8987019528564524, + "grad_norm": 0.5534172518042819, + "learning_rate": 1.342108606180925e-06, + "loss": 11.7682, + "step": 34868 + }, + { + "epoch": 1.8987564068530354, + "grad_norm": 0.5327623657273541, + "learning_rate": 1.340669098994063e-06, + "loss": 11.6865, + "step": 34869 + }, + { + "epoch": 1.8988108608496184, + "grad_norm": 0.5025208902523031, + "learning_rate": 1.339230358998711e-06, + "loss": 11.8081, + "step": 34870 + }, + { + "epoch": 1.8988653148462014, + "grad_norm": 0.5593433130700441, + "learning_rate": 1.3377923862060493e-06, + "loss": 11.7972, + "step": 34871 + }, + { + "epoch": 1.8989197688427844, + "grad_norm": 0.5229594386061016, + "learning_rate": 1.3363551806272578e-06, + "loss": 11.7237, + "step": 34872 + }, + { + "epoch": 1.8989742228393676, + "grad_norm": 0.5664251049278367, + "learning_rate": 1.3349187422735166e-06, + "loss": 11.746, + "step": 34873 + }, + { + "epoch": 1.8990286768359506, + "grad_norm": 0.5076171833846765, + "learning_rate": 1.3334830711559832e-06, + "loss": 11.8043, + "step": 34874 + }, + { + "epoch": 1.8990831308325336, + "grad_norm": 0.611139018203511, + "learning_rate": 1.3320481672858487e-06, + "loss": 11.8499, + "step": 34875 + }, + { + "epoch": 1.8991375848291165, + "grad_norm": 0.5403750894731288, + "learning_rate": 1.3306140306742377e-06, + "loss": 11.7036, + "step": 34876 + }, + { + "epoch": 1.8991920388256995, + "grad_norm": 0.5165284539906411, + "learning_rate": 1.32918066133233e-06, + "loss": 11.7363, + "step": 34877 + }, + { + "epoch": 1.8992464928222825, + "grad_norm": 0.6575863872562808, + "learning_rate": 1.32774805927125e-06, + "loss": 11.8829, + "step": 34878 + }, + { + "epoch": 1.8993009468188657, + "grad_norm": 0.5447377032095552, + "learning_rate": 1.3263162245021553e-06, + "loss": 11.8546, + "step": 34879 + }, + { + "epoch": 1.8993554008154487, + "grad_norm": 0.5401548915126622, + "learning_rate": 1.324885157036171e-06, + "loss": 11.8171, + "step": 34880 + }, + { + "epoch": 1.8994098548120317, + "grad_norm": 0.6040337382301092, + "learning_rate": 1.3234548568844318e-06, + "loss": 11.8438, + "step": 34881 + }, + { + "epoch": 1.8994643088086147, + "grad_norm": 0.518418596072161, + "learning_rate": 1.3220253240580516e-06, + "loss": 11.7245, + "step": 34882 + }, + { + "epoch": 1.8995187628051977, + "grad_norm": 0.5037391785542648, + "learning_rate": 1.3205965585681545e-06, + "loss": 11.7445, + "step": 34883 + }, + { + "epoch": 1.8995732168017807, + "grad_norm": 0.530468799518843, + "learning_rate": 1.319168560425843e-06, + "loss": 11.7476, + "step": 34884 + }, + { + "epoch": 1.8996276707983637, + "grad_norm": 0.5375853461548761, + "learning_rate": 1.3177413296422303e-06, + "loss": 11.7333, + "step": 34885 + }, + { + "epoch": 1.8996821247949467, + "grad_norm": 0.5772703460003947, + "learning_rate": 1.3163148662284074e-06, + "loss": 11.837, + "step": 34886 + }, + { + "epoch": 1.8997365787915297, + "grad_norm": 0.4990338475383423, + "learning_rate": 1.3148891701954768e-06, + "loss": 11.7326, + "step": 34887 + }, + { + "epoch": 1.8997910327881127, + "grad_norm": 0.5248471910613208, + "learning_rate": 1.3134642415545074e-06, + "loss": 11.8689, + "step": 34888 + }, + { + "epoch": 1.8998454867846957, + "grad_norm": 0.5597959271870933, + "learning_rate": 1.31204008031659e-06, + "loss": 11.8377, + "step": 34889 + }, + { + "epoch": 1.8998999407812787, + "grad_norm": 0.5805332350728085, + "learning_rate": 1.3106166864928048e-06, + "loss": 11.7712, + "step": 34890 + }, + { + "epoch": 1.8999543947778617, + "grad_norm": 0.5226958549523403, + "learning_rate": 1.30919406009421e-06, + "loss": 11.865, + "step": 34891 + }, + { + "epoch": 1.9000088487744446, + "grad_norm": 0.4992119103294987, + "learning_rate": 1.3077722011318738e-06, + "loss": 11.5739, + "step": 34892 + }, + { + "epoch": 1.9000633027710276, + "grad_norm": 0.5891212049804477, + "learning_rate": 1.3063511096168546e-06, + "loss": 11.8646, + "step": 34893 + }, + { + "epoch": 1.9001177567676106, + "grad_norm": 0.5596632414305603, + "learning_rate": 1.3049307855601877e-06, + "loss": 11.864, + "step": 34894 + }, + { + "epoch": 1.9001722107641936, + "grad_norm": 0.5521483647653863, + "learning_rate": 1.3035112289729422e-06, + "loss": 11.6744, + "step": 34895 + }, + { + "epoch": 1.9002266647607768, + "grad_norm": 0.5352249635968738, + "learning_rate": 1.3020924398661426e-06, + "loss": 11.7585, + "step": 34896 + }, + { + "epoch": 1.9002811187573598, + "grad_norm": 0.7299639279554151, + "learning_rate": 1.3006744182508245e-06, + "loss": 11.6792, + "step": 34897 + }, + { + "epoch": 1.9003355727539428, + "grad_norm": 0.5527762706527218, + "learning_rate": 1.2992571641380124e-06, + "loss": 11.7747, + "step": 34898 + }, + { + "epoch": 1.9003900267505258, + "grad_norm": 0.5634854671760061, + "learning_rate": 1.297840677538731e-06, + "loss": 11.625, + "step": 34899 + }, + { + "epoch": 1.9004444807471088, + "grad_norm": 0.5102684204527475, + "learning_rate": 1.296424958463993e-06, + "loss": 11.72, + "step": 34900 + }, + { + "epoch": 1.9004989347436918, + "grad_norm": 0.5235308436576003, + "learning_rate": 1.2950100069248018e-06, + "loss": 11.8088, + "step": 34901 + }, + { + "epoch": 1.900553388740275, + "grad_norm": 0.5487990887186326, + "learning_rate": 1.2935958229321698e-06, + "loss": 11.6855, + "step": 34902 + }, + { + "epoch": 1.900607842736858, + "grad_norm": 0.5080858442369418, + "learning_rate": 1.292182406497089e-06, + "loss": 11.8006, + "step": 34903 + }, + { + "epoch": 1.900662296733441, + "grad_norm": 0.5764407085076949, + "learning_rate": 1.29076975763055e-06, + "loss": 11.6335, + "step": 34904 + }, + { + "epoch": 1.900716750730024, + "grad_norm": 0.5435858284155105, + "learning_rate": 1.2893578763435443e-06, + "loss": 11.6927, + "step": 34905 + }, + { + "epoch": 1.900771204726607, + "grad_norm": 0.611928884447569, + "learning_rate": 1.2879467626470298e-06, + "loss": 11.519, + "step": 34906 + }, + { + "epoch": 1.90082565872319, + "grad_norm": 0.520866001968785, + "learning_rate": 1.28653641655202e-06, + "loss": 11.747, + "step": 34907 + }, + { + "epoch": 1.900880112719773, + "grad_norm": 0.5933677414894604, + "learning_rate": 1.2851268380694393e-06, + "loss": 11.8656, + "step": 34908 + }, + { + "epoch": 1.900934566716356, + "grad_norm": 0.5794399435556268, + "learning_rate": 1.2837180272102678e-06, + "loss": 11.8481, + "step": 34909 + }, + { + "epoch": 1.900989020712939, + "grad_norm": 0.5638927687898467, + "learning_rate": 1.2823099839854636e-06, + "loss": 11.7772, + "step": 34910 + }, + { + "epoch": 1.901043474709522, + "grad_norm": 0.5723553513297639, + "learning_rate": 1.2809027084059733e-06, + "loss": 11.6843, + "step": 34911 + }, + { + "epoch": 1.901097928706105, + "grad_norm": 0.5956225606954836, + "learning_rate": 1.2794962004827437e-06, + "loss": 11.6345, + "step": 34912 + }, + { + "epoch": 1.901152382702688, + "grad_norm": 0.5187793448726408, + "learning_rate": 1.2780904602266997e-06, + "loss": 11.7602, + "step": 34913 + }, + { + "epoch": 1.901206836699271, + "grad_norm": 0.6625625031443545, + "learning_rate": 1.2766854876487766e-06, + "loss": 11.7441, + "step": 34914 + }, + { + "epoch": 1.901261290695854, + "grad_norm": 0.6301397190904372, + "learning_rate": 1.2752812827599103e-06, + "loss": 11.6538, + "step": 34915 + }, + { + "epoch": 1.901315744692437, + "grad_norm": 0.5569390578766883, + "learning_rate": 1.2738778455710031e-06, + "loss": 11.7446, + "step": 34916 + }, + { + "epoch": 1.90137019868902, + "grad_norm": 0.5747944201762731, + "learning_rate": 1.272475176092991e-06, + "loss": 11.7425, + "step": 34917 + }, + { + "epoch": 1.901424652685603, + "grad_norm": 0.5872191550488564, + "learning_rate": 1.2710732743367536e-06, + "loss": 11.7856, + "step": 34918 + }, + { + "epoch": 1.901479106682186, + "grad_norm": 0.5258704340142019, + "learning_rate": 1.2696721403132273e-06, + "loss": 11.8165, + "step": 34919 + }, + { + "epoch": 1.901533560678769, + "grad_norm": 0.5250324522894962, + "learning_rate": 1.2682717740332694e-06, + "loss": 11.764, + "step": 34920 + }, + { + "epoch": 1.901588014675352, + "grad_norm": 0.5188689434614271, + "learning_rate": 1.266872175507794e-06, + "loss": 11.6813, + "step": 34921 + }, + { + "epoch": 1.901642468671935, + "grad_norm": 0.526756402285969, + "learning_rate": 1.2654733447476807e-06, + "loss": 11.7247, + "step": 34922 + }, + { + "epoch": 1.901696922668518, + "grad_norm": 0.5317676067744854, + "learning_rate": 1.2640752817637992e-06, + "loss": 11.8025, + "step": 34923 + }, + { + "epoch": 1.901751376665101, + "grad_norm": 0.5196343589005462, + "learning_rate": 1.2626779865670403e-06, + "loss": 11.7772, + "step": 34924 + }, + { + "epoch": 1.901805830661684, + "grad_norm": 0.5156707167033562, + "learning_rate": 1.2612814591682399e-06, + "loss": 11.8267, + "step": 34925 + }, + { + "epoch": 1.9018602846582673, + "grad_norm": 0.6338894852183763, + "learning_rate": 1.2598856995782782e-06, + "loss": 11.8186, + "step": 34926 + }, + { + "epoch": 1.9019147386548503, + "grad_norm": 0.5402170246045844, + "learning_rate": 1.258490707808002e-06, + "loss": 11.8636, + "step": 34927 + }, + { + "epoch": 1.9019691926514333, + "grad_norm": 0.5225756205200797, + "learning_rate": 1.2570964838682586e-06, + "loss": 11.7981, + "step": 34928 + }, + { + "epoch": 1.9020236466480163, + "grad_norm": 0.5036981029513681, + "learning_rate": 1.2557030277698945e-06, + "loss": 11.8473, + "step": 34929 + }, + { + "epoch": 1.9020781006445993, + "grad_norm": 0.5302650787095671, + "learning_rate": 1.2543103395237454e-06, + "loss": 11.7383, + "step": 34930 + }, + { + "epoch": 1.9021325546411822, + "grad_norm": 0.5640592816036747, + "learning_rate": 1.2529184191406474e-06, + "loss": 11.8471, + "step": 34931 + }, + { + "epoch": 1.9021870086377652, + "grad_norm": 0.5283403228744005, + "learning_rate": 1.2515272666314027e-06, + "loss": 11.663, + "step": 34932 + }, + { + "epoch": 1.9022414626343482, + "grad_norm": 0.5660956425878506, + "learning_rate": 1.250136882006847e-06, + "loss": 11.7854, + "step": 34933 + }, + { + "epoch": 1.9022959166309312, + "grad_norm": 0.5202634141052236, + "learning_rate": 1.2487472652777832e-06, + "loss": 11.764, + "step": 34934 + }, + { + "epoch": 1.9023503706275142, + "grad_norm": 0.5103911372556909, + "learning_rate": 1.2473584164550245e-06, + "loss": 11.6607, + "step": 34935 + }, + { + "epoch": 1.9024048246240972, + "grad_norm": 0.5184577260363474, + "learning_rate": 1.2459703355493736e-06, + "loss": 11.7746, + "step": 34936 + }, + { + "epoch": 1.9024592786206802, + "grad_norm": 0.5519968341127948, + "learning_rate": 1.2445830225716104e-06, + "loss": 11.6335, + "step": 34937 + }, + { + "epoch": 1.9025137326172632, + "grad_norm": 0.5758726591964479, + "learning_rate": 1.2431964775325267e-06, + "loss": 11.8986, + "step": 34938 + }, + { + "epoch": 1.9025681866138462, + "grad_norm": 0.5901030153822988, + "learning_rate": 1.2418107004429135e-06, + "loss": 11.793, + "step": 34939 + }, + { + "epoch": 1.9026226406104292, + "grad_norm": 0.5969603883994249, + "learning_rate": 1.2404256913135404e-06, + "loss": 11.8338, + "step": 34940 + }, + { + "epoch": 1.9026770946070122, + "grad_norm": 0.5266099654687076, + "learning_rate": 1.2390414501551984e-06, + "loss": 11.7505, + "step": 34941 + }, + { + "epoch": 1.9027315486035952, + "grad_norm": 0.5501564057848704, + "learning_rate": 1.2376579769786124e-06, + "loss": 11.7582, + "step": 34942 + }, + { + "epoch": 1.9027860026001784, + "grad_norm": 0.5913755801523414, + "learning_rate": 1.2362752717945625e-06, + "loss": 11.7118, + "step": 34943 + }, + { + "epoch": 1.9028404565967614, + "grad_norm": 0.5740167269089241, + "learning_rate": 1.2348933346137958e-06, + "loss": 11.7911, + "step": 34944 + }, + { + "epoch": 1.9028949105933444, + "grad_norm": 0.49452260907208945, + "learning_rate": 1.2335121654470705e-06, + "loss": 11.6782, + "step": 34945 + }, + { + "epoch": 1.9029493645899274, + "grad_norm": 0.5383000224602161, + "learning_rate": 1.232131764305111e-06, + "loss": 11.844, + "step": 34946 + }, + { + "epoch": 1.9030038185865104, + "grad_norm": 0.5316669763904741, + "learning_rate": 1.2307521311986536e-06, + "loss": 11.8824, + "step": 34947 + }, + { + "epoch": 1.9030582725830933, + "grad_norm": 0.5807140241198657, + "learning_rate": 1.229373266138445e-06, + "loss": 11.7666, + "step": 34948 + }, + { + "epoch": 1.9031127265796766, + "grad_norm": 0.5643874459202864, + "learning_rate": 1.2279951691351876e-06, + "loss": 11.8332, + "step": 34949 + }, + { + "epoch": 1.9031671805762596, + "grad_norm": 0.5849060923044125, + "learning_rate": 1.2266178401995954e-06, + "loss": 11.7471, + "step": 34950 + }, + { + "epoch": 1.9032216345728425, + "grad_norm": 0.5169059632750649, + "learning_rate": 1.225241279342404e-06, + "loss": 11.8156, + "step": 34951 + }, + { + "epoch": 1.9032760885694255, + "grad_norm": 0.5676538317201736, + "learning_rate": 1.2238654865742938e-06, + "loss": 11.7099, + "step": 34952 + }, + { + "epoch": 1.9033305425660085, + "grad_norm": 0.555361791346935, + "learning_rate": 1.2224904619059673e-06, + "loss": 11.8238, + "step": 34953 + }, + { + "epoch": 1.9033849965625915, + "grad_norm": 0.5340937115298117, + "learning_rate": 1.2211162053481162e-06, + "loss": 11.7109, + "step": 34954 + }, + { + "epoch": 1.9034394505591745, + "grad_norm": 0.5149744115786244, + "learning_rate": 1.2197427169114317e-06, + "loss": 11.6893, + "step": 34955 + }, + { + "epoch": 1.9034939045557575, + "grad_norm": 0.500862523279399, + "learning_rate": 1.2183699966066052e-06, + "loss": 11.765, + "step": 34956 + }, + { + "epoch": 1.9035483585523405, + "grad_norm": 0.5919726300378692, + "learning_rate": 1.216998044444284e-06, + "loss": 11.7503, + "step": 34957 + }, + { + "epoch": 1.9036028125489235, + "grad_norm": 0.5216627565954235, + "learning_rate": 1.2156268604351707e-06, + "loss": 11.7937, + "step": 34958 + }, + { + "epoch": 1.9036572665455065, + "grad_norm": 0.5350789227643593, + "learning_rate": 1.2142564445898896e-06, + "loss": 11.7579, + "step": 34959 + }, + { + "epoch": 1.9037117205420895, + "grad_norm": 0.5672783880458488, + "learning_rate": 1.2128867969191326e-06, + "loss": 11.7553, + "step": 34960 + }, + { + "epoch": 1.9037661745386725, + "grad_norm": 0.4998683821936918, + "learning_rate": 1.2115179174335244e-06, + "loss": 11.796, + "step": 34961 + }, + { + "epoch": 1.9038206285352555, + "grad_norm": 0.49377638318305933, + "learning_rate": 1.2101498061437234e-06, + "loss": 11.7123, + "step": 34962 + }, + { + "epoch": 1.9038750825318385, + "grad_norm": 0.5317072407810388, + "learning_rate": 1.208782463060365e-06, + "loss": 11.7181, + "step": 34963 + }, + { + "epoch": 1.9039295365284215, + "grad_norm": 0.6067866019767924, + "learning_rate": 1.2074158881940744e-06, + "loss": 11.7319, + "step": 34964 + }, + { + "epoch": 1.9039839905250044, + "grad_norm": 0.5461599204792117, + "learning_rate": 1.2060500815554877e-06, + "loss": 11.7649, + "step": 34965 + }, + { + "epoch": 1.9040384445215877, + "grad_norm": 0.5512674428664004, + "learning_rate": 1.2046850431552293e-06, + "loss": 11.8201, + "step": 34966 + }, + { + "epoch": 1.9040928985181707, + "grad_norm": 0.551134458254267, + "learning_rate": 1.203320773003902e-06, + "loss": 11.7467, + "step": 34967 + }, + { + "epoch": 1.9041473525147536, + "grad_norm": 0.49863207282143507, + "learning_rate": 1.2019572711121196e-06, + "loss": 11.659, + "step": 34968 + }, + { + "epoch": 1.9042018065113366, + "grad_norm": 0.5695888542949131, + "learning_rate": 1.2005945374904958e-06, + "loss": 11.7296, + "step": 34969 + }, + { + "epoch": 1.9042562605079196, + "grad_norm": 0.5737764133068176, + "learning_rate": 1.199232572149611e-06, + "loss": 11.8685, + "step": 34970 + }, + { + "epoch": 1.9043107145045026, + "grad_norm": 0.6623615169926745, + "learning_rate": 1.197871375100068e-06, + "loss": 11.8326, + "step": 34971 + }, + { + "epoch": 1.9043651685010858, + "grad_norm": 0.5531808203617824, + "learning_rate": 1.196510946352436e-06, + "loss": 11.6211, + "step": 34972 + }, + { + "epoch": 1.9044196224976688, + "grad_norm": 0.5011029737950699, + "learning_rate": 1.1951512859173064e-06, + "loss": 11.7735, + "step": 34973 + }, + { + "epoch": 1.9044740764942518, + "grad_norm": 0.5635643958075988, + "learning_rate": 1.1937923938052598e-06, + "loss": 11.7972, + "step": 34974 + }, + { + "epoch": 1.9045285304908348, + "grad_norm": 0.4859862049418657, + "learning_rate": 1.1924342700268431e-06, + "loss": 11.7243, + "step": 34975 + }, + { + "epoch": 1.9045829844874178, + "grad_norm": 0.5534201210675502, + "learning_rate": 1.191076914592626e-06, + "loss": 11.9276, + "step": 34976 + }, + { + "epoch": 1.9046374384840008, + "grad_norm": 0.5011984913829557, + "learning_rate": 1.1897203275131664e-06, + "loss": 11.7358, + "step": 34977 + }, + { + "epoch": 1.9046918924805838, + "grad_norm": 0.5675792748840617, + "learning_rate": 1.1883645087990224e-06, + "loss": 11.769, + "step": 34978 + }, + { + "epoch": 1.9047463464771668, + "grad_norm": 0.5026309591187752, + "learning_rate": 1.1870094584607195e-06, + "loss": 11.5996, + "step": 34979 + }, + { + "epoch": 1.9048008004737498, + "grad_norm": 0.5581656370747227, + "learning_rate": 1.1856551765088042e-06, + "loss": 11.77, + "step": 34980 + }, + { + "epoch": 1.9048552544703328, + "grad_norm": 0.5663165010056022, + "learning_rate": 1.1843016629538128e-06, + "loss": 11.7731, + "step": 34981 + }, + { + "epoch": 1.9049097084669158, + "grad_norm": 0.5172392838909518, + "learning_rate": 1.1829489178062481e-06, + "loss": 11.695, + "step": 34982 + }, + { + "epoch": 1.9049641624634988, + "grad_norm": 0.5497732495772556, + "learning_rate": 1.181596941076646e-06, + "loss": 11.8683, + "step": 34983 + }, + { + "epoch": 1.9050186164600817, + "grad_norm": 0.5413920720232699, + "learning_rate": 1.18024573277552e-06, + "loss": 11.6641, + "step": 34984 + }, + { + "epoch": 1.9050730704566647, + "grad_norm": 0.547364218464008, + "learning_rate": 1.1788952929133846e-06, + "loss": 11.6054, + "step": 34985 + }, + { + "epoch": 1.9051275244532477, + "grad_norm": 0.5487273460843867, + "learning_rate": 1.1775456215007197e-06, + "loss": 11.7632, + "step": 34986 + }, + { + "epoch": 1.9051819784498307, + "grad_norm": 0.5150943124030233, + "learning_rate": 1.1761967185480394e-06, + "loss": 11.7124, + "step": 34987 + }, + { + "epoch": 1.9052364324464137, + "grad_norm": 0.5506035108844471, + "learning_rate": 1.174848584065824e-06, + "loss": 11.584, + "step": 34988 + }, + { + "epoch": 1.9052908864429967, + "grad_norm": 0.5648535428830471, + "learning_rate": 1.1735012180645543e-06, + "loss": 11.7641, + "step": 34989 + }, + { + "epoch": 1.90534534043958, + "grad_norm": 0.5546063878312949, + "learning_rate": 1.1721546205547218e-06, + "loss": 11.8059, + "step": 34990 + }, + { + "epoch": 1.905399794436163, + "grad_norm": 0.5946641378680784, + "learning_rate": 1.1708087915467846e-06, + "loss": 11.8523, + "step": 34991 + }, + { + "epoch": 1.905454248432746, + "grad_norm": 0.5573673365857627, + "learning_rate": 1.1694637310512125e-06, + "loss": 11.6701, + "step": 34992 + }, + { + "epoch": 1.905508702429329, + "grad_norm": 0.5404327531472225, + "learning_rate": 1.1681194390784634e-06, + "loss": 11.8141, + "step": 34993 + }, + { + "epoch": 1.905563156425912, + "grad_norm": 0.542847902451296, + "learning_rate": 1.1667759156389847e-06, + "loss": 11.7001, + "step": 34994 + }, + { + "epoch": 1.9056176104224951, + "grad_norm": 0.5595541408136204, + "learning_rate": 1.1654331607432457e-06, + "loss": 11.7201, + "step": 34995 + }, + { + "epoch": 1.905672064419078, + "grad_norm": 0.5288638373784262, + "learning_rate": 1.1640911744016602e-06, + "loss": 11.8266, + "step": 34996 + }, + { + "epoch": 1.905726518415661, + "grad_norm": 0.5970397485499701, + "learning_rate": 1.1627499566246869e-06, + "loss": 11.7677, + "step": 34997 + }, + { + "epoch": 1.905780972412244, + "grad_norm": 0.5616017549771714, + "learning_rate": 1.1614095074227282e-06, + "loss": 11.7861, + "step": 34998 + }, + { + "epoch": 1.905835426408827, + "grad_norm": 0.5635818877416472, + "learning_rate": 1.1600698268062315e-06, + "loss": 11.8511, + "step": 34999 + }, + { + "epoch": 1.90588988040541, + "grad_norm": 0.5023035649247424, + "learning_rate": 1.1587309147856217e-06, + "loss": 11.7506, + "step": 35000 + }, + { + "epoch": 1.905944334401993, + "grad_norm": 0.613569279577508, + "learning_rate": 1.1573927713712796e-06, + "loss": 11.7161, + "step": 35001 + }, + { + "epoch": 1.905998788398576, + "grad_norm": 0.5262782385154369, + "learning_rate": 1.156055396573641e-06, + "loss": 11.7928, + "step": 35002 + }, + { + "epoch": 1.906053242395159, + "grad_norm": 0.5671827952736994, + "learning_rate": 1.1547187904030866e-06, + "loss": 11.7986, + "step": 35003 + }, + { + "epoch": 1.906107696391742, + "grad_norm": 0.4781231166406721, + "learning_rate": 1.153382952870008e-06, + "loss": 11.6468, + "step": 35004 + }, + { + "epoch": 1.906162150388325, + "grad_norm": 0.579281572459171, + "learning_rate": 1.152047883984808e-06, + "loss": 11.825, + "step": 35005 + }, + { + "epoch": 1.906216604384908, + "grad_norm": 0.5480966016272341, + "learning_rate": 1.1507135837578453e-06, + "loss": 11.8069, + "step": 35006 + }, + { + "epoch": 1.906271058381491, + "grad_norm": 0.5126263720681261, + "learning_rate": 1.1493800521995334e-06, + "loss": 11.7038, + "step": 35007 + }, + { + "epoch": 1.906325512378074, + "grad_norm": 0.6769674666132978, + "learning_rate": 1.1480472893201977e-06, + "loss": 11.7818, + "step": 35008 + }, + { + "epoch": 1.906379966374657, + "grad_norm": 0.5307353123798643, + "learning_rate": 1.1467152951302407e-06, + "loss": 11.7825, + "step": 35009 + }, + { + "epoch": 1.90643442037124, + "grad_norm": 0.5434710656450639, + "learning_rate": 1.1453840696399986e-06, + "loss": 11.7149, + "step": 35010 + }, + { + "epoch": 1.906488874367823, + "grad_norm": 0.516724319134081, + "learning_rate": 1.144053612859819e-06, + "loss": 11.7374, + "step": 35011 + }, + { + "epoch": 1.906543328364406, + "grad_norm": 0.5074130646145986, + "learning_rate": 1.142723924800071e-06, + "loss": 11.7568, + "step": 35012 + }, + { + "epoch": 1.9065977823609892, + "grad_norm": 0.5503949663861676, + "learning_rate": 1.1413950054710687e-06, + "loss": 11.849, + "step": 35013 + }, + { + "epoch": 1.9066522363575722, + "grad_norm": 0.5871422964262129, + "learning_rate": 1.1400668548831595e-06, + "loss": 11.6125, + "step": 35014 + }, + { + "epoch": 1.9067066903541552, + "grad_norm": 0.5684713531708555, + "learning_rate": 1.1387394730466793e-06, + "loss": 11.7334, + "step": 35015 + }, + { + "epoch": 1.9067611443507382, + "grad_norm": 0.5339253585247791, + "learning_rate": 1.1374128599719314e-06, + "loss": 11.6277, + "step": 35016 + }, + { + "epoch": 1.9068155983473212, + "grad_norm": 0.6252833041214423, + "learning_rate": 1.1360870156692405e-06, + "loss": 11.7459, + "step": 35017 + }, + { + "epoch": 1.9068700523439042, + "grad_norm": 0.5034389970866846, + "learning_rate": 1.1347619401489206e-06, + "loss": 11.7586, + "step": 35018 + }, + { + "epoch": 1.9069245063404874, + "grad_norm": 0.47682379643605904, + "learning_rate": 1.1334376334212638e-06, + "loss": 11.759, + "step": 35019 + }, + { + "epoch": 1.9069789603370704, + "grad_norm": 0.5630438106014305, + "learning_rate": 1.1321140954965836e-06, + "loss": 11.7311, + "step": 35020 + }, + { + "epoch": 1.9070334143336534, + "grad_norm": 0.5712070275988911, + "learning_rate": 1.13079132638515e-06, + "loss": 11.7696, + "step": 35021 + }, + { + "epoch": 1.9070878683302364, + "grad_norm": 0.5284279871716235, + "learning_rate": 1.1294693260972877e-06, + "loss": 11.7122, + "step": 35022 + }, + { + "epoch": 1.9071423223268194, + "grad_norm": 0.5604633493767001, + "learning_rate": 1.1281480946432332e-06, + "loss": 11.8063, + "step": 35023 + }, + { + "epoch": 1.9071967763234023, + "grad_norm": 0.5505765064327027, + "learning_rate": 1.1268276320332893e-06, + "loss": 11.7465, + "step": 35024 + }, + { + "epoch": 1.9072512303199853, + "grad_norm": 0.5023365981449948, + "learning_rate": 1.1255079382777144e-06, + "loss": 11.7987, + "step": 35025 + }, + { + "epoch": 1.9073056843165683, + "grad_norm": 0.5342114329208129, + "learning_rate": 1.1241890133867671e-06, + "loss": 11.7392, + "step": 35026 + }, + { + "epoch": 1.9073601383131513, + "grad_norm": 0.5126656077229695, + "learning_rate": 1.1228708573707058e-06, + "loss": 11.819, + "step": 35027 + }, + { + "epoch": 1.9074145923097343, + "grad_norm": 0.5578011430400167, + "learning_rate": 1.121553470239789e-06, + "loss": 11.8733, + "step": 35028 + }, + { + "epoch": 1.9074690463063173, + "grad_norm": 0.5135564892159001, + "learning_rate": 1.1202368520042527e-06, + "loss": 11.7069, + "step": 35029 + }, + { + "epoch": 1.9075235003029003, + "grad_norm": 0.5355118246894365, + "learning_rate": 1.1189210026743225e-06, + "loss": 11.7699, + "step": 35030 + }, + { + "epoch": 1.9075779542994833, + "grad_norm": 0.599579854014161, + "learning_rate": 1.1176059222602676e-06, + "loss": 11.919, + "step": 35031 + }, + { + "epoch": 1.9076324082960663, + "grad_norm": 0.5283005049642556, + "learning_rate": 1.1162916107722798e-06, + "loss": 11.8298, + "step": 35032 + }, + { + "epoch": 1.9076868622926493, + "grad_norm": 0.5101769098963235, + "learning_rate": 1.1149780682205957e-06, + "loss": 11.7398, + "step": 35033 + }, + { + "epoch": 1.9077413162892323, + "grad_norm": 0.5498331710435225, + "learning_rate": 1.113665294615418e-06, + "loss": 11.8011, + "step": 35034 + }, + { + "epoch": 1.9077957702858153, + "grad_norm": 0.5832457360829273, + "learning_rate": 1.112353289966961e-06, + "loss": 11.6953, + "step": 35035 + }, + { + "epoch": 1.9078502242823985, + "grad_norm": 0.5279775767510355, + "learning_rate": 1.1110420542854384e-06, + "loss": 11.8075, + "step": 35036 + }, + { + "epoch": 1.9079046782789815, + "grad_norm": 0.5205927459750868, + "learning_rate": 1.109731587581031e-06, + "loss": 11.7359, + "step": 35037 + }, + { + "epoch": 1.9079591322755645, + "grad_norm": 0.5881479585714262, + "learning_rate": 1.108421889863931e-06, + "loss": 11.7509, + "step": 35038 + }, + { + "epoch": 1.9080135862721475, + "grad_norm": 0.5403880405130227, + "learning_rate": 1.1071129611443408e-06, + "loss": 11.8494, + "step": 35039 + }, + { + "epoch": 1.9080680402687304, + "grad_norm": 0.6171662767645459, + "learning_rate": 1.1058048014324084e-06, + "loss": 11.9234, + "step": 35040 + }, + { + "epoch": 1.9081224942653134, + "grad_norm": 0.5373698224508806, + "learning_rate": 1.1044974107383255e-06, + "loss": 11.6986, + "step": 35041 + }, + { + "epoch": 1.9081769482618967, + "grad_norm": 0.5334942218426715, + "learning_rate": 1.1031907890722615e-06, + "loss": 11.7383, + "step": 35042 + }, + { + "epoch": 1.9082314022584796, + "grad_norm": 0.5215693598715035, + "learning_rate": 1.101884936444364e-06, + "loss": 11.5175, + "step": 35043 + }, + { + "epoch": 1.9082858562550626, + "grad_norm": 0.5439630012830671, + "learning_rate": 1.1005798528648025e-06, + "loss": 11.6718, + "step": 35044 + }, + { + "epoch": 1.9083403102516456, + "grad_norm": 0.5169738020244817, + "learning_rate": 1.0992755383437137e-06, + "loss": 11.692, + "step": 35045 + }, + { + "epoch": 1.9083947642482286, + "grad_norm": 0.5267399179343627, + "learning_rate": 1.0979719928912446e-06, + "loss": 11.7187, + "step": 35046 + }, + { + "epoch": 1.9084492182448116, + "grad_norm": 0.5240608640848374, + "learning_rate": 1.0966692165175318e-06, + "loss": 11.7991, + "step": 35047 + }, + { + "epoch": 1.9085036722413946, + "grad_norm": 0.5479382086282198, + "learning_rate": 1.0953672092327006e-06, + "loss": 11.8021, + "step": 35048 + }, + { + "epoch": 1.9085581262379776, + "grad_norm": 0.5635261883703365, + "learning_rate": 1.094065971046887e-06, + "loss": 11.6825, + "step": 35049 + }, + { + "epoch": 1.9086125802345606, + "grad_norm": 0.5543134097400486, + "learning_rate": 1.0927655019701943e-06, + "loss": 11.9005, + "step": 35050 + }, + { + "epoch": 1.9086670342311436, + "grad_norm": 0.5595174971811848, + "learning_rate": 1.0914658020127477e-06, + "loss": 11.8898, + "step": 35051 + }, + { + "epoch": 1.9087214882277266, + "grad_norm": 0.5394529378112851, + "learning_rate": 1.0901668711846614e-06, + "loss": 11.7729, + "step": 35052 + }, + { + "epoch": 1.9087759422243096, + "grad_norm": 0.5580037430524944, + "learning_rate": 1.0888687094960049e-06, + "loss": 11.7652, + "step": 35053 + }, + { + "epoch": 1.9088303962208926, + "grad_norm": 0.5193653645595908, + "learning_rate": 1.0875713169569146e-06, + "loss": 11.7774, + "step": 35054 + }, + { + "epoch": 1.9088848502174756, + "grad_norm": 0.5520991921498845, + "learning_rate": 1.0862746935774381e-06, + "loss": 11.6531, + "step": 35055 + }, + { + "epoch": 1.9089393042140586, + "grad_norm": 0.5447856659242378, + "learning_rate": 1.0849788393676896e-06, + "loss": 11.8382, + "step": 35056 + }, + { + "epoch": 1.9089937582106415, + "grad_norm": 0.5596622529204539, + "learning_rate": 1.0836837543377165e-06, + "loss": 11.8003, + "step": 35057 + }, + { + "epoch": 1.9090482122072245, + "grad_norm": 0.5716859031427447, + "learning_rate": 1.082389438497622e-06, + "loss": 11.739, + "step": 35058 + }, + { + "epoch": 1.9091026662038075, + "grad_norm": 0.5529552089456344, + "learning_rate": 1.0810958918574533e-06, + "loss": 11.7993, + "step": 35059 + }, + { + "epoch": 1.9091571202003907, + "grad_norm": 0.590923171759506, + "learning_rate": 1.0798031144272691e-06, + "loss": 11.7941, + "step": 35060 + }, + { + "epoch": 1.9092115741969737, + "grad_norm": 0.503203522617532, + "learning_rate": 1.0785111062171282e-06, + "loss": 11.6206, + "step": 35061 + }, + { + "epoch": 1.9092660281935567, + "grad_norm": 0.5159708977929509, + "learning_rate": 1.0772198672370782e-06, + "loss": 11.775, + "step": 35062 + }, + { + "epoch": 1.9093204821901397, + "grad_norm": 0.5641158346192275, + "learning_rate": 1.0759293974971441e-06, + "loss": 11.8559, + "step": 35063 + }, + { + "epoch": 1.9093749361867227, + "grad_norm": 0.5603151273798148, + "learning_rate": 1.0746396970073846e-06, + "loss": 11.6271, + "step": 35064 + }, + { + "epoch": 1.909429390183306, + "grad_norm": 0.4953463292263194, + "learning_rate": 1.073350765777803e-06, + "loss": 11.814, + "step": 35065 + }, + { + "epoch": 1.909483844179889, + "grad_norm": 0.5693987504248924, + "learning_rate": 1.0720626038184467e-06, + "loss": 11.8154, + "step": 35066 + }, + { + "epoch": 1.909538298176472, + "grad_norm": 0.49598276689437376, + "learning_rate": 1.0707752111393189e-06, + "loss": 11.6372, + "step": 35067 + }, + { + "epoch": 1.909592752173055, + "grad_norm": 0.5512118148892549, + "learning_rate": 1.0694885877504334e-06, + "loss": 11.9031, + "step": 35068 + }, + { + "epoch": 1.909647206169638, + "grad_norm": 0.5783836042091899, + "learning_rate": 1.0682027336617939e-06, + "loss": 11.8721, + "step": 35069 + }, + { + "epoch": 1.909701660166221, + "grad_norm": 0.5378272435245747, + "learning_rate": 1.0669176488834032e-06, + "loss": 11.5571, + "step": 35070 + }, + { + "epoch": 1.9097561141628039, + "grad_norm": 0.5237454585595565, + "learning_rate": 1.0656333334252532e-06, + "loss": 11.6939, + "step": 35071 + }, + { + "epoch": 1.9098105681593869, + "grad_norm": 0.5758661119362838, + "learning_rate": 1.064349787297325e-06, + "loss": 11.7597, + "step": 35072 + }, + { + "epoch": 1.9098650221559699, + "grad_norm": 0.5862225418671749, + "learning_rate": 1.0630670105096108e-06, + "loss": 11.7829, + "step": 35073 + }, + { + "epoch": 1.9099194761525529, + "grad_norm": 0.6267844827987517, + "learning_rate": 1.0617850030720688e-06, + "loss": 11.8347, + "step": 35074 + }, + { + "epoch": 1.9099739301491359, + "grad_norm": 0.5373539663053456, + "learning_rate": 1.0605037649946915e-06, + "loss": 11.762, + "step": 35075 + }, + { + "epoch": 1.9100283841457188, + "grad_norm": 0.561820988168871, + "learning_rate": 1.059223296287415e-06, + "loss": 11.7992, + "step": 35076 + }, + { + "epoch": 1.9100828381423018, + "grad_norm": 0.5095559453567118, + "learning_rate": 1.0579435969602203e-06, + "loss": 11.7944, + "step": 35077 + }, + { + "epoch": 1.9101372921388848, + "grad_norm": 0.4888594334549358, + "learning_rate": 1.056664667023044e-06, + "loss": 11.8618, + "step": 35078 + }, + { + "epoch": 1.9101917461354678, + "grad_norm": 0.5353049185426352, + "learning_rate": 1.0553865064858448e-06, + "loss": 11.7997, + "step": 35079 + }, + { + "epoch": 1.9102462001320508, + "grad_norm": 0.5223524984841007, + "learning_rate": 1.0541091153585481e-06, + "loss": 11.6515, + "step": 35080 + }, + { + "epoch": 1.9103006541286338, + "grad_norm": 0.5663943238723873, + "learning_rate": 1.0528324936510902e-06, + "loss": 11.7447, + "step": 35081 + }, + { + "epoch": 1.9103551081252168, + "grad_norm": 0.5433977328800638, + "learning_rate": 1.0515566413733969e-06, + "loss": 11.8036, + "step": 35082 + }, + { + "epoch": 1.9104095621218, + "grad_norm": 0.5724687132210393, + "learning_rate": 1.0502815585354042e-06, + "loss": 11.7263, + "step": 35083 + }, + { + "epoch": 1.910464016118383, + "grad_norm": 0.5410814429033893, + "learning_rate": 1.0490072451470046e-06, + "loss": 11.8019, + "step": 35084 + }, + { + "epoch": 1.910518470114966, + "grad_norm": 0.5795403204573999, + "learning_rate": 1.0477337012181232e-06, + "loss": 11.864, + "step": 35085 + }, + { + "epoch": 1.910572924111549, + "grad_norm": 0.5843096261051132, + "learning_rate": 1.0464609267586522e-06, + "loss": 11.8717, + "step": 35086 + }, + { + "epoch": 1.910627378108132, + "grad_norm": 0.5547838288076281, + "learning_rate": 1.045188921778506e-06, + "loss": 11.7711, + "step": 35087 + }, + { + "epoch": 1.910681832104715, + "grad_norm": 0.5089451102713335, + "learning_rate": 1.0439176862875654e-06, + "loss": 11.6835, + "step": 35088 + }, + { + "epoch": 1.9107362861012982, + "grad_norm": 0.6201359920253151, + "learning_rate": 1.0426472202957004e-06, + "loss": 11.6396, + "step": 35089 + }, + { + "epoch": 1.9107907400978812, + "grad_norm": 0.5881591957683492, + "learning_rate": 1.0413775238128253e-06, + "loss": 11.8939, + "step": 35090 + }, + { + "epoch": 1.9108451940944642, + "grad_norm": 0.5522354787812104, + "learning_rate": 1.0401085968487766e-06, + "loss": 11.7936, + "step": 35091 + }, + { + "epoch": 1.9108996480910472, + "grad_norm": 0.5235783192898293, + "learning_rate": 1.0388404394134577e-06, + "loss": 11.7094, + "step": 35092 + }, + { + "epoch": 1.9109541020876302, + "grad_norm": 0.5104367125330096, + "learning_rate": 1.0375730515167047e-06, + "loss": 11.7649, + "step": 35093 + }, + { + "epoch": 1.9110085560842132, + "grad_norm": 0.6352311273546106, + "learning_rate": 1.0363064331683769e-06, + "loss": 11.9029, + "step": 35094 + }, + { + "epoch": 1.9110630100807962, + "grad_norm": 0.5808278235580587, + "learning_rate": 1.0350405843783216e-06, + "loss": 11.7317, + "step": 35095 + }, + { + "epoch": 1.9111174640773791, + "grad_norm": 0.5439018501035373, + "learning_rate": 1.0337755051563868e-06, + "loss": 11.7704, + "step": 35096 + }, + { + "epoch": 1.9111719180739621, + "grad_norm": 0.5535290140400511, + "learning_rate": 1.0325111955124201e-06, + "loss": 11.7334, + "step": 35097 + }, + { + "epoch": 1.9112263720705451, + "grad_norm": 0.5355060497331602, + "learning_rate": 1.0312476554562356e-06, + "loss": 11.7896, + "step": 35098 + }, + { + "epoch": 1.9112808260671281, + "grad_norm": 0.5190122938502716, + "learning_rate": 1.02998488499767e-06, + "loss": 11.8073, + "step": 35099 + }, + { + "epoch": 1.9113352800637111, + "grad_norm": 0.5204723389919724, + "learning_rate": 1.028722884146538e-06, + "loss": 11.7878, + "step": 35100 + }, + { + "epoch": 1.911389734060294, + "grad_norm": 0.5564546198705889, + "learning_rate": 1.0274616529126536e-06, + "loss": 11.8006, + "step": 35101 + }, + { + "epoch": 1.911444188056877, + "grad_norm": 0.48794837984224265, + "learning_rate": 1.0262011913058312e-06, + "loss": 11.6623, + "step": 35102 + }, + { + "epoch": 1.91149864205346, + "grad_norm": 0.5390515935615477, + "learning_rate": 1.024941499335863e-06, + "loss": 11.828, + "step": 35103 + }, + { + "epoch": 1.911553096050043, + "grad_norm": 0.5235486524429563, + "learning_rate": 1.0236825770125414e-06, + "loss": 11.7012, + "step": 35104 + }, + { + "epoch": 1.911607550046626, + "grad_norm": 0.6402393695134693, + "learning_rate": 1.0224244243456693e-06, + "loss": 11.7515, + "step": 35105 + }, + { + "epoch": 1.9116620040432093, + "grad_norm": 0.515391054436438, + "learning_rate": 1.0211670413450169e-06, + "loss": 11.8416, + "step": 35106 + }, + { + "epoch": 1.9117164580397923, + "grad_norm": 0.49968832184637235, + "learning_rate": 1.0199104280203763e-06, + "loss": 11.768, + "step": 35107 + }, + { + "epoch": 1.9117709120363753, + "grad_norm": 0.6274243618865167, + "learning_rate": 1.0186545843815065e-06, + "loss": 11.7803, + "step": 35108 + }, + { + "epoch": 1.9118253660329583, + "grad_norm": 0.5097495364866304, + "learning_rate": 1.0173995104381773e-06, + "loss": 11.779, + "step": 35109 + }, + { + "epoch": 1.9118798200295413, + "grad_norm": 0.5261417959977284, + "learning_rate": 1.0161452062001587e-06, + "loss": 11.8723, + "step": 35110 + }, + { + "epoch": 1.9119342740261243, + "grad_norm": 0.5459633910347668, + "learning_rate": 1.0148916716771761e-06, + "loss": 11.8145, + "step": 35111 + }, + { + "epoch": 1.9119887280227075, + "grad_norm": 0.5602467867364136, + "learning_rate": 1.0136389068790108e-06, + "loss": 11.779, + "step": 35112 + }, + { + "epoch": 1.9120431820192905, + "grad_norm": 0.5941671271494218, + "learning_rate": 1.0123869118153883e-06, + "loss": 11.6405, + "step": 35113 + }, + { + "epoch": 1.9120976360158735, + "grad_norm": 0.5483298253064384, + "learning_rate": 1.0111356864960341e-06, + "loss": 11.6138, + "step": 35114 + }, + { + "epoch": 1.9121520900124565, + "grad_norm": 0.5154942674498695, + "learning_rate": 1.0098852309307072e-06, + "loss": 11.8123, + "step": 35115 + }, + { + "epoch": 1.9122065440090394, + "grad_norm": 0.5525543639258955, + "learning_rate": 1.0086355451290997e-06, + "loss": 11.8114, + "step": 35116 + }, + { + "epoch": 1.9122609980056224, + "grad_norm": 0.5227600270568211, + "learning_rate": 1.007386629100948e-06, + "loss": 11.7644, + "step": 35117 + }, + { + "epoch": 1.9123154520022054, + "grad_norm": 0.5627995273649774, + "learning_rate": 1.006138482855956e-06, + "loss": 11.6594, + "step": 35118 + }, + { + "epoch": 1.9123699059987884, + "grad_norm": 0.5338480996883471, + "learning_rate": 1.0048911064038268e-06, + "loss": 11.7901, + "step": 35119 + }, + { + "epoch": 1.9124243599953714, + "grad_norm": 0.5442165476819657, + "learning_rate": 1.003644499754275e-06, + "loss": 11.6895, + "step": 35120 + }, + { + "epoch": 1.9124788139919544, + "grad_norm": 0.5476404177077041, + "learning_rate": 1.0023986629169813e-06, + "loss": 11.7744, + "step": 35121 + }, + { + "epoch": 1.9125332679885374, + "grad_norm": 0.5437739273626775, + "learning_rate": 1.0011535959016494e-06, + "loss": 11.7232, + "step": 35122 + }, + { + "epoch": 1.9125877219851204, + "grad_norm": 0.5599405951560065, + "learning_rate": 9.999092987179381e-07, + "loss": 11.7276, + "step": 35123 + }, + { + "epoch": 1.9126421759817034, + "grad_norm": 0.49533066232163964, + "learning_rate": 9.986657713755288e-07, + "loss": 11.7487, + "step": 35124 + }, + { + "epoch": 1.9126966299782864, + "grad_norm": 0.5326189323704272, + "learning_rate": 9.974230138841024e-07, + "loss": 11.7116, + "step": 35125 + }, + { + "epoch": 1.9127510839748694, + "grad_norm": 0.5294952348997704, + "learning_rate": 9.961810262533178e-07, + "loss": 11.8349, + "step": 35126 + }, + { + "epoch": 1.9128055379714524, + "grad_norm": 0.6010079103320684, + "learning_rate": 9.94939808492834e-07, + "loss": 11.8098, + "step": 35127 + }, + { + "epoch": 1.9128599919680354, + "grad_norm": 0.5693917939451096, + "learning_rate": 9.93699360612299e-07, + "loss": 11.6701, + "step": 35128 + }, + { + "epoch": 1.9129144459646186, + "grad_norm": 0.4904476688547059, + "learning_rate": 9.924596826213716e-07, + "loss": 11.6774, + "step": 35129 + }, + { + "epoch": 1.9129688999612016, + "grad_norm": 0.4995587318185474, + "learning_rate": 9.912207745296665e-07, + "loss": 11.7444, + "step": 35130 + }, + { + "epoch": 1.9130233539577846, + "grad_norm": 0.5644042562087479, + "learning_rate": 9.899826363468311e-07, + "loss": 11.774, + "step": 35131 + }, + { + "epoch": 1.9130778079543675, + "grad_norm": 0.6478245834824435, + "learning_rate": 9.887452680825138e-07, + "loss": 11.7276, + "step": 35132 + }, + { + "epoch": 1.9131322619509505, + "grad_norm": 0.5025718304468798, + "learning_rate": 9.875086697463066e-07, + "loss": 11.7485, + "step": 35133 + }, + { + "epoch": 1.9131867159475335, + "grad_norm": 0.5132824629887921, + "learning_rate": 9.86272841347835e-07, + "loss": 11.5666, + "step": 35134 + }, + { + "epoch": 1.9132411699441167, + "grad_norm": 0.5160447393687133, + "learning_rate": 9.85037782896714e-07, + "loss": 11.8409, + "step": 35135 + }, + { + "epoch": 1.9132956239406997, + "grad_norm": 0.5831760641182391, + "learning_rate": 9.838034944025354e-07, + "loss": 11.7491, + "step": 35136 + }, + { + "epoch": 1.9133500779372827, + "grad_norm": 0.5000670887401855, + "learning_rate": 9.82569975874914e-07, + "loss": 11.6315, + "step": 35137 + }, + { + "epoch": 1.9134045319338657, + "grad_norm": 0.5364252803555681, + "learning_rate": 9.813372273234311e-07, + "loss": 11.6952, + "step": 35138 + }, + { + "epoch": 1.9134589859304487, + "grad_norm": 0.5859474620258575, + "learning_rate": 9.80105248757679e-07, + "loss": 11.773, + "step": 35139 + }, + { + "epoch": 1.9135134399270317, + "grad_norm": 0.5183197641860716, + "learning_rate": 9.788740401872277e-07, + "loss": 11.8488, + "step": 35140 + }, + { + "epoch": 1.9135678939236147, + "grad_norm": 0.5311511630162272, + "learning_rate": 9.776436016216473e-07, + "loss": 11.885, + "step": 35141 + }, + { + "epoch": 1.9136223479201977, + "grad_norm": 0.5721275190221004, + "learning_rate": 9.764139330705412e-07, + "loss": 11.8198, + "step": 35142 + }, + { + "epoch": 1.9136768019167807, + "grad_norm": 0.5439242256473281, + "learning_rate": 9.751850345434355e-07, + "loss": 11.7999, + "step": 35143 + }, + { + "epoch": 1.9137312559133637, + "grad_norm": 0.5025895639682192, + "learning_rate": 9.739569060498998e-07, + "loss": 11.6802, + "step": 35144 + }, + { + "epoch": 1.9137857099099467, + "grad_norm": 0.522579173012891, + "learning_rate": 9.727295475994714e-07, + "loss": 11.7436, + "step": 35145 + }, + { + "epoch": 1.9138401639065297, + "grad_norm": 0.5540909247206703, + "learning_rate": 9.715029592017088e-07, + "loss": 11.8201, + "step": 35146 + }, + { + "epoch": 1.9138946179031127, + "grad_norm": 0.5299964103986897, + "learning_rate": 9.702771408661604e-07, + "loss": 11.8015, + "step": 35147 + }, + { + "epoch": 1.9139490718996957, + "grad_norm": 0.5460718503237835, + "learning_rate": 9.690520926023294e-07, + "loss": 11.7618, + "step": 35148 + }, + { + "epoch": 1.9140035258962786, + "grad_norm": 0.5642893204770933, + "learning_rate": 9.678278144197639e-07, + "loss": 11.7825, + "step": 35149 + }, + { + "epoch": 1.9140579798928616, + "grad_norm": 0.5199466557557606, + "learning_rate": 9.666043063279673e-07, + "loss": 11.8994, + "step": 35150 + }, + { + "epoch": 1.9141124338894446, + "grad_norm": 0.5713658464418989, + "learning_rate": 9.653815683364764e-07, + "loss": 11.8829, + "step": 35151 + }, + { + "epoch": 1.9141668878860276, + "grad_norm": 0.5318327808591264, + "learning_rate": 9.641596004547726e-07, + "loss": 11.7626, + "step": 35152 + }, + { + "epoch": 1.9142213418826108, + "grad_norm": 0.5282506846256024, + "learning_rate": 9.629384026923816e-07, + "loss": 11.8018, + "step": 35153 + }, + { + "epoch": 1.9142757958791938, + "grad_norm": 0.5720525814173348, + "learning_rate": 9.617179750587957e-07, + "loss": 11.8022, + "step": 35154 + }, + { + "epoch": 1.9143302498757768, + "grad_norm": 0.5258208866729774, + "learning_rate": 9.604983175634852e-07, + "loss": 11.9336, + "step": 35155 + }, + { + "epoch": 1.9143847038723598, + "grad_norm": 0.5159547586650071, + "learning_rate": 9.592794302159646e-07, + "loss": 11.7323, + "step": 35156 + }, + { + "epoch": 1.9144391578689428, + "grad_norm": 0.5610796327787909, + "learning_rate": 9.580613130256822e-07, + "loss": 11.8646, + "step": 35157 + }, + { + "epoch": 1.9144936118655258, + "grad_norm": 0.5767096618913562, + "learning_rate": 9.5684396600213e-07, + "loss": 11.8894, + "step": 35158 + }, + { + "epoch": 1.914548065862109, + "grad_norm": 0.5294956672415153, + "learning_rate": 9.556273891547673e-07, + "loss": 11.7973, + "step": 35159 + }, + { + "epoch": 1.914602519858692, + "grad_norm": 0.5391260841721528, + "learning_rate": 9.544115824930532e-07, + "loss": 11.7829, + "step": 35160 + }, + { + "epoch": 1.914656973855275, + "grad_norm": 0.5405415191523248, + "learning_rate": 9.53196546026458e-07, + "loss": 11.7448, + "step": 35161 + }, + { + "epoch": 1.914711427851858, + "grad_norm": 0.5841377470590796, + "learning_rate": 9.519822797643962e-07, + "loss": 11.8143, + "step": 35162 + }, + { + "epoch": 1.914765881848441, + "grad_norm": 0.6184059434122927, + "learning_rate": 9.507687837163492e-07, + "loss": 11.729, + "step": 35163 + }, + { + "epoch": 1.914820335845024, + "grad_norm": 0.5255925439390693, + "learning_rate": 9.495560578917318e-07, + "loss": 11.7779, + "step": 35164 + }, + { + "epoch": 1.914874789841607, + "grad_norm": 0.5253838804623853, + "learning_rate": 9.483441022999695e-07, + "loss": 11.7927, + "step": 35165 + }, + { + "epoch": 1.91492924383819, + "grad_norm": 0.5980539158184621, + "learning_rate": 9.471329169504995e-07, + "loss": 11.9218, + "step": 35166 + }, + { + "epoch": 1.914983697834773, + "grad_norm": 0.5617716604024158, + "learning_rate": 9.459225018527251e-07, + "loss": 11.8383, + "step": 35167 + }, + { + "epoch": 1.915038151831356, + "grad_norm": 0.5623123857783747, + "learning_rate": 9.447128570160723e-07, + "loss": 11.6922, + "step": 35168 + }, + { + "epoch": 1.915092605827939, + "grad_norm": 0.5506840989740919, + "learning_rate": 9.435039824499559e-07, + "loss": 11.8841, + "step": 35169 + }, + { + "epoch": 1.915147059824522, + "grad_norm": 0.53043677168556, + "learning_rate": 9.422958781637569e-07, + "loss": 11.6814, + "step": 35170 + }, + { + "epoch": 1.915201513821105, + "grad_norm": 0.5097510373721862, + "learning_rate": 9.410885441668793e-07, + "loss": 11.7102, + "step": 35171 + }, + { + "epoch": 1.915255967817688, + "grad_norm": 0.5199440180916941, + "learning_rate": 9.398819804687043e-07, + "loss": 11.7086, + "step": 35172 + }, + { + "epoch": 1.915310421814271, + "grad_norm": 0.5762046373965938, + "learning_rate": 9.386761870786243e-07, + "loss": 11.9591, + "step": 35173 + }, + { + "epoch": 1.915364875810854, + "grad_norm": 0.5586414190330928, + "learning_rate": 9.374711640060096e-07, + "loss": 11.8247, + "step": 35174 + }, + { + "epoch": 1.915419329807437, + "grad_norm": 0.5381081233376709, + "learning_rate": 9.362669112602307e-07, + "loss": 11.708, + "step": 35175 + }, + { + "epoch": 1.9154737838040201, + "grad_norm": 0.559612607472234, + "learning_rate": 9.350634288506466e-07, + "loss": 11.7455, + "step": 35176 + }, + { + "epoch": 1.915528237800603, + "grad_norm": 0.5597247638643995, + "learning_rate": 9.338607167866276e-07, + "loss": 11.7732, + "step": 35177 + }, + { + "epoch": 1.915582691797186, + "grad_norm": 0.5832729433958163, + "learning_rate": 9.326587750775329e-07, + "loss": 11.8304, + "step": 35178 + }, + { + "epoch": 1.915637145793769, + "grad_norm": 0.5401258334400405, + "learning_rate": 9.314576037326772e-07, + "loss": 11.7651, + "step": 35179 + }, + { + "epoch": 1.915691599790352, + "grad_norm": 0.546976026951646, + "learning_rate": 9.302572027614309e-07, + "loss": 11.8512, + "step": 35180 + }, + { + "epoch": 1.915746053786935, + "grad_norm": 0.5205921963085846, + "learning_rate": 9.290575721731198e-07, + "loss": 11.7567, + "step": 35181 + }, + { + "epoch": 1.9158005077835183, + "grad_norm": 0.5490305793584135, + "learning_rate": 9.278587119770698e-07, + "loss": 11.7593, + "step": 35182 + }, + { + "epoch": 1.9158549617801013, + "grad_norm": 0.5386055788617824, + "learning_rate": 9.266606221826069e-07, + "loss": 11.6951, + "step": 35183 + }, + { + "epoch": 1.9159094157766843, + "grad_norm": 0.4888484154079049, + "learning_rate": 9.254633027990456e-07, + "loss": 11.7217, + "step": 35184 + }, + { + "epoch": 1.9159638697732673, + "grad_norm": 0.5561620307709734, + "learning_rate": 9.242667538356898e-07, + "loss": 11.8107, + "step": 35185 + }, + { + "epoch": 1.9160183237698503, + "grad_norm": 0.533857419028927, + "learning_rate": 9.230709753018651e-07, + "loss": 11.804, + "step": 35186 + }, + { + "epoch": 1.9160727777664333, + "grad_norm": 0.5740980065069025, + "learning_rate": 9.218759672068422e-07, + "loss": 11.7448, + "step": 35187 + }, + { + "epoch": 1.9161272317630162, + "grad_norm": 0.5348810160177284, + "learning_rate": 9.206817295599246e-07, + "loss": 11.7366, + "step": 35188 + }, + { + "epoch": 1.9161816857595992, + "grad_norm": 0.5639451320436223, + "learning_rate": 9.194882623704049e-07, + "loss": 11.7164, + "step": 35189 + }, + { + "epoch": 1.9162361397561822, + "grad_norm": 0.49415545276392187, + "learning_rate": 9.182955656475645e-07, + "loss": 11.529, + "step": 35190 + }, + { + "epoch": 1.9162905937527652, + "grad_norm": 0.557829386350709, + "learning_rate": 9.171036394006737e-07, + "loss": 11.7855, + "step": 35191 + }, + { + "epoch": 1.9163450477493482, + "grad_norm": 0.5947507546188459, + "learning_rate": 9.159124836390032e-07, + "loss": 11.849, + "step": 35192 + }, + { + "epoch": 1.9163995017459312, + "grad_norm": 0.5565434666075539, + "learning_rate": 9.147220983718119e-07, + "loss": 11.7487, + "step": 35193 + }, + { + "epoch": 1.9164539557425142, + "grad_norm": 0.5687280597467993, + "learning_rate": 9.135324836083592e-07, + "loss": 11.7756, + "step": 35194 + }, + { + "epoch": 1.9165084097390972, + "grad_norm": 0.5369250923409661, + "learning_rate": 9.123436393578822e-07, + "loss": 11.7732, + "step": 35195 + }, + { + "epoch": 1.9165628637356802, + "grad_norm": 0.6092539975696989, + "learning_rate": 9.111555656296511e-07, + "loss": 11.692, + "step": 35196 + }, + { + "epoch": 1.9166173177322632, + "grad_norm": 0.5167452649292813, + "learning_rate": 9.09968262432892e-07, + "loss": 11.7265, + "step": 35197 + }, + { + "epoch": 1.9166717717288462, + "grad_norm": 0.5385597377763062, + "learning_rate": 9.087817297768308e-07, + "loss": 11.833, + "step": 35198 + }, + { + "epoch": 1.9167262257254294, + "grad_norm": 0.5416782185874432, + "learning_rate": 9.075959676707046e-07, + "loss": 11.6064, + "step": 35199 + }, + { + "epoch": 1.9167806797220124, + "grad_norm": 0.537139385306688, + "learning_rate": 9.064109761237282e-07, + "loss": 11.681, + "step": 35200 + }, + { + "epoch": 1.9168351337185954, + "grad_norm": 0.6357180096696138, + "learning_rate": 9.052267551451165e-07, + "loss": 11.808, + "step": 35201 + }, + { + "epoch": 1.9168895877151784, + "grad_norm": 0.6055184640562326, + "learning_rate": 9.040433047440844e-07, + "loss": 11.788, + "step": 35202 + }, + { + "epoch": 1.9169440417117614, + "grad_norm": 0.5045144401513638, + "learning_rate": 9.028606249298355e-07, + "loss": 11.7997, + "step": 35203 + }, + { + "epoch": 1.9169984957083444, + "grad_norm": 0.5459174889517361, + "learning_rate": 9.016787157115514e-07, + "loss": 11.8305, + "step": 35204 + }, + { + "epoch": 1.9170529497049276, + "grad_norm": 0.4910436462510843, + "learning_rate": 9.004975770984358e-07, + "loss": 11.7572, + "step": 35205 + }, + { + "epoch": 1.9171074037015106, + "grad_norm": 0.5478522802477646, + "learning_rate": 8.993172090996593e-07, + "loss": 11.8115, + "step": 35206 + }, + { + "epoch": 1.9171618576980936, + "grad_norm": 0.5559894155851322, + "learning_rate": 8.981376117244255e-07, + "loss": 11.8449, + "step": 35207 + }, + { + "epoch": 1.9172163116946765, + "grad_norm": 0.538612690716442, + "learning_rate": 8.969587849818828e-07, + "loss": 11.7906, + "step": 35208 + }, + { + "epoch": 1.9172707656912595, + "grad_norm": 0.5084388156015692, + "learning_rate": 8.957807288812126e-07, + "loss": 11.7268, + "step": 35209 + }, + { + "epoch": 1.9173252196878425, + "grad_norm": 0.4832681966615627, + "learning_rate": 8.946034434315742e-07, + "loss": 11.7395, + "step": 35210 + }, + { + "epoch": 1.9173796736844255, + "grad_norm": 0.5738225621440586, + "learning_rate": 8.934269286421159e-07, + "loss": 11.8553, + "step": 35211 + }, + { + "epoch": 1.9174341276810085, + "grad_norm": 0.48711473108013803, + "learning_rate": 8.922511845219971e-07, + "loss": 11.7085, + "step": 35212 + }, + { + "epoch": 1.9174885816775915, + "grad_norm": 0.5202528831488631, + "learning_rate": 8.910762110803439e-07, + "loss": 11.8102, + "step": 35213 + }, + { + "epoch": 1.9175430356741745, + "grad_norm": 0.519394116891388, + "learning_rate": 8.899020083263043e-07, + "loss": 11.7792, + "step": 35214 + }, + { + "epoch": 1.9175974896707575, + "grad_norm": 0.519271590028062, + "learning_rate": 8.887285762690156e-07, + "loss": 11.8252, + "step": 35215 + }, + { + "epoch": 1.9176519436673405, + "grad_norm": 0.5455615839571457, + "learning_rate": 8.875559149175816e-07, + "loss": 11.8203, + "step": 35216 + }, + { + "epoch": 1.9177063976639235, + "grad_norm": 0.5390491771381011, + "learning_rate": 8.863840242811394e-07, + "loss": 11.744, + "step": 35217 + }, + { + "epoch": 1.9177608516605065, + "grad_norm": 0.5617895119488668, + "learning_rate": 8.85212904368804e-07, + "loss": 11.7927, + "step": 35218 + }, + { + "epoch": 1.9178153056570895, + "grad_norm": 0.49923998977061224, + "learning_rate": 8.84042555189657e-07, + "loss": 11.7869, + "step": 35219 + }, + { + "epoch": 1.9178697596536725, + "grad_norm": 0.5392256851222319, + "learning_rate": 8.828729767528354e-07, + "loss": 11.7381, + "step": 35220 + }, + { + "epoch": 1.9179242136502554, + "grad_norm": 0.5580498866571791, + "learning_rate": 8.817041690673989e-07, + "loss": 11.7798, + "step": 35221 + }, + { + "epoch": 1.9179786676468384, + "grad_norm": 0.5553461907649563, + "learning_rate": 8.805361321424732e-07, + "loss": 11.8, + "step": 35222 + }, + { + "epoch": 1.9180331216434217, + "grad_norm": 0.538107620107467, + "learning_rate": 8.793688659871069e-07, + "loss": 11.6728, + "step": 35223 + }, + { + "epoch": 1.9180875756400046, + "grad_norm": 0.5358888643911297, + "learning_rate": 8.782023706103925e-07, + "loss": 11.8671, + "step": 35224 + }, + { + "epoch": 1.9181420296365876, + "grad_norm": 0.5473243382040065, + "learning_rate": 8.770366460214008e-07, + "loss": 11.7563, + "step": 35225 + }, + { + "epoch": 1.9181964836331706, + "grad_norm": 0.5619020179221044, + "learning_rate": 8.75871692229191e-07, + "loss": 11.8151, + "step": 35226 + }, + { + "epoch": 1.9182509376297536, + "grad_norm": 0.646298529253663, + "learning_rate": 8.747075092428336e-07, + "loss": 11.7864, + "step": 35227 + }, + { + "epoch": 1.9183053916263366, + "grad_norm": 0.5448105127013941, + "learning_rate": 8.73544097071366e-07, + "loss": 11.6923, + "step": 35228 + }, + { + "epoch": 1.9183598456229198, + "grad_norm": 0.5097718353891445, + "learning_rate": 8.723814557238474e-07, + "loss": 11.7472, + "step": 35229 + }, + { + "epoch": 1.9184142996195028, + "grad_norm": 0.6254578642322416, + "learning_rate": 8.712195852093152e-07, + "loss": 11.7263, + "step": 35230 + }, + { + "epoch": 1.9184687536160858, + "grad_norm": 0.5030137023454851, + "learning_rate": 8.700584855367955e-07, + "loss": 11.7914, + "step": 35231 + }, + { + "epoch": 1.9185232076126688, + "grad_norm": 0.524172881676483, + "learning_rate": 8.688981567153365e-07, + "loss": 11.7223, + "step": 35232 + }, + { + "epoch": 1.9185776616092518, + "grad_norm": 0.5398621754495531, + "learning_rate": 8.677385987539532e-07, + "loss": 11.9072, + "step": 35233 + }, + { + "epoch": 1.9186321156058348, + "grad_norm": 0.5669199821899846, + "learning_rate": 8.665798116616496e-07, + "loss": 11.6823, + "step": 35234 + }, + { + "epoch": 1.9186865696024178, + "grad_norm": 0.558354801015339, + "learning_rate": 8.654217954474408e-07, + "loss": 11.7699, + "step": 35235 + }, + { + "epoch": 1.9187410235990008, + "grad_norm": 0.5154310645832462, + "learning_rate": 8.642645501203416e-07, + "loss": 11.7, + "step": 35236 + }, + { + "epoch": 1.9187954775955838, + "grad_norm": 0.5253994038750188, + "learning_rate": 8.631080756893562e-07, + "loss": 11.734, + "step": 35237 + }, + { + "epoch": 1.9188499315921668, + "grad_norm": 0.582087935449611, + "learning_rate": 8.619523721634548e-07, + "loss": 11.8067, + "step": 35238 + }, + { + "epoch": 1.9189043855887498, + "grad_norm": 0.5274111672244881, + "learning_rate": 8.607974395516416e-07, + "loss": 11.7308, + "step": 35239 + }, + { + "epoch": 1.9189588395853328, + "grad_norm": 0.5722438827303705, + "learning_rate": 8.596432778628983e-07, + "loss": 11.6561, + "step": 35240 + }, + { + "epoch": 1.9190132935819157, + "grad_norm": 0.5250881599929415, + "learning_rate": 8.584898871061841e-07, + "loss": 11.7011, + "step": 35241 + }, + { + "epoch": 1.9190677475784987, + "grad_norm": 0.6673704896322545, + "learning_rate": 8.573372672904923e-07, + "loss": 11.8617, + "step": 35242 + }, + { + "epoch": 1.9191222015750817, + "grad_norm": 0.5550575162151639, + "learning_rate": 8.561854184247597e-07, + "loss": 11.6047, + "step": 35243 + }, + { + "epoch": 1.9191766555716647, + "grad_norm": 0.609772576805787, + "learning_rate": 8.550343405179573e-07, + "loss": 11.811, + "step": 35244 + }, + { + "epoch": 1.9192311095682477, + "grad_norm": 0.49943831048443377, + "learning_rate": 8.538840335790443e-07, + "loss": 11.7641, + "step": 35245 + }, + { + "epoch": 1.919285563564831, + "grad_norm": 0.5877683080457907, + "learning_rate": 8.527344976169471e-07, + "loss": 11.7762, + "step": 35246 + }, + { + "epoch": 1.919340017561414, + "grad_norm": 0.6439500745850081, + "learning_rate": 8.51585732640614e-07, + "loss": 11.7894, + "step": 35247 + }, + { + "epoch": 1.919394471557997, + "grad_norm": 0.537615234870368, + "learning_rate": 8.504377386589824e-07, + "loss": 11.7894, + "step": 35248 + }, + { + "epoch": 1.91944892555458, + "grad_norm": 0.5793091396641953, + "learning_rate": 8.492905156809671e-07, + "loss": 11.8368, + "step": 35249 + }, + { + "epoch": 1.919503379551163, + "grad_norm": 0.5705253878050031, + "learning_rate": 8.481440637154947e-07, + "loss": 11.8582, + "step": 35250 + }, + { + "epoch": 1.919557833547746, + "grad_norm": 0.4914867197880977, + "learning_rate": 8.469983827714911e-07, + "loss": 11.6931, + "step": 35251 + }, + { + "epoch": 1.919612287544329, + "grad_norm": 0.529209964279529, + "learning_rate": 8.458534728578494e-07, + "loss": 11.6899, + "step": 35252 + }, + { + "epoch": 1.919666741540912, + "grad_norm": 0.5437293401220309, + "learning_rate": 8.447093339834844e-07, + "loss": 11.7877, + "step": 35253 + }, + { + "epoch": 1.919721195537495, + "grad_norm": 0.5633118117687254, + "learning_rate": 8.435659661572892e-07, + "loss": 11.7408, + "step": 35254 + }, + { + "epoch": 1.919775649534078, + "grad_norm": 0.5199596714588441, + "learning_rate": 8.424233693881456e-07, + "loss": 11.7181, + "step": 35255 + }, + { + "epoch": 1.919830103530661, + "grad_norm": 0.4993925874144359, + "learning_rate": 8.412815436849575e-07, + "loss": 11.7928, + "step": 35256 + }, + { + "epoch": 1.919884557527244, + "grad_norm": 0.4955370450074922, + "learning_rate": 8.401404890565845e-07, + "loss": 11.5063, + "step": 35257 + }, + { + "epoch": 1.919939011523827, + "grad_norm": 0.5295129961762205, + "learning_rate": 8.390002055119084e-07, + "loss": 11.7691, + "step": 35258 + }, + { + "epoch": 1.91999346552041, + "grad_norm": 0.5341947484634186, + "learning_rate": 8.378606930597999e-07, + "loss": 11.9021, + "step": 35259 + }, + { + "epoch": 1.920047919516993, + "grad_norm": 0.49459724132531996, + "learning_rate": 8.367219517091074e-07, + "loss": 11.6327, + "step": 35260 + }, + { + "epoch": 1.920102373513576, + "grad_norm": 0.552824060456351, + "learning_rate": 8.355839814687017e-07, + "loss": 11.7846, + "step": 35261 + }, + { + "epoch": 1.920156827510159, + "grad_norm": 0.5945931968247791, + "learning_rate": 8.344467823474311e-07, + "loss": 11.7119, + "step": 35262 + }, + { + "epoch": 1.920211281506742, + "grad_norm": 0.5165917185399999, + "learning_rate": 8.33310354354122e-07, + "loss": 11.7467, + "step": 35263 + }, + { + "epoch": 1.920265735503325, + "grad_norm": 0.5398463526646301, + "learning_rate": 8.321746974976341e-07, + "loss": 11.689, + "step": 35264 + }, + { + "epoch": 1.920320189499908, + "grad_norm": 0.5122382129269146, + "learning_rate": 8.310398117867713e-07, + "loss": 11.686, + "step": 35265 + }, + { + "epoch": 1.920374643496491, + "grad_norm": 0.4963068238640571, + "learning_rate": 8.299056972303821e-07, + "loss": 11.769, + "step": 35266 + }, + { + "epoch": 1.920429097493074, + "grad_norm": 0.5939624631643763, + "learning_rate": 8.287723538372705e-07, + "loss": 11.841, + "step": 35267 + }, + { + "epoch": 1.920483551489657, + "grad_norm": 0.5601564948005896, + "learning_rate": 8.276397816162629e-07, + "loss": 11.6909, + "step": 35268 + }, + { + "epoch": 1.9205380054862402, + "grad_norm": 0.5345917124766969, + "learning_rate": 8.265079805761522e-07, + "loss": 11.6399, + "step": 35269 + }, + { + "epoch": 1.9205924594828232, + "grad_norm": 0.5148940207216658, + "learning_rate": 8.253769507257536e-07, + "loss": 11.6773, + "step": 35270 + }, + { + "epoch": 1.9206469134794062, + "grad_norm": 0.5404967793073, + "learning_rate": 8.242466920738601e-07, + "loss": 11.6103, + "step": 35271 + }, + { + "epoch": 1.9207013674759892, + "grad_norm": 0.5186321777187234, + "learning_rate": 8.231172046292424e-07, + "loss": 11.875, + "step": 35272 + }, + { + "epoch": 1.9207558214725722, + "grad_norm": 0.5180517870130527, + "learning_rate": 8.219884884007045e-07, + "loss": 11.7603, + "step": 35273 + }, + { + "epoch": 1.9208102754691552, + "grad_norm": 0.504696692025867, + "learning_rate": 8.208605433970173e-07, + "loss": 11.7911, + "step": 35274 + }, + { + "epoch": 1.9208647294657384, + "grad_norm": 0.5492580926459601, + "learning_rate": 8.197333696269515e-07, + "loss": 11.827, + "step": 35275 + }, + { + "epoch": 1.9209191834623214, + "grad_norm": 0.5867281708173184, + "learning_rate": 8.186069670992779e-07, + "loss": 11.7523, + "step": 35276 + }, + { + "epoch": 1.9209736374589044, + "grad_norm": 0.6153437883647792, + "learning_rate": 8.17481335822734e-07, + "loss": 11.7761, + "step": 35277 + }, + { + "epoch": 1.9210280914554874, + "grad_norm": 0.5026668114833792, + "learning_rate": 8.163564758060905e-07, + "loss": 11.6629, + "step": 35278 + }, + { + "epoch": 1.9210825454520704, + "grad_norm": 0.5320818665546363, + "learning_rate": 8.152323870581069e-07, + "loss": 11.7831, + "step": 35279 + }, + { + "epoch": 1.9211369994486533, + "grad_norm": 0.5681963567239882, + "learning_rate": 8.141090695874876e-07, + "loss": 11.8499, + "step": 35280 + }, + { + "epoch": 1.9211914534452363, + "grad_norm": 0.4949459466197695, + "learning_rate": 8.129865234030032e-07, + "loss": 11.7889, + "step": 35281 + }, + { + "epoch": 1.9212459074418193, + "grad_norm": 0.5349621663826487, + "learning_rate": 8.11864748513369e-07, + "loss": 11.7013, + "step": 35282 + }, + { + "epoch": 1.9213003614384023, + "grad_norm": 0.5774407758647135, + "learning_rate": 8.107437449273114e-07, + "loss": 11.8837, + "step": 35283 + }, + { + "epoch": 1.9213548154349853, + "grad_norm": 0.5460741780068611, + "learning_rate": 8.096235126535456e-07, + "loss": 11.8375, + "step": 35284 + }, + { + "epoch": 1.9214092694315683, + "grad_norm": 0.5667162986954193, + "learning_rate": 8.085040517007758e-07, + "loss": 11.7073, + "step": 35285 + }, + { + "epoch": 1.9214637234281513, + "grad_norm": 0.5757809426311653, + "learning_rate": 8.073853620777061e-07, + "loss": 11.8311, + "step": 35286 + }, + { + "epoch": 1.9215181774247343, + "grad_norm": 0.5300675981497448, + "learning_rate": 8.062674437930517e-07, + "loss": 11.6415, + "step": 35287 + }, + { + "epoch": 1.9215726314213173, + "grad_norm": 0.5504102236991174, + "learning_rate": 8.051502968554947e-07, + "loss": 11.8556, + "step": 35288 + }, + { + "epoch": 1.9216270854179003, + "grad_norm": 0.5219588919888506, + "learning_rate": 8.04033921273728e-07, + "loss": 11.7289, + "step": 35289 + }, + { + "epoch": 1.9216815394144833, + "grad_norm": 0.532605736327605, + "learning_rate": 8.029183170564225e-07, + "loss": 11.9089, + "step": 35290 + }, + { + "epoch": 1.9217359934110663, + "grad_norm": 0.5545374656691011, + "learning_rate": 8.018034842122601e-07, + "loss": 11.6906, + "step": 35291 + }, + { + "epoch": 1.9217904474076493, + "grad_norm": 0.5785721568173094, + "learning_rate": 8.006894227499118e-07, + "loss": 11.8068, + "step": 35292 + }, + { + "epoch": 1.9218449014042325, + "grad_norm": 0.5855414702112186, + "learning_rate": 7.995761326780371e-07, + "loss": 11.8148, + "step": 35293 + }, + { + "epoch": 1.9218993554008155, + "grad_norm": 0.5362638828443235, + "learning_rate": 7.984636140052959e-07, + "loss": 11.7047, + "step": 35294 + }, + { + "epoch": 1.9219538093973985, + "grad_norm": 0.5477454407750199, + "learning_rate": 7.973518667403368e-07, + "loss": 11.7107, + "step": 35295 + }, + { + "epoch": 1.9220082633939815, + "grad_norm": 0.4952478844872741, + "learning_rate": 7.962408908918085e-07, + "loss": 11.7421, + "step": 35296 + }, + { + "epoch": 1.9220627173905644, + "grad_norm": 0.5232201606893285, + "learning_rate": 7.951306864683372e-07, + "loss": 11.7441, + "step": 35297 + }, + { + "epoch": 1.9221171713871477, + "grad_norm": 0.5422497469933022, + "learning_rate": 7.940212534785718e-07, + "loss": 11.8066, + "step": 35298 + }, + { + "epoch": 1.9221716253837307, + "grad_norm": 0.5773271614851948, + "learning_rate": 7.929125919311387e-07, + "loss": 11.8527, + "step": 35299 + }, + { + "epoch": 1.9222260793803136, + "grad_norm": 0.5069161014283836, + "learning_rate": 7.918047018346419e-07, + "loss": 11.6907, + "step": 35300 + }, + { + "epoch": 1.9222805333768966, + "grad_norm": 0.5265502382157767, + "learning_rate": 7.906975831977303e-07, + "loss": 11.8117, + "step": 35301 + }, + { + "epoch": 1.9223349873734796, + "grad_norm": 0.5979082352160378, + "learning_rate": 7.895912360289748e-07, + "loss": 11.7516, + "step": 35302 + }, + { + "epoch": 1.9223894413700626, + "grad_norm": 0.4978611012246709, + "learning_rate": 7.884856603370017e-07, + "loss": 11.6019, + "step": 35303 + }, + { + "epoch": 1.9224438953666456, + "grad_norm": 0.5553903313803183, + "learning_rate": 7.873808561304041e-07, + "loss": 11.8605, + "step": 35304 + }, + { + "epoch": 1.9224983493632286, + "grad_norm": 0.486835429970468, + "learning_rate": 7.862768234177643e-07, + "loss": 11.6685, + "step": 35305 + }, + { + "epoch": 1.9225528033598116, + "grad_norm": 0.5372476764856517, + "learning_rate": 7.85173562207675e-07, + "loss": 11.7563, + "step": 35306 + }, + { + "epoch": 1.9226072573563946, + "grad_norm": 0.5329644864882591, + "learning_rate": 7.840710725087186e-07, + "loss": 11.6573, + "step": 35307 + }, + { + "epoch": 1.9226617113529776, + "grad_norm": 0.7622171642642938, + "learning_rate": 7.829693543294658e-07, + "loss": 11.8296, + "step": 35308 + }, + { + "epoch": 1.9227161653495606, + "grad_norm": 0.6431431136042206, + "learning_rate": 7.818684076784766e-07, + "loss": 11.9103, + "step": 35309 + }, + { + "epoch": 1.9227706193461436, + "grad_norm": 0.5997994841951583, + "learning_rate": 7.807682325643218e-07, + "loss": 11.7868, + "step": 35310 + }, + { + "epoch": 1.9228250733427266, + "grad_norm": 0.5578947978803397, + "learning_rate": 7.796688289955389e-07, + "loss": 11.8253, + "step": 35311 + }, + { + "epoch": 1.9228795273393096, + "grad_norm": 0.5286349018608756, + "learning_rate": 7.785701969806991e-07, + "loss": 11.6868, + "step": 35312 + }, + { + "epoch": 1.9229339813358926, + "grad_norm": 0.583396661750172, + "learning_rate": 7.77472336528351e-07, + "loss": 11.861, + "step": 35313 + }, + { + "epoch": 1.9229884353324755, + "grad_norm": 0.5394180114020597, + "learning_rate": 7.763752476469988e-07, + "loss": 11.696, + "step": 35314 + }, + { + "epoch": 1.9230428893290585, + "grad_norm": 0.6335493071578289, + "learning_rate": 7.752789303452024e-07, + "loss": 11.6758, + "step": 35315 + }, + { + "epoch": 1.9230973433256418, + "grad_norm": 0.5197597917635172, + "learning_rate": 7.741833846314773e-07, + "loss": 11.8524, + "step": 35316 + }, + { + "epoch": 1.9231517973222247, + "grad_norm": 0.5378542335438624, + "learning_rate": 7.730886105143387e-07, + "loss": 11.6057, + "step": 35317 + }, + { + "epoch": 1.9232062513188077, + "grad_norm": 0.5634133311170223, + "learning_rate": 7.719946080023021e-07, + "loss": 11.8391, + "step": 35318 + }, + { + "epoch": 1.9232607053153907, + "grad_norm": 0.5735120756466036, + "learning_rate": 7.70901377103872e-07, + "loss": 11.8891, + "step": 35319 + }, + { + "epoch": 1.9233151593119737, + "grad_norm": 0.5057175099043302, + "learning_rate": 7.698089178275635e-07, + "loss": 11.8658, + "step": 35320 + }, + { + "epoch": 1.9233696133085567, + "grad_norm": 0.5543803388535611, + "learning_rate": 7.68717230181859e-07, + "loss": 11.8473, + "step": 35321 + }, + { + "epoch": 1.92342406730514, + "grad_norm": 0.5872870913362307, + "learning_rate": 7.676263141752404e-07, + "loss": 11.817, + "step": 35322 + }, + { + "epoch": 1.923478521301723, + "grad_norm": 0.51143281922094, + "learning_rate": 7.665361698162121e-07, + "loss": 11.7323, + "step": 35323 + }, + { + "epoch": 1.923532975298306, + "grad_norm": 0.5044437056489144, + "learning_rate": 7.654467971132451e-07, + "loss": 11.7122, + "step": 35324 + }, + { + "epoch": 1.923587429294889, + "grad_norm": 0.5450926954154512, + "learning_rate": 7.643581960747992e-07, + "loss": 11.6908, + "step": 35325 + }, + { + "epoch": 1.923641883291472, + "grad_norm": 0.5756954108170521, + "learning_rate": 7.632703667093344e-07, + "loss": 11.848, + "step": 35326 + }, + { + "epoch": 1.923696337288055, + "grad_norm": 0.4863770442798545, + "learning_rate": 7.621833090253327e-07, + "loss": 11.6358, + "step": 35327 + }, + { + "epoch": 1.9237507912846379, + "grad_norm": 0.508350176005348, + "learning_rate": 7.61097023031232e-07, + "loss": 11.8138, + "step": 35328 + }, + { + "epoch": 1.9238052452812209, + "grad_norm": 0.5992019878117516, + "learning_rate": 7.60011508735492e-07, + "loss": 11.6911, + "step": 35329 + }, + { + "epoch": 1.9238596992778039, + "grad_norm": 0.5505014228180067, + "learning_rate": 7.589267661465393e-07, + "loss": 11.8331, + "step": 35330 + }, + { + "epoch": 1.9239141532743869, + "grad_norm": 0.5914248529883434, + "learning_rate": 7.578427952728117e-07, + "loss": 11.6986, + "step": 35331 + }, + { + "epoch": 1.9239686072709699, + "grad_norm": 0.5849041648920296, + "learning_rate": 7.567595961227469e-07, + "loss": 11.8056, + "step": 35332 + }, + { + "epoch": 1.9240230612675528, + "grad_norm": 0.5272009266257041, + "learning_rate": 7.556771687047492e-07, + "loss": 11.5029, + "step": 35333 + }, + { + "epoch": 1.9240775152641358, + "grad_norm": 0.5604447882902809, + "learning_rate": 7.545955130272675e-07, + "loss": 11.8475, + "step": 35334 + }, + { + "epoch": 1.9241319692607188, + "grad_norm": 0.5504769635070138, + "learning_rate": 7.535146290986839e-07, + "loss": 11.8832, + "step": 35335 + }, + { + "epoch": 1.9241864232573018, + "grad_norm": 0.5029963049925109, + "learning_rate": 7.524345169274139e-07, + "loss": 11.7606, + "step": 35336 + }, + { + "epoch": 1.9242408772538848, + "grad_norm": 0.5312068726749273, + "learning_rate": 7.513551765218618e-07, + "loss": 11.8833, + "step": 35337 + }, + { + "epoch": 1.9242953312504678, + "grad_norm": 0.5125757415662441, + "learning_rate": 7.5027660789041e-07, + "loss": 11.7914, + "step": 35338 + }, + { + "epoch": 1.924349785247051, + "grad_norm": 0.5649880153604923, + "learning_rate": 7.491988110414517e-07, + "loss": 11.6746, + "step": 35339 + }, + { + "epoch": 1.924404239243634, + "grad_norm": 0.5012386348472996, + "learning_rate": 7.4812178598338e-07, + "loss": 11.795, + "step": 35340 + }, + { + "epoch": 1.924458693240217, + "grad_norm": 0.5576508571761037, + "learning_rate": 7.47045532724544e-07, + "loss": 11.6963, + "step": 35341 + }, + { + "epoch": 1.9245131472368, + "grad_norm": 0.5388868489754646, + "learning_rate": 7.459700512733369e-07, + "loss": 11.8101, + "step": 35342 + }, + { + "epoch": 1.924567601233383, + "grad_norm": 0.5524929396746211, + "learning_rate": 7.448953416380966e-07, + "loss": 11.755, + "step": 35343 + }, + { + "epoch": 1.924622055229966, + "grad_norm": 0.5281468662307155, + "learning_rate": 7.438214038272162e-07, + "loss": 11.6712, + "step": 35344 + }, + { + "epoch": 1.9246765092265492, + "grad_norm": 0.5019786033804621, + "learning_rate": 7.427482378490114e-07, + "loss": 11.7936, + "step": 35345 + }, + { + "epoch": 1.9247309632231322, + "grad_norm": 0.4985147264600398, + "learning_rate": 7.416758437118421e-07, + "loss": 11.7349, + "step": 35346 + }, + { + "epoch": 1.9247854172197152, + "grad_norm": 0.6123380396788846, + "learning_rate": 7.406042214240572e-07, + "loss": 11.7973, + "step": 35347 + }, + { + "epoch": 1.9248398712162982, + "grad_norm": 0.5479284479493136, + "learning_rate": 7.395333709939722e-07, + "loss": 11.7835, + "step": 35348 + }, + { + "epoch": 1.9248943252128812, + "grad_norm": 0.611817874657679, + "learning_rate": 7.384632924299139e-07, + "loss": 11.841, + "step": 35349 + }, + { + "epoch": 1.9249487792094642, + "grad_norm": 0.5113991007685988, + "learning_rate": 7.373939857402201e-07, + "loss": 11.7846, + "step": 35350 + }, + { + "epoch": 1.9250032332060472, + "grad_norm": 0.852600629121004, + "learning_rate": 7.363254509331952e-07, + "loss": 11.6187, + "step": 35351 + }, + { + "epoch": 1.9250576872026302, + "grad_norm": 0.5040558355011653, + "learning_rate": 7.352576880171547e-07, + "loss": 11.821, + "step": 35352 + }, + { + "epoch": 1.9251121411992131, + "grad_norm": 0.5355425617423119, + "learning_rate": 7.34190697000392e-07, + "loss": 11.7523, + "step": 35353 + }, + { + "epoch": 1.9251665951957961, + "grad_norm": 0.49322277301141726, + "learning_rate": 7.331244778912116e-07, + "loss": 11.8026, + "step": 35354 + }, + { + "epoch": 1.9252210491923791, + "grad_norm": 0.5022982131103606, + "learning_rate": 7.32059030697907e-07, + "loss": 11.843, + "step": 35355 + }, + { + "epoch": 1.9252755031889621, + "grad_norm": 0.5167383581109334, + "learning_rate": 7.30994355428749e-07, + "loss": 11.7494, + "step": 35356 + }, + { + "epoch": 1.9253299571855451, + "grad_norm": 0.532425071896843, + "learning_rate": 7.299304520920313e-07, + "loss": 11.8206, + "step": 35357 + }, + { + "epoch": 1.925384411182128, + "grad_norm": 0.5259448588169953, + "learning_rate": 7.288673206960139e-07, + "loss": 11.7795, + "step": 35358 + }, + { + "epoch": 1.925438865178711, + "grad_norm": 0.5248042446606963, + "learning_rate": 7.278049612489901e-07, + "loss": 11.8046, + "step": 35359 + }, + { + "epoch": 1.925493319175294, + "grad_norm": 0.5425324117292638, + "learning_rate": 7.267433737591867e-07, + "loss": 11.8307, + "step": 35360 + }, + { + "epoch": 1.925547773171877, + "grad_norm": 0.5062761099637456, + "learning_rate": 7.256825582348748e-07, + "loss": 11.7454, + "step": 35361 + }, + { + "epoch": 1.92560222716846, + "grad_norm": 0.5352392168668737, + "learning_rate": 7.246225146843144e-07, + "loss": 11.7295, + "step": 35362 + }, + { + "epoch": 1.9256566811650433, + "grad_norm": 0.5184501313269085, + "learning_rate": 7.235632431157325e-07, + "loss": 11.7991, + "step": 35363 + }, + { + "epoch": 1.9257111351616263, + "grad_norm": 0.5216924404581125, + "learning_rate": 7.225047435373666e-07, + "loss": 11.8121, + "step": 35364 + }, + { + "epoch": 1.9257655891582093, + "grad_norm": 0.5641134648379601, + "learning_rate": 7.21447015957466e-07, + "loss": 11.8982, + "step": 35365 + }, + { + "epoch": 1.9258200431547923, + "grad_norm": 0.5949317025428467, + "learning_rate": 7.20390060384224e-07, + "loss": 11.7294, + "step": 35366 + }, + { + "epoch": 1.9258744971513753, + "grad_norm": 0.5402392675001839, + "learning_rate": 7.193338768259005e-07, + "loss": 11.7455, + "step": 35367 + }, + { + "epoch": 1.9259289511479585, + "grad_norm": 0.6061302741210017, + "learning_rate": 7.182784652906671e-07, + "loss": 11.7955, + "step": 35368 + }, + { + "epoch": 1.9259834051445415, + "grad_norm": 0.5470773797960535, + "learning_rate": 7.172238257867614e-07, + "loss": 11.7651, + "step": 35369 + }, + { + "epoch": 1.9260378591411245, + "grad_norm": 0.5421452448098459, + "learning_rate": 7.161699583223658e-07, + "loss": 11.7071, + "step": 35370 + }, + { + "epoch": 1.9260923131377075, + "grad_norm": 0.5261602671605133, + "learning_rate": 7.151168629056737e-07, + "loss": 11.8025, + "step": 35371 + }, + { + "epoch": 1.9261467671342904, + "grad_norm": 0.5599441292931336, + "learning_rate": 7.140645395449008e-07, + "loss": 11.7575, + "step": 35372 + }, + { + "epoch": 1.9262012211308734, + "grad_norm": 0.5600867048068385, + "learning_rate": 7.130129882482073e-07, + "loss": 11.6645, + "step": 35373 + }, + { + "epoch": 1.9262556751274564, + "grad_norm": 0.5715260293873371, + "learning_rate": 7.119622090237754e-07, + "loss": 11.8297, + "step": 35374 + }, + { + "epoch": 1.9263101291240394, + "grad_norm": 0.5468786873557178, + "learning_rate": 7.109122018797764e-07, + "loss": 11.8405, + "step": 35375 + }, + { + "epoch": 1.9263645831206224, + "grad_norm": 0.5285094818170534, + "learning_rate": 7.098629668243706e-07, + "loss": 11.7535, + "step": 35376 + }, + { + "epoch": 1.9264190371172054, + "grad_norm": 0.5607105169284818, + "learning_rate": 7.088145038657179e-07, + "loss": 11.6754, + "step": 35377 + }, + { + "epoch": 1.9264734911137884, + "grad_norm": 0.49940159785988447, + "learning_rate": 7.077668130119786e-07, + "loss": 11.7179, + "step": 35378 + }, + { + "epoch": 1.9265279451103714, + "grad_norm": 0.4907213726895333, + "learning_rate": 7.067198942713016e-07, + "loss": 11.7589, + "step": 35379 + }, + { + "epoch": 1.9265823991069544, + "grad_norm": 0.5659091368713514, + "learning_rate": 7.056737476518138e-07, + "loss": 11.9466, + "step": 35380 + }, + { + "epoch": 1.9266368531035374, + "grad_norm": 0.5643104997612685, + "learning_rate": 7.046283731616643e-07, + "loss": 11.901, + "step": 35381 + }, + { + "epoch": 1.9266913071001204, + "grad_norm": 0.5137125062711562, + "learning_rate": 7.035837708089799e-07, + "loss": 11.8287, + "step": 35382 + }, + { + "epoch": 1.9267457610967034, + "grad_norm": 0.5954182365678513, + "learning_rate": 7.025399406018652e-07, + "loss": 11.7802, + "step": 35383 + }, + { + "epoch": 1.9268002150932864, + "grad_norm": 0.5230969445519821, + "learning_rate": 7.014968825484691e-07, + "loss": 11.7085, + "step": 35384 + }, + { + "epoch": 1.9268546690898694, + "grad_norm": 0.5325847700152697, + "learning_rate": 7.004545966568854e-07, + "loss": 11.671, + "step": 35385 + }, + { + "epoch": 1.9269091230864526, + "grad_norm": 0.5498119132753048, + "learning_rate": 6.994130829352074e-07, + "loss": 11.873, + "step": 35386 + }, + { + "epoch": 1.9269635770830356, + "grad_norm": 0.5351749950341316, + "learning_rate": 6.98372341391551e-07, + "loss": 11.7139, + "step": 35387 + }, + { + "epoch": 1.9270180310796186, + "grad_norm": 0.5443566007650343, + "learning_rate": 6.973323720339986e-07, + "loss": 11.8042, + "step": 35388 + }, + { + "epoch": 1.9270724850762015, + "grad_norm": 0.5390561097638134, + "learning_rate": 6.962931748706547e-07, + "loss": 11.7709, + "step": 35389 + }, + { + "epoch": 1.9271269390727845, + "grad_norm": 0.565712985744551, + "learning_rate": 6.952547499095797e-07, + "loss": 11.8785, + "step": 35390 + }, + { + "epoch": 1.9271813930693675, + "grad_norm": 0.5805432028399905, + "learning_rate": 6.942170971588558e-07, + "loss": 11.8645, + "step": 35391 + }, + { + "epoch": 1.9272358470659507, + "grad_norm": 0.625499406559887, + "learning_rate": 6.931802166265544e-07, + "loss": 11.7966, + "step": 35392 + }, + { + "epoch": 1.9272903010625337, + "grad_norm": 0.517132198749326, + "learning_rate": 6.92144108320747e-07, + "loss": 11.7365, + "step": 35393 + }, + { + "epoch": 1.9273447550591167, + "grad_norm": 0.5861462659480324, + "learning_rate": 6.911087722494824e-07, + "loss": 11.8478, + "step": 35394 + }, + { + "epoch": 1.9273992090556997, + "grad_norm": 0.5619082696620521, + "learning_rate": 6.90074208420799e-07, + "loss": 11.7398, + "step": 35395 + }, + { + "epoch": 1.9274536630522827, + "grad_norm": 0.5148110059500736, + "learning_rate": 6.890404168427567e-07, + "loss": 11.7284, + "step": 35396 + }, + { + "epoch": 1.9275081170488657, + "grad_norm": 0.5645150423670221, + "learning_rate": 6.880073975233825e-07, + "loss": 11.9115, + "step": 35397 + }, + { + "epoch": 1.9275625710454487, + "grad_norm": 0.6195674293480592, + "learning_rate": 6.869751504707255e-07, + "loss": 11.7153, + "step": 35398 + }, + { + "epoch": 1.9276170250420317, + "grad_norm": 0.5095060571853789, + "learning_rate": 6.859436756928017e-07, + "loss": 11.7286, + "step": 35399 + }, + { + "epoch": 1.9276714790386147, + "grad_norm": 0.5364047275544772, + "learning_rate": 6.849129731976378e-07, + "loss": 11.7814, + "step": 35400 + }, + { + "epoch": 1.9277259330351977, + "grad_norm": 0.5484462847732213, + "learning_rate": 6.838830429932386e-07, + "loss": 11.6552, + "step": 35401 + }, + { + "epoch": 1.9277803870317807, + "grad_norm": 0.5992665760189813, + "learning_rate": 6.828538850876309e-07, + "loss": 11.8318, + "step": 35402 + }, + { + "epoch": 1.9278348410283637, + "grad_norm": 0.5201795017438869, + "learning_rate": 6.818254994887973e-07, + "loss": 11.8069, + "step": 35403 + }, + { + "epoch": 1.9278892950249467, + "grad_norm": 0.5738206542609812, + "learning_rate": 6.807978862047537e-07, + "loss": 11.8386, + "step": 35404 + }, + { + "epoch": 1.9279437490215297, + "grad_norm": 0.535493682482242, + "learning_rate": 6.797710452434714e-07, + "loss": 11.6852, + "step": 35405 + }, + { + "epoch": 1.9279982030181126, + "grad_norm": 0.5544308372407974, + "learning_rate": 6.78744976612955e-07, + "loss": 11.8048, + "step": 35406 + }, + { + "epoch": 1.9280526570146956, + "grad_norm": 0.6455312421116323, + "learning_rate": 6.777196803211649e-07, + "loss": 11.8757, + "step": 35407 + }, + { + "epoch": 1.9281071110112786, + "grad_norm": 0.49445511996845926, + "learning_rate": 6.766951563760948e-07, + "loss": 11.6895, + "step": 35408 + }, + { + "epoch": 1.9281615650078618, + "grad_norm": 0.5431249779727285, + "learning_rate": 6.756714047856938e-07, + "loss": 11.7307, + "step": 35409 + }, + { + "epoch": 1.9282160190044448, + "grad_norm": 0.5803088746264368, + "learning_rate": 6.746484255579222e-07, + "loss": 11.7811, + "step": 35410 + }, + { + "epoch": 1.9282704730010278, + "grad_norm": 0.514407079938101, + "learning_rate": 6.736262187007513e-07, + "loss": 11.5352, + "step": 35411 + }, + { + "epoch": 1.9283249269976108, + "grad_norm": 0.5249767434297249, + "learning_rate": 6.726047842221084e-07, + "loss": 11.8586, + "step": 35412 + }, + { + "epoch": 1.9283793809941938, + "grad_norm": 0.5934939878917341, + "learning_rate": 6.715841221299534e-07, + "loss": 11.6992, + "step": 35413 + }, + { + "epoch": 1.9284338349907768, + "grad_norm": 0.5314712080866695, + "learning_rate": 6.705642324322248e-07, + "loss": 11.7601, + "step": 35414 + }, + { + "epoch": 1.92848828898736, + "grad_norm": 0.5375466080781637, + "learning_rate": 6.695451151368381e-07, + "loss": 11.5534, + "step": 35415 + }, + { + "epoch": 1.928542742983943, + "grad_norm": 0.5091149042748288, + "learning_rate": 6.685267702517317e-07, + "loss": 11.7133, + "step": 35416 + }, + { + "epoch": 1.928597196980526, + "grad_norm": 0.5299152011881831, + "learning_rate": 6.675091977848102e-07, + "loss": 11.7254, + "step": 35417 + }, + { + "epoch": 1.928651650977109, + "grad_norm": 0.5082856489996755, + "learning_rate": 6.664923977440119e-07, + "loss": 11.7531, + "step": 35418 + }, + { + "epoch": 1.928706104973692, + "grad_norm": 0.5583785568851579, + "learning_rate": 6.65476370137208e-07, + "loss": 11.7428, + "step": 35419 + }, + { + "epoch": 1.928760558970275, + "grad_norm": 0.5235554945693908, + "learning_rate": 6.644611149723257e-07, + "loss": 11.6274, + "step": 35420 + }, + { + "epoch": 1.928815012966858, + "grad_norm": 0.5536106244421286, + "learning_rate": 6.634466322572586e-07, + "loss": 11.756, + "step": 35421 + }, + { + "epoch": 1.928869466963441, + "grad_norm": 0.598404157875678, + "learning_rate": 6.624329219998781e-07, + "loss": 11.7432, + "step": 35422 + }, + { + "epoch": 1.928923920960024, + "grad_norm": 0.5319733815319337, + "learning_rate": 6.614199842080893e-07, + "loss": 11.7326, + "step": 35423 + }, + { + "epoch": 1.928978374956607, + "grad_norm": 0.5583179663698711, + "learning_rate": 6.604078188897523e-07, + "loss": 11.8577, + "step": 35424 + }, + { + "epoch": 1.92903282895319, + "grad_norm": 0.6470860211071525, + "learning_rate": 6.593964260527385e-07, + "loss": 11.6878, + "step": 35425 + }, + { + "epoch": 1.929087282949773, + "grad_norm": 0.5941266616046202, + "learning_rate": 6.583858057049308e-07, + "loss": 11.9415, + "step": 35426 + }, + { + "epoch": 1.929141736946356, + "grad_norm": 0.5077404659532531, + "learning_rate": 6.57375957854156e-07, + "loss": 11.7247, + "step": 35427 + }, + { + "epoch": 1.929196190942939, + "grad_norm": 0.5371235103430241, + "learning_rate": 6.56366882508297e-07, + "loss": 11.697, + "step": 35428 + }, + { + "epoch": 1.929250644939522, + "grad_norm": 0.5335731010933777, + "learning_rate": 6.553585796751916e-07, + "loss": 11.8056, + "step": 35429 + }, + { + "epoch": 1.929305098936105, + "grad_norm": 0.5326915148899944, + "learning_rate": 6.543510493626781e-07, + "loss": 11.7094, + "step": 35430 + }, + { + "epoch": 1.929359552932688, + "grad_norm": 0.5458947629846782, + "learning_rate": 6.533442915785836e-07, + "loss": 11.866, + "step": 35431 + }, + { + "epoch": 1.9294140069292711, + "grad_norm": 0.7991401788652261, + "learning_rate": 6.523383063307465e-07, + "loss": 11.6802, + "step": 35432 + }, + { + "epoch": 1.9294684609258541, + "grad_norm": 0.55400984203346, + "learning_rate": 6.513330936269824e-07, + "loss": 11.7957, + "step": 35433 + }, + { + "epoch": 1.929522914922437, + "grad_norm": 0.5486984418403, + "learning_rate": 6.503286534751185e-07, + "loss": 11.8621, + "step": 35434 + }, + { + "epoch": 1.92957736891902, + "grad_norm": 0.5650074778730099, + "learning_rate": 6.493249858829597e-07, + "loss": 11.9049, + "step": 35435 + }, + { + "epoch": 1.929631822915603, + "grad_norm": 0.5348936191789725, + "learning_rate": 6.483220908583221e-07, + "loss": 11.7854, + "step": 35436 + }, + { + "epoch": 1.929686276912186, + "grad_norm": 0.5484853027327932, + "learning_rate": 6.47319968408977e-07, + "loss": 11.7551, + "step": 35437 + }, + { + "epoch": 1.9297407309087693, + "grad_norm": 0.596676607543935, + "learning_rate": 6.463186185427405e-07, + "loss": 11.9053, + "step": 35438 + }, + { + "epoch": 1.9297951849053523, + "grad_norm": 0.5244260156174037, + "learning_rate": 6.453180412673843e-07, + "loss": 11.8333, + "step": 35439 + }, + { + "epoch": 1.9298496389019353, + "grad_norm": 0.6116199511475648, + "learning_rate": 6.44318236590713e-07, + "loss": 11.8368, + "step": 35440 + }, + { + "epoch": 1.9299040928985183, + "grad_norm": 0.5096518970216495, + "learning_rate": 6.433192045204762e-07, + "loss": 11.8159, + "step": 35441 + }, + { + "epoch": 1.9299585468951013, + "grad_norm": 0.508541784617971, + "learning_rate": 6.423209450644452e-07, + "loss": 11.7323, + "step": 35442 + }, + { + "epoch": 1.9300130008916843, + "grad_norm": 0.5555457044378989, + "learning_rate": 6.413234582303918e-07, + "loss": 11.8258, + "step": 35443 + }, + { + "epoch": 1.9300674548882673, + "grad_norm": 0.5365538766671122, + "learning_rate": 6.403267440260763e-07, + "loss": 11.7692, + "step": 35444 + }, + { + "epoch": 1.9301219088848502, + "grad_norm": 0.5190823342156515, + "learning_rate": 6.39330802459237e-07, + "loss": 11.6126, + "step": 35445 + }, + { + "epoch": 1.9301763628814332, + "grad_norm": 0.5493325550371453, + "learning_rate": 6.383356335376234e-07, + "loss": 11.8537, + "step": 35446 + }, + { + "epoch": 1.9302308168780162, + "grad_norm": 0.568077630585568, + "learning_rate": 6.373412372689735e-07, + "loss": 11.887, + "step": 35447 + }, + { + "epoch": 1.9302852708745992, + "grad_norm": 0.5642403482181683, + "learning_rate": 6.363476136610369e-07, + "loss": 11.815, + "step": 35448 + }, + { + "epoch": 1.9303397248711822, + "grad_norm": 0.5813725648126096, + "learning_rate": 6.353547627215073e-07, + "loss": 11.8681, + "step": 35449 + }, + { + "epoch": 1.9303941788677652, + "grad_norm": 0.5534224653106757, + "learning_rate": 6.343626844581229e-07, + "loss": 11.7836, + "step": 35450 + }, + { + "epoch": 1.9304486328643482, + "grad_norm": 0.5909850161885183, + "learning_rate": 6.333713788785999e-07, + "loss": 11.7532, + "step": 35451 + }, + { + "epoch": 1.9305030868609312, + "grad_norm": 0.5617021735881333, + "learning_rate": 6.323808459906544e-07, + "loss": 11.8077, + "step": 35452 + }, + { + "epoch": 1.9305575408575142, + "grad_norm": 0.5088197444062463, + "learning_rate": 6.31391085801969e-07, + "loss": 11.7859, + "step": 35453 + }, + { + "epoch": 1.9306119948540972, + "grad_norm": 0.5667951383496671, + "learning_rate": 6.304020983202486e-07, + "loss": 11.7958, + "step": 35454 + }, + { + "epoch": 1.9306664488506802, + "grad_norm": 0.5631110617421782, + "learning_rate": 6.294138835531982e-07, + "loss": 11.7575, + "step": 35455 + }, + { + "epoch": 1.9307209028472634, + "grad_norm": 0.4771713502211208, + "learning_rate": 6.284264415084895e-07, + "loss": 11.6977, + "step": 35456 + }, + { + "epoch": 1.9307753568438464, + "grad_norm": 0.6243482940799704, + "learning_rate": 6.274397721937941e-07, + "loss": 11.8624, + "step": 35457 + }, + { + "epoch": 1.9308298108404294, + "grad_norm": 0.5242654353878011, + "learning_rate": 6.264538756167837e-07, + "loss": 11.8154, + "step": 35458 + }, + { + "epoch": 1.9308842648370124, + "grad_norm": 0.5354684200038378, + "learning_rate": 6.25468751785141e-07, + "loss": 11.8179, + "step": 35459 + }, + { + "epoch": 1.9309387188335954, + "grad_norm": 0.5434240838919755, + "learning_rate": 6.244844007065265e-07, + "loss": 11.8165, + "step": 35460 + }, + { + "epoch": 1.9309931728301784, + "grad_norm": 0.5358392178486855, + "learning_rate": 6.235008223885785e-07, + "loss": 11.8687, + "step": 35461 + }, + { + "epoch": 1.9310476268267616, + "grad_norm": 0.5295326054658354, + "learning_rate": 6.225180168389578e-07, + "loss": 11.6926, + "step": 35462 + }, + { + "epoch": 1.9311020808233446, + "grad_norm": 0.5109574104897429, + "learning_rate": 6.215359840652912e-07, + "loss": 11.5464, + "step": 35463 + }, + { + "epoch": 1.9311565348199276, + "grad_norm": 0.5126465925223631, + "learning_rate": 6.205547240752396e-07, + "loss": 11.9344, + "step": 35464 + }, + { + "epoch": 1.9312109888165105, + "grad_norm": 0.562043269587614, + "learning_rate": 6.195742368764191e-07, + "loss": 11.8007, + "step": 35465 + }, + { + "epoch": 1.9312654428130935, + "grad_norm": 0.516122612809263, + "learning_rate": 6.185945224764456e-07, + "loss": 11.8307, + "step": 35466 + }, + { + "epoch": 1.9313198968096765, + "grad_norm": 0.5150862949424221, + "learning_rate": 6.176155808829575e-07, + "loss": 11.7832, + "step": 35467 + }, + { + "epoch": 1.9313743508062595, + "grad_norm": 0.5312854379425083, + "learning_rate": 6.16637412103549e-07, + "loss": 11.5806, + "step": 35468 + }, + { + "epoch": 1.9314288048028425, + "grad_norm": 0.5217652543439314, + "learning_rate": 6.156600161458359e-07, + "loss": 11.6881, + "step": 35469 + }, + { + "epoch": 1.9314832587994255, + "grad_norm": 0.5782885527472418, + "learning_rate": 6.146833930174234e-07, + "loss": 11.8026, + "step": 35470 + }, + { + "epoch": 1.9315377127960085, + "grad_norm": 0.5043587481644304, + "learning_rate": 6.137075427258943e-07, + "loss": 11.7683, + "step": 35471 + }, + { + "epoch": 1.9315921667925915, + "grad_norm": 0.5348028428426579, + "learning_rate": 6.127324652788424e-07, + "loss": 11.6308, + "step": 35472 + }, + { + "epoch": 1.9316466207891745, + "grad_norm": 0.6041838035066207, + "learning_rate": 6.117581606838507e-07, + "loss": 11.7065, + "step": 35473 + }, + { + "epoch": 1.9317010747857575, + "grad_norm": 0.544692518559051, + "learning_rate": 6.10784628948502e-07, + "loss": 11.7577, + "step": 35474 + }, + { + "epoch": 1.9317555287823405, + "grad_norm": 0.5529879675505308, + "learning_rate": 6.098118700803568e-07, + "loss": 11.6863, + "step": 35475 + }, + { + "epoch": 1.9318099827789235, + "grad_norm": 0.5296035195851783, + "learning_rate": 6.088398840869758e-07, + "loss": 11.8179, + "step": 35476 + }, + { + "epoch": 1.9318644367755065, + "grad_norm": 0.7115354645674317, + "learning_rate": 6.078686709759307e-07, + "loss": 11.6767, + "step": 35477 + }, + { + "epoch": 1.9319188907720894, + "grad_norm": 0.5158612277346752, + "learning_rate": 6.068982307547599e-07, + "loss": 11.8101, + "step": 35478 + }, + { + "epoch": 1.9319733447686727, + "grad_norm": 0.550608074542295, + "learning_rate": 6.059285634310241e-07, + "loss": 11.7904, + "step": 35479 + }, + { + "epoch": 1.9320277987652557, + "grad_norm": 0.5920005190755275, + "learning_rate": 6.049596690122506e-07, + "loss": 11.7993, + "step": 35480 + }, + { + "epoch": 1.9320822527618386, + "grad_norm": 0.6001616860296559, + "learning_rate": 6.039915475059777e-07, + "loss": 11.7818, + "step": 35481 + }, + { + "epoch": 1.9321367067584216, + "grad_norm": 0.5419926409527603, + "learning_rate": 6.030241989197438e-07, + "loss": 11.8246, + "step": 35482 + }, + { + "epoch": 1.9321911607550046, + "grad_norm": 0.516740599127782, + "learning_rate": 6.020576232610542e-07, + "loss": 11.812, + "step": 35483 + }, + { + "epoch": 1.9322456147515876, + "grad_norm": 0.5638874430702876, + "learning_rate": 6.010918205374361e-07, + "loss": 11.7153, + "step": 35484 + }, + { + "epoch": 1.9323000687481708, + "grad_norm": 0.5943364813106105, + "learning_rate": 6.001267907564057e-07, + "loss": 11.6287, + "step": 35485 + }, + { + "epoch": 1.9323545227447538, + "grad_norm": 0.5856328089669531, + "learning_rate": 5.991625339254458e-07, + "loss": 11.7378, + "step": 35486 + }, + { + "epoch": 1.9324089767413368, + "grad_norm": 0.5527893443743865, + "learning_rate": 5.981990500520729e-07, + "loss": 11.8416, + "step": 35487 + }, + { + "epoch": 1.9324634307379198, + "grad_norm": 0.5339137887451216, + "learning_rate": 5.972363391437696e-07, + "loss": 11.8466, + "step": 35488 + }, + { + "epoch": 1.9325178847345028, + "grad_norm": 0.6167140816443636, + "learning_rate": 5.962744012080413e-07, + "loss": 11.7049, + "step": 35489 + }, + { + "epoch": 1.9325723387310858, + "grad_norm": 0.5891799387008039, + "learning_rate": 5.953132362523372e-07, + "loss": 11.6747, + "step": 35490 + }, + { + "epoch": 1.9326267927276688, + "grad_norm": 0.5361965009771246, + "learning_rate": 5.943528442841517e-07, + "loss": 11.7759, + "step": 35491 + }, + { + "epoch": 1.9326812467242518, + "grad_norm": 0.5822303092732038, + "learning_rate": 5.933932253109454e-07, + "loss": 11.8003, + "step": 35492 + }, + { + "epoch": 1.9327357007208348, + "grad_norm": 0.5725928998089853, + "learning_rate": 5.924343793401898e-07, + "loss": 11.9443, + "step": 35493 + }, + { + "epoch": 1.9327901547174178, + "grad_norm": 0.5570129172893064, + "learning_rate": 5.914763063793349e-07, + "loss": 11.7837, + "step": 35494 + }, + { + "epoch": 1.9328446087140008, + "grad_norm": 0.49833430172502596, + "learning_rate": 5.905190064358301e-07, + "loss": 11.7331, + "step": 35495 + }, + { + "epoch": 1.9328990627105838, + "grad_norm": 0.48798094462713315, + "learning_rate": 5.89562479517125e-07, + "loss": 11.7916, + "step": 35496 + }, + { + "epoch": 1.9329535167071668, + "grad_norm": 0.5102213066152852, + "learning_rate": 5.886067256306472e-07, + "loss": 11.8337, + "step": 35497 + }, + { + "epoch": 1.9330079707037497, + "grad_norm": 0.5262706629664062, + "learning_rate": 5.876517447838347e-07, + "loss": 11.7168, + "step": 35498 + }, + { + "epoch": 1.9330624247003327, + "grad_norm": 0.5274663062535434, + "learning_rate": 5.866975369841155e-07, + "loss": 11.7886, + "step": 35499 + }, + { + "epoch": 1.9331168786969157, + "grad_norm": 0.5142626794514161, + "learning_rate": 5.857441022389054e-07, + "loss": 11.7051, + "step": 35500 + }, + { + "epoch": 1.9331713326934987, + "grad_norm": 0.5462579250009353, + "learning_rate": 5.84791440555621e-07, + "loss": 11.8119, + "step": 35501 + }, + { + "epoch": 1.933225786690082, + "grad_norm": 0.5515835747959236, + "learning_rate": 5.838395519416784e-07, + "loss": 11.8155, + "step": 35502 + }, + { + "epoch": 1.933280240686665, + "grad_norm": 0.5028460130939895, + "learning_rate": 5.828884364044607e-07, + "loss": 11.8183, + "step": 35503 + }, + { + "epoch": 1.933334694683248, + "grad_norm": 0.5402563121058426, + "learning_rate": 5.819380939513841e-07, + "loss": 11.8034, + "step": 35504 + }, + { + "epoch": 1.933389148679831, + "grad_norm": 0.504151490597803, + "learning_rate": 5.809885245898206e-07, + "loss": 11.7673, + "step": 35505 + }, + { + "epoch": 1.933443602676414, + "grad_norm": 0.5367612822078753, + "learning_rate": 5.800397283271752e-07, + "loss": 11.8184, + "step": 35506 + }, + { + "epoch": 1.933498056672997, + "grad_norm": 0.6087798429679145, + "learning_rate": 5.790917051707978e-07, + "loss": 11.7996, + "step": 35507 + }, + { + "epoch": 1.9335525106695801, + "grad_norm": 0.5058173259283866, + "learning_rate": 5.781444551280823e-07, + "loss": 11.7636, + "step": 35508 + }, + { + "epoch": 1.933606964666163, + "grad_norm": 0.5104911424120063, + "learning_rate": 5.771979782063897e-07, + "loss": 11.7917, + "step": 35509 + }, + { + "epoch": 1.933661418662746, + "grad_norm": 0.5269449614424476, + "learning_rate": 5.762522744130805e-07, + "loss": 11.8466, + "step": 35510 + }, + { + "epoch": 1.933715872659329, + "grad_norm": 0.6238794507286992, + "learning_rate": 5.753073437555046e-07, + "loss": 11.8674, + "step": 35511 + }, + { + "epoch": 1.933770326655912, + "grad_norm": 0.5058354461907706, + "learning_rate": 5.743631862410115e-07, + "loss": 11.7186, + "step": 35512 + }, + { + "epoch": 1.933824780652495, + "grad_norm": 0.6166115157126538, + "learning_rate": 5.73419801876951e-07, + "loss": 11.8859, + "step": 35513 + }, + { + "epoch": 1.933879234649078, + "grad_norm": 0.5258356103363334, + "learning_rate": 5.724771906706505e-07, + "loss": 11.7401, + "step": 35514 + }, + { + "epoch": 1.933933688645661, + "grad_norm": 0.5825308617439735, + "learning_rate": 5.715353526294376e-07, + "loss": 11.8494, + "step": 35515 + }, + { + "epoch": 1.933988142642244, + "grad_norm": 0.5293163193035932, + "learning_rate": 5.705942877606396e-07, + "loss": 11.7866, + "step": 35516 + }, + { + "epoch": 1.934042596638827, + "grad_norm": 0.5160769179071342, + "learning_rate": 5.69653996071573e-07, + "loss": 11.5809, + "step": 35517 + }, + { + "epoch": 1.93409705063541, + "grad_norm": 0.49040082299889726, + "learning_rate": 5.687144775695541e-07, + "loss": 11.7248, + "step": 35518 + }, + { + "epoch": 1.934151504631993, + "grad_norm": 0.5418900609016448, + "learning_rate": 5.677757322618881e-07, + "loss": 11.6884, + "step": 35519 + }, + { + "epoch": 1.934205958628576, + "grad_norm": 0.4995072799842643, + "learning_rate": 5.668377601558694e-07, + "loss": 11.7995, + "step": 35520 + }, + { + "epoch": 1.934260412625159, + "grad_norm": 0.5595781752915366, + "learning_rate": 5.659005612587919e-07, + "loss": 11.9312, + "step": 35521 + }, + { + "epoch": 1.934314866621742, + "grad_norm": 0.5210870773252599, + "learning_rate": 5.649641355779501e-07, + "loss": 11.6057, + "step": 35522 + }, + { + "epoch": 1.934369320618325, + "grad_norm": 0.5302416990401119, + "learning_rate": 5.640284831206266e-07, + "loss": 11.8015, + "step": 35523 + }, + { + "epoch": 1.934423774614908, + "grad_norm": 0.5602177390948307, + "learning_rate": 5.630936038940826e-07, + "loss": 11.7848, + "step": 35524 + }, + { + "epoch": 1.934478228611491, + "grad_norm": 0.674124727988565, + "learning_rate": 5.621594979056122e-07, + "loss": 11.8302, + "step": 35525 + }, + { + "epoch": 1.9345326826080742, + "grad_norm": 0.525792940730806, + "learning_rate": 5.612261651624651e-07, + "loss": 11.7443, + "step": 35526 + }, + { + "epoch": 1.9345871366046572, + "grad_norm": 0.542087456639, + "learning_rate": 5.60293605671891e-07, + "loss": 11.7502, + "step": 35527 + }, + { + "epoch": 1.9346415906012402, + "grad_norm": 0.6032764142708296, + "learning_rate": 5.593618194411509e-07, + "loss": 11.8292, + "step": 35528 + }, + { + "epoch": 1.9346960445978232, + "grad_norm": 0.5052367945536791, + "learning_rate": 5.584308064774834e-07, + "loss": 11.6832, + "step": 35529 + }, + { + "epoch": 1.9347504985944062, + "grad_norm": 0.5790057680640368, + "learning_rate": 5.575005667881383e-07, + "loss": 11.8919, + "step": 35530 + }, + { + "epoch": 1.9348049525909894, + "grad_norm": 0.5269509280830994, + "learning_rate": 5.565711003803542e-07, + "loss": 11.7955, + "step": 35531 + }, + { + "epoch": 1.9348594065875724, + "grad_norm": 0.5238391545030523, + "learning_rate": 5.556424072613365e-07, + "loss": 11.7945, + "step": 35532 + }, + { + "epoch": 1.9349138605841554, + "grad_norm": 0.5270434170481811, + "learning_rate": 5.547144874383348e-07, + "loss": 11.8329, + "step": 35533 + }, + { + "epoch": 1.9349683145807384, + "grad_norm": 0.5729170709511119, + "learning_rate": 5.537873409185434e-07, + "loss": 11.6325, + "step": 35534 + }, + { + "epoch": 1.9350227685773214, + "grad_norm": 0.6404820091796056, + "learning_rate": 5.52860967709179e-07, + "loss": 11.8465, + "step": 35535 + }, + { + "epoch": 1.9350772225739044, + "grad_norm": 0.572087389665054, + "learning_rate": 5.519353678174465e-07, + "loss": 11.5826, + "step": 35536 + }, + { + "epoch": 1.9351316765704873, + "grad_norm": 0.5094313491671534, + "learning_rate": 5.510105412505406e-07, + "loss": 11.7929, + "step": 35537 + }, + { + "epoch": 1.9351861305670703, + "grad_norm": 0.5397780977723677, + "learning_rate": 5.500864880156553e-07, + "loss": 11.6658, + "step": 35538 + }, + { + "epoch": 1.9352405845636533, + "grad_norm": 0.5748822295368378, + "learning_rate": 5.491632081199626e-07, + "loss": 11.6088, + "step": 35539 + }, + { + "epoch": 1.9352950385602363, + "grad_norm": 0.6642626763132676, + "learning_rate": 5.482407015706681e-07, + "loss": 11.9099, + "step": 35540 + }, + { + "epoch": 1.9353494925568193, + "grad_norm": 0.49085089642363156, + "learning_rate": 5.473189683749325e-07, + "loss": 11.7104, + "step": 35541 + }, + { + "epoch": 1.9354039465534023, + "grad_norm": 0.5444184984950573, + "learning_rate": 5.463980085399167e-07, + "loss": 11.6211, + "step": 35542 + }, + { + "epoch": 1.9354584005499853, + "grad_norm": 0.561893596251195, + "learning_rate": 5.45477822072793e-07, + "loss": 11.7222, + "step": 35543 + }, + { + "epoch": 1.9355128545465683, + "grad_norm": 0.6352864392320118, + "learning_rate": 5.44558408980711e-07, + "loss": 11.7787, + "step": 35544 + }, + { + "epoch": 1.9355673085431513, + "grad_norm": 0.6272580156377827, + "learning_rate": 5.436397692708206e-07, + "loss": 11.8069, + "step": 35545 + }, + { + "epoch": 1.9356217625397343, + "grad_norm": 0.6382483803998975, + "learning_rate": 5.427219029502717e-07, + "loss": 11.8302, + "step": 35546 + }, + { + "epoch": 1.9356762165363173, + "grad_norm": 0.5368684556254641, + "learning_rate": 5.418048100261808e-07, + "loss": 11.8036, + "step": 35547 + }, + { + "epoch": 1.9357306705329003, + "grad_norm": 0.5487146134663398, + "learning_rate": 5.408884905057088e-07, + "loss": 11.8738, + "step": 35548 + }, + { + "epoch": 1.9357851245294835, + "grad_norm": 0.554506082062712, + "learning_rate": 5.399729443959611e-07, + "loss": 11.8091, + "step": 35549 + }, + { + "epoch": 1.9358395785260665, + "grad_norm": 0.5123244370176998, + "learning_rate": 5.390581717040766e-07, + "loss": 11.8091, + "step": 35550 + }, + { + "epoch": 1.9358940325226495, + "grad_norm": 0.5316891974945974, + "learning_rate": 5.381441724371384e-07, + "loss": 11.7525, + "step": 35551 + }, + { + "epoch": 1.9359484865192325, + "grad_norm": 0.5424051800823211, + "learning_rate": 5.37230946602274e-07, + "loss": 11.7651, + "step": 35552 + }, + { + "epoch": 1.9360029405158155, + "grad_norm": 0.5074854984584203, + "learning_rate": 5.363184942065891e-07, + "loss": 11.785, + "step": 35553 + }, + { + "epoch": 1.9360573945123984, + "grad_norm": 0.5125622814230949, + "learning_rate": 5.354068152571668e-07, + "loss": 11.773, + "step": 35554 + }, + { + "epoch": 1.9361118485089817, + "grad_norm": 0.5326360895208188, + "learning_rate": 5.344959097611014e-07, + "loss": 11.7576, + "step": 35555 + }, + { + "epoch": 1.9361663025055647, + "grad_norm": 0.5538480999750912, + "learning_rate": 5.33585777725476e-07, + "loss": 11.7114, + "step": 35556 + }, + { + "epoch": 1.9362207565021476, + "grad_norm": 0.5686752853133247, + "learning_rate": 5.326764191573741e-07, + "loss": 11.7576, + "step": 35557 + }, + { + "epoch": 1.9362752104987306, + "grad_norm": 0.5361564625880706, + "learning_rate": 5.317678340638566e-07, + "loss": 11.7045, + "step": 35558 + }, + { + "epoch": 1.9363296644953136, + "grad_norm": 0.5772105290727838, + "learning_rate": 5.308600224519844e-07, + "loss": 11.7416, + "step": 35559 + }, + { + "epoch": 1.9363841184918966, + "grad_norm": 0.554722701537153, + "learning_rate": 5.299529843288409e-07, + "loss": 11.7756, + "step": 35560 + }, + { + "epoch": 1.9364385724884796, + "grad_norm": 0.5684014070116883, + "learning_rate": 5.290467197014537e-07, + "loss": 11.8636, + "step": 35561 + }, + { + "epoch": 1.9364930264850626, + "grad_norm": 0.594703750715069, + "learning_rate": 5.281412285768839e-07, + "loss": 11.6712, + "step": 35562 + }, + { + "epoch": 1.9365474804816456, + "grad_norm": 0.5671834112957976, + "learning_rate": 5.272365109621702e-07, + "loss": 11.7228, + "step": 35563 + }, + { + "epoch": 1.9366019344782286, + "grad_norm": 0.533732680674147, + "learning_rate": 5.263325668643404e-07, + "loss": 11.7501, + "step": 35564 + }, + { + "epoch": 1.9366563884748116, + "grad_norm": 0.5474627287160215, + "learning_rate": 5.254293962904444e-07, + "loss": 11.8011, + "step": 35565 + }, + { + "epoch": 1.9367108424713946, + "grad_norm": 0.537744675436712, + "learning_rate": 5.245269992474766e-07, + "loss": 11.7322, + "step": 35566 + }, + { + "epoch": 1.9367652964679776, + "grad_norm": 0.6856842289761352, + "learning_rate": 5.236253757424758e-07, + "loss": 11.8531, + "step": 35567 + }, + { + "epoch": 1.9368197504645606, + "grad_norm": 0.528508075490259, + "learning_rate": 5.227245257824475e-07, + "loss": 11.8795, + "step": 35568 + }, + { + "epoch": 1.9368742044611436, + "grad_norm": 0.5458446312322265, + "learning_rate": 5.218244493743862e-07, + "loss": 11.7738, + "step": 35569 + }, + { + "epoch": 1.9369286584577265, + "grad_norm": 0.5817752403466625, + "learning_rate": 5.209251465253196e-07, + "loss": 11.8707, + "step": 35570 + }, + { + "epoch": 1.9369831124543095, + "grad_norm": 0.5757848563191925, + "learning_rate": 5.200266172422085e-07, + "loss": 11.8887, + "step": 35571 + }, + { + "epoch": 1.9370375664508928, + "grad_norm": 0.5366360202542961, + "learning_rate": 5.191288615320478e-07, + "loss": 11.7073, + "step": 35572 + }, + { + "epoch": 1.9370920204474757, + "grad_norm": 0.5009043441019304, + "learning_rate": 5.182318794018315e-07, + "loss": 11.6977, + "step": 35573 + }, + { + "epoch": 1.9371464744440587, + "grad_norm": 0.5453093343904455, + "learning_rate": 5.173356708585208e-07, + "loss": 11.8582, + "step": 35574 + }, + { + "epoch": 1.9372009284406417, + "grad_norm": 0.5530277793269945, + "learning_rate": 5.164402359090992e-07, + "loss": 11.8872, + "step": 35575 + }, + { + "epoch": 1.9372553824372247, + "grad_norm": 0.5171475565767923, + "learning_rate": 5.155455745605276e-07, + "loss": 11.764, + "step": 35576 + }, + { + "epoch": 1.9373098364338077, + "grad_norm": 0.5567248051119154, + "learning_rate": 5.146516868197448e-07, + "loss": 11.7506, + "step": 35577 + }, + { + "epoch": 1.937364290430391, + "grad_norm": 0.5453075674968821, + "learning_rate": 5.137585726937233e-07, + "loss": 11.7504, + "step": 35578 + }, + { + "epoch": 1.937418744426974, + "grad_norm": 0.5437802702069786, + "learning_rate": 5.128662321893906e-07, + "loss": 11.7658, + "step": 35579 + }, + { + "epoch": 1.937473198423557, + "grad_norm": 0.5910353216614329, + "learning_rate": 5.11974665313697e-07, + "loss": 11.8461, + "step": 35580 + }, + { + "epoch": 1.93752765242014, + "grad_norm": 0.5412227303444469, + "learning_rate": 5.1108387207357e-07, + "loss": 11.7498, + "step": 35581 + }, + { + "epoch": 1.937582106416723, + "grad_norm": 0.5293252489419342, + "learning_rate": 5.101938524759486e-07, + "loss": 11.7091, + "step": 35582 + }, + { + "epoch": 1.937636560413306, + "grad_norm": 0.5686496400691846, + "learning_rate": 5.093046065277385e-07, + "loss": 11.8361, + "step": 35583 + }, + { + "epoch": 1.937691014409889, + "grad_norm": 0.6288173121877694, + "learning_rate": 5.084161342358562e-07, + "loss": 11.7634, + "step": 35584 + }, + { + "epoch": 1.9377454684064719, + "grad_norm": 0.5355093804957634, + "learning_rate": 5.075284356072185e-07, + "loss": 11.798, + "step": 35585 + }, + { + "epoch": 1.9377999224030549, + "grad_norm": 0.5980710019276938, + "learning_rate": 5.06641510648731e-07, + "loss": 11.7094, + "step": 35586 + }, + { + "epoch": 1.9378543763996379, + "grad_norm": 0.5638489993134836, + "learning_rate": 5.05755359367277e-07, + "loss": 11.6973, + "step": 35587 + }, + { + "epoch": 1.9379088303962209, + "grad_norm": 0.5412255550250967, + "learning_rate": 5.048699817697511e-07, + "loss": 11.8987, + "step": 35588 + }, + { + "epoch": 1.9379632843928039, + "grad_norm": 0.53510416101725, + "learning_rate": 5.039853778630477e-07, + "loss": 11.7264, + "step": 35589 + }, + { + "epoch": 1.9380177383893868, + "grad_norm": 0.5512286243043325, + "learning_rate": 5.03101547654039e-07, + "loss": 11.6982, + "step": 35590 + }, + { + "epoch": 1.9380721923859698, + "grad_norm": 0.5852851572606463, + "learning_rate": 5.022184911495864e-07, + "loss": 11.6774, + "step": 35591 + }, + { + "epoch": 1.9381266463825528, + "grad_norm": 0.5660109327879069, + "learning_rate": 5.013362083565843e-07, + "loss": 11.816, + "step": 35592 + }, + { + "epoch": 1.9381811003791358, + "grad_norm": 0.5389142622214665, + "learning_rate": 5.004546992818715e-07, + "loss": 11.8273, + "step": 35593 + }, + { + "epoch": 1.9382355543757188, + "grad_norm": 0.5466833615710256, + "learning_rate": 4.995739639323094e-07, + "loss": 11.7037, + "step": 35594 + }, + { + "epoch": 1.9382900083723018, + "grad_norm": 0.5101710844844005, + "learning_rate": 4.98694002314748e-07, + "loss": 11.7182, + "step": 35595 + }, + { + "epoch": 1.938344462368885, + "grad_norm": 0.5598999360532281, + "learning_rate": 4.978148144360262e-07, + "loss": 11.8324, + "step": 35596 + }, + { + "epoch": 1.938398916365468, + "grad_norm": 0.5141143040876567, + "learning_rate": 4.969364003029941e-07, + "loss": 11.7442, + "step": 35597 + }, + { + "epoch": 1.938453370362051, + "grad_norm": 0.5282292840073158, + "learning_rate": 4.960587599224575e-07, + "loss": 11.8029, + "step": 35598 + }, + { + "epoch": 1.938507824358634, + "grad_norm": 0.5559970232317345, + "learning_rate": 4.951818933012553e-07, + "loss": 11.6902, + "step": 35599 + }, + { + "epoch": 1.938562278355217, + "grad_norm": 0.5585131601873601, + "learning_rate": 4.943058004462042e-07, + "loss": 11.8164, + "step": 35600 + }, + { + "epoch": 1.9386167323518002, + "grad_norm": 0.5361690102528642, + "learning_rate": 4.934304813641211e-07, + "loss": 11.7702, + "step": 35601 + }, + { + "epoch": 1.9386711863483832, + "grad_norm": 0.5611180396196582, + "learning_rate": 4.925559360618226e-07, + "loss": 11.6667, + "step": 35602 + }, + { + "epoch": 1.9387256403449662, + "grad_norm": 0.6096788073552318, + "learning_rate": 4.916821645460812e-07, + "loss": 11.8067, + "step": 35603 + }, + { + "epoch": 1.9387800943415492, + "grad_norm": 0.606285916864132, + "learning_rate": 4.908091668237136e-07, + "loss": 11.7109, + "step": 35604 + }, + { + "epoch": 1.9388345483381322, + "grad_norm": 0.5485993629123017, + "learning_rate": 4.899369429014922e-07, + "loss": 11.8023, + "step": 35605 + }, + { + "epoch": 1.9388890023347152, + "grad_norm": 0.5112756488971831, + "learning_rate": 4.890654927862226e-07, + "loss": 11.8408, + "step": 35606 + }, + { + "epoch": 1.9389434563312982, + "grad_norm": 0.5109369905202668, + "learning_rate": 4.881948164846661e-07, + "loss": 11.8175, + "step": 35607 + }, + { + "epoch": 1.9389979103278812, + "grad_norm": 0.5256283769630259, + "learning_rate": 4.87324914003584e-07, + "loss": 11.7667, + "step": 35608 + }, + { + "epoch": 1.9390523643244642, + "grad_norm": 0.5116411016231788, + "learning_rate": 4.864557853497597e-07, + "loss": 11.681, + "step": 35609 + }, + { + "epoch": 1.9391068183210471, + "grad_norm": 0.5106795253498314, + "learning_rate": 4.855874305299435e-07, + "loss": 11.7391, + "step": 35610 + }, + { + "epoch": 1.9391612723176301, + "grad_norm": 0.5515760517609177, + "learning_rate": 4.847198495508853e-07, + "loss": 11.8701, + "step": 35611 + }, + { + "epoch": 1.9392157263142131, + "grad_norm": 0.5935175150195036, + "learning_rate": 4.838530424193355e-07, + "loss": 11.7416, + "step": 35612 + }, + { + "epoch": 1.9392701803107961, + "grad_norm": 0.5456600415296303, + "learning_rate": 4.829870091420219e-07, + "loss": 11.7238, + "step": 35613 + }, + { + "epoch": 1.9393246343073791, + "grad_norm": 0.5820267786212897, + "learning_rate": 4.821217497257058e-07, + "loss": 11.8029, + "step": 35614 + }, + { + "epoch": 1.939379088303962, + "grad_norm": 0.5972214761178529, + "learning_rate": 4.812572641770929e-07, + "loss": 11.8013, + "step": 35615 + }, + { + "epoch": 1.939433542300545, + "grad_norm": 0.5705273882659945, + "learning_rate": 4.803935525029224e-07, + "loss": 11.7862, + "step": 35616 + }, + { + "epoch": 1.939487996297128, + "grad_norm": 0.6152412570596103, + "learning_rate": 4.795306147098999e-07, + "loss": 11.8496, + "step": 35617 + }, + { + "epoch": 1.939542450293711, + "grad_norm": 0.6127777368977805, + "learning_rate": 4.786684508047201e-07, + "loss": 11.8153, + "step": 35618 + }, + { + "epoch": 1.9395969042902943, + "grad_norm": 0.5332224816342236, + "learning_rate": 4.778070607941221e-07, + "loss": 11.8921, + "step": 35619 + }, + { + "epoch": 1.9396513582868773, + "grad_norm": 0.560218249044104, + "learning_rate": 4.769464446847782e-07, + "loss": 11.7938, + "step": 35620 + }, + { + "epoch": 1.9397058122834603, + "grad_norm": 0.5371161705150863, + "learning_rate": 4.760866024833943e-07, + "loss": 11.795, + "step": 35621 + }, + { + "epoch": 1.9397602662800433, + "grad_norm": 0.5914869001523443, + "learning_rate": 4.752275341966428e-07, + "loss": 11.7295, + "step": 35622 + }, + { + "epoch": 1.9398147202766263, + "grad_norm": 0.5783624654681653, + "learning_rate": 4.7436923983120717e-07, + "loss": 11.9032, + "step": 35623 + }, + { + "epoch": 1.9398691742732093, + "grad_norm": 0.5788289930007964, + "learning_rate": 4.735117193937821e-07, + "loss": 11.8802, + "step": 35624 + }, + { + "epoch": 1.9399236282697925, + "grad_norm": 0.5369306486402379, + "learning_rate": 4.726549728910179e-07, + "loss": 11.8194, + "step": 35625 + }, + { + "epoch": 1.9399780822663755, + "grad_norm": 0.5421521257155484, + "learning_rate": 4.717990003295758e-07, + "loss": 11.8647, + "step": 35626 + }, + { + "epoch": 1.9400325362629585, + "grad_norm": 0.5615799146355253, + "learning_rate": 4.709438017161172e-07, + "loss": 11.8358, + "step": 35627 + }, + { + "epoch": 1.9400869902595415, + "grad_norm": 0.5786992811693126, + "learning_rate": 4.700893770572812e-07, + "loss": 11.7863, + "step": 35628 + }, + { + "epoch": 1.9401414442561244, + "grad_norm": 0.5857360801939799, + "learning_rate": 4.6923572635974023e-07, + "loss": 11.7307, + "step": 35629 + }, + { + "epoch": 1.9401958982527074, + "grad_norm": 0.5265721571586228, + "learning_rate": 4.6838284963010016e-07, + "loss": 11.7108, + "step": 35630 + }, + { + "epoch": 1.9402503522492904, + "grad_norm": 0.5645426037271813, + "learning_rate": 4.675307468750112e-07, + "loss": 11.7081, + "step": 35631 + }, + { + "epoch": 1.9403048062458734, + "grad_norm": 0.5340102986397449, + "learning_rate": 4.6667941810109026e-07, + "loss": 11.704, + "step": 35632 + }, + { + "epoch": 1.9403592602424564, + "grad_norm": 0.5791237343761372, + "learning_rate": 4.6582886331496543e-07, + "loss": 11.7368, + "step": 35633 + }, + { + "epoch": 1.9404137142390394, + "grad_norm": 0.5726002459202131, + "learning_rate": 4.649790825232425e-07, + "loss": 11.8715, + "step": 35634 + }, + { + "epoch": 1.9404681682356224, + "grad_norm": 0.5064762737642653, + "learning_rate": 4.641300757325273e-07, + "loss": 11.8122, + "step": 35635 + }, + { + "epoch": 1.9405226222322054, + "grad_norm": 0.5245745992275329, + "learning_rate": 4.632818429494479e-07, + "loss": 11.7848, + "step": 35636 + }, + { + "epoch": 1.9405770762287884, + "grad_norm": 0.5583594530276754, + "learning_rate": 4.6243438418057674e-07, + "loss": 11.8176, + "step": 35637 + }, + { + "epoch": 1.9406315302253714, + "grad_norm": 0.5467704679271195, + "learning_rate": 4.6158769943249747e-07, + "loss": 11.828, + "step": 35638 + }, + { + "epoch": 1.9406859842219544, + "grad_norm": 0.6377707657926502, + "learning_rate": 4.6074178871181595e-07, + "loss": 11.9018, + "step": 35639 + }, + { + "epoch": 1.9407404382185374, + "grad_norm": 0.5554824544815146, + "learning_rate": 4.598966520250936e-07, + "loss": 11.6895, + "step": 35640 + }, + { + "epoch": 1.9407948922151204, + "grad_norm": 0.5584007560908515, + "learning_rate": 4.590522893789029e-07, + "loss": 11.8051, + "step": 35641 + }, + { + "epoch": 1.9408493462117036, + "grad_norm": 0.5427642025318676, + "learning_rate": 4.5820870077982747e-07, + "loss": 11.7008, + "step": 35642 + }, + { + "epoch": 1.9409038002082866, + "grad_norm": 0.5466973752396468, + "learning_rate": 4.5736588623440655e-07, + "loss": 11.8237, + "step": 35643 + }, + { + "epoch": 1.9409582542048696, + "grad_norm": 0.53261653244681, + "learning_rate": 4.565238457492016e-07, + "loss": 11.847, + "step": 35644 + }, + { + "epoch": 1.9410127082014526, + "grad_norm": 0.5893298575811762, + "learning_rate": 4.5568257933075175e-07, + "loss": 11.8655, + "step": 35645 + }, + { + "epoch": 1.9410671621980355, + "grad_norm": 0.529436593129781, + "learning_rate": 4.5484208698562957e-07, + "loss": 11.7339, + "step": 35646 + }, + { + "epoch": 1.9411216161946185, + "grad_norm": 0.4989824159338248, + "learning_rate": 4.5400236872032987e-07, + "loss": 11.7142, + "step": 35647 + }, + { + "epoch": 1.9411760701912018, + "grad_norm": 0.5639858010167177, + "learning_rate": 4.5316342454141403e-07, + "loss": 11.7917, + "step": 35648 + }, + { + "epoch": 1.9412305241877847, + "grad_norm": 0.5489765442045642, + "learning_rate": 4.5232525445538796e-07, + "loss": 11.6759, + "step": 35649 + }, + { + "epoch": 1.9412849781843677, + "grad_norm": 0.552480917792173, + "learning_rate": 4.514878584687687e-07, + "loss": 11.5201, + "step": 35650 + }, + { + "epoch": 1.9413394321809507, + "grad_norm": 0.5364813325782707, + "learning_rate": 4.506512365880844e-07, + "loss": 11.6857, + "step": 35651 + }, + { + "epoch": 1.9413938861775337, + "grad_norm": 0.522938519881032, + "learning_rate": 4.498153888198298e-07, + "loss": 11.7345, + "step": 35652 + }, + { + "epoch": 1.9414483401741167, + "grad_norm": 0.5362017031750769, + "learning_rate": 4.4898031517049965e-07, + "loss": 11.8485, + "step": 35653 + }, + { + "epoch": 1.9415027941706997, + "grad_norm": 0.5341845330736563, + "learning_rate": 4.4814601564659996e-07, + "loss": 11.7259, + "step": 35654 + }, + { + "epoch": 1.9415572481672827, + "grad_norm": 0.5555783654732127, + "learning_rate": 4.473124902546033e-07, + "loss": 11.8794, + "step": 35655 + }, + { + "epoch": 1.9416117021638657, + "grad_norm": 0.5586152100740498, + "learning_rate": 4.464797390010045e-07, + "loss": 11.6914, + "step": 35656 + }, + { + "epoch": 1.9416661561604487, + "grad_norm": 0.5447660271687738, + "learning_rate": 4.456477618922761e-07, + "loss": 11.7789, + "step": 35657 + }, + { + "epoch": 1.9417206101570317, + "grad_norm": 0.622647630281201, + "learning_rate": 4.448165589348796e-07, + "loss": 11.7603, + "step": 35658 + }, + { + "epoch": 1.9417750641536147, + "grad_norm": 0.5431923100601692, + "learning_rate": 4.4398613013528766e-07, + "loss": 11.792, + "step": 35659 + }, + { + "epoch": 1.9418295181501977, + "grad_norm": 0.5790240480772548, + "learning_rate": 4.4315647549996176e-07, + "loss": 11.9162, + "step": 35660 + }, + { + "epoch": 1.9418839721467807, + "grad_norm": 0.5117777821406759, + "learning_rate": 4.4232759503534115e-07, + "loss": 11.7656, + "step": 35661 + }, + { + "epoch": 1.9419384261433636, + "grad_norm": 0.5479628306654931, + "learning_rate": 4.414994887478763e-07, + "loss": 11.622, + "step": 35662 + }, + { + "epoch": 1.9419928801399466, + "grad_norm": 0.6233033102570642, + "learning_rate": 4.4067215664400643e-07, + "loss": 11.9512, + "step": 35663 + }, + { + "epoch": 1.9420473341365296, + "grad_norm": 0.5564504797952488, + "learning_rate": 4.3984559873017086e-07, + "loss": 11.673, + "step": 35664 + }, + { + "epoch": 1.9421017881331128, + "grad_norm": 0.5448089265380534, + "learning_rate": 4.3901981501278664e-07, + "loss": 11.9291, + "step": 35665 + }, + { + "epoch": 1.9421562421296958, + "grad_norm": 0.48524318686328877, + "learning_rate": 4.3819480549828205e-07, + "loss": 11.723, + "step": 35666 + }, + { + "epoch": 1.9422106961262788, + "grad_norm": 0.6561161077775467, + "learning_rate": 4.3737057019307413e-07, + "loss": 11.7131, + "step": 35667 + }, + { + "epoch": 1.9422651501228618, + "grad_norm": 0.652101449136463, + "learning_rate": 4.3654710910356886e-07, + "loss": 11.9336, + "step": 35668 + }, + { + "epoch": 1.9423196041194448, + "grad_norm": 0.4927048110208113, + "learning_rate": 4.3572442223617225e-07, + "loss": 11.6496, + "step": 35669 + }, + { + "epoch": 1.9423740581160278, + "grad_norm": 0.5433086032537734, + "learning_rate": 4.349025095972792e-07, + "loss": 11.7397, + "step": 35670 + }, + { + "epoch": 1.942428512112611, + "grad_norm": 0.5282950663203201, + "learning_rate": 4.340813711932734e-07, + "loss": 11.8879, + "step": 35671 + }, + { + "epoch": 1.942482966109194, + "grad_norm": 0.47735063570741465, + "learning_rate": 4.3326100703054983e-07, + "loss": 11.827, + "step": 35672 + }, + { + "epoch": 1.942537420105777, + "grad_norm": 0.5427272413004708, + "learning_rate": 4.3244141711549223e-07, + "loss": 11.7919, + "step": 35673 + }, + { + "epoch": 1.94259187410236, + "grad_norm": 0.5197529372982094, + "learning_rate": 4.316226014544622e-07, + "loss": 11.7244, + "step": 35674 + }, + { + "epoch": 1.942646328098943, + "grad_norm": 0.5386760180800727, + "learning_rate": 4.3080456005383243e-07, + "loss": 11.7665, + "step": 35675 + }, + { + "epoch": 1.942700782095526, + "grad_norm": 0.6072065150359209, + "learning_rate": 4.2998729291997553e-07, + "loss": 11.8564, + "step": 35676 + }, + { + "epoch": 1.942755236092109, + "grad_norm": 0.5637670787877125, + "learning_rate": 4.2917080005921985e-07, + "loss": 11.7804, + "step": 35677 + }, + { + "epoch": 1.942809690088692, + "grad_norm": 0.565105528113536, + "learning_rate": 4.28355081477938e-07, + "loss": 11.8302, + "step": 35678 + }, + { + "epoch": 1.942864144085275, + "grad_norm": 0.5648751695002734, + "learning_rate": 4.2754013718245836e-07, + "loss": 11.6565, + "step": 35679 + }, + { + "epoch": 1.942918598081858, + "grad_norm": 0.5920240674745916, + "learning_rate": 4.267259671791424e-07, + "loss": 11.7315, + "step": 35680 + }, + { + "epoch": 1.942973052078441, + "grad_norm": 0.5077946424213308, + "learning_rate": 4.259125714742851e-07, + "loss": 11.78, + "step": 35681 + }, + { + "epoch": 1.943027506075024, + "grad_norm": 0.545072177508665, + "learning_rate": 4.25099950074237e-07, + "loss": 11.7516, + "step": 35682 + }, + { + "epoch": 1.943081960071607, + "grad_norm": 0.5208080262270302, + "learning_rate": 4.242881029853041e-07, + "loss": 11.7015, + "step": 35683 + }, + { + "epoch": 1.94313641406819, + "grad_norm": 0.48932973275562786, + "learning_rate": 4.234770302138147e-07, + "loss": 11.6227, + "step": 35684 + }, + { + "epoch": 1.943190868064773, + "grad_norm": 0.5030720565581431, + "learning_rate": 4.2266673176606376e-07, + "loss": 11.9005, + "step": 35685 + }, + { + "epoch": 1.943245322061356, + "grad_norm": 0.5441757426810855, + "learning_rate": 4.218572076483573e-07, + "loss": 11.8475, + "step": 35686 + }, + { + "epoch": 1.943299776057939, + "grad_norm": 0.5935279381739246, + "learning_rate": 4.210484578669904e-07, + "loss": 11.826, + "step": 35687 + }, + { + "epoch": 1.943354230054522, + "grad_norm": 0.5280797930627416, + "learning_rate": 4.202404824282469e-07, + "loss": 11.7909, + "step": 35688 + }, + { + "epoch": 1.9434086840511051, + "grad_norm": 0.5701949166003447, + "learning_rate": 4.1943328133841056e-07, + "loss": 11.782, + "step": 35689 + }, + { + "epoch": 1.943463138047688, + "grad_norm": 0.6112172870600644, + "learning_rate": 4.1862685460376525e-07, + "loss": 11.8887, + "step": 35690 + }, + { + "epoch": 1.943517592044271, + "grad_norm": 0.5271576083084188, + "learning_rate": 4.1782120223057273e-07, + "loss": 11.697, + "step": 35691 + }, + { + "epoch": 1.943572046040854, + "grad_norm": 0.676886655921369, + "learning_rate": 4.1701632422510575e-07, + "loss": 11.8064, + "step": 35692 + }, + { + "epoch": 1.943626500037437, + "grad_norm": 0.58317774283948, + "learning_rate": 4.1621222059361477e-07, + "loss": 11.7945, + "step": 35693 + }, + { + "epoch": 1.94368095403402, + "grad_norm": 0.5382340106794387, + "learning_rate": 4.154088913423615e-07, + "loss": 11.6911, + "step": 35694 + }, + { + "epoch": 1.9437354080306033, + "grad_norm": 0.5199886284674525, + "learning_rate": 4.146063364775854e-07, + "loss": 11.7783, + "step": 35695 + }, + { + "epoch": 1.9437898620271863, + "grad_norm": 0.5757448211057639, + "learning_rate": 4.13804556005537e-07, + "loss": 11.8136, + "step": 35696 + }, + { + "epoch": 1.9438443160237693, + "grad_norm": 0.6443045661948109, + "learning_rate": 4.1300354993244475e-07, + "loss": 11.8579, + "step": 35697 + }, + { + "epoch": 1.9438987700203523, + "grad_norm": 0.5467342450522417, + "learning_rate": 4.122033182645368e-07, + "loss": 11.7118, + "step": 35698 + }, + { + "epoch": 1.9439532240169353, + "grad_norm": 0.5542789289943424, + "learning_rate": 4.1140386100803063e-07, + "loss": 11.8412, + "step": 35699 + }, + { + "epoch": 1.9440076780135183, + "grad_norm": 0.580831701424614, + "learning_rate": 4.1060517816916553e-07, + "loss": 11.8274, + "step": 35700 + }, + { + "epoch": 1.9440621320101013, + "grad_norm": 0.583477141632455, + "learning_rate": 4.098072697541144e-07, + "loss": 11.8083, + "step": 35701 + }, + { + "epoch": 1.9441165860066842, + "grad_norm": 0.561768365617742, + "learning_rate": 4.090101357691167e-07, + "loss": 11.7626, + "step": 35702 + }, + { + "epoch": 1.9441710400032672, + "grad_norm": 0.5796074077575896, + "learning_rate": 4.082137762203564e-07, + "loss": 11.7128, + "step": 35703 + }, + { + "epoch": 1.9442254939998502, + "grad_norm": 0.5634412532462876, + "learning_rate": 4.0741819111402846e-07, + "loss": 11.777, + "step": 35704 + }, + { + "epoch": 1.9442799479964332, + "grad_norm": 0.5646150578537383, + "learning_rate": 4.0662338045630576e-07, + "loss": 11.7639, + "step": 35705 + }, + { + "epoch": 1.9443344019930162, + "grad_norm": 0.5284997169497452, + "learning_rate": 4.058293442533945e-07, + "loss": 11.7069, + "step": 35706 + }, + { + "epoch": 1.9443888559895992, + "grad_norm": 0.5379691627878892, + "learning_rate": 4.050360825114563e-07, + "loss": 11.8555, + "step": 35707 + }, + { + "epoch": 1.9444433099861822, + "grad_norm": 0.5539307429360871, + "learning_rate": 4.04243595236653e-07, + "loss": 11.7842, + "step": 35708 + }, + { + "epoch": 1.9444977639827652, + "grad_norm": 0.5455320142949022, + "learning_rate": 4.0345188243515743e-07, + "loss": 11.7417, + "step": 35709 + }, + { + "epoch": 1.9445522179793482, + "grad_norm": 0.5941572163906921, + "learning_rate": 4.0266094411312016e-07, + "loss": 11.7816, + "step": 35710 + }, + { + "epoch": 1.9446066719759312, + "grad_norm": 0.5993469973740057, + "learning_rate": 4.018707802766919e-07, + "loss": 11.7944, + "step": 35711 + }, + { + "epoch": 1.9446611259725144, + "grad_norm": 0.5623196863215588, + "learning_rate": 4.0108139093202323e-07, + "loss": 11.7284, + "step": 35712 + }, + { + "epoch": 1.9447155799690974, + "grad_norm": 0.5040781196717365, + "learning_rate": 4.0029277608524265e-07, + "loss": 11.7071, + "step": 35713 + }, + { + "epoch": 1.9447700339656804, + "grad_norm": 0.524003300164236, + "learning_rate": 3.995049357425007e-07, + "loss": 11.7005, + "step": 35714 + }, + { + "epoch": 1.9448244879622634, + "grad_norm": 0.4866673847118549, + "learning_rate": 3.987178699098926e-07, + "loss": 11.7628, + "step": 35715 + }, + { + "epoch": 1.9448789419588464, + "grad_norm": 0.5821477240631966, + "learning_rate": 3.9793157859358e-07, + "loss": 11.8356, + "step": 35716 + }, + { + "epoch": 1.9449333959554294, + "grad_norm": 0.510507799479634, + "learning_rate": 3.971460617996359e-07, + "loss": 11.767, + "step": 35717 + }, + { + "epoch": 1.9449878499520126, + "grad_norm": 0.6280518385953502, + "learning_rate": 3.9636131953419975e-07, + "loss": 11.6978, + "step": 35718 + }, + { + "epoch": 1.9450423039485956, + "grad_norm": 0.5285910951440296, + "learning_rate": 3.9557735180335567e-07, + "loss": 11.8543, + "step": 35719 + }, + { + "epoch": 1.9450967579451786, + "grad_norm": 0.5458123047948893, + "learning_rate": 3.9479415861320977e-07, + "loss": 11.7192, + "step": 35720 + }, + { + "epoch": 1.9451512119417615, + "grad_norm": 0.5328846918990849, + "learning_rate": 3.9401173996983507e-07, + "loss": 11.8453, + "step": 35721 + }, + { + "epoch": 1.9452056659383445, + "grad_norm": 0.5329099872425184, + "learning_rate": 3.932300958793489e-07, + "loss": 11.7942, + "step": 35722 + }, + { + "epoch": 1.9452601199349275, + "grad_norm": 0.5598684641929673, + "learning_rate": 3.924492263477908e-07, + "loss": 11.7646, + "step": 35723 + }, + { + "epoch": 1.9453145739315105, + "grad_norm": 0.559382593285798, + "learning_rate": 3.9166913138126704e-07, + "loss": 11.7958, + "step": 35724 + }, + { + "epoch": 1.9453690279280935, + "grad_norm": 0.5545658316275914, + "learning_rate": 3.908898109858172e-07, + "loss": 11.7752, + "step": 35725 + }, + { + "epoch": 1.9454234819246765, + "grad_norm": 0.5180100870720471, + "learning_rate": 3.901112651675143e-07, + "loss": 11.6259, + "step": 35726 + }, + { + "epoch": 1.9454779359212595, + "grad_norm": 0.545965859639784, + "learning_rate": 3.8933349393242003e-07, + "loss": 11.8456, + "step": 35727 + }, + { + "epoch": 1.9455323899178425, + "grad_norm": 0.6642197509422173, + "learning_rate": 3.88556497286563e-07, + "loss": 11.721, + "step": 35728 + }, + { + "epoch": 1.9455868439144255, + "grad_norm": 0.4712969524763708, + "learning_rate": 3.877802752360049e-07, + "loss": 11.6598, + "step": 35729 + }, + { + "epoch": 1.9456412979110085, + "grad_norm": 0.5494635343391067, + "learning_rate": 3.8700482778676327e-07, + "loss": 11.8528, + "step": 35730 + }, + { + "epoch": 1.9456957519075915, + "grad_norm": 0.5428316624175658, + "learning_rate": 3.8623015494488877e-07, + "loss": 11.7118, + "step": 35731 + }, + { + "epoch": 1.9457502059041745, + "grad_norm": 0.4863273638061356, + "learning_rate": 3.854562567163766e-07, + "loss": 11.6769, + "step": 35732 + }, + { + "epoch": 1.9458046599007575, + "grad_norm": 0.5746758191981314, + "learning_rate": 3.8468313310727753e-07, + "loss": 11.8249, + "step": 35733 + }, + { + "epoch": 1.9458591138973405, + "grad_norm": 0.5685567238136227, + "learning_rate": 3.8391078412357563e-07, + "loss": 11.7821, + "step": 35734 + }, + { + "epoch": 1.9459135678939237, + "grad_norm": 0.5237326655910691, + "learning_rate": 3.831392097712994e-07, + "loss": 11.7451, + "step": 35735 + }, + { + "epoch": 1.9459680218905067, + "grad_norm": 0.6046829238409155, + "learning_rate": 3.823684100564329e-07, + "loss": 11.8291, + "step": 35736 + }, + { + "epoch": 1.9460224758870897, + "grad_norm": 0.533224507214216, + "learning_rate": 3.8159838498498247e-07, + "loss": 11.7393, + "step": 35737 + }, + { + "epoch": 1.9460769298836726, + "grad_norm": 0.5436070399414449, + "learning_rate": 3.8082913456292114e-07, + "loss": 11.8217, + "step": 35738 + }, + { + "epoch": 1.9461313838802556, + "grad_norm": 0.5453353709124602, + "learning_rate": 3.800606587962441e-07, + "loss": 11.8112, + "step": 35739 + }, + { + "epoch": 1.9461858378768386, + "grad_norm": 0.5812766822010963, + "learning_rate": 3.792929576909132e-07, + "loss": 11.8766, + "step": 35740 + }, + { + "epoch": 1.9462402918734218, + "grad_norm": 0.5537300880662903, + "learning_rate": 3.7852603125291265e-07, + "loss": 11.7217, + "step": 35741 + }, + { + "epoch": 1.9462947458700048, + "grad_norm": 0.5276328697901749, + "learning_rate": 3.7775987948819316e-07, + "loss": 11.8891, + "step": 35742 + }, + { + "epoch": 1.9463491998665878, + "grad_norm": 0.5260705314244255, + "learning_rate": 3.769945024027277e-07, + "loss": 11.6815, + "step": 35743 + }, + { + "epoch": 1.9464036538631708, + "grad_norm": 0.5785938476405321, + "learning_rate": 3.7622990000245606e-07, + "loss": 11.7479, + "step": 35744 + }, + { + "epoch": 1.9464581078597538, + "grad_norm": 0.5896680240690657, + "learning_rate": 3.75466072293329e-07, + "loss": 11.7817, + "step": 35745 + }, + { + "epoch": 1.9465125618563368, + "grad_norm": 0.6498796511110028, + "learning_rate": 3.747030192812862e-07, + "loss": 11.7938, + "step": 35746 + }, + { + "epoch": 1.9465670158529198, + "grad_norm": 0.7237580619834845, + "learning_rate": 3.7394074097226725e-07, + "loss": 11.8408, + "step": 35747 + }, + { + "epoch": 1.9466214698495028, + "grad_norm": 0.5563646385313362, + "learning_rate": 3.7317923737217876e-07, + "loss": 11.7168, + "step": 35748 + }, + { + "epoch": 1.9466759238460858, + "grad_norm": 0.5168367031565501, + "learning_rate": 3.7241850848696023e-07, + "loss": 11.7155, + "step": 35749 + }, + { + "epoch": 1.9467303778426688, + "grad_norm": 0.5130486735454791, + "learning_rate": 3.7165855432252926e-07, + "loss": 11.7459, + "step": 35750 + }, + { + "epoch": 1.9467848318392518, + "grad_norm": 0.6244948898279601, + "learning_rate": 3.708993748847811e-07, + "loss": 11.7626, + "step": 35751 + }, + { + "epoch": 1.9468392858358348, + "grad_norm": 0.5530923103096124, + "learning_rate": 3.701409701796332e-07, + "loss": 11.7108, + "step": 35752 + }, + { + "epoch": 1.9468937398324178, + "grad_norm": 0.5661141389051748, + "learning_rate": 3.6938334021296985e-07, + "loss": 11.8574, + "step": 35753 + }, + { + "epoch": 1.9469481938290008, + "grad_norm": 0.5182482736419168, + "learning_rate": 3.686264849906973e-07, + "loss": 11.5661, + "step": 35754 + }, + { + "epoch": 1.9470026478255837, + "grad_norm": 0.5639635402331181, + "learning_rate": 3.678704045186776e-07, + "loss": 11.5872, + "step": 35755 + }, + { + "epoch": 1.9470571018221667, + "grad_norm": 0.5171289199709851, + "learning_rate": 3.6711509880282823e-07, + "loss": 11.7741, + "step": 35756 + }, + { + "epoch": 1.9471115558187497, + "grad_norm": 0.516906244992106, + "learning_rate": 3.6636056784898896e-07, + "loss": 11.8151, + "step": 35757 + }, + { + "epoch": 1.9471660098153327, + "grad_norm": 0.5059858006377905, + "learning_rate": 3.656068116630329e-07, + "loss": 11.5865, + "step": 35758 + }, + { + "epoch": 1.947220463811916, + "grad_norm": 0.4941656244664837, + "learning_rate": 3.6485383025084416e-07, + "loss": 11.7932, + "step": 35759 + }, + { + "epoch": 1.947274917808499, + "grad_norm": 0.6434683748293571, + "learning_rate": 3.641016236182404e-07, + "loss": 11.7971, + "step": 35760 + }, + { + "epoch": 1.947329371805082, + "grad_norm": 0.5282323122304959, + "learning_rate": 3.6335019177110575e-07, + "loss": 11.7219, + "step": 35761 + }, + { + "epoch": 1.947383825801665, + "grad_norm": 0.502077551476034, + "learning_rate": 3.6259953471526885e-07, + "loss": 11.7572, + "step": 35762 + }, + { + "epoch": 1.947438279798248, + "grad_norm": 0.499740545919771, + "learning_rate": 3.618496524565584e-07, + "loss": 11.7284, + "step": 35763 + }, + { + "epoch": 1.947492733794831, + "grad_norm": 0.5412275970791999, + "learning_rate": 3.6110054500081425e-07, + "loss": 11.8111, + "step": 35764 + }, + { + "epoch": 1.9475471877914141, + "grad_norm": 0.6620992968835064, + "learning_rate": 3.6035221235387607e-07, + "loss": 11.8467, + "step": 35765 + }, + { + "epoch": 1.947601641787997, + "grad_norm": 0.5463680273203441, + "learning_rate": 3.5960465452152815e-07, + "loss": 11.8238, + "step": 35766 + }, + { + "epoch": 1.94765609578458, + "grad_norm": 0.6230451822896992, + "learning_rate": 3.588578715096214e-07, + "loss": 11.7563, + "step": 35767 + }, + { + "epoch": 1.947710549781163, + "grad_norm": 0.5500990160108249, + "learning_rate": 3.5811186332393997e-07, + "loss": 11.7255, + "step": 35768 + }, + { + "epoch": 1.947765003777746, + "grad_norm": 0.5274759632381156, + "learning_rate": 3.5736662997029046e-07, + "loss": 11.7417, + "step": 35769 + }, + { + "epoch": 1.947819457774329, + "grad_norm": 0.6134745912046852, + "learning_rate": 3.566221714544682e-07, + "loss": 11.8862, + "step": 35770 + }, + { + "epoch": 1.947873911770912, + "grad_norm": 0.5942901860692718, + "learning_rate": 3.558784877822574e-07, + "loss": 11.7984, + "step": 35771 + }, + { + "epoch": 1.947928365767495, + "grad_norm": 0.5324894676895175, + "learning_rate": 3.551355789594535e-07, + "loss": 11.5921, + "step": 35772 + }, + { + "epoch": 1.947982819764078, + "grad_norm": 0.5208748411718979, + "learning_rate": 3.543934449918185e-07, + "loss": 11.7375, + "step": 35773 + }, + { + "epoch": 1.948037273760661, + "grad_norm": 0.5884086872750396, + "learning_rate": 3.5365208588513666e-07, + "loss": 11.8309, + "step": 35774 + }, + { + "epoch": 1.948091727757244, + "grad_norm": 0.5344587420224608, + "learning_rate": 3.5291150164517006e-07, + "loss": 11.7614, + "step": 35775 + }, + { + "epoch": 1.948146181753827, + "grad_norm": 0.5807156630632645, + "learning_rate": 3.5217169227765857e-07, + "loss": 11.8448, + "step": 35776 + }, + { + "epoch": 1.94820063575041, + "grad_norm": 0.6509098455259031, + "learning_rate": 3.514326577883864e-07, + "loss": 11.9187, + "step": 35777 + }, + { + "epoch": 1.948255089746993, + "grad_norm": 0.6640034731038533, + "learning_rate": 3.5069439818308235e-07, + "loss": 11.8736, + "step": 35778 + }, + { + "epoch": 1.948309543743576, + "grad_norm": 0.5197435171253312, + "learning_rate": 3.499569134674863e-07, + "loss": 11.7151, + "step": 35779 + }, + { + "epoch": 1.948363997740159, + "grad_norm": 0.5382578311627094, + "learning_rate": 3.49220203647338e-07, + "loss": 11.711, + "step": 35780 + }, + { + "epoch": 1.948418451736742, + "grad_norm": 0.5033336945565409, + "learning_rate": 3.4848426872836626e-07, + "loss": 11.8021, + "step": 35781 + }, + { + "epoch": 1.9484729057333252, + "grad_norm": 0.5469208153599247, + "learning_rate": 3.477491087162887e-07, + "loss": 11.668, + "step": 35782 + }, + { + "epoch": 1.9485273597299082, + "grad_norm": 0.5779186555582223, + "learning_rate": 3.4701472361682307e-07, + "loss": 11.726, + "step": 35783 + }, + { + "epoch": 1.9485818137264912, + "grad_norm": 0.567181339386298, + "learning_rate": 3.462811134356869e-07, + "loss": 11.8276, + "step": 35784 + }, + { + "epoch": 1.9486362677230742, + "grad_norm": 0.5817138107091715, + "learning_rate": 3.455482781785868e-07, + "loss": 11.7262, + "step": 35785 + }, + { + "epoch": 1.9486907217196572, + "grad_norm": 0.5178025546351565, + "learning_rate": 3.448162178512071e-07, + "loss": 11.7248, + "step": 35786 + }, + { + "epoch": 1.9487451757162402, + "grad_norm": 0.6111691085854609, + "learning_rate": 3.440849324592543e-07, + "loss": 11.7132, + "step": 35787 + }, + { + "epoch": 1.9487996297128234, + "grad_norm": 0.4830768142712869, + "learning_rate": 3.4335442200840173e-07, + "loss": 11.6678, + "step": 35788 + }, + { + "epoch": 1.9488540837094064, + "grad_norm": 0.5205172880075745, + "learning_rate": 3.4262468650434474e-07, + "loss": 11.7703, + "step": 35789 + }, + { + "epoch": 1.9489085377059894, + "grad_norm": 0.5361165745380371, + "learning_rate": 3.4189572595274553e-07, + "loss": 11.7288, + "step": 35790 + }, + { + "epoch": 1.9489629917025724, + "grad_norm": 0.5943033370828161, + "learning_rate": 3.4116754035928845e-07, + "loss": 11.7242, + "step": 35791 + }, + { + "epoch": 1.9490174456991554, + "grad_norm": 0.5666927184201768, + "learning_rate": 3.4044012972961336e-07, + "loss": 11.771, + "step": 35792 + }, + { + "epoch": 1.9490718996957384, + "grad_norm": 0.5402614924496174, + "learning_rate": 3.397134940693936e-07, + "loss": 11.8572, + "step": 35793 + }, + { + "epoch": 1.9491263536923213, + "grad_norm": 0.5913378435730663, + "learning_rate": 3.38987633384269e-07, + "loss": 11.766, + "step": 35794 + }, + { + "epoch": 1.9491808076889043, + "grad_norm": 0.5183274891548123, + "learning_rate": 3.3826254767990174e-07, + "loss": 11.8697, + "step": 35795 + }, + { + "epoch": 1.9492352616854873, + "grad_norm": 0.5579197662683869, + "learning_rate": 3.3753823696190956e-07, + "loss": 11.7581, + "step": 35796 + }, + { + "epoch": 1.9492897156820703, + "grad_norm": 0.541985234741951, + "learning_rate": 3.368147012359324e-07, + "loss": 11.7521, + "step": 35797 + }, + { + "epoch": 1.9493441696786533, + "grad_norm": 0.565821080166775, + "learning_rate": 3.360919405075991e-07, + "loss": 11.7639, + "step": 35798 + }, + { + "epoch": 1.9493986236752363, + "grad_norm": 0.590903133530086, + "learning_rate": 3.353699547825273e-07, + "loss": 11.6631, + "step": 35799 + }, + { + "epoch": 1.9494530776718193, + "grad_norm": 0.5516842409895033, + "learning_rate": 3.3464874406634597e-07, + "loss": 11.8536, + "step": 35800 + }, + { + "epoch": 1.9495075316684023, + "grad_norm": 0.5342780122454649, + "learning_rate": 3.339283083646283e-07, + "loss": 11.7583, + "step": 35801 + }, + { + "epoch": 1.9495619856649853, + "grad_norm": 0.4950279828894907, + "learning_rate": 3.332086476830143e-07, + "loss": 11.6153, + "step": 35802 + }, + { + "epoch": 1.9496164396615683, + "grad_norm": 0.5697677897162219, + "learning_rate": 3.324897620270773e-07, + "loss": 11.7594, + "step": 35803 + }, + { + "epoch": 1.9496708936581513, + "grad_norm": 0.5429610345061743, + "learning_rate": 3.317716514024127e-07, + "loss": 11.7725, + "step": 35804 + }, + { + "epoch": 1.9497253476547345, + "grad_norm": 0.5510586216933023, + "learning_rate": 3.3105431581461623e-07, + "loss": 11.9017, + "step": 35805 + }, + { + "epoch": 1.9497798016513175, + "grad_norm": 0.5190511466084771, + "learning_rate": 3.3033775526923883e-07, + "loss": 11.739, + "step": 35806 + }, + { + "epoch": 1.9498342556479005, + "grad_norm": 0.5364381969782219, + "learning_rate": 3.296219697718872e-07, + "loss": 11.8013, + "step": 35807 + }, + { + "epoch": 1.9498887096444835, + "grad_norm": 0.5947973922439259, + "learning_rate": 3.289069593281013e-07, + "loss": 11.719, + "step": 35808 + }, + { + "epoch": 1.9499431636410665, + "grad_norm": 0.5521077995128394, + "learning_rate": 3.2819272394344346e-07, + "loss": 11.7002, + "step": 35809 + }, + { + "epoch": 1.9499976176376494, + "grad_norm": 0.5764012215332128, + "learning_rate": 3.274792636234869e-07, + "loss": 11.8677, + "step": 35810 + }, + { + "epoch": 1.9500520716342327, + "grad_norm": 0.5578297352939146, + "learning_rate": 3.267665783737606e-07, + "loss": 11.9374, + "step": 35811 + }, + { + "epoch": 1.9501065256308157, + "grad_norm": 0.5322309769436364, + "learning_rate": 3.260546681998156e-07, + "loss": 11.7566, + "step": 35812 + }, + { + "epoch": 1.9501609796273986, + "grad_norm": 0.5433238958891174, + "learning_rate": 3.253435331071808e-07, + "loss": 11.7498, + "step": 35813 + }, + { + "epoch": 1.9502154336239816, + "grad_norm": 0.48925847983033494, + "learning_rate": 3.2463317310138517e-07, + "loss": 11.7488, + "step": 35814 + }, + { + "epoch": 1.9502698876205646, + "grad_norm": 0.5589126774225989, + "learning_rate": 3.2392358818796873e-07, + "loss": 11.8701, + "step": 35815 + }, + { + "epoch": 1.9503243416171476, + "grad_norm": 0.5709214017534532, + "learning_rate": 3.2321477837242706e-07, + "loss": 11.7667, + "step": 35816 + }, + { + "epoch": 1.9503787956137306, + "grad_norm": 0.5940625147770527, + "learning_rate": 3.225067436603002e-07, + "loss": 11.7525, + "step": 35817 + }, + { + "epoch": 1.9504332496103136, + "grad_norm": 0.4917298206686435, + "learning_rate": 3.217994840570615e-07, + "loss": 11.7953, + "step": 35818 + }, + { + "epoch": 1.9504877036068966, + "grad_norm": 0.5999062352893816, + "learning_rate": 3.210929995682288e-07, + "loss": 11.7939, + "step": 35819 + }, + { + "epoch": 1.9505421576034796, + "grad_norm": 0.4887759734218869, + "learning_rate": 3.203872901992977e-07, + "loss": 11.7573, + "step": 35820 + }, + { + "epoch": 1.9505966116000626, + "grad_norm": 0.5424081437122078, + "learning_rate": 3.1968235595574156e-07, + "loss": 11.7486, + "step": 35821 + }, + { + "epoch": 1.9506510655966456, + "grad_norm": 0.5817441787724815, + "learning_rate": 3.1897819684305606e-07, + "loss": 11.8727, + "step": 35822 + }, + { + "epoch": 1.9507055195932286, + "grad_norm": 0.5180513380536177, + "learning_rate": 3.1827481286671454e-07, + "loss": 11.7364, + "step": 35823 + }, + { + "epoch": 1.9507599735898116, + "grad_norm": 0.49946414602183087, + "learning_rate": 3.1757220403219043e-07, + "loss": 11.8093, + "step": 35824 + }, + { + "epoch": 1.9508144275863946, + "grad_norm": 0.5077580998679988, + "learning_rate": 3.168703703449349e-07, + "loss": 11.7523, + "step": 35825 + }, + { + "epoch": 1.9508688815829776, + "grad_norm": 0.5229095469697806, + "learning_rate": 3.161693118104103e-07, + "loss": 11.7871, + "step": 35826 + }, + { + "epoch": 1.9509233355795605, + "grad_norm": 0.5624545584192372, + "learning_rate": 3.154690284340789e-07, + "loss": 11.8014, + "step": 35827 + }, + { + "epoch": 1.9509777895761435, + "grad_norm": 0.6411604832075658, + "learning_rate": 3.1476952022136964e-07, + "loss": 11.8469, + "step": 35828 + }, + { + "epoch": 1.9510322435727268, + "grad_norm": 0.5604763411659524, + "learning_rate": 3.1407078717773377e-07, + "loss": 11.7794, + "step": 35829 + }, + { + "epoch": 1.9510866975693097, + "grad_norm": 0.518215737932677, + "learning_rate": 3.1337282930860025e-07, + "loss": 11.7237, + "step": 35830 + }, + { + "epoch": 1.9511411515658927, + "grad_norm": 0.5534885203746889, + "learning_rate": 3.1267564661938697e-07, + "loss": 11.7628, + "step": 35831 + }, + { + "epoch": 1.9511956055624757, + "grad_norm": 0.6007370207747913, + "learning_rate": 3.119792391155341e-07, + "loss": 11.8422, + "step": 35832 + }, + { + "epoch": 1.9512500595590587, + "grad_norm": 0.5412034443034113, + "learning_rate": 3.112836068024483e-07, + "loss": 11.8291, + "step": 35833 + }, + { + "epoch": 1.951304513555642, + "grad_norm": 0.5023854244170816, + "learning_rate": 3.1058874968554754e-07, + "loss": 11.675, + "step": 35834 + }, + { + "epoch": 1.951358967552225, + "grad_norm": 0.5998728340734277, + "learning_rate": 3.0989466777021634e-07, + "loss": 11.8802, + "step": 35835 + }, + { + "epoch": 1.951413421548808, + "grad_norm": 0.5233710975280607, + "learning_rate": 3.0920136106186157e-07, + "loss": 11.7182, + "step": 35836 + }, + { + "epoch": 1.951467875545391, + "grad_norm": 0.5886865846789391, + "learning_rate": 3.085088295658789e-07, + "loss": 11.8063, + "step": 35837 + }, + { + "epoch": 1.951522329541974, + "grad_norm": 0.5820195424925961, + "learning_rate": 3.0781707328765285e-07, + "loss": 11.7188, + "step": 35838 + }, + { + "epoch": 1.951576783538557, + "grad_norm": 0.5645678801383038, + "learning_rate": 3.0712609223255696e-07, + "loss": 11.7586, + "step": 35839 + }, + { + "epoch": 1.95163123753514, + "grad_norm": 0.6150839369493637, + "learning_rate": 3.0643588640596467e-07, + "loss": 11.8566, + "step": 35840 + }, + { + "epoch": 1.9516856915317229, + "grad_norm": 0.5641827692573567, + "learning_rate": 3.0574645581323834e-07, + "loss": 11.6754, + "step": 35841 + }, + { + "epoch": 1.9517401455283059, + "grad_norm": 0.629641290932581, + "learning_rate": 3.050578004597626e-07, + "loss": 11.6847, + "step": 35842 + }, + { + "epoch": 1.9517945995248889, + "grad_norm": 0.5240048791950138, + "learning_rate": 3.0436992035086656e-07, + "loss": 11.8017, + "step": 35843 + }, + { + "epoch": 1.9518490535214719, + "grad_norm": 0.5207401473257164, + "learning_rate": 3.0368281549191245e-07, + "loss": 11.6369, + "step": 35844 + }, + { + "epoch": 1.9519035075180549, + "grad_norm": 0.5680741577814936, + "learning_rate": 3.0299648588822946e-07, + "loss": 11.8314, + "step": 35845 + }, + { + "epoch": 1.9519579615146379, + "grad_norm": 0.5415864232828316, + "learning_rate": 3.0231093154518e-07, + "loss": 11.8047, + "step": 35846 + }, + { + "epoch": 1.9520124155112208, + "grad_norm": 0.5483960674646408, + "learning_rate": 3.016261524680708e-07, + "loss": 11.8152, + "step": 35847 + }, + { + "epoch": 1.9520668695078038, + "grad_norm": 0.5239235695206559, + "learning_rate": 3.009421486622421e-07, + "loss": 11.8147, + "step": 35848 + }, + { + "epoch": 1.9521213235043868, + "grad_norm": 0.6451641908611242, + "learning_rate": 3.002589201330008e-07, + "loss": 11.765, + "step": 35849 + }, + { + "epoch": 1.9521757775009698, + "grad_norm": 0.5243196034409832, + "learning_rate": 2.995764668856649e-07, + "loss": 11.8682, + "step": 35850 + }, + { + "epoch": 1.9522302314975528, + "grad_norm": 0.562819873362613, + "learning_rate": 2.988947889255522e-07, + "loss": 11.8393, + "step": 35851 + }, + { + "epoch": 1.952284685494136, + "grad_norm": 0.5234802610463309, + "learning_rate": 2.9821388625794756e-07, + "loss": 11.7525, + "step": 35852 + }, + { + "epoch": 1.952339139490719, + "grad_norm": 0.5223475941794641, + "learning_rate": 2.975337588881466e-07, + "loss": 11.7858, + "step": 35853 + }, + { + "epoch": 1.952393593487302, + "grad_norm": 0.5188269331431005, + "learning_rate": 2.9685440682144515e-07, + "loss": 11.8606, + "step": 35854 + }, + { + "epoch": 1.952448047483885, + "grad_norm": 0.5103479857297905, + "learning_rate": 2.961758300631279e-07, + "loss": 11.6701, + "step": 35855 + }, + { + "epoch": 1.952502501480468, + "grad_norm": 0.5454324703986294, + "learning_rate": 2.954980286184683e-07, + "loss": 11.8446, + "step": 35856 + }, + { + "epoch": 1.952556955477051, + "grad_norm": 0.5400136085861927, + "learning_rate": 2.9482100249274005e-07, + "loss": 11.7596, + "step": 35857 + }, + { + "epoch": 1.9526114094736342, + "grad_norm": 0.5282519667995272, + "learning_rate": 2.941447516911944e-07, + "loss": 11.8004, + "step": 35858 + }, + { + "epoch": 1.9526658634702172, + "grad_norm": 0.5545893747726753, + "learning_rate": 2.9346927621910494e-07, + "loss": 11.553, + "step": 35859 + }, + { + "epoch": 1.9527203174668002, + "grad_norm": 0.5537199705247947, + "learning_rate": 2.927945760817119e-07, + "loss": 11.7516, + "step": 35860 + }, + { + "epoch": 1.9527747714633832, + "grad_norm": 0.5631832176629185, + "learning_rate": 2.921206512842778e-07, + "loss": 11.8641, + "step": 35861 + }, + { + "epoch": 1.9528292254599662, + "grad_norm": 0.5360749020353921, + "learning_rate": 2.9144750183203175e-07, + "loss": 11.624, + "step": 35862 + }, + { + "epoch": 1.9528836794565492, + "grad_norm": 0.5334187994037874, + "learning_rate": 2.90775127730214e-07, + "loss": 11.9112, + "step": 35863 + }, + { + "epoch": 1.9529381334531322, + "grad_norm": 0.5445344224613425, + "learning_rate": 2.901035289840537e-07, + "loss": 11.8131, + "step": 35864 + }, + { + "epoch": 1.9529925874497152, + "grad_norm": 0.5487306338296999, + "learning_rate": 2.894327055987578e-07, + "loss": 11.6989, + "step": 35865 + }, + { + "epoch": 1.9530470414462981, + "grad_norm": 0.4904048312942676, + "learning_rate": 2.887626575795666e-07, + "loss": 11.6938, + "step": 35866 + }, + { + "epoch": 1.9531014954428811, + "grad_norm": 0.5223437638123665, + "learning_rate": 2.880933849316647e-07, + "loss": 11.7829, + "step": 35867 + }, + { + "epoch": 1.9531559494394641, + "grad_norm": 0.5653797954884884, + "learning_rate": 2.874248876602814e-07, + "loss": 11.6763, + "step": 35868 + }, + { + "epoch": 1.9532104034360471, + "grad_norm": 0.5670962305740367, + "learning_rate": 2.867571657706014e-07, + "loss": 11.6804, + "step": 35869 + }, + { + "epoch": 1.9532648574326301, + "grad_norm": 0.5361234131465358, + "learning_rate": 2.8609021926782053e-07, + "loss": 11.8379, + "step": 35870 + }, + { + "epoch": 1.9533193114292131, + "grad_norm": 0.5745048326946705, + "learning_rate": 2.8542404815712354e-07, + "loss": 11.722, + "step": 35871 + }, + { + "epoch": 1.953373765425796, + "grad_norm": 0.5558435101449316, + "learning_rate": 2.847586524436951e-07, + "loss": 11.8554, + "step": 35872 + }, + { + "epoch": 1.953428219422379, + "grad_norm": 0.5234960261363741, + "learning_rate": 2.8409403213269794e-07, + "loss": 11.692, + "step": 35873 + }, + { + "epoch": 1.953482673418962, + "grad_norm": 0.5513330073098779, + "learning_rate": 2.8343018722930546e-07, + "loss": 11.875, + "step": 35874 + }, + { + "epoch": 1.9535371274155453, + "grad_norm": 0.5461841410829567, + "learning_rate": 2.8276711773869146e-07, + "loss": 11.871, + "step": 35875 + }, + { + "epoch": 1.9535915814121283, + "grad_norm": 0.5587509588084834, + "learning_rate": 2.821048236659962e-07, + "loss": 11.7609, + "step": 35876 + }, + { + "epoch": 1.9536460354087113, + "grad_norm": 0.5564115773370264, + "learning_rate": 2.814433050163823e-07, + "loss": 11.8441, + "step": 35877 + }, + { + "epoch": 1.9537004894052943, + "grad_norm": 0.5415376775011018, + "learning_rate": 2.8078256179498995e-07, + "loss": 11.7623, + "step": 35878 + }, + { + "epoch": 1.9537549434018773, + "grad_norm": 0.5500602009273343, + "learning_rate": 2.801225940069485e-07, + "loss": 11.8713, + "step": 35879 + }, + { + "epoch": 1.9538093973984603, + "grad_norm": 0.6269348301275441, + "learning_rate": 2.7946340165739825e-07, + "loss": 11.8026, + "step": 35880 + }, + { + "epoch": 1.9538638513950435, + "grad_norm": 0.5750371656321503, + "learning_rate": 2.788049847514684e-07, + "loss": 11.8651, + "step": 35881 + }, + { + "epoch": 1.9539183053916265, + "grad_norm": 0.5823462549100343, + "learning_rate": 2.7814734329426606e-07, + "loss": 11.7562, + "step": 35882 + }, + { + "epoch": 1.9539727593882095, + "grad_norm": 0.6173978633885092, + "learning_rate": 2.7749047729092036e-07, + "loss": 11.7826, + "step": 35883 + }, + { + "epoch": 1.9540272133847925, + "grad_norm": 0.5880719517248111, + "learning_rate": 2.768343867465273e-07, + "loss": 11.8653, + "step": 35884 + }, + { + "epoch": 1.9540816673813755, + "grad_norm": 0.5758611817619522, + "learning_rate": 2.7617907166619384e-07, + "loss": 11.722, + "step": 35885 + }, + { + "epoch": 1.9541361213779584, + "grad_norm": 0.5832879258558195, + "learning_rate": 2.7552453205501595e-07, + "loss": 11.7369, + "step": 35886 + }, + { + "epoch": 1.9541905753745414, + "grad_norm": 0.5287119368124059, + "learning_rate": 2.7487076791808954e-07, + "loss": 11.8529, + "step": 35887 + }, + { + "epoch": 1.9542450293711244, + "grad_norm": 0.5157165752512857, + "learning_rate": 2.7421777926048833e-07, + "loss": 11.8183, + "step": 35888 + }, + { + "epoch": 1.9542994833677074, + "grad_norm": 0.4865903755511943, + "learning_rate": 2.735655660872971e-07, + "loss": 11.7977, + "step": 35889 + }, + { + "epoch": 1.9543539373642904, + "grad_norm": 0.540523073617327, + "learning_rate": 2.7291412840357855e-07, + "loss": 11.7901, + "step": 35890 + }, + { + "epoch": 1.9544083913608734, + "grad_norm": 0.5487497903139256, + "learning_rate": 2.7226346621440633e-07, + "loss": 11.6646, + "step": 35891 + }, + { + "epoch": 1.9544628453574564, + "grad_norm": 0.5891033440618993, + "learning_rate": 2.7161357952483204e-07, + "loss": 11.7166, + "step": 35892 + }, + { + "epoch": 1.9545172993540394, + "grad_norm": 0.5863427885907998, + "learning_rate": 2.709644683399182e-07, + "loss": 11.8414, + "step": 35893 + }, + { + "epoch": 1.9545717533506224, + "grad_norm": 0.5819569928162447, + "learning_rate": 2.7031613266471636e-07, + "loss": 11.7669, + "step": 35894 + }, + { + "epoch": 1.9546262073472054, + "grad_norm": 0.5619965174082716, + "learning_rate": 2.696685725042558e-07, + "loss": 11.7991, + "step": 35895 + }, + { + "epoch": 1.9546806613437884, + "grad_norm": 0.563162459752338, + "learning_rate": 2.690217878635659e-07, + "loss": 11.8986, + "step": 35896 + }, + { + "epoch": 1.9547351153403714, + "grad_norm": 0.5085379508253828, + "learning_rate": 2.6837577874769817e-07, + "loss": 11.7301, + "step": 35897 + }, + { + "epoch": 1.9547895693369544, + "grad_norm": 0.5520974546880524, + "learning_rate": 2.6773054516167074e-07, + "loss": 11.7723, + "step": 35898 + }, + { + "epoch": 1.9548440233335376, + "grad_norm": 0.5518164002357201, + "learning_rate": 2.670860871104908e-07, + "loss": 11.7063, + "step": 35899 + }, + { + "epoch": 1.9548984773301206, + "grad_norm": 0.6009453695692543, + "learning_rate": 2.664424045991765e-07, + "loss": 11.789, + "step": 35900 + }, + { + "epoch": 1.9549529313267036, + "grad_norm": 0.5692161719869828, + "learning_rate": 2.657994976327238e-07, + "loss": 11.7154, + "step": 35901 + }, + { + "epoch": 1.9550073853232866, + "grad_norm": 0.6287841057724544, + "learning_rate": 2.6515736621615106e-07, + "loss": 11.7546, + "step": 35902 + }, + { + "epoch": 1.9550618393198695, + "grad_norm": 0.5583070790923353, + "learning_rate": 2.6451601035443196e-07, + "loss": 11.8544, + "step": 35903 + }, + { + "epoch": 1.9551162933164528, + "grad_norm": 0.507130766576949, + "learning_rate": 2.638754300525625e-07, + "loss": 11.6692, + "step": 35904 + }, + { + "epoch": 1.9551707473130358, + "grad_norm": 0.6557667819767078, + "learning_rate": 2.632356253155277e-07, + "loss": 11.7887, + "step": 35905 + }, + { + "epoch": 1.9552252013096187, + "grad_norm": 0.5321084664675033, + "learning_rate": 2.6259659614829013e-07, + "loss": 11.6028, + "step": 35906 + }, + { + "epoch": 1.9552796553062017, + "grad_norm": 0.5084910653859216, + "learning_rate": 2.6195834255583474e-07, + "loss": 11.7913, + "step": 35907 + }, + { + "epoch": 1.9553341093027847, + "grad_norm": 0.5389367268452145, + "learning_rate": 2.613208645431242e-07, + "loss": 11.8112, + "step": 35908 + }, + { + "epoch": 1.9553885632993677, + "grad_norm": 0.5369992545014218, + "learning_rate": 2.6068416211509906e-07, + "loss": 11.8744, + "step": 35909 + }, + { + "epoch": 1.9554430172959507, + "grad_norm": 0.5785829195847547, + "learning_rate": 2.6004823527672196e-07, + "loss": 11.79, + "step": 35910 + }, + { + "epoch": 1.9554974712925337, + "grad_norm": 0.6076617351802577, + "learning_rate": 2.594130840329334e-07, + "loss": 11.7503, + "step": 35911 + }, + { + "epoch": 1.9555519252891167, + "grad_norm": 0.5304492212541593, + "learning_rate": 2.587787083886739e-07, + "loss": 11.7637, + "step": 35912 + }, + { + "epoch": 1.9556063792856997, + "grad_norm": 0.5155447374843087, + "learning_rate": 2.5814510834888393e-07, + "loss": 11.7147, + "step": 35913 + }, + { + "epoch": 1.9556608332822827, + "grad_norm": 0.5760552732945751, + "learning_rate": 2.575122839184818e-07, + "loss": 11.8351, + "step": 35914 + }, + { + "epoch": 1.9557152872788657, + "grad_norm": 0.6113632142587743, + "learning_rate": 2.568802351023858e-07, + "loss": 11.8025, + "step": 35915 + }, + { + "epoch": 1.9557697412754487, + "grad_norm": 0.5589133512186288, + "learning_rate": 2.5624896190552526e-07, + "loss": 11.79, + "step": 35916 + }, + { + "epoch": 1.9558241952720317, + "grad_norm": 0.5391823032797487, + "learning_rate": 2.5561846433279633e-07, + "loss": 11.7124, + "step": 35917 + }, + { + "epoch": 1.9558786492686147, + "grad_norm": 0.5836982543128165, + "learning_rate": 2.549887423891062e-07, + "loss": 11.6308, + "step": 35918 + }, + { + "epoch": 1.9559331032651976, + "grad_norm": 0.6049656136525037, + "learning_rate": 2.543597960793509e-07, + "loss": 11.846, + "step": 35919 + }, + { + "epoch": 1.9559875572617806, + "grad_norm": 0.5440007353911324, + "learning_rate": 2.5373162540841547e-07, + "loss": 11.6707, + "step": 35920 + }, + { + "epoch": 1.9560420112583636, + "grad_norm": 0.5871422008088131, + "learning_rate": 2.531042303811959e-07, + "loss": 11.6263, + "step": 35921 + }, + { + "epoch": 1.9560964652549468, + "grad_norm": 0.5726808866474816, + "learning_rate": 2.5247761100256615e-07, + "loss": 11.7663, + "step": 35922 + }, + { + "epoch": 1.9561509192515298, + "grad_norm": 0.5255182273916505, + "learning_rate": 2.51851767277389e-07, + "loss": 11.695, + "step": 35923 + }, + { + "epoch": 1.9562053732481128, + "grad_norm": 0.7316910928448936, + "learning_rate": 2.512266992105494e-07, + "loss": 11.8428, + "step": 35924 + }, + { + "epoch": 1.9562598272446958, + "grad_norm": 0.5466159138712406, + "learning_rate": 2.5060240680689907e-07, + "loss": 11.7252, + "step": 35925 + }, + { + "epoch": 1.9563142812412788, + "grad_norm": 0.5120301825262785, + "learning_rate": 2.499788900712896e-07, + "loss": 11.8882, + "step": 35926 + }, + { + "epoch": 1.9563687352378618, + "grad_norm": 0.5531173590274187, + "learning_rate": 2.493561490085727e-07, + "loss": 11.7761, + "step": 35927 + }, + { + "epoch": 1.956423189234445, + "grad_norm": 0.6035921506793719, + "learning_rate": 2.48734183623589e-07, + "loss": 11.7222, + "step": 35928 + }, + { + "epoch": 1.956477643231028, + "grad_norm": 0.5275334665423085, + "learning_rate": 2.4811299392117906e-07, + "loss": 11.8672, + "step": 35929 + }, + { + "epoch": 1.956532097227611, + "grad_norm": 0.5079229798744881, + "learning_rate": 2.4749257990617224e-07, + "loss": 11.7034, + "step": 35930 + }, + { + "epoch": 1.956586551224194, + "grad_norm": 0.5632949809556068, + "learning_rate": 2.46872941583387e-07, + "loss": 11.7346, + "step": 35931 + }, + { + "epoch": 1.956641005220777, + "grad_norm": 0.538298404356519, + "learning_rate": 2.4625407895765285e-07, + "loss": 11.7376, + "step": 35932 + }, + { + "epoch": 1.95669545921736, + "grad_norm": 0.5876919351619007, + "learning_rate": 2.4563599203376584e-07, + "loss": 11.6859, + "step": 35933 + }, + { + "epoch": 1.956749913213943, + "grad_norm": 0.556354689695551, + "learning_rate": 2.450186808165555e-07, + "loss": 11.7755, + "step": 35934 + }, + { + "epoch": 1.956804367210526, + "grad_norm": 0.5260272536635366, + "learning_rate": 2.4440214531079586e-07, + "loss": 11.7413, + "step": 35935 + }, + { + "epoch": 1.956858821207109, + "grad_norm": 0.5286884270683888, + "learning_rate": 2.4378638552129404e-07, + "loss": 11.8724, + "step": 35936 + }, + { + "epoch": 1.956913275203692, + "grad_norm": 0.5494134903867526, + "learning_rate": 2.4317140145284634e-07, + "loss": 11.7719, + "step": 35937 + }, + { + "epoch": 1.956967729200275, + "grad_norm": 0.5925043697881837, + "learning_rate": 2.425571931102266e-07, + "loss": 11.6398, + "step": 35938 + }, + { + "epoch": 1.957022183196858, + "grad_norm": 0.5943765468098076, + "learning_rate": 2.419437604982089e-07, + "loss": 11.9939, + "step": 35939 + }, + { + "epoch": 1.957076637193441, + "grad_norm": 0.5535055704856124, + "learning_rate": 2.4133110362156705e-07, + "loss": 11.8611, + "step": 35940 + }, + { + "epoch": 1.957131091190024, + "grad_norm": 0.5040609159798455, + "learning_rate": 2.4071922248506404e-07, + "loss": 11.6314, + "step": 35941 + }, + { + "epoch": 1.957185545186607, + "grad_norm": 0.5475128848344688, + "learning_rate": 2.401081170934627e-07, + "loss": 11.7481, + "step": 35942 + }, + { + "epoch": 1.95723999918319, + "grad_norm": 0.4946371589393999, + "learning_rate": 2.394977874515036e-07, + "loss": 11.7724, + "step": 35943 + }, + { + "epoch": 1.957294453179773, + "grad_norm": 0.5377785856912255, + "learning_rate": 2.3888823356393864e-07, + "loss": 11.7273, + "step": 35944 + }, + { + "epoch": 1.9573489071763561, + "grad_norm": 0.5307371097819616, + "learning_rate": 2.3827945543551943e-07, + "loss": 11.5581, + "step": 35945 + }, + { + "epoch": 1.9574033611729391, + "grad_norm": 0.5651747604016063, + "learning_rate": 2.376714530709534e-07, + "loss": 11.8479, + "step": 35946 + }, + { + "epoch": 1.957457815169522, + "grad_norm": 0.5283998511994507, + "learning_rate": 2.3706422647499226e-07, + "loss": 11.7311, + "step": 35947 + }, + { + "epoch": 1.957512269166105, + "grad_norm": 0.5407110930783279, + "learning_rate": 2.3645777565235448e-07, + "loss": 11.639, + "step": 35948 + }, + { + "epoch": 1.957566723162688, + "grad_norm": 0.5639319181617664, + "learning_rate": 2.3585210060774742e-07, + "loss": 11.7106, + "step": 35949 + }, + { + "epoch": 1.957621177159271, + "grad_norm": 0.505142217294847, + "learning_rate": 2.3524720134587842e-07, + "loss": 11.6646, + "step": 35950 + }, + { + "epoch": 1.9576756311558543, + "grad_norm": 0.5746314781996184, + "learning_rate": 2.3464307787146588e-07, + "loss": 11.8535, + "step": 35951 + }, + { + "epoch": 1.9577300851524373, + "grad_norm": 0.5174578720733319, + "learning_rate": 2.34039730189195e-07, + "loss": 11.7564, + "step": 35952 + }, + { + "epoch": 1.9577845391490203, + "grad_norm": 0.5251376682898412, + "learning_rate": 2.33437158303762e-07, + "loss": 11.7435, + "step": 35953 + }, + { + "epoch": 1.9578389931456033, + "grad_norm": 0.5353396839108149, + "learning_rate": 2.3283536221986312e-07, + "loss": 11.7581, + "step": 35954 + }, + { + "epoch": 1.9578934471421863, + "grad_norm": 0.5340541763313547, + "learning_rate": 2.322343419421502e-07, + "loss": 11.7559, + "step": 35955 + }, + { + "epoch": 1.9579479011387693, + "grad_norm": 0.5204118864493482, + "learning_rate": 2.316340974753306e-07, + "loss": 11.8139, + "step": 35956 + }, + { + "epoch": 1.9580023551353523, + "grad_norm": 0.5972537787744839, + "learning_rate": 2.3103462882404503e-07, + "loss": 11.7654, + "step": 35957 + }, + { + "epoch": 1.9580568091319352, + "grad_norm": 0.505470952708302, + "learning_rate": 2.3043593599296753e-07, + "loss": 11.7382, + "step": 35958 + }, + { + "epoch": 1.9581112631285182, + "grad_norm": 0.517484750902238, + "learning_rate": 2.2983801898674996e-07, + "loss": 11.7658, + "step": 35959 + }, + { + "epoch": 1.9581657171251012, + "grad_norm": 0.6277265276756603, + "learning_rate": 2.2924087781004412e-07, + "loss": 11.8407, + "step": 35960 + }, + { + "epoch": 1.9582201711216842, + "grad_norm": 0.5175984169503505, + "learning_rate": 2.2864451246749074e-07, + "loss": 11.6994, + "step": 35961 + }, + { + "epoch": 1.9582746251182672, + "grad_norm": 0.5292819777409135, + "learning_rate": 2.280489229637417e-07, + "loss": 11.7636, + "step": 35962 + }, + { + "epoch": 1.9583290791148502, + "grad_norm": 0.5432526409607965, + "learning_rate": 2.274541093034044e-07, + "loss": 11.8115, + "step": 35963 + }, + { + "epoch": 1.9583835331114332, + "grad_norm": 0.5721332638844125, + "learning_rate": 2.2686007149111955e-07, + "loss": 11.7229, + "step": 35964 + }, + { + "epoch": 1.9584379871080162, + "grad_norm": 0.5306818464954678, + "learning_rate": 2.2626680953149459e-07, + "loss": 11.6406, + "step": 35965 + }, + { + "epoch": 1.9584924411045992, + "grad_norm": 0.5303782181200513, + "learning_rate": 2.2567432342915916e-07, + "loss": 11.8844, + "step": 35966 + }, + { + "epoch": 1.9585468951011822, + "grad_norm": 0.5247701926028948, + "learning_rate": 2.250826131887096e-07, + "loss": 11.8682, + "step": 35967 + }, + { + "epoch": 1.9586013490977654, + "grad_norm": 0.4854950431216873, + "learning_rate": 2.244916788147533e-07, + "loss": 11.7764, + "step": 35968 + }, + { + "epoch": 1.9586558030943484, + "grad_norm": 0.5879777289575042, + "learning_rate": 2.239015203118866e-07, + "loss": 11.8634, + "step": 35969 + }, + { + "epoch": 1.9587102570909314, + "grad_norm": 0.5213985472647493, + "learning_rate": 2.2331213768468363e-07, + "loss": 11.8124, + "step": 35970 + }, + { + "epoch": 1.9587647110875144, + "grad_norm": 0.5562151313233744, + "learning_rate": 2.2272353093774067e-07, + "loss": 11.8171, + "step": 35971 + }, + { + "epoch": 1.9588191650840974, + "grad_norm": 0.5000176824145144, + "learning_rate": 2.2213570007563188e-07, + "loss": 11.7175, + "step": 35972 + }, + { + "epoch": 1.9588736190806804, + "grad_norm": 0.5177771222958579, + "learning_rate": 2.215486451029314e-07, + "loss": 11.821, + "step": 35973 + }, + { + "epoch": 1.9589280730772636, + "grad_norm": 0.5053136477219986, + "learning_rate": 2.2096236602420216e-07, + "loss": 11.8311, + "step": 35974 + }, + { + "epoch": 1.9589825270738466, + "grad_norm": 0.5547501316099366, + "learning_rate": 2.2037686284399617e-07, + "loss": 11.744, + "step": 35975 + }, + { + "epoch": 1.9590369810704296, + "grad_norm": 0.5583075796416789, + "learning_rate": 2.197921355668875e-07, + "loss": 11.7493, + "step": 35976 + }, + { + "epoch": 1.9590914350670126, + "grad_norm": 0.4865546407423326, + "learning_rate": 2.1920818419739475e-07, + "loss": 11.8093, + "step": 35977 + }, + { + "epoch": 1.9591458890635955, + "grad_norm": 0.5159941282380012, + "learning_rate": 2.1862500874008097e-07, + "loss": 11.845, + "step": 35978 + }, + { + "epoch": 1.9592003430601785, + "grad_norm": 0.5738069207992845, + "learning_rate": 2.1804260919946472e-07, + "loss": 11.7668, + "step": 35979 + }, + { + "epoch": 1.9592547970567615, + "grad_norm": 0.544092997987775, + "learning_rate": 2.1746098558008688e-07, + "loss": 11.8062, + "step": 35980 + }, + { + "epoch": 1.9593092510533445, + "grad_norm": 0.5430659754606114, + "learning_rate": 2.16880137886466e-07, + "loss": 11.7779, + "step": 35981 + }, + { + "epoch": 1.9593637050499275, + "grad_norm": 0.5533043904573204, + "learning_rate": 2.163000661231096e-07, + "loss": 11.7627, + "step": 35982 + }, + { + "epoch": 1.9594181590465105, + "grad_norm": 0.5491360911113741, + "learning_rate": 2.157207702945474e-07, + "loss": 11.755, + "step": 35983 + }, + { + "epoch": 1.9594726130430935, + "grad_norm": 0.5939356269830557, + "learning_rate": 2.151422504052758e-07, + "loss": 11.8318, + "step": 35984 + }, + { + "epoch": 1.9595270670396765, + "grad_norm": 0.5346205594886096, + "learning_rate": 2.145645064597801e-07, + "loss": 11.7513, + "step": 35985 + }, + { + "epoch": 1.9595815210362595, + "grad_norm": 0.5220698824521048, + "learning_rate": 2.139875384625789e-07, + "loss": 11.7206, + "step": 35986 + }, + { + "epoch": 1.9596359750328425, + "grad_norm": 0.5531759666057698, + "learning_rate": 2.1341134641813532e-07, + "loss": 11.7065, + "step": 35987 + }, + { + "epoch": 1.9596904290294255, + "grad_norm": 0.5283570502275963, + "learning_rate": 2.1283593033094572e-07, + "loss": 11.8188, + "step": 35988 + }, + { + "epoch": 1.9597448830260085, + "grad_norm": 0.5472719714085682, + "learning_rate": 2.1226129020547324e-07, + "loss": 11.6957, + "step": 35989 + }, + { + "epoch": 1.9597993370225915, + "grad_norm": 0.5066518263393189, + "learning_rate": 2.1168742604619206e-07, + "loss": 11.7311, + "step": 35990 + }, + { + "epoch": 1.9598537910191745, + "grad_norm": 0.5613462019930039, + "learning_rate": 2.111143378575653e-07, + "loss": 11.7848, + "step": 35991 + }, + { + "epoch": 1.9599082450157577, + "grad_norm": 0.5879546238432012, + "learning_rate": 2.105420256440449e-07, + "loss": 11.8254, + "step": 35992 + }, + { + "epoch": 1.9599626990123407, + "grad_norm": 0.5143005623587931, + "learning_rate": 2.09970489410094e-07, + "loss": 11.7918, + "step": 35993 + }, + { + "epoch": 1.9600171530089237, + "grad_norm": 0.5088615201152062, + "learning_rate": 2.0939972916014238e-07, + "loss": 11.7431, + "step": 35994 + }, + { + "epoch": 1.9600716070055066, + "grad_norm": 0.6261982315383645, + "learning_rate": 2.0882974489863093e-07, + "loss": 11.7243, + "step": 35995 + }, + { + "epoch": 1.9601260610020896, + "grad_norm": 0.5667986517146926, + "learning_rate": 2.0826053662998946e-07, + "loss": 11.7451, + "step": 35996 + }, + { + "epoch": 1.9601805149986726, + "grad_norm": 0.5579407408108145, + "learning_rate": 2.0769210435865883e-07, + "loss": 11.6502, + "step": 35997 + }, + { + "epoch": 1.9602349689952558, + "grad_norm": 0.5572480299646217, + "learning_rate": 2.0712444808904662e-07, + "loss": 11.6107, + "step": 35998 + }, + { + "epoch": 1.9602894229918388, + "grad_norm": 0.498678713069211, + "learning_rate": 2.0655756782557157e-07, + "loss": 11.8045, + "step": 35999 + }, + { + "epoch": 1.9603438769884218, + "grad_norm": 0.521042250840242, + "learning_rate": 2.059914635726301e-07, + "loss": 11.7027, + "step": 36000 + }, + { + "epoch": 1.9603983309850048, + "grad_norm": 0.5331566584946978, + "learning_rate": 2.0542613533465204e-07, + "loss": 11.7363, + "step": 36001 + }, + { + "epoch": 1.9604527849815878, + "grad_norm": 0.5443088404153574, + "learning_rate": 2.0486158311600057e-07, + "loss": 11.7402, + "step": 36002 + }, + { + "epoch": 1.9605072389781708, + "grad_norm": 0.5525041906124101, + "learning_rate": 2.0429780692108325e-07, + "loss": 11.7649, + "step": 36003 + }, + { + "epoch": 1.9605616929747538, + "grad_norm": 0.6303177546887656, + "learning_rate": 2.0373480675428546e-07, + "loss": 11.7865, + "step": 36004 + }, + { + "epoch": 1.9606161469713368, + "grad_norm": 0.5915968681274381, + "learning_rate": 2.031725826199815e-07, + "loss": 11.8998, + "step": 36005 + }, + { + "epoch": 1.9606706009679198, + "grad_norm": 0.5011611063775976, + "learning_rate": 2.0261113452254566e-07, + "loss": 11.7123, + "step": 36006 + }, + { + "epoch": 1.9607250549645028, + "grad_norm": 0.5570439052407872, + "learning_rate": 2.0205046246634108e-07, + "loss": 11.7016, + "step": 36007 + }, + { + "epoch": 1.9607795089610858, + "grad_norm": 0.512534127629289, + "learning_rate": 2.0149056645573094e-07, + "loss": 11.788, + "step": 36008 + }, + { + "epoch": 1.9608339629576688, + "grad_norm": 0.5184480782611407, + "learning_rate": 2.0093144649505623e-07, + "loss": 11.7903, + "step": 36009 + }, + { + "epoch": 1.9608884169542518, + "grad_norm": 0.5857843005853354, + "learning_rate": 2.0037310258868013e-07, + "loss": 11.8534, + "step": 36010 + }, + { + "epoch": 1.9609428709508347, + "grad_norm": 0.5111709976498129, + "learning_rate": 1.998155347409436e-07, + "loss": 11.5414, + "step": 36011 + }, + { + "epoch": 1.9609973249474177, + "grad_norm": 0.5292095211027054, + "learning_rate": 1.9925874295617652e-07, + "loss": 11.732, + "step": 36012 + }, + { + "epoch": 1.9610517789440007, + "grad_norm": 0.5856080790092227, + "learning_rate": 1.9870272723870875e-07, + "loss": 11.8398, + "step": 36013 + }, + { + "epoch": 1.9611062329405837, + "grad_norm": 0.5033977054195551, + "learning_rate": 1.9814748759285906e-07, + "loss": 11.7954, + "step": 36014 + }, + { + "epoch": 1.961160686937167, + "grad_norm": 0.577376962998737, + "learning_rate": 1.9759302402295731e-07, + "loss": 11.8011, + "step": 36015 + }, + { + "epoch": 1.96121514093375, + "grad_norm": 0.5397605666328251, + "learning_rate": 1.9703933653331118e-07, + "loss": 11.757, + "step": 36016 + }, + { + "epoch": 1.961269594930333, + "grad_norm": 0.5541654365163368, + "learning_rate": 1.9648642512821724e-07, + "loss": 11.7338, + "step": 36017 + }, + { + "epoch": 1.961324048926916, + "grad_norm": 0.7217196531735636, + "learning_rate": 1.9593428981198315e-07, + "loss": 11.7926, + "step": 36018 + }, + { + "epoch": 1.961378502923499, + "grad_norm": 0.519998290054018, + "learning_rate": 1.9538293058890543e-07, + "loss": 11.6498, + "step": 36019 + }, + { + "epoch": 1.961432956920082, + "grad_norm": 0.5272176504867117, + "learning_rate": 1.948323474632696e-07, + "loss": 11.7032, + "step": 36020 + }, + { + "epoch": 1.9614874109166651, + "grad_norm": 0.5099707324800663, + "learning_rate": 1.9428254043935003e-07, + "loss": 11.451, + "step": 36021 + }, + { + "epoch": 1.9615418649132481, + "grad_norm": 0.5179851898636314, + "learning_rate": 1.93733509521421e-07, + "loss": 11.6959, + "step": 36022 + }, + { + "epoch": 1.961596318909831, + "grad_norm": 0.5372911964255624, + "learning_rate": 1.9318525471376802e-07, + "loss": 11.8403, + "step": 36023 + }, + { + "epoch": 1.961650772906414, + "grad_norm": 0.761199762720354, + "learning_rate": 1.9263777602064327e-07, + "loss": 11.727, + "step": 36024 + }, + { + "epoch": 1.961705226902997, + "grad_norm": 0.4977141123013079, + "learning_rate": 1.9209107344629885e-07, + "loss": 11.7505, + "step": 36025 + }, + { + "epoch": 1.96175968089958, + "grad_norm": 0.549922404983853, + "learning_rate": 1.9154514699499805e-07, + "loss": 11.7971, + "step": 36026 + }, + { + "epoch": 1.961814134896163, + "grad_norm": 0.5653266048395228, + "learning_rate": 1.909999966709819e-07, + "loss": 11.8807, + "step": 36027 + }, + { + "epoch": 1.961868588892746, + "grad_norm": 0.5776493133370724, + "learning_rate": 1.9045562247848036e-07, + "loss": 11.7873, + "step": 36028 + }, + { + "epoch": 1.961923042889329, + "grad_norm": 0.5485644438477592, + "learning_rate": 1.899120244217345e-07, + "loss": 11.7357, + "step": 36029 + }, + { + "epoch": 1.961977496885912, + "grad_norm": 0.5381699981451984, + "learning_rate": 1.8936920250497425e-07, + "loss": 11.6875, + "step": 36030 + }, + { + "epoch": 1.962031950882495, + "grad_norm": 0.5204981639255041, + "learning_rate": 1.8882715673241847e-07, + "loss": 11.7527, + "step": 36031 + }, + { + "epoch": 1.962086404879078, + "grad_norm": 0.4941649129436532, + "learning_rate": 1.8828588710827488e-07, + "loss": 11.7216, + "step": 36032 + }, + { + "epoch": 1.962140858875661, + "grad_norm": 0.5787425359730036, + "learning_rate": 1.8774539363676235e-07, + "loss": 11.8042, + "step": 36033 + }, + { + "epoch": 1.962195312872244, + "grad_norm": 0.5298592707443699, + "learning_rate": 1.8720567632207753e-07, + "loss": 11.7328, + "step": 36034 + }, + { + "epoch": 1.962249766868827, + "grad_norm": 0.6048682113144075, + "learning_rate": 1.8666673516842814e-07, + "loss": 11.7806, + "step": 36035 + }, + { + "epoch": 1.96230422086541, + "grad_norm": 0.5203259207628885, + "learning_rate": 1.8612857017998864e-07, + "loss": 11.7484, + "step": 36036 + }, + { + "epoch": 1.962358674861993, + "grad_norm": 0.5203654124111285, + "learning_rate": 1.8559118136095566e-07, + "loss": 11.7989, + "step": 36037 + }, + { + "epoch": 1.9624131288585762, + "grad_norm": 0.5574154978845504, + "learning_rate": 1.8505456871550364e-07, + "loss": 11.803, + "step": 36038 + }, + { + "epoch": 1.9624675828551592, + "grad_norm": 0.5187730591400143, + "learning_rate": 1.8451873224780703e-07, + "loss": 11.7569, + "step": 36039 + }, + { + "epoch": 1.9625220368517422, + "grad_norm": 0.5397345979656596, + "learning_rate": 1.8398367196202914e-07, + "loss": 11.5374, + "step": 36040 + }, + { + "epoch": 1.9625764908483252, + "grad_norm": 0.5249696001010945, + "learning_rate": 1.8344938786233334e-07, + "loss": 11.8053, + "step": 36041 + }, + { + "epoch": 1.9626309448449082, + "grad_norm": 0.6110236430723038, + "learning_rate": 1.8291587995288295e-07, + "loss": 11.8182, + "step": 36042 + }, + { + "epoch": 1.9626853988414912, + "grad_norm": 0.5413015579929863, + "learning_rate": 1.82383148237808e-07, + "loss": 11.7702, + "step": 36043 + }, + { + "epoch": 1.9627398528380744, + "grad_norm": 0.583288513988857, + "learning_rate": 1.8185119272126072e-07, + "loss": 11.714, + "step": 36044 + }, + { + "epoch": 1.9627943068346574, + "grad_norm": 0.5334439277205439, + "learning_rate": 1.8132001340739334e-07, + "loss": 11.8311, + "step": 36045 + }, + { + "epoch": 1.9628487608312404, + "grad_norm": 0.6313076412553729, + "learning_rate": 1.8078961030030262e-07, + "loss": 11.9019, + "step": 36046 + }, + { + "epoch": 1.9629032148278234, + "grad_norm": 0.5802374704330805, + "learning_rate": 1.8025998340414075e-07, + "loss": 11.7941, + "step": 36047 + }, + { + "epoch": 1.9629576688244064, + "grad_norm": 0.5530229621499704, + "learning_rate": 1.7973113272301556e-07, + "loss": 11.7996, + "step": 36048 + }, + { + "epoch": 1.9630121228209894, + "grad_norm": 0.611200710106464, + "learning_rate": 1.79203058261046e-07, + "loss": 11.7493, + "step": 36049 + }, + { + "epoch": 1.9630665768175724, + "grad_norm": 0.5538728300789155, + "learning_rate": 1.7867576002232878e-07, + "loss": 11.8603, + "step": 36050 + }, + { + "epoch": 1.9631210308141553, + "grad_norm": 0.5902849805325706, + "learning_rate": 1.781492380109606e-07, + "loss": 11.7395, + "step": 36051 + }, + { + "epoch": 1.9631754848107383, + "grad_norm": 0.5172885994447218, + "learning_rate": 1.7762349223106046e-07, + "loss": 11.7958, + "step": 36052 + }, + { + "epoch": 1.9632299388073213, + "grad_norm": 0.5611427830043566, + "learning_rate": 1.770985226866806e-07, + "loss": 11.7454, + "step": 36053 + }, + { + "epoch": 1.9632843928039043, + "grad_norm": 0.5455502001926145, + "learning_rate": 1.7657432938193997e-07, + "loss": 11.8378, + "step": 36054 + }, + { + "epoch": 1.9633388468004873, + "grad_norm": 0.5278946672983367, + "learning_rate": 1.7605091232089088e-07, + "loss": 11.6538, + "step": 36055 + }, + { + "epoch": 1.9633933007970703, + "grad_norm": 0.6774585307507369, + "learning_rate": 1.7552827150760788e-07, + "loss": 11.9573, + "step": 36056 + }, + { + "epoch": 1.9634477547936533, + "grad_norm": 0.5560376521290947, + "learning_rate": 1.7500640694616544e-07, + "loss": 11.8007, + "step": 36057 + }, + { + "epoch": 1.9635022087902363, + "grad_norm": 0.5959644389597665, + "learning_rate": 1.744853186406048e-07, + "loss": 11.8268, + "step": 36058 + }, + { + "epoch": 1.9635566627868193, + "grad_norm": 0.5390154979311883, + "learning_rate": 1.7396500659498938e-07, + "loss": 11.7693, + "step": 36059 + }, + { + "epoch": 1.9636111167834023, + "grad_norm": 0.5274492519851942, + "learning_rate": 1.7344547081337147e-07, + "loss": 11.7415, + "step": 36060 + }, + { + "epoch": 1.9636655707799853, + "grad_norm": 0.602824620451956, + "learning_rate": 1.7292671129977013e-07, + "loss": 11.7833, + "step": 36061 + }, + { + "epoch": 1.9637200247765685, + "grad_norm": 0.5294160814401556, + "learning_rate": 1.724087280582376e-07, + "loss": 11.6379, + "step": 36062 + }, + { + "epoch": 1.9637744787731515, + "grad_norm": 0.49065103439773283, + "learning_rate": 1.7189152109280405e-07, + "loss": 11.8398, + "step": 36063 + }, + { + "epoch": 1.9638289327697345, + "grad_norm": 0.5378153877668171, + "learning_rate": 1.7137509040746625e-07, + "loss": 11.861, + "step": 36064 + }, + { + "epoch": 1.9638833867663175, + "grad_norm": 0.520991498039108, + "learning_rate": 1.7085943600626542e-07, + "loss": 11.827, + "step": 36065 + }, + { + "epoch": 1.9639378407629005, + "grad_norm": 0.5675381880773707, + "learning_rate": 1.7034455789320947e-07, + "loss": 11.8059, + "step": 36066 + }, + { + "epoch": 1.9639922947594837, + "grad_norm": 0.48008876926079747, + "learning_rate": 1.698304560722841e-07, + "loss": 11.5584, + "step": 36067 + }, + { + "epoch": 1.9640467487560667, + "grad_norm": 0.542396205604003, + "learning_rate": 1.693171305475083e-07, + "loss": 11.7639, + "step": 36068 + }, + { + "epoch": 1.9641012027526497, + "grad_norm": 0.51188379760167, + "learning_rate": 1.688045813228567e-07, + "loss": 11.8636, + "step": 36069 + }, + { + "epoch": 1.9641556567492326, + "grad_norm": 0.5727338721667895, + "learning_rate": 1.682928084023261e-07, + "loss": 11.7454, + "step": 36070 + }, + { + "epoch": 1.9642101107458156, + "grad_norm": 0.5429382756620393, + "learning_rate": 1.6778181178989106e-07, + "loss": 11.7864, + "step": 36071 + }, + { + "epoch": 1.9642645647423986, + "grad_norm": 0.6027145212717409, + "learning_rate": 1.672715914895262e-07, + "loss": 11.7084, + "step": 36072 + }, + { + "epoch": 1.9643190187389816, + "grad_norm": 0.5824037743355904, + "learning_rate": 1.6676214750520613e-07, + "loss": 11.7379, + "step": 36073 + }, + { + "epoch": 1.9643734727355646, + "grad_norm": 0.5365915612304775, + "learning_rate": 1.6625347984088324e-07, + "loss": 11.7134, + "step": 36074 + }, + { + "epoch": 1.9644279267321476, + "grad_norm": 0.5111232224399088, + "learning_rate": 1.657455885005099e-07, + "loss": 11.6599, + "step": 36075 + }, + { + "epoch": 1.9644823807287306, + "grad_norm": 0.5047051101211683, + "learning_rate": 1.6523847348803856e-07, + "loss": 11.8254, + "step": 36076 + }, + { + "epoch": 1.9645368347253136, + "grad_norm": 0.5362279851786779, + "learning_rate": 1.6473213480742156e-07, + "loss": 11.7533, + "step": 36077 + }, + { + "epoch": 1.9645912887218966, + "grad_norm": 0.6139211616223916, + "learning_rate": 1.642265724625891e-07, + "loss": 11.8627, + "step": 36078 + }, + { + "epoch": 1.9646457427184796, + "grad_norm": 0.5461223989996234, + "learning_rate": 1.6372178645747138e-07, + "loss": 11.736, + "step": 36079 + }, + { + "epoch": 1.9647001967150626, + "grad_norm": 0.5359178466362011, + "learning_rate": 1.6321777679599858e-07, + "loss": 11.8026, + "step": 36080 + }, + { + "epoch": 1.9647546507116456, + "grad_norm": 0.560500157611614, + "learning_rate": 1.627145434820898e-07, + "loss": 11.7971, + "step": 36081 + }, + { + "epoch": 1.9648091047082286, + "grad_norm": 0.516300969854972, + "learning_rate": 1.6221208651964192e-07, + "loss": 11.6479, + "step": 36082 + }, + { + "epoch": 1.9648635587048116, + "grad_norm": 0.5579620078889198, + "learning_rate": 1.6171040591258512e-07, + "loss": 11.8091, + "step": 36083 + }, + { + "epoch": 1.9649180127013945, + "grad_norm": 0.6173431760203307, + "learning_rate": 1.612095016648163e-07, + "loss": 11.7821, + "step": 36084 + }, + { + "epoch": 1.9649724666979778, + "grad_norm": 1.0244614891651473, + "learning_rate": 1.607093737802101e-07, + "loss": 11.8587, + "step": 36085 + }, + { + "epoch": 1.9650269206945608, + "grad_norm": 0.537696805089532, + "learning_rate": 1.6021002226268567e-07, + "loss": 11.7809, + "step": 36086 + }, + { + "epoch": 1.9650813746911437, + "grad_norm": 0.5401388137255373, + "learning_rate": 1.5971144711610654e-07, + "loss": 11.7671, + "step": 36087 + }, + { + "epoch": 1.9651358286877267, + "grad_norm": 0.49885934768199663, + "learning_rate": 1.5921364834435847e-07, + "loss": 11.7048, + "step": 36088 + }, + { + "epoch": 1.9651902826843097, + "grad_norm": 0.5756231592654099, + "learning_rate": 1.587166259513051e-07, + "loss": 11.8952, + "step": 36089 + }, + { + "epoch": 1.9652447366808927, + "grad_norm": 0.5280664174641819, + "learning_rate": 1.5822037994080997e-07, + "loss": 11.6791, + "step": 36090 + }, + { + "epoch": 1.965299190677476, + "grad_norm": 0.5578361953831942, + "learning_rate": 1.5772491031673665e-07, + "loss": 11.7343, + "step": 36091 + }, + { + "epoch": 1.965353644674059, + "grad_norm": 0.5191451354626264, + "learning_rate": 1.572302170829376e-07, + "loss": 11.7964, + "step": 36092 + }, + { + "epoch": 1.965408098670642, + "grad_norm": 0.5843496016765073, + "learning_rate": 1.5673630024326536e-07, + "loss": 11.7899, + "step": 36093 + }, + { + "epoch": 1.965462552667225, + "grad_norm": 0.5260539209834371, + "learning_rate": 1.5624315980155014e-07, + "loss": 11.6003, + "step": 36094 + }, + { + "epoch": 1.965517006663808, + "grad_norm": 0.5505038763418566, + "learning_rate": 1.5575079576164443e-07, + "loss": 11.6972, + "step": 36095 + }, + { + "epoch": 1.965571460660391, + "grad_norm": 0.5464706381830436, + "learning_rate": 1.552592081273563e-07, + "loss": 11.871, + "step": 36096 + }, + { + "epoch": 1.965625914656974, + "grad_norm": 0.6060484170851027, + "learning_rate": 1.5476839690250488e-07, + "loss": 11.9239, + "step": 36097 + }, + { + "epoch": 1.9656803686535569, + "grad_norm": 0.5348211016419099, + "learning_rate": 1.542783620909316e-07, + "loss": 11.7832, + "step": 36098 + }, + { + "epoch": 1.9657348226501399, + "grad_norm": 0.527604278349885, + "learning_rate": 1.537891036964334e-07, + "loss": 11.8067, + "step": 36099 + }, + { + "epoch": 1.9657892766467229, + "grad_norm": 0.545937571217918, + "learning_rate": 1.5330062172281835e-07, + "loss": 11.8276, + "step": 36100 + }, + { + "epoch": 1.9658437306433059, + "grad_norm": 0.6159916248778422, + "learning_rate": 1.5281291617387227e-07, + "loss": 11.9024, + "step": 36101 + }, + { + "epoch": 1.9658981846398889, + "grad_norm": 0.5393725227542722, + "learning_rate": 1.5232598705340327e-07, + "loss": 11.8396, + "step": 36102 + }, + { + "epoch": 1.9659526386364718, + "grad_norm": 0.5850637917033171, + "learning_rate": 1.5183983436519722e-07, + "loss": 11.6428, + "step": 36103 + }, + { + "epoch": 1.9660070926330548, + "grad_norm": 0.6049639957187175, + "learning_rate": 1.5135445811301774e-07, + "loss": 11.8286, + "step": 36104 + }, + { + "epoch": 1.9660615466296378, + "grad_norm": 0.5391677320089677, + "learning_rate": 1.508698583006507e-07, + "loss": 11.6355, + "step": 36105 + }, + { + "epoch": 1.9661160006262208, + "grad_norm": 0.5963141921636924, + "learning_rate": 1.5038603493187087e-07, + "loss": 11.8, + "step": 36106 + }, + { + "epoch": 1.9661704546228038, + "grad_norm": 0.5607816036452667, + "learning_rate": 1.4990298801043079e-07, + "loss": 11.78, + "step": 36107 + }, + { + "epoch": 1.966224908619387, + "grad_norm": 0.5467884372063281, + "learning_rate": 1.4942071754009413e-07, + "loss": 11.8496, + "step": 36108 + }, + { + "epoch": 1.96627936261597, + "grad_norm": 0.5237521257677624, + "learning_rate": 1.4893922352460233e-07, + "loss": 11.8206, + "step": 36109 + }, + { + "epoch": 1.966333816612553, + "grad_norm": 0.5031099102344975, + "learning_rate": 1.4845850596770793e-07, + "loss": 11.7422, + "step": 36110 + }, + { + "epoch": 1.966388270609136, + "grad_norm": 0.5607691559665281, + "learning_rate": 1.4797856487314132e-07, + "loss": 11.8437, + "step": 36111 + }, + { + "epoch": 1.966442724605719, + "grad_norm": 0.6055633202044425, + "learning_rate": 1.4749940024464393e-07, + "loss": 11.9146, + "step": 36112 + }, + { + "epoch": 1.966497178602302, + "grad_norm": 0.5361187480751732, + "learning_rate": 1.47021012085935e-07, + "loss": 11.7409, + "step": 36113 + }, + { + "epoch": 1.9665516325988852, + "grad_norm": 0.653688839131671, + "learning_rate": 1.465434004007338e-07, + "loss": 11.9156, + "step": 36114 + }, + { + "epoch": 1.9666060865954682, + "grad_norm": 0.5149642296727405, + "learning_rate": 1.4606656519275952e-07, + "loss": 11.789, + "step": 36115 + }, + { + "epoch": 1.9666605405920512, + "grad_norm": 0.5676475229880306, + "learning_rate": 1.4559050646570925e-07, + "loss": 11.8223, + "step": 36116 + }, + { + "epoch": 1.9667149945886342, + "grad_norm": 0.5252849505699192, + "learning_rate": 1.4511522422330227e-07, + "loss": 11.7996, + "step": 36117 + }, + { + "epoch": 1.9667694485852172, + "grad_norm": 0.5877817757785959, + "learning_rate": 1.4464071846921333e-07, + "loss": 11.7568, + "step": 36118 + }, + { + "epoch": 1.9668239025818002, + "grad_norm": 0.5882393227807133, + "learning_rate": 1.4416698920716177e-07, + "loss": 11.8876, + "step": 36119 + }, + { + "epoch": 1.9668783565783832, + "grad_norm": 0.5704054098512796, + "learning_rate": 1.4369403644080014e-07, + "loss": 11.7115, + "step": 36120 + }, + { + "epoch": 1.9669328105749662, + "grad_norm": 0.5289613004063723, + "learning_rate": 1.4322186017382554e-07, + "loss": 11.9047, + "step": 36121 + }, + { + "epoch": 1.9669872645715492, + "grad_norm": 0.5453688217064268, + "learning_rate": 1.427504604098906e-07, + "loss": 11.7507, + "step": 36122 + }, + { + "epoch": 1.9670417185681321, + "grad_norm": 0.5198981806085811, + "learning_rate": 1.4227983715269232e-07, + "loss": 11.6551, + "step": 36123 + }, + { + "epoch": 1.9670961725647151, + "grad_norm": 0.5253699755300153, + "learning_rate": 1.4180999040586117e-07, + "loss": 11.7519, + "step": 36124 + }, + { + "epoch": 1.9671506265612981, + "grad_norm": 0.5393785358843216, + "learning_rate": 1.413409201730609e-07, + "loss": 11.859, + "step": 36125 + }, + { + "epoch": 1.9672050805578811, + "grad_norm": 0.5558675984047506, + "learning_rate": 1.408726264579441e-07, + "loss": 11.7202, + "step": 36126 + }, + { + "epoch": 1.9672595345544641, + "grad_norm": 0.5586216581087697, + "learning_rate": 1.404051092641523e-07, + "loss": 11.89, + "step": 36127 + }, + { + "epoch": 1.967313988551047, + "grad_norm": 0.537080041467235, + "learning_rate": 1.3993836859531596e-07, + "loss": 11.5992, + "step": 36128 + }, + { + "epoch": 1.96736844254763, + "grad_norm": 0.5485479140489351, + "learning_rate": 1.3947240445505437e-07, + "loss": 11.8157, + "step": 36129 + }, + { + "epoch": 1.967422896544213, + "grad_norm": 0.5432455194118807, + "learning_rate": 1.3900721684702023e-07, + "loss": 11.6892, + "step": 36130 + }, + { + "epoch": 1.967477350540796, + "grad_norm": 0.5711861910186576, + "learning_rate": 1.385428057747995e-07, + "loss": 11.8944, + "step": 36131 + }, + { + "epoch": 1.9675318045373793, + "grad_norm": 0.5397806512059342, + "learning_rate": 1.3807917124201154e-07, + "loss": 11.7456, + "step": 36132 + }, + { + "epoch": 1.9675862585339623, + "grad_norm": 0.5562135585852154, + "learning_rate": 1.3761631325227564e-07, + "loss": 11.8171, + "step": 36133 + }, + { + "epoch": 1.9676407125305453, + "grad_norm": 0.5412331934578432, + "learning_rate": 1.3715423180917786e-07, + "loss": 11.8402, + "step": 36134 + }, + { + "epoch": 1.9676951665271283, + "grad_norm": 0.4967746914567025, + "learning_rate": 1.366929269163264e-07, + "loss": 11.7487, + "step": 36135 + }, + { + "epoch": 1.9677496205237113, + "grad_norm": 0.5395412386790123, + "learning_rate": 1.362323985772851e-07, + "loss": 11.7525, + "step": 36136 + }, + { + "epoch": 1.9678040745202945, + "grad_norm": 0.6026132408976158, + "learning_rate": 1.3577264679566214e-07, + "loss": 11.8774, + "step": 36137 + }, + { + "epoch": 1.9678585285168775, + "grad_norm": 0.5580241116103313, + "learning_rate": 1.353136715749992e-07, + "loss": 11.8646, + "step": 36138 + }, + { + "epoch": 1.9679129825134605, + "grad_norm": 0.5345519822307682, + "learning_rate": 1.3485547291890442e-07, + "loss": 11.7214, + "step": 36139 + }, + { + "epoch": 1.9679674365100435, + "grad_norm": 0.5376399763605038, + "learning_rate": 1.3439805083090838e-07, + "loss": 11.6297, + "step": 36140 + }, + { + "epoch": 1.9680218905066265, + "grad_norm": 0.5098290078463686, + "learning_rate": 1.3394140531458598e-07, + "loss": 11.8686, + "step": 36141 + }, + { + "epoch": 1.9680763445032095, + "grad_norm": 0.49553484030839434, + "learning_rate": 1.3348553637347882e-07, + "loss": 11.746, + "step": 36142 + }, + { + "epoch": 1.9681307984997924, + "grad_norm": 0.5133708628717928, + "learning_rate": 1.3303044401112852e-07, + "loss": 11.6988, + "step": 36143 + }, + { + "epoch": 1.9681852524963754, + "grad_norm": 0.6003084040624407, + "learning_rate": 1.325761282310878e-07, + "loss": 11.9923, + "step": 36144 + }, + { + "epoch": 1.9682397064929584, + "grad_norm": 0.517819959884357, + "learning_rate": 1.3212258903688714e-07, + "loss": 11.729, + "step": 36145 + }, + { + "epoch": 1.9682941604895414, + "grad_norm": 0.5401976231040443, + "learning_rate": 1.3166982643204595e-07, + "loss": 11.6976, + "step": 36146 + }, + { + "epoch": 1.9683486144861244, + "grad_norm": 0.5557863424856615, + "learning_rate": 1.3121784042009478e-07, + "loss": 11.786, + "step": 36147 + }, + { + "epoch": 1.9684030684827074, + "grad_norm": 0.5617659525947073, + "learning_rate": 1.3076663100453079e-07, + "loss": 11.8253, + "step": 36148 + }, + { + "epoch": 1.9684575224792904, + "grad_norm": 0.5604156723002983, + "learning_rate": 1.303161981888734e-07, + "loss": 11.6716, + "step": 36149 + }, + { + "epoch": 1.9685119764758734, + "grad_norm": 0.5239089373389401, + "learning_rate": 1.2986654197664205e-07, + "loss": 11.8228, + "step": 36150 + }, + { + "epoch": 1.9685664304724564, + "grad_norm": 0.5449828924114282, + "learning_rate": 1.294176623713006e-07, + "loss": 11.6734, + "step": 36151 + }, + { + "epoch": 1.9686208844690394, + "grad_norm": 0.5389916291346643, + "learning_rate": 1.2896955937635734e-07, + "loss": 11.7565, + "step": 36152 + }, + { + "epoch": 1.9686753384656224, + "grad_norm": 0.5836005375121872, + "learning_rate": 1.2852223299529843e-07, + "loss": 11.9432, + "step": 36153 + }, + { + "epoch": 1.9687297924622054, + "grad_norm": 0.5841893821371371, + "learning_rate": 1.280756832315988e-07, + "loss": 11.8333, + "step": 36154 + }, + { + "epoch": 1.9687842464587886, + "grad_norm": 0.5275668060950642, + "learning_rate": 1.2762991008873348e-07, + "loss": 11.7275, + "step": 36155 + }, + { + "epoch": 1.9688387004553716, + "grad_norm": 0.5217971901470458, + "learning_rate": 1.2718491357016637e-07, + "loss": 11.8409, + "step": 36156 + }, + { + "epoch": 1.9688931544519546, + "grad_norm": 0.5498514013741188, + "learning_rate": 1.2674069367935026e-07, + "loss": 11.8383, + "step": 36157 + }, + { + "epoch": 1.9689476084485376, + "grad_norm": 0.5214066287859432, + "learning_rate": 1.2629725041976014e-07, + "loss": 11.7107, + "step": 36158 + }, + { + "epoch": 1.9690020624451205, + "grad_norm": 0.5597912773438624, + "learning_rate": 1.2585458379481552e-07, + "loss": 11.7649, + "step": 36159 + }, + { + "epoch": 1.9690565164417035, + "grad_norm": 0.5673417262170881, + "learning_rate": 1.2541269380799136e-07, + "loss": 11.7681, + "step": 36160 + }, + { + "epoch": 1.9691109704382868, + "grad_norm": 0.5287815407454016, + "learning_rate": 1.249715804626961e-07, + "loss": 11.7399, + "step": 36161 + }, + { + "epoch": 1.9691654244348697, + "grad_norm": 0.5069560083107458, + "learning_rate": 1.2453124376237134e-07, + "loss": 11.7218, + "step": 36162 + }, + { + "epoch": 1.9692198784314527, + "grad_norm": 0.518075331035977, + "learning_rate": 1.2409168371044776e-07, + "loss": 11.6591, + "step": 36163 + }, + { + "epoch": 1.9692743324280357, + "grad_norm": 0.5758893774532786, + "learning_rate": 1.2365290031032263e-07, + "loss": 11.7872, + "step": 36164 + }, + { + "epoch": 1.9693287864246187, + "grad_norm": 0.5899421541870227, + "learning_rate": 1.2321489356543758e-07, + "loss": 11.7815, + "step": 36165 + }, + { + "epoch": 1.9693832404212017, + "grad_norm": 0.5995609830507865, + "learning_rate": 1.2277766347917886e-07, + "loss": 11.9585, + "step": 36166 + }, + { + "epoch": 1.9694376944177847, + "grad_norm": 0.5431532295982318, + "learning_rate": 1.223412100549437e-07, + "loss": 11.7666, + "step": 36167 + }, + { + "epoch": 1.9694921484143677, + "grad_norm": 0.5715447701570818, + "learning_rate": 1.219055332961405e-07, + "loss": 11.8167, + "step": 36168 + }, + { + "epoch": 1.9695466024109507, + "grad_norm": 0.5283972325969607, + "learning_rate": 1.2147063320614438e-07, + "loss": 11.7003, + "step": 36169 + }, + { + "epoch": 1.9696010564075337, + "grad_norm": 0.6585150801629824, + "learning_rate": 1.2103650978834147e-07, + "loss": 11.8573, + "step": 36170 + }, + { + "epoch": 1.9696555104041167, + "grad_norm": 0.565393904228365, + "learning_rate": 1.2060316304610686e-07, + "loss": 11.7446, + "step": 36171 + }, + { + "epoch": 1.9697099644006997, + "grad_norm": 0.5742858779716876, + "learning_rate": 1.2017059298281564e-07, + "loss": 11.7469, + "step": 36172 + }, + { + "epoch": 1.9697644183972827, + "grad_norm": 0.5166168818491959, + "learning_rate": 1.1973879960183176e-07, + "loss": 11.7861, + "step": 36173 + }, + { + "epoch": 1.9698188723938657, + "grad_norm": 0.5302467696516732, + "learning_rate": 1.1930778290650814e-07, + "loss": 11.7151, + "step": 36174 + }, + { + "epoch": 1.9698733263904487, + "grad_norm": 0.5700285546687027, + "learning_rate": 1.188775429001865e-07, + "loss": 11.7277, + "step": 36175 + }, + { + "epoch": 1.9699277803870316, + "grad_norm": 0.5419987626982037, + "learning_rate": 1.1844807958623083e-07, + "loss": 11.7204, + "step": 36176 + }, + { + "epoch": 1.9699822343836146, + "grad_norm": 0.4873817336037878, + "learning_rate": 1.1801939296797182e-07, + "loss": 11.6233, + "step": 36177 + }, + { + "epoch": 1.9700366883801979, + "grad_norm": 0.49949415826014854, + "learning_rate": 1.1759148304875122e-07, + "loss": 11.6539, + "step": 36178 + }, + { + "epoch": 1.9700911423767808, + "grad_norm": 0.5398713427126732, + "learning_rate": 1.1716434983187751e-07, + "loss": 11.7904, + "step": 36179 + }, + { + "epoch": 1.9701455963733638, + "grad_norm": 0.5627717589213662, + "learning_rate": 1.1673799332069247e-07, + "loss": 11.7619, + "step": 36180 + }, + { + "epoch": 1.9702000503699468, + "grad_norm": 0.5212479404199305, + "learning_rate": 1.1631241351850453e-07, + "loss": 11.7151, + "step": 36181 + }, + { + "epoch": 1.9702545043665298, + "grad_norm": 0.5318582358039198, + "learning_rate": 1.1588761042862218e-07, + "loss": 11.7379, + "step": 36182 + }, + { + "epoch": 1.9703089583631128, + "grad_norm": 0.5851594414369534, + "learning_rate": 1.1546358405434277e-07, + "loss": 11.77, + "step": 36183 + }, + { + "epoch": 1.970363412359696, + "grad_norm": 0.5343863641122449, + "learning_rate": 1.1504033439896367e-07, + "loss": 11.7686, + "step": 36184 + }, + { + "epoch": 1.970417866356279, + "grad_norm": 0.5995527303681142, + "learning_rate": 1.1461786146579334e-07, + "loss": 11.7558, + "step": 36185 + }, + { + "epoch": 1.970472320352862, + "grad_norm": 0.5154038835467099, + "learning_rate": 1.1419616525809584e-07, + "loss": 11.6241, + "step": 36186 + }, + { + "epoch": 1.970526774349445, + "grad_norm": 0.5522059941507242, + "learning_rate": 1.1377524577916854e-07, + "loss": 11.7807, + "step": 36187 + }, + { + "epoch": 1.970581228346028, + "grad_norm": 0.577810220383659, + "learning_rate": 1.1335510303226437e-07, + "loss": 11.767, + "step": 36188 + }, + { + "epoch": 1.970635682342611, + "grad_norm": 0.5724667765123155, + "learning_rate": 1.1293573702068072e-07, + "loss": 11.6692, + "step": 36189 + }, + { + "epoch": 1.970690136339194, + "grad_norm": 0.6336511842378724, + "learning_rate": 1.1251714774764833e-07, + "loss": 11.7558, + "step": 36190 + }, + { + "epoch": 1.970744590335777, + "grad_norm": 0.5545459044883255, + "learning_rate": 1.1209933521643124e-07, + "loss": 11.7489, + "step": 36191 + }, + { + "epoch": 1.97079904433236, + "grad_norm": 0.5551963448600635, + "learning_rate": 1.1168229943028241e-07, + "loss": 11.7147, + "step": 36192 + }, + { + "epoch": 1.970853498328943, + "grad_norm": 0.4912865617046046, + "learning_rate": 1.112660403924437e-07, + "loss": 11.7612, + "step": 36193 + }, + { + "epoch": 1.970907952325526, + "grad_norm": 0.5470037838420644, + "learning_rate": 1.1085055810615696e-07, + "loss": 11.6291, + "step": 36194 + }, + { + "epoch": 1.970962406322109, + "grad_norm": 0.5738744663975652, + "learning_rate": 1.1043585257464184e-07, + "loss": 11.8226, + "step": 36195 + }, + { + "epoch": 1.971016860318692, + "grad_norm": 0.5450275334591849, + "learning_rate": 1.1002192380112908e-07, + "loss": 11.8505, + "step": 36196 + }, + { + "epoch": 1.971071314315275, + "grad_norm": 0.553970158224054, + "learning_rate": 1.0960877178883833e-07, + "loss": 11.7523, + "step": 36197 + }, + { + "epoch": 1.971125768311858, + "grad_norm": 0.5110423403723059, + "learning_rate": 1.0919639654097813e-07, + "loss": 11.7543, + "step": 36198 + }, + { + "epoch": 1.971180222308441, + "grad_norm": 0.5611192989680622, + "learning_rate": 1.0878479806076813e-07, + "loss": 11.8191, + "step": 36199 + }, + { + "epoch": 1.971234676305024, + "grad_norm": 0.5475503858666368, + "learning_rate": 1.0837397635139468e-07, + "loss": 11.7071, + "step": 36200 + }, + { + "epoch": 1.9712891303016071, + "grad_norm": 0.5519847935598601, + "learning_rate": 1.0796393141605521e-07, + "loss": 11.6902, + "step": 36201 + }, + { + "epoch": 1.9713435842981901, + "grad_norm": 0.5856589134744672, + "learning_rate": 1.0755466325793606e-07, + "loss": 11.6871, + "step": 36202 + }, + { + "epoch": 1.9713980382947731, + "grad_norm": 0.518004116995887, + "learning_rate": 1.0714617188022357e-07, + "loss": 11.6014, + "step": 36203 + }, + { + "epoch": 1.971452492291356, + "grad_norm": 0.6271548455564989, + "learning_rate": 1.067384572861041e-07, + "loss": 11.8362, + "step": 36204 + }, + { + "epoch": 1.971506946287939, + "grad_norm": 0.594355934820692, + "learning_rate": 1.0633151947873065e-07, + "loss": 11.708, + "step": 36205 + }, + { + "epoch": 1.971561400284522, + "grad_norm": 0.530694315035782, + "learning_rate": 1.0592535846127849e-07, + "loss": 11.7427, + "step": 36206 + }, + { + "epoch": 1.9716158542811053, + "grad_norm": 0.5497407668420948, + "learning_rate": 1.0551997423690063e-07, + "loss": 11.7952, + "step": 36207 + }, + { + "epoch": 1.9716703082776883, + "grad_norm": 0.5237390509891918, + "learning_rate": 1.0511536680876122e-07, + "loss": 11.7361, + "step": 36208 + }, + { + "epoch": 1.9717247622742713, + "grad_norm": 0.5521301418158289, + "learning_rate": 1.0471153617999108e-07, + "loss": 11.7663, + "step": 36209 + }, + { + "epoch": 1.9717792162708543, + "grad_norm": 0.6364183485173556, + "learning_rate": 1.0430848235373214e-07, + "loss": 11.9224, + "step": 36210 + }, + { + "epoch": 1.9718336702674373, + "grad_norm": 0.5466973097287856, + "learning_rate": 1.0390620533312634e-07, + "loss": 11.8913, + "step": 36211 + }, + { + "epoch": 1.9718881242640203, + "grad_norm": 0.5324499588525479, + "learning_rate": 1.035047051212934e-07, + "loss": 11.8186, + "step": 36212 + }, + { + "epoch": 1.9719425782606033, + "grad_norm": 0.5269022730122065, + "learning_rate": 1.0310398172136415e-07, + "loss": 11.7582, + "step": 36213 + }, + { + "epoch": 1.9719970322571863, + "grad_norm": 0.7373916070420619, + "learning_rate": 1.0270403513645832e-07, + "loss": 11.7461, + "step": 36214 + }, + { + "epoch": 1.9720514862537692, + "grad_norm": 0.519722595597763, + "learning_rate": 1.0230486536967344e-07, + "loss": 11.7973, + "step": 36215 + }, + { + "epoch": 1.9721059402503522, + "grad_norm": 0.5892860532731992, + "learning_rate": 1.0190647242411811e-07, + "loss": 11.6721, + "step": 36216 + }, + { + "epoch": 1.9721603942469352, + "grad_norm": 0.5406902008203212, + "learning_rate": 1.0150885630288987e-07, + "loss": 11.6886, + "step": 36217 + }, + { + "epoch": 1.9722148482435182, + "grad_norm": 0.557050987792401, + "learning_rate": 1.0111201700908624e-07, + "loss": 11.8016, + "step": 36218 + }, + { + "epoch": 1.9722693022401012, + "grad_norm": 0.5353839650173511, + "learning_rate": 1.0071595454578253e-07, + "loss": 11.8251, + "step": 36219 + }, + { + "epoch": 1.9723237562366842, + "grad_norm": 0.5437374207443233, + "learning_rate": 1.0032066891606517e-07, + "loss": 11.8201, + "step": 36220 + }, + { + "epoch": 1.9723782102332672, + "grad_norm": 0.5638026184592565, + "learning_rate": 9.992616012300949e-08, + "loss": 11.9145, + "step": 36221 + }, + { + "epoch": 1.9724326642298502, + "grad_norm": 0.5387061459151387, + "learning_rate": 9.953242816967968e-08, + "loss": 11.7386, + "step": 36222 + }, + { + "epoch": 1.9724871182264332, + "grad_norm": 0.5329417947855982, + "learning_rate": 9.913947305913995e-08, + "loss": 11.7285, + "step": 36223 + }, + { + "epoch": 1.9725415722230162, + "grad_norm": 0.5437957574204663, + "learning_rate": 9.874729479444345e-08, + "loss": 11.7789, + "step": 36224 + }, + { + "epoch": 1.9725960262195994, + "grad_norm": 0.5532036869654136, + "learning_rate": 9.835589337864326e-08, + "loss": 11.8299, + "step": 36225 + }, + { + "epoch": 1.9726504802161824, + "grad_norm": 0.5942121282855267, + "learning_rate": 9.796526881478141e-08, + "loss": 11.8215, + "step": 36226 + }, + { + "epoch": 1.9727049342127654, + "grad_norm": 0.5400951194990719, + "learning_rate": 9.75754211058999e-08, + "loss": 11.8526, + "step": 36227 + }, + { + "epoch": 1.9727593882093484, + "grad_norm": 0.5148944871312742, + "learning_rate": 9.718635025501855e-08, + "loss": 11.8274, + "step": 36228 + }, + { + "epoch": 1.9728138422059314, + "grad_norm": 0.5032515065594267, + "learning_rate": 9.679805626517934e-08, + "loss": 11.6671, + "step": 36229 + }, + { + "epoch": 1.9728682962025144, + "grad_norm": 0.5470610301875037, + "learning_rate": 9.641053913937992e-08, + "loss": 11.6036, + "step": 36230 + }, + { + "epoch": 1.9729227501990976, + "grad_norm": 0.53765309824049, + "learning_rate": 9.602379888065116e-08, + "loss": 11.8478, + "step": 36231 + }, + { + "epoch": 1.9729772041956806, + "grad_norm": 0.5587061458410771, + "learning_rate": 9.563783549200178e-08, + "loss": 11.7793, + "step": 36232 + }, + { + "epoch": 1.9730316581922636, + "grad_norm": 0.5366956153541028, + "learning_rate": 9.525264897641828e-08, + "loss": 11.6919, + "step": 36233 + }, + { + "epoch": 1.9730861121888466, + "grad_norm": 0.511307339449537, + "learning_rate": 9.486823933689826e-08, + "loss": 11.7315, + "step": 36234 + }, + { + "epoch": 1.9731405661854295, + "grad_norm": 0.5064735221613555, + "learning_rate": 9.448460657645042e-08, + "loss": 11.7467, + "step": 36235 + }, + { + "epoch": 1.9731950201820125, + "grad_norm": 0.5419981151780828, + "learning_rate": 9.410175069803906e-08, + "loss": 11.7226, + "step": 36236 + }, + { + "epoch": 1.9732494741785955, + "grad_norm": 0.6664414139562269, + "learning_rate": 9.371967170463958e-08, + "loss": 11.7607, + "step": 36237 + }, + { + "epoch": 1.9733039281751785, + "grad_norm": 0.5885293274527131, + "learning_rate": 9.333836959923847e-08, + "loss": 11.8146, + "step": 36238 + }, + { + "epoch": 1.9733583821717615, + "grad_norm": 0.5063013234728269, + "learning_rate": 9.295784438478894e-08, + "loss": 11.7359, + "step": 36239 + }, + { + "epoch": 1.9734128361683445, + "grad_norm": 0.5694645873013854, + "learning_rate": 9.257809606426638e-08, + "loss": 11.8281, + "step": 36240 + }, + { + "epoch": 1.9734672901649275, + "grad_norm": 0.5171677678543232, + "learning_rate": 9.219912464060176e-08, + "loss": 11.6704, + "step": 36241 + }, + { + "epoch": 1.9735217441615105, + "grad_norm": 0.559913902747464, + "learning_rate": 9.182093011674831e-08, + "loss": 11.6763, + "step": 36242 + }, + { + "epoch": 1.9735761981580935, + "grad_norm": 0.5667602243596593, + "learning_rate": 9.144351249564809e-08, + "loss": 11.7827, + "step": 36243 + }, + { + "epoch": 1.9736306521546765, + "grad_norm": 0.556631289938428, + "learning_rate": 9.106687178024321e-08, + "loss": 11.7439, + "step": 36244 + }, + { + "epoch": 1.9736851061512595, + "grad_norm": 0.5730097167908303, + "learning_rate": 9.069100797345353e-08, + "loss": 11.6917, + "step": 36245 + }, + { + "epoch": 1.9737395601478425, + "grad_norm": 0.542573366011134, + "learning_rate": 9.031592107821008e-08, + "loss": 11.8264, + "step": 36246 + }, + { + "epoch": 1.9737940141444255, + "grad_norm": 0.5295022511165232, + "learning_rate": 8.99416110974216e-08, + "loss": 11.5568, + "step": 36247 + }, + { + "epoch": 1.9738484681410087, + "grad_norm": 0.6218076862655832, + "learning_rate": 8.956807803399691e-08, + "loss": 11.8278, + "step": 36248 + }, + { + "epoch": 1.9739029221375917, + "grad_norm": 0.5000786730024737, + "learning_rate": 8.919532189085589e-08, + "loss": 11.7464, + "step": 36249 + }, + { + "epoch": 1.9739573761341747, + "grad_norm": 0.5658771570438295, + "learning_rate": 8.88233426708851e-08, + "loss": 11.8297, + "step": 36250 + }, + { + "epoch": 1.9740118301307576, + "grad_norm": 0.5070732070418988, + "learning_rate": 8.845214037697113e-08, + "loss": 11.7376, + "step": 36251 + }, + { + "epoch": 1.9740662841273406, + "grad_norm": 0.5577908444636813, + "learning_rate": 8.808171501201167e-08, + "loss": 11.85, + "step": 36252 + }, + { + "epoch": 1.9741207381239236, + "grad_norm": 0.48970841671500775, + "learning_rate": 8.77120665788933e-08, + "loss": 11.7533, + "step": 36253 + }, + { + "epoch": 1.9741751921205068, + "grad_norm": 0.5301730065638747, + "learning_rate": 8.734319508046929e-08, + "loss": 11.6283, + "step": 36254 + }, + { + "epoch": 1.9742296461170898, + "grad_norm": 0.5573255084230732, + "learning_rate": 8.697510051962621e-08, + "loss": 11.7985, + "step": 36255 + }, + { + "epoch": 1.9742841001136728, + "grad_norm": 0.5134263743120279, + "learning_rate": 8.660778289921734e-08, + "loss": 11.7688, + "step": 36256 + }, + { + "epoch": 1.9743385541102558, + "grad_norm": 0.5559404718408812, + "learning_rate": 8.624124222210705e-08, + "loss": 11.7263, + "step": 36257 + }, + { + "epoch": 1.9743930081068388, + "grad_norm": 0.5465372986633922, + "learning_rate": 8.587547849112642e-08, + "loss": 11.7493, + "step": 36258 + }, + { + "epoch": 1.9744474621034218, + "grad_norm": 0.6107175971590492, + "learning_rate": 8.551049170915094e-08, + "loss": 11.8658, + "step": 36259 + }, + { + "epoch": 1.9745019161000048, + "grad_norm": 0.5091342860021256, + "learning_rate": 8.514628187898944e-08, + "loss": 11.6125, + "step": 36260 + }, + { + "epoch": 1.9745563700965878, + "grad_norm": 0.48977344812386725, + "learning_rate": 8.47828490034952e-08, + "loss": 11.7845, + "step": 36261 + }, + { + "epoch": 1.9746108240931708, + "grad_norm": 0.5106483975923595, + "learning_rate": 8.442019308547711e-08, + "loss": 11.7983, + "step": 36262 + }, + { + "epoch": 1.9746652780897538, + "grad_norm": 0.5654591778208484, + "learning_rate": 8.405831412776621e-08, + "loss": 11.8042, + "step": 36263 + }, + { + "epoch": 1.9747197320863368, + "grad_norm": 0.5390632403146243, + "learning_rate": 8.369721213318248e-08, + "loss": 11.6458, + "step": 36264 + }, + { + "epoch": 1.9747741860829198, + "grad_norm": 0.6122913704702639, + "learning_rate": 8.333688710451259e-08, + "loss": 11.7465, + "step": 36265 + }, + { + "epoch": 1.9748286400795028, + "grad_norm": 0.5812981482406246, + "learning_rate": 8.29773390445765e-08, + "loss": 11.8522, + "step": 36266 + }, + { + "epoch": 1.9748830940760858, + "grad_norm": 0.5626605446982994, + "learning_rate": 8.261856795617195e-08, + "loss": 11.5391, + "step": 36267 + }, + { + "epoch": 1.9749375480726687, + "grad_norm": 0.5645370918586674, + "learning_rate": 8.226057384206343e-08, + "loss": 11.7902, + "step": 36268 + }, + { + "epoch": 1.9749920020692517, + "grad_norm": 0.5304535637631037, + "learning_rate": 8.190335670507088e-08, + "loss": 11.7177, + "step": 36269 + }, + { + "epoch": 1.9750464560658347, + "grad_norm": 0.5090654073949913, + "learning_rate": 8.154691654794766e-08, + "loss": 11.7826, + "step": 36270 + }, + { + "epoch": 1.975100910062418, + "grad_norm": 0.5589764265176818, + "learning_rate": 8.119125337346934e-08, + "loss": 11.8097, + "step": 36271 + }, + { + "epoch": 1.975155364059001, + "grad_norm": 0.4915100717076199, + "learning_rate": 8.083636718441145e-08, + "loss": 11.8505, + "step": 36272 + }, + { + "epoch": 1.975209818055584, + "grad_norm": 0.555226556038492, + "learning_rate": 8.048225798351628e-08, + "loss": 11.7101, + "step": 36273 + }, + { + "epoch": 1.975264272052167, + "grad_norm": 0.5304210386952574, + "learning_rate": 8.012892577354824e-08, + "loss": 11.8985, + "step": 36274 + }, + { + "epoch": 1.97531872604875, + "grad_norm": 0.5224139660861945, + "learning_rate": 7.977637055726073e-08, + "loss": 11.6573, + "step": 36275 + }, + { + "epoch": 1.975373180045333, + "grad_norm": 0.5965846458124298, + "learning_rate": 7.942459233738486e-08, + "loss": 11.838, + "step": 36276 + }, + { + "epoch": 1.9754276340419161, + "grad_norm": 0.5522995528529425, + "learning_rate": 7.90735911166629e-08, + "loss": 11.7547, + "step": 36277 + }, + { + "epoch": 1.9754820880384991, + "grad_norm": 0.574676307176989, + "learning_rate": 7.87233668978149e-08, + "loss": 11.627, + "step": 36278 + }, + { + "epoch": 1.975536542035082, + "grad_norm": 0.5173689839251362, + "learning_rate": 7.83739196835831e-08, + "loss": 11.8449, + "step": 36279 + }, + { + "epoch": 1.975590996031665, + "grad_norm": 0.5135224400747184, + "learning_rate": 7.802524947666534e-08, + "loss": 11.6768, + "step": 36280 + }, + { + "epoch": 1.975645450028248, + "grad_norm": 0.5251815678512176, + "learning_rate": 7.767735627978168e-08, + "loss": 11.7705, + "step": 36281 + }, + { + "epoch": 1.975699904024831, + "grad_norm": 0.586207223170747, + "learning_rate": 7.733024009562994e-08, + "loss": 11.8765, + "step": 36282 + }, + { + "epoch": 1.975754358021414, + "grad_norm": 0.5574241248148462, + "learning_rate": 7.698390092691909e-08, + "loss": 11.8348, + "step": 36283 + }, + { + "epoch": 1.975808812017997, + "grad_norm": 0.7273289480528171, + "learning_rate": 7.663833877634696e-08, + "loss": 11.933, + "step": 36284 + }, + { + "epoch": 1.97586326601458, + "grad_norm": 0.5668389575448375, + "learning_rate": 7.629355364657809e-08, + "loss": 11.8707, + "step": 36285 + }, + { + "epoch": 1.975917720011163, + "grad_norm": 0.50882618487503, + "learning_rate": 7.59495455403103e-08, + "loss": 11.7991, + "step": 36286 + }, + { + "epoch": 1.975972174007746, + "grad_norm": 0.5920375125195722, + "learning_rate": 7.560631446023036e-08, + "loss": 11.8265, + "step": 36287 + }, + { + "epoch": 1.976026628004329, + "grad_norm": 0.5267960479256812, + "learning_rate": 7.52638604089806e-08, + "loss": 11.7645, + "step": 36288 + }, + { + "epoch": 1.976081082000912, + "grad_norm": 0.5596102602300225, + "learning_rate": 7.492218338923662e-08, + "loss": 11.7543, + "step": 36289 + }, + { + "epoch": 1.976135535997495, + "grad_norm": 0.5709911412573924, + "learning_rate": 7.458128340366299e-08, + "loss": 11.7865, + "step": 36290 + }, + { + "epoch": 1.976189989994078, + "grad_norm": 0.5582750815536331, + "learning_rate": 7.424116045489094e-08, + "loss": 11.8644, + "step": 36291 + }, + { + "epoch": 1.976244443990661, + "grad_norm": 0.5237505920322937, + "learning_rate": 7.390181454558498e-08, + "loss": 11.7387, + "step": 36292 + }, + { + "epoch": 1.976298897987244, + "grad_norm": 0.5097076329400535, + "learning_rate": 7.356324567837636e-08, + "loss": 11.7185, + "step": 36293 + }, + { + "epoch": 1.976353351983827, + "grad_norm": 0.5429137780618095, + "learning_rate": 7.32254538558963e-08, + "loss": 11.8103, + "step": 36294 + }, + { + "epoch": 1.9764078059804102, + "grad_norm": 0.5841830201353465, + "learning_rate": 7.288843908076492e-08, + "loss": 11.7246, + "step": 36295 + }, + { + "epoch": 1.9764622599769932, + "grad_norm": 0.5782744501294761, + "learning_rate": 7.255220135562457e-08, + "loss": 11.8672, + "step": 36296 + }, + { + "epoch": 1.9765167139735762, + "grad_norm": 0.5426315506050473, + "learning_rate": 7.221674068306206e-08, + "loss": 11.7543, + "step": 36297 + }, + { + "epoch": 1.9765711679701592, + "grad_norm": 0.5466101557634807, + "learning_rate": 7.188205706570861e-08, + "loss": 11.5791, + "step": 36298 + }, + { + "epoch": 1.9766256219667422, + "grad_norm": 0.5375236649410577, + "learning_rate": 7.154815050616215e-08, + "loss": 11.7321, + "step": 36299 + }, + { + "epoch": 1.9766800759633252, + "grad_norm": 0.4964365523418336, + "learning_rate": 7.121502100700949e-08, + "loss": 11.6377, + "step": 36300 + }, + { + "epoch": 1.9767345299599084, + "grad_norm": 0.6305503821042118, + "learning_rate": 7.088266857084858e-08, + "loss": 11.8237, + "step": 36301 + }, + { + "epoch": 1.9767889839564914, + "grad_norm": 0.5358256091857745, + "learning_rate": 7.055109320025511e-08, + "loss": 11.7467, + "step": 36302 + }, + { + "epoch": 1.9768434379530744, + "grad_norm": 0.5497933257512864, + "learning_rate": 7.0220294897827e-08, + "loss": 11.8914, + "step": 36303 + }, + { + "epoch": 1.9768978919496574, + "grad_norm": 0.5984924652247515, + "learning_rate": 6.989027366611778e-08, + "loss": 11.7903, + "step": 36304 + }, + { + "epoch": 1.9769523459462404, + "grad_norm": 0.550647246077146, + "learning_rate": 6.956102950770315e-08, + "loss": 11.8169, + "step": 36305 + }, + { + "epoch": 1.9770067999428234, + "grad_norm": 0.5377030961924383, + "learning_rate": 6.923256242513664e-08, + "loss": 11.7392, + "step": 36306 + }, + { + "epoch": 1.9770612539394063, + "grad_norm": 0.5272190434435017, + "learning_rate": 6.890487242097177e-08, + "loss": 11.7926, + "step": 36307 + }, + { + "epoch": 1.9771157079359893, + "grad_norm": 0.5148450472477232, + "learning_rate": 6.857795949776203e-08, + "loss": 11.7307, + "step": 36308 + }, + { + "epoch": 1.9771701619325723, + "grad_norm": 0.5522228817044328, + "learning_rate": 6.825182365806093e-08, + "loss": 11.8029, + "step": 36309 + }, + { + "epoch": 1.9772246159291553, + "grad_norm": 0.5048907866642403, + "learning_rate": 6.792646490437759e-08, + "loss": 11.5917, + "step": 36310 + }, + { + "epoch": 1.9772790699257383, + "grad_norm": 0.5817331170603239, + "learning_rate": 6.760188323926552e-08, + "loss": 11.7732, + "step": 36311 + }, + { + "epoch": 1.9773335239223213, + "grad_norm": 0.5569403418667223, + "learning_rate": 6.727807866523384e-08, + "loss": 11.8178, + "step": 36312 + }, + { + "epoch": 1.9773879779189043, + "grad_norm": 0.5254249993695699, + "learning_rate": 6.695505118481382e-08, + "loss": 11.7582, + "step": 36313 + }, + { + "epoch": 1.9774424319154873, + "grad_norm": 0.5173435106520328, + "learning_rate": 6.66328008005035e-08, + "loss": 11.7492, + "step": 36314 + }, + { + "epoch": 1.9774968859120703, + "grad_norm": 0.531210626335476, + "learning_rate": 6.631132751482305e-08, + "loss": 11.8797, + "step": 36315 + }, + { + "epoch": 1.9775513399086533, + "grad_norm": 0.5094864697529042, + "learning_rate": 6.59906313302594e-08, + "loss": 11.7976, + "step": 36316 + }, + { + "epoch": 1.9776057939052363, + "grad_norm": 0.5615927347246377, + "learning_rate": 6.567071224931054e-08, + "loss": 11.7936, + "step": 36317 + }, + { + "epoch": 1.9776602479018195, + "grad_norm": 0.5895509348544351, + "learning_rate": 6.535157027446337e-08, + "loss": 11.6687, + "step": 36318 + }, + { + "epoch": 1.9777147018984025, + "grad_norm": 0.5208164665987108, + "learning_rate": 6.503320540820479e-08, + "loss": 11.8263, + "step": 36319 + }, + { + "epoch": 1.9777691558949855, + "grad_norm": 0.5077054033679513, + "learning_rate": 6.47156176530106e-08, + "loss": 11.704, + "step": 36320 + }, + { + "epoch": 1.9778236098915685, + "grad_norm": 0.5650257572135685, + "learning_rate": 6.439880701134548e-08, + "loss": 11.8266, + "step": 36321 + }, + { + "epoch": 1.9778780638881515, + "grad_norm": 0.5194485004209728, + "learning_rate": 6.408277348567415e-08, + "loss": 11.6489, + "step": 36322 + }, + { + "epoch": 1.9779325178847345, + "grad_norm": 0.5101688774812396, + "learning_rate": 6.37675170784502e-08, + "loss": 11.753, + "step": 36323 + }, + { + "epoch": 1.9779869718813177, + "grad_norm": 0.49336019726026487, + "learning_rate": 6.34530377921272e-08, + "loss": 11.6734, + "step": 36324 + }, + { + "epoch": 1.9780414258779007, + "grad_norm": 0.5976201422843452, + "learning_rate": 6.313933562915874e-08, + "loss": 11.8844, + "step": 36325 + }, + { + "epoch": 1.9780958798744837, + "grad_norm": 0.5111855462812013, + "learning_rate": 6.282641059197625e-08, + "loss": 11.7917, + "step": 36326 + }, + { + "epoch": 1.9781503338710666, + "grad_norm": 0.5422945022187791, + "learning_rate": 6.251426268301108e-08, + "loss": 11.7145, + "step": 36327 + }, + { + "epoch": 1.9782047878676496, + "grad_norm": 0.510816362739602, + "learning_rate": 6.220289190470575e-08, + "loss": 11.8235, + "step": 36328 + }, + { + "epoch": 1.9782592418642326, + "grad_norm": 0.5620730410934067, + "learning_rate": 6.18922982594583e-08, + "loss": 11.8108, + "step": 36329 + }, + { + "epoch": 1.9783136958608156, + "grad_norm": 0.5609138016276113, + "learning_rate": 6.158248174970017e-08, + "loss": 11.7199, + "step": 36330 + }, + { + "epoch": 1.9783681498573986, + "grad_norm": 0.5202221186895, + "learning_rate": 6.12734423778405e-08, + "loss": 11.7697, + "step": 36331 + }, + { + "epoch": 1.9784226038539816, + "grad_norm": 0.572701285802484, + "learning_rate": 6.096518014627738e-08, + "loss": 11.9467, + "step": 36332 + }, + { + "epoch": 1.9784770578505646, + "grad_norm": 0.5556647868062884, + "learning_rate": 6.065769505740892e-08, + "loss": 11.754, + "step": 36333 + }, + { + "epoch": 1.9785315118471476, + "grad_norm": 0.5417592795573949, + "learning_rate": 6.035098711362209e-08, + "loss": 11.7906, + "step": 36334 + }, + { + "epoch": 1.9785859658437306, + "grad_norm": 0.5681967901406085, + "learning_rate": 6.004505631730383e-08, + "loss": 11.6744, + "step": 36335 + }, + { + "epoch": 1.9786404198403136, + "grad_norm": 0.5218432259138275, + "learning_rate": 5.973990267084118e-08, + "loss": 11.8088, + "step": 36336 + }, + { + "epoch": 1.9786948738368966, + "grad_norm": 0.509235367162828, + "learning_rate": 5.9435526176598863e-08, + "loss": 11.7769, + "step": 36337 + }, + { + "epoch": 1.9787493278334796, + "grad_norm": 0.5982462132848184, + "learning_rate": 5.913192683694169e-08, + "loss": 11.9151, + "step": 36338 + }, + { + "epoch": 1.9788037818300626, + "grad_norm": 0.6370102271443775, + "learning_rate": 5.882910465424551e-08, + "loss": 11.8847, + "step": 36339 + }, + { + "epoch": 1.9788582358266456, + "grad_norm": 0.5820695328139325, + "learning_rate": 5.852705963084182e-08, + "loss": 11.7506, + "step": 36340 + }, + { + "epoch": 1.9789126898232288, + "grad_norm": 0.5480070882482397, + "learning_rate": 5.8225791769106476e-08, + "loss": 11.7188, + "step": 36341 + }, + { + "epoch": 1.9789671438198118, + "grad_norm": 0.5234889592020289, + "learning_rate": 5.7925301071348746e-08, + "loss": 11.8682, + "step": 36342 + }, + { + "epoch": 1.9790215978163948, + "grad_norm": 0.5489082558769252, + "learning_rate": 5.7625587539944514e-08, + "loss": 11.7167, + "step": 36343 + }, + { + "epoch": 1.9790760518129777, + "grad_norm": 0.5211445365558279, + "learning_rate": 5.732665117719194e-08, + "loss": 11.866, + "step": 36344 + }, + { + "epoch": 1.9791305058095607, + "grad_norm": 0.5313983787771561, + "learning_rate": 5.702849198542248e-08, + "loss": 11.6887, + "step": 36345 + }, + { + "epoch": 1.9791849598061437, + "grad_norm": 0.5935234732911259, + "learning_rate": 5.673110996696762e-08, + "loss": 11.9406, + "step": 36346 + }, + { + "epoch": 1.979239413802727, + "grad_norm": 0.5406266078476016, + "learning_rate": 5.6434505124136616e-08, + "loss": 11.7714, + "step": 36347 + }, + { + "epoch": 1.97929386779931, + "grad_norm": 0.577116098170172, + "learning_rate": 5.613867745922763e-08, + "loss": 11.7744, + "step": 36348 + }, + { + "epoch": 1.979348321795893, + "grad_norm": 0.5639332310482835, + "learning_rate": 5.584362697453882e-08, + "loss": 11.8665, + "step": 36349 + }, + { + "epoch": 1.979402775792476, + "grad_norm": 0.586943512552007, + "learning_rate": 5.554935367237946e-08, + "loss": 11.7265, + "step": 36350 + }, + { + "epoch": 1.979457229789059, + "grad_norm": 0.5458017642790904, + "learning_rate": 5.525585755502549e-08, + "loss": 11.8465, + "step": 36351 + }, + { + "epoch": 1.979511683785642, + "grad_norm": 0.6195057890363825, + "learning_rate": 5.496313862476399e-08, + "loss": 11.7799, + "step": 36352 + }, + { + "epoch": 1.979566137782225, + "grad_norm": 0.5576611249620743, + "learning_rate": 5.46711968838598e-08, + "loss": 11.7103, + "step": 36353 + }, + { + "epoch": 1.979620591778808, + "grad_norm": 0.5828726139288403, + "learning_rate": 5.438003233459998e-08, + "loss": 11.8346, + "step": 36354 + }, + { + "epoch": 1.9796750457753909, + "grad_norm": 0.5360299264781121, + "learning_rate": 5.40896449792494e-08, + "loss": 11.8725, + "step": 36355 + }, + { + "epoch": 1.9797294997719739, + "grad_norm": 0.5294166679762473, + "learning_rate": 5.3800034820050696e-08, + "loss": 11.7766, + "step": 36356 + }, + { + "epoch": 1.9797839537685569, + "grad_norm": 0.5134185955352489, + "learning_rate": 5.3511201859268725e-08, + "loss": 11.6572, + "step": 36357 + }, + { + "epoch": 1.9798384077651399, + "grad_norm": 0.6311800638950488, + "learning_rate": 5.3223146099135035e-08, + "loss": 11.9058, + "step": 36358 + }, + { + "epoch": 1.9798928617617229, + "grad_norm": 0.5834818119326419, + "learning_rate": 5.2935867541914487e-08, + "loss": 11.7748, + "step": 36359 + }, + { + "epoch": 1.9799473157583058, + "grad_norm": 0.5675728564670772, + "learning_rate": 5.264936618981642e-08, + "loss": 11.692, + "step": 36360 + }, + { + "epoch": 1.9800017697548888, + "grad_norm": 0.5399396559574882, + "learning_rate": 5.236364204507238e-08, + "loss": 11.7302, + "step": 36361 + }, + { + "epoch": 1.9800562237514718, + "grad_norm": 0.5459817402648429, + "learning_rate": 5.207869510992502e-08, + "loss": 11.7319, + "step": 36362 + }, + { + "epoch": 1.9801106777480548, + "grad_norm": 0.5013970198304216, + "learning_rate": 5.179452538656149e-08, + "loss": 11.8196, + "step": 36363 + }, + { + "epoch": 1.9801651317446378, + "grad_norm": 0.5595841064832009, + "learning_rate": 5.151113287721332e-08, + "loss": 11.8034, + "step": 36364 + }, + { + "epoch": 1.980219585741221, + "grad_norm": 0.5352657501973164, + "learning_rate": 5.122851758406766e-08, + "loss": 11.787, + "step": 36365 + }, + { + "epoch": 1.980274039737804, + "grad_norm": 0.5478585939635453, + "learning_rate": 5.094667950933385e-08, + "loss": 11.7097, + "step": 36366 + }, + { + "epoch": 1.980328493734387, + "grad_norm": 0.5369698837798761, + "learning_rate": 5.0665618655210136e-08, + "loss": 11.7658, + "step": 36367 + }, + { + "epoch": 1.98038294773097, + "grad_norm": 0.5646834408333486, + "learning_rate": 5.038533502386145e-08, + "loss": 11.8147, + "step": 36368 + }, + { + "epoch": 1.980437401727553, + "grad_norm": 0.5748268256684115, + "learning_rate": 5.0105828617474925e-08, + "loss": 11.7697, + "step": 36369 + }, + { + "epoch": 1.9804918557241362, + "grad_norm": 0.5403407442121676, + "learning_rate": 4.982709943823771e-08, + "loss": 11.7193, + "step": 36370 + }, + { + "epoch": 1.9805463097207192, + "grad_norm": 0.558221577878052, + "learning_rate": 4.9549147488303635e-08, + "loss": 11.7359, + "step": 36371 + }, + { + "epoch": 1.9806007637173022, + "grad_norm": 0.513380065672353, + "learning_rate": 4.927197276982653e-08, + "loss": 11.7409, + "step": 36372 + }, + { + "epoch": 1.9806552177138852, + "grad_norm": 0.5309627229713108, + "learning_rate": 4.899557528498244e-08, + "loss": 11.7988, + "step": 36373 + }, + { + "epoch": 1.9807096717104682, + "grad_norm": 0.5099223751420987, + "learning_rate": 4.871995503591409e-08, + "loss": 11.7858, + "step": 36374 + }, + { + "epoch": 1.9807641257070512, + "grad_norm": 0.5365808849213447, + "learning_rate": 4.8445112024753106e-08, + "loss": 11.7678, + "step": 36375 + }, + { + "epoch": 1.9808185797036342, + "grad_norm": 0.5243160670745014, + "learning_rate": 4.817104625364221e-08, + "loss": 11.8326, + "step": 36376 + }, + { + "epoch": 1.9808730337002172, + "grad_norm": 0.5720339580511433, + "learning_rate": 4.789775772472416e-08, + "loss": 11.835, + "step": 36377 + }, + { + "epoch": 1.9809274876968002, + "grad_norm": 0.5196927828287613, + "learning_rate": 4.762524644010835e-08, + "loss": 11.7265, + "step": 36378 + }, + { + "epoch": 1.9809819416933832, + "grad_norm": 0.564518442792885, + "learning_rate": 4.735351240192642e-08, + "loss": 11.863, + "step": 36379 + }, + { + "epoch": 1.9810363956899661, + "grad_norm": 0.5165776029013883, + "learning_rate": 4.7082555612287804e-08, + "loss": 11.7244, + "step": 36380 + }, + { + "epoch": 1.9810908496865491, + "grad_norm": 0.5488738136409199, + "learning_rate": 4.6812376073290806e-08, + "loss": 11.6977, + "step": 36381 + }, + { + "epoch": 1.9811453036831321, + "grad_norm": 0.5675803755394957, + "learning_rate": 4.6542973787044866e-08, + "loss": 11.7042, + "step": 36382 + }, + { + "epoch": 1.9811997576797151, + "grad_norm": 0.5550562381550627, + "learning_rate": 4.6274348755637185e-08, + "loss": 11.9232, + "step": 36383 + }, + { + "epoch": 1.9812542116762981, + "grad_norm": 0.5136672127163762, + "learning_rate": 4.60065009811661e-08, + "loss": 11.7147, + "step": 36384 + }, + { + "epoch": 1.981308665672881, + "grad_norm": 0.5152296456485161, + "learning_rate": 4.5739430465718826e-08, + "loss": 11.805, + "step": 36385 + }, + { + "epoch": 1.981363119669464, + "grad_norm": 0.5471769060936504, + "learning_rate": 4.547313721136037e-08, + "loss": 11.7434, + "step": 36386 + }, + { + "epoch": 1.981417573666047, + "grad_norm": 0.5797772295236849, + "learning_rate": 4.520762122015576e-08, + "loss": 11.7883, + "step": 36387 + }, + { + "epoch": 1.9814720276626303, + "grad_norm": 0.5224920669972644, + "learning_rate": 4.4942882494192204e-08, + "loss": 11.8305, + "step": 36388 + }, + { + "epoch": 1.9815264816592133, + "grad_norm": 0.596610670032578, + "learning_rate": 4.467892103550142e-08, + "loss": 11.7957, + "step": 36389 + }, + { + "epoch": 1.9815809356557963, + "grad_norm": 0.49203453727486535, + "learning_rate": 4.441573684615952e-08, + "loss": 11.7003, + "step": 36390 + }, + { + "epoch": 1.9816353896523793, + "grad_norm": 0.5130185467092806, + "learning_rate": 4.415332992820931e-08, + "loss": 11.7963, + "step": 36391 + }, + { + "epoch": 1.9816898436489623, + "grad_norm": 0.4771372129257163, + "learning_rate": 4.38917002836714e-08, + "loss": 11.6706, + "step": 36392 + }, + { + "epoch": 1.9817442976455453, + "grad_norm": 0.519489722446061, + "learning_rate": 4.36308479145997e-08, + "loss": 11.7753, + "step": 36393 + }, + { + "epoch": 1.9817987516421285, + "grad_norm": 0.5289121310084038, + "learning_rate": 4.3370772823014824e-08, + "loss": 11.7648, + "step": 36394 + }, + { + "epoch": 1.9818532056387115, + "grad_norm": 0.5192001536176016, + "learning_rate": 4.3111475010948474e-08, + "loss": 11.7289, + "step": 36395 + }, + { + "epoch": 1.9819076596352945, + "grad_norm": 0.5346676956395546, + "learning_rate": 4.285295448041016e-08, + "loss": 11.7567, + "step": 36396 + }, + { + "epoch": 1.9819621136318775, + "grad_norm": 0.5346781785191166, + "learning_rate": 4.259521123339827e-08, + "loss": 11.8591, + "step": 36397 + }, + { + "epoch": 1.9820165676284605, + "grad_norm": 0.5311257454039943, + "learning_rate": 4.233824527194452e-08, + "loss": 11.807, + "step": 36398 + }, + { + "epoch": 1.9820710216250434, + "grad_norm": 0.5162823434128065, + "learning_rate": 4.208205659802511e-08, + "loss": 11.6991, + "step": 36399 + }, + { + "epoch": 1.9821254756216264, + "grad_norm": 0.5467079572874044, + "learning_rate": 4.182664521362734e-08, + "loss": 11.8395, + "step": 36400 + }, + { + "epoch": 1.9821799296182094, + "grad_norm": 0.5242538245582777, + "learning_rate": 4.15720111207607e-08, + "loss": 11.7732, + "step": 36401 + }, + { + "epoch": 1.9822343836147924, + "grad_norm": 0.5335046877696261, + "learning_rate": 4.13181543213903e-08, + "loss": 11.8929, + "step": 36402 + }, + { + "epoch": 1.9822888376113754, + "grad_norm": 0.6177807711391469, + "learning_rate": 4.106507481749233e-08, + "loss": 11.9371, + "step": 36403 + }, + { + "epoch": 1.9823432916079584, + "grad_norm": 0.5412095866919604, + "learning_rate": 4.0812772611042993e-08, + "loss": 11.8959, + "step": 36404 + }, + { + "epoch": 1.9823977456045414, + "grad_norm": 0.6220907738212006, + "learning_rate": 4.0561247703985174e-08, + "loss": 11.9239, + "step": 36405 + }, + { + "epoch": 1.9824521996011244, + "grad_norm": 0.5125961095662214, + "learning_rate": 4.0310500098295066e-08, + "loss": 11.7893, + "step": 36406 + }, + { + "epoch": 1.9825066535977074, + "grad_norm": 0.5520342049921835, + "learning_rate": 4.0060529795904466e-08, + "loss": 11.7016, + "step": 36407 + }, + { + "epoch": 1.9825611075942904, + "grad_norm": 0.5570034617747267, + "learning_rate": 3.9811336798778466e-08, + "loss": 11.7271, + "step": 36408 + }, + { + "epoch": 1.9826155615908734, + "grad_norm": 0.5632854091028757, + "learning_rate": 3.9562921108837745e-08, + "loss": 11.8128, + "step": 36409 + }, + { + "epoch": 1.9826700155874564, + "grad_norm": 0.5297255578447356, + "learning_rate": 3.93152827280141e-08, + "loss": 11.7802, + "step": 36410 + }, + { + "epoch": 1.9827244695840396, + "grad_norm": 0.5445943732363671, + "learning_rate": 3.906842165823932e-08, + "loss": 11.9233, + "step": 36411 + }, + { + "epoch": 1.9827789235806226, + "grad_norm": 0.5533317778112701, + "learning_rate": 3.882233790143408e-08, + "loss": 11.7697, + "step": 36412 + }, + { + "epoch": 1.9828333775772056, + "grad_norm": 0.5434511729045822, + "learning_rate": 3.8577031459519075e-08, + "loss": 11.9149, + "step": 36413 + }, + { + "epoch": 1.9828878315737886, + "grad_norm": 0.5692900136423962, + "learning_rate": 3.8332502334381684e-08, + "loss": 11.7982, + "step": 36414 + }, + { + "epoch": 1.9829422855703716, + "grad_norm": 0.531192699353065, + "learning_rate": 3.808875052793148e-08, + "loss": 11.7621, + "step": 36415 + }, + { + "epoch": 1.9829967395669545, + "grad_norm": 0.6144391606959894, + "learning_rate": 3.7845776042078064e-08, + "loss": 11.7797, + "step": 36416 + }, + { + "epoch": 1.9830511935635378, + "grad_norm": 0.5129640212616524, + "learning_rate": 3.7603578878686597e-08, + "loss": 11.7144, + "step": 36417 + }, + { + "epoch": 1.9831056475601208, + "grad_norm": 0.5452520192810797, + "learning_rate": 3.736215903966667e-08, + "loss": 11.7853, + "step": 36418 + }, + { + "epoch": 1.9831601015567037, + "grad_norm": 0.539465194466536, + "learning_rate": 3.7121516526872345e-08, + "loss": 11.7376, + "step": 36419 + }, + { + "epoch": 1.9832145555532867, + "grad_norm": 0.5754108188862822, + "learning_rate": 3.688165134219102e-08, + "loss": 11.8138, + "step": 36420 + }, + { + "epoch": 1.9832690095498697, + "grad_norm": 0.8407767216307679, + "learning_rate": 3.664256348747674e-08, + "loss": 11.8776, + "step": 36421 + }, + { + "epoch": 1.9833234635464527, + "grad_norm": 0.5060928623516017, + "learning_rate": 3.640425296459471e-08, + "loss": 11.7418, + "step": 36422 + }, + { + "epoch": 1.9833779175430357, + "grad_norm": 0.5452587918783603, + "learning_rate": 3.616671977539898e-08, + "loss": 11.8999, + "step": 36423 + }, + { + "epoch": 1.9834323715396187, + "grad_norm": 0.47998421597627455, + "learning_rate": 3.5929963921732534e-08, + "loss": 11.7164, + "step": 36424 + }, + { + "epoch": 1.9834868255362017, + "grad_norm": 0.5355787736840985, + "learning_rate": 3.569398540544944e-08, + "loss": 11.7603, + "step": 36425 + }, + { + "epoch": 1.9835412795327847, + "grad_norm": 0.5291167248010584, + "learning_rate": 3.545878422835935e-08, + "loss": 11.7304, + "step": 36426 + }, + { + "epoch": 1.9835957335293677, + "grad_norm": 0.5715189458160215, + "learning_rate": 3.522436039231636e-08, + "loss": 11.8027, + "step": 36427 + }, + { + "epoch": 1.9836501875259507, + "grad_norm": 0.5517468892746402, + "learning_rate": 3.499071389911901e-08, + "loss": 11.8163, + "step": 36428 + }, + { + "epoch": 1.9837046415225337, + "grad_norm": 0.49261450619098257, + "learning_rate": 3.475784475061028e-08, + "loss": 11.7235, + "step": 36429 + }, + { + "epoch": 1.9837590955191167, + "grad_norm": 0.6400201614758865, + "learning_rate": 3.452575294858873e-08, + "loss": 11.8131, + "step": 36430 + }, + { + "epoch": 1.9838135495156997, + "grad_norm": 0.5098108880897478, + "learning_rate": 3.429443849485292e-08, + "loss": 11.7697, + "step": 36431 + }, + { + "epoch": 1.9838680035122827, + "grad_norm": 0.5896519442328669, + "learning_rate": 3.4063901391212514e-08, + "loss": 11.7852, + "step": 36432 + }, + { + "epoch": 1.9839224575088656, + "grad_norm": 0.5735672501499843, + "learning_rate": 3.3834141639454976e-08, + "loss": 11.8876, + "step": 36433 + }, + { + "epoch": 1.9839769115054486, + "grad_norm": 0.5641724388866606, + "learning_rate": 3.360515924136776e-08, + "loss": 11.7415, + "step": 36434 + }, + { + "epoch": 1.9840313655020319, + "grad_norm": 0.529777008549507, + "learning_rate": 3.337695419872722e-08, + "loss": 11.716, + "step": 36435 + }, + { + "epoch": 1.9840858194986148, + "grad_norm": 0.5489142191689512, + "learning_rate": 3.314952651330971e-08, + "loss": 11.7486, + "step": 36436 + }, + { + "epoch": 1.9841402734951978, + "grad_norm": 0.5623179765079697, + "learning_rate": 3.29228761868916e-08, + "loss": 11.7828, + "step": 36437 + }, + { + "epoch": 1.9841947274917808, + "grad_norm": 0.535042210829423, + "learning_rate": 3.269700322122704e-08, + "loss": 11.7817, + "step": 36438 + }, + { + "epoch": 1.9842491814883638, + "grad_norm": 0.5466478260566233, + "learning_rate": 3.247190761808128e-08, + "loss": 11.7459, + "step": 36439 + }, + { + "epoch": 1.984303635484947, + "grad_norm": 0.527203248150186, + "learning_rate": 3.2247589379197364e-08, + "loss": 11.6313, + "step": 36440 + }, + { + "epoch": 1.98435808948153, + "grad_norm": 0.5582220042491767, + "learning_rate": 3.202404850631835e-08, + "loss": 11.8305, + "step": 36441 + }, + { + "epoch": 1.984412543478113, + "grad_norm": 0.506057199041528, + "learning_rate": 3.180128500117619e-08, + "loss": 11.7391, + "step": 36442 + }, + { + "epoch": 1.984466997474696, + "grad_norm": 0.5030049299910746, + "learning_rate": 3.1579298865525017e-08, + "loss": 11.8705, + "step": 36443 + }, + { + "epoch": 1.984521451471279, + "grad_norm": 0.5600930442138133, + "learning_rate": 3.13580901010746e-08, + "loss": 11.7956, + "step": 36444 + }, + { + "epoch": 1.984575905467862, + "grad_norm": 0.542997673197189, + "learning_rate": 3.113765870954577e-08, + "loss": 11.7903, + "step": 36445 + }, + { + "epoch": 1.984630359464445, + "grad_norm": 0.5415007233739847, + "learning_rate": 3.091800469264827e-08, + "loss": 11.7567, + "step": 36446 + }, + { + "epoch": 1.984684813461028, + "grad_norm": 0.6795185988670197, + "learning_rate": 3.0699128052114056e-08, + "loss": 11.9329, + "step": 36447 + }, + { + "epoch": 1.984739267457611, + "grad_norm": 0.5924422573874469, + "learning_rate": 3.048102878961956e-08, + "loss": 11.8243, + "step": 36448 + }, + { + "epoch": 1.984793721454194, + "grad_norm": 0.6158281607939818, + "learning_rate": 3.026370690686342e-08, + "loss": 11.8351, + "step": 36449 + }, + { + "epoch": 1.984848175450777, + "grad_norm": 0.7523505407724791, + "learning_rate": 3.0047162405544284e-08, + "loss": 11.673, + "step": 36450 + }, + { + "epoch": 1.98490262944736, + "grad_norm": 0.5584705033824234, + "learning_rate": 2.983139528734968e-08, + "loss": 11.6747, + "step": 36451 + }, + { + "epoch": 1.984957083443943, + "grad_norm": 0.5710800115087628, + "learning_rate": 2.9616405553944958e-08, + "loss": 11.7849, + "step": 36452 + }, + { + "epoch": 1.985011537440526, + "grad_norm": 0.5356196072968289, + "learning_rate": 2.9402193207017648e-08, + "loss": 11.625, + "step": 36453 + }, + { + "epoch": 1.985065991437109, + "grad_norm": 0.47829264998905807, + "learning_rate": 2.918875824821088e-08, + "loss": 11.7912, + "step": 36454 + }, + { + "epoch": 1.985120445433692, + "grad_norm": 0.5136787925091855, + "learning_rate": 2.8976100679212192e-08, + "loss": 11.7793, + "step": 36455 + }, + { + "epoch": 1.985174899430275, + "grad_norm": 0.5177346165727557, + "learning_rate": 2.8764220501642514e-08, + "loss": 11.7661, + "step": 36456 + }, + { + "epoch": 1.985229353426858, + "grad_norm": 0.5417904388014813, + "learning_rate": 2.855311771717828e-08, + "loss": 11.8325, + "step": 36457 + }, + { + "epoch": 1.9852838074234411, + "grad_norm": 0.5065989398736146, + "learning_rate": 2.8342792327451517e-08, + "loss": 11.7289, + "step": 36458 + }, + { + "epoch": 1.9853382614200241, + "grad_norm": 0.5506387088913992, + "learning_rate": 2.8133244334094255e-08, + "loss": 11.7723, + "step": 36459 + }, + { + "epoch": 1.9853927154166071, + "grad_norm": 0.5698689998099923, + "learning_rate": 2.7924473738738522e-08, + "loss": 11.7432, + "step": 36460 + }, + { + "epoch": 1.98544716941319, + "grad_norm": 0.5959147601358578, + "learning_rate": 2.771648054300524e-08, + "loss": 11.7884, + "step": 36461 + }, + { + "epoch": 1.985501623409773, + "grad_norm": 0.5890098564455121, + "learning_rate": 2.750926474851534e-08, + "loss": 11.9101, + "step": 36462 + }, + { + "epoch": 1.985556077406356, + "grad_norm": 0.5885155735101597, + "learning_rate": 2.7302826356878642e-08, + "loss": 11.8218, + "step": 36463 + }, + { + "epoch": 1.9856105314029393, + "grad_norm": 0.5177049109306143, + "learning_rate": 2.7097165369704967e-08, + "loss": 11.5807, + "step": 36464 + }, + { + "epoch": 1.9856649853995223, + "grad_norm": 0.5207191332899337, + "learning_rate": 2.689228178858194e-08, + "loss": 11.7763, + "step": 36465 + }, + { + "epoch": 1.9857194393961053, + "grad_norm": 0.5097256203044499, + "learning_rate": 2.6688175615119383e-08, + "loss": 11.8482, + "step": 36466 + }, + { + "epoch": 1.9857738933926883, + "grad_norm": 0.5937435099745028, + "learning_rate": 2.6484846850882705e-08, + "loss": 11.8284, + "step": 36467 + }, + { + "epoch": 1.9858283473892713, + "grad_norm": 0.5430212541291283, + "learning_rate": 2.6282295497470634e-08, + "loss": 11.6684, + "step": 36468 + }, + { + "epoch": 1.9858828013858543, + "grad_norm": 0.46856519532394764, + "learning_rate": 2.608052155645968e-08, + "loss": 11.619, + "step": 36469 + }, + { + "epoch": 1.9859372553824373, + "grad_norm": 0.5376089571158441, + "learning_rate": 2.587952502940416e-08, + "loss": 11.7194, + "step": 36470 + }, + { + "epoch": 1.9859917093790203, + "grad_norm": 0.5370737918031686, + "learning_rate": 2.5679305917880592e-08, + "loss": 11.7159, + "step": 36471 + }, + { + "epoch": 1.9860461633756032, + "grad_norm": 0.534456341970588, + "learning_rate": 2.547986422343218e-08, + "loss": 11.721, + "step": 36472 + }, + { + "epoch": 1.9861006173721862, + "grad_norm": 0.5693198990854652, + "learning_rate": 2.5281199947624347e-08, + "loss": 11.8178, + "step": 36473 + }, + { + "epoch": 1.9861550713687692, + "grad_norm": 0.5395660060236315, + "learning_rate": 2.5083313091989192e-08, + "loss": 11.9495, + "step": 36474 + }, + { + "epoch": 1.9862095253653522, + "grad_norm": 0.5594244618991824, + "learning_rate": 2.488620365808103e-08, + "loss": 11.746, + "step": 36475 + }, + { + "epoch": 1.9862639793619352, + "grad_norm": 0.5300685926005979, + "learning_rate": 2.4689871647420872e-08, + "loss": 11.7432, + "step": 36476 + }, + { + "epoch": 1.9863184333585182, + "grad_norm": 0.5428098946373274, + "learning_rate": 2.4494317061540817e-08, + "loss": 11.7523, + "step": 36477 + }, + { + "epoch": 1.9863728873551012, + "grad_norm": 0.5526840885415635, + "learning_rate": 2.4299539901950773e-08, + "loss": 11.6715, + "step": 36478 + }, + { + "epoch": 1.9864273413516842, + "grad_norm": 0.5000964365460533, + "learning_rate": 2.410554017018285e-08, + "loss": 11.7401, + "step": 36479 + }, + { + "epoch": 1.9864817953482672, + "grad_norm": 0.5147461783983301, + "learning_rate": 2.3912317867724742e-08, + "loss": 11.7364, + "step": 36480 + }, + { + "epoch": 1.9865362493448504, + "grad_norm": 0.8427748965577434, + "learning_rate": 2.3719872996097457e-08, + "loss": 11.7933, + "step": 36481 + }, + { + "epoch": 1.9865907033414334, + "grad_norm": 0.4886377342918437, + "learning_rate": 2.3528205556788697e-08, + "loss": 11.8437, + "step": 36482 + }, + { + "epoch": 1.9866451573380164, + "grad_norm": 0.5983192921560333, + "learning_rate": 2.3337315551297257e-08, + "loss": 11.9141, + "step": 36483 + }, + { + "epoch": 1.9866996113345994, + "grad_norm": 0.553652626616461, + "learning_rate": 2.3147202981099738e-08, + "loss": 11.8146, + "step": 36484 + }, + { + "epoch": 1.9867540653311824, + "grad_norm": 0.6418428452775194, + "learning_rate": 2.2957867847661628e-08, + "loss": 11.8464, + "step": 36485 + }, + { + "epoch": 1.9868085193277654, + "grad_norm": 0.5455281673631264, + "learning_rate": 2.2769310152481736e-08, + "loss": 11.82, + "step": 36486 + }, + { + "epoch": 1.9868629733243486, + "grad_norm": 0.7700380742973841, + "learning_rate": 2.2581529897014454e-08, + "loss": 11.8567, + "step": 36487 + }, + { + "epoch": 1.9869174273209316, + "grad_norm": 0.508514097771357, + "learning_rate": 2.2394527082714168e-08, + "loss": 11.7757, + "step": 36488 + }, + { + "epoch": 1.9869718813175146, + "grad_norm": 0.5886567007444015, + "learning_rate": 2.2208301711046376e-08, + "loss": 11.8596, + "step": 36489 + }, + { + "epoch": 1.9870263353140976, + "grad_norm": 0.5549673988248771, + "learning_rate": 2.2022853783443263e-08, + "loss": 11.7638, + "step": 36490 + }, + { + "epoch": 1.9870807893106806, + "grad_norm": 0.5587925331336404, + "learning_rate": 2.183818330137033e-08, + "loss": 11.8355, + "step": 36491 + }, + { + "epoch": 1.9871352433072635, + "grad_norm": 0.5328003337131507, + "learning_rate": 2.1654290266237553e-08, + "loss": 11.6843, + "step": 36492 + }, + { + "epoch": 1.9871896973038465, + "grad_norm": 0.5352286581011919, + "learning_rate": 2.147117467949933e-08, + "loss": 11.8615, + "step": 36493 + }, + { + "epoch": 1.9872441513004295, + "grad_norm": 0.521618915135555, + "learning_rate": 2.1288836542554537e-08, + "loss": 11.6261, + "step": 36494 + }, + { + "epoch": 1.9872986052970125, + "grad_norm": 0.5312015028180979, + "learning_rate": 2.1107275856846466e-08, + "loss": 11.6559, + "step": 36495 + }, + { + "epoch": 1.9873530592935955, + "grad_norm": 0.5332096331879077, + "learning_rate": 2.0926492623762894e-08, + "loss": 11.8785, + "step": 36496 + }, + { + "epoch": 1.9874075132901785, + "grad_norm": 0.5424721875130121, + "learning_rate": 2.074648684472491e-08, + "loss": 11.6787, + "step": 36497 + }, + { + "epoch": 1.9874619672867615, + "grad_norm": 0.5406570202646456, + "learning_rate": 2.0567258521131394e-08, + "loss": 11.6913, + "step": 36498 + }, + { + "epoch": 1.9875164212833445, + "grad_norm": 0.5796295827097839, + "learning_rate": 2.0388807654370125e-08, + "loss": 11.8214, + "step": 36499 + }, + { + "epoch": 1.9875708752799275, + "grad_norm": 0.5368021958895797, + "learning_rate": 2.021113424583998e-08, + "loss": 11.8478, + "step": 36500 + }, + { + "epoch": 1.9876253292765105, + "grad_norm": 0.48074724813872993, + "learning_rate": 2.0034238296906537e-08, + "loss": 11.5588, + "step": 36501 + }, + { + "epoch": 1.9876797832730935, + "grad_norm": 0.5065848752096026, + "learning_rate": 1.9858119808957575e-08, + "loss": 11.7973, + "step": 36502 + }, + { + "epoch": 1.9877342372696765, + "grad_norm": 0.5759036275316106, + "learning_rate": 1.968277878335867e-08, + "loss": 11.7654, + "step": 36503 + }, + { + "epoch": 1.9877886912662597, + "grad_norm": 0.5591605320109206, + "learning_rate": 1.950821522147539e-08, + "loss": 11.6315, + "step": 36504 + }, + { + "epoch": 1.9878431452628427, + "grad_norm": 0.5598232048830637, + "learning_rate": 1.933442912465111e-08, + "loss": 11.8587, + "step": 36505 + }, + { + "epoch": 1.9878975992594257, + "grad_norm": 0.5734144479691698, + "learning_rate": 1.9161420494262505e-08, + "loss": 11.6655, + "step": 36506 + }, + { + "epoch": 1.9879520532560087, + "grad_norm": 0.5181470637516574, + "learning_rate": 1.8989189331641843e-08, + "loss": 11.8563, + "step": 36507 + }, + { + "epoch": 1.9880065072525916, + "grad_norm": 0.5678972236587222, + "learning_rate": 1.881773563812139e-08, + "loss": 11.7367, + "step": 36508 + }, + { + "epoch": 1.9880609612491746, + "grad_norm": 0.5275689525463271, + "learning_rate": 1.864705941503342e-08, + "loss": 11.5711, + "step": 36509 + }, + { + "epoch": 1.9881154152457579, + "grad_norm": 0.5226776011583704, + "learning_rate": 1.8477160663721295e-08, + "loss": 11.8611, + "step": 36510 + }, + { + "epoch": 1.9881698692423408, + "grad_norm": 0.5907563497459758, + "learning_rate": 1.830803938550618e-08, + "loss": 11.9167, + "step": 36511 + }, + { + "epoch": 1.9882243232389238, + "grad_norm": 0.5558311979413025, + "learning_rate": 1.8139695581687043e-08, + "loss": 11.8081, + "step": 36512 + }, + { + "epoch": 1.9882787772355068, + "grad_norm": 0.5408375610092016, + "learning_rate": 1.7972129253573942e-08, + "loss": 11.8025, + "step": 36513 + }, + { + "epoch": 1.9883332312320898, + "grad_norm": 0.6487321049336314, + "learning_rate": 1.7805340402488046e-08, + "loss": 11.9208, + "step": 36514 + }, + { + "epoch": 1.9883876852286728, + "grad_norm": 0.5331738608573782, + "learning_rate": 1.763932902970611e-08, + "loss": 11.8855, + "step": 36515 + }, + { + "epoch": 1.9884421392252558, + "grad_norm": 0.49844084490852403, + "learning_rate": 1.7474095136538193e-08, + "loss": 11.746, + "step": 36516 + }, + { + "epoch": 1.9884965932218388, + "grad_norm": 0.5258561191422564, + "learning_rate": 1.7309638724249954e-08, + "loss": 11.7594, + "step": 36517 + }, + { + "epoch": 1.9885510472184218, + "grad_norm": 0.477293214613775, + "learning_rate": 1.714595979412925e-08, + "loss": 11.6868, + "step": 36518 + }, + { + "epoch": 1.9886055012150048, + "grad_norm": 0.5407193248464377, + "learning_rate": 1.698305834745284e-08, + "loss": 11.7135, + "step": 36519 + }, + { + "epoch": 1.9886599552115878, + "grad_norm": 0.5378190096474488, + "learning_rate": 1.6820934385475272e-08, + "loss": 11.7109, + "step": 36520 + }, + { + "epoch": 1.9887144092081708, + "grad_norm": 0.5317356370743387, + "learning_rate": 1.6659587909473307e-08, + "loss": 11.742, + "step": 36521 + }, + { + "epoch": 1.9887688632047538, + "grad_norm": 0.5760190090144073, + "learning_rate": 1.649901892069039e-08, + "loss": 11.8167, + "step": 36522 + }, + { + "epoch": 1.9888233172013368, + "grad_norm": 0.5291254998874673, + "learning_rate": 1.6339227420381076e-08, + "loss": 11.7414, + "step": 36523 + }, + { + "epoch": 1.9888777711979198, + "grad_norm": 0.5248665097299513, + "learning_rate": 1.618021340977771e-08, + "loss": 11.7631, + "step": 36524 + }, + { + "epoch": 1.9889322251945027, + "grad_norm": 0.5435504748659484, + "learning_rate": 1.602197689013485e-08, + "loss": 11.6174, + "step": 36525 + }, + { + "epoch": 1.9889866791910857, + "grad_norm": 0.5212935552363418, + "learning_rate": 1.5864517862662632e-08, + "loss": 11.8869, + "step": 36526 + }, + { + "epoch": 1.9890411331876687, + "grad_norm": 0.5796327661169799, + "learning_rate": 1.5707836328604508e-08, + "loss": 11.7896, + "step": 36527 + }, + { + "epoch": 1.989095587184252, + "grad_norm": 0.535665979283029, + "learning_rate": 1.5551932289170623e-08, + "loss": 11.73, + "step": 36528 + }, + { + "epoch": 1.989150041180835, + "grad_norm": 0.5853900510831671, + "learning_rate": 1.5396805745560018e-08, + "loss": 11.657, + "step": 36529 + }, + { + "epoch": 1.989204495177418, + "grad_norm": 1.653872634805348, + "learning_rate": 1.5242456699005037e-08, + "loss": 11.9419, + "step": 36530 + }, + { + "epoch": 1.989258949174001, + "grad_norm": 0.5540657531119669, + "learning_rate": 1.508888515068252e-08, + "loss": 11.7679, + "step": 36531 + }, + { + "epoch": 1.989313403170584, + "grad_norm": 0.5260239814293589, + "learning_rate": 1.4936091101802608e-08, + "loss": 11.8596, + "step": 36532 + }, + { + "epoch": 1.989367857167167, + "grad_norm": 0.5370210294461955, + "learning_rate": 1.4784074553542138e-08, + "loss": 11.7527, + "step": 36533 + }, + { + "epoch": 1.9894223111637501, + "grad_norm": 0.5940283262279022, + "learning_rate": 1.4632835507089049e-08, + "loss": 11.9127, + "step": 36534 + }, + { + "epoch": 1.9894767651603331, + "grad_norm": 0.5591647115083277, + "learning_rate": 1.4482373963620177e-08, + "loss": 11.7366, + "step": 36535 + }, + { + "epoch": 1.989531219156916, + "grad_norm": 0.5998355448929859, + "learning_rate": 1.4332689924301257e-08, + "loss": 11.8799, + "step": 36536 + }, + { + "epoch": 1.989585673153499, + "grad_norm": 0.5143044566579497, + "learning_rate": 1.4183783390298022e-08, + "loss": 11.778, + "step": 36537 + }, + { + "epoch": 1.989640127150082, + "grad_norm": 0.5242737544104311, + "learning_rate": 1.4035654362776207e-08, + "loss": 11.6855, + "step": 36538 + }, + { + "epoch": 1.989694581146665, + "grad_norm": 0.5185734968571442, + "learning_rate": 1.3888302842868239e-08, + "loss": 11.6778, + "step": 36539 + }, + { + "epoch": 1.989749035143248, + "grad_norm": 0.5775111635310636, + "learning_rate": 1.3741728831750955e-08, + "loss": 11.7708, + "step": 36540 + }, + { + "epoch": 1.989803489139831, + "grad_norm": 0.533025138209696, + "learning_rate": 1.3595932330534578e-08, + "loss": 11.7419, + "step": 36541 + }, + { + "epoch": 1.989857943136414, + "grad_norm": 0.5913819883562282, + "learning_rate": 1.3450913340362637e-08, + "loss": 11.8257, + "step": 36542 + }, + { + "epoch": 1.989912397132997, + "grad_norm": 0.5666547350860178, + "learning_rate": 1.330667186236756e-08, + "loss": 11.7689, + "step": 36543 + }, + { + "epoch": 1.98996685112958, + "grad_norm": 0.5617870311779647, + "learning_rate": 1.3163207897659569e-08, + "loss": 11.7593, + "step": 36544 + }, + { + "epoch": 1.990021305126163, + "grad_norm": 0.5278297550209919, + "learning_rate": 1.3020521447371093e-08, + "loss": 11.761, + "step": 36545 + }, + { + "epoch": 1.990075759122746, + "grad_norm": 0.55048504306361, + "learning_rate": 1.2878612512601251e-08, + "loss": 11.7731, + "step": 36546 + }, + { + "epoch": 1.990130213119329, + "grad_norm": 0.5374731388214943, + "learning_rate": 1.2737481094460269e-08, + "loss": 11.7788, + "step": 36547 + }, + { + "epoch": 1.990184667115912, + "grad_norm": 0.577594614368669, + "learning_rate": 1.2597127194036163e-08, + "loss": 11.8198, + "step": 36548 + }, + { + "epoch": 1.990239121112495, + "grad_norm": 0.5727313297973341, + "learning_rate": 1.2457550812416951e-08, + "loss": 11.7288, + "step": 36549 + }, + { + "epoch": 1.990293575109078, + "grad_norm": 0.5253241306983374, + "learning_rate": 1.2318751950701757e-08, + "loss": 11.7712, + "step": 36550 + }, + { + "epoch": 1.9903480291056612, + "grad_norm": 0.5600070923392805, + "learning_rate": 1.2180730609967495e-08, + "loss": 11.6915, + "step": 36551 + }, + { + "epoch": 1.9904024831022442, + "grad_norm": 0.5692639945535776, + "learning_rate": 1.2043486791279978e-08, + "loss": 11.8965, + "step": 36552 + }, + { + "epoch": 1.9904569370988272, + "grad_norm": 0.5878580842199417, + "learning_rate": 1.1907020495716126e-08, + "loss": 11.6622, + "step": 36553 + }, + { + "epoch": 1.9905113910954102, + "grad_norm": 0.5272999207693095, + "learning_rate": 1.1771331724319545e-08, + "loss": 11.6701, + "step": 36554 + }, + { + "epoch": 1.9905658450919932, + "grad_norm": 0.5069394035366999, + "learning_rate": 1.1636420478167154e-08, + "loss": 11.6037, + "step": 36555 + }, + { + "epoch": 1.9906202990885762, + "grad_norm": 0.5307850074672694, + "learning_rate": 1.1502286758291459e-08, + "loss": 11.7663, + "step": 36556 + }, + { + "epoch": 1.9906747530851594, + "grad_norm": 0.5553898205062658, + "learning_rate": 1.136893056574717e-08, + "loss": 11.8379, + "step": 36557 + }, + { + "epoch": 1.9907292070817424, + "grad_norm": 0.5542884498644787, + "learning_rate": 1.12363519015668e-08, + "loss": 11.769, + "step": 36558 + }, + { + "epoch": 1.9907836610783254, + "grad_norm": 0.5387905649061595, + "learning_rate": 1.110455076678285e-08, + "loss": 11.7206, + "step": 36559 + }, + { + "epoch": 1.9908381150749084, + "grad_norm": 0.5412154444315345, + "learning_rate": 1.0973527162405628e-08, + "loss": 11.8113, + "step": 36560 + }, + { + "epoch": 1.9908925690714914, + "grad_norm": 0.5987705883005071, + "learning_rate": 1.0843281089478741e-08, + "loss": 11.843, + "step": 36561 + }, + { + "epoch": 1.9909470230680744, + "grad_norm": 0.6542224568495725, + "learning_rate": 1.071381254900139e-08, + "loss": 11.8524, + "step": 36562 + }, + { + "epoch": 1.9910014770646574, + "grad_norm": 0.5760613911943637, + "learning_rate": 1.0585121541972776e-08, + "loss": 11.6326, + "step": 36563 + }, + { + "epoch": 1.9910559310612403, + "grad_norm": 0.5359531398178513, + "learning_rate": 1.0457208069414304e-08, + "loss": 11.8028, + "step": 36564 + }, + { + "epoch": 1.9911103850578233, + "grad_norm": 0.471814764239132, + "learning_rate": 1.033007213230297e-08, + "loss": 11.7115, + "step": 36565 + }, + { + "epoch": 1.9911648390544063, + "grad_norm": 0.5769235638220788, + "learning_rate": 1.0203713731626873e-08, + "loss": 11.8379, + "step": 36566 + }, + { + "epoch": 1.9912192930509893, + "grad_norm": 0.5238240480289093, + "learning_rate": 1.0078132868385215e-08, + "loss": 11.8243, + "step": 36567 + }, + { + "epoch": 1.9912737470475723, + "grad_norm": 0.5364322950304945, + "learning_rate": 9.953329543543888e-09, + "loss": 11.6682, + "step": 36568 + }, + { + "epoch": 1.9913282010441553, + "grad_norm": 0.593512180407724, + "learning_rate": 9.829303758068786e-09, + "loss": 11.8686, + "step": 36569 + }, + { + "epoch": 1.9913826550407383, + "grad_norm": 0.5605312415944048, + "learning_rate": 9.706055512925804e-09, + "loss": 11.8327, + "step": 36570 + }, + { + "epoch": 1.9914371090373213, + "grad_norm": 0.5514882556214906, + "learning_rate": 9.583584809080837e-09, + "loss": 11.7296, + "step": 36571 + }, + { + "epoch": 1.9914915630339043, + "grad_norm": 0.5429878775553492, + "learning_rate": 9.461891647488675e-09, + "loss": 11.7423, + "step": 36572 + }, + { + "epoch": 1.9915460170304873, + "grad_norm": 0.5691855765732323, + "learning_rate": 9.340976029081905e-09, + "loss": 11.7875, + "step": 36573 + }, + { + "epoch": 1.9916004710270705, + "grad_norm": 0.5521564499331408, + "learning_rate": 9.22083795481532e-09, + "loss": 11.8163, + "step": 36574 + }, + { + "epoch": 1.9916549250236535, + "grad_norm": 0.5701652563157238, + "learning_rate": 9.101477425610405e-09, + "loss": 11.8349, + "step": 36575 + }, + { + "epoch": 1.9917093790202365, + "grad_norm": 0.5855597507007764, + "learning_rate": 8.982894442399747e-09, + "loss": 11.807, + "step": 36576 + }, + { + "epoch": 1.9917638330168195, + "grad_norm": 0.5525490524446289, + "learning_rate": 8.865089006104832e-09, + "loss": 11.7881, + "step": 36577 + }, + { + "epoch": 1.9918182870134025, + "grad_norm": 0.5457276094545386, + "learning_rate": 8.748061117647144e-09, + "loss": 11.7254, + "step": 36578 + }, + { + "epoch": 1.9918727410099855, + "grad_norm": 0.6052376554592634, + "learning_rate": 8.631810777937067e-09, + "loss": 11.7223, + "step": 36579 + }, + { + "epoch": 1.9919271950065687, + "grad_norm": 0.5890097010004732, + "learning_rate": 8.516337987862777e-09, + "loss": 11.8369, + "step": 36580 + }, + { + "epoch": 1.9919816490031517, + "grad_norm": 0.5788997216648428, + "learning_rate": 8.401642748345762e-09, + "loss": 11.7819, + "step": 36581 + }, + { + "epoch": 1.9920361029997347, + "grad_norm": 0.6038637351962896, + "learning_rate": 8.287725060263097e-09, + "loss": 11.5915, + "step": 36582 + }, + { + "epoch": 1.9920905569963177, + "grad_norm": 0.5987582867117255, + "learning_rate": 8.17458492450296e-09, + "loss": 11.79, + "step": 36583 + }, + { + "epoch": 1.9921450109929006, + "grad_norm": 0.5258476968420109, + "learning_rate": 8.062222341942427e-09, + "loss": 11.7373, + "step": 36584 + }, + { + "epoch": 1.9921994649894836, + "grad_norm": 0.5378773063919398, + "learning_rate": 7.950637313458576e-09, + "loss": 11.83, + "step": 36585 + }, + { + "epoch": 1.9922539189860666, + "grad_norm": 0.5388884010314146, + "learning_rate": 7.839829839928481e-09, + "loss": 11.6727, + "step": 36586 + }, + { + "epoch": 1.9923083729826496, + "grad_norm": 0.52512221499079, + "learning_rate": 7.729799922195912e-09, + "loss": 11.7655, + "step": 36587 + }, + { + "epoch": 1.9923628269792326, + "grad_norm": 0.548364727159414, + "learning_rate": 7.620547561126845e-09, + "loss": 11.6767, + "step": 36588 + }, + { + "epoch": 1.9924172809758156, + "grad_norm": 0.5262306886882376, + "learning_rate": 7.51207275757615e-09, + "loss": 11.7849, + "step": 36589 + }, + { + "epoch": 1.9924717349723986, + "grad_norm": 0.5296609115846772, + "learning_rate": 7.404375512365391e-09, + "loss": 11.727, + "step": 36590 + }, + { + "epoch": 1.9925261889689816, + "grad_norm": 0.5197247939044222, + "learning_rate": 7.297455826360544e-09, + "loss": 11.74, + "step": 36591 + }, + { + "epoch": 1.9925806429655646, + "grad_norm": 0.5061356816214125, + "learning_rate": 7.191313700372071e-09, + "loss": 11.5471, + "step": 36592 + }, + { + "epoch": 1.9926350969621476, + "grad_norm": 0.6189454476855245, + "learning_rate": 7.0859491352437415e-09, + "loss": 11.8413, + "step": 36593 + }, + { + "epoch": 1.9926895509587306, + "grad_norm": 0.5116906400228126, + "learning_rate": 6.981362131774916e-09, + "loss": 11.6078, + "step": 36594 + }, + { + "epoch": 1.9927440049553136, + "grad_norm": 0.5443973249715622, + "learning_rate": 6.8775526907871594e-09, + "loss": 11.802, + "step": 36595 + }, + { + "epoch": 1.9927984589518966, + "grad_norm": 0.5226089155217758, + "learning_rate": 6.7745208130909345e-09, + "loss": 11.8135, + "step": 36596 + }, + { + "epoch": 1.9928529129484795, + "grad_norm": 0.5249070542445562, + "learning_rate": 6.6722664994744995e-09, + "loss": 11.753, + "step": 36597 + }, + { + "epoch": 1.9929073669450628, + "grad_norm": 0.5731575805681722, + "learning_rate": 6.570789750759421e-09, + "loss": 11.7456, + "step": 36598 + }, + { + "epoch": 1.9929618209416458, + "grad_norm": 0.6046180240218758, + "learning_rate": 6.470090567711751e-09, + "loss": 11.8848, + "step": 36599 + }, + { + "epoch": 1.9930162749382287, + "grad_norm": 0.5238906110334236, + "learning_rate": 6.370168951119748e-09, + "loss": 11.6064, + "step": 36600 + }, + { + "epoch": 1.9930707289348117, + "grad_norm": 0.5442536443886653, + "learning_rate": 6.27102490176057e-09, + "loss": 11.8224, + "step": 36601 + }, + { + "epoch": 1.9931251829313947, + "grad_norm": 0.6052913011545425, + "learning_rate": 6.17265842041137e-09, + "loss": 11.8681, + "step": 36602 + }, + { + "epoch": 1.993179636927978, + "grad_norm": 0.5277548338212886, + "learning_rate": 6.075069507838205e-09, + "loss": 11.7395, + "step": 36603 + }, + { + "epoch": 1.993234090924561, + "grad_norm": 0.6245102734589224, + "learning_rate": 5.978258164784922e-09, + "loss": 11.8652, + "step": 36604 + }, + { + "epoch": 1.993288544921144, + "grad_norm": 0.5429962968186917, + "learning_rate": 5.882224392017577e-09, + "loss": 11.5717, + "step": 36605 + }, + { + "epoch": 1.993342998917727, + "grad_norm": 0.5307810892273391, + "learning_rate": 5.786968190280018e-09, + "loss": 11.7404, + "step": 36606 + }, + { + "epoch": 1.99339745291431, + "grad_norm": 0.515158080338061, + "learning_rate": 5.692489560304992e-09, + "loss": 11.6824, + "step": 36607 + }, + { + "epoch": 1.993451906910893, + "grad_norm": 0.5087382019976419, + "learning_rate": 5.598788502847452e-09, + "loss": 11.787, + "step": 36608 + }, + { + "epoch": 1.993506360907476, + "grad_norm": 0.5465389543262851, + "learning_rate": 5.505865018606837e-09, + "loss": 11.7854, + "step": 36609 + }, + { + "epoch": 1.993560814904059, + "grad_norm": 0.5229986948256299, + "learning_rate": 5.4137191083381e-09, + "loss": 11.747, + "step": 36610 + }, + { + "epoch": 1.993615268900642, + "grad_norm": 0.5524090689207177, + "learning_rate": 5.322350772729579e-09, + "loss": 11.8494, + "step": 36611 + }, + { + "epoch": 1.9936697228972249, + "grad_norm": 0.6229786307791355, + "learning_rate": 5.231760012502917e-09, + "loss": 11.9247, + "step": 36612 + }, + { + "epoch": 1.9937241768938079, + "grad_norm": 0.5276054388819608, + "learning_rate": 5.14194682836866e-09, + "loss": 11.7887, + "step": 36613 + }, + { + "epoch": 1.9937786308903909, + "grad_norm": 0.5685137900197232, + "learning_rate": 5.052911221015144e-09, + "loss": 11.843, + "step": 36614 + }, + { + "epoch": 1.9938330848869739, + "grad_norm": 0.6003547945268849, + "learning_rate": 4.96465319114181e-09, + "loss": 11.8157, + "step": 36615 + }, + { + "epoch": 1.9938875388835569, + "grad_norm": 0.503537665410977, + "learning_rate": 4.877172739436997e-09, + "loss": 11.7783, + "step": 36616 + }, + { + "epoch": 1.9939419928801398, + "grad_norm": 0.5524766885056016, + "learning_rate": 4.7904698665779405e-09, + "loss": 11.8226, + "step": 36617 + }, + { + "epoch": 1.9939964468767228, + "grad_norm": 0.5839317510664969, + "learning_rate": 4.704544573230774e-09, + "loss": 11.7921, + "step": 36618 + }, + { + "epoch": 1.9940509008733058, + "grad_norm": 0.4937412077674242, + "learning_rate": 4.619396860072734e-09, + "loss": 11.7857, + "step": 36619 + }, + { + "epoch": 1.9941053548698888, + "grad_norm": 0.619428771642005, + "learning_rate": 4.535026727758851e-09, + "loss": 11.7916, + "step": 36620 + }, + { + "epoch": 1.994159808866472, + "grad_norm": 0.5241669488118829, + "learning_rate": 4.45143417695526e-09, + "loss": 11.7554, + "step": 36621 + }, + { + "epoch": 1.994214262863055, + "grad_norm": 0.6134667912022922, + "learning_rate": 4.3686192083058904e-09, + "loss": 11.9199, + "step": 36622 + }, + { + "epoch": 1.994268716859638, + "grad_norm": 0.5666420771301303, + "learning_rate": 4.286581822454672e-09, + "loss": 11.688, + "step": 36623 + }, + { + "epoch": 1.994323170856221, + "grad_norm": 0.6716040216061332, + "learning_rate": 4.20532202003443e-09, + "loss": 11.901, + "step": 36624 + }, + { + "epoch": 1.994377624852804, + "grad_norm": 0.5038869191413636, + "learning_rate": 4.124839801689095e-09, + "loss": 11.7773, + "step": 36625 + }, + { + "epoch": 1.994432078849387, + "grad_norm": 0.5304847927368819, + "learning_rate": 4.045135168040392e-09, + "loss": 11.7209, + "step": 36626 + }, + { + "epoch": 1.9944865328459702, + "grad_norm": 0.5834841566192898, + "learning_rate": 3.966208119698944e-09, + "loss": 11.7493, + "step": 36627 + }, + { + "epoch": 1.9945409868425532, + "grad_norm": 0.5297166483552079, + "learning_rate": 3.888058657297577e-09, + "loss": 11.8569, + "step": 36628 + }, + { + "epoch": 1.9945954408391362, + "grad_norm": 0.5238660667981947, + "learning_rate": 3.8106867814136086e-09, + "loss": 11.8502, + "step": 36629 + }, + { + "epoch": 1.9946498948357192, + "grad_norm": 0.5234309470475411, + "learning_rate": 3.734092492679864e-09, + "loss": 11.6249, + "step": 36630 + }, + { + "epoch": 1.9947043488323022, + "grad_norm": 0.5139694921264127, + "learning_rate": 3.658275791684762e-09, + "loss": 11.7672, + "step": 36631 + }, + { + "epoch": 1.9947588028288852, + "grad_norm": 0.5264909707605255, + "learning_rate": 3.583236679005619e-09, + "loss": 11.7835, + "step": 36632 + }, + { + "epoch": 1.9948132568254682, + "grad_norm": 0.5242808653399074, + "learning_rate": 3.5089751552308537e-09, + "loss": 11.6735, + "step": 36633 + }, + { + "epoch": 1.9948677108220512, + "grad_norm": 0.5194141757764744, + "learning_rate": 3.435491220937781e-09, + "loss": 11.7064, + "step": 36634 + }, + { + "epoch": 1.9949221648186342, + "grad_norm": 0.5211938629199598, + "learning_rate": 3.362784876703717e-09, + "loss": 11.7095, + "step": 36635 + }, + { + "epoch": 1.9949766188152172, + "grad_norm": 0.6333757755520796, + "learning_rate": 3.290856123083774e-09, + "loss": 11.7457, + "step": 36636 + }, + { + "epoch": 1.9950310728118001, + "grad_norm": 0.5435958077205914, + "learning_rate": 3.219704960655268e-09, + "loss": 11.7806, + "step": 36637 + }, + { + "epoch": 1.9950855268083831, + "grad_norm": 0.5210460749138116, + "learning_rate": 3.1493313899511046e-09, + "loss": 11.7653, + "step": 36638 + }, + { + "epoch": 1.9951399808049661, + "grad_norm": 0.5117830193675624, + "learning_rate": 3.0797354115263964e-09, + "loss": 11.8055, + "step": 36639 + }, + { + "epoch": 1.9951944348015491, + "grad_norm": 0.514988069970701, + "learning_rate": 3.0109170259251528e-09, + "loss": 11.6839, + "step": 36640 + }, + { + "epoch": 1.9952488887981321, + "grad_norm": 0.5542102234589764, + "learning_rate": 2.942876233680281e-09, + "loss": 11.7812, + "step": 36641 + }, + { + "epoch": 1.995303342794715, + "grad_norm": 0.516034683978556, + "learning_rate": 2.8756130353246867e-09, + "loss": 11.8199, + "step": 36642 + }, + { + "epoch": 1.995357796791298, + "grad_norm": 0.5155532397239551, + "learning_rate": 2.8091274313801762e-09, + "loss": 11.7568, + "step": 36643 + }, + { + "epoch": 1.9954122507878813, + "grad_norm": 0.6329309551328337, + "learning_rate": 2.743419422357452e-09, + "loss": 11.6283, + "step": 36644 + }, + { + "epoch": 1.9954667047844643, + "grad_norm": 0.4963986603524539, + "learning_rate": 2.6784890087672154e-09, + "loss": 11.7111, + "step": 36645 + }, + { + "epoch": 1.9955211587810473, + "grad_norm": 0.6076339871650054, + "learning_rate": 2.61433619112017e-09, + "loss": 11.8352, + "step": 36646 + }, + { + "epoch": 1.9955756127776303, + "grad_norm": 0.5234783332036234, + "learning_rate": 2.5509609699159163e-09, + "loss": 11.7935, + "step": 36647 + }, + { + "epoch": 1.9956300667742133, + "grad_norm": 0.5392005307077706, + "learning_rate": 2.4883633456429524e-09, + "loss": 11.7426, + "step": 36648 + }, + { + "epoch": 1.9956845207707963, + "grad_norm": 0.5887162033689871, + "learning_rate": 2.4265433187897757e-09, + "loss": 11.8386, + "step": 36649 + }, + { + "epoch": 1.9957389747673795, + "grad_norm": 0.5396301072348566, + "learning_rate": 2.365500889844885e-09, + "loss": 11.7071, + "step": 36650 + }, + { + "epoch": 1.9957934287639625, + "grad_norm": 0.575394282904709, + "learning_rate": 2.3052360592745738e-09, + "loss": 11.7565, + "step": 36651 + }, + { + "epoch": 1.9958478827605455, + "grad_norm": 0.770807824973402, + "learning_rate": 2.245748827534033e-09, + "loss": 11.8931, + "step": 36652 + }, + { + "epoch": 1.9959023367571285, + "grad_norm": 0.5378501184092315, + "learning_rate": 2.1870391951117618e-09, + "loss": 11.7475, + "step": 36653 + }, + { + "epoch": 1.9959567907537115, + "grad_norm": 0.5328546146445313, + "learning_rate": 2.129107162451849e-09, + "loss": 11.7567, + "step": 36654 + }, + { + "epoch": 1.9960112447502945, + "grad_norm": 0.5642278048320595, + "learning_rate": 2.0719527300094853e-09, + "loss": 11.7792, + "step": 36655 + }, + { + "epoch": 1.9960656987468774, + "grad_norm": 0.5407460073021433, + "learning_rate": 2.0155758982176588e-09, + "loss": 11.8449, + "step": 36656 + }, + { + "epoch": 1.9961201527434604, + "grad_norm": 0.5918623888172896, + "learning_rate": 1.959976667520458e-09, + "loss": 11.8475, + "step": 36657 + }, + { + "epoch": 1.9961746067400434, + "grad_norm": 0.5208077918237171, + "learning_rate": 1.905155038361972e-09, + "loss": 11.715, + "step": 36658 + }, + { + "epoch": 1.9962290607366264, + "grad_norm": 0.5547570054299232, + "learning_rate": 1.8511110111529839e-09, + "loss": 11.7199, + "step": 36659 + }, + { + "epoch": 1.9962835147332094, + "grad_norm": 0.5406713324055593, + "learning_rate": 1.7978445863153782e-09, + "loss": 11.8062, + "step": 36660 + }, + { + "epoch": 1.9963379687297924, + "grad_norm": 0.5336696006867239, + "learning_rate": 1.7453557642710393e-09, + "loss": 11.8141, + "step": 36661 + }, + { + "epoch": 1.9963924227263754, + "grad_norm": 0.5206807405477759, + "learning_rate": 1.6936445454307504e-09, + "loss": 11.7819, + "step": 36662 + }, + { + "epoch": 1.9964468767229584, + "grad_norm": 0.5767948374382369, + "learning_rate": 1.642710930183089e-09, + "loss": 11.772, + "step": 36663 + }, + { + "epoch": 1.9965013307195414, + "grad_norm": 0.585163110915863, + "learning_rate": 1.5925549189388377e-09, + "loss": 11.7973, + "step": 36664 + }, + { + "epoch": 1.9965557847161244, + "grad_norm": 0.5016347834194185, + "learning_rate": 1.5431765120754727e-09, + "loss": 11.7421, + "step": 36665 + }, + { + "epoch": 1.9966102387127074, + "grad_norm": 0.517762633363895, + "learning_rate": 1.4945757099815716e-09, + "loss": 11.7985, + "step": 36666 + }, + { + "epoch": 1.9966646927092904, + "grad_norm": 0.5241380060054428, + "learning_rate": 1.4467525130346105e-09, + "loss": 11.6881, + "step": 36667 + }, + { + "epoch": 1.9967191467058736, + "grad_norm": 0.5181787526820097, + "learning_rate": 1.399706921612065e-09, + "loss": 11.7816, + "step": 36668 + }, + { + "epoch": 1.9967736007024566, + "grad_norm": 0.5505127884394181, + "learning_rate": 1.3534389360692068e-09, + "loss": 11.6572, + "step": 36669 + }, + { + "epoch": 1.9968280546990396, + "grad_norm": 0.5693830525480125, + "learning_rate": 1.3079485567724093e-09, + "loss": 11.8402, + "step": 36670 + }, + { + "epoch": 1.9968825086956226, + "grad_norm": 0.5860973761223902, + "learning_rate": 1.2632357840769438e-09, + "loss": 11.8876, + "step": 36671 + }, + { + "epoch": 1.9969369626922056, + "grad_norm": 0.5704348010003918, + "learning_rate": 1.2193006183269794e-09, + "loss": 11.8041, + "step": 36672 + }, + { + "epoch": 1.9969914166887888, + "grad_norm": 0.6279119288513697, + "learning_rate": 1.1761430598666856e-09, + "loss": 11.8818, + "step": 36673 + }, + { + "epoch": 1.9970458706853718, + "grad_norm": 0.5198706796872268, + "learning_rate": 1.133763109029129e-09, + "loss": 11.7613, + "step": 36674 + }, + { + "epoch": 1.9971003246819548, + "grad_norm": 0.6034201104231257, + "learning_rate": 1.0921607661473765e-09, + "loss": 11.8964, + "step": 36675 + }, + { + "epoch": 1.9971547786785377, + "grad_norm": 0.49949102841187076, + "learning_rate": 1.051336031543393e-09, + "loss": 11.7939, + "step": 36676 + }, + { + "epoch": 1.9972092326751207, + "grad_norm": 0.5363153137472171, + "learning_rate": 1.0112889055391428e-09, + "loss": 11.7587, + "step": 36677 + }, + { + "epoch": 1.9972636866717037, + "grad_norm": 0.5810910374213107, + "learning_rate": 9.720193884343865e-10, + "loss": 11.724, + "step": 36678 + }, + { + "epoch": 1.9973181406682867, + "grad_norm": 0.5210939134204752, + "learning_rate": 9.335274805399863e-10, + "loss": 11.6994, + "step": 36679 + }, + { + "epoch": 1.9973725946648697, + "grad_norm": 0.5381576891492463, + "learning_rate": 8.958131821557025e-10, + "loss": 11.8286, + "step": 36680 + }, + { + "epoch": 1.9974270486614527, + "grad_norm": 0.530220337335392, + "learning_rate": 8.588764935812954e-10, + "loss": 11.755, + "step": 36681 + }, + { + "epoch": 1.9974815026580357, + "grad_norm": 0.5728679182487828, + "learning_rate": 8.227174150943206e-10, + "loss": 11.787, + "step": 36682 + }, + { + "epoch": 1.9975359566546187, + "grad_norm": 0.5205716047367589, + "learning_rate": 7.87335946972334e-10, + "loss": 11.8618, + "step": 36683 + }, + { + "epoch": 1.9975904106512017, + "grad_norm": 0.5241118339580726, + "learning_rate": 7.527320895150958e-10, + "loss": 11.5183, + "step": 36684 + }, + { + "epoch": 1.9976448646477847, + "grad_norm": 0.5088502198791645, + "learning_rate": 7.189058429557527e-10, + "loss": 11.7576, + "step": 36685 + }, + { + "epoch": 1.9976993186443677, + "grad_norm": 0.5470162434631054, + "learning_rate": 6.85857207594065e-10, + "loss": 11.6734, + "step": 36686 + }, + { + "epoch": 1.9977537726409507, + "grad_norm": 0.5110775201370584, + "learning_rate": 6.535861836520773e-10, + "loss": 11.7879, + "step": 36687 + }, + { + "epoch": 1.9978082266375337, + "grad_norm": 0.5608037502661866, + "learning_rate": 6.220927714073455e-10, + "loss": 11.6387, + "step": 36688 + }, + { + "epoch": 1.9978626806341166, + "grad_norm": 0.5188198153542338, + "learning_rate": 5.913769710930161e-10, + "loss": 11.8053, + "step": 36689 + }, + { + "epoch": 1.9979171346306996, + "grad_norm": 0.5757468699154831, + "learning_rate": 5.614387829533385e-10, + "loss": 11.568, + "step": 36690 + }, + { + "epoch": 1.9979715886272829, + "grad_norm": 0.5732020279908595, + "learning_rate": 5.322782072103571e-10, + "loss": 11.7973, + "step": 36691 + }, + { + "epoch": 1.9980260426238658, + "grad_norm": 0.5661612494643344, + "learning_rate": 5.038952441083211e-10, + "loss": 11.8387, + "step": 36692 + }, + { + "epoch": 1.9980804966204488, + "grad_norm": 0.5364384218893937, + "learning_rate": 4.762898938470706e-10, + "loss": 11.7906, + "step": 36693 + }, + { + "epoch": 1.9981349506170318, + "grad_norm": 0.6917035468321034, + "learning_rate": 4.494621566597523e-10, + "loss": 11.8456, + "step": 36694 + }, + { + "epoch": 1.9981894046136148, + "grad_norm": 0.5582074763770993, + "learning_rate": 4.2341203274620653e-10, + "loss": 11.7563, + "step": 36695 + }, + { + "epoch": 1.9982438586101978, + "grad_norm": 0.5400802746154967, + "learning_rate": 3.981395223062734e-10, + "loss": 11.8059, + "step": 36696 + }, + { + "epoch": 1.998298312606781, + "grad_norm": 0.6784869398514226, + "learning_rate": 3.736446255508952e-10, + "loss": 11.8526, + "step": 36697 + }, + { + "epoch": 1.998352766603364, + "grad_norm": 0.5680346951485059, + "learning_rate": 3.499273426466054e-10, + "loss": 11.8861, + "step": 36698 + }, + { + "epoch": 1.998407220599947, + "grad_norm": 0.5160730365803441, + "learning_rate": 3.2698767379324426e-10, + "loss": 11.7039, + "step": 36699 + }, + { + "epoch": 1.99846167459653, + "grad_norm": 0.5123216668387516, + "learning_rate": 3.0482561917954954e-10, + "loss": 11.7522, + "step": 36700 + }, + { + "epoch": 1.998516128593113, + "grad_norm": 0.5254371195560049, + "learning_rate": 2.8344117894985036e-10, + "loss": 11.8134, + "step": 36701 + }, + { + "epoch": 1.998570582589696, + "grad_norm": 0.5854105432065376, + "learning_rate": 2.6283435329288453e-10, + "loss": 11.7867, + "step": 36702 + }, + { + "epoch": 1.998625036586279, + "grad_norm": 0.5862551505034992, + "learning_rate": 2.4300514236408336e-10, + "loss": 11.9344, + "step": 36703 + }, + { + "epoch": 1.998679490582862, + "grad_norm": 0.5140052659654867, + "learning_rate": 2.23953546318878e-10, + "loss": 11.7432, + "step": 36704 + }, + { + "epoch": 1.998733944579445, + "grad_norm": 0.5604743133364141, + "learning_rate": 2.0567956529049527e-10, + "loss": 11.7821, + "step": 36705 + }, + { + "epoch": 1.998788398576028, + "grad_norm": 0.5240572928921524, + "learning_rate": 1.881831994454686e-10, + "loss": 11.6996, + "step": 36706 + }, + { + "epoch": 1.998842852572611, + "grad_norm": 0.5405593267654587, + "learning_rate": 1.7146444889482026e-10, + "loss": 11.711, + "step": 36707 + }, + { + "epoch": 1.998897306569194, + "grad_norm": 0.5587512939355626, + "learning_rate": 1.5552331379398155e-10, + "loss": 11.8544, + "step": 36708 + }, + { + "epoch": 1.998951760565777, + "grad_norm": 0.5475112401914279, + "learning_rate": 1.403597942428725e-10, + "loss": 11.7343, + "step": 36709 + }, + { + "epoch": 1.99900621456236, + "grad_norm": 0.5490761635001964, + "learning_rate": 1.2597389037471984e-10, + "loss": 11.8575, + "step": 36710 + }, + { + "epoch": 1.999060668558943, + "grad_norm": 0.5125875521480342, + "learning_rate": 1.1236560230054593e-10, + "loss": 11.6728, + "step": 36711 + }, + { + "epoch": 1.999115122555526, + "grad_norm": 0.5796859431852404, + "learning_rate": 9.953493010916859e-11, + "loss": 11.8984, + "step": 36712 + }, + { + "epoch": 1.999169576552109, + "grad_norm": 0.5733528789181468, + "learning_rate": 8.748187392271234e-11, + "loss": 11.8825, + "step": 36713 + }, + { + "epoch": 1.9992240305486921, + "grad_norm": 0.5596695605625579, + "learning_rate": 7.620643382999503e-11, + "loss": 11.7445, + "step": 36714 + }, + { + "epoch": 1.9992784845452751, + "grad_norm": 0.519350111406422, + "learning_rate": 6.570860990873229e-11, + "loss": 11.7687, + "step": 36715 + }, + { + "epoch": 1.9993329385418581, + "grad_norm": 0.5701236866048116, + "learning_rate": 5.598840224774193e-11, + "loss": 11.8148, + "step": 36716 + }, + { + "epoch": 1.999387392538441, + "grad_norm": 0.554621853887413, + "learning_rate": 4.7045810924739586e-11, + "loss": 11.7872, + "step": 36717 + }, + { + "epoch": 1.999441846535024, + "grad_norm": 0.531288056186138, + "learning_rate": 3.888083600633863e-11, + "loss": 11.7555, + "step": 36718 + }, + { + "epoch": 1.999496300531607, + "grad_norm": 0.541802893876769, + "learning_rate": 3.149347754805021e-11, + "loss": 11.7749, + "step": 36719 + }, + { + "epoch": 1.9995507545281903, + "grad_norm": 0.6008596151002624, + "learning_rate": 2.4883735627589944e-11, + "loss": 11.77, + "step": 36720 + }, + { + "epoch": 1.9996052085247733, + "grad_norm": 0.4876281226770037, + "learning_rate": 1.905161026716229e-11, + "loss": 11.7265, + "step": 36721 + }, + { + "epoch": 1.9996596625213563, + "grad_norm": 0.5839723800419259, + "learning_rate": 1.399710154448286e-11, + "loss": 11.7743, + "step": 36722 + }, + { + "epoch": 1.9997141165179393, + "grad_norm": 0.570160454135997, + "learning_rate": 9.720209470653885e-12, + "loss": 11.7272, + "step": 36723 + }, + { + "epoch": 1.9997685705145223, + "grad_norm": 0.5308431239774418, + "learning_rate": 6.220934101186515e-12, + "loss": 11.7632, + "step": 36724 + }, + { + "epoch": 1.9998230245111053, + "grad_norm": 0.5191686533476448, + "learning_rate": 3.4992754471829813e-12, + "loss": 11.7697, + "step": 36725 + }, + { + "epoch": 1.9998774785076883, + "grad_norm": 0.525138338749835, + "learning_rate": 1.5552335419499743e-12, + "loss": 11.7664, + "step": 36726 + }, + { + "epoch": 1.9999319325042713, + "grad_norm": 0.5327611039604181, + "learning_rate": 3.8880838548749357e-13, + "loss": 11.6781, + "step": 36727 + }, + { + "epoch": 1.9999863865008543, + "grad_norm": 0.5408051043597044, + "learning_rate": 0.0, + "loss": 11.8148, + "step": 36728 + }, + { + "epoch": 1.9999863865008543, + "step": 36728, + "total_flos": 4582719071862784.0, + "train_loss": 6.056929863945536, + "train_runtime": 133830.3994, + "train_samples_per_second": 35.128, + "train_steps_per_second": 0.274 + } + ], + "logging_steps": 1.0, + "max_steps": 36728, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 250, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4582719071862784.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}